xref: /xnu-11417.140.69/osfmk/kern/lock_rw.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 #define LOCK_PRIVATE 1
57 #include <debug.h>
58 #include <kern/locks_internal.h>
59 #include <kern/lock_stat.h>
60 #include <kern/locks.h>
61 #include <kern/zalloc.h>
62 #include <kern/thread.h>
63 #include <kern/processor.h>
64 #include <kern/sched_prim.h>
65 #include <kern/debug.h>
66 #include <machine/atomic.h>
67 #include <machine/machine_cpu.h>
68 
69 KALLOC_TYPE_DEFINE(KT_LCK_RW, lck_rw_t, KT_PRIV_ACCT);
70 
71 #define LCK_RW_WRITER_EVENT(lck)                (event_t)((uintptr_t)(lck)+1)
72 #define LCK_RW_READER_EVENT(lck)                (event_t)((uintptr_t)(lck)+2)
73 #define WRITE_EVENT_TO_RWLOCK(event)            ((lck_rw_t *)((uintptr_t)(event)-1))
74 #define READ_EVENT_TO_RWLOCK(event)             ((lck_rw_t *)((uintptr_t)(event)-2))
75 
76 #if CONFIG_DTRACE
77 #define DTRACE_RW_SHARED        0x0     //reader
78 #define DTRACE_RW_EXCL          0x1     //writer
79 #define DTRACE_NO_FLAG          0x0     //not applicable
80 #endif  /* CONFIG_DTRACE */
81 
82 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
83 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
84 #define LCK_RW_LCK_SHARED_CODE          0x102
85 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
86 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
87 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
88 
89 #if __x86_64__
90 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
91 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
92 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
93 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
94 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
95 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
96 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
97 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
98 #endif
99 
100 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
101 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
102 
103 #define ordered_load_rw(lock)                   os_atomic_load(&(lock)->lck_rw_data, compiler_acq_rel)
104 #define ordered_store_rw(lock, value)           os_atomic_store(&(lock)->lck_rw_data, (value), compiler_acq_rel)
105 #define ordered_store_rw_owner(lock, value)     os_atomic_store(&(lock)->lck_rw_owner, (value), compiler_acq_rel)
106 
107 #ifdef DEBUG_RW
108 
109 STATIC_IF_KEY_DEFINE_TRUE(lck_rw_assert);
110 
111 static TUNABLE(bool, lck_rw_recursive_shared_assert_74048094, "lck_rw_recursive_shared_assert", false);
112 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) rwlde_caller_packing_params =
113     VM_PACKING_PARAMS(LCK_RW_CALLER_PACKED);
114 
115 #define set_rwlde_caller_packed(entry, caller)          ((entry)->rwlde_caller_packed = VM_PACK_POINTER((vm_offset_t)caller, LCK_RW_CALLER_PACKED))
116 #define get_rwlde_caller(entry)                         ((void*)VM_UNPACK_POINTER(entry->rwlde_caller_packed, LCK_RW_CALLER_PACKED))
117 
118 #endif /* DEBUG_RW */
119 
120 /*!
121  * @function lck_rw_alloc_init
122  *
123  * @abstract
124  * Allocates and initializes a rw_lock_t.
125  *
126  * @discussion
127  * The function can block. See lck_rw_init() for initialization details.
128  *
129  * @param grp           lock group to associate with the lock.
130  * @param attr          lock attribute to initialize the lock.
131  *
132  * @returns             NULL or the allocated lock
133  */
134 lck_rw_t *
lck_rw_alloc_init(lck_grp_t * grp,lck_attr_t * attr)135 lck_rw_alloc_init(
136 	lck_grp_t       *grp,
137 	lck_attr_t      *attr)
138 {
139 	lck_rw_t *lck;
140 
141 	lck = zalloc_flags(KT_LCK_RW, Z_WAITOK | Z_ZERO);
142 	lck_rw_init(lck, grp, attr);
143 	return lck;
144 }
145 
146 /*!
147  * @function lck_rw_init
148  *
149  * @abstract
150  * Initializes a rw_lock_t.
151  *
152  * @discussion
153  * Usage statistics for the lock are going to be added to the lock group provided.
154  *
155  * The lock attribute can be used to specify the lock contention behaviour.
156  * RW_WRITER_PRIORITY is the default behaviour (LCK_ATTR_NULL defaults to RW_WRITER_PRIORITY)
157  * and lck_attr_rw_shared_priority() can be used to set the behaviour to RW_SHARED_PRIORITY.
158  *
159  * RW_WRITER_PRIORITY gives priority to the writers upon contention with the readers;
160  * if the lock is held and a writer starts waiting for the lock, readers will not be able
161  * to acquire the lock until all writers stop contending. Readers could
162  * potentially starve.
163  * RW_SHARED_PRIORITY gives priority to the readers upon contention with the writers:
164  * unleass the lock is held in exclusive mode, readers will always be able to acquire the lock.
165  * Readers can lock a shared lock even if there are writers waiting. Writers could potentially
166  * starve.
167  *
168  * @param lck           lock to initialize.
169  * @param grp           lock group to associate with the lock.
170  * @param attr          lock attribute to initialize the lock.
171  *
172  */
173 void
lck_rw_init(lck_rw_t * lck,lck_grp_t * grp,lck_attr_t * attr)174 lck_rw_init(
175 	lck_rw_t        *lck,
176 	lck_grp_t       *grp,
177 	lck_attr_t      *attr)
178 {
179 	/* keep this so that the lck_type_t type is referenced for lldb */
180 	lck_type_t type = LCK_TYPE_RW;
181 
182 	if (attr == LCK_ATTR_NULL) {
183 		attr = &lck_attr_default;
184 	}
185 	*lck = (lck_rw_t){
186 		.lck_rw_type = type,
187 		.lck_rw_can_sleep = true,
188 		.lck_rw_priv_excl = !(attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY),
189 	};
190 	lck_grp_reference(grp, &grp->lck_grp_rwcnt);
191 }
192 
193 /*!
194  * @function lck_rw_free
195  *
196  * @abstract
197  * Frees a rw_lock previously allocated with lck_rw_alloc_init().
198  *
199  * @discussion
200  * The lock must be not held by any thread.
201  *
202  * @param lck           rw_lock to free.
203  */
204 void
lck_rw_free(lck_rw_t * lck,lck_grp_t * grp)205 lck_rw_free(
206 	lck_rw_t        *lck,
207 	lck_grp_t       *grp)
208 {
209 	lck_rw_destroy(lck, grp);
210 	zfree(KT_LCK_RW, lck);
211 }
212 
213 /*!
214  * @function lck_rw_destroy
215  *
216  * @abstract
217  * Destroys a rw_lock previously initialized with lck_rw_init().
218  *
219  * @discussion
220  * The lock must be not held by any thread.
221  *
222  * @param lck           rw_lock to destroy.
223  */
224 void
lck_rw_destroy(lck_rw_t * lck,lck_grp_t * grp)225 lck_rw_destroy(
226 	lck_rw_t        *lck,
227 	lck_grp_t       *grp)
228 {
229 	if (lck->lck_rw_type != LCK_TYPE_RW ||
230 	    lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
231 		panic("Destroying previously destroyed lock %p", lck);
232 	}
233 	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
234 
235 	lck->lck_rw_type = LCK_TYPE_NONE;
236 	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
237 	lck_grp_deallocate(grp, &grp->lck_grp_rwcnt);
238 }
239 
240 #ifdef DEBUG_RW
241 
242 /*
243  * Best effort mechanism to debug rw_locks.
244  *
245  * This mechanism is in addition to the owner checks. The owner is set
246  * only when the lock is held in exclusive mode so the checks do not cover
247  * the cases in which the lock is held in shared mode.
248  *
249  * This mechanism tentatively stores the rw_lock acquired and its debug
250  * information on the thread struct.
251  * Just up to LCK_RW_EXPECTED_MAX_NUMBER rw lock debug information can be stored.
252  *
253  * NOTE: LCK_RW_EXPECTED_MAX_NUMBER is the expected number of rw_locks held
254  * at the same time. If a thread holds more than this number of rw_locks we
255  * will start losing debug information.
256  * Increasing LCK_RW_EXPECTED_MAX_NUMBER will increase the probability we will
257  * store the debug information but it will require more memory per thread
258  * and longer lock/unlock time.
259  *
260  * If an empty slot is found for the debug information, we record the lock
261  * otherwise we set the overflow threshold flag.
262  *
263  * If we reached the overflow threshold we might stop asserting because we cannot be sure
264  * anymore if the lock was acquired or not.
265  *
266  * Even if we reached the overflow threshold, we try to store the debug information
267  * for the new locks acquired. This can be useful in core dumps to debug
268  * possible return to userspace without unlocking and to find possible readers
269  * holding the lock.
270  */
271 #if DEBUG_RW
272 
273 __static_if_init_func
274 void
lck_rw_assert_init(const char * args,uint64_t kf_ovrd)275 lck_rw_assert_init(const char *args, uint64_t kf_ovrd)
276 {
277 	bool lck_rw_assert_disable = false;
278 
279 	if (kf_ovrd & KF_MACH_ASSERT_OVRD) {
280 		lck_rw_assert_disable = true;
281 	}
282 
283 	if (static_if_boot_arg_uint64(args, "lcks", 0) &
284 	    LCK_OPTION_DISABLE_RW_DEBUG) {
285 		lck_rw_assert_disable = true;
286 	}
287 
288 	if (lck_rw_assert_disable) {
289 		static_if_key_disable(lck_rw_assert);
290 	}
291 }
292 
293 #endif /* DEBUG_RW */
294 
295 static inline struct rw_lock_debug_entry *
find_lock_in_savedlocks(lck_rw_t * lock,rw_lock_debug_t * rw_locks_held)296 find_lock_in_savedlocks(lck_rw_t* lock, rw_lock_debug_t *rw_locks_held)
297 {
298 	int i;
299 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
300 		struct rw_lock_debug_entry *existing = &rw_locks_held->rwld_locks[i];
301 		if (existing->rwlde_lock == lock) {
302 			return existing;
303 		}
304 	}
305 
306 	return NULL;
307 }
308 
309 __abortlike
310 static void
rwlock_slot_panic(rw_lock_debug_t * rw_locks_held)311 rwlock_slot_panic(rw_lock_debug_t *rw_locks_held)
312 {
313 	panic("No empty slot found in %p slot_used %d", rw_locks_held, rw_locks_held->rwld_locks_saved);
314 }
315 
316 static inline struct rw_lock_debug_entry *
find_empty_slot(rw_lock_debug_t * rw_locks_held)317 find_empty_slot(rw_lock_debug_t *rw_locks_held)
318 {
319 	int i;
320 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
321 		struct rw_lock_debug_entry *entry = &rw_locks_held->rwld_locks[i];
322 		if (entry->rwlde_lock == NULL) {
323 			return entry;
324 		}
325 	}
326 	rwlock_slot_panic(rw_locks_held);
327 }
328 
329 __abortlike
330 static void
canlock_rwlock_panic(lck_rw_t * lock,thread_t thread,struct rw_lock_debug_entry * entry)331 canlock_rwlock_panic(lck_rw_t* lock, thread_t thread, struct rw_lock_debug_entry *entry)
332 {
333 	panic("RW lock %p already held by %p caller %p mode_count %d state 0x%x owner 0x%p ",
334 	    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
335 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
336 }
337 
338 __attribute__((noinline))
339 static void
assert_canlock_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)340 assert_canlock_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
341 {
342 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
343 	if (__probable(rw_locks_held->rwld_locks_acquired == 0)) {
344 		//no locks saved, safe to lock
345 		return;
346 	}
347 
348 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
349 	if (__improbable(entry != NULL)) {
350 		boolean_t can_be_shared_recursive;
351 		if (lck_rw_recursive_shared_assert_74048094) {
352 			can_be_shared_recursive = (lock->lck_rw_priv_excl == 0);
353 		} else {
354 			/* currently rw_lock_shared is called recursively,
355 			 * until the code is fixed allow to lock
356 			 * recursively in shared mode
357 			 */
358 			can_be_shared_recursive = TRUE;
359 		}
360 		if ((type == LCK_RW_TYPE_SHARED) && can_be_shared_recursive && entry->rwlde_mode_count >= 1) {
361 			return;
362 		}
363 		canlock_rwlock_panic(lock, thread, entry);
364 	}
365 }
366 
367 static inline void
assert_canlock_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)368 assert_canlock_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
369 {
370 	if (lck_rw_assert_enabled()) {
371 		assert_canlock_rwlock_slow(lock, thread, type);
372 	}
373 }
374 
375 __abortlike
376 static void
held_rwlock_notheld_panic(lck_rw_t * lock,thread_t thread)377 held_rwlock_notheld_panic(lck_rw_t* lock, thread_t thread)
378 {
379 	panic("RW lock %p not held by %p", lock, thread);
380 }
381 
382 __abortlike
383 static void
held_rwlock_notheld_with_info_panic(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,struct rw_lock_debug_entry * entry)384 held_rwlock_notheld_with_info_panic(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, struct rw_lock_debug_entry *entry)
385 {
386 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
387 		panic("RW lock %p not held in exclusive by %p caller %p read %d state 0x%x owner 0x%p ",
388 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
389 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
390 	} else {
391 		panic("RW lock %p not held in shared by %p caller %p read %d state 0x%x owner 0x%p ",
392 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
393 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
394 	}
395 }
396 
397 __attribute__((noinline))
398 static void
assert_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)399 assert_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
400 {
401 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
402 
403 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_locks_saved == 0)) {
404 		if (rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_overflow == 0) {
405 			held_rwlock_notheld_panic(lock, thread);
406 		}
407 		return;
408 	}
409 
410 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
411 	if (__probable(entry != NULL)) {
412 		if (type == LCK_RW_TYPE_EXCLUSIVE && entry->rwlde_mode_count != -1) {
413 			held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
414 		} else {
415 			if (type == LCK_RW_TYPE_SHARED && entry->rwlde_mode_count <= 0) {
416 				held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
417 			}
418 		}
419 	} else {
420 		if (rw_locks_held->rwld_overflow == 0) {
421 			held_rwlock_notheld_panic(lock, thread);
422 		}
423 	}
424 }
425 
426 static inline void
assert_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)427 assert_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
428 {
429 	if (lck_rw_assert_enabled()) {
430 		assert_held_rwlock_slow(lock, thread, type);
431 	}
432 }
433 
434 __attribute__((noinline))
435 static void
change_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)436 change_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
437 {
438 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
439 	if (__improbable(rw_locks_held->rwld_locks_saved == 0)) {
440 		if (rw_locks_held->rwld_overflow == 0) {
441 			held_rwlock_notheld_panic(lock, thread);
442 		}
443 		return;
444 	}
445 
446 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
447 	if (__probable(entry != NULL)) {
448 		if (typeFrom == LCK_RW_TYPE_SHARED) {
449 			//We are upgrading
450 			assertf(entry->rwlde_mode_count == 1,
451 			    "RW lock %p not held by a single shared when upgrading "
452 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
453 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
454 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
455 			entry->rwlde_mode_count = -1;
456 			set_rwlde_caller_packed(entry, caller);
457 		} else {
458 			//We are downgrading
459 			assertf(entry->rwlde_mode_count == -1,
460 			    "RW lock %p not held in write mode when downgrading "
461 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
462 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
463 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
464 			entry->rwlde_mode_count = 1;
465 			set_rwlde_caller_packed(entry, caller);
466 		}
467 		return;
468 	}
469 
470 	if (rw_locks_held->rwld_overflow == 0) {
471 		held_rwlock_notheld_panic(lock, thread);
472 	}
473 
474 	if (rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER) {
475 		//array is full
476 		return;
477 	}
478 
479 	struct rw_lock_debug_entry *null_entry = find_empty_slot(rw_locks_held);
480 	null_entry->rwlde_lock = lock;
481 	set_rwlde_caller_packed(null_entry, caller);
482 	if (typeFrom == LCK_RW_TYPE_SHARED) {
483 		null_entry->rwlde_mode_count = -1;
484 	} else {
485 		null_entry->rwlde_mode_count = 1;
486 	}
487 	rw_locks_held->rwld_locks_saved++;
488 }
489 
490 static inline void
change_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)491 change_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
492 {
493 	if (lck_rw_assert_enabled()) {
494 		change_held_rwlock_slow(lock, thread, typeFrom, caller);
495 	}
496 }
497 
498 __abortlike
499 static void
add_held_rwlock_too_many_panic(thread_t thread)500 add_held_rwlock_too_many_panic(thread_t thread)
501 {
502 	panic("RW lock too many rw locks held, rwld_locks_acquired maxed out for thread %p", thread);
503 }
504 
505 static __attribute__((noinline)) void
add_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)506 add_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
507 {
508 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
509 	struct rw_lock_debug_entry *null_entry;
510 	if (__improbable(rw_locks_held->rwld_locks_acquired == UINT32_MAX)) {
511 		add_held_rwlock_too_many_panic(thread);
512 	}
513 	rw_locks_held->rwld_locks_acquired++;
514 
515 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
516 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
517 			//array is full
518 			rw_locks_held->rwld_overflow = 1;
519 			return;
520 		}
521 		null_entry = find_empty_slot(rw_locks_held);
522 		null_entry->rwlde_lock = lock;
523 		set_rwlde_caller_packed(null_entry, caller);
524 		null_entry->rwlde_mode_count = -1;
525 		rw_locks_held->rwld_locks_saved++;
526 		return;
527 	} else {
528 		if (__probable(rw_locks_held->rwld_locks_saved == 0)) {
529 			//array is empty
530 			goto add_shared;
531 		}
532 
533 		boolean_t allow_shared_recursive;
534 		if (lck_rw_recursive_shared_assert_74048094) {
535 			allow_shared_recursive = (lock->lck_rw_priv_excl == 0);
536 		} else {
537 			allow_shared_recursive = TRUE;
538 		}
539 		if (allow_shared_recursive) {
540 			//It could be already locked in shared mode
541 			struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
542 			if (entry != NULL) {
543 				assert(entry->rwlde_mode_count > 0);
544 				assertf(entry->rwlde_mode_count != INT8_MAX,
545 				    "RW lock %p with too many recursive shared held "
546 				    "from %p caller %p read %d state 0x%x owner 0x%p",
547 				    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
548 				    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
549 				entry->rwlde_mode_count += 1;
550 				return;
551 			}
552 		}
553 
554 		//none of the locks were a match
555 		//try to add a new entry
556 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
557 			//array is full
558 			rw_locks_held->rwld_overflow = 1;
559 			return;
560 		}
561 
562 add_shared:
563 		null_entry = find_empty_slot(rw_locks_held);
564 		null_entry->rwlde_lock = lock;
565 		set_rwlde_caller_packed(null_entry, caller);
566 		null_entry->rwlde_mode_count = 1;
567 		rw_locks_held->rwld_locks_saved++;
568 	}
569 }
570 
571 static inline void
add_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)572 add_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
573 {
574 	if (lck_rw_assert_enabled()) {
575 		add_held_rwlock_slow(lock, thread, type, caller);
576 	}
577 }
578 
579 static void
remove_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)580 remove_held_rwlock_slow(lck_rw_t *lock, thread_t thread, lck_rw_type_t type)
581 {
582 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
583 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0)) {
584 		return;
585 	}
586 	rw_locks_held->rwld_locks_acquired--;
587 
588 	if (rw_locks_held->rwld_locks_saved == 0) {
589 		assert(rw_locks_held->rwld_overflow == 1);
590 		goto out;
591 	}
592 
593 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
594 	if (__probable(entry != NULL)) {
595 		if (type == LCK_RW_TYPE_EXCLUSIVE) {
596 			assert(entry->rwlde_mode_count == -1);
597 			entry->rwlde_mode_count = 0;
598 		} else {
599 			assert(entry->rwlde_mode_count > 0);
600 			entry->rwlde_mode_count--;
601 			if (entry->rwlde_mode_count > 0) {
602 				goto out;
603 			}
604 		}
605 		entry->rwlde_caller_packed = 0;
606 		entry->rwlde_lock = NULL;
607 		rw_locks_held->rwld_locks_saved--;
608 	} else {
609 		assert(rw_locks_held->rwld_overflow == 1);
610 	}
611 
612 out:
613 	if (rw_locks_held->rwld_locks_acquired == 0) {
614 		rw_locks_held->rwld_overflow = 0;
615 	}
616 	return;
617 }
618 
619 static inline void
remove_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)620 remove_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
621 {
622 	if (lck_rw_assert_enabled()) {
623 		remove_held_rwlock_slow(lock, thread, type);
624 	}
625 }
626 #endif /* DEBUG_RW */
627 
628 /*
629  * We disable interrupts while holding the RW interlock to prevent an
630  * interrupt from exacerbating hold time.
631  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
632  */
633 static inline boolean_t
lck_interlock_lock(lck_rw_t * lck)634 lck_interlock_lock(
635 	lck_rw_t        *lck)
636 {
637 	boolean_t       istate;
638 
639 	istate = ml_set_interrupts_enabled(FALSE);
640 	lck_rw_ilk_lock(lck);
641 	return istate;
642 }
643 
644 static inline void
lck_interlock_unlock(lck_rw_t * lck,boolean_t istate)645 lck_interlock_unlock(
646 	lck_rw_t        *lck,
647 	boolean_t       istate)
648 {
649 	lck_rw_ilk_unlock(lck);
650 	ml_set_interrupts_enabled(istate);
651 }
652 
653 /*
654  * compute the deadline to spin against when
655  * waiting for a change of state on a lck_rw_t
656  */
657 static inline uint64_t
lck_rw_deadline_for_spin(lck_rw_t * lck)658 lck_rw_deadline_for_spin(
659 	lck_rw_t        *lck)
660 {
661 	lck_rw_word_t   word;
662 
663 	word.data = ordered_load_rw(lck);
664 	if (word.can_sleep) {
665 		if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
666 			/*
667 			 * there are already threads waiting on this lock... this
668 			 * implies that they have spun beyond their deadlines waiting for
669 			 * the desired state to show up so we will not bother spinning at this time...
670 			 *   or
671 			 * the current number of threads sharing this lock exceeds our capacity to run them
672 			 * concurrently and since all states we're going to spin for require the rw_shared_count
673 			 * to be at 0, we'll not bother spinning since the latency for this to happen is
674 			 * unpredictable...
675 			 */
676 			return mach_absolute_time();
677 		}
678 		return mach_absolute_time() + os_atomic_load(&MutexSpin, relaxed);
679 	} else {
680 		return mach_absolute_time() + (100000LL * 1000000000LL);
681 	}
682 }
683 
684 /*
685  * This inline is used when busy-waiting for an rw lock.
686  * If interrupts were disabled when the lock primitive was called,
687  * we poll the IPI handler for pending tlb flushes in x86.
688  */
689 static inline void
lck_rw_lock_pause(boolean_t interrupts_enabled)690 lck_rw_lock_pause(
691 	boolean_t       interrupts_enabled)
692 {
693 #if X86_64
694 	if (!interrupts_enabled) {
695 		handle_pending_TLB_flushes();
696 	}
697 	cpu_pause();
698 #else
699 	(void) interrupts_enabled;
700 	wait_for_event();
701 #endif
702 }
703 
704 typedef enum __enum_closed {
705 	LCK_RW_DRAIN_S_DRAINED       = 0,
706 	LCK_RW_DRAIN_S_NOT_DRAINED   = 1,
707 	LCK_RW_DRAIN_S_EARLY_RETURN  = 2,
708 	LCK_RW_DRAIN_S_TIMED_OUT     = 3,
709 } lck_rw_drain_state_t;
710 
711 static lck_rw_drain_state_t
712 lck_rw_drain_status(
713 	lck_rw_t        *lock,
714 	uint32_t        status_mask,
715 	boolean_t       wait,
716 	bool            (^lock_pause)(void))
717 {
718 	uint64_t        deadline = 0;
719 	uint32_t        data;
720 	boolean_t       istate = FALSE;
721 
722 	if (wait) {
723 		deadline = lck_rw_deadline_for_spin(lock);
724 #if __x86_64__
725 		istate = ml_get_interrupts_enabled();
726 #endif
727 	}
728 
729 	for (;;) {
730 #if __x86_64__
731 		data = os_atomic_load(&lock->lck_rw_data, relaxed);
732 #else
733 		data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
734 #endif
735 		if ((data & status_mask) == 0) {
736 			atomic_exchange_abort();
737 			return LCK_RW_DRAIN_S_DRAINED;
738 		}
739 
740 		if (!wait) {
741 			atomic_exchange_abort();
742 			return LCK_RW_DRAIN_S_NOT_DRAINED;
743 		}
744 
745 		lck_rw_lock_pause(istate);
746 
747 		if (mach_absolute_time() >= deadline) {
748 			return LCK_RW_DRAIN_S_TIMED_OUT;
749 		}
750 
751 		if (lock_pause && lock_pause()) {
752 			return LCK_RW_DRAIN_S_EARLY_RETURN;
753 		}
754 	}
755 }
756 
757 /*
758  * Spin while interlock is held.
759  */
760 static inline void
lck_rw_interlock_spin(lck_rw_t * lock)761 lck_rw_interlock_spin(
762 	lck_rw_t        *lock)
763 {
764 	uint32_t        data, prev;
765 
766 	for (;;) {
767 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_relaxed);
768 		if (data & LCK_RW_INTERLOCK) {
769 #if __x86_64__
770 			cpu_pause();
771 #else
772 			wait_for_event();
773 #endif
774 		} else {
775 			atomic_exchange_abort();
776 			return;
777 		}
778 	}
779 }
780 
781 #define LCK_RW_GRAB_WANT        0
782 #define LCK_RW_GRAB_SHARED      1
783 
784 typedef enum __enum_closed __enum_options {
785 	LCK_RW_GRAB_F_SHARED    = 0x0,  // Not really a flag obviously but makes call sites more readable.
786 	LCK_RW_GRAB_F_WANT_EXCL = 0x1,
787 	LCK_RW_GRAB_F_WAIT      = 0x2,
788 } lck_rw_grab_flags_t;
789 
790 typedef enum __enum_closed {
791 	LCK_RW_GRAB_S_NOT_LOCKED    = 0,
792 	LCK_RW_GRAB_S_LOCKED        = 1,
793 	LCK_RW_GRAB_S_EARLY_RETURN  = 2,
794 	LCK_RW_GRAB_S_TIMED_OUT     = 3,
795 } lck_rw_grab_state_t;
796 
797 static lck_rw_grab_state_t
798 lck_rw_grab(
799 	lck_rw_t            *lock,
800 	lck_rw_grab_flags_t flags,
801 	bool                (^lock_pause)(void))
802 {
803 	uint64_t        deadline = 0;
804 	uint32_t        data, prev;
805 	boolean_t       do_exch, istate = FALSE;
806 
807 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
808 
809 	if ((flags & LCK_RW_GRAB_F_WAIT) != 0) {
810 		deadline = lck_rw_deadline_for_spin(lock);
811 #if __x86_64__
812 		istate = ml_get_interrupts_enabled();
813 #endif
814 	}
815 
816 	for (;;) {
817 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
818 		if (data & LCK_RW_INTERLOCK) {
819 			atomic_exchange_abort();
820 			lck_rw_interlock_spin(lock);
821 			continue;
822 		}
823 		do_exch = FALSE;
824 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
825 			if ((data & LCK_RW_WANT_EXCL) == 0) {
826 				data |= LCK_RW_WANT_EXCL;
827 				do_exch = TRUE;
828 			}
829 		} else {        // LCK_RW_GRAB_SHARED
830 			if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
831 			    (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
832 				data += LCK_RW_SHARED_READER;
833 				do_exch = TRUE;
834 			}
835 		}
836 		if (do_exch) {
837 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
838 				return LCK_RW_GRAB_S_LOCKED;
839 			}
840 		} else {
841 			if ((flags & LCK_RW_GRAB_F_WAIT) == 0) {
842 				atomic_exchange_abort();
843 				return LCK_RW_GRAB_S_NOT_LOCKED;
844 			}
845 
846 			lck_rw_lock_pause(istate);
847 
848 			if (mach_absolute_time() >= deadline) {
849 				return LCK_RW_GRAB_S_TIMED_OUT;
850 			}
851 			if (lock_pause && lock_pause()) {
852 				return LCK_RW_GRAB_S_EARLY_RETURN;
853 			}
854 		}
855 	}
856 }
857 
858 /*
859  * The inverse of lck_rw_grab - drops either the LCK_RW_WANT_EXCL bit or
860  * decrements the reader count. Doesn't deal with waking up waiters - i.e.
861  * should only be called when can_sleep is false.
862  */
863 static void
lck_rw_drop(lck_rw_t * lock,lck_rw_grab_flags_t flags)864 lck_rw_drop(lck_rw_t *lock, lck_rw_grab_flags_t flags)
865 {
866 	uint32_t data, prev;
867 
868 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
869 	assert(!lock->lck_rw_can_sleep);
870 
871 	for (;;) {
872 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
873 
874 		/* Interlock should never be taken when can_sleep is false. */
875 		assert3u(data & LCK_RW_INTERLOCK, ==, 0);
876 
877 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
878 			data &= ~LCK_RW_WANT_EXCL;
879 		} else {
880 			data -= LCK_RW_SHARED_READER;
881 		}
882 
883 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
884 			break;
885 		}
886 
887 		cpu_pause();
888 	}
889 
890 	return;
891 }
892 
893 static boolean_t
894 lck_rw_lock_exclusive_gen(
895 	lck_rw_t        *lock,
896 	bool            (^lock_pause)(void))
897 {
898 	__assert_only thread_t self = current_thread();
899 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
900 	lck_rw_word_t           word;
901 	int                     slept = 0;
902 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
903 	lck_rw_drain_state_t    drain_state = LCK_RW_DRAIN_S_NOT_DRAINED;
904 	wait_result_t           res = 0;
905 	boolean_t               istate;
906 
907 #if     CONFIG_DTRACE
908 	boolean_t dtrace_ls_initialized = FALSE;
909 	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
910 	uint64_t wait_interval = 0;
911 	int readers_at_sleep = 0;
912 #endif
913 
914 	assertf(lock->lck_rw_owner != self->ctid,
915 	    "Lock already held state=0x%x, owner=%p",
916 	    ordered_load_rw(lock), self);
917 
918 #ifdef DEBUG_RW
919 	/*
920 	 * Best effort attempt to check that this thread
921 	 * is not already holding the lock (this checks read mode too).
922 	 */
923 	assert_canlock_rwlock(lock, self, LCK_RW_TYPE_EXCLUSIVE);
924 #endif /* DEBUG_RW */
925 
926 	/*
927 	 *	Try to acquire the lck_rw_want_excl bit.
928 	 */
929 	while (lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL, NULL) != LCK_RW_GRAB_S_LOCKED) {
930 #if     CONFIG_DTRACE
931 		if (dtrace_ls_initialized == FALSE) {
932 			dtrace_ls_initialized = TRUE;
933 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
934 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
935 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
936 			if (dtrace_ls_enabled) {
937 				/*
938 				 * Either sleeping or spinning is happening,
939 				 *  start a timing of our delay interval now.
940 				 */
941 				readers_at_sleep = lock->lck_rw_shared_count;
942 				wait_interval = mach_absolute_time();
943 			}
944 		}
945 #endif
946 
947 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START,
948 		    trace_lck, 0, 0, 0, 0);
949 
950 		grab_state = lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT, lock_pause);
951 
952 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END,
953 		    trace_lck, 0, 0, grab_state, 0);
954 
955 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
956 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
957 			break;
958 		}
959 		/*
960 		 * if we get here, the deadline has expired w/o us
961 		 * being able to grab the lock exclusively
962 		 * check to see if we're allowed to do a thread_block
963 		 */
964 		word.data = ordered_load_rw(lock);
965 		if (word.can_sleep) {
966 			istate = lck_interlock_lock(lock);
967 			word.data = ordered_load_rw(lock);
968 
969 			if (word.want_excl) {
970 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
971 
972 				word.w_waiting = 1;
973 				ordered_store_rw(lock, word.data);
974 
975 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
976 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
977 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
978 				lck_interlock_unlock(lock, istate);
979 				if (res == THREAD_WAITING) {
980 					res = thread_block(THREAD_CONTINUE_NULL);
981 					slept++;
982 				}
983 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
984 			} else {
985 				word.want_excl = 1;
986 				ordered_store_rw(lock, word.data);
987 				lck_interlock_unlock(lock, istate);
988 				break;
989 			}
990 		}
991 	}
992 
993 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
994 		assert(lock_pause);
995 		return FALSE;
996 	}
997 
998 	/*
999 	 * Wait for readers (and upgrades) to finish...
1000 	 */
1001 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
1002 #if     CONFIG_DTRACE
1003 		/*
1004 		 * Either sleeping or spinning is happening, start
1005 		 * a timing of our delay interval now.  If we set it
1006 		 * to -1 we don't have accurate data so we cannot later
1007 		 * decide to record a dtrace spin or sleep event.
1008 		 */
1009 		if (dtrace_ls_initialized == FALSE) {
1010 			dtrace_ls_initialized = TRUE;
1011 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1012 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1013 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1014 			if (dtrace_ls_enabled) {
1015 				/*
1016 				 * Either sleeping or spinning is happening,
1017 				 *  start a timing of our delay interval now.
1018 				 */
1019 				readers_at_sleep = lock->lck_rw_shared_count;
1020 				wait_interval = mach_absolute_time();
1021 			}
1022 		}
1023 #endif
1024 
1025 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1026 
1027 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE, lock_pause);
1028 
1029 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, drain_state, 0);
1030 
1031 		if (drain_state == LCK_RW_DRAIN_S_DRAINED ||
1032 		    drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
1033 			break;
1034 		}
1035 		/*
1036 		 * if we get here, the deadline has expired w/o us
1037 		 * being able to grab the lock exclusively
1038 		 * check to see if we're allowed to do a thread_block
1039 		 */
1040 		word.data = ordered_load_rw(lock);
1041 		if (word.can_sleep) {
1042 			istate = lck_interlock_lock(lock);
1043 			word.data = ordered_load_rw(lock);
1044 
1045 			if (word.shared_count != 0 || word.want_upgrade) {
1046 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1047 
1048 				word.w_waiting = 1;
1049 				ordered_store_rw(lock, word.data);
1050 
1051 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1052 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1053 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1054 				lck_interlock_unlock(lock, istate);
1055 
1056 				if (res == THREAD_WAITING) {
1057 					res = thread_block(THREAD_CONTINUE_NULL);
1058 					slept++;
1059 				}
1060 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1061 			} else {
1062 				lck_interlock_unlock(lock, istate);
1063 				/*
1064 				 * must own the lock now, since we checked for
1065 				 * readers or upgrade owner behind the interlock
1066 				 * no need for a call to 'lck_rw_drain_status'
1067 				 */
1068 				break;
1069 			}
1070 		}
1071 	}
1072 
1073 #if     CONFIG_DTRACE
1074 	/*
1075 	 * Decide what latencies we suffered that are Dtrace events.
1076 	 * If we have set wait_interval, then we either spun or slept.
1077 	 * At least we get out from under the interlock before we record
1078 	 * which is the best we can do here to minimize the impact
1079 	 * of the tracing.
1080 	 * If we have set wait_interval to -1, then dtrace was not enabled when we
1081 	 * started sleeping/spinning so we don't record this event.
1082 	 */
1083 	if (dtrace_ls_enabled == TRUE) {
1084 		if (slept == 0) {
1085 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1086 			    mach_absolute_time() - wait_interval, 1);
1087 		} else {
1088 			/*
1089 			 * For the blocking case, we also record if when we blocked
1090 			 * it was held for read or write, and how many readers.
1091 			 * Notice that above we recorded this before we dropped
1092 			 * the interlock so the count is accurate.
1093 			 */
1094 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1095 			    mach_absolute_time() - wait_interval, 1,
1096 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1097 		}
1098 	}
1099 #endif /* CONFIG_DTRACE */
1100 
1101 	if (drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
1102 		lck_rw_drop(lock, LCK_RW_GRAB_F_WANT_EXCL);
1103 		assert(lock_pause);
1104 		return FALSE;
1105 	}
1106 
1107 #if CONFIG_DTRACE
1108 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1109 #endif  /* CONFIG_DTRACE */
1110 
1111 	return TRUE;
1112 }
1113 
1114 static inline void
lck_rw_lock_check_preemption(lck_rw_t * lock __unused)1115 lck_rw_lock_check_preemption(lck_rw_t *lock __unused)
1116 {
1117 	assertf((get_preemption_level() == 0 && ml_get_interrupts_enabled()) ||
1118 	    startup_phase < STARTUP_SUB_EARLY_BOOT ||
1119 	    current_cpu_datap()->cpu_hibernate ||
1120 	    ml_is_quiescing() ||
1121 	    !not_in_kdp,
1122 	    "%s: attempt to take rwlock %p in non-preemptible or interrupt context: "
1123 	    "preemption level = %d, interruptible = %d", __func__, lock,
1124 	    get_preemption_level(), (int)ml_get_interrupts_enabled());
1125 }
1126 
1127 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
1128 	    (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
1129 	    LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1130 /*!
1131  * @function lck_rw_lock_exclusive_check_contended
1132  *
1133  * @abstract
1134  * Locks a rw_lock in exclusive mode.
1135  *
1136  * @discussion
1137  * This routine IS EXPERIMENTAL.
1138  * It's only used for the vm object lock, and use for other subsystems is UNSUPPORTED.
1139  * Note that the return value is ONLY A HEURISTIC w.r.t. the lock's contention.
1140  *
1141  * @param lock           rw_lock to lock.
1142  *
1143  * @returns Returns TRUE if the thread spun or blocked while attempting to acquire the lock, FALSE
1144  *          otherwise.
1145  */
1146 bool
lck_rw_lock_exclusive_check_contended(lck_rw_t * lock)1147 lck_rw_lock_exclusive_check_contended(
1148 	lck_rw_t        *lock)
1149 {
1150 	thread_t        thread = current_thread();
1151 	bool            contended  = false;
1152 
1153 	if (lock->lck_rw_can_sleep) {
1154 		lck_rw_lock_check_preemption(lock);
1155 		lck_rw_lock_count_inc(thread, lock);
1156 	} else if (get_preemption_level() == 0) {
1157 		panic("Taking non-sleepable RW lock with preemption enabled");
1158 	}
1159 
1160 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1161 #if     CONFIG_DTRACE
1162 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1163 #endif  /* CONFIG_DTRACE */
1164 	} else {
1165 		contended = true;
1166 		(void) lck_rw_lock_exclusive_gen(lock, NULL);
1167 	}
1168 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1169 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1170 	ordered_store_rw_owner(lock, thread->ctid);
1171 
1172 #ifdef DEBUG_RW
1173 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, __builtin_return_address(0));
1174 #endif /* DEBUG_RW */
1175 	return contended;
1176 }
1177 
1178 __attribute__((always_inline))
1179 static boolean_t
1180 lck_rw_lock_exclusive_internal_inline(
1181 	lck_rw_t        *lock,
1182 	void            *caller,
1183 	bool            (^lock_pause)(void))
1184 {
1185 #pragma unused(caller)
1186 	thread_t        thread = current_thread();
1187 
1188 	if (lock->lck_rw_can_sleep) {
1189 		lck_rw_lock_check_preemption(lock);
1190 		lck_rw_lock_count_inc(thread, lock);
1191 	} else if (get_preemption_level() == 0) {
1192 		panic("Taking non-sleepable RW lock with preemption enabled");
1193 	}
1194 
1195 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1196 #if     CONFIG_DTRACE
1197 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1198 #endif  /* CONFIG_DTRACE */
1199 	} else if (!lck_rw_lock_exclusive_gen(lock, lock_pause)) {
1200 		/*
1201 		 * lck_rw_lock_exclusive_gen() should only return
1202 		 * early if lock_pause has been passed and
1203 		 * returns FALSE. lock_pause is exclusive with
1204 		 * lck_rw_can_sleep().
1205 		 */
1206 		assert(!lock->lck_rw_can_sleep);
1207 		return FALSE;
1208 	}
1209 
1210 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1211 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1212 	ordered_store_rw_owner(lock, thread->ctid);
1213 
1214 #if DEBUG_RW
1215 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1216 #endif /* DEBUG_RW */
1217 
1218 	return TRUE;
1219 }
1220 
1221 __attribute__((noinline))
1222 static void
lck_rw_lock_exclusive_internal(lck_rw_t * lock,void * caller)1223 lck_rw_lock_exclusive_internal(
1224 	lck_rw_t        *lock,
1225 	void            *caller)
1226 {
1227 	(void) lck_rw_lock_exclusive_internal_inline(lock, caller, NULL);
1228 }
1229 
1230 /*!
1231  * @function lck_rw_lock_exclusive
1232  *
1233  * @abstract
1234  * Locks a rw_lock in exclusive mode.
1235  *
1236  * @discussion
1237  * This function can block.
1238  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1239  * can acquire it in exclusive mode.
1240  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1241  *
1242  * @param lock           rw_lock to lock.
1243  */
1244 void
lck_rw_lock_exclusive(lck_rw_t * lock)1245 lck_rw_lock_exclusive(
1246 	lck_rw_t        *lock)
1247 {
1248 	(void) lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), NULL);
1249 }
1250 
1251 /*!
1252  * @function lck_rw_lock_exclusive_b
1253  *
1254  * @abstract
1255  * Locks a rw_lock in exclusive mode. Returns early if the lock can't be acquired
1256  * and the specified block returns true.
1257  *
1258  * @discussion
1259  * Identical to lck_rw_lock_exclusive() but can return early if the lock can't be
1260  * acquired and the specified block returns true. The block is called
1261  * repeatedly when waiting to acquire the lock.
1262  * Should only be called when the lock cannot sleep (i.e. when
1263  * lock->lck_rw_can_sleep is false).
1264  *
1265  * @param lock           rw_lock to lock.
1266  * @param lock_pause     block invoked while waiting to acquire lock
1267  *
1268  * @returns              Returns TRUE if the lock is successfully taken,
1269  *                       FALSE if the block returns true and the lock has
1270  *                       not been acquired.
1271  */
1272 boolean_t
1273 lck_rw_lock_exclusive_b(
1274 	lck_rw_t        *lock,
1275 	bool            (^lock_pause)(void))
1276 {
1277 	assert(!lock->lck_rw_can_sleep);
1278 
1279 	return lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), lock_pause);
1280 }
1281 
1282 /*
1283  *	Routine:	lck_rw_lock_shared_gen
1284  *	Function:
1285  *		Fast path code has determined that this lock
1286  *		is held exclusively... this is where we spin/block
1287  *		until we can acquire the lock in the shared mode
1288  */
1289 static boolean_t
1290 lck_rw_lock_shared_gen(
1291 	lck_rw_t        *lck,
1292 	bool            (^lock_pause)(void))
1293 {
1294 	__assert_only thread_t  self = current_thread();
1295 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1296 	lck_rw_word_t           word;
1297 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
1298 	int                     slept = 0;
1299 	wait_result_t           res = 0;
1300 	boolean_t               istate;
1301 
1302 #if     CONFIG_DTRACE
1303 	uint64_t wait_interval = 0;
1304 	int readers_at_sleep = 0;
1305 	boolean_t dtrace_ls_initialized = FALSE;
1306 	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1307 #endif /* CONFIG_DTRACE */
1308 
1309 	assertf(lck->lck_rw_owner != self->ctid,
1310 	    "Lock already held state=0x%x, owner=%p",
1311 	    ordered_load_rw(lck), self);
1312 
1313 #ifdef DEBUG_RW
1314 	/*
1315 	 * Best effort attempt to check that this thread
1316 	 * is not already holding the lock in shared mode.
1317 	 */
1318 	assert_canlock_rwlock(lck, self, LCK_RW_TYPE_SHARED);
1319 #endif
1320 
1321 	while (lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED, NULL) != LCK_RW_GRAB_S_LOCKED) {
1322 #if     CONFIG_DTRACE
1323 		if (dtrace_ls_initialized == FALSE) {
1324 			dtrace_ls_initialized = TRUE;
1325 			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1326 			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1327 			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1328 			if (dtrace_ls_enabled) {
1329 				/*
1330 				 * Either sleeping or spinning is happening,
1331 				 *  start a timing of our delay interval now.
1332 				 */
1333 				readers_at_sleep = lck->lck_rw_shared_count;
1334 				wait_interval = mach_absolute_time();
1335 			}
1336 		}
1337 #endif
1338 
1339 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1340 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1341 
1342 		grab_state = lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED | LCK_RW_GRAB_F_WAIT, lock_pause);
1343 
1344 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1345 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, grab_state, 0);
1346 
1347 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
1348 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1349 			break;
1350 		}
1351 
1352 		/*
1353 		 * if we get here, the deadline has expired w/o us
1354 		 * being able to grab the lock for read
1355 		 * check to see if we're allowed to do a thread_block
1356 		 */
1357 		if (lck->lck_rw_can_sleep) {
1358 			istate = lck_interlock_lock(lck);
1359 
1360 			word.data = ordered_load_rw(lck);
1361 			if ((word.want_excl || word.want_upgrade) &&
1362 			    ((word.shared_count == 0) || word.priv_excl)) {
1363 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1364 				    trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1365 
1366 				word.r_waiting = 1;
1367 				ordered_store_rw(lck, word.data);
1368 
1369 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1370 				res = assert_wait(LCK_RW_READER_EVENT(lck),
1371 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1372 				lck_interlock_unlock(lck, istate);
1373 
1374 				if (res == THREAD_WAITING) {
1375 					res = thread_block(THREAD_CONTINUE_NULL);
1376 					slept++;
1377 				}
1378 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1379 				    trace_lck, res, slept, 0, 0);
1380 			} else {
1381 				word.shared_count++;
1382 				ordered_store_rw(lck, word.data);
1383 				lck_interlock_unlock(lck, istate);
1384 				break;
1385 			}
1386 		}
1387 	}
1388 
1389 #if     CONFIG_DTRACE
1390 	if (dtrace_ls_enabled == TRUE) {
1391 		if (slept == 0) {
1392 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1393 		} else {
1394 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1395 			    mach_absolute_time() - wait_interval, 0,
1396 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1397 		}
1398 	}
1399 #endif /* CONFIG_DTRACE */
1400 
1401 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1402 		assert(lock_pause);
1403 		return FALSE;
1404 	}
1405 
1406 #if     CONFIG_DTRACE
1407 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1408 #endif  /* CONFIG_DTRACE */
1409 
1410 	return TRUE;
1411 }
1412 
1413 __attribute__((always_inline))
1414 static boolean_t
1415 lck_rw_lock_shared_internal_inline(
1416 	lck_rw_t        *lock,
1417 	void            *caller,
1418 	bool            (^lock_pause)(void))
1419 {
1420 #pragma unused(caller)
1421 
1422 	uint32_t        data, prev;
1423 	thread_t        thread = current_thread();
1424 #ifdef DEBUG_RW
1425 	boolean_t       check_canlock = TRUE;
1426 #endif
1427 
1428 	if (lock->lck_rw_can_sleep) {
1429 		lck_rw_lock_check_preemption(lock);
1430 		lck_rw_lock_count_inc(thread, lock);
1431 	} else if (get_preemption_level() == 0) {
1432 		panic("Taking non-sleepable RW lock with preemption enabled");
1433 	}
1434 
1435 	for (;;) {
1436 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1437 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1438 			atomic_exchange_abort();
1439 			if (!lck_rw_lock_shared_gen(lock, lock_pause)) {
1440 				/*
1441 				 * lck_rw_lock_shared_gen() should only return
1442 				 * early if lock_pause has been passed and
1443 				 * returns FALSE. lock_pause is exclusive with
1444 				 * lck_rw_can_sleep().
1445 				 */
1446 				assert(!lock->lck_rw_can_sleep);
1447 				return FALSE;
1448 			}
1449 
1450 			goto locked;
1451 		}
1452 #ifdef DEBUG_RW
1453 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1454 			/*
1455 			 * If the lock is uncontended,
1456 			 * we do not need to check if we can lock it
1457 			 */
1458 			check_canlock = FALSE;
1459 		}
1460 #endif
1461 		data += LCK_RW_SHARED_READER;
1462 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1463 			break;
1464 		}
1465 		cpu_pause();
1466 	}
1467 #ifdef DEBUG_RW
1468 	if (check_canlock) {
1469 		/*
1470 		 * Best effort attempt to check that this thread
1471 		 * is not already holding the lock (this checks read mode too).
1472 		 */
1473 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1474 	}
1475 #endif
1476 locked:
1477 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1478 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1479 
1480 #if     CONFIG_DTRACE
1481 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1482 #endif  /* CONFIG_DTRACE */
1483 
1484 #ifdef DEBUG_RW
1485 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1486 #endif /* DEBUG_RW */
1487 
1488 	return TRUE;
1489 }
1490 
1491 __attribute__((noinline))
1492 static void
lck_rw_lock_shared_internal(lck_rw_t * lock,void * caller)1493 lck_rw_lock_shared_internal(
1494 	lck_rw_t        *lock,
1495 	void            *caller)
1496 {
1497 	(void) lck_rw_lock_shared_internal_inline(lock, caller, NULL);
1498 }
1499 
1500 /*!
1501  * @function lck_rw_lock_shared
1502  *
1503  * @abstract
1504  * Locks a rw_lock in shared mode.
1505  *
1506  * @discussion
1507  * This function can block.
1508  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1509  * can acquire it in exclusive mode.
1510  * If the lock is held in shared mode and there are no writers waiting, a reader will be able to acquire
1511  * the lock without waiting.
1512  * If the lock is held in shared mode and there is at least a writer waiting, a reader will wait
1513  * for all the writers to make progress if the lock was initialized with the default settings. Instead if
1514  * RW_SHARED_PRIORITY was selected at initialization time, a reader will never wait if the lock is held
1515  * in shared mode.
1516  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1517  *
1518  * @param lock           rw_lock to lock.
1519  */
1520 void
lck_rw_lock_shared(lck_rw_t * lock)1521 lck_rw_lock_shared(
1522 	lck_rw_t        *lock)
1523 {
1524 	(void) lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), NULL);
1525 }
1526 
1527 /*!
1528  * @function lck_rw_lock_shared_b
1529  *
1530  * @abstract
1531  * Locks a rw_lock in shared mode. Returns early if the lock can't be acquired
1532  * and the specified block returns true.
1533  *
1534  * @discussion
1535  * Identical to lck_rw_lock_shared() but can return early if the lock can't be
1536  * acquired and the specified block returns true. The block is called
1537  * repeatedly when waiting to acquire the lock.
1538  * Should only be called when the lock cannot sleep (i.e. when
1539  * lock->lck_rw_can_sleep is false).
1540  *
1541  * @param lock           rw_lock to lock.
1542  * @param lock_pause     block invoked while waiting to acquire lock
1543  *
1544  * @returns              Returns TRUE if the lock is successfully taken,
1545  *                       FALSE if the block returns true and the lock has
1546  *                       not been acquired.
1547  */
1548 boolean_t
1549 lck_rw_lock_shared_b(
1550 	lck_rw_t        *lock,
1551 	bool            (^lock_pause)(void))
1552 {
1553 	assert(!lock->lck_rw_can_sleep);
1554 
1555 	return lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), lock_pause);
1556 }
1557 
1558 /*
1559  *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1560  *	Function:
1561  *		Fast path code has already dropped our read
1562  *		count and determined that someone else owns 'lck_rw_want_upgrade'
1563  *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1564  *		all we need to do here is determine if a wakeup is needed
1565  */
1566 static boolean_t
lck_rw_lock_shared_to_exclusive_failure(lck_rw_t * lck,uint32_t prior_lock_state)1567 lck_rw_lock_shared_to_exclusive_failure(
1568 	lck_rw_t        *lck,
1569 	uint32_t        prior_lock_state)
1570 {
1571 	thread_t        thread = current_thread();
1572 
1573 	if ((prior_lock_state & LCK_RW_W_WAITING) &&
1574 	    ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1575 		/*
1576 		 *	Someone else has requested upgrade.
1577 		 *	Since we've released the read lock, wake
1578 		 *	him up if he's blocked waiting
1579 		 */
1580 		thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1581 	}
1582 
1583 	/* Check if dropping the lock means that we need to unpromote */
1584 	if (lck->lck_rw_can_sleep) {
1585 		lck_rw_lock_count_dec(thread, lck);
1586 	}
1587 
1588 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1589 	    VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1590 
1591 #ifdef DEBUG_RW
1592 	remove_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
1593 #endif /* DEBUG_RW */
1594 
1595 	return FALSE;
1596 }
1597 
1598 /*
1599  *	Routine:	lck_rw_lock_shared_to_exclusive_success
1600  *	Function:
1601  *		the fast path code has already dropped our read
1602  *		count and successfully acquired 'lck_rw_want_upgrade'
1603  *		we just need to wait for the rest of the readers to drain
1604  *		and then we can return as the exclusive holder of this lock
1605  */
1606 static void
lck_rw_lock_shared_to_exclusive_success(lck_rw_t * lock)1607 lck_rw_lock_shared_to_exclusive_success(
1608 	lck_rw_t        *lock)
1609 {
1610 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1611 	int                     slept = 0;
1612 	lck_rw_word_t           word;
1613 	wait_result_t           res;
1614 	boolean_t               istate;
1615 	lck_rw_drain_state_t    drain_state;
1616 
1617 #if     CONFIG_DTRACE
1618 	uint64_t                wait_interval = 0;
1619 	int                     readers_at_sleep = 0;
1620 	boolean_t               dtrace_ls_initialized = FALSE;
1621 	boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1622 #endif
1623 
1624 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
1625 		word.data = ordered_load_rw(lock);
1626 #if     CONFIG_DTRACE
1627 		if (dtrace_ls_initialized == FALSE) {
1628 			dtrace_ls_initialized = TRUE;
1629 			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1630 			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1631 			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1632 			if (dtrace_ls_enabled) {
1633 				/*
1634 				 * Either sleeping or spinning is happening,
1635 				 *  start a timing of our delay interval now.
1636 				 */
1637 				readers_at_sleep = word.shared_count;
1638 				wait_interval = mach_absolute_time();
1639 			}
1640 		}
1641 #endif
1642 
1643 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1644 		    trace_lck, word.shared_count, 0, 0, 0);
1645 
1646 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE, NULL);
1647 
1648 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1649 		    trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1650 
1651 		if (drain_state == LCK_RW_DRAIN_S_DRAINED) {
1652 			break;
1653 		}
1654 
1655 		/*
1656 		 * if we get here, the spin deadline in lck_rw_wait_on_status()
1657 		 * has expired w/o the rw_shared_count having drained to 0
1658 		 * check to see if we're allowed to do a thread_block
1659 		 */
1660 		if (word.can_sleep) {
1661 			istate = lck_interlock_lock(lock);
1662 
1663 			word.data = ordered_load_rw(lock);
1664 			if (word.shared_count != 0) {
1665 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1666 				    trace_lck, word.shared_count, 0, 0, 0);
1667 
1668 				word.w_waiting = 1;
1669 				ordered_store_rw(lock, word.data);
1670 
1671 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1672 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1673 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1674 				lck_interlock_unlock(lock, istate);
1675 
1676 				if (res == THREAD_WAITING) {
1677 					res = thread_block(THREAD_CONTINUE_NULL);
1678 					slept++;
1679 				}
1680 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1681 				    trace_lck, res, slept, 0, 0);
1682 			} else {
1683 				lck_interlock_unlock(lock, istate);
1684 				break;
1685 			}
1686 		}
1687 	}
1688 #if     CONFIG_DTRACE
1689 	/*
1690 	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1691 	 */
1692 	if (dtrace_ls_enabled == TRUE) {
1693 		if (slept == 0) {
1694 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1695 		} else {
1696 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1697 			    mach_absolute_time() - wait_interval, 1,
1698 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1699 		}
1700 	}
1701 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1702 #endif
1703 }
1704 
1705 /*!
1706  * @function lck_rw_lock_shared_to_exclusive
1707  *
1708  * @abstract
1709  * Upgrades a rw_lock held in shared mode to exclusive.
1710  *
1711  * @discussion
1712  * This function can block.
1713  * Only one reader at a time can upgrade to exclusive mode. If the upgrades fails the function will
1714  * return with the lock not held.
1715  * The caller needs to hold the lock in shared mode to upgrade it.
1716  *
1717  * @param lock           rw_lock already held in shared mode to upgrade.
1718  *
1719  * @returns TRUE if the lock was upgraded, FALSE if it was not possible.
1720  *          If the function was not able to upgrade the lock, the lock will be dropped
1721  *          by the function.
1722  */
1723 boolean_t
lck_rw_lock_shared_to_exclusive(lck_rw_t * lock)1724 lck_rw_lock_shared_to_exclusive(
1725 	lck_rw_t        *lock)
1726 {
1727 	thread_t thread = current_thread();
1728 	uint32_t data, prev;
1729 
1730 	assertf(lock->lck_rw_priv_excl != 0, "lock %p thread %p", lock, current_thread());
1731 
1732 #if DEBUG_RW
1733 	assert_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1734 #endif /* DEBUG_RW */
1735 
1736 	for (;;) {
1737 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1738 		if (data & LCK_RW_INTERLOCK) {
1739 			atomic_exchange_abort();
1740 			lck_rw_interlock_spin(lock);
1741 			continue;
1742 		}
1743 		if (data & LCK_RW_WANT_UPGRADE) {
1744 			data -= LCK_RW_SHARED_READER;
1745 			if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1746 				data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1747 			}
1748 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1749 				return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1750 			}
1751 		} else {
1752 			data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1753 			data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1754 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1755 				break;
1756 			}
1757 		}
1758 		cpu_pause();
1759 	}
1760 	/* we now own the WANT_UPGRADE */
1761 	if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1762 		lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1763 	}
1764 
1765 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1766 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1767 
1768 	ordered_store_rw_owner(lock, thread->ctid);
1769 #if     CONFIG_DTRACE
1770 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1771 #endif  /* CONFIG_DTRACE */
1772 
1773 #if DEBUG_RW
1774 	change_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, __builtin_return_address(0));
1775 #endif /* DEBUG_RW */
1776 	return TRUE;
1777 }
1778 
1779 /*
1780  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1781  *      Function:
1782  *		Fast path has already dropped
1783  *		our exclusive state and bumped lck_rw_shared_count
1784  *		all we need to do here is determine if anyone
1785  *		needs to be awakened.
1786  */
1787 static void
lck_rw_lock_exclusive_to_shared_gen(lck_rw_t * lck,uint32_t prior_lock_state,void * caller)1788 lck_rw_lock_exclusive_to_shared_gen(
1789 	lck_rw_t        *lck,
1790 	uint32_t        prior_lock_state,
1791 	void            *caller)
1792 {
1793 #pragma unused(caller)
1794 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1795 	lck_rw_word_t   fake_lck;
1796 
1797 	/*
1798 	 * prior_lock state is a snapshot of the 1st word of the
1799 	 * lock in question... we'll fake up a pointer to it
1800 	 * and carefully not access anything beyond whats defined
1801 	 * in the first word of a lck_rw_t
1802 	 */
1803 	fake_lck.data = prior_lock_state;
1804 
1805 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1806 	    trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1807 
1808 	/*
1809 	 * don't wake up anyone waiting to take the lock exclusively
1810 	 * since we hold a read count... when the read count drops to 0,
1811 	 * the writers will be woken.
1812 	 *
1813 	 * wake up any waiting readers if we don't have any writers waiting,
1814 	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1815 	 */
1816 	if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1817 		thread_wakeup(LCK_RW_READER_EVENT(lck));
1818 	}
1819 
1820 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1821 	    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1822 
1823 #if CONFIG_DTRACE
1824 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1825 #endif
1826 
1827 #if DEBUG_RW
1828 	thread_t        thread = current_thread();
1829 	change_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1830 #endif /* DEBUG_RW */
1831 }
1832 
1833 /*!
1834  * @function lck_rw_lock_exclusive_to_shared
1835  *
1836  * @abstract
1837  * Downgrades a rw_lock held in exclusive mode to shared.
1838  *
1839  * @discussion
1840  * The caller needs to hold the lock in exclusive mode to be able to downgrade it.
1841  *
1842  * @param lock           rw_lock already held in exclusive mode to downgrade.
1843  */
1844 void
lck_rw_lock_exclusive_to_shared(lck_rw_t * lock)1845 lck_rw_lock_exclusive_to_shared(
1846 	lck_rw_t        *lock)
1847 {
1848 	uint32_t        data, prev;
1849 
1850 	assertf(lock->lck_rw_owner == current_thread()->ctid,
1851 	    "state=0x%x, owner=%p", lock->lck_rw_data,
1852 	    ctid_get_thread_unsafe(lock->lck_rw_owner));
1853 	ordered_store_rw_owner(lock, 0);
1854 
1855 	for (;;) {
1856 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1857 		if (data & LCK_RW_INTERLOCK) {
1858 			atomic_exchange_abort();
1859 			lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1860 			continue;
1861 		}
1862 		data += LCK_RW_SHARED_READER;
1863 		if (data & LCK_RW_WANT_UPGRADE) {
1864 			data &= ~(LCK_RW_WANT_UPGRADE);
1865 		} else {
1866 			data &= ~(LCK_RW_WANT_EXCL);
1867 		}
1868 		if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1869 			data &= ~(LCK_RW_W_WAITING);
1870 		}
1871 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1872 			break;
1873 		}
1874 		cpu_pause();
1875 	}
1876 	lck_rw_lock_exclusive_to_shared_gen(lock, prev, __builtin_return_address(0));
1877 }
1878 
1879 /*
1880  * Very sad hack, but the codegen for lck_rw_lock
1881  * is very unhappy with the combination of __builtin_return_address()
1882  * and a noreturn function. For some reason it adds more frames
1883  * than it should. rdar://76570684
1884  */
1885 void
1886 _lck_rw_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
1887 #pragma clang diagnostic push
1888 #pragma clang diagnostic ignored "-Wmissing-noreturn"
1889 __attribute__((noinline, weak))
1890 void
_lck_rw_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1891 _lck_rw_lock_type_panic(
1892 	lck_rw_t        *lck,
1893 	lck_rw_type_t   lck_rw_type)
1894 {
1895 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
1896 }
1897 #pragma clang diagnostic pop
1898 
1899 /*!
1900  * @function lck_rw_lock
1901  *
1902  * @abstract
1903  * Locks a rw_lock with the specified type.
1904  *
1905  * @discussion
1906  * See lck_rw_lock_shared() or lck_rw_lock_exclusive() for more details.
1907  *
1908  * @param lck           rw_lock to lock.
1909  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
1910  */
1911 void
lck_rw_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1912 lck_rw_lock(
1913 	lck_rw_t        *lck,
1914 	lck_rw_type_t   lck_rw_type)
1915 {
1916 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1917 		return lck_rw_lock_shared_internal(lck, __builtin_return_address(0));
1918 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1919 		return lck_rw_lock_exclusive_internal(lck, __builtin_return_address(0));
1920 	}
1921 	_lck_rw_lock_type_panic(lck, lck_rw_type);
1922 }
1923 
1924 __attribute__((always_inline))
1925 static boolean_t
lck_rw_try_lock_shared_internal_inline(lck_rw_t * lock,void * caller)1926 lck_rw_try_lock_shared_internal_inline(
1927 	lck_rw_t        *lock,
1928 	void            *caller)
1929 {
1930 #pragma unused(caller)
1931 
1932 	uint32_t        data, prev;
1933 	thread_t        thread = current_thread();
1934 #ifdef DEBUG_RW
1935 	boolean_t       check_canlock = TRUE;
1936 #endif
1937 
1938 	for (;;) {
1939 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1940 		if (data & LCK_RW_INTERLOCK) {
1941 			atomic_exchange_abort();
1942 			lck_rw_interlock_spin(lock);
1943 			continue;
1944 		}
1945 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1946 			atomic_exchange_abort();
1947 			return FALSE;             /* lock is busy */
1948 		}
1949 #ifdef DEBUG_RW
1950 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1951 			/*
1952 			 * If the lock is uncontended,
1953 			 * we do not need to check if we can lock it
1954 			 */
1955 			check_canlock = FALSE;
1956 		}
1957 #endif
1958 		data += LCK_RW_SHARED_READER;     /* Increment reader refcount */
1959 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1960 			break;
1961 		}
1962 		cpu_pause();
1963 	}
1964 #ifdef DEBUG_RW
1965 	if (check_canlock) {
1966 		/*
1967 		 * Best effort attempt to check that this thread
1968 		 * is not already holding the lock (this checks read mode too).
1969 		 */
1970 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1971 	}
1972 #endif
1973 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1974 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1975 
1976 	if (lock->lck_rw_can_sleep) {
1977 		lck_rw_lock_count_inc(thread, lock);
1978 	} else if (get_preemption_level() == 0) {
1979 		panic("Taking non-sleepable RW lock with preemption enabled");
1980 	}
1981 
1982 #if     CONFIG_DTRACE
1983 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1984 #endif  /* CONFIG_DTRACE */
1985 
1986 #ifdef DEBUG_RW
1987 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1988 #endif /* DEBUG_RW */
1989 	return TRUE;
1990 }
1991 
1992 __attribute__((noinline))
1993 static boolean_t
lck_rw_try_lock_shared_internal(lck_rw_t * lock,void * caller)1994 lck_rw_try_lock_shared_internal(
1995 	lck_rw_t        *lock,
1996 	void            *caller)
1997 {
1998 	return lck_rw_try_lock_shared_internal_inline(lock, caller);
1999 }
2000 
2001 /*!
2002  * @function lck_rw_try_lock_shared
2003  *
2004  * @abstract
2005  * Tries to locks a rw_lock in read mode.
2006  *
2007  * @discussion
2008  * This function will return and not block in case the lock is already held.
2009  * See lck_rw_lock_shared for more details.
2010  *
2011  * @param lock           rw_lock to lock.
2012  *
2013  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2014  */
2015 boolean_t
lck_rw_try_lock_shared(lck_rw_t * lock)2016 lck_rw_try_lock_shared(
2017 	lck_rw_t        *lock)
2018 {
2019 	return lck_rw_try_lock_shared_internal_inline(lock, __builtin_return_address(0));
2020 }
2021 
2022 __attribute__((always_inline))
2023 static boolean_t
lck_rw_try_lock_exclusive_internal_inline(lck_rw_t * lock,void * caller)2024 lck_rw_try_lock_exclusive_internal_inline(
2025 	lck_rw_t        *lock,
2026 	void            *caller)
2027 {
2028 #pragma unused(caller)
2029 	uint32_t        data, prev;
2030 
2031 	for (;;) {
2032 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
2033 		if (data & LCK_RW_INTERLOCK) {
2034 			atomic_exchange_abort();
2035 			lck_rw_interlock_spin(lock);
2036 			continue;
2037 		}
2038 		if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
2039 			atomic_exchange_abort();
2040 			return FALSE;
2041 		}
2042 		data |= LCK_RW_WANT_EXCL;
2043 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
2044 			break;
2045 		}
2046 		cpu_pause();
2047 	}
2048 	thread_t thread = current_thread();
2049 
2050 	if (lock->lck_rw_can_sleep) {
2051 		lck_rw_lock_count_inc(thread, lock);
2052 	} else if (get_preemption_level() == 0) {
2053 		panic("Taking non-sleepable RW lock with preemption enabled");
2054 	}
2055 
2056 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
2057 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
2058 
2059 	ordered_store_rw_owner(lock, thread->ctid);
2060 #if     CONFIG_DTRACE
2061 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
2062 #endif  /* CONFIG_DTRACE */
2063 
2064 #ifdef DEBUG_RW
2065 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
2066 #endif /* DEBUG_RW */
2067 	return TRUE;
2068 }
2069 
2070 __attribute__((noinline))
2071 static boolean_t
lck_rw_try_lock_exclusive_internal(lck_rw_t * lock,void * caller)2072 lck_rw_try_lock_exclusive_internal(
2073 	lck_rw_t        *lock,
2074 	void            *caller)
2075 {
2076 	return lck_rw_try_lock_exclusive_internal_inline(lock, caller);
2077 }
2078 
2079 /*!
2080  * @function lck_rw_try_lock_exclusive
2081  *
2082  * @abstract
2083  * Tries to locks a rw_lock in write mode.
2084  *
2085  * @discussion
2086  * This function will return and not block in case the lock is already held.
2087  * See lck_rw_lock_exclusive for more details.
2088  *
2089  * @param lock           rw_lock to lock.
2090  *
2091  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2092  */
2093 boolean_t
lck_rw_try_lock_exclusive(lck_rw_t * lock)2094 lck_rw_try_lock_exclusive(
2095 	lck_rw_t        *lock)
2096 {
2097 	return lck_rw_try_lock_exclusive_internal_inline(lock, __builtin_return_address(0));
2098 }
2099 
2100 /*
2101  * Very sad hack, but the codegen for lck_rw_try_lock
2102  * is very unhappy with the combination of __builtin_return_address()
2103  * and a noreturn function. For some reason it adds more frames
2104  * than it should. rdar://76570684
2105  */
2106 boolean_t
2107 _lck_rw_try_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
2108 #pragma clang diagnostic push
2109 #pragma clang diagnostic ignored "-Wmissing-noreturn"
2110 __attribute__((noinline, weak))
2111 boolean_t
_lck_rw_try_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2112 _lck_rw_try_lock_type_panic(
2113 	lck_rw_t        *lck,
2114 	lck_rw_type_t   lck_rw_type)
2115 {
2116 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
2117 }
2118 #pragma clang diagnostic pop
2119 
2120 /*!
2121  * @function lck_rw_try_lock
2122  *
2123  * @abstract
2124  * Tries to locks a rw_lock with the specified type.
2125  *
2126  * @discussion
2127  * This function will return and not wait/block in case the lock is already held.
2128  * See lck_rw_try_lock_shared() or lck_rw_try_lock_exclusive() for more details.
2129  *
2130  * @param lck           rw_lock to lock.
2131  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2132  *
2133  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2134  */
2135 boolean_t
lck_rw_try_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2136 lck_rw_try_lock(
2137 	lck_rw_t        *lck,
2138 	lck_rw_type_t   lck_rw_type)
2139 {
2140 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2141 		return lck_rw_try_lock_shared_internal(lck, __builtin_return_address(0));
2142 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2143 		return lck_rw_try_lock_exclusive_internal(lck, __builtin_return_address(0));
2144 	}
2145 	return _lck_rw_try_lock_type_panic(lck, lck_rw_type);
2146 }
2147 
2148 /*
2149  *      Routine:        lck_rw_done_gen
2150  *
2151  *	prior_lock_state is the value in the 1st
2152  *      word of the lock at the time of a successful
2153  *	atomic compare and exchange with the new value...
2154  *      it represents the state of the lock before we
2155  *	decremented the rw_shared_count or cleared either
2156  *      rw_want_upgrade or rw_want_write and
2157  *	the lck_x_waiting bits...  since the wrapper
2158  *      routine has already changed the state atomically,
2159  *	we just need to decide if we should
2160  *	wake up anyone and what value to return... we do
2161  *	this by examining the state of the lock before
2162  *	we changed it
2163  */
2164 static lck_rw_type_t
lck_rw_done_gen(lck_rw_t * lck,uint32_t prior_lock_state)2165 lck_rw_done_gen(
2166 	lck_rw_t        *lck,
2167 	uint32_t        prior_lock_state)
2168 {
2169 	lck_rw_word_t   fake_lck;
2170 	lck_rw_type_t   lock_type;
2171 	thread_t        thread;
2172 
2173 	/*
2174 	 * prior_lock state is a snapshot of the 1st word of the
2175 	 * lock in question... we'll fake up a pointer to it
2176 	 * and carefully not access anything beyond whats defined
2177 	 * in the first word of a lck_rw_t
2178 	 */
2179 	fake_lck.data = prior_lock_state;
2180 
2181 	if (fake_lck.shared_count <= 1) {
2182 		if (fake_lck.w_waiting) {
2183 			thread_wakeup(LCK_RW_WRITER_EVENT(lck));
2184 		}
2185 
2186 		if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
2187 			thread_wakeup(LCK_RW_READER_EVENT(lck));
2188 		}
2189 	}
2190 	if (fake_lck.shared_count) {
2191 		lock_type = LCK_RW_TYPE_SHARED;
2192 	} else {
2193 		lock_type = LCK_RW_TYPE_EXCLUSIVE;
2194 	}
2195 
2196 	/* Check if dropping the lock means that we need to unpromote */
2197 	thread = current_thread();
2198 	if (fake_lck.can_sleep) {
2199 		lck_rw_lock_count_dec(thread, lck);
2200 	}
2201 
2202 #if CONFIG_DTRACE
2203 	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
2204 #endif
2205 
2206 #ifdef DEBUG_RW
2207 	remove_held_rwlock(lck, thread, lock_type);
2208 #endif /* DEBUG_RW */
2209 	return lock_type;
2210 }
2211 
2212 /*!
2213  * @function lck_rw_done
2214  *
2215  * @abstract
2216  * Force unlocks a rw_lock without consistency checks.
2217  *
2218  * @discussion
2219  * Do not use unless sure you can avoid consistency checks.
2220  *
2221  * @param lock           rw_lock to unlock.
2222  */
2223 lck_rw_type_t
lck_rw_done(lck_rw_t * lock)2224 lck_rw_done(
2225 	lck_rw_t        *lock)
2226 {
2227 	uint32_t        data, prev;
2228 	boolean_t       once = FALSE;
2229 
2230 #ifdef DEBUG_RW
2231 	/*
2232 	 * Best effort attempt to check that this thread
2233 	 * is holding the lock.
2234 	 */
2235 	thread_t thread = current_thread();
2236 	assert_held_rwlock(lock, thread, 0);
2237 #endif /* DEBUG_RW */
2238 	for (;;) {
2239 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
2240 		if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
2241 			atomic_exchange_abort();
2242 			lck_rw_interlock_spin(lock);
2243 			continue;
2244 		}
2245 		if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
2246 			assertf(lock->lck_rw_owner == 0,
2247 			    "state=0x%x, owner=%p", lock->lck_rw_data,
2248 			    ctid_get_thread_unsafe(lock->lck_rw_owner));
2249 			data -= LCK_RW_SHARED_READER;
2250 			if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
2251 				goto check_waiters;
2252 			}
2253 		} else {                                        /* if reader count == 0, must be exclusive lock */
2254 			if (data & LCK_RW_WANT_UPGRADE) {
2255 				data &= ~(LCK_RW_WANT_UPGRADE);
2256 			} else {
2257 				if (data & LCK_RW_WANT_EXCL) {
2258 					data &= ~(LCK_RW_WANT_EXCL);
2259 				} else {                                /* lock is not 'owned', panic */
2260 					panic("Releasing non-exclusive RW lock without a reader refcount!");
2261 				}
2262 			}
2263 			if (!once) {
2264 				// Only check for holder and clear it once
2265 				assertf(lock->lck_rw_owner == current_thread()->ctid,
2266 				    "state=0x%x, owner=%p", lock->lck_rw_data,
2267 				    ctid_get_thread_unsafe(lock->lck_rw_owner));
2268 				ordered_store_rw_owner(lock, 0);
2269 				once = TRUE;
2270 			}
2271 check_waiters:
2272 			/*
2273 			 * test the original values to match what
2274 			 * lck_rw_done_gen is going to do to determine
2275 			 * which wakeups need to happen...
2276 			 *
2277 			 * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
2278 			 */
2279 			if (prev & LCK_RW_W_WAITING) {
2280 				data &= ~(LCK_RW_W_WAITING);
2281 				if ((prev & LCK_RW_PRIV_EXCL) == 0) {
2282 					data &= ~(LCK_RW_R_WAITING);
2283 				}
2284 			} else {
2285 				data &= ~(LCK_RW_R_WAITING);
2286 			}
2287 		}
2288 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
2289 			break;
2290 		}
2291 		cpu_pause();
2292 	}
2293 	return lck_rw_done_gen(lock, prev);
2294 }
2295 
2296 /*!
2297  * @function lck_rw_unlock_shared
2298  *
2299  * @abstract
2300  * Unlocks a rw_lock previously locked in shared mode.
2301  *
2302  * @discussion
2303  * The same thread that locked the lock needs to unlock it.
2304  *
2305  * @param lck           rw_lock held in shared mode to unlock.
2306  */
2307 void
lck_rw_unlock_shared(lck_rw_t * lck)2308 lck_rw_unlock_shared(
2309 	lck_rw_t        *lck)
2310 {
2311 	lck_rw_type_t   ret;
2312 
2313 	assertf(lck->lck_rw_owner == 0,
2314 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2315 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2316 	assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
2317 	ret = lck_rw_done(lck);
2318 
2319 	if (ret != LCK_RW_TYPE_SHARED) {
2320 		panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
2321 	}
2322 }
2323 
2324 /*!
2325  * @function lck_rw_unlock_exclusive
2326  *
2327  * @abstract
2328  * Unlocks a rw_lock previously locked in exclusive mode.
2329  *
2330  * @discussion
2331  * The same thread that locked the lock needs to unlock it.
2332  *
2333  * @param lck           rw_lock held in exclusive mode to unlock.
2334  */
2335 void
lck_rw_unlock_exclusive(lck_rw_t * lck)2336 lck_rw_unlock_exclusive(
2337 	lck_rw_t        *lck)
2338 {
2339 	lck_rw_type_t   ret;
2340 
2341 	assertf(lck->lck_rw_owner == current_thread()->ctid,
2342 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2343 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2344 	ret = lck_rw_done(lck);
2345 
2346 	if (ret != LCK_RW_TYPE_EXCLUSIVE) {
2347 		panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
2348 	}
2349 }
2350 
2351 /*!
2352  * @function lck_rw_unlock
2353  *
2354  * @abstract
2355  * Unlocks a rw_lock previously locked with lck_rw_type.
2356  *
2357  * @discussion
2358  * The lock must be unlocked by the same thread it was locked from.
2359  * The type of the lock/unlock have to match, unless an upgrade/downgrade was performed while
2360  * holding the lock.
2361  *
2362  * @param lck           rw_lock to unlock.
2363  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2364  */
2365 void
lck_rw_unlock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2366 lck_rw_unlock(
2367 	lck_rw_t         *lck,
2368 	lck_rw_type_t    lck_rw_type)
2369 {
2370 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2371 		lck_rw_unlock_shared(lck);
2372 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2373 		lck_rw_unlock_exclusive(lck);
2374 	} else {
2375 		panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
2376 	}
2377 }
2378 
2379 /*!
2380  * @function lck_rw_assert
2381  *
2382  * @abstract
2383  * Asserts the rw_lock is held.
2384  *
2385  * @discussion
2386  * read-write locks do not have a concept of ownership when held in shared mode,
2387  * so this function merely asserts that someone is holding the lock, not necessarily the caller.
2388  * However if rw_lock_debug is on, a best effort mechanism to track the owners is in place, and
2389  * this function can be more accurate.
2390  * Type can be LCK_RW_ASSERT_SHARED, LCK_RW_ASSERT_EXCLUSIVE, LCK_RW_ASSERT_HELD
2391  * LCK_RW_ASSERT_NOTHELD.
2392  *
2393  * @param lck   rw_lock to check.
2394  * @param type  assert type
2395  */
2396 void
lck_rw_assert(lck_rw_t * lck,unsigned int type)2397 lck_rw_assert(
2398 	lck_rw_t        *lck,
2399 	unsigned int    type)
2400 {
2401 	thread_t thread = current_thread();
2402 
2403 	switch (type) {
2404 	case LCK_RW_ASSERT_SHARED:
2405 		if ((lck->lck_rw_shared_count != 0) &&
2406 		    (lck->lck_rw_owner == 0)) {
2407 #if DEBUG_RW
2408 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2409 #endif /* DEBUG_RW */
2410 			return;
2411 		}
2412 		break;
2413 	case LCK_RW_ASSERT_EXCLUSIVE:
2414 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2415 		    (lck->lck_rw_shared_count == 0) &&
2416 		    (lck->lck_rw_owner == thread->ctid)) {
2417 #if DEBUG_RW
2418 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2419 #endif /* DEBUG_RW */
2420 			return;
2421 		}
2422 		break;
2423 	case LCK_RW_ASSERT_HELD:
2424 		if (lck->lck_rw_shared_count != 0) {
2425 #if DEBUG_RW
2426 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2427 #endif /* DEBUG_RW */
2428 			return;         // Held shared
2429 		}
2430 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2431 		    (lck->lck_rw_owner == thread->ctid)) {
2432 #if DEBUG_RW
2433 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2434 #endif /* DEBUG_RW */
2435 			return;         // Held exclusive
2436 		}
2437 		break;
2438 	case LCK_RW_ASSERT_NOTHELD:
2439 		if ((lck->lck_rw_shared_count == 0) &&
2440 		    !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2441 		    (lck->lck_rw_owner == 0)) {
2442 #ifdef DEBUG_RW
2443 			assert_canlock_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2444 #endif /* DEBUG_RW */
2445 			return;
2446 		}
2447 		break;
2448 	default:
2449 		break;
2450 	}
2451 	panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2452 }
2453 
2454 /*!
2455  * @function kdp_lck_rw_lock_is_acquired_exclusive
2456  *
2457  * @abstract
2458  * Checks if a rw_lock is held exclusevely.
2459  *
2460  * @discussion
2461  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2462  *
2463  * @param lck   lock to check
2464  *
2465  * @returns TRUE if the lock is held exclusevely
2466  */
2467 boolean_t
kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t * lck)2468 kdp_lck_rw_lock_is_acquired_exclusive(
2469 	lck_rw_t        *lck)
2470 {
2471 	if (not_in_kdp) {
2472 		panic("panic: rw lock exclusive check done outside of kernel debugger");
2473 	}
2474 	return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2475 }
2476 
2477 void
kdp_rwlck_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)2478 kdp_rwlck_find_owner(
2479 	__unused struct waitq   *waitq,
2480 	event64_t               event,
2481 	thread_waitinfo_t       *waitinfo)
2482 {
2483 	lck_rw_t        *rwlck = NULL;
2484 	switch (waitinfo->wait_type) {
2485 	case kThreadWaitKernelRWLockRead:
2486 		rwlck = READ_EVENT_TO_RWLOCK(event);
2487 		break;
2488 	case kThreadWaitKernelRWLockWrite:
2489 	case kThreadWaitKernelRWLockUpgrade:
2490 		rwlck = WRITE_EVENT_TO_RWLOCK(event);
2491 		break;
2492 	default:
2493 		panic("%s was called with an invalid blocking type", __FUNCTION__);
2494 		break;
2495 	}
2496 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2497 	waitinfo->owner = thread_tid(ctid_get_thread(rwlck->lck_rw_owner));
2498 }
2499 
2500 /*!
2501  * @function lck_rw_lock_would_yield_shared
2502  *
2503  * @abstract
2504  * Check whether a rw_lock currently held in shared mode would be yielded
2505  *
2506  * @discussion
2507  * This function can be used when lck_rw_lock_yield_shared() would be
2508  * inappropriate due to the need to perform additional housekeeping
2509  * prior to any yield or when the caller may wish to prematurely terminate
2510  * an operation rather than resume it after regaining the lock.
2511  *
2512  * @param lck           rw_lock already held in shared mode to yield.
2513  *
2514  * @returns TRUE if the lock would yield, FALSE otherwise
2515  */
2516 bool
lck_rw_lock_would_yield_shared(lck_rw_t * lck)2517 lck_rw_lock_would_yield_shared(
2518 	lck_rw_t        *lck)
2519 {
2520 	lck_rw_word_t   word;
2521 
2522 	lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2523 
2524 	word.data = ordered_load_rw(lck);
2525 	if (word.want_excl || word.want_upgrade) {
2526 		return true;
2527 	}
2528 
2529 	return false;
2530 }
2531 
2532 /*!
2533  * @function lck_rw_lock_yield_shared
2534  *
2535  * @abstract
2536  * Yields a rw_lock held in shared mode.
2537  *
2538  * @discussion
2539  * This function can block.
2540  * Yields the lock in case there are writers waiting.
2541  * The yield will unlock, block, and re-lock the lock in shared mode.
2542  *
2543  * @param lck           rw_lock already held in shared mode to yield.
2544  * @param force_yield   if set to true it will always yield irrespective of the lock status
2545  *
2546  * @returns TRUE if the lock was yield, FALSE otherwise
2547  */
2548 bool
lck_rw_lock_yield_shared(lck_rw_t * lck,boolean_t force_yield)2549 lck_rw_lock_yield_shared(
2550 	lck_rw_t        *lck,
2551 	boolean_t       force_yield)
2552 {
2553 	if (lck_rw_lock_would_yield_shared(lck) || force_yield) {
2554 		lck_rw_unlock_shared(lck);
2555 		mutex_pause(2);
2556 		lck_rw_lock_shared(lck);
2557 		return true;
2558 	}
2559 
2560 	return false;
2561 }
2562 
2563 /*!
2564  * @function lck_rw_lock_would_yield_exclusive
2565  *
2566  * @abstract
2567  * Check whether a rw_lock currently held in exclusive mode would be yielded
2568  *
2569  * @discussion
2570  * This function can be used when lck_rw_lock_yield_exclusive would be
2571  * inappropriate due to the need to perform additional housekeeping
2572  * prior to any yield or when the caller may wish to prematurely terminate
2573  * an operation rather than resume it after regaining the lock.
2574  *
2575  * @param lck           rw_lock already held in exclusive mode to yield.
2576  * @param mode          when to yield.
2577  *
2578  * @returns TRUE if the lock would yield, FALSE otherwise
2579  */
2580 bool
lck_rw_lock_would_yield_exclusive(lck_rw_t * lck,lck_rw_yield_t mode)2581 lck_rw_lock_would_yield_exclusive(
2582 	lck_rw_t        *lck,
2583 	lck_rw_yield_t  mode)
2584 {
2585 	lck_rw_word_t word;
2586 	bool yield = false;
2587 
2588 	lck_rw_assert(lck, LCK_RW_ASSERT_EXCLUSIVE);
2589 
2590 	if (mode == LCK_RW_YIELD_ALWAYS) {
2591 		yield = true;
2592 	} else {
2593 		word.data = ordered_load_rw(lck);
2594 		if (word.w_waiting) {
2595 			yield = true;
2596 		} else if (mode == LCK_RW_YIELD_ANY_WAITER) {
2597 			yield = (word.r_waiting != 0);
2598 		}
2599 	}
2600 
2601 	return yield;
2602 }
2603 
2604 /*!
2605  * @function lck_rw_lock_yield_exclusive
2606  *
2607  * @abstract
2608  * Yields a rw_lock held in exclusive mode.
2609  *
2610  * @discussion
2611  * This function can block.
2612  * Yields the lock in case there are writers waiting.
2613  * The yield will unlock, block, and re-lock the lock in exclusive mode.
2614  *
2615  * @param lck           rw_lock already held in exclusive mode to yield.
2616  * @param mode          when to yield.
2617  *
2618  * @returns TRUE if the lock was yield, FALSE otherwise
2619  */
2620 bool
lck_rw_lock_yield_exclusive(lck_rw_t * lck,lck_rw_yield_t mode)2621 lck_rw_lock_yield_exclusive(
2622 	lck_rw_t        *lck,
2623 	lck_rw_yield_t  mode)
2624 {
2625 	bool yield = lck_rw_lock_would_yield_exclusive(lck, mode);
2626 
2627 	if (yield) {
2628 		lck_rw_unlock_exclusive(lck);
2629 		mutex_pause(2);
2630 		lck_rw_lock_exclusive(lck);
2631 	}
2632 
2633 	return yield;
2634 }
2635 
2636 /*!
2637  * @function lck_rw_sleep
2638  *
2639  * @abstract
2640  * Assert_wait on an event while holding the rw_lock.
2641  *
2642  * @discussion
2643  * the flags can decide how to re-acquire the lock upon wake up
2644  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2645  * and if the priority needs to be kept boosted until the lock is
2646  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2647  *
2648  * @param lck                   rw_lock to use to synch the assert_wait.
2649  * @param lck_sleep_action      flags.
2650  * @param event                 event to assert_wait on.
2651  * @param interruptible         wait type.
2652  */
2653 wait_result_t
lck_rw_sleep(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible)2654 lck_rw_sleep(
2655 	lck_rw_t                *lck,
2656 	lck_sleep_action_t      lck_sleep_action,
2657 	event_t                 event,
2658 	wait_interrupt_t        interruptible)
2659 {
2660 	wait_result_t           res;
2661 	lck_rw_type_t           lck_rw_type;
2662 	thread_pri_floor_t      token;
2663 
2664 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2665 		panic("Invalid lock sleep action %x", lck_sleep_action);
2666 	}
2667 
2668 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2669 		/*
2670 		 * Although we are dropping the RW lock, the intent in most cases
2671 		 * is that this thread remains as an observer, since it may hold
2672 		 * some secondary resource, but must yield to avoid deadlock. In
2673 		 * this situation, make sure that the thread is boosted to the
2674 		 * ceiling while blocked, so that it can re-acquire the
2675 		 * RW lock at that priority.
2676 		 */
2677 		token = thread_priority_floor_start();
2678 	}
2679 
2680 	res = assert_wait(event, interruptible);
2681 	if (res == THREAD_WAITING) {
2682 		lck_rw_type = lck_rw_done(lck);
2683 		res = thread_block(THREAD_CONTINUE_NULL);
2684 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2685 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2686 				lck_rw_lock(lck, lck_rw_type);
2687 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2688 				lck_rw_lock_exclusive(lck);
2689 			} else {
2690 				lck_rw_lock_shared(lck);
2691 			}
2692 		}
2693 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2694 		(void)lck_rw_done(lck);
2695 	}
2696 
2697 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2698 		thread_priority_floor_end(&token);
2699 	}
2700 
2701 	return res;
2702 }
2703 
2704 /*!
2705  * @function lck_rw_sleep_deadline
2706  *
2707  * @abstract
2708  * Assert_wait_deadline on an event while holding the rw_lock.
2709  *
2710  * @discussion
2711  * the flags can decide how to re-acquire the lock upon wake up
2712  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2713  * and if the priority needs to be kept boosted until the lock is
2714  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2715  *
2716  * @param lck                   rw_lock to use to synch the assert_wait.
2717  * @param lck_sleep_action      flags.
2718  * @param event                 event to assert_wait on.
2719  * @param interruptible         wait type.
2720  * @param deadline              maximum time after which being woken up
2721  */
2722 wait_result_t
lck_rw_sleep_deadline(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible,uint64_t deadline)2723 lck_rw_sleep_deadline(
2724 	lck_rw_t                *lck,
2725 	lck_sleep_action_t      lck_sleep_action,
2726 	event_t                 event,
2727 	wait_interrupt_t        interruptible,
2728 	uint64_t                deadline)
2729 {
2730 	wait_result_t           res;
2731 	lck_rw_type_t           lck_rw_type;
2732 	thread_pri_floor_t      token;
2733 
2734 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2735 		panic("Invalid lock sleep action %x", lck_sleep_action);
2736 	}
2737 
2738 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2739 		token = thread_priority_floor_start();
2740 	}
2741 
2742 	res = assert_wait_deadline(event, interruptible, deadline);
2743 	if (res == THREAD_WAITING) {
2744 		lck_rw_type = lck_rw_done(lck);
2745 		res = thread_block(THREAD_CONTINUE_NULL);
2746 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2747 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2748 				lck_rw_lock(lck, lck_rw_type);
2749 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2750 				lck_rw_lock_exclusive(lck);
2751 			} else {
2752 				lck_rw_lock_shared(lck);
2753 			}
2754 		}
2755 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2756 		(void)lck_rw_done(lck);
2757 	}
2758 
2759 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2760 		thread_priority_floor_end(&token);
2761 	}
2762 
2763 	return res;
2764 }
2765 
2766 /*
2767  * Reader-writer lock promotion
2768  *
2769  * We support a limited form of reader-writer
2770  * lock promotion whose effects are:
2771  *
2772  *   * Qualifying threads have decay disabled
2773  *   * Scheduler priority is reset to a floor of
2774  *     of their statically assigned priority
2775  *     or MINPRI_RWLOCK
2776  *
2777  * The rationale is that lck_rw_ts do not have
2778  * a single owner, so we cannot apply a directed
2779  * priority boost from all waiting threads
2780  * to all holding threads without maintaining
2781  * lists of all shared owners and all waiting
2782  * threads for every lock.
2783  *
2784  * Instead (and to preserve the uncontended fast-
2785  * path), acquiring (or attempting to acquire)
2786  * a RW lock in shared or exclusive lock increments
2787  * a per-thread counter. Only if that thread stops
2788  * making forward progress (for instance blocking
2789  * on a mutex, or being preempted) do we consult
2790  * the counter and apply the priority floor.
2791  * When the thread becomes runnable again (or in
2792  * the case of preemption it never stopped being
2793  * runnable), it has the priority boost and should
2794  * be in a good position to run on the CPU and
2795  * release all RW locks (at which point the priority
2796  * boost is cleared).
2797  *
2798  * Care must be taken to ensure that priority
2799  * boosts are not retained indefinitely, since unlike
2800  * mutex priority boosts (where the boost is tied
2801  * to the mutex lifecycle), the boost is tied
2802  * to the thread and independent of any particular
2803  * lck_rw_t. Assertions are in place on return
2804  * to userspace so that the boost is not held
2805  * indefinitely.
2806  *
2807  * The routines that increment/decrement the
2808  * per-thread counter should err on the side of
2809  * incrementing any time a preemption is possible
2810  * and the lock would be visible to the rest of the
2811  * system as held (so it should be incremented before
2812  * interlocks are dropped/preemption is enabled, or
2813  * before a CAS is executed to acquire the lock).
2814  *
2815  */
2816 
2817 /*!
2818  * @function lck_rw_clear_promotion
2819  *
2820  * @abstract
2821  * Undo priority promotions when the last rw_lock
2822  * is released by a thread (if a promotion was active).
2823  *
2824  * @param thread        thread to demote.
2825  * @param lock          object reason for the demotion.
2826  */
2827 __attribute__((noinline))
2828 static void
lck_rw_clear_promotion(thread_t thread,const void * lock)2829 lck_rw_clear_promotion(thread_t thread, const void *lock)
2830 {
2831 	/* Cancel any promotions if the thread had actually blocked while holding a RW lock */
2832 	spl_t s = splsched();
2833 	thread_lock(thread);
2834 
2835 	if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
2836 		sched_thread_unpromote_reason(thread, TH_SFLAG_RW_PROMOTED,
2837 		    unslide_for_kdebug(lock));
2838 	}
2839 
2840 	thread_unlock(thread);
2841 	splx(s);
2842 }
2843 
2844 /*!
2845  * @function lck_rw_set_promotion_locked
2846  *
2847  * @abstract
2848  * Callout from context switch if the thread goes
2849  * off core with a positive rwlock_count.
2850  *
2851  * @discussion
2852  * Called at splsched with the thread locked.
2853  *
2854  * @param thread        thread to promote.
2855  */
2856 __attribute__((always_inline))
2857 void
lck_rw_set_promotion_locked(thread_t thread)2858 lck_rw_set_promotion_locked(thread_t thread)
2859 {
2860 	if (LcksOpts & LCK_OPTION_DISABLE_RW_PRIO) {
2861 		return;
2862 	}
2863 
2864 	assert(thread->rwlock_count > 0);
2865 
2866 	if (!(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2867 		sched_thread_promote_reason(thread, TH_SFLAG_RW_PROMOTED, 0);
2868 	}
2869 }
2870 
2871 __attribute__((always_inline))
2872 void
lck_rw_lock_count_inc(thread_t thread,const void * lock __unused)2873 lck_rw_lock_count_inc(thread_t thread, const void *lock __unused)
2874 {
2875 	if (thread->rwlock_count++ == 0) {
2876 #if MACH_ASSERT
2877 		/*
2878 		 * Set the ast to check that the
2879 		 * rwlock_count is going to be set to zero when
2880 		 * going back to userspace.
2881 		 * Set it only once when we increment it for the first time.
2882 		 */
2883 		act_set_debug_assert();
2884 #endif
2885 	}
2886 }
2887 
2888 __abortlike
2889 static void
__lck_rw_lock_count_dec_panic(thread_t thread)2890 __lck_rw_lock_count_dec_panic(thread_t thread)
2891 {
2892 	panic("rw lock count underflow for thread %p", thread);
2893 }
2894 
2895 __attribute__((always_inline))
2896 void
lck_rw_lock_count_dec(thread_t thread,const void * lock)2897 lck_rw_lock_count_dec(thread_t thread, const void *lock)
2898 {
2899 	uint32_t rwlock_count = thread->rwlock_count--;
2900 
2901 	if (rwlock_count == 0) {
2902 		__lck_rw_lock_count_dec_panic(thread);
2903 	}
2904 
2905 	if (__probable(rwlock_count == 1)) {
2906 		/* sched_flags checked without lock, but will be rechecked while clearing */
2907 		if (__improbable(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2908 			lck_rw_clear_promotion(thread, lock);
2909 		}
2910 	}
2911 }
2912