xref: /xnu-8019.80.24/osfmk/kern/lock_rw.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 #include <debug.h>
57 #include <kern/lock_stat.h>
58 #include <kern/locks.h>
59 #include <kern/zalloc.h>
60 #include <kern/thread.h>
61 #include <kern/processor.h>
62 #include <kern/sched_prim.h>
63 #include <kern/debug.h>
64 #include <machine/atomic.h>
65 #include <machine/machine_cpu.h>
66 
67 KALLOC_TYPE_DEFINE(KT_LCK_RW, lck_rw_t, KT_PRIV_ACCT);
68 
69 #define LCK_RW_WRITER_EVENT(lck)                (event_t)((uintptr_t)(lck)+1)
70 #define LCK_RW_READER_EVENT(lck)                (event_t)((uintptr_t)(lck)+2)
71 #define WRITE_EVENT_TO_RWLOCK(event)            ((lck_rw_t *)((uintptr_t)(event)-1))
72 #define READ_EVENT_TO_RWLOCK(event)             ((lck_rw_t *)((uintptr_t)(event)-2))
73 
74 #if CONFIG_DTRACE
75 #define DTRACE_RW_SHARED        0x0     //reader
76 #define DTRACE_RW_EXCL          0x1     //writer
77 #define DTRACE_NO_FLAG          0x0     //not applicable
78 #endif  /* CONFIG_DTRACE */
79 
80 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
81 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
82 #define LCK_RW_LCK_SHARED_CODE          0x102
83 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
84 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
85 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
86 
87 #if __x86_64__
88 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
89 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
90 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
91 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
92 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
93 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
94 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
95 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
96 #endif
97 
98 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
99 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
100 
101 #define ordered_load_rw(lock)                   os_atomic_load(&(lock)->lck_rw_data, compiler_acq_rel)
102 #define ordered_store_rw(lock, value)           os_atomic_store(&(lock)->lck_rw_data, (value), compiler_acq_rel)
103 #define ordered_load_rw_owner(lock)             os_atomic_load(&(lock)->lck_rw_owner, compiler_acq_rel)
104 #define ordered_store_rw_owner(lock, value)     os_atomic_store(&(lock)->lck_rw_owner, (value), compiler_acq_rel)
105 
106 #ifdef DEBUG_RW
107 static TUNABLE(bool, lck_rw_recursive_shared_assert_74048094, "lck_rw_recursive_shared_assert", false);
108 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) rwlde_caller_packing_params =
109     VM_PACKING_PARAMS(LCK_RW_CALLER_PACKED);
110 #define rw_lock_debug_disabled()                ((LcksOpts & disLkRWDebug) == disLkRWDebug)
111 
112 #define set_rwlde_caller_packed(entry, caller)          ((entry)->rwlde_caller_packed = VM_PACK_POINTER((vm_offset_t)caller, LCK_RW_CALLER_PACKED))
113 #define get_rwlde_caller(entry)                         ((void*)VM_UNPACK_POINTER(entry->rwlde_caller_packed, LCK_RW_CALLER_PACKED))
114 
115 #endif /* DEBUG_RW */
116 
117 /*!
118  * @function lck_rw_alloc_init
119  *
120  * @abstract
121  * Allocates and initializes a rw_lock_t.
122  *
123  * @discussion
124  * The function can block. See lck_rw_init() for initialization details.
125  *
126  * @param grp           lock group to associate with the lock.
127  * @param attr          lock attribute to initialize the lock.
128  *
129  * @returns             NULL or the allocated lock
130  */
131 lck_rw_t *
lck_rw_alloc_init(lck_grp_t * grp,lck_attr_t * attr)132 lck_rw_alloc_init(
133 	lck_grp_t       *grp,
134 	lck_attr_t      *attr)
135 {
136 	lck_rw_t *lck;
137 
138 	lck = zalloc_flags(KT_LCK_RW, Z_WAITOK | Z_ZERO);
139 	lck_rw_init(lck, grp, attr);
140 	return lck;
141 }
142 
143 /*!
144  * @function lck_rw_init
145  *
146  * @abstract
147  * Initializes a rw_lock_t.
148  *
149  * @discussion
150  * Usage statistics for the lock are going to be added to the lock group provided.
151  *
152  * The lock attribute can be used to specify the lock contention behaviour.
153  * RW_WRITER_PRIORITY is the default behaviour (LCK_ATTR_NULL defaults to RW_WRITER_PRIORITY)
154  * and lck_attr_rw_shared_priority() can be used to set the behaviour to RW_SHARED_PRIORITY.
155  *
156  * RW_WRITER_PRIORITY gives priority to the writers upon contention with the readers;
157  * if the lock is held and a writer starts waiting for the lock, readers will not be able
158  * to acquire the lock until all writers stop contending. Readers could
159  * potentially starve.
160  * RW_SHARED_PRIORITY gives priority to the readers upon contention with the writers:
161  * unleass the lock is held in exclusive mode, readers will always be able to acquire the lock.
162  * Readers can lock a shared lock even if there are writers waiting. Writers could potentially
163  * starve.
164  *
165  * @param lck           lock to initialize.
166  * @param grp           lock group to associate with the lock.
167  * @param attr          lock attribute to initialize the lock.
168  *
169  */
170 void
lck_rw_init(lck_rw_t * lck,lck_grp_t * grp,lck_attr_t * attr)171 lck_rw_init(
172 	lck_rw_t        *lck,
173 	lck_grp_t       *grp,
174 	lck_attr_t      *attr)
175 {
176 	if (attr == LCK_ATTR_NULL) {
177 		attr = &LockDefaultLckAttr;
178 	}
179 	memset(lck, 0, sizeof(lck_rw_t));
180 	lck->lck_rw_can_sleep = TRUE;
181 	if ((attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY) == 0) {
182 		lck->lck_rw_priv_excl = TRUE;
183 	}
184 
185 	lck_grp_reference(grp);
186 	lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
187 }
188 
189 /*!
190  * @function lck_rw_free
191  *
192  * @abstract
193  * Frees a rw_lock previously allocated with lck_rw_alloc_init().
194  *
195  * @discussion
196  * The lock must be not held by any thread.
197  *
198  * @param lck           rw_lock to free.
199  */
200 void
lck_rw_free(lck_rw_t * lck,lck_grp_t * grp)201 lck_rw_free(
202 	lck_rw_t        *lck,
203 	lck_grp_t       *grp)
204 {
205 	lck_rw_destroy(lck, grp);
206 	zfree(KT_LCK_RW, lck);
207 }
208 
209 /*!
210  * @function lck_rw_destroy
211  *
212  * @abstract
213  * Destroys a rw_lock previously initialized with lck_rw_init().
214  *
215  * @discussion
216  * The lock must be not held by any thread.
217  *
218  * @param lck           rw_lock to destroy.
219  */
220 void
lck_rw_destroy(lck_rw_t * lck,lck_grp_t * grp)221 lck_rw_destroy(
222 	lck_rw_t        *lck,
223 	lck_grp_t       *grp)
224 {
225 	if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
226 		panic("Destroying previously destroyed lock %p", lck);
227 	}
228 	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
229 
230 	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
231 	lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
232 	lck_grp_deallocate(grp);
233 	return;
234 }
235 
236 #ifdef DEBUG_RW
237 
238 /*
239  * Best effort mechanism to debug rw_locks.
240  *
241  * This mechanism is in addition to the owner checks. The owner is set
242  * only when the lock is held in exclusive mode so the checks do not cover
243  * the cases in which the lock is held in shared mode.
244  *
245  * This mechanism tentatively stores the rw_lock acquired and its debug
246  * information on the thread struct.
247  * Just up to LCK_RW_EXPECTED_MAX_NUMBER rw lock debug information can be stored.
248  *
249  * NOTE: LCK_RW_EXPECTED_MAX_NUMBER is the expected number of rw_locks held
250  * at the same time. If a thread holds more than this number of rw_locks we
251  * will start losing debug information.
252  * Increasing LCK_RW_EXPECTED_MAX_NUMBER will increase the probability we will
253  * store the debug information but it will require more memory per thread
254  * and longer lock/unlock time.
255  *
256  * If an empty slot is found for the debug information, we record the lock
257  * otherwise we set the overflow threshold flag.
258  *
259  * If we reached the overflow threshold we might stop asserting because we cannot be sure
260  * anymore if the lock was acquired or not.
261  *
262  * Even if we reached the overflow threshold, we try to store the debug information
263  * for the new locks acquired. This can be useful in core dumps to debug
264  * possible return to userspace without unlocking and to find possible readers
265  * holding the lock.
266  */
267 void
rw_lock_init(void)268 rw_lock_init(void)
269 {
270 	if (kern_feature_override(KF_RW_LOCK_DEBUG_OVRD)) {
271 		LcksOpts |= disLkRWDebug;
272 	}
273 }
274 
275 static inline struct rw_lock_debug_entry *
find_lock_in_savedlocks(lck_rw_t * lock,rw_lock_debug_t * rw_locks_held)276 find_lock_in_savedlocks(lck_rw_t* lock, rw_lock_debug_t *rw_locks_held)
277 {
278 	int i;
279 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
280 		struct rw_lock_debug_entry *existing = &rw_locks_held->rwld_locks[i];
281 		if (existing->rwlde_lock == lock) {
282 			return existing;
283 		}
284 	}
285 
286 	return NULL;
287 }
288 
289 __abortlike
290 static void
rwlock_slot_panic(rw_lock_debug_t * rw_locks_held)291 rwlock_slot_panic(rw_lock_debug_t *rw_locks_held)
292 {
293 	panic("No empty slot found in %p slot_used %d", rw_locks_held, rw_locks_held->rwld_locks_saved);
294 }
295 
296 static inline struct rw_lock_debug_entry *
find_empty_slot(rw_lock_debug_t * rw_locks_held)297 find_empty_slot(rw_lock_debug_t *rw_locks_held)
298 {
299 	int i;
300 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
301 		struct rw_lock_debug_entry *entry = &rw_locks_held->rwld_locks[i];
302 		if (entry->rwlde_lock == NULL) {
303 			return entry;
304 		}
305 	}
306 	rwlock_slot_panic(rw_locks_held);
307 }
308 
309 __abortlike
310 static void
canlock_rwlock_panic(lck_rw_t * lock,thread_t thread,struct rw_lock_debug_entry * entry)311 canlock_rwlock_panic(lck_rw_t* lock, thread_t thread, struct rw_lock_debug_entry *entry)
312 {
313 	panic("RW lock %p already held by %p caller %p mode_count %d state 0x%x owner 0x%p ",
314 	    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
315 	    ordered_load_rw(lock), ordered_load_rw_owner(lock));
316 }
317 
318 static inline void
assert_canlock_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)319 assert_canlock_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
320 {
321 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
322 
323 	if (__probable(rw_lock_debug_disabled() || (rw_locks_held->rwld_locks_acquired == 0))) {
324 		//no locks saved, safe to lock
325 		return;
326 	}
327 
328 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
329 	if (__improbable(entry != NULL)) {
330 		boolean_t can_be_shared_recursive;
331 		if (lck_rw_recursive_shared_assert_74048094) {
332 			can_be_shared_recursive = (lock->lck_rw_priv_excl == 0);
333 		} else {
334 			/* currently rw_lock_shared is called recursively,
335 			 * until the code is fixed allow to lock
336 			 * recursively in shared mode
337 			 */
338 			can_be_shared_recursive = TRUE;
339 		}
340 		if ((type == LCK_RW_TYPE_SHARED) && can_be_shared_recursive && entry->rwlde_mode_count >= 1) {
341 			return;
342 		}
343 		canlock_rwlock_panic(lock, thread, entry);
344 	}
345 }
346 
347 __abortlike
348 static void
held_rwlock_notheld_panic(lck_rw_t * lock,thread_t thread)349 held_rwlock_notheld_panic(lck_rw_t* lock, thread_t thread)
350 {
351 	panic("RW lock %p not held by %p", lock, thread);
352 }
353 
354 __abortlike
355 static void
held_rwlock_notheld_with_info_panic(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,struct rw_lock_debug_entry * entry)356 held_rwlock_notheld_with_info_panic(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, struct rw_lock_debug_entry *entry)
357 {
358 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
359 		panic("RW lock %p not held in exclusive by %p caller %p read %d state 0x%x owner 0x%p ",
360 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
361 		    ordered_load_rw(lock), ordered_load_rw_owner(lock));
362 	} else {
363 		panic("RW lock %p not held in shared by %p caller %p read %d state 0x%x owner 0x%p ",
364 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
365 		    ordered_load_rw(lock), ordered_load_rw_owner(lock));
366 	}
367 }
368 
369 static inline void
assert_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)370 assert_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
371 {
372 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
373 
374 	if (__probable(rw_lock_debug_disabled())) {
375 		return;
376 	}
377 
378 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_locks_saved == 0)) {
379 		if (rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_overflow == 0) {
380 			held_rwlock_notheld_panic(lock, thread);
381 		}
382 		return;
383 	}
384 
385 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
386 	if (__probable(entry != NULL)) {
387 		if (type == LCK_RW_TYPE_EXCLUSIVE && entry->rwlde_mode_count != -1) {
388 			held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
389 		} else {
390 			if (type == LCK_RW_TYPE_SHARED && entry->rwlde_mode_count <= 0) {
391 				held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
392 			}
393 		}
394 	} else {
395 		if (rw_locks_held->rwld_overflow == 0) {
396 			held_rwlock_notheld_panic(lock, thread);
397 		}
398 	}
399 }
400 
401 static inline void
change_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)402 change_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
403 {
404 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
405 
406 	if (__probable(rw_lock_debug_disabled())) {
407 		return;
408 	}
409 
410 	if (__improbable(rw_locks_held->rwld_locks_saved == 0)) {
411 		if (rw_locks_held->rwld_overflow == 0) {
412 			held_rwlock_notheld_panic(lock, thread);
413 		}
414 		return;
415 	}
416 
417 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
418 	if (__probable(entry != NULL)) {
419 		if (typeFrom == LCK_RW_TYPE_SHARED) {
420 			//We are upgrading
421 			assertf(entry->rwlde_mode_count == 1,
422 			    "RW lock %p not held by a single shared when upgrading "
423 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
424 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
425 			    ordered_load_rw(lock), ordered_load_rw_owner(lock));
426 			entry->rwlde_mode_count = -1;
427 			set_rwlde_caller_packed(entry, caller);
428 		} else {
429 			//We are downgrading
430 			assertf(entry->rwlde_mode_count == -1,
431 			    "RW lock %p not held in write mode when downgrading "
432 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
433 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
434 			    ordered_load_rw(lock), ordered_load_rw_owner(lock));
435 			entry->rwlde_mode_count = 1;
436 			set_rwlde_caller_packed(entry, caller);
437 		}
438 		return;
439 	}
440 
441 	if (rw_locks_held->rwld_overflow == 0) {
442 		held_rwlock_notheld_panic(lock, thread);
443 	}
444 
445 	if (rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER) {
446 		//array is full
447 		return;
448 	}
449 
450 	struct rw_lock_debug_entry *null_entry = find_empty_slot(rw_locks_held);
451 	null_entry->rwlde_lock = lock;
452 	set_rwlde_caller_packed(null_entry, caller);
453 	if (typeFrom == LCK_RW_TYPE_SHARED) {
454 		null_entry->rwlde_mode_count = -1;
455 	} else {
456 		null_entry->rwlde_mode_count = 1;
457 	}
458 	rw_locks_held->rwld_locks_saved++;
459 }
460 
461 __abortlike
462 static void
add_held_rwlock_too_many_panic(thread_t thread)463 add_held_rwlock_too_many_panic(thread_t thread)
464 {
465 	panic("RW lock too many rw locks held, rwld_locks_acquired maxed out for thread %p", thread);
466 }
467 
468 static inline void
add_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)469 add_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
470 {
471 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
472 	struct rw_lock_debug_entry *null_entry;
473 
474 	if (__probable(rw_lock_debug_disabled())) {
475 		return;
476 	}
477 
478 	if (__improbable(rw_locks_held->rwld_locks_acquired == UINT32_MAX)) {
479 		add_held_rwlock_too_many_panic(thread);
480 	}
481 	rw_locks_held->rwld_locks_acquired++;
482 
483 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
484 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
485 			//array is full
486 			rw_locks_held->rwld_overflow = 1;
487 			return;
488 		}
489 		null_entry = find_empty_slot(rw_locks_held);
490 		null_entry->rwlde_lock = lock;
491 		set_rwlde_caller_packed(null_entry, caller);
492 		null_entry->rwlde_mode_count = -1;
493 		rw_locks_held->rwld_locks_saved++;
494 		return;
495 	} else {
496 		if (__probable(rw_locks_held->rwld_locks_saved == 0)) {
497 			//array is empty
498 			goto add_shared;
499 		}
500 
501 		boolean_t allow_shared_recursive;
502 		if (lck_rw_recursive_shared_assert_74048094) {
503 			allow_shared_recursive = (lock->lck_rw_priv_excl == 0);
504 		} else {
505 			allow_shared_recursive = TRUE;
506 		}
507 		if (allow_shared_recursive) {
508 			//It could be already locked in shared mode
509 			struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
510 			if (entry != NULL) {
511 				assert(entry->rwlde_mode_count > 0);
512 				assertf(entry->rwlde_mode_count != INT8_MAX,
513 				    "RW lock %p with too many recursive shared held "
514 				    "from %p caller %p read %d state 0x%x owner 0x%p",
515 				    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
516 				    ordered_load_rw(lock), ordered_load_rw_owner(lock));
517 				entry->rwlde_mode_count += 1;
518 				return;
519 			}
520 		}
521 
522 		//none of the locks were a match
523 		//try to add a new entry
524 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
525 			//array is full
526 			rw_locks_held->rwld_overflow = 1;
527 			return;
528 		}
529 
530 add_shared:
531 		null_entry = find_empty_slot(rw_locks_held);
532 		null_entry->rwlde_lock = lock;
533 		set_rwlde_caller_packed(null_entry, caller);
534 		null_entry->rwlde_mode_count = 1;
535 		rw_locks_held->rwld_locks_saved++;
536 	}
537 }
538 
539 static inline void
remove_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)540 remove_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
541 {
542 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
543 
544 	if (__probable(rw_lock_debug_disabled())) {
545 		return;
546 	}
547 
548 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0)) {
549 		return;
550 	}
551 	rw_locks_held->rwld_locks_acquired--;
552 
553 	if (rw_locks_held->rwld_locks_saved == 0) {
554 		assert(rw_locks_held->rwld_overflow == 1);
555 		goto out;
556 	}
557 
558 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
559 	if (__probable(entry != NULL)) {
560 		if (type == LCK_RW_TYPE_EXCLUSIVE) {
561 			assert(entry->rwlde_mode_count == -1);
562 			entry->rwlde_mode_count = 0;
563 		} else {
564 			assert(entry->rwlde_mode_count > 0);
565 			entry->rwlde_mode_count--;
566 			if (entry->rwlde_mode_count > 0) {
567 				goto out;
568 			}
569 		}
570 		entry->rwlde_caller_packed = 0;
571 		entry->rwlde_lock = NULL;
572 		rw_locks_held->rwld_locks_saved--;
573 	} else {
574 		assert(rw_locks_held->rwld_overflow == 1);
575 	}
576 
577 out:
578 	if (rw_locks_held->rwld_locks_acquired == 0) {
579 		rw_locks_held->rwld_overflow = 0;
580 	}
581 	return;
582 }
583 #endif /* DEBUG_RW */
584 
585 /*
586  * We disable interrupts while holding the RW interlock to prevent an
587  * interrupt from exacerbating hold time.
588  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
589  */
590 static inline boolean_t
lck_interlock_lock(lck_rw_t * lck)591 lck_interlock_lock(
592 	lck_rw_t        *lck)
593 {
594 	boolean_t       istate;
595 
596 	istate = ml_set_interrupts_enabled(FALSE);
597 	lck_rw_ilk_lock(lck);
598 	return istate;
599 }
600 
601 static inline void
lck_interlock_unlock(lck_rw_t * lck,boolean_t istate)602 lck_interlock_unlock(
603 	lck_rw_t        *lck,
604 	boolean_t       istate)
605 {
606 	lck_rw_ilk_unlock(lck);
607 	ml_set_interrupts_enabled(istate);
608 }
609 
610 static inline void
lck_rw_inc_thread_count(thread_t thread)611 lck_rw_inc_thread_count(
612 	thread_t thread)
613 {
614 	__assert_only uint32_t prev_rwlock_count;
615 
616 	prev_rwlock_count = thread->rwlock_count++;
617 #if MACH_ASSERT
618 	/*
619 	 * Set the ast to check that the
620 	 * rwlock_count is going to be set to zero when
621 	 * going back to userspace.
622 	 * Set it only once when we increment it for the first time.
623 	 */
624 	if (prev_rwlock_count == 0) {
625 		act_set_debug_assert();
626 	}
627 #endif
628 }
629 
630 /*
631  * compute the deadline to spin against when
632  * waiting for a change of state on a lck_rw_t
633  */
634 static inline uint64_t
lck_rw_deadline_for_spin(lck_rw_t * lck)635 lck_rw_deadline_for_spin(
636 	lck_rw_t        *lck)
637 {
638 	lck_rw_word_t   word;
639 
640 	word.data = ordered_load_rw(lck);
641 	if (word.can_sleep) {
642 		if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
643 			/*
644 			 * there are already threads waiting on this lock... this
645 			 * implies that they have spun beyond their deadlines waiting for
646 			 * the desired state to show up so we will not bother spinning at this time...
647 			 *   or
648 			 * the current number of threads sharing this lock exceeds our capacity to run them
649 			 * concurrently and since all states we're going to spin for require the rw_shared_count
650 			 * to be at 0, we'll not bother spinning since the latency for this to happen is
651 			 * unpredictable...
652 			 */
653 			return mach_absolute_time();
654 		}
655 		return mach_absolute_time() + os_atomic_load(&MutexSpin, relaxed);
656 	} else {
657 		return mach_absolute_time() + (100000LL * 1000000000LL);
658 	}
659 }
660 
661 /*
662  * This inline is used when busy-waiting for an rw lock.
663  * If interrupts were disabled when the lock primitive was called,
664  * we poll the IPI handler for pending tlb flushes in x86.
665  */
666 static inline void
lck_rw_lock_pause(boolean_t interrupts_enabled)667 lck_rw_lock_pause(
668 	boolean_t       interrupts_enabled)
669 {
670 #if X86_64
671 	if (!interrupts_enabled) {
672 		handle_pending_TLB_flushes();
673 	}
674 	cpu_pause();
675 #else
676 	(void) interrupts_enabled;
677 	wait_for_event();
678 #endif
679 }
680 
681 static boolean_t
lck_rw_drain_status(lck_rw_t * lock,uint32_t status_mask,boolean_t wait)682 lck_rw_drain_status(
683 	lck_rw_t        *lock,
684 	uint32_t        status_mask,
685 	boolean_t       wait)
686 {
687 	uint64_t        deadline = 0;
688 	uint32_t        data;
689 	boolean_t       istate = FALSE;
690 
691 	if (wait) {
692 		deadline = lck_rw_deadline_for_spin(lock);
693 #if __x86_64__
694 		istate = ml_get_interrupts_enabled();
695 #endif
696 	}
697 
698 	for (;;) {
699 #if __x86_64__
700 		data = os_atomic_load(&lock->lck_rw_data, relaxed);
701 #else
702 		data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
703 #endif
704 		if ((data & status_mask) == 0) {
705 			break;
706 		}
707 		if (wait) {
708 			lck_rw_lock_pause(istate);
709 		} else {
710 			atomic_exchange_abort();
711 		}
712 		if (!wait || (mach_absolute_time() >= deadline)) {
713 			return FALSE;
714 		}
715 	}
716 	atomic_exchange_abort();
717 	return TRUE;
718 }
719 
720 /*
721  * Spin while interlock is held.
722  */
723 static inline void
lck_rw_interlock_spin(lck_rw_t * lock)724 lck_rw_interlock_spin(
725 	lck_rw_t        *lock)
726 {
727 	uint32_t        data, prev;
728 
729 	for (;;) {
730 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_relaxed);
731 		if (data & LCK_RW_INTERLOCK) {
732 #if __x86_64__
733 			cpu_pause();
734 #else
735 			wait_for_event();
736 #endif
737 		} else {
738 			atomic_exchange_abort();
739 			return;
740 		}
741 	}
742 }
743 
744 #define LCK_RW_GRAB_WANT        0
745 #define LCK_RW_GRAB_SHARED      1
746 
747 static boolean_t
lck_rw_grab(lck_rw_t * lock,int mode,boolean_t wait)748 lck_rw_grab(
749 	lck_rw_t        *lock,
750 	int             mode,
751 	boolean_t       wait)
752 {
753 	uint64_t        deadline = 0;
754 	uint32_t        data, prev;
755 	boolean_t       do_exch, istate = FALSE;
756 
757 	if (wait) {
758 		deadline = lck_rw_deadline_for_spin(lock);
759 #if __x86_64__
760 		istate = ml_get_interrupts_enabled();
761 #endif
762 	}
763 
764 	for (;;) {
765 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
766 		if (data & LCK_RW_INTERLOCK) {
767 			atomic_exchange_abort();
768 			lck_rw_interlock_spin(lock);
769 			continue;
770 		}
771 		do_exch = FALSE;
772 		if (mode == LCK_RW_GRAB_WANT) {
773 			if ((data & LCK_RW_WANT_EXCL) == 0) {
774 				data |= LCK_RW_WANT_EXCL;
775 				do_exch = TRUE;
776 			}
777 		} else {        // LCK_RW_GRAB_SHARED
778 			if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
779 			    (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
780 				data += LCK_RW_SHARED_READER;
781 				do_exch = TRUE;
782 			}
783 		}
784 		if (do_exch) {
785 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
786 				return TRUE;
787 			}
788 		} else {
789 			if (wait) {
790 				lck_rw_lock_pause(istate);
791 			} else {
792 				atomic_exchange_abort();
793 			}
794 			if (!wait || (mach_absolute_time() >= deadline)) {
795 				return FALSE;
796 			}
797 		}
798 	}
799 }
800 
801 static void
lck_rw_lock_exclusive_gen(lck_rw_t * lock)802 lck_rw_lock_exclusive_gen(
803 	lck_rw_t        *lock)
804 {
805 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
806 	lck_rw_word_t           word;
807 	int                     slept = 0;
808 	boolean_t               gotlock = 0;
809 	boolean_t               not_shared_or_upgrade = 0;
810 	wait_result_t           res = 0;
811 	boolean_t               istate;
812 
813 #if     CONFIG_DTRACE
814 	boolean_t dtrace_ls_initialized = FALSE;
815 	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
816 	uint64_t wait_interval = 0;
817 	int readers_at_sleep = 0;
818 #endif
819 
820 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
821 	assertf(owner != current_thread(), "Lock already held state=0x%x, owner=%p",
822 	    ordered_load_rw(lock), owner);
823 
824 #ifdef DEBUG_RW
825 	/*
826 	 * Best effort attempt to check that this thread
827 	 * is not already holding the lock (this checks read mode too).
828 	 */
829 	assert_canlock_rwlock(lock, current_thread(), LCK_RW_TYPE_EXCLUSIVE);
830 #endif /* DEBUG_RW */
831 
832 	/*
833 	 *	Try to acquire the lck_rw_want_excl bit.
834 	 */
835 	while (!lck_rw_grab(lock, LCK_RW_GRAB_WANT, FALSE)) {
836 #if     CONFIG_DTRACE
837 		if (dtrace_ls_initialized == FALSE) {
838 			dtrace_ls_initialized = TRUE;
839 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
840 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
841 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
842 			if (dtrace_ls_enabled) {
843 				/*
844 				 * Either sleeping or spinning is happening,
845 				 *  start a timing of our delay interval now.
846 				 */
847 				readers_at_sleep = lock->lck_rw_shared_count;
848 				wait_interval = mach_absolute_time();
849 			}
850 		}
851 #endif
852 
853 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
854 
855 		gotlock = lck_rw_grab(lock, LCK_RW_GRAB_WANT, TRUE);
856 
857 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
858 
859 		if (gotlock) {
860 			break;
861 		}
862 		/*
863 		 * if we get here, the deadline has expired w/o us
864 		 * being able to grab the lock exclusively
865 		 * check to see if we're allowed to do a thread_block
866 		 */
867 		word.data = ordered_load_rw(lock);
868 		if (word.can_sleep) {
869 			istate = lck_interlock_lock(lock);
870 			word.data = ordered_load_rw(lock);
871 
872 			if (word.want_excl) {
873 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
874 
875 				word.w_waiting = 1;
876 				ordered_store_rw(lock, word.data);
877 
878 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
879 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
880 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
881 				lck_interlock_unlock(lock, istate);
882 				if (res == THREAD_WAITING) {
883 					res = thread_block(THREAD_CONTINUE_NULL);
884 					slept++;
885 				}
886 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
887 			} else {
888 				word.want_excl = 1;
889 				ordered_store_rw(lock, word.data);
890 				lck_interlock_unlock(lock, istate);
891 				break;
892 			}
893 		}
894 	}
895 	/*
896 	 * Wait for readers (and upgrades) to finish...
897 	 */
898 	while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE)) {
899 #if     CONFIG_DTRACE
900 		/*
901 		 * Either sleeping or spinning is happening, start
902 		 * a timing of our delay interval now.  If we set it
903 		 * to -1 we don't have accurate data so we cannot later
904 		 * decide to record a dtrace spin or sleep event.
905 		 */
906 		if (dtrace_ls_initialized == FALSE) {
907 			dtrace_ls_initialized = TRUE;
908 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
909 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
910 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
911 			if (dtrace_ls_enabled) {
912 				/*
913 				 * Either sleeping or spinning is happening,
914 				 *  start a timing of our delay interval now.
915 				 */
916 				readers_at_sleep = lock->lck_rw_shared_count;
917 				wait_interval = mach_absolute_time();
918 			}
919 		}
920 #endif
921 
922 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
923 
924 		not_shared_or_upgrade = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE);
925 
926 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, not_shared_or_upgrade, 0);
927 
928 		if (not_shared_or_upgrade) {
929 			break;
930 		}
931 		/*
932 		 * if we get here, the deadline has expired w/o us
933 		 * being able to grab the lock exclusively
934 		 * check to see if we're allowed to do a thread_block
935 		 */
936 		word.data = ordered_load_rw(lock);
937 		if (word.can_sleep) {
938 			istate = lck_interlock_lock(lock);
939 			word.data = ordered_load_rw(lock);
940 
941 			if (word.shared_count != 0 || word.want_upgrade) {
942 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
943 
944 				word.w_waiting = 1;
945 				ordered_store_rw(lock, word.data);
946 
947 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
948 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
949 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
950 				lck_interlock_unlock(lock, istate);
951 
952 				if (res == THREAD_WAITING) {
953 					res = thread_block(THREAD_CONTINUE_NULL);
954 					slept++;
955 				}
956 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
957 			} else {
958 				lck_interlock_unlock(lock, istate);
959 				/*
960 				 * must own the lock now, since we checked for
961 				 * readers or upgrade owner behind the interlock
962 				 * no need for a call to 'lck_rw_drain_status'
963 				 */
964 				break;
965 			}
966 		}
967 	}
968 
969 #if     CONFIG_DTRACE
970 	/*
971 	 * Decide what latencies we suffered that are Dtrace events.
972 	 * If we have set wait_interval, then we either spun or slept.
973 	 * At least we get out from under the interlock before we record
974 	 * which is the best we can do here to minimize the impact
975 	 * of the tracing.
976 	 * If we have set wait_interval to -1, then dtrace was not enabled when we
977 	 * started sleeping/spinning so we don't record this event.
978 	 */
979 	if (dtrace_ls_enabled == TRUE) {
980 		if (slept == 0) {
981 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
982 			    mach_absolute_time() - wait_interval, 1);
983 		} else {
984 			/*
985 			 * For the blocking case, we also record if when we blocked
986 			 * it was held for read or write, and how many readers.
987 			 * Notice that above we recorded this before we dropped
988 			 * the interlock so the count is accurate.
989 			 */
990 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
991 			    mach_absolute_time() - wait_interval, 1,
992 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
993 		}
994 	}
995 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
996 #endif  /* CONFIG_DTRACE */
997 }
998 
999 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
1000 	    (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
1001 	    LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1002 /*!
1003  * @function lck_rw_lock_exclusive_check_contended
1004  *
1005  * @abstract
1006  * Locks a rw_lock in exclusive mode.
1007  *
1008  * @discussion
1009  * This routine IS EXPERIMENTAL.
1010  * It's only used for the vm object lock, and use for other subsystems is UNSUPPORTED.
1011  * Note that the return value is ONLY A HEURISTIC w.r.t. the lock's contention.
1012  *
1013  * @param lock           rw_lock to lock.
1014  *
1015  * @returns Returns TRUE if the thread spun or blocked while attempting to acquire the lock, FALSE
1016  *          otherwise.
1017  */
1018 bool
lck_rw_lock_exclusive_check_contended(lck_rw_t * lock)1019 lck_rw_lock_exclusive_check_contended(
1020 	lck_rw_t        *lock)
1021 {
1022 	thread_t        thread = current_thread();
1023 	bool            contended  = false;
1024 
1025 	if (lock->lck_rw_can_sleep) {
1026 		lck_rw_inc_thread_count(thread);
1027 	} else if (get_preemption_level() == 0) {
1028 		panic("Taking non-sleepable RW lock with preemption enabled");
1029 	}
1030 
1031 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1032 #if     CONFIG_DTRACE
1033 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1034 #endif  /* CONFIG_DTRACE */
1035 	} else {
1036 		contended = true;
1037 		lck_rw_lock_exclusive_gen(lock);
1038 	}
1039 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
1040 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1041 
1042 	ordered_store_rw_owner(lock, thread);
1043 
1044 #ifdef DEBUG_RW
1045 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, __builtin_return_address(0));
1046 #endif /* DEBUG_RW */
1047 	return contended;
1048 }
1049 
1050 __attribute__((always_inline))
1051 static void
lck_rw_lock_exclusive_internal_inline(lck_rw_t * lock,void * caller)1052 lck_rw_lock_exclusive_internal_inline(
1053 	lck_rw_t        *lock,
1054 	void            *caller)
1055 {
1056 #pragma unused(caller)
1057 	thread_t        thread = current_thread();
1058 
1059 	if (lock->lck_rw_can_sleep) {
1060 		lck_rw_inc_thread_count(thread);
1061 	} else if (get_preemption_level() == 0) {
1062 		panic("Taking non-sleepable RW lock with preemption enabled");
1063 	}
1064 
1065 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1066 #if     CONFIG_DTRACE
1067 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1068 #endif  /* CONFIG_DTRACE */
1069 	} else {
1070 		lck_rw_lock_exclusive_gen(lock);
1071 	}
1072 
1073 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
1074 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1075 
1076 	ordered_store_rw_owner(lock, thread);
1077 
1078 #if DEBUG_RW
1079 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1080 #endif /* DEBUG_RW */
1081 }
1082 
1083 __attribute__((noinline))
1084 static void
lck_rw_lock_exclusive_internal(lck_rw_t * lock,void * caller)1085 lck_rw_lock_exclusive_internal(
1086 	lck_rw_t        *lock,
1087 	void            *caller)
1088 {
1089 	lck_rw_lock_exclusive_internal_inline(lock, caller);
1090 }
1091 
1092 /*!
1093  * @function lck_rw_lock_exclusive
1094  *
1095  * @abstract
1096  * Locks a rw_lock in exclusive mode.
1097  *
1098  * @discussion
1099  * This function can block.
1100  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1101  * can acquire it in exclusive mode.
1102  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1103  *
1104  * @param lock           rw_lock to lock.
1105  */
1106 void
lck_rw_lock_exclusive(lck_rw_t * lock)1107 lck_rw_lock_exclusive(
1108 	lck_rw_t        *lock)
1109 {
1110 	lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0));
1111 }
1112 
1113 /*
1114  *	Routine:	lck_rw_lock_shared_gen
1115  *	Function:
1116  *		Fast path code has determined that this lock
1117  *		is held exclusively... this is where we spin/block
1118  *		until we can acquire the lock in the shared mode
1119  */
1120 static void
lck_rw_lock_shared_gen(lck_rw_t * lck)1121 lck_rw_lock_shared_gen(
1122 	lck_rw_t        *lck)
1123 {
1124 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1125 	lck_rw_word_t           word;
1126 	boolean_t               gotlock = 0;
1127 	int                     slept = 0;
1128 	wait_result_t           res = 0;
1129 	boolean_t               istate;
1130 
1131 #if     CONFIG_DTRACE
1132 	uint64_t wait_interval = 0;
1133 	int readers_at_sleep = 0;
1134 	boolean_t dtrace_ls_initialized = FALSE;
1135 	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1136 #endif /* CONFIG_DTRACE */
1137 
1138 	__assert_only thread_t owner = ordered_load_rw_owner(lck);
1139 	assertf(owner != current_thread(), "Lock already held state=0x%x, owner=%p",
1140 	    ordered_load_rw(lck), owner);
1141 #ifdef DEBUG_RW
1142 	/*
1143 	 * Best effort attempt to check that this thread
1144 	 * is not already holding the lock in shared mode.
1145 	 */
1146 	assert_canlock_rwlock(lck, current_thread(), LCK_RW_TYPE_SHARED);
1147 #endif
1148 
1149 	while (!lck_rw_grab(lck, LCK_RW_GRAB_SHARED, FALSE)) {
1150 #if     CONFIG_DTRACE
1151 		if (dtrace_ls_initialized == FALSE) {
1152 			dtrace_ls_initialized = TRUE;
1153 			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1154 			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1155 			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1156 			if (dtrace_ls_enabled) {
1157 				/*
1158 				 * Either sleeping or spinning is happening,
1159 				 *  start a timing of our delay interval now.
1160 				 */
1161 				readers_at_sleep = lck->lck_rw_shared_count;
1162 				wait_interval = mach_absolute_time();
1163 			}
1164 		}
1165 #endif
1166 
1167 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1168 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1169 
1170 		gotlock = lck_rw_grab(lck, LCK_RW_GRAB_SHARED, TRUE);
1171 
1172 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1173 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, gotlock, 0);
1174 
1175 		if (gotlock) {
1176 			break;
1177 		}
1178 		/*
1179 		 * if we get here, the deadline has expired w/o us
1180 		 * being able to grab the lock for read
1181 		 * check to see if we're allowed to do a thread_block
1182 		 */
1183 		if (lck->lck_rw_can_sleep) {
1184 			istate = lck_interlock_lock(lck);
1185 
1186 			word.data = ordered_load_rw(lck);
1187 			if ((word.want_excl || word.want_upgrade) &&
1188 			    ((word.shared_count == 0) || word.priv_excl)) {
1189 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1190 				    trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1191 
1192 				word.r_waiting = 1;
1193 				ordered_store_rw(lck, word.data);
1194 
1195 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1196 				res = assert_wait(LCK_RW_READER_EVENT(lck),
1197 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1198 				lck_interlock_unlock(lck, istate);
1199 
1200 				if (res == THREAD_WAITING) {
1201 					res = thread_block(THREAD_CONTINUE_NULL);
1202 					slept++;
1203 				}
1204 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1205 				    trace_lck, res, slept, 0, 0);
1206 			} else {
1207 				word.shared_count++;
1208 				ordered_store_rw(lck, word.data);
1209 				lck_interlock_unlock(lck, istate);
1210 				break;
1211 			}
1212 		}
1213 	}
1214 
1215 #if     CONFIG_DTRACE
1216 	if (dtrace_ls_enabled == TRUE) {
1217 		if (slept == 0) {
1218 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1219 		} else {
1220 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1221 			    mach_absolute_time() - wait_interval, 0,
1222 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1223 		}
1224 	}
1225 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1226 #endif  /* CONFIG_DTRACE */
1227 }
1228 
1229 __attribute__((always_inline))
1230 static void
lck_rw_lock_shared_internal_inline(lck_rw_t * lock,void * caller)1231 lck_rw_lock_shared_internal_inline(
1232 	lck_rw_t        *lock,
1233 	void            *caller)
1234 {
1235 #pragma unused(caller)
1236 
1237 	uint32_t        data, prev;
1238 	thread_t        thread = current_thread();
1239 	__assert_only thread_t owner;
1240 #ifdef DEBUG_RW
1241 	boolean_t       check_canlock = TRUE;
1242 #endif
1243 
1244 	if (lock->lck_rw_can_sleep) {
1245 		lck_rw_inc_thread_count(thread);
1246 	} else if (get_preemption_level() == 0) {
1247 		panic("Taking non-sleepable RW lock with preemption enabled");
1248 	}
1249 
1250 	for (;;) {
1251 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1252 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1253 			atomic_exchange_abort();
1254 			lck_rw_lock_shared_gen(lock);
1255 			goto locked;
1256 		}
1257 #ifdef DEBUG_RW
1258 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1259 			/*
1260 			 * If the lock is uncontended,
1261 			 * we do not need to check if we can lock it
1262 			 */
1263 			check_canlock = FALSE;
1264 		}
1265 #endif
1266 		data += LCK_RW_SHARED_READER;
1267 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1268 			break;
1269 		}
1270 		cpu_pause();
1271 	}
1272 #ifdef DEBUG_RW
1273 	if (check_canlock) {
1274 		/*
1275 		 * Best effort attempt to check that this thread
1276 		 * is not already holding the lock (this checks read mode too).
1277 		 */
1278 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1279 	}
1280 #endif
1281 locked:
1282 	owner = ordered_load_rw_owner(lock);
1283 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1284 
1285 #if     CONFIG_DTRACE
1286 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1287 #endif  /* CONFIG_DTRACE */
1288 
1289 #ifdef DEBUG_RW
1290 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1291 #endif /* DEBUG_RW */
1292 }
1293 
1294 __attribute__((noinline))
1295 static void
lck_rw_lock_shared_internal(lck_rw_t * lock,void * caller)1296 lck_rw_lock_shared_internal(
1297 	lck_rw_t        *lock,
1298 	void            *caller)
1299 {
1300 	lck_rw_lock_shared_internal_inline(lock, caller);
1301 }
1302 
1303 /*!
1304  * @function lck_rw_lock_shared
1305  *
1306  * @abstract
1307  * Locks a rw_lock in shared mode.
1308  *
1309  * @discussion
1310  * This function can block.
1311  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1312  * can acquire it in exclusive mode.
1313  * If the lock is held in shared mode and there are no writers waiting, a reader will be able to acquire
1314  * the lock without waiting.
1315  * If the lock is held in shared mode and there is at least a writer waiting, a reader will wait
1316  * for all the writers to make progress if the lock was initialized with the default settings. Instead if
1317  * RW_SHARED_PRIORITY was selected at initialization time, a reader will never wait if the lock is held
1318  * in shared mode.
1319  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1320  *
1321  * @param lock           rw_lock to lock.
1322  */
1323 void
lck_rw_lock_shared(lck_rw_t * lock)1324 lck_rw_lock_shared(
1325 	lck_rw_t        *lock)
1326 {
1327 	lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0));
1328 }
1329 
1330 /*
1331  *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1332  *	Function:
1333  *		Fast path code has already dropped our read
1334  *		count and determined that someone else owns 'lck_rw_want_upgrade'
1335  *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1336  *		all we need to do here is determine if a wakeup is needed
1337  */
1338 static boolean_t
lck_rw_lock_shared_to_exclusive_failure(lck_rw_t * lck,uint32_t prior_lock_state)1339 lck_rw_lock_shared_to_exclusive_failure(
1340 	lck_rw_t        *lck,
1341 	uint32_t        prior_lock_state)
1342 {
1343 	thread_t        thread = current_thread();
1344 	uint32_t        rwlock_count;
1345 
1346 	if ((prior_lock_state & LCK_RW_W_WAITING) &&
1347 	    ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1348 		/*
1349 		 *	Someone else has requested upgrade.
1350 		 *	Since we've released the read lock, wake
1351 		 *	him up if he's blocked waiting
1352 		 */
1353 		thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1354 	}
1355 
1356 	/* Check if dropping the lock means that we need to unpromote */
1357 	if (lck->lck_rw_can_sleep) {
1358 		rwlock_count = thread->rwlock_count--;
1359 	} else {
1360 		rwlock_count = UINT32_MAX;
1361 	}
1362 
1363 	if (rwlock_count == 0) {
1364 		panic("rw lock count underflow for thread %p", thread);
1365 	}
1366 
1367 	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1368 		/* sched_flags checked without lock, but will be rechecked while clearing */
1369 		lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1370 	}
1371 
1372 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1373 	    VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1374 
1375 #ifdef DEBUG_RW
1376 	remove_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
1377 #endif /* DEBUG_RW */
1378 
1379 	return FALSE;
1380 }
1381 
1382 /*
1383  *	Routine:	lck_rw_lock_shared_to_exclusive_success
1384  *	Function:
1385  *		the fast path code has already dropped our read
1386  *		count and successfully acquired 'lck_rw_want_upgrade'
1387  *		we just need to wait for the rest of the readers to drain
1388  *		and then we can return as the exclusive holder of this lock
1389  */
1390 static void
lck_rw_lock_shared_to_exclusive_success(lck_rw_t * lock)1391 lck_rw_lock_shared_to_exclusive_success(
1392 	lck_rw_t        *lock)
1393 {
1394 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1395 	int                     slept = 0;
1396 	lck_rw_word_t           word;
1397 	wait_result_t           res;
1398 	boolean_t               istate;
1399 	boolean_t               not_shared;
1400 
1401 #if     CONFIG_DTRACE
1402 	uint64_t                wait_interval = 0;
1403 	int                     readers_at_sleep = 0;
1404 	boolean_t               dtrace_ls_initialized = FALSE;
1405 	boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1406 #endif
1407 
1408 	while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE)) {
1409 		word.data = ordered_load_rw(lock);
1410 #if     CONFIG_DTRACE
1411 		if (dtrace_ls_initialized == FALSE) {
1412 			dtrace_ls_initialized = TRUE;
1413 			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1414 			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1415 			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1416 			if (dtrace_ls_enabled) {
1417 				/*
1418 				 * Either sleeping or spinning is happening,
1419 				 *  start a timing of our delay interval now.
1420 				 */
1421 				readers_at_sleep = word.shared_count;
1422 				wait_interval = mach_absolute_time();
1423 			}
1424 		}
1425 #endif
1426 
1427 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1428 		    trace_lck, word.shared_count, 0, 0, 0);
1429 
1430 		not_shared = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE);
1431 
1432 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1433 		    trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1434 
1435 		if (not_shared) {
1436 			break;
1437 		}
1438 
1439 		/*
1440 		 * if we get here, the spin deadline in lck_rw_wait_on_status()
1441 		 * has expired w/o the rw_shared_count having drained to 0
1442 		 * check to see if we're allowed to do a thread_block
1443 		 */
1444 		if (word.can_sleep) {
1445 			istate = lck_interlock_lock(lock);
1446 
1447 			word.data = ordered_load_rw(lock);
1448 			if (word.shared_count != 0) {
1449 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1450 				    trace_lck, word.shared_count, 0, 0, 0);
1451 
1452 				word.w_waiting = 1;
1453 				ordered_store_rw(lock, word.data);
1454 
1455 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1456 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1457 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1458 				lck_interlock_unlock(lock, istate);
1459 
1460 				if (res == THREAD_WAITING) {
1461 					res = thread_block(THREAD_CONTINUE_NULL);
1462 					slept++;
1463 				}
1464 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1465 				    trace_lck, res, slept, 0, 0);
1466 			} else {
1467 				lck_interlock_unlock(lock, istate);
1468 				break;
1469 			}
1470 		}
1471 	}
1472 #if     CONFIG_DTRACE
1473 	/*
1474 	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1475 	 */
1476 	if (dtrace_ls_enabled == TRUE) {
1477 		if (slept == 0) {
1478 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1479 		} else {
1480 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1481 			    mach_absolute_time() - wait_interval, 1,
1482 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1483 		}
1484 	}
1485 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1486 #endif
1487 }
1488 
1489 /*!
1490  * @function lck_rw_lock_shared_to_exclusive
1491  *
1492  * @abstract
1493  * Upgrades a rw_lock held in shared mode to exclusive.
1494  *
1495  * @discussion
1496  * This function can block.
1497  * Only one reader at a time can upgrade to exclusive mode. If the upgrades fails the function will
1498  * return with the lock not held.
1499  * The caller needs to hold the lock in shared mode to upgrade it.
1500  *
1501  * @param lock           rw_lock already held in shared mode to upgrade.
1502  *
1503  * @returns TRUE if the lock was upgraded, FALSE if it was not possible.
1504  *          If the function was not able to upgrade the lock, the lock will be dropped
1505  *          by the function.
1506  */
1507 boolean_t
lck_rw_lock_shared_to_exclusive(lck_rw_t * lock)1508 lck_rw_lock_shared_to_exclusive(
1509 	lck_rw_t        *lock)
1510 {
1511 	uint32_t        data, prev;
1512 
1513 	assertf(lock->lck_rw_priv_excl != 0, "lock %p thread %p", lock, current_thread());
1514 
1515 #if DEBUG_RW
1516 	thread_t thread = current_thread();
1517 	assert_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1518 #endif /* DEBUG_RW */
1519 
1520 	for (;;) {
1521 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1522 		if (data & LCK_RW_INTERLOCK) {
1523 			atomic_exchange_abort();
1524 			lck_rw_interlock_spin(lock);
1525 			continue;
1526 		}
1527 		if (data & LCK_RW_WANT_UPGRADE) {
1528 			data -= LCK_RW_SHARED_READER;
1529 			if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1530 				data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1531 			}
1532 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1533 				return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1534 			}
1535 		} else {
1536 			data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1537 			data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1538 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1539 				break;
1540 			}
1541 		}
1542 		cpu_pause();
1543 	}
1544 	/* we now own the WANT_UPGRADE */
1545 	if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1546 		lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1547 	}
1548 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
1549 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1550 
1551 	ordered_store_rw_owner(lock, current_thread());
1552 #if     CONFIG_DTRACE
1553 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1554 #endif  /* CONFIG_DTRACE */
1555 
1556 #if DEBUG_RW
1557 	change_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, __builtin_return_address(0));
1558 #endif /* DEBUG_RW */
1559 	return TRUE;
1560 }
1561 
1562 /*
1563  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1564  *      Function:
1565  *		Fast path has already dropped
1566  *		our exclusive state and bumped lck_rw_shared_count
1567  *		all we need to do here is determine if anyone
1568  *		needs to be awakened.
1569  */
1570 static void
lck_rw_lock_exclusive_to_shared_gen(lck_rw_t * lck,uint32_t prior_lock_state,void * caller)1571 lck_rw_lock_exclusive_to_shared_gen(
1572 	lck_rw_t        *lck,
1573 	uint32_t        prior_lock_state,
1574 	void            *caller)
1575 {
1576 #pragma unused(caller)
1577 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1578 	lck_rw_word_t   fake_lck;
1579 
1580 	/*
1581 	 * prior_lock state is a snapshot of the 1st word of the
1582 	 * lock in question... we'll fake up a pointer to it
1583 	 * and carefully not access anything beyond whats defined
1584 	 * in the first word of a lck_rw_t
1585 	 */
1586 	fake_lck.data = prior_lock_state;
1587 
1588 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1589 	    trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1590 
1591 	/*
1592 	 * don't wake up anyone waiting to take the lock exclusively
1593 	 * since we hold a read count... when the read count drops to 0,
1594 	 * the writers will be woken.
1595 	 *
1596 	 * wake up any waiting readers if we don't have any writers waiting,
1597 	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1598 	 */
1599 	if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1600 		thread_wakeup(LCK_RW_READER_EVENT(lck));
1601 	}
1602 
1603 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1604 	    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1605 
1606 #if CONFIG_DTRACE
1607 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1608 #endif
1609 
1610 #if DEBUG_RW
1611 	thread_t        thread = current_thread();
1612 	change_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1613 #endif /* DEBUG_RW */
1614 }
1615 
1616 /*!
1617  * @function lck_rw_lock_exclusive_to_shared
1618  *
1619  * @abstract
1620  * Downgrades a rw_lock held in exclusive mode to shared.
1621  *
1622  * @discussion
1623  * The caller needs to hold the lock in exclusive mode to be able to downgrade it.
1624  *
1625  * @param lock           rw_lock already held in exclusive mode to downgrade.
1626  */
1627 void
lck_rw_lock_exclusive_to_shared(lck_rw_t * lock)1628 lck_rw_lock_exclusive_to_shared(
1629 	lck_rw_t        *lock)
1630 {
1631 	uint32_t        data, prev;
1632 
1633 	assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1634 	ordered_store_rw_owner(lock, THREAD_NULL);
1635 	for (;;) {
1636 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1637 		if (data & LCK_RW_INTERLOCK) {
1638 			atomic_exchange_abort();
1639 			lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1640 			continue;
1641 		}
1642 		data += LCK_RW_SHARED_READER;
1643 		if (data & LCK_RW_WANT_UPGRADE) {
1644 			data &= ~(LCK_RW_WANT_UPGRADE);
1645 		} else {
1646 			data &= ~(LCK_RW_WANT_EXCL);
1647 		}
1648 		if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1649 			data &= ~(LCK_RW_W_WAITING);
1650 		}
1651 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1652 			break;
1653 		}
1654 		cpu_pause();
1655 	}
1656 	lck_rw_lock_exclusive_to_shared_gen(lock, prev, __builtin_return_address(0));
1657 }
1658 
1659 /*
1660  * Very sad hack, but the codegen for lck_rw_lock
1661  * is very unhappy with the combination of __builtin_return_address()
1662  * and a noreturn function. For some reason it adds more frames
1663  * than it should. rdar://76570684
1664  */
1665 void
1666 _lck_rw_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
1667 #pragma clang diagnostic push
1668 #pragma clang diagnostic ignored "-Wmissing-noreturn"
1669 __attribute__((noinline, weak))
1670 void
_lck_rw_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1671 _lck_rw_lock_type_panic(
1672 	lck_rw_t        *lck,
1673 	lck_rw_type_t   lck_rw_type)
1674 {
1675 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
1676 }
1677 #pragma clang diagnostic pop
1678 
1679 /*!
1680  * @function lck_rw_lock
1681  *
1682  * @abstract
1683  * Locks a rw_lock with the specified type.
1684  *
1685  * @discussion
1686  * See lck_rw_lock_shared() or lck_rw_lock_exclusive() for more details.
1687  *
1688  * @param lck           rw_lock to lock.
1689  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
1690  */
1691 void
lck_rw_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1692 lck_rw_lock(
1693 	lck_rw_t        *lck,
1694 	lck_rw_type_t   lck_rw_type)
1695 {
1696 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1697 		return lck_rw_lock_shared_internal(lck, __builtin_return_address(0));
1698 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1699 		return lck_rw_lock_exclusive_internal(lck, __builtin_return_address(0));
1700 	}
1701 	_lck_rw_lock_type_panic(lck, lck_rw_type);
1702 }
1703 
1704 __attribute__((always_inline))
1705 static boolean_t
lck_rw_try_lock_shared_internal_inline(lck_rw_t * lock,void * caller)1706 lck_rw_try_lock_shared_internal_inline(
1707 	lck_rw_t        *lock,
1708 	void            *caller)
1709 {
1710 #pragma unused(caller)
1711 
1712 	uint32_t        data, prev;
1713 	thread_t        thread = current_thread();
1714 #ifdef DEBUG_RW
1715 	boolean_t       check_canlock = TRUE;
1716 #endif
1717 
1718 	for (;;) {
1719 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1720 		if (data & LCK_RW_INTERLOCK) {
1721 			atomic_exchange_abort();
1722 			lck_rw_interlock_spin(lock);
1723 			continue;
1724 		}
1725 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1726 			atomic_exchange_abort();
1727 			return FALSE;             /* lock is busy */
1728 		}
1729 #ifdef DEBUG_RW
1730 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1731 			/*
1732 			 * If the lock is uncontended,
1733 			 * we do not need to check if we can lock it
1734 			 */
1735 			check_canlock = FALSE;
1736 		}
1737 #endif
1738 		data += LCK_RW_SHARED_READER;     /* Increment reader refcount */
1739 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1740 			break;
1741 		}
1742 		cpu_pause();
1743 	}
1744 #ifdef DEBUG_RW
1745 	if (check_canlock) {
1746 		/*
1747 		 * Best effort attempt to check that this thread
1748 		 * is not already holding the lock (this checks read mode too).
1749 		 */
1750 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1751 	}
1752 #endif
1753 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
1754 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1755 
1756 	if (lock->lck_rw_can_sleep) {
1757 		lck_rw_inc_thread_count(thread);
1758 	} else if (get_preemption_level() == 0) {
1759 		panic("Taking non-sleepable RW lock with preemption enabled");
1760 	}
1761 
1762 #if     CONFIG_DTRACE
1763 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1764 #endif  /* CONFIG_DTRACE */
1765 
1766 #ifdef DEBUG_RW
1767 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1768 #endif /* DEBUG_RW */
1769 	return TRUE;
1770 }
1771 
1772 __attribute__((noinline))
1773 static boolean_t
lck_rw_try_lock_shared_internal(lck_rw_t * lock,void * caller)1774 lck_rw_try_lock_shared_internal(
1775 	lck_rw_t        *lock,
1776 	void            *caller)
1777 {
1778 	return lck_rw_try_lock_shared_internal_inline(lock, caller);
1779 }
1780 
1781 /*!
1782  * @function lck_rw_try_lock_shared
1783  *
1784  * @abstract
1785  * Tries to locks a rw_lock in read mode.
1786  *
1787  * @discussion
1788  * This function will return and not block in case the lock is already held.
1789  * See lck_rw_lock_shared for more details.
1790  *
1791  * @param lock           rw_lock to lock.
1792  *
1793  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
1794  */
1795 boolean_t
lck_rw_try_lock_shared(lck_rw_t * lock)1796 lck_rw_try_lock_shared(
1797 	lck_rw_t        *lock)
1798 {
1799 	return lck_rw_try_lock_shared_internal_inline(lock, __builtin_return_address(0));
1800 }
1801 
1802 __attribute__((always_inline))
1803 static boolean_t
lck_rw_try_lock_exclusive_internal_inline(lck_rw_t * lock,void * caller)1804 lck_rw_try_lock_exclusive_internal_inline(
1805 	lck_rw_t        *lock,
1806 	void            *caller)
1807 {
1808 #pragma unused(caller)
1809 	uint32_t        data, prev;
1810 
1811 	for (;;) {
1812 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1813 		if (data & LCK_RW_INTERLOCK) {
1814 			atomic_exchange_abort();
1815 			lck_rw_interlock_spin(lock);
1816 			continue;
1817 		}
1818 		if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1819 			atomic_exchange_abort();
1820 			return FALSE;
1821 		}
1822 		data |= LCK_RW_WANT_EXCL;
1823 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1824 			break;
1825 		}
1826 		cpu_pause();
1827 	}
1828 	thread_t thread = current_thread();
1829 
1830 	if (lock->lck_rw_can_sleep) {
1831 		lck_rw_inc_thread_count(thread);
1832 	} else if (get_preemption_level() == 0) {
1833 		panic("Taking non-sleepable RW lock with preemption enabled");
1834 	}
1835 
1836 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
1837 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1838 
1839 	ordered_store_rw_owner(lock, thread);
1840 #if     CONFIG_DTRACE
1841 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1842 #endif  /* CONFIG_DTRACE */
1843 
1844 #ifdef DEBUG_RW
1845 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1846 #endif /* DEBUG_RW */
1847 	return TRUE;
1848 }
1849 
1850 __attribute__((noinline))
1851 static boolean_t
lck_rw_try_lock_exclusive_internal(lck_rw_t * lock,void * caller)1852 lck_rw_try_lock_exclusive_internal(
1853 	lck_rw_t        *lock,
1854 	void            *caller)
1855 {
1856 	return lck_rw_try_lock_exclusive_internal_inline(lock, caller);
1857 }
1858 
1859 /*!
1860  * @function lck_rw_try_lock_exclusive
1861  *
1862  * @abstract
1863  * Tries to locks a rw_lock in write mode.
1864  *
1865  * @discussion
1866  * This function will return and not block in case the lock is already held.
1867  * See lck_rw_lock_exclusive for more details.
1868  *
1869  * @param lock           rw_lock to lock.
1870  *
1871  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
1872  */
1873 boolean_t
lck_rw_try_lock_exclusive(lck_rw_t * lock)1874 lck_rw_try_lock_exclusive(
1875 	lck_rw_t        *lock)
1876 {
1877 	return lck_rw_try_lock_exclusive_internal_inline(lock, __builtin_return_address(0));
1878 }
1879 
1880 /*
1881  * Very sad hack, but the codegen for lck_rw_try_lock
1882  * is very unhappy with the combination of __builtin_return_address()
1883  * and a noreturn function. For some reason it adds more frames
1884  * than it should. rdar://76570684
1885  */
1886 boolean_t
1887 _lck_rw_try_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
1888 #pragma clang diagnostic push
1889 #pragma clang diagnostic ignored "-Wmissing-noreturn"
1890 __attribute__((noinline, weak))
1891 boolean_t
_lck_rw_try_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1892 _lck_rw_try_lock_type_panic(
1893 	lck_rw_t        *lck,
1894 	lck_rw_type_t   lck_rw_type)
1895 {
1896 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
1897 }
1898 #pragma clang diagnostic pop
1899 
1900 /*!
1901  * @function lck_rw_try_lock
1902  *
1903  * @abstract
1904  * Tries to locks a rw_lock with the specified type.
1905  *
1906  * @discussion
1907  * This function will return and not wait/block in case the lock is already held.
1908  * See lck_rw_try_lock_shared() or lck_rw_try_lock_exclusive() for more details.
1909  *
1910  * @param lck           rw_lock to lock.
1911  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
1912  *
1913  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
1914  */
1915 boolean_t
lck_rw_try_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1916 lck_rw_try_lock(
1917 	lck_rw_t        *lck,
1918 	lck_rw_type_t   lck_rw_type)
1919 {
1920 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1921 		return lck_rw_try_lock_shared_internal(lck, __builtin_return_address(0));
1922 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1923 		return lck_rw_try_lock_exclusive_internal(lck, __builtin_return_address(0));
1924 	}
1925 	return _lck_rw_try_lock_type_panic(lck, lck_rw_type);
1926 }
1927 
1928 /*
1929  *      Routine:        lck_rw_done_gen
1930  *
1931  *	prior_lock_state is the value in the 1st
1932  *      word of the lock at the time of a successful
1933  *	atomic compare and exchange with the new value...
1934  *      it represents the state of the lock before we
1935  *	decremented the rw_shared_count or cleared either
1936  *      rw_want_upgrade or rw_want_write and
1937  *	the lck_x_waiting bits...  since the wrapper
1938  *      routine has already changed the state atomically,
1939  *	we just need to decide if we should
1940  *	wake up anyone and what value to return... we do
1941  *	this by examining the state of the lock before
1942  *	we changed it
1943  */
1944 static lck_rw_type_t
lck_rw_done_gen(lck_rw_t * lck,uint32_t prior_lock_state)1945 lck_rw_done_gen(
1946 	lck_rw_t        *lck,
1947 	uint32_t        prior_lock_state)
1948 {
1949 	lck_rw_word_t   fake_lck;
1950 	lck_rw_type_t   lock_type;
1951 	thread_t        thread;
1952 	uint32_t        rwlock_count;
1953 
1954 	/*
1955 	 * prior_lock state is a snapshot of the 1st word of the
1956 	 * lock in question... we'll fake up a pointer to it
1957 	 * and carefully not access anything beyond whats defined
1958 	 * in the first word of a lck_rw_t
1959 	 */
1960 	fake_lck.data = prior_lock_state;
1961 
1962 	if (fake_lck.shared_count <= 1) {
1963 		if (fake_lck.w_waiting) {
1964 			thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1965 		}
1966 
1967 		if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1968 			thread_wakeup(LCK_RW_READER_EVENT(lck));
1969 		}
1970 	}
1971 	if (fake_lck.shared_count) {
1972 		lock_type = LCK_RW_TYPE_SHARED;
1973 	} else {
1974 		lock_type = LCK_RW_TYPE_EXCLUSIVE;
1975 	}
1976 
1977 	/* Check if dropping the lock means that we need to unpromote */
1978 	thread = current_thread();
1979 	if (fake_lck.can_sleep) {
1980 		rwlock_count = thread->rwlock_count--;
1981 	} else {
1982 		rwlock_count = UINT32_MAX;
1983 	}
1984 
1985 	if (rwlock_count == 0) {
1986 		panic("rw lock count underflow for thread %p", thread);
1987 	}
1988 
1989 	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1990 		/* sched_flags checked without lock, but will be rechecked while clearing */
1991 		lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1992 	}
1993 #if CONFIG_DTRACE
1994 	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1995 #endif
1996 
1997 #ifdef DEBUG_RW
1998 	remove_held_rwlock(lck, thread, lock_type);
1999 #endif /* DEBUG_RW */
2000 	return lock_type;
2001 }
2002 
2003 /*!
2004  * @function lck_rw_done
2005  *
2006  * @abstract
2007  * Force unlocks a rw_lock without consistency checks.
2008  *
2009  * @discussion
2010  * Do not use unless sure you can avoid consistency checks.
2011  *
2012  * @param lock           rw_lock to unlock.
2013  */
2014 lck_rw_type_t
lck_rw_done(lck_rw_t * lock)2015 lck_rw_done(
2016 	lck_rw_t        *lock)
2017 {
2018 	uint32_t        data, prev;
2019 	boolean_t       once = FALSE;
2020 
2021 #ifdef DEBUG_RW
2022 	/*
2023 	 * Best effort attempt to check that this thread
2024 	 * is holding the lock.
2025 	 */
2026 	thread_t thread = current_thread();
2027 	assert_held_rwlock(lock, thread, 0);
2028 #endif /* DEBUG_RW */
2029 	for (;;) {
2030 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
2031 		if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
2032 			atomic_exchange_abort();
2033 			lck_rw_interlock_spin(lock);
2034 			continue;
2035 		}
2036 		if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
2037 			assertf(lock->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
2038 			data -= LCK_RW_SHARED_READER;
2039 			if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
2040 				goto check_waiters;
2041 			}
2042 		} else {                                        /* if reader count == 0, must be exclusive lock */
2043 			if (data & LCK_RW_WANT_UPGRADE) {
2044 				data &= ~(LCK_RW_WANT_UPGRADE);
2045 			} else {
2046 				if (data & LCK_RW_WANT_EXCL) {
2047 					data &= ~(LCK_RW_WANT_EXCL);
2048 				} else {                                /* lock is not 'owned', panic */
2049 					panic("Releasing non-exclusive RW lock without a reader refcount!");
2050 				}
2051 			}
2052 			if (!once) {
2053 				// Only check for holder and clear it once
2054 				assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
2055 				ordered_store_rw_owner(lock, THREAD_NULL);
2056 				once = TRUE;
2057 			}
2058 check_waiters:
2059 			/*
2060 			 * test the original values to match what
2061 			 * lck_rw_done_gen is going to do to determine
2062 			 * which wakeups need to happen...
2063 			 *
2064 			 * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
2065 			 */
2066 			if (prev & LCK_RW_W_WAITING) {
2067 				data &= ~(LCK_RW_W_WAITING);
2068 				if ((prev & LCK_RW_PRIV_EXCL) == 0) {
2069 					data &= ~(LCK_RW_R_WAITING);
2070 				}
2071 			} else {
2072 				data &= ~(LCK_RW_R_WAITING);
2073 			}
2074 		}
2075 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
2076 			break;
2077 		}
2078 		cpu_pause();
2079 	}
2080 	return lck_rw_done_gen(lock, prev);
2081 }
2082 
2083 /*!
2084  * @function lck_rw_unlock_shared
2085  *
2086  * @abstract
2087  * Unlocks a rw_lock previously locked in shared mode.
2088  *
2089  * @discussion
2090  * The same thread that locked the lock needs to unlock it.
2091  *
2092  * @param lck           rw_lock held in shared mode to unlock.
2093  */
2094 void
lck_rw_unlock_shared(lck_rw_t * lck)2095 lck_rw_unlock_shared(
2096 	lck_rw_t        *lck)
2097 {
2098 	lck_rw_type_t   ret;
2099 
2100 	assertf(lck->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
2101 	assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
2102 	ret = lck_rw_done(lck);
2103 
2104 	if (ret != LCK_RW_TYPE_SHARED) {
2105 		panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
2106 	}
2107 }
2108 
2109 /*!
2110  * @function lck_rw_unlock_exclusive
2111  *
2112  * @abstract
2113  * Unlocks a rw_lock previously locked in exclusive mode.
2114  *
2115  * @discussion
2116  * The same thread that locked the lock needs to unlock it.
2117  *
2118  * @param lck           rw_lock held in exclusive mode to unlock.
2119  */
2120 void
lck_rw_unlock_exclusive(lck_rw_t * lck)2121 lck_rw_unlock_exclusive(
2122 	lck_rw_t        *lck)
2123 {
2124 	lck_rw_type_t   ret;
2125 
2126 	assertf(lck->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
2127 	ret = lck_rw_done(lck);
2128 
2129 	if (ret != LCK_RW_TYPE_EXCLUSIVE) {
2130 		panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
2131 	}
2132 }
2133 
2134 /*!
2135  * @function lck_rw_unlock
2136  *
2137  * @abstract
2138  * Unlocks a rw_lock previously locked with lck_rw_type.
2139  *
2140  * @discussion
2141  * The lock must be unlocked by the same thread it was locked from.
2142  * The type of the lock/unlock have to match, unless an upgrade/downgrade was performed while
2143  * holding the lock.
2144  *
2145  * @param lck           rw_lock to unlock.
2146  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2147  */
2148 void
lck_rw_unlock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2149 lck_rw_unlock(
2150 	lck_rw_t         *lck,
2151 	lck_rw_type_t    lck_rw_type)
2152 {
2153 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2154 		lck_rw_unlock_shared(lck);
2155 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2156 		lck_rw_unlock_exclusive(lck);
2157 	} else {
2158 		panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
2159 	}
2160 }
2161 
2162 /*!
2163  * @function lck_rw_assert
2164  *
2165  * @abstract
2166  * Asserts the rw_lock is held.
2167  *
2168  * @discussion
2169  * read-write locks do not have a concept of ownership when held in shared mode,
2170  * so this function merely asserts that someone is holding the lock, not necessarily the caller.
2171  * However if rw_lock_debug is on, a best effort mechanism to track the owners is in place, and
2172  * this function can be more accurate.
2173  * Type can be LCK_RW_ASSERT_SHARED, LCK_RW_ASSERT_EXCLUSIVE, LCK_RW_ASSERT_HELD
2174  * LCK_RW_ASSERT_NOTHELD.
2175  *
2176  * @param lck   rw_lock to check.
2177  * @param type  assert type
2178  */
2179 void
lck_rw_assert(lck_rw_t * lck,unsigned int type)2180 lck_rw_assert(
2181 	lck_rw_t        *lck,
2182 	unsigned int    type)
2183 {
2184 #if DEBUG_RW
2185 	thread_t thread = current_thread();
2186 #endif /* DEBUG_RW */
2187 
2188 	switch (type) {
2189 	case LCK_RW_ASSERT_SHARED:
2190 		if ((lck->lck_rw_shared_count != 0) &&
2191 		    (lck->lck_rw_owner == THREAD_NULL)) {
2192 #if DEBUG_RW
2193 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2194 #endif /* DEBUG_RW */
2195 			return;
2196 		}
2197 		break;
2198 	case LCK_RW_ASSERT_EXCLUSIVE:
2199 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2200 		    (lck->lck_rw_shared_count == 0) &&
2201 		    (lck->lck_rw_owner == current_thread())) {
2202 #if DEBUG_RW
2203 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2204 #endif /* DEBUG_RW */
2205 			return;
2206 		}
2207 		break;
2208 	case LCK_RW_ASSERT_HELD:
2209 		if (lck->lck_rw_shared_count != 0) {
2210 #if DEBUG_RW
2211 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2212 #endif /* DEBUG_RW */
2213 			return;         // Held shared
2214 		}
2215 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2216 		    (lck->lck_rw_owner == current_thread())) {
2217 #if DEBUG_RW
2218 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2219 #endif /* DEBUG_RW */
2220 			return;         // Held exclusive
2221 		}
2222 		break;
2223 	case LCK_RW_ASSERT_NOTHELD:
2224 		if ((lck->lck_rw_shared_count == 0) &&
2225 		    !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2226 		    (lck->lck_rw_owner == THREAD_NULL)) {
2227 #ifdef DEBUG_RW
2228 			assert_canlock_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2229 #endif /* DEBUG_RW */
2230 			return;
2231 		}
2232 		break;
2233 	default:
2234 		break;
2235 	}
2236 	panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2237 }
2238 
2239 /*!
2240  * @function kdp_lck_rw_lock_is_acquired_exclusive
2241  *
2242  * @abstract
2243  * Checks if a rw_lock is held exclusevely.
2244  *
2245  * @discussion
2246  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2247  *
2248  * @param lck   lock to check
2249  *
2250  * @returns TRUE if the lock is held exclusevely
2251  */
2252 boolean_t
kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t * lck)2253 kdp_lck_rw_lock_is_acquired_exclusive(
2254 	lck_rw_t        *lck)
2255 {
2256 	if (not_in_kdp) {
2257 		panic("panic: rw lock exclusive check done outside of kernel debugger");
2258 	}
2259 	return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2260 }
2261 
2262 void
kdp_rwlck_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)2263 kdp_rwlck_find_owner(
2264 	__unused struct waitq   *waitq,
2265 	event64_t               event,
2266 	thread_waitinfo_t       *waitinfo)
2267 {
2268 	lck_rw_t        *rwlck = NULL;
2269 	switch (waitinfo->wait_type) {
2270 	case kThreadWaitKernelRWLockRead:
2271 		rwlck = READ_EVENT_TO_RWLOCK(event);
2272 		break;
2273 	case kThreadWaitKernelRWLockWrite:
2274 	case kThreadWaitKernelRWLockUpgrade:
2275 		rwlck = WRITE_EVENT_TO_RWLOCK(event);
2276 		break;
2277 	default:
2278 		panic("%s was called with an invalid blocking type", __FUNCTION__);
2279 		break;
2280 	}
2281 	if (rwlck->lck_rw_owner) {
2282 		thread_require(rwlck->lck_rw_owner);
2283 	}
2284 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2285 	waitinfo->owner = thread_tid(rwlck->lck_rw_owner);
2286 }
2287 
2288 /*!
2289  * @function lck_rw_lock_yield_shared
2290  *
2291  * @abstract
2292  * Yields a rw_lock held in shared mode.
2293  *
2294  * @discussion
2295  * This function can block.
2296  * Yields the lock in case there are writers waiting.
2297  * The yield will unlock, block, and re-lock the lock in shared mode.
2298  *
2299  * @param lck           rw_lock already held in shared mode to yield.
2300  * @param force_yield   if set to true it will always yield irrespective of the lock status
2301  *
2302  * @returns TRUE if the lock was yield, FALSE otherwise
2303  */
2304 boolean_t
lck_rw_lock_yield_shared(lck_rw_t * lck,boolean_t force_yield)2305 lck_rw_lock_yield_shared(
2306 	lck_rw_t        *lck,
2307 	boolean_t       force_yield)
2308 {
2309 	lck_rw_word_t   word;
2310 
2311 	lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2312 
2313 	word.data = ordered_load_rw(lck);
2314 	if (word.want_excl || word.want_upgrade || force_yield) {
2315 		lck_rw_unlock_shared(lck);
2316 		mutex_pause(2);
2317 		lck_rw_lock_shared(lck);
2318 		return TRUE;
2319 	}
2320 
2321 	return FALSE;
2322 }
2323 
2324 /*!
2325  * @function lck_rw_sleep
2326  *
2327  * @abstract
2328  * Assert_wait on an event while holding the rw_lock.
2329  *
2330  * @discussion
2331  * the flags can decide how to re-acquire the lock upon wake up
2332  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2333  * and if the priority needs to be kept boosted until the lock is
2334  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2335  *
2336  * @param lck                   rw_lock to use to synch the assert_wait.
2337  * @param lck_sleep_action      flags.
2338  * @param event                 event to assert_wait on.
2339  * @param interruptible         wait type.
2340  */
2341 wait_result_t
lck_rw_sleep(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible)2342 lck_rw_sleep(
2343 	lck_rw_t                *lck,
2344 	lck_sleep_action_t      lck_sleep_action,
2345 	event_t                 event,
2346 	wait_interrupt_t        interruptible)
2347 {
2348 	wait_result_t           res;
2349 	lck_rw_type_t           lck_rw_type;
2350 	thread_pri_floor_t      token;
2351 
2352 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2353 		panic("Invalid lock sleep action %x", lck_sleep_action);
2354 	}
2355 
2356 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2357 		/*
2358 		 * Although we are dropping the RW lock, the intent in most cases
2359 		 * is that this thread remains as an observer, since it may hold
2360 		 * some secondary resource, but must yield to avoid deadlock. In
2361 		 * this situation, make sure that the thread is boosted to the
2362 		 * ceiling while blocked, so that it can re-acquire the
2363 		 * RW lock at that priority.
2364 		 */
2365 		token = thread_priority_floor_start();
2366 	}
2367 
2368 	res = assert_wait(event, interruptible);
2369 	if (res == THREAD_WAITING) {
2370 		lck_rw_type = lck_rw_done(lck);
2371 		res = thread_block(THREAD_CONTINUE_NULL);
2372 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2373 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2374 				lck_rw_lock(lck, lck_rw_type);
2375 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2376 				lck_rw_lock_exclusive(lck);
2377 			} else {
2378 				lck_rw_lock_shared(lck);
2379 			}
2380 		}
2381 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2382 		(void)lck_rw_done(lck);
2383 	}
2384 
2385 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2386 		thread_priority_floor_end(&token);
2387 	}
2388 
2389 	return res;
2390 }
2391 
2392 /*!
2393  * @function lck_rw_sleep_deadline
2394  *
2395  * @abstract
2396  * Assert_wait_deadline on an event while holding the rw_lock.
2397  *
2398  * @discussion
2399  * the flags can decide how to re-acquire the lock upon wake up
2400  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2401  * and if the priority needs to be kept boosted until the lock is
2402  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2403  *
2404  * @param lck                   rw_lock to use to synch the assert_wait.
2405  * @param lck_sleep_action      flags.
2406  * @param event                 event to assert_wait on.
2407  * @param interruptible         wait type.
2408  * @param deadline              maximum time after which being woken up
2409  */
2410 wait_result_t
lck_rw_sleep_deadline(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible,uint64_t deadline)2411 lck_rw_sleep_deadline(
2412 	lck_rw_t                *lck,
2413 	lck_sleep_action_t      lck_sleep_action,
2414 	event_t                 event,
2415 	wait_interrupt_t        interruptible,
2416 	uint64_t                deadline)
2417 {
2418 	wait_result_t           res;
2419 	lck_rw_type_t           lck_rw_type;
2420 	thread_pri_floor_t      token;
2421 
2422 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2423 		panic("Invalid lock sleep action %x", lck_sleep_action);
2424 	}
2425 
2426 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2427 		token = thread_priority_floor_start();
2428 	}
2429 
2430 	res = assert_wait_deadline(event, interruptible, deadline);
2431 	if (res == THREAD_WAITING) {
2432 		lck_rw_type = lck_rw_done(lck);
2433 		res = thread_block(THREAD_CONTINUE_NULL);
2434 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2435 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2436 				lck_rw_lock(lck, lck_rw_type);
2437 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2438 				lck_rw_lock_exclusive(lck);
2439 			} else {
2440 				lck_rw_lock_shared(lck);
2441 			}
2442 		}
2443 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2444 		(void)lck_rw_done(lck);
2445 	}
2446 
2447 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2448 		thread_priority_floor_end(&token);
2449 	}
2450 
2451 	return res;
2452 }
2453 
2454 /*
2455  * Reader-writer lock promotion
2456  *
2457  * We support a limited form of reader-writer
2458  * lock promotion whose effects are:
2459  *
2460  *   * Qualifying threads have decay disabled
2461  *   * Scheduler priority is reset to a floor of
2462  *     of their statically assigned priority
2463  *     or MINPRI_RWLOCK
2464  *
2465  * The rationale is that lck_rw_ts do not have
2466  * a single owner, so we cannot apply a directed
2467  * priority boost from all waiting threads
2468  * to all holding threads without maintaining
2469  * lists of all shared owners and all waiting
2470  * threads for every lock.
2471  *
2472  * Instead (and to preserve the uncontended fast-
2473  * path), acquiring (or attempting to acquire)
2474  * a RW lock in shared or exclusive lock increments
2475  * a per-thread counter. Only if that thread stops
2476  * making forward progress (for instance blocking
2477  * on a mutex, or being preempted) do we consult
2478  * the counter and apply the priority floor.
2479  * When the thread becomes runnable again (or in
2480  * the case of preemption it never stopped being
2481  * runnable), it has the priority boost and should
2482  * be in a good position to run on the CPU and
2483  * release all RW locks (at which point the priority
2484  * boost is cleared).
2485  *
2486  * Care must be taken to ensure that priority
2487  * boosts are not retained indefinitely, since unlike
2488  * mutex priority boosts (where the boost is tied
2489  * to the mutex lifecycle), the boost is tied
2490  * to the thread and independent of any particular
2491  * lck_rw_t. Assertions are in place on return
2492  * to userspace so that the boost is not held
2493  * indefinitely.
2494  *
2495  * The routines that increment/decrement the
2496  * per-thread counter should err on the side of
2497  * incrementing any time a preemption is possible
2498  * and the lock would be visible to the rest of the
2499  * system as held (so it should be incremented before
2500  * interlocks are dropped/preemption is enabled, or
2501  * before a CAS is executed to acquire the lock).
2502  *
2503  */
2504 
2505 /*!
2506  * @function lck_rw_clear_promotion
2507  *
2508  * @abstract
2509  * Undo priority promotions when the last rw_lock
2510  * is released by a thread (if a promotion was active).
2511  *
2512  * @param thread        thread to demote.
2513  * @param trace_obj     object reason for the demotion.
2514  */
2515 void
lck_rw_clear_promotion(thread_t thread,uintptr_t trace_obj)2516 lck_rw_clear_promotion(
2517 	thread_t thread,
2518 	uintptr_t trace_obj)
2519 {
2520 	assert(thread->rwlock_count == 0);
2521 
2522 	/* Cancel any promotions if the thread had actually blocked while holding a RW lock */
2523 	spl_t s = splsched();
2524 	thread_lock(thread);
2525 
2526 	if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
2527 		sched_thread_unpromote_reason(thread, TH_SFLAG_RW_PROMOTED, trace_obj);
2528 	}
2529 
2530 	thread_unlock(thread);
2531 	splx(s);
2532 }
2533 
2534 /*!
2535  * @function lck_rw_set_promotion_locked
2536  *
2537  * @abstract
2538  * Callout from context switch if the thread goes
2539  * off core with a positive rwlock_count.
2540  *
2541  * @discussion
2542  * Called at splsched with the thread locked.
2543  *
2544  * @param thread        thread to promote.
2545  */
2546 void
lck_rw_set_promotion_locked(thread_t thread)2547 lck_rw_set_promotion_locked(thread_t thread)
2548 {
2549 	if (LcksOpts & disLkRWPrio) {
2550 		return;
2551 	}
2552 
2553 	assert(thread->rwlock_count > 0);
2554 
2555 	if (!(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2556 		sched_thread_promote_reason(thread, TH_SFLAG_RW_PROMOTED, 0);
2557 	}
2558 }
2559 
2560 #if __x86_64__
2561 void lck_rw_clear_promotions_x86(thread_t thread);
2562 /*
2563  * On return to userspace, this routine is called from assembly
2564  * if the rwlock_count is somehow imbalanced
2565  */
2566 #if MACH_LDEBUG
2567 __dead2
2568 #endif /* MACH_LDEBUG */
2569 void
lck_rw_clear_promotions_x86(thread_t thread)2570 lck_rw_clear_promotions_x86(thread_t thread)
2571 {
2572 #if MACH_LDEBUG
2573 	/* It's fatal to leave a RW lock locked and return to userspace */
2574 	panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
2575 #else
2576 	/* Paper over the issue */
2577 	thread->rwlock_count = 0;
2578 	lck_rw_clear_promotion(thread, 0);
2579 #endif /* MACH_LDEBUG */
2580 }
2581 #endif /* __x86_64__ */
2582