xref: /xnu-8020.101.4/osfmk/kern/lock_rw.c (revision e7776783b89a353188416a9a346c6cdb4928faad)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 #include <debug.h>
57 #include <kern/lock_stat.h>
58 #include <kern/locks.h>
59 #include <kern/zalloc.h>
60 #include <kern/thread.h>
61 #include <kern/processor.h>
62 #include <kern/sched_prim.h>
63 #include <kern/debug.h>
64 #include <machine/atomic.h>
65 #include <machine/machine_cpu.h>
66 
67 KALLOC_TYPE_DEFINE(KT_LCK_RW, lck_rw_t, KT_PRIV_ACCT);
68 
69 #define LCK_RW_WRITER_EVENT(lck)                (event_t)((uintptr_t)(lck)+1)
70 #define LCK_RW_READER_EVENT(lck)                (event_t)((uintptr_t)(lck)+2)
71 #define WRITE_EVENT_TO_RWLOCK(event)            ((lck_rw_t *)((uintptr_t)(event)-1))
72 #define READ_EVENT_TO_RWLOCK(event)             ((lck_rw_t *)((uintptr_t)(event)-2))
73 
74 #if CONFIG_DTRACE
75 #define DTRACE_RW_SHARED        0x0     //reader
76 #define DTRACE_RW_EXCL          0x1     //writer
77 #define DTRACE_NO_FLAG          0x0     //not applicable
78 #endif  /* CONFIG_DTRACE */
79 
80 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
81 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
82 #define LCK_RW_LCK_SHARED_CODE          0x102
83 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
84 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
85 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
86 
87 #if __x86_64__
88 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
89 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
90 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
91 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
92 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
93 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
94 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
95 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
96 #endif
97 
98 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
99 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
100 
101 #define ordered_load_rw(lock)                   os_atomic_load(&(lock)->lck_rw_data, compiler_acq_rel)
102 #define ordered_store_rw(lock, value)           os_atomic_store(&(lock)->lck_rw_data, (value), compiler_acq_rel)
103 #define ordered_load_rw_owner(lock)             os_atomic_load(&(lock)->lck_rw_owner, compiler_acq_rel)
104 #define ordered_store_rw_owner(lock, value)     os_atomic_store(&(lock)->lck_rw_owner, (value), compiler_acq_rel)
105 
106 #ifdef DEBUG_RW
107 static TUNABLE(bool, lck_rw_recursive_shared_assert_74048094, "lck_rw_recursive_shared_assert", false);
108 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) rwlde_caller_packing_params =
109     VM_PACKING_PARAMS(LCK_RW_CALLER_PACKED);
110 #define rw_lock_debug_disabled()                ((LcksOpts & disLkRWDebug) == disLkRWDebug)
111 
112 #define set_rwlde_caller_packed(entry, caller)          ((entry)->rwlde_caller_packed = VM_PACK_POINTER((vm_offset_t)caller, LCK_RW_CALLER_PACKED))
113 #define get_rwlde_caller(entry)                         ((void*)VM_UNPACK_POINTER(entry->rwlde_caller_packed, LCK_RW_CALLER_PACKED))
114 
115 #endif /* DEBUG_RW */
116 
117 /*!
118  * @function lck_rw_alloc_init
119  *
120  * @abstract
121  * Allocates and initializes a rw_lock_t.
122  *
123  * @discussion
124  * The function can block. See lck_rw_init() for initialization details.
125  *
126  * @param grp           lock group to associate with the lock.
127  * @param attr          lock attribute to initialize the lock.
128  *
129  * @returns             NULL or the allocated lock
130  */
131 lck_rw_t *
lck_rw_alloc_init(lck_grp_t * grp,lck_attr_t * attr)132 lck_rw_alloc_init(
133 	lck_grp_t       *grp,
134 	lck_attr_t      *attr)
135 {
136 	lck_rw_t *lck;
137 
138 	lck = zalloc_flags(KT_LCK_RW, Z_WAITOK | Z_ZERO);
139 	lck_rw_init(lck, grp, attr);
140 	return lck;
141 }
142 
143 /*!
144  * @function lck_rw_init
145  *
146  * @abstract
147  * Initializes a rw_lock_t.
148  *
149  * @discussion
150  * Usage statistics for the lock are going to be added to the lock group provided.
151  *
152  * The lock attribute can be used to specify the lock contention behaviour.
153  * RW_WRITER_PRIORITY is the default behaviour (LCK_ATTR_NULL defaults to RW_WRITER_PRIORITY)
154  * and lck_attr_rw_shared_priority() can be used to set the behaviour to RW_SHARED_PRIORITY.
155  *
156  * RW_WRITER_PRIORITY gives priority to the writers upon contention with the readers;
157  * if the lock is held and a writer starts waiting for the lock, readers will not be able
158  * to acquire the lock until all writers stop contending. Readers could
159  * potentially starve.
160  * RW_SHARED_PRIORITY gives priority to the readers upon contention with the writers:
161  * unleass the lock is held in exclusive mode, readers will always be able to acquire the lock.
162  * Readers can lock a shared lock even if there are writers waiting. Writers could potentially
163  * starve.
164  *
165  * @param lck           lock to initialize.
166  * @param grp           lock group to associate with the lock.
167  * @param attr          lock attribute to initialize the lock.
168  *
169  */
170 void
lck_rw_init(lck_rw_t * lck,lck_grp_t * grp,lck_attr_t * attr)171 lck_rw_init(
172 	lck_rw_t        *lck,
173 	lck_grp_t       *grp,
174 	lck_attr_t      *attr)
175 {
176 	if (attr == LCK_ATTR_NULL) {
177 		attr = &LockDefaultLckAttr;
178 	}
179 	memset(lck, 0, sizeof(lck_rw_t));
180 	lck->lck_rw_can_sleep = TRUE;
181 	if ((attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY) == 0) {
182 		lck->lck_rw_priv_excl = TRUE;
183 	}
184 	lck_grp_reference(grp, &grp->lck_grp_rwcnt);
185 }
186 
187 /*!
188  * @function lck_rw_free
189  *
190  * @abstract
191  * Frees a rw_lock previously allocated with lck_rw_alloc_init().
192  *
193  * @discussion
194  * The lock must be not held by any thread.
195  *
196  * @param lck           rw_lock to free.
197  */
198 void
lck_rw_free(lck_rw_t * lck,lck_grp_t * grp)199 lck_rw_free(
200 	lck_rw_t        *lck,
201 	lck_grp_t       *grp)
202 {
203 	lck_rw_destroy(lck, grp);
204 	zfree(KT_LCK_RW, lck);
205 }
206 
207 /*!
208  * @function lck_rw_destroy
209  *
210  * @abstract
211  * Destroys a rw_lock previously initialized with lck_rw_init().
212  *
213  * @discussion
214  * The lock must be not held by any thread.
215  *
216  * @param lck           rw_lock to destroy.
217  */
218 void
lck_rw_destroy(lck_rw_t * lck,lck_grp_t * grp)219 lck_rw_destroy(
220 	lck_rw_t        *lck,
221 	lck_grp_t       *grp)
222 {
223 	if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
224 		panic("Destroying previously destroyed lock %p", lck);
225 	}
226 	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
227 
228 	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
229 	lck_grp_deallocate(grp, &grp->lck_grp_rwcnt);
230 }
231 
232 #ifdef DEBUG_RW
233 
234 /*
235  * Best effort mechanism to debug rw_locks.
236  *
237  * This mechanism is in addition to the owner checks. The owner is set
238  * only when the lock is held in exclusive mode so the checks do not cover
239  * the cases in which the lock is held in shared mode.
240  *
241  * This mechanism tentatively stores the rw_lock acquired and its debug
242  * information on the thread struct.
243  * Just up to LCK_RW_EXPECTED_MAX_NUMBER rw lock debug information can be stored.
244  *
245  * NOTE: LCK_RW_EXPECTED_MAX_NUMBER is the expected number of rw_locks held
246  * at the same time. If a thread holds more than this number of rw_locks we
247  * will start losing debug information.
248  * Increasing LCK_RW_EXPECTED_MAX_NUMBER will increase the probability we will
249  * store the debug information but it will require more memory per thread
250  * and longer lock/unlock time.
251  *
252  * If an empty slot is found for the debug information, we record the lock
253  * otherwise we set the overflow threshold flag.
254  *
255  * If we reached the overflow threshold we might stop asserting because we cannot be sure
256  * anymore if the lock was acquired or not.
257  *
258  * Even if we reached the overflow threshold, we try to store the debug information
259  * for the new locks acquired. This can be useful in core dumps to debug
260  * possible return to userspace without unlocking and to find possible readers
261  * holding the lock.
262  */
263 __startup_func
264 static void
rw_lock_init(void)265 rw_lock_init(void)
266 {
267 	if (kern_feature_override(KF_RW_LOCK_DEBUG_OVRD)) {
268 		LcksOpts |= disLkRWDebug;
269 	}
270 }
271 STARTUP(LOCKS_EARLY, STARTUP_RANK_FIRST, rw_lock_init);
272 
273 static inline struct rw_lock_debug_entry *
find_lock_in_savedlocks(lck_rw_t * lock,rw_lock_debug_t * rw_locks_held)274 find_lock_in_savedlocks(lck_rw_t* lock, rw_lock_debug_t *rw_locks_held)
275 {
276 	int i;
277 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
278 		struct rw_lock_debug_entry *existing = &rw_locks_held->rwld_locks[i];
279 		if (existing->rwlde_lock == lock) {
280 			return existing;
281 		}
282 	}
283 
284 	return NULL;
285 }
286 
287 __abortlike
288 static void
rwlock_slot_panic(rw_lock_debug_t * rw_locks_held)289 rwlock_slot_panic(rw_lock_debug_t *rw_locks_held)
290 {
291 	panic("No empty slot found in %p slot_used %d", rw_locks_held, rw_locks_held->rwld_locks_saved);
292 }
293 
294 static inline struct rw_lock_debug_entry *
find_empty_slot(rw_lock_debug_t * rw_locks_held)295 find_empty_slot(rw_lock_debug_t *rw_locks_held)
296 {
297 	int i;
298 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
299 		struct rw_lock_debug_entry *entry = &rw_locks_held->rwld_locks[i];
300 		if (entry->rwlde_lock == NULL) {
301 			return entry;
302 		}
303 	}
304 	rwlock_slot_panic(rw_locks_held);
305 }
306 
307 __abortlike
308 static void
canlock_rwlock_panic(lck_rw_t * lock,thread_t thread,struct rw_lock_debug_entry * entry)309 canlock_rwlock_panic(lck_rw_t* lock, thread_t thread, struct rw_lock_debug_entry *entry)
310 {
311 	panic("RW lock %p already held by %p caller %p mode_count %d state 0x%x owner 0x%p ",
312 	    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
313 	    ordered_load_rw(lock), ordered_load_rw_owner(lock));
314 }
315 
316 static inline void
assert_canlock_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)317 assert_canlock_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
318 {
319 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
320 
321 	if (__probable(rw_lock_debug_disabled() || (rw_locks_held->rwld_locks_acquired == 0))) {
322 		//no locks saved, safe to lock
323 		return;
324 	}
325 
326 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
327 	if (__improbable(entry != NULL)) {
328 		boolean_t can_be_shared_recursive;
329 		if (lck_rw_recursive_shared_assert_74048094) {
330 			can_be_shared_recursive = (lock->lck_rw_priv_excl == 0);
331 		} else {
332 			/* currently rw_lock_shared is called recursively,
333 			 * until the code is fixed allow to lock
334 			 * recursively in shared mode
335 			 */
336 			can_be_shared_recursive = TRUE;
337 		}
338 		if ((type == LCK_RW_TYPE_SHARED) && can_be_shared_recursive && entry->rwlde_mode_count >= 1) {
339 			return;
340 		}
341 		canlock_rwlock_panic(lock, thread, entry);
342 	}
343 }
344 
345 __abortlike
346 static void
held_rwlock_notheld_panic(lck_rw_t * lock,thread_t thread)347 held_rwlock_notheld_panic(lck_rw_t* lock, thread_t thread)
348 {
349 	panic("RW lock %p not held by %p", lock, thread);
350 }
351 
352 __abortlike
353 static void
held_rwlock_notheld_with_info_panic(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,struct rw_lock_debug_entry * entry)354 held_rwlock_notheld_with_info_panic(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, struct rw_lock_debug_entry *entry)
355 {
356 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
357 		panic("RW lock %p not held in exclusive by %p caller %p read %d state 0x%x owner 0x%p ",
358 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
359 		    ordered_load_rw(lock), ordered_load_rw_owner(lock));
360 	} else {
361 		panic("RW lock %p not held in shared by %p caller %p read %d state 0x%x owner 0x%p ",
362 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
363 		    ordered_load_rw(lock), ordered_load_rw_owner(lock));
364 	}
365 }
366 
367 static inline void
assert_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)368 assert_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
369 {
370 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
371 
372 	if (__probable(rw_lock_debug_disabled())) {
373 		return;
374 	}
375 
376 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_locks_saved == 0)) {
377 		if (rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_overflow == 0) {
378 			held_rwlock_notheld_panic(lock, thread);
379 		}
380 		return;
381 	}
382 
383 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
384 	if (__probable(entry != NULL)) {
385 		if (type == LCK_RW_TYPE_EXCLUSIVE && entry->rwlde_mode_count != -1) {
386 			held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
387 		} else {
388 			if (type == LCK_RW_TYPE_SHARED && entry->rwlde_mode_count <= 0) {
389 				held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
390 			}
391 		}
392 	} else {
393 		if (rw_locks_held->rwld_overflow == 0) {
394 			held_rwlock_notheld_panic(lock, thread);
395 		}
396 	}
397 }
398 
399 static inline void
change_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)400 change_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
401 {
402 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
403 
404 	if (__probable(rw_lock_debug_disabled())) {
405 		return;
406 	}
407 
408 	if (__improbable(rw_locks_held->rwld_locks_saved == 0)) {
409 		if (rw_locks_held->rwld_overflow == 0) {
410 			held_rwlock_notheld_panic(lock, thread);
411 		}
412 		return;
413 	}
414 
415 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
416 	if (__probable(entry != NULL)) {
417 		if (typeFrom == LCK_RW_TYPE_SHARED) {
418 			//We are upgrading
419 			assertf(entry->rwlde_mode_count == 1,
420 			    "RW lock %p not held by a single shared when upgrading "
421 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
422 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
423 			    ordered_load_rw(lock), ordered_load_rw_owner(lock));
424 			entry->rwlde_mode_count = -1;
425 			set_rwlde_caller_packed(entry, caller);
426 		} else {
427 			//We are downgrading
428 			assertf(entry->rwlde_mode_count == -1,
429 			    "RW lock %p not held in write mode when downgrading "
430 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
431 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
432 			    ordered_load_rw(lock), ordered_load_rw_owner(lock));
433 			entry->rwlde_mode_count = 1;
434 			set_rwlde_caller_packed(entry, caller);
435 		}
436 		return;
437 	}
438 
439 	if (rw_locks_held->rwld_overflow == 0) {
440 		held_rwlock_notheld_panic(lock, thread);
441 	}
442 
443 	if (rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER) {
444 		//array is full
445 		return;
446 	}
447 
448 	struct rw_lock_debug_entry *null_entry = find_empty_slot(rw_locks_held);
449 	null_entry->rwlde_lock = lock;
450 	set_rwlde_caller_packed(null_entry, caller);
451 	if (typeFrom == LCK_RW_TYPE_SHARED) {
452 		null_entry->rwlde_mode_count = -1;
453 	} else {
454 		null_entry->rwlde_mode_count = 1;
455 	}
456 	rw_locks_held->rwld_locks_saved++;
457 }
458 
459 __abortlike
460 static void
add_held_rwlock_too_many_panic(thread_t thread)461 add_held_rwlock_too_many_panic(thread_t thread)
462 {
463 	panic("RW lock too many rw locks held, rwld_locks_acquired maxed out for thread %p", thread);
464 }
465 
466 static inline void
add_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)467 add_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
468 {
469 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
470 	struct rw_lock_debug_entry *null_entry;
471 
472 	if (__probable(rw_lock_debug_disabled())) {
473 		return;
474 	}
475 
476 	if (__improbable(rw_locks_held->rwld_locks_acquired == UINT32_MAX)) {
477 		add_held_rwlock_too_many_panic(thread);
478 	}
479 	rw_locks_held->rwld_locks_acquired++;
480 
481 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
482 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
483 			//array is full
484 			rw_locks_held->rwld_overflow = 1;
485 			return;
486 		}
487 		null_entry = find_empty_slot(rw_locks_held);
488 		null_entry->rwlde_lock = lock;
489 		set_rwlde_caller_packed(null_entry, caller);
490 		null_entry->rwlde_mode_count = -1;
491 		rw_locks_held->rwld_locks_saved++;
492 		return;
493 	} else {
494 		if (__probable(rw_locks_held->rwld_locks_saved == 0)) {
495 			//array is empty
496 			goto add_shared;
497 		}
498 
499 		boolean_t allow_shared_recursive;
500 		if (lck_rw_recursive_shared_assert_74048094) {
501 			allow_shared_recursive = (lock->lck_rw_priv_excl == 0);
502 		} else {
503 			allow_shared_recursive = TRUE;
504 		}
505 		if (allow_shared_recursive) {
506 			//It could be already locked in shared mode
507 			struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
508 			if (entry != NULL) {
509 				assert(entry->rwlde_mode_count > 0);
510 				assertf(entry->rwlde_mode_count != INT8_MAX,
511 				    "RW lock %p with too many recursive shared held "
512 				    "from %p caller %p read %d state 0x%x owner 0x%p",
513 				    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
514 				    ordered_load_rw(lock), ordered_load_rw_owner(lock));
515 				entry->rwlde_mode_count += 1;
516 				return;
517 			}
518 		}
519 
520 		//none of the locks were a match
521 		//try to add a new entry
522 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
523 			//array is full
524 			rw_locks_held->rwld_overflow = 1;
525 			return;
526 		}
527 
528 add_shared:
529 		null_entry = find_empty_slot(rw_locks_held);
530 		null_entry->rwlde_lock = lock;
531 		set_rwlde_caller_packed(null_entry, caller);
532 		null_entry->rwlde_mode_count = 1;
533 		rw_locks_held->rwld_locks_saved++;
534 	}
535 }
536 
537 static inline void
remove_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)538 remove_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
539 {
540 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
541 
542 	if (__probable(rw_lock_debug_disabled())) {
543 		return;
544 	}
545 
546 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0)) {
547 		return;
548 	}
549 	rw_locks_held->rwld_locks_acquired--;
550 
551 	if (rw_locks_held->rwld_locks_saved == 0) {
552 		assert(rw_locks_held->rwld_overflow == 1);
553 		goto out;
554 	}
555 
556 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
557 	if (__probable(entry != NULL)) {
558 		if (type == LCK_RW_TYPE_EXCLUSIVE) {
559 			assert(entry->rwlde_mode_count == -1);
560 			entry->rwlde_mode_count = 0;
561 		} else {
562 			assert(entry->rwlde_mode_count > 0);
563 			entry->rwlde_mode_count--;
564 			if (entry->rwlde_mode_count > 0) {
565 				goto out;
566 			}
567 		}
568 		entry->rwlde_caller_packed = 0;
569 		entry->rwlde_lock = NULL;
570 		rw_locks_held->rwld_locks_saved--;
571 	} else {
572 		assert(rw_locks_held->rwld_overflow == 1);
573 	}
574 
575 out:
576 	if (rw_locks_held->rwld_locks_acquired == 0) {
577 		rw_locks_held->rwld_overflow = 0;
578 	}
579 	return;
580 }
581 #endif /* DEBUG_RW */
582 
583 /*
584  * We disable interrupts while holding the RW interlock to prevent an
585  * interrupt from exacerbating hold time.
586  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
587  */
588 static inline boolean_t
lck_interlock_lock(lck_rw_t * lck)589 lck_interlock_lock(
590 	lck_rw_t        *lck)
591 {
592 	boolean_t       istate;
593 
594 	istate = ml_set_interrupts_enabled(FALSE);
595 	lck_rw_ilk_lock(lck);
596 	return istate;
597 }
598 
599 static inline void
lck_interlock_unlock(lck_rw_t * lck,boolean_t istate)600 lck_interlock_unlock(
601 	lck_rw_t        *lck,
602 	boolean_t       istate)
603 {
604 	lck_rw_ilk_unlock(lck);
605 	ml_set_interrupts_enabled(istate);
606 }
607 
608 static inline void
lck_rw_inc_thread_count(thread_t thread)609 lck_rw_inc_thread_count(
610 	thread_t thread)
611 {
612 	__assert_only uint32_t prev_rwlock_count;
613 
614 	prev_rwlock_count = thread->rwlock_count++;
615 #if MACH_ASSERT
616 	/*
617 	 * Set the ast to check that the
618 	 * rwlock_count is going to be set to zero when
619 	 * going back to userspace.
620 	 * Set it only once when we increment it for the first time.
621 	 */
622 	if (prev_rwlock_count == 0) {
623 		act_set_debug_assert();
624 	}
625 #endif
626 }
627 
628 /*
629  * compute the deadline to spin against when
630  * waiting for a change of state on a lck_rw_t
631  */
632 static inline uint64_t
lck_rw_deadline_for_spin(lck_rw_t * lck)633 lck_rw_deadline_for_spin(
634 	lck_rw_t        *lck)
635 {
636 	lck_rw_word_t   word;
637 
638 	word.data = ordered_load_rw(lck);
639 	if (word.can_sleep) {
640 		if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
641 			/*
642 			 * there are already threads waiting on this lock... this
643 			 * implies that they have spun beyond their deadlines waiting for
644 			 * the desired state to show up so we will not bother spinning at this time...
645 			 *   or
646 			 * the current number of threads sharing this lock exceeds our capacity to run them
647 			 * concurrently and since all states we're going to spin for require the rw_shared_count
648 			 * to be at 0, we'll not bother spinning since the latency for this to happen is
649 			 * unpredictable...
650 			 */
651 			return mach_absolute_time();
652 		}
653 		return mach_absolute_time() + os_atomic_load(&MutexSpin, relaxed);
654 	} else {
655 		return mach_absolute_time() + (100000LL * 1000000000LL);
656 	}
657 }
658 
659 /*
660  * This inline is used when busy-waiting for an rw lock.
661  * If interrupts were disabled when the lock primitive was called,
662  * we poll the IPI handler for pending tlb flushes in x86.
663  */
664 static inline void
lck_rw_lock_pause(boolean_t interrupts_enabled)665 lck_rw_lock_pause(
666 	boolean_t       interrupts_enabled)
667 {
668 #if X86_64
669 	if (!interrupts_enabled) {
670 		handle_pending_TLB_flushes();
671 	}
672 	cpu_pause();
673 #else
674 	(void) interrupts_enabled;
675 	wait_for_event();
676 #endif
677 }
678 
679 static boolean_t
lck_rw_drain_status(lck_rw_t * lock,uint32_t status_mask,boolean_t wait)680 lck_rw_drain_status(
681 	lck_rw_t        *lock,
682 	uint32_t        status_mask,
683 	boolean_t       wait)
684 {
685 	uint64_t        deadline = 0;
686 	uint32_t        data;
687 	boolean_t       istate = FALSE;
688 
689 	if (wait) {
690 		deadline = lck_rw_deadline_for_spin(lock);
691 #if __x86_64__
692 		istate = ml_get_interrupts_enabled();
693 #endif
694 	}
695 
696 	for (;;) {
697 #if __x86_64__
698 		data = os_atomic_load(&lock->lck_rw_data, relaxed);
699 #else
700 		data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
701 #endif
702 		if ((data & status_mask) == 0) {
703 			break;
704 		}
705 		if (wait) {
706 			lck_rw_lock_pause(istate);
707 		} else {
708 			atomic_exchange_abort();
709 		}
710 		if (!wait || (mach_absolute_time() >= deadline)) {
711 			return FALSE;
712 		}
713 	}
714 	atomic_exchange_abort();
715 	return TRUE;
716 }
717 
718 /*
719  * Spin while interlock is held.
720  */
721 static inline void
lck_rw_interlock_spin(lck_rw_t * lock)722 lck_rw_interlock_spin(
723 	lck_rw_t        *lock)
724 {
725 	uint32_t        data, prev;
726 
727 	for (;;) {
728 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_relaxed);
729 		if (data & LCK_RW_INTERLOCK) {
730 #if __x86_64__
731 			cpu_pause();
732 #else
733 			wait_for_event();
734 #endif
735 		} else {
736 			atomic_exchange_abort();
737 			return;
738 		}
739 	}
740 }
741 
742 #define LCK_RW_GRAB_WANT        0
743 #define LCK_RW_GRAB_SHARED      1
744 
745 static boolean_t
lck_rw_grab(lck_rw_t * lock,int mode,boolean_t wait)746 lck_rw_grab(
747 	lck_rw_t        *lock,
748 	int             mode,
749 	boolean_t       wait)
750 {
751 	uint64_t        deadline = 0;
752 	uint32_t        data, prev;
753 	boolean_t       do_exch, istate = FALSE;
754 
755 	if (wait) {
756 		deadline = lck_rw_deadline_for_spin(lock);
757 #if __x86_64__
758 		istate = ml_get_interrupts_enabled();
759 #endif
760 	}
761 
762 	for (;;) {
763 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
764 		if (data & LCK_RW_INTERLOCK) {
765 			atomic_exchange_abort();
766 			lck_rw_interlock_spin(lock);
767 			continue;
768 		}
769 		do_exch = FALSE;
770 		if (mode == LCK_RW_GRAB_WANT) {
771 			if ((data & LCK_RW_WANT_EXCL) == 0) {
772 				data |= LCK_RW_WANT_EXCL;
773 				do_exch = TRUE;
774 			}
775 		} else {        // LCK_RW_GRAB_SHARED
776 			if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
777 			    (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
778 				data += LCK_RW_SHARED_READER;
779 				do_exch = TRUE;
780 			}
781 		}
782 		if (do_exch) {
783 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
784 				return TRUE;
785 			}
786 		} else {
787 			if (wait) {
788 				lck_rw_lock_pause(istate);
789 			} else {
790 				atomic_exchange_abort();
791 			}
792 			if (!wait || (mach_absolute_time() >= deadline)) {
793 				return FALSE;
794 			}
795 		}
796 	}
797 }
798 
799 static void
lck_rw_lock_exclusive_gen(lck_rw_t * lock)800 lck_rw_lock_exclusive_gen(
801 	lck_rw_t        *lock)
802 {
803 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
804 	lck_rw_word_t           word;
805 	int                     slept = 0;
806 	boolean_t               gotlock = 0;
807 	boolean_t               not_shared_or_upgrade = 0;
808 	wait_result_t           res = 0;
809 	boolean_t               istate;
810 
811 #if     CONFIG_DTRACE
812 	boolean_t dtrace_ls_initialized = FALSE;
813 	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
814 	uint64_t wait_interval = 0;
815 	int readers_at_sleep = 0;
816 #endif
817 
818 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
819 	assertf(owner != current_thread(), "Lock already held state=0x%x, owner=%p",
820 	    ordered_load_rw(lock), owner);
821 
822 #ifdef DEBUG_RW
823 	/*
824 	 * Best effort attempt to check that this thread
825 	 * is not already holding the lock (this checks read mode too).
826 	 */
827 	assert_canlock_rwlock(lock, current_thread(), LCK_RW_TYPE_EXCLUSIVE);
828 #endif /* DEBUG_RW */
829 
830 	/*
831 	 *	Try to acquire the lck_rw_want_excl bit.
832 	 */
833 	while (!lck_rw_grab(lock, LCK_RW_GRAB_WANT, FALSE)) {
834 #if     CONFIG_DTRACE
835 		if (dtrace_ls_initialized == FALSE) {
836 			dtrace_ls_initialized = TRUE;
837 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
838 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
839 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
840 			if (dtrace_ls_enabled) {
841 				/*
842 				 * Either sleeping or spinning is happening,
843 				 *  start a timing of our delay interval now.
844 				 */
845 				readers_at_sleep = lock->lck_rw_shared_count;
846 				wait_interval = mach_absolute_time();
847 			}
848 		}
849 #endif
850 
851 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
852 
853 		gotlock = lck_rw_grab(lock, LCK_RW_GRAB_WANT, TRUE);
854 
855 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
856 
857 		if (gotlock) {
858 			break;
859 		}
860 		/*
861 		 * if we get here, the deadline has expired w/o us
862 		 * being able to grab the lock exclusively
863 		 * check to see if we're allowed to do a thread_block
864 		 */
865 		word.data = ordered_load_rw(lock);
866 		if (word.can_sleep) {
867 			istate = lck_interlock_lock(lock);
868 			word.data = ordered_load_rw(lock);
869 
870 			if (word.want_excl) {
871 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
872 
873 				word.w_waiting = 1;
874 				ordered_store_rw(lock, word.data);
875 
876 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
877 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
878 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
879 				lck_interlock_unlock(lock, istate);
880 				if (res == THREAD_WAITING) {
881 					res = thread_block(THREAD_CONTINUE_NULL);
882 					slept++;
883 				}
884 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
885 			} else {
886 				word.want_excl = 1;
887 				ordered_store_rw(lock, word.data);
888 				lck_interlock_unlock(lock, istate);
889 				break;
890 			}
891 		}
892 	}
893 	/*
894 	 * Wait for readers (and upgrades) to finish...
895 	 */
896 	while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE)) {
897 #if     CONFIG_DTRACE
898 		/*
899 		 * Either sleeping or spinning is happening, start
900 		 * a timing of our delay interval now.  If we set it
901 		 * to -1 we don't have accurate data so we cannot later
902 		 * decide to record a dtrace spin or sleep event.
903 		 */
904 		if (dtrace_ls_initialized == FALSE) {
905 			dtrace_ls_initialized = TRUE;
906 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
907 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
908 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
909 			if (dtrace_ls_enabled) {
910 				/*
911 				 * Either sleeping or spinning is happening,
912 				 *  start a timing of our delay interval now.
913 				 */
914 				readers_at_sleep = lock->lck_rw_shared_count;
915 				wait_interval = mach_absolute_time();
916 			}
917 		}
918 #endif
919 
920 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
921 
922 		not_shared_or_upgrade = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE);
923 
924 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, not_shared_or_upgrade, 0);
925 
926 		if (not_shared_or_upgrade) {
927 			break;
928 		}
929 		/*
930 		 * if we get here, the deadline has expired w/o us
931 		 * being able to grab the lock exclusively
932 		 * check to see if we're allowed to do a thread_block
933 		 */
934 		word.data = ordered_load_rw(lock);
935 		if (word.can_sleep) {
936 			istate = lck_interlock_lock(lock);
937 			word.data = ordered_load_rw(lock);
938 
939 			if (word.shared_count != 0 || word.want_upgrade) {
940 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
941 
942 				word.w_waiting = 1;
943 				ordered_store_rw(lock, word.data);
944 
945 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
946 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
947 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
948 				lck_interlock_unlock(lock, istate);
949 
950 				if (res == THREAD_WAITING) {
951 					res = thread_block(THREAD_CONTINUE_NULL);
952 					slept++;
953 				}
954 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
955 			} else {
956 				lck_interlock_unlock(lock, istate);
957 				/*
958 				 * must own the lock now, since we checked for
959 				 * readers or upgrade owner behind the interlock
960 				 * no need for a call to 'lck_rw_drain_status'
961 				 */
962 				break;
963 			}
964 		}
965 	}
966 
967 #if     CONFIG_DTRACE
968 	/*
969 	 * Decide what latencies we suffered that are Dtrace events.
970 	 * If we have set wait_interval, then we either spun or slept.
971 	 * At least we get out from under the interlock before we record
972 	 * which is the best we can do here to minimize the impact
973 	 * of the tracing.
974 	 * If we have set wait_interval to -1, then dtrace was not enabled when we
975 	 * started sleeping/spinning so we don't record this event.
976 	 */
977 	if (dtrace_ls_enabled == TRUE) {
978 		if (slept == 0) {
979 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
980 			    mach_absolute_time() - wait_interval, 1);
981 		} else {
982 			/*
983 			 * For the blocking case, we also record if when we blocked
984 			 * it was held for read or write, and how many readers.
985 			 * Notice that above we recorded this before we dropped
986 			 * the interlock so the count is accurate.
987 			 */
988 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
989 			    mach_absolute_time() - wait_interval, 1,
990 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
991 		}
992 	}
993 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
994 #endif  /* CONFIG_DTRACE */
995 }
996 
997 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
998 	    (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
999 	    LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1000 /*!
1001  * @function lck_rw_lock_exclusive_check_contended
1002  *
1003  * @abstract
1004  * Locks a rw_lock in exclusive mode.
1005  *
1006  * @discussion
1007  * This routine IS EXPERIMENTAL.
1008  * It's only used for the vm object lock, and use for other subsystems is UNSUPPORTED.
1009  * Note that the return value is ONLY A HEURISTIC w.r.t. the lock's contention.
1010  *
1011  * @param lock           rw_lock to lock.
1012  *
1013  * @returns Returns TRUE if the thread spun or blocked while attempting to acquire the lock, FALSE
1014  *          otherwise.
1015  */
1016 bool
lck_rw_lock_exclusive_check_contended(lck_rw_t * lock)1017 lck_rw_lock_exclusive_check_contended(
1018 	lck_rw_t        *lock)
1019 {
1020 	thread_t        thread = current_thread();
1021 	bool            contended  = false;
1022 
1023 	if (lock->lck_rw_can_sleep) {
1024 		lck_rw_inc_thread_count(thread);
1025 	} else if (get_preemption_level() == 0) {
1026 		panic("Taking non-sleepable RW lock with preemption enabled");
1027 	}
1028 
1029 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1030 #if     CONFIG_DTRACE
1031 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1032 #endif  /* CONFIG_DTRACE */
1033 	} else {
1034 		contended = true;
1035 		lck_rw_lock_exclusive_gen(lock);
1036 	}
1037 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
1038 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1039 
1040 	ordered_store_rw_owner(lock, thread);
1041 
1042 #ifdef DEBUG_RW
1043 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, __builtin_return_address(0));
1044 #endif /* DEBUG_RW */
1045 	return contended;
1046 }
1047 
1048 __attribute__((always_inline))
1049 static void
lck_rw_lock_exclusive_internal_inline(lck_rw_t * lock,void * caller)1050 lck_rw_lock_exclusive_internal_inline(
1051 	lck_rw_t        *lock,
1052 	void            *caller)
1053 {
1054 #pragma unused(caller)
1055 	thread_t        thread = current_thread();
1056 
1057 	if (lock->lck_rw_can_sleep) {
1058 		lck_rw_inc_thread_count(thread);
1059 	} else if (get_preemption_level() == 0) {
1060 		panic("Taking non-sleepable RW lock with preemption enabled");
1061 	}
1062 
1063 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1064 #if     CONFIG_DTRACE
1065 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1066 #endif  /* CONFIG_DTRACE */
1067 	} else {
1068 		lck_rw_lock_exclusive_gen(lock);
1069 	}
1070 
1071 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
1072 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1073 
1074 	ordered_store_rw_owner(lock, thread);
1075 
1076 #if DEBUG_RW
1077 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1078 #endif /* DEBUG_RW */
1079 }
1080 
1081 __attribute__((noinline))
1082 static void
lck_rw_lock_exclusive_internal(lck_rw_t * lock,void * caller)1083 lck_rw_lock_exclusive_internal(
1084 	lck_rw_t        *lock,
1085 	void            *caller)
1086 {
1087 	lck_rw_lock_exclusive_internal_inline(lock, caller);
1088 }
1089 
1090 /*!
1091  * @function lck_rw_lock_exclusive
1092  *
1093  * @abstract
1094  * Locks a rw_lock in exclusive mode.
1095  *
1096  * @discussion
1097  * This function can block.
1098  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1099  * can acquire it in exclusive mode.
1100  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1101  *
1102  * @param lock           rw_lock to lock.
1103  */
1104 void
lck_rw_lock_exclusive(lck_rw_t * lock)1105 lck_rw_lock_exclusive(
1106 	lck_rw_t        *lock)
1107 {
1108 	lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0));
1109 }
1110 
1111 /*
1112  *	Routine:	lck_rw_lock_shared_gen
1113  *	Function:
1114  *		Fast path code has determined that this lock
1115  *		is held exclusively... this is where we spin/block
1116  *		until we can acquire the lock in the shared mode
1117  */
1118 static void
lck_rw_lock_shared_gen(lck_rw_t * lck)1119 lck_rw_lock_shared_gen(
1120 	lck_rw_t        *lck)
1121 {
1122 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1123 	lck_rw_word_t           word;
1124 	boolean_t               gotlock = 0;
1125 	int                     slept = 0;
1126 	wait_result_t           res = 0;
1127 	boolean_t               istate;
1128 
1129 #if     CONFIG_DTRACE
1130 	uint64_t wait_interval = 0;
1131 	int readers_at_sleep = 0;
1132 	boolean_t dtrace_ls_initialized = FALSE;
1133 	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1134 #endif /* CONFIG_DTRACE */
1135 
1136 	__assert_only thread_t owner = ordered_load_rw_owner(lck);
1137 	assertf(owner != current_thread(), "Lock already held state=0x%x, owner=%p",
1138 	    ordered_load_rw(lck), owner);
1139 #ifdef DEBUG_RW
1140 	/*
1141 	 * Best effort attempt to check that this thread
1142 	 * is not already holding the lock in shared mode.
1143 	 */
1144 	assert_canlock_rwlock(lck, current_thread(), LCK_RW_TYPE_SHARED);
1145 #endif
1146 
1147 	while (!lck_rw_grab(lck, LCK_RW_GRAB_SHARED, FALSE)) {
1148 #if     CONFIG_DTRACE
1149 		if (dtrace_ls_initialized == FALSE) {
1150 			dtrace_ls_initialized = TRUE;
1151 			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1152 			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1153 			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1154 			if (dtrace_ls_enabled) {
1155 				/*
1156 				 * Either sleeping or spinning is happening,
1157 				 *  start a timing of our delay interval now.
1158 				 */
1159 				readers_at_sleep = lck->lck_rw_shared_count;
1160 				wait_interval = mach_absolute_time();
1161 			}
1162 		}
1163 #endif
1164 
1165 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1166 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1167 
1168 		gotlock = lck_rw_grab(lck, LCK_RW_GRAB_SHARED, TRUE);
1169 
1170 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1171 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, gotlock, 0);
1172 
1173 		if (gotlock) {
1174 			break;
1175 		}
1176 		/*
1177 		 * if we get here, the deadline has expired w/o us
1178 		 * being able to grab the lock for read
1179 		 * check to see if we're allowed to do a thread_block
1180 		 */
1181 		if (lck->lck_rw_can_sleep) {
1182 			istate = lck_interlock_lock(lck);
1183 
1184 			word.data = ordered_load_rw(lck);
1185 			if ((word.want_excl || word.want_upgrade) &&
1186 			    ((word.shared_count == 0) || word.priv_excl)) {
1187 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1188 				    trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1189 
1190 				word.r_waiting = 1;
1191 				ordered_store_rw(lck, word.data);
1192 
1193 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1194 				res = assert_wait(LCK_RW_READER_EVENT(lck),
1195 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1196 				lck_interlock_unlock(lck, istate);
1197 
1198 				if (res == THREAD_WAITING) {
1199 					res = thread_block(THREAD_CONTINUE_NULL);
1200 					slept++;
1201 				}
1202 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1203 				    trace_lck, res, slept, 0, 0);
1204 			} else {
1205 				word.shared_count++;
1206 				ordered_store_rw(lck, word.data);
1207 				lck_interlock_unlock(lck, istate);
1208 				break;
1209 			}
1210 		}
1211 	}
1212 
1213 #if     CONFIG_DTRACE
1214 	if (dtrace_ls_enabled == TRUE) {
1215 		if (slept == 0) {
1216 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1217 		} else {
1218 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1219 			    mach_absolute_time() - wait_interval, 0,
1220 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1221 		}
1222 	}
1223 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1224 #endif  /* CONFIG_DTRACE */
1225 }
1226 
1227 __attribute__((always_inline))
1228 static void
lck_rw_lock_shared_internal_inline(lck_rw_t * lock,void * caller)1229 lck_rw_lock_shared_internal_inline(
1230 	lck_rw_t        *lock,
1231 	void            *caller)
1232 {
1233 #pragma unused(caller)
1234 
1235 	uint32_t        data, prev;
1236 	thread_t        thread = current_thread();
1237 	__assert_only thread_t owner;
1238 #ifdef DEBUG_RW
1239 	boolean_t       check_canlock = TRUE;
1240 #endif
1241 
1242 	if (lock->lck_rw_can_sleep) {
1243 		lck_rw_inc_thread_count(thread);
1244 	} else if (get_preemption_level() == 0) {
1245 		panic("Taking non-sleepable RW lock with preemption enabled");
1246 	}
1247 
1248 	for (;;) {
1249 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1250 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1251 			atomic_exchange_abort();
1252 			lck_rw_lock_shared_gen(lock);
1253 			goto locked;
1254 		}
1255 #ifdef DEBUG_RW
1256 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1257 			/*
1258 			 * If the lock is uncontended,
1259 			 * we do not need to check if we can lock it
1260 			 */
1261 			check_canlock = FALSE;
1262 		}
1263 #endif
1264 		data += LCK_RW_SHARED_READER;
1265 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1266 			break;
1267 		}
1268 		cpu_pause();
1269 	}
1270 #ifdef DEBUG_RW
1271 	if (check_canlock) {
1272 		/*
1273 		 * Best effort attempt to check that this thread
1274 		 * is not already holding the lock (this checks read mode too).
1275 		 */
1276 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1277 	}
1278 #endif
1279 locked:
1280 	owner = ordered_load_rw_owner(lock);
1281 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1282 
1283 #if     CONFIG_DTRACE
1284 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1285 #endif  /* CONFIG_DTRACE */
1286 
1287 #ifdef DEBUG_RW
1288 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1289 #endif /* DEBUG_RW */
1290 }
1291 
1292 __attribute__((noinline))
1293 static void
lck_rw_lock_shared_internal(lck_rw_t * lock,void * caller)1294 lck_rw_lock_shared_internal(
1295 	lck_rw_t        *lock,
1296 	void            *caller)
1297 {
1298 	lck_rw_lock_shared_internal_inline(lock, caller);
1299 }
1300 
1301 /*!
1302  * @function lck_rw_lock_shared
1303  *
1304  * @abstract
1305  * Locks a rw_lock in shared mode.
1306  *
1307  * @discussion
1308  * This function can block.
1309  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1310  * can acquire it in exclusive mode.
1311  * If the lock is held in shared mode and there are no writers waiting, a reader will be able to acquire
1312  * the lock without waiting.
1313  * If the lock is held in shared mode and there is at least a writer waiting, a reader will wait
1314  * for all the writers to make progress if the lock was initialized with the default settings. Instead if
1315  * RW_SHARED_PRIORITY was selected at initialization time, a reader will never wait if the lock is held
1316  * in shared mode.
1317  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1318  *
1319  * @param lock           rw_lock to lock.
1320  */
1321 void
lck_rw_lock_shared(lck_rw_t * lock)1322 lck_rw_lock_shared(
1323 	lck_rw_t        *lock)
1324 {
1325 	lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0));
1326 }
1327 
1328 /*
1329  *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1330  *	Function:
1331  *		Fast path code has already dropped our read
1332  *		count and determined that someone else owns 'lck_rw_want_upgrade'
1333  *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1334  *		all we need to do here is determine if a wakeup is needed
1335  */
1336 static boolean_t
lck_rw_lock_shared_to_exclusive_failure(lck_rw_t * lck,uint32_t prior_lock_state)1337 lck_rw_lock_shared_to_exclusive_failure(
1338 	lck_rw_t        *lck,
1339 	uint32_t        prior_lock_state)
1340 {
1341 	thread_t        thread = current_thread();
1342 	uint32_t        rwlock_count;
1343 
1344 	if ((prior_lock_state & LCK_RW_W_WAITING) &&
1345 	    ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1346 		/*
1347 		 *	Someone else has requested upgrade.
1348 		 *	Since we've released the read lock, wake
1349 		 *	him up if he's blocked waiting
1350 		 */
1351 		thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1352 	}
1353 
1354 	/* Check if dropping the lock means that we need to unpromote */
1355 	if (lck->lck_rw_can_sleep) {
1356 		rwlock_count = thread->rwlock_count--;
1357 	} else {
1358 		rwlock_count = UINT32_MAX;
1359 	}
1360 
1361 	if (rwlock_count == 0) {
1362 		panic("rw lock count underflow for thread %p", thread);
1363 	}
1364 
1365 	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1366 		/* sched_flags checked without lock, but will be rechecked while clearing */
1367 		lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1368 	}
1369 
1370 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1371 	    VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1372 
1373 #ifdef DEBUG_RW
1374 	remove_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
1375 #endif /* DEBUG_RW */
1376 
1377 	return FALSE;
1378 }
1379 
1380 /*
1381  *	Routine:	lck_rw_lock_shared_to_exclusive_success
1382  *	Function:
1383  *		the fast path code has already dropped our read
1384  *		count and successfully acquired 'lck_rw_want_upgrade'
1385  *		we just need to wait for the rest of the readers to drain
1386  *		and then we can return as the exclusive holder of this lock
1387  */
1388 static void
lck_rw_lock_shared_to_exclusive_success(lck_rw_t * lock)1389 lck_rw_lock_shared_to_exclusive_success(
1390 	lck_rw_t        *lock)
1391 {
1392 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1393 	int                     slept = 0;
1394 	lck_rw_word_t           word;
1395 	wait_result_t           res;
1396 	boolean_t               istate;
1397 	boolean_t               not_shared;
1398 
1399 #if     CONFIG_DTRACE
1400 	uint64_t                wait_interval = 0;
1401 	int                     readers_at_sleep = 0;
1402 	boolean_t               dtrace_ls_initialized = FALSE;
1403 	boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1404 #endif
1405 
1406 	while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE)) {
1407 		word.data = ordered_load_rw(lock);
1408 #if     CONFIG_DTRACE
1409 		if (dtrace_ls_initialized == FALSE) {
1410 			dtrace_ls_initialized = TRUE;
1411 			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1412 			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1413 			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1414 			if (dtrace_ls_enabled) {
1415 				/*
1416 				 * Either sleeping or spinning is happening,
1417 				 *  start a timing of our delay interval now.
1418 				 */
1419 				readers_at_sleep = word.shared_count;
1420 				wait_interval = mach_absolute_time();
1421 			}
1422 		}
1423 #endif
1424 
1425 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1426 		    trace_lck, word.shared_count, 0, 0, 0);
1427 
1428 		not_shared = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE);
1429 
1430 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1431 		    trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1432 
1433 		if (not_shared) {
1434 			break;
1435 		}
1436 
1437 		/*
1438 		 * if we get here, the spin deadline in lck_rw_wait_on_status()
1439 		 * has expired w/o the rw_shared_count having drained to 0
1440 		 * check to see if we're allowed to do a thread_block
1441 		 */
1442 		if (word.can_sleep) {
1443 			istate = lck_interlock_lock(lock);
1444 
1445 			word.data = ordered_load_rw(lock);
1446 			if (word.shared_count != 0) {
1447 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1448 				    trace_lck, word.shared_count, 0, 0, 0);
1449 
1450 				word.w_waiting = 1;
1451 				ordered_store_rw(lock, word.data);
1452 
1453 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1454 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1455 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1456 				lck_interlock_unlock(lock, istate);
1457 
1458 				if (res == THREAD_WAITING) {
1459 					res = thread_block(THREAD_CONTINUE_NULL);
1460 					slept++;
1461 				}
1462 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1463 				    trace_lck, res, slept, 0, 0);
1464 			} else {
1465 				lck_interlock_unlock(lock, istate);
1466 				break;
1467 			}
1468 		}
1469 	}
1470 #if     CONFIG_DTRACE
1471 	/*
1472 	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1473 	 */
1474 	if (dtrace_ls_enabled == TRUE) {
1475 		if (slept == 0) {
1476 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1477 		} else {
1478 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1479 			    mach_absolute_time() - wait_interval, 1,
1480 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1481 		}
1482 	}
1483 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1484 #endif
1485 }
1486 
1487 /*!
1488  * @function lck_rw_lock_shared_to_exclusive
1489  *
1490  * @abstract
1491  * Upgrades a rw_lock held in shared mode to exclusive.
1492  *
1493  * @discussion
1494  * This function can block.
1495  * Only one reader at a time can upgrade to exclusive mode. If the upgrades fails the function will
1496  * return with the lock not held.
1497  * The caller needs to hold the lock in shared mode to upgrade it.
1498  *
1499  * @param lock           rw_lock already held in shared mode to upgrade.
1500  *
1501  * @returns TRUE if the lock was upgraded, FALSE if it was not possible.
1502  *          If the function was not able to upgrade the lock, the lock will be dropped
1503  *          by the function.
1504  */
1505 boolean_t
lck_rw_lock_shared_to_exclusive(lck_rw_t * lock)1506 lck_rw_lock_shared_to_exclusive(
1507 	lck_rw_t        *lock)
1508 {
1509 	uint32_t        data, prev;
1510 
1511 	assertf(lock->lck_rw_priv_excl != 0, "lock %p thread %p", lock, current_thread());
1512 
1513 #if DEBUG_RW
1514 	thread_t thread = current_thread();
1515 	assert_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1516 #endif /* DEBUG_RW */
1517 
1518 	for (;;) {
1519 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1520 		if (data & LCK_RW_INTERLOCK) {
1521 			atomic_exchange_abort();
1522 			lck_rw_interlock_spin(lock);
1523 			continue;
1524 		}
1525 		if (data & LCK_RW_WANT_UPGRADE) {
1526 			data -= LCK_RW_SHARED_READER;
1527 			if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1528 				data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1529 			}
1530 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1531 				return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1532 			}
1533 		} else {
1534 			data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1535 			data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1536 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1537 				break;
1538 			}
1539 		}
1540 		cpu_pause();
1541 	}
1542 	/* we now own the WANT_UPGRADE */
1543 	if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1544 		lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1545 	}
1546 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
1547 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1548 
1549 	ordered_store_rw_owner(lock, current_thread());
1550 #if     CONFIG_DTRACE
1551 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1552 #endif  /* CONFIG_DTRACE */
1553 
1554 #if DEBUG_RW
1555 	change_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, __builtin_return_address(0));
1556 #endif /* DEBUG_RW */
1557 	return TRUE;
1558 }
1559 
1560 /*
1561  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1562  *      Function:
1563  *		Fast path has already dropped
1564  *		our exclusive state and bumped lck_rw_shared_count
1565  *		all we need to do here is determine if anyone
1566  *		needs to be awakened.
1567  */
1568 static void
lck_rw_lock_exclusive_to_shared_gen(lck_rw_t * lck,uint32_t prior_lock_state,void * caller)1569 lck_rw_lock_exclusive_to_shared_gen(
1570 	lck_rw_t        *lck,
1571 	uint32_t        prior_lock_state,
1572 	void            *caller)
1573 {
1574 #pragma unused(caller)
1575 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1576 	lck_rw_word_t   fake_lck;
1577 
1578 	/*
1579 	 * prior_lock state is a snapshot of the 1st word of the
1580 	 * lock in question... we'll fake up a pointer to it
1581 	 * and carefully not access anything beyond whats defined
1582 	 * in the first word of a lck_rw_t
1583 	 */
1584 	fake_lck.data = prior_lock_state;
1585 
1586 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1587 	    trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1588 
1589 	/*
1590 	 * don't wake up anyone waiting to take the lock exclusively
1591 	 * since we hold a read count... when the read count drops to 0,
1592 	 * the writers will be woken.
1593 	 *
1594 	 * wake up any waiting readers if we don't have any writers waiting,
1595 	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1596 	 */
1597 	if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1598 		thread_wakeup(LCK_RW_READER_EVENT(lck));
1599 	}
1600 
1601 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1602 	    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1603 
1604 #if CONFIG_DTRACE
1605 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1606 #endif
1607 
1608 #if DEBUG_RW
1609 	thread_t        thread = current_thread();
1610 	change_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1611 #endif /* DEBUG_RW */
1612 }
1613 
1614 /*!
1615  * @function lck_rw_lock_exclusive_to_shared
1616  *
1617  * @abstract
1618  * Downgrades a rw_lock held in exclusive mode to shared.
1619  *
1620  * @discussion
1621  * The caller needs to hold the lock in exclusive mode to be able to downgrade it.
1622  *
1623  * @param lock           rw_lock already held in exclusive mode to downgrade.
1624  */
1625 void
lck_rw_lock_exclusive_to_shared(lck_rw_t * lock)1626 lck_rw_lock_exclusive_to_shared(
1627 	lck_rw_t        *lock)
1628 {
1629 	uint32_t        data, prev;
1630 
1631 	assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1632 	ordered_store_rw_owner(lock, THREAD_NULL);
1633 	for (;;) {
1634 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1635 		if (data & LCK_RW_INTERLOCK) {
1636 			atomic_exchange_abort();
1637 			lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1638 			continue;
1639 		}
1640 		data += LCK_RW_SHARED_READER;
1641 		if (data & LCK_RW_WANT_UPGRADE) {
1642 			data &= ~(LCK_RW_WANT_UPGRADE);
1643 		} else {
1644 			data &= ~(LCK_RW_WANT_EXCL);
1645 		}
1646 		if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1647 			data &= ~(LCK_RW_W_WAITING);
1648 		}
1649 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1650 			break;
1651 		}
1652 		cpu_pause();
1653 	}
1654 	lck_rw_lock_exclusive_to_shared_gen(lock, prev, __builtin_return_address(0));
1655 }
1656 
1657 /*
1658  * Very sad hack, but the codegen for lck_rw_lock
1659  * is very unhappy with the combination of __builtin_return_address()
1660  * and a noreturn function. For some reason it adds more frames
1661  * than it should. rdar://76570684
1662  */
1663 void
1664 _lck_rw_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
1665 #pragma clang diagnostic push
1666 #pragma clang diagnostic ignored "-Wmissing-noreturn"
1667 __attribute__((noinline, weak))
1668 void
_lck_rw_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1669 _lck_rw_lock_type_panic(
1670 	lck_rw_t        *lck,
1671 	lck_rw_type_t   lck_rw_type)
1672 {
1673 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
1674 }
1675 #pragma clang diagnostic pop
1676 
1677 /*!
1678  * @function lck_rw_lock
1679  *
1680  * @abstract
1681  * Locks a rw_lock with the specified type.
1682  *
1683  * @discussion
1684  * See lck_rw_lock_shared() or lck_rw_lock_exclusive() for more details.
1685  *
1686  * @param lck           rw_lock to lock.
1687  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
1688  */
1689 void
lck_rw_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1690 lck_rw_lock(
1691 	lck_rw_t        *lck,
1692 	lck_rw_type_t   lck_rw_type)
1693 {
1694 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1695 		return lck_rw_lock_shared_internal(lck, __builtin_return_address(0));
1696 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1697 		return lck_rw_lock_exclusive_internal(lck, __builtin_return_address(0));
1698 	}
1699 	_lck_rw_lock_type_panic(lck, lck_rw_type);
1700 }
1701 
1702 __attribute__((always_inline))
1703 static boolean_t
lck_rw_try_lock_shared_internal_inline(lck_rw_t * lock,void * caller)1704 lck_rw_try_lock_shared_internal_inline(
1705 	lck_rw_t        *lock,
1706 	void            *caller)
1707 {
1708 #pragma unused(caller)
1709 
1710 	uint32_t        data, prev;
1711 	thread_t        thread = current_thread();
1712 #ifdef DEBUG_RW
1713 	boolean_t       check_canlock = TRUE;
1714 #endif
1715 
1716 	for (;;) {
1717 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1718 		if (data & LCK_RW_INTERLOCK) {
1719 			atomic_exchange_abort();
1720 			lck_rw_interlock_spin(lock);
1721 			continue;
1722 		}
1723 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1724 			atomic_exchange_abort();
1725 			return FALSE;             /* lock is busy */
1726 		}
1727 #ifdef DEBUG_RW
1728 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1729 			/*
1730 			 * If the lock is uncontended,
1731 			 * we do not need to check if we can lock it
1732 			 */
1733 			check_canlock = FALSE;
1734 		}
1735 #endif
1736 		data += LCK_RW_SHARED_READER;     /* Increment reader refcount */
1737 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1738 			break;
1739 		}
1740 		cpu_pause();
1741 	}
1742 #ifdef DEBUG_RW
1743 	if (check_canlock) {
1744 		/*
1745 		 * Best effort attempt to check that this thread
1746 		 * is not already holding the lock (this checks read mode too).
1747 		 */
1748 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1749 	}
1750 #endif
1751 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
1752 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1753 
1754 	if (lock->lck_rw_can_sleep) {
1755 		lck_rw_inc_thread_count(thread);
1756 	} else if (get_preemption_level() == 0) {
1757 		panic("Taking non-sleepable RW lock with preemption enabled");
1758 	}
1759 
1760 #if     CONFIG_DTRACE
1761 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1762 #endif  /* CONFIG_DTRACE */
1763 
1764 #ifdef DEBUG_RW
1765 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1766 #endif /* DEBUG_RW */
1767 	return TRUE;
1768 }
1769 
1770 __attribute__((noinline))
1771 static boolean_t
lck_rw_try_lock_shared_internal(lck_rw_t * lock,void * caller)1772 lck_rw_try_lock_shared_internal(
1773 	lck_rw_t        *lock,
1774 	void            *caller)
1775 {
1776 	return lck_rw_try_lock_shared_internal_inline(lock, caller);
1777 }
1778 
1779 /*!
1780  * @function lck_rw_try_lock_shared
1781  *
1782  * @abstract
1783  * Tries to locks a rw_lock in read mode.
1784  *
1785  * @discussion
1786  * This function will return and not block in case the lock is already held.
1787  * See lck_rw_lock_shared for more details.
1788  *
1789  * @param lock           rw_lock to lock.
1790  *
1791  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
1792  */
1793 boolean_t
lck_rw_try_lock_shared(lck_rw_t * lock)1794 lck_rw_try_lock_shared(
1795 	lck_rw_t        *lock)
1796 {
1797 	return lck_rw_try_lock_shared_internal_inline(lock, __builtin_return_address(0));
1798 }
1799 
1800 __attribute__((always_inline))
1801 static boolean_t
lck_rw_try_lock_exclusive_internal_inline(lck_rw_t * lock,void * caller)1802 lck_rw_try_lock_exclusive_internal_inline(
1803 	lck_rw_t        *lock,
1804 	void            *caller)
1805 {
1806 #pragma unused(caller)
1807 	uint32_t        data, prev;
1808 
1809 	for (;;) {
1810 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1811 		if (data & LCK_RW_INTERLOCK) {
1812 			atomic_exchange_abort();
1813 			lck_rw_interlock_spin(lock);
1814 			continue;
1815 		}
1816 		if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1817 			atomic_exchange_abort();
1818 			return FALSE;
1819 		}
1820 		data |= LCK_RW_WANT_EXCL;
1821 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1822 			break;
1823 		}
1824 		cpu_pause();
1825 	}
1826 	thread_t thread = current_thread();
1827 
1828 	if (lock->lck_rw_can_sleep) {
1829 		lck_rw_inc_thread_count(thread);
1830 	} else if (get_preemption_level() == 0) {
1831 		panic("Taking non-sleepable RW lock with preemption enabled");
1832 	}
1833 
1834 	__assert_only thread_t owner = ordered_load_rw_owner(lock);
1835 	assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1836 
1837 	ordered_store_rw_owner(lock, thread);
1838 #if     CONFIG_DTRACE
1839 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1840 #endif  /* CONFIG_DTRACE */
1841 
1842 #ifdef DEBUG_RW
1843 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1844 #endif /* DEBUG_RW */
1845 	return TRUE;
1846 }
1847 
1848 __attribute__((noinline))
1849 static boolean_t
lck_rw_try_lock_exclusive_internal(lck_rw_t * lock,void * caller)1850 lck_rw_try_lock_exclusive_internal(
1851 	lck_rw_t        *lock,
1852 	void            *caller)
1853 {
1854 	return lck_rw_try_lock_exclusive_internal_inline(lock, caller);
1855 }
1856 
1857 /*!
1858  * @function lck_rw_try_lock_exclusive
1859  *
1860  * @abstract
1861  * Tries to locks a rw_lock in write mode.
1862  *
1863  * @discussion
1864  * This function will return and not block in case the lock is already held.
1865  * See lck_rw_lock_exclusive for more details.
1866  *
1867  * @param lock           rw_lock to lock.
1868  *
1869  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
1870  */
1871 boolean_t
lck_rw_try_lock_exclusive(lck_rw_t * lock)1872 lck_rw_try_lock_exclusive(
1873 	lck_rw_t        *lock)
1874 {
1875 	return lck_rw_try_lock_exclusive_internal_inline(lock, __builtin_return_address(0));
1876 }
1877 
1878 /*
1879  * Very sad hack, but the codegen for lck_rw_try_lock
1880  * is very unhappy with the combination of __builtin_return_address()
1881  * and a noreturn function. For some reason it adds more frames
1882  * than it should. rdar://76570684
1883  */
1884 boolean_t
1885 _lck_rw_try_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
1886 #pragma clang diagnostic push
1887 #pragma clang diagnostic ignored "-Wmissing-noreturn"
1888 __attribute__((noinline, weak))
1889 boolean_t
_lck_rw_try_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1890 _lck_rw_try_lock_type_panic(
1891 	lck_rw_t        *lck,
1892 	lck_rw_type_t   lck_rw_type)
1893 {
1894 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
1895 }
1896 #pragma clang diagnostic pop
1897 
1898 /*!
1899  * @function lck_rw_try_lock
1900  *
1901  * @abstract
1902  * Tries to locks a rw_lock with the specified type.
1903  *
1904  * @discussion
1905  * This function will return and not wait/block in case the lock is already held.
1906  * See lck_rw_try_lock_shared() or lck_rw_try_lock_exclusive() for more details.
1907  *
1908  * @param lck           rw_lock to lock.
1909  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
1910  *
1911  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
1912  */
1913 boolean_t
lck_rw_try_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1914 lck_rw_try_lock(
1915 	lck_rw_t        *lck,
1916 	lck_rw_type_t   lck_rw_type)
1917 {
1918 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1919 		return lck_rw_try_lock_shared_internal(lck, __builtin_return_address(0));
1920 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1921 		return lck_rw_try_lock_exclusive_internal(lck, __builtin_return_address(0));
1922 	}
1923 	return _lck_rw_try_lock_type_panic(lck, lck_rw_type);
1924 }
1925 
1926 /*
1927  *      Routine:        lck_rw_done_gen
1928  *
1929  *	prior_lock_state is the value in the 1st
1930  *      word of the lock at the time of a successful
1931  *	atomic compare and exchange with the new value...
1932  *      it represents the state of the lock before we
1933  *	decremented the rw_shared_count or cleared either
1934  *      rw_want_upgrade or rw_want_write and
1935  *	the lck_x_waiting bits...  since the wrapper
1936  *      routine has already changed the state atomically,
1937  *	we just need to decide if we should
1938  *	wake up anyone and what value to return... we do
1939  *	this by examining the state of the lock before
1940  *	we changed it
1941  */
1942 static lck_rw_type_t
lck_rw_done_gen(lck_rw_t * lck,uint32_t prior_lock_state)1943 lck_rw_done_gen(
1944 	lck_rw_t        *lck,
1945 	uint32_t        prior_lock_state)
1946 {
1947 	lck_rw_word_t   fake_lck;
1948 	lck_rw_type_t   lock_type;
1949 	thread_t        thread;
1950 	uint32_t        rwlock_count;
1951 
1952 	/*
1953 	 * prior_lock state is a snapshot of the 1st word of the
1954 	 * lock in question... we'll fake up a pointer to it
1955 	 * and carefully not access anything beyond whats defined
1956 	 * in the first word of a lck_rw_t
1957 	 */
1958 	fake_lck.data = prior_lock_state;
1959 
1960 	if (fake_lck.shared_count <= 1) {
1961 		if (fake_lck.w_waiting) {
1962 			thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1963 		}
1964 
1965 		if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1966 			thread_wakeup(LCK_RW_READER_EVENT(lck));
1967 		}
1968 	}
1969 	if (fake_lck.shared_count) {
1970 		lock_type = LCK_RW_TYPE_SHARED;
1971 	} else {
1972 		lock_type = LCK_RW_TYPE_EXCLUSIVE;
1973 	}
1974 
1975 	/* Check if dropping the lock means that we need to unpromote */
1976 	thread = current_thread();
1977 	if (fake_lck.can_sleep) {
1978 		rwlock_count = thread->rwlock_count--;
1979 	} else {
1980 		rwlock_count = UINT32_MAX;
1981 	}
1982 
1983 	if (rwlock_count == 0) {
1984 		panic("rw lock count underflow for thread %p", thread);
1985 	}
1986 
1987 	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1988 		/* sched_flags checked without lock, but will be rechecked while clearing */
1989 		lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1990 	}
1991 #if CONFIG_DTRACE
1992 	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1993 #endif
1994 
1995 #ifdef DEBUG_RW
1996 	remove_held_rwlock(lck, thread, lock_type);
1997 #endif /* DEBUG_RW */
1998 	return lock_type;
1999 }
2000 
2001 /*!
2002  * @function lck_rw_done
2003  *
2004  * @abstract
2005  * Force unlocks a rw_lock without consistency checks.
2006  *
2007  * @discussion
2008  * Do not use unless sure you can avoid consistency checks.
2009  *
2010  * @param lock           rw_lock to unlock.
2011  */
2012 lck_rw_type_t
lck_rw_done(lck_rw_t * lock)2013 lck_rw_done(
2014 	lck_rw_t        *lock)
2015 {
2016 	uint32_t        data, prev;
2017 	boolean_t       once = FALSE;
2018 
2019 #ifdef DEBUG_RW
2020 	/*
2021 	 * Best effort attempt to check that this thread
2022 	 * is holding the lock.
2023 	 */
2024 	thread_t thread = current_thread();
2025 	assert_held_rwlock(lock, thread, 0);
2026 #endif /* DEBUG_RW */
2027 	for (;;) {
2028 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
2029 		if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
2030 			atomic_exchange_abort();
2031 			lck_rw_interlock_spin(lock);
2032 			continue;
2033 		}
2034 		if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
2035 			assertf(lock->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
2036 			data -= LCK_RW_SHARED_READER;
2037 			if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
2038 				goto check_waiters;
2039 			}
2040 		} else {                                        /* if reader count == 0, must be exclusive lock */
2041 			if (data & LCK_RW_WANT_UPGRADE) {
2042 				data &= ~(LCK_RW_WANT_UPGRADE);
2043 			} else {
2044 				if (data & LCK_RW_WANT_EXCL) {
2045 					data &= ~(LCK_RW_WANT_EXCL);
2046 				} else {                                /* lock is not 'owned', panic */
2047 					panic("Releasing non-exclusive RW lock without a reader refcount!");
2048 				}
2049 			}
2050 			if (!once) {
2051 				// Only check for holder and clear it once
2052 				assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
2053 				ordered_store_rw_owner(lock, THREAD_NULL);
2054 				once = TRUE;
2055 			}
2056 check_waiters:
2057 			/*
2058 			 * test the original values to match what
2059 			 * lck_rw_done_gen is going to do to determine
2060 			 * which wakeups need to happen...
2061 			 *
2062 			 * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
2063 			 */
2064 			if (prev & LCK_RW_W_WAITING) {
2065 				data &= ~(LCK_RW_W_WAITING);
2066 				if ((prev & LCK_RW_PRIV_EXCL) == 0) {
2067 					data &= ~(LCK_RW_R_WAITING);
2068 				}
2069 			} else {
2070 				data &= ~(LCK_RW_R_WAITING);
2071 			}
2072 		}
2073 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
2074 			break;
2075 		}
2076 		cpu_pause();
2077 	}
2078 	return lck_rw_done_gen(lock, prev);
2079 }
2080 
2081 /*!
2082  * @function lck_rw_unlock_shared
2083  *
2084  * @abstract
2085  * Unlocks a rw_lock previously locked in shared mode.
2086  *
2087  * @discussion
2088  * The same thread that locked the lock needs to unlock it.
2089  *
2090  * @param lck           rw_lock held in shared mode to unlock.
2091  */
2092 void
lck_rw_unlock_shared(lck_rw_t * lck)2093 lck_rw_unlock_shared(
2094 	lck_rw_t        *lck)
2095 {
2096 	lck_rw_type_t   ret;
2097 
2098 	assertf(lck->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
2099 	assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
2100 	ret = lck_rw_done(lck);
2101 
2102 	if (ret != LCK_RW_TYPE_SHARED) {
2103 		panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
2104 	}
2105 }
2106 
2107 /*!
2108  * @function lck_rw_unlock_exclusive
2109  *
2110  * @abstract
2111  * Unlocks a rw_lock previously locked in exclusive mode.
2112  *
2113  * @discussion
2114  * The same thread that locked the lock needs to unlock it.
2115  *
2116  * @param lck           rw_lock held in exclusive mode to unlock.
2117  */
2118 void
lck_rw_unlock_exclusive(lck_rw_t * lck)2119 lck_rw_unlock_exclusive(
2120 	lck_rw_t        *lck)
2121 {
2122 	lck_rw_type_t   ret;
2123 
2124 	assertf(lck->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
2125 	ret = lck_rw_done(lck);
2126 
2127 	if (ret != LCK_RW_TYPE_EXCLUSIVE) {
2128 		panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
2129 	}
2130 }
2131 
2132 /*!
2133  * @function lck_rw_unlock
2134  *
2135  * @abstract
2136  * Unlocks a rw_lock previously locked with lck_rw_type.
2137  *
2138  * @discussion
2139  * The lock must be unlocked by the same thread it was locked from.
2140  * The type of the lock/unlock have to match, unless an upgrade/downgrade was performed while
2141  * holding the lock.
2142  *
2143  * @param lck           rw_lock to unlock.
2144  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2145  */
2146 void
lck_rw_unlock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2147 lck_rw_unlock(
2148 	lck_rw_t         *lck,
2149 	lck_rw_type_t    lck_rw_type)
2150 {
2151 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2152 		lck_rw_unlock_shared(lck);
2153 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2154 		lck_rw_unlock_exclusive(lck);
2155 	} else {
2156 		panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
2157 	}
2158 }
2159 
2160 /*!
2161  * @function lck_rw_assert
2162  *
2163  * @abstract
2164  * Asserts the rw_lock is held.
2165  *
2166  * @discussion
2167  * read-write locks do not have a concept of ownership when held in shared mode,
2168  * so this function merely asserts that someone is holding the lock, not necessarily the caller.
2169  * However if rw_lock_debug is on, a best effort mechanism to track the owners is in place, and
2170  * this function can be more accurate.
2171  * Type can be LCK_RW_ASSERT_SHARED, LCK_RW_ASSERT_EXCLUSIVE, LCK_RW_ASSERT_HELD
2172  * LCK_RW_ASSERT_NOTHELD.
2173  *
2174  * @param lck   rw_lock to check.
2175  * @param type  assert type
2176  */
2177 void
lck_rw_assert(lck_rw_t * lck,unsigned int type)2178 lck_rw_assert(
2179 	lck_rw_t        *lck,
2180 	unsigned int    type)
2181 {
2182 #if DEBUG_RW
2183 	thread_t thread = current_thread();
2184 #endif /* DEBUG_RW */
2185 
2186 	switch (type) {
2187 	case LCK_RW_ASSERT_SHARED:
2188 		if ((lck->lck_rw_shared_count != 0) &&
2189 		    (lck->lck_rw_owner == THREAD_NULL)) {
2190 #if DEBUG_RW
2191 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2192 #endif /* DEBUG_RW */
2193 			return;
2194 		}
2195 		break;
2196 	case LCK_RW_ASSERT_EXCLUSIVE:
2197 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2198 		    (lck->lck_rw_shared_count == 0) &&
2199 		    (lck->lck_rw_owner == current_thread())) {
2200 #if DEBUG_RW
2201 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2202 #endif /* DEBUG_RW */
2203 			return;
2204 		}
2205 		break;
2206 	case LCK_RW_ASSERT_HELD:
2207 		if (lck->lck_rw_shared_count != 0) {
2208 #if DEBUG_RW
2209 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2210 #endif /* DEBUG_RW */
2211 			return;         // Held shared
2212 		}
2213 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2214 		    (lck->lck_rw_owner == current_thread())) {
2215 #if DEBUG_RW
2216 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2217 #endif /* DEBUG_RW */
2218 			return;         // Held exclusive
2219 		}
2220 		break;
2221 	case LCK_RW_ASSERT_NOTHELD:
2222 		if ((lck->lck_rw_shared_count == 0) &&
2223 		    !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2224 		    (lck->lck_rw_owner == THREAD_NULL)) {
2225 #ifdef DEBUG_RW
2226 			assert_canlock_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2227 #endif /* DEBUG_RW */
2228 			return;
2229 		}
2230 		break;
2231 	default:
2232 		break;
2233 	}
2234 	panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2235 }
2236 
2237 /*!
2238  * @function kdp_lck_rw_lock_is_acquired_exclusive
2239  *
2240  * @abstract
2241  * Checks if a rw_lock is held exclusevely.
2242  *
2243  * @discussion
2244  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2245  *
2246  * @param lck   lock to check
2247  *
2248  * @returns TRUE if the lock is held exclusevely
2249  */
2250 boolean_t
kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t * lck)2251 kdp_lck_rw_lock_is_acquired_exclusive(
2252 	lck_rw_t        *lck)
2253 {
2254 	if (not_in_kdp) {
2255 		panic("panic: rw lock exclusive check done outside of kernel debugger");
2256 	}
2257 	return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2258 }
2259 
2260 void
kdp_rwlck_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)2261 kdp_rwlck_find_owner(
2262 	__unused struct waitq   *waitq,
2263 	event64_t               event,
2264 	thread_waitinfo_t       *waitinfo)
2265 {
2266 	lck_rw_t        *rwlck = NULL;
2267 	switch (waitinfo->wait_type) {
2268 	case kThreadWaitKernelRWLockRead:
2269 		rwlck = READ_EVENT_TO_RWLOCK(event);
2270 		break;
2271 	case kThreadWaitKernelRWLockWrite:
2272 	case kThreadWaitKernelRWLockUpgrade:
2273 		rwlck = WRITE_EVENT_TO_RWLOCK(event);
2274 		break;
2275 	default:
2276 		panic("%s was called with an invalid blocking type", __FUNCTION__);
2277 		break;
2278 	}
2279 	if (rwlck->lck_rw_owner) {
2280 		thread_require(rwlck->lck_rw_owner);
2281 	}
2282 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2283 	waitinfo->owner = thread_tid(rwlck->lck_rw_owner);
2284 }
2285 
2286 /*!
2287  * @function lck_rw_lock_yield_shared
2288  *
2289  * @abstract
2290  * Yields a rw_lock held in shared mode.
2291  *
2292  * @discussion
2293  * This function can block.
2294  * Yields the lock in case there are writers waiting.
2295  * The yield will unlock, block, and re-lock the lock in shared mode.
2296  *
2297  * @param lck           rw_lock already held in shared mode to yield.
2298  * @param force_yield   if set to true it will always yield irrespective of the lock status
2299  *
2300  * @returns TRUE if the lock was yield, FALSE otherwise
2301  */
2302 boolean_t
lck_rw_lock_yield_shared(lck_rw_t * lck,boolean_t force_yield)2303 lck_rw_lock_yield_shared(
2304 	lck_rw_t        *lck,
2305 	boolean_t       force_yield)
2306 {
2307 	lck_rw_word_t   word;
2308 
2309 	lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2310 
2311 	word.data = ordered_load_rw(lck);
2312 	if (word.want_excl || word.want_upgrade || force_yield) {
2313 		lck_rw_unlock_shared(lck);
2314 		mutex_pause(2);
2315 		lck_rw_lock_shared(lck);
2316 		return TRUE;
2317 	}
2318 
2319 	return FALSE;
2320 }
2321 
2322 /*!
2323  * @function lck_rw_sleep
2324  *
2325  * @abstract
2326  * Assert_wait on an event while holding the rw_lock.
2327  *
2328  * @discussion
2329  * the flags can decide how to re-acquire the lock upon wake up
2330  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2331  * and if the priority needs to be kept boosted until the lock is
2332  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2333  *
2334  * @param lck                   rw_lock to use to synch the assert_wait.
2335  * @param lck_sleep_action      flags.
2336  * @param event                 event to assert_wait on.
2337  * @param interruptible         wait type.
2338  */
2339 wait_result_t
lck_rw_sleep(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible)2340 lck_rw_sleep(
2341 	lck_rw_t                *lck,
2342 	lck_sleep_action_t      lck_sleep_action,
2343 	event_t                 event,
2344 	wait_interrupt_t        interruptible)
2345 {
2346 	wait_result_t           res;
2347 	lck_rw_type_t           lck_rw_type;
2348 	thread_pri_floor_t      token;
2349 
2350 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2351 		panic("Invalid lock sleep action %x", lck_sleep_action);
2352 	}
2353 
2354 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2355 		/*
2356 		 * Although we are dropping the RW lock, the intent in most cases
2357 		 * is that this thread remains as an observer, since it may hold
2358 		 * some secondary resource, but must yield to avoid deadlock. In
2359 		 * this situation, make sure that the thread is boosted to the
2360 		 * ceiling while blocked, so that it can re-acquire the
2361 		 * RW lock at that priority.
2362 		 */
2363 		token = thread_priority_floor_start();
2364 	}
2365 
2366 	res = assert_wait(event, interruptible);
2367 	if (res == THREAD_WAITING) {
2368 		lck_rw_type = lck_rw_done(lck);
2369 		res = thread_block(THREAD_CONTINUE_NULL);
2370 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2371 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2372 				lck_rw_lock(lck, lck_rw_type);
2373 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2374 				lck_rw_lock_exclusive(lck);
2375 			} else {
2376 				lck_rw_lock_shared(lck);
2377 			}
2378 		}
2379 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2380 		(void)lck_rw_done(lck);
2381 	}
2382 
2383 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2384 		thread_priority_floor_end(&token);
2385 	}
2386 
2387 	return res;
2388 }
2389 
2390 /*!
2391  * @function lck_rw_sleep_deadline
2392  *
2393  * @abstract
2394  * Assert_wait_deadline on an event while holding the rw_lock.
2395  *
2396  * @discussion
2397  * the flags can decide how to re-acquire the lock upon wake up
2398  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2399  * and if the priority needs to be kept boosted until the lock is
2400  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2401  *
2402  * @param lck                   rw_lock to use to synch the assert_wait.
2403  * @param lck_sleep_action      flags.
2404  * @param event                 event to assert_wait on.
2405  * @param interruptible         wait type.
2406  * @param deadline              maximum time after which being woken up
2407  */
2408 wait_result_t
lck_rw_sleep_deadline(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible,uint64_t deadline)2409 lck_rw_sleep_deadline(
2410 	lck_rw_t                *lck,
2411 	lck_sleep_action_t      lck_sleep_action,
2412 	event_t                 event,
2413 	wait_interrupt_t        interruptible,
2414 	uint64_t                deadline)
2415 {
2416 	wait_result_t           res;
2417 	lck_rw_type_t           lck_rw_type;
2418 	thread_pri_floor_t      token;
2419 
2420 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2421 		panic("Invalid lock sleep action %x", lck_sleep_action);
2422 	}
2423 
2424 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2425 		token = thread_priority_floor_start();
2426 	}
2427 
2428 	res = assert_wait_deadline(event, interruptible, deadline);
2429 	if (res == THREAD_WAITING) {
2430 		lck_rw_type = lck_rw_done(lck);
2431 		res = thread_block(THREAD_CONTINUE_NULL);
2432 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2433 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2434 				lck_rw_lock(lck, lck_rw_type);
2435 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2436 				lck_rw_lock_exclusive(lck);
2437 			} else {
2438 				lck_rw_lock_shared(lck);
2439 			}
2440 		}
2441 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2442 		(void)lck_rw_done(lck);
2443 	}
2444 
2445 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2446 		thread_priority_floor_end(&token);
2447 	}
2448 
2449 	return res;
2450 }
2451 
2452 /*
2453  * Reader-writer lock promotion
2454  *
2455  * We support a limited form of reader-writer
2456  * lock promotion whose effects are:
2457  *
2458  *   * Qualifying threads have decay disabled
2459  *   * Scheduler priority is reset to a floor of
2460  *     of their statically assigned priority
2461  *     or MINPRI_RWLOCK
2462  *
2463  * The rationale is that lck_rw_ts do not have
2464  * a single owner, so we cannot apply a directed
2465  * priority boost from all waiting threads
2466  * to all holding threads without maintaining
2467  * lists of all shared owners and all waiting
2468  * threads for every lock.
2469  *
2470  * Instead (and to preserve the uncontended fast-
2471  * path), acquiring (or attempting to acquire)
2472  * a RW lock in shared or exclusive lock increments
2473  * a per-thread counter. Only if that thread stops
2474  * making forward progress (for instance blocking
2475  * on a mutex, or being preempted) do we consult
2476  * the counter and apply the priority floor.
2477  * When the thread becomes runnable again (or in
2478  * the case of preemption it never stopped being
2479  * runnable), it has the priority boost and should
2480  * be in a good position to run on the CPU and
2481  * release all RW locks (at which point the priority
2482  * boost is cleared).
2483  *
2484  * Care must be taken to ensure that priority
2485  * boosts are not retained indefinitely, since unlike
2486  * mutex priority boosts (where the boost is tied
2487  * to the mutex lifecycle), the boost is tied
2488  * to the thread and independent of any particular
2489  * lck_rw_t. Assertions are in place on return
2490  * to userspace so that the boost is not held
2491  * indefinitely.
2492  *
2493  * The routines that increment/decrement the
2494  * per-thread counter should err on the side of
2495  * incrementing any time a preemption is possible
2496  * and the lock would be visible to the rest of the
2497  * system as held (so it should be incremented before
2498  * interlocks are dropped/preemption is enabled, or
2499  * before a CAS is executed to acquire the lock).
2500  *
2501  */
2502 
2503 /*!
2504  * @function lck_rw_clear_promotion
2505  *
2506  * @abstract
2507  * Undo priority promotions when the last rw_lock
2508  * is released by a thread (if a promotion was active).
2509  *
2510  * @param thread        thread to demote.
2511  * @param trace_obj     object reason for the demotion.
2512  */
2513 void
lck_rw_clear_promotion(thread_t thread,uintptr_t trace_obj)2514 lck_rw_clear_promotion(
2515 	thread_t thread,
2516 	uintptr_t trace_obj)
2517 {
2518 	assert(thread->rwlock_count == 0);
2519 
2520 	/* Cancel any promotions if the thread had actually blocked while holding a RW lock */
2521 	spl_t s = splsched();
2522 	thread_lock(thread);
2523 
2524 	if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
2525 		sched_thread_unpromote_reason(thread, TH_SFLAG_RW_PROMOTED, trace_obj);
2526 	}
2527 
2528 	thread_unlock(thread);
2529 	splx(s);
2530 }
2531 
2532 /*!
2533  * @function lck_rw_set_promotion_locked
2534  *
2535  * @abstract
2536  * Callout from context switch if the thread goes
2537  * off core with a positive rwlock_count.
2538  *
2539  * @discussion
2540  * Called at splsched with the thread locked.
2541  *
2542  * @param thread        thread to promote.
2543  */
2544 void
lck_rw_set_promotion_locked(thread_t thread)2545 lck_rw_set_promotion_locked(thread_t thread)
2546 {
2547 	if (LcksOpts & disLkRWPrio) {
2548 		return;
2549 	}
2550 
2551 	assert(thread->rwlock_count > 0);
2552 
2553 	if (!(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2554 		sched_thread_promote_reason(thread, TH_SFLAG_RW_PROMOTED, 0);
2555 	}
2556 }
2557 
2558 #if __x86_64__
2559 void lck_rw_clear_promotions_x86(thread_t thread);
2560 /*
2561  * On return to userspace, this routine is called from assembly
2562  * if the rwlock_count is somehow imbalanced
2563  */
2564 #if MACH_LDEBUG
2565 __dead2
2566 #endif /* MACH_LDEBUG */
2567 void
lck_rw_clear_promotions_x86(thread_t thread)2568 lck_rw_clear_promotions_x86(thread_t thread)
2569 {
2570 #if MACH_LDEBUG
2571 	/* It's fatal to leave a RW lock locked and return to userspace */
2572 	panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
2573 #else
2574 	/* Paper over the issue */
2575 	thread->rwlock_count = 0;
2576 	lck_rw_clear_promotion(thread, 0);
2577 #endif /* MACH_LDEBUG */
2578 }
2579 #endif /* __x86_64__ */
2580