xref: /xnu-12377.81.4/osfmk/vm/vm_reclaim.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/sched_prim.h>
35 #include <kern/startup.h>
36 #include <kern/thread_group.h>
37 #include <libkern/OSAtomic.h>
38 #include <mach/kern_return.h>
39 #include <mach/mach_types.h>
40 #include <mach/vm_reclaim_private.h>
41 #include <os/atomic_private.h>
42 #include <os/base_private.h>
43 #include <os/log.h>
44 #include <os/refcnt.h>
45 #include <os/refcnt_internal.h>
46 #include <pexpert/pexpert.h>
47 #include <sys/errno.h>
48 #include <sys/kdebug.h>
49 #include <sys/queue.h>
50 #include <sys/reason.h>
51 #include <vm/vm_fault_xnu.h>
52 #include <vm/vm_map.h>
53 #include <vm/vm_map_internal.h>
54 #include <vm/vm_pageout_internal.h>
55 #include <vm/vm_reclaim_internal.h>
56 #include <vm/vm_sanitize_internal.h>
57 #include <vm/vm_kern_xnu.h>
58 
59 #pragma mark Tunables
60 
61 #if DEVELOPMENT || DEBUG
62 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
63 #else /* RELEASE */
64 const uint32_t kReclaimChunkSize = 16;
65 #endif /* DEVELOPMENT || DEBUG */
66 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_sampling_period_ns, "vm_reclaim_sampling_period_ns",
67 #if XNU_TARGET_OS_OSX
68     10ULL * NSEC_PER_SEC);
69 #else
70     1ULL * NSEC_PER_SEC);
71 #endif
72 TUNABLE_DEV_WRITEABLE(bool, vm_reclaim_enabled, "vm_reclaim_enabled", true);
73 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_normal, "vm_reclaim_autotrim_pct_normal", 10);
74 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_pressure, "vm_reclaim_autotrim_pct_pressure", 5);
75 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_critical, "vm_reclaim_autotrim_pct_critical", 1);
76 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_base, "vm_reclaim_wma_weight_base", 3);
77 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_cur, "vm_reclaim_wma_weight_cur", 1);
78 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_denom, "vm_reclaim_wma_denom", 4);
79 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_abandonment_threshold, "vm_reclaim_abandonment_threshold", 512);
80 
81 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
82 #if DEVELOPMENT || DEBUG
83 TUNABLE_WRITEABLE(bool, vm_reclaim_debug, "vm_reclaim_debug", false);
84 #endif
85 
86 #pragma mark Declarations
87 typedef struct proc *proc_t;
88 extern const char *proc_best_name(struct proc *);
89 extern void *proc_find(int pid);
90 extern task_t proc_task(proc_t);
91 extern kern_return_t kern_return_for_errno(int);
92 extern int mach_to_bsd_errno(kern_return_t kr);
93 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
94 struct proc *proc_ref(struct proc *p, int locked);
95 int proc_rele(proc_t p);
96 
97 #define _vmdr_log_type(type, fmt, ...) os_log_with_type(vm_reclaim_log_handle, type, "vm_reclaim: " fmt, ##__VA_ARGS__)
98 #define vmdr_log(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_DEFAULT, fmt, ##__VA_ARGS__)
99 #define vmdr_log_info(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_INFO, fmt, ##__VA_ARGS__)
100 #define vmdr_log_error(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_ERROR, fmt, ##__VA_ARGS__)
101 #if DEVELOPMENT || DEBUG
102 #define vmdr_log_debug(fmt, ...) \
103 MACRO_BEGIN \
104 if (os_unlikely(vm_reclaim_debug)) { \
105 	_vmdr_log_type(OS_LOG_TYPE_DEBUG, fmt, ##__VA_ARGS__); \
106 } \
107 MACRO_END
108 #else /* !(DEVELOPMENT || DEBUG)*/
109 #define vmdr_log_debug(...)
110 #endif /* DEVELOPMENT || DEBUG */
111 
112 static kern_return_t reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
113 static kern_return_t reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
114 static kern_return_t reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
115 static kern_return_t reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result);
116 static mach_error_t vmdr_sample_working_set(
117 	vm_deferred_reclamation_metadata_t metadata,
118 	mach_vm_size_t *trim_threshold_out,
119 	vm_deferred_reclamation_options_t options);
120 static void vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata);
121 static void vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
122 static void vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
123 static void vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata);
124 static void vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata);
125 static void vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action,
126     mach_vm_size_t *total_bytes_reclaimed_out,
127     vm_deferred_reclamation_options_t options);
128 static kern_return_t reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
129     uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
130     mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out);
131 
132 struct vm_deferred_reclamation_metadata_s {
133 	/*
134 	 * Global list containing every reclamation buffer. Protected by the
135 	 * reclamation_buffers_lock.
136 	 */
137 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list;
138 	/* Protects all struct fields (except denoted otherwise) */
139 	decl_lck_mtx_data(, vdrm_lock);
140 	/* Gate to be acquired when performing copyio on the user ring */
141 	decl_lck_mtx_gate_data(, vdrm_gate);
142 	/*
143 	 * The task owns this structure but we maintain a backpointer here
144 	 * so that we can send an exception if we hit an error.
145 	 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
146 	 */
147 	task_t vdrm_task;
148 	pid_t vdrm_pid;
149 	vm_map_t vdrm_map;
150 	/*
151 	 * The owning task holds a ref on this object. When the task dies, it
152 	 * will set vdrm_task := NULL and drop its ref. Threads operating on the buffer
153 	 * should hold a +1 on the metadata structure to ensure it's validity.
154 	 */
155 	os_refcnt_t vdrm_refcnt;
156 	/* The virtual address of the ringbuffer in the user map (immutable) */
157 	user_addr_t vdrm_ring_addr;
158 	/* The size of the VM allocation containing the ringbuffer (immutable) */
159 	mach_vm_size_t vdrm_ring_size;
160 	/* The length of the ringbuffer. This may be changed on buffer re-size */
161 	mach_vm_reclaim_count_t vdrm_buffer_len;
162 	/* Which GC epoch this buffer was last considered in */
163 	uint64_t vdrm_reclaimed_at;
164 	/*
165 	 * The number of threads waiting for a pending reclamation
166 	 * on this buffer to complete.
167 	 */
168 	uint32_t vdrm_waiters;
169 	/* timestamp (MAS) of the last working set sample for this ringbuffer */
170 	uint64_t vdrm_last_sample_abs;
171 	/*
172 	 * The number of bytes reclaimed by kernel GC since the last user
173 	 * accounting update. Protected by @c vdrm_gate.
174 	 */
175 	size_t vdrm_kernel_bytes_reclaimed;
176 	/*
177 	 * The last amount of reclaimable bytes reported to the kernel.
178 	 */
179 	uint64_t vdrm_reclaimable_bytes_last;
180 	/*
181 	 * Exponential moving average of the minimum reclaimable buffer size
182 	 * (in VMDR_WMA_UNIT's). Protected by @c vdrm_gate.
183 	 */
184 	uint64_t vdrm_reclaimable_bytes_wma;
185 	/*
186 	 * Tracks whether or not this reclamation metadata has been added
187 	 * to the global list yet. Normally, this happens when it is allocated,
188 	 * except in the case of fork(). In this case, we have to duplicate the
189 	 * parent's metadata before it returns from fork(), but this occurs
190 	 * before the child's address space is set up.
191 	 */
192 	uint8_t vdrm_is_registered : 1,
193 	    __unused1 : 7;
194 };
195 
196 #pragma mark Globals
197 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
198 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
199 os_refgrp_decl(static, vm_reclaim_metadata_refgrp, "vm_reclaim_metadata_refgrp", NULL);
200 /*
201  * The reclamation_buffers list contains every buffer in the system.
202  * The reclamation_buffers_lock protects the reclamation_buffers list.
203  * It must be held when iterating over the list or manipulating the list.
204  * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
205  */
206 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclaim_buffers = TAILQ_HEAD_INITIALIZER(reclaim_buffers);
207 LCK_MTX_DECLARE(reclaim_buffers_lock, &vm_reclaim_lock_grp);
208 /* Number of times Reclaim GC has run */
209 uint64_t vm_reclaim_gc_epoch = 0;
210 /* The number of reclamation actions (drains/trims) done during GC */
211 uint64_t vm_reclaim_gc_reclaim_count;
212 /* Gate for GC */
213 static decl_lck_mtx_gate_data(, vm_reclaim_gc_gate);
214 os_log_t vm_reclaim_log_handle;
215 /* Number of initialized reclaim buffers */
216 _Atomic uint32_t vm_reclaim_buffer_count;
217 uint64_t vm_reclaim_sampling_period_abs = 0;
218 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_scavenger_thread = THREAD_NULL;
219 static sched_cond_atomic_t vm_reclaim_scavenger_cond = SCHED_COND_INIT;
220 
221 #pragma mark Buffer Initialization/Destruction
222 
223 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,mach_vm_reclaim_count_t len)224 vmdr_metadata_alloc(
225 	task_t                  task,
226 	user_addr_t             buffer,
227 	mach_vm_size_t          size,
228 	mach_vm_reclaim_count_t len)
229 {
230 	vm_deferred_reclamation_metadata_t metadata;
231 	vm_map_t map = task->map;
232 
233 	assert(!map->is_nested_map);
234 
235 	metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
236 	lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
237 	lck_mtx_gate_init(&metadata->vdrm_lock, &metadata->vdrm_gate);
238 	os_ref_init(&metadata->vdrm_refcnt, &vm_reclaim_metadata_refgrp);
239 
240 	metadata->vdrm_task = task;
241 	metadata->vdrm_map = map;
242 	metadata->vdrm_ring_addr = buffer;
243 	metadata->vdrm_ring_size = size;
244 	metadata->vdrm_buffer_len = len;
245 
246 	if (os_atomic_inc(&vm_reclaim_buffer_count, relaxed) == UINT32_MAX) {
247 		panic("Overflowed vm_reclaim_buffer_count");
248 	}
249 
250 	/*
251 	 * we do not need to hold a lock on `task` because this is called
252 	 * either at fork() time or from the context of current_task().
253 	 */
254 	vm_map_reference(map);
255 	return metadata;
256 }
257 
258 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)259 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
260 {
261 	vm_map_deallocate(metadata->vdrm_map);
262 	lck_mtx_gate_destroy(&metadata->vdrm_lock, &metadata->vdrm_gate);
263 	lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
264 	zfree(vm_reclaim_metadata_zone, metadata);
265 	if (os_atomic_dec_orig(&vm_reclaim_buffer_count, relaxed) == 0) {
266 		panic("Underflowed vm_reclaim_buffer_count");
267 	}
268 }
269 
270 static mach_vm_size_t
vmdr_round_len_to_size(vm_map_t map,mach_vm_reclaim_count_t count)271 vmdr_round_len_to_size(vm_map_t map, mach_vm_reclaim_count_t count)
272 {
273 	mach_vm_size_t metadata_size = offsetof(struct mach_vm_reclaim_ring_s, entries);
274 	mach_vm_size_t entries_size = count * sizeof(struct mach_vm_reclaim_entry_s);
275 	return vm_map_round_page(metadata_size + entries_size, vm_map_page_mask(map));
276 }
277 
278 mach_error_t
vm_deferred_reclamation_buffer_allocate_internal(task_t task,mach_vm_address_ut * address_u,uint64_t * sampling_period,mach_vm_reclaim_count_t len,mach_vm_reclaim_count_t max_len)279 vm_deferred_reclamation_buffer_allocate_internal(
280 	task_t                   task,
281 	mach_vm_address_ut       *address_u,
282 	uint64_t                 *sampling_period,
283 	mach_vm_reclaim_count_t  len,
284 	mach_vm_reclaim_count_t  max_len)
285 {
286 	kern_return_t kr;
287 	kern_return_t tmp_kr;
288 	vm_deferred_reclamation_metadata_t metadata = NULL;
289 	vm_map_t map;
290 	uint64_t head = 0, tail = 0, busy = 0;
291 	static bool reclaim_disabled_logged = false;
292 
293 	if (task == TASK_NULL) {
294 		return KERN_INVALID_TASK;
295 	}
296 	if (address_u == NULL || sampling_period == NULL ||
297 	    len == 0 || max_len == 0 || max_len < len) {
298 		return KERN_INVALID_ARGUMENT;
299 	}
300 	map = task->map;
301 	if (!vm_reclaim_enabled) {
302 		if (!reclaim_disabled_logged) {
303 			/* Avoid logging failure for every new process */
304 			reclaim_disabled_logged = true;
305 			vmdr_log_error("failed to initialize deferred "
306 			    "reclamation buffer - vm_reclaim is disabled\n");
307 		}
308 		return VM_RECLAIM_NOT_SUPPORTED;
309 	}
310 
311 	map = task->map;
312 	mach_vm_size_t rounded_vm_size = vmdr_round_len_to_size(map, max_len);
313 	if (rounded_vm_size == 0) {
314 		return KERN_INVALID_ARGUMENT;
315 	}
316 
317 	if (rounded_vm_size > VM_RECLAIM_MAX_BUFFER_SIZE) {
318 		vmdr_log_error("denying request to allocate ringbuffer of size "
319 		    "%llu KiB (max %llu KiB)\n",
320 		    rounded_vm_size,
321 		    VM_RECLAIM_MAX_BUFFER_SIZE);
322 		return KERN_NO_SPACE;
323 	}
324 
325 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START,
326 	    task_pid(task), len);
327 
328 	/*
329 	 * Allocate a VM region that can contain the maximum buffer size. The
330 	 * allocation starts as VM_PROT_NONE and may be unprotected on buffer
331 	 * resize.
332 	 *
333 	 * TODO: If clients other than libmalloc adopt deferred reclaim, a
334 	 * different tag should be given
335 	 *
336 	 * `address` was sanitized under the assumption that we'll only use
337 	 * it as a hint (overflow checks were used) so we must pass the
338 	 * anywhere flag.
339 	 */
340 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE_PERMANENT(
341 		.vm_tag = VM_MEMORY_MALLOC);
342 	mach_vm_size_ut size_u = vm_sanitize_wrap_size(rounded_vm_size);
343 	kr = mach_vm_map_kernel(map, address_u, size_u, VM_MAP_PAGE_MASK(map),
344 	    vmk_flags, IPC_PORT_NULL, 0, FALSE,
345 	    VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_COPY);
346 	if (kr != KERN_SUCCESS) {
347 		vmdr_log_error("%s [%d] failed to allocate VA for reclaim "
348 		    "buffer (%d)\n", task_best_name(task), task_pid(task), kr);
349 		return kr;
350 	}
351 	mach_vm_address_t address = VM_SANITIZE_UNSAFE_UNWRAP(*address_u);
352 	assert3u(address, !=, 0);
353 
354 	metadata = vmdr_metadata_alloc(task, address, rounded_vm_size, len);
355 	metadata->vdrm_pid = task_pid(task);
356 
357 	/*
358 	 * Validate the starting indices.
359 	 */
360 	kr = reclaim_copyin_busy(metadata, &busy);
361 	if (kr != KERN_SUCCESS) {
362 		goto out;
363 	}
364 	kr = reclaim_copyin_head(metadata, &head);
365 	if (kr != KERN_SUCCESS) {
366 		goto out;
367 	}
368 	kr = reclaim_copyin_tail(metadata, &tail);
369 	if (kr != KERN_SUCCESS) {
370 		goto out;
371 	}
372 
373 	if (head != 0 || tail != 0 || busy != 0) {
374 		vmdr_log_error("indices were not "
375 		    "zero-initialized\n");
376 		kr = KERN_INVALID_ARGUMENT;
377 		goto out;
378 	}
379 
380 	/*
381 	 * Publish the metadata to the task & global buffer list. This must be
382 	 * done under the task lock to synchronize with task termination - i.e.
383 	 * task_terminate_internal is guaranteed to see the published metadata and
384 	 * tear it down.
385 	 */
386 	lck_mtx_lock(&reclaim_buffers_lock);
387 	task_lock(task);
388 
389 	if (!task_is_active(task) || task_is_halting(task)) {
390 		vmdr_log_error(
391 			"failed to initialize buffer on dying task %s [%d]",
392 			task_best_name(task), task_pid(task));
393 		kr = KERN_ABORTED;
394 		goto fail_task;
395 	}
396 	if (task->deferred_reclamation_metadata != NULL) {
397 		vmdr_log_error(
398 			"tried to overwrite existing reclaim buffer for %s [%d]", task_best_name(task), task_pid(task));
399 		kr = VM_RECLAIM_RESOURCE_SHORTAGE;
400 		goto fail_task;
401 	}
402 
403 	metadata->vdrm_is_registered = true;
404 	vmdr_list_append_locked(metadata);
405 	task->deferred_reclamation_metadata = metadata;
406 
407 	task_unlock(task);
408 	lck_mtx_unlock(&reclaim_buffers_lock);
409 
410 	vmdr_log_debug("%s [%d] allocated ring with capacity %u/%u\n",
411 	    task_best_name(task), task_pid(task),
412 	    len, max_len);
413 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
414 	    task_pid(task), KERN_SUCCESS, address);
415 	DTRACE_VM3(reclaim_ring_allocate,
416 	    mach_vm_address_t, address,
417 	    mach_vm_reclaim_count_t, len,
418 	    mach_vm_reclaim_count_t, max_len);
419 	return KERN_SUCCESS;
420 
421 fail_task:
422 	task_unlock(task);
423 	lck_mtx_unlock(&reclaim_buffers_lock);
424 
425 	tmp_kr = mach_vm_deallocate(map,
426 	    *address_u, size_u);
427 	assert(tmp_kr == KERN_SUCCESS);
428 
429 out:
430 	*address_u = vm_sanitize_wrap_addr(0ull);
431 	*sampling_period = vm_reclaim_sampling_period_abs;
432 	vmdr_metadata_release(metadata);
433 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
434 	    kr, NULL);
435 	return kr;
436 }
437 
438 #pragma mark Synchronization & Lifecycle
439 
440 static inline void
vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)441 vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)
442 {
443 	lck_mtx_lock(&metadata->vdrm_lock);
444 }
445 
446 static inline void
vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)447 vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)
448 {
449 	lck_mtx_unlock(&metadata->vdrm_lock);
450 }
451 
452 static inline void
vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)453 vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)
454 {
455 	lck_mtx_gate_assert(&metadata->vdrm_lock, &metadata->vdrm_gate,
456 	    GATE_ASSERT_HELD);
457 }
458 
459 static inline void
vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)460 vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)
461 {
462 #if MACH_ASSERT
463 	vmdr_metadata_lock(metadata);
464 	vmdr_metadata_assert_owned_locked(metadata);
465 	vmdr_metadata_unlock(metadata);
466 #else /* MACH_ASSERT */
467 	(void)metadata;
468 #endif /* MACH_ASSERT */
469 }
470 
471 static bool
vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)472 vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)
473 {
474 	kern_return_t kr = lck_mtx_gate_try_close(&metadata->vdrm_lock,
475 	    &metadata->vdrm_gate);
476 	return kr == KERN_SUCCESS;
477 }
478 
479 /*
480  * Try to take ownership of the buffer. Returns true if successful.
481  */
482 static bool
vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata,vm_deferred_reclamation_options_t options)483 vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata,
484     vm_deferred_reclamation_options_t options)
485 {
486 	__assert_only gate_wait_result_t wait_result;
487 	if (!vmdr_metadata_try_own_locked(metadata)) {
488 		if (options & RECLAIM_NO_WAIT) {
489 			return false;
490 		}
491 		wait_result = lck_mtx_gate_wait(
492 			&metadata->vdrm_lock, &metadata->vdrm_gate, LCK_SLEEP_DEFAULT,
493 			THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
494 		assert(wait_result == GATE_HANDOFF);
495 	}
496 	return true;
497 }
498 
499 /*
500  * Set the current thread as the owner of a reclaim buffer. May block. Will
501  * propagate priority.
502  */
503 static void
vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)504 vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)
505 {
506 	vmdr_metadata_lock(metadata);
507 	vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
508 	vmdr_metadata_unlock(metadata);
509 }
510 
511 static void
vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)512 vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)
513 {
514 	vmdr_metadata_assert_owned_locked(metadata);
515 	lck_mtx_gate_handoff(&metadata->vdrm_lock, &metadata->vdrm_gate,
516 	    GATE_HANDOFF_OPEN_IF_NO_WAITERS);
517 }
518 
519 /*
520  * Release ownership of a reclaim buffer and wakeup any threads waiting for
521  * ownership. Must be called from the thread that acquired ownership.
522  */
523 static void
vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)524 vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)
525 {
526 	vmdr_metadata_lock(metadata);
527 	vmdr_metadata_disown_locked(metadata);
528 	vmdr_metadata_unlock(metadata);
529 }
530 
531 static void
vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)532 vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)
533 {
534 	os_ref_retain(&metadata->vdrm_refcnt);
535 }
536 
537 static void
vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)538 vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)
539 {
540 	if (os_ref_release(&metadata->vdrm_refcnt) == 0) {
541 		vmdr_metadata_free(metadata);
542 	}
543 }
544 
545 static void
vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)546 vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
547 {
548 	LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
549 	assert3p(metadata->vdrm_list.tqe_prev, !=, NULL);
550 	TAILQ_REMOVE(&reclaim_buffers, metadata, vdrm_list);
551 	metadata->vdrm_list.tqe_prev = NULL;
552 	metadata->vdrm_list.tqe_next = NULL;
553 }
554 
555 static void
vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)556 vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
557 {
558 	LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
559 	assert3p(metadata->vdrm_list.tqe_prev, ==, NULL);
560 	TAILQ_INSERT_TAIL(&reclaim_buffers, metadata, vdrm_list);
561 }
562 
563 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)564 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
565 {
566 	assert(metadata != NULL);
567 	/*
568 	 * First remove the buffer from the global list so no one else can get access to it.
569 	 */
570 	lck_mtx_lock(&reclaim_buffers_lock);
571 	if (metadata->vdrm_is_registered) {
572 		vmdr_list_remove_locked(metadata);
573 	}
574 	lck_mtx_unlock(&reclaim_buffers_lock);
575 
576 	/*
577 	 * The task is dropping its ref on this buffer. First remove the buffer's
578 	 * back-reference to the task so that any threads currently operating on
579 	 * this buffer do not try to operate on the dead/dying task
580 	 */
581 	vmdr_metadata_lock(metadata);
582 	assert3p(metadata->vdrm_task, !=, TASK_NULL);
583 	metadata->vdrm_task = TASK_NULL;
584 	vmdr_metadata_unlock(metadata);
585 	vmdr_metadata_release(metadata);
586 }
587 
588 #pragma mark Exception Delivery
589 
590 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)591 reclaim_kill_with_reason(
592 	vm_deferred_reclamation_metadata_t metadata,
593 	unsigned reason,
594 	mach_exception_data_type_t subcode)
595 {
596 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
597 	mach_exception_code_t code = 0;
598 	task_t task;
599 	proc_t p = NULL;
600 	boolean_t fatal = TRUE;
601 	bool killing_self;
602 	pid_t pid;
603 	int err;
604 
605 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
606 
607 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
608 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
609 	EXC_GUARD_ENCODE_TARGET(code, 0);
610 
611 	vmdr_metadata_lock(metadata);
612 	task = metadata->vdrm_task;
613 	if (task == TASK_NULL || !task_is_active(task) || task_is_halting(task)) {
614 		/* Task is no longer alive */
615 		vmdr_metadata_unlock(metadata);
616 		vmdr_log_error(
617 			"Unable to deliver guard exception because task "
618 			"[%d] is already dead.\n",
619 			metadata->vdrm_pid);
620 		return;
621 	}
622 
623 	if (panic_on_kill) {
624 		panic("About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
625 	}
626 
627 	killing_self = (task == current_task());
628 	if (!killing_self) {
629 		task_reference(task);
630 	}
631 	assert(task != kernel_task);
632 	vmdr_metadata_unlock(metadata);
633 
634 	if (reason == kGUARD_EXC_DEALLOC_GAP) {
635 		task_lock(task);
636 		fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
637 		task_unlock(task);
638 	}
639 
640 	if (!fatal) {
641 		vmdr_log_info(
642 			"Skipping non fatal guard exception for %s [%d]\n",
643 			task_best_name(task), task_pid(task));
644 		goto out;
645 	}
646 
647 	pid = task_pid(task);
648 	if (killing_self) {
649 		p = get_bsdtask_info(task);
650 	} else {
651 		p = proc_find(pid);
652 		if (p && proc_task(p) != task) {
653 			vmdr_log_error(
654 				"Unable to deliver guard exception because proc is gone & pid rolled over.\n");
655 			goto out;
656 		}
657 	}
658 
659 	if (!p) {
660 		vmdr_log_error(
661 			"Unable to deliver guard exception because task does not have a proc.\n");
662 		goto out;
663 	}
664 
665 	int flags = PX_DEBUG_NO_HONOR;
666 	exception_info_t info = {
667 		.os_reason = OS_REASON_GUARD,
668 		.exception_type = EXC_GUARD,
669 		.mx_code = code,
670 		.mx_subcode = subcode
671 	};
672 
673 	vmdr_log("Force-exiting %s [%d]\n", task_best_name(task), task_pid(task));
674 
675 	err = exit_with_mach_exception(p, info, flags);
676 	if (err != 0) {
677 		vmdr_log_error("Unable to deliver guard exception to %p: %d\n", p, err);
678 		goto out;
679 	}
680 
681 
682 out:
683 	if (!killing_self) {
684 		if (p) {
685 			proc_rele(p);
686 			p = NULL;
687 		}
688 		if (task) {
689 			task_deallocate(task);
690 			task = NULL;
691 		}
692 	}
693 }
694 
695 #pragma mark Copy I/O
696 
697 static user_addr_t
get_entries_ptr(vm_deferred_reclamation_metadata_t metadata)698 get_entries_ptr(vm_deferred_reclamation_metadata_t metadata)
699 {
700 	return metadata->vdrm_ring_addr +
701 	       offsetof(struct mach_vm_reclaim_ring_s, entries);
702 }
703 
704 static user_addr_t
get_head_ptr(vm_deferred_reclamation_metadata_t metadata)705 get_head_ptr(vm_deferred_reclamation_metadata_t metadata)
706 {
707 	return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, head);
708 }
709 
710 static user_addr_t
get_tail_ptr(vm_deferred_reclamation_metadata_t metadata)711 get_tail_ptr(vm_deferred_reclamation_metadata_t metadata)
712 {
713 	return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, tail);
714 }
715 
716 static user_addr_t
get_busy_ptr(vm_deferred_reclamation_metadata_t metadata)717 get_busy_ptr(vm_deferred_reclamation_metadata_t metadata)
718 {
719 	return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, busy);
720 }
721 
722 static kern_return_t
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)723 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
724 {
725 	if (result != 0 && (result != EFAULT || !vm_fault_get_disabled())) {
726 		vmdr_log_error("Killing [%d] due to copy I/O error\n", metadata->vdrm_pid);
727 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE,
728 		    result);
729 	}
730 	return kern_return_for_errno(result);
731 }
732 
733 /*
734  * Helper functions to do copyio on the head, tail, and busy pointers.
735  * Note that the kernel will only write to the busy and head pointers.
736  * Userspace is not supposed to write to the head or busy pointers, but the kernel
737  * must be resilient to that kind of bug in userspace.
738  */
739 
740 static kern_return_t
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)741 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
742 {
743 	int result;
744 	kern_return_t kr;
745 	user_addr_t head_ptr = get_head_ptr(metadata);
746 
747 	result = copyin_atomic64(head_ptr, head);
748 	kr = reclaim_handle_copyio_error(metadata, result);
749 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
750 		vmdr_log_error(
751 			"Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
752 	}
753 	return kr;
754 }
755 
756 static kern_return_t
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)757 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
758 {
759 	int result;
760 	kern_return_t kr;
761 	user_addr_t tail_ptr = get_tail_ptr(metadata);
762 
763 	result = copyin_atomic64(tail_ptr, tail);
764 	kr = reclaim_handle_copyio_error(metadata, result);
765 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
766 		vmdr_log_error(
767 			"Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
768 	}
769 	return kr;
770 }
771 
772 static kern_return_t
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)773 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
774 {
775 	int result;
776 	kern_return_t kr;
777 	user_addr_t busy_ptr = get_busy_ptr(metadata);
778 
779 	result = copyin_atomic64(busy_ptr, busy);
780 	kr = reclaim_handle_copyio_error(metadata, result);
781 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
782 		vmdr_log_error(
783 			"Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
784 	}
785 	return kr;
786 }
787 
788 static kern_return_t
reclaim_copyin_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata,size_t * reclaimable_bytes_out)789 reclaim_copyin_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t *reclaimable_bytes_out)
790 {
791 	int result;
792 	kern_return_t kr = KERN_SUCCESS;
793 	uint64_t reclaimable_bytes;
794 	user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
795 	    offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes);
796 
797 	result = copyin_atomic64(ptr, &reclaimable_bytes);
798 	if (result) {
799 		kr = reclaim_handle_copyio_error(metadata, result);
800 		if (result != EFAULT || !vm_fault_get_disabled()) {
801 			vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
802 		}
803 	} else {
804 		*reclaimable_bytes_out = (size_t)reclaimable_bytes;
805 	}
806 	return kr;
807 }
808 
809 static kern_return_t
reclaim_copyin_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata,size_t * min_reclaimable_bytes_out)810 reclaim_copyin_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t *min_reclaimable_bytes_out)
811 {
812 	int result;
813 	kern_return_t kr = KERN_SUCCESS;
814 	uint64_t min_reclaimable_bytes;
815 	user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
816 	    offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes_min);
817 
818 	result = copyin_atomic64(ptr, &min_reclaimable_bytes);
819 	if (result) {
820 		kr = reclaim_handle_copyio_error(metadata, result);
821 		if (result != EFAULT || !vm_fault_get_disabled()) {
822 			vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
823 		}
824 	} else {
825 		*min_reclaimable_bytes_out = (size_t)min_reclaimable_bytes;
826 	}
827 	return kr;
828 }
829 
830 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)831 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
832 {
833 	int result;
834 	kern_return_t kr = KERN_SUCCESS;
835 	user_addr_t busy_ptr = get_busy_ptr(metadata);
836 
837 	result = copyout_atomic64(value, busy_ptr);
838 	if (result) {
839 		kr = reclaim_handle_copyio_error(metadata, result);
840 		if (result != EFAULT || !vm_fault_get_disabled()) {
841 			vmdr_log_error(
842 				"Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
843 		}
844 	}
845 	return kr;
846 }
847 
848 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)849 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
850 {
851 	int result;
852 	kern_return_t kr = KERN_SUCCESS;
853 	user_addr_t head_ptr = get_head_ptr(metadata);
854 
855 	result = copyout_atomic64(value, head_ptr);
856 	if (result) {
857 		kr = reclaim_handle_copyio_error(metadata, result);
858 		if (result != EFAULT || !vm_fault_get_disabled()) {
859 			vmdr_log_error(
860 				"Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
861 		}
862 	}
863 	return kr;
864 }
865 
866 static kern_return_t
reclaim_copyout_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata,size_t min_reclaimable_bytes)867 reclaim_copyout_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t min_reclaimable_bytes)
868 {
869 	int result;
870 	kern_return_t kr = KERN_SUCCESS;
871 	user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
872 	    offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes_min);
873 
874 	result = copyout_atomic64(min_reclaimable_bytes, ptr);
875 	if (result) {
876 		kr = reclaim_handle_copyio_error(metadata, result);
877 		if (result != EFAULT || !vm_fault_get_disabled()) {
878 			vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
879 		}
880 	}
881 	return kr;
882 }
883 
884 #pragma mark Reclamation
885 
886 /*
887  * @func reclaim_chunk
888  *
889  * @brief
890  * Reclaim a batch of entries from the buffer.
891  *
892  * @param bytes_to_reclaim
893  * Number of bytes caller wishes to reclaim from the buffer
894  *
895  * @param bytes_reclaimed_out
896  * The number of bytes reclaimed from the buffer written out
897  *
898  * @param chunk_size
899  * The maximum number of entries to hold busy and reclaim from (must
900  * be <= kReclaimChunkSize)
901  *
902  * @param num_reclaimed_out
903  * The number of entries reclaimed written out
904  *
905  * @discussion
906  * If the buffer has been exhausted of entries (tail == head),
907  * num_reclaimed_out will be zero. It is important that the caller abort any
908  * loops if such a condition is met.
909  */
910 static kern_return_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,uint64_t bytes_to_reclaim,uint64_t * bytes_reclaimed_out,mach_vm_reclaim_count_t chunk_size,mach_vm_reclaim_count_t * num_reclaimed_out)911 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
912     uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
913     mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out)
914 {
915 	kern_return_t kr = KERN_SUCCESS;
916 	int result = 0;
917 	mach_vm_reclaim_count_t num_reclaimed = 0, num_copied = 0;
918 	uint64_t bytes_reclaimed = 0;
919 	uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0;
920 	vm_map_t map = metadata->vdrm_map;
921 	vm_map_switch_context_t switch_ctx;
922 	struct mach_vm_reclaim_entry_s copied_entries[kReclaimChunkSize];
923 
924 	assert(metadata != NULL);
925 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
926 	vmdr_metadata_assert_owned(metadata);
927 
928 	assert(chunk_size <= kReclaimChunkSize);
929 
930 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START,
931 	    metadata->vdrm_pid, bytes_to_reclaim);
932 
933 	memset(copied_entries, 0, sizeof(copied_entries));
934 
935 	switch_ctx = vm_map_switch_to(map);
936 
937 	kr = reclaim_copyin_busy(metadata, &busy);
938 	if (kr != KERN_SUCCESS) {
939 		goto done;
940 	}
941 	kr = reclaim_copyin_head(metadata, &head);
942 	if (kr != KERN_SUCCESS) {
943 		goto done;
944 	}
945 	kr = reclaim_copyin_tail(metadata, &tail);
946 	if (kr != KERN_SUCCESS) {
947 		goto done;
948 	}
949 
950 	/*
951 	 * NB: busy may not be exactly equal to head if the jetsam
952 	 * thread fails to fault on the indices after having marked
953 	 * entries busy
954 	 */
955 	if (busy < head || (busy - head) > kReclaimChunkSize) {
956 		vmdr_log_error(
957 			"Userspace modified head or busy pointer! head: %llu "
958 			"(0x%llx) | busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
959 			head, get_head_ptr(metadata), busy, get_busy_ptr(metadata), tail,
960 			get_tail_ptr(metadata));
961 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE,
962 		    busy);
963 		kr = KERN_FAILURE;
964 		goto done;
965 	}
966 
967 	if (tail < head) {
968 		/*
969 		 * Userspace is likely in the middle of trying to re-use an entry,
970 		 * bail on this reclamation.
971 		 */
972 		vmdr_log_error(
973 			"Tail < head! Userspace is likely attempting a "
974 			"cancellation; aborting reclamation | head: %llu "
975 			"(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
976 			head, get_head_ptr(metadata), tail, get_tail_ptr(metadata), busy,
977 			get_busy_ptr(metadata));
978 		kr = KERN_ABORTED;
979 		goto done;
980 	}
981 
982 	/*
983 	 * NB: If any of the copyouts below fail due to faults being disabled,
984 	 * the buffer may be left in a state where several entries are unusable
985 	 * until the next reclamation (i.e. busy > head)
986 	 */
987 	num_to_reclaim = tail - head;
988 	while (true) {
989 		num_to_reclaim = MIN(num_to_reclaim, chunk_size);
990 		if (num_to_reclaim == 0) {
991 			break;
992 		}
993 		busy = head + num_to_reclaim;
994 		kr = reclaim_copyout_busy(metadata, busy);
995 		if (kr != KERN_SUCCESS) {
996 			goto done;
997 		}
998 		os_atomic_thread_fence(seq_cst);
999 		kr = reclaim_copyin_tail(metadata, &new_tail);
1000 		if (kr != KERN_SUCCESS) {
1001 			goto done;
1002 		}
1003 
1004 		if (new_tail >= busy) {
1005 			/* Got num_to_reclaim entries */
1006 			break;
1007 		}
1008 		tail = new_tail;
1009 		if (tail < head) {
1010 			/*
1011 			 * Userspace is likely in the middle of trying to re-use an entry,
1012 			 * bail on this reclamation
1013 			 */
1014 			vmdr_log_error(
1015 				"Tail < head! Userspace is likely attempting a "
1016 				"cancellation; aborting reclamation | head: %llu "
1017 				"(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
1018 				head, get_head_ptr(metadata), tail, get_tail_ptr(metadata), busy,
1019 				get_busy_ptr(metadata));
1020 			/* Reset busy back to head */
1021 			reclaim_copyout_busy(metadata, head);
1022 			kr = KERN_ABORTED;
1023 			goto done;
1024 		}
1025 		/* Can't reclaim these entries. Try again */
1026 		num_to_reclaim = tail - head;
1027 		if (num_to_reclaim == 0) {
1028 			/* Nothing left to reclaim. Reset busy to head. */
1029 			kr = reclaim_copyout_busy(metadata, head);
1030 			if (kr != KERN_SUCCESS) {
1031 				goto done;
1032 			}
1033 			break;
1034 		}
1035 		/*
1036 		 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
1037 		 * so this is gauranteed to converge.
1038 		 */
1039 	}
1040 	vmdr_log_debug("[%d] reclaiming up to %llu entries (%llu KiB) head=%llu "
1041 	    "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_to_reclaim,
1042 	    bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1043 
1044 	uint64_t memcpy_start_idx = head % metadata->vdrm_buffer_len;
1045 	while (num_copied < num_to_reclaim) {
1046 		uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
1047 		// Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
1048 		memcpy_end_idx = MIN(memcpy_end_idx, metadata->vdrm_buffer_len);
1049 		uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
1050 
1051 		assert(num_to_copy + num_copied <= kReclaimChunkSize);
1052 		user_addr_t src_ptr = get_entries_ptr(metadata) +
1053 		    (memcpy_start_idx * sizeof(struct mach_vm_reclaim_entry_s));
1054 		struct mach_vm_reclaim_entry_s *dst_ptr = copied_entries + num_copied;
1055 		result = copyin(src_ptr, dst_ptr,
1056 		    (num_to_copy * sizeof(struct mach_vm_reclaim_entry_s)));
1057 		kr = reclaim_handle_copyio_error(metadata, result);
1058 		if (kr != KERN_SUCCESS) {
1059 			if (kr != KERN_MEMORY_ERROR || !vm_fault_get_disabled()) {
1060 				vmdr_log_error(
1061 					"Unable to copyin %llu entries in reclaim "
1062 					"buffer at 0x%llx to 0x%llx: err=%d\n",
1063 					num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
1064 			}
1065 			goto done;
1066 		}
1067 
1068 		num_copied += num_to_copy;
1069 		memcpy_start_idx = (memcpy_start_idx + num_to_copy) % metadata->vdrm_buffer_len;
1070 	}
1071 
1072 	for (num_reclaimed = 0; num_reclaimed < num_to_reclaim && bytes_reclaimed < bytes_to_reclaim; num_reclaimed++) {
1073 		mach_vm_reclaim_entry_t entry = &copied_entries[num_reclaimed];
1074 		KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1075 		    metadata->vdrm_pid, entry->address, entry->size,
1076 		    entry->behavior);
1077 		if (entry->address != 0 && entry->size != 0) {
1078 			vm_map_address_t start = vm_map_trunc_page(entry->address,
1079 			    VM_MAP_PAGE_MASK(map));
1080 			vm_map_address_t end = vm_map_round_page(entry->address + entry->size,
1081 			    VM_MAP_PAGE_MASK(map));
1082 			DTRACE_VM4(vm_reclaim_entry,
1083 			    pid_t, metadata->vdrm_pid,
1084 			    mach_vm_address_t, entry->address,
1085 			    mach_vm_address_t, end,
1086 			    mach_vm_reclaim_action_t, entry->behavior);
1087 			KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1088 			    metadata->vdrm_pid, start, end,
1089 			    entry->behavior);
1090 			vmdr_log_debug("[%d] Reclaiming entry %llu (0x%llx, 0x%llx)\n", metadata->vdrm_pid, head + num_reclaimed, start, end);
1091 			switch (entry->behavior) {
1092 			case VM_RECLAIM_DEALLOCATE:
1093 				kr = vm_map_remove_guard(map,
1094 				    start, end, VM_MAP_REMOVE_GAPS_FAIL,
1095 				    KMEM_GUARD_NONE).kmr_return;
1096 				if (kr == KERN_INVALID_VALUE) {
1097 					vmdr_log_error(
1098 						"[%d] Killing due to virtual-memory guard at (0x%llx, 0x%llx)\n",
1099 						metadata->vdrm_pid, start, end);
1100 					reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
1101 					goto done;
1102 				} else if (kr != KERN_SUCCESS) {
1103 					vmdr_log_error(
1104 						"[%d] Killing due to deallocation failure at (0x%llx, 0x%llx) err=%d\n",
1105 						metadata->vdrm_pid, start, end, kr);
1106 					reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1107 					goto done;
1108 				}
1109 				break;
1110 			case VM_RECLAIM_FREE:
1111 				/*
1112 				 * TODO: This should free the backing pages directly instead of using
1113 				 * VM_BEHAVIOR_REUSABLE, which will mark the pages as clean and let them
1114 				 * age in the LRU.
1115 				 */
1116 				kr = vm_map_behavior_set(map, start,
1117 				    end, VM_BEHAVIOR_REUSABLE);
1118 				if (kr != KERN_SUCCESS) {
1119 					vmdr_log_error(
1120 						"[%d] Failed to free(reusable) (0x%llx, 0x%llx) err=%d\n",
1121 						metadata->vdrm_pid, start, end, kr);
1122 				}
1123 				break;
1124 			default:
1125 				vmdr_log_error(
1126 					"attempted to reclaim entry with unsupported behavior %uh",
1127 					entry->behavior);
1128 				reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1129 				kr = KERN_INVALID_VALUE;
1130 				goto done;
1131 			}
1132 			bytes_reclaimed += entry->size;
1133 			KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END,
1134 			    kr);
1135 		}
1136 	}
1137 
1138 	assert(head + num_reclaimed <= busy);
1139 	head += num_reclaimed;
1140 	kr = reclaim_copyout_head(metadata, head);
1141 	if (kr != KERN_SUCCESS) {
1142 		goto done;
1143 	}
1144 	if (busy > head) {
1145 		busy = head;
1146 		kr = reclaim_copyout_busy(metadata, busy);
1147 		if (kr != KERN_SUCCESS) {
1148 			goto done;
1149 		}
1150 	}
1151 
1152 done:
1153 	vmdr_log_debug("[%d] reclaimed %u entries (%llu KiB) head=%llu "
1154 	    "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_reclaimed,
1155 	    bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1156 	vm_map_switch_back(switch_ctx);
1157 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1158 	    bytes_reclaimed, num_reclaimed, kr);
1159 	if (bytes_reclaimed_out) {
1160 		*bytes_reclaimed_out = bytes_reclaimed;
1161 	}
1162 	if (num_reclaimed_out) {
1163 		*num_reclaimed_out = num_reclaimed;
1164 	}
1165 	return kr;
1166 }
1167 
1168 /*
1169  * @func vmdr_reclaim_from_buffer
1170  *
1171  * @brief
1172  * Reclaim entries until the buffer's estimated number of available bytes
1173  * is <= @c bytes_to_reclaim.
1174  *
1175  * @param bytes_to_reclaim
1176  * The minimum number of bytes to reclaim
1177  *
1178  * @param num_bytes_reclaimed_out
1179  * The number of bytes reclaimed written out
1180  *
1181  * @param options
1182  * If RECLAIM_NO_FAULT is set, do not fault on the buffer if it has been paged
1183  * out.
1184  *
1185  * @discussion
1186  * The buffer should be owned by the caller.
1187  */
1188 static kern_return_t
vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata,mach_vm_size_t bytes_to_reclaim,mach_vm_size_t * num_bytes_reclaimed_out,vm_deferred_reclamation_options_t options)1189 vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata,
1190     mach_vm_size_t bytes_to_reclaim, mach_vm_size_t *num_bytes_reclaimed_out,
1191     vm_deferred_reclamation_options_t options)
1192 {
1193 	kern_return_t kr = KERN_SUCCESS;
1194 
1195 	if (options & RECLAIM_NO_FAULT) {
1196 		vm_fault_disable();
1197 	}
1198 
1199 	mach_vm_size_t total_bytes_reclaimed = 0;
1200 	while (total_bytes_reclaimed < bytes_to_reclaim) {
1201 		mach_vm_size_t cur_bytes_reclaimed;
1202 		mach_vm_reclaim_count_t entries_reclaimed;
1203 		kr = reclaim_chunk(metadata, bytes_to_reclaim - total_bytes_reclaimed,
1204 		    &cur_bytes_reclaimed, kReclaimChunkSize, &entries_reclaimed);
1205 		total_bytes_reclaimed += cur_bytes_reclaimed;
1206 		if (entries_reclaimed == 0 || kr != KERN_SUCCESS) {
1207 			break;
1208 		}
1209 	}
1210 
1211 	if (options & RECLAIM_NO_FAULT) {
1212 		vm_fault_enable();
1213 	}
1214 	vmdr_log_debug("reclaimed %llu B / %llu B from %d\n", total_bytes_reclaimed, bytes_to_reclaim, metadata->vdrm_pid);
1215 	if (num_bytes_reclaimed_out) {
1216 		*num_bytes_reclaimed_out = total_bytes_reclaimed;
1217 	}
1218 	return kr;
1219 }
1220 
1221 /*
1222  * Get and retain the reclamation metadata buffer for the given task.
1223  */
1224 static vm_deferred_reclamation_metadata_t
vmdr_acquire_task_metadata(task_t task)1225 vmdr_acquire_task_metadata(task_t task)
1226 {
1227 	vm_deferred_reclamation_metadata_t meta = NULL;
1228 	assert(task != NULL);
1229 	task_lock(task);
1230 	if (!task_is_halting(task) && task_is_active(task)) {
1231 		meta = task->deferred_reclamation_metadata;
1232 	}
1233 	if (meta != NULL) {
1234 		vmdr_metadata_retain(meta);
1235 	}
1236 	task_unlock(task);
1237 	return meta;
1238 }
1239 
1240 
1241 #pragma mark Buffer Resize/Synchronization
1242 
1243 kern_return_t
vm_deferred_reclamation_buffer_flush_internal(task_t task,mach_vm_reclaim_count_t num_entries_to_reclaim,mach_vm_size_t * bytes_reclaimed_out)1244 vm_deferred_reclamation_buffer_flush_internal(task_t task,
1245     mach_vm_reclaim_count_t num_entries_to_reclaim,
1246     mach_vm_size_t *bytes_reclaimed_out)
1247 {
1248 	kern_return_t kr;
1249 	vm_deferred_reclamation_metadata_t metadata = NULL;
1250 	mach_vm_reclaim_count_t total_reclaimed = 0;
1251 	uint64_t bytes_reclaimed = 0;
1252 
1253 	if (!task_is_active(task)) {
1254 		return KERN_INVALID_TASK;
1255 	}
1256 
1257 	metadata = vmdr_acquire_task_metadata(task);
1258 	if (metadata == NULL) {
1259 		return KERN_INVALID_ARGUMENT;
1260 	}
1261 
1262 	vmdr_metadata_own(metadata);
1263 
1264 	vmdr_log_debug("[%d] flushing %u entries\n", task_pid(task), num_entries_to_reclaim);
1265 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_START, metadata->vdrm_pid, num_entries_to_reclaim);
1266 
1267 	while (total_reclaimed < num_entries_to_reclaim) {
1268 		mach_vm_reclaim_count_t cur_reclaimed;
1269 		uint64_t cur_bytes_reclaimed;
1270 		mach_vm_reclaim_count_t chunk_size = MIN(num_entries_to_reclaim - total_reclaimed, kReclaimChunkSize);
1271 		kr = reclaim_chunk(metadata, UINT64_MAX, &cur_bytes_reclaimed, chunk_size,
1272 		    &cur_reclaimed);
1273 		total_reclaimed += cur_reclaimed;
1274 		bytes_reclaimed += cur_bytes_reclaimed;
1275 		if (cur_reclaimed == 0) {
1276 			break;
1277 		} else if (kr == KERN_ABORTED) {
1278 			/*
1279 			 * Unable to reclaim due to a lost race with
1280 			 * userspace, yield the gate and try again
1281 			 */
1282 			vmdr_metadata_disown(metadata);
1283 			vmdr_metadata_own(metadata);
1284 			continue;
1285 		} else if (kr != KERN_SUCCESS) {
1286 			break;
1287 		}
1288 	}
1289 	/*
1290 	 * Tell the client how many bytes the kernel has reclaimed
1291 	 * since the last time it updated its accounting
1292 	 */
1293 	bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1294 	metadata->vdrm_kernel_bytes_reclaimed = 0;
1295 
1296 	vmdr_metadata_disown(metadata);
1297 
1298 	*bytes_reclaimed_out = bytes_reclaimed;
1299 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_END, kr, total_reclaimed, bytes_reclaimed);
1300 	DTRACE_VM2(reclaim_flush,
1301 	    mach_vm_reclaim_count_t, num_entries_to_reclaim,
1302 	    size_t, bytes_reclaimed);
1303 	return kr;
1304 }
1305 
1306 kern_return_t
vm_deferred_reclamation_buffer_resize_internal(task_t task,mach_vm_reclaim_count_t len,mach_vm_size_t * bytes_reclaimed_out)1307 vm_deferred_reclamation_buffer_resize_internal(
1308 	task_t                   task,
1309 	mach_vm_reclaim_count_t len,
1310 	mach_vm_size_t *bytes_reclaimed_out)
1311 {
1312 	kern_return_t kr;
1313 	mach_vm_reclaim_count_t num_entries_reclaimed = 0;
1314 	mach_vm_reclaim_count_t old_len;
1315 
1316 	if (task == TASK_NULL) {
1317 		return KERN_INVALID_TASK;
1318 	}
1319 	if (len == 0) {
1320 		return KERN_INVALID_ARGUMENT;
1321 	}
1322 	vm_deferred_reclamation_metadata_t metadata = vmdr_acquire_task_metadata(task);
1323 	if (metadata == NULL) {
1324 		return KERN_INVALID_TASK;
1325 	}
1326 
1327 	/* Size must be multiple of page size */
1328 	vm_map_t map = task->map;
1329 	mach_vm_size_t new_size = vmdr_round_len_to_size(map, len);
1330 	if (new_size == 0) {
1331 		vmdr_metadata_release(metadata);
1332 		return KERN_INVALID_ARGUMENT;
1333 	}
1334 	if (new_size > metadata->vdrm_ring_size) {
1335 		vmdr_metadata_release(metadata);
1336 		return KERN_NO_SPACE;
1337 	}
1338 
1339 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_START,
1340 	    task_pid(task), new_size);
1341 
1342 	/*
1343 	 * Prevent other threads from operating on this buffer while it is
1344 	 * resized. It is the caller's responsibility to ensure mutual
1345 	 * exclusion with other user threads
1346 	 */
1347 	vmdr_metadata_own(metadata);
1348 
1349 	old_len = metadata->vdrm_buffer_len;
1350 
1351 	vmdr_log_debug("%s [%d] resizing buffer %u -> %u entries\n",
1352 	    task_best_name(task), task_pid(task), old_len, len);
1353 
1354 	/*
1355 	 * Reclaim all the entries currently in the buffer to prevent re-use
1356 	 * of old reclaim ids that will alias differently into the newly sized
1357 	 * buffer.
1358 	 *
1359 	 * TODO: Consider encoding the ringbuffer-capacity in the
1360 	 * mach_vm_reclaim_id_t, so reuses can still find objects after a resize.
1361 	 */
1362 	mach_vm_size_t total_bytes_reclaimed = 0;
1363 	do {
1364 		mach_vm_size_t cur_bytes_reclaimed;
1365 		kr = reclaim_chunk(metadata, UINT64_MAX, &cur_bytes_reclaimed, kReclaimChunkSize,
1366 		    &num_entries_reclaimed);
1367 		total_bytes_reclaimed += cur_bytes_reclaimed;
1368 		if (kr != KERN_SUCCESS) {
1369 			goto fail;
1370 		}
1371 	} while (num_entries_reclaimed > 0);
1372 
1373 	vmdr_log_debug("[%d] successfully resized buffer | reclaimed: %llu B "
1374 	    "kernel_reclaimed: %zu B\n", metadata->vdrm_pid,
1375 	    total_bytes_reclaimed, metadata->vdrm_kernel_bytes_reclaimed);
1376 
1377 	total_bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1378 	metadata->vdrm_kernel_bytes_reclaimed = 0;
1379 
1380 	/* Publish new user addresses in kernel metadata */
1381 	vmdr_metadata_lock(metadata);
1382 	metadata->vdrm_buffer_len = len;
1383 	vmdr_metadata_disown_locked(metadata);
1384 	vmdr_metadata_unlock(metadata);
1385 	vmdr_metadata_release(metadata);
1386 
1387 	*bytes_reclaimed_out = total_bytes_reclaimed;
1388 
1389 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, KERN_SUCCESS, num_entries_reclaimed, total_bytes_reclaimed);
1390 	DTRACE_VM2(reclaim_ring_resize,
1391 	    mach_vm_reclaim_count_t, old_len,
1392 	    mach_vm_reclaim_count_t, len);
1393 	return KERN_SUCCESS;
1394 
1395 fail:
1396 	vmdr_metadata_disown(metadata);
1397 	vmdr_metadata_release(metadata);
1398 	*bytes_reclaimed_out = total_bytes_reclaimed;
1399 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, kr, num_entries_reclaimed);
1400 	return kr;
1401 }
1402 
1403 #pragma mark Accounting
1404 
1405 extern vm_pressure_level_t memorystatus_vm_pressure_level;
1406 
1407 static kern_return_t
vmdr_calculate_autotrim_threshold(vm_deferred_reclamation_metadata_t metadata,size_t * trim_threshold_out)1408 vmdr_calculate_autotrim_threshold(vm_deferred_reclamation_metadata_t metadata, size_t *trim_threshold_out)
1409 {
1410 	kern_return_t kr;
1411 	uint32_t autotrim_pct;
1412 
1413 	/*
1414 	 * Determine the autotrim threshold based on the current pressure level
1415 	 */
1416 	vm_pressure_level_t pressure_level = os_atomic_load(&memorystatus_vm_pressure_level, relaxed);
1417 	switch (pressure_level) {
1418 	case kVMPressureNormal:
1419 		autotrim_pct = vm_reclaim_autotrim_pct_normal;
1420 		break;
1421 	case kVMPressureWarning:
1422 	case kVMPressureUrgent:
1423 		autotrim_pct = vm_reclaim_autotrim_pct_pressure;
1424 		break;
1425 	case kVMPressureCritical:
1426 		autotrim_pct = vm_reclaim_autotrim_pct_critical;
1427 		break;
1428 	default:
1429 		panic("vm_reclaim: unexpected vm_pressure_level %d", pressure_level);
1430 	}
1431 
1432 	/*
1433 	 * Estimate the task's maximum working set size
1434 	 */
1435 	ledger_amount_t phys_footprint_max = 0;
1436 
1437 	vmdr_metadata_lock(metadata);
1438 	task_t task = metadata->vdrm_task;
1439 	if (task == TASK_NULL) {
1440 		vmdr_metadata_unlock(metadata);
1441 		return KERN_INVALID_TASK;
1442 	}
1443 	task_reference(task);
1444 	vmdr_metadata_unlock(metadata);
1445 
1446 	kr = ledger_get_lifetime_max(task->ledger,
1447 	    task_ledgers.phys_footprint, &phys_footprint_max);
1448 	assert3u(kr, ==, KERN_SUCCESS);
1449 
1450 	task_deallocate(task);
1451 
1452 	*trim_threshold_out = phys_footprint_max * autotrim_pct / 100;
1453 	return KERN_SUCCESS;
1454 }
1455 
1456 #define VMDR_WMA_UNIT (1 << 8)
1457 #define VMDR_WMA_MIX(base, e)  ((vm_reclaim_wma_weight_base * (base) + (e) * VMDR_WMA_UNIT * vm_reclaim_wma_weight_cur) / vm_reclaim_wma_denom)
1458 
1459 /*
1460  * @func vmdr_ws_sample
1461  *
1462  * @brief sample the working set size of the given buffer
1463  *
1464  * @param metadata
1465  * The reclaim buffer to sample
1466  *
1467  * @param trim_threshold_out
1468  * If the buffer should be trimmed, the amount to trim (in bytes) will be
1469  * written out
1470  *
1471  * @returns KERN_MEMORY_ERROR if copyio failed due to RECLAIM_NO_FAULT
1472  *
1473  * @discussion
1474  * The caller must own the buffer
1475  */
1476 static mach_error_t
vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata,mach_vm_size_t * trim_threshold_out,vm_deferred_reclamation_options_t options)1477 vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata,
1478     mach_vm_size_t *trim_threshold_out, vm_deferred_reclamation_options_t options)
1479 {
1480 	mach_error_t err = ERR_SUCCESS;
1481 	size_t min_reclaimable_bytes = 0, cur_reclaimable_bytes = 0;
1482 	uint64_t wma = 0;
1483 
1484 	vmdr_metadata_assert_owned(metadata);
1485 
1486 	*trim_threshold_out = 0;
1487 
1488 	vm_map_switch_context_t map_ctx = vm_map_switch_to(metadata->vdrm_map);
1489 
1490 	if (options & RECLAIM_NO_FAULT) {
1491 		vm_fault_disable();
1492 	}
1493 
1494 	err = reclaim_copyin_min_reclaimable_bytes(metadata, &min_reclaimable_bytes);
1495 	if (err != ERR_SUCCESS) {
1496 		goto done;
1497 	}
1498 
1499 	uint64_t now = mach_absolute_time();
1500 	if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) {
1501 		/* A sampling period has not elapsed */
1502 		goto done;
1503 	}
1504 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_START,
1505 	    metadata->vdrm_pid,
1506 	    now,
1507 	    metadata->vdrm_last_sample_abs,
1508 	    min_reclaimable_bytes);
1509 
1510 	err = reclaim_copyin_reclaimable_bytes(metadata, &cur_reclaimable_bytes);
1511 	if (err != ERR_SUCCESS) {
1512 		goto done;
1513 	}
1514 
1515 	/* Reset the minimum to start a new sampling interval */
1516 	err = reclaim_copyout_min_reclaimable_bytes(metadata, cur_reclaimable_bytes);
1517 	if (err != ERR_SUCCESS) {
1518 		goto done;
1519 	}
1520 
1521 	/*
1522 	 * The user accounting will overcount if the kernel has reclaimed
1523 	 * without telling the client about it.
1524 	 */
1525 	if (cur_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) {
1526 		cur_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed;
1527 	} else {
1528 		vmdr_log_error("[%d] more bytes have been reclaimed (%zu) than "
1529 		    "are supposedly in buffer (%zu)\n", metadata->vdrm_pid,
1530 		    metadata->vdrm_kernel_bytes_reclaimed, cur_reclaimable_bytes);
1531 		/* This will cause an underflow in user accounting */
1532 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_ACCOUNTING_FAILURE, cur_reclaimable_bytes);
1533 		err = KERN_ABORTED;
1534 		goto done;
1535 	}
1536 	if (min_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) {
1537 		min_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed;
1538 	} else {
1539 		min_reclaimable_bytes = 0;
1540 	}
1541 
1542 	uint64_t samples_elapsed = (now - metadata->vdrm_last_sample_abs) /
1543 	    vm_reclaim_sampling_period_abs;
1544 	if (samples_elapsed > vm_reclaim_abandonment_threshold) {
1545 		/*
1546 		 * Many sampling periods have elapsed since the ring was
1547 		 * last sampled. Don't bother computing the WMA and assume
1548 		 * the buffer's current contents are unneeded.
1549 		 */
1550 		wma = VMDR_WMA_MIX(0, cur_reclaimable_bytes);
1551 	} else {
1552 		/*
1553 		 * Compute an exponential moving average of the minimum amount of reclaimable
1554 		 * memory in this buffer. Multiple sampling periods may have elapsed
1555 		 * since the last sample. By definition, the minimum must be the same for
1556 		 * all elapsed periods (otherwise libmalloc would have called down to
1557 		 * update accounting)
1558 		 */
1559 		for (unsigned int i = 0; i < samples_elapsed; i++) {
1560 			wma = VMDR_WMA_MIX(
1561 				metadata->vdrm_reclaimable_bytes_wma,
1562 				min_reclaimable_bytes);
1563 		}
1564 	}
1565 
1566 	metadata->vdrm_reclaimable_bytes_wma = wma;
1567 	size_t unneeded_bytes = MIN(min_reclaimable_bytes,
1568 	    metadata->vdrm_reclaimable_bytes_wma / VMDR_WMA_UNIT);
1569 
1570 	size_t autotrim_threshold;
1571 	err = vmdr_calculate_autotrim_threshold(metadata, &autotrim_threshold);
1572 	if (err != ERR_SUCCESS) {
1573 		goto done;
1574 	}
1575 
1576 	if (unneeded_bytes >= vm_map_page_size(metadata->vdrm_map) &&
1577 	    unneeded_bytes >= autotrim_threshold) {
1578 		*trim_threshold_out = vm_map_round_page(unneeded_bytes,
1579 		    vm_map_page_mask(metadata->vdrm_map));
1580 	}
1581 
1582 	metadata->vdrm_last_sample_abs = mach_absolute_time();
1583 	metadata->vdrm_reclaimable_bytes_last = cur_reclaimable_bytes;
1584 
1585 done:
1586 	vm_map_switch_back(map_ctx);
1587 	if (options & RECLAIM_NO_FAULT) {
1588 		vm_fault_enable();
1589 	}
1590 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_END,
1591 	    wma,
1592 	    min_reclaimable_bytes,
1593 	    cur_reclaimable_bytes,
1594 	    *trim_threshold_out);
1595 	DTRACE_VM5(reclaim_sample,
1596 	    pid_t, metadata->vdrm_pid,
1597 	    uint64_t, wma,
1598 	    size_t, min_reclaimable_bytes,
1599 	    size_t, cur_reclaimable_bytes,
1600 	    size_t, *trim_threshold_out);
1601 	vmdr_log_debug("sampled buffer with min %lu est %lu trim %llu wma %llu\n",
1602 	    min_reclaimable_bytes,
1603 	    cur_reclaimable_bytes,
1604 	    *trim_threshold_out,
1605 	    wma);
1606 	return err;
1607 }
1608 
1609 /*
1610  * Caller must have buffer owned and unlocked
1611  */
1612 static kern_return_t
vmdr_trim(vm_deferred_reclamation_metadata_t metadata,mach_vm_size_t bytes_to_reclaim,mach_vm_size_t * bytes_reclaimed,vm_deferred_reclamation_options_t options)1613 vmdr_trim(vm_deferred_reclamation_metadata_t metadata, mach_vm_size_t bytes_to_reclaim,
1614     mach_vm_size_t *bytes_reclaimed, vm_deferred_reclamation_options_t options)
1615 {
1616 	kern_return_t kr;
1617 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_START,
1618 	    metadata->vdrm_pid, bytes_to_reclaim);
1619 
1620 	kr = vmdr_reclaim_from_buffer(metadata, bytes_to_reclaim,
1621 	    bytes_reclaimed, options);
1622 
1623 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_END, kr, bytes_reclaimed);
1624 	DTRACE_VM3(reclaim_trim,
1625 	    pid_t, metadata->vdrm_pid,
1626 	    size_t, bytes_to_reclaim,
1627 	    size_t, *bytes_reclaimed);
1628 	return kr;
1629 }
1630 
1631 /*
1632  * Caller must have buffer owned and unlocked
1633  */
1634 static kern_return_t
vmdr_drain(vm_deferred_reclamation_metadata_t metadata,mach_vm_size_t * bytes_reclaimed,vm_deferred_reclamation_options_t options)1635 vmdr_drain(vm_deferred_reclamation_metadata_t metadata, mach_vm_size_t *bytes_reclaimed,
1636     vm_deferred_reclamation_options_t options)
1637 {
1638 	kern_return_t kr;
1639 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_START,
1640 	    metadata->vdrm_pid);
1641 
1642 	kr = vmdr_reclaim_from_buffer(metadata, UINT64_MAX,
1643 	    bytes_reclaimed, options);
1644 
1645 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_END, kr, bytes_reclaimed);
1646 	DTRACE_VM2(reclaim_drain,
1647 	    pid_t, metadata->vdrm_pid,
1648 	    size_t, *bytes_reclaimed);
1649 	return kr;
1650 }
1651 
1652 mach_error_t
vm_deferred_reclamation_update_accounting_internal(task_t task,uint64_t * bytes_reclaimed_out)1653 vm_deferred_reclamation_update_accounting_internal(task_t task, uint64_t *bytes_reclaimed_out)
1654 {
1655 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1656 	mach_vm_size_t bytes_to_reclaim, bytes_reclaimed = 0;
1657 	mach_error_t err = ERR_SUCCESS;
1658 
1659 	if (metadata == NULL) {
1660 		return KERN_NOT_FOUND;
1661 	}
1662 
1663 	if (!metadata->vdrm_pid) {
1664 		/* If this is a forked child, we may not yet have a pid */
1665 		metadata->vdrm_pid = task_pid(task);
1666 	}
1667 
1668 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START,
1669 	    metadata->vdrm_pid);
1670 
1671 	vmdr_metadata_lock(metadata);
1672 	uint64_t now = mach_absolute_time();
1673 	if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) {
1674 		/*
1675 		 * This is a fast path to avoid waiting on the gate if another
1676 		 * thread beat us to sampling.
1677 		 */
1678 		vmdr_metadata_unlock(metadata);
1679 		goto done;
1680 	}
1681 	vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
1682 	vmdr_metadata_unlock(metadata);
1683 
1684 	err = vmdr_sample_working_set(metadata, &bytes_to_reclaim, RECLAIM_OPTIONS_NONE);
1685 	if (err != ERR_SUCCESS) {
1686 		vmdr_metadata_disown(metadata);
1687 		goto done;
1688 	}
1689 	if (bytes_to_reclaim) {
1690 		vmdr_log_debug("[%d] trimming %llu B\n", metadata->vdrm_pid, bytes_to_reclaim);
1691 
1692 		err = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, RECLAIM_OPTIONS_NONE);
1693 
1694 		if (err == KERN_ABORTED) {
1695 			/*
1696 			 * We were unable to complete the trim due to a lost
1697 			 * race with userspace. This need not be fatal b/c the
1698 			 * accounting was successfully updated.
1699 			 */
1700 			err = KERN_SUCCESS;
1701 		}
1702 	}
1703 
1704 	/*
1705 	 * Tell the client how many bytes the kernel has reclaimed
1706 	 * since the last time it updated its accounting
1707 	 */
1708 	bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1709 	metadata->vdrm_kernel_bytes_reclaimed = 0;
1710 
1711 	vmdr_metadata_disown(metadata);
1712 
1713 done:
1714 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END,
1715 	    metadata->vdrm_last_sample_abs,
1716 	    bytes_to_reclaim,
1717 	    bytes_reclaimed);
1718 	*bytes_reclaimed_out = (uint64_t)bytes_reclaimed;
1719 	return err;
1720 }
1721 
1722 kern_return_t
vm_deferred_reclamation_task_drain(task_t task,vm_deferred_reclamation_options_t options)1723 vm_deferred_reclamation_task_drain(task_t task,
1724     vm_deferred_reclamation_options_t options)
1725 {
1726 	kern_return_t kr;
1727 	mach_vm_size_t bytes_reclaimed;
1728 
1729 	task_lock(task);
1730 	if (!task_is_active(task) || task_is_halting(task)) {
1731 		task_unlock(task);
1732 		return KERN_ABORTED;
1733 	}
1734 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1735 	if (metadata == NULL) {
1736 		task_unlock(task);
1737 		return KERN_SUCCESS;
1738 	}
1739 	vmdr_metadata_retain(metadata);
1740 	task_unlock(task);
1741 
1742 	vmdr_metadata_own(metadata);
1743 
1744 	kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1745 	metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1746 
1747 	vmdr_metadata_disown(metadata);
1748 	vmdr_metadata_release(metadata);
1749 	return kr;
1750 }
1751 
1752 void
vm_deferred_reclamation_task_suspend(task_t task)1753 vm_deferred_reclamation_task_suspend(task_t task)
1754 {
1755 	if (task->deferred_reclamation_metadata) {
1756 		sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
1757 	}
1758 }
1759 
1760 #pragma mark KPIs
1761 
1762 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_task_fork(task_t task,vm_deferred_reclamation_metadata_t parent)1763 vm_deferred_reclamation_task_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1764 {
1765 	vm_deferred_reclamation_metadata_t metadata = NULL;
1766 	vmdr_metadata_assert_owned(parent);
1767 	vmdr_log_debug("forking [%d]\n", parent->vdrm_pid);
1768 
1769 	assert(task->deferred_reclamation_metadata == NULL);
1770 	metadata = vmdr_metadata_alloc(task, parent->vdrm_ring_addr,
1771 	    parent->vdrm_ring_size, parent->vdrm_buffer_len);
1772 
1773 	metadata->vdrm_last_sample_abs = parent->vdrm_last_sample_abs;
1774 	metadata->vdrm_kernel_bytes_reclaimed = parent->vdrm_kernel_bytes_reclaimed;
1775 	metadata->vdrm_reclaimable_bytes_wma = parent->vdrm_reclaimable_bytes_wma;
1776 
1777 	return metadata;
1778 }
1779 
1780 void
vm_deferred_reclamation_task_fork_register(vm_deferred_reclamation_metadata_t metadata)1781 vm_deferred_reclamation_task_fork_register(vm_deferred_reclamation_metadata_t metadata)
1782 {
1783 	assert(metadata != NULL);
1784 	assert(!metadata->vdrm_is_registered);
1785 
1786 	lck_mtx_lock(&reclaim_buffers_lock);
1787 	metadata->vdrm_is_registered = true;
1788 	vmdr_list_append_locked(metadata);
1789 	lck_mtx_unlock(&reclaim_buffers_lock);
1790 }
1791 
1792 bool
vm_deferred_reclamation_task_has_ring(task_t task)1793 vm_deferred_reclamation_task_has_ring(task_t task)
1794 {
1795 	return task->deferred_reclamation_metadata != NULL;
1796 }
1797 
1798 void
vm_deferred_reclamation_ring_own(vm_deferred_reclamation_metadata_t metadata)1799 vm_deferred_reclamation_ring_own(vm_deferred_reclamation_metadata_t metadata)
1800 {
1801 	vmdr_metadata_own(metadata);
1802 }
1803 
1804 void
vm_deferred_reclamation_ring_disown(vm_deferred_reclamation_metadata_t metadata)1805 vm_deferred_reclamation_ring_disown(vm_deferred_reclamation_metadata_t metadata)
1806 {
1807 	vmdr_metadata_disown(metadata);
1808 }
1809 
1810 void
vm_deferred_reclamation_gc(vm_deferred_reclamation_gc_action_t action,mach_vm_size_t * total_bytes_reclaimed,vm_deferred_reclamation_options_t options)1811 vm_deferred_reclamation_gc(vm_deferred_reclamation_gc_action_t action,
1812     mach_vm_size_t *total_bytes_reclaimed,
1813     vm_deferred_reclamation_options_t options)
1814 {
1815 	vmdr_garbage_collect(action, total_bytes_reclaimed, options);
1816 }
1817 
1818 void
vm_deferred_reclamation_settle_ledger(task_t task)1819 vm_deferred_reclamation_settle_ledger(task_t task)
1820 {
1821 	vm_deferred_reclamation_metadata_t meta = vmdr_acquire_task_metadata(task);
1822 	if (meta == NULL) {
1823 		return;
1824 	}
1825 	vmdr_metadata_lock(meta);
1826 	ledger_zero_balance(task->ledger, task_ledgers.est_reclaimable);
1827 	ledger_credit(
1828 		task->ledger,
1829 		task_ledgers.est_reclaimable,
1830 		meta->vdrm_reclaimable_bytes_last);
1831 	vmdr_metadata_unlock(meta);
1832 	vmdr_metadata_release(meta);
1833 }
1834 
1835 #pragma mark Global Reclamation GC
1836 
1837 static void
vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action,mach_vm_size_t * total_bytes_reclaimed_out,vm_deferred_reclamation_options_t options)1838 vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action,
1839     mach_vm_size_t *total_bytes_reclaimed_out,
1840     vm_deferred_reclamation_options_t options)
1841 {
1842 	kern_return_t kr;
1843 	mach_vm_size_t total_bytes_reclaimed = 0;
1844 	gate_wait_result_t wr;
1845 
1846 	lck_mtx_lock(&reclaim_buffers_lock);
1847 	kr = lck_mtx_gate_try_close(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
1848 	if (kr != KERN_SUCCESS) {
1849 		if (options & RECLAIM_NO_WAIT) {
1850 			lck_mtx_unlock(&reclaim_buffers_lock);
1851 			return;
1852 		}
1853 		wr = lck_mtx_gate_wait(&reclaim_buffers_lock, &vm_reclaim_gc_gate, LCK_SLEEP_DEFAULT, THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1854 		assert3u(wr, ==, GATE_HANDOFF);
1855 	}
1856 
1857 	vm_reclaim_gc_epoch++;
1858 	vmdr_log_debug("running global GC\n");
1859 	while (true) {
1860 		vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclaim_buffers);
1861 		if (metadata == NULL) {
1862 			break;
1863 		}
1864 		vmdr_list_remove_locked(metadata);
1865 		vmdr_list_append_locked(metadata);
1866 		vmdr_metadata_retain(metadata);
1867 		lck_mtx_unlock(&reclaim_buffers_lock);
1868 
1869 		vmdr_metadata_lock(metadata);
1870 
1871 		if (metadata->vdrm_reclaimed_at >= vm_reclaim_gc_epoch) {
1872 			/* We've already seen this one. We're done */
1873 			vmdr_metadata_unlock(metadata);
1874 			vmdr_metadata_release(metadata);
1875 			lck_mtx_lock(&reclaim_buffers_lock);
1876 			break;
1877 		}
1878 		metadata->vdrm_reclaimed_at = vm_reclaim_gc_epoch;
1879 
1880 		task_t task = metadata->vdrm_task;
1881 		if (task == TASK_NULL ||
1882 		    !task_is_active(task) ||
1883 		    task_is_halting(task)) {
1884 			goto next;
1885 		}
1886 		bool buffer_is_suspended = task_is_app_suspended(task);
1887 		task = TASK_NULL;
1888 
1889 		mach_vm_size_t bytes_reclaimed = 0;
1890 		mach_vm_size_t bytes_to_reclaim = 0;
1891 
1892 		switch (action) {
1893 		case RECLAIM_GC_DRAIN:
1894 			if (!vmdr_metadata_own_locked(metadata, options)) {
1895 				goto next;
1896 			}
1897 			vmdr_metadata_unlock(metadata);
1898 
1899 			vmdr_log_debug("draining [%d]\n", metadata->vdrm_pid);
1900 			kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1901 			metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1902 
1903 			vmdr_metadata_lock(metadata);
1904 			vmdr_metadata_disown_locked(metadata);
1905 			break;
1906 		case RECLAIM_GC_SCAVENGE:
1907 			if (buffer_is_suspended) {
1908 				if (!vmdr_metadata_own_locked(metadata, options)) {
1909 					goto next;
1910 				}
1911 				vmdr_metadata_unlock(metadata);
1912 
1913 				/* This buffer is no longer in use, fully reclaim it. */
1914 				vmdr_log_debug("found suspended buffer [%d], draining\n", metadata->vdrm_pid);
1915 				kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1916 				metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1917 
1918 				vmdr_metadata_lock(metadata);
1919 				vmdr_metadata_disown_locked(metadata);
1920 			}
1921 			break;
1922 		case RECLAIM_GC_TRIM:
1923 			if (!vmdr_metadata_own_locked(metadata, options)) {
1924 				goto next;
1925 			}
1926 			vmdr_metadata_unlock(metadata);
1927 			kr = vmdr_sample_working_set(metadata, &bytes_to_reclaim, options);
1928 			if (kr == KERN_SUCCESS && bytes_to_reclaim) {
1929 				vmdr_log_debug("GC found stale buffer (%d), trimming\n", metadata->vdrm_pid);
1930 				kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, options);
1931 				metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1932 			}
1933 			vmdr_metadata_lock(metadata);
1934 			vmdr_metadata_disown_locked(metadata);
1935 			break;
1936 		}
1937 		if (bytes_reclaimed) {
1938 			vm_reclaim_gc_reclaim_count++;
1939 			total_bytes_reclaimed += bytes_reclaimed;
1940 		}
1941 		if (metadata->vdrm_waiters && action != RECLAIM_GC_TRIM) {
1942 			thread_wakeup((event_t)&metadata->vdrm_waiters);
1943 		}
1944 next:
1945 		vmdr_metadata_unlock(metadata);
1946 		vmdr_metadata_release(metadata);
1947 		lck_mtx_lock(&reclaim_buffers_lock);
1948 	}
1949 	lck_mtx_gate_handoff(&reclaim_buffers_lock, &vm_reclaim_gc_gate, GATE_HANDOFF_OPEN_IF_NO_WAITERS);
1950 	lck_mtx_unlock(&reclaim_buffers_lock);
1951 	*total_bytes_reclaimed_out = total_bytes_reclaimed;
1952 }
1953 
1954 OS_NORETURN
1955 static void
vm_reclaim_scavenger_thread_continue(__unused void * param,__unused wait_result_t wr)1956 vm_reclaim_scavenger_thread_continue(__unused void *param, __unused wait_result_t wr)
1957 {
1958 	sched_cond_ack(&vm_reclaim_scavenger_cond);
1959 
1960 	while (true) {
1961 		mach_vm_size_t total_bytes_reclaimed;
1962 		vmdr_garbage_collect(RECLAIM_GC_SCAVENGE, &total_bytes_reclaimed,
1963 		    RECLAIM_OPTIONS_NONE);
1964 		vmdr_log_info("scavenger reclaimed %llu KiB of virtual memory\n",
1965 		    total_bytes_reclaimed >> 10);
1966 		sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT,
1967 		    vm_reclaim_scavenger_thread_continue);
1968 	}
1969 }
1970 
1971 OS_NORETURN
1972 static void
vm_reclaim_scavenger_thread_init(__unused void * param,__unused wait_result_t wr)1973 vm_reclaim_scavenger_thread_init(__unused void *param, __unused wait_result_t wr)
1974 {
1975 	thread_set_thread_name(current_thread(), "VM_reclaim_scavenger");
1976 #if CONFIG_THREAD_GROUPS
1977 	thread_group_vm_add();
1978 #endif /* CONFIG_THREAD_GROUPS */
1979 	sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT, vm_reclaim_scavenger_thread_continue);
1980 	__builtin_unreachable();
1981 }
1982 
1983 __startup_func
1984 static void
vm_deferred_reclamation_init(void)1985 vm_deferred_reclamation_init(void)
1986 {
1987 	vm_reclaim_log_handle = os_log_create("com.apple.xnu", "vm_reclaim");
1988 	nanoseconds_to_absolutetime((uint64_t)vm_reclaim_sampling_period_ns,
1989 	    &vm_reclaim_sampling_period_abs);
1990 
1991 	sched_cond_init(&vm_reclaim_scavenger_cond);
1992 	lck_mtx_gate_init(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
1993 	kern_return_t kr = kernel_thread_start_priority(vm_reclaim_scavenger_thread_init,
1994 	    NULL, BASEPRI_KERNEL, &vm_reclaim_scavenger_thread);
1995 	if (kr != KERN_SUCCESS) {
1996 		panic("Unable to create VM reclaim thread, %d", kr);
1997 	}
1998 }
1999 
2000 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
2001 
2002 #pragma mark Debug Interfaces
2003 
2004 #if DEVELOPMENT || DEBUG
2005 
2006 bool
vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task)2007 vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task)
2008 {
2009 	bool reclaimed;
2010 	vm_deferred_reclamation_metadata_t metadata;
2011 
2012 	metadata = vmdr_acquire_task_metadata(task);
2013 	if (metadata == NULL) {
2014 		return false;
2015 	}
2016 	vmdr_metadata_lock(metadata);
2017 
2018 	metadata->vdrm_waiters++;
2019 	/* Wake up the scavenger thread */
2020 	sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
2021 	wait_result_t wr = lck_mtx_sleep(&metadata->vdrm_lock,
2022 	    LCK_SLEEP_DEFAULT, (event_t)&metadata->vdrm_waiters,
2023 	    THREAD_ABORTSAFE);
2024 	metadata->vdrm_waiters--;
2025 	reclaimed = (wr == THREAD_AWAKENED);
2026 
2027 	vmdr_metadata_unlock(metadata);
2028 	vmdr_metadata_release(metadata);
2029 	return reclaimed;
2030 }
2031 
2032 #endif /* DEVELOPMENT || DEBUG */
2033 
2034 #pragma mark Introspectibility
2035 
2036 kern_return_t
vm_deferred_reclamation_buffer_query_internal(task_t task,mach_vm_address_ut * addr_out_u,mach_vm_size_ut * size_out_u)2037 vm_deferred_reclamation_buffer_query_internal(
2038 	task_t task,
2039 	mach_vm_address_ut *addr_out_u,
2040 	mach_vm_size_ut *size_out_u)
2041 {
2042 	vm_deferred_reclamation_metadata_t meta;
2043 
2044 	if (task == NULL) {
2045 		return KERN_INVALID_TASK;
2046 	}
2047 
2048 	if ((addr_out_u == NULL) || (size_out_u == NULL)) {
2049 		return KERN_INVALID_ARGUMENT;
2050 	}
2051 
2052 	meta = vmdr_acquire_task_metadata(task);
2053 
2054 	if (meta == NULL) {
2055 		*addr_out_u = vm_sanitize_wrap_addr(0);
2056 		*size_out_u = vm_sanitize_wrap_size(0);
2057 	} else {
2058 		vmdr_metadata_lock(meta);
2059 		*addr_out_u = vm_sanitize_wrap_addr(meta->vdrm_ring_addr);
2060 		*size_out_u = vm_sanitize_wrap_size(meta->vdrm_ring_size);
2061 		vmdr_metadata_unlock(meta);
2062 		vmdr_metadata_release(meta);
2063 	}
2064 
2065 	return KERN_SUCCESS;
2066 }
2067