xref: /xnu-11417.101.15/osfmk/vm/vm_reclaim.c (revision e3723e1f17661b24996789d8afc084c0c3303b26)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/sched_prim.h>
35 #include <kern/startup.h>
36 #include <kern/thread_group.h>
37 #include <libkern/OSAtomic.h>
38 #include <mach/kern_return.h>
39 #include <mach/mach_types.h>
40 #include <mach/vm_reclaim_private.h>
41 #include <os/atomic_private.h>
42 #include <os/base_private.h>
43 #include <os/log.h>
44 #include <os/refcnt.h>
45 #include <os/refcnt_internal.h>
46 #include <pexpert/pexpert.h>
47 #include <sys/errno.h>
48 #include <sys/kdebug.h>
49 #include <sys/queue.h>
50 #include <sys/reason.h>
51 #include <vm/vm_fault_xnu.h>
52 #include <vm/vm_map.h>
53 #include <vm/vm_map_internal.h>
54 #include <vm/vm_pageout_internal.h>
55 #include <vm/vm_reclaim_internal.h>
56 #include <vm/vm_sanitize_internal.h>
57 #include <vm/vm_kern_xnu.h>
58 
59 #pragma mark Tunables
60 
61 #if XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR
62 /* Temporarily opt iOS into the legacy behavior as a stop-gap */
63 #define CONFIG_WORKING_SET_ESTIMATION 0
64 /*
65  * Deferred reclaim may be enabled via EDT for select iOS devices, but
66  * defaults to disabled
67  */
68 #define VM_RECLAIM_ENABLED_DEFAULT false
69 #else
70 #define CONFIG_WORKING_SET_ESTIMATION 1
71 #define VM_RECLAIM_ENABLED_DEFAULT true
72 #endif
73 
74 #if DEVELOPMENT || DEBUG
75 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
76 #else /* RELEASE */
77 const uint32_t kReclaimChunkSize = 16;
78 #endif /* DEVELOPMENT || DEBUG */
79 #if CONFIG_WORKING_SET_ESTIMATION
80 TUNABLE_DT_DEV_WRITEABLE(bool, vm_reclaim_enabled, "/defaults",
81     "kern.vm_reclaim_enabled", "vm_reclaim_enabled", VM_RECLAIM_ENABLED_DEFAULT, TUNABLE_DT_NONE);
82 /* TODO: Consider varying the sampling rate based on rusage, ringbuffer-velocity, memory pressure */
83 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_sampling_period_ns, "vm_reclaim_sampling_period_ns", 1ULL * NSEC_PER_SEC);
84 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_normal, "vm_reclaim_autotrim_pct_normal", 10);
85 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_pressure, "vm_reclaim_autotrim_pct_pressure", 5);
86 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_critical, "vm_reclaim_autotrim_pct_critical", 1);
87 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_base, "vm_reclaim_wma_weight_base", 3);
88 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_cur, "vm_reclaim_wma_weight_cur", 1);
89 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_denom, "vm_reclaim_wma_denom", 4);
90 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_abandonment_threshold, "vm_reclaim_abandonment_threshold", 512);
91 #else /* CONFIG_WORKING_SET_ESTIMATION */
92 TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults",
93     "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", 0, TUNABLE_DT_NONE);
94 #endif /* CONFIG_WORKING_SET_ESTIMATION */
95 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
96 #if DEVELOPMENT || DEBUG
97 TUNABLE_WRITEABLE(bool, vm_reclaim_debug, "vm_reclaim_debug", false);
98 #endif
99 
100 #pragma mark Declarations
101 typedef struct proc *proc_t;
102 extern const char *proc_best_name(struct proc *);
103 extern void *proc_find(int pid);
104 extern task_t proc_task(proc_t);
105 extern kern_return_t kern_return_for_errno(int);
106 extern int mach_to_bsd_errno(kern_return_t kr);
107 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
108 struct proc *proc_ref(struct proc *p, int locked);
109 int proc_rele(proc_t p);
110 
111 #define _vmdr_log_type(type, fmt, ...) os_log_with_type(vm_reclaim_log_handle, type, "vm_reclaim: " fmt, ##__VA_ARGS__)
112 #define vmdr_log(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_DEFAULT, fmt, ##__VA_ARGS__)
113 #define vmdr_log_info(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_INFO, fmt, ##__VA_ARGS__)
114 #define vmdr_log_error(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_ERROR, fmt, ##__VA_ARGS__)
115 #if DEVELOPMENT || DEBUG
116 #define vmdr_log_debug(fmt, ...) \
117 MACRO_BEGIN \
118 if (os_unlikely(vm_reclaim_debug)) { \
119 	_vmdr_log_type(OS_LOG_TYPE_DEBUG, fmt, ##__VA_ARGS__); \
120 } \
121 MACRO_END
122 #else /* !(DEVELOPMENT || DEBUG)*/
123 #define vmdr_log_debug(...)
124 #endif /* DEVELOPMENT || DEBUG */
125 
126 static kern_return_t reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
127 static kern_return_t reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
128 static kern_return_t reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
129 static kern_return_t reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result);
130 #if CONFIG_WORKING_SET_ESTIMATION
131 static bool vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata, size_t *trim_threshold_out);
132 #endif
133 static void vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata);
134 static void vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
135 static void vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
136 static void vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata);
137 static void vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata);
138 static void vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action, vm_deferred_reclamation_options_t options);
139 static kern_return_t reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
140     uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
141     mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out);
142 
143 struct vm_deferred_reclamation_metadata_s {
144 	/*
145 	 * Global list containing every reclamation buffer. Protected by the
146 	 * reclamation_buffers_lock.
147 	 */
148 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list;
149 	/* Protects all struct fields (except denoted otherwise) */
150 	decl_lck_mtx_data(, vdrm_lock);
151 	decl_lck_mtx_gate_data(, vdrm_gate);
152 	/*
153 	 * The task owns this structure but we maintain a backpointer here
154 	 * so that we can send an exception if we hit an error.
155 	 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
156 	 */
157 	task_t vdrm_task;
158 	pid_t vdrm_pid;
159 	vm_map_t vdrm_map;
160 	/*
161 	 * The owning task holds a ref on this object. When the task dies, it
162 	 * will set vdrm_task := NULL and drop its ref. Threads operating on the buffer
163 	 * should hold a +1 on the metadata structure to ensure it's validity.
164 	 */
165 	os_refcnt_t vdrm_refcnt;
166 	/* The virtual address of the ringbuffer in the user map (immutable) */
167 	user_addr_t vdrm_buffer_addr;
168 	/* The size of the VM allocation containing the ringbuffer (immutable) */
169 	mach_vm_size_t vdrm_buffer_size;
170 	/* The length of the ringbuffer. This may be changed on buffer re-size */
171 	mach_vm_reclaim_count_t vdrm_buffer_len;
172 	/* Which GC epoch this buffer was last considered in */
173 	uint64_t vdrm_reclaimed_at;
174 	/*
175 	 * The number of threads waiting for a pending reclamation
176 	 * on this buffer to complete.
177 	 */
178 	uint32_t vdrm_waiters;
179 #if CONFIG_WORKING_SET_ESTIMATION
180 	/* timestamp (MAS) of the last working set sample for this ringbuffer */
181 	uint64_t vdrm_last_sample_abs;
182 	/*
183 	 * Exponential moving average of the minimum reclaimable buffer size (in VMDR_WMA_UNIT's)
184 	 */
185 	uint64_t vdrm_reclaimable_bytes_wma;
186 	/*
187 	 * The minimum amount of reclaimable memory in this buffer for the current
188 	 * sampling interval.
189 	 */
190 	size_t vdrm_reclaimable_bytes_min;
191 #endif /* CONFIG_WORKING_SET_ESTIMATION */
192 	/*
193 	 * These two values represent running sums of uncancelled bytes
194 	 * entered into the ring by userspace and bytes reclaimed out of the
195 	 * buffer by the kernel.
196 	 *
197 	 * The uncancelled byte-count may fluctuate as the client enters and
198 	 * cancels new reclamation requests. Reclamation requests which have
199 	 * been completed by the kernel will not deduct from the uncancelled
200 	 * count but will be added to the reclaimed byte count.
201 	 *
202 	 *  - `vdrm_cumulative_reclaimed_bytes` is monotonically increasing.
203 	 *  - `vdrm_cumulative_uncancelled_bytes` may fluctuate but
204 	 *    should trend upward.
205 	 *  - `vdrm_cumulative_uncancelled_bytes` must be kept >=
206 	 *    `vdrm_cumulative_reclaimed_bytes`
207 	 *
208 	 * Both values are in terms of virtual memory,
209 	 * so they give an upper bound on the amount of physical memory that
210 	 * can be reclaimed. To get an estimate of the current amount of VA in
211 	 * the buffer do vdrm_cumulative_uncancelled_bytes -
212 	 * vdrm_cumulative_reclaimed_bytes.
213 	 */
214 	size_t vdrm_cumulative_uncancelled_bytes;
215 	size_t vdrm_cumulative_reclaimed_bytes;
216 
217 	/*
218 	 * Tracks whether or not this reclamation metadata has been added
219 	 * to the global list yet. Normally, this happens when it is allocated,
220 	 * except in the case of fork(). In this case, we have to duplicate the
221 	 * parent's metadata before it returns from fork(), but this occurs
222 	 * before the child's address space is set up.
223 	 */
224 	uint8_t vdrm_is_registered : 1,
225 	    __unused1 : 7;
226 };
227 
228 #pragma mark Globals
229 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
230 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
231 os_refgrp_decl(static, vm_reclaim_metadata_refgrp, "vm_reclaim_metadata_refgrp", NULL);
232 /*
233  * The reclamation_buffers list contains every buffer in the system.
234  * The reclamation_buffers_lock protects the reclamation_buffers list.
235  * It must be held when iterating over the list or manipulating the list.
236  * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
237  */
238 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclaim_buffers = TAILQ_HEAD_INITIALIZER(reclaim_buffers);
239 LCK_MTX_DECLARE(reclaim_buffers_lock, &vm_reclaim_lock_grp);
240 /* Number of times Reclaim GC has run */
241 uint64_t vm_reclaim_gc_epoch = 0;
242 /* The number of reclamation actions (drains/trims) done during GC */
243 uint64_t vm_reclaim_gc_reclaim_count;
244 /* Gate for GC */
245 static decl_lck_mtx_gate_data(, vm_reclaim_gc_gate);
246 os_log_t vm_reclaim_log_handle;
247 /* Number of initialized reclaim buffers */
248 _Atomic uint32_t vm_reclaim_buffer_count;
249 uint64_t vm_reclaim_sampling_period_abs = 0;
250 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_scavenger_thread = THREAD_NULL;
251 static sched_cond_atomic_t vm_reclaim_scavenger_cond = SCHED_COND_INIT;
252 
253 #pragma mark Buffer Initialization/Destruction
254 
255 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,mach_vm_reclaim_count_t len)256 vmdr_metadata_alloc(
257 	task_t                  task,
258 	user_addr_t             buffer,
259 	mach_vm_size_t          size,
260 	mach_vm_reclaim_count_t len)
261 {
262 	vm_deferred_reclamation_metadata_t metadata;
263 	vm_map_t map = task->map;
264 
265 	assert(!map->is_nested_map);
266 
267 	metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
268 	lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
269 	lck_mtx_gate_init(&metadata->vdrm_lock, &metadata->vdrm_gate);
270 	os_ref_init(&metadata->vdrm_refcnt, &vm_reclaim_metadata_refgrp);
271 
272 	metadata->vdrm_task = task;
273 	metadata->vdrm_map = map;
274 	metadata->vdrm_buffer_addr = buffer;
275 	metadata->vdrm_buffer_size = size;
276 	metadata->vdrm_buffer_len = len;
277 
278 	if (os_atomic_inc(&vm_reclaim_buffer_count, relaxed) == UINT32_MAX) {
279 		panic("Overflowed vm_reclaim_buffer_count");
280 	}
281 
282 	/*
283 	 * we do not need to hold a lock on `task` because this is called
284 	 * either at fork() time or from the context of current_task().
285 	 */
286 	vm_map_reference(map);
287 	return metadata;
288 }
289 
290 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)291 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
292 {
293 	vm_map_deallocate(metadata->vdrm_map);
294 	lck_mtx_gate_destroy(&metadata->vdrm_lock, &metadata->vdrm_gate);
295 	lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
296 	zfree(vm_reclaim_metadata_zone, metadata);
297 	if (os_atomic_dec_orig(&vm_reclaim_buffer_count, relaxed) == 0) {
298 		panic("Underflowed vm_reclaim_buffer_count");
299 	}
300 }
301 
302 static mach_vm_size_t
vmdr_round_len_to_size(vm_map_t map,mach_vm_reclaim_count_t count)303 vmdr_round_len_to_size(vm_map_t map, mach_vm_reclaim_count_t count)
304 {
305 	mach_vm_size_t metadata_size = offsetof(struct mach_vm_reclaim_ring_s, entries);
306 	mach_vm_size_t entries_size = count * sizeof(struct mach_vm_reclaim_entry_s);
307 	return vm_map_round_page(metadata_size + entries_size, vm_map_page_mask(map));
308 }
309 
310 mach_error_t
vm_deferred_reclamation_buffer_allocate_internal(task_t task,mach_vm_address_ut * address_u,mach_vm_reclaim_count_t len,mach_vm_reclaim_count_t max_len)311 vm_deferred_reclamation_buffer_allocate_internal(
312 	task_t                   task,
313 	mach_vm_address_ut       *address_u,
314 	mach_vm_reclaim_count_t  len,
315 	mach_vm_reclaim_count_t  max_len)
316 {
317 	kern_return_t kr;
318 	kern_return_t tmp_kr;
319 	vm_deferred_reclamation_metadata_t metadata = NULL;
320 	vm_map_t map;
321 	uint64_t head = 0, tail = 0, busy = 0;
322 	static bool reclaim_disabled_logged = false;
323 
324 	if (task == TASK_NULL) {
325 		return KERN_INVALID_TASK;
326 	}
327 	if (address_u == NULL) {
328 		return KERN_INVALID_ADDRESS;
329 	}
330 	if (len == 0 || max_len == 0 || max_len < len) {
331 		return KERN_INVALID_ARGUMENT;
332 	}
333 	map = task->map;
334 #if CONFIG_WORKING_SET_ESTIMATION
335 	if (!vm_reclaim_enabled) {
336 #else /* !CONFIG_WORKING_SET_ESTIMATION */
337 	if (!vm_reclaim_max_threshold) {
338 #endif /* CONFIG_WORKING_SET_ESTIMATION */
339 		if (!reclaim_disabled_logged) {
340 			/* Avoid logging failure for every new process */
341 			reclaim_disabled_logged = true;
342 			vmdr_log_error("failed to initialize deferred "
343 			    "reclamation buffer - vm_reclaim is disabled\n");
344 		}
345 		return VM_RECLAIM_NOT_SUPPORTED;
346 	}
347 
348 	map = task->map;
349 	mach_vm_size_t rounded_vm_size = vmdr_round_len_to_size(map, max_len);
350 	if (rounded_vm_size == 0) {
351 		return KERN_INVALID_ARGUMENT;
352 	}
353 
354 	if (rounded_vm_size > VM_RECLAIM_MAX_BUFFER_SIZE) {
355 		vmdr_log_error("denying request to allocate ringbuffer of size "
356 		    "%llu KiB (max %llu KiB)\n",
357 		    rounded_vm_size,
358 		    VM_RECLAIM_MAX_BUFFER_SIZE);
359 		return KERN_NO_SPACE;
360 	}
361 
362 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START,
363 	    task_pid(task), len);
364 
365 	/*
366 	 * Allocate a VM region that can contain the maximum buffer size. The
367 	 * allocation starts as VM_PROT_NONE and may be unprotected on buffer
368 	 * resize.
369 	 *
370 	 * TODO: If clients other than libmalloc adopt deferred reclaim, a
371 	 * different tag should be given
372 	 *
373 	 * `address` was sanitized under the assumption that we'll only use
374 	 * it as a hint (overflow checks were used) so we must pass the
375 	 * anywhere flag.
376 	 */
377 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE_PERMANENT(
378 		.vm_tag = VM_MEMORY_MALLOC);
379 	mach_vm_size_ut size_u = vm_sanitize_wrap_size(rounded_vm_size);
380 	kr = mach_vm_map_kernel(map, address_u, size_u, VM_MAP_PAGE_MASK(map),
381 	    vmk_flags, IPC_PORT_NULL, 0, FALSE,
382 	    VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_COPY);
383 	if (kr != KERN_SUCCESS) {
384 		vmdr_log_error("%s [%d] failed to allocate VA for reclaim "
385 		    "buffer (%d)\n", task_best_name(task), task_pid(task), kr);
386 		return kr;
387 	}
388 	mach_vm_address_t address = VM_SANITIZE_UNSAFE_UNWRAP(*address_u);
389 	assert3u(address, !=, 0);
390 
391 	metadata = vmdr_metadata_alloc(task, address, rounded_vm_size, len);
392 	metadata->vdrm_pid = task_pid(task);
393 
394 	/*
395 	 * Validate the starting indices.
396 	 */
397 	kr = reclaim_copyin_busy(metadata, &busy);
398 	if (kr != KERN_SUCCESS) {
399 		goto out;
400 	}
401 	kr = reclaim_copyin_head(metadata, &head);
402 	if (kr != KERN_SUCCESS) {
403 		goto out;
404 	}
405 	kr = reclaim_copyin_tail(metadata, &tail);
406 	if (kr != KERN_SUCCESS) {
407 		goto out;
408 	}
409 
410 	if (head != 0 || tail != 0 || busy != 0) {
411 		vmdr_log_error("indices were not "
412 		    "zero-initialized\n");
413 		kr = KERN_INVALID_ARGUMENT;
414 		goto out;
415 	}
416 
417 	/*
418 	 * Publish the metadata to the task & global buffer list. This must be
419 	 * done under the task lock to synchronize with task termination - i.e.
420 	 * task_terminate_internal is guaranteed to see the published metadata and
421 	 * tear it down.
422 	 */
423 	lck_mtx_lock(&reclaim_buffers_lock);
424 	task_lock(task);
425 
426 	if (!task_is_active(task) || task_is_halting(task)) {
427 		vmdr_log_error(
428 			"failed to initialize buffer on dying task %s [%d]",
429 			task_best_name(task), task_pid(task));
430 		kr = KERN_ABORTED;
431 		goto fail_task;
432 	}
433 	if (task->deferred_reclamation_metadata != NULL) {
434 		vmdr_log_error(
435 			"tried to overwrite existing reclaim buffer for %s [%d]", task_best_name(task), task_pid(task));
436 		kr = VM_RECLAIM_RESOURCE_SHORTAGE;
437 		goto fail_task;
438 	}
439 
440 	metadata->vdrm_is_registered = true;
441 	vmdr_list_append_locked(metadata);
442 	task->deferred_reclamation_metadata = metadata;
443 
444 	task_unlock(task);
445 	lck_mtx_unlock(&reclaim_buffers_lock);
446 
447 	vmdr_log_debug("%s [%d] allocated ring with capacity %u/%u\n",
448 	    task_best_name(task), task_pid(task),
449 	    len, max_len);
450 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
451 	    task_pid(task), KERN_SUCCESS, address);
452 	DTRACE_VM3(reclaim_ring_allocate,
453 	    mach_vm_address_t, address,
454 	    mach_vm_reclaim_count_t, len,
455 	    mach_vm_reclaim_count_t, max_len);
456 	return KERN_SUCCESS;
457 
458 fail_task:
459 	task_unlock(task);
460 	lck_mtx_unlock(&reclaim_buffers_lock);
461 
462 	tmp_kr = mach_vm_deallocate(map,
463 	    *address_u, size_u);
464 	assert(tmp_kr == KERN_SUCCESS);
465 
466 out:
467 	*address_u = vm_sanitize_wrap_addr(0ull);
468 	vmdr_metadata_release(metadata);
469 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
470 	    kr, NULL);
471 	return kr;
472 }
473 
474 #pragma mark Synchronization & Lifecycle
475 
476 static inline void
477 vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)
478 {
479 	lck_mtx_lock(&metadata->vdrm_lock);
480 }
481 
482 static inline void
483 vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)
484 {
485 	lck_mtx_unlock(&metadata->vdrm_lock);
486 }
487 
488 static inline void
489 vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)
490 {
491 	lck_mtx_gate_assert(&metadata->vdrm_lock, &metadata->vdrm_gate,
492 	    GATE_ASSERT_HELD);
493 }
494 
495 static inline void
496 vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)
497 {
498 #if MACH_ASSERT
499 	vmdr_metadata_lock(metadata);
500 	vmdr_metadata_assert_owned_locked(metadata);
501 	vmdr_metadata_unlock(metadata);
502 #else /* MACH_ASSERT */
503 	(void)metadata;
504 #endif /* MACH_ASSERT */
505 }
506 
507 static bool
508 vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)
509 {
510 	kern_return_t kr = lck_mtx_gate_try_close(&metadata->vdrm_lock,
511 	    &metadata->vdrm_gate);
512 	return kr == KERN_SUCCESS;
513 }
514 
515 /*
516  * Try to take ownership of the buffer. Returns true if successful.
517  */
518 static bool
519 vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata,
520     vm_deferred_reclamation_options_t options)
521 {
522 	__assert_only gate_wait_result_t wait_result;
523 	if (!vmdr_metadata_try_own_locked(metadata)) {
524 		if (options & RECLAIM_NO_WAIT) {
525 			return false;
526 		}
527 		wait_result = lck_mtx_gate_wait(
528 			&metadata->vdrm_lock, &metadata->vdrm_gate, LCK_SLEEP_DEFAULT,
529 			THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
530 		assert(wait_result == GATE_HANDOFF);
531 	}
532 	return true;
533 }
534 
535 /*
536  * Set the current thread as the owner of a reclaim buffer. May block. Will
537  * propagate priority.
538  */
539 static void
540 vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)
541 {
542 	vmdr_metadata_lock(metadata);
543 	vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
544 	vmdr_metadata_unlock(metadata);
545 }
546 
547 static void
548 vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)
549 {
550 	vmdr_metadata_assert_owned_locked(metadata);
551 	lck_mtx_gate_handoff(&metadata->vdrm_lock, &metadata->vdrm_gate,
552 	    GATE_HANDOFF_OPEN_IF_NO_WAITERS);
553 }
554 
555 /*
556  * Release ownership of a reclaim buffer and wakeup any threads waiting for
557  * ownership. Must be called from the thread that acquired ownership.
558  */
559 static void
560 vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)
561 {
562 	vmdr_metadata_lock(metadata);
563 	vmdr_metadata_disown_locked(metadata);
564 	vmdr_metadata_unlock(metadata);
565 }
566 
567 static void
568 vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)
569 {
570 	os_ref_retain(&metadata->vdrm_refcnt);
571 }
572 
573 static void
574 vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)
575 {
576 	if (os_ref_release(&metadata->vdrm_refcnt) == 0) {
577 		vmdr_metadata_free(metadata);
578 	}
579 }
580 
581 static void
582 vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
583 {
584 	LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
585 	assert3p(metadata->vdrm_list.tqe_prev, !=, NULL);
586 	TAILQ_REMOVE(&reclaim_buffers, metadata, vdrm_list);
587 	metadata->vdrm_list.tqe_prev = NULL;
588 	metadata->vdrm_list.tqe_next = NULL;
589 }
590 
591 static void
592 vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
593 {
594 	LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
595 	assert3p(metadata->vdrm_list.tqe_prev, ==, NULL);
596 	TAILQ_INSERT_TAIL(&reclaim_buffers, metadata, vdrm_list);
597 }
598 
599 void
600 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
601 {
602 	assert(metadata != NULL);
603 	/*
604 	 * First remove the buffer from the global list so no one else can get access to it.
605 	 */
606 	lck_mtx_lock(&reclaim_buffers_lock);
607 	if (metadata->vdrm_is_registered) {
608 		vmdr_list_remove_locked(metadata);
609 	}
610 	lck_mtx_unlock(&reclaim_buffers_lock);
611 
612 	/*
613 	 * The task is dropping its ref on this buffer. First remove the buffer's
614 	 * back-reference to the task so that any threads currently operating on
615 	 * this buffer do not try to operate on the dead/dying task
616 	 */
617 	vmdr_metadata_lock(metadata);
618 	assert3p(metadata->vdrm_task, !=, TASK_NULL);
619 	metadata->vdrm_task = TASK_NULL;
620 	vmdr_metadata_unlock(metadata);
621 	vmdr_metadata_release(metadata);
622 }
623 
624 #pragma mark Exception Delivery
625 
626 static void
627 reclaim_kill_with_reason(
628 	vm_deferred_reclamation_metadata_t metadata,
629 	unsigned reason,
630 	mach_exception_data_type_t subcode)
631 {
632 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
633 	mach_exception_code_t code = 0;
634 	task_t task;
635 	proc_t p = NULL;
636 	boolean_t fatal = TRUE;
637 	bool killing_self;
638 	pid_t pid;
639 	int err;
640 
641 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
642 
643 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
644 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
645 	EXC_GUARD_ENCODE_TARGET(code, 0);
646 
647 	vmdr_metadata_lock(metadata);
648 	task = metadata->vdrm_task;
649 	if (task == TASK_NULL || !task_is_active(task) || task_is_halting(task)) {
650 		/* Task is no longer alive */
651 		vmdr_metadata_unlock(metadata);
652 		vmdr_log_error(
653 			"Unable to deliver guard exception because task "
654 			"[%d] is already dead.\n",
655 			metadata->vdrm_pid);
656 		return;
657 	}
658 
659 	if (panic_on_kill) {
660 		panic("About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
661 	}
662 
663 	killing_self = (task == current_task());
664 	if (!killing_self) {
665 		task_reference(task);
666 	}
667 	assert(task != kernel_task);
668 	vmdr_metadata_unlock(metadata);
669 
670 	if (reason == kGUARD_EXC_DEALLOC_GAP) {
671 		task_lock(task);
672 		fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
673 		task_unlock(task);
674 	}
675 
676 	if (!fatal) {
677 		vmdr_log_info(
678 			"Skipping non fatal guard exception for %s [%d]\n",
679 			task_best_name(task), task_pid(task));
680 		goto out;
681 	}
682 
683 	pid = task_pid(task);
684 	if (killing_self) {
685 		p = get_bsdtask_info(task);
686 	} else {
687 		p = proc_find(pid);
688 		if (p && proc_task(p) != task) {
689 			vmdr_log_error(
690 				"Unable to deliver guard exception because proc is gone & pid rolled over.\n");
691 			goto out;
692 		}
693 	}
694 
695 	if (!p) {
696 		vmdr_log_error(
697 			"Unable to deliver guard exception because task does not have a proc.\n");
698 		goto out;
699 	}
700 
701 	int flags = PX_DEBUG_NO_HONOR;
702 	exception_info_t info = {
703 		.os_reason = OS_REASON_GUARD,
704 		.exception_type = EXC_GUARD,
705 		.mx_code = code,
706 		.mx_subcode = subcode
707 	};
708 
709 	vmdr_log("Force-exiting %s [%d]\n", task_best_name(task), task_pid(task));
710 
711 	err = exit_with_mach_exception(p, info, flags);
712 	if (err != 0) {
713 		vmdr_log_error("Unable to deliver guard exception to %p: %d\n", p, err);
714 		goto out;
715 	}
716 
717 
718 out:
719 	if (!killing_self) {
720 		if (p) {
721 			proc_rele(p);
722 			p = NULL;
723 		}
724 		if (task) {
725 			task_deallocate(task);
726 			task = NULL;
727 		}
728 	}
729 }
730 
731 #pragma mark Copy I/O
732 
733 static user_addr_t
734 get_entries_ptr(vm_deferred_reclamation_metadata_t metadata)
735 {
736 	return metadata->vdrm_buffer_addr +
737 	       offsetof(struct mach_vm_reclaim_ring_s, entries);
738 }
739 
740 static user_addr_t
741 get_indices_ptr(user_addr_t buffer_addr)
742 {
743 	return buffer_addr +
744 	       offsetof(struct mach_vm_reclaim_ring_s, indices);
745 }
746 
747 static user_addr_t
748 get_head_ptr(user_addr_t indices)
749 {
750 	return indices + offsetof(struct mach_vm_reclaim_indices_s, head);
751 }
752 
753 static user_addr_t
754 get_tail_ptr(user_addr_t indices)
755 {
756 	return indices + offsetof(struct mach_vm_reclaim_indices_s, tail);
757 }
758 
759 static user_addr_t
760 get_busy_ptr(user_addr_t indices)
761 {
762 	return indices + offsetof(struct mach_vm_reclaim_indices_s, busy);
763 }
764 
765 static kern_return_t
766 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
767 {
768 	if (result != 0 && (result != EFAULT || !vm_fault_get_disabled())) {
769 		vmdr_log_error("Killing [%d] due to copy I/O error\n", metadata->vdrm_pid);
770 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE,
771 		    result);
772 	}
773 	return kern_return_for_errno(result);
774 }
775 
776 /*
777  * Helper functions to do copyio on the head, tail, and busy pointers.
778  * Note that the kernel will only write to the busy and head pointers.
779  * Userspace is not supposed to write to the head or busy pointers, but the kernel
780  * must be resilient to that kind of bug in userspace.
781  */
782 
783 static kern_return_t
784 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
785 {
786 	int result;
787 	kern_return_t kr;
788 	user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr);
789 	user_addr_t head_ptr = get_head_ptr(indices);
790 
791 	result = copyin_atomic64(head_ptr, head);
792 	kr = reclaim_handle_copyio_error(metadata, result);
793 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
794 		vmdr_log_error(
795 			"Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
796 	}
797 	return kr;
798 }
799 
800 static kern_return_t
801 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
802 {
803 	int result;
804 	kern_return_t kr;
805 	user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr);
806 	user_addr_t tail_ptr = get_tail_ptr(indices);
807 
808 	result = copyin_atomic64(tail_ptr, tail);
809 	kr = reclaim_handle_copyio_error(metadata, result);
810 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
811 		vmdr_log_error(
812 			"Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
813 	}
814 	return kr;
815 }
816 
817 static kern_return_t
818 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
819 {
820 	int result;
821 	kern_return_t kr;
822 	user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr);
823 	user_addr_t busy_ptr = get_busy_ptr(indices);
824 
825 	result = copyin_atomic64(busy_ptr, busy);
826 	kr = reclaim_handle_copyio_error(metadata, result);
827 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
828 		vmdr_log_error(
829 			"Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
830 	}
831 	return kr;
832 }
833 
834 static bool
835 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
836 {
837 	int result;
838 	kern_return_t kr;
839 	user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr);
840 	user_addr_t busy_ptr = get_busy_ptr(indices);
841 
842 	result = copyout_atomic64(value, busy_ptr);
843 	kr = reclaim_handle_copyio_error(metadata, result);
844 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
845 		vmdr_log_error(
846 			"Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
847 	}
848 	return kr;
849 }
850 
851 static bool
852 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
853 {
854 	int result;
855 	kern_return_t kr;
856 	user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr);
857 	user_addr_t head_ptr = get_head_ptr(indices);
858 
859 	result = copyout_atomic64(value, head_ptr);
860 	kr = reclaim_handle_copyio_error(metadata, result);
861 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
862 		vmdr_log_error(
863 			"Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
864 	}
865 	return kr;
866 }
867 
868 #pragma mark Reclamation
869 
870 /*
871  * @func reclaim_chunk
872  *
873  * @brief
874  * Reclaim a batch of entries from the buffer.
875  *
876  * @param bytes_to_reclaim
877  * Number of bytes caller wishes to reclaim from the buffer
878  *
879  * @param bytes_reclaimed_out
880  * The number of bytes reclaimed from the buffer written out
881  *
882  * @param chunk_size
883  * The maximum number of entries to hold busy and reclaim from (must
884  * be <= kReclaimChunkSize)
885  *
886  * @param num_reclaimed_out
887  * The number of entries reclaimed written out
888  *
889  * @discussion
890  * If the buffer has been exhausted of entries (tail == head),
891  * num_reclaimed_out will be zero. It is important that the caller abort any
892  * loops if such a condition is met.
893  */
894 static kern_return_t
895 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
896     uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
897     mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out)
898 {
899 	kern_return_t kr = KERN_SUCCESS;
900 	int result = 0;
901 	mach_vm_reclaim_count_t num_reclaimed = 0, num_copied = 0;
902 	uint64_t bytes_reclaimed = 0;
903 	uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0;
904 	user_addr_t indices;
905 	vm_map_t map = metadata->vdrm_map;
906 	vm_map_switch_context_t switch_ctx;
907 	struct mach_vm_reclaim_entry_s copied_entries[kReclaimChunkSize];
908 
909 	assert(metadata != NULL);
910 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
911 	vmdr_metadata_assert_owned(metadata);
912 
913 	assert(chunk_size <= kReclaimChunkSize);
914 
915 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START,
916 	    metadata->vdrm_pid, bytes_to_reclaim);
917 
918 	memset(copied_entries, 0, sizeof(copied_entries));
919 
920 	indices = get_indices_ptr(metadata->vdrm_buffer_addr);
921 	switch_ctx = vm_map_switch_to(map);
922 
923 	kr = reclaim_copyin_busy(metadata, &busy);
924 	if (kr != KERN_SUCCESS) {
925 		goto done;
926 	}
927 	kr = reclaim_copyin_head(metadata, &head);
928 	if (kr != KERN_SUCCESS) {
929 		goto done;
930 	}
931 	kr = reclaim_copyin_tail(metadata, &tail);
932 	if (kr != KERN_SUCCESS) {
933 		goto done;
934 	}
935 
936 	/*
937 	 * NB: busy may not be exactly equal to head if the jetsam
938 	 * thread fails to fault on the indices after having marked
939 	 * entries busy
940 	 */
941 	if (busy < head || (busy - head) > kReclaimChunkSize) {
942 		vmdr_log_error(
943 			"Userspace modified head or busy pointer! head: %llu "
944 			"(0x%llx) | busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
945 			head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail,
946 			get_tail_ptr(indices));
947 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE,
948 		    busy);
949 		kr = KERN_FAILURE;
950 		goto done;
951 	}
952 
953 	if (tail < head) {
954 		/*
955 		 * Userspace is likely in the middle of trying to re-use an entry,
956 		 * bail on this reclamation.
957 		 */
958 		vmdr_log_error(
959 			"Tail < head! Userspace is likely attempting a "
960 			"cancellation; aborting reclamation | head: %llu "
961 			"(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
962 			head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy,
963 			get_busy_ptr(indices));
964 		kr = KERN_ABORTED;
965 		goto done;
966 	}
967 
968 	/*
969 	 * NB: If any of the copyouts below fail due to faults being disabled,
970 	 * the buffer may be left in a state where several entries are unusable
971 	 * until the next reclamation (i.e. busy > head)
972 	 */
973 	num_to_reclaim = tail - head;
974 	while (true) {
975 		num_to_reclaim = MIN(num_to_reclaim, chunk_size);
976 		if (num_to_reclaim == 0) {
977 			break;
978 		}
979 		busy = head + num_to_reclaim;
980 		kr = reclaim_copyout_busy(metadata, busy);
981 		if (kr != KERN_SUCCESS) {
982 			goto done;
983 		}
984 		os_atomic_thread_fence(seq_cst);
985 		kr = reclaim_copyin_tail(metadata, &new_tail);
986 		if (kr != KERN_SUCCESS) {
987 			goto done;
988 		}
989 
990 		if (new_tail >= busy) {
991 			/* Got num_to_reclaim entries */
992 			break;
993 		}
994 		tail = new_tail;
995 		if (tail < head) {
996 			/*
997 			 * Userspace is likely in the middle of trying to re-use an entry,
998 			 * bail on this reclamation
999 			 */
1000 			vmdr_log_error(
1001 				"Tail < head! Userspace is likely attempting a "
1002 				"cancellation; aborting reclamation | head: %llu "
1003 				"(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
1004 				head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy,
1005 				get_busy_ptr(indices));
1006 			/* Reset busy back to head */
1007 			reclaim_copyout_busy(metadata, head);
1008 			kr = KERN_ABORTED;
1009 			goto done;
1010 		}
1011 		/* Can't reclaim these entries. Try again */
1012 		num_to_reclaim = tail - head;
1013 		if (num_to_reclaim == 0) {
1014 			/* Nothing left to reclaim. Reset busy to head. */
1015 			kr = reclaim_copyout_busy(metadata, head);
1016 			if (kr != KERN_SUCCESS) {
1017 				goto done;
1018 			}
1019 			break;
1020 		}
1021 		/*
1022 		 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
1023 		 * so this is gauranteed to converge.
1024 		 */
1025 	}
1026 	vmdr_log_debug("[%d] reclaiming up to %llu entries (%llu KiB) head=%llu "
1027 	    "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_to_reclaim,
1028 	    bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1029 
1030 	uint64_t memcpy_start_idx = head % metadata->vdrm_buffer_len;
1031 	while (num_copied < num_to_reclaim) {
1032 		uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
1033 		// Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
1034 		memcpy_end_idx = MIN(memcpy_end_idx, metadata->vdrm_buffer_len);
1035 		uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
1036 
1037 		assert(num_to_copy + num_copied <= kReclaimChunkSize);
1038 		user_addr_t src_ptr = get_entries_ptr(metadata) +
1039 		    (memcpy_start_idx * sizeof(struct mach_vm_reclaim_entry_s));
1040 		struct mach_vm_reclaim_entry_s *dst_ptr = copied_entries + num_copied;
1041 		result = copyin(src_ptr, dst_ptr,
1042 		    (num_to_copy * sizeof(struct mach_vm_reclaim_entry_s)));
1043 		kr = reclaim_handle_copyio_error(metadata, result);
1044 		if (kr != KERN_SUCCESS) {
1045 			if (kr != KERN_MEMORY_ERROR || !vm_fault_get_disabled()) {
1046 				vmdr_log_error(
1047 					"Unable to copyin %llu entries in reclaim "
1048 					"buffer at 0x%llx to 0x%llx: err=%d\n",
1049 					num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
1050 			}
1051 			goto done;
1052 		}
1053 
1054 		num_copied += num_to_copy;
1055 		memcpy_start_idx = (memcpy_start_idx + num_to_copy) % metadata->vdrm_buffer_len;
1056 	}
1057 
1058 	for (num_reclaimed = 0; num_reclaimed < num_to_reclaim && bytes_reclaimed < bytes_to_reclaim; num_reclaimed++) {
1059 		mach_vm_reclaim_entry_t entry = &copied_entries[num_reclaimed];
1060 		KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1061 		    metadata->vdrm_pid, entry->address, entry->size,
1062 		    entry->behavior);
1063 		if (entry->address != 0 && entry->size != 0) {
1064 			vm_map_address_t start = vm_map_trunc_page(entry->address,
1065 			    VM_MAP_PAGE_MASK(map));
1066 			vm_map_address_t end = vm_map_round_page(entry->address + entry->size,
1067 			    VM_MAP_PAGE_MASK(map));
1068 			DTRACE_VM4(vm_reclaim_entry,
1069 			    pid_t, metadata->vdrm_pid,
1070 			    mach_vm_address_t, entry->address,
1071 			    mach_vm_address_t, end,
1072 			    mach_vm_reclaim_action_t, entry->behavior);
1073 			KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1074 			    metadata->vdrm_pid, start, end,
1075 			    entry->behavior);
1076 			vmdr_log_debug("[%d] Reclaiming entry %llu (0x%llx, 0x%llx)\n", metadata->vdrm_pid, head + num_reclaimed, start, end);
1077 			switch (entry->behavior) {
1078 			case VM_RECLAIM_DEALLOCATE:
1079 				kr = vm_map_remove_guard(map,
1080 				    start, end, VM_MAP_REMOVE_GAPS_FAIL,
1081 				    KMEM_GUARD_NONE).kmr_return;
1082 				if (kr == KERN_INVALID_VALUE) {
1083 					vmdr_log_error(
1084 						"[%d] Killing due to virtual-memory guard at (0x%llx, 0x%llx)\n",
1085 						metadata->vdrm_pid, start, end);
1086 					reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
1087 					goto done;
1088 				} else if (kr != KERN_SUCCESS) {
1089 					vmdr_log_error(
1090 						"[%d] Killing due to deallocation failure at (0x%llx, 0x%llx) err=%d\n",
1091 						metadata->vdrm_pid, start, end, kr);
1092 					reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1093 					goto done;
1094 				}
1095 				break;
1096 			case VM_RECLAIM_FREE:
1097 				/*
1098 				 * TODO: This should free the backing pages directly instead of using
1099 				 * VM_BEHAVIOR_REUSABLE, which will mark the pages as clean and let them
1100 				 * age in the LRU.
1101 				 */
1102 				kr = vm_map_behavior_set(map, start,
1103 				    end, VM_BEHAVIOR_REUSABLE);
1104 				if (kr != KERN_SUCCESS) {
1105 					vmdr_log_error(
1106 						"[%d] Failed to free(reusable) (0x%llx, 0x%llx) err=%d\n",
1107 						metadata->vdrm_pid, start, end, kr);
1108 				}
1109 				break;
1110 			default:
1111 				vmdr_log_error(
1112 					"attempted to reclaim entry with unsupported behavior %uh",
1113 					entry->behavior);
1114 				reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1115 				kr = KERN_INVALID_VALUE;
1116 				goto done;
1117 			}
1118 			bytes_reclaimed += entry->size;
1119 			KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END,
1120 			    kr);
1121 		}
1122 	}
1123 
1124 	assert(head + num_reclaimed <= busy);
1125 	head += num_reclaimed;
1126 	kr = reclaim_copyout_head(metadata, head);
1127 	if (kr != KERN_SUCCESS) {
1128 		goto done;
1129 	}
1130 	if (busy > head) {
1131 		busy = head;
1132 		kr = reclaim_copyout_busy(metadata, busy);
1133 		if (kr != KERN_SUCCESS) {
1134 			goto done;
1135 		}
1136 	}
1137 
1138 done:
1139 	vmdr_log_debug("[%d] reclaimed %u entries (%llu KiB) head=%llu "
1140 	    "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_reclaimed,
1141 	    bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1142 	vm_map_switch_back(switch_ctx);
1143 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1144 	    bytes_reclaimed, num_reclaimed, kr);
1145 	if (bytes_reclaimed_out) {
1146 		*bytes_reclaimed_out = bytes_reclaimed;
1147 	}
1148 	if (num_reclaimed_out) {
1149 		*num_reclaimed_out = num_reclaimed;
1150 	}
1151 	return kr;
1152 }
1153 
1154 /*
1155  * @func vmdr_reclaim_from_buffer
1156  *
1157  * @brief
1158  * Reclaim entries until the buffer's estimated number of available bytes
1159  * is <= @c bytes_to_reclaim.
1160  *
1161  * @param bytes_to_reclaim
1162  * The minimum number of bytes to reclaim
1163  *
1164  * @param num_bytes_reclaimed_out
1165  * The number of bytes reclaimed written out
1166  *
1167  * @param options
1168  * If RECLAIM_NO_FAULT is set, do not fault on the buffer if it has been paged
1169  * out.
1170  *
1171  * @discussion
1172  * The buffer should be owned by the caller.
1173  */
1174 static kern_return_t
1175 vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata,
1176     size_t bytes_to_reclaim, size_t *num_bytes_reclaimed_out,
1177     vm_deferred_reclamation_options_t options)
1178 {
1179 	kern_return_t kr = KERN_SUCCESS;
1180 
1181 	if (options & RECLAIM_NO_FAULT) {
1182 		vm_fault_disable();
1183 	}
1184 
1185 	size_t total_bytes_reclaimed = 0;
1186 	while (total_bytes_reclaimed < bytes_to_reclaim) {
1187 		uint64_t cur_bytes_reclaimed;
1188 		mach_vm_reclaim_count_t entries_reclaimed;
1189 		kr = reclaim_chunk(metadata, bytes_to_reclaim - total_bytes_reclaimed,
1190 		    &cur_bytes_reclaimed, kReclaimChunkSize, &entries_reclaimed);
1191 		total_bytes_reclaimed += cur_bytes_reclaimed;
1192 		if (entries_reclaimed == 0 || kr != KERN_SUCCESS) {
1193 			break;
1194 		}
1195 	}
1196 
1197 	if (options & RECLAIM_NO_FAULT) {
1198 		vm_fault_enable();
1199 	}
1200 	vmdr_log_debug("reclaimed %lu B / %lu B from %d\n", total_bytes_reclaimed, bytes_to_reclaim, metadata->vdrm_pid);
1201 	if (num_bytes_reclaimed_out) {
1202 		*num_bytes_reclaimed_out = total_bytes_reclaimed;
1203 	}
1204 	return kr;
1205 }
1206 
1207 /*
1208  * Get the reclamation metadata buffer for the given map.
1209  */
1210 static vm_deferred_reclamation_metadata_t
1211 get_task_reclaim_metadata(task_t task)
1212 {
1213 	assert(task != NULL);
1214 	vm_deferred_reclamation_metadata_t metadata = NULL;
1215 	task_lock(task);
1216 	metadata = task->deferred_reclamation_metadata;
1217 	task_unlock(task);
1218 	return metadata;
1219 }
1220 
1221 #pragma mark Buffer Resize/Synchronization
1222 
1223 kern_return_t
1224 vm_deferred_reclamation_buffer_flush_internal(task_t task,
1225     mach_vm_reclaim_count_t num_entries_to_reclaim)
1226 {
1227 	kern_return_t kr;
1228 	vm_deferred_reclamation_metadata_t metadata = NULL;
1229 	mach_vm_reclaim_count_t total_reclaimed = 0;
1230 	uint64_t bytes_reclaimed = 0;
1231 
1232 	if (!task_is_active(task)) {
1233 		return KERN_INVALID_TASK;
1234 	}
1235 
1236 	metadata = get_task_reclaim_metadata(task);
1237 	if (metadata == NULL) {
1238 		return KERN_INVALID_ARGUMENT;
1239 	}
1240 
1241 	vmdr_metadata_own(metadata);
1242 
1243 	vmdr_log_debug("[%d] flushing %u entries\n", task_pid(task), num_entries_to_reclaim);
1244 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_START, metadata->vdrm_pid, num_entries_to_reclaim);
1245 
1246 	while (total_reclaimed < num_entries_to_reclaim) {
1247 		mach_vm_reclaim_count_t cur_reclaimed;
1248 		uint64_t cur_bytes_reclaimed;
1249 		mach_vm_reclaim_count_t chunk_size = MIN(num_entries_to_reclaim - total_reclaimed, kReclaimChunkSize);
1250 		kr = reclaim_chunk(metadata, UINT64_MAX, &cur_bytes_reclaimed, chunk_size,
1251 		    &cur_reclaimed);
1252 		total_reclaimed += cur_reclaimed;
1253 		bytes_reclaimed += cur_bytes_reclaimed;
1254 		if (cur_reclaimed == 0) {
1255 			break;
1256 		} else if (kr == KERN_ABORTED) {
1257 			/*
1258 			 * Unable to reclaim due to a lost race with
1259 			 * userspace, yield the gate and try again
1260 			 */
1261 			vmdr_metadata_disown(metadata);
1262 			vmdr_metadata_own(metadata);
1263 			continue;
1264 		} else if (kr != KERN_SUCCESS) {
1265 			break;
1266 		}
1267 	}
1268 
1269 	vmdr_metadata_lock(metadata);
1270 	metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed;
1271 	vmdr_metadata_disown_locked(metadata);
1272 	vmdr_metadata_unlock(metadata);
1273 
1274 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_END, kr, total_reclaimed, bytes_reclaimed);
1275 	DTRACE_VM2(reclaim_flush,
1276 	    mach_vm_reclaim_count_t, num_entries_to_reclaim,
1277 	    size_t, bytes_reclaimed);
1278 	return kr;
1279 }
1280 
1281 kern_return_t
1282 vm_deferred_reclamation_buffer_resize_internal(
1283 	task_t                   task,
1284 	mach_vm_reclaim_count_t len)
1285 {
1286 	kern_return_t kr;
1287 	mach_vm_reclaim_count_t num_entries_reclaimed = 0;
1288 	mach_vm_reclaim_count_t old_len;
1289 
1290 	if (task == TASK_NULL) {
1291 		return KERN_INVALID_TASK;
1292 	}
1293 	if (len == 0) {
1294 		return KERN_INVALID_ARGUMENT;
1295 	}
1296 	vm_deferred_reclamation_metadata_t metadata = get_task_reclaim_metadata(task);
1297 	if (metadata == NULL) {
1298 		return KERN_INVALID_TASK;
1299 	}
1300 
1301 	/* Size must be multiple of page size */
1302 	vm_map_t map = task->map;
1303 	mach_vm_size_t new_size = vmdr_round_len_to_size(map, len);
1304 	if (new_size == 0) {
1305 		return KERN_INVALID_ARGUMENT;
1306 	}
1307 	if (new_size > metadata->vdrm_buffer_size) {
1308 		return KERN_NO_SPACE;
1309 	}
1310 
1311 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_START,
1312 	    task_pid(task), new_size);
1313 
1314 	/*
1315 	 * Prevent other threads from operating on this buffer while it is
1316 	 * resized. It is the caller's responsibility to ensure mutual
1317 	 * exclusion with other user threads
1318 	 */
1319 	vmdr_metadata_own(metadata);
1320 
1321 	old_len = metadata->vdrm_buffer_len;
1322 
1323 	vmdr_log_debug("%s [%d] resizing buffer %u -> %u entries\n",
1324 	    task_best_name(task), task_pid(task), old_len, len);
1325 
1326 	/*
1327 	 * Reclaim all the entries currently in the buffer to prevent re-use
1328 	 * of old reclaim ids that will alias differently into the newly sized
1329 	 * buffer.
1330 	 *
1331 	 * TODO: Consider encoding the ringbuffer-capacity in the
1332 	 * mach_vm_reclaim_id_t, so reuses can still find objects after a resize.
1333 	 */
1334 	do {
1335 		kr = reclaim_chunk(metadata, UINT64_MAX, NULL, kReclaimChunkSize,
1336 		    &num_entries_reclaimed);
1337 		if (kr != KERN_SUCCESS) {
1338 			goto fail;
1339 		}
1340 	} while (num_entries_reclaimed > 0);
1341 
1342 	/* Publish new user addresses in kernel metadata */
1343 	vmdr_metadata_lock(metadata);
1344 	metadata->vdrm_buffer_len = len;
1345 	vmdr_metadata_disown_locked(metadata);
1346 	vmdr_metadata_unlock(metadata);
1347 
1348 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, KERN_SUCCESS, num_entries_reclaimed);
1349 	DTRACE_VM2(reclaim_ring_resize,
1350 	    mach_vm_reclaim_count_t, old_len,
1351 	    mach_vm_reclaim_count_t, len);
1352 	return KERN_SUCCESS;
1353 
1354 fail:
1355 	vmdr_metadata_disown(metadata);
1356 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, kr, num_entries_reclaimed);
1357 	return kr;
1358 }
1359 
1360 #pragma mark Accounting
1361 
1362 #if CONFIG_WORKING_SET_ESTIMATION
1363 extern vm_pressure_level_t memorystatus_vm_pressure_level;
1364 
1365 static uint64_t
1366 vmdr_metadata_autotrim_threshold(vm_deferred_reclamation_metadata_t metadata)
1367 {
1368 	kern_return_t kr;
1369 	uint32_t autotrim_pct;
1370 
1371 	/*
1372 	 * Determine the autotrim threshold based on the current pressure level
1373 	 */
1374 	vm_pressure_level_t pressure_level = os_atomic_load(&memorystatus_vm_pressure_level, relaxed);
1375 	switch (pressure_level) {
1376 	case kVMPressureNormal:
1377 		autotrim_pct = vm_reclaim_autotrim_pct_normal;
1378 		break;
1379 	case kVMPressureWarning:
1380 	case kVMPressureUrgent:
1381 		autotrim_pct = vm_reclaim_autotrim_pct_pressure;
1382 		break;
1383 	case kVMPressureCritical:
1384 		autotrim_pct = vm_reclaim_autotrim_pct_critical;
1385 		break;
1386 	default:
1387 		panic("vm_reclaim: unexpected vm_pressure_level %d", pressure_level);
1388 	}
1389 
1390 	/*
1391 	 * Estimate the task's maximum working set size
1392 	 */
1393 	ledger_amount_t phys_footprint_max = 0;
1394 	kr = ledger_get_lifetime_max(metadata->vdrm_task->ledger,
1395 	    task_ledgers.phys_footprint, &phys_footprint_max);
1396 	assert3u(kr, ==, KERN_SUCCESS);
1397 
1398 	return phys_footprint_max * autotrim_pct / 100;
1399 }
1400 
1401 #define VMDR_WMA_UNIT (1 << 8)
1402 #define VMDR_WMA_MIX(base, e)  ((vm_reclaim_wma_weight_base * (base) + (e) * VMDR_WMA_UNIT * vm_reclaim_wma_weight_cur) / vm_reclaim_wma_denom)
1403 
1404 static size_t
1405 vmdr_metadata_reset_min_bytes(vm_deferred_reclamation_metadata_t metadata)
1406 {
1407 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
1408 	metadata->vdrm_reclaimable_bytes_min =
1409 	    metadata->vdrm_cumulative_uncancelled_bytes -
1410 	    metadata->vdrm_cumulative_reclaimed_bytes;
1411 	return metadata->vdrm_reclaimable_bytes_min;
1412 }
1413 
1414 /*
1415  * @func vmdr_ws_sample
1416  *
1417  * @brief sample the working set size of the given buffer
1418  *
1419  * @param metadata
1420  * The reclaim buffer to sample
1421  *
1422  * @param trim_threshold_out
1423  * If the buffer should be trimmed, the amount to trim (in bytes) will be
1424  * written out
1425  *
1426  * @returns true iff the buffer should be trimmed
1427  *
1428  * @discussion
1429  * The caller must hold the buffer locked.
1430  */
1431 static bool
1432 vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata,
1433     size_t *trim_threshold_out)
1434 {
1435 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
1436 
1437 	uint64_t now = mach_absolute_time();
1438 	if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) {
1439 		/* A sampling period has not elapsed */
1440 		return false;
1441 	}
1442 
1443 	size_t estimated_reclaimable_bytes;
1444 	uint64_t samples_elapsed = (now - metadata->vdrm_last_sample_abs) /
1445 	    vm_reclaim_sampling_period_abs;
1446 
1447 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_START,
1448 	    metadata->vdrm_pid,
1449 	    now,
1450 	    metadata->vdrm_last_sample_abs,
1451 	    metadata->vdrm_reclaimable_bytes_min);
1452 
1453 	if (samples_elapsed > vm_reclaim_abandonment_threshold) {
1454 		/*
1455 		 * Many sampling periods have elapsed since the ring was
1456 		 * last sampled. Don't bother computing the WMA and assume
1457 		 * the buffer's current contents are unneeded.
1458 		 */
1459 		estimated_reclaimable_bytes =
1460 		    metadata->vdrm_cumulative_uncancelled_bytes -
1461 		    metadata->vdrm_cumulative_reclaimed_bytes;
1462 		metadata->vdrm_reclaimable_bytes_min = estimated_reclaimable_bytes;
1463 		metadata->vdrm_reclaimable_bytes_wma = estimated_reclaimable_bytes;
1464 	} else {
1465 		/*
1466 		 * Compute an exponential moving average of the minimum amount of reclaimable
1467 		 * memory in this buffer. Multiple sampling periods may have elapsed
1468 		 * since the last sample. By definition, the minimum must be the same for
1469 		 * all elapsed periods (otherwise libmalloc would have called down to
1470 		 * update accounting)
1471 		 */
1472 		for (unsigned int i = 0; i < samples_elapsed; i++) {
1473 			metadata->vdrm_reclaimable_bytes_wma = VMDR_WMA_MIX(
1474 				metadata->vdrm_reclaimable_bytes_wma,
1475 				metadata->vdrm_reclaimable_bytes_min);
1476 		}
1477 
1478 		/* Reset the minimum to start a new sampling interval */
1479 		estimated_reclaimable_bytes = vmdr_metadata_reset_min_bytes(metadata);
1480 	}
1481 
1482 	metadata->vdrm_last_sample_abs = now;
1483 
1484 	size_t trim_threshold_bytes = MIN(metadata->vdrm_reclaimable_bytes_min,
1485 	    metadata->vdrm_reclaimable_bytes_wma / VMDR_WMA_UNIT);
1486 	size_t autotrim_threshold = vmdr_metadata_autotrim_threshold(metadata);
1487 
1488 	bool trim_needed = trim_threshold_bytes >= vm_map_page_size(metadata->vdrm_map) &&
1489 	    trim_threshold_bytes >= autotrim_threshold;
1490 
1491 	*trim_threshold_out = vm_map_round_page(trim_threshold_bytes,
1492 	    vm_map_page_mask(metadata->vdrm_map));
1493 
1494 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_END,
1495 	    *trim_threshold_out,
1496 	    trim_needed,
1497 	    estimated_reclaimable_bytes);
1498 	DTRACE_VM5(reclaim_sample,
1499 	    pid_t, metadata->vdrm_pid,
1500 	    uint64_t, metadata->vdrm_reclaimable_bytes_wma,
1501 	    size_t, metadata->vdrm_reclaimable_bytes_min,
1502 	    size_t, estimated_reclaimable_bytes,
1503 	    size_t, *trim_threshold_out);
1504 	vmdr_log_debug("sampled buffer with min %lu est %lu trim %lu wma %llu\n",
1505 	    metadata->vdrm_reclaimable_bytes_min,
1506 	    estimated_reclaimable_bytes,
1507 	    trim_threshold_bytes,
1508 	    metadata->vdrm_reclaimable_bytes_wma / VMDR_WMA_UNIT);
1509 
1510 	return trim_needed;
1511 }
1512 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1513 
1514 /*
1515  * Caller must have buffer owned and unlocked
1516  */
1517 static kern_return_t
1518 vmdr_trim(vm_deferred_reclamation_metadata_t metadata, size_t bytes_to_reclaim,
1519     size_t *bytes_reclaimed, vm_deferred_reclamation_options_t options)
1520 {
1521 	kern_return_t kr;
1522 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_START,
1523 	    metadata->vdrm_pid, bytes_to_reclaim);
1524 
1525 	kr = vmdr_reclaim_from_buffer(metadata, bytes_to_reclaim,
1526 	    bytes_reclaimed, options);
1527 
1528 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_END, kr, bytes_reclaimed);
1529 	DTRACE_VM3(reclaim_trim,
1530 	    pid_t, metadata->vdrm_pid,
1531 	    size_t, bytes_to_reclaim,
1532 	    size_t, *bytes_reclaimed);
1533 	return kr;
1534 }
1535 
1536 /*
1537  * Caller must have buffer owned and unlocked
1538  */
1539 static kern_return_t
1540 vmdr_drain(vm_deferred_reclamation_metadata_t metadata, size_t *bytes_reclaimed,
1541     vm_deferred_reclamation_options_t options)
1542 {
1543 	kern_return_t kr;
1544 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_START,
1545 	    metadata->vdrm_pid);
1546 
1547 	kr = vmdr_reclaim_from_buffer(metadata, UINT64_MAX,
1548 	    bytes_reclaimed, options);
1549 
1550 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_END, kr, bytes_reclaimed);
1551 	DTRACE_VM2(reclaim_drain,
1552 	    pid_t, metadata->vdrm_pid,
1553 	    size_t, *bytes_reclaimed);
1554 	return kr;
1555 }
1556 
1557 kern_return_t
1558 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, uint64_t bytes_placed_in_buffer)
1559 {
1560 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1561 	size_t estimated_reclaimable_bytes, bytes_to_reclaim, bytes_reclaimed = 0;
1562 	kern_return_t kr = KERN_SUCCESS;
1563 	if (metadata == NULL) {
1564 		return KERN_INVALID_ARGUMENT;
1565 	}
1566 
1567 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START,
1568 	    metadata->vdrm_pid, bytes_placed_in_buffer);
1569 
1570 	vmdr_metadata_lock(metadata);
1571 
1572 	if (!metadata->vdrm_pid) {
1573 		/* If this is a forked child, we may not yet have a pid */
1574 		metadata->vdrm_pid = task_pid(task);
1575 	}
1576 
1577 	/*
1578 	 * The client is allowed to make this call in parallel from multiple threads.
1579 	 * It's possible that, while we were waiting for the lock, another
1580 	 * thread updated accounting with a larger/newer uncancelled_bytes
1581 	 * value that resulted in a reclaim. We can't provide strict ordering
1582 	 * with the current implementation, but we can at least detect very
1583 	 * erroneous stale values that would result in the uncancelled-byte
1584 	 * count being less than the reclaimed-byte-count (which cannot be
1585 	 * accurate).
1586 	 *
1587 	 * TODO: Consider making this a try_copyin of the userspace value
1588 	 * under the mutex to ensure ordering/consistency (rdar://137607771)
1589 	 */
1590 	if (bytes_placed_in_buffer < metadata->vdrm_cumulative_reclaimed_bytes) {
1591 		goto done;
1592 	}
1593 
1594 	metadata->vdrm_cumulative_uncancelled_bytes = bytes_placed_in_buffer;
1595 	estimated_reclaimable_bytes = bytes_placed_in_buffer - metadata->vdrm_cumulative_reclaimed_bytes;
1596 #if CONFIG_WORKING_SET_ESTIMATION
1597 	bool should_reclaim = vmdr_sample_working_set(metadata, &bytes_to_reclaim);
1598 	if (should_reclaim) {
1599 		vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
1600 		lck_mtx_unlock(&metadata->vdrm_lock);
1601 		vmdr_log_debug("trimming pid %d\n", metadata->vdrm_pid);
1602 
1603 		kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, RECLAIM_OPTIONS_NONE);
1604 
1605 		vmdr_metadata_lock(metadata);
1606 		metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed;
1607 		/* Reset the current minimum now that the buffer has been trimmed down */
1608 		vmdr_metadata_reset_min_bytes(metadata);
1609 		vmdr_metadata_disown_locked(metadata);
1610 		if (kr == KERN_ABORTED) {
1611 			/*
1612 			 * We were unable to complete the trim due to a lost
1613 			 * race with userspace. This need not be fatal b/c the
1614 			 * accounting was successfully updated.
1615 			 */
1616 			kr = KERN_SUCCESS;
1617 		}
1618 	} else {
1619 		/* Update the minimum for the current sampling period */
1620 		metadata->vdrm_reclaimable_bytes_min = MIN(metadata->vdrm_reclaimable_bytes_min, estimated_reclaimable_bytes);
1621 	}
1622 #else /* !CONFIG_WORKING_SET_ESTIMATION */
1623 	if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
1624 		bytes_to_reclaim = vm_reclaim_max_threshold - estimated_reclaimable_bytes;
1625 		vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
1626 		vmdr_metadata_unlock(metadata);
1627 		kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, RECLAIM_OPTIONS_NONE);
1628 		vmdr_metadata_lock(metadata);
1629 		metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed;
1630 		vmdr_metadata_disown_locked(metadata);
1631 		if (kr == KERN_ABORTED) {
1632 			/*
1633 			 * We were unable to complete the trim due to a lost
1634 			 * race with userspace. This need not be fatal b/c the
1635 			 * accounting was successfully updated.
1636 			 */
1637 			kr = KERN_SUCCESS;
1638 		}
1639 	}
1640 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1641 
1642 done:
1643 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END,
1644 	    metadata->vdrm_cumulative_uncancelled_bytes,
1645 	    metadata->vdrm_cumulative_reclaimed_bytes,
1646 	    bytes_reclaimed);
1647 	vmdr_metadata_unlock(metadata);
1648 	return kr;
1649 }
1650 
1651 kern_return_t
1652 vm_deferred_reclamation_task_drain(task_t task,
1653     vm_deferred_reclamation_options_t options)
1654 {
1655 	kern_return_t kr;
1656 	size_t bytes_reclaimed;
1657 
1658 	task_lock(task);
1659 	if (!task_is_active(task) || task_is_halting(task)) {
1660 		task_unlock(task);
1661 		return KERN_ABORTED;
1662 	}
1663 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1664 	if (metadata == NULL) {
1665 		task_unlock(task);
1666 		return KERN_SUCCESS;
1667 	}
1668 	vmdr_metadata_retain(metadata);
1669 	task_unlock(task);
1670 
1671 	vmdr_metadata_own(metadata);
1672 
1673 	kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1674 
1675 	vmdr_metadata_lock(metadata);
1676 	metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed;
1677 	vmdr_metadata_disown_locked(metadata);
1678 	vmdr_metadata_unlock(metadata);
1679 
1680 	vmdr_metadata_release(metadata);
1681 	return kr;
1682 }
1683 
1684 void
1685 vm_deferred_reclamation_task_suspend(task_t task)
1686 {
1687 	if (task->deferred_reclamation_metadata) {
1688 		sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
1689 	}
1690 }
1691 
1692 #pragma mark KPIs
1693 
1694 vm_deferred_reclamation_metadata_t
1695 vm_deferred_reclamation_task_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1696 {
1697 	vm_deferred_reclamation_metadata_t metadata = NULL;
1698 	vmdr_metadata_assert_owned(parent);
1699 
1700 	assert(task->deferred_reclamation_metadata == NULL);
1701 	metadata = vmdr_metadata_alloc(task, parent->vdrm_buffer_addr,
1702 	    parent->vdrm_buffer_size, parent->vdrm_buffer_len);
1703 
1704 	metadata->vdrm_cumulative_reclaimed_bytes = parent->vdrm_cumulative_reclaimed_bytes;
1705 	metadata->vdrm_cumulative_uncancelled_bytes = parent->vdrm_cumulative_uncancelled_bytes;
1706 #if CONFIG_WORKING_SET_ESTIMATION
1707 	metadata->vdrm_reclaimable_bytes_min = parent->vdrm_reclaimable_bytes_min;
1708 	metadata->vdrm_reclaimable_bytes_wma = parent->vdrm_reclaimable_bytes_wma;
1709 	metadata->vdrm_last_sample_abs = parent->vdrm_last_sample_abs;
1710 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1711 
1712 	return metadata;
1713 }
1714 
1715 void
1716 vm_deferred_reclamation_task_fork_register(vm_deferred_reclamation_metadata_t metadata)
1717 {
1718 	assert(metadata != NULL);
1719 	assert(!metadata->vdrm_is_registered);
1720 
1721 	lck_mtx_lock(&reclaim_buffers_lock);
1722 	metadata->vdrm_is_registered = true;
1723 	vmdr_list_append_locked(metadata);
1724 	lck_mtx_unlock(&reclaim_buffers_lock);
1725 }
1726 
1727 bool
1728 vm_deferred_reclamation_task_has_ring(task_t task)
1729 {
1730 	return task->deferred_reclamation_metadata != NULL;
1731 }
1732 
1733 void
1734 vm_deferred_reclamation_ring_own(vm_deferred_reclamation_metadata_t metadata)
1735 {
1736 	vmdr_metadata_own(metadata);
1737 }
1738 
1739 void
1740 vm_deferred_reclamation_ring_disown(vm_deferred_reclamation_metadata_t metadata)
1741 {
1742 	vmdr_metadata_disown(metadata);
1743 }
1744 
1745 void
1746 vm_deferred_reclamation_gc(vm_deferred_reclamation_gc_action_t action, vm_deferred_reclamation_options_t options)
1747 {
1748 	vmdr_garbage_collect(action, options);
1749 }
1750 
1751 #pragma mark Global Reclamation GC
1752 
1753 static void
1754 vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action, vm_deferred_reclamation_options_t options)
1755 {
1756 	kern_return_t kr;
1757 	size_t bytes_reclaimed, bytes_to_reclaim;
1758 	bool should_reclaim;
1759 	gate_wait_result_t wr;
1760 
1761 #if !CONFIG_WORKING_SET_ESTIMATION
1762 	if (action == RECLAIM_GC_TRIM) {
1763 		/* GC_TRIM is a no-op without working set estimation */
1764 		return;
1765 	}
1766 #endif /* !CONFIG_WORKING_SET_ESTIMATION */
1767 
1768 	lck_mtx_lock(&reclaim_buffers_lock);
1769 	kr = lck_mtx_gate_try_close(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
1770 	if (kr != KERN_SUCCESS) {
1771 		if (options & RECLAIM_NO_WAIT) {
1772 			lck_mtx_unlock(&reclaim_buffers_lock);
1773 			return;
1774 		}
1775 		wr = lck_mtx_gate_wait(&reclaim_buffers_lock, &vm_reclaim_gc_gate, LCK_SLEEP_DEFAULT, THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1776 		assert3u(wr, ==, GATE_HANDOFF);
1777 	}
1778 
1779 	vm_reclaim_gc_epoch++;
1780 	vmdr_log_debug("running global GC\n");
1781 	while (true) {
1782 		vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclaim_buffers);
1783 		if (metadata == NULL) {
1784 			break;
1785 		}
1786 		vmdr_list_remove_locked(metadata);
1787 		vmdr_list_append_locked(metadata);
1788 		vmdr_metadata_retain(metadata);
1789 		lck_mtx_unlock(&reclaim_buffers_lock);
1790 
1791 		vmdr_metadata_lock(metadata);
1792 
1793 		if (metadata->vdrm_reclaimed_at >= vm_reclaim_gc_epoch) {
1794 			/* We've already seen this one. We're done */
1795 			vmdr_metadata_unlock(metadata);
1796 			vmdr_metadata_release(metadata);
1797 			lck_mtx_lock(&reclaim_buffers_lock);
1798 			break;
1799 		}
1800 		metadata->vdrm_reclaimed_at = vm_reclaim_gc_epoch;
1801 
1802 		task_t task = metadata->vdrm_task;
1803 		if (task == TASK_NULL ||
1804 		    !task_is_active(task) ||
1805 		    task_is_halting(task)) {
1806 			goto next;
1807 		}
1808 		bool buffer_is_suspended = task_is_app_suspended(task);
1809 		task = TASK_NULL;
1810 
1811 		switch (action) {
1812 		case RECLAIM_GC_DRAIN:
1813 			if (!vmdr_metadata_own_locked(metadata, options)) {
1814 				goto next;
1815 			}
1816 			vmdr_metadata_unlock(metadata);
1817 			vmdr_drain(metadata, &bytes_reclaimed, options);
1818 			vmdr_metadata_lock(metadata);
1819 			vmdr_metadata_disown_locked(metadata);
1820 			break;
1821 		case RECLAIM_GC_SCAVENGE:
1822 			if (buffer_is_suspended) {
1823 				vmdr_metadata_own_locked(metadata, options);
1824 				vmdr_metadata_unlock(metadata);
1825 				/* This buffer is no longer in use, fully reclaim it. */
1826 				vmdr_log_debug("found suspended buffer (%d), draining\n", metadata->vdrm_pid);
1827 				kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1828 				vmdr_metadata_lock(metadata);
1829 				vmdr_metadata_disown_locked(metadata);
1830 			}
1831 			break;
1832 		case RECLAIM_GC_TRIM:
1833 #if CONFIG_WORKING_SET_ESTIMATION
1834 			should_reclaim = vmdr_sample_working_set(metadata, &bytes_to_reclaim);
1835 			if (should_reclaim) {
1836 				vmdr_log_debug("GC found stale buffer (%d), trimming\n", metadata->vdrm_pid);
1837 				vmdr_metadata_own_locked(metadata, options);
1838 				vmdr_metadata_unlock(metadata);
1839 				kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, options);
1840 				vmdr_metadata_lock(metadata);
1841 				vmdr_metadata_disown_locked(metadata);
1842 			}
1843 #else /* !CONFIG_WORKING_SET_ESTIMATION */
1844 			(void)bytes_to_reclaim;
1845 			(void)should_reclaim;
1846 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1847 			break;
1848 		}
1849 		if (bytes_reclaimed) {
1850 			vm_reclaim_gc_reclaim_count++;
1851 			metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed;
1852 		}
1853 		if (metadata->vdrm_waiters && action != RECLAIM_GC_TRIM) {
1854 			thread_wakeup((event_t)&metadata->vdrm_waiters);
1855 		}
1856 next:
1857 		vmdr_metadata_unlock(metadata);
1858 		vmdr_metadata_release(metadata);
1859 		lck_mtx_lock(&reclaim_buffers_lock);
1860 	}
1861 	lck_mtx_gate_handoff(&reclaim_buffers_lock, &vm_reclaim_gc_gate, GATE_HANDOFF_OPEN_IF_NO_WAITERS);
1862 	lck_mtx_unlock(&reclaim_buffers_lock);
1863 }
1864 
1865 OS_NORETURN
1866 static void
1867 vm_reclaim_scavenger_thread_continue(__unused void *param, __unused wait_result_t wr)
1868 {
1869 	sched_cond_ack(&vm_reclaim_scavenger_cond);
1870 
1871 	while (true) {
1872 		vmdr_garbage_collect(RECLAIM_GC_SCAVENGE, RECLAIM_OPTIONS_NONE);
1873 		sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT, vm_reclaim_scavenger_thread_continue);
1874 	}
1875 }
1876 
1877 OS_NORETURN
1878 static void
1879 vm_reclaim_scavenger_thread_init(__unused void *param, __unused wait_result_t wr)
1880 {
1881 	thread_set_thread_name(current_thread(), "VM_reclaim_scavenger");
1882 #if CONFIG_THREAD_GROUPS
1883 	thread_group_vm_add();
1884 #endif /* CONFIG_THREAD_GROUPS */
1885 	sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT, vm_reclaim_scavenger_thread_continue);
1886 	__builtin_unreachable();
1887 }
1888 
1889 __startup_func
1890 static void
1891 vm_deferred_reclamation_init(void)
1892 {
1893 	vm_reclaim_log_handle = os_log_create("com.apple.xnu", "vm_reclaim");
1894 #if CONFIG_WORKING_SET_ESTIMATION
1895 	nanoseconds_to_absolutetime((uint64_t)vm_reclaim_sampling_period_ns,
1896 	    &vm_reclaim_sampling_period_abs);
1897 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1898 
1899 	sched_cond_init(&vm_reclaim_scavenger_cond);
1900 	lck_mtx_gate_init(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
1901 	kern_return_t kr = kernel_thread_start_priority(vm_reclaim_scavenger_thread_init,
1902 	    NULL, BASEPRI_KERNEL, &vm_reclaim_scavenger_thread);
1903 	if (kr != KERN_SUCCESS) {
1904 		panic("Unable to create VM reclaim thread, %d", kr);
1905 	}
1906 }
1907 
1908 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1909 
1910 #pragma mark Debug Interfaces
1911 
1912 #if DEVELOPMENT || DEBUG
1913 
1914 bool
1915 vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task)
1916 {
1917 	bool reclaimed;
1918 	vm_deferred_reclamation_metadata_t metadata = NULL;
1919 
1920 	task_lock(task);
1921 	if (!task_is_halting(task) && task_is_active(task)) {
1922 		metadata = task->deferred_reclamation_metadata;
1923 	}
1924 	if (metadata != NULL) {
1925 		vmdr_metadata_retain(metadata);
1926 	}
1927 	task_unlock(task);
1928 	if (metadata == NULL) {
1929 		return false;
1930 	}
1931 
1932 	vmdr_metadata_lock(metadata);
1933 
1934 	metadata->vdrm_waiters++;
1935 	/* Wake up the scavenger thread */
1936 	sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
1937 	wait_result_t wr = lck_mtx_sleep(&metadata->vdrm_lock,
1938 	    LCK_SLEEP_DEFAULT, (event_t)&metadata->vdrm_waiters,
1939 	    THREAD_ABORTSAFE);
1940 	metadata->vdrm_waiters--;
1941 	reclaimed = (wr == THREAD_AWAKENED);
1942 
1943 	vmdr_metadata_unlock(metadata);
1944 	vmdr_metadata_release(metadata);
1945 	return reclaimed;
1946 }
1947 
1948 #endif /* DEVELOPMENT || DEBUG */
1949