xref: /xnu-12377.1.9/osfmk/vm/vm_reclaim.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/sched_prim.h>
35 #include <kern/startup.h>
36 #include <kern/thread_group.h>
37 #include <libkern/OSAtomic.h>
38 #include <mach/kern_return.h>
39 #include <mach/mach_types.h>
40 #include <mach/vm_reclaim_private.h>
41 #include <os/atomic_private.h>
42 #include <os/base_private.h>
43 #include <os/log.h>
44 #include <os/refcnt.h>
45 #include <os/refcnt_internal.h>
46 #include <pexpert/pexpert.h>
47 #include <sys/errno.h>
48 #include <sys/kdebug.h>
49 #include <sys/queue.h>
50 #include <sys/reason.h>
51 #include <vm/vm_fault_xnu.h>
52 #include <vm/vm_map.h>
53 #include <vm/vm_map_internal.h>
54 #include <vm/vm_pageout_internal.h>
55 #include <vm/vm_reclaim_internal.h>
56 #include <vm/vm_sanitize_internal.h>
57 #include <vm/vm_kern_xnu.h>
58 
59 #pragma mark Tunables
60 
61 #if XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR
62 /* Temporarily opt iOS into the legacy behavior as a stop-gap */
63 #define CONFIG_WORKING_SET_ESTIMATION 0
64 /*
65  * Deferred reclaim may be enabled via EDT for select iOS devices, but
66  * defaults to disabled
67  */
68 #define VM_RECLAIM_ENABLED_DEFAULT false
69 #else
70 #define CONFIG_WORKING_SET_ESTIMATION 1
71 #define VM_RECLAIM_ENABLED_DEFAULT true
72 #endif
73 
74 #if DEVELOPMENT || DEBUG
75 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
76 #else /* RELEASE */
77 const uint32_t kReclaimChunkSize = 16;
78 #endif /* DEVELOPMENT || DEBUG */
79 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_sampling_period_ns, "vm_reclaim_sampling_period_ns",
80 #if CONFIG_WORKING_SET_ESTIMATION
81     10ULL * NSEC_PER_SEC);
82 #else
83     0ULL);
84 #endif
85 #if CONFIG_WORKING_SET_ESTIMATION
86 TUNABLE_DT_DEV_WRITEABLE(bool, vm_reclaim_enabled, "/defaults",
87     "kern.vm_reclaim_enabled", "vm_reclaim_enabled", VM_RECLAIM_ENABLED_DEFAULT, TUNABLE_DT_NONE);
88 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_normal, "vm_reclaim_autotrim_pct_normal", 10);
89 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_pressure, "vm_reclaim_autotrim_pct_pressure", 5);
90 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_critical, "vm_reclaim_autotrim_pct_critical", 1);
91 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_base, "vm_reclaim_wma_weight_base", 3);
92 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_cur, "vm_reclaim_wma_weight_cur", 1);
93 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_denom, "vm_reclaim_wma_denom", 4);
94 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_abandonment_threshold, "vm_reclaim_abandonment_threshold", 512);
95 #else /* CONFIG_WORKING_SET_ESTIMATION */
96 TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults",
97     "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", 0, TUNABLE_DT_NONE);
98 #endif /* CONFIG_WORKING_SET_ESTIMATION */
99 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
100 #if DEVELOPMENT || DEBUG
101 TUNABLE_WRITEABLE(bool, vm_reclaim_debug, "vm_reclaim_debug", false);
102 #endif
103 
104 #pragma mark Declarations
105 typedef struct proc *proc_t;
106 extern const char *proc_best_name(struct proc *);
107 extern void *proc_find(int pid);
108 extern task_t proc_task(proc_t);
109 extern kern_return_t kern_return_for_errno(int);
110 extern int mach_to_bsd_errno(kern_return_t kr);
111 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
112 struct proc *proc_ref(struct proc *p, int locked);
113 int proc_rele(proc_t p);
114 
115 #define _vmdr_log_type(type, fmt, ...) os_log_with_type(vm_reclaim_log_handle, type, "vm_reclaim: " fmt, ##__VA_ARGS__)
116 #define vmdr_log(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_DEFAULT, fmt, ##__VA_ARGS__)
117 #define vmdr_log_info(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_INFO, fmt, ##__VA_ARGS__)
118 #define vmdr_log_error(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_ERROR, fmt, ##__VA_ARGS__)
119 #if DEVELOPMENT || DEBUG
120 #define vmdr_log_debug(fmt, ...) \
121 MACRO_BEGIN \
122 if (os_unlikely(vm_reclaim_debug)) { \
123 	_vmdr_log_type(OS_LOG_TYPE_DEBUG, fmt, ##__VA_ARGS__); \
124 } \
125 MACRO_END
126 #else /* !(DEVELOPMENT || DEBUG)*/
127 #define vmdr_log_debug(...)
128 #endif /* DEVELOPMENT || DEBUG */
129 
130 static kern_return_t reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
131 static kern_return_t reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
132 static kern_return_t reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
133 static kern_return_t reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result);
134 #if CONFIG_WORKING_SET_ESTIMATION
135 static mach_error_t vmdr_sample_working_set(
136 	vm_deferred_reclamation_metadata_t metadata,
137 	mach_vm_size_t *trim_threshold_out,
138 	vm_deferred_reclamation_options_t options);
139 #endif
140 static void vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata);
141 static void vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
142 static void vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
143 static void vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata);
144 static void vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata);
145 static void vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action,
146     mach_vm_size_t *total_bytes_reclaimed_out,
147     vm_deferred_reclamation_options_t options);
148 static kern_return_t reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
149     uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
150     mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out);
151 
152 struct vm_deferred_reclamation_metadata_s {
153 	/*
154 	 * Global list containing every reclamation buffer. Protected by the
155 	 * reclamation_buffers_lock.
156 	 */
157 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list;
158 	/* Protects all struct fields (except denoted otherwise) */
159 	decl_lck_mtx_data(, vdrm_lock);
160 	/* Gate to be acquired when performing copyio on the user ring */
161 	decl_lck_mtx_gate_data(, vdrm_gate);
162 	/*
163 	 * The task owns this structure but we maintain a backpointer here
164 	 * so that we can send an exception if we hit an error.
165 	 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
166 	 */
167 	task_t vdrm_task;
168 	pid_t vdrm_pid;
169 	vm_map_t vdrm_map;
170 	/*
171 	 * The owning task holds a ref on this object. When the task dies, it
172 	 * will set vdrm_task := NULL and drop its ref. Threads operating on the buffer
173 	 * should hold a +1 on the metadata structure to ensure it's validity.
174 	 */
175 	os_refcnt_t vdrm_refcnt;
176 	/* The virtual address of the ringbuffer in the user map (immutable) */
177 	user_addr_t vdrm_ring_addr;
178 	/* The size of the VM allocation containing the ringbuffer (immutable) */
179 	mach_vm_size_t vdrm_ring_size;
180 	/* The length of the ringbuffer. This may be changed on buffer re-size */
181 	mach_vm_reclaim_count_t vdrm_buffer_len;
182 	/* Which GC epoch this buffer was last considered in */
183 	uint64_t vdrm_reclaimed_at;
184 	/*
185 	 * The number of threads waiting for a pending reclamation
186 	 * on this buffer to complete.
187 	 */
188 	uint32_t vdrm_waiters;
189 	/* timestamp (MAS) of the last working set sample for this ringbuffer */
190 	uint64_t vdrm_last_sample_abs;
191 	/*
192 	 * The number of bytes reclaimed by kernel GC since the last user
193 	 * accounting update. Protected by @c vdrm_gate.
194 	 */
195 	size_t vdrm_kernel_bytes_reclaimed;
196 	/*
197 	 * The last amount of reclaimable bytes reported to the kernel.
198 	 */
199 	uint64_t vdrm_reclaimable_bytes_last;
200 #if CONFIG_WORKING_SET_ESTIMATION
201 	/*
202 	 * Exponential moving average of the minimum reclaimable buffer size
203 	 * (in VMDR_WMA_UNIT's). Protected by @c vdrm_gate.
204 	 */
205 	uint64_t vdrm_reclaimable_bytes_wma;
206 #endif /* CONFIG_WORKING_SET_ESTIMATION */
207 	/*
208 	 * Tracks whether or not this reclamation metadata has been added
209 	 * to the global list yet. Normally, this happens when it is allocated,
210 	 * except in the case of fork(). In this case, we have to duplicate the
211 	 * parent's metadata before it returns from fork(), but this occurs
212 	 * before the child's address space is set up.
213 	 */
214 	uint8_t vdrm_is_registered : 1,
215 	    __unused1 : 7;
216 };
217 
218 #pragma mark Globals
219 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
220 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
221 os_refgrp_decl(static, vm_reclaim_metadata_refgrp, "vm_reclaim_metadata_refgrp", NULL);
222 /*
223  * The reclamation_buffers list contains every buffer in the system.
224  * The reclamation_buffers_lock protects the reclamation_buffers list.
225  * It must be held when iterating over the list or manipulating the list.
226  * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
227  */
228 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclaim_buffers = TAILQ_HEAD_INITIALIZER(reclaim_buffers);
229 LCK_MTX_DECLARE(reclaim_buffers_lock, &vm_reclaim_lock_grp);
230 /* Number of times Reclaim GC has run */
231 uint64_t vm_reclaim_gc_epoch = 0;
232 /* The number of reclamation actions (drains/trims) done during GC */
233 uint64_t vm_reclaim_gc_reclaim_count;
234 /* Gate for GC */
235 static decl_lck_mtx_gate_data(, vm_reclaim_gc_gate);
236 os_log_t vm_reclaim_log_handle;
237 /* Number of initialized reclaim buffers */
238 _Atomic uint32_t vm_reclaim_buffer_count;
239 uint64_t vm_reclaim_sampling_period_abs = 0;
240 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_scavenger_thread = THREAD_NULL;
241 static sched_cond_atomic_t vm_reclaim_scavenger_cond = SCHED_COND_INIT;
242 
243 #pragma mark Buffer Initialization/Destruction
244 
245 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,mach_vm_reclaim_count_t len)246 vmdr_metadata_alloc(
247 	task_t                  task,
248 	user_addr_t             buffer,
249 	mach_vm_size_t          size,
250 	mach_vm_reclaim_count_t len)
251 {
252 	vm_deferred_reclamation_metadata_t metadata;
253 	vm_map_t map = task->map;
254 
255 	assert(!map->is_nested_map);
256 
257 	metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
258 	lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
259 	lck_mtx_gate_init(&metadata->vdrm_lock, &metadata->vdrm_gate);
260 	os_ref_init(&metadata->vdrm_refcnt, &vm_reclaim_metadata_refgrp);
261 
262 	metadata->vdrm_task = task;
263 	metadata->vdrm_map = map;
264 	metadata->vdrm_ring_addr = buffer;
265 	metadata->vdrm_ring_size = size;
266 	metadata->vdrm_buffer_len = len;
267 
268 	if (os_atomic_inc(&vm_reclaim_buffer_count, relaxed) == UINT32_MAX) {
269 		panic("Overflowed vm_reclaim_buffer_count");
270 	}
271 
272 	/*
273 	 * we do not need to hold a lock on `task` because this is called
274 	 * either at fork() time or from the context of current_task().
275 	 */
276 	vm_map_reference(map);
277 	return metadata;
278 }
279 
280 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)281 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
282 {
283 	vm_map_deallocate(metadata->vdrm_map);
284 	lck_mtx_gate_destroy(&metadata->vdrm_lock, &metadata->vdrm_gate);
285 	lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
286 	zfree(vm_reclaim_metadata_zone, metadata);
287 	if (os_atomic_dec_orig(&vm_reclaim_buffer_count, relaxed) == 0) {
288 		panic("Underflowed vm_reclaim_buffer_count");
289 	}
290 }
291 
292 static mach_vm_size_t
vmdr_round_len_to_size(vm_map_t map,mach_vm_reclaim_count_t count)293 vmdr_round_len_to_size(vm_map_t map, mach_vm_reclaim_count_t count)
294 {
295 	mach_vm_size_t metadata_size = offsetof(struct mach_vm_reclaim_ring_s, entries);
296 	mach_vm_size_t entries_size = count * sizeof(struct mach_vm_reclaim_entry_s);
297 	return vm_map_round_page(metadata_size + entries_size, vm_map_page_mask(map));
298 }
299 
300 mach_error_t
vm_deferred_reclamation_buffer_allocate_internal(task_t task,mach_vm_address_ut * address_u,uint64_t * sampling_period,mach_vm_reclaim_count_t len,mach_vm_reclaim_count_t max_len)301 vm_deferred_reclamation_buffer_allocate_internal(
302 	task_t                   task,
303 	mach_vm_address_ut       *address_u,
304 	uint64_t                 *sampling_period,
305 	mach_vm_reclaim_count_t  len,
306 	mach_vm_reclaim_count_t  max_len)
307 {
308 	kern_return_t kr;
309 	kern_return_t tmp_kr;
310 	vm_deferred_reclamation_metadata_t metadata = NULL;
311 	vm_map_t map;
312 	uint64_t head = 0, tail = 0, busy = 0;
313 	static bool reclaim_disabled_logged = false;
314 
315 	if (task == TASK_NULL) {
316 		return KERN_INVALID_TASK;
317 	}
318 	if (address_u == NULL || sampling_period == NULL ||
319 	    len == 0 || max_len == 0 || max_len < len) {
320 		return KERN_INVALID_ARGUMENT;
321 	}
322 	map = task->map;
323 #if CONFIG_WORKING_SET_ESTIMATION
324 	if (!vm_reclaim_enabled) {
325 #else /* !CONFIG_WORKING_SET_ESTIMATION */
326 	if (!vm_reclaim_max_threshold) {
327 #endif /* CONFIG_WORKING_SET_ESTIMATION */
328 		if (!reclaim_disabled_logged) {
329 			/* Avoid logging failure for every new process */
330 			reclaim_disabled_logged = true;
331 			vmdr_log_error("failed to initialize deferred "
332 			    "reclamation buffer - vm_reclaim is disabled\n");
333 		}
334 		return VM_RECLAIM_NOT_SUPPORTED;
335 	}
336 
337 	map = task->map;
338 	mach_vm_size_t rounded_vm_size = vmdr_round_len_to_size(map, max_len);
339 	if (rounded_vm_size == 0) {
340 		return KERN_INVALID_ARGUMENT;
341 	}
342 
343 	if (rounded_vm_size > VM_RECLAIM_MAX_BUFFER_SIZE) {
344 		vmdr_log_error("denying request to allocate ringbuffer of size "
345 		    "%llu KiB (max %llu KiB)\n",
346 		    rounded_vm_size,
347 		    VM_RECLAIM_MAX_BUFFER_SIZE);
348 		return KERN_NO_SPACE;
349 	}
350 
351 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START,
352 	    task_pid(task), len);
353 
354 	/*
355 	 * Allocate a VM region that can contain the maximum buffer size. The
356 	 * allocation starts as VM_PROT_NONE and may be unprotected on buffer
357 	 * resize.
358 	 *
359 	 * TODO: If clients other than libmalloc adopt deferred reclaim, a
360 	 * different tag should be given
361 	 *
362 	 * `address` was sanitized under the assumption that we'll only use
363 	 * it as a hint (overflow checks were used) so we must pass the
364 	 * anywhere flag.
365 	 */
366 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE_PERMANENT(
367 		.vm_tag = VM_MEMORY_MALLOC);
368 	mach_vm_size_ut size_u = vm_sanitize_wrap_size(rounded_vm_size);
369 	kr = mach_vm_map_kernel(map, address_u, size_u, VM_MAP_PAGE_MASK(map),
370 	    vmk_flags, IPC_PORT_NULL, 0, FALSE,
371 	    VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_COPY);
372 	if (kr != KERN_SUCCESS) {
373 		vmdr_log_error("%s [%d] failed to allocate VA for reclaim "
374 		    "buffer (%d)\n", task_best_name(task), task_pid(task), kr);
375 		return kr;
376 	}
377 	mach_vm_address_t address = VM_SANITIZE_UNSAFE_UNWRAP(*address_u);
378 	assert3u(address, !=, 0);
379 
380 	metadata = vmdr_metadata_alloc(task, address, rounded_vm_size, len);
381 	metadata->vdrm_pid = task_pid(task);
382 
383 	/*
384 	 * Validate the starting indices.
385 	 */
386 	kr = reclaim_copyin_busy(metadata, &busy);
387 	if (kr != KERN_SUCCESS) {
388 		goto out;
389 	}
390 	kr = reclaim_copyin_head(metadata, &head);
391 	if (kr != KERN_SUCCESS) {
392 		goto out;
393 	}
394 	kr = reclaim_copyin_tail(metadata, &tail);
395 	if (kr != KERN_SUCCESS) {
396 		goto out;
397 	}
398 
399 	if (head != 0 || tail != 0 || busy != 0) {
400 		vmdr_log_error("indices were not "
401 		    "zero-initialized\n");
402 		kr = KERN_INVALID_ARGUMENT;
403 		goto out;
404 	}
405 
406 	/*
407 	 * Publish the metadata to the task & global buffer list. This must be
408 	 * done under the task lock to synchronize with task termination - i.e.
409 	 * task_terminate_internal is guaranteed to see the published metadata and
410 	 * tear it down.
411 	 */
412 	lck_mtx_lock(&reclaim_buffers_lock);
413 	task_lock(task);
414 
415 	if (!task_is_active(task) || task_is_halting(task)) {
416 		vmdr_log_error(
417 			"failed to initialize buffer on dying task %s [%d]",
418 			task_best_name(task), task_pid(task));
419 		kr = KERN_ABORTED;
420 		goto fail_task;
421 	}
422 	if (task->deferred_reclamation_metadata != NULL) {
423 		vmdr_log_error(
424 			"tried to overwrite existing reclaim buffer for %s [%d]", task_best_name(task), task_pid(task));
425 		kr = VM_RECLAIM_RESOURCE_SHORTAGE;
426 		goto fail_task;
427 	}
428 
429 	metadata->vdrm_is_registered = true;
430 	vmdr_list_append_locked(metadata);
431 	task->deferred_reclamation_metadata = metadata;
432 
433 	task_unlock(task);
434 	lck_mtx_unlock(&reclaim_buffers_lock);
435 
436 	vmdr_log_debug("%s [%d] allocated ring with capacity %u/%u\n",
437 	    task_best_name(task), task_pid(task),
438 	    len, max_len);
439 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
440 	    task_pid(task), KERN_SUCCESS, address);
441 	DTRACE_VM3(reclaim_ring_allocate,
442 	    mach_vm_address_t, address,
443 	    mach_vm_reclaim_count_t, len,
444 	    mach_vm_reclaim_count_t, max_len);
445 	return KERN_SUCCESS;
446 
447 fail_task:
448 	task_unlock(task);
449 	lck_mtx_unlock(&reclaim_buffers_lock);
450 
451 	tmp_kr = mach_vm_deallocate(map,
452 	    *address_u, size_u);
453 	assert(tmp_kr == KERN_SUCCESS);
454 
455 out:
456 	*address_u = vm_sanitize_wrap_addr(0ull);
457 	*sampling_period = vm_reclaim_sampling_period_abs;
458 	vmdr_metadata_release(metadata);
459 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
460 	    kr, NULL);
461 	return kr;
462 }
463 
464 #pragma mark Synchronization & Lifecycle
465 
466 static inline void
467 vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)
468 {
469 	lck_mtx_lock(&metadata->vdrm_lock);
470 }
471 
472 static inline void
473 vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)
474 {
475 	lck_mtx_unlock(&metadata->vdrm_lock);
476 }
477 
478 static inline void
479 vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)
480 {
481 	lck_mtx_gate_assert(&metadata->vdrm_lock, &metadata->vdrm_gate,
482 	    GATE_ASSERT_HELD);
483 }
484 
485 static inline void
486 vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)
487 {
488 #if MACH_ASSERT
489 	vmdr_metadata_lock(metadata);
490 	vmdr_metadata_assert_owned_locked(metadata);
491 	vmdr_metadata_unlock(metadata);
492 #else /* MACH_ASSERT */
493 	(void)metadata;
494 #endif /* MACH_ASSERT */
495 }
496 
497 static bool
498 vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)
499 {
500 	kern_return_t kr = lck_mtx_gate_try_close(&metadata->vdrm_lock,
501 	    &metadata->vdrm_gate);
502 	return kr == KERN_SUCCESS;
503 }
504 
505 /*
506  * Try to take ownership of the buffer. Returns true if successful.
507  */
508 static bool
509 vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata,
510     vm_deferred_reclamation_options_t options)
511 {
512 	__assert_only gate_wait_result_t wait_result;
513 	if (!vmdr_metadata_try_own_locked(metadata)) {
514 		if (options & RECLAIM_NO_WAIT) {
515 			return false;
516 		}
517 		wait_result = lck_mtx_gate_wait(
518 			&metadata->vdrm_lock, &metadata->vdrm_gate, LCK_SLEEP_DEFAULT,
519 			THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
520 		assert(wait_result == GATE_HANDOFF);
521 	}
522 	return true;
523 }
524 
525 /*
526  * Set the current thread as the owner of a reclaim buffer. May block. Will
527  * propagate priority.
528  */
529 static void
530 vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)
531 {
532 	vmdr_metadata_lock(metadata);
533 	vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
534 	vmdr_metadata_unlock(metadata);
535 }
536 
537 static void
538 vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)
539 {
540 	vmdr_metadata_assert_owned_locked(metadata);
541 	lck_mtx_gate_handoff(&metadata->vdrm_lock, &metadata->vdrm_gate,
542 	    GATE_HANDOFF_OPEN_IF_NO_WAITERS);
543 }
544 
545 /*
546  * Release ownership of a reclaim buffer and wakeup any threads waiting for
547  * ownership. Must be called from the thread that acquired ownership.
548  */
549 static void
550 vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)
551 {
552 	vmdr_metadata_lock(metadata);
553 	vmdr_metadata_disown_locked(metadata);
554 	vmdr_metadata_unlock(metadata);
555 }
556 
557 static void
558 vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)
559 {
560 	os_ref_retain(&metadata->vdrm_refcnt);
561 }
562 
563 static void
564 vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)
565 {
566 	if (os_ref_release(&metadata->vdrm_refcnt) == 0) {
567 		vmdr_metadata_free(metadata);
568 	}
569 }
570 
571 static void
572 vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
573 {
574 	LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
575 	assert3p(metadata->vdrm_list.tqe_prev, !=, NULL);
576 	TAILQ_REMOVE(&reclaim_buffers, metadata, vdrm_list);
577 	metadata->vdrm_list.tqe_prev = NULL;
578 	metadata->vdrm_list.tqe_next = NULL;
579 }
580 
581 static void
582 vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
583 {
584 	LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
585 	assert3p(metadata->vdrm_list.tqe_prev, ==, NULL);
586 	TAILQ_INSERT_TAIL(&reclaim_buffers, metadata, vdrm_list);
587 }
588 
589 void
590 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
591 {
592 	assert(metadata != NULL);
593 	/*
594 	 * First remove the buffer from the global list so no one else can get access to it.
595 	 */
596 	lck_mtx_lock(&reclaim_buffers_lock);
597 	if (metadata->vdrm_is_registered) {
598 		vmdr_list_remove_locked(metadata);
599 	}
600 	lck_mtx_unlock(&reclaim_buffers_lock);
601 
602 	/*
603 	 * The task is dropping its ref on this buffer. First remove the buffer's
604 	 * back-reference to the task so that any threads currently operating on
605 	 * this buffer do not try to operate on the dead/dying task
606 	 */
607 	vmdr_metadata_lock(metadata);
608 	assert3p(metadata->vdrm_task, !=, TASK_NULL);
609 	metadata->vdrm_task = TASK_NULL;
610 	vmdr_metadata_unlock(metadata);
611 	vmdr_metadata_release(metadata);
612 }
613 
614 #pragma mark Exception Delivery
615 
616 static void
617 reclaim_kill_with_reason(
618 	vm_deferred_reclamation_metadata_t metadata,
619 	unsigned reason,
620 	mach_exception_data_type_t subcode)
621 {
622 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
623 	mach_exception_code_t code = 0;
624 	task_t task;
625 	proc_t p = NULL;
626 	boolean_t fatal = TRUE;
627 	bool killing_self;
628 	pid_t pid;
629 	int err;
630 
631 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
632 
633 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
634 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
635 	EXC_GUARD_ENCODE_TARGET(code, 0);
636 
637 	vmdr_metadata_lock(metadata);
638 	task = metadata->vdrm_task;
639 	if (task == TASK_NULL || !task_is_active(task) || task_is_halting(task)) {
640 		/* Task is no longer alive */
641 		vmdr_metadata_unlock(metadata);
642 		vmdr_log_error(
643 			"Unable to deliver guard exception because task "
644 			"[%d] is already dead.\n",
645 			metadata->vdrm_pid);
646 		return;
647 	}
648 
649 	if (panic_on_kill) {
650 		panic("About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
651 	}
652 
653 	killing_self = (task == current_task());
654 	if (!killing_self) {
655 		task_reference(task);
656 	}
657 	assert(task != kernel_task);
658 	vmdr_metadata_unlock(metadata);
659 
660 	if (reason == kGUARD_EXC_DEALLOC_GAP) {
661 		task_lock(task);
662 		fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
663 		task_unlock(task);
664 	}
665 
666 	if (!fatal) {
667 		vmdr_log_info(
668 			"Skipping non fatal guard exception for %s [%d]\n",
669 			task_best_name(task), task_pid(task));
670 		goto out;
671 	}
672 
673 	pid = task_pid(task);
674 	if (killing_self) {
675 		p = get_bsdtask_info(task);
676 	} else {
677 		p = proc_find(pid);
678 		if (p && proc_task(p) != task) {
679 			vmdr_log_error(
680 				"Unable to deliver guard exception because proc is gone & pid rolled over.\n");
681 			goto out;
682 		}
683 	}
684 
685 	if (!p) {
686 		vmdr_log_error(
687 			"Unable to deliver guard exception because task does not have a proc.\n");
688 		goto out;
689 	}
690 
691 	int flags = PX_DEBUG_NO_HONOR;
692 	exception_info_t info = {
693 		.os_reason = OS_REASON_GUARD,
694 		.exception_type = EXC_GUARD,
695 		.mx_code = code,
696 		.mx_subcode = subcode
697 	};
698 
699 	vmdr_log("Force-exiting %s [%d]\n", task_best_name(task), task_pid(task));
700 
701 	err = exit_with_mach_exception(p, info, flags);
702 	if (err != 0) {
703 		vmdr_log_error("Unable to deliver guard exception to %p: %d\n", p, err);
704 		goto out;
705 	}
706 
707 
708 out:
709 	if (!killing_self) {
710 		if (p) {
711 			proc_rele(p);
712 			p = NULL;
713 		}
714 		if (task) {
715 			task_deallocate(task);
716 			task = NULL;
717 		}
718 	}
719 }
720 
721 #pragma mark Copy I/O
722 
723 static user_addr_t
724 get_entries_ptr(vm_deferred_reclamation_metadata_t metadata)
725 {
726 	return metadata->vdrm_ring_addr +
727 	       offsetof(struct mach_vm_reclaim_ring_s, entries);
728 }
729 
730 static user_addr_t
731 get_head_ptr(vm_deferred_reclamation_metadata_t metadata)
732 {
733 	return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, head);
734 }
735 
736 static user_addr_t
737 get_tail_ptr(vm_deferred_reclamation_metadata_t metadata)
738 {
739 	return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, tail);
740 }
741 
742 static user_addr_t
743 get_busy_ptr(vm_deferred_reclamation_metadata_t metadata)
744 {
745 	return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, busy);
746 }
747 
748 static kern_return_t
749 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
750 {
751 	if (result != 0 && (result != EFAULT || !vm_fault_get_disabled())) {
752 		vmdr_log_error("Killing [%d] due to copy I/O error\n", metadata->vdrm_pid);
753 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE,
754 		    result);
755 	}
756 	return kern_return_for_errno(result);
757 }
758 
759 /*
760  * Helper functions to do copyio on the head, tail, and busy pointers.
761  * Note that the kernel will only write to the busy and head pointers.
762  * Userspace is not supposed to write to the head or busy pointers, but the kernel
763  * must be resilient to that kind of bug in userspace.
764  */
765 
766 static kern_return_t
767 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
768 {
769 	int result;
770 	kern_return_t kr;
771 	user_addr_t head_ptr = get_head_ptr(metadata);
772 
773 	result = copyin_atomic64(head_ptr, head);
774 	kr = reclaim_handle_copyio_error(metadata, result);
775 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
776 		vmdr_log_error(
777 			"Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
778 	}
779 	return kr;
780 }
781 
782 static kern_return_t
783 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
784 {
785 	int result;
786 	kern_return_t kr;
787 	user_addr_t tail_ptr = get_tail_ptr(metadata);
788 
789 	result = copyin_atomic64(tail_ptr, tail);
790 	kr = reclaim_handle_copyio_error(metadata, result);
791 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
792 		vmdr_log_error(
793 			"Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
794 	}
795 	return kr;
796 }
797 
798 static kern_return_t
799 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
800 {
801 	int result;
802 	kern_return_t kr;
803 	user_addr_t busy_ptr = get_busy_ptr(metadata);
804 
805 	result = copyin_atomic64(busy_ptr, busy);
806 	kr = reclaim_handle_copyio_error(metadata, result);
807 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
808 		vmdr_log_error(
809 			"Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
810 	}
811 	return kr;
812 }
813 
814 static kern_return_t
815 reclaim_copyin_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t *reclaimable_bytes_out)
816 {
817 	int result;
818 	kern_return_t kr = KERN_SUCCESS;
819 	uint64_t reclaimable_bytes;
820 	user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
821 	    offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes);
822 
823 	result = copyin_atomic64(ptr, &reclaimable_bytes);
824 	if (result) {
825 		kr = reclaim_handle_copyio_error(metadata, result);
826 		if (result != EFAULT || !vm_fault_get_disabled()) {
827 			vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
828 		}
829 	} else {
830 		*reclaimable_bytes_out = (size_t)reclaimable_bytes;
831 	}
832 	return kr;
833 }
834 
835 #if CONFIG_WORKING_SET_ESTIMATION
836 static kern_return_t
837 reclaim_copyin_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t *min_reclaimable_bytes_out)
838 {
839 	int result;
840 	kern_return_t kr = KERN_SUCCESS;
841 	uint64_t min_reclaimable_bytes;
842 	user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
843 	    offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes_min);
844 
845 	result = copyin_atomic64(ptr, &min_reclaimable_bytes);
846 	if (result) {
847 		kr = reclaim_handle_copyio_error(metadata, result);
848 		if (result != EFAULT || !vm_fault_get_disabled()) {
849 			vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
850 		}
851 	} else {
852 		*min_reclaimable_bytes_out = (size_t)min_reclaimable_bytes;
853 	}
854 	return kr;
855 }
856 #endif /* CONFIG_WORKING_SET_ESTIMATION */
857 
858 static bool
859 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
860 {
861 	int result;
862 	kern_return_t kr = KERN_SUCCESS;
863 	user_addr_t busy_ptr = get_busy_ptr(metadata);
864 
865 	result = copyout_atomic64(value, busy_ptr);
866 	if (result) {
867 		kr = reclaim_handle_copyio_error(metadata, result);
868 		if (result != EFAULT || !vm_fault_get_disabled()) {
869 			vmdr_log_error(
870 				"Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
871 		}
872 	}
873 	return kr;
874 }
875 
876 static bool
877 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
878 {
879 	int result;
880 	kern_return_t kr = KERN_SUCCESS;
881 	user_addr_t head_ptr = get_head_ptr(metadata);
882 
883 	result = copyout_atomic64(value, head_ptr);
884 	if (result) {
885 		kr = reclaim_handle_copyio_error(metadata, result);
886 		if (result != EFAULT || !vm_fault_get_disabled()) {
887 			vmdr_log_error(
888 				"Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
889 		}
890 	}
891 	return kr;
892 }
893 
894 #if CONFIG_WORKING_SET_ESTIMATION
895 static kern_return_t
896 reclaim_copyout_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t min_reclaimable_bytes)
897 {
898 	int result;
899 	kern_return_t kr = KERN_SUCCESS;
900 	user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
901 	    offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes_min);
902 
903 	result = copyout_atomic64(min_reclaimable_bytes, ptr);
904 	if (result) {
905 		kr = reclaim_handle_copyio_error(metadata, result);
906 		if (result != EFAULT || !vm_fault_get_disabled()) {
907 			vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
908 		}
909 	}
910 	return kr;
911 }
912 #endif /* CONFIG_WORKING_SET_ESTIMATION */
913 
914 #pragma mark Reclamation
915 
916 /*
917  * @func reclaim_chunk
918  *
919  * @brief
920  * Reclaim a batch of entries from the buffer.
921  *
922  * @param bytes_to_reclaim
923  * Number of bytes caller wishes to reclaim from the buffer
924  *
925  * @param bytes_reclaimed_out
926  * The number of bytes reclaimed from the buffer written out
927  *
928  * @param chunk_size
929  * The maximum number of entries to hold busy and reclaim from (must
930  * be <= kReclaimChunkSize)
931  *
932  * @param num_reclaimed_out
933  * The number of entries reclaimed written out
934  *
935  * @discussion
936  * If the buffer has been exhausted of entries (tail == head),
937  * num_reclaimed_out will be zero. It is important that the caller abort any
938  * loops if such a condition is met.
939  */
940 static kern_return_t
941 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
942     uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
943     mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out)
944 {
945 	kern_return_t kr = KERN_SUCCESS;
946 	int result = 0;
947 	mach_vm_reclaim_count_t num_reclaimed = 0, num_copied = 0;
948 	uint64_t bytes_reclaimed = 0;
949 	uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0;
950 	vm_map_t map = metadata->vdrm_map;
951 	vm_map_switch_context_t switch_ctx;
952 	struct mach_vm_reclaim_entry_s copied_entries[kReclaimChunkSize];
953 
954 	assert(metadata != NULL);
955 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
956 	vmdr_metadata_assert_owned(metadata);
957 
958 	assert(chunk_size <= kReclaimChunkSize);
959 
960 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START,
961 	    metadata->vdrm_pid, bytes_to_reclaim);
962 
963 	memset(copied_entries, 0, sizeof(copied_entries));
964 
965 	switch_ctx = vm_map_switch_to(map);
966 
967 	kr = reclaim_copyin_busy(metadata, &busy);
968 	if (kr != KERN_SUCCESS) {
969 		goto done;
970 	}
971 	kr = reclaim_copyin_head(metadata, &head);
972 	if (kr != KERN_SUCCESS) {
973 		goto done;
974 	}
975 	kr = reclaim_copyin_tail(metadata, &tail);
976 	if (kr != KERN_SUCCESS) {
977 		goto done;
978 	}
979 
980 	/*
981 	 * NB: busy may not be exactly equal to head if the jetsam
982 	 * thread fails to fault on the indices after having marked
983 	 * entries busy
984 	 */
985 	if (busy < head || (busy - head) > kReclaimChunkSize) {
986 		vmdr_log_error(
987 			"Userspace modified head or busy pointer! head: %llu "
988 			"(0x%llx) | busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
989 			head, get_head_ptr(metadata), busy, get_busy_ptr(metadata), tail,
990 			get_tail_ptr(metadata));
991 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE,
992 		    busy);
993 		kr = KERN_FAILURE;
994 		goto done;
995 	}
996 
997 	if (tail < head) {
998 		/*
999 		 * Userspace is likely in the middle of trying to re-use an entry,
1000 		 * bail on this reclamation.
1001 		 */
1002 		vmdr_log_error(
1003 			"Tail < head! Userspace is likely attempting a "
1004 			"cancellation; aborting reclamation | head: %llu "
1005 			"(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
1006 			head, get_head_ptr(metadata), tail, get_tail_ptr(metadata), busy,
1007 			get_busy_ptr(metadata));
1008 		kr = KERN_ABORTED;
1009 		goto done;
1010 	}
1011 
1012 	/*
1013 	 * NB: If any of the copyouts below fail due to faults being disabled,
1014 	 * the buffer may be left in a state where several entries are unusable
1015 	 * until the next reclamation (i.e. busy > head)
1016 	 */
1017 	num_to_reclaim = tail - head;
1018 	while (true) {
1019 		num_to_reclaim = MIN(num_to_reclaim, chunk_size);
1020 		if (num_to_reclaim == 0) {
1021 			break;
1022 		}
1023 		busy = head + num_to_reclaim;
1024 		kr = reclaim_copyout_busy(metadata, busy);
1025 		if (kr != KERN_SUCCESS) {
1026 			goto done;
1027 		}
1028 		os_atomic_thread_fence(seq_cst);
1029 		kr = reclaim_copyin_tail(metadata, &new_tail);
1030 		if (kr != KERN_SUCCESS) {
1031 			goto done;
1032 		}
1033 
1034 		if (new_tail >= busy) {
1035 			/* Got num_to_reclaim entries */
1036 			break;
1037 		}
1038 		tail = new_tail;
1039 		if (tail < head) {
1040 			/*
1041 			 * Userspace is likely in the middle of trying to re-use an entry,
1042 			 * bail on this reclamation
1043 			 */
1044 			vmdr_log_error(
1045 				"Tail < head! Userspace is likely attempting a "
1046 				"cancellation; aborting reclamation | head: %llu "
1047 				"(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
1048 				head, get_head_ptr(metadata), tail, get_tail_ptr(metadata), busy,
1049 				get_busy_ptr(metadata));
1050 			/* Reset busy back to head */
1051 			reclaim_copyout_busy(metadata, head);
1052 			kr = KERN_ABORTED;
1053 			goto done;
1054 		}
1055 		/* Can't reclaim these entries. Try again */
1056 		num_to_reclaim = tail - head;
1057 		if (num_to_reclaim == 0) {
1058 			/* Nothing left to reclaim. Reset busy to head. */
1059 			kr = reclaim_copyout_busy(metadata, head);
1060 			if (kr != KERN_SUCCESS) {
1061 				goto done;
1062 			}
1063 			break;
1064 		}
1065 		/*
1066 		 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
1067 		 * so this is gauranteed to converge.
1068 		 */
1069 	}
1070 	vmdr_log_debug("[%d] reclaiming up to %llu entries (%llu KiB) head=%llu "
1071 	    "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_to_reclaim,
1072 	    bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1073 
1074 	uint64_t memcpy_start_idx = head % metadata->vdrm_buffer_len;
1075 	while (num_copied < num_to_reclaim) {
1076 		uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
1077 		// Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
1078 		memcpy_end_idx = MIN(memcpy_end_idx, metadata->vdrm_buffer_len);
1079 		uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
1080 
1081 		assert(num_to_copy + num_copied <= kReclaimChunkSize);
1082 		user_addr_t src_ptr = get_entries_ptr(metadata) +
1083 		    (memcpy_start_idx * sizeof(struct mach_vm_reclaim_entry_s));
1084 		struct mach_vm_reclaim_entry_s *dst_ptr = copied_entries + num_copied;
1085 		result = copyin(src_ptr, dst_ptr,
1086 		    (num_to_copy * sizeof(struct mach_vm_reclaim_entry_s)));
1087 		kr = reclaim_handle_copyio_error(metadata, result);
1088 		if (kr != KERN_SUCCESS) {
1089 			if (kr != KERN_MEMORY_ERROR || !vm_fault_get_disabled()) {
1090 				vmdr_log_error(
1091 					"Unable to copyin %llu entries in reclaim "
1092 					"buffer at 0x%llx to 0x%llx: err=%d\n",
1093 					num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
1094 			}
1095 			goto done;
1096 		}
1097 
1098 		num_copied += num_to_copy;
1099 		memcpy_start_idx = (memcpy_start_idx + num_to_copy) % metadata->vdrm_buffer_len;
1100 	}
1101 
1102 	for (num_reclaimed = 0; num_reclaimed < num_to_reclaim && bytes_reclaimed < bytes_to_reclaim; num_reclaimed++) {
1103 		mach_vm_reclaim_entry_t entry = &copied_entries[num_reclaimed];
1104 		KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1105 		    metadata->vdrm_pid, entry->address, entry->size,
1106 		    entry->behavior);
1107 		if (entry->address != 0 && entry->size != 0) {
1108 			vm_map_address_t start = vm_map_trunc_page(entry->address,
1109 			    VM_MAP_PAGE_MASK(map));
1110 			vm_map_address_t end = vm_map_round_page(entry->address + entry->size,
1111 			    VM_MAP_PAGE_MASK(map));
1112 			DTRACE_VM4(vm_reclaim_entry,
1113 			    pid_t, metadata->vdrm_pid,
1114 			    mach_vm_address_t, entry->address,
1115 			    mach_vm_address_t, end,
1116 			    mach_vm_reclaim_action_t, entry->behavior);
1117 			KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1118 			    metadata->vdrm_pid, start, end,
1119 			    entry->behavior);
1120 			vmdr_log_debug("[%d] Reclaiming entry %llu (0x%llx, 0x%llx)\n", metadata->vdrm_pid, head + num_reclaimed, start, end);
1121 			switch (entry->behavior) {
1122 			case VM_RECLAIM_DEALLOCATE:
1123 				kr = vm_map_remove_guard(map,
1124 				    start, end, VM_MAP_REMOVE_GAPS_FAIL,
1125 				    KMEM_GUARD_NONE).kmr_return;
1126 				if (kr == KERN_INVALID_VALUE) {
1127 					vmdr_log_error(
1128 						"[%d] Killing due to virtual-memory guard at (0x%llx, 0x%llx)\n",
1129 						metadata->vdrm_pid, start, end);
1130 					reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
1131 					goto done;
1132 				} else if (kr != KERN_SUCCESS) {
1133 					vmdr_log_error(
1134 						"[%d] Killing due to deallocation failure at (0x%llx, 0x%llx) err=%d\n",
1135 						metadata->vdrm_pid, start, end, kr);
1136 					reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1137 					goto done;
1138 				}
1139 				break;
1140 			case VM_RECLAIM_FREE:
1141 				/*
1142 				 * TODO: This should free the backing pages directly instead of using
1143 				 * VM_BEHAVIOR_REUSABLE, which will mark the pages as clean and let them
1144 				 * age in the LRU.
1145 				 */
1146 				kr = vm_map_behavior_set(map, start,
1147 				    end, VM_BEHAVIOR_REUSABLE);
1148 				if (kr != KERN_SUCCESS) {
1149 					vmdr_log_error(
1150 						"[%d] Failed to free(reusable) (0x%llx, 0x%llx) err=%d\n",
1151 						metadata->vdrm_pid, start, end, kr);
1152 				}
1153 				break;
1154 			default:
1155 				vmdr_log_error(
1156 					"attempted to reclaim entry with unsupported behavior %uh",
1157 					entry->behavior);
1158 				reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1159 				kr = KERN_INVALID_VALUE;
1160 				goto done;
1161 			}
1162 			bytes_reclaimed += entry->size;
1163 			KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END,
1164 			    kr);
1165 		}
1166 	}
1167 
1168 	assert(head + num_reclaimed <= busy);
1169 	head += num_reclaimed;
1170 	kr = reclaim_copyout_head(metadata, head);
1171 	if (kr != KERN_SUCCESS) {
1172 		goto done;
1173 	}
1174 	if (busy > head) {
1175 		busy = head;
1176 		kr = reclaim_copyout_busy(metadata, busy);
1177 		if (kr != KERN_SUCCESS) {
1178 			goto done;
1179 		}
1180 	}
1181 
1182 done:
1183 	vmdr_log_debug("[%d] reclaimed %u entries (%llu KiB) head=%llu "
1184 	    "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_reclaimed,
1185 	    bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1186 	vm_map_switch_back(switch_ctx);
1187 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1188 	    bytes_reclaimed, num_reclaimed, kr);
1189 	if (bytes_reclaimed_out) {
1190 		*bytes_reclaimed_out = bytes_reclaimed;
1191 	}
1192 	if (num_reclaimed_out) {
1193 		*num_reclaimed_out = num_reclaimed;
1194 	}
1195 	return kr;
1196 }
1197 
1198 /*
1199  * @func vmdr_reclaim_from_buffer
1200  *
1201  * @brief
1202  * Reclaim entries until the buffer's estimated number of available bytes
1203  * is <= @c bytes_to_reclaim.
1204  *
1205  * @param bytes_to_reclaim
1206  * The minimum number of bytes to reclaim
1207  *
1208  * @param num_bytes_reclaimed_out
1209  * The number of bytes reclaimed written out
1210  *
1211  * @param options
1212  * If RECLAIM_NO_FAULT is set, do not fault on the buffer if it has been paged
1213  * out.
1214  *
1215  * @discussion
1216  * The buffer should be owned by the caller.
1217  */
1218 static kern_return_t
1219 vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata,
1220     mach_vm_size_t bytes_to_reclaim, mach_vm_size_t *num_bytes_reclaimed_out,
1221     vm_deferred_reclamation_options_t options)
1222 {
1223 	kern_return_t kr = KERN_SUCCESS;
1224 
1225 	if (options & RECLAIM_NO_FAULT) {
1226 		vm_fault_disable();
1227 	}
1228 
1229 	mach_vm_size_t total_bytes_reclaimed = 0;
1230 	while (total_bytes_reclaimed < bytes_to_reclaim) {
1231 		mach_vm_size_t cur_bytes_reclaimed;
1232 		mach_vm_reclaim_count_t entries_reclaimed;
1233 		kr = reclaim_chunk(metadata, bytes_to_reclaim - total_bytes_reclaimed,
1234 		    &cur_bytes_reclaimed, kReclaimChunkSize, &entries_reclaimed);
1235 		total_bytes_reclaimed += cur_bytes_reclaimed;
1236 		if (entries_reclaimed == 0 || kr != KERN_SUCCESS) {
1237 			break;
1238 		}
1239 	}
1240 
1241 	if (options & RECLAIM_NO_FAULT) {
1242 		vm_fault_enable();
1243 	}
1244 	vmdr_log_debug("reclaimed %llu B / %llu B from %d\n", total_bytes_reclaimed, bytes_to_reclaim, metadata->vdrm_pid);
1245 	if (num_bytes_reclaimed_out) {
1246 		*num_bytes_reclaimed_out = total_bytes_reclaimed;
1247 	}
1248 	return kr;
1249 }
1250 
1251 /*
1252  * Get and retain the reclamation metadata buffer for the given task.
1253  */
1254 static vm_deferred_reclamation_metadata_t
1255 vmdr_acquire_task_metadata(task_t task)
1256 {
1257 	vm_deferred_reclamation_metadata_t meta = NULL;
1258 	assert(task != NULL);
1259 	task_lock(task);
1260 	if (!task_is_halting(task) && task_is_active(task)) {
1261 		meta = task->deferred_reclamation_metadata;
1262 	}
1263 	if (meta != NULL) {
1264 		vmdr_metadata_retain(meta);
1265 	}
1266 	task_unlock(task);
1267 	return meta;
1268 }
1269 
1270 
1271 #pragma mark Buffer Resize/Synchronization
1272 
1273 kern_return_t
1274 vm_deferred_reclamation_buffer_flush_internal(task_t task,
1275     mach_vm_reclaim_count_t num_entries_to_reclaim,
1276     mach_vm_size_t *bytes_reclaimed_out)
1277 {
1278 	kern_return_t kr;
1279 	vm_deferred_reclamation_metadata_t metadata = NULL;
1280 	mach_vm_reclaim_count_t total_reclaimed = 0;
1281 	uint64_t bytes_reclaimed = 0;
1282 
1283 	if (!task_is_active(task)) {
1284 		return KERN_INVALID_TASK;
1285 	}
1286 
1287 	metadata = vmdr_acquire_task_metadata(task);
1288 	if (metadata == NULL) {
1289 		return KERN_INVALID_ARGUMENT;
1290 	}
1291 
1292 	vmdr_metadata_own(metadata);
1293 
1294 	vmdr_log_debug("[%d] flushing %u entries\n", task_pid(task), num_entries_to_reclaim);
1295 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_START, metadata->vdrm_pid, num_entries_to_reclaim);
1296 
1297 	while (total_reclaimed < num_entries_to_reclaim) {
1298 		mach_vm_reclaim_count_t cur_reclaimed;
1299 		uint64_t cur_bytes_reclaimed;
1300 		mach_vm_reclaim_count_t chunk_size = MIN(num_entries_to_reclaim - total_reclaimed, kReclaimChunkSize);
1301 		kr = reclaim_chunk(metadata, UINT64_MAX, &cur_bytes_reclaimed, chunk_size,
1302 		    &cur_reclaimed);
1303 		total_reclaimed += cur_reclaimed;
1304 		bytes_reclaimed += cur_bytes_reclaimed;
1305 		if (cur_reclaimed == 0) {
1306 			break;
1307 		} else if (kr == KERN_ABORTED) {
1308 			/*
1309 			 * Unable to reclaim due to a lost race with
1310 			 * userspace, yield the gate and try again
1311 			 */
1312 			vmdr_metadata_disown(metadata);
1313 			vmdr_metadata_own(metadata);
1314 			continue;
1315 		} else if (kr != KERN_SUCCESS) {
1316 			break;
1317 		}
1318 	}
1319 	/*
1320 	 * Tell the client how many bytes the kernel has reclaimed
1321 	 * since the last time it updated its accounting
1322 	 */
1323 	bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1324 	metadata->vdrm_kernel_bytes_reclaimed = 0;
1325 
1326 	vmdr_metadata_disown(metadata);
1327 
1328 	*bytes_reclaimed_out = bytes_reclaimed;
1329 	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_END, kr, total_reclaimed, bytes_reclaimed);
1330 	DTRACE_VM2(reclaim_flush,
1331 	    mach_vm_reclaim_count_t, num_entries_to_reclaim,
1332 	    size_t, bytes_reclaimed);
1333 	return kr;
1334 }
1335 
1336 kern_return_t
1337 vm_deferred_reclamation_buffer_resize_internal(
1338 	task_t                   task,
1339 	mach_vm_reclaim_count_t len,
1340 	mach_vm_size_t *bytes_reclaimed_out)
1341 {
1342 	kern_return_t kr;
1343 	mach_vm_reclaim_count_t num_entries_reclaimed = 0;
1344 	mach_vm_reclaim_count_t old_len;
1345 
1346 	if (task == TASK_NULL) {
1347 		return KERN_INVALID_TASK;
1348 	}
1349 	if (len == 0) {
1350 		return KERN_INVALID_ARGUMENT;
1351 	}
1352 	vm_deferred_reclamation_metadata_t metadata = vmdr_acquire_task_metadata(task);
1353 	if (metadata == NULL) {
1354 		return KERN_INVALID_TASK;
1355 	}
1356 
1357 	/* Size must be multiple of page size */
1358 	vm_map_t map = task->map;
1359 	mach_vm_size_t new_size = vmdr_round_len_to_size(map, len);
1360 	if (new_size == 0) {
1361 		vmdr_metadata_release(metadata);
1362 		return KERN_INVALID_ARGUMENT;
1363 	}
1364 	if (new_size > metadata->vdrm_ring_size) {
1365 		vmdr_metadata_release(metadata);
1366 		return KERN_NO_SPACE;
1367 	}
1368 
1369 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_START,
1370 	    task_pid(task), new_size);
1371 
1372 	/*
1373 	 * Prevent other threads from operating on this buffer while it is
1374 	 * resized. It is the caller's responsibility to ensure mutual
1375 	 * exclusion with other user threads
1376 	 */
1377 	vmdr_metadata_own(metadata);
1378 
1379 	old_len = metadata->vdrm_buffer_len;
1380 
1381 	vmdr_log_debug("%s [%d] resizing buffer %u -> %u entries\n",
1382 	    task_best_name(task), task_pid(task), old_len, len);
1383 
1384 	/*
1385 	 * Reclaim all the entries currently in the buffer to prevent re-use
1386 	 * of old reclaim ids that will alias differently into the newly sized
1387 	 * buffer.
1388 	 *
1389 	 * TODO: Consider encoding the ringbuffer-capacity in the
1390 	 * mach_vm_reclaim_id_t, so reuses can still find objects after a resize.
1391 	 */
1392 	mach_vm_size_t total_bytes_reclaimed = 0;
1393 	do {
1394 		mach_vm_size_t cur_bytes_reclaimed;
1395 		kr = reclaim_chunk(metadata, UINT64_MAX, &cur_bytes_reclaimed, kReclaimChunkSize,
1396 		    &num_entries_reclaimed);
1397 		total_bytes_reclaimed += cur_bytes_reclaimed;
1398 		if (kr != KERN_SUCCESS) {
1399 			goto fail;
1400 		}
1401 	} while (num_entries_reclaimed > 0);
1402 
1403 	vmdr_log_debug("[%d] successfully resized buffer | reclaimed: %llu B "
1404 	    "kernel_reclaimed: %zu B\n", metadata->vdrm_pid,
1405 	    total_bytes_reclaimed, metadata->vdrm_kernel_bytes_reclaimed);
1406 
1407 	total_bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1408 	metadata->vdrm_kernel_bytes_reclaimed = 0;
1409 
1410 	/* Publish new user addresses in kernel metadata */
1411 	vmdr_metadata_lock(metadata);
1412 	metadata->vdrm_buffer_len = len;
1413 	vmdr_metadata_disown_locked(metadata);
1414 	vmdr_metadata_unlock(metadata);
1415 	vmdr_metadata_release(metadata);
1416 
1417 	*bytes_reclaimed_out = total_bytes_reclaimed;
1418 
1419 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, KERN_SUCCESS, num_entries_reclaimed, total_bytes_reclaimed);
1420 	DTRACE_VM2(reclaim_ring_resize,
1421 	    mach_vm_reclaim_count_t, old_len,
1422 	    mach_vm_reclaim_count_t, len);
1423 	return KERN_SUCCESS;
1424 
1425 fail:
1426 	vmdr_metadata_disown(metadata);
1427 	vmdr_metadata_release(metadata);
1428 	*bytes_reclaimed_out = total_bytes_reclaimed;
1429 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, kr, num_entries_reclaimed);
1430 	return kr;
1431 }
1432 
1433 #pragma mark Accounting
1434 
1435 #if CONFIG_WORKING_SET_ESTIMATION
1436 extern vm_pressure_level_t memorystatus_vm_pressure_level;
1437 
1438 static kern_return_t
1439 vmdr_calculate_autotrim_threshold(vm_deferred_reclamation_metadata_t metadata, size_t *trim_threshold_out)
1440 {
1441 	kern_return_t kr;
1442 	uint32_t autotrim_pct;
1443 
1444 	/*
1445 	 * Determine the autotrim threshold based on the current pressure level
1446 	 */
1447 	vm_pressure_level_t pressure_level = os_atomic_load(&memorystatus_vm_pressure_level, relaxed);
1448 	switch (pressure_level) {
1449 	case kVMPressureNormal:
1450 		autotrim_pct = vm_reclaim_autotrim_pct_normal;
1451 		break;
1452 	case kVMPressureWarning:
1453 	case kVMPressureUrgent:
1454 		autotrim_pct = vm_reclaim_autotrim_pct_pressure;
1455 		break;
1456 	case kVMPressureCritical:
1457 		autotrim_pct = vm_reclaim_autotrim_pct_critical;
1458 		break;
1459 	default:
1460 		panic("vm_reclaim: unexpected vm_pressure_level %d", pressure_level);
1461 	}
1462 
1463 	/*
1464 	 * Estimate the task's maximum working set size
1465 	 */
1466 	ledger_amount_t phys_footprint_max = 0;
1467 
1468 	vmdr_metadata_lock(metadata);
1469 	task_t task = metadata->vdrm_task;
1470 	if (task == TASK_NULL) {
1471 		vmdr_metadata_unlock(metadata);
1472 		return KERN_INVALID_TASK;
1473 	}
1474 	task_reference(task);
1475 	vmdr_metadata_unlock(metadata);
1476 
1477 	kr = ledger_get_lifetime_max(task->ledger,
1478 	    task_ledgers.phys_footprint, &phys_footprint_max);
1479 	assert3u(kr, ==, KERN_SUCCESS);
1480 
1481 	task_deallocate(task);
1482 
1483 	*trim_threshold_out = phys_footprint_max * autotrim_pct / 100;
1484 	return KERN_SUCCESS;
1485 }
1486 
1487 #define VMDR_WMA_UNIT (1 << 8)
1488 #define VMDR_WMA_MIX(base, e)  ((vm_reclaim_wma_weight_base * (base) + (e) * VMDR_WMA_UNIT * vm_reclaim_wma_weight_cur) / vm_reclaim_wma_denom)
1489 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1490 
1491 /*
1492  * @func vmdr_ws_sample
1493  *
1494  * @brief sample the working set size of the given buffer
1495  *
1496  * @param metadata
1497  * The reclaim buffer to sample
1498  *
1499  * @param trim_threshold_out
1500  * If the buffer should be trimmed, the amount to trim (in bytes) will be
1501  * written out
1502  *
1503  * @returns KERN_MEMORY_ERROR if copyio failed due to RECLAIM_NO_FAULT
1504  *
1505  * @discussion
1506  * The caller must own the buffer
1507  */
1508 static mach_error_t
1509 vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata,
1510     mach_vm_size_t *trim_threshold_out, vm_deferred_reclamation_options_t options)
1511 {
1512 	mach_error_t err = ERR_SUCCESS;
1513 	size_t min_reclaimable_bytes = 0, cur_reclaimable_bytes = 0;
1514 	uint64_t wma = 0;
1515 
1516 	vmdr_metadata_assert_owned(metadata);
1517 
1518 	*trim_threshold_out = 0;
1519 
1520 	vm_map_switch_context_t map_ctx = vm_map_switch_to(metadata->vdrm_map);
1521 
1522 	if (options & RECLAIM_NO_FAULT) {
1523 		vm_fault_disable();
1524 	}
1525 #if CONFIG_WORKING_SET_ESTIMATION
1526 	err = reclaim_copyin_min_reclaimable_bytes(metadata, &min_reclaimable_bytes);
1527 	if (err != ERR_SUCCESS) {
1528 		goto done;
1529 	}
1530 
1531 	uint64_t now = mach_absolute_time();
1532 	if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) {
1533 		/* A sampling period has not elapsed */
1534 		goto done;
1535 	}
1536 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_START,
1537 	    metadata->vdrm_pid,
1538 	    now,
1539 	    metadata->vdrm_last_sample_abs,
1540 	    min_reclaimable_bytes);
1541 
1542 	err = reclaim_copyin_reclaimable_bytes(metadata, &cur_reclaimable_bytes);
1543 	if (err != ERR_SUCCESS) {
1544 		goto done;
1545 	}
1546 
1547 	/* Reset the minimum to start a new sampling interval */
1548 	err = reclaim_copyout_min_reclaimable_bytes(metadata, cur_reclaimable_bytes);
1549 	if (err != ERR_SUCCESS) {
1550 		goto done;
1551 	}
1552 
1553 	/*
1554 	 * The user accounting will overcount if the kernel has reclaimed
1555 	 * without telling the client about it.
1556 	 */
1557 	if (cur_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) {
1558 		cur_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed;
1559 	} else {
1560 		vmdr_log_error("[%d] more bytes have been reclaimed (%zu) than "
1561 		    "are supposedly in buffer (%zu)\n", metadata->vdrm_pid,
1562 		    metadata->vdrm_kernel_bytes_reclaimed, cur_reclaimable_bytes);
1563 		/* This will cause an underflow in user accounting */
1564 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_ACCOUNTING_FAILURE, cur_reclaimable_bytes);
1565 		err = KERN_ABORTED;
1566 		goto done;
1567 	}
1568 	if (min_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) {
1569 		min_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed;
1570 	} else {
1571 		min_reclaimable_bytes = 0;
1572 	}
1573 
1574 	uint64_t samples_elapsed = (now - metadata->vdrm_last_sample_abs) /
1575 	    vm_reclaim_sampling_period_abs;
1576 	if (samples_elapsed > vm_reclaim_abandonment_threshold) {
1577 		/*
1578 		 * Many sampling periods have elapsed since the ring was
1579 		 * last sampled. Don't bother computing the WMA and assume
1580 		 * the buffer's current contents are unneeded.
1581 		 */
1582 		wma = VMDR_WMA_MIX(0, cur_reclaimable_bytes);
1583 	} else {
1584 		/*
1585 		 * Compute an exponential moving average of the minimum amount of reclaimable
1586 		 * memory in this buffer. Multiple sampling periods may have elapsed
1587 		 * since the last sample. By definition, the minimum must be the same for
1588 		 * all elapsed periods (otherwise libmalloc would have called down to
1589 		 * update accounting)
1590 		 */
1591 		for (unsigned int i = 0; i < samples_elapsed; i++) {
1592 			wma = VMDR_WMA_MIX(
1593 				metadata->vdrm_reclaimable_bytes_wma,
1594 				min_reclaimable_bytes);
1595 		}
1596 	}
1597 
1598 	metadata->vdrm_reclaimable_bytes_wma = wma;
1599 	size_t unneeded_bytes = MIN(min_reclaimable_bytes,
1600 	    metadata->vdrm_reclaimable_bytes_wma / VMDR_WMA_UNIT);
1601 
1602 	size_t autotrim_threshold;
1603 	err = vmdr_calculate_autotrim_threshold(metadata, &autotrim_threshold);
1604 	if (err != ERR_SUCCESS) {
1605 		goto done;
1606 	}
1607 
1608 	if (unneeded_bytes >= vm_map_page_size(metadata->vdrm_map) &&
1609 	    unneeded_bytes >= autotrim_threshold) {
1610 		*trim_threshold_out = vm_map_round_page(unneeded_bytes,
1611 		    vm_map_page_mask(metadata->vdrm_map));
1612 	}
1613 #else /* !CONFIG_WORKING_SET_ESTIMATION */
1614 	(void)min_reclaimable_bytes;
1615 	(void)wma;
1616 	err = reclaim_copyin_reclaimable_bytes(metadata, &cur_reclaimable_bytes);
1617 	if (err != ERR_SUCCESS) {
1618 		goto done;
1619 	}
1620 	if (cur_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) {
1621 		cur_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed;
1622 	} else {
1623 		vmdr_log_error("[%d] more bytes have been reclaimed (%zu) than "
1624 		    "are supposedly in buffer (%zu)\n", metadata->vdrm_pid,
1625 		    metadata->vdrm_kernel_bytes_reclaimed, cur_reclaimable_bytes);
1626 	}
1627 	if (cur_reclaimable_bytes > vm_reclaim_max_threshold) {
1628 		*trim_threshold_out = vm_reclaim_max_threshold - cur_reclaimable_bytes;
1629 	}
1630 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1631 
1632 	metadata->vdrm_last_sample_abs = mach_absolute_time();
1633 	metadata->vdrm_reclaimable_bytes_last = cur_reclaimable_bytes;
1634 
1635 done:
1636 	vm_map_switch_back(map_ctx);
1637 	if (options & RECLAIM_NO_FAULT) {
1638 		vm_fault_enable();
1639 	}
1640 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_END,
1641 	    wma,
1642 	    min_reclaimable_bytes,
1643 	    cur_reclaimable_bytes,
1644 	    *trim_threshold_out);
1645 	DTRACE_VM5(reclaim_sample,
1646 	    pid_t, metadata->vdrm_pid,
1647 	    uint64_t, wma,
1648 	    size_t, min_reclaimable_bytes,
1649 	    size_t, cur_reclaimable_bytes,
1650 	    size_t, *trim_threshold_out);
1651 	vmdr_log_debug("sampled buffer with min %lu est %lu trim %llu wma %llu\n",
1652 	    min_reclaimable_bytes,
1653 	    cur_reclaimable_bytes,
1654 	    *trim_threshold_out,
1655 	    wma);
1656 	return err;
1657 }
1658 
1659 /*
1660  * Caller must have buffer owned and unlocked
1661  */
1662 static kern_return_t
1663 vmdr_trim(vm_deferred_reclamation_metadata_t metadata, mach_vm_size_t bytes_to_reclaim,
1664     mach_vm_size_t *bytes_reclaimed, vm_deferred_reclamation_options_t options)
1665 {
1666 	kern_return_t kr;
1667 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_START,
1668 	    metadata->vdrm_pid, bytes_to_reclaim);
1669 
1670 	kr = vmdr_reclaim_from_buffer(metadata, bytes_to_reclaim,
1671 	    bytes_reclaimed, options);
1672 
1673 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_END, kr, bytes_reclaimed);
1674 	DTRACE_VM3(reclaim_trim,
1675 	    pid_t, metadata->vdrm_pid,
1676 	    size_t, bytes_to_reclaim,
1677 	    size_t, *bytes_reclaimed);
1678 	return kr;
1679 }
1680 
1681 /*
1682  * Caller must have buffer owned and unlocked
1683  */
1684 static kern_return_t
1685 vmdr_drain(vm_deferred_reclamation_metadata_t metadata, mach_vm_size_t *bytes_reclaimed,
1686     vm_deferred_reclamation_options_t options)
1687 {
1688 	kern_return_t kr;
1689 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_START,
1690 	    metadata->vdrm_pid);
1691 
1692 	kr = vmdr_reclaim_from_buffer(metadata, UINT64_MAX,
1693 	    bytes_reclaimed, options);
1694 
1695 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_END, kr, bytes_reclaimed);
1696 	DTRACE_VM2(reclaim_drain,
1697 	    pid_t, metadata->vdrm_pid,
1698 	    size_t, *bytes_reclaimed);
1699 	return kr;
1700 }
1701 
1702 mach_error_t
1703 vm_deferred_reclamation_update_accounting_internal(task_t task, uint64_t *bytes_reclaimed_out)
1704 {
1705 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1706 	mach_vm_size_t bytes_to_reclaim, bytes_reclaimed = 0;
1707 	mach_error_t err = ERR_SUCCESS;
1708 
1709 	if (metadata == NULL) {
1710 		return KERN_NOT_FOUND;
1711 	}
1712 
1713 	if (!metadata->vdrm_pid) {
1714 		/* If this is a forked child, we may not yet have a pid */
1715 		metadata->vdrm_pid = task_pid(task);
1716 	}
1717 
1718 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START,
1719 	    metadata->vdrm_pid);
1720 
1721 	vmdr_metadata_lock(metadata);
1722 	uint64_t now = mach_absolute_time();
1723 	if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) {
1724 		/*
1725 		 * This is a fast path to avoid waiting on the gate if another
1726 		 * thread beat us to sampling.
1727 		 */
1728 		vmdr_metadata_unlock(metadata);
1729 		goto done;
1730 	}
1731 	vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
1732 	vmdr_metadata_unlock(metadata);
1733 
1734 	err = vmdr_sample_working_set(metadata, &bytes_to_reclaim, RECLAIM_OPTIONS_NONE);
1735 	if (err != ERR_SUCCESS) {
1736 		vmdr_metadata_disown(metadata);
1737 		goto done;
1738 	}
1739 	if (bytes_to_reclaim) {
1740 		vmdr_log_debug("[%d] trimming %llu B\n", metadata->vdrm_pid, bytes_to_reclaim);
1741 
1742 		err = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, RECLAIM_OPTIONS_NONE);
1743 
1744 		if (err == KERN_ABORTED) {
1745 			/*
1746 			 * We were unable to complete the trim due to a lost
1747 			 * race with userspace. This need not be fatal b/c the
1748 			 * accounting was successfully updated.
1749 			 */
1750 			err = KERN_SUCCESS;
1751 		}
1752 	}
1753 
1754 	/*
1755 	 * Tell the client how many bytes the kernel has reclaimed
1756 	 * since the last time it updated its accounting
1757 	 */
1758 	bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1759 	metadata->vdrm_kernel_bytes_reclaimed = 0;
1760 
1761 	vmdr_metadata_disown(metadata);
1762 
1763 done:
1764 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END,
1765 	    metadata->vdrm_last_sample_abs,
1766 	    bytes_to_reclaim,
1767 	    bytes_reclaimed);
1768 	*bytes_reclaimed_out = (uint64_t)bytes_reclaimed;
1769 	return err;
1770 }
1771 
1772 kern_return_t
1773 vm_deferred_reclamation_task_drain(task_t task,
1774     vm_deferred_reclamation_options_t options)
1775 {
1776 	kern_return_t kr;
1777 	mach_vm_size_t bytes_reclaimed;
1778 
1779 	task_lock(task);
1780 	if (!task_is_active(task) || task_is_halting(task)) {
1781 		task_unlock(task);
1782 		return KERN_ABORTED;
1783 	}
1784 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1785 	if (metadata == NULL) {
1786 		task_unlock(task);
1787 		return KERN_SUCCESS;
1788 	}
1789 	vmdr_metadata_retain(metadata);
1790 	task_unlock(task);
1791 
1792 	vmdr_metadata_own(metadata);
1793 
1794 	kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1795 	metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1796 
1797 	vmdr_metadata_disown(metadata);
1798 	vmdr_metadata_release(metadata);
1799 	return kr;
1800 }
1801 
1802 void
1803 vm_deferred_reclamation_task_suspend(task_t task)
1804 {
1805 	if (task->deferred_reclamation_metadata) {
1806 		sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
1807 	}
1808 }
1809 
1810 #pragma mark KPIs
1811 
1812 vm_deferred_reclamation_metadata_t
1813 vm_deferred_reclamation_task_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1814 {
1815 	vm_deferred_reclamation_metadata_t metadata = NULL;
1816 	vmdr_metadata_assert_owned(parent);
1817 	vmdr_log_debug("forking [%d]\n", parent->vdrm_pid);
1818 
1819 	assert(task->deferred_reclamation_metadata == NULL);
1820 	metadata = vmdr_metadata_alloc(task, parent->vdrm_ring_addr,
1821 	    parent->vdrm_ring_size, parent->vdrm_buffer_len);
1822 
1823 	metadata->vdrm_last_sample_abs = parent->vdrm_last_sample_abs;
1824 	metadata->vdrm_kernel_bytes_reclaimed = parent->vdrm_kernel_bytes_reclaimed;
1825 #if CONFIG_WORKING_SET_ESTIMATION
1826 	metadata->vdrm_reclaimable_bytes_wma = parent->vdrm_reclaimable_bytes_wma;
1827 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1828 
1829 	return metadata;
1830 }
1831 
1832 void
1833 vm_deferred_reclamation_task_fork_register(vm_deferred_reclamation_metadata_t metadata)
1834 {
1835 	assert(metadata != NULL);
1836 	assert(!metadata->vdrm_is_registered);
1837 
1838 	lck_mtx_lock(&reclaim_buffers_lock);
1839 	metadata->vdrm_is_registered = true;
1840 	vmdr_list_append_locked(metadata);
1841 	lck_mtx_unlock(&reclaim_buffers_lock);
1842 }
1843 
1844 bool
1845 vm_deferred_reclamation_task_has_ring(task_t task)
1846 {
1847 	return task->deferred_reclamation_metadata != NULL;
1848 }
1849 
1850 void
1851 vm_deferred_reclamation_ring_own(vm_deferred_reclamation_metadata_t metadata)
1852 {
1853 	vmdr_metadata_own(metadata);
1854 }
1855 
1856 void
1857 vm_deferred_reclamation_ring_disown(vm_deferred_reclamation_metadata_t metadata)
1858 {
1859 	vmdr_metadata_disown(metadata);
1860 }
1861 
1862 void
1863 vm_deferred_reclamation_gc(vm_deferred_reclamation_gc_action_t action,
1864     mach_vm_size_t *total_bytes_reclaimed,
1865     vm_deferred_reclamation_options_t options)
1866 {
1867 	vmdr_garbage_collect(action, total_bytes_reclaimed, options);
1868 }
1869 
1870 void
1871 vm_deferred_reclamation_settle_ledger(task_t task)
1872 {
1873 	vm_deferred_reclamation_metadata_t meta = vmdr_acquire_task_metadata(task);
1874 	if (meta == NULL) {
1875 		return;
1876 	}
1877 	vmdr_metadata_lock(meta);
1878 	ledger_zero_balance(task->ledger, task_ledgers.est_reclaimable);
1879 	ledger_credit(
1880 		task->ledger,
1881 		task_ledgers.est_reclaimable,
1882 		meta->vdrm_reclaimable_bytes_last);
1883 	vmdr_metadata_unlock(meta);
1884 	vmdr_metadata_release(meta);
1885 }
1886 
1887 #pragma mark Global Reclamation GC
1888 
1889 static void
1890 vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action,
1891     mach_vm_size_t *total_bytes_reclaimed_out,
1892     vm_deferred_reclamation_options_t options)
1893 {
1894 	kern_return_t kr;
1895 	mach_vm_size_t total_bytes_reclaimed = 0;
1896 	gate_wait_result_t wr;
1897 
1898 	lck_mtx_lock(&reclaim_buffers_lock);
1899 	kr = lck_mtx_gate_try_close(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
1900 	if (kr != KERN_SUCCESS) {
1901 		if (options & RECLAIM_NO_WAIT) {
1902 			lck_mtx_unlock(&reclaim_buffers_lock);
1903 			return;
1904 		}
1905 		wr = lck_mtx_gate_wait(&reclaim_buffers_lock, &vm_reclaim_gc_gate, LCK_SLEEP_DEFAULT, THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1906 		assert3u(wr, ==, GATE_HANDOFF);
1907 	}
1908 
1909 	vm_reclaim_gc_epoch++;
1910 	vmdr_log_debug("running global GC\n");
1911 	while (true) {
1912 		vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclaim_buffers);
1913 		if (metadata == NULL) {
1914 			break;
1915 		}
1916 		vmdr_list_remove_locked(metadata);
1917 		vmdr_list_append_locked(metadata);
1918 		vmdr_metadata_retain(metadata);
1919 		lck_mtx_unlock(&reclaim_buffers_lock);
1920 
1921 		vmdr_metadata_lock(metadata);
1922 
1923 		if (metadata->vdrm_reclaimed_at >= vm_reclaim_gc_epoch) {
1924 			/* We've already seen this one. We're done */
1925 			vmdr_metadata_unlock(metadata);
1926 			vmdr_metadata_release(metadata);
1927 			lck_mtx_lock(&reclaim_buffers_lock);
1928 			break;
1929 		}
1930 		metadata->vdrm_reclaimed_at = vm_reclaim_gc_epoch;
1931 
1932 		task_t task = metadata->vdrm_task;
1933 		if (task == TASK_NULL ||
1934 		    !task_is_active(task) ||
1935 		    task_is_halting(task)) {
1936 			goto next;
1937 		}
1938 		bool buffer_is_suspended = task_is_app_suspended(task);
1939 		task = TASK_NULL;
1940 
1941 		mach_vm_size_t bytes_reclaimed = 0;
1942 		mach_vm_size_t bytes_to_reclaim = 0;
1943 
1944 		switch (action) {
1945 		case RECLAIM_GC_DRAIN:
1946 			if (!vmdr_metadata_own_locked(metadata, options)) {
1947 				goto next;
1948 			}
1949 			vmdr_metadata_unlock(metadata);
1950 
1951 			vmdr_log_debug("draining [%d]\n", metadata->vdrm_pid);
1952 			kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1953 			metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1954 
1955 			vmdr_metadata_lock(metadata);
1956 			vmdr_metadata_disown_locked(metadata);
1957 			break;
1958 		case RECLAIM_GC_SCAVENGE:
1959 			if (buffer_is_suspended) {
1960 				if (!vmdr_metadata_own_locked(metadata, options)) {
1961 					goto next;
1962 				}
1963 				vmdr_metadata_unlock(metadata);
1964 
1965 				/* This buffer is no longer in use, fully reclaim it. */
1966 				vmdr_log_debug("found suspended buffer [%d], draining\n", metadata->vdrm_pid);
1967 				kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1968 				metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1969 
1970 				vmdr_metadata_lock(metadata);
1971 				vmdr_metadata_disown_locked(metadata);
1972 			}
1973 			break;
1974 		case RECLAIM_GC_TRIM:
1975 			if (!vmdr_metadata_own_locked(metadata, options)) {
1976 				goto next;
1977 			}
1978 			vmdr_metadata_unlock(metadata);
1979 			kr = vmdr_sample_working_set(metadata, &bytes_to_reclaim, options);
1980 			if (kr == KERN_SUCCESS && bytes_to_reclaim) {
1981 				vmdr_log_debug("GC found stale buffer (%d), trimming\n", metadata->vdrm_pid);
1982 				kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, options);
1983 				metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1984 			}
1985 			vmdr_metadata_lock(metadata);
1986 			vmdr_metadata_disown_locked(metadata);
1987 			break;
1988 		}
1989 		if (bytes_reclaimed) {
1990 			vm_reclaim_gc_reclaim_count++;
1991 			total_bytes_reclaimed += bytes_reclaimed;
1992 		}
1993 		if (metadata->vdrm_waiters && action != RECLAIM_GC_TRIM) {
1994 			thread_wakeup((event_t)&metadata->vdrm_waiters);
1995 		}
1996 next:
1997 		vmdr_metadata_unlock(metadata);
1998 		vmdr_metadata_release(metadata);
1999 		lck_mtx_lock(&reclaim_buffers_lock);
2000 	}
2001 	lck_mtx_gate_handoff(&reclaim_buffers_lock, &vm_reclaim_gc_gate, GATE_HANDOFF_OPEN_IF_NO_WAITERS);
2002 	lck_mtx_unlock(&reclaim_buffers_lock);
2003 	*total_bytes_reclaimed_out = total_bytes_reclaimed;
2004 }
2005 
2006 OS_NORETURN
2007 static void
2008 vm_reclaim_scavenger_thread_continue(__unused void *param, __unused wait_result_t wr)
2009 {
2010 	sched_cond_ack(&vm_reclaim_scavenger_cond);
2011 
2012 	while (true) {
2013 		mach_vm_size_t total_bytes_reclaimed;
2014 		vmdr_garbage_collect(RECLAIM_GC_SCAVENGE, &total_bytes_reclaimed,
2015 		    RECLAIM_OPTIONS_NONE);
2016 		vmdr_log_info("scavenger reclaimed %llu KiB of virtual memory\n",
2017 		    total_bytes_reclaimed >> 10);
2018 		sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT,
2019 		    vm_reclaim_scavenger_thread_continue);
2020 	}
2021 }
2022 
2023 OS_NORETURN
2024 static void
2025 vm_reclaim_scavenger_thread_init(__unused void *param, __unused wait_result_t wr)
2026 {
2027 	thread_set_thread_name(current_thread(), "VM_reclaim_scavenger");
2028 #if CONFIG_THREAD_GROUPS
2029 	thread_group_vm_add();
2030 #endif /* CONFIG_THREAD_GROUPS */
2031 	sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT, vm_reclaim_scavenger_thread_continue);
2032 	__builtin_unreachable();
2033 }
2034 
2035 __startup_func
2036 static void
2037 vm_deferred_reclamation_init(void)
2038 {
2039 	vm_reclaim_log_handle = os_log_create("com.apple.xnu", "vm_reclaim");
2040 	nanoseconds_to_absolutetime((uint64_t)vm_reclaim_sampling_period_ns,
2041 	    &vm_reclaim_sampling_period_abs);
2042 
2043 	sched_cond_init(&vm_reclaim_scavenger_cond);
2044 	lck_mtx_gate_init(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
2045 	kern_return_t kr = kernel_thread_start_priority(vm_reclaim_scavenger_thread_init,
2046 	    NULL, BASEPRI_KERNEL, &vm_reclaim_scavenger_thread);
2047 	if (kr != KERN_SUCCESS) {
2048 		panic("Unable to create VM reclaim thread, %d", kr);
2049 	}
2050 }
2051 
2052 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
2053 
2054 #pragma mark Debug Interfaces
2055 
2056 #if DEVELOPMENT || DEBUG
2057 
2058 bool
2059 vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task)
2060 {
2061 	bool reclaimed;
2062 	vm_deferred_reclamation_metadata_t metadata;
2063 
2064 	metadata = vmdr_acquire_task_metadata(task);
2065 	if (metadata == NULL) {
2066 		return false;
2067 	}
2068 	vmdr_metadata_lock(metadata);
2069 
2070 	metadata->vdrm_waiters++;
2071 	/* Wake up the scavenger thread */
2072 	sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
2073 	wait_result_t wr = lck_mtx_sleep(&metadata->vdrm_lock,
2074 	    LCK_SLEEP_DEFAULT, (event_t)&metadata->vdrm_waiters,
2075 	    THREAD_ABORTSAFE);
2076 	metadata->vdrm_waiters--;
2077 	reclaimed = (wr == THREAD_AWAKENED);
2078 
2079 	vmdr_metadata_unlock(metadata);
2080 	vmdr_metadata_release(metadata);
2081 	return reclaimed;
2082 }
2083 
2084 #endif /* DEVELOPMENT || DEBUG */
2085 
2086 #pragma mark Introspectibility
2087 
2088 kern_return_t
2089 vm_deferred_reclamation_buffer_query_internal(
2090 	task_t task,
2091 	mach_vm_address_ut *addr_out_u,
2092 	mach_vm_size_ut *size_out_u)
2093 {
2094 	vm_deferred_reclamation_metadata_t meta;
2095 
2096 	if (task == NULL) {
2097 		return KERN_INVALID_TASK;
2098 	}
2099 
2100 	if ((addr_out_u == NULL) || (size_out_u == NULL)) {
2101 		return KERN_INVALID_ARGUMENT;
2102 	}
2103 
2104 	meta = vmdr_acquire_task_metadata(task);
2105 
2106 	if (meta == NULL) {
2107 		*addr_out_u = vm_sanitize_wrap_addr(0);
2108 		*size_out_u = vm_sanitize_wrap_size(0);
2109 	} else {
2110 		vmdr_metadata_lock(meta);
2111 		*addr_out_u = vm_sanitize_wrap_addr(meta->vdrm_ring_addr);
2112 		*size_out_u = vm_sanitize_wrap_size(meta->vdrm_ring_size);
2113 		vmdr_metadata_unlock(meta);
2114 		vmdr_metadata_release(meta);
2115 	}
2116 
2117 	return KERN_SUCCESS;
2118 }
2119