1 /*
2 * Copyright (c) 2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/sched_prim.h>
35 #include <kern/startup.h>
36 #include <kern/thread_group.h>
37 #include <libkern/OSAtomic.h>
38 #include <mach/kern_return.h>
39 #include <mach/mach_types.h>
40 #include <mach/vm_reclaim_private.h>
41 #include <os/atomic_private.h>
42 #include <os/base_private.h>
43 #include <os/log.h>
44 #include <os/refcnt.h>
45 #include <os/refcnt_internal.h>
46 #include <pexpert/pexpert.h>
47 #include <sys/errno.h>
48 #include <sys/kdebug.h>
49 #include <sys/queue.h>
50 #include <sys/reason.h>
51 #include <vm/vm_fault_xnu.h>
52 #include <vm/vm_map.h>
53 #include <vm/vm_map_internal.h>
54 #include <vm/vm_pageout_internal.h>
55 #include <vm/vm_reclaim_internal.h>
56 #include <vm/vm_sanitize_internal.h>
57 #include <vm/vm_kern_xnu.h>
58
59 #pragma mark Tunables
60
61 #if XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR
62 /* Temporarily opt iOS into the legacy behavior as a stop-gap */
63 #define CONFIG_WORKING_SET_ESTIMATION 0
64 /*
65 * Deferred reclaim may be enabled via EDT for select iOS devices, but
66 * defaults to disabled
67 */
68 #define VM_RECLAIM_ENABLED_DEFAULT false
69 #else
70 #define CONFIG_WORKING_SET_ESTIMATION 1
71 #define VM_RECLAIM_ENABLED_DEFAULT true
72 #endif
73
74 #if DEVELOPMENT || DEBUG
75 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
76 #else /* RELEASE */
77 const uint32_t kReclaimChunkSize = 16;
78 #endif /* DEVELOPMENT || DEBUG */
79 #if CONFIG_WORKING_SET_ESTIMATION
80 TUNABLE_DT_DEV_WRITEABLE(bool, vm_reclaim_enabled, "/defaults",
81 "kern.vm_reclaim_enabled", "vm_reclaim_enabled", VM_RECLAIM_ENABLED_DEFAULT, TUNABLE_DT_NONE);
82 /* TODO: Consider varying the sampling rate based on rusage, ringbuffer-velocity, memory pressure */
83 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_sampling_period_ns, "vm_reclaim_sampling_period_ns", 1ULL * NSEC_PER_SEC);
84 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_normal, "vm_reclaim_autotrim_pct_normal", 10);
85 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_pressure, "vm_reclaim_autotrim_pct_pressure", 5);
86 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_critical, "vm_reclaim_autotrim_pct_critical", 1);
87 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_base, "vm_reclaim_wma_weight_base", 3);
88 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_cur, "vm_reclaim_wma_weight_cur", 1);
89 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_denom, "vm_reclaim_wma_denom", 4);
90 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_abandonment_threshold, "vm_reclaim_abandonment_threshold", 512);
91 #else /* CONFIG_WORKING_SET_ESTIMATION */
92 TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults",
93 "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", 0, TUNABLE_DT_NONE);
94 #endif /* CONFIG_WORKING_SET_ESTIMATION */
95 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
96 #if DEVELOPMENT || DEBUG
97 TUNABLE_WRITEABLE(bool, vm_reclaim_debug, "vm_reclaim_debug", false);
98 #endif
99
100 #pragma mark Declarations
101 typedef struct proc *proc_t;
102 extern const char *proc_best_name(struct proc *);
103 extern void *proc_find(int pid);
104 extern task_t proc_task(proc_t);
105 extern kern_return_t kern_return_for_errno(int);
106 extern int mach_to_bsd_errno(kern_return_t kr);
107 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
108 struct proc *proc_ref(struct proc *p, int locked);
109 int proc_rele(proc_t p);
110
111 #define _vmdr_log_type(type, fmt, ...) os_log_with_type(vm_reclaim_log_handle, type, "vm_reclaim: " fmt, ##__VA_ARGS__)
112 #define vmdr_log(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_DEFAULT, fmt, ##__VA_ARGS__)
113 #define vmdr_log_info(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_INFO, fmt, ##__VA_ARGS__)
114 #define vmdr_log_error(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_ERROR, fmt, ##__VA_ARGS__)
115 #if DEVELOPMENT || DEBUG
116 #define vmdr_log_debug(fmt, ...) \
117 MACRO_BEGIN \
118 if (os_unlikely(vm_reclaim_debug)) { \
119 _vmdr_log_type(OS_LOG_TYPE_DEBUG, fmt, ##__VA_ARGS__); \
120 } \
121 MACRO_END
122 #else /* !(DEVELOPMENT || DEBUG)*/
123 #define vmdr_log_debug(...)
124 #endif /* DEVELOPMENT || DEBUG */
125
126 static kern_return_t reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
127 static kern_return_t reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
128 static kern_return_t reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
129 static kern_return_t reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result);
130 #if CONFIG_WORKING_SET_ESTIMATION
131 static bool vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata, size_t *trim_threshold_out);
132 #endif
133 static void vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata);
134 static void vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
135 static void vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
136 static void vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata);
137 static void vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata);
138 static void vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action, vm_deferred_reclamation_options_t options);
139 static kern_return_t reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
140 uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
141 mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out);
142
143 struct vm_deferred_reclamation_metadata_s {
144 /*
145 * Global list containing every reclamation buffer. Protected by the
146 * reclamation_buffers_lock.
147 */
148 TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list;
149 /* Protects all struct fields (except denoted otherwise) */
150 decl_lck_mtx_data(, vdrm_lock);
151 decl_lck_mtx_gate_data(, vdrm_gate);
152 /*
153 * The task owns this structure but we maintain a backpointer here
154 * so that we can send an exception if we hit an error.
155 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
156 */
157 task_t vdrm_task;
158 pid_t vdrm_pid;
159 vm_map_t vdrm_map;
160 /*
161 * The owning task holds a ref on this object. When the task dies, it
162 * will set vdrm_task := NULL and drop its ref. Threads operating on the buffer
163 * should hold a +1 on the metadata structure to ensure it's validity.
164 */
165 os_refcnt_t vdrm_refcnt;
166 /* The virtual address of the ringbuffer in the user map (immutable) */
167 user_addr_t vdrm_buffer_addr;
168 /* The size of the VM allocation containing the ringbuffer (immutable) */
169 mach_vm_size_t vdrm_buffer_size;
170 /* The length of the ringbuffer. This may be changed on buffer re-size */
171 mach_vm_reclaim_count_t vdrm_buffer_len;
172 /* Which GC epoch this buffer was last considered in */
173 uint64_t vdrm_reclaimed_at;
174 /*
175 * The number of threads waiting for a pending reclamation
176 * on this buffer to complete.
177 */
178 uint32_t vdrm_waiters;
179 #if CONFIG_WORKING_SET_ESTIMATION
180 /* timestamp (MAS) of the last working set sample for this ringbuffer */
181 uint64_t vdrm_last_sample_abs;
182 /*
183 * Exponential moving average of the minimum reclaimable buffer size (in VMDR_WMA_UNIT's)
184 */
185 uint64_t vdrm_reclaimable_bytes_wma;
186 /*
187 * The minimum amount of reclaimable memory in this buffer for the current
188 * sampling interval.
189 */
190 size_t vdrm_reclaimable_bytes_min;
191 #endif /* CONFIG_WORKING_SET_ESTIMATION */
192 /*
193 * These two values represent running sums of uncancelled bytes
194 * entered into the ring by userspace and bytes reclaimed out of the
195 * buffer by the kernel.
196 *
197 * The uncancelled byte-count may fluctuate as the client enters and
198 * cancels new reclamation requests. Reclamation requests which have
199 * been completed by the kernel will not deduct from the uncancelled
200 * count but will be added to the reclaimed byte count.
201 *
202 * - `vdrm_cumulative_reclaimed_bytes` is monotonically increasing.
203 * - `vdrm_cumulative_uncancelled_bytes` may fluctuate but
204 * should trend upward.
205 * - `vdrm_cumulative_uncancelled_bytes` must be kept >=
206 * `vdrm_cumulative_reclaimed_bytes`
207 *
208 * Both values are in terms of virtual memory,
209 * so they give an upper bound on the amount of physical memory that
210 * can be reclaimed. To get an estimate of the current amount of VA in
211 * the buffer do vdrm_cumulative_uncancelled_bytes -
212 * vdrm_cumulative_reclaimed_bytes.
213 */
214 size_t vdrm_cumulative_uncancelled_bytes;
215 size_t vdrm_cumulative_reclaimed_bytes;
216
217 /*
218 * Tracks whether or not this reclamation metadata has been added
219 * to the global list yet. Normally, this happens when it is allocated,
220 * except in the case of fork(). In this case, we have to duplicate the
221 * parent's metadata before it returns from fork(), but this occurs
222 * before the child's address space is set up.
223 */
224 uint8_t vdrm_is_registered : 1,
225 __unused1 : 7;
226 };
227
228 #pragma mark Globals
229 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
230 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
231 os_refgrp_decl(static, vm_reclaim_metadata_refgrp, "vm_reclaim_metadata_refgrp", NULL);
232 /*
233 * The reclamation_buffers list contains every buffer in the system.
234 * The reclamation_buffers_lock protects the reclamation_buffers list.
235 * It must be held when iterating over the list or manipulating the list.
236 * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
237 */
238 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclaim_buffers = TAILQ_HEAD_INITIALIZER(reclaim_buffers);
239 LCK_MTX_DECLARE(reclaim_buffers_lock, &vm_reclaim_lock_grp);
240 /* Number of times Reclaim GC has run */
241 uint64_t vm_reclaim_gc_epoch = 0;
242 /* The number of reclamation actions (drains/trims) done during GC */
243 uint64_t vm_reclaim_gc_reclaim_count;
244 /* Gate for GC */
245 static decl_lck_mtx_gate_data(, vm_reclaim_gc_gate);
246 os_log_t vm_reclaim_log_handle;
247 /* Number of initialized reclaim buffers */
248 _Atomic uint32_t vm_reclaim_buffer_count;
249 uint64_t vm_reclaim_sampling_period_abs = 0;
250 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_scavenger_thread = THREAD_NULL;
251 static sched_cond_atomic_t vm_reclaim_scavenger_cond = SCHED_COND_INIT;
252
253 #pragma mark Buffer Initialization/Destruction
254
255 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,mach_vm_reclaim_count_t len)256 vmdr_metadata_alloc(
257 task_t task,
258 user_addr_t buffer,
259 mach_vm_size_t size,
260 mach_vm_reclaim_count_t len)
261 {
262 vm_deferred_reclamation_metadata_t metadata;
263 vm_map_t map = task->map;
264
265 assert(!map->is_nested_map);
266
267 metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
268 lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
269 lck_mtx_gate_init(&metadata->vdrm_lock, &metadata->vdrm_gate);
270 os_ref_init(&metadata->vdrm_refcnt, &vm_reclaim_metadata_refgrp);
271
272 metadata->vdrm_task = task;
273 metadata->vdrm_map = map;
274 metadata->vdrm_buffer_addr = buffer;
275 metadata->vdrm_buffer_size = size;
276 metadata->vdrm_buffer_len = len;
277
278 if (os_atomic_inc(&vm_reclaim_buffer_count, relaxed) == UINT32_MAX) {
279 panic("Overflowed vm_reclaim_buffer_count");
280 }
281
282 /*
283 * we do not need to hold a lock on `task` because this is called
284 * either at fork() time or from the context of current_task().
285 */
286 vm_map_reference(map);
287 return metadata;
288 }
289
290 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)291 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
292 {
293 vm_map_deallocate(metadata->vdrm_map);
294 lck_mtx_gate_destroy(&metadata->vdrm_lock, &metadata->vdrm_gate);
295 lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
296 zfree(vm_reclaim_metadata_zone, metadata);
297 if (os_atomic_dec_orig(&vm_reclaim_buffer_count, relaxed) == 0) {
298 panic("Underflowed vm_reclaim_buffer_count");
299 }
300 }
301
302 static mach_vm_size_t
vmdr_round_len_to_size(vm_map_t map,mach_vm_reclaim_count_t count)303 vmdr_round_len_to_size(vm_map_t map, mach_vm_reclaim_count_t count)
304 {
305 mach_vm_size_t metadata_size = offsetof(struct mach_vm_reclaim_ring_s, entries);
306 mach_vm_size_t entries_size = count * sizeof(struct mach_vm_reclaim_entry_s);
307 return vm_map_round_page(metadata_size + entries_size, vm_map_page_mask(map));
308 }
309
310 mach_error_t
vm_deferred_reclamation_buffer_allocate_internal(task_t task,mach_vm_address_ut * address_u,mach_vm_reclaim_count_t len,mach_vm_reclaim_count_t max_len)311 vm_deferred_reclamation_buffer_allocate_internal(
312 task_t task,
313 mach_vm_address_ut *address_u,
314 mach_vm_reclaim_count_t len,
315 mach_vm_reclaim_count_t max_len)
316 {
317 kern_return_t kr;
318 kern_return_t tmp_kr;
319 vm_deferred_reclamation_metadata_t metadata = NULL;
320 vm_map_t map;
321 uint64_t head = 0, tail = 0, busy = 0;
322 static bool reclaim_disabled_logged = false;
323
324 if (task == TASK_NULL) {
325 return KERN_INVALID_TASK;
326 }
327 if (address_u == NULL) {
328 return KERN_INVALID_ADDRESS;
329 }
330 if (len == 0 || max_len == 0 || max_len < len) {
331 return KERN_INVALID_ARGUMENT;
332 }
333 map = task->map;
334 #if CONFIG_WORKING_SET_ESTIMATION
335 if (!vm_reclaim_enabled) {
336 #else /* !CONFIG_WORKING_SET_ESTIMATION */
337 if (!vm_reclaim_max_threshold) {
338 #endif /* CONFIG_WORKING_SET_ESTIMATION */
339 if (!reclaim_disabled_logged) {
340 /* Avoid logging failure for every new process */
341 reclaim_disabled_logged = true;
342 vmdr_log_error("failed to initialize deferred "
343 "reclamation buffer - vm_reclaim is disabled\n");
344 }
345 return VM_RECLAIM_NOT_SUPPORTED;
346 }
347
348 map = task->map;
349 mach_vm_size_t rounded_vm_size = vmdr_round_len_to_size(map, max_len);
350 if (rounded_vm_size == 0) {
351 return KERN_INVALID_ARGUMENT;
352 }
353
354 if (rounded_vm_size > VM_RECLAIM_MAX_BUFFER_SIZE) {
355 vmdr_log_error("denying request to allocate ringbuffer of size "
356 "%llu KiB (max %llu KiB)\n",
357 rounded_vm_size,
358 VM_RECLAIM_MAX_BUFFER_SIZE);
359 return KERN_NO_SPACE;
360 }
361
362 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START,
363 task_pid(task), len);
364
365 /*
366 * Allocate a VM region that can contain the maximum buffer size. The
367 * allocation starts as VM_PROT_NONE and may be unprotected on buffer
368 * resize.
369 *
370 * TODO: If clients other than libmalloc adopt deferred reclaim, a
371 * different tag should be given
372 *
373 * `address` was sanitized under the assumption that we'll only use
374 * it as a hint (overflow checks were used) so we must pass the
375 * anywhere flag.
376 */
377 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE_PERMANENT(
378 .vm_tag = VM_MEMORY_MALLOC);
379 mach_vm_size_ut size_u = vm_sanitize_wrap_size(rounded_vm_size);
380 kr = mach_vm_map_kernel(map, address_u, size_u, VM_MAP_PAGE_MASK(map),
381 vmk_flags, IPC_PORT_NULL, 0, FALSE,
382 VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_COPY);
383 if (kr != KERN_SUCCESS) {
384 vmdr_log_error("%s [%d] failed to allocate VA for reclaim "
385 "buffer (%d)\n", task_best_name(task), task_pid(task), kr);
386 return kr;
387 }
388 mach_vm_address_t address = VM_SANITIZE_UNSAFE_UNWRAP(*address_u);
389 assert3u(address, !=, 0);
390
391 metadata = vmdr_metadata_alloc(task, address, rounded_vm_size, len);
392 metadata->vdrm_pid = task_pid(task);
393
394 /*
395 * Validate the starting indices.
396 */
397 kr = reclaim_copyin_busy(metadata, &busy);
398 if (kr != KERN_SUCCESS) {
399 goto out;
400 }
401 kr = reclaim_copyin_head(metadata, &head);
402 if (kr != KERN_SUCCESS) {
403 goto out;
404 }
405 kr = reclaim_copyin_tail(metadata, &tail);
406 if (kr != KERN_SUCCESS) {
407 goto out;
408 }
409
410 if (head != 0 || tail != 0 || busy != 0) {
411 vmdr_log_error("indices were not "
412 "zero-initialized\n");
413 kr = KERN_INVALID_ARGUMENT;
414 goto out;
415 }
416
417 /*
418 * Publish the metadata to the task & global buffer list. This must be
419 * done under the task lock to synchronize with task termination - i.e.
420 * task_terminate_internal is guaranteed to see the published metadata and
421 * tear it down.
422 */
423 lck_mtx_lock(&reclaim_buffers_lock);
424 task_lock(task);
425
426 if (!task_is_active(task) || task_is_halting(task)) {
427 vmdr_log_error(
428 "failed to initialize buffer on dying task %s [%d]",
429 task_best_name(task), task_pid(task));
430 kr = KERN_ABORTED;
431 goto fail_task;
432 }
433 if (task->deferred_reclamation_metadata != NULL) {
434 vmdr_log_error(
435 "tried to overwrite existing reclaim buffer for %s [%d]", task_best_name(task), task_pid(task));
436 kr = VM_RECLAIM_RESOURCE_SHORTAGE;
437 goto fail_task;
438 }
439
440 metadata->vdrm_is_registered = true;
441 vmdr_list_append_locked(metadata);
442 task->deferred_reclamation_metadata = metadata;
443
444 task_unlock(task);
445 lck_mtx_unlock(&reclaim_buffers_lock);
446
447 vmdr_log_debug("%s [%d] allocated ring with capacity %u/%u\n",
448 task_best_name(task), task_pid(task),
449 len, max_len);
450 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
451 task_pid(task), KERN_SUCCESS, address);
452 DTRACE_VM3(reclaim_ring_allocate,
453 mach_vm_address_t, address,
454 mach_vm_reclaim_count_t, len,
455 mach_vm_reclaim_count_t, max_len);
456 return KERN_SUCCESS;
457
458 fail_task:
459 task_unlock(task);
460 lck_mtx_unlock(&reclaim_buffers_lock);
461
462 tmp_kr = mach_vm_deallocate(map,
463 *address_u, size_u);
464 assert(tmp_kr == KERN_SUCCESS);
465
466 out:
467 *address_u = vm_sanitize_wrap_addr(0ull);
468 vmdr_metadata_release(metadata);
469 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
470 kr, NULL);
471 return kr;
472 }
473
474 #pragma mark Synchronization & Lifecycle
475
476 static inline void
477 vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)
478 {
479 lck_mtx_lock(&metadata->vdrm_lock);
480 }
481
482 static inline void
483 vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)
484 {
485 lck_mtx_unlock(&metadata->vdrm_lock);
486 }
487
488 static inline void
489 vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)
490 {
491 lck_mtx_gate_assert(&metadata->vdrm_lock, &metadata->vdrm_gate,
492 GATE_ASSERT_HELD);
493 }
494
495 static inline void
496 vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)
497 {
498 #if MACH_ASSERT
499 vmdr_metadata_lock(metadata);
500 vmdr_metadata_assert_owned_locked(metadata);
501 vmdr_metadata_unlock(metadata);
502 #else /* MACH_ASSERT */
503 (void)metadata;
504 #endif /* MACH_ASSERT */
505 }
506
507 static bool
508 vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)
509 {
510 kern_return_t kr = lck_mtx_gate_try_close(&metadata->vdrm_lock,
511 &metadata->vdrm_gate);
512 return kr == KERN_SUCCESS;
513 }
514
515 /*
516 * Try to take ownership of the buffer. Returns true if successful.
517 */
518 static bool
519 vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata,
520 vm_deferred_reclamation_options_t options)
521 {
522 __assert_only gate_wait_result_t wait_result;
523 if (!vmdr_metadata_try_own_locked(metadata)) {
524 if (options & RECLAIM_NO_WAIT) {
525 return false;
526 }
527 wait_result = lck_mtx_gate_wait(
528 &metadata->vdrm_lock, &metadata->vdrm_gate, LCK_SLEEP_DEFAULT,
529 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
530 assert(wait_result == GATE_HANDOFF);
531 }
532 return true;
533 }
534
535 /*
536 * Set the current thread as the owner of a reclaim buffer. May block. Will
537 * propagate priority.
538 */
539 static void
540 vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)
541 {
542 vmdr_metadata_lock(metadata);
543 vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
544 vmdr_metadata_unlock(metadata);
545 }
546
547 static void
548 vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)
549 {
550 vmdr_metadata_assert_owned_locked(metadata);
551 lck_mtx_gate_handoff(&metadata->vdrm_lock, &metadata->vdrm_gate,
552 GATE_HANDOFF_OPEN_IF_NO_WAITERS);
553 }
554
555 /*
556 * Release ownership of a reclaim buffer and wakeup any threads waiting for
557 * ownership. Must be called from the thread that acquired ownership.
558 */
559 static void
560 vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)
561 {
562 vmdr_metadata_lock(metadata);
563 vmdr_metadata_disown_locked(metadata);
564 vmdr_metadata_unlock(metadata);
565 }
566
567 static void
568 vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)
569 {
570 os_ref_retain(&metadata->vdrm_refcnt);
571 }
572
573 static void
574 vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)
575 {
576 if (os_ref_release(&metadata->vdrm_refcnt) == 0) {
577 vmdr_metadata_free(metadata);
578 }
579 }
580
581 static void
582 vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
583 {
584 LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
585 assert3p(metadata->vdrm_list.tqe_prev, !=, NULL);
586 TAILQ_REMOVE(&reclaim_buffers, metadata, vdrm_list);
587 metadata->vdrm_list.tqe_prev = NULL;
588 metadata->vdrm_list.tqe_next = NULL;
589 }
590
591 static void
592 vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
593 {
594 LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
595 assert3p(metadata->vdrm_list.tqe_prev, ==, NULL);
596 TAILQ_INSERT_TAIL(&reclaim_buffers, metadata, vdrm_list);
597 }
598
599 void
600 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
601 {
602 assert(metadata != NULL);
603 /*
604 * First remove the buffer from the global list so no one else can get access to it.
605 */
606 lck_mtx_lock(&reclaim_buffers_lock);
607 if (metadata->vdrm_is_registered) {
608 vmdr_list_remove_locked(metadata);
609 }
610 lck_mtx_unlock(&reclaim_buffers_lock);
611
612 /*
613 * The task is dropping its ref on this buffer. First remove the buffer's
614 * back-reference to the task so that any threads currently operating on
615 * this buffer do not try to operate on the dead/dying task
616 */
617 vmdr_metadata_lock(metadata);
618 assert3p(metadata->vdrm_task, !=, TASK_NULL);
619 metadata->vdrm_task = TASK_NULL;
620 vmdr_metadata_unlock(metadata);
621 vmdr_metadata_release(metadata);
622 }
623
624 #pragma mark Exception Delivery
625
626 static void
627 reclaim_kill_with_reason(
628 vm_deferred_reclamation_metadata_t metadata,
629 unsigned reason,
630 mach_exception_data_type_t subcode)
631 {
632 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
633 mach_exception_code_t code = 0;
634 task_t task;
635 proc_t p = NULL;
636 boolean_t fatal = TRUE;
637 bool killing_self;
638 pid_t pid;
639 int err;
640
641 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
642
643 EXC_GUARD_ENCODE_TYPE(code, guard_type);
644 EXC_GUARD_ENCODE_FLAVOR(code, reason);
645 EXC_GUARD_ENCODE_TARGET(code, 0);
646
647 vmdr_metadata_lock(metadata);
648 task = metadata->vdrm_task;
649 if (task == TASK_NULL || !task_is_active(task) || task_is_halting(task)) {
650 /* Task is no longer alive */
651 vmdr_metadata_unlock(metadata);
652 vmdr_log_error(
653 "Unable to deliver guard exception because task "
654 "[%d] is already dead.\n",
655 metadata->vdrm_pid);
656 return;
657 }
658
659 if (panic_on_kill) {
660 panic("About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
661 }
662
663 killing_self = (task == current_task());
664 if (!killing_self) {
665 task_reference(task);
666 }
667 assert(task != kernel_task);
668 vmdr_metadata_unlock(metadata);
669
670 if (reason == kGUARD_EXC_DEALLOC_GAP) {
671 task_lock(task);
672 fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
673 task_unlock(task);
674 }
675
676 if (!fatal) {
677 vmdr_log_info(
678 "Skipping non fatal guard exception for %s [%d]\n",
679 task_best_name(task), task_pid(task));
680 goto out;
681 }
682
683 pid = task_pid(task);
684 if (killing_self) {
685 p = get_bsdtask_info(task);
686 } else {
687 p = proc_find(pid);
688 if (p && proc_task(p) != task) {
689 vmdr_log_error(
690 "Unable to deliver guard exception because proc is gone & pid rolled over.\n");
691 goto out;
692 }
693 }
694
695 if (!p) {
696 vmdr_log_error(
697 "Unable to deliver guard exception because task does not have a proc.\n");
698 goto out;
699 }
700
701 int flags = PX_DEBUG_NO_HONOR;
702 exception_info_t info = {
703 .os_reason = OS_REASON_GUARD,
704 .exception_type = EXC_GUARD,
705 .mx_code = code,
706 .mx_subcode = subcode
707 };
708
709 vmdr_log("Force-exiting %s [%d]\n", task_best_name(task), task_pid(task));
710
711 err = exit_with_mach_exception(p, info, flags);
712 if (err != 0) {
713 vmdr_log_error("Unable to deliver guard exception to %p: %d\n", p, err);
714 goto out;
715 }
716
717
718 out:
719 if (!killing_self) {
720 if (p) {
721 proc_rele(p);
722 p = NULL;
723 }
724 if (task) {
725 task_deallocate(task);
726 task = NULL;
727 }
728 }
729 }
730
731 #pragma mark Copy I/O
732
733 static user_addr_t
734 get_entries_ptr(vm_deferred_reclamation_metadata_t metadata)
735 {
736 return metadata->vdrm_buffer_addr +
737 offsetof(struct mach_vm_reclaim_ring_s, entries);
738 }
739
740 static user_addr_t
741 get_indices_ptr(user_addr_t buffer_addr)
742 {
743 return buffer_addr +
744 offsetof(struct mach_vm_reclaim_ring_s, indices);
745 }
746
747 static user_addr_t
748 get_head_ptr(user_addr_t indices)
749 {
750 return indices + offsetof(struct mach_vm_reclaim_indices_s, head);
751 }
752
753 static user_addr_t
754 get_tail_ptr(user_addr_t indices)
755 {
756 return indices + offsetof(struct mach_vm_reclaim_indices_s, tail);
757 }
758
759 static user_addr_t
760 get_busy_ptr(user_addr_t indices)
761 {
762 return indices + offsetof(struct mach_vm_reclaim_indices_s, busy);
763 }
764
765 static kern_return_t
766 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
767 {
768 if (result != 0 && (result != EFAULT || !vm_fault_get_disabled())) {
769 vmdr_log_error("Killing [%d] due to copy I/O error\n", metadata->vdrm_pid);
770 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE,
771 result);
772 }
773 return kern_return_for_errno(result);
774 }
775
776 /*
777 * Helper functions to do copyio on the head, tail, and busy pointers.
778 * Note that the kernel will only write to the busy and head pointers.
779 * Userspace is not supposed to write to the head or busy pointers, but the kernel
780 * must be resilient to that kind of bug in userspace.
781 */
782
783 static kern_return_t
784 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
785 {
786 int result;
787 kern_return_t kr;
788 user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr);
789 user_addr_t head_ptr = get_head_ptr(indices);
790
791 result = copyin_atomic64(head_ptr, head);
792 kr = reclaim_handle_copyio_error(metadata, result);
793 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
794 vmdr_log_error(
795 "Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
796 }
797 return kr;
798 }
799
800 static kern_return_t
801 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
802 {
803 int result;
804 kern_return_t kr;
805 user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr);
806 user_addr_t tail_ptr = get_tail_ptr(indices);
807
808 result = copyin_atomic64(tail_ptr, tail);
809 kr = reclaim_handle_copyio_error(metadata, result);
810 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
811 vmdr_log_error(
812 "Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
813 }
814 return kr;
815 }
816
817 static kern_return_t
818 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
819 {
820 int result;
821 kern_return_t kr;
822 user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr);
823 user_addr_t busy_ptr = get_busy_ptr(indices);
824
825 result = copyin_atomic64(busy_ptr, busy);
826 kr = reclaim_handle_copyio_error(metadata, result);
827 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
828 vmdr_log_error(
829 "Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
830 }
831 return kr;
832 }
833
834 static bool
835 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
836 {
837 int result;
838 kern_return_t kr;
839 user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr);
840 user_addr_t busy_ptr = get_busy_ptr(indices);
841
842 result = copyout_atomic64(value, busy_ptr);
843 kr = reclaim_handle_copyio_error(metadata, result);
844 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
845 vmdr_log_error(
846 "Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
847 }
848 return kr;
849 }
850
851 static bool
852 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
853 {
854 int result;
855 kern_return_t kr;
856 user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr);
857 user_addr_t head_ptr = get_head_ptr(indices);
858
859 result = copyout_atomic64(value, head_ptr);
860 kr = reclaim_handle_copyio_error(metadata, result);
861 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
862 vmdr_log_error(
863 "Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
864 }
865 return kr;
866 }
867
868 #pragma mark Reclamation
869
870 /*
871 * @func reclaim_chunk
872 *
873 * @brief
874 * Reclaim a batch of entries from the buffer.
875 *
876 * @param bytes_to_reclaim
877 * Number of bytes caller wishes to reclaim from the buffer
878 *
879 * @param bytes_reclaimed_out
880 * The number of bytes reclaimed from the buffer written out
881 *
882 * @param chunk_size
883 * The maximum number of entries to hold busy and reclaim from (must
884 * be <= kReclaimChunkSize)
885 *
886 * @param num_reclaimed_out
887 * The number of entries reclaimed written out
888 *
889 * @discussion
890 * If the buffer has been exhausted of entries (tail == head),
891 * num_reclaimed_out will be zero. It is important that the caller abort any
892 * loops if such a condition is met.
893 */
894 static kern_return_t
895 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
896 uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
897 mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out)
898 {
899 kern_return_t kr = KERN_SUCCESS;
900 int result = 0;
901 mach_vm_reclaim_count_t num_reclaimed = 0, num_copied = 0;
902 uint64_t bytes_reclaimed = 0;
903 uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0;
904 user_addr_t indices;
905 vm_map_t map = metadata->vdrm_map;
906 vm_map_switch_context_t switch_ctx;
907 struct mach_vm_reclaim_entry_s copied_entries[kReclaimChunkSize];
908
909 assert(metadata != NULL);
910 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
911 vmdr_metadata_assert_owned(metadata);
912
913 assert(chunk_size <= kReclaimChunkSize);
914
915 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START,
916 metadata->vdrm_pid, bytes_to_reclaim);
917
918 memset(copied_entries, 0, sizeof(copied_entries));
919
920 indices = get_indices_ptr(metadata->vdrm_buffer_addr);
921 switch_ctx = vm_map_switch_to(map);
922
923 kr = reclaim_copyin_busy(metadata, &busy);
924 if (kr != KERN_SUCCESS) {
925 goto done;
926 }
927 kr = reclaim_copyin_head(metadata, &head);
928 if (kr != KERN_SUCCESS) {
929 goto done;
930 }
931 kr = reclaim_copyin_tail(metadata, &tail);
932 if (kr != KERN_SUCCESS) {
933 goto done;
934 }
935
936 /*
937 * NB: busy may not be exactly equal to head if the jetsam
938 * thread fails to fault on the indices after having marked
939 * entries busy
940 */
941 if (busy < head || (busy - head) > kReclaimChunkSize) {
942 vmdr_log_error(
943 "Userspace modified head or busy pointer! head: %llu "
944 "(0x%llx) | busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
945 head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail,
946 get_tail_ptr(indices));
947 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE,
948 busy);
949 kr = KERN_FAILURE;
950 goto done;
951 }
952
953 if (tail < head) {
954 /*
955 * Userspace is likely in the middle of trying to re-use an entry,
956 * bail on this reclamation.
957 */
958 vmdr_log_error(
959 "Tail < head! Userspace is likely attempting a "
960 "cancellation; aborting reclamation | head: %llu "
961 "(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
962 head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy,
963 get_busy_ptr(indices));
964 kr = KERN_ABORTED;
965 goto done;
966 }
967
968 /*
969 * NB: If any of the copyouts below fail due to faults being disabled,
970 * the buffer may be left in a state where several entries are unusable
971 * until the next reclamation (i.e. busy > head)
972 */
973 num_to_reclaim = tail - head;
974 while (true) {
975 num_to_reclaim = MIN(num_to_reclaim, chunk_size);
976 if (num_to_reclaim == 0) {
977 break;
978 }
979 busy = head + num_to_reclaim;
980 kr = reclaim_copyout_busy(metadata, busy);
981 if (kr != KERN_SUCCESS) {
982 goto done;
983 }
984 os_atomic_thread_fence(seq_cst);
985 kr = reclaim_copyin_tail(metadata, &new_tail);
986 if (kr != KERN_SUCCESS) {
987 goto done;
988 }
989
990 if (new_tail >= busy) {
991 /* Got num_to_reclaim entries */
992 break;
993 }
994 tail = new_tail;
995 if (tail < head) {
996 /*
997 * Userspace is likely in the middle of trying to re-use an entry,
998 * bail on this reclamation
999 */
1000 vmdr_log_error(
1001 "Tail < head! Userspace is likely attempting a "
1002 "cancellation; aborting reclamation | head: %llu "
1003 "(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
1004 head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy,
1005 get_busy_ptr(indices));
1006 /* Reset busy back to head */
1007 reclaim_copyout_busy(metadata, head);
1008 kr = KERN_ABORTED;
1009 goto done;
1010 }
1011 /* Can't reclaim these entries. Try again */
1012 num_to_reclaim = tail - head;
1013 if (num_to_reclaim == 0) {
1014 /* Nothing left to reclaim. Reset busy to head. */
1015 kr = reclaim_copyout_busy(metadata, head);
1016 if (kr != KERN_SUCCESS) {
1017 goto done;
1018 }
1019 break;
1020 }
1021 /*
1022 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
1023 * so this is gauranteed to converge.
1024 */
1025 }
1026 vmdr_log_debug("[%d] reclaiming up to %llu entries (%llu KiB) head=%llu "
1027 "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_to_reclaim,
1028 bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1029
1030 uint64_t memcpy_start_idx = head % metadata->vdrm_buffer_len;
1031 while (num_copied < num_to_reclaim) {
1032 uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
1033 // Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
1034 memcpy_end_idx = MIN(memcpy_end_idx, metadata->vdrm_buffer_len);
1035 uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
1036
1037 assert(num_to_copy + num_copied <= kReclaimChunkSize);
1038 user_addr_t src_ptr = get_entries_ptr(metadata) +
1039 (memcpy_start_idx * sizeof(struct mach_vm_reclaim_entry_s));
1040 struct mach_vm_reclaim_entry_s *dst_ptr = copied_entries + num_copied;
1041 result = copyin(src_ptr, dst_ptr,
1042 (num_to_copy * sizeof(struct mach_vm_reclaim_entry_s)));
1043 kr = reclaim_handle_copyio_error(metadata, result);
1044 if (kr != KERN_SUCCESS) {
1045 if (kr != KERN_MEMORY_ERROR || !vm_fault_get_disabled()) {
1046 vmdr_log_error(
1047 "Unable to copyin %llu entries in reclaim "
1048 "buffer at 0x%llx to 0x%llx: err=%d\n",
1049 num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
1050 }
1051 goto done;
1052 }
1053
1054 num_copied += num_to_copy;
1055 memcpy_start_idx = (memcpy_start_idx + num_to_copy) % metadata->vdrm_buffer_len;
1056 }
1057
1058 for (num_reclaimed = 0; num_reclaimed < num_to_reclaim && bytes_reclaimed < bytes_to_reclaim; num_reclaimed++) {
1059 mach_vm_reclaim_entry_t entry = &copied_entries[num_reclaimed];
1060 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1061 metadata->vdrm_pid, entry->address, entry->size,
1062 entry->behavior);
1063 if (entry->address != 0 && entry->size != 0) {
1064 vm_map_address_t start = vm_map_trunc_page(entry->address,
1065 VM_MAP_PAGE_MASK(map));
1066 vm_map_address_t end = vm_map_round_page(entry->address + entry->size,
1067 VM_MAP_PAGE_MASK(map));
1068 DTRACE_VM4(vm_reclaim_entry,
1069 pid_t, metadata->vdrm_pid,
1070 mach_vm_address_t, entry->address,
1071 mach_vm_address_t, end,
1072 mach_vm_reclaim_action_t, entry->behavior);
1073 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1074 metadata->vdrm_pid, start, end,
1075 entry->behavior);
1076 vmdr_log_debug("[%d] Reclaiming entry %llu (0x%llx, 0x%llx)\n", metadata->vdrm_pid, head + num_reclaimed, start, end);
1077 switch (entry->behavior) {
1078 case VM_RECLAIM_DEALLOCATE:
1079 kr = vm_map_remove_guard(map,
1080 start, end, VM_MAP_REMOVE_GAPS_FAIL,
1081 KMEM_GUARD_NONE).kmr_return;
1082 if (kr == KERN_INVALID_VALUE) {
1083 vmdr_log_error(
1084 "[%d] Killing due to virtual-memory guard at (0x%llx, 0x%llx)\n",
1085 metadata->vdrm_pid, start, end);
1086 reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
1087 goto done;
1088 } else if (kr != KERN_SUCCESS) {
1089 vmdr_log_error(
1090 "[%d] Killing due to deallocation failure at (0x%llx, 0x%llx) err=%d\n",
1091 metadata->vdrm_pid, start, end, kr);
1092 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1093 goto done;
1094 }
1095 break;
1096 case VM_RECLAIM_FREE:
1097 /*
1098 * TODO: This should free the backing pages directly instead of using
1099 * VM_BEHAVIOR_REUSABLE, which will mark the pages as clean and let them
1100 * age in the LRU.
1101 */
1102 kr = vm_map_behavior_set(map, start,
1103 end, VM_BEHAVIOR_REUSABLE);
1104 if (kr != KERN_SUCCESS) {
1105 vmdr_log_error(
1106 "[%d] Failed to free(reusable) (0x%llx, 0x%llx) err=%d\n",
1107 metadata->vdrm_pid, start, end, kr);
1108 }
1109 break;
1110 default:
1111 vmdr_log_error(
1112 "attempted to reclaim entry with unsupported behavior %uh",
1113 entry->behavior);
1114 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1115 kr = KERN_INVALID_VALUE;
1116 goto done;
1117 }
1118 bytes_reclaimed += entry->size;
1119 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END,
1120 kr);
1121 }
1122 }
1123
1124 assert(head + num_reclaimed <= busy);
1125 head += num_reclaimed;
1126 kr = reclaim_copyout_head(metadata, head);
1127 if (kr != KERN_SUCCESS) {
1128 goto done;
1129 }
1130 if (busy > head) {
1131 busy = head;
1132 kr = reclaim_copyout_busy(metadata, busy);
1133 if (kr != KERN_SUCCESS) {
1134 goto done;
1135 }
1136 }
1137
1138 done:
1139 vmdr_log_debug("[%d] reclaimed %u entries (%llu KiB) head=%llu "
1140 "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_reclaimed,
1141 bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1142 vm_map_switch_back(switch_ctx);
1143 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1144 bytes_reclaimed, num_reclaimed, kr);
1145 if (bytes_reclaimed_out) {
1146 *bytes_reclaimed_out = bytes_reclaimed;
1147 }
1148 if (num_reclaimed_out) {
1149 *num_reclaimed_out = num_reclaimed;
1150 }
1151 return kr;
1152 }
1153
1154 /*
1155 * @func vmdr_reclaim_from_buffer
1156 *
1157 * @brief
1158 * Reclaim entries until the buffer's estimated number of available bytes
1159 * is <= @c bytes_to_reclaim.
1160 *
1161 * @param bytes_to_reclaim
1162 * The minimum number of bytes to reclaim
1163 *
1164 * @param num_bytes_reclaimed_out
1165 * The number of bytes reclaimed written out
1166 *
1167 * @param options
1168 * If RECLAIM_NO_FAULT is set, do not fault on the buffer if it has been paged
1169 * out.
1170 *
1171 * @discussion
1172 * The buffer should be owned by the caller.
1173 */
1174 static kern_return_t
1175 vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata,
1176 size_t bytes_to_reclaim, size_t *num_bytes_reclaimed_out,
1177 vm_deferred_reclamation_options_t options)
1178 {
1179 kern_return_t kr = KERN_SUCCESS;
1180
1181 if (options & RECLAIM_NO_FAULT) {
1182 vm_fault_disable();
1183 }
1184
1185 size_t total_bytes_reclaimed = 0;
1186 while (total_bytes_reclaimed < bytes_to_reclaim) {
1187 uint64_t cur_bytes_reclaimed;
1188 mach_vm_reclaim_count_t entries_reclaimed;
1189 kr = reclaim_chunk(metadata, bytes_to_reclaim - total_bytes_reclaimed,
1190 &cur_bytes_reclaimed, kReclaimChunkSize, &entries_reclaimed);
1191 total_bytes_reclaimed += cur_bytes_reclaimed;
1192 if (entries_reclaimed == 0 || kr != KERN_SUCCESS) {
1193 break;
1194 }
1195 }
1196
1197 if (options & RECLAIM_NO_FAULT) {
1198 vm_fault_enable();
1199 }
1200 vmdr_log_debug("reclaimed %lu B / %lu B from %d\n", total_bytes_reclaimed, bytes_to_reclaim, metadata->vdrm_pid);
1201 if (num_bytes_reclaimed_out) {
1202 *num_bytes_reclaimed_out = total_bytes_reclaimed;
1203 }
1204 return kr;
1205 }
1206
1207 /*
1208 * Get the reclamation metadata buffer for the given map.
1209 */
1210 static vm_deferred_reclamation_metadata_t
1211 get_task_reclaim_metadata(task_t task)
1212 {
1213 assert(task != NULL);
1214 vm_deferred_reclamation_metadata_t metadata = NULL;
1215 task_lock(task);
1216 metadata = task->deferred_reclamation_metadata;
1217 task_unlock(task);
1218 return metadata;
1219 }
1220
1221 #pragma mark Buffer Resize/Synchronization
1222
1223 kern_return_t
1224 vm_deferred_reclamation_buffer_flush_internal(task_t task,
1225 mach_vm_reclaim_count_t num_entries_to_reclaim)
1226 {
1227 kern_return_t kr;
1228 vm_deferred_reclamation_metadata_t metadata = NULL;
1229 mach_vm_reclaim_count_t total_reclaimed = 0;
1230 uint64_t bytes_reclaimed = 0;
1231
1232 if (!task_is_active(task)) {
1233 return KERN_INVALID_TASK;
1234 }
1235
1236 metadata = get_task_reclaim_metadata(task);
1237 if (metadata == NULL) {
1238 return KERN_INVALID_ARGUMENT;
1239 }
1240
1241 vmdr_metadata_own(metadata);
1242
1243 vmdr_log_debug("[%d] flushing %u entries\n", task_pid(task), num_entries_to_reclaim);
1244 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_START, metadata->vdrm_pid, num_entries_to_reclaim);
1245
1246 while (total_reclaimed < num_entries_to_reclaim) {
1247 mach_vm_reclaim_count_t cur_reclaimed;
1248 uint64_t cur_bytes_reclaimed;
1249 mach_vm_reclaim_count_t chunk_size = MIN(num_entries_to_reclaim - total_reclaimed, kReclaimChunkSize);
1250 kr = reclaim_chunk(metadata, UINT64_MAX, &cur_bytes_reclaimed, chunk_size,
1251 &cur_reclaimed);
1252 total_reclaimed += cur_reclaimed;
1253 bytes_reclaimed += cur_bytes_reclaimed;
1254 if (cur_reclaimed == 0) {
1255 break;
1256 } else if (kr == KERN_ABORTED) {
1257 /*
1258 * Unable to reclaim due to a lost race with
1259 * userspace, yield the gate and try again
1260 */
1261 vmdr_metadata_disown(metadata);
1262 vmdr_metadata_own(metadata);
1263 continue;
1264 } else if (kr != KERN_SUCCESS) {
1265 break;
1266 }
1267 }
1268
1269 vmdr_metadata_lock(metadata);
1270 metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed;
1271 vmdr_metadata_disown_locked(metadata);
1272 vmdr_metadata_unlock(metadata);
1273
1274 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_END, kr, total_reclaimed, bytes_reclaimed);
1275 DTRACE_VM2(reclaim_flush,
1276 mach_vm_reclaim_count_t, num_entries_to_reclaim,
1277 size_t, bytes_reclaimed);
1278 return kr;
1279 }
1280
1281 kern_return_t
1282 vm_deferred_reclamation_buffer_resize_internal(
1283 task_t task,
1284 mach_vm_reclaim_count_t len)
1285 {
1286 kern_return_t kr;
1287 mach_vm_reclaim_count_t num_entries_reclaimed = 0;
1288 mach_vm_reclaim_count_t old_len;
1289
1290 if (task == TASK_NULL) {
1291 return KERN_INVALID_TASK;
1292 }
1293 if (len == 0) {
1294 return KERN_INVALID_ARGUMENT;
1295 }
1296 vm_deferred_reclamation_metadata_t metadata = get_task_reclaim_metadata(task);
1297 if (metadata == NULL) {
1298 return KERN_INVALID_TASK;
1299 }
1300
1301 /* Size must be multiple of page size */
1302 vm_map_t map = task->map;
1303 mach_vm_size_t new_size = vmdr_round_len_to_size(map, len);
1304 if (new_size == 0) {
1305 return KERN_INVALID_ARGUMENT;
1306 }
1307 if (new_size > metadata->vdrm_buffer_size) {
1308 return KERN_NO_SPACE;
1309 }
1310
1311 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_START,
1312 task_pid(task), new_size);
1313
1314 /*
1315 * Prevent other threads from operating on this buffer while it is
1316 * resized. It is the caller's responsibility to ensure mutual
1317 * exclusion with other user threads
1318 */
1319 vmdr_metadata_own(metadata);
1320
1321 old_len = metadata->vdrm_buffer_len;
1322
1323 vmdr_log_debug("%s [%d] resizing buffer %u -> %u entries\n",
1324 task_best_name(task), task_pid(task), old_len, len);
1325
1326 /*
1327 * Reclaim all the entries currently in the buffer to prevent re-use
1328 * of old reclaim ids that will alias differently into the newly sized
1329 * buffer.
1330 *
1331 * TODO: Consider encoding the ringbuffer-capacity in the
1332 * mach_vm_reclaim_id_t, so reuses can still find objects after a resize.
1333 */
1334 do {
1335 kr = reclaim_chunk(metadata, UINT64_MAX, NULL, kReclaimChunkSize,
1336 &num_entries_reclaimed);
1337 if (kr != KERN_SUCCESS) {
1338 goto fail;
1339 }
1340 } while (num_entries_reclaimed > 0);
1341
1342 /* Publish new user addresses in kernel metadata */
1343 vmdr_metadata_lock(metadata);
1344 metadata->vdrm_buffer_len = len;
1345 vmdr_metadata_disown_locked(metadata);
1346 vmdr_metadata_unlock(metadata);
1347
1348 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, KERN_SUCCESS, num_entries_reclaimed);
1349 DTRACE_VM2(reclaim_ring_resize,
1350 mach_vm_reclaim_count_t, old_len,
1351 mach_vm_reclaim_count_t, len);
1352 return KERN_SUCCESS;
1353
1354 fail:
1355 vmdr_metadata_disown(metadata);
1356 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, kr, num_entries_reclaimed);
1357 return kr;
1358 }
1359
1360 #pragma mark Accounting
1361
1362 #if CONFIG_WORKING_SET_ESTIMATION
1363 extern vm_pressure_level_t memorystatus_vm_pressure_level;
1364
1365 static uint64_t
1366 vmdr_metadata_autotrim_threshold(vm_deferred_reclamation_metadata_t metadata)
1367 {
1368 kern_return_t kr;
1369 uint32_t autotrim_pct;
1370
1371 /*
1372 * Determine the autotrim threshold based on the current pressure level
1373 */
1374 vm_pressure_level_t pressure_level = os_atomic_load(&memorystatus_vm_pressure_level, relaxed);
1375 switch (pressure_level) {
1376 case kVMPressureNormal:
1377 autotrim_pct = vm_reclaim_autotrim_pct_normal;
1378 break;
1379 case kVMPressureWarning:
1380 case kVMPressureUrgent:
1381 autotrim_pct = vm_reclaim_autotrim_pct_pressure;
1382 break;
1383 case kVMPressureCritical:
1384 autotrim_pct = vm_reclaim_autotrim_pct_critical;
1385 break;
1386 default:
1387 panic("vm_reclaim: unexpected vm_pressure_level %d", pressure_level);
1388 }
1389
1390 /*
1391 * Estimate the task's maximum working set size
1392 */
1393 ledger_amount_t phys_footprint_max = 0;
1394 kr = ledger_get_lifetime_max(metadata->vdrm_task->ledger,
1395 task_ledgers.phys_footprint, &phys_footprint_max);
1396 assert3u(kr, ==, KERN_SUCCESS);
1397
1398 return phys_footprint_max * autotrim_pct / 100;
1399 }
1400
1401 #define VMDR_WMA_UNIT (1 << 8)
1402 #define VMDR_WMA_MIX(base, e) ((vm_reclaim_wma_weight_base * (base) + (e) * VMDR_WMA_UNIT * vm_reclaim_wma_weight_cur) / vm_reclaim_wma_denom)
1403
1404 static size_t
1405 vmdr_metadata_reset_min_bytes(vm_deferred_reclamation_metadata_t metadata)
1406 {
1407 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
1408 metadata->vdrm_reclaimable_bytes_min =
1409 metadata->vdrm_cumulative_uncancelled_bytes -
1410 metadata->vdrm_cumulative_reclaimed_bytes;
1411 return metadata->vdrm_reclaimable_bytes_min;
1412 }
1413
1414 /*
1415 * @func vmdr_ws_sample
1416 *
1417 * @brief sample the working set size of the given buffer
1418 *
1419 * @param metadata
1420 * The reclaim buffer to sample
1421 *
1422 * @param trim_threshold_out
1423 * If the buffer should be trimmed, the amount to trim (in bytes) will be
1424 * written out
1425 *
1426 * @returns true iff the buffer should be trimmed
1427 *
1428 * @discussion
1429 * The caller must hold the buffer locked.
1430 */
1431 static bool
1432 vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata,
1433 size_t *trim_threshold_out)
1434 {
1435 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
1436
1437 uint64_t now = mach_absolute_time();
1438 if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) {
1439 /* A sampling period has not elapsed */
1440 return false;
1441 }
1442
1443 size_t estimated_reclaimable_bytes;
1444 uint64_t samples_elapsed = (now - metadata->vdrm_last_sample_abs) /
1445 vm_reclaim_sampling_period_abs;
1446
1447 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_START,
1448 metadata->vdrm_pid,
1449 now,
1450 metadata->vdrm_last_sample_abs,
1451 metadata->vdrm_reclaimable_bytes_min);
1452
1453 if (samples_elapsed > vm_reclaim_abandonment_threshold) {
1454 /*
1455 * Many sampling periods have elapsed since the ring was
1456 * last sampled. Don't bother computing the WMA and assume
1457 * the buffer's current contents are unneeded.
1458 */
1459 estimated_reclaimable_bytes =
1460 metadata->vdrm_cumulative_uncancelled_bytes -
1461 metadata->vdrm_cumulative_reclaimed_bytes;
1462 metadata->vdrm_reclaimable_bytes_min = estimated_reclaimable_bytes;
1463 metadata->vdrm_reclaimable_bytes_wma = estimated_reclaimable_bytes;
1464 } else {
1465 /*
1466 * Compute an exponential moving average of the minimum amount of reclaimable
1467 * memory in this buffer. Multiple sampling periods may have elapsed
1468 * since the last sample. By definition, the minimum must be the same for
1469 * all elapsed periods (otherwise libmalloc would have called down to
1470 * update accounting)
1471 */
1472 for (unsigned int i = 0; i < samples_elapsed; i++) {
1473 metadata->vdrm_reclaimable_bytes_wma = VMDR_WMA_MIX(
1474 metadata->vdrm_reclaimable_bytes_wma,
1475 metadata->vdrm_reclaimable_bytes_min);
1476 }
1477
1478 /* Reset the minimum to start a new sampling interval */
1479 estimated_reclaimable_bytes = vmdr_metadata_reset_min_bytes(metadata);
1480 }
1481
1482 metadata->vdrm_last_sample_abs = now;
1483
1484 size_t trim_threshold_bytes = MIN(metadata->vdrm_reclaimable_bytes_min,
1485 metadata->vdrm_reclaimable_bytes_wma / VMDR_WMA_UNIT);
1486 size_t autotrim_threshold = vmdr_metadata_autotrim_threshold(metadata);
1487
1488 bool trim_needed = trim_threshold_bytes >= vm_map_page_size(metadata->vdrm_map) &&
1489 trim_threshold_bytes >= autotrim_threshold;
1490
1491 *trim_threshold_out = vm_map_round_page(trim_threshold_bytes,
1492 vm_map_page_mask(metadata->vdrm_map));
1493
1494 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_END,
1495 *trim_threshold_out,
1496 trim_needed,
1497 estimated_reclaimable_bytes);
1498 DTRACE_VM5(reclaim_sample,
1499 pid_t, metadata->vdrm_pid,
1500 uint64_t, metadata->vdrm_reclaimable_bytes_wma,
1501 size_t, metadata->vdrm_reclaimable_bytes_min,
1502 size_t, estimated_reclaimable_bytes,
1503 size_t, *trim_threshold_out);
1504 vmdr_log_debug("sampled buffer with min %lu est %lu trim %lu wma %llu\n",
1505 metadata->vdrm_reclaimable_bytes_min,
1506 estimated_reclaimable_bytes,
1507 trim_threshold_bytes,
1508 metadata->vdrm_reclaimable_bytes_wma / VMDR_WMA_UNIT);
1509
1510 return trim_needed;
1511 }
1512 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1513
1514 /*
1515 * Caller must have buffer owned and unlocked
1516 */
1517 static kern_return_t
1518 vmdr_trim(vm_deferred_reclamation_metadata_t metadata, size_t bytes_to_reclaim,
1519 size_t *bytes_reclaimed, vm_deferred_reclamation_options_t options)
1520 {
1521 kern_return_t kr;
1522 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_START,
1523 metadata->vdrm_pid, bytes_to_reclaim);
1524
1525 kr = vmdr_reclaim_from_buffer(metadata, bytes_to_reclaim,
1526 bytes_reclaimed, options);
1527
1528 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_END, kr, bytes_reclaimed);
1529 DTRACE_VM3(reclaim_trim,
1530 pid_t, metadata->vdrm_pid,
1531 size_t, bytes_to_reclaim,
1532 size_t, *bytes_reclaimed);
1533 return kr;
1534 }
1535
1536 /*
1537 * Caller must have buffer owned and unlocked
1538 */
1539 static kern_return_t
1540 vmdr_drain(vm_deferred_reclamation_metadata_t metadata, size_t *bytes_reclaimed,
1541 vm_deferred_reclamation_options_t options)
1542 {
1543 kern_return_t kr;
1544 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_START,
1545 metadata->vdrm_pid);
1546
1547 kr = vmdr_reclaim_from_buffer(metadata, UINT64_MAX,
1548 bytes_reclaimed, options);
1549
1550 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_END, kr, bytes_reclaimed);
1551 DTRACE_VM2(reclaim_drain,
1552 pid_t, metadata->vdrm_pid,
1553 size_t, *bytes_reclaimed);
1554 return kr;
1555 }
1556
1557 kern_return_t
1558 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, uint64_t bytes_placed_in_buffer)
1559 {
1560 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1561 size_t estimated_reclaimable_bytes, bytes_to_reclaim, bytes_reclaimed = 0;
1562 kern_return_t kr = KERN_SUCCESS;
1563 if (metadata == NULL) {
1564 return KERN_INVALID_ARGUMENT;
1565 }
1566
1567 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START,
1568 metadata->vdrm_pid, bytes_placed_in_buffer);
1569
1570 vmdr_metadata_lock(metadata);
1571
1572 if (!metadata->vdrm_pid) {
1573 /* If this is a forked child, we may not yet have a pid */
1574 metadata->vdrm_pid = task_pid(task);
1575 }
1576
1577 /*
1578 * The client is allowed to make this call in parallel from multiple threads.
1579 * It's possible that, while we were waiting for the lock, another
1580 * thread updated accounting with a larger/newer uncancelled_bytes
1581 * value that resulted in a reclaim. We can't provide strict ordering
1582 * with the current implementation, but we can at least detect very
1583 * erroneous stale values that would result in the uncancelled-byte
1584 * count being less than the reclaimed-byte-count (which cannot be
1585 * accurate).
1586 *
1587 * TODO: Consider making this a try_copyin of the userspace value
1588 * under the mutex to ensure ordering/consistency (rdar://137607771)
1589 */
1590 if (bytes_placed_in_buffer < metadata->vdrm_cumulative_reclaimed_bytes) {
1591 goto done;
1592 }
1593
1594 metadata->vdrm_cumulative_uncancelled_bytes = bytes_placed_in_buffer;
1595 estimated_reclaimable_bytes = bytes_placed_in_buffer - metadata->vdrm_cumulative_reclaimed_bytes;
1596 #if CONFIG_WORKING_SET_ESTIMATION
1597 bool should_reclaim = vmdr_sample_working_set(metadata, &bytes_to_reclaim);
1598 if (should_reclaim) {
1599 vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
1600 lck_mtx_unlock(&metadata->vdrm_lock);
1601 vmdr_log_debug("trimming pid %d\n", metadata->vdrm_pid);
1602
1603 kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, RECLAIM_OPTIONS_NONE);
1604
1605 vmdr_metadata_lock(metadata);
1606 metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed;
1607 /* Reset the current minimum now that the buffer has been trimmed down */
1608 vmdr_metadata_reset_min_bytes(metadata);
1609 vmdr_metadata_disown_locked(metadata);
1610 if (kr == KERN_ABORTED) {
1611 /*
1612 * We were unable to complete the trim due to a lost
1613 * race with userspace. This need not be fatal b/c the
1614 * accounting was successfully updated.
1615 */
1616 kr = KERN_SUCCESS;
1617 }
1618 } else {
1619 /* Update the minimum for the current sampling period */
1620 metadata->vdrm_reclaimable_bytes_min = MIN(metadata->vdrm_reclaimable_bytes_min, estimated_reclaimable_bytes);
1621 }
1622 #else /* !CONFIG_WORKING_SET_ESTIMATION */
1623 if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
1624 bytes_to_reclaim = vm_reclaim_max_threshold - estimated_reclaimable_bytes;
1625 vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
1626 vmdr_metadata_unlock(metadata);
1627 kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, RECLAIM_OPTIONS_NONE);
1628 vmdr_metadata_lock(metadata);
1629 metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed;
1630 vmdr_metadata_disown_locked(metadata);
1631 if (kr == KERN_ABORTED) {
1632 /*
1633 * We were unable to complete the trim due to a lost
1634 * race with userspace. This need not be fatal b/c the
1635 * accounting was successfully updated.
1636 */
1637 kr = KERN_SUCCESS;
1638 }
1639 }
1640 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1641
1642 done:
1643 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END,
1644 metadata->vdrm_cumulative_uncancelled_bytes,
1645 metadata->vdrm_cumulative_reclaimed_bytes,
1646 bytes_reclaimed);
1647 vmdr_metadata_unlock(metadata);
1648 return kr;
1649 }
1650
1651 kern_return_t
1652 vm_deferred_reclamation_task_drain(task_t task,
1653 vm_deferred_reclamation_options_t options)
1654 {
1655 kern_return_t kr;
1656 size_t bytes_reclaimed;
1657
1658 task_lock(task);
1659 if (!task_is_active(task) || task_is_halting(task)) {
1660 task_unlock(task);
1661 return KERN_ABORTED;
1662 }
1663 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1664 if (metadata == NULL) {
1665 task_unlock(task);
1666 return KERN_SUCCESS;
1667 }
1668 vmdr_metadata_retain(metadata);
1669 task_unlock(task);
1670
1671 vmdr_metadata_own(metadata);
1672
1673 kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1674
1675 vmdr_metadata_lock(metadata);
1676 metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed;
1677 vmdr_metadata_disown_locked(metadata);
1678 vmdr_metadata_unlock(metadata);
1679
1680 vmdr_metadata_release(metadata);
1681 return kr;
1682 }
1683
1684 void
1685 vm_deferred_reclamation_task_suspend(task_t task)
1686 {
1687 if (task->deferred_reclamation_metadata) {
1688 sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
1689 }
1690 }
1691
1692 #pragma mark KPIs
1693
1694 vm_deferred_reclamation_metadata_t
1695 vm_deferred_reclamation_task_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1696 {
1697 vm_deferred_reclamation_metadata_t metadata = NULL;
1698 vmdr_metadata_assert_owned(parent);
1699
1700 assert(task->deferred_reclamation_metadata == NULL);
1701 metadata = vmdr_metadata_alloc(task, parent->vdrm_buffer_addr,
1702 parent->vdrm_buffer_size, parent->vdrm_buffer_len);
1703
1704 metadata->vdrm_cumulative_reclaimed_bytes = parent->vdrm_cumulative_reclaimed_bytes;
1705 metadata->vdrm_cumulative_uncancelled_bytes = parent->vdrm_cumulative_uncancelled_bytes;
1706 #if CONFIG_WORKING_SET_ESTIMATION
1707 metadata->vdrm_reclaimable_bytes_min = parent->vdrm_reclaimable_bytes_min;
1708 metadata->vdrm_reclaimable_bytes_wma = parent->vdrm_reclaimable_bytes_wma;
1709 metadata->vdrm_last_sample_abs = parent->vdrm_last_sample_abs;
1710 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1711
1712 return metadata;
1713 }
1714
1715 void
1716 vm_deferred_reclamation_task_fork_register(vm_deferred_reclamation_metadata_t metadata)
1717 {
1718 assert(metadata != NULL);
1719 assert(!metadata->vdrm_is_registered);
1720
1721 lck_mtx_lock(&reclaim_buffers_lock);
1722 metadata->vdrm_is_registered = true;
1723 vmdr_list_append_locked(metadata);
1724 lck_mtx_unlock(&reclaim_buffers_lock);
1725 }
1726
1727 bool
1728 vm_deferred_reclamation_task_has_ring(task_t task)
1729 {
1730 return task->deferred_reclamation_metadata != NULL;
1731 }
1732
1733 void
1734 vm_deferred_reclamation_ring_own(vm_deferred_reclamation_metadata_t metadata)
1735 {
1736 vmdr_metadata_own(metadata);
1737 }
1738
1739 void
1740 vm_deferred_reclamation_ring_disown(vm_deferred_reclamation_metadata_t metadata)
1741 {
1742 vmdr_metadata_disown(metadata);
1743 }
1744
1745 void
1746 vm_deferred_reclamation_gc(vm_deferred_reclamation_gc_action_t action, vm_deferred_reclamation_options_t options)
1747 {
1748 vmdr_garbage_collect(action, options);
1749 }
1750
1751 #pragma mark Global Reclamation GC
1752
1753 static void
1754 vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action, vm_deferred_reclamation_options_t options)
1755 {
1756 kern_return_t kr;
1757 size_t bytes_reclaimed, bytes_to_reclaim;
1758 bool should_reclaim;
1759 gate_wait_result_t wr;
1760
1761 #if !CONFIG_WORKING_SET_ESTIMATION
1762 if (action == RECLAIM_GC_TRIM) {
1763 /* GC_TRIM is a no-op without working set estimation */
1764 return;
1765 }
1766 #endif /* !CONFIG_WORKING_SET_ESTIMATION */
1767
1768 lck_mtx_lock(&reclaim_buffers_lock);
1769 kr = lck_mtx_gate_try_close(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
1770 if (kr != KERN_SUCCESS) {
1771 if (options & RECLAIM_NO_WAIT) {
1772 lck_mtx_unlock(&reclaim_buffers_lock);
1773 return;
1774 }
1775 wr = lck_mtx_gate_wait(&reclaim_buffers_lock, &vm_reclaim_gc_gate, LCK_SLEEP_DEFAULT, THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1776 assert3u(wr, ==, GATE_HANDOFF);
1777 }
1778
1779 vm_reclaim_gc_epoch++;
1780 vmdr_log_debug("running global GC\n");
1781 while (true) {
1782 vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclaim_buffers);
1783 if (metadata == NULL) {
1784 break;
1785 }
1786 vmdr_list_remove_locked(metadata);
1787 vmdr_list_append_locked(metadata);
1788 vmdr_metadata_retain(metadata);
1789 lck_mtx_unlock(&reclaim_buffers_lock);
1790
1791 vmdr_metadata_lock(metadata);
1792
1793 if (metadata->vdrm_reclaimed_at >= vm_reclaim_gc_epoch) {
1794 /* We've already seen this one. We're done */
1795 vmdr_metadata_unlock(metadata);
1796 vmdr_metadata_release(metadata);
1797 lck_mtx_lock(&reclaim_buffers_lock);
1798 break;
1799 }
1800 metadata->vdrm_reclaimed_at = vm_reclaim_gc_epoch;
1801
1802 task_t task = metadata->vdrm_task;
1803 if (task == TASK_NULL ||
1804 !task_is_active(task) ||
1805 task_is_halting(task)) {
1806 goto next;
1807 }
1808 bool buffer_is_suspended = task_is_app_suspended(task);
1809 task = TASK_NULL;
1810
1811 switch (action) {
1812 case RECLAIM_GC_DRAIN:
1813 if (!vmdr_metadata_own_locked(metadata, options)) {
1814 goto next;
1815 }
1816 vmdr_metadata_unlock(metadata);
1817 vmdr_drain(metadata, &bytes_reclaimed, options);
1818 vmdr_metadata_lock(metadata);
1819 vmdr_metadata_disown_locked(metadata);
1820 break;
1821 case RECLAIM_GC_SCAVENGE:
1822 if (buffer_is_suspended) {
1823 vmdr_metadata_own_locked(metadata, options);
1824 vmdr_metadata_unlock(metadata);
1825 /* This buffer is no longer in use, fully reclaim it. */
1826 vmdr_log_debug("found suspended buffer (%d), draining\n", metadata->vdrm_pid);
1827 kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1828 vmdr_metadata_lock(metadata);
1829 vmdr_metadata_disown_locked(metadata);
1830 }
1831 break;
1832 case RECLAIM_GC_TRIM:
1833 #if CONFIG_WORKING_SET_ESTIMATION
1834 should_reclaim = vmdr_sample_working_set(metadata, &bytes_to_reclaim);
1835 if (should_reclaim) {
1836 vmdr_log_debug("GC found stale buffer (%d), trimming\n", metadata->vdrm_pid);
1837 vmdr_metadata_own_locked(metadata, options);
1838 vmdr_metadata_unlock(metadata);
1839 kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, options);
1840 vmdr_metadata_lock(metadata);
1841 vmdr_metadata_disown_locked(metadata);
1842 }
1843 #else /* !CONFIG_WORKING_SET_ESTIMATION */
1844 (void)bytes_to_reclaim;
1845 (void)should_reclaim;
1846 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1847 break;
1848 }
1849 if (bytes_reclaimed) {
1850 vm_reclaim_gc_reclaim_count++;
1851 metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed;
1852 }
1853 if (metadata->vdrm_waiters && action != RECLAIM_GC_TRIM) {
1854 thread_wakeup((event_t)&metadata->vdrm_waiters);
1855 }
1856 next:
1857 vmdr_metadata_unlock(metadata);
1858 vmdr_metadata_release(metadata);
1859 lck_mtx_lock(&reclaim_buffers_lock);
1860 }
1861 lck_mtx_gate_handoff(&reclaim_buffers_lock, &vm_reclaim_gc_gate, GATE_HANDOFF_OPEN_IF_NO_WAITERS);
1862 lck_mtx_unlock(&reclaim_buffers_lock);
1863 }
1864
1865 OS_NORETURN
1866 static void
1867 vm_reclaim_scavenger_thread_continue(__unused void *param, __unused wait_result_t wr)
1868 {
1869 sched_cond_ack(&vm_reclaim_scavenger_cond);
1870
1871 while (true) {
1872 vmdr_garbage_collect(RECLAIM_GC_SCAVENGE, RECLAIM_OPTIONS_NONE);
1873 sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT, vm_reclaim_scavenger_thread_continue);
1874 }
1875 }
1876
1877 OS_NORETURN
1878 static void
1879 vm_reclaim_scavenger_thread_init(__unused void *param, __unused wait_result_t wr)
1880 {
1881 thread_set_thread_name(current_thread(), "VM_reclaim_scavenger");
1882 #if CONFIG_THREAD_GROUPS
1883 thread_group_vm_add();
1884 #endif /* CONFIG_THREAD_GROUPS */
1885 sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT, vm_reclaim_scavenger_thread_continue);
1886 __builtin_unreachable();
1887 }
1888
1889 __startup_func
1890 static void
1891 vm_deferred_reclamation_init(void)
1892 {
1893 vm_reclaim_log_handle = os_log_create("com.apple.xnu", "vm_reclaim");
1894 #if CONFIG_WORKING_SET_ESTIMATION
1895 nanoseconds_to_absolutetime((uint64_t)vm_reclaim_sampling_period_ns,
1896 &vm_reclaim_sampling_period_abs);
1897 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1898
1899 sched_cond_init(&vm_reclaim_scavenger_cond);
1900 lck_mtx_gate_init(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
1901 kern_return_t kr = kernel_thread_start_priority(vm_reclaim_scavenger_thread_init,
1902 NULL, BASEPRI_KERNEL, &vm_reclaim_scavenger_thread);
1903 if (kr != KERN_SUCCESS) {
1904 panic("Unable to create VM reclaim thread, %d", kr);
1905 }
1906 }
1907
1908 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1909
1910 #pragma mark Debug Interfaces
1911
1912 #if DEVELOPMENT || DEBUG
1913
1914 bool
1915 vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task)
1916 {
1917 bool reclaimed;
1918 vm_deferred_reclamation_metadata_t metadata = NULL;
1919
1920 task_lock(task);
1921 if (!task_is_halting(task) && task_is_active(task)) {
1922 metadata = task->deferred_reclamation_metadata;
1923 }
1924 if (metadata != NULL) {
1925 vmdr_metadata_retain(metadata);
1926 }
1927 task_unlock(task);
1928 if (metadata == NULL) {
1929 return false;
1930 }
1931
1932 vmdr_metadata_lock(metadata);
1933
1934 metadata->vdrm_waiters++;
1935 /* Wake up the scavenger thread */
1936 sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
1937 wait_result_t wr = lck_mtx_sleep(&metadata->vdrm_lock,
1938 LCK_SLEEP_DEFAULT, (event_t)&metadata->vdrm_waiters,
1939 THREAD_ABORTSAFE);
1940 metadata->vdrm_waiters--;
1941 reclaimed = (wr == THREAD_AWAKENED);
1942
1943 vmdr_metadata_unlock(metadata);
1944 vmdr_metadata_release(metadata);
1945 return reclaimed;
1946 }
1947
1948 #endif /* DEVELOPMENT || DEBUG */
1949