1 /*
2 * Copyright (c) 2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/sched_prim.h>
35 #include <kern/startup.h>
36 #include <kern/thread_group.h>
37 #include <libkern/OSAtomic.h>
38 #include <mach/kern_return.h>
39 #include <mach/mach_types.h>
40 #include <mach/vm_reclaim_private.h>
41 #include <os/atomic_private.h>
42 #include <os/base_private.h>
43 #include <os/log.h>
44 #include <os/refcnt.h>
45 #include <os/refcnt_internal.h>
46 #include <pexpert/pexpert.h>
47 #include <sys/errno.h>
48 #include <sys/kdebug.h>
49 #include <sys/queue.h>
50 #include <sys/reason.h>
51 #include <vm/vm_fault_xnu.h>
52 #include <vm/vm_map.h>
53 #include <vm/vm_map_internal.h>
54 #include <vm/vm_pageout_internal.h>
55 #include <vm/vm_reclaim_internal.h>
56 #include <vm/vm_sanitize_internal.h>
57 #include <vm/vm_kern_xnu.h>
58
59 #pragma mark Tunables
60
61 #if XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR
62 /* Temporarily opt iOS into the legacy behavior as a stop-gap */
63 #define CONFIG_WORKING_SET_ESTIMATION 0
64 /*
65 * Deferred reclaim may be enabled via EDT for select iOS devices, but
66 * defaults to disabled
67 */
68 #define VM_RECLAIM_ENABLED_DEFAULT false
69 #else
70 #define CONFIG_WORKING_SET_ESTIMATION 1
71 #define VM_RECLAIM_ENABLED_DEFAULT true
72 #endif
73
74 #if DEVELOPMENT || DEBUG
75 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
76 #else /* RELEASE */
77 const uint32_t kReclaimChunkSize = 16;
78 #endif /* DEVELOPMENT || DEBUG */
79 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_sampling_period_ns, "vm_reclaim_sampling_period_ns",
80 #if CONFIG_WORKING_SET_ESTIMATION
81 10ULL * NSEC_PER_SEC);
82 #else
83 0ULL);
84 #endif
85 #if CONFIG_WORKING_SET_ESTIMATION
86 TUNABLE_DT_DEV_WRITEABLE(bool, vm_reclaim_enabled, "/defaults",
87 "kern.vm_reclaim_enabled", "vm_reclaim_enabled", VM_RECLAIM_ENABLED_DEFAULT, TUNABLE_DT_NONE);
88 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_normal, "vm_reclaim_autotrim_pct_normal", 10);
89 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_pressure, "vm_reclaim_autotrim_pct_pressure", 5);
90 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_critical, "vm_reclaim_autotrim_pct_critical", 1);
91 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_base, "vm_reclaim_wma_weight_base", 3);
92 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_cur, "vm_reclaim_wma_weight_cur", 1);
93 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_denom, "vm_reclaim_wma_denom", 4);
94 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_abandonment_threshold, "vm_reclaim_abandonment_threshold", 512);
95 #else /* CONFIG_WORKING_SET_ESTIMATION */
96 TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults",
97 "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", 0, TUNABLE_DT_NONE);
98 #endif /* CONFIG_WORKING_SET_ESTIMATION */
99 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
100 #if DEVELOPMENT || DEBUG
101 TUNABLE_WRITEABLE(bool, vm_reclaim_debug, "vm_reclaim_debug", false);
102 #endif
103
104 #pragma mark Declarations
105 typedef struct proc *proc_t;
106 extern const char *proc_best_name(struct proc *);
107 extern void *proc_find(int pid);
108 extern task_t proc_task(proc_t);
109 extern kern_return_t kern_return_for_errno(int);
110 extern int mach_to_bsd_errno(kern_return_t kr);
111 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
112 struct proc *proc_ref(struct proc *p, int locked);
113 int proc_rele(proc_t p);
114
115 #define _vmdr_log_type(type, fmt, ...) os_log_with_type(vm_reclaim_log_handle, type, "vm_reclaim: " fmt, ##__VA_ARGS__)
116 #define vmdr_log(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_DEFAULT, fmt, ##__VA_ARGS__)
117 #define vmdr_log_info(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_INFO, fmt, ##__VA_ARGS__)
118 #define vmdr_log_error(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_ERROR, fmt, ##__VA_ARGS__)
119 #if DEVELOPMENT || DEBUG
120 #define vmdr_log_debug(fmt, ...) \
121 MACRO_BEGIN \
122 if (os_unlikely(vm_reclaim_debug)) { \
123 _vmdr_log_type(OS_LOG_TYPE_DEBUG, fmt, ##__VA_ARGS__); \
124 } \
125 MACRO_END
126 #else /* !(DEVELOPMENT || DEBUG)*/
127 #define vmdr_log_debug(...)
128 #endif /* DEVELOPMENT || DEBUG */
129
130 static kern_return_t reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
131 static kern_return_t reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
132 static kern_return_t reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
133 static kern_return_t reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result);
134 #if CONFIG_WORKING_SET_ESTIMATION
135 static mach_error_t vmdr_sample_working_set(
136 vm_deferred_reclamation_metadata_t metadata,
137 mach_vm_size_t *trim_threshold_out,
138 vm_deferred_reclamation_options_t options);
139 #endif
140 static void vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata);
141 static void vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
142 static void vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
143 static void vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata);
144 static void vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata);
145 static void vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action,
146 mach_vm_size_t *total_bytes_reclaimed_out,
147 vm_deferred_reclamation_options_t options);
148 static kern_return_t reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
149 uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
150 mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out);
151
152 struct vm_deferred_reclamation_metadata_s {
153 /*
154 * Global list containing every reclamation buffer. Protected by the
155 * reclamation_buffers_lock.
156 */
157 TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list;
158 /* Protects all struct fields (except denoted otherwise) */
159 decl_lck_mtx_data(, vdrm_lock);
160 /* Gate to be acquired when performing copyio on the user ring */
161 decl_lck_mtx_gate_data(, vdrm_gate);
162 /*
163 * The task owns this structure but we maintain a backpointer here
164 * so that we can send an exception if we hit an error.
165 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
166 */
167 task_t vdrm_task;
168 pid_t vdrm_pid;
169 vm_map_t vdrm_map;
170 /*
171 * The owning task holds a ref on this object. When the task dies, it
172 * will set vdrm_task := NULL and drop its ref. Threads operating on the buffer
173 * should hold a +1 on the metadata structure to ensure it's validity.
174 */
175 os_refcnt_t vdrm_refcnt;
176 /* The virtual address of the ringbuffer in the user map (immutable) */
177 user_addr_t vdrm_ring_addr;
178 /* The size of the VM allocation containing the ringbuffer (immutable) */
179 mach_vm_size_t vdrm_ring_size;
180 /* The length of the ringbuffer. This may be changed on buffer re-size */
181 mach_vm_reclaim_count_t vdrm_buffer_len;
182 /* Which GC epoch this buffer was last considered in */
183 uint64_t vdrm_reclaimed_at;
184 /*
185 * The number of threads waiting for a pending reclamation
186 * on this buffer to complete.
187 */
188 uint32_t vdrm_waiters;
189 /* timestamp (MAS) of the last working set sample for this ringbuffer */
190 uint64_t vdrm_last_sample_abs;
191 /*
192 * The number of bytes reclaimed by kernel GC since the last user
193 * accounting update. Protected by @c vdrm_gate.
194 */
195 size_t vdrm_kernel_bytes_reclaimed;
196 /*
197 * The last amount of reclaimable bytes reported to the kernel.
198 */
199 uint64_t vdrm_reclaimable_bytes_last;
200 #if CONFIG_WORKING_SET_ESTIMATION
201 /*
202 * Exponential moving average of the minimum reclaimable buffer size
203 * (in VMDR_WMA_UNIT's). Protected by @c vdrm_gate.
204 */
205 uint64_t vdrm_reclaimable_bytes_wma;
206 #endif /* CONFIG_WORKING_SET_ESTIMATION */
207 /*
208 * Tracks whether or not this reclamation metadata has been added
209 * to the global list yet. Normally, this happens when it is allocated,
210 * except in the case of fork(). In this case, we have to duplicate the
211 * parent's metadata before it returns from fork(), but this occurs
212 * before the child's address space is set up.
213 */
214 uint8_t vdrm_is_registered : 1,
215 __unused1 : 7;
216 };
217
218 #pragma mark Globals
219 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
220 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
221 os_refgrp_decl(static, vm_reclaim_metadata_refgrp, "vm_reclaim_metadata_refgrp", NULL);
222 /*
223 * The reclamation_buffers list contains every buffer in the system.
224 * The reclamation_buffers_lock protects the reclamation_buffers list.
225 * It must be held when iterating over the list or manipulating the list.
226 * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
227 */
228 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclaim_buffers = TAILQ_HEAD_INITIALIZER(reclaim_buffers);
229 LCK_MTX_DECLARE(reclaim_buffers_lock, &vm_reclaim_lock_grp);
230 /* Number of times Reclaim GC has run */
231 uint64_t vm_reclaim_gc_epoch = 0;
232 /* The number of reclamation actions (drains/trims) done during GC */
233 uint64_t vm_reclaim_gc_reclaim_count;
234 /* Gate for GC */
235 static decl_lck_mtx_gate_data(, vm_reclaim_gc_gate);
236 os_log_t vm_reclaim_log_handle;
237 /* Number of initialized reclaim buffers */
238 _Atomic uint32_t vm_reclaim_buffer_count;
239 uint64_t vm_reclaim_sampling_period_abs = 0;
240 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_scavenger_thread = THREAD_NULL;
241 static sched_cond_atomic_t vm_reclaim_scavenger_cond = SCHED_COND_INIT;
242
243 #pragma mark Buffer Initialization/Destruction
244
245 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,mach_vm_reclaim_count_t len)246 vmdr_metadata_alloc(
247 task_t task,
248 user_addr_t buffer,
249 mach_vm_size_t size,
250 mach_vm_reclaim_count_t len)
251 {
252 vm_deferred_reclamation_metadata_t metadata;
253 vm_map_t map = task->map;
254
255 assert(!map->is_nested_map);
256
257 metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
258 lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
259 lck_mtx_gate_init(&metadata->vdrm_lock, &metadata->vdrm_gate);
260 os_ref_init(&metadata->vdrm_refcnt, &vm_reclaim_metadata_refgrp);
261
262 metadata->vdrm_task = task;
263 metadata->vdrm_map = map;
264 metadata->vdrm_ring_addr = buffer;
265 metadata->vdrm_ring_size = size;
266 metadata->vdrm_buffer_len = len;
267
268 if (os_atomic_inc(&vm_reclaim_buffer_count, relaxed) == UINT32_MAX) {
269 panic("Overflowed vm_reclaim_buffer_count");
270 }
271
272 /*
273 * we do not need to hold a lock on `task` because this is called
274 * either at fork() time or from the context of current_task().
275 */
276 vm_map_reference(map);
277 return metadata;
278 }
279
280 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)281 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
282 {
283 vm_map_deallocate(metadata->vdrm_map);
284 lck_mtx_gate_destroy(&metadata->vdrm_lock, &metadata->vdrm_gate);
285 lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
286 zfree(vm_reclaim_metadata_zone, metadata);
287 if (os_atomic_dec_orig(&vm_reclaim_buffer_count, relaxed) == 0) {
288 panic("Underflowed vm_reclaim_buffer_count");
289 }
290 }
291
292 static mach_vm_size_t
vmdr_round_len_to_size(vm_map_t map,mach_vm_reclaim_count_t count)293 vmdr_round_len_to_size(vm_map_t map, mach_vm_reclaim_count_t count)
294 {
295 mach_vm_size_t metadata_size = offsetof(struct mach_vm_reclaim_ring_s, entries);
296 mach_vm_size_t entries_size = count * sizeof(struct mach_vm_reclaim_entry_s);
297 return vm_map_round_page(metadata_size + entries_size, vm_map_page_mask(map));
298 }
299
300 mach_error_t
vm_deferred_reclamation_buffer_allocate_internal(task_t task,mach_vm_address_ut * address_u,uint64_t * sampling_period,mach_vm_reclaim_count_t len,mach_vm_reclaim_count_t max_len)301 vm_deferred_reclamation_buffer_allocate_internal(
302 task_t task,
303 mach_vm_address_ut *address_u,
304 uint64_t *sampling_period,
305 mach_vm_reclaim_count_t len,
306 mach_vm_reclaim_count_t max_len)
307 {
308 kern_return_t kr;
309 kern_return_t tmp_kr;
310 vm_deferred_reclamation_metadata_t metadata = NULL;
311 vm_map_t map;
312 uint64_t head = 0, tail = 0, busy = 0;
313 static bool reclaim_disabled_logged = false;
314
315 if (task == TASK_NULL) {
316 return KERN_INVALID_TASK;
317 }
318 if (address_u == NULL || sampling_period == NULL ||
319 len == 0 || max_len == 0 || max_len < len) {
320 return KERN_INVALID_ARGUMENT;
321 }
322 map = task->map;
323 #if CONFIG_WORKING_SET_ESTIMATION
324 if (!vm_reclaim_enabled) {
325 #else /* !CONFIG_WORKING_SET_ESTIMATION */
326 if (!vm_reclaim_max_threshold) {
327 #endif /* CONFIG_WORKING_SET_ESTIMATION */
328 if (!reclaim_disabled_logged) {
329 /* Avoid logging failure for every new process */
330 reclaim_disabled_logged = true;
331 vmdr_log_error("failed to initialize deferred "
332 "reclamation buffer - vm_reclaim is disabled\n");
333 }
334 return VM_RECLAIM_NOT_SUPPORTED;
335 }
336
337 map = task->map;
338 mach_vm_size_t rounded_vm_size = vmdr_round_len_to_size(map, max_len);
339 if (rounded_vm_size == 0) {
340 return KERN_INVALID_ARGUMENT;
341 }
342
343 if (rounded_vm_size > VM_RECLAIM_MAX_BUFFER_SIZE) {
344 vmdr_log_error("denying request to allocate ringbuffer of size "
345 "%llu KiB (max %llu KiB)\n",
346 rounded_vm_size,
347 VM_RECLAIM_MAX_BUFFER_SIZE);
348 return KERN_NO_SPACE;
349 }
350
351 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START,
352 task_pid(task), len);
353
354 /*
355 * Allocate a VM region that can contain the maximum buffer size. The
356 * allocation starts as VM_PROT_NONE and may be unprotected on buffer
357 * resize.
358 *
359 * TODO: If clients other than libmalloc adopt deferred reclaim, a
360 * different tag should be given
361 *
362 * `address` was sanitized under the assumption that we'll only use
363 * it as a hint (overflow checks were used) so we must pass the
364 * anywhere flag.
365 */
366 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE_PERMANENT(
367 .vm_tag = VM_MEMORY_MALLOC);
368 mach_vm_size_ut size_u = vm_sanitize_wrap_size(rounded_vm_size);
369 kr = mach_vm_map_kernel(map, address_u, size_u, VM_MAP_PAGE_MASK(map),
370 vmk_flags, IPC_PORT_NULL, 0, FALSE,
371 VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_COPY);
372 if (kr != KERN_SUCCESS) {
373 vmdr_log_error("%s [%d] failed to allocate VA for reclaim "
374 "buffer (%d)\n", task_best_name(task), task_pid(task), kr);
375 return kr;
376 }
377 mach_vm_address_t address = VM_SANITIZE_UNSAFE_UNWRAP(*address_u);
378 assert3u(address, !=, 0);
379
380 metadata = vmdr_metadata_alloc(task, address, rounded_vm_size, len);
381 metadata->vdrm_pid = task_pid(task);
382
383 /*
384 * Validate the starting indices.
385 */
386 kr = reclaim_copyin_busy(metadata, &busy);
387 if (kr != KERN_SUCCESS) {
388 goto out;
389 }
390 kr = reclaim_copyin_head(metadata, &head);
391 if (kr != KERN_SUCCESS) {
392 goto out;
393 }
394 kr = reclaim_copyin_tail(metadata, &tail);
395 if (kr != KERN_SUCCESS) {
396 goto out;
397 }
398
399 if (head != 0 || tail != 0 || busy != 0) {
400 vmdr_log_error("indices were not "
401 "zero-initialized\n");
402 kr = KERN_INVALID_ARGUMENT;
403 goto out;
404 }
405
406 /*
407 * Publish the metadata to the task & global buffer list. This must be
408 * done under the task lock to synchronize with task termination - i.e.
409 * task_terminate_internal is guaranteed to see the published metadata and
410 * tear it down.
411 */
412 lck_mtx_lock(&reclaim_buffers_lock);
413 task_lock(task);
414
415 if (!task_is_active(task) || task_is_halting(task)) {
416 vmdr_log_error(
417 "failed to initialize buffer on dying task %s [%d]",
418 task_best_name(task), task_pid(task));
419 kr = KERN_ABORTED;
420 goto fail_task;
421 }
422 if (task->deferred_reclamation_metadata != NULL) {
423 vmdr_log_error(
424 "tried to overwrite existing reclaim buffer for %s [%d]", task_best_name(task), task_pid(task));
425 kr = VM_RECLAIM_RESOURCE_SHORTAGE;
426 goto fail_task;
427 }
428
429 metadata->vdrm_is_registered = true;
430 vmdr_list_append_locked(metadata);
431 task->deferred_reclamation_metadata = metadata;
432
433 task_unlock(task);
434 lck_mtx_unlock(&reclaim_buffers_lock);
435
436 vmdr_log_debug("%s [%d] allocated ring with capacity %u/%u\n",
437 task_best_name(task), task_pid(task),
438 len, max_len);
439 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
440 task_pid(task), KERN_SUCCESS, address);
441 DTRACE_VM3(reclaim_ring_allocate,
442 mach_vm_address_t, address,
443 mach_vm_reclaim_count_t, len,
444 mach_vm_reclaim_count_t, max_len);
445 return KERN_SUCCESS;
446
447 fail_task:
448 task_unlock(task);
449 lck_mtx_unlock(&reclaim_buffers_lock);
450
451 tmp_kr = mach_vm_deallocate(map,
452 *address_u, size_u);
453 assert(tmp_kr == KERN_SUCCESS);
454
455 out:
456 *address_u = vm_sanitize_wrap_addr(0ull);
457 *sampling_period = vm_reclaim_sampling_period_abs;
458 vmdr_metadata_release(metadata);
459 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
460 kr, NULL);
461 return kr;
462 }
463
464 #pragma mark Synchronization & Lifecycle
465
466 static inline void
467 vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)
468 {
469 lck_mtx_lock(&metadata->vdrm_lock);
470 }
471
472 static inline void
473 vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)
474 {
475 lck_mtx_unlock(&metadata->vdrm_lock);
476 }
477
478 static inline void
479 vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)
480 {
481 lck_mtx_gate_assert(&metadata->vdrm_lock, &metadata->vdrm_gate,
482 GATE_ASSERT_HELD);
483 }
484
485 static inline void
486 vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)
487 {
488 #if MACH_ASSERT
489 vmdr_metadata_lock(metadata);
490 vmdr_metadata_assert_owned_locked(metadata);
491 vmdr_metadata_unlock(metadata);
492 #else /* MACH_ASSERT */
493 (void)metadata;
494 #endif /* MACH_ASSERT */
495 }
496
497 static bool
498 vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)
499 {
500 kern_return_t kr = lck_mtx_gate_try_close(&metadata->vdrm_lock,
501 &metadata->vdrm_gate);
502 return kr == KERN_SUCCESS;
503 }
504
505 /*
506 * Try to take ownership of the buffer. Returns true if successful.
507 */
508 static bool
509 vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata,
510 vm_deferred_reclamation_options_t options)
511 {
512 __assert_only gate_wait_result_t wait_result;
513 if (!vmdr_metadata_try_own_locked(metadata)) {
514 if (options & RECLAIM_NO_WAIT) {
515 return false;
516 }
517 wait_result = lck_mtx_gate_wait(
518 &metadata->vdrm_lock, &metadata->vdrm_gate, LCK_SLEEP_DEFAULT,
519 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
520 assert(wait_result == GATE_HANDOFF);
521 }
522 return true;
523 }
524
525 /*
526 * Set the current thread as the owner of a reclaim buffer. May block. Will
527 * propagate priority.
528 */
529 static void
530 vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)
531 {
532 vmdr_metadata_lock(metadata);
533 vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
534 vmdr_metadata_unlock(metadata);
535 }
536
537 static void
538 vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)
539 {
540 vmdr_metadata_assert_owned_locked(metadata);
541 lck_mtx_gate_handoff(&metadata->vdrm_lock, &metadata->vdrm_gate,
542 GATE_HANDOFF_OPEN_IF_NO_WAITERS);
543 }
544
545 /*
546 * Release ownership of a reclaim buffer and wakeup any threads waiting for
547 * ownership. Must be called from the thread that acquired ownership.
548 */
549 static void
550 vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)
551 {
552 vmdr_metadata_lock(metadata);
553 vmdr_metadata_disown_locked(metadata);
554 vmdr_metadata_unlock(metadata);
555 }
556
557 static void
558 vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)
559 {
560 os_ref_retain(&metadata->vdrm_refcnt);
561 }
562
563 static void
564 vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)
565 {
566 if (os_ref_release(&metadata->vdrm_refcnt) == 0) {
567 vmdr_metadata_free(metadata);
568 }
569 }
570
571 static void
572 vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
573 {
574 LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
575 assert3p(metadata->vdrm_list.tqe_prev, !=, NULL);
576 TAILQ_REMOVE(&reclaim_buffers, metadata, vdrm_list);
577 metadata->vdrm_list.tqe_prev = NULL;
578 metadata->vdrm_list.tqe_next = NULL;
579 }
580
581 static void
582 vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
583 {
584 LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
585 assert3p(metadata->vdrm_list.tqe_prev, ==, NULL);
586 TAILQ_INSERT_TAIL(&reclaim_buffers, metadata, vdrm_list);
587 }
588
589 void
590 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
591 {
592 assert(metadata != NULL);
593 /*
594 * First remove the buffer from the global list so no one else can get access to it.
595 */
596 lck_mtx_lock(&reclaim_buffers_lock);
597 if (metadata->vdrm_is_registered) {
598 vmdr_list_remove_locked(metadata);
599 }
600 lck_mtx_unlock(&reclaim_buffers_lock);
601
602 /*
603 * The task is dropping its ref on this buffer. First remove the buffer's
604 * back-reference to the task so that any threads currently operating on
605 * this buffer do not try to operate on the dead/dying task
606 */
607 vmdr_metadata_lock(metadata);
608 assert3p(metadata->vdrm_task, !=, TASK_NULL);
609 metadata->vdrm_task = TASK_NULL;
610 vmdr_metadata_unlock(metadata);
611 vmdr_metadata_release(metadata);
612 }
613
614 #pragma mark Exception Delivery
615
616 static void
617 reclaim_kill_with_reason(
618 vm_deferred_reclamation_metadata_t metadata,
619 unsigned reason,
620 mach_exception_data_type_t subcode)
621 {
622 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
623 mach_exception_code_t code = 0;
624 task_t task;
625 proc_t p = NULL;
626 boolean_t fatal = TRUE;
627 bool killing_self;
628 pid_t pid;
629 int err;
630
631 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
632
633 EXC_GUARD_ENCODE_TYPE(code, guard_type);
634 EXC_GUARD_ENCODE_FLAVOR(code, reason);
635 EXC_GUARD_ENCODE_TARGET(code, 0);
636
637 vmdr_metadata_lock(metadata);
638 task = metadata->vdrm_task;
639 if (task == TASK_NULL || !task_is_active(task) || task_is_halting(task)) {
640 /* Task is no longer alive */
641 vmdr_metadata_unlock(metadata);
642 vmdr_log_error(
643 "Unable to deliver guard exception because task "
644 "[%d] is already dead.\n",
645 metadata->vdrm_pid);
646 return;
647 }
648
649 if (panic_on_kill) {
650 panic("About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
651 }
652
653 killing_self = (task == current_task());
654 if (!killing_self) {
655 task_reference(task);
656 }
657 assert(task != kernel_task);
658 vmdr_metadata_unlock(metadata);
659
660 if (reason == kGUARD_EXC_DEALLOC_GAP) {
661 task_lock(task);
662 fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
663 task_unlock(task);
664 }
665
666 if (!fatal) {
667 vmdr_log_info(
668 "Skipping non fatal guard exception for %s [%d]\n",
669 task_best_name(task), task_pid(task));
670 goto out;
671 }
672
673 pid = task_pid(task);
674 if (killing_self) {
675 p = get_bsdtask_info(task);
676 } else {
677 p = proc_find(pid);
678 if (p && proc_task(p) != task) {
679 vmdr_log_error(
680 "Unable to deliver guard exception because proc is gone & pid rolled over.\n");
681 goto out;
682 }
683 }
684
685 if (!p) {
686 vmdr_log_error(
687 "Unable to deliver guard exception because task does not have a proc.\n");
688 goto out;
689 }
690
691 int flags = PX_DEBUG_NO_HONOR;
692 exception_info_t info = {
693 .os_reason = OS_REASON_GUARD,
694 .exception_type = EXC_GUARD,
695 .mx_code = code,
696 .mx_subcode = subcode
697 };
698
699 vmdr_log("Force-exiting %s [%d]\n", task_best_name(task), task_pid(task));
700
701 err = exit_with_mach_exception(p, info, flags);
702 if (err != 0) {
703 vmdr_log_error("Unable to deliver guard exception to %p: %d\n", p, err);
704 goto out;
705 }
706
707
708 out:
709 if (!killing_self) {
710 if (p) {
711 proc_rele(p);
712 p = NULL;
713 }
714 if (task) {
715 task_deallocate(task);
716 task = NULL;
717 }
718 }
719 }
720
721 #pragma mark Copy I/O
722
723 static user_addr_t
724 get_entries_ptr(vm_deferred_reclamation_metadata_t metadata)
725 {
726 return metadata->vdrm_ring_addr +
727 offsetof(struct mach_vm_reclaim_ring_s, entries);
728 }
729
730 static user_addr_t
731 get_head_ptr(vm_deferred_reclamation_metadata_t metadata)
732 {
733 return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, head);
734 }
735
736 static user_addr_t
737 get_tail_ptr(vm_deferred_reclamation_metadata_t metadata)
738 {
739 return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, tail);
740 }
741
742 static user_addr_t
743 get_busy_ptr(vm_deferred_reclamation_metadata_t metadata)
744 {
745 return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, busy);
746 }
747
748 static kern_return_t
749 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
750 {
751 if (result != 0 && (result != EFAULT || !vm_fault_get_disabled())) {
752 vmdr_log_error("Killing [%d] due to copy I/O error\n", metadata->vdrm_pid);
753 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE,
754 result);
755 }
756 return kern_return_for_errno(result);
757 }
758
759 /*
760 * Helper functions to do copyio on the head, tail, and busy pointers.
761 * Note that the kernel will only write to the busy and head pointers.
762 * Userspace is not supposed to write to the head or busy pointers, but the kernel
763 * must be resilient to that kind of bug in userspace.
764 */
765
766 static kern_return_t
767 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
768 {
769 int result;
770 kern_return_t kr;
771 user_addr_t head_ptr = get_head_ptr(metadata);
772
773 result = copyin_atomic64(head_ptr, head);
774 kr = reclaim_handle_copyio_error(metadata, result);
775 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
776 vmdr_log_error(
777 "Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
778 }
779 return kr;
780 }
781
782 static kern_return_t
783 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
784 {
785 int result;
786 kern_return_t kr;
787 user_addr_t tail_ptr = get_tail_ptr(metadata);
788
789 result = copyin_atomic64(tail_ptr, tail);
790 kr = reclaim_handle_copyio_error(metadata, result);
791 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
792 vmdr_log_error(
793 "Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
794 }
795 return kr;
796 }
797
798 static kern_return_t
799 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
800 {
801 int result;
802 kern_return_t kr;
803 user_addr_t busy_ptr = get_busy_ptr(metadata);
804
805 result = copyin_atomic64(busy_ptr, busy);
806 kr = reclaim_handle_copyio_error(metadata, result);
807 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
808 vmdr_log_error(
809 "Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
810 }
811 return kr;
812 }
813
814 static kern_return_t
815 reclaim_copyin_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t *reclaimable_bytes_out)
816 {
817 int result;
818 kern_return_t kr = KERN_SUCCESS;
819 uint64_t reclaimable_bytes;
820 user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
821 offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes);
822
823 result = copyin_atomic64(ptr, &reclaimable_bytes);
824 if (result) {
825 kr = reclaim_handle_copyio_error(metadata, result);
826 if (result != EFAULT || !vm_fault_get_disabled()) {
827 vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
828 }
829 } else {
830 *reclaimable_bytes_out = (size_t)reclaimable_bytes;
831 }
832 return kr;
833 }
834
835 #if CONFIG_WORKING_SET_ESTIMATION
836 static kern_return_t
837 reclaim_copyin_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t *min_reclaimable_bytes_out)
838 {
839 int result;
840 kern_return_t kr = KERN_SUCCESS;
841 uint64_t min_reclaimable_bytes;
842 user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
843 offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes_min);
844
845 result = copyin_atomic64(ptr, &min_reclaimable_bytes);
846 if (result) {
847 kr = reclaim_handle_copyio_error(metadata, result);
848 if (result != EFAULT || !vm_fault_get_disabled()) {
849 vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
850 }
851 } else {
852 *min_reclaimable_bytes_out = (size_t)min_reclaimable_bytes;
853 }
854 return kr;
855 }
856 #endif /* CONFIG_WORKING_SET_ESTIMATION */
857
858 static bool
859 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
860 {
861 int result;
862 kern_return_t kr = KERN_SUCCESS;
863 user_addr_t busy_ptr = get_busy_ptr(metadata);
864
865 result = copyout_atomic64(value, busy_ptr);
866 if (result) {
867 kr = reclaim_handle_copyio_error(metadata, result);
868 if (result != EFAULT || !vm_fault_get_disabled()) {
869 vmdr_log_error(
870 "Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
871 }
872 }
873 return kr;
874 }
875
876 static bool
877 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
878 {
879 int result;
880 kern_return_t kr = KERN_SUCCESS;
881 user_addr_t head_ptr = get_head_ptr(metadata);
882
883 result = copyout_atomic64(value, head_ptr);
884 if (result) {
885 kr = reclaim_handle_copyio_error(metadata, result);
886 if (result != EFAULT || !vm_fault_get_disabled()) {
887 vmdr_log_error(
888 "Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
889 }
890 }
891 return kr;
892 }
893
894 #if CONFIG_WORKING_SET_ESTIMATION
895 static kern_return_t
896 reclaim_copyout_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t min_reclaimable_bytes)
897 {
898 int result;
899 kern_return_t kr = KERN_SUCCESS;
900 user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
901 offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes_min);
902
903 result = copyout_atomic64(min_reclaimable_bytes, ptr);
904 if (result) {
905 kr = reclaim_handle_copyio_error(metadata, result);
906 if (result != EFAULT || !vm_fault_get_disabled()) {
907 vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
908 }
909 }
910 return kr;
911 }
912 #endif /* CONFIG_WORKING_SET_ESTIMATION */
913
914 #pragma mark Reclamation
915
916 /*
917 * @func reclaim_chunk
918 *
919 * @brief
920 * Reclaim a batch of entries from the buffer.
921 *
922 * @param bytes_to_reclaim
923 * Number of bytes caller wishes to reclaim from the buffer
924 *
925 * @param bytes_reclaimed_out
926 * The number of bytes reclaimed from the buffer written out
927 *
928 * @param chunk_size
929 * The maximum number of entries to hold busy and reclaim from (must
930 * be <= kReclaimChunkSize)
931 *
932 * @param num_reclaimed_out
933 * The number of entries reclaimed written out
934 *
935 * @discussion
936 * If the buffer has been exhausted of entries (tail == head),
937 * num_reclaimed_out will be zero. It is important that the caller abort any
938 * loops if such a condition is met.
939 */
940 static kern_return_t
941 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
942 uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
943 mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out)
944 {
945 kern_return_t kr = KERN_SUCCESS;
946 int result = 0;
947 mach_vm_reclaim_count_t num_reclaimed = 0, num_copied = 0;
948 uint64_t bytes_reclaimed = 0;
949 uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0;
950 vm_map_t map = metadata->vdrm_map;
951 vm_map_switch_context_t switch_ctx;
952 struct mach_vm_reclaim_entry_s copied_entries[kReclaimChunkSize];
953
954 assert(metadata != NULL);
955 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
956 vmdr_metadata_assert_owned(metadata);
957
958 assert(chunk_size <= kReclaimChunkSize);
959
960 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START,
961 metadata->vdrm_pid, bytes_to_reclaim);
962
963 memset(copied_entries, 0, sizeof(copied_entries));
964
965 switch_ctx = vm_map_switch_to(map);
966
967 kr = reclaim_copyin_busy(metadata, &busy);
968 if (kr != KERN_SUCCESS) {
969 goto done;
970 }
971 kr = reclaim_copyin_head(metadata, &head);
972 if (kr != KERN_SUCCESS) {
973 goto done;
974 }
975 kr = reclaim_copyin_tail(metadata, &tail);
976 if (kr != KERN_SUCCESS) {
977 goto done;
978 }
979
980 /*
981 * NB: busy may not be exactly equal to head if the jetsam
982 * thread fails to fault on the indices after having marked
983 * entries busy
984 */
985 if (busy < head || (busy - head) > kReclaimChunkSize) {
986 vmdr_log_error(
987 "Userspace modified head or busy pointer! head: %llu "
988 "(0x%llx) | busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
989 head, get_head_ptr(metadata), busy, get_busy_ptr(metadata), tail,
990 get_tail_ptr(metadata));
991 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE,
992 busy);
993 kr = KERN_FAILURE;
994 goto done;
995 }
996
997 if (tail < head) {
998 /*
999 * Userspace is likely in the middle of trying to re-use an entry,
1000 * bail on this reclamation.
1001 */
1002 vmdr_log_error(
1003 "Tail < head! Userspace is likely attempting a "
1004 "cancellation; aborting reclamation | head: %llu "
1005 "(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
1006 head, get_head_ptr(metadata), tail, get_tail_ptr(metadata), busy,
1007 get_busy_ptr(metadata));
1008 kr = KERN_ABORTED;
1009 goto done;
1010 }
1011
1012 /*
1013 * NB: If any of the copyouts below fail due to faults being disabled,
1014 * the buffer may be left in a state where several entries are unusable
1015 * until the next reclamation (i.e. busy > head)
1016 */
1017 num_to_reclaim = tail - head;
1018 while (true) {
1019 num_to_reclaim = MIN(num_to_reclaim, chunk_size);
1020 if (num_to_reclaim == 0) {
1021 break;
1022 }
1023 busy = head + num_to_reclaim;
1024 kr = reclaim_copyout_busy(metadata, busy);
1025 if (kr != KERN_SUCCESS) {
1026 goto done;
1027 }
1028 os_atomic_thread_fence(seq_cst);
1029 kr = reclaim_copyin_tail(metadata, &new_tail);
1030 if (kr != KERN_SUCCESS) {
1031 goto done;
1032 }
1033
1034 if (new_tail >= busy) {
1035 /* Got num_to_reclaim entries */
1036 break;
1037 }
1038 tail = new_tail;
1039 if (tail < head) {
1040 /*
1041 * Userspace is likely in the middle of trying to re-use an entry,
1042 * bail on this reclamation
1043 */
1044 vmdr_log_error(
1045 "Tail < head! Userspace is likely attempting a "
1046 "cancellation; aborting reclamation | head: %llu "
1047 "(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
1048 head, get_head_ptr(metadata), tail, get_tail_ptr(metadata), busy,
1049 get_busy_ptr(metadata));
1050 /* Reset busy back to head */
1051 reclaim_copyout_busy(metadata, head);
1052 kr = KERN_ABORTED;
1053 goto done;
1054 }
1055 /* Can't reclaim these entries. Try again */
1056 num_to_reclaim = tail - head;
1057 if (num_to_reclaim == 0) {
1058 /* Nothing left to reclaim. Reset busy to head. */
1059 kr = reclaim_copyout_busy(metadata, head);
1060 if (kr != KERN_SUCCESS) {
1061 goto done;
1062 }
1063 break;
1064 }
1065 /*
1066 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
1067 * so this is gauranteed to converge.
1068 */
1069 }
1070 vmdr_log_debug("[%d] reclaiming up to %llu entries (%llu KiB) head=%llu "
1071 "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_to_reclaim,
1072 bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1073
1074 uint64_t memcpy_start_idx = head % metadata->vdrm_buffer_len;
1075 while (num_copied < num_to_reclaim) {
1076 uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
1077 // Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
1078 memcpy_end_idx = MIN(memcpy_end_idx, metadata->vdrm_buffer_len);
1079 uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
1080
1081 assert(num_to_copy + num_copied <= kReclaimChunkSize);
1082 user_addr_t src_ptr = get_entries_ptr(metadata) +
1083 (memcpy_start_idx * sizeof(struct mach_vm_reclaim_entry_s));
1084 struct mach_vm_reclaim_entry_s *dst_ptr = copied_entries + num_copied;
1085 result = copyin(src_ptr, dst_ptr,
1086 (num_to_copy * sizeof(struct mach_vm_reclaim_entry_s)));
1087 kr = reclaim_handle_copyio_error(metadata, result);
1088 if (kr != KERN_SUCCESS) {
1089 if (kr != KERN_MEMORY_ERROR || !vm_fault_get_disabled()) {
1090 vmdr_log_error(
1091 "Unable to copyin %llu entries in reclaim "
1092 "buffer at 0x%llx to 0x%llx: err=%d\n",
1093 num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
1094 }
1095 goto done;
1096 }
1097
1098 num_copied += num_to_copy;
1099 memcpy_start_idx = (memcpy_start_idx + num_to_copy) % metadata->vdrm_buffer_len;
1100 }
1101
1102 for (num_reclaimed = 0; num_reclaimed < num_to_reclaim && bytes_reclaimed < bytes_to_reclaim; num_reclaimed++) {
1103 mach_vm_reclaim_entry_t entry = &copied_entries[num_reclaimed];
1104 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1105 metadata->vdrm_pid, entry->address, entry->size,
1106 entry->behavior);
1107 if (entry->address != 0 && entry->size != 0) {
1108 vm_map_address_t start = vm_map_trunc_page(entry->address,
1109 VM_MAP_PAGE_MASK(map));
1110 vm_map_address_t end = vm_map_round_page(entry->address + entry->size,
1111 VM_MAP_PAGE_MASK(map));
1112 DTRACE_VM4(vm_reclaim_entry,
1113 pid_t, metadata->vdrm_pid,
1114 mach_vm_address_t, entry->address,
1115 mach_vm_address_t, end,
1116 mach_vm_reclaim_action_t, entry->behavior);
1117 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1118 metadata->vdrm_pid, start, end,
1119 entry->behavior);
1120 vmdr_log_debug("[%d] Reclaiming entry %llu (0x%llx, 0x%llx)\n", metadata->vdrm_pid, head + num_reclaimed, start, end);
1121 switch (entry->behavior) {
1122 case VM_RECLAIM_DEALLOCATE:
1123 kr = vm_map_remove_guard(map,
1124 start, end, VM_MAP_REMOVE_GAPS_FAIL,
1125 KMEM_GUARD_NONE).kmr_return;
1126 if (kr == KERN_INVALID_VALUE) {
1127 vmdr_log_error(
1128 "[%d] Killing due to virtual-memory guard at (0x%llx, 0x%llx)\n",
1129 metadata->vdrm_pid, start, end);
1130 reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
1131 goto done;
1132 } else if (kr != KERN_SUCCESS) {
1133 vmdr_log_error(
1134 "[%d] Killing due to deallocation failure at (0x%llx, 0x%llx) err=%d\n",
1135 metadata->vdrm_pid, start, end, kr);
1136 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1137 goto done;
1138 }
1139 break;
1140 case VM_RECLAIM_FREE:
1141 /*
1142 * TODO: This should free the backing pages directly instead of using
1143 * VM_BEHAVIOR_REUSABLE, which will mark the pages as clean and let them
1144 * age in the LRU.
1145 */
1146 kr = vm_map_behavior_set(map, start,
1147 end, VM_BEHAVIOR_REUSABLE);
1148 if (kr != KERN_SUCCESS) {
1149 vmdr_log_error(
1150 "[%d] Failed to free(reusable) (0x%llx, 0x%llx) err=%d\n",
1151 metadata->vdrm_pid, start, end, kr);
1152 }
1153 break;
1154 default:
1155 vmdr_log_error(
1156 "attempted to reclaim entry with unsupported behavior %uh",
1157 entry->behavior);
1158 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1159 kr = KERN_INVALID_VALUE;
1160 goto done;
1161 }
1162 bytes_reclaimed += entry->size;
1163 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END,
1164 kr);
1165 }
1166 }
1167
1168 assert(head + num_reclaimed <= busy);
1169 head += num_reclaimed;
1170 kr = reclaim_copyout_head(metadata, head);
1171 if (kr != KERN_SUCCESS) {
1172 goto done;
1173 }
1174 if (busy > head) {
1175 busy = head;
1176 kr = reclaim_copyout_busy(metadata, busy);
1177 if (kr != KERN_SUCCESS) {
1178 goto done;
1179 }
1180 }
1181
1182 done:
1183 vmdr_log_debug("[%d] reclaimed %u entries (%llu KiB) head=%llu "
1184 "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_reclaimed,
1185 bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1186 vm_map_switch_back(switch_ctx);
1187 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1188 bytes_reclaimed, num_reclaimed, kr);
1189 if (bytes_reclaimed_out) {
1190 *bytes_reclaimed_out = bytes_reclaimed;
1191 }
1192 if (num_reclaimed_out) {
1193 *num_reclaimed_out = num_reclaimed;
1194 }
1195 return kr;
1196 }
1197
1198 /*
1199 * @func vmdr_reclaim_from_buffer
1200 *
1201 * @brief
1202 * Reclaim entries until the buffer's estimated number of available bytes
1203 * is <= @c bytes_to_reclaim.
1204 *
1205 * @param bytes_to_reclaim
1206 * The minimum number of bytes to reclaim
1207 *
1208 * @param num_bytes_reclaimed_out
1209 * The number of bytes reclaimed written out
1210 *
1211 * @param options
1212 * If RECLAIM_NO_FAULT is set, do not fault on the buffer if it has been paged
1213 * out.
1214 *
1215 * @discussion
1216 * The buffer should be owned by the caller.
1217 */
1218 static kern_return_t
1219 vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata,
1220 mach_vm_size_t bytes_to_reclaim, mach_vm_size_t *num_bytes_reclaimed_out,
1221 vm_deferred_reclamation_options_t options)
1222 {
1223 kern_return_t kr = KERN_SUCCESS;
1224
1225 if (options & RECLAIM_NO_FAULT) {
1226 vm_fault_disable();
1227 }
1228
1229 mach_vm_size_t total_bytes_reclaimed = 0;
1230 while (total_bytes_reclaimed < bytes_to_reclaim) {
1231 mach_vm_size_t cur_bytes_reclaimed;
1232 mach_vm_reclaim_count_t entries_reclaimed;
1233 kr = reclaim_chunk(metadata, bytes_to_reclaim - total_bytes_reclaimed,
1234 &cur_bytes_reclaimed, kReclaimChunkSize, &entries_reclaimed);
1235 total_bytes_reclaimed += cur_bytes_reclaimed;
1236 if (entries_reclaimed == 0 || kr != KERN_SUCCESS) {
1237 break;
1238 }
1239 }
1240
1241 if (options & RECLAIM_NO_FAULT) {
1242 vm_fault_enable();
1243 }
1244 vmdr_log_debug("reclaimed %llu B / %llu B from %d\n", total_bytes_reclaimed, bytes_to_reclaim, metadata->vdrm_pid);
1245 if (num_bytes_reclaimed_out) {
1246 *num_bytes_reclaimed_out = total_bytes_reclaimed;
1247 }
1248 return kr;
1249 }
1250
1251 /*
1252 * Get and retain the reclamation metadata buffer for the given task.
1253 */
1254 static vm_deferred_reclamation_metadata_t
1255 vmdr_acquire_task_metadata(task_t task)
1256 {
1257 vm_deferred_reclamation_metadata_t meta = NULL;
1258 assert(task != NULL);
1259 task_lock(task);
1260 if (!task_is_halting(task) && task_is_active(task)) {
1261 meta = task->deferred_reclamation_metadata;
1262 }
1263 if (meta != NULL) {
1264 vmdr_metadata_retain(meta);
1265 }
1266 task_unlock(task);
1267 return meta;
1268 }
1269
1270
1271 #pragma mark Buffer Resize/Synchronization
1272
1273 kern_return_t
1274 vm_deferred_reclamation_buffer_flush_internal(task_t task,
1275 mach_vm_reclaim_count_t num_entries_to_reclaim,
1276 mach_vm_size_t *bytes_reclaimed_out)
1277 {
1278 kern_return_t kr;
1279 vm_deferred_reclamation_metadata_t metadata = NULL;
1280 mach_vm_reclaim_count_t total_reclaimed = 0;
1281 uint64_t bytes_reclaimed = 0;
1282
1283 if (!task_is_active(task)) {
1284 return KERN_INVALID_TASK;
1285 }
1286
1287 metadata = vmdr_acquire_task_metadata(task);
1288 if (metadata == NULL) {
1289 return KERN_INVALID_ARGUMENT;
1290 }
1291
1292 vmdr_metadata_own(metadata);
1293
1294 vmdr_log_debug("[%d] flushing %u entries\n", task_pid(task), num_entries_to_reclaim);
1295 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_START, metadata->vdrm_pid, num_entries_to_reclaim);
1296
1297 while (total_reclaimed < num_entries_to_reclaim) {
1298 mach_vm_reclaim_count_t cur_reclaimed;
1299 uint64_t cur_bytes_reclaimed;
1300 mach_vm_reclaim_count_t chunk_size = MIN(num_entries_to_reclaim - total_reclaimed, kReclaimChunkSize);
1301 kr = reclaim_chunk(metadata, UINT64_MAX, &cur_bytes_reclaimed, chunk_size,
1302 &cur_reclaimed);
1303 total_reclaimed += cur_reclaimed;
1304 bytes_reclaimed += cur_bytes_reclaimed;
1305 if (cur_reclaimed == 0) {
1306 break;
1307 } else if (kr == KERN_ABORTED) {
1308 /*
1309 * Unable to reclaim due to a lost race with
1310 * userspace, yield the gate and try again
1311 */
1312 vmdr_metadata_disown(metadata);
1313 vmdr_metadata_own(metadata);
1314 continue;
1315 } else if (kr != KERN_SUCCESS) {
1316 break;
1317 }
1318 }
1319 /*
1320 * Tell the client how many bytes the kernel has reclaimed
1321 * since the last time it updated its accounting
1322 */
1323 bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1324 metadata->vdrm_kernel_bytes_reclaimed = 0;
1325
1326 vmdr_metadata_disown(metadata);
1327
1328 *bytes_reclaimed_out = bytes_reclaimed;
1329 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_END, kr, total_reclaimed, bytes_reclaimed);
1330 DTRACE_VM2(reclaim_flush,
1331 mach_vm_reclaim_count_t, num_entries_to_reclaim,
1332 size_t, bytes_reclaimed);
1333 return kr;
1334 }
1335
1336 kern_return_t
1337 vm_deferred_reclamation_buffer_resize_internal(
1338 task_t task,
1339 mach_vm_reclaim_count_t len,
1340 mach_vm_size_t *bytes_reclaimed_out)
1341 {
1342 kern_return_t kr;
1343 mach_vm_reclaim_count_t num_entries_reclaimed = 0;
1344 mach_vm_reclaim_count_t old_len;
1345
1346 if (task == TASK_NULL) {
1347 return KERN_INVALID_TASK;
1348 }
1349 if (len == 0) {
1350 return KERN_INVALID_ARGUMENT;
1351 }
1352 vm_deferred_reclamation_metadata_t metadata = vmdr_acquire_task_metadata(task);
1353 if (metadata == NULL) {
1354 return KERN_INVALID_TASK;
1355 }
1356
1357 /* Size must be multiple of page size */
1358 vm_map_t map = task->map;
1359 mach_vm_size_t new_size = vmdr_round_len_to_size(map, len);
1360 if (new_size == 0) {
1361 vmdr_metadata_release(metadata);
1362 return KERN_INVALID_ARGUMENT;
1363 }
1364 if (new_size > metadata->vdrm_ring_size) {
1365 vmdr_metadata_release(metadata);
1366 return KERN_NO_SPACE;
1367 }
1368
1369 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_START,
1370 task_pid(task), new_size);
1371
1372 /*
1373 * Prevent other threads from operating on this buffer while it is
1374 * resized. It is the caller's responsibility to ensure mutual
1375 * exclusion with other user threads
1376 */
1377 vmdr_metadata_own(metadata);
1378
1379 old_len = metadata->vdrm_buffer_len;
1380
1381 vmdr_log_debug("%s [%d] resizing buffer %u -> %u entries\n",
1382 task_best_name(task), task_pid(task), old_len, len);
1383
1384 /*
1385 * Reclaim all the entries currently in the buffer to prevent re-use
1386 * of old reclaim ids that will alias differently into the newly sized
1387 * buffer.
1388 *
1389 * TODO: Consider encoding the ringbuffer-capacity in the
1390 * mach_vm_reclaim_id_t, so reuses can still find objects after a resize.
1391 */
1392 mach_vm_size_t total_bytes_reclaimed = 0;
1393 do {
1394 mach_vm_size_t cur_bytes_reclaimed;
1395 kr = reclaim_chunk(metadata, UINT64_MAX, &cur_bytes_reclaimed, kReclaimChunkSize,
1396 &num_entries_reclaimed);
1397 total_bytes_reclaimed += cur_bytes_reclaimed;
1398 if (kr != KERN_SUCCESS) {
1399 goto fail;
1400 }
1401 } while (num_entries_reclaimed > 0);
1402
1403 vmdr_log_debug("[%d] successfully resized buffer | reclaimed: %llu B "
1404 "kernel_reclaimed: %zu B\n", metadata->vdrm_pid,
1405 total_bytes_reclaimed, metadata->vdrm_kernel_bytes_reclaimed);
1406
1407 total_bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1408 metadata->vdrm_kernel_bytes_reclaimed = 0;
1409
1410 /* Publish new user addresses in kernel metadata */
1411 vmdr_metadata_lock(metadata);
1412 metadata->vdrm_buffer_len = len;
1413 vmdr_metadata_disown_locked(metadata);
1414 vmdr_metadata_unlock(metadata);
1415 vmdr_metadata_release(metadata);
1416
1417 *bytes_reclaimed_out = total_bytes_reclaimed;
1418
1419 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, KERN_SUCCESS, num_entries_reclaimed, total_bytes_reclaimed);
1420 DTRACE_VM2(reclaim_ring_resize,
1421 mach_vm_reclaim_count_t, old_len,
1422 mach_vm_reclaim_count_t, len);
1423 return KERN_SUCCESS;
1424
1425 fail:
1426 vmdr_metadata_disown(metadata);
1427 vmdr_metadata_release(metadata);
1428 *bytes_reclaimed_out = total_bytes_reclaimed;
1429 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, kr, num_entries_reclaimed);
1430 return kr;
1431 }
1432
1433 #pragma mark Accounting
1434
1435 #if CONFIG_WORKING_SET_ESTIMATION
1436 extern vm_pressure_level_t memorystatus_vm_pressure_level;
1437
1438 static kern_return_t
1439 vmdr_calculate_autotrim_threshold(vm_deferred_reclamation_metadata_t metadata, size_t *trim_threshold_out)
1440 {
1441 kern_return_t kr;
1442 uint32_t autotrim_pct;
1443
1444 /*
1445 * Determine the autotrim threshold based on the current pressure level
1446 */
1447 vm_pressure_level_t pressure_level = os_atomic_load(&memorystatus_vm_pressure_level, relaxed);
1448 switch (pressure_level) {
1449 case kVMPressureNormal:
1450 autotrim_pct = vm_reclaim_autotrim_pct_normal;
1451 break;
1452 case kVMPressureWarning:
1453 case kVMPressureUrgent:
1454 autotrim_pct = vm_reclaim_autotrim_pct_pressure;
1455 break;
1456 case kVMPressureCritical:
1457 autotrim_pct = vm_reclaim_autotrim_pct_critical;
1458 break;
1459 default:
1460 panic("vm_reclaim: unexpected vm_pressure_level %d", pressure_level);
1461 }
1462
1463 /*
1464 * Estimate the task's maximum working set size
1465 */
1466 ledger_amount_t phys_footprint_max = 0;
1467
1468 vmdr_metadata_lock(metadata);
1469 task_t task = metadata->vdrm_task;
1470 if (task == TASK_NULL) {
1471 vmdr_metadata_unlock(metadata);
1472 return KERN_INVALID_TASK;
1473 }
1474 task_reference(task);
1475 vmdr_metadata_unlock(metadata);
1476
1477 kr = ledger_get_lifetime_max(task->ledger,
1478 task_ledgers.phys_footprint, &phys_footprint_max);
1479 assert3u(kr, ==, KERN_SUCCESS);
1480
1481 task_deallocate(task);
1482
1483 *trim_threshold_out = phys_footprint_max * autotrim_pct / 100;
1484 return KERN_SUCCESS;
1485 }
1486
1487 #define VMDR_WMA_UNIT (1 << 8)
1488 #define VMDR_WMA_MIX(base, e) ((vm_reclaim_wma_weight_base * (base) + (e) * VMDR_WMA_UNIT * vm_reclaim_wma_weight_cur) / vm_reclaim_wma_denom)
1489 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1490
1491 /*
1492 * @func vmdr_ws_sample
1493 *
1494 * @brief sample the working set size of the given buffer
1495 *
1496 * @param metadata
1497 * The reclaim buffer to sample
1498 *
1499 * @param trim_threshold_out
1500 * If the buffer should be trimmed, the amount to trim (in bytes) will be
1501 * written out
1502 *
1503 * @returns KERN_MEMORY_ERROR if copyio failed due to RECLAIM_NO_FAULT
1504 *
1505 * @discussion
1506 * The caller must own the buffer
1507 */
1508 static mach_error_t
1509 vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata,
1510 mach_vm_size_t *trim_threshold_out, vm_deferred_reclamation_options_t options)
1511 {
1512 mach_error_t err = ERR_SUCCESS;
1513 size_t min_reclaimable_bytes = 0, cur_reclaimable_bytes = 0;
1514 uint64_t wma = 0;
1515
1516 vmdr_metadata_assert_owned(metadata);
1517
1518 *trim_threshold_out = 0;
1519
1520 vm_map_switch_context_t map_ctx = vm_map_switch_to(metadata->vdrm_map);
1521
1522 if (options & RECLAIM_NO_FAULT) {
1523 vm_fault_disable();
1524 }
1525 #if CONFIG_WORKING_SET_ESTIMATION
1526 err = reclaim_copyin_min_reclaimable_bytes(metadata, &min_reclaimable_bytes);
1527 if (err != ERR_SUCCESS) {
1528 goto done;
1529 }
1530
1531 uint64_t now = mach_absolute_time();
1532 if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) {
1533 /* A sampling period has not elapsed */
1534 goto done;
1535 }
1536 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_START,
1537 metadata->vdrm_pid,
1538 now,
1539 metadata->vdrm_last_sample_abs,
1540 min_reclaimable_bytes);
1541
1542 err = reclaim_copyin_reclaimable_bytes(metadata, &cur_reclaimable_bytes);
1543 if (err != ERR_SUCCESS) {
1544 goto done;
1545 }
1546
1547 /* Reset the minimum to start a new sampling interval */
1548 err = reclaim_copyout_min_reclaimable_bytes(metadata, cur_reclaimable_bytes);
1549 if (err != ERR_SUCCESS) {
1550 goto done;
1551 }
1552
1553 /*
1554 * The user accounting will overcount if the kernel has reclaimed
1555 * without telling the client about it.
1556 */
1557 if (cur_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) {
1558 cur_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed;
1559 } else {
1560 vmdr_log_error("[%d] more bytes have been reclaimed (%zu) than "
1561 "are supposedly in buffer (%zu)\n", metadata->vdrm_pid,
1562 metadata->vdrm_kernel_bytes_reclaimed, cur_reclaimable_bytes);
1563 /* This will cause an underflow in user accounting */
1564 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_ACCOUNTING_FAILURE, cur_reclaimable_bytes);
1565 err = KERN_ABORTED;
1566 goto done;
1567 }
1568 if (min_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) {
1569 min_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed;
1570 } else {
1571 min_reclaimable_bytes = 0;
1572 }
1573
1574 uint64_t samples_elapsed = (now - metadata->vdrm_last_sample_abs) /
1575 vm_reclaim_sampling_period_abs;
1576 if (samples_elapsed > vm_reclaim_abandonment_threshold) {
1577 /*
1578 * Many sampling periods have elapsed since the ring was
1579 * last sampled. Don't bother computing the WMA and assume
1580 * the buffer's current contents are unneeded.
1581 */
1582 wma = VMDR_WMA_MIX(0, cur_reclaimable_bytes);
1583 } else {
1584 /*
1585 * Compute an exponential moving average of the minimum amount of reclaimable
1586 * memory in this buffer. Multiple sampling periods may have elapsed
1587 * since the last sample. By definition, the minimum must be the same for
1588 * all elapsed periods (otherwise libmalloc would have called down to
1589 * update accounting)
1590 */
1591 for (unsigned int i = 0; i < samples_elapsed; i++) {
1592 wma = VMDR_WMA_MIX(
1593 metadata->vdrm_reclaimable_bytes_wma,
1594 min_reclaimable_bytes);
1595 }
1596 }
1597
1598 metadata->vdrm_reclaimable_bytes_wma = wma;
1599 size_t unneeded_bytes = MIN(min_reclaimable_bytes,
1600 metadata->vdrm_reclaimable_bytes_wma / VMDR_WMA_UNIT);
1601
1602 size_t autotrim_threshold;
1603 err = vmdr_calculate_autotrim_threshold(metadata, &autotrim_threshold);
1604 if (err != ERR_SUCCESS) {
1605 goto done;
1606 }
1607
1608 if (unneeded_bytes >= vm_map_page_size(metadata->vdrm_map) &&
1609 unneeded_bytes >= autotrim_threshold) {
1610 *trim_threshold_out = vm_map_round_page(unneeded_bytes,
1611 vm_map_page_mask(metadata->vdrm_map));
1612 }
1613 #else /* !CONFIG_WORKING_SET_ESTIMATION */
1614 (void)min_reclaimable_bytes;
1615 (void)wma;
1616 err = reclaim_copyin_reclaimable_bytes(metadata, &cur_reclaimable_bytes);
1617 if (err != ERR_SUCCESS) {
1618 goto done;
1619 }
1620 if (cur_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) {
1621 cur_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed;
1622 } else {
1623 vmdr_log_error("[%d] more bytes have been reclaimed (%zu) than "
1624 "are supposedly in buffer (%zu)\n", metadata->vdrm_pid,
1625 metadata->vdrm_kernel_bytes_reclaimed, cur_reclaimable_bytes);
1626 }
1627 if (cur_reclaimable_bytes > vm_reclaim_max_threshold) {
1628 *trim_threshold_out = vm_reclaim_max_threshold - cur_reclaimable_bytes;
1629 }
1630 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1631
1632 metadata->vdrm_last_sample_abs = mach_absolute_time();
1633 metadata->vdrm_reclaimable_bytes_last = cur_reclaimable_bytes;
1634
1635 done:
1636 vm_map_switch_back(map_ctx);
1637 if (options & RECLAIM_NO_FAULT) {
1638 vm_fault_enable();
1639 }
1640 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_END,
1641 wma,
1642 min_reclaimable_bytes,
1643 cur_reclaimable_bytes,
1644 *trim_threshold_out);
1645 DTRACE_VM5(reclaim_sample,
1646 pid_t, metadata->vdrm_pid,
1647 uint64_t, wma,
1648 size_t, min_reclaimable_bytes,
1649 size_t, cur_reclaimable_bytes,
1650 size_t, *trim_threshold_out);
1651 vmdr_log_debug("sampled buffer with min %lu est %lu trim %llu wma %llu\n",
1652 min_reclaimable_bytes,
1653 cur_reclaimable_bytes,
1654 *trim_threshold_out,
1655 wma);
1656 return err;
1657 }
1658
1659 /*
1660 * Caller must have buffer owned and unlocked
1661 */
1662 static kern_return_t
1663 vmdr_trim(vm_deferred_reclamation_metadata_t metadata, mach_vm_size_t bytes_to_reclaim,
1664 mach_vm_size_t *bytes_reclaimed, vm_deferred_reclamation_options_t options)
1665 {
1666 kern_return_t kr;
1667 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_START,
1668 metadata->vdrm_pid, bytes_to_reclaim);
1669
1670 kr = vmdr_reclaim_from_buffer(metadata, bytes_to_reclaim,
1671 bytes_reclaimed, options);
1672
1673 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_END, kr, bytes_reclaimed);
1674 DTRACE_VM3(reclaim_trim,
1675 pid_t, metadata->vdrm_pid,
1676 size_t, bytes_to_reclaim,
1677 size_t, *bytes_reclaimed);
1678 return kr;
1679 }
1680
1681 /*
1682 * Caller must have buffer owned and unlocked
1683 */
1684 static kern_return_t
1685 vmdr_drain(vm_deferred_reclamation_metadata_t metadata, mach_vm_size_t *bytes_reclaimed,
1686 vm_deferred_reclamation_options_t options)
1687 {
1688 kern_return_t kr;
1689 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_START,
1690 metadata->vdrm_pid);
1691
1692 kr = vmdr_reclaim_from_buffer(metadata, UINT64_MAX,
1693 bytes_reclaimed, options);
1694
1695 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_END, kr, bytes_reclaimed);
1696 DTRACE_VM2(reclaim_drain,
1697 pid_t, metadata->vdrm_pid,
1698 size_t, *bytes_reclaimed);
1699 return kr;
1700 }
1701
1702 mach_error_t
1703 vm_deferred_reclamation_update_accounting_internal(task_t task, uint64_t *bytes_reclaimed_out)
1704 {
1705 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1706 mach_vm_size_t bytes_to_reclaim, bytes_reclaimed = 0;
1707 mach_error_t err = ERR_SUCCESS;
1708
1709 if (metadata == NULL) {
1710 return KERN_NOT_FOUND;
1711 }
1712
1713 if (!metadata->vdrm_pid) {
1714 /* If this is a forked child, we may not yet have a pid */
1715 metadata->vdrm_pid = task_pid(task);
1716 }
1717
1718 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START,
1719 metadata->vdrm_pid);
1720
1721 vmdr_metadata_lock(metadata);
1722 uint64_t now = mach_absolute_time();
1723 if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) {
1724 /*
1725 * This is a fast path to avoid waiting on the gate if another
1726 * thread beat us to sampling.
1727 */
1728 vmdr_metadata_unlock(metadata);
1729 goto done;
1730 }
1731 vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
1732 vmdr_metadata_unlock(metadata);
1733
1734 err = vmdr_sample_working_set(metadata, &bytes_to_reclaim, RECLAIM_OPTIONS_NONE);
1735 if (err != ERR_SUCCESS) {
1736 vmdr_metadata_disown(metadata);
1737 goto done;
1738 }
1739 if (bytes_to_reclaim) {
1740 vmdr_log_debug("[%d] trimming %llu B\n", metadata->vdrm_pid, bytes_to_reclaim);
1741
1742 err = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, RECLAIM_OPTIONS_NONE);
1743
1744 if (err == KERN_ABORTED) {
1745 /*
1746 * We were unable to complete the trim due to a lost
1747 * race with userspace. This need not be fatal b/c the
1748 * accounting was successfully updated.
1749 */
1750 err = KERN_SUCCESS;
1751 }
1752 }
1753
1754 /*
1755 * Tell the client how many bytes the kernel has reclaimed
1756 * since the last time it updated its accounting
1757 */
1758 bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1759 metadata->vdrm_kernel_bytes_reclaimed = 0;
1760
1761 vmdr_metadata_disown(metadata);
1762
1763 done:
1764 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END,
1765 metadata->vdrm_last_sample_abs,
1766 bytes_to_reclaim,
1767 bytes_reclaimed);
1768 *bytes_reclaimed_out = (uint64_t)bytes_reclaimed;
1769 return err;
1770 }
1771
1772 kern_return_t
1773 vm_deferred_reclamation_task_drain(task_t task,
1774 vm_deferred_reclamation_options_t options)
1775 {
1776 kern_return_t kr;
1777 mach_vm_size_t bytes_reclaimed;
1778
1779 task_lock(task);
1780 if (!task_is_active(task) || task_is_halting(task)) {
1781 task_unlock(task);
1782 return KERN_ABORTED;
1783 }
1784 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1785 if (metadata == NULL) {
1786 task_unlock(task);
1787 return KERN_SUCCESS;
1788 }
1789 vmdr_metadata_retain(metadata);
1790 task_unlock(task);
1791
1792 vmdr_metadata_own(metadata);
1793
1794 kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1795 metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1796
1797 vmdr_metadata_disown(metadata);
1798 vmdr_metadata_release(metadata);
1799 return kr;
1800 }
1801
1802 void
1803 vm_deferred_reclamation_task_suspend(task_t task)
1804 {
1805 if (task->deferred_reclamation_metadata) {
1806 sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
1807 }
1808 }
1809
1810 #pragma mark KPIs
1811
1812 vm_deferred_reclamation_metadata_t
1813 vm_deferred_reclamation_task_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1814 {
1815 vm_deferred_reclamation_metadata_t metadata = NULL;
1816 vmdr_metadata_assert_owned(parent);
1817 vmdr_log_debug("forking [%d]\n", parent->vdrm_pid);
1818
1819 assert(task->deferred_reclamation_metadata == NULL);
1820 metadata = vmdr_metadata_alloc(task, parent->vdrm_ring_addr,
1821 parent->vdrm_ring_size, parent->vdrm_buffer_len);
1822
1823 metadata->vdrm_last_sample_abs = parent->vdrm_last_sample_abs;
1824 metadata->vdrm_kernel_bytes_reclaimed = parent->vdrm_kernel_bytes_reclaimed;
1825 #if CONFIG_WORKING_SET_ESTIMATION
1826 metadata->vdrm_reclaimable_bytes_wma = parent->vdrm_reclaimable_bytes_wma;
1827 #endif /* CONFIG_WORKING_SET_ESTIMATION */
1828
1829 return metadata;
1830 }
1831
1832 void
1833 vm_deferred_reclamation_task_fork_register(vm_deferred_reclamation_metadata_t metadata)
1834 {
1835 assert(metadata != NULL);
1836 assert(!metadata->vdrm_is_registered);
1837
1838 lck_mtx_lock(&reclaim_buffers_lock);
1839 metadata->vdrm_is_registered = true;
1840 vmdr_list_append_locked(metadata);
1841 lck_mtx_unlock(&reclaim_buffers_lock);
1842 }
1843
1844 bool
1845 vm_deferred_reclamation_task_has_ring(task_t task)
1846 {
1847 return task->deferred_reclamation_metadata != NULL;
1848 }
1849
1850 void
1851 vm_deferred_reclamation_ring_own(vm_deferred_reclamation_metadata_t metadata)
1852 {
1853 vmdr_metadata_own(metadata);
1854 }
1855
1856 void
1857 vm_deferred_reclamation_ring_disown(vm_deferred_reclamation_metadata_t metadata)
1858 {
1859 vmdr_metadata_disown(metadata);
1860 }
1861
1862 void
1863 vm_deferred_reclamation_gc(vm_deferred_reclamation_gc_action_t action,
1864 mach_vm_size_t *total_bytes_reclaimed,
1865 vm_deferred_reclamation_options_t options)
1866 {
1867 vmdr_garbage_collect(action, total_bytes_reclaimed, options);
1868 }
1869
1870 void
1871 vm_deferred_reclamation_settle_ledger(task_t task)
1872 {
1873 vm_deferred_reclamation_metadata_t meta = vmdr_acquire_task_metadata(task);
1874 if (meta == NULL) {
1875 return;
1876 }
1877 vmdr_metadata_lock(meta);
1878 ledger_zero_balance(task->ledger, task_ledgers.est_reclaimable);
1879 ledger_credit(
1880 task->ledger,
1881 task_ledgers.est_reclaimable,
1882 meta->vdrm_reclaimable_bytes_last);
1883 vmdr_metadata_unlock(meta);
1884 vmdr_metadata_release(meta);
1885 }
1886
1887 #pragma mark Global Reclamation GC
1888
1889 static void
1890 vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action,
1891 mach_vm_size_t *total_bytes_reclaimed_out,
1892 vm_deferred_reclamation_options_t options)
1893 {
1894 kern_return_t kr;
1895 mach_vm_size_t total_bytes_reclaimed = 0;
1896 gate_wait_result_t wr;
1897
1898 lck_mtx_lock(&reclaim_buffers_lock);
1899 kr = lck_mtx_gate_try_close(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
1900 if (kr != KERN_SUCCESS) {
1901 if (options & RECLAIM_NO_WAIT) {
1902 lck_mtx_unlock(&reclaim_buffers_lock);
1903 return;
1904 }
1905 wr = lck_mtx_gate_wait(&reclaim_buffers_lock, &vm_reclaim_gc_gate, LCK_SLEEP_DEFAULT, THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1906 assert3u(wr, ==, GATE_HANDOFF);
1907 }
1908
1909 vm_reclaim_gc_epoch++;
1910 vmdr_log_debug("running global GC\n");
1911 while (true) {
1912 vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclaim_buffers);
1913 if (metadata == NULL) {
1914 break;
1915 }
1916 vmdr_list_remove_locked(metadata);
1917 vmdr_list_append_locked(metadata);
1918 vmdr_metadata_retain(metadata);
1919 lck_mtx_unlock(&reclaim_buffers_lock);
1920
1921 vmdr_metadata_lock(metadata);
1922
1923 if (metadata->vdrm_reclaimed_at >= vm_reclaim_gc_epoch) {
1924 /* We've already seen this one. We're done */
1925 vmdr_metadata_unlock(metadata);
1926 vmdr_metadata_release(metadata);
1927 lck_mtx_lock(&reclaim_buffers_lock);
1928 break;
1929 }
1930 metadata->vdrm_reclaimed_at = vm_reclaim_gc_epoch;
1931
1932 task_t task = metadata->vdrm_task;
1933 if (task == TASK_NULL ||
1934 !task_is_active(task) ||
1935 task_is_halting(task)) {
1936 goto next;
1937 }
1938 bool buffer_is_suspended = task_is_app_suspended(task);
1939 task = TASK_NULL;
1940
1941 mach_vm_size_t bytes_reclaimed = 0;
1942 mach_vm_size_t bytes_to_reclaim = 0;
1943
1944 switch (action) {
1945 case RECLAIM_GC_DRAIN:
1946 if (!vmdr_metadata_own_locked(metadata, options)) {
1947 goto next;
1948 }
1949 vmdr_metadata_unlock(metadata);
1950
1951 vmdr_log_debug("draining [%d]\n", metadata->vdrm_pid);
1952 kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1953 metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1954
1955 vmdr_metadata_lock(metadata);
1956 vmdr_metadata_disown_locked(metadata);
1957 break;
1958 case RECLAIM_GC_SCAVENGE:
1959 if (buffer_is_suspended) {
1960 if (!vmdr_metadata_own_locked(metadata, options)) {
1961 goto next;
1962 }
1963 vmdr_metadata_unlock(metadata);
1964
1965 /* This buffer is no longer in use, fully reclaim it. */
1966 vmdr_log_debug("found suspended buffer [%d], draining\n", metadata->vdrm_pid);
1967 kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1968 metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1969
1970 vmdr_metadata_lock(metadata);
1971 vmdr_metadata_disown_locked(metadata);
1972 }
1973 break;
1974 case RECLAIM_GC_TRIM:
1975 if (!vmdr_metadata_own_locked(metadata, options)) {
1976 goto next;
1977 }
1978 vmdr_metadata_unlock(metadata);
1979 kr = vmdr_sample_working_set(metadata, &bytes_to_reclaim, options);
1980 if (kr == KERN_SUCCESS && bytes_to_reclaim) {
1981 vmdr_log_debug("GC found stale buffer (%d), trimming\n", metadata->vdrm_pid);
1982 kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, options);
1983 metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1984 }
1985 vmdr_metadata_lock(metadata);
1986 vmdr_metadata_disown_locked(metadata);
1987 break;
1988 }
1989 if (bytes_reclaimed) {
1990 vm_reclaim_gc_reclaim_count++;
1991 total_bytes_reclaimed += bytes_reclaimed;
1992 }
1993 if (metadata->vdrm_waiters && action != RECLAIM_GC_TRIM) {
1994 thread_wakeup((event_t)&metadata->vdrm_waiters);
1995 }
1996 next:
1997 vmdr_metadata_unlock(metadata);
1998 vmdr_metadata_release(metadata);
1999 lck_mtx_lock(&reclaim_buffers_lock);
2000 }
2001 lck_mtx_gate_handoff(&reclaim_buffers_lock, &vm_reclaim_gc_gate, GATE_HANDOFF_OPEN_IF_NO_WAITERS);
2002 lck_mtx_unlock(&reclaim_buffers_lock);
2003 *total_bytes_reclaimed_out = total_bytes_reclaimed;
2004 }
2005
2006 OS_NORETURN
2007 static void
2008 vm_reclaim_scavenger_thread_continue(__unused void *param, __unused wait_result_t wr)
2009 {
2010 sched_cond_ack(&vm_reclaim_scavenger_cond);
2011
2012 while (true) {
2013 mach_vm_size_t total_bytes_reclaimed;
2014 vmdr_garbage_collect(RECLAIM_GC_SCAVENGE, &total_bytes_reclaimed,
2015 RECLAIM_OPTIONS_NONE);
2016 vmdr_log_info("scavenger reclaimed %llu KiB of virtual memory\n",
2017 total_bytes_reclaimed >> 10);
2018 sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT,
2019 vm_reclaim_scavenger_thread_continue);
2020 }
2021 }
2022
2023 OS_NORETURN
2024 static void
2025 vm_reclaim_scavenger_thread_init(__unused void *param, __unused wait_result_t wr)
2026 {
2027 thread_set_thread_name(current_thread(), "VM_reclaim_scavenger");
2028 #if CONFIG_THREAD_GROUPS
2029 thread_group_vm_add();
2030 #endif /* CONFIG_THREAD_GROUPS */
2031 sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT, vm_reclaim_scavenger_thread_continue);
2032 __builtin_unreachable();
2033 }
2034
2035 __startup_func
2036 static void
2037 vm_deferred_reclamation_init(void)
2038 {
2039 vm_reclaim_log_handle = os_log_create("com.apple.xnu", "vm_reclaim");
2040 nanoseconds_to_absolutetime((uint64_t)vm_reclaim_sampling_period_ns,
2041 &vm_reclaim_sampling_period_abs);
2042
2043 sched_cond_init(&vm_reclaim_scavenger_cond);
2044 lck_mtx_gate_init(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
2045 kern_return_t kr = kernel_thread_start_priority(vm_reclaim_scavenger_thread_init,
2046 NULL, BASEPRI_KERNEL, &vm_reclaim_scavenger_thread);
2047 if (kr != KERN_SUCCESS) {
2048 panic("Unable to create VM reclaim thread, %d", kr);
2049 }
2050 }
2051
2052 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
2053
2054 #pragma mark Debug Interfaces
2055
2056 #if DEVELOPMENT || DEBUG
2057
2058 bool
2059 vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task)
2060 {
2061 bool reclaimed;
2062 vm_deferred_reclamation_metadata_t metadata;
2063
2064 metadata = vmdr_acquire_task_metadata(task);
2065 if (metadata == NULL) {
2066 return false;
2067 }
2068 vmdr_metadata_lock(metadata);
2069
2070 metadata->vdrm_waiters++;
2071 /* Wake up the scavenger thread */
2072 sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
2073 wait_result_t wr = lck_mtx_sleep(&metadata->vdrm_lock,
2074 LCK_SLEEP_DEFAULT, (event_t)&metadata->vdrm_waiters,
2075 THREAD_ABORTSAFE);
2076 metadata->vdrm_waiters--;
2077 reclaimed = (wr == THREAD_AWAKENED);
2078
2079 vmdr_metadata_unlock(metadata);
2080 vmdr_metadata_release(metadata);
2081 return reclaimed;
2082 }
2083
2084 #endif /* DEVELOPMENT || DEBUG */
2085
2086 #pragma mark Introspectibility
2087
2088 kern_return_t
2089 vm_deferred_reclamation_buffer_query_internal(
2090 task_t task,
2091 mach_vm_address_ut *addr_out_u,
2092 mach_vm_size_ut *size_out_u)
2093 {
2094 vm_deferred_reclamation_metadata_t meta;
2095
2096 if (task == NULL) {
2097 return KERN_INVALID_TASK;
2098 }
2099
2100 if ((addr_out_u == NULL) || (size_out_u == NULL)) {
2101 return KERN_INVALID_ARGUMENT;
2102 }
2103
2104 meta = vmdr_acquire_task_metadata(task);
2105
2106 if (meta == NULL) {
2107 *addr_out_u = vm_sanitize_wrap_addr(0);
2108 *size_out_u = vm_sanitize_wrap_size(0);
2109 } else {
2110 vmdr_metadata_lock(meta);
2111 *addr_out_u = vm_sanitize_wrap_addr(meta->vdrm_ring_addr);
2112 *size_out_u = vm_sanitize_wrap_size(meta->vdrm_ring_size);
2113 vmdr_metadata_unlock(meta);
2114 vmdr_metadata_release(meta);
2115 }
2116
2117 return KERN_SUCCESS;
2118 }
2119