1 /*
2 * Copyright (c) 2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/sched_prim.h>
35 #include <kern/startup.h>
36 #include <kern/thread_group.h>
37 #include <libkern/OSAtomic.h>
38 #include <mach/kern_return.h>
39 #include <mach/mach_types.h>
40 #include <mach/vm_reclaim_private.h>
41 #include <os/atomic_private.h>
42 #include <os/base_private.h>
43 #include <os/log.h>
44 #include <os/refcnt.h>
45 #include <os/refcnt_internal.h>
46 #include <pexpert/pexpert.h>
47 #include <sys/errno.h>
48 #include <sys/kdebug.h>
49 #include <sys/queue.h>
50 #include <sys/reason.h>
51 #include <vm/vm_fault_xnu.h>
52 #include <vm/vm_map.h>
53 #include <vm/vm_map_internal.h>
54 #include <vm/vm_pageout_internal.h>
55 #include <vm/vm_reclaim_internal.h>
56 #include <vm/vm_sanitize_internal.h>
57 #include <vm/vm_kern_xnu.h>
58
59 #pragma mark Tunables
60
61 #if DEVELOPMENT || DEBUG
62 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
63 #else /* RELEASE */
64 const uint32_t kReclaimChunkSize = 16;
65 #endif /* DEVELOPMENT || DEBUG */
66 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_sampling_period_ns, "vm_reclaim_sampling_period_ns",
67 #if XNU_TARGET_OS_OSX
68 10ULL * NSEC_PER_SEC);
69 #else
70 1ULL * NSEC_PER_SEC);
71 #endif
72 TUNABLE_DEV_WRITEABLE(bool, vm_reclaim_enabled, "vm_reclaim_enabled", true);
73 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_normal, "vm_reclaim_autotrim_pct_normal", 10);
74 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_pressure, "vm_reclaim_autotrim_pct_pressure", 5);
75 TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_critical, "vm_reclaim_autotrim_pct_critical", 1);
76 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_base, "vm_reclaim_wma_weight_base", 3);
77 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_weight_cur, "vm_reclaim_wma_weight_cur", 1);
78 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_wma_denom, "vm_reclaim_wma_denom", 4);
79 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_abandonment_threshold, "vm_reclaim_abandonment_threshold", 512);
80
81 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
82 #if DEVELOPMENT || DEBUG
83 TUNABLE_WRITEABLE(bool, vm_reclaim_debug, "vm_reclaim_debug", false);
84 #endif
85
86 #pragma mark Declarations
87 typedef struct proc *proc_t;
88 extern const char *proc_best_name(struct proc *);
89 extern void *proc_find(int pid);
90 extern task_t proc_task(proc_t);
91 extern kern_return_t kern_return_for_errno(int);
92 extern int mach_to_bsd_errno(kern_return_t kr);
93 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
94 struct proc *proc_ref(struct proc *p, int locked);
95 int proc_rele(proc_t p);
96
97 #define _vmdr_log_type(type, fmt, ...) os_log_with_type(vm_reclaim_log_handle, type, "vm_reclaim: " fmt, ##__VA_ARGS__)
98 #define vmdr_log(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_DEFAULT, fmt, ##__VA_ARGS__)
99 #define vmdr_log_info(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_INFO, fmt, ##__VA_ARGS__)
100 #define vmdr_log_error(fmt, ...) _vmdr_log_type(OS_LOG_TYPE_ERROR, fmt, ##__VA_ARGS__)
101 #if DEVELOPMENT || DEBUG
102 #define vmdr_log_debug(fmt, ...) \
103 MACRO_BEGIN \
104 if (os_unlikely(vm_reclaim_debug)) { \
105 _vmdr_log_type(OS_LOG_TYPE_DEBUG, fmt, ##__VA_ARGS__); \
106 } \
107 MACRO_END
108 #else /* !(DEVELOPMENT || DEBUG)*/
109 #define vmdr_log_debug(...)
110 #endif /* DEVELOPMENT || DEBUG */
111
112 static kern_return_t reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
113 static kern_return_t reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
114 static kern_return_t reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
115 static kern_return_t reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result);
116 static mach_error_t vmdr_sample_working_set(
117 vm_deferred_reclamation_metadata_t metadata,
118 mach_vm_size_t *trim_threshold_out,
119 vm_deferred_reclamation_options_t options);
120 static void vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata);
121 static void vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
122 static void vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
123 static void vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata);
124 static void vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata);
125 static void vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action,
126 mach_vm_size_t *total_bytes_reclaimed_out,
127 vm_deferred_reclamation_options_t options);
128 static kern_return_t reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
129 uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
130 mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out);
131
132 struct vm_deferred_reclamation_metadata_s {
133 /*
134 * Global list containing every reclamation buffer. Protected by the
135 * reclamation_buffers_lock.
136 */
137 TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list;
138 /* Protects all struct fields (except denoted otherwise) */
139 decl_lck_mtx_data(, vdrm_lock);
140 /* Gate to be acquired when performing copyio on the user ring */
141 decl_lck_mtx_gate_data(, vdrm_gate);
142 /*
143 * The task owns this structure but we maintain a backpointer here
144 * so that we can send an exception if we hit an error.
145 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
146 */
147 task_t vdrm_task;
148 pid_t vdrm_pid;
149 vm_map_t vdrm_map;
150 /*
151 * The owning task holds a ref on this object. When the task dies, it
152 * will set vdrm_task := NULL and drop its ref. Threads operating on the buffer
153 * should hold a +1 on the metadata structure to ensure it's validity.
154 */
155 os_refcnt_t vdrm_refcnt;
156 /* The virtual address of the ringbuffer in the user map (immutable) */
157 user_addr_t vdrm_ring_addr;
158 /* The size of the VM allocation containing the ringbuffer (immutable) */
159 mach_vm_size_t vdrm_ring_size;
160 /* The length of the ringbuffer. This may be changed on buffer re-size */
161 mach_vm_reclaim_count_t vdrm_buffer_len;
162 /* Which GC epoch this buffer was last considered in */
163 uint64_t vdrm_reclaimed_at;
164 /*
165 * The number of threads waiting for a pending reclamation
166 * on this buffer to complete.
167 */
168 uint32_t vdrm_waiters;
169 /* timestamp (MAS) of the last working set sample for this ringbuffer */
170 uint64_t vdrm_last_sample_abs;
171 /*
172 * The number of bytes reclaimed by kernel GC since the last user
173 * accounting update. Protected by @c vdrm_gate.
174 */
175 size_t vdrm_kernel_bytes_reclaimed;
176 /*
177 * The last amount of reclaimable bytes reported to the kernel.
178 */
179 uint64_t vdrm_reclaimable_bytes_last;
180 /*
181 * Exponential moving average of the minimum reclaimable buffer size
182 * (in VMDR_WMA_UNIT's). Protected by @c vdrm_gate.
183 */
184 uint64_t vdrm_reclaimable_bytes_wma;
185 /*
186 * Tracks whether or not this reclamation metadata has been added
187 * to the global list yet. Normally, this happens when it is allocated,
188 * except in the case of fork(). In this case, we have to duplicate the
189 * parent's metadata before it returns from fork(), but this occurs
190 * before the child's address space is set up.
191 */
192 uint8_t vdrm_is_registered : 1,
193 __unused1 : 7;
194 };
195
196 #pragma mark Globals
197 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
198 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
199 os_refgrp_decl(static, vm_reclaim_metadata_refgrp, "vm_reclaim_metadata_refgrp", NULL);
200 /*
201 * The reclamation_buffers list contains every buffer in the system.
202 * The reclamation_buffers_lock protects the reclamation_buffers list.
203 * It must be held when iterating over the list or manipulating the list.
204 * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
205 */
206 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclaim_buffers = TAILQ_HEAD_INITIALIZER(reclaim_buffers);
207 LCK_MTX_DECLARE(reclaim_buffers_lock, &vm_reclaim_lock_grp);
208 /* Number of times Reclaim GC has run */
209 uint64_t vm_reclaim_gc_epoch = 0;
210 /* The number of reclamation actions (drains/trims) done during GC */
211 uint64_t vm_reclaim_gc_reclaim_count;
212 /* Gate for GC */
213 static decl_lck_mtx_gate_data(, vm_reclaim_gc_gate);
214 os_log_t vm_reclaim_log_handle;
215 /* Number of initialized reclaim buffers */
216 _Atomic uint32_t vm_reclaim_buffer_count;
217 uint64_t vm_reclaim_sampling_period_abs = 0;
218 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_scavenger_thread = THREAD_NULL;
219 static sched_cond_atomic_t vm_reclaim_scavenger_cond = SCHED_COND_INIT;
220
221 #pragma mark Buffer Initialization/Destruction
222
223 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,mach_vm_reclaim_count_t len)224 vmdr_metadata_alloc(
225 task_t task,
226 user_addr_t buffer,
227 mach_vm_size_t size,
228 mach_vm_reclaim_count_t len)
229 {
230 vm_deferred_reclamation_metadata_t metadata;
231 vm_map_t map = task->map;
232
233 assert(!map->is_nested_map);
234
235 metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
236 lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
237 lck_mtx_gate_init(&metadata->vdrm_lock, &metadata->vdrm_gate);
238 os_ref_init(&metadata->vdrm_refcnt, &vm_reclaim_metadata_refgrp);
239
240 metadata->vdrm_task = task;
241 metadata->vdrm_map = map;
242 metadata->vdrm_ring_addr = buffer;
243 metadata->vdrm_ring_size = size;
244 metadata->vdrm_buffer_len = len;
245
246 if (os_atomic_inc(&vm_reclaim_buffer_count, relaxed) == UINT32_MAX) {
247 panic("Overflowed vm_reclaim_buffer_count");
248 }
249
250 /*
251 * we do not need to hold a lock on `task` because this is called
252 * either at fork() time or from the context of current_task().
253 */
254 vm_map_reference(map);
255 return metadata;
256 }
257
258 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)259 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
260 {
261 vm_map_deallocate(metadata->vdrm_map);
262 lck_mtx_gate_destroy(&metadata->vdrm_lock, &metadata->vdrm_gate);
263 lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
264 zfree(vm_reclaim_metadata_zone, metadata);
265 if (os_atomic_dec_orig(&vm_reclaim_buffer_count, relaxed) == 0) {
266 panic("Underflowed vm_reclaim_buffer_count");
267 }
268 }
269
270 static mach_vm_size_t
vmdr_round_len_to_size(vm_map_t map,mach_vm_reclaim_count_t count)271 vmdr_round_len_to_size(vm_map_t map, mach_vm_reclaim_count_t count)
272 {
273 mach_vm_size_t metadata_size = offsetof(struct mach_vm_reclaim_ring_s, entries);
274 mach_vm_size_t entries_size = count * sizeof(struct mach_vm_reclaim_entry_s);
275 return vm_map_round_page(metadata_size + entries_size, vm_map_page_mask(map));
276 }
277
278 mach_error_t
vm_deferred_reclamation_buffer_allocate_internal(task_t task,mach_vm_address_ut * address_u,uint64_t * sampling_period,mach_vm_reclaim_count_t len,mach_vm_reclaim_count_t max_len)279 vm_deferred_reclamation_buffer_allocate_internal(
280 task_t task,
281 mach_vm_address_ut *address_u,
282 uint64_t *sampling_period,
283 mach_vm_reclaim_count_t len,
284 mach_vm_reclaim_count_t max_len)
285 {
286 kern_return_t kr;
287 kern_return_t tmp_kr;
288 vm_deferred_reclamation_metadata_t metadata = NULL;
289 vm_map_t map;
290 uint64_t head = 0, tail = 0, busy = 0;
291 static bool reclaim_disabled_logged = false;
292
293 if (task == TASK_NULL) {
294 return KERN_INVALID_TASK;
295 }
296 if (address_u == NULL || sampling_period == NULL ||
297 len == 0 || max_len == 0 || max_len < len) {
298 return KERN_INVALID_ARGUMENT;
299 }
300 map = task->map;
301 if (!vm_reclaim_enabled) {
302 if (!reclaim_disabled_logged) {
303 /* Avoid logging failure for every new process */
304 reclaim_disabled_logged = true;
305 vmdr_log_error("failed to initialize deferred "
306 "reclamation buffer - vm_reclaim is disabled\n");
307 }
308 return VM_RECLAIM_NOT_SUPPORTED;
309 }
310
311 map = task->map;
312 mach_vm_size_t rounded_vm_size = vmdr_round_len_to_size(map, max_len);
313 if (rounded_vm_size == 0) {
314 return KERN_INVALID_ARGUMENT;
315 }
316
317 if (rounded_vm_size > VM_RECLAIM_MAX_BUFFER_SIZE) {
318 vmdr_log_error("denying request to allocate ringbuffer of size "
319 "%llu KiB (max %llu KiB)\n",
320 rounded_vm_size,
321 VM_RECLAIM_MAX_BUFFER_SIZE);
322 return KERN_NO_SPACE;
323 }
324
325 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START,
326 task_pid(task), len);
327
328 /*
329 * Allocate a VM region that can contain the maximum buffer size. The
330 * allocation starts as VM_PROT_NONE and may be unprotected on buffer
331 * resize.
332 *
333 * TODO: If clients other than libmalloc adopt deferred reclaim, a
334 * different tag should be given
335 *
336 * `address` was sanitized under the assumption that we'll only use
337 * it as a hint (overflow checks were used) so we must pass the
338 * anywhere flag.
339 */
340 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE_PERMANENT(
341 .vm_tag = VM_MEMORY_MALLOC);
342 mach_vm_size_ut size_u = vm_sanitize_wrap_size(rounded_vm_size);
343 kr = mach_vm_map_kernel(map, address_u, size_u, VM_MAP_PAGE_MASK(map),
344 vmk_flags, IPC_PORT_NULL, 0, FALSE,
345 VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_COPY);
346 if (kr != KERN_SUCCESS) {
347 vmdr_log_error("%s [%d] failed to allocate VA for reclaim "
348 "buffer (%d)\n", task_best_name(task), task_pid(task), kr);
349 return kr;
350 }
351 mach_vm_address_t address = VM_SANITIZE_UNSAFE_UNWRAP(*address_u);
352 assert3u(address, !=, 0);
353
354 metadata = vmdr_metadata_alloc(task, address, rounded_vm_size, len);
355 metadata->vdrm_pid = task_pid(task);
356
357 /*
358 * Validate the starting indices.
359 */
360 kr = reclaim_copyin_busy(metadata, &busy);
361 if (kr != KERN_SUCCESS) {
362 goto out;
363 }
364 kr = reclaim_copyin_head(metadata, &head);
365 if (kr != KERN_SUCCESS) {
366 goto out;
367 }
368 kr = reclaim_copyin_tail(metadata, &tail);
369 if (kr != KERN_SUCCESS) {
370 goto out;
371 }
372
373 if (head != 0 || tail != 0 || busy != 0) {
374 vmdr_log_error("indices were not "
375 "zero-initialized\n");
376 kr = KERN_INVALID_ARGUMENT;
377 goto out;
378 }
379
380 /*
381 * Publish the metadata to the task & global buffer list. This must be
382 * done under the task lock to synchronize with task termination - i.e.
383 * task_terminate_internal is guaranteed to see the published metadata and
384 * tear it down.
385 */
386 lck_mtx_lock(&reclaim_buffers_lock);
387 task_lock(task);
388
389 if (!task_is_active(task) || task_is_halting(task)) {
390 vmdr_log_error(
391 "failed to initialize buffer on dying task %s [%d]",
392 task_best_name(task), task_pid(task));
393 kr = KERN_ABORTED;
394 goto fail_task;
395 }
396 if (task->deferred_reclamation_metadata != NULL) {
397 vmdr_log_error(
398 "tried to overwrite existing reclaim buffer for %s [%d]", task_best_name(task), task_pid(task));
399 kr = VM_RECLAIM_RESOURCE_SHORTAGE;
400 goto fail_task;
401 }
402
403 metadata->vdrm_is_registered = true;
404 vmdr_list_append_locked(metadata);
405 task->deferred_reclamation_metadata = metadata;
406
407 task_unlock(task);
408 lck_mtx_unlock(&reclaim_buffers_lock);
409
410 vmdr_log_debug("%s [%d] allocated ring with capacity %u/%u\n",
411 task_best_name(task), task_pid(task),
412 len, max_len);
413 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
414 task_pid(task), KERN_SUCCESS, address);
415 DTRACE_VM3(reclaim_ring_allocate,
416 mach_vm_address_t, address,
417 mach_vm_reclaim_count_t, len,
418 mach_vm_reclaim_count_t, max_len);
419 return KERN_SUCCESS;
420
421 fail_task:
422 task_unlock(task);
423 lck_mtx_unlock(&reclaim_buffers_lock);
424
425 tmp_kr = mach_vm_deallocate(map,
426 *address_u, size_u);
427 assert(tmp_kr == KERN_SUCCESS);
428
429 out:
430 *address_u = vm_sanitize_wrap_addr(0ull);
431 *sampling_period = vm_reclaim_sampling_period_abs;
432 vmdr_metadata_release(metadata);
433 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
434 kr, NULL);
435 return kr;
436 }
437
438 #pragma mark Synchronization & Lifecycle
439
440 static inline void
vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)441 vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)
442 {
443 lck_mtx_lock(&metadata->vdrm_lock);
444 }
445
446 static inline void
vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)447 vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)
448 {
449 lck_mtx_unlock(&metadata->vdrm_lock);
450 }
451
452 static inline void
vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)453 vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)
454 {
455 lck_mtx_gate_assert(&metadata->vdrm_lock, &metadata->vdrm_gate,
456 GATE_ASSERT_HELD);
457 }
458
459 static inline void
vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)460 vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)
461 {
462 #if MACH_ASSERT
463 vmdr_metadata_lock(metadata);
464 vmdr_metadata_assert_owned_locked(metadata);
465 vmdr_metadata_unlock(metadata);
466 #else /* MACH_ASSERT */
467 (void)metadata;
468 #endif /* MACH_ASSERT */
469 }
470
471 static bool
vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)472 vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)
473 {
474 kern_return_t kr = lck_mtx_gate_try_close(&metadata->vdrm_lock,
475 &metadata->vdrm_gate);
476 return kr == KERN_SUCCESS;
477 }
478
479 /*
480 * Try to take ownership of the buffer. Returns true if successful.
481 */
482 static bool
vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata,vm_deferred_reclamation_options_t options)483 vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata,
484 vm_deferred_reclamation_options_t options)
485 {
486 __assert_only gate_wait_result_t wait_result;
487 if (!vmdr_metadata_try_own_locked(metadata)) {
488 if (options & RECLAIM_NO_WAIT) {
489 return false;
490 }
491 wait_result = lck_mtx_gate_wait(
492 &metadata->vdrm_lock, &metadata->vdrm_gate, LCK_SLEEP_DEFAULT,
493 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
494 assert(wait_result == GATE_HANDOFF);
495 }
496 return true;
497 }
498
499 /*
500 * Set the current thread as the owner of a reclaim buffer. May block. Will
501 * propagate priority.
502 */
503 static void
vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)504 vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)
505 {
506 vmdr_metadata_lock(metadata);
507 vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
508 vmdr_metadata_unlock(metadata);
509 }
510
511 static void
vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)512 vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)
513 {
514 vmdr_metadata_assert_owned_locked(metadata);
515 lck_mtx_gate_handoff(&metadata->vdrm_lock, &metadata->vdrm_gate,
516 GATE_HANDOFF_OPEN_IF_NO_WAITERS);
517 }
518
519 /*
520 * Release ownership of a reclaim buffer and wakeup any threads waiting for
521 * ownership. Must be called from the thread that acquired ownership.
522 */
523 static void
vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)524 vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)
525 {
526 vmdr_metadata_lock(metadata);
527 vmdr_metadata_disown_locked(metadata);
528 vmdr_metadata_unlock(metadata);
529 }
530
531 static void
vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)532 vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)
533 {
534 os_ref_retain(&metadata->vdrm_refcnt);
535 }
536
537 static void
vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)538 vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)
539 {
540 if (os_ref_release(&metadata->vdrm_refcnt) == 0) {
541 vmdr_metadata_free(metadata);
542 }
543 }
544
545 static void
vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)546 vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
547 {
548 LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
549 assert3p(metadata->vdrm_list.tqe_prev, !=, NULL);
550 TAILQ_REMOVE(&reclaim_buffers, metadata, vdrm_list);
551 metadata->vdrm_list.tqe_prev = NULL;
552 metadata->vdrm_list.tqe_next = NULL;
553 }
554
555 static void
vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)556 vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
557 {
558 LCK_MTX_ASSERT(&reclaim_buffers_lock, LCK_MTX_ASSERT_OWNED);
559 assert3p(metadata->vdrm_list.tqe_prev, ==, NULL);
560 TAILQ_INSERT_TAIL(&reclaim_buffers, metadata, vdrm_list);
561 }
562
563 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)564 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
565 {
566 assert(metadata != NULL);
567 /*
568 * First remove the buffer from the global list so no one else can get access to it.
569 */
570 lck_mtx_lock(&reclaim_buffers_lock);
571 if (metadata->vdrm_is_registered) {
572 vmdr_list_remove_locked(metadata);
573 }
574 lck_mtx_unlock(&reclaim_buffers_lock);
575
576 /*
577 * The task is dropping its ref on this buffer. First remove the buffer's
578 * back-reference to the task so that any threads currently operating on
579 * this buffer do not try to operate on the dead/dying task
580 */
581 vmdr_metadata_lock(metadata);
582 assert3p(metadata->vdrm_task, !=, TASK_NULL);
583 metadata->vdrm_task = TASK_NULL;
584 vmdr_metadata_unlock(metadata);
585 vmdr_metadata_release(metadata);
586 }
587
588 #pragma mark Exception Delivery
589
590 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)591 reclaim_kill_with_reason(
592 vm_deferred_reclamation_metadata_t metadata,
593 unsigned reason,
594 mach_exception_data_type_t subcode)
595 {
596 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
597 mach_exception_code_t code = 0;
598 task_t task;
599 proc_t p = NULL;
600 boolean_t fatal = TRUE;
601 bool killing_self;
602 pid_t pid;
603 int err;
604
605 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
606
607 EXC_GUARD_ENCODE_TYPE(code, guard_type);
608 EXC_GUARD_ENCODE_FLAVOR(code, reason);
609 EXC_GUARD_ENCODE_TARGET(code, 0);
610
611 vmdr_metadata_lock(metadata);
612 task = metadata->vdrm_task;
613 if (task == TASK_NULL || !task_is_active(task) || task_is_halting(task)) {
614 /* Task is no longer alive */
615 vmdr_metadata_unlock(metadata);
616 vmdr_log_error(
617 "Unable to deliver guard exception because task "
618 "[%d] is already dead.\n",
619 metadata->vdrm_pid);
620 return;
621 }
622
623 if (panic_on_kill) {
624 panic("About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
625 }
626
627 killing_self = (task == current_task());
628 if (!killing_self) {
629 task_reference(task);
630 }
631 assert(task != kernel_task);
632 vmdr_metadata_unlock(metadata);
633
634 if (reason == kGUARD_EXC_DEALLOC_GAP) {
635 task_lock(task);
636 fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
637 task_unlock(task);
638 }
639
640 if (!fatal) {
641 vmdr_log_info(
642 "Skipping non fatal guard exception for %s [%d]\n",
643 task_best_name(task), task_pid(task));
644 goto out;
645 }
646
647 pid = task_pid(task);
648 if (killing_self) {
649 p = get_bsdtask_info(task);
650 } else {
651 p = proc_find(pid);
652 if (p && proc_task(p) != task) {
653 vmdr_log_error(
654 "Unable to deliver guard exception because proc is gone & pid rolled over.\n");
655 goto out;
656 }
657 }
658
659 if (!p) {
660 vmdr_log_error(
661 "Unable to deliver guard exception because task does not have a proc.\n");
662 goto out;
663 }
664
665 int flags = PX_DEBUG_NO_HONOR;
666 exception_info_t info = {
667 .os_reason = OS_REASON_GUARD,
668 .exception_type = EXC_GUARD,
669 .mx_code = code,
670 .mx_subcode = subcode
671 };
672
673 vmdr_log("Force-exiting %s [%d]\n", task_best_name(task), task_pid(task));
674
675 err = exit_with_mach_exception(p, info, flags);
676 if (err != 0) {
677 vmdr_log_error("Unable to deliver guard exception to %p: %d\n", p, err);
678 goto out;
679 }
680
681
682 out:
683 if (!killing_self) {
684 if (p) {
685 proc_rele(p);
686 p = NULL;
687 }
688 if (task) {
689 task_deallocate(task);
690 task = NULL;
691 }
692 }
693 }
694
695 #pragma mark Copy I/O
696
697 static user_addr_t
get_entries_ptr(vm_deferred_reclamation_metadata_t metadata)698 get_entries_ptr(vm_deferred_reclamation_metadata_t metadata)
699 {
700 return metadata->vdrm_ring_addr +
701 offsetof(struct mach_vm_reclaim_ring_s, entries);
702 }
703
704 static user_addr_t
get_head_ptr(vm_deferred_reclamation_metadata_t metadata)705 get_head_ptr(vm_deferred_reclamation_metadata_t metadata)
706 {
707 return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, head);
708 }
709
710 static user_addr_t
get_tail_ptr(vm_deferred_reclamation_metadata_t metadata)711 get_tail_ptr(vm_deferred_reclamation_metadata_t metadata)
712 {
713 return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, tail);
714 }
715
716 static user_addr_t
get_busy_ptr(vm_deferred_reclamation_metadata_t metadata)717 get_busy_ptr(vm_deferred_reclamation_metadata_t metadata)
718 {
719 return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, busy);
720 }
721
722 static kern_return_t
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)723 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
724 {
725 if (result != 0 && (result != EFAULT || !vm_fault_get_disabled())) {
726 vmdr_log_error("Killing [%d] due to copy I/O error\n", metadata->vdrm_pid);
727 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE,
728 result);
729 }
730 return kern_return_for_errno(result);
731 }
732
733 /*
734 * Helper functions to do copyio on the head, tail, and busy pointers.
735 * Note that the kernel will only write to the busy and head pointers.
736 * Userspace is not supposed to write to the head or busy pointers, but the kernel
737 * must be resilient to that kind of bug in userspace.
738 */
739
740 static kern_return_t
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)741 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
742 {
743 int result;
744 kern_return_t kr;
745 user_addr_t head_ptr = get_head_ptr(metadata);
746
747 result = copyin_atomic64(head_ptr, head);
748 kr = reclaim_handle_copyio_error(metadata, result);
749 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
750 vmdr_log_error(
751 "Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
752 }
753 return kr;
754 }
755
756 static kern_return_t
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)757 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
758 {
759 int result;
760 kern_return_t kr;
761 user_addr_t tail_ptr = get_tail_ptr(metadata);
762
763 result = copyin_atomic64(tail_ptr, tail);
764 kr = reclaim_handle_copyio_error(metadata, result);
765 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
766 vmdr_log_error(
767 "Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
768 }
769 return kr;
770 }
771
772 static kern_return_t
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)773 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
774 {
775 int result;
776 kern_return_t kr;
777 user_addr_t busy_ptr = get_busy_ptr(metadata);
778
779 result = copyin_atomic64(busy_ptr, busy);
780 kr = reclaim_handle_copyio_error(metadata, result);
781 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
782 vmdr_log_error(
783 "Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
784 }
785 return kr;
786 }
787
788 static kern_return_t
reclaim_copyin_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata,size_t * reclaimable_bytes_out)789 reclaim_copyin_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t *reclaimable_bytes_out)
790 {
791 int result;
792 kern_return_t kr = KERN_SUCCESS;
793 uint64_t reclaimable_bytes;
794 user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
795 offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes);
796
797 result = copyin_atomic64(ptr, &reclaimable_bytes);
798 if (result) {
799 kr = reclaim_handle_copyio_error(metadata, result);
800 if (result != EFAULT || !vm_fault_get_disabled()) {
801 vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
802 }
803 } else {
804 *reclaimable_bytes_out = (size_t)reclaimable_bytes;
805 }
806 return kr;
807 }
808
809 static kern_return_t
reclaim_copyin_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata,size_t * min_reclaimable_bytes_out)810 reclaim_copyin_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t *min_reclaimable_bytes_out)
811 {
812 int result;
813 kern_return_t kr = KERN_SUCCESS;
814 uint64_t min_reclaimable_bytes;
815 user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
816 offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes_min);
817
818 result = copyin_atomic64(ptr, &min_reclaimable_bytes);
819 if (result) {
820 kr = reclaim_handle_copyio_error(metadata, result);
821 if (result != EFAULT || !vm_fault_get_disabled()) {
822 vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
823 }
824 } else {
825 *min_reclaimable_bytes_out = (size_t)min_reclaimable_bytes;
826 }
827 return kr;
828 }
829
830 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)831 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
832 {
833 int result;
834 kern_return_t kr = KERN_SUCCESS;
835 user_addr_t busy_ptr = get_busy_ptr(metadata);
836
837 result = copyout_atomic64(value, busy_ptr);
838 if (result) {
839 kr = reclaim_handle_copyio_error(metadata, result);
840 if (result != EFAULT || !vm_fault_get_disabled()) {
841 vmdr_log_error(
842 "Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
843 }
844 }
845 return kr;
846 }
847
848 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)849 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
850 {
851 int result;
852 kern_return_t kr = KERN_SUCCESS;
853 user_addr_t head_ptr = get_head_ptr(metadata);
854
855 result = copyout_atomic64(value, head_ptr);
856 if (result) {
857 kr = reclaim_handle_copyio_error(metadata, result);
858 if (result != EFAULT || !vm_fault_get_disabled()) {
859 vmdr_log_error(
860 "Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
861 }
862 }
863 return kr;
864 }
865
866 static kern_return_t
reclaim_copyout_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata,size_t min_reclaimable_bytes)867 reclaim_copyout_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t min_reclaimable_bytes)
868 {
869 int result;
870 kern_return_t kr = KERN_SUCCESS;
871 user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr +
872 offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes_min);
873
874 result = copyout_atomic64(min_reclaimable_bytes, ptr);
875 if (result) {
876 kr = reclaim_handle_copyio_error(metadata, result);
877 if (result != EFAULT || !vm_fault_get_disabled()) {
878 vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result);
879 }
880 }
881 return kr;
882 }
883
884 #pragma mark Reclamation
885
886 /*
887 * @func reclaim_chunk
888 *
889 * @brief
890 * Reclaim a batch of entries from the buffer.
891 *
892 * @param bytes_to_reclaim
893 * Number of bytes caller wishes to reclaim from the buffer
894 *
895 * @param bytes_reclaimed_out
896 * The number of bytes reclaimed from the buffer written out
897 *
898 * @param chunk_size
899 * The maximum number of entries to hold busy and reclaim from (must
900 * be <= kReclaimChunkSize)
901 *
902 * @param num_reclaimed_out
903 * The number of entries reclaimed written out
904 *
905 * @discussion
906 * If the buffer has been exhausted of entries (tail == head),
907 * num_reclaimed_out will be zero. It is important that the caller abort any
908 * loops if such a condition is met.
909 */
910 static kern_return_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,uint64_t bytes_to_reclaim,uint64_t * bytes_reclaimed_out,mach_vm_reclaim_count_t chunk_size,mach_vm_reclaim_count_t * num_reclaimed_out)911 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
912 uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out,
913 mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out)
914 {
915 kern_return_t kr = KERN_SUCCESS;
916 int result = 0;
917 mach_vm_reclaim_count_t num_reclaimed = 0, num_copied = 0;
918 uint64_t bytes_reclaimed = 0;
919 uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0;
920 vm_map_t map = metadata->vdrm_map;
921 vm_map_switch_context_t switch_ctx;
922 struct mach_vm_reclaim_entry_s copied_entries[kReclaimChunkSize];
923
924 assert(metadata != NULL);
925 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
926 vmdr_metadata_assert_owned(metadata);
927
928 assert(chunk_size <= kReclaimChunkSize);
929
930 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START,
931 metadata->vdrm_pid, bytes_to_reclaim);
932
933 memset(copied_entries, 0, sizeof(copied_entries));
934
935 switch_ctx = vm_map_switch_to(map);
936
937 kr = reclaim_copyin_busy(metadata, &busy);
938 if (kr != KERN_SUCCESS) {
939 goto done;
940 }
941 kr = reclaim_copyin_head(metadata, &head);
942 if (kr != KERN_SUCCESS) {
943 goto done;
944 }
945 kr = reclaim_copyin_tail(metadata, &tail);
946 if (kr != KERN_SUCCESS) {
947 goto done;
948 }
949
950 /*
951 * NB: busy may not be exactly equal to head if the jetsam
952 * thread fails to fault on the indices after having marked
953 * entries busy
954 */
955 if (busy < head || (busy - head) > kReclaimChunkSize) {
956 vmdr_log_error(
957 "Userspace modified head or busy pointer! head: %llu "
958 "(0x%llx) | busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
959 head, get_head_ptr(metadata), busy, get_busy_ptr(metadata), tail,
960 get_tail_ptr(metadata));
961 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE,
962 busy);
963 kr = KERN_FAILURE;
964 goto done;
965 }
966
967 if (tail < head) {
968 /*
969 * Userspace is likely in the middle of trying to re-use an entry,
970 * bail on this reclamation.
971 */
972 vmdr_log_error(
973 "Tail < head! Userspace is likely attempting a "
974 "cancellation; aborting reclamation | head: %llu "
975 "(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
976 head, get_head_ptr(metadata), tail, get_tail_ptr(metadata), busy,
977 get_busy_ptr(metadata));
978 kr = KERN_ABORTED;
979 goto done;
980 }
981
982 /*
983 * NB: If any of the copyouts below fail due to faults being disabled,
984 * the buffer may be left in a state where several entries are unusable
985 * until the next reclamation (i.e. busy > head)
986 */
987 num_to_reclaim = tail - head;
988 while (true) {
989 num_to_reclaim = MIN(num_to_reclaim, chunk_size);
990 if (num_to_reclaim == 0) {
991 break;
992 }
993 busy = head + num_to_reclaim;
994 kr = reclaim_copyout_busy(metadata, busy);
995 if (kr != KERN_SUCCESS) {
996 goto done;
997 }
998 os_atomic_thread_fence(seq_cst);
999 kr = reclaim_copyin_tail(metadata, &new_tail);
1000 if (kr != KERN_SUCCESS) {
1001 goto done;
1002 }
1003
1004 if (new_tail >= busy) {
1005 /* Got num_to_reclaim entries */
1006 break;
1007 }
1008 tail = new_tail;
1009 if (tail < head) {
1010 /*
1011 * Userspace is likely in the middle of trying to re-use an entry,
1012 * bail on this reclamation
1013 */
1014 vmdr_log_error(
1015 "Tail < head! Userspace is likely attempting a "
1016 "cancellation; aborting reclamation | head: %llu "
1017 "(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
1018 head, get_head_ptr(metadata), tail, get_tail_ptr(metadata), busy,
1019 get_busy_ptr(metadata));
1020 /* Reset busy back to head */
1021 reclaim_copyout_busy(metadata, head);
1022 kr = KERN_ABORTED;
1023 goto done;
1024 }
1025 /* Can't reclaim these entries. Try again */
1026 num_to_reclaim = tail - head;
1027 if (num_to_reclaim == 0) {
1028 /* Nothing left to reclaim. Reset busy to head. */
1029 kr = reclaim_copyout_busy(metadata, head);
1030 if (kr != KERN_SUCCESS) {
1031 goto done;
1032 }
1033 break;
1034 }
1035 /*
1036 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
1037 * so this is gauranteed to converge.
1038 */
1039 }
1040 vmdr_log_debug("[%d] reclaiming up to %llu entries (%llu KiB) head=%llu "
1041 "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_to_reclaim,
1042 bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1043
1044 uint64_t memcpy_start_idx = head % metadata->vdrm_buffer_len;
1045 while (num_copied < num_to_reclaim) {
1046 uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
1047 // Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
1048 memcpy_end_idx = MIN(memcpy_end_idx, metadata->vdrm_buffer_len);
1049 uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
1050
1051 assert(num_to_copy + num_copied <= kReclaimChunkSize);
1052 user_addr_t src_ptr = get_entries_ptr(metadata) +
1053 (memcpy_start_idx * sizeof(struct mach_vm_reclaim_entry_s));
1054 struct mach_vm_reclaim_entry_s *dst_ptr = copied_entries + num_copied;
1055 result = copyin(src_ptr, dst_ptr,
1056 (num_to_copy * sizeof(struct mach_vm_reclaim_entry_s)));
1057 kr = reclaim_handle_copyio_error(metadata, result);
1058 if (kr != KERN_SUCCESS) {
1059 if (kr != KERN_MEMORY_ERROR || !vm_fault_get_disabled()) {
1060 vmdr_log_error(
1061 "Unable to copyin %llu entries in reclaim "
1062 "buffer at 0x%llx to 0x%llx: err=%d\n",
1063 num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
1064 }
1065 goto done;
1066 }
1067
1068 num_copied += num_to_copy;
1069 memcpy_start_idx = (memcpy_start_idx + num_to_copy) % metadata->vdrm_buffer_len;
1070 }
1071
1072 for (num_reclaimed = 0; num_reclaimed < num_to_reclaim && bytes_reclaimed < bytes_to_reclaim; num_reclaimed++) {
1073 mach_vm_reclaim_entry_t entry = &copied_entries[num_reclaimed];
1074 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1075 metadata->vdrm_pid, entry->address, entry->size,
1076 entry->behavior);
1077 if (entry->address != 0 && entry->size != 0) {
1078 vm_map_address_t start = vm_map_trunc_page(entry->address,
1079 VM_MAP_PAGE_MASK(map));
1080 vm_map_address_t end = vm_map_round_page(entry->address + entry->size,
1081 VM_MAP_PAGE_MASK(map));
1082 DTRACE_VM4(vm_reclaim_entry,
1083 pid_t, metadata->vdrm_pid,
1084 mach_vm_address_t, entry->address,
1085 mach_vm_address_t, end,
1086 mach_vm_reclaim_action_t, entry->behavior);
1087 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
1088 metadata->vdrm_pid, start, end,
1089 entry->behavior);
1090 vmdr_log_debug("[%d] Reclaiming entry %llu (0x%llx, 0x%llx)\n", metadata->vdrm_pid, head + num_reclaimed, start, end);
1091 switch (entry->behavior) {
1092 case VM_RECLAIM_DEALLOCATE:
1093 kr = vm_map_remove_guard(map,
1094 start, end, VM_MAP_REMOVE_GAPS_FAIL,
1095 KMEM_GUARD_NONE).kmr_return;
1096 if (kr == KERN_INVALID_VALUE) {
1097 vmdr_log_error(
1098 "[%d] Killing due to virtual-memory guard at (0x%llx, 0x%llx)\n",
1099 metadata->vdrm_pid, start, end);
1100 reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
1101 goto done;
1102 } else if (kr != KERN_SUCCESS) {
1103 vmdr_log_error(
1104 "[%d] Killing due to deallocation failure at (0x%llx, 0x%llx) err=%d\n",
1105 metadata->vdrm_pid, start, end, kr);
1106 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1107 goto done;
1108 }
1109 break;
1110 case VM_RECLAIM_FREE:
1111 /*
1112 * TODO: This should free the backing pages directly instead of using
1113 * VM_BEHAVIOR_REUSABLE, which will mark the pages as clean and let them
1114 * age in the LRU.
1115 */
1116 kr = vm_map_behavior_set(map, start,
1117 end, VM_BEHAVIOR_REUSABLE);
1118 if (kr != KERN_SUCCESS) {
1119 vmdr_log_error(
1120 "[%d] Failed to free(reusable) (0x%llx, 0x%llx) err=%d\n",
1121 metadata->vdrm_pid, start, end, kr);
1122 }
1123 break;
1124 default:
1125 vmdr_log_error(
1126 "attempted to reclaim entry with unsupported behavior %uh",
1127 entry->behavior);
1128 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1129 kr = KERN_INVALID_VALUE;
1130 goto done;
1131 }
1132 bytes_reclaimed += entry->size;
1133 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END,
1134 kr);
1135 }
1136 }
1137
1138 assert(head + num_reclaimed <= busy);
1139 head += num_reclaimed;
1140 kr = reclaim_copyout_head(metadata, head);
1141 if (kr != KERN_SUCCESS) {
1142 goto done;
1143 }
1144 if (busy > head) {
1145 busy = head;
1146 kr = reclaim_copyout_busy(metadata, busy);
1147 if (kr != KERN_SUCCESS) {
1148 goto done;
1149 }
1150 }
1151
1152 done:
1153 vmdr_log_debug("[%d] reclaimed %u entries (%llu KiB) head=%llu "
1154 "busy=%llu tail=%llu len=%u", metadata->vdrm_pid, num_reclaimed,
1155 bytes_reclaimed, head, busy, tail, metadata->vdrm_buffer_len);
1156 vm_map_switch_back(switch_ctx);
1157 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1158 bytes_reclaimed, num_reclaimed, kr);
1159 if (bytes_reclaimed_out) {
1160 *bytes_reclaimed_out = bytes_reclaimed;
1161 }
1162 if (num_reclaimed_out) {
1163 *num_reclaimed_out = num_reclaimed;
1164 }
1165 return kr;
1166 }
1167
1168 /*
1169 * @func vmdr_reclaim_from_buffer
1170 *
1171 * @brief
1172 * Reclaim entries until the buffer's estimated number of available bytes
1173 * is <= @c bytes_to_reclaim.
1174 *
1175 * @param bytes_to_reclaim
1176 * The minimum number of bytes to reclaim
1177 *
1178 * @param num_bytes_reclaimed_out
1179 * The number of bytes reclaimed written out
1180 *
1181 * @param options
1182 * If RECLAIM_NO_FAULT is set, do not fault on the buffer if it has been paged
1183 * out.
1184 *
1185 * @discussion
1186 * The buffer should be owned by the caller.
1187 */
1188 static kern_return_t
vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata,mach_vm_size_t bytes_to_reclaim,mach_vm_size_t * num_bytes_reclaimed_out,vm_deferred_reclamation_options_t options)1189 vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata,
1190 mach_vm_size_t bytes_to_reclaim, mach_vm_size_t *num_bytes_reclaimed_out,
1191 vm_deferred_reclamation_options_t options)
1192 {
1193 kern_return_t kr = KERN_SUCCESS;
1194
1195 if (options & RECLAIM_NO_FAULT) {
1196 vm_fault_disable();
1197 }
1198
1199 mach_vm_size_t total_bytes_reclaimed = 0;
1200 while (total_bytes_reclaimed < bytes_to_reclaim) {
1201 mach_vm_size_t cur_bytes_reclaimed;
1202 mach_vm_reclaim_count_t entries_reclaimed;
1203 kr = reclaim_chunk(metadata, bytes_to_reclaim - total_bytes_reclaimed,
1204 &cur_bytes_reclaimed, kReclaimChunkSize, &entries_reclaimed);
1205 total_bytes_reclaimed += cur_bytes_reclaimed;
1206 if (entries_reclaimed == 0 || kr != KERN_SUCCESS) {
1207 break;
1208 }
1209 }
1210
1211 if (options & RECLAIM_NO_FAULT) {
1212 vm_fault_enable();
1213 }
1214 vmdr_log_debug("reclaimed %llu B / %llu B from %d\n", total_bytes_reclaimed, bytes_to_reclaim, metadata->vdrm_pid);
1215 if (num_bytes_reclaimed_out) {
1216 *num_bytes_reclaimed_out = total_bytes_reclaimed;
1217 }
1218 return kr;
1219 }
1220
1221 /*
1222 * Get and retain the reclamation metadata buffer for the given task.
1223 */
1224 static vm_deferred_reclamation_metadata_t
vmdr_acquire_task_metadata(task_t task)1225 vmdr_acquire_task_metadata(task_t task)
1226 {
1227 vm_deferred_reclamation_metadata_t meta = NULL;
1228 assert(task != NULL);
1229 task_lock(task);
1230 if (!task_is_halting(task) && task_is_active(task)) {
1231 meta = task->deferred_reclamation_metadata;
1232 }
1233 if (meta != NULL) {
1234 vmdr_metadata_retain(meta);
1235 }
1236 task_unlock(task);
1237 return meta;
1238 }
1239
1240
1241 #pragma mark Buffer Resize/Synchronization
1242
1243 kern_return_t
vm_deferred_reclamation_buffer_flush_internal(task_t task,mach_vm_reclaim_count_t num_entries_to_reclaim,mach_vm_size_t * bytes_reclaimed_out)1244 vm_deferred_reclamation_buffer_flush_internal(task_t task,
1245 mach_vm_reclaim_count_t num_entries_to_reclaim,
1246 mach_vm_size_t *bytes_reclaimed_out)
1247 {
1248 kern_return_t kr;
1249 vm_deferred_reclamation_metadata_t metadata = NULL;
1250 mach_vm_reclaim_count_t total_reclaimed = 0;
1251 uint64_t bytes_reclaimed = 0;
1252
1253 if (!task_is_active(task)) {
1254 return KERN_INVALID_TASK;
1255 }
1256
1257 metadata = vmdr_acquire_task_metadata(task);
1258 if (metadata == NULL) {
1259 return KERN_INVALID_ARGUMENT;
1260 }
1261
1262 vmdr_metadata_own(metadata);
1263
1264 vmdr_log_debug("[%d] flushing %u entries\n", task_pid(task), num_entries_to_reclaim);
1265 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_START, metadata->vdrm_pid, num_entries_to_reclaim);
1266
1267 while (total_reclaimed < num_entries_to_reclaim) {
1268 mach_vm_reclaim_count_t cur_reclaimed;
1269 uint64_t cur_bytes_reclaimed;
1270 mach_vm_reclaim_count_t chunk_size = MIN(num_entries_to_reclaim - total_reclaimed, kReclaimChunkSize);
1271 kr = reclaim_chunk(metadata, UINT64_MAX, &cur_bytes_reclaimed, chunk_size,
1272 &cur_reclaimed);
1273 total_reclaimed += cur_reclaimed;
1274 bytes_reclaimed += cur_bytes_reclaimed;
1275 if (cur_reclaimed == 0) {
1276 break;
1277 } else if (kr == KERN_ABORTED) {
1278 /*
1279 * Unable to reclaim due to a lost race with
1280 * userspace, yield the gate and try again
1281 */
1282 vmdr_metadata_disown(metadata);
1283 vmdr_metadata_own(metadata);
1284 continue;
1285 } else if (kr != KERN_SUCCESS) {
1286 break;
1287 }
1288 }
1289 /*
1290 * Tell the client how many bytes the kernel has reclaimed
1291 * since the last time it updated its accounting
1292 */
1293 bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1294 metadata->vdrm_kernel_bytes_reclaimed = 0;
1295
1296 vmdr_metadata_disown(metadata);
1297
1298 *bytes_reclaimed_out = bytes_reclaimed;
1299 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_END, kr, total_reclaimed, bytes_reclaimed);
1300 DTRACE_VM2(reclaim_flush,
1301 mach_vm_reclaim_count_t, num_entries_to_reclaim,
1302 size_t, bytes_reclaimed);
1303 return kr;
1304 }
1305
1306 kern_return_t
vm_deferred_reclamation_buffer_resize_internal(task_t task,mach_vm_reclaim_count_t len,mach_vm_size_t * bytes_reclaimed_out)1307 vm_deferred_reclamation_buffer_resize_internal(
1308 task_t task,
1309 mach_vm_reclaim_count_t len,
1310 mach_vm_size_t *bytes_reclaimed_out)
1311 {
1312 kern_return_t kr;
1313 mach_vm_reclaim_count_t num_entries_reclaimed = 0;
1314 mach_vm_reclaim_count_t old_len;
1315
1316 if (task == TASK_NULL) {
1317 return KERN_INVALID_TASK;
1318 }
1319 if (len == 0) {
1320 return KERN_INVALID_ARGUMENT;
1321 }
1322 vm_deferred_reclamation_metadata_t metadata = vmdr_acquire_task_metadata(task);
1323 if (metadata == NULL) {
1324 return KERN_INVALID_TASK;
1325 }
1326
1327 /* Size must be multiple of page size */
1328 vm_map_t map = task->map;
1329 mach_vm_size_t new_size = vmdr_round_len_to_size(map, len);
1330 if (new_size == 0) {
1331 vmdr_metadata_release(metadata);
1332 return KERN_INVALID_ARGUMENT;
1333 }
1334 if (new_size > metadata->vdrm_ring_size) {
1335 vmdr_metadata_release(metadata);
1336 return KERN_NO_SPACE;
1337 }
1338
1339 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_START,
1340 task_pid(task), new_size);
1341
1342 /*
1343 * Prevent other threads from operating on this buffer while it is
1344 * resized. It is the caller's responsibility to ensure mutual
1345 * exclusion with other user threads
1346 */
1347 vmdr_metadata_own(metadata);
1348
1349 old_len = metadata->vdrm_buffer_len;
1350
1351 vmdr_log_debug("%s [%d] resizing buffer %u -> %u entries\n",
1352 task_best_name(task), task_pid(task), old_len, len);
1353
1354 /*
1355 * Reclaim all the entries currently in the buffer to prevent re-use
1356 * of old reclaim ids that will alias differently into the newly sized
1357 * buffer.
1358 *
1359 * TODO: Consider encoding the ringbuffer-capacity in the
1360 * mach_vm_reclaim_id_t, so reuses can still find objects after a resize.
1361 */
1362 mach_vm_size_t total_bytes_reclaimed = 0;
1363 do {
1364 mach_vm_size_t cur_bytes_reclaimed;
1365 kr = reclaim_chunk(metadata, UINT64_MAX, &cur_bytes_reclaimed, kReclaimChunkSize,
1366 &num_entries_reclaimed);
1367 total_bytes_reclaimed += cur_bytes_reclaimed;
1368 if (kr != KERN_SUCCESS) {
1369 goto fail;
1370 }
1371 } while (num_entries_reclaimed > 0);
1372
1373 vmdr_log_debug("[%d] successfully resized buffer | reclaimed: %llu B "
1374 "kernel_reclaimed: %zu B\n", metadata->vdrm_pid,
1375 total_bytes_reclaimed, metadata->vdrm_kernel_bytes_reclaimed);
1376
1377 total_bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1378 metadata->vdrm_kernel_bytes_reclaimed = 0;
1379
1380 /* Publish new user addresses in kernel metadata */
1381 vmdr_metadata_lock(metadata);
1382 metadata->vdrm_buffer_len = len;
1383 vmdr_metadata_disown_locked(metadata);
1384 vmdr_metadata_unlock(metadata);
1385 vmdr_metadata_release(metadata);
1386
1387 *bytes_reclaimed_out = total_bytes_reclaimed;
1388
1389 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, KERN_SUCCESS, num_entries_reclaimed, total_bytes_reclaimed);
1390 DTRACE_VM2(reclaim_ring_resize,
1391 mach_vm_reclaim_count_t, old_len,
1392 mach_vm_reclaim_count_t, len);
1393 return KERN_SUCCESS;
1394
1395 fail:
1396 vmdr_metadata_disown(metadata);
1397 vmdr_metadata_release(metadata);
1398 *bytes_reclaimed_out = total_bytes_reclaimed;
1399 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, kr, num_entries_reclaimed);
1400 return kr;
1401 }
1402
1403 #pragma mark Accounting
1404
1405 extern vm_pressure_level_t memorystatus_vm_pressure_level;
1406
1407 static kern_return_t
vmdr_calculate_autotrim_threshold(vm_deferred_reclamation_metadata_t metadata,size_t * trim_threshold_out)1408 vmdr_calculate_autotrim_threshold(vm_deferred_reclamation_metadata_t metadata, size_t *trim_threshold_out)
1409 {
1410 kern_return_t kr;
1411 uint32_t autotrim_pct;
1412
1413 /*
1414 * Determine the autotrim threshold based on the current pressure level
1415 */
1416 vm_pressure_level_t pressure_level = os_atomic_load(&memorystatus_vm_pressure_level, relaxed);
1417 switch (pressure_level) {
1418 case kVMPressureNormal:
1419 autotrim_pct = vm_reclaim_autotrim_pct_normal;
1420 break;
1421 case kVMPressureWarning:
1422 case kVMPressureUrgent:
1423 autotrim_pct = vm_reclaim_autotrim_pct_pressure;
1424 break;
1425 case kVMPressureCritical:
1426 autotrim_pct = vm_reclaim_autotrim_pct_critical;
1427 break;
1428 default:
1429 panic("vm_reclaim: unexpected vm_pressure_level %d", pressure_level);
1430 }
1431
1432 /*
1433 * Estimate the task's maximum working set size
1434 */
1435 ledger_amount_t phys_footprint_max = 0;
1436
1437 vmdr_metadata_lock(metadata);
1438 task_t task = metadata->vdrm_task;
1439 if (task == TASK_NULL) {
1440 vmdr_metadata_unlock(metadata);
1441 return KERN_INVALID_TASK;
1442 }
1443 task_reference(task);
1444 vmdr_metadata_unlock(metadata);
1445
1446 kr = ledger_get_lifetime_max(task->ledger,
1447 task_ledgers.phys_footprint, &phys_footprint_max);
1448 assert3u(kr, ==, KERN_SUCCESS);
1449
1450 task_deallocate(task);
1451
1452 *trim_threshold_out = phys_footprint_max * autotrim_pct / 100;
1453 return KERN_SUCCESS;
1454 }
1455
1456 #define VMDR_WMA_UNIT (1 << 8)
1457 #define VMDR_WMA_MIX(base, e) ((vm_reclaim_wma_weight_base * (base) + (e) * VMDR_WMA_UNIT * vm_reclaim_wma_weight_cur) / vm_reclaim_wma_denom)
1458
1459 /*
1460 * @func vmdr_ws_sample
1461 *
1462 * @brief sample the working set size of the given buffer
1463 *
1464 * @param metadata
1465 * The reclaim buffer to sample
1466 *
1467 * @param trim_threshold_out
1468 * If the buffer should be trimmed, the amount to trim (in bytes) will be
1469 * written out
1470 *
1471 * @returns KERN_MEMORY_ERROR if copyio failed due to RECLAIM_NO_FAULT
1472 *
1473 * @discussion
1474 * The caller must own the buffer
1475 */
1476 static mach_error_t
vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata,mach_vm_size_t * trim_threshold_out,vm_deferred_reclamation_options_t options)1477 vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata,
1478 mach_vm_size_t *trim_threshold_out, vm_deferred_reclamation_options_t options)
1479 {
1480 mach_error_t err = ERR_SUCCESS;
1481 size_t min_reclaimable_bytes = 0, cur_reclaimable_bytes = 0;
1482 uint64_t wma = 0;
1483
1484 vmdr_metadata_assert_owned(metadata);
1485
1486 *trim_threshold_out = 0;
1487
1488 vm_map_switch_context_t map_ctx = vm_map_switch_to(metadata->vdrm_map);
1489
1490 if (options & RECLAIM_NO_FAULT) {
1491 vm_fault_disable();
1492 }
1493
1494 err = reclaim_copyin_min_reclaimable_bytes(metadata, &min_reclaimable_bytes);
1495 if (err != ERR_SUCCESS) {
1496 goto done;
1497 }
1498
1499 uint64_t now = mach_absolute_time();
1500 if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) {
1501 /* A sampling period has not elapsed */
1502 goto done;
1503 }
1504 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_START,
1505 metadata->vdrm_pid,
1506 now,
1507 metadata->vdrm_last_sample_abs,
1508 min_reclaimable_bytes);
1509
1510 err = reclaim_copyin_reclaimable_bytes(metadata, &cur_reclaimable_bytes);
1511 if (err != ERR_SUCCESS) {
1512 goto done;
1513 }
1514
1515 /* Reset the minimum to start a new sampling interval */
1516 err = reclaim_copyout_min_reclaimable_bytes(metadata, cur_reclaimable_bytes);
1517 if (err != ERR_SUCCESS) {
1518 goto done;
1519 }
1520
1521 /*
1522 * The user accounting will overcount if the kernel has reclaimed
1523 * without telling the client about it.
1524 */
1525 if (cur_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) {
1526 cur_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed;
1527 } else {
1528 vmdr_log_error("[%d] more bytes have been reclaimed (%zu) than "
1529 "are supposedly in buffer (%zu)\n", metadata->vdrm_pid,
1530 metadata->vdrm_kernel_bytes_reclaimed, cur_reclaimable_bytes);
1531 /* This will cause an underflow in user accounting */
1532 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_ACCOUNTING_FAILURE, cur_reclaimable_bytes);
1533 err = KERN_ABORTED;
1534 goto done;
1535 }
1536 if (min_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) {
1537 min_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed;
1538 } else {
1539 min_reclaimable_bytes = 0;
1540 }
1541
1542 uint64_t samples_elapsed = (now - metadata->vdrm_last_sample_abs) /
1543 vm_reclaim_sampling_period_abs;
1544 if (samples_elapsed > vm_reclaim_abandonment_threshold) {
1545 /*
1546 * Many sampling periods have elapsed since the ring was
1547 * last sampled. Don't bother computing the WMA and assume
1548 * the buffer's current contents are unneeded.
1549 */
1550 wma = VMDR_WMA_MIX(0, cur_reclaimable_bytes);
1551 } else {
1552 /*
1553 * Compute an exponential moving average of the minimum amount of reclaimable
1554 * memory in this buffer. Multiple sampling periods may have elapsed
1555 * since the last sample. By definition, the minimum must be the same for
1556 * all elapsed periods (otherwise libmalloc would have called down to
1557 * update accounting)
1558 */
1559 for (unsigned int i = 0; i < samples_elapsed; i++) {
1560 wma = VMDR_WMA_MIX(
1561 metadata->vdrm_reclaimable_bytes_wma,
1562 min_reclaimable_bytes);
1563 }
1564 }
1565
1566 metadata->vdrm_reclaimable_bytes_wma = wma;
1567 size_t unneeded_bytes = MIN(min_reclaimable_bytes,
1568 metadata->vdrm_reclaimable_bytes_wma / VMDR_WMA_UNIT);
1569
1570 size_t autotrim_threshold;
1571 err = vmdr_calculate_autotrim_threshold(metadata, &autotrim_threshold);
1572 if (err != ERR_SUCCESS) {
1573 goto done;
1574 }
1575
1576 if (unneeded_bytes >= vm_map_page_size(metadata->vdrm_map) &&
1577 unneeded_bytes >= autotrim_threshold) {
1578 *trim_threshold_out = vm_map_round_page(unneeded_bytes,
1579 vm_map_page_mask(metadata->vdrm_map));
1580 }
1581
1582 metadata->vdrm_last_sample_abs = mach_absolute_time();
1583 metadata->vdrm_reclaimable_bytes_last = cur_reclaimable_bytes;
1584
1585 done:
1586 vm_map_switch_back(map_ctx);
1587 if (options & RECLAIM_NO_FAULT) {
1588 vm_fault_enable();
1589 }
1590 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_END,
1591 wma,
1592 min_reclaimable_bytes,
1593 cur_reclaimable_bytes,
1594 *trim_threshold_out);
1595 DTRACE_VM5(reclaim_sample,
1596 pid_t, metadata->vdrm_pid,
1597 uint64_t, wma,
1598 size_t, min_reclaimable_bytes,
1599 size_t, cur_reclaimable_bytes,
1600 size_t, *trim_threshold_out);
1601 vmdr_log_debug("sampled buffer with min %lu est %lu trim %llu wma %llu\n",
1602 min_reclaimable_bytes,
1603 cur_reclaimable_bytes,
1604 *trim_threshold_out,
1605 wma);
1606 return err;
1607 }
1608
1609 /*
1610 * Caller must have buffer owned and unlocked
1611 */
1612 static kern_return_t
vmdr_trim(vm_deferred_reclamation_metadata_t metadata,mach_vm_size_t bytes_to_reclaim,mach_vm_size_t * bytes_reclaimed,vm_deferred_reclamation_options_t options)1613 vmdr_trim(vm_deferred_reclamation_metadata_t metadata, mach_vm_size_t bytes_to_reclaim,
1614 mach_vm_size_t *bytes_reclaimed, vm_deferred_reclamation_options_t options)
1615 {
1616 kern_return_t kr;
1617 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_START,
1618 metadata->vdrm_pid, bytes_to_reclaim);
1619
1620 kr = vmdr_reclaim_from_buffer(metadata, bytes_to_reclaim,
1621 bytes_reclaimed, options);
1622
1623 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_END, kr, bytes_reclaimed);
1624 DTRACE_VM3(reclaim_trim,
1625 pid_t, metadata->vdrm_pid,
1626 size_t, bytes_to_reclaim,
1627 size_t, *bytes_reclaimed);
1628 return kr;
1629 }
1630
1631 /*
1632 * Caller must have buffer owned and unlocked
1633 */
1634 static kern_return_t
vmdr_drain(vm_deferred_reclamation_metadata_t metadata,mach_vm_size_t * bytes_reclaimed,vm_deferred_reclamation_options_t options)1635 vmdr_drain(vm_deferred_reclamation_metadata_t metadata, mach_vm_size_t *bytes_reclaimed,
1636 vm_deferred_reclamation_options_t options)
1637 {
1638 kern_return_t kr;
1639 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_START,
1640 metadata->vdrm_pid);
1641
1642 kr = vmdr_reclaim_from_buffer(metadata, UINT64_MAX,
1643 bytes_reclaimed, options);
1644
1645 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_DRAIN) | DBG_FUNC_END, kr, bytes_reclaimed);
1646 DTRACE_VM2(reclaim_drain,
1647 pid_t, metadata->vdrm_pid,
1648 size_t, *bytes_reclaimed);
1649 return kr;
1650 }
1651
1652 mach_error_t
vm_deferred_reclamation_update_accounting_internal(task_t task,uint64_t * bytes_reclaimed_out)1653 vm_deferred_reclamation_update_accounting_internal(task_t task, uint64_t *bytes_reclaimed_out)
1654 {
1655 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1656 mach_vm_size_t bytes_to_reclaim, bytes_reclaimed = 0;
1657 mach_error_t err = ERR_SUCCESS;
1658
1659 if (metadata == NULL) {
1660 return KERN_NOT_FOUND;
1661 }
1662
1663 if (!metadata->vdrm_pid) {
1664 /* If this is a forked child, we may not yet have a pid */
1665 metadata->vdrm_pid = task_pid(task);
1666 }
1667
1668 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START,
1669 metadata->vdrm_pid);
1670
1671 vmdr_metadata_lock(metadata);
1672 uint64_t now = mach_absolute_time();
1673 if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) {
1674 /*
1675 * This is a fast path to avoid waiting on the gate if another
1676 * thread beat us to sampling.
1677 */
1678 vmdr_metadata_unlock(metadata);
1679 goto done;
1680 }
1681 vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE);
1682 vmdr_metadata_unlock(metadata);
1683
1684 err = vmdr_sample_working_set(metadata, &bytes_to_reclaim, RECLAIM_OPTIONS_NONE);
1685 if (err != ERR_SUCCESS) {
1686 vmdr_metadata_disown(metadata);
1687 goto done;
1688 }
1689 if (bytes_to_reclaim) {
1690 vmdr_log_debug("[%d] trimming %llu B\n", metadata->vdrm_pid, bytes_to_reclaim);
1691
1692 err = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, RECLAIM_OPTIONS_NONE);
1693
1694 if (err == KERN_ABORTED) {
1695 /*
1696 * We were unable to complete the trim due to a lost
1697 * race with userspace. This need not be fatal b/c the
1698 * accounting was successfully updated.
1699 */
1700 err = KERN_SUCCESS;
1701 }
1702 }
1703
1704 /*
1705 * Tell the client how many bytes the kernel has reclaimed
1706 * since the last time it updated its accounting
1707 */
1708 bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed;
1709 metadata->vdrm_kernel_bytes_reclaimed = 0;
1710
1711 vmdr_metadata_disown(metadata);
1712
1713 done:
1714 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END,
1715 metadata->vdrm_last_sample_abs,
1716 bytes_to_reclaim,
1717 bytes_reclaimed);
1718 *bytes_reclaimed_out = (uint64_t)bytes_reclaimed;
1719 return err;
1720 }
1721
1722 kern_return_t
vm_deferred_reclamation_task_drain(task_t task,vm_deferred_reclamation_options_t options)1723 vm_deferred_reclamation_task_drain(task_t task,
1724 vm_deferred_reclamation_options_t options)
1725 {
1726 kern_return_t kr;
1727 mach_vm_size_t bytes_reclaimed;
1728
1729 task_lock(task);
1730 if (!task_is_active(task) || task_is_halting(task)) {
1731 task_unlock(task);
1732 return KERN_ABORTED;
1733 }
1734 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1735 if (metadata == NULL) {
1736 task_unlock(task);
1737 return KERN_SUCCESS;
1738 }
1739 vmdr_metadata_retain(metadata);
1740 task_unlock(task);
1741
1742 vmdr_metadata_own(metadata);
1743
1744 kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1745 metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1746
1747 vmdr_metadata_disown(metadata);
1748 vmdr_metadata_release(metadata);
1749 return kr;
1750 }
1751
1752 void
vm_deferred_reclamation_task_suspend(task_t task)1753 vm_deferred_reclamation_task_suspend(task_t task)
1754 {
1755 if (task->deferred_reclamation_metadata) {
1756 sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
1757 }
1758 }
1759
1760 #pragma mark KPIs
1761
1762 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_task_fork(task_t task,vm_deferred_reclamation_metadata_t parent)1763 vm_deferred_reclamation_task_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1764 {
1765 vm_deferred_reclamation_metadata_t metadata = NULL;
1766 vmdr_metadata_assert_owned(parent);
1767 vmdr_log_debug("forking [%d]\n", parent->vdrm_pid);
1768
1769 assert(task->deferred_reclamation_metadata == NULL);
1770 metadata = vmdr_metadata_alloc(task, parent->vdrm_ring_addr,
1771 parent->vdrm_ring_size, parent->vdrm_buffer_len);
1772
1773 metadata->vdrm_last_sample_abs = parent->vdrm_last_sample_abs;
1774 metadata->vdrm_kernel_bytes_reclaimed = parent->vdrm_kernel_bytes_reclaimed;
1775 metadata->vdrm_reclaimable_bytes_wma = parent->vdrm_reclaimable_bytes_wma;
1776
1777 return metadata;
1778 }
1779
1780 void
vm_deferred_reclamation_task_fork_register(vm_deferred_reclamation_metadata_t metadata)1781 vm_deferred_reclamation_task_fork_register(vm_deferred_reclamation_metadata_t metadata)
1782 {
1783 assert(metadata != NULL);
1784 assert(!metadata->vdrm_is_registered);
1785
1786 lck_mtx_lock(&reclaim_buffers_lock);
1787 metadata->vdrm_is_registered = true;
1788 vmdr_list_append_locked(metadata);
1789 lck_mtx_unlock(&reclaim_buffers_lock);
1790 }
1791
1792 bool
vm_deferred_reclamation_task_has_ring(task_t task)1793 vm_deferred_reclamation_task_has_ring(task_t task)
1794 {
1795 return task->deferred_reclamation_metadata != NULL;
1796 }
1797
1798 void
vm_deferred_reclamation_ring_own(vm_deferred_reclamation_metadata_t metadata)1799 vm_deferred_reclamation_ring_own(vm_deferred_reclamation_metadata_t metadata)
1800 {
1801 vmdr_metadata_own(metadata);
1802 }
1803
1804 void
vm_deferred_reclamation_ring_disown(vm_deferred_reclamation_metadata_t metadata)1805 vm_deferred_reclamation_ring_disown(vm_deferred_reclamation_metadata_t metadata)
1806 {
1807 vmdr_metadata_disown(metadata);
1808 }
1809
1810 void
vm_deferred_reclamation_gc(vm_deferred_reclamation_gc_action_t action,mach_vm_size_t * total_bytes_reclaimed,vm_deferred_reclamation_options_t options)1811 vm_deferred_reclamation_gc(vm_deferred_reclamation_gc_action_t action,
1812 mach_vm_size_t *total_bytes_reclaimed,
1813 vm_deferred_reclamation_options_t options)
1814 {
1815 vmdr_garbage_collect(action, total_bytes_reclaimed, options);
1816 }
1817
1818 void
vm_deferred_reclamation_settle_ledger(task_t task)1819 vm_deferred_reclamation_settle_ledger(task_t task)
1820 {
1821 vm_deferred_reclamation_metadata_t meta = vmdr_acquire_task_metadata(task);
1822 if (meta == NULL) {
1823 return;
1824 }
1825 vmdr_metadata_lock(meta);
1826 ledger_zero_balance(task->ledger, task_ledgers.est_reclaimable);
1827 ledger_credit(
1828 task->ledger,
1829 task_ledgers.est_reclaimable,
1830 meta->vdrm_reclaimable_bytes_last);
1831 vmdr_metadata_unlock(meta);
1832 vmdr_metadata_release(meta);
1833 }
1834
1835 #pragma mark Global Reclamation GC
1836
1837 static void
vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action,mach_vm_size_t * total_bytes_reclaimed_out,vm_deferred_reclamation_options_t options)1838 vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action,
1839 mach_vm_size_t *total_bytes_reclaimed_out,
1840 vm_deferred_reclamation_options_t options)
1841 {
1842 kern_return_t kr;
1843 mach_vm_size_t total_bytes_reclaimed = 0;
1844 gate_wait_result_t wr;
1845
1846 lck_mtx_lock(&reclaim_buffers_lock);
1847 kr = lck_mtx_gate_try_close(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
1848 if (kr != KERN_SUCCESS) {
1849 if (options & RECLAIM_NO_WAIT) {
1850 lck_mtx_unlock(&reclaim_buffers_lock);
1851 return;
1852 }
1853 wr = lck_mtx_gate_wait(&reclaim_buffers_lock, &vm_reclaim_gc_gate, LCK_SLEEP_DEFAULT, THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1854 assert3u(wr, ==, GATE_HANDOFF);
1855 }
1856
1857 vm_reclaim_gc_epoch++;
1858 vmdr_log_debug("running global GC\n");
1859 while (true) {
1860 vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclaim_buffers);
1861 if (metadata == NULL) {
1862 break;
1863 }
1864 vmdr_list_remove_locked(metadata);
1865 vmdr_list_append_locked(metadata);
1866 vmdr_metadata_retain(metadata);
1867 lck_mtx_unlock(&reclaim_buffers_lock);
1868
1869 vmdr_metadata_lock(metadata);
1870
1871 if (metadata->vdrm_reclaimed_at >= vm_reclaim_gc_epoch) {
1872 /* We've already seen this one. We're done */
1873 vmdr_metadata_unlock(metadata);
1874 vmdr_metadata_release(metadata);
1875 lck_mtx_lock(&reclaim_buffers_lock);
1876 break;
1877 }
1878 metadata->vdrm_reclaimed_at = vm_reclaim_gc_epoch;
1879
1880 task_t task = metadata->vdrm_task;
1881 if (task == TASK_NULL ||
1882 !task_is_active(task) ||
1883 task_is_halting(task)) {
1884 goto next;
1885 }
1886 bool buffer_is_suspended = task_is_app_suspended(task);
1887 task = TASK_NULL;
1888
1889 mach_vm_size_t bytes_reclaimed = 0;
1890 mach_vm_size_t bytes_to_reclaim = 0;
1891
1892 switch (action) {
1893 case RECLAIM_GC_DRAIN:
1894 if (!vmdr_metadata_own_locked(metadata, options)) {
1895 goto next;
1896 }
1897 vmdr_metadata_unlock(metadata);
1898
1899 vmdr_log_debug("draining [%d]\n", metadata->vdrm_pid);
1900 kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1901 metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1902
1903 vmdr_metadata_lock(metadata);
1904 vmdr_metadata_disown_locked(metadata);
1905 break;
1906 case RECLAIM_GC_SCAVENGE:
1907 if (buffer_is_suspended) {
1908 if (!vmdr_metadata_own_locked(metadata, options)) {
1909 goto next;
1910 }
1911 vmdr_metadata_unlock(metadata);
1912
1913 /* This buffer is no longer in use, fully reclaim it. */
1914 vmdr_log_debug("found suspended buffer [%d], draining\n", metadata->vdrm_pid);
1915 kr = vmdr_drain(metadata, &bytes_reclaimed, options);
1916 metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1917
1918 vmdr_metadata_lock(metadata);
1919 vmdr_metadata_disown_locked(metadata);
1920 }
1921 break;
1922 case RECLAIM_GC_TRIM:
1923 if (!vmdr_metadata_own_locked(metadata, options)) {
1924 goto next;
1925 }
1926 vmdr_metadata_unlock(metadata);
1927 kr = vmdr_sample_working_set(metadata, &bytes_to_reclaim, options);
1928 if (kr == KERN_SUCCESS && bytes_to_reclaim) {
1929 vmdr_log_debug("GC found stale buffer (%d), trimming\n", metadata->vdrm_pid);
1930 kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, options);
1931 metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed;
1932 }
1933 vmdr_metadata_lock(metadata);
1934 vmdr_metadata_disown_locked(metadata);
1935 break;
1936 }
1937 if (bytes_reclaimed) {
1938 vm_reclaim_gc_reclaim_count++;
1939 total_bytes_reclaimed += bytes_reclaimed;
1940 }
1941 if (metadata->vdrm_waiters && action != RECLAIM_GC_TRIM) {
1942 thread_wakeup((event_t)&metadata->vdrm_waiters);
1943 }
1944 next:
1945 vmdr_metadata_unlock(metadata);
1946 vmdr_metadata_release(metadata);
1947 lck_mtx_lock(&reclaim_buffers_lock);
1948 }
1949 lck_mtx_gate_handoff(&reclaim_buffers_lock, &vm_reclaim_gc_gate, GATE_HANDOFF_OPEN_IF_NO_WAITERS);
1950 lck_mtx_unlock(&reclaim_buffers_lock);
1951 *total_bytes_reclaimed_out = total_bytes_reclaimed;
1952 }
1953
1954 OS_NORETURN
1955 static void
vm_reclaim_scavenger_thread_continue(__unused void * param,__unused wait_result_t wr)1956 vm_reclaim_scavenger_thread_continue(__unused void *param, __unused wait_result_t wr)
1957 {
1958 sched_cond_ack(&vm_reclaim_scavenger_cond);
1959
1960 while (true) {
1961 mach_vm_size_t total_bytes_reclaimed;
1962 vmdr_garbage_collect(RECLAIM_GC_SCAVENGE, &total_bytes_reclaimed,
1963 RECLAIM_OPTIONS_NONE);
1964 vmdr_log_info("scavenger reclaimed %llu KiB of virtual memory\n",
1965 total_bytes_reclaimed >> 10);
1966 sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT,
1967 vm_reclaim_scavenger_thread_continue);
1968 }
1969 }
1970
1971 OS_NORETURN
1972 static void
vm_reclaim_scavenger_thread_init(__unused void * param,__unused wait_result_t wr)1973 vm_reclaim_scavenger_thread_init(__unused void *param, __unused wait_result_t wr)
1974 {
1975 thread_set_thread_name(current_thread(), "VM_reclaim_scavenger");
1976 #if CONFIG_THREAD_GROUPS
1977 thread_group_vm_add();
1978 #endif /* CONFIG_THREAD_GROUPS */
1979 sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT, vm_reclaim_scavenger_thread_continue);
1980 __builtin_unreachable();
1981 }
1982
1983 __startup_func
1984 static void
vm_deferred_reclamation_init(void)1985 vm_deferred_reclamation_init(void)
1986 {
1987 vm_reclaim_log_handle = os_log_create("com.apple.xnu", "vm_reclaim");
1988 nanoseconds_to_absolutetime((uint64_t)vm_reclaim_sampling_period_ns,
1989 &vm_reclaim_sampling_period_abs);
1990
1991 sched_cond_init(&vm_reclaim_scavenger_cond);
1992 lck_mtx_gate_init(&reclaim_buffers_lock, &vm_reclaim_gc_gate);
1993 kern_return_t kr = kernel_thread_start_priority(vm_reclaim_scavenger_thread_init,
1994 NULL, BASEPRI_KERNEL, &vm_reclaim_scavenger_thread);
1995 if (kr != KERN_SUCCESS) {
1996 panic("Unable to create VM reclaim thread, %d", kr);
1997 }
1998 }
1999
2000 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
2001
2002 #pragma mark Debug Interfaces
2003
2004 #if DEVELOPMENT || DEBUG
2005
2006 bool
vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task)2007 vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task)
2008 {
2009 bool reclaimed;
2010 vm_deferred_reclamation_metadata_t metadata;
2011
2012 metadata = vmdr_acquire_task_metadata(task);
2013 if (metadata == NULL) {
2014 return false;
2015 }
2016 vmdr_metadata_lock(metadata);
2017
2018 metadata->vdrm_waiters++;
2019 /* Wake up the scavenger thread */
2020 sched_cond_signal(&vm_reclaim_scavenger_cond, vm_reclaim_scavenger_thread);
2021 wait_result_t wr = lck_mtx_sleep(&metadata->vdrm_lock,
2022 LCK_SLEEP_DEFAULT, (event_t)&metadata->vdrm_waiters,
2023 THREAD_ABORTSAFE);
2024 metadata->vdrm_waiters--;
2025 reclaimed = (wr == THREAD_AWAKENED);
2026
2027 vmdr_metadata_unlock(metadata);
2028 vmdr_metadata_release(metadata);
2029 return reclaimed;
2030 }
2031
2032 #endif /* DEVELOPMENT || DEBUG */
2033
2034 #pragma mark Introspectibility
2035
2036 kern_return_t
vm_deferred_reclamation_buffer_query_internal(task_t task,mach_vm_address_ut * addr_out_u,mach_vm_size_ut * size_out_u)2037 vm_deferred_reclamation_buffer_query_internal(
2038 task_t task,
2039 mach_vm_address_ut *addr_out_u,
2040 mach_vm_size_ut *size_out_u)
2041 {
2042 vm_deferred_reclamation_metadata_t meta;
2043
2044 if (task == NULL) {
2045 return KERN_INVALID_TASK;
2046 }
2047
2048 if ((addr_out_u == NULL) || (size_out_u == NULL)) {
2049 return KERN_INVALID_ARGUMENT;
2050 }
2051
2052 meta = vmdr_acquire_task_metadata(task);
2053
2054 if (meta == NULL) {
2055 *addr_out_u = vm_sanitize_wrap_addr(0);
2056 *size_out_u = vm_sanitize_wrap_size(0);
2057 } else {
2058 vmdr_metadata_lock(meta);
2059 *addr_out_u = vm_sanitize_wrap_addr(meta->vdrm_ring_addr);
2060 *size_out_u = vm_sanitize_wrap_size(meta->vdrm_ring_size);
2061 vmdr_metadata_unlock(meta);
2062 vmdr_metadata_release(meta);
2063 }
2064
2065 return KERN_SUCCESS;
2066 }
2067