1 /*
2 * Copyright (c) 2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/startup.h>
35 #include <kern/sched.h>
36 #include <libkern/OSAtomic.h>
37 #include <mach/kern_return.h>
38 #include <mach/mach_types.h>
39 #include <mach/vm_reclaim.h>
40 #include <os/log.h>
41 #include <pexpert/pexpert.h>
42 #include <vm/vm_fault_xnu.h>
43 #include <vm/vm_map.h>
44 #include <vm/vm_map_internal.h>
45 #include <vm/vm_reclaim_internal.h>
46 #include <vm/vm_sanitize_internal.h>
47 #include <sys/errno.h>
48 #include <sys/kdebug.h>
49 #include <vm/vm_kern_xnu.h>
50 #include <sys/queue.h>
51 #include <sys/reason.h>
52 #include <os/atomic_private.h>
53 #include <os/refcnt.h>
54 #include <os/refcnt_internal.h>
55
56 #pragma mark Tunables
57
58 #define VM_RECLAIM_THRESHOLD_DISABLED 0ULL
59
60 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
61 static integer_t kReclaimThreadPriority = BASEPRI_VM;
62 // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
63 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", 2);
64 TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults", "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", 0, TUNABLE_DT_NONE);
65 // Used to debug vm_reclaim kills
66 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
67
68 #pragma mark Declarations
69 typedef struct proc *proc_t;
70 extern const char *proc_best_name(struct proc *);
71 extern kern_return_t kern_return_for_errno(int);
72 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
73 struct proc *proc_ref(struct proc *p, int locked);
74 int proc_rele(proc_t p);
75 static kern_return_t reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
76 static kern_return_t reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
77 static kern_return_t reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
78
79 os_refgrp_decl(static, vdrm_refgrp, "vm_reclaim_metadata_refgrp", NULL);
80
81 struct vm_deferred_reclamation_metadata_s {
82 /*
83 * Global list containing every reclamation buffer. Protected by the
84 * reclamation_buffers_lock.
85 */
86 TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list;
87 /*
88 * A list containing buffers that are ripe for reclamation. Protected by
89 * the async_reclamation_buffers_lock.
90 */
91 TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list;
92 /* Protects all struct fields (except denoted otherwise) */
93 decl_lck_mtx_data(, vdrm_lock);
94 decl_lck_mtx_gate_data(, vdrm_gate);
95 /*
96 * The task owns this structure but we maintain a backpointer here
97 * so that we can send an exception if we hit an error.
98 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
99 */
100 task_t vdrm_task;
101 pid_t vdrm_pid;
102 vm_map_t vdrm_map;
103 /*
104 * The owning task holds a ref on this object. When the task dies, it
105 * will set vdrm_task := NULL and drop its ref. Threads operating on the buffer
106 * should hold a +1 on the metadata structure to ensure it's validity.
107 */
108 os_refcnt_t vdrm_refcnt;
109 user_addr_t vdrm_reclaim_buffer;
110 mach_vm_size_t vdrm_buffer_size;
111 user_addr_t vdrm_reclaim_indices;
112 uint64_t vdrm_reclaimed_at;
113 /*
114 * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
115 * cumulatively. Both values are in terms of virtual memory, so they give an upper bound
116 * on the amount of physical memory that can be reclaimed.
117 * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
118 * Note that neither value is protected by the vdrm_lock.
119 */
120 _Atomic size_t vdrm_num_bytes_put_in_buffer;
121 _Atomic size_t vdrm_num_bytes_reclaimed;
122 /*
123 * The number of threads waiting for a pending reclamation
124 * on this buffer to complete. Protected by the
125 * async_reclamation_buffers_lock.
126 */
127 uint32_t vdrm_waiters;
128 };
129 static void vmdr_process_async_reclamation_list(void);
130
131 extern void *proc_find(int pid);
132 extern task_t proc_task(proc_t);
133
134 #pragma mark Globals
135 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
136 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
137 static os_log_t vm_reclaim_log_handle;
138
139 /*
140 * We maintain two lists of reclamation buffers.
141 * The reclamation_buffers list contains every buffer in the system.
142 * The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
143 * Each list has its own lock.
144 */
145 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
146
147 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
148 /*
149 * The reclamation_buffers_lock protects the reclamation_buffers list.
150 * It must be held when iterating over the list or manipulating the list.
151 * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
152 */
153 LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
154 LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
155 static uint64_t reclamation_counter; // generation count for global reclaims
156
157
158 static void vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
159 static void vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
160 static void vmdr_async_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
161 static void vmdr_async_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
162
163 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
164 static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
165 static void vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata);
166
167 #pragma mark Implementation
168
169 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,user_addr_t indices)170 vmdr_metadata_alloc(
171 task_t task,
172 user_addr_t buffer,
173 mach_vm_size_t size,
174 user_addr_t indices)
175 {
176 vm_deferred_reclamation_metadata_t metadata;
177 vm_map_t map = task->map;
178
179 assert(!map->is_nested_map);
180
181 metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
182 lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
183 lck_mtx_gate_init(&metadata->vdrm_lock, &metadata->vdrm_gate);
184 os_ref_init(&metadata->vdrm_refcnt, &vdrm_refgrp);
185
186 metadata->vdrm_task = task;
187 /*
188 * Forked children will not yet have a pid. Lazily set the pid once the
189 * task has been started.
190 *
191 * TODO: do not support buffer initialization during fork and have libmalloc
192 * initialize the buffer after fork. (rdar://124295804)
193 */
194 metadata->vdrm_pid = 0;
195 metadata->vdrm_map = map;
196 metadata->vdrm_reclaim_buffer = buffer;
197 metadata->vdrm_buffer_size = size;
198 metadata->vdrm_reclaim_indices = indices;
199
200 /*
201 * we do not need to hold a lock on `task` because this is called
202 * either at fork() time or from the context of current_task().
203 */
204 vm_map_reference(map);
205 return metadata;
206 }
207
208 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)209 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
210 {
211 assert3u(os_ref_get_count(&metadata->vdrm_refcnt), ==, 0);
212 vm_map_deallocate(metadata->vdrm_map);
213 lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
214 lck_mtx_gate_destroy(&metadata->vdrm_lock, &metadata->vdrm_gate);
215 zfree(vm_reclaim_metadata_zone, metadata);
216 }
217
218 kern_return_t
vm_deferred_reclamation_buffer_init_internal(task_t task,mach_vm_address_t * address,mach_vm_size_t size)219 vm_deferred_reclamation_buffer_init_internal(
220 task_t task,
221 mach_vm_address_t *address,
222 mach_vm_size_t size)
223 {
224 kern_return_t kr = KERN_FAILURE;
225 vm_deferred_reclamation_metadata_t metadata = NULL;
226 vm_map_t map;
227 uint64_t head = 0, tail = 0, busy = 0;
228 static bool reclaim_disabled_logged = false;
229
230 if (task == TASK_NULL || address == NULL || size == 0) {
231 return KERN_INVALID_ARGUMENT;
232 }
233
234 if (!vm_reclaim_max_threshold) {
235 if (!reclaim_disabled_logged) {
236 /* Avoid logging failure for every new process */
237 reclaim_disabled_logged = true;
238 os_log_error(vm_reclaim_log_handle,
239 "vm_reclaim: failed to initialize vmdr buffer - reclaim is disabled (%llu)\n",
240 vm_reclaim_max_threshold);
241 }
242 return KERN_NOT_SUPPORTED;
243 }
244
245 map = task->map;
246 size = vm_map_round_page(size, VM_MAP_PAGE_MASK(map));
247
248 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START,
249 task_pid(task), size);
250 /*
251 * TODO: If clients other than libmalloc adopt deferred reclaim, a
252 * different tag should be given
253 */
254 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE_PERMANENT(
255 .vm_tag = VM_MEMORY_MALLOC);
256 mach_vm_offset_ut *offset_u = vm_sanitize_wrap_addr_ref(address);
257 mach_vm_size_ut size_u = vm_sanitize_wrap_size(size);
258 kr = mach_vm_allocate_kernel(map, offset_u, size_u, vmk_flags);
259 if (kr != KERN_SUCCESS) {
260 os_log_error(vm_reclaim_log_handle, "vm_reclaim: failed to allocate VA for reclaim "
261 "buffer (%d) - %s [%d]\n", kr, task_best_name(task), task_pid(task));
262 return kr;
263 }
264 assert3u(*address, !=, 0);
265
266 user_addr_t buffer = *address + \
267 offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
268 mach_vm_size_t buffer_size = size - \
269 offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
270 user_addr_t indices = *address + \
271 offsetof(struct mach_vm_reclaim_buffer_v1_s, indices);
272
273 metadata = vmdr_metadata_alloc(task, buffer, buffer_size, indices);
274
275 /*
276 * Validate the starting indices.
277 */
278 kr = reclaim_copyin_busy(metadata, &busy);
279 if (kr != KERN_SUCCESS) {
280 goto out;
281 }
282 kr = reclaim_copyin_head(metadata, &head);
283 if (kr != KERN_SUCCESS) {
284 goto out;
285 }
286 kr = reclaim_copyin_tail(metadata, &tail);
287 if (kr != KERN_SUCCESS) {
288 goto out;
289 }
290
291 if (head != 0 || tail != 0 || busy != 0) {
292 os_log_error(vm_reclaim_log_handle, "vm_reclaim: indices were not "
293 "zero-initialized\n");
294 kr = KERN_INVALID_ARGUMENT;
295 goto out;
296 }
297
298 /*
299 * Publish the metadata to the task & global buffer list. This must be
300 * done under the task lock to synchronize with task termination - i.e.
301 * task_terminate_internal is guaranteed to see the published metadata and
302 * tear it down.
303 */
304 lck_mtx_lock(&reclamation_buffers_lock);
305 task_lock(task);
306
307 if (!task_is_active(task) || task_is_halting(task)) {
308 os_log_error(vm_reclaim_log_handle,
309 "vm_reclaim: failed to initialize buffer on dying task %s [%d]", task_best_name(task), task_pid(task));
310 kr = KERN_ABORTED;
311 goto fail_task;
312 }
313 if (task->deferred_reclamation_metadata != NULL) {
314 os_log_error(vm_reclaim_log_handle,
315 "vm_reclaim: tried to overwrite existing reclaim buffer for %s [%d]", task_best_name(task), task_pid(task));
316 kr = KERN_INVALID_ARGUMENT;
317 goto fail_task;
318 }
319
320 vmdr_list_append_locked(metadata);
321
322 task->deferred_reclamation_metadata = metadata;
323
324 task_unlock(task);
325 lck_mtx_unlock(&reclamation_buffers_lock);
326
327 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
328 task_pid(task), KERN_SUCCESS, *address);
329 return KERN_SUCCESS;
330
331 fail_task:
332 task_unlock(task);
333 lck_mtx_unlock(&reclamation_buffers_lock);
334
335 out:
336 vmdr_metadata_release(metadata);
337 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
338 task_pid(task), kr);
339 return kr;
340 }
341
342 #pragma mark Synchronization
343
344 static inline void
vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)345 vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)
346 {
347 lck_mtx_lock(&metadata->vdrm_lock);
348 }
349
350 static inline void
vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)351 vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)
352 {
353 lck_mtx_unlock(&metadata->vdrm_lock);
354 }
355
356 static inline void
vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)357 vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)
358 {
359 lck_mtx_gate_assert(&metadata->vdrm_lock, &metadata->vdrm_gate,
360 GATE_ASSERT_HELD);
361 }
362
363 static inline void
vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)364 vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)
365 {
366 #if MACH_ASSERT
367 vmdr_metadata_lock(metadata);
368 vmdr_metadata_assert_owned_locked(metadata);
369 vmdr_metadata_unlock(metadata);
370 #else /* MACH_ASSERT */
371 (void)metadata;
372 #endif /* MACH_ASSERT */
373 }
374
375
376 /*
377 * Try to take ownership of the buffer. Returns true if successful.
378 */
379 static bool
vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)380 vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)
381 {
382 kern_return_t kr = lck_mtx_gate_try_close(&metadata->vdrm_lock,
383 &metadata->vdrm_gate);
384 return kr == KERN_SUCCESS;
385 }
386
387 static void
vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata)388 vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata)
389 {
390 __assert_only gate_wait_result_t wait_result;
391 if (!vmdr_metadata_try_own_locked(metadata)) {
392 wait_result = lck_mtx_gate_wait(
393 &metadata->vdrm_lock, &metadata->vdrm_gate, LCK_SLEEP_DEFAULT,
394 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
395 assert(wait_result == GATE_HANDOFF);
396 }
397 }
398
399 /*
400 * Set the current thread as the owner of a reclaim buffer. May block. Will
401 * propagate priority.
402 */
403 static void
vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)404 vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)
405 {
406 vmdr_metadata_lock(metadata);
407 vmdr_metadata_own_locked(metadata);
408 vmdr_metadata_unlock(metadata);
409 }
410
411 static void
vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)412 vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)
413 {
414 vmdr_metadata_assert_owned_locked(metadata);
415 lck_mtx_gate_handoff(&metadata->vdrm_lock, &metadata->vdrm_gate,
416 GATE_HANDOFF_OPEN_IF_NO_WAITERS);
417 }
418
419 /*
420 * Release ownership of a reclaim buffer and wakeup any threads waiting for
421 * ownership. Must be called from the thread that acquired ownership.
422 */
423 static void
vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)424 vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)
425 {
426 vmdr_metadata_lock(metadata);
427 vmdr_metadata_disown_locked(metadata);
428 vmdr_metadata_unlock(metadata);
429 }
430
431 static void
vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)432 vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)
433 {
434 os_ref_retain(&metadata->vdrm_refcnt);
435 }
436
437 static void
vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)438 vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)
439 {
440 if (os_ref_release(&metadata->vdrm_refcnt) == 0) {
441 vmdr_metadata_free(metadata);
442 }
443 }
444
445 void
vm_deferred_reclamation_buffer_own(vm_deferred_reclamation_metadata_t metadata)446 vm_deferred_reclamation_buffer_own(vm_deferred_reclamation_metadata_t metadata)
447 {
448 vmdr_metadata_own(metadata);
449 }
450
451 void
vm_deferred_reclamation_buffer_disown(vm_deferred_reclamation_metadata_t metadata)452 vm_deferred_reclamation_buffer_disown(vm_deferred_reclamation_metadata_t metadata)
453 {
454 vmdr_metadata_disown(metadata);
455 }
456
457 #pragma mark Global Queue Management
458
459 static void
vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)460 vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
461 {
462 LCK_MTX_ASSERT(&reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
463 assert(metadata->vdrm_list.tqe_prev != NULL);
464 TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
465 metadata->vdrm_list.tqe_prev = NULL;
466 metadata->vdrm_list.tqe_next = NULL;
467 }
468
469 static void
vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)470 vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
471 {
472 LCK_MTX_ASSERT(&reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
473 assert(metadata->vdrm_list.tqe_prev == NULL);
474 TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
475 }
476
477 static void
vmdr_async_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)478 vmdr_async_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
479 {
480 LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
481 assert(metadata->vdrm_async_list.tqe_prev != NULL);
482 TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
483 metadata->vdrm_async_list.tqe_prev = NULL;
484 metadata->vdrm_async_list.tqe_next = NULL;
485 }
486
487 static void
vmdr_async_list_append_locked(vm_deferred_reclamation_metadata_t metadata)488 vmdr_async_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
489 {
490 LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
491 assert(metadata->vdrm_async_list.tqe_prev == NULL);
492 TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
493 }
494
495 static bool
vmdr_metadata_has_pending_reclamation(vm_deferred_reclamation_metadata_t metadata)496 vmdr_metadata_has_pending_reclamation(vm_deferred_reclamation_metadata_t metadata)
497 {
498 LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
499 return metadata->vdrm_async_list.tqe_prev != NULL;
500 }
501
502 #pragma mark Lifecycle
503
504 void
vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)505 vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)
506 {
507 assert(metadata != NULL);
508 /*
509 * First remove the buffer from the global list so no one else can get access to it.
510 */
511 lck_mtx_lock(&reclamation_buffers_lock);
512 vmdr_list_remove_locked(metadata);
513 lck_mtx_unlock(&reclamation_buffers_lock);
514
515 /*
516 * Now remove it from the async list (if present)
517 */
518 lck_mtx_lock(&async_reclamation_buffers_lock);
519 if (vmdr_metadata_has_pending_reclamation(metadata)) {
520 vmdr_async_list_remove_locked(metadata);
521 }
522 lck_mtx_unlock(&async_reclamation_buffers_lock);
523 }
524
525 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)526 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
527 {
528 assert(metadata != NULL);
529 /* Buffer must be uninstalled before being deallocated */
530 assert(metadata->vdrm_async_list.tqe_prev == NULL);
531 assert(metadata->vdrm_async_list.tqe_next == NULL);
532 assert(metadata->vdrm_list.tqe_prev == NULL);
533 assert(metadata->vdrm_list.tqe_next == NULL);
534 /*
535 * The task is dropping its ref on this buffer. First remove the buffer's
536 * back-reference to the task so that any threads currently operating on
537 * this buffer do not try to operate on the dead/dying task
538 */
539 vmdr_metadata_lock(metadata);
540 metadata->vdrm_task = TASK_NULL;
541 vmdr_metadata_unlock(metadata);
542
543 vmdr_metadata_release(metadata);
544 }
545
546 #pragma mark Exception Delivery
547
548 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)549 reclaim_kill_with_reason(
550 vm_deferred_reclamation_metadata_t metadata,
551 unsigned reason,
552 mach_exception_data_type_t subcode)
553 {
554 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
555 mach_exception_code_t code = 0;
556 task_t task;
557 proc_t p = NULL;
558 boolean_t fatal = TRUE;
559 bool killing_self;
560 pid_t pid;
561 int err;
562
563 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
564
565 EXC_GUARD_ENCODE_TYPE(code, guard_type);
566 EXC_GUARD_ENCODE_FLAVOR(code, reason);
567 EXC_GUARD_ENCODE_TARGET(code, 0);
568
569 vmdr_metadata_lock(metadata);
570 task = metadata->vdrm_task;
571 if (task == TASK_NULL || !task_is_active(task) || task_is_halting(task)) {
572 /* Task is no longer alive */
573 vmdr_metadata_unlock(metadata);
574 os_log_error(vm_reclaim_log_handle,
575 "vm_reclaim: Unable to deliver guard exception because task "
576 "[%d] is already dead.\n",
577 task ? task_pid(task) : -1);
578 return;
579 }
580
581 if (panic_on_kill) {
582 panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
583 }
584
585 killing_self = (task == current_task());
586 if (!killing_self) {
587 task_reference(task);
588 }
589 assert(task != kernel_task);
590 vmdr_metadata_unlock(metadata);
591
592 if (reason == kGUARD_EXC_DEALLOC_GAP) {
593 task_lock(task);
594 fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
595 task_unlock(task);
596 }
597
598 if (!fatal) {
599 os_log_info(vm_reclaim_log_handle,
600 "vm_reclaim: Skipping non fatal guard exception.\n");
601 goto out;
602 }
603
604 pid = task_pid(task);
605 if (killing_self) {
606 p = get_bsdtask_info(task);
607 } else {
608 p = proc_find(pid);
609 if (p && proc_task(p) != task) {
610 os_log_error(vm_reclaim_log_handle,
611 "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
612 goto out;
613 }
614 }
615
616 if (!p) {
617 os_log_error(vm_reclaim_log_handle,
618 "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
619 goto out;
620 }
621
622 int flags = PX_DEBUG_NO_HONOR;
623 exception_info_t info = {
624 .os_reason = OS_REASON_GUARD,
625 .exception_type = EXC_GUARD,
626 .mx_code = code,
627 .mx_subcode = subcode
628 };
629
630 err = exit_with_mach_exception(p, info, flags);
631 if (err != 0) {
632 os_log_error(vm_reclaim_log_handle, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
633 goto out;
634 }
635
636
637 out:
638 if (!killing_self) {
639 if (p) {
640 proc_rele(p);
641 p = NULL;
642 }
643 if (task) {
644 task_deallocate(task);
645 task = NULL;
646 }
647 }
648 }
649
650 #pragma mark CopyI/O
651
652 static user_addr_t
get_head_ptr(user_addr_t indices)653 get_head_ptr(user_addr_t indices)
654 {
655 return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
656 }
657
658 static user_addr_t
get_tail_ptr(user_addr_t indices)659 get_tail_ptr(user_addr_t indices)
660 {
661 return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
662 }
663
664 static user_addr_t
get_busy_ptr(user_addr_t indices)665 get_busy_ptr(user_addr_t indices)
666 {
667 return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
668 }
669
670 static kern_return_t
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)671 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
672 {
673 if (result != 0 && (result != EFAULT || !vm_fault_get_disabled())) {
674 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE,
675 result);
676 }
677 return kern_return_for_errno(result);
678 }
679
680 /*
681 * Helper functions to do copyio on the head, tail, and busy pointers.
682 * Note that the kernel will only write to the busy and head pointers.
683 * Userspace is not supposed to write to the head or busy pointers, but the kernel
684 * must be resilient to that kind of bug in userspace.
685 */
686
687 static kern_return_t
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)688 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
689 {
690 int result;
691 kern_return_t kr;
692 user_addr_t indices = metadata->vdrm_reclaim_indices;
693 user_addr_t head_ptr = get_head_ptr(indices);
694
695 result = copyin_atomic64(head_ptr, head);
696 kr = reclaim_handle_copyio_error(metadata, result);
697 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
698 os_log_error(vm_reclaim_log_handle,
699 "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
700 }
701 return kr;
702 }
703
704 static kern_return_t
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)705 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
706 {
707 int result;
708 kern_return_t kr;
709 user_addr_t indices = metadata->vdrm_reclaim_indices;
710 user_addr_t tail_ptr = get_tail_ptr(indices);
711
712 result = copyin_atomic64(tail_ptr, tail);
713 kr = reclaim_handle_copyio_error(metadata, result);
714 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
715 os_log_error(vm_reclaim_log_handle,
716 "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
717 }
718 return kr;
719 }
720
721 static kern_return_t
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)722 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
723 {
724 int result;
725 kern_return_t kr;
726 user_addr_t indices = metadata->vdrm_reclaim_indices;
727 user_addr_t busy_ptr = get_busy_ptr(indices);
728
729 result = copyin_atomic64(busy_ptr, busy);
730 kr = reclaim_handle_copyio_error(metadata, result);
731 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
732 os_log_error(vm_reclaim_log_handle,
733 "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
734 }
735 return kr;
736 }
737
738 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)739 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
740 {
741 int result;
742 kern_return_t kr;
743 user_addr_t indices = metadata->vdrm_reclaim_indices;
744 user_addr_t busy_ptr = get_busy_ptr(indices);
745
746 result = copyout_atomic64(value, busy_ptr);
747 kr = reclaim_handle_copyio_error(metadata, result);
748 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
749 os_log_error(vm_reclaim_log_handle,
750 "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
751 }
752 return kr;
753 }
754
755 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)756 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
757 {
758 int result;
759 kern_return_t kr;
760 user_addr_t indices = metadata->vdrm_reclaim_indices;
761 user_addr_t head_ptr = get_head_ptr(indices);
762
763 result = copyout_atomic64(value, head_ptr);
764 kr = reclaim_handle_copyio_error(metadata, result);
765 if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
766 os_log_error(vm_reclaim_log_handle,
767 "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
768 }
769 return kr;
770 }
771
772 #pragma mark Reclamation
773
774 /*
775 * Reclaim a chunk (kReclaimChunkSize entries) from the buffer.
776 *
777 * Writes the number of entries reclaimed to `num_reclaimed_out`. Note that
778 * there may be zero reclaimable entries in the chunk (they have all been
779 * re-used by userspace).
780 *
781 * Returns:
782 * - KERN_NOT_FOUND if the buffer has been exhausted (head == tail)
783 * - KERN_FAILURE on failure to reclaim -- metadata lock will be dropped
784 * before returning
785 */
786 static kern_return_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,size_t * num_reclaimed_out,vm_deferred_reclamation_options_t options)787 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
788 size_t *num_reclaimed_out, vm_deferred_reclamation_options_t options)
789 {
790 kern_return_t kr;
791 int result = 0;
792 size_t num_reclaimed = 0;
793 uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0,
794 num_copied = 0, buffer_len = 0;
795 user_addr_t indices;
796 vm_map_t map = metadata->vdrm_map, old_map;
797 mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
798
799 assert(metadata != NULL);
800 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
801
802 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START,
803 metadata->vdrm_pid, kReclaimChunkSize);
804
805 buffer_len = metadata->vdrm_buffer_size /
806 sizeof(mach_vm_reclaim_entry_v1_t);
807
808 memset(reclaim_entries, 0, sizeof(reclaim_entries));
809
810 indices = (user_addr_t) metadata->vdrm_reclaim_indices;
811 old_map = vm_map_switch(map);
812
813 if (options & RECLAIM_NO_FAULT) {
814 vm_fault_disable();
815 }
816
817 kr = reclaim_copyin_busy(metadata, &busy);
818 if (kr != KERN_SUCCESS) {
819 goto fail;
820 }
821 kr = reclaim_copyin_head(metadata, &head);
822 if (kr != KERN_SUCCESS) {
823 goto fail;
824 }
825 kr = reclaim_copyin_tail(metadata, &tail);
826 if (kr != KERN_SUCCESS) {
827 goto fail;
828 }
829
830 if (busy != head) {
831 // Userspace overwrote one of the pointers
832 os_log_error(vm_reclaim_log_handle,
833 "vm_reclaim: Userspace modified head or busy pointer! head: %llu "
834 "(0x%llx) != busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
835 head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail,
836 get_tail_ptr(indices));
837 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE,
838 busy);
839 kr = KERN_FAILURE;
840 goto fail;
841 }
842
843 if (tail < head) {
844 /*
845 * Userspace is likely in the middle of trying to re-use an entry,
846 * bail on this reclamation.
847 */
848 os_log_error(vm_reclaim_log_handle,
849 "vm_reclaim: Userspace modified head or tail pointer! head: %llu "
850 "(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
851 head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy,
852 get_busy_ptr(indices));
853 kr = KERN_FAILURE;
854 goto fail;
855 }
856
857 /*
858 * NB: If any of the copyouts below fail due to faults being disabled,
859 * the buffer may be left in a state where several entries are unusable
860 * until the next reclamation (i.e. busy > head)
861 */
862 num_to_reclaim = tail - head;
863 while (true) {
864 num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
865 if (num_to_reclaim == 0) {
866 break;
867 }
868 busy = head + num_to_reclaim;
869 kr = reclaim_copyout_busy(metadata, busy);
870 if (kr != KERN_SUCCESS) {
871 goto fail;
872 }
873 os_atomic_thread_fence(seq_cst);
874 kr = reclaim_copyin_tail(metadata, &new_tail);
875 if (kr != KERN_SUCCESS) {
876 goto fail;
877 }
878
879 if (new_tail >= busy) {
880 /* Got num_to_reclaim entries */
881 break;
882 }
883 tail = new_tail;
884 if (tail < head) {
885 /*
886 * Userspace is likely in the middle of trying to re-use an entry,
887 * bail on this reclamation
888 */
889 os_log_error(vm_reclaim_log_handle,
890 "vm_reclaim: Userspace modified head or tail pointer! head: "
891 "%llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
892 head, get_head_ptr(indices), tail, get_tail_ptr(indices),
893 busy, get_busy_ptr(indices));
894 /* Reset busy back to head */
895 reclaim_copyout_busy(metadata, head);
896 kr = KERN_FAILURE;
897 goto fail;
898 }
899 /* Can't reclaim these entries. Try again */
900 num_to_reclaim = tail - head;
901 if (num_to_reclaim == 0) {
902 /* Nothing left to reclaim. Reset busy to head. */
903 kr = reclaim_copyout_busy(metadata, head);
904 if (kr != KERN_SUCCESS) {
905 goto fail;
906 }
907 break;
908 }
909 /*
910 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
911 * so this is gauranteed to converge.
912 */
913 }
914
915 while (num_copied < num_to_reclaim) {
916 uint64_t memcpy_start_idx = (head % buffer_len);
917 uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
918 // Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
919 memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
920 uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
921
922 assert(num_to_copy + num_copied <= kReclaimChunkSize);
923 user_addr_t src_ptr = metadata->vdrm_reclaim_buffer +
924 (memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t));
925 mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
926
927 result = copyin(src_ptr, dst_ptr,
928 (num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t)));
929 kr = reclaim_handle_copyio_error(metadata, result);
930 if (kr != KERN_SUCCESS) {
931 if (kr != KERN_MEMORY_ERROR) {
932 os_log_error(vm_reclaim_log_handle,
933 "vm_reclaim: Unable to copyin %llu entries in reclaim "
934 "buffer at 0x%llx to 0x%llx: err=%d\n",
935 num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
936 }
937 goto fail;
938 }
939
940 num_copied += num_to_copy;
941 head += num_to_copy;
942 }
943
944 for (size_t i = 0; i < num_to_reclaim; i++) {
945 mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
946 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
947 metadata->vdrm_pid, entry->address, entry->size,
948 entry->behavior);
949 DTRACE_VM4(vm_reclaim_chunk,
950 int, metadata->vdrm_pid,
951 mach_vm_address_t, entry->address,
952 size_t, entry->size,
953 mach_vm_reclaim_behavior_v1_t, entry->behavior);
954 if (entry->address != 0 && entry->size != 0) {
955 switch (entry->behavior) {
956 case MACH_VM_RECLAIM_DEALLOCATE:
957 kr = vm_map_remove_guard(map,
958 vm_map_trunc_page(entry->address,
959 VM_MAP_PAGE_MASK(map)),
960 vm_map_round_page(entry->address + entry->size,
961 VM_MAP_PAGE_MASK(map)),
962 VM_MAP_REMOVE_GAPS_FAIL,
963 KMEM_GUARD_NONE).kmr_return;
964 if (kr == KERN_INVALID_VALUE) {
965 reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
966 goto fail;
967 } else if (kr != KERN_SUCCESS) {
968 os_log_error(vm_reclaim_log_handle,
969 "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx err=%d\n",
970 entry->address, entry->size, (uint64_t) map, kr);
971 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
972 goto fail;
973 }
974 break;
975 case MACH_VM_RECLAIM_REUSABLE:
976 kr = vm_map_behavior_set(map,
977 vm_map_trunc_page(entry->address, VM_MAP_PAGE_MASK(map)),
978 vm_map_round_page(entry->address + entry->size, VM_MAP_PAGE_MASK(map)),
979 VM_BEHAVIOR_REUSABLE);
980 if (kr != KERN_SUCCESS) {
981 os_log_error(vm_reclaim_log_handle,
982 "vm_reclaim: unable to free(reusable) 0x%llx (%u) for pid %d err=%d\n",
983 entry->address, entry->size, metadata->vdrm_pid, kr);
984 }
985 break;
986 default:
987 os_log_error(vm_reclaim_log_handle,
988 "vm_reclaim: attempted to reclaim entry with unsupported behavior %uh",
989 entry->behavior);
990 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
991 kr = KERN_INVALID_VALUE;
992 goto fail;
993 }
994 num_reclaimed++;
995 os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
996 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END,
997 metadata->vdrm_pid, entry->address);
998 }
999 }
1000
1001 kr = reclaim_copyout_head(metadata, head);
1002 if (kr != KERN_SUCCESS) {
1003 goto fail;
1004 }
1005
1006 if (options & RECLAIM_NO_FAULT) {
1007 vm_fault_enable();
1008 }
1009 vm_map_switch(old_map);
1010 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1011 metadata->vdrm_pid, num_to_reclaim, num_reclaimed, true);
1012 *num_reclaimed_out = num_reclaimed;
1013 if (num_to_reclaim == 0) {
1014 // We have exhausted the reclaimable portion of the buffer
1015 return KERN_NOT_FOUND;
1016 }
1017 return KERN_SUCCESS;
1018
1019 fail:
1020 if (options & RECLAIM_NO_FAULT) {
1021 vm_fault_enable();
1022 }
1023 vm_map_switch(old_map);
1024 *num_reclaimed_out = num_reclaimed;
1025 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1026 metadata->vdrm_pid, num_to_reclaim, num_reclaimed, false);
1027 return kr;
1028 }
1029
1030 /*
1031 * Attempts to reclaim until the buffer's estimated number of available bytes
1032 * is <= num_bytes_reclaimable_threshold. The metadata buffer lock should be
1033 * held by the caller.
1034 *
1035 * Writes the number of entries reclaimed to `num_reclaimed_out`.
1036 */
1037 static kern_return_t
reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,size_t num_bytes_reclaimable_threshold,size_t * num_reclaimed_out)1038 reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,
1039 size_t num_bytes_reclaimable_threshold, size_t *num_reclaimed_out)
1040 {
1041 assert(metadata != NULL);
1042 assert(num_reclaimed_out != NULL);
1043 vmdr_metadata_assert_owned(metadata);
1044 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
1045
1046 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_START, metadata->vdrm_pid);
1047
1048 size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
1049 while (true) {
1050 kern_return_t kr;
1051 size_t curr_entries_reclaimed = 0;
1052 num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
1053 reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
1054 if (num_bytes_reclaimed > reclaimable_bytes) {
1055 estimated_reclaimable_bytes = 0;
1056 } else {
1057 estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
1058 }
1059 if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
1060 break;
1061 }
1062 kr = reclaim_chunk(metadata, &curr_entries_reclaimed,
1063 RECLAIM_OPTIONS_NONE);
1064 if (kr == KERN_NOT_FOUND) {
1065 // Nothing left to reclaim
1066 break;
1067 } else if (kr != KERN_SUCCESS) {
1068 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END,
1069 metadata->vdrm_pid, num_entries_reclaimed,
1070 estimated_reclaimable_bytes, kr);
1071 *num_reclaimed_out = num_entries_reclaimed;
1072 return kr;
1073 }
1074 num_entries_reclaimed += curr_entries_reclaimed;
1075 }
1076
1077 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END,
1078 metadata->vdrm_pid, num_entries_reclaimed,
1079 estimated_reclaimable_bytes, KERN_SUCCESS);
1080 *num_reclaimed_out = num_entries_reclaimed;
1081 return KERN_SUCCESS;
1082 }
1083
1084 /*
1085 * Get the reclamation metadata buffer for the given map.
1086 */
1087 static vm_deferred_reclamation_metadata_t
get_task_reclaim_metadata(task_t task)1088 get_task_reclaim_metadata(task_t task)
1089 {
1090 assert(task != NULL);
1091 vm_deferred_reclamation_metadata_t metadata = NULL;
1092 task_lock(task);
1093 metadata = task->deferred_reclamation_metadata;
1094 task_unlock(task);
1095 return metadata;
1096 }
1097
1098 kern_return_t
vm_deferred_reclamation_buffer_synchronize_internal(task_t task,size_t num_entries_to_reclaim)1099 vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
1100 {
1101 kern_return_t kr;
1102 vm_deferred_reclamation_metadata_t metadata = NULL;
1103 size_t total_reclaimed = 0;
1104
1105 if (!task_is_active(task)) {
1106 return KERN_FAILURE;
1107 }
1108
1109 metadata = get_task_reclaim_metadata(task);
1110 if (metadata == NULL) {
1111 return KERN_INVALID_ARGUMENT;
1112 }
1113
1114 vmdr_metadata_own(metadata);
1115
1116 while (total_reclaimed < num_entries_to_reclaim) {
1117 size_t num_reclaimed;
1118 kr = reclaim_chunk(metadata, &num_reclaimed, RECLAIM_OPTIONS_NONE);
1119 if (kr == KERN_NOT_FOUND) {
1120 /* buffer has been fully reclaimed from */
1121 break;
1122 } else if (kr != KERN_SUCCESS) {
1123 vmdr_metadata_disown(metadata);
1124 return kr;
1125 }
1126
1127 total_reclaimed += num_reclaimed;
1128 }
1129
1130 vmdr_metadata_disown(metadata);
1131 return KERN_SUCCESS;
1132 }
1133
1134 kern_return_t
vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task,size_t reclaimable_bytes)1135 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
1136 {
1137 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1138 size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer, num_reclaimed = 0;
1139 bool success;
1140 kern_return_t kr = KERN_SUCCESS;
1141 if (metadata == NULL) {
1142 return KERN_INVALID_ARGUMENT;
1143 }
1144
1145 if (!metadata->vdrm_pid) {
1146 metadata->vdrm_pid = task_pid(task);
1147 }
1148
1149 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START,
1150 metadata->vdrm_pid, reclaimable_bytes);
1151
1152 /*
1153 * The client is allowed to make this call in parallel from multiple threads.
1154 * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
1155 * If the client's value is smaller than what we've stored, another thread
1156 * raced ahead of them and we've already acted on that accounting so this
1157 * call should be a no-op.
1158 */
1159 success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
1160 reclaimable_bytes, acquire,
1161 {
1162 if (num_bytes_in_buffer > reclaimable_bytes) {
1163 os_atomic_rmw_loop_give_up(break);
1164 }
1165 });
1166 if (!success) {
1167 /* Stale value. Nothing new to reclaim */
1168 goto done;
1169 }
1170 num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
1171
1172 if (reclaimable_bytes > num_bytes_reclaimed) {
1173 estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
1174 if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
1175 vmdr_metadata_own(metadata);
1176 kr = reclaim_entries_from_buffer(metadata,
1177 vm_reclaim_max_threshold, &num_reclaimed);
1178 vmdr_metadata_disown(metadata);
1179 }
1180 }
1181
1182 done:
1183 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END,
1184 metadata->vdrm_pid, reclaimable_bytes, num_bytes_reclaimed,
1185 num_reclaimed);
1186
1187 return kr;
1188 }
1189
1190 static inline size_t
pick_reclaim_threshold(vm_deferred_reclamation_action_t action)1191 pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
1192 {
1193 switch (action) {
1194 case RECLAIM_FULL:
1195 return 0;
1196 case RECLAIM_TRIM:
1197 return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
1198 case RECLAIM_ASYNC:
1199 return 0;
1200 }
1201 }
1202
1203 void
vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action,vm_deferred_reclamation_options_t options)1204 vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action, vm_deferred_reclamation_options_t options)
1205 {
1206 kern_return_t kr;
1207 size_t num_reclaimed;
1208 size_t reclaim_threshold;
1209
1210 switch (action) {
1211 case RECLAIM_ASYNC:
1212 lck_mtx_lock(&async_reclamation_buffers_lock);
1213 vmdr_process_async_reclamation_list();
1214 lck_mtx_unlock(&async_reclamation_buffers_lock);
1215 break;
1216 case RECLAIM_TRIM:
1217 case RECLAIM_FULL:
1218 reclaim_threshold = pick_reclaim_threshold(action);
1219 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_START,
1220 action, reclaim_threshold);
1221 lck_mtx_lock(&reclamation_buffers_lock);
1222 reclamation_counter++;
1223 vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
1224 while (metadata != NULL) {
1225 vmdr_list_remove_locked(metadata);
1226 vmdr_list_append_locked(metadata);
1227 vmdr_metadata_retain(metadata);
1228 lck_mtx_unlock(&reclamation_buffers_lock);
1229
1230 vmdr_metadata_lock(metadata);
1231
1232 if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
1233 // We've already seen this one. We're done
1234 vmdr_metadata_unlock(metadata);
1235 lck_mtx_lock(&reclamation_buffers_lock);
1236 break;
1237 }
1238 metadata->vdrm_reclaimed_at = reclamation_counter;
1239
1240 if (options & RECLAIM_NO_WAIT) {
1241 bool acquired = vmdr_metadata_try_own_locked(metadata);
1242 if (!acquired) {
1243 vmdr_metadata_unlock(metadata);
1244 goto next;
1245 }
1246 } else {
1247 vmdr_metadata_own_locked(metadata);
1248 }
1249 vmdr_metadata_unlock(metadata);
1250
1251 kr = reclaim_entries_from_buffer(metadata,
1252 reclaim_threshold, &num_reclaimed);
1253
1254 vmdr_metadata_disown(metadata);
1255 next:
1256 vmdr_metadata_release(metadata);
1257 lck_mtx_lock(&reclamation_buffers_lock);
1258 metadata = TAILQ_FIRST(&reclamation_buffers);
1259 }
1260 lck_mtx_unlock(&reclamation_buffers_lock);
1261 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_END,
1262 reclamation_counter);
1263 break;
1264 default:
1265 panic("Unexpected reclaim action %d", action);
1266 }
1267 }
1268
1269 void
vm_deferred_reclamation_reclaim_all_memory(vm_deferred_reclamation_options_t options)1270 vm_deferred_reclamation_reclaim_all_memory(
1271 vm_deferred_reclamation_options_t options)
1272 {
1273 vm_deferred_reclamation_reclaim_memory(RECLAIM_FULL, options);
1274 }
1275
1276 bool
vm_deferred_reclamation_reclaim_from_task_async(task_t task)1277 vm_deferred_reclamation_reclaim_from_task_async(task_t task)
1278 {
1279 bool queued = false;
1280 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1281
1282 if (metadata != NULL) {
1283 os_log_debug(vm_reclaim_log_handle, "vm_reclaim: enquequeing %d for "
1284 "asynchronous reclamation.\n", task_pid(task));
1285 lck_mtx_lock(&async_reclamation_buffers_lock);
1286 // move this buffer to the tail if still on the async list
1287 if (vmdr_metadata_has_pending_reclamation(metadata)) {
1288 vmdr_async_list_remove_locked(metadata);
1289 }
1290 vmdr_async_list_append_locked(metadata);
1291 lck_mtx_unlock(&async_reclamation_buffers_lock);
1292 queued = true;
1293 thread_wakeup_thread(&vm_reclaim_thread, vm_reclaim_thread);
1294 }
1295
1296 return queued;
1297 }
1298
1299 kern_return_t
vm_deferred_reclamation_reclaim_from_task_sync(task_t task,size_t max_entries_to_reclaim)1300 vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
1301 {
1302 kern_return_t kr;
1303 size_t num_reclaimed = 0;
1304 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1305
1306 if (!task_is_active(task) || task_is_halting(task)) {
1307 return KERN_ABORTED;
1308 }
1309
1310 if (metadata != NULL) {
1311 vmdr_metadata_own(metadata);
1312 while (num_reclaimed < max_entries_to_reclaim) {
1313 size_t num_reclaimed_now;
1314 kr = reclaim_chunk(metadata, &num_reclaimed_now, RECLAIM_OPTIONS_NONE);
1315 if (kr == KERN_NOT_FOUND) {
1316 // Nothing left to reclaim
1317 break;
1318 } else if (kr != KERN_SUCCESS) {
1319 /* Lock has already been released and task is being killed. */
1320 vmdr_metadata_disown(metadata);
1321 return kr;
1322 }
1323 num_reclaimed += num_reclaimed_now;
1324 }
1325 vmdr_metadata_disown(metadata);
1326 }
1327
1328 return KERN_SUCCESS;
1329 }
1330
1331 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_buffer_fork(task_t task,vm_deferred_reclamation_metadata_t parent)1332 vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1333 {
1334 vm_deferred_reclamation_metadata_t metadata = NULL;
1335 vmdr_metadata_assert_owned(parent);
1336
1337 assert(task->deferred_reclamation_metadata == NULL);
1338 metadata = vmdr_metadata_alloc(task, parent->vdrm_reclaim_buffer,
1339 parent->vdrm_buffer_size, parent->vdrm_reclaim_indices);
1340 vmdr_metadata_disown(parent);
1341
1342 lck_mtx_lock(&reclamation_buffers_lock);
1343 vmdr_list_append_locked(metadata);
1344 lck_mtx_unlock(&reclamation_buffers_lock);
1345
1346 return metadata;
1347 }
1348
1349 static void
reclaim_thread_init(void)1350 reclaim_thread_init(void)
1351 {
1352 #if CONFIG_THREAD_GROUPS
1353 thread_group_vm_add();
1354 #endif
1355 thread_set_thread_name(current_thread(), "VM_reclaim");
1356 }
1357
1358
1359 static void
vmdr_process_async_reclamation_list(void)1360 vmdr_process_async_reclamation_list(void)
1361 {
1362 kern_return_t kr;
1363 size_t total_entries_reclaimed = 0;
1364 size_t num_tasks_reclaimed = 0;
1365 LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
1366 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_START);
1367
1368 vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
1369 while (metadata != NULL) {
1370 size_t num_reclaimed;
1371 vmdr_metadata_retain(metadata);
1372 /*
1373 * NB: It is safe to drop the async list lock without removing the
1374 * buffer because only one thread (the reclamation thread) may consume
1375 * from the async list. The buffer is guaranteed to still be in the
1376 * list when the lock is re-taken.
1377 */
1378 lck_mtx_unlock(&async_reclamation_buffers_lock);
1379
1380 vmdr_metadata_own(metadata);
1381
1382 /* NB: Currently the async reclaim thread fully reclaims the buffer */
1383 kr = reclaim_entries_from_buffer(metadata, 0, &num_reclaimed);
1384 total_entries_reclaimed += num_reclaimed;
1385 num_tasks_reclaimed++;
1386
1387 assert(current_thread()->map == kernel_map);
1388 vmdr_metadata_disown(metadata);
1389
1390 lck_mtx_lock(&async_reclamation_buffers_lock);
1391 /* Wakeup anyone waiting on this buffer getting processed */
1392 if (metadata->vdrm_waiters) {
1393 wakeup_all_with_inheritor(&metadata->vdrm_async_list,
1394 THREAD_AWAKENED);
1395 }
1396 /*
1397 * Check that the buffer has not been removed from the async list
1398 * while being reclaimed from. This can happen if the task terminates
1399 * while the reclamation is in flight.
1400 */
1401 if (vmdr_metadata_has_pending_reclamation(metadata)) {
1402 vmdr_async_list_remove_locked(metadata);
1403 }
1404 vmdr_metadata_release(metadata);
1405 metadata = TAILQ_FIRST(&async_reclamation_buffers);
1406 }
1407 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_END,
1408 num_tasks_reclaimed, total_entries_reclaimed);
1409 }
1410
1411 __enum_decl(reclaim_thread_state, uint32_t, {
1412 RECLAIM_THREAD_INIT = 0,
1413 RECLAIM_THREAD_CONT = 1,
1414 });
1415
1416 static void
reclaim_thread_continue(void)1417 reclaim_thread_continue(void)
1418 {
1419 lck_mtx_lock(&async_reclamation_buffers_lock);
1420
1421 vmdr_process_async_reclamation_list();
1422 assert_wait(&vm_reclaim_thread, THREAD_UNINT);
1423
1424 lck_mtx_unlock(&async_reclamation_buffers_lock);
1425 }
1426
1427 void
reclaim_thread(void * param,wait_result_t wr __unused)1428 reclaim_thread(void *param, wait_result_t wr __unused)
1429 {
1430 if (param == (void *) RECLAIM_THREAD_INIT) {
1431 reclaim_thread_init();
1432 } else {
1433 assert(param == (void *) RECLAIM_THREAD_CONT);
1434 }
1435
1436 reclaim_thread_continue();
1437
1438 (void) thread_block_parameter(reclaim_thread, (void*) RECLAIM_THREAD_CONT);
1439 }
1440
1441 __startup_func
1442 static void
vm_deferred_reclamation_init(void)1443 vm_deferred_reclamation_init(void)
1444 {
1445 // Note: no-op pending rdar://27006343 (Custom kernel log handles)
1446 vm_reclaim_log_handle = os_log_create("com.apple.xnu", "vm_reclaim");
1447
1448 (void)kernel_thread_start_priority(reclaim_thread,
1449 (void *)RECLAIM_THREAD_INIT, kReclaimThreadPriority,
1450 &vm_reclaim_thread);
1451 }
1452
1453 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1454
1455 #if DEVELOPMENT || DEBUG
1456
1457 bool
vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)1458 vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1459 {
1460 vm_deferred_reclamation_metadata_t metadata = NULL;
1461 proc_t p = proc_find(pid);
1462 if (p == NULL) {
1463 return false;
1464 }
1465 task_t t = proc_task(p);
1466 if (t == NULL) {
1467 proc_rele(p);
1468 return false;
1469 }
1470
1471 task_lock(t);
1472 if (!task_is_halting(t) && task_is_active(t)) {
1473 metadata = t->deferred_reclamation_metadata;
1474 if (metadata != NULL) {
1475 vmdr_metadata_retain(metadata);
1476 }
1477 }
1478 task_unlock(t);
1479 proc_rele(p);
1480 if (metadata == NULL) {
1481 return false;
1482 }
1483
1484 lck_mtx_lock(&async_reclamation_buffers_lock);
1485 while (vmdr_metadata_has_pending_reclamation(metadata)) {
1486 metadata->vdrm_waiters++;
1487 lck_mtx_sleep_with_inheritor(&async_reclamation_buffers_lock,
1488 LCK_SLEEP_DEFAULT, &metadata->vdrm_async_list, vm_reclaim_thread,
1489 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1490 metadata->vdrm_waiters--;
1491 }
1492 lck_mtx_unlock(&async_reclamation_buffers_lock);
1493
1494 vmdr_metadata_release(metadata);
1495 return true;
1496 }
1497
1498 #endif /* DEVELOPMENT || DEBUG */
1499