1 /*
2 * Copyright (c) 2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/startup.h>
35 #include <kern/sched.h>
36 #include <libkern/OSAtomic.h>
37 #include <mach/mach_types.h>
38 #include <mach/mach_vm.h>
39 #include <mach/vm_reclaim.h>
40 #include <os/log.h>
41 #include <pexpert/pexpert.h>
42 #include <vm/vm_map_internal.h>
43 #include <vm/vm_reclaim_internal.h>
44 #include <sys/queue.h>
45 #include <os/atomic_private.h>
46
47 #pragma mark Tunables
48 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
49 static integer_t kReclaimThreadPriority = BASEPRI_VM;
50 // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
51 TUNABLE_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", 2);
52 // Used to debug vm_reclaim kills
53 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
54 uint64_t vm_reclaim_max_threshold;
55
56 #pragma mark Declarations
57 typedef struct proc *proc_t;
58 extern char *proc_best_name(proc_t proc);
59 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
60 struct proc *proc_ref(struct proc *p, int locked);
61 int proc_rele(proc_t p);
62 static bool reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
63 static bool reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
64 static bool reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
65
66 struct vm_deferred_reclamation_metadata_s {
67 TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; // Global list containing every reclamation buffer
68 TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list; // A list containing buffers that are ripe for reclamation
69 decl_lck_mtx_data(, vdrm_lock); /* Held when reclaiming from the buffer */
70 /*
71 * The task owns this structure but we maintain a backpointer here
72 * so that we can send an exception if we hit an error.
73 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
74 */
75 task_t vdrm_task;
76 vm_map_t vdrm_map;
77 user_addr_t vdrm_reclaim_buffer;
78 mach_vm_size_t vdrm_buffer_size;
79 user_addr_t vdrm_reclaim_indices;
80 uint64_t vdrm_reclaimed_at;
81 /*
82 * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
83 * cumulatively. Both values are in terms of virtual memory, so they give an upper bound
84 * on the amount of physical memory that can be reclaimed.
85 * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
86 * Note that neither value is protected by the vdrm_lock.
87 */
88 _Atomic size_t vdrm_num_bytes_put_in_buffer;
89 _Atomic size_t vdrm_num_bytes_reclaimed;
90 };
91 static void process_async_reclamation_list(void);
92
93 extern void *proc_find(int pid);
94 extern task_t proc_task(proc_t);
95
96 #pragma mark Globals
97 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
98 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
99 static size_t kReclaimChunkFailed = UINT64_MAX;
100
101 /*
102 * We maintain two lists of reclamation buffers.
103 * The reclamation_buffers list contains every buffer in the system.
104 * The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
105 * Each list has its own lock.
106 */
107 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
108
109 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
110 /*
111 * The reclamation_buffers_lock protects the reclamation_buffers list.
112 * It must be held when iterating over the list or manipulating the list.
113 * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
114 */
115 LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
116 LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
117 static size_t reclamation_buffers_length;
118 static uint64_t reclamation_counter; // generation count for global reclaims
119
120 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
121 static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
122
123 #pragma mark Implementation
124
125 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,user_addr_t indices)126 vmdr_metadata_alloc(
127 task_t task,
128 user_addr_t buffer,
129 mach_vm_size_t size,
130 user_addr_t indices)
131 {
132 vm_deferred_reclamation_metadata_t metadata;
133 vm_map_t map = task->map;
134
135 assert(!map->is_nested_map);
136
137 metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
138 lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
139 metadata->vdrm_task = task;
140 metadata->vdrm_map = map;
141 metadata->vdrm_reclaim_buffer = buffer;
142 metadata->vdrm_buffer_size = size;
143 metadata->vdrm_reclaim_indices = indices;
144
145 /*
146 * we do not need to hold a lock on `task` because this is called
147 * either at fork() time or from the context of current_task().
148 */
149 vm_map_reference(map);
150 return metadata;
151 }
152
153 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)154 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
155 {
156 vm_map_deallocate(metadata->vdrm_map);
157 lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
158 zfree(vm_reclaim_metadata_zone, metadata);
159 }
160
161 kern_return_t
vm_deferred_reclamation_buffer_init_internal(task_t task,mach_vm_offset_t address,mach_vm_size_t size,user_addr_t indices)162 vm_deferred_reclamation_buffer_init_internal(
163 task_t task,
164 mach_vm_offset_t address,
165 mach_vm_size_t size,
166 user_addr_t indices)
167 {
168 kern_return_t kr = KERN_FAILURE;
169 vm_deferred_reclamation_metadata_t metadata = NULL;
170 bool success;
171 uint64_t head = 0, tail = 0, busy = 0;
172
173 if (address == 0 || indices == 0 || size < 2 * sizeof(mach_vm_reclaim_entry_v1_t)) {
174 return KERN_INVALID_ARGUMENT;
175 }
176
177 metadata = vmdr_metadata_alloc(task, address, size, indices);
178
179 /*
180 * Validate the starting indices
181 */
182 success = reclaim_copyin_busy(metadata, &busy);
183 if (!success) {
184 kr = KERN_INVALID_ARGUMENT;
185 goto out;
186 }
187 success = reclaim_copyin_head(metadata, &head);
188 if (!success) {
189 kr = KERN_INVALID_ARGUMENT;
190 goto out;
191 }
192 success = reclaim_copyin_tail(metadata, &tail);
193 if (!success) {
194 kr = KERN_INVALID_ARGUMENT;
195 goto out;
196 }
197 if (head != 0 || tail != 0 || busy != 0) {
198 kr = KERN_INVALID_ARGUMENT;
199 goto out;
200 }
201
202 task_lock(task);
203 if (task->deferred_reclamation_metadata != NULL) {
204 /* Attempt to overwrite existing reclaim buffer. This is not allowed. */
205 os_log_error(OS_LOG_DEFAULT,
206 "vm_reclaim: tried to overwrite exisiting reclaim buffer for task %p", task);
207 kr = KERN_INVALID_ARGUMENT;
208 task_unlock(task);
209 goto out;
210 }
211 task->deferred_reclamation_metadata = metadata;
212
213 task_unlock(task);
214 lck_mtx_lock(&reclamation_buffers_lock);
215 TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
216 reclamation_buffers_length++;
217 lck_mtx_unlock(&reclamation_buffers_lock);
218
219 return KERN_SUCCESS;
220
221 out:
222 vmdr_metadata_free(metadata);
223 return kr;
224 }
225
226 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)227 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
228 {
229 assert(metadata != NULL);
230 /*
231 * First remove the buffer from the global list so no one else can get access to it.
232 */
233 lck_mtx_lock(&reclamation_buffers_lock);
234 TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
235 reclamation_buffers_length--;
236 lck_mtx_unlock(&reclamation_buffers_lock);
237
238 /*
239 * Now remove it from the async list (if present)
240 */
241 lck_mtx_lock(&async_reclamation_buffers_lock);
242 if (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
243 TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
244 metadata->vdrm_async_list.tqe_next = NULL;
245 metadata->vdrm_async_list.tqe_prev = NULL;
246 }
247 lck_mtx_unlock(&async_reclamation_buffers_lock);
248
249 vmdr_metadata_free(metadata);
250 }
251
252 static user_addr_t
get_head_ptr(user_addr_t indices)253 get_head_ptr(user_addr_t indices)
254 {
255 return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
256 }
257
258 static user_addr_t
get_tail_ptr(user_addr_t indices)259 get_tail_ptr(user_addr_t indices)
260 {
261 return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
262 }
263
264 static user_addr_t
get_busy_ptr(user_addr_t indices)265 get_busy_ptr(user_addr_t indices)
266 {
267 return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
268 }
269
270 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)271 reclaim_kill_with_reason(
272 vm_deferred_reclamation_metadata_t metadata,
273 unsigned reason,
274 mach_exception_data_type_t subcode)
275 {
276 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
277 mach_exception_code_t code = 0;
278 task_t task = metadata->vdrm_task;
279 proc_t p = NULL;
280 boolean_t fatal = TRUE;
281 bool killing_self = false;
282 pid_t pid;
283 int err;
284
285 if (panic_on_kill) {
286 panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
287 }
288
289 EXC_GUARD_ENCODE_TYPE(code, guard_type);
290 EXC_GUARD_ENCODE_FLAVOR(code, reason);
291 EXC_GUARD_ENCODE_TARGET(code, 0);
292
293 assert(metadata->vdrm_task != kernel_task);
294 killing_self = task == current_task();
295 if (!killing_self) {
296 /*
297 * Grab a reference on the task to make sure it doesn't go away
298 * after we drop the metadata lock
299 */
300 task_reference(task);
301 }
302 /*
303 * We need to issue a wakeup in case this kill is coming from the async path.
304 * Once we drop the lock the caller can no longer do this wakeup, but
305 * if there's someone blocked on this reclaim they hold a map reference
306 * and thus need to be woken up so the map can be freed.
307 */
308 thread_wakeup(&metadata->vdrm_async_list);
309 lck_mtx_unlock(&metadata->vdrm_lock);
310
311 if (reason == kGUARD_EXC_DEALLOC_GAP) {
312 task_lock(task);
313 fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
314 task_unlock(task);
315 }
316
317 if (!fatal) {
318 os_log_info(OS_LOG_DEFAULT,
319 "vm_reclaim: Skipping non fatal guard exception.\n");
320 goto out;
321 }
322
323 pid = task_pid(task);
324 if (killing_self) {
325 p = get_bsdtask_info(task);
326 } else {
327 p = proc_find(pid);
328 if (p && proc_task(p) != task) {
329 os_log_error(OS_LOG_DEFAULT,
330 "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
331 goto out;
332 }
333
334 task_deallocate(task);
335 task = NULL;
336 }
337
338 if (!p) {
339 os_log_error(OS_LOG_DEFAULT,
340 "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
341 goto out;
342 }
343
344 err = exit_with_guard_exception(p, code, subcode);
345 if (err != 0) {
346 os_log_error(OS_LOG_DEFAULT, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
347 }
348 out:
349 if (!killing_self) {
350 if (p) {
351 proc_rele(p);
352 p = NULL;
353 }
354 if (task) {
355 task_deallocate(task);
356 task = NULL;
357 }
358 }
359 }
360
361 static void
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)362 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
363 {
364 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE, result);
365 }
366
367 /*
368 * Helper functions to do copyio on the head, tail, and busy pointers.
369 * Note that the kernel will only write to the busy and head pointers.
370 * Userspace is not supposed to write to the head or busy pointers, but the kernel
371 * must be resilient to that kind of bug in userspace.
372 */
373
374
375 static bool
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)376 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
377 {
378 int result;
379 user_addr_t indices = metadata->vdrm_reclaim_indices;
380 user_addr_t head_ptr = get_head_ptr(indices);
381
382 result = copyin_atomic64(head_ptr, head);
383
384 if (result != 0) {
385 os_log_error(OS_LOG_DEFAULT,
386 "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
387 reclaim_handle_copyio_error(metadata, result);
388 return false;
389 }
390 return true;
391 }
392
393 static bool
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)394 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
395 {
396 int result;
397 user_addr_t indices = metadata->vdrm_reclaim_indices;
398 user_addr_t tail_ptr = get_tail_ptr(indices);
399
400 result = copyin_atomic64(tail_ptr, tail);
401
402 if (result != 0) {
403 os_log_error(OS_LOG_DEFAULT,
404 "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
405 reclaim_handle_copyio_error(metadata, result);
406 return false;
407 }
408 return true;
409 }
410
411 static bool
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)412 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
413 {
414 int result;
415 user_addr_t indices = metadata->vdrm_reclaim_indices;
416 user_addr_t busy_ptr = get_busy_ptr(indices);
417
418 result = copyin_atomic64(busy_ptr, busy);
419
420 if (result != 0) {
421 os_log_error(OS_LOG_DEFAULT,
422 "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
423 reclaim_handle_copyio_error(metadata, result);
424 return false;
425 }
426 return true;
427 }
428
429 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)430 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
431 {
432 int result;
433 user_addr_t indices = metadata->vdrm_reclaim_indices;
434 user_addr_t busy_ptr = get_busy_ptr(indices);
435
436 result = copyout_atomic64(value, busy_ptr);
437
438 if (result != 0) {
439 os_log_error(OS_LOG_DEFAULT,
440 "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
441 reclaim_handle_copyio_error(metadata, result);
442 return false;
443 }
444 return true;
445 }
446
447 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)448 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
449 {
450 int result;
451 user_addr_t indices = metadata->vdrm_reclaim_indices;
452 user_addr_t head_ptr = get_head_ptr(indices);
453
454 result = copyout_atomic64(value, head_ptr);
455
456 if (result != 0) {
457 os_log_error(OS_LOG_DEFAULT,
458 "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
459 reclaim_handle_copyio_error(metadata, result);
460 return false;
461 }
462 return true;
463 }
464
465 /*
466 * Reclaim a chunk from the buffer.
467 * Returns the number of entries reclaimed or 0 if there are no entries left in the buffer.
468 */
469 static size_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)470 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)
471 {
472 assert(metadata != NULL);
473 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
474
475 int result = 0;
476 size_t num_reclaimed = 0;
477 uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0, num_copied = 0, buffer_len = 0;
478 user_addr_t indices;
479 vm_map_t map = metadata->vdrm_map, old_map;
480 mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
481 bool success;
482
483 buffer_len = metadata->vdrm_buffer_size / sizeof(mach_vm_reclaim_entry_v1_t);
484
485 memset(reclaim_entries, 0, sizeof(reclaim_entries));
486
487 indices = (user_addr_t) metadata->vdrm_reclaim_indices;
488 old_map = vm_map_switch(map);
489
490 success = reclaim_copyin_busy(metadata, &busy);
491 if (!success) {
492 goto fail;
493 }
494 success = reclaim_copyin_head(metadata, &head);
495 if (!success) {
496 goto fail;
497 }
498 success = reclaim_copyin_tail(metadata, &tail);
499 if (!success) {
500 goto fail;
501 }
502
503 if (busy != head) {
504 // Userspace overwrote one of the pointers
505 os_log_error(OS_LOG_DEFAULT,
506 "vm_reclaim: Userspace modified head or busy pointer! head: %llu (0x%llx) != busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
507 head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, get_tail_ptr(indices));
508 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, busy);
509 goto fail;
510 }
511
512 if (tail < head) {
513 // Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
514 os_log_error(OS_LOG_DEFAULT,
515 "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
516 head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
517 lck_mtx_unlock(&metadata->vdrm_lock);
518 goto fail;
519 }
520
521 num_to_reclaim = tail - head;
522 while (true) {
523 num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
524 if (num_to_reclaim == 0) {
525 break;
526 }
527 busy = head + num_to_reclaim;
528 success = reclaim_copyout_busy(metadata, busy);
529 if (!success) {
530 goto fail;
531 }
532 os_atomic_thread_fence(seq_cst);
533 success = reclaim_copyin_tail(metadata, &new_tail);
534 if (!success) {
535 goto fail;
536 }
537
538 if (new_tail >= busy) {
539 /* Got num_to_reclaim entries */
540 break;
541 }
542 tail = new_tail;
543 if (tail < head) {
544 // Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
545 os_log_error(OS_LOG_DEFAULT,
546 "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
547 head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
548 lck_mtx_unlock(&metadata->vdrm_lock);
549 goto fail;
550 }
551 /* Can't reclaim these entries. Try again */
552 num_to_reclaim = tail - head;
553 if (num_to_reclaim == 0) {
554 /* Nothing left to reclaim. Reset busy to head. */
555 success = reclaim_copyout_busy(metadata, head);
556 if (!success) {
557 goto fail;
558 }
559 break;
560 }
561 /*
562 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
563 * so this is gauranteed to converge.
564 */
565 }
566
567 while (num_copied < num_to_reclaim) {
568 uint64_t memcpy_start_idx = (head % buffer_len);
569 uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
570 // Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
571 memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
572 uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
573
574 assert(num_to_copy + num_copied <= kReclaimChunkSize);
575 user_addr_t src_ptr = metadata->vdrm_reclaim_buffer + memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t);
576 mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
577
578 result = copyin(src_ptr, dst_ptr, num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t));
579
580 if (result != 0) {
581 os_log_error(OS_LOG_DEFAULT,
582 "vm_reclaim: Unable to copyin %llu entries in reclaim buffer at 0x%llx to 0x%llx: err=%d\n",
583 num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
584 reclaim_handle_copyio_error(metadata, result);
585 goto fail;
586 }
587
588 num_copied += num_to_copy;
589 head += num_to_copy;
590 }
591
592 for (size_t i = 0; i < num_to_reclaim; i++) {
593 mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
594 if (entry->address != 0 && entry->size != 0) {
595 kern_return_t kr = vm_map_remove_guard(map,
596 vm_map_trunc_page(entry->address,
597 VM_MAP_PAGE_MASK(map)),
598 vm_map_round_page(entry->address + entry->size,
599 VM_MAP_PAGE_MASK(map)),
600 VM_MAP_REMOVE_GAPS_FAIL,
601 KMEM_GUARD_NONE).kmr_return;
602 if (kr == KERN_INVALID_VALUE) {
603 reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
604 goto fail;
605 } else if (kr != KERN_SUCCESS) {
606 os_log_error(OS_LOG_DEFAULT,
607 "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx. Err: %d\n",
608 entry->address, entry->size, (uint64_t) map, kr);
609 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
610 goto fail;
611 }
612 num_reclaimed++;
613 os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
614 }
615 }
616
617 success = reclaim_copyout_head(metadata, head);
618 if (!success) {
619 goto fail;
620 }
621
622 vm_map_switch(old_map);
623 return num_reclaimed;
624 fail:
625 vm_map_switch(old_map);
626 return kReclaimChunkFailed;
627 }
628
629 /*
630 * Attempts to reclaim until the buffer's estimated number of available bytes is <= num_bytes_reclaimable_threshold
631 * The metadata buffer lock should be held by the caller.
632 *
633 * Returns the number of entries reclaimed.
634 */
635 static size_t
reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,size_t num_bytes_reclaimable_threshold)636 reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata, size_t num_bytes_reclaimable_threshold)
637 {
638 assert(metadata != NULL);
639 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
640 if (!task_is_active(metadata->vdrm_task)) {
641 /*
642 * If the task is exiting, the reclaim below will likely fail and fall through
643 * to the (slower) error path.
644 * So as an optimization, we bail out early here.
645 */
646 return KERN_FAILURE;
647 }
648
649 size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
650 while (true) {
651 size_t curr_entries_reclaimed = 0;
652 num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
653 reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
654 if (num_bytes_reclaimed > reclaimable_bytes) {
655 estimated_reclaimable_bytes = 0;
656 } else {
657 estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
658 }
659 if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
660 break;
661 }
662 curr_entries_reclaimed = reclaim_chunk(metadata);
663 if (curr_entries_reclaimed == kReclaimChunkFailed) {
664 return kReclaimChunkFailed;
665 }
666 if (curr_entries_reclaimed == 0) {
667 break;
668 }
669 num_entries_reclaimed += curr_entries_reclaimed;
670 }
671
672 return num_entries_reclaimed;
673 }
674
675 /*
676 * Get the reclamation metadata buffer for the given map.
677 * If the buffer exists it is returned locked.
678 */
679 static vm_deferred_reclamation_metadata_t
get_task_reclaim_metadata(task_t task)680 get_task_reclaim_metadata(task_t task)
681 {
682 assert(task != NULL);
683 vm_deferred_reclamation_metadata_t metadata = NULL;
684 task_lock(task);
685 metadata = task->deferred_reclamation_metadata;
686 if (metadata != NULL) {
687 lck_mtx_lock(&metadata->vdrm_lock);
688 }
689 task_unlock(task);
690 return metadata;
691 }
692
693 kern_return_t
vm_deferred_reclamation_buffer_synchronize_internal(task_t task,size_t num_entries_to_reclaim)694 vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
695 {
696 vm_deferred_reclamation_metadata_t metadata = NULL;
697 size_t total_reclaimed = 0;
698
699 if (!task_is_active(task)) {
700 return KERN_FAILURE;
701 }
702
703 metadata = get_task_reclaim_metadata(task);
704 if (metadata == NULL) {
705 return KERN_INVALID_ARGUMENT;
706 }
707
708 while (total_reclaimed < num_entries_to_reclaim) {
709 size_t num_reclaimed = reclaim_chunk(metadata);
710 if (num_reclaimed == kReclaimChunkFailed) {
711 /* Lock has already been released and task is being killed. */
712 return KERN_FAILURE;
713 }
714 if (num_reclaimed == 0) {
715 /* There was nothing to reclaim. A reclamation thread must have beaten us to it. Nothing to do here. */
716 break;
717 }
718
719 total_reclaimed += num_reclaimed;
720 }
721 lck_mtx_unlock(&metadata->vdrm_lock);
722
723 return KERN_SUCCESS;
724 }
725
726 kern_return_t
vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task,size_t reclaimable_bytes)727 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
728 {
729 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
730 size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer;
731 bool success;
732 if (metadata == NULL) {
733 return KERN_INVALID_ARGUMENT;
734 }
735
736 /*
737 * The client is allowed to make this call in parallel from multiple threads.
738 * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
739 * If the client's value is smaller than what we've stored, another thread
740 * raced ahead of them and we've already acted on that accounting so this
741 * call should be a no-op.
742 */
743 success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
744 reclaimable_bytes, acquire,
745 {
746 if (num_bytes_in_buffer > reclaimable_bytes) {
747 os_atomic_rmw_loop_give_up(break);
748 }
749 });
750 if (!success) {
751 /* Stale value. Nothing new to reclaim */
752 return KERN_SUCCESS;
753 }
754 num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
755
756 if (reclaimable_bytes > num_bytes_reclaimed) {
757 estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
758 if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
759 lck_mtx_lock(&metadata->vdrm_lock);
760 size_t num_reclaimed = reclaim_entries_from_buffer(metadata, vm_reclaim_max_threshold);
761 if (num_reclaimed == kReclaimChunkFailed) {
762 /* Lock has already been released & task is in the process of getting killed. */
763 return KERN_INVALID_ARGUMENT;
764 }
765 lck_mtx_unlock(&metadata->vdrm_lock);
766 }
767 }
768
769 return KERN_SUCCESS;
770 }
771
772 static inline size_t
pick_reclaim_threshold(vm_deferred_reclamation_action_t action)773 pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
774 {
775 switch (action) {
776 case RECLAIM_FULL:
777 return 0;
778 case RECLAIM_TRIM:
779 return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
780 case RECLAIM_ASYNC:
781 return 0;
782 }
783 }
784
785 void
vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)786 vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)
787 {
788 if (action == RECLAIM_ASYNC) {
789 lck_mtx_lock(&async_reclamation_buffers_lock);
790
791 process_async_reclamation_list();
792 lck_mtx_unlock(&async_reclamation_buffers_lock);
793 } else {
794 size_t reclaim_threshold = pick_reclaim_threshold(action);
795 lck_mtx_lock(&reclamation_buffers_lock);
796 reclamation_counter++;
797 while (true) {
798 vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
799 if (metadata == NULL) {
800 break;
801 }
802 lck_mtx_lock(&metadata->vdrm_lock);
803 if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
804 // We've already seen this one. We're done
805 lck_mtx_unlock(&metadata->vdrm_lock);
806 break;
807 }
808 metadata->vdrm_reclaimed_at = reclamation_counter;
809
810 TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
811 TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
812 lck_mtx_unlock(&reclamation_buffers_lock);
813
814 size_t num_reclaimed = reclaim_entries_from_buffer(metadata, reclaim_threshold);
815 if (num_reclaimed != kReclaimChunkFailed) {
816 lck_mtx_unlock(&metadata->vdrm_lock);
817 }
818
819 lck_mtx_lock(&reclamation_buffers_lock);
820 }
821 lck_mtx_unlock(&reclamation_buffers_lock);
822 }
823 }
824
825 void
vm_deferred_reclamation_reclaim_all_memory(void)826 vm_deferred_reclamation_reclaim_all_memory(void)
827 {
828 vm_deferred_reclamation_reclaim_memory(RECLAIM_FULL);
829 }
830
831 bool
vm_deferred_reclamation_reclaim_from_task_async(task_t task)832 vm_deferred_reclamation_reclaim_from_task_async(task_t task)
833 {
834 bool queued = false;
835 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
836
837 if (metadata != NULL) {
838 lck_mtx_lock(&async_reclamation_buffers_lock);
839 TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
840 lck_mtx_unlock(&async_reclamation_buffers_lock);
841 queued = true;
842 thread_wakeup(&vm_reclaim_thread);
843 }
844
845 return queued;
846 }
847
848 bool
vm_deferred_reclamation_reclaim_from_task_sync(task_t task,size_t max_entries_to_reclaim)849 vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
850 {
851 size_t num_reclaimed = 0;
852 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
853
854 if (!task_is_active(task)) {
855 return false;
856 }
857
858 if (metadata != NULL) {
859 lck_mtx_lock(&metadata->vdrm_lock);
860 while (num_reclaimed < max_entries_to_reclaim) {
861 size_t num_reclaimed_now = reclaim_chunk(metadata);
862 if (num_reclaimed_now == kReclaimChunkFailed) {
863 /* Lock has already been released and task is being killed. */
864 return false;
865 }
866 if (num_reclaimed_now == 0) {
867 // Nothing left to reclaim
868 break;
869 }
870 num_reclaimed += num_reclaimed_now;
871 }
872 lck_mtx_unlock(&metadata->vdrm_lock);
873 }
874
875 return num_reclaimed > 0;
876 }
877
878 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_buffer_fork(task_t task,vm_deferred_reclamation_metadata_t parent)879 vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
880 {
881 vm_deferred_reclamation_metadata_t metadata = NULL;
882
883 LCK_MTX_ASSERT(&parent->vdrm_lock, LCK_MTX_ASSERT_OWNED);
884
885 assert(task->deferred_reclamation_metadata == NULL);
886 metadata = vmdr_metadata_alloc(task, parent->vdrm_reclaim_buffer,
887 parent->vdrm_buffer_size, parent->vdrm_reclaim_indices);
888 lck_mtx_unlock(&parent->vdrm_lock);
889
890 lck_mtx_lock(&reclamation_buffers_lock);
891 TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
892 reclamation_buffers_length++;
893 lck_mtx_unlock(&reclamation_buffers_lock);
894
895 return metadata;
896 }
897
898 void
vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)899 vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)
900 {
901 lck_mtx_lock(&metadata->vdrm_lock);
902 }
903
904 void
vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)905 vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)
906 {
907 lck_mtx_unlock(&metadata->vdrm_lock);
908 }
909
910
911 static void
reclaim_thread_init(void)912 reclaim_thread_init(void)
913 {
914 #if CONFIG_THREAD_GROUPS
915 thread_group_vm_add();
916 #endif
917 thread_set_thread_name(current_thread(), "VM_reclaim");
918 }
919
920
921 static void
process_async_reclamation_list(void)922 process_async_reclamation_list(void)
923 {
924 LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
925
926 vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
927 while (metadata != NULL) {
928 TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
929 metadata->vdrm_async_list.tqe_next = NULL;
930 metadata->vdrm_async_list.tqe_prev = NULL;
931 lck_mtx_lock(&metadata->vdrm_lock);
932 lck_mtx_unlock(&async_reclamation_buffers_lock);
933
934 // NB: Currently the async reclaim thread fully reclaims the buffer.
935 size_t num_reclaimed = reclaim_entries_from_buffer(metadata, 0);
936 if (num_reclaimed == kReclaimChunkFailed) {
937 /* Lock has already been released & task is in the process of getting killed. */
938 goto next;
939 }
940 /* Wakeup anyone waiting on this buffer getting processed */
941 thread_wakeup(&metadata->vdrm_async_list);
942 assert(current_thread()->map == kernel_map);
943 lck_mtx_unlock(&metadata->vdrm_lock);
944
945 next:
946 lck_mtx_lock(&async_reclamation_buffers_lock);
947 metadata = TAILQ_FIRST(&async_reclamation_buffers);
948 }
949 }
950
951 __enum_decl(reclaim_thread_state, uint32_t, {
952 RECLAIM_THREAD_INIT = 0,
953 RECLAIM_THREAD_CONT = 1,
954 });
955
956 static void
reclaim_thread_continue(void)957 reclaim_thread_continue(void)
958 {
959 lck_mtx_lock(&async_reclamation_buffers_lock);
960
961 process_async_reclamation_list();
962 assert_wait(&vm_reclaim_thread, THREAD_UNINT);
963
964 lck_mtx_unlock(&async_reclamation_buffers_lock);
965 }
966
967 void
reclaim_thread(void * param,wait_result_t wr __unused)968 reclaim_thread(void *param, wait_result_t wr __unused)
969 {
970 if (param == (void *) RECLAIM_THREAD_INIT) {
971 reclaim_thread_init();
972 } else {
973 assert(param == (void *) RECLAIM_THREAD_CONT);
974 }
975
976 reclaim_thread_continue();
977
978 (void) thread_block_parameter(reclaim_thread, (void*) RECLAIM_THREAD_CONT);
979 }
980
981 __startup_func
982 static void
vm_deferred_reclamation_init(void)983 vm_deferred_reclamation_init(void)
984 {
985 vm_reclaim_max_threshold = PAGE_SIZE;
986 if (!PE_parse_boot_argn("vm_reclaim_max_threshold",
987 &vm_reclaim_max_threshold, sizeof(vm_reclaim_max_threshold))) {
988 vm_reclaim_max_threshold = PAGE_SIZE;
989 }
990
991 (void)kernel_thread_start_priority(reclaim_thread,
992 (void *)RECLAIM_THREAD_INIT, kReclaimThreadPriority,
993 &vm_reclaim_thread);
994 }
995
996 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
997
998 #if DEVELOPMENT || DEBUG
999
1000 bool
vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)1001 vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1002 {
1003 vm_deferred_reclamation_metadata_t metadata = NULL;
1004 proc_t p = proc_find(pid);
1005 vm_map_t map = NULL;
1006 if (p == NULL) {
1007 return false;
1008 }
1009 task_t t = proc_task(p);
1010 if (t == NULL) {
1011 proc_rele(p);
1012 return false;
1013 }
1014
1015 task_lock(t);
1016 if (t->map) {
1017 metadata = t->deferred_reclamation_metadata;
1018 if (metadata != NULL) {
1019 map = t->map;
1020 vm_map_reference(t->map);
1021 }
1022 }
1023 task_unlock(t);
1024 proc_rele(p);
1025 if (metadata == NULL) {
1026 return false;
1027 }
1028
1029 lck_mtx_lock(&async_reclamation_buffers_lock);
1030 while (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
1031 assert_wait(&metadata->vdrm_async_list, THREAD_UNINT);
1032 lck_mtx_unlock(&async_reclamation_buffers_lock);
1033 thread_block(THREAD_CONTINUE_NULL);
1034 lck_mtx_lock(&async_reclamation_buffers_lock);
1035 }
1036
1037 /*
1038 * The async reclaim thread first removes the buffer from the list
1039 * and then reclaims it (while holding its lock).
1040 * So grab the metadata buffer's lock here to ensure the
1041 * reclaim is done.
1042 */
1043 lck_mtx_lock(&metadata->vdrm_lock);
1044 lck_mtx_unlock(&metadata->vdrm_lock);
1045 lck_mtx_unlock(&async_reclamation_buffers_lock);
1046
1047 vm_map_deallocate(map);
1048 return true;
1049 }
1050
1051 #endif /* DEVELOPMENT || DEBUG */
1052