1 /*
2 * Copyright (c) 2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/startup.h>
35 #include <kern/sched.h>
36 #include <libkern/OSAtomic.h>
37 #include <mach/mach_types.h>
38 #include <mach/mach_vm.h>
39 #include <mach/vm_reclaim.h>
40 #include <os/log.h>
41 #include <pexpert/pexpert.h>
42 #include <vm/vm_map_internal.h>
43 #include <vm/vm_reclaim_internal.h>
44 #include <sys/queue.h>
45 #include <os/atomic_private.h>
46
47 #pragma mark Tunables
48 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
49 static integer_t kReclaimThreadPriority = BASEPRI_VM;
50 // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
51 TUNABLE_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", 2);
52 // Used to debug vm_reclaim kills
53 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
54 uint64_t vm_reclaim_max_threshold;
55
56 #pragma mark Declarations
57 typedef struct proc * proc_t;
58 extern char * proc_best_name(proc_t proc);
59 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
60 struct proc *proc_ref(struct proc *p, int locked);
61 int proc_rele(proc_t p);
62 static bool reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
63 static bool reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
64 static bool reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
65
66 struct vm_deferred_reclamation_metadata_s {
67 TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; // Global list containing every reclamation buffer
68 TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list; // A list containing buffers that are ripe for reclamation
69 decl_lck_mtx_data(, vdrm_lock); /* Held when reclaiming from the buffer */
70 /*
71 * The task owns this structure but we maintain a backpointer here
72 * so that we can send an exception if we hit an error.
73 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
74 */
75 task_t vdrm_task;
76 vm_map_t vdrm_map;
77 user_addr_t vdrm_reclaim_buffer;
78 mach_vm_size_t vdrm_buffer_size;
79 user_addr_t vdrm_reclaim_indices;
80 uint64_t vdrm_reclaimed_at;
81 /*
82 * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
83 * cumulatively. Both values are in terms of virtual memory, so they give an upper bound
84 * on the amount of physical memory that can be reclaimed.
85 * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
86 * Note that neither value is protected by the vdrm_lock.
87 */
88 _Atomic size_t vdrm_num_bytes_put_in_buffer;
89 _Atomic size_t vdrm_num_bytes_reclaimed;
90 };
91 static void process_async_reclamation_list(void);
92
93 extern void *proc_find(int pid);
94 extern task_t proc_task(proc_t);
95
96 #pragma mark Globals
97 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
98 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
99 static size_t kReclaimChunkFailed = UINT64_MAX;
100
101 /*
102 * We maintain two lists of reclamation buffers.
103 * The reclamation_buffers list contains every buffer in the system.
104 * The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
105 * Each list has its own lock.
106 */
107 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
108
109 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
110 /*
111 * The reclamation_buffers_lock protects the reclamation_buffers list.
112 * It must be held when iterating over the list or manipulating the list.
113 * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
114 */
115 LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
116 LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
117 static size_t reclamation_buffers_length;
118 static uint64_t reclamation_counter; // generation count for global reclaims
119
120 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
121 static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
122
123 #pragma mark Implementation
124
125 static vm_deferred_reclamation_metadata_t
metadata_init(task_t task,vm_map_t map,user_addr_t buffer,mach_vm_size_t size,user_addr_t indices)126 metadata_init(
127 task_t task,
128 vm_map_t map,
129 user_addr_t buffer,
130 mach_vm_size_t size,
131 user_addr_t indices)
132 {
133 vm_deferred_reclamation_metadata_t metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
134 lck_mtx_init(&(metadata->vdrm_lock), &vm_reclaim_lock_grp, LCK_ATTR_NULL);
135 assert(!map->is_nested_map);
136 assert(map == task->map);
137
138 metadata->vdrm_task = task;
139 metadata->vdrm_map = map;
140 metadata->vdrm_reclaim_buffer = buffer;
141 metadata->vdrm_buffer_size = size;
142 metadata->vdrm_reclaim_indices = indices;
143 return metadata;
144 }
145
146 kern_return_t
vm_deferred_reclamation_buffer_init_internal(task_t task,mach_vm_offset_t address,mach_vm_size_t size,user_addr_t indices)147 vm_deferred_reclamation_buffer_init_internal(
148 task_t task,
149 mach_vm_offset_t address,
150 mach_vm_size_t size,
151 user_addr_t indices)
152 {
153 kern_return_t kr = KERN_FAILURE;
154 vm_deferred_reclamation_metadata_t metadata = NULL;
155 vm_map_t map = VM_MAP_NULL;
156 bool success;
157 uint64_t head = 0, tail = 0, busy = 0;
158 if (address == 0 || indices == 0 || size < 2 * sizeof(mach_vm_reclaim_entry_v1_t)) {
159 return KERN_INVALID_ARGUMENT;
160 }
161
162 task_lock(task);
163 /* The reclamation buffer will adopt this reference. */
164 map = task->map;
165 vm_map_reference(map);
166 task_unlock(task);
167
168 metadata = metadata_init(task, map, address, size, indices);
169
170 /*
171 * Validate the starting indices
172 */
173 success = reclaim_copyin_busy(metadata, &busy);
174 if (!success) {
175 kr = KERN_INVALID_ARGUMENT;
176 goto out;
177 }
178 success = reclaim_copyin_head(metadata, &head);
179 if (!success) {
180 kr = KERN_INVALID_ARGUMENT;
181 goto out;
182 }
183 success = reclaim_copyin_tail(metadata, &tail);
184 if (!success) {
185 kr = KERN_INVALID_ARGUMENT;
186 goto out;
187 }
188 if (head != 0 || tail != 0 || busy != 0) {
189 kr = KERN_INVALID_ARGUMENT;
190 goto out;
191 }
192
193 task_lock(task);
194 if (task->deferred_reclamation_metadata != NULL) {
195 /* Attempt to overwrite existing reclaim buffer. This is not allowed. */
196 os_log_with_startup_serial(OS_LOG_DEFAULT,
197 "vm_reclaim: tried to overwrite exisiting reclaim buffer for task %p", task);
198 kr = KERN_INVALID_ARGUMENT;
199 task_unlock(task);
200 goto out;
201 }
202 task->deferred_reclamation_metadata = metadata;
203 map = VM_MAP_NULL;
204 kr = KERN_SUCCESS;
205
206 task_unlock(task);
207 lck_mtx_lock(&reclamation_buffers_lock);
208 TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
209 reclamation_buffers_length++;
210 lck_mtx_unlock(&reclamation_buffers_lock);
211 metadata = NULL;
212
213 out:
214 if (metadata) {
215 zfree(vm_reclaim_metadata_zone, metadata);
216 }
217 if (map) {
218 vm_map_deallocate(map);
219 }
220
221 return kr;
222 }
223
224 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)225 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
226 {
227 assert(metadata != NULL);
228 /*
229 * First remove the buffer from the global list so no one else can get access to it.
230 */
231 lck_mtx_lock(&reclamation_buffers_lock);
232 TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
233 reclamation_buffers_length--;
234 lck_mtx_unlock(&reclamation_buffers_lock);
235
236 /*
237 * Now remove it from the async list (if present)
238 */
239 lck_mtx_lock(&async_reclamation_buffers_lock);
240 if (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
241 TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
242 metadata->vdrm_async_list.tqe_next = NULL;
243 metadata->vdrm_async_list.tqe_prev = NULL;
244 }
245 lck_mtx_unlock(&async_reclamation_buffers_lock);
246
247 /*
248 * Now take the metadata lock. Once we acquire that it's
249 * safe to free it.
250 */
251 lck_mtx_lock(&metadata->vdrm_lock);
252 /* Drop our reference on the map */
253 vm_map_deallocate(metadata->vdrm_map);
254 zfree(vm_reclaim_metadata_zone, metadata);
255 }
256
257 static user_addr_t
get_head_ptr(user_addr_t indices)258 get_head_ptr(user_addr_t indices)
259 {
260 return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
261 }
262
263 static user_addr_t
get_tail_ptr(user_addr_t indices)264 get_tail_ptr(user_addr_t indices)
265 {
266 return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
267 }
268
269 static user_addr_t
get_busy_ptr(user_addr_t indices)270 get_busy_ptr(user_addr_t indices)
271 {
272 return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
273 }
274
275 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)276 reclaim_kill_with_reason(
277 vm_deferred_reclamation_metadata_t metadata,
278 unsigned reason,
279 mach_exception_data_type_t subcode)
280 {
281 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
282 mach_exception_code_t code = 0;
283 task_t task = metadata->vdrm_task;
284 proc_t p = NULL;
285 boolean_t fatal = TRUE;
286 bool killing_self = false;
287 pid_t pid;
288 int err;
289
290 if (panic_on_kill) {
291 panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
292 }
293
294 EXC_GUARD_ENCODE_TYPE(code, guard_type);
295 EXC_GUARD_ENCODE_FLAVOR(code, reason);
296 EXC_GUARD_ENCODE_TARGET(code, 0);
297
298 assert(metadata->vdrm_task != kernel_task);
299 killing_self = task == current_task();
300 if (!killing_self) {
301 /*
302 * Grab a reference on the task to make sure it doesn't go away
303 * after we drop the metadata lock
304 */
305 task_reference(task);
306 }
307 /*
308 * We need to issue a wakeup in case this kill is coming from the async path.
309 * Once we drop the lock the caller can no longer do this wakeup, but
310 * if there's someone blocked on this reclaim they hold a map reference
311 * and thus need to be woken up so the map can be freed.
312 */
313 thread_wakeup(&metadata->vdrm_async_list);
314 lck_mtx_unlock(&metadata->vdrm_lock);
315
316 if (reason == kGUARD_EXC_DEALLOC_GAP) {
317 task_lock(task);
318 fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
319 task_unlock(task);
320 }
321
322 if (!fatal) {
323 os_log_with_startup_serial(OS_LOG_DEFAULT,
324 "vm_reclaim: Skipping non fatal guard exception.\n");
325 goto out;
326 }
327
328 pid = task_pid(task);
329 if (killing_self) {
330 p = get_bsdtask_info(task);
331 } else {
332 p = proc_find(pid);
333 if (p && proc_task(p) != task) {
334 os_log_with_startup_serial(OS_LOG_DEFAULT,
335 "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
336 goto out;
337 }
338
339 task_deallocate(task);
340 task = NULL;
341 }
342
343 if (!p) {
344 os_log_with_startup_serial(OS_LOG_DEFAULT,
345 "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
346 goto out;
347 }
348
349 err = exit_with_guard_exception(p, code, subcode);
350 if (err != 0) {
351 os_log_with_startup_serial(OS_LOG_DEFAULT, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
352 }
353 out:
354 if (!killing_self) {
355 if (p) {
356 proc_rele(p);
357 p = NULL;
358 }
359 if (task) {
360 task_deallocate(task);
361 task = NULL;
362 }
363 }
364 }
365
366 static void
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)367 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
368 {
369 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE, result);
370 }
371
372 /*
373 * Helper functions to do copyio on the head, tail, and busy pointers.
374 * Note that the kernel will only write to the busy and head pointers.
375 * Userspace is not supposed to write to the head or busy pointers, but the kernel
376 * must be resilient to that kind of bug in userspace.
377 */
378
379
380 static bool
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)381 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
382 {
383 int result;
384 user_addr_t indices = metadata->vdrm_reclaim_indices;
385 user_addr_t head_ptr = get_head_ptr(indices);
386
387 result = copyin_atomic64(head_ptr, head);
388
389 if (result != 0) {
390 os_log_with_startup_serial(OS_LOG_DEFAULT,
391 "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
392 reclaim_handle_copyio_error(metadata, result);
393 return false;
394 }
395 return true;
396 }
397
398 static bool
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)399 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
400 {
401 int result;
402 user_addr_t indices = metadata->vdrm_reclaim_indices;
403 user_addr_t tail_ptr = get_tail_ptr(indices);
404
405 result = copyin_atomic64(tail_ptr, tail);
406
407 if (result != 0) {
408 os_log_with_startup_serial(OS_LOG_DEFAULT,
409 "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
410 reclaim_handle_copyio_error(metadata, result);
411 return false;
412 }
413 return true;
414 }
415
416 static bool
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)417 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
418 {
419 int result;
420 user_addr_t indices = metadata->vdrm_reclaim_indices;
421 user_addr_t busy_ptr = get_busy_ptr(indices);
422
423 result = copyin_atomic64(busy_ptr, busy);
424
425 if (result != 0) {
426 os_log_with_startup_serial(OS_LOG_DEFAULT,
427 "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
428 reclaim_handle_copyio_error(metadata, result);
429 return false;
430 }
431 return true;
432 }
433
434 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)435 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
436 {
437 int result;
438 user_addr_t indices = metadata->vdrm_reclaim_indices;
439 user_addr_t busy_ptr = get_busy_ptr(indices);
440
441 result = copyout_atomic64(value, busy_ptr);
442
443 if (result != 0) {
444 os_log_with_startup_serial(OS_LOG_DEFAULT,
445 "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
446 reclaim_handle_copyio_error(metadata, result);
447 return false;
448 }
449 return true;
450 }
451
452 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)453 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
454 {
455 int result;
456 user_addr_t indices = metadata->vdrm_reclaim_indices;
457 user_addr_t head_ptr = get_head_ptr(indices);
458
459 result = copyout_atomic64(value, head_ptr);
460
461 if (result != 0) {
462 os_log_with_startup_serial(OS_LOG_DEFAULT,
463 "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
464 reclaim_handle_copyio_error(metadata, result);
465 return false;
466 }
467 return true;
468 }
469
470 /*
471 * Reclaim a chunk from the buffer.
472 * Returns the number of entries reclaimed or 0 if there are no entries left in the buffer.
473 */
474 static size_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)475 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)
476 {
477 assert(metadata != NULL);
478 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
479
480 int result = 0;
481 size_t num_reclaimed = 0;
482 uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0, num_copied = 0, buffer_len = 0;
483 user_addr_t indices;
484 vm_map_t map = metadata->vdrm_map, old_map;
485 mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
486 bool success;
487
488 buffer_len = metadata->vdrm_buffer_size / sizeof(mach_vm_reclaim_entry_v1_t);
489
490 memset(reclaim_entries, 0, sizeof(reclaim_entries));
491
492 indices = (user_addr_t) metadata->vdrm_reclaim_indices;
493 old_map = vm_map_switch(map);
494
495 success = reclaim_copyin_busy(metadata, &busy);
496 if (!success) {
497 goto fail;
498 }
499 success = reclaim_copyin_head(metadata, &head);
500 if (!success) {
501 goto fail;
502 }
503 success = reclaim_copyin_tail(metadata, &tail);
504 if (!success) {
505 goto fail;
506 }
507
508 if (busy != head) {
509 // Userspace overwrote one of the pointers
510 os_log_with_startup_serial(OS_LOG_DEFAULT,
511 "vm_reclaim: Userspace modified head or busy pointer! %llu (0x%llx) != %llu (0x%llx) tail = %llu (0x%llx)\n",
512 head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, get_tail_ptr(indices));
513 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, busy);
514 goto fail;
515 }
516
517 if (tail < head) {
518 os_log_with_startup_serial(OS_LOG_DEFAULT,
519 "vm_reclaim: Userspace modified head or tail pointer! %llu (0x%llx) != %llu (0x%llx) busy = %llu (0x%llx)\n",
520 head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
521 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, tail);
522 goto fail;
523 }
524
525 num_to_reclaim = tail - head;
526 while (true) {
527 num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
528 if (num_to_reclaim == 0) {
529 break;
530 }
531 busy = head + num_to_reclaim;
532 success = reclaim_copyout_busy(metadata, busy);
533 if (!success) {
534 goto fail;
535 }
536 os_atomic_thread_fence(seq_cst);
537 success = reclaim_copyin_tail(metadata, &new_tail);
538 if (!success) {
539 goto fail;
540 }
541
542 if (new_tail >= busy) {
543 /* Got num_to_reclaim entries */
544 break;
545 }
546 tail = new_tail;
547 if (tail < head) {
548 os_log_with_startup_serial(OS_LOG_DEFAULT,
549 "vm_reclaim: Userspace modified head or tail pointer! %llu (0x%llx) != %llu (0x%llx) busy = %llu (0x%llx)\n",
550 head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
551 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, tail);
552 goto fail;
553 }
554 /* Can't reclaim these entries. Try again */
555 num_to_reclaim = tail - head;
556 if (num_to_reclaim == 0) {
557 /* Nothing left to reclaim. Reset busy to head. */
558 success = reclaim_copyout_busy(metadata, head);
559 if (!success) {
560 goto fail;
561 }
562 break;
563 }
564 /*
565 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
566 * so this is gauranteed to converge.
567 */
568 }
569
570 while (num_copied < num_to_reclaim) {
571 uint64_t memcpy_start_idx = (head % buffer_len);
572 uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
573 // Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
574 memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
575 uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
576
577 assert(num_to_copy + num_copied <= kReclaimChunkSize);
578 user_addr_t src_ptr = metadata->vdrm_reclaim_buffer + memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t);
579 mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
580
581 result = copyin(src_ptr, dst_ptr, num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t));
582
583 if (result != 0) {
584 os_log_with_startup_serial(OS_LOG_DEFAULT,
585 "vm_reclaim: Unable to copyin %llu entries in reclaim buffer at 0x%llx to 0x%llx: err=%d\n",
586 num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
587 reclaim_handle_copyio_error(metadata, result);
588 goto fail;
589 }
590
591 num_copied += num_to_copy;
592 head += num_to_copy;
593 }
594
595 for (size_t i = 0; i < num_to_reclaim; i++) {
596 mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
597 if (entry->address != 0 && entry->size != 0) {
598 kern_return_t kr = vm_map_remove_guard(map,
599 vm_map_trunc_page(entry->address,
600 VM_MAP_PAGE_MASK(map)),
601 vm_map_round_page(entry->address + entry->size,
602 VM_MAP_PAGE_MASK(map)),
603 VM_MAP_REMOVE_GAPS_FAIL,
604 KMEM_GUARD_NONE).kmr_return;
605 if (kr == KERN_INVALID_VALUE) {
606 reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
607 goto fail;
608 } else if (kr != KERN_SUCCESS) {
609 os_log_with_startup_serial(OS_LOG_DEFAULT,
610 "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx. Err: %d\n",
611 entry->address, entry->size, (uint64_t) map, kr);
612 reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
613 goto fail;
614 }
615 num_reclaimed++;
616 os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
617 }
618 }
619
620 success = reclaim_copyout_head(metadata, head);
621 if (!success) {
622 goto fail;
623 }
624
625 vm_map_switch(old_map);
626 return num_reclaimed;
627 fail:
628 vm_map_switch(old_map);
629 return kReclaimChunkFailed;
630 }
631
632 /*
633 * Attempts to reclaim until the buffer's estimated number of available bytes is <= num_bytes_reclaimable_threshold
634 * The metadata buffer lock should be held by the caller.
635 *
636 * Returns the number of entries reclaimed.
637 */
638 static size_t
reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,size_t num_bytes_reclaimable_threshold)639 reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata, size_t num_bytes_reclaimable_threshold)
640 {
641 assert(metadata != NULL);
642 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
643 if (!task_is_active(metadata->vdrm_task)) {
644 /*
645 * If the task is exiting, the reclaim below will likely fail and fall through
646 * to the (slower) error path.
647 * So as an optimization, we bail out early here.
648 */
649 return KERN_FAILURE;
650 }
651
652 size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
653 while (true) {
654 size_t curr_entries_reclaimed = 0;
655 num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
656 reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
657 if (num_bytes_reclaimed > reclaimable_bytes) {
658 estimated_reclaimable_bytes = 0;
659 } else {
660 estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
661 }
662 if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
663 break;
664 }
665 curr_entries_reclaimed = reclaim_chunk(metadata);
666 if (curr_entries_reclaimed == kReclaimChunkFailed) {
667 return kReclaimChunkFailed;
668 }
669 if (curr_entries_reclaimed == 0) {
670 break;
671 }
672 num_entries_reclaimed += curr_entries_reclaimed;
673 }
674
675 return num_entries_reclaimed;
676 }
677
678 /*
679 * Get the reclamation metadata buffer for the given map.
680 * If the buffer exists it is returned locked.
681 */
682 static vm_deferred_reclamation_metadata_t
get_task_reclaim_metadata(task_t task)683 get_task_reclaim_metadata(task_t task)
684 {
685 assert(task != NULL);
686 vm_deferred_reclamation_metadata_t metadata = NULL;
687 task_lock(task);
688 metadata = task->deferred_reclamation_metadata;
689 if (metadata != NULL) {
690 lck_mtx_lock(&metadata->vdrm_lock);
691 }
692 task_unlock(task);
693 return metadata;
694 }
695
696 kern_return_t
vm_deferred_reclamation_buffer_synchronize_internal(task_t task,size_t num_entries_to_reclaim)697 vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
698 {
699 vm_deferred_reclamation_metadata_t metadata = NULL;
700 size_t total_reclaimed = 0;
701
702 if (!task_is_active(task)) {
703 return KERN_FAILURE;
704 }
705
706 metadata = get_task_reclaim_metadata(task);
707 if (metadata == NULL) {
708 return KERN_INVALID_ARGUMENT;
709 }
710
711 while (total_reclaimed < num_entries_to_reclaim) {
712 size_t num_reclaimed = reclaim_chunk(metadata);
713 if (num_reclaimed == kReclaimChunkFailed) {
714 /* Lock has already been released and task is being killed. */
715 return KERN_FAILURE;
716 }
717 if (num_reclaimed == 0) {
718 /* There was nothing to reclaim. A reclamation thread must have beaten us to it. Nothing to do here. */
719 break;
720 }
721
722 total_reclaimed += num_reclaimed;
723 }
724 lck_mtx_unlock(&metadata->vdrm_lock);
725
726 return KERN_SUCCESS;
727 }
728
729 kern_return_t
vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task,size_t reclaimable_bytes)730 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
731 {
732 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
733 size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer;
734 bool success;
735 if (metadata == NULL) {
736 return KERN_INVALID_ARGUMENT;
737 }
738
739 /*
740 * The client is allowed to make this call in parallel from multiple threads.
741 * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
742 * If the client's value is smaller than what we've stored, another thread
743 * raced ahead of them and we've already acted on that accounting so this
744 * call should be a no-op.
745 */
746 success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
747 reclaimable_bytes, acquire,
748 {
749 if (num_bytes_in_buffer > reclaimable_bytes) {
750 os_atomic_rmw_loop_give_up(break);
751 }
752 });
753 if (!success) {
754 /* Stale value. Nothing new to reclaim */
755 return KERN_SUCCESS;
756 }
757 num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
758
759 if (reclaimable_bytes > num_bytes_reclaimed) {
760 estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
761 if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
762 lck_mtx_lock(&metadata->vdrm_lock);
763 size_t num_reclaimed = reclaim_entries_from_buffer(metadata, vm_reclaim_max_threshold);
764 if (num_reclaimed == kReclaimChunkFailed) {
765 /* Lock has already been released & task is in the process of getting killed. */
766 return KERN_INVALID_ARGUMENT;
767 }
768 lck_mtx_unlock(&metadata->vdrm_lock);
769 }
770 }
771
772 return KERN_SUCCESS;
773 }
774
775 static inline size_t
pick_reclaim_threshold(vm_deferred_reclamation_action_t action)776 pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
777 {
778 switch (action) {
779 case RECLAIM_FULL:
780 return 0;
781 case RECLAIM_TRIM:
782 return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
783 case RECLAIM_ASYNC:
784 return 0;
785 }
786 }
787
788 void
vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)789 vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)
790 {
791 if (action == RECLAIM_ASYNC) {
792 lck_mtx_lock(&async_reclamation_buffers_lock);
793
794 process_async_reclamation_list();
795 lck_mtx_unlock(&async_reclamation_buffers_lock);
796 } else {
797 size_t reclaim_threshold = pick_reclaim_threshold(action);
798 lck_mtx_lock(&reclamation_buffers_lock);
799 reclamation_counter++;
800 while (true) {
801 vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
802 if (metadata == NULL) {
803 break;
804 }
805 lck_mtx_lock(&metadata->vdrm_lock);
806 if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
807 // We've already seen this one. We're done
808 lck_mtx_unlock(&metadata->vdrm_lock);
809 break;
810 }
811 metadata->vdrm_reclaimed_at = reclamation_counter;
812
813 TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
814 TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
815 lck_mtx_unlock(&reclamation_buffers_lock);
816
817 size_t num_reclaimed = reclaim_entries_from_buffer(metadata, reclaim_threshold);
818 if (num_reclaimed != kReclaimChunkFailed) {
819 lck_mtx_unlock(&metadata->vdrm_lock);
820 }
821
822 lck_mtx_lock(&reclamation_buffers_lock);
823 }
824 lck_mtx_unlock(&reclamation_buffers_lock);
825 }
826 }
827
828 void
vm_deferred_reclamation_reclaim_all_memory(void)829 vm_deferred_reclamation_reclaim_all_memory(void)
830 {
831 vm_deferred_reclamation_reclaim_memory(RECLAIM_FULL);
832 }
833
834 bool
vm_deferred_reclamation_reclaim_from_task_async(task_t task)835 vm_deferred_reclamation_reclaim_from_task_async(task_t task)
836 {
837 bool queued = false;
838 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
839
840 if (metadata != NULL) {
841 lck_mtx_lock(&async_reclamation_buffers_lock);
842 TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
843 lck_mtx_unlock(&async_reclamation_buffers_lock);
844 queued = true;
845 thread_wakeup(&vm_reclaim_thread);
846 }
847
848 return queued;
849 }
850
851 bool
vm_deferred_reclamation_reclaim_from_task_sync(task_t task,size_t max_entries_to_reclaim)852 vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
853 {
854 size_t num_reclaimed = 0;
855 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
856
857 if (!task_is_active(task)) {
858 return false;
859 }
860
861 if (metadata != NULL) {
862 lck_mtx_lock(&metadata->vdrm_lock);
863 while (num_reclaimed < max_entries_to_reclaim) {
864 size_t num_reclaimed_now = reclaim_chunk(metadata);
865 if (num_reclaimed_now == kReclaimChunkFailed) {
866 /* Lock has already been released and task is being killed. */
867 return false;
868 }
869 if (num_reclaimed_now == 0) {
870 // Nothing left to reclaim
871 break;
872 }
873 num_reclaimed += num_reclaimed_now;
874 }
875 lck_mtx_unlock(&metadata->vdrm_lock);
876 }
877
878 return num_reclaimed > 0;
879 }
880
881 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_buffer_fork(task_t task,vm_deferred_reclamation_metadata_t parent)882 vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
883 {
884 LCK_MTX_ASSERT(&parent->vdrm_lock, LCK_MTX_ASSERT_OWNED);
885 vm_map_t map = task->map;
886 vm_deferred_reclamation_metadata_t metadata = NULL;
887
888 vm_map_reference(map);
889 assert(task->deferred_reclamation_metadata == NULL);
890 metadata = metadata_init(task, map, parent->vdrm_reclaim_buffer, parent->vdrm_buffer_size, parent->vdrm_reclaim_indices);
891 lck_mtx_unlock(&parent->vdrm_lock);
892
893 lck_mtx_lock(&reclamation_buffers_lock);
894 TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
895 reclamation_buffers_length++;
896 lck_mtx_unlock(&reclamation_buffers_lock);
897
898 return metadata;
899 }
900
901 void
vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)902 vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)
903 {
904 lck_mtx_lock(&metadata->vdrm_lock);
905 }
906
907 void
vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)908 vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)
909 {
910 lck_mtx_unlock(&metadata->vdrm_lock);
911 }
912
913
914 static void
reclaim_thread_init(void)915 reclaim_thread_init(void)
916 {
917 #if CONFIG_THREAD_GROUPS
918 thread_group_vm_add();
919 #endif
920 thread_set_thread_name(current_thread(), "VM_reclaim");
921 }
922
923
924 static void
process_async_reclamation_list(void)925 process_async_reclamation_list(void)
926 {
927 LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
928
929 vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
930 while (metadata != NULL) {
931 TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
932 metadata->vdrm_async_list.tqe_next = NULL;
933 metadata->vdrm_async_list.tqe_prev = NULL;
934 lck_mtx_lock(&metadata->vdrm_lock);
935 lck_mtx_unlock(&async_reclamation_buffers_lock);
936
937 // NB: Currently the async reclaim thread fully reclaims the buffer.
938 size_t num_reclaimed = reclaim_entries_from_buffer(metadata, 0);
939 if (num_reclaimed == kReclaimChunkFailed) {
940 /* Lock has already been released & task is in the process of getting killed. */
941 goto next;
942 }
943 /* Wakeup anyone waiting on this buffer getting processed */
944 thread_wakeup(&metadata->vdrm_async_list);
945 assert(current_thread()->map == kernel_map);
946 lck_mtx_unlock(&metadata->vdrm_lock);
947
948 next:
949 lck_mtx_lock(&async_reclamation_buffers_lock);
950 metadata = TAILQ_FIRST(&async_reclamation_buffers);
951 }
952 }
953
954 __enum_decl(reclaim_thread_state, uint32_t, {
955 RECLAIM_THREAD_INIT = 0,
956 RECLAIM_THREAD_CONT = 1,
957 });
958
959 static void
reclaim_thread_continue(void)960 reclaim_thread_continue(void)
961 {
962 lck_mtx_lock(&async_reclamation_buffers_lock);
963
964 process_async_reclamation_list();
965 assert_wait(&vm_reclaim_thread, THREAD_UNINT);
966
967 lck_mtx_unlock(&async_reclamation_buffers_lock);
968 }
969
970 void
reclaim_thread(void * param,wait_result_t wr __unused)971 reclaim_thread(void *param, wait_result_t wr __unused)
972 {
973 if (param == (void *) RECLAIM_THREAD_INIT) {
974 reclaim_thread_init();
975 } else {
976 assert(param == (void *) RECLAIM_THREAD_CONT);
977 }
978
979 reclaim_thread_continue();
980
981 (void) thread_block_parameter(reclaim_thread, (void*) RECLAIM_THREAD_CONT);
982 }
983
984 __startup_func
985 static void
vm_deferred_reclamation_init(void)986 vm_deferred_reclamation_init(void)
987 {
988 kern_return_t result;
989
990 vm_reclaim_max_threshold = PAGE_SIZE;
991 if (!PE_parse_boot_argn("vm_reclaim_max_threshold", &vm_reclaim_max_threshold, sizeof(vm_reclaim_max_threshold))) {
992 vm_reclaim_max_threshold = PAGE_SIZE;
993 }
994
995 result = kernel_thread_start_priority(reclaim_thread,
996 (void *)RECLAIM_THREAD_INIT, kReclaimThreadPriority,
997 &vm_reclaim_thread);
998 }
999
1000 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1001
1002 #if DEVELOPMENT || DEBUG
1003
1004 bool
vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)1005 vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1006 {
1007 vm_deferred_reclamation_metadata_t metadata = NULL;
1008 proc_t p = proc_find(pid);
1009 vm_map_t map = NULL;
1010 if (p == NULL) {
1011 return false;
1012 }
1013 task_t t = proc_task(p);
1014 if (t == NULL) {
1015 proc_rele(p);
1016 return false;
1017 }
1018
1019 task_lock(t);
1020 if (t->map) {
1021 metadata = t->deferred_reclamation_metadata;
1022 if (metadata != NULL) {
1023 map = t->map;
1024 vm_map_reference(t->map);
1025 }
1026 }
1027 task_unlock(t);
1028 proc_rele(p);
1029 if (metadata == NULL) {
1030 return false;
1031 }
1032
1033 lck_mtx_lock(&async_reclamation_buffers_lock);
1034 while (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
1035 assert_wait(&metadata->vdrm_async_list, THREAD_UNINT);
1036 lck_mtx_unlock(&async_reclamation_buffers_lock);
1037 thread_block(THREAD_CONTINUE_NULL);
1038 lck_mtx_lock(&async_reclamation_buffers_lock);
1039 }
1040
1041 /*
1042 * The async reclaim thread first removes the buffer from the list
1043 * and then reclaims it (while holding its lock).
1044 * So grab the metadata buffer's lock here to ensure the
1045 * reclaim is done.
1046 */
1047 lck_mtx_lock(&metadata->vdrm_lock);
1048 lck_mtx_unlock(&metadata->vdrm_lock);
1049 lck_mtx_unlock(&async_reclamation_buffers_lock);
1050
1051 vm_map_deallocate(map);
1052 return true;
1053 }
1054
1055 #endif /* DEVELOPMENT || DEBUG */
1056