xref: /xnu-8796.141.3/osfmk/vm/vm_reclaim.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/startup.h>
35 #include <kern/sched.h>
36 #include <libkern/OSAtomic.h>
37 #include <mach/mach_types.h>
38 #include <mach/mach_vm.h>
39 #include <mach/vm_reclaim.h>
40 #include <os/log.h>
41 #include <pexpert/pexpert.h>
42 #include <vm/vm_map_internal.h>
43 #include <vm/vm_reclaim_internal.h>
44 #include <sys/queue.h>
45 #include <os/atomic_private.h>
46 
47 #pragma mark Tunables
48 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
49 static integer_t kReclaimThreadPriority = BASEPRI_VM;
50 // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
51 TUNABLE_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", 2);
52 // Used to debug vm_reclaim kills
53 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
54 uint64_t vm_reclaim_max_threshold;
55 
56 #pragma mark Declarations
57 typedef struct proc *proc_t;
58 extern char *proc_best_name(proc_t proc);
59 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
60 struct proc *proc_ref(struct proc *p, int locked);
61 int proc_rele(proc_t p);
62 static bool reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
63 static bool reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
64 static bool reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
65 
66 struct vm_deferred_reclamation_metadata_s {
67 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; // Global list containing every reclamation buffer
68 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list; // A list containing buffers that are ripe for reclamation
69 	decl_lck_mtx_data(, vdrm_lock); /* Held when reclaiming from the buffer */
70 	/*
71 	 * The task owns this structure but we maintain a backpointer here
72 	 * so that we can send an exception if we hit an error.
73 	 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
74 	 */
75 	task_t vdrm_task;
76 	vm_map_t vdrm_map;
77 	user_addr_t vdrm_reclaim_buffer;
78 	mach_vm_size_t vdrm_buffer_size;
79 	user_addr_t vdrm_reclaim_indices;
80 	uint64_t vdrm_reclaimed_at;
81 	/*
82 	 * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
83 	 * cumulatively. Both values are in terms of virtual memory, so they give an upper bound
84 	 * on the amount of physical memory that can be reclaimed.
85 	 * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
86 	 * Note that neither value is protected by the vdrm_lock.
87 	 */
88 	_Atomic size_t vdrm_num_bytes_put_in_buffer;
89 	_Atomic size_t vdrm_num_bytes_reclaimed;
90 };
91 static void process_async_reclamation_list(void);
92 
93 extern void *proc_find(int pid);
94 extern task_t proc_task(proc_t);
95 
96 #pragma mark Globals
97 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
98 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
99 static size_t kReclaimChunkFailed = UINT64_MAX;
100 
101 /*
102  * We maintain two lists of reclamation buffers.
103  * The reclamation_buffers list contains every buffer in the system.
104  * The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
105  * Each list has its own lock.
106  */
107 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
108 
109 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
110 /*
111  * The reclamation_buffers_lock protects the reclamation_buffers list.
112  * It must be held when iterating over the list or manipulating the list.
113  * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
114  */
115 LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
116 LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
117 static size_t reclamation_buffers_length;
118 static uint64_t reclamation_counter; // generation count for global reclaims
119 
120 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
121 static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
122 
123 #pragma mark Implementation
124 
125 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,user_addr_t indices)126 vmdr_metadata_alloc(
127 	task_t                  task,
128 	user_addr_t             buffer,
129 	mach_vm_size_t          size,
130 	user_addr_t             indices)
131 {
132 	vm_deferred_reclamation_metadata_t metadata;
133 	vm_map_t map = task->map;
134 
135 	assert(!map->is_nested_map);
136 
137 	metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
138 	lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
139 	metadata->vdrm_task = task;
140 	metadata->vdrm_map = map;
141 	metadata->vdrm_reclaim_buffer = buffer;
142 	metadata->vdrm_buffer_size = size;
143 	metadata->vdrm_reclaim_indices = indices;
144 
145 	/*
146 	 * we do not need to hold a lock on `task` because this is called
147 	 * either at fork() time or from the context of current_task().
148 	 */
149 	vm_map_reference(map);
150 	return metadata;
151 }
152 
153 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)154 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
155 {
156 	vm_map_deallocate(metadata->vdrm_map);
157 	lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
158 	zfree(vm_reclaim_metadata_zone, metadata);
159 }
160 
161 kern_return_t
vm_deferred_reclamation_buffer_init_internal(task_t task,mach_vm_offset_t address,mach_vm_size_t size,user_addr_t indices)162 vm_deferred_reclamation_buffer_init_internal(
163 	task_t                  task,
164 	mach_vm_offset_t        address,
165 	mach_vm_size_t          size,
166 	user_addr_t             indices)
167 {
168 	kern_return_t kr = KERN_FAILURE;
169 	vm_deferred_reclamation_metadata_t metadata = NULL;
170 	bool success;
171 	uint64_t head = 0, tail = 0, busy = 0;
172 
173 	if (address == 0 || indices == 0 || size < 2 * sizeof(mach_vm_reclaim_entry_v1_t)) {
174 		return KERN_INVALID_ARGUMENT;
175 	}
176 
177 	metadata = vmdr_metadata_alloc(task, address, size, indices);
178 
179 	/*
180 	 * Validate the starting indices
181 	 */
182 	success = reclaim_copyin_busy(metadata, &busy);
183 	if (!success) {
184 		kr = KERN_INVALID_ARGUMENT;
185 		goto out;
186 	}
187 	success = reclaim_copyin_head(metadata, &head);
188 	if (!success) {
189 		kr = KERN_INVALID_ARGUMENT;
190 		goto out;
191 	}
192 	success = reclaim_copyin_tail(metadata, &tail);
193 	if (!success) {
194 		kr = KERN_INVALID_ARGUMENT;
195 		goto out;
196 	}
197 	if (head != 0 || tail != 0 || busy != 0) {
198 		kr = KERN_INVALID_ARGUMENT;
199 		goto out;
200 	}
201 
202 	task_lock(task);
203 	if (task->deferred_reclamation_metadata != NULL) {
204 		/* Attempt to overwrite existing reclaim buffer. This is not allowed. */
205 		os_log_error(OS_LOG_DEFAULT,
206 		    "vm_reclaim: tried to overwrite exisiting reclaim buffer for task %p", task);
207 		kr = KERN_INVALID_ARGUMENT;
208 		task_unlock(task);
209 		goto out;
210 	}
211 	task->deferred_reclamation_metadata = metadata;
212 
213 	task_unlock(task);
214 	lck_mtx_lock(&reclamation_buffers_lock);
215 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
216 	reclamation_buffers_length++;
217 	lck_mtx_unlock(&reclamation_buffers_lock);
218 
219 	return KERN_SUCCESS;
220 
221 out:
222 	vmdr_metadata_free(metadata);
223 	return kr;
224 }
225 
226 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)227 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
228 {
229 	assert(metadata != NULL);
230 	/*
231 	 * First remove the buffer from the global list so no one else can get access to it.
232 	 */
233 	lck_mtx_lock(&reclamation_buffers_lock);
234 	TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
235 	reclamation_buffers_length--;
236 	lck_mtx_unlock(&reclamation_buffers_lock);
237 
238 	/*
239 	 * Now remove it from the async list (if present)
240 	 */
241 	lck_mtx_lock(&async_reclamation_buffers_lock);
242 	if (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
243 		TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
244 		metadata->vdrm_async_list.tqe_next = NULL;
245 		metadata->vdrm_async_list.tqe_prev = NULL;
246 	}
247 	lck_mtx_unlock(&async_reclamation_buffers_lock);
248 
249 	vmdr_metadata_free(metadata);
250 }
251 
252 static user_addr_t
get_head_ptr(user_addr_t indices)253 get_head_ptr(user_addr_t indices)
254 {
255 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
256 }
257 
258 static user_addr_t
get_tail_ptr(user_addr_t indices)259 get_tail_ptr(user_addr_t indices)
260 {
261 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
262 }
263 
264 static user_addr_t
get_busy_ptr(user_addr_t indices)265 get_busy_ptr(user_addr_t indices)
266 {
267 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
268 }
269 
270 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)271 reclaim_kill_with_reason(
272 	vm_deferred_reclamation_metadata_t metadata,
273 	unsigned reason,
274 	mach_exception_data_type_t subcode)
275 {
276 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
277 	mach_exception_code_t code = 0;
278 	task_t task = metadata->vdrm_task;
279 	proc_t p = NULL;
280 	boolean_t fatal = TRUE;
281 	bool killing_self = false;
282 	pid_t pid;
283 	int err;
284 
285 	if (panic_on_kill) {
286 		panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
287 	}
288 
289 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
290 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
291 	EXC_GUARD_ENCODE_TARGET(code, 0);
292 
293 	assert(metadata->vdrm_task != kernel_task);
294 	killing_self = task == current_task();
295 	if (!killing_self) {
296 		/*
297 		 * Grab a reference on the task to make sure it doesn't go away
298 		 * after we drop the metadata lock
299 		 */
300 		task_reference(task);
301 	}
302 	/*
303 	 * We need to issue a wakeup in case this kill is coming from the async path.
304 	 * Once we drop the lock the caller can no longer do this wakeup, but
305 	 * if there's someone blocked on this reclaim they hold a map reference
306 	 * and thus need to be woken up so the map can be freed.
307 	 */
308 	thread_wakeup(&metadata->vdrm_async_list);
309 	lck_mtx_unlock(&metadata->vdrm_lock);
310 
311 	if (reason == kGUARD_EXC_DEALLOC_GAP) {
312 		task_lock(task);
313 		fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
314 		task_unlock(task);
315 	}
316 
317 	if (!fatal) {
318 		os_log_info(OS_LOG_DEFAULT,
319 		    "vm_reclaim: Skipping non fatal guard exception.\n");
320 		goto out;
321 	}
322 
323 	pid = task_pid(task);
324 	if (killing_self) {
325 		p = get_bsdtask_info(task);
326 	} else {
327 		p = proc_find(pid);
328 		if (p && proc_task(p) != task) {
329 			os_log_error(OS_LOG_DEFAULT,
330 			    "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
331 			goto out;
332 		}
333 
334 		task_deallocate(task);
335 		task = NULL;
336 	}
337 
338 	if (!p) {
339 		os_log_error(OS_LOG_DEFAULT,
340 		    "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
341 		goto out;
342 	}
343 
344 	err = exit_with_guard_exception(p, code, subcode);
345 	if (err != 0) {
346 		os_log_error(OS_LOG_DEFAULT, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
347 	}
348 out:
349 	if (!killing_self) {
350 		if (p) {
351 			proc_rele(p);
352 			p = NULL;
353 		}
354 		if (task) {
355 			task_deallocate(task);
356 			task = NULL;
357 		}
358 	}
359 }
360 
361 static void
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)362 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
363 {
364 	reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE, result);
365 }
366 
367 /*
368  * Helper functions to do copyio on the head, tail, and busy pointers.
369  * Note that the kernel will only write to the busy and head pointers.
370  * Userspace is not supposed to write to the head or busy pointers, but the kernel
371  * must be resilient to that kind of bug in userspace.
372  */
373 
374 
375 static bool
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)376 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
377 {
378 	int result;
379 	user_addr_t indices = metadata->vdrm_reclaim_indices;
380 	user_addr_t head_ptr = get_head_ptr(indices);
381 
382 	result = copyin_atomic64(head_ptr, head);
383 
384 	if (result != 0) {
385 		os_log_error(OS_LOG_DEFAULT,
386 		    "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
387 		reclaim_handle_copyio_error(metadata, result);
388 		return false;
389 	}
390 	return true;
391 }
392 
393 static bool
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)394 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
395 {
396 	int result;
397 	user_addr_t indices = metadata->vdrm_reclaim_indices;
398 	user_addr_t tail_ptr = get_tail_ptr(indices);
399 
400 	result = copyin_atomic64(tail_ptr, tail);
401 
402 	if (result != 0) {
403 		os_log_error(OS_LOG_DEFAULT,
404 		    "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
405 		reclaim_handle_copyio_error(metadata, result);
406 		return false;
407 	}
408 	return true;
409 }
410 
411 static bool
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)412 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
413 {
414 	int result;
415 	user_addr_t indices = metadata->vdrm_reclaim_indices;
416 	user_addr_t busy_ptr = get_busy_ptr(indices);
417 
418 	result = copyin_atomic64(busy_ptr, busy);
419 
420 	if (result != 0) {
421 		os_log_error(OS_LOG_DEFAULT,
422 		    "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
423 		reclaim_handle_copyio_error(metadata, result);
424 		return false;
425 	}
426 	return true;
427 }
428 
429 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)430 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
431 {
432 	int result;
433 	user_addr_t indices = metadata->vdrm_reclaim_indices;
434 	user_addr_t busy_ptr = get_busy_ptr(indices);
435 
436 	result = copyout_atomic64(value, busy_ptr);
437 
438 	if (result != 0) {
439 		os_log_error(OS_LOG_DEFAULT,
440 		    "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
441 		reclaim_handle_copyio_error(metadata, result);
442 		return false;
443 	}
444 	return true;
445 }
446 
447 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)448 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
449 {
450 	int result;
451 	user_addr_t indices = metadata->vdrm_reclaim_indices;
452 	user_addr_t head_ptr = get_head_ptr(indices);
453 
454 	result = copyout_atomic64(value, head_ptr);
455 
456 	if (result != 0) {
457 		os_log_error(OS_LOG_DEFAULT,
458 		    "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
459 		reclaim_handle_copyio_error(metadata, result);
460 		return false;
461 	}
462 	return true;
463 }
464 
465 /*
466  * Reclaim a chunk from the buffer.
467  * Returns the number of entries reclaimed or 0 if there are no entries left in the buffer.
468  */
469 static size_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)470 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)
471 {
472 	assert(metadata != NULL);
473 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
474 
475 	int result = 0;
476 	size_t num_reclaimed = 0;
477 	uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0, num_copied = 0, buffer_len = 0;
478 	user_addr_t indices;
479 	vm_map_t map = metadata->vdrm_map, old_map;
480 	mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
481 	bool success;
482 
483 	buffer_len = metadata->vdrm_buffer_size / sizeof(mach_vm_reclaim_entry_v1_t);
484 
485 	memset(reclaim_entries, 0, sizeof(reclaim_entries));
486 
487 	indices = (user_addr_t) metadata->vdrm_reclaim_indices;
488 	old_map = vm_map_switch(map);
489 
490 	success = reclaim_copyin_busy(metadata, &busy);
491 	if (!success) {
492 		goto fail;
493 	}
494 	success = reclaim_copyin_head(metadata, &head);
495 	if (!success) {
496 		goto fail;
497 	}
498 	success = reclaim_copyin_tail(metadata, &tail);
499 	if (!success) {
500 		goto fail;
501 	}
502 
503 	if (busy != head) {
504 		// Userspace overwrote one of the pointers
505 		os_log_error(OS_LOG_DEFAULT,
506 		    "vm_reclaim: Userspace modified head or busy pointer! head: %llu (0x%llx) != busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
507 		    head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, get_tail_ptr(indices));
508 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, busy);
509 		goto fail;
510 	}
511 
512 	if (tail < head) {
513 		// Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
514 		os_log_error(OS_LOG_DEFAULT,
515 		    "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
516 		    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
517 		lck_mtx_unlock(&metadata->vdrm_lock);
518 		goto fail;
519 	}
520 
521 	num_to_reclaim = tail - head;
522 	while (true) {
523 		num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
524 		if (num_to_reclaim == 0) {
525 			break;
526 		}
527 		busy = head + num_to_reclaim;
528 		success = reclaim_copyout_busy(metadata, busy);
529 		if (!success) {
530 			goto fail;
531 		}
532 		os_atomic_thread_fence(seq_cst);
533 		success = reclaim_copyin_tail(metadata, &new_tail);
534 		if (!success) {
535 			goto fail;
536 		}
537 
538 		if (new_tail >= busy) {
539 			/* Got num_to_reclaim entries */
540 			break;
541 		}
542 		tail = new_tail;
543 		if (tail < head) {
544 			// Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
545 			os_log_error(OS_LOG_DEFAULT,
546 			    "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
547 			    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
548 			lck_mtx_unlock(&metadata->vdrm_lock);
549 			goto fail;
550 		}
551 		/* Can't reclaim these entries. Try again */
552 		num_to_reclaim = tail - head;
553 		if (num_to_reclaim == 0) {
554 			/* Nothing left to reclaim. Reset busy to head. */
555 			success = reclaim_copyout_busy(metadata, head);
556 			if (!success) {
557 				goto fail;
558 			}
559 			break;
560 		}
561 		/*
562 		 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
563 		 * so this is gauranteed to converge.
564 		 */
565 	}
566 
567 	while (num_copied < num_to_reclaim) {
568 		uint64_t memcpy_start_idx = (head % buffer_len);
569 		uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
570 		// Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
571 		memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
572 		uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
573 
574 		assert(num_to_copy + num_copied <= kReclaimChunkSize);
575 		user_addr_t src_ptr = metadata->vdrm_reclaim_buffer + memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t);
576 		mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
577 
578 		result = copyin(src_ptr, dst_ptr, num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t));
579 
580 		if (result != 0) {
581 			os_log_error(OS_LOG_DEFAULT,
582 			    "vm_reclaim: Unable to copyin %llu entries in reclaim buffer at 0x%llx to 0x%llx: err=%d\n",
583 			    num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
584 			reclaim_handle_copyio_error(metadata, result);
585 			goto fail;
586 		}
587 
588 		num_copied += num_to_copy;
589 		head += num_to_copy;
590 	}
591 
592 	for (size_t i = 0; i < num_to_reclaim; i++) {
593 		mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
594 		if (entry->address != 0 && entry->size != 0) {
595 			kern_return_t kr = vm_map_remove_guard(map,
596 			    vm_map_trunc_page(entry->address,
597 			    VM_MAP_PAGE_MASK(map)),
598 			    vm_map_round_page(entry->address + entry->size,
599 			    VM_MAP_PAGE_MASK(map)),
600 			    VM_MAP_REMOVE_GAPS_FAIL,
601 			    KMEM_GUARD_NONE).kmr_return;
602 			if (kr == KERN_INVALID_VALUE) {
603 				reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
604 				goto fail;
605 			} else if (kr != KERN_SUCCESS) {
606 				os_log_error(OS_LOG_DEFAULT,
607 				    "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx. Err: %d\n",
608 				    entry->address, entry->size, (uint64_t) map, kr);
609 				reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
610 				goto fail;
611 			}
612 			num_reclaimed++;
613 			os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
614 		}
615 	}
616 
617 	success = reclaim_copyout_head(metadata, head);
618 	if (!success) {
619 		goto fail;
620 	}
621 
622 	vm_map_switch(old_map);
623 	return num_reclaimed;
624 fail:
625 	vm_map_switch(old_map);
626 	return kReclaimChunkFailed;
627 }
628 
629 /*
630  * Attempts to reclaim until the buffer's estimated number of available bytes is <= num_bytes_reclaimable_threshold
631  * The metadata buffer lock should be held by the caller.
632  *
633  * Returns the number of entries reclaimed.
634  */
635 static size_t
reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,size_t num_bytes_reclaimable_threshold)636 reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata, size_t num_bytes_reclaimable_threshold)
637 {
638 	assert(metadata != NULL);
639 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
640 	if (!task_is_active(metadata->vdrm_task)) {
641 		/*
642 		 * If the task is exiting, the reclaim below will likely fail and fall through
643 		 * to the (slower) error path.
644 		 * So as an optimization, we bail out early here.
645 		 */
646 		return KERN_FAILURE;
647 	}
648 
649 	size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
650 	while (true) {
651 		size_t curr_entries_reclaimed = 0;
652 		num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
653 		reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
654 		if (num_bytes_reclaimed > reclaimable_bytes) {
655 			estimated_reclaimable_bytes = 0;
656 		} else {
657 			estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
658 		}
659 		if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
660 			break;
661 		}
662 		curr_entries_reclaimed = reclaim_chunk(metadata);
663 		if (curr_entries_reclaimed == kReclaimChunkFailed) {
664 			return kReclaimChunkFailed;
665 		}
666 		if (curr_entries_reclaimed == 0) {
667 			break;
668 		}
669 		num_entries_reclaimed += curr_entries_reclaimed;
670 	}
671 
672 	return num_entries_reclaimed;
673 }
674 
675 /*
676  * Get the reclamation metadata buffer for the given map.
677  * If the buffer exists it is returned locked.
678  */
679 static vm_deferred_reclamation_metadata_t
get_task_reclaim_metadata(task_t task)680 get_task_reclaim_metadata(task_t task)
681 {
682 	assert(task != NULL);
683 	vm_deferred_reclamation_metadata_t metadata = NULL;
684 	task_lock(task);
685 	metadata = task->deferred_reclamation_metadata;
686 	if (metadata != NULL) {
687 		lck_mtx_lock(&metadata->vdrm_lock);
688 	}
689 	task_unlock(task);
690 	return metadata;
691 }
692 
693 kern_return_t
vm_deferred_reclamation_buffer_synchronize_internal(task_t task,size_t num_entries_to_reclaim)694 vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
695 {
696 	vm_deferred_reclamation_metadata_t metadata = NULL;
697 	size_t total_reclaimed = 0;
698 
699 	if (!task_is_active(task)) {
700 		return KERN_FAILURE;
701 	}
702 
703 	metadata = get_task_reclaim_metadata(task);
704 	if (metadata == NULL) {
705 		return KERN_INVALID_ARGUMENT;
706 	}
707 
708 	while (total_reclaimed < num_entries_to_reclaim) {
709 		size_t num_reclaimed = reclaim_chunk(metadata);
710 		if (num_reclaimed == kReclaimChunkFailed) {
711 			/* Lock has already been released and task is being killed. */
712 			return KERN_FAILURE;
713 		}
714 		if (num_reclaimed == 0) {
715 			/* There was nothing to reclaim. A reclamation thread must have beaten us to it. Nothing to do here. */
716 			break;
717 		}
718 
719 		total_reclaimed += num_reclaimed;
720 	}
721 	lck_mtx_unlock(&metadata->vdrm_lock);
722 
723 	return KERN_SUCCESS;
724 }
725 
726 kern_return_t
vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task,size_t reclaimable_bytes)727 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
728 {
729 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
730 	size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer;
731 	bool success;
732 	if (metadata == NULL) {
733 		return KERN_INVALID_ARGUMENT;
734 	}
735 
736 	/*
737 	 * The client is allowed to make this call in parallel from multiple threads.
738 	 * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
739 	 * If the client's value is smaller than what we've stored, another thread
740 	 * raced ahead of them and we've already acted on that accounting so this
741 	 * call should be a no-op.
742 	 */
743 	success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
744 	    reclaimable_bytes, acquire,
745 	{
746 		if (num_bytes_in_buffer > reclaimable_bytes) {
747 		        os_atomic_rmw_loop_give_up(break);
748 		}
749 	});
750 	if (!success) {
751 		/* Stale value. Nothing new to reclaim */
752 		return KERN_SUCCESS;
753 	}
754 	num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
755 
756 	if (reclaimable_bytes > num_bytes_reclaimed) {
757 		estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
758 		if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
759 			lck_mtx_lock(&metadata->vdrm_lock);
760 			size_t num_reclaimed = reclaim_entries_from_buffer(metadata, vm_reclaim_max_threshold);
761 			if (num_reclaimed == kReclaimChunkFailed) {
762 				/* Lock has already been released & task is in the process of getting killed. */
763 				return KERN_INVALID_ARGUMENT;
764 			}
765 			lck_mtx_unlock(&metadata->vdrm_lock);
766 		}
767 	}
768 
769 	return KERN_SUCCESS;
770 }
771 
772 static inline size_t
pick_reclaim_threshold(vm_deferred_reclamation_action_t action)773 pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
774 {
775 	switch (action) {
776 	case RECLAIM_FULL:
777 		return 0;
778 	case RECLAIM_TRIM:
779 		return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
780 	case RECLAIM_ASYNC:
781 		return 0;
782 	}
783 }
784 
785 void
vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)786 vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)
787 {
788 	if (action == RECLAIM_ASYNC) {
789 		lck_mtx_lock(&async_reclamation_buffers_lock);
790 
791 		process_async_reclamation_list();
792 		lck_mtx_unlock(&async_reclamation_buffers_lock);
793 	} else {
794 		size_t reclaim_threshold = pick_reclaim_threshold(action);
795 		lck_mtx_lock(&reclamation_buffers_lock);
796 		reclamation_counter++;
797 		while (true) {
798 			vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
799 			if (metadata == NULL) {
800 				break;
801 			}
802 			lck_mtx_lock(&metadata->vdrm_lock);
803 			if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
804 				// We've already seen this one. We're done
805 				lck_mtx_unlock(&metadata->vdrm_lock);
806 				break;
807 			}
808 			metadata->vdrm_reclaimed_at = reclamation_counter;
809 
810 			TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
811 			TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
812 			lck_mtx_unlock(&reclamation_buffers_lock);
813 
814 			size_t num_reclaimed = reclaim_entries_from_buffer(metadata, reclaim_threshold);
815 			if (num_reclaimed != kReclaimChunkFailed) {
816 				lck_mtx_unlock(&metadata->vdrm_lock);
817 			}
818 
819 			lck_mtx_lock(&reclamation_buffers_lock);
820 		}
821 		lck_mtx_unlock(&reclamation_buffers_lock);
822 	}
823 }
824 
825 void
vm_deferred_reclamation_reclaim_all_memory(void)826 vm_deferred_reclamation_reclaim_all_memory(void)
827 {
828 	vm_deferred_reclamation_reclaim_memory(RECLAIM_FULL);
829 }
830 
831 bool
vm_deferred_reclamation_reclaim_from_task_async(task_t task)832 vm_deferred_reclamation_reclaim_from_task_async(task_t task)
833 {
834 	bool queued = false;
835 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
836 
837 	if (metadata != NULL) {
838 		lck_mtx_lock(&async_reclamation_buffers_lock);
839 		TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
840 		lck_mtx_unlock(&async_reclamation_buffers_lock);
841 		queued = true;
842 		thread_wakeup(&vm_reclaim_thread);
843 	}
844 
845 	return queued;
846 }
847 
848 bool
vm_deferred_reclamation_reclaim_from_task_sync(task_t task,size_t max_entries_to_reclaim)849 vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
850 {
851 	size_t num_reclaimed = 0;
852 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
853 
854 	if (!task_is_active(task)) {
855 		return false;
856 	}
857 
858 	if (metadata != NULL) {
859 		lck_mtx_lock(&metadata->vdrm_lock);
860 		while (num_reclaimed < max_entries_to_reclaim) {
861 			size_t num_reclaimed_now = reclaim_chunk(metadata);
862 			if (num_reclaimed_now == kReclaimChunkFailed) {
863 				/* Lock has already been released and task is being killed. */
864 				return false;
865 			}
866 			if (num_reclaimed_now == 0) {
867 				// Nothing left to reclaim
868 				break;
869 			}
870 			num_reclaimed += num_reclaimed_now;
871 		}
872 		lck_mtx_unlock(&metadata->vdrm_lock);
873 	}
874 
875 	return num_reclaimed > 0;
876 }
877 
878 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_buffer_fork(task_t task,vm_deferred_reclamation_metadata_t parent)879 vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
880 {
881 	vm_deferred_reclamation_metadata_t metadata = NULL;
882 
883 	LCK_MTX_ASSERT(&parent->vdrm_lock, LCK_MTX_ASSERT_OWNED);
884 
885 	assert(task->deferred_reclamation_metadata == NULL);
886 	metadata = vmdr_metadata_alloc(task, parent->vdrm_reclaim_buffer,
887 	    parent->vdrm_buffer_size, parent->vdrm_reclaim_indices);
888 	lck_mtx_unlock(&parent->vdrm_lock);
889 
890 	lck_mtx_lock(&reclamation_buffers_lock);
891 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
892 	reclamation_buffers_length++;
893 	lck_mtx_unlock(&reclamation_buffers_lock);
894 
895 	return metadata;
896 }
897 
898 void
vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)899 vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)
900 {
901 	lck_mtx_lock(&metadata->vdrm_lock);
902 }
903 
904 void
vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)905 vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)
906 {
907 	lck_mtx_unlock(&metadata->vdrm_lock);
908 }
909 
910 
911 static void
reclaim_thread_init(void)912 reclaim_thread_init(void)
913 {
914 #if CONFIG_THREAD_GROUPS
915 	thread_group_vm_add();
916 #endif
917 	thread_set_thread_name(current_thread(), "VM_reclaim");
918 }
919 
920 
921 static void
process_async_reclamation_list(void)922 process_async_reclamation_list(void)
923 {
924 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
925 
926 	vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
927 	while (metadata != NULL) {
928 		TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
929 		metadata->vdrm_async_list.tqe_next = NULL;
930 		metadata->vdrm_async_list.tqe_prev = NULL;
931 		lck_mtx_lock(&metadata->vdrm_lock);
932 		lck_mtx_unlock(&async_reclamation_buffers_lock);
933 
934 		// NB: Currently the async reclaim thread fully reclaims the buffer.
935 		size_t num_reclaimed = reclaim_entries_from_buffer(metadata, 0);
936 		if (num_reclaimed == kReclaimChunkFailed) {
937 			/* Lock has already been released & task is in the process of getting killed. */
938 			goto next;
939 		}
940 		/* Wakeup anyone waiting on this buffer getting processed */
941 		thread_wakeup(&metadata->vdrm_async_list);
942 		assert(current_thread()->map == kernel_map);
943 		lck_mtx_unlock(&metadata->vdrm_lock);
944 
945 next:
946 		lck_mtx_lock(&async_reclamation_buffers_lock);
947 		metadata = TAILQ_FIRST(&async_reclamation_buffers);
948 	}
949 }
950 
951 __enum_decl(reclaim_thread_state, uint32_t, {
952 	RECLAIM_THREAD_INIT = 0,
953 	RECLAIM_THREAD_CONT = 1,
954 });
955 
956 static void
reclaim_thread_continue(void)957 reclaim_thread_continue(void)
958 {
959 	lck_mtx_lock(&async_reclamation_buffers_lock);
960 
961 	process_async_reclamation_list();
962 	assert_wait(&vm_reclaim_thread, THREAD_UNINT);
963 
964 	lck_mtx_unlock(&async_reclamation_buffers_lock);
965 }
966 
967 void
reclaim_thread(void * param,wait_result_t wr __unused)968 reclaim_thread(void *param, wait_result_t wr __unused)
969 {
970 	if (param == (void *) RECLAIM_THREAD_INIT) {
971 		reclaim_thread_init();
972 	} else {
973 		assert(param == (void *) RECLAIM_THREAD_CONT);
974 	}
975 
976 	reclaim_thread_continue();
977 
978 	(void) thread_block_parameter(reclaim_thread, (void*) RECLAIM_THREAD_CONT);
979 }
980 
981 __startup_func
982 static void
vm_deferred_reclamation_init(void)983 vm_deferred_reclamation_init(void)
984 {
985 	vm_reclaim_max_threshold = PAGE_SIZE;
986 	if (!PE_parse_boot_argn("vm_reclaim_max_threshold",
987 	    &vm_reclaim_max_threshold, sizeof(vm_reclaim_max_threshold))) {
988 		vm_reclaim_max_threshold = PAGE_SIZE;
989 	}
990 
991 	(void)kernel_thread_start_priority(reclaim_thread,
992 	    (void *)RECLAIM_THREAD_INIT, kReclaimThreadPriority,
993 	    &vm_reclaim_thread);
994 }
995 
996 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
997 
998 #if DEVELOPMENT || DEBUG
999 
1000 bool
vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)1001 vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1002 {
1003 	vm_deferred_reclamation_metadata_t metadata = NULL;
1004 	proc_t p = proc_find(pid);
1005 	vm_map_t map = NULL;
1006 	if (p == NULL) {
1007 		return false;
1008 	}
1009 	task_t t = proc_task(p);
1010 	if (t == NULL) {
1011 		proc_rele(p);
1012 		return false;
1013 	}
1014 
1015 	task_lock(t);
1016 	if (t->map) {
1017 		metadata = t->deferred_reclamation_metadata;
1018 		if (metadata != NULL) {
1019 			map = t->map;
1020 			vm_map_reference(t->map);
1021 		}
1022 	}
1023 	task_unlock(t);
1024 	proc_rele(p);
1025 	if (metadata == NULL) {
1026 		return false;
1027 	}
1028 
1029 	lck_mtx_lock(&async_reclamation_buffers_lock);
1030 	while (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
1031 		assert_wait(&metadata->vdrm_async_list, THREAD_UNINT);
1032 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1033 		thread_block(THREAD_CONTINUE_NULL);
1034 		lck_mtx_lock(&async_reclamation_buffers_lock);
1035 	}
1036 
1037 	/*
1038 	 * The async reclaim thread first removes the buffer from the list
1039 	 * and then reclaims it (while holding its lock).
1040 	 * So grab the metadata buffer's lock here to ensure the
1041 	 * reclaim is done.
1042 	 */
1043 	lck_mtx_lock(&metadata->vdrm_lock);
1044 	lck_mtx_unlock(&metadata->vdrm_lock);
1045 	lck_mtx_unlock(&async_reclamation_buffers_lock);
1046 
1047 	vm_map_deallocate(map);
1048 	return true;
1049 }
1050 
1051 #endif /* DEVELOPMENT || DEBUG */
1052