xref: /xnu-8792.41.9/osfmk/vm/vm_reclaim.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/startup.h>
35 #include <kern/sched.h>
36 #include <libkern/OSAtomic.h>
37 #include <mach/mach_types.h>
38 #include <mach/mach_vm.h>
39 #include <mach/vm_reclaim.h>
40 #include <os/log.h>
41 #include <pexpert/pexpert.h>
42 #include <vm/vm_map_internal.h>
43 #include <vm/vm_reclaim_internal.h>
44 #include <sys/queue.h>
45 #include <os/atomic_private.h>
46 
47 #pragma mark Tunables
48 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
49 static integer_t kReclaimThreadPriority = BASEPRI_VM;
50 // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
51 TUNABLE_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", 2);
52 // Used to debug vm_reclaim kills
53 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
54 uint64_t vm_reclaim_max_threshold;
55 
56 #pragma mark Declarations
57 typedef struct proc *   proc_t;
58 extern char *           proc_best_name(proc_t proc);
59 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
60 struct proc *proc_ref(struct proc *p, int locked);
61 int proc_rele(proc_t p);
62 static bool reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
63 static bool reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
64 static bool reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
65 
66 struct vm_deferred_reclamation_metadata_s {
67 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; // Global list containing every reclamation buffer
68 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list; // A list containing buffers that are ripe for reclamation
69 	decl_lck_mtx_data(, vdrm_lock); /* Held when reclaiming from the buffer */
70 	/*
71 	 * The task owns this structure but we maintain a backpointer here
72 	 * so that we can send an exception if we hit an error.
73 	 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
74 	 */
75 	task_t vdrm_task;
76 	vm_map_t vdrm_map;
77 	user_addr_t vdrm_reclaim_buffer;
78 	mach_vm_size_t vdrm_buffer_size;
79 	user_addr_t vdrm_reclaim_indices;
80 	uint64_t vdrm_reclaimed_at;
81 	/*
82 	 * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
83 	 * cumulatively. Both values are in terms of virtual memory, so they give an upper bound
84 	 * on the amount of physical memory that can be reclaimed.
85 	 * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
86 	 * Note that neither value is protected by the vdrm_lock.
87 	 */
88 	_Atomic size_t vdrm_num_bytes_put_in_buffer;
89 	_Atomic size_t vdrm_num_bytes_reclaimed;
90 };
91 static void process_async_reclamation_list(void);
92 
93 extern void *proc_find(int pid);
94 extern task_t proc_task(proc_t);
95 
96 #pragma mark Globals
97 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
98 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
99 static size_t kReclaimChunkFailed = UINT64_MAX;
100 
101 /*
102  * We maintain two lists of reclamation buffers.
103  * The reclamation_buffers list contains every buffer in the system.
104  * The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
105  * Each list has its own lock.
106  */
107 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
108 
109 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
110 /*
111  * The reclamation_buffers_lock protects the reclamation_buffers list.
112  * It must be held when iterating over the list or manipulating the list.
113  * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
114  */
115 LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
116 LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
117 static size_t reclamation_buffers_length;
118 static uint64_t reclamation_counter; // generation count for global reclaims
119 
120 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
121 static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
122 
123 #pragma mark Implementation
124 
125 static vm_deferred_reclamation_metadata_t
metadata_init(task_t task,vm_map_t map,user_addr_t buffer,mach_vm_size_t size,user_addr_t indices)126 metadata_init(
127 	task_t task,
128 	vm_map_t map,
129 	user_addr_t buffer,
130 	mach_vm_size_t size,
131 	user_addr_t indices)
132 {
133 	vm_deferred_reclamation_metadata_t metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
134 	lck_mtx_init(&(metadata->vdrm_lock), &vm_reclaim_lock_grp, LCK_ATTR_NULL);
135 	assert(!map->is_nested_map);
136 	assert(map == task->map);
137 
138 	metadata->vdrm_task = task;
139 	metadata->vdrm_map = map;
140 	metadata->vdrm_reclaim_buffer = buffer;
141 	metadata->vdrm_buffer_size = size;
142 	metadata->vdrm_reclaim_indices = indices;
143 	return metadata;
144 }
145 
146 kern_return_t
vm_deferred_reclamation_buffer_init_internal(task_t task,mach_vm_offset_t address,mach_vm_size_t size,user_addr_t indices)147 vm_deferred_reclamation_buffer_init_internal(
148 	task_t task,
149 	mach_vm_offset_t address,
150 	mach_vm_size_t size,
151 	user_addr_t indices)
152 {
153 	kern_return_t kr = KERN_FAILURE;
154 	vm_deferred_reclamation_metadata_t metadata = NULL;
155 	vm_map_t map = VM_MAP_NULL;
156 	bool success;
157 	uint64_t head = 0, tail = 0, busy = 0;
158 	if (address == 0 || indices == 0 || size < 2 * sizeof(mach_vm_reclaim_entry_v1_t)) {
159 		return KERN_INVALID_ARGUMENT;
160 	}
161 
162 	task_lock(task);
163 	/* The reclamation buffer will adopt this reference. */
164 	map = task->map;
165 	vm_map_reference(map);
166 	task_unlock(task);
167 
168 	metadata = metadata_init(task, map, address, size, indices);
169 
170 	/*
171 	 * Validate the starting indices
172 	 */
173 	success = reclaim_copyin_busy(metadata, &busy);
174 	if (!success) {
175 		kr = KERN_INVALID_ARGUMENT;
176 		goto out;
177 	}
178 	success = reclaim_copyin_head(metadata, &head);
179 	if (!success) {
180 		kr = KERN_INVALID_ARGUMENT;
181 		goto out;
182 	}
183 	success = reclaim_copyin_tail(metadata, &tail);
184 	if (!success) {
185 		kr = KERN_INVALID_ARGUMENT;
186 		goto out;
187 	}
188 	if (head != 0 || tail != 0 || busy != 0) {
189 		kr = KERN_INVALID_ARGUMENT;
190 		goto out;
191 	}
192 
193 	task_lock(task);
194 	if (task->deferred_reclamation_metadata != NULL) {
195 		/* Attempt to overwrite existing reclaim buffer. This is not allowed. */
196 		os_log_with_startup_serial(OS_LOG_DEFAULT,
197 		    "vm_reclaim: tried to overwrite exisiting reclaim buffer for task %p", task);
198 		kr = KERN_INVALID_ARGUMENT;
199 		task_unlock(task);
200 		goto out;
201 	}
202 	task->deferred_reclamation_metadata = metadata;
203 	map = VM_MAP_NULL;
204 	kr = KERN_SUCCESS;
205 
206 	task_unlock(task);
207 	lck_mtx_lock(&reclamation_buffers_lock);
208 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
209 	reclamation_buffers_length++;
210 	lck_mtx_unlock(&reclamation_buffers_lock);
211 	metadata = NULL;
212 
213 out:
214 	if (metadata) {
215 		zfree(vm_reclaim_metadata_zone, metadata);
216 	}
217 	if (map) {
218 		vm_map_deallocate(map);
219 	}
220 
221 	return kr;
222 }
223 
224 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)225 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
226 {
227 	assert(metadata != NULL);
228 	/*
229 	 * First remove the buffer from the global list so no one else can get access to it.
230 	 */
231 	lck_mtx_lock(&reclamation_buffers_lock);
232 	TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
233 	reclamation_buffers_length--;
234 	lck_mtx_unlock(&reclamation_buffers_lock);
235 
236 	/*
237 	 * Now remove it from the async list (if present)
238 	 */
239 	lck_mtx_lock(&async_reclamation_buffers_lock);
240 	if (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
241 		TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
242 		metadata->vdrm_async_list.tqe_next = NULL;
243 		metadata->vdrm_async_list.tqe_prev = NULL;
244 	}
245 	lck_mtx_unlock(&async_reclamation_buffers_lock);
246 
247 	/*
248 	 * Now take the metadata lock. Once we acquire that it's
249 	 * safe to free it.
250 	 */
251 	lck_mtx_lock(&metadata->vdrm_lock);
252 	/* Drop our reference on the map */
253 	vm_map_deallocate(metadata->vdrm_map);
254 	zfree(vm_reclaim_metadata_zone, metadata);
255 }
256 
257 static user_addr_t
get_head_ptr(user_addr_t indices)258 get_head_ptr(user_addr_t indices)
259 {
260 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
261 }
262 
263 static user_addr_t
get_tail_ptr(user_addr_t indices)264 get_tail_ptr(user_addr_t indices)
265 {
266 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
267 }
268 
269 static user_addr_t
get_busy_ptr(user_addr_t indices)270 get_busy_ptr(user_addr_t indices)
271 {
272 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
273 }
274 
275 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)276 reclaim_kill_with_reason(
277 	vm_deferred_reclamation_metadata_t metadata,
278 	unsigned reason,
279 	mach_exception_data_type_t subcode)
280 {
281 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
282 	mach_exception_code_t code = 0;
283 	task_t task = metadata->vdrm_task;
284 	proc_t p = NULL;
285 	boolean_t fatal = TRUE;
286 	bool killing_self = false;
287 	pid_t pid;
288 	int err;
289 
290 	if (panic_on_kill) {
291 		panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
292 	}
293 
294 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
295 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
296 	EXC_GUARD_ENCODE_TARGET(code, 0);
297 
298 	assert(metadata->vdrm_task != kernel_task);
299 	killing_self = task == current_task();
300 	if (!killing_self) {
301 		/*
302 		 * Grab a reference on the task to make sure it doesn't go away
303 		 * after we drop the metadata lock
304 		 */
305 		task_reference(task);
306 	}
307 	/*
308 	 * We need to issue a wakeup in case this kill is coming from the async path.
309 	 * Once we drop the lock the caller can no longer do this wakeup, but
310 	 * if there's someone blocked on this reclaim they hold a map reference
311 	 * and thus need to be woken up so the map can be freed.
312 	 */
313 	thread_wakeup(&metadata->vdrm_async_list);
314 	lck_mtx_unlock(&metadata->vdrm_lock);
315 
316 	if (reason == kGUARD_EXC_DEALLOC_GAP) {
317 		task_lock(task);
318 		fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
319 		task_unlock(task);
320 	}
321 
322 	if (!fatal) {
323 		os_log_with_startup_serial(OS_LOG_DEFAULT,
324 		    "vm_reclaim: Skipping non fatal guard exception.\n");
325 		goto out;
326 	}
327 
328 	pid = task_pid(task);
329 	if (killing_self) {
330 		p = get_bsdtask_info(task);
331 	} else {
332 		p = proc_find(pid);
333 		if (p && proc_task(p) != task) {
334 			os_log_with_startup_serial(OS_LOG_DEFAULT,
335 			    "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
336 			goto out;
337 		}
338 
339 		task_deallocate(task);
340 		task = NULL;
341 	}
342 
343 	if (!p) {
344 		os_log_with_startup_serial(OS_LOG_DEFAULT,
345 		    "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
346 		goto out;
347 	}
348 
349 	err = exit_with_guard_exception(p, code, subcode);
350 	if (err != 0) {
351 		os_log_with_startup_serial(OS_LOG_DEFAULT, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
352 	}
353 out:
354 	if (!killing_self) {
355 		if (p) {
356 			proc_rele(p);
357 			p = NULL;
358 		}
359 		if (task) {
360 			task_deallocate(task);
361 			task = NULL;
362 		}
363 	}
364 }
365 
366 static void
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)367 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
368 {
369 	reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE, result);
370 }
371 
372 /*
373  * Helper functions to do copyio on the head, tail, and busy pointers.
374  * Note that the kernel will only write to the busy and head pointers.
375  * Userspace is not supposed to write to the head or busy pointers, but the kernel
376  * must be resilient to that kind of bug in userspace.
377  */
378 
379 
380 static bool
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)381 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
382 {
383 	int result;
384 	user_addr_t indices = metadata->vdrm_reclaim_indices;
385 	user_addr_t head_ptr = get_head_ptr(indices);
386 
387 	result = copyin_atomic64(head_ptr, head);
388 
389 	if (result != 0) {
390 		os_log_with_startup_serial(OS_LOG_DEFAULT,
391 		    "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
392 		reclaim_handle_copyio_error(metadata, result);
393 		return false;
394 	}
395 	return true;
396 }
397 
398 static bool
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)399 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
400 {
401 	int result;
402 	user_addr_t indices = metadata->vdrm_reclaim_indices;
403 	user_addr_t tail_ptr = get_tail_ptr(indices);
404 
405 	result = copyin_atomic64(tail_ptr, tail);
406 
407 	if (result != 0) {
408 		os_log_with_startup_serial(OS_LOG_DEFAULT,
409 		    "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
410 		reclaim_handle_copyio_error(metadata, result);
411 		return false;
412 	}
413 	return true;
414 }
415 
416 static bool
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)417 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
418 {
419 	int result;
420 	user_addr_t indices = metadata->vdrm_reclaim_indices;
421 	user_addr_t busy_ptr = get_busy_ptr(indices);
422 
423 	result = copyin_atomic64(busy_ptr, busy);
424 
425 	if (result != 0) {
426 		os_log_with_startup_serial(OS_LOG_DEFAULT,
427 		    "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
428 		reclaim_handle_copyio_error(metadata, result);
429 		return false;
430 	}
431 	return true;
432 }
433 
434 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)435 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
436 {
437 	int result;
438 	user_addr_t indices = metadata->vdrm_reclaim_indices;
439 	user_addr_t busy_ptr = get_busy_ptr(indices);
440 
441 	result = copyout_atomic64(value, busy_ptr);
442 
443 	if (result != 0) {
444 		os_log_with_startup_serial(OS_LOG_DEFAULT,
445 		    "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
446 		reclaim_handle_copyio_error(metadata, result);
447 		return false;
448 	}
449 	return true;
450 }
451 
452 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)453 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
454 {
455 	int result;
456 	user_addr_t indices = metadata->vdrm_reclaim_indices;
457 	user_addr_t head_ptr = get_head_ptr(indices);
458 
459 	result = copyout_atomic64(value, head_ptr);
460 
461 	if (result != 0) {
462 		os_log_with_startup_serial(OS_LOG_DEFAULT,
463 		    "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
464 		reclaim_handle_copyio_error(metadata, result);
465 		return false;
466 	}
467 	return true;
468 }
469 
470 /*
471  * Reclaim a chunk from the buffer.
472  * Returns the number of entries reclaimed or 0 if there are no entries left in the buffer.
473  */
474 static size_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)475 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)
476 {
477 	assert(metadata != NULL);
478 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
479 
480 	int result = 0;
481 	size_t num_reclaimed = 0;
482 	uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0, num_copied = 0, buffer_len = 0;
483 	user_addr_t indices;
484 	vm_map_t map = metadata->vdrm_map, old_map;
485 	mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
486 	bool success;
487 
488 	buffer_len = metadata->vdrm_buffer_size / sizeof(mach_vm_reclaim_entry_v1_t);
489 
490 	memset(reclaim_entries, 0, sizeof(reclaim_entries));
491 
492 	indices = (user_addr_t) metadata->vdrm_reclaim_indices;
493 	old_map = vm_map_switch(map);
494 
495 	success = reclaim_copyin_busy(metadata, &busy);
496 	if (!success) {
497 		goto fail;
498 	}
499 	success = reclaim_copyin_head(metadata, &head);
500 	if (!success) {
501 		goto fail;
502 	}
503 	success = reclaim_copyin_tail(metadata, &tail);
504 	if (!success) {
505 		goto fail;
506 	}
507 
508 	if (busy != head) {
509 		// Userspace overwrote one of the pointers
510 		os_log_with_startup_serial(OS_LOG_DEFAULT,
511 		    "vm_reclaim: Userspace modified head or busy pointer! %llu (0x%llx) != %llu (0x%llx) tail = %llu (0x%llx)\n",
512 		    head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, get_tail_ptr(indices));
513 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, busy);
514 		goto fail;
515 	}
516 
517 	if (tail < head) {
518 		os_log_with_startup_serial(OS_LOG_DEFAULT,
519 		    "vm_reclaim: Userspace modified head or tail pointer! %llu (0x%llx) != %llu (0x%llx) busy = %llu (0x%llx)\n",
520 		    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
521 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, tail);
522 		goto fail;
523 	}
524 
525 	num_to_reclaim = tail - head;
526 	while (true) {
527 		num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
528 		if (num_to_reclaim == 0) {
529 			break;
530 		}
531 		busy = head + num_to_reclaim;
532 		success = reclaim_copyout_busy(metadata, busy);
533 		if (!success) {
534 			goto fail;
535 		}
536 		os_atomic_thread_fence(seq_cst);
537 		success = reclaim_copyin_tail(metadata, &new_tail);
538 		if (!success) {
539 			goto fail;
540 		}
541 
542 		if (new_tail >= busy) {
543 			/* Got num_to_reclaim entries */
544 			break;
545 		}
546 		tail = new_tail;
547 		if (tail < head) {
548 			os_log_with_startup_serial(OS_LOG_DEFAULT,
549 			    "vm_reclaim: Userspace modified head or tail pointer! %llu (0x%llx) != %llu (0x%llx) busy = %llu (0x%llx)\n",
550 			    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
551 			reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, tail);
552 			goto fail;
553 		}
554 		/* Can't reclaim these entries. Try again */
555 		num_to_reclaim = tail - head;
556 		if (num_to_reclaim == 0) {
557 			/* Nothing left to reclaim. Reset busy to head. */
558 			success = reclaim_copyout_busy(metadata, head);
559 			if (!success) {
560 				goto fail;
561 			}
562 			break;
563 		}
564 		/*
565 		 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
566 		 * so this is gauranteed to converge.
567 		 */
568 	}
569 
570 	while (num_copied < num_to_reclaim) {
571 		uint64_t memcpy_start_idx = (head % buffer_len);
572 		uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
573 		// Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
574 		memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
575 		uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
576 
577 		assert(num_to_copy + num_copied <= kReclaimChunkSize);
578 		user_addr_t src_ptr = metadata->vdrm_reclaim_buffer + memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t);
579 		mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
580 
581 		result = copyin(src_ptr, dst_ptr, num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t));
582 
583 		if (result != 0) {
584 			os_log_with_startup_serial(OS_LOG_DEFAULT,
585 			    "vm_reclaim: Unable to copyin %llu entries in reclaim buffer at 0x%llx to 0x%llx: err=%d\n",
586 			    num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
587 			reclaim_handle_copyio_error(metadata, result);
588 			goto fail;
589 		}
590 
591 		num_copied += num_to_copy;
592 		head += num_to_copy;
593 	}
594 
595 	for (size_t i = 0; i < num_to_reclaim; i++) {
596 		mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
597 		if (entry->address != 0 && entry->size != 0) {
598 			kern_return_t kr = vm_map_remove_guard(map,
599 			    vm_map_trunc_page(entry->address,
600 			    VM_MAP_PAGE_MASK(map)),
601 			    vm_map_round_page(entry->address + entry->size,
602 			    VM_MAP_PAGE_MASK(map)),
603 			    VM_MAP_REMOVE_GAPS_FAIL,
604 			    KMEM_GUARD_NONE).kmr_return;
605 			if (kr == KERN_INVALID_VALUE) {
606 				reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
607 				goto fail;
608 			} else if (kr != KERN_SUCCESS) {
609 				os_log_with_startup_serial(OS_LOG_DEFAULT,
610 				    "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx. Err: %d\n",
611 				    entry->address, entry->size, (uint64_t) map, kr);
612 				reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
613 				goto fail;
614 			}
615 			num_reclaimed++;
616 			os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
617 		}
618 	}
619 
620 	success = reclaim_copyout_head(metadata, head);
621 	if (!success) {
622 		goto fail;
623 	}
624 
625 	vm_map_switch(old_map);
626 	return num_reclaimed;
627 fail:
628 	vm_map_switch(old_map);
629 	return kReclaimChunkFailed;
630 }
631 
632 /*
633  * Attempts to reclaim until the buffer's estimated number of available bytes is <= num_bytes_reclaimable_threshold
634  * The metadata buffer lock should be held by the caller.
635  *
636  * Returns the number of entries reclaimed.
637  */
638 static size_t
reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,size_t num_bytes_reclaimable_threshold)639 reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata, size_t num_bytes_reclaimable_threshold)
640 {
641 	assert(metadata != NULL);
642 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
643 	if (!task_is_active(metadata->vdrm_task)) {
644 		/*
645 		 * If the task is exiting, the reclaim below will likely fail and fall through
646 		 * to the (slower) error path.
647 		 * So as an optimization, we bail out early here.
648 		 */
649 		return KERN_FAILURE;
650 	}
651 
652 	size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
653 	while (true) {
654 		size_t curr_entries_reclaimed = 0;
655 		num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
656 		reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
657 		if (num_bytes_reclaimed > reclaimable_bytes) {
658 			estimated_reclaimable_bytes = 0;
659 		} else {
660 			estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
661 		}
662 		if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
663 			break;
664 		}
665 		curr_entries_reclaimed = reclaim_chunk(metadata);
666 		if (curr_entries_reclaimed == kReclaimChunkFailed) {
667 			return kReclaimChunkFailed;
668 		}
669 		if (curr_entries_reclaimed == 0) {
670 			break;
671 		}
672 		num_entries_reclaimed += curr_entries_reclaimed;
673 	}
674 
675 	return num_entries_reclaimed;
676 }
677 
678 /*
679  * Get the reclamation metadata buffer for the given map.
680  * If the buffer exists it is returned locked.
681  */
682 static vm_deferred_reclamation_metadata_t
get_task_reclaim_metadata(task_t task)683 get_task_reclaim_metadata(task_t task)
684 {
685 	assert(task != NULL);
686 	vm_deferred_reclamation_metadata_t metadata = NULL;
687 	task_lock(task);
688 	metadata = task->deferred_reclamation_metadata;
689 	if (metadata != NULL) {
690 		lck_mtx_lock(&metadata->vdrm_lock);
691 	}
692 	task_unlock(task);
693 	return metadata;
694 }
695 
696 kern_return_t
vm_deferred_reclamation_buffer_synchronize_internal(task_t task,size_t num_entries_to_reclaim)697 vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
698 {
699 	vm_deferred_reclamation_metadata_t metadata = NULL;
700 	size_t total_reclaimed = 0;
701 
702 	if (!task_is_active(task)) {
703 		return KERN_FAILURE;
704 	}
705 
706 	metadata = get_task_reclaim_metadata(task);
707 	if (metadata == NULL) {
708 		return KERN_INVALID_ARGUMENT;
709 	}
710 
711 	while (total_reclaimed < num_entries_to_reclaim) {
712 		size_t num_reclaimed = reclaim_chunk(metadata);
713 		if (num_reclaimed == kReclaimChunkFailed) {
714 			/* Lock has already been released and task is being killed. */
715 			return KERN_FAILURE;
716 		}
717 		if (num_reclaimed == 0) {
718 			/* There was nothing to reclaim. A reclamation thread must have beaten us to it. Nothing to do here. */
719 			break;
720 		}
721 
722 		total_reclaimed += num_reclaimed;
723 	}
724 	lck_mtx_unlock(&metadata->vdrm_lock);
725 
726 	return KERN_SUCCESS;
727 }
728 
729 kern_return_t
vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task,size_t reclaimable_bytes)730 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
731 {
732 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
733 	size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer;
734 	bool success;
735 	if (metadata == NULL) {
736 		return KERN_INVALID_ARGUMENT;
737 	}
738 
739 	/*
740 	 * The client is allowed to make this call in parallel from multiple threads.
741 	 * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
742 	 * If the client's value is smaller than what we've stored, another thread
743 	 * raced ahead of them and we've already acted on that accounting so this
744 	 * call should be a no-op.
745 	 */
746 	success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
747 	    reclaimable_bytes, acquire,
748 	{
749 		if (num_bytes_in_buffer > reclaimable_bytes) {
750 		        os_atomic_rmw_loop_give_up(break);
751 		}
752 	});
753 	if (!success) {
754 		/* Stale value. Nothing new to reclaim */
755 		return KERN_SUCCESS;
756 	}
757 	num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
758 
759 	if (reclaimable_bytes > num_bytes_reclaimed) {
760 		estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
761 		if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
762 			lck_mtx_lock(&metadata->vdrm_lock);
763 			size_t num_reclaimed = reclaim_entries_from_buffer(metadata, vm_reclaim_max_threshold);
764 			if (num_reclaimed == kReclaimChunkFailed) {
765 				/* Lock has already been released & task is in the process of getting killed. */
766 				return KERN_INVALID_ARGUMENT;
767 			}
768 			lck_mtx_unlock(&metadata->vdrm_lock);
769 		}
770 	}
771 
772 	return KERN_SUCCESS;
773 }
774 
775 static inline size_t
pick_reclaim_threshold(vm_deferred_reclamation_action_t action)776 pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
777 {
778 	switch (action) {
779 	case RECLAIM_FULL:
780 		return 0;
781 	case RECLAIM_TRIM:
782 		return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
783 	case RECLAIM_ASYNC:
784 		return 0;
785 	}
786 }
787 
788 void
vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)789 vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)
790 {
791 	if (action == RECLAIM_ASYNC) {
792 		lck_mtx_lock(&async_reclamation_buffers_lock);
793 
794 		process_async_reclamation_list();
795 		lck_mtx_unlock(&async_reclamation_buffers_lock);
796 	} else {
797 		size_t reclaim_threshold = pick_reclaim_threshold(action);
798 		lck_mtx_lock(&reclamation_buffers_lock);
799 		reclamation_counter++;
800 		while (true) {
801 			vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
802 			if (metadata == NULL) {
803 				break;
804 			}
805 			lck_mtx_lock(&metadata->vdrm_lock);
806 			if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
807 				// We've already seen this one. We're done
808 				lck_mtx_unlock(&metadata->vdrm_lock);
809 				break;
810 			}
811 			metadata->vdrm_reclaimed_at = reclamation_counter;
812 
813 			TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
814 			TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
815 			lck_mtx_unlock(&reclamation_buffers_lock);
816 
817 			size_t num_reclaimed = reclaim_entries_from_buffer(metadata, reclaim_threshold);
818 			if (num_reclaimed != kReclaimChunkFailed) {
819 				lck_mtx_unlock(&metadata->vdrm_lock);
820 			}
821 
822 			lck_mtx_lock(&reclamation_buffers_lock);
823 		}
824 		lck_mtx_unlock(&reclamation_buffers_lock);
825 	}
826 }
827 
828 void
vm_deferred_reclamation_reclaim_all_memory(void)829 vm_deferred_reclamation_reclaim_all_memory(void)
830 {
831 	vm_deferred_reclamation_reclaim_memory(RECLAIM_FULL);
832 }
833 
834 bool
vm_deferred_reclamation_reclaim_from_task_async(task_t task)835 vm_deferred_reclamation_reclaim_from_task_async(task_t task)
836 {
837 	bool queued = false;
838 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
839 
840 	if (metadata != NULL) {
841 		lck_mtx_lock(&async_reclamation_buffers_lock);
842 		TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
843 		lck_mtx_unlock(&async_reclamation_buffers_lock);
844 		queued = true;
845 		thread_wakeup(&vm_reclaim_thread);
846 	}
847 
848 	return queued;
849 }
850 
851 bool
vm_deferred_reclamation_reclaim_from_task_sync(task_t task,size_t max_entries_to_reclaim)852 vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
853 {
854 	size_t num_reclaimed = 0;
855 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
856 
857 	if (!task_is_active(task)) {
858 		return false;
859 	}
860 
861 	if (metadata != NULL) {
862 		lck_mtx_lock(&metadata->vdrm_lock);
863 		while (num_reclaimed < max_entries_to_reclaim) {
864 			size_t num_reclaimed_now = reclaim_chunk(metadata);
865 			if (num_reclaimed_now == kReclaimChunkFailed) {
866 				/* Lock has already been released and task is being killed. */
867 				return false;
868 			}
869 			if (num_reclaimed_now == 0) {
870 				// Nothing left to reclaim
871 				break;
872 			}
873 			num_reclaimed += num_reclaimed_now;
874 		}
875 		lck_mtx_unlock(&metadata->vdrm_lock);
876 	}
877 
878 	return num_reclaimed > 0;
879 }
880 
881 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_buffer_fork(task_t task,vm_deferred_reclamation_metadata_t parent)882 vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
883 {
884 	LCK_MTX_ASSERT(&parent->vdrm_lock, LCK_MTX_ASSERT_OWNED);
885 	vm_map_t map = task->map;
886 	vm_deferred_reclamation_metadata_t metadata = NULL;
887 
888 	vm_map_reference(map);
889 	assert(task->deferred_reclamation_metadata == NULL);
890 	metadata = metadata_init(task, map, parent->vdrm_reclaim_buffer, parent->vdrm_buffer_size, parent->vdrm_reclaim_indices);
891 	lck_mtx_unlock(&parent->vdrm_lock);
892 
893 	lck_mtx_lock(&reclamation_buffers_lock);
894 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
895 	reclamation_buffers_length++;
896 	lck_mtx_unlock(&reclamation_buffers_lock);
897 
898 	return metadata;
899 }
900 
901 void
vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)902 vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)
903 {
904 	lck_mtx_lock(&metadata->vdrm_lock);
905 }
906 
907 void
vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)908 vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)
909 {
910 	lck_mtx_unlock(&metadata->vdrm_lock);
911 }
912 
913 
914 static void
reclaim_thread_init(void)915 reclaim_thread_init(void)
916 {
917 #if CONFIG_THREAD_GROUPS
918 	thread_group_vm_add();
919 #endif
920 	thread_set_thread_name(current_thread(), "VM_reclaim");
921 }
922 
923 
924 static void
process_async_reclamation_list(void)925 process_async_reclamation_list(void)
926 {
927 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
928 
929 	vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
930 	while (metadata != NULL) {
931 		TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
932 		metadata->vdrm_async_list.tqe_next = NULL;
933 		metadata->vdrm_async_list.tqe_prev = NULL;
934 		lck_mtx_lock(&metadata->vdrm_lock);
935 		lck_mtx_unlock(&async_reclamation_buffers_lock);
936 
937 		// NB: Currently the async reclaim thread fully reclaims the buffer.
938 		size_t num_reclaimed = reclaim_entries_from_buffer(metadata, 0);
939 		if (num_reclaimed == kReclaimChunkFailed) {
940 			/* Lock has already been released & task is in the process of getting killed. */
941 			goto next;
942 		}
943 		/* Wakeup anyone waiting on this buffer getting processed */
944 		thread_wakeup(&metadata->vdrm_async_list);
945 		assert(current_thread()->map == kernel_map);
946 		lck_mtx_unlock(&metadata->vdrm_lock);
947 
948 next:
949 		lck_mtx_lock(&async_reclamation_buffers_lock);
950 		metadata = TAILQ_FIRST(&async_reclamation_buffers);
951 	}
952 }
953 
954 __enum_decl(reclaim_thread_state, uint32_t, {
955 	RECLAIM_THREAD_INIT = 0,
956 	RECLAIM_THREAD_CONT = 1,
957 });
958 
959 static void
reclaim_thread_continue(void)960 reclaim_thread_continue(void)
961 {
962 	lck_mtx_lock(&async_reclamation_buffers_lock);
963 
964 	process_async_reclamation_list();
965 	assert_wait(&vm_reclaim_thread, THREAD_UNINT);
966 
967 	lck_mtx_unlock(&async_reclamation_buffers_lock);
968 }
969 
970 void
reclaim_thread(void * param,wait_result_t wr __unused)971 reclaim_thread(void *param, wait_result_t wr __unused)
972 {
973 	if (param == (void *) RECLAIM_THREAD_INIT) {
974 		reclaim_thread_init();
975 	} else {
976 		assert(param == (void *) RECLAIM_THREAD_CONT);
977 	}
978 
979 	reclaim_thread_continue();
980 
981 	(void) thread_block_parameter(reclaim_thread, (void*) RECLAIM_THREAD_CONT);
982 }
983 
984 __startup_func
985 static void
vm_deferred_reclamation_init(void)986 vm_deferred_reclamation_init(void)
987 {
988 	kern_return_t result;
989 
990 	vm_reclaim_max_threshold = PAGE_SIZE;
991 	if (!PE_parse_boot_argn("vm_reclaim_max_threshold", &vm_reclaim_max_threshold, sizeof(vm_reclaim_max_threshold))) {
992 		vm_reclaim_max_threshold = PAGE_SIZE;
993 	}
994 
995 	result = kernel_thread_start_priority(reclaim_thread,
996 	    (void *)RECLAIM_THREAD_INIT, kReclaimThreadPriority,
997 	    &vm_reclaim_thread);
998 }
999 
1000 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1001 
1002 #if DEVELOPMENT || DEBUG
1003 
1004 bool
vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)1005 vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1006 {
1007 	vm_deferred_reclamation_metadata_t metadata = NULL;
1008 	proc_t p = proc_find(pid);
1009 	vm_map_t map = NULL;
1010 	if (p == NULL) {
1011 		return false;
1012 	}
1013 	task_t t = proc_task(p);
1014 	if (t == NULL) {
1015 		proc_rele(p);
1016 		return false;
1017 	}
1018 
1019 	task_lock(t);
1020 	if (t->map) {
1021 		metadata = t->deferred_reclamation_metadata;
1022 		if (metadata != NULL) {
1023 			map = t->map;
1024 			vm_map_reference(t->map);
1025 		}
1026 	}
1027 	task_unlock(t);
1028 	proc_rele(p);
1029 	if (metadata == NULL) {
1030 		return false;
1031 	}
1032 
1033 	lck_mtx_lock(&async_reclamation_buffers_lock);
1034 	while (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
1035 		assert_wait(&metadata->vdrm_async_list, THREAD_UNINT);
1036 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1037 		thread_block(THREAD_CONTINUE_NULL);
1038 		lck_mtx_lock(&async_reclamation_buffers_lock);
1039 	}
1040 
1041 	/*
1042 	 * The async reclaim thread first removes the buffer from the list
1043 	 * and then reclaims it (while holding its lock).
1044 	 * So grab the metadata buffer's lock here to ensure the
1045 	 * reclaim is done.
1046 	 */
1047 	lck_mtx_lock(&metadata->vdrm_lock);
1048 	lck_mtx_unlock(&metadata->vdrm_lock);
1049 	lck_mtx_unlock(&async_reclamation_buffers_lock);
1050 
1051 	vm_map_deallocate(map);
1052 	return true;
1053 }
1054 
1055 #endif /* DEVELOPMENT || DEBUG */
1056