xref: /xnu-10002.1.13/osfmk/vm/vm_reclaim.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/startup.h>
35 #include <kern/sched.h>
36 #include <libkern/OSAtomic.h>
37 #include <mach/mach_types.h>
38 #include <mach/mach_vm.h>
39 #include <mach/vm_reclaim.h>
40 #include <os/log.h>
41 #include <pexpert/pexpert.h>
42 #include <vm/vm_map_internal.h>
43 #include <vm/vm_reclaim_internal.h>
44 #include <sys/queue.h>
45 #include <os/atomic_private.h>
46 
47 #pragma mark Tunables
48 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
49 static integer_t kReclaimThreadPriority = BASEPRI_VM;
50 // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
51 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", 2);
52 TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults", "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", 0, TUNABLE_DT_NONE);
53 // Used to debug vm_reclaim kills
54 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
55 
56 #pragma mark Declarations
57 typedef struct proc *proc_t;
58 extern char *proc_best_name(proc_t proc);
59 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
60 struct proc *proc_ref(struct proc *p, int locked);
61 int proc_rele(proc_t p);
62 static bool reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
63 static bool reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
64 static bool reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
65 
66 struct vm_deferred_reclamation_metadata_s {
67 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; // Global list containing every reclamation buffer
68 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list; // A list containing buffers that are ripe for reclamation
69 	decl_lck_mtx_data(, vdrm_lock); /* Held when reclaiming from the buffer */
70 	/*
71 	 * The task owns this structure but we maintain a backpointer here
72 	 * so that we can send an exception if we hit an error.
73 	 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
74 	 */
75 	task_t vdrm_task;
76 	vm_map_t vdrm_map;
77 	user_addr_t vdrm_reclaim_buffer;
78 	mach_vm_size_t vdrm_buffer_size;
79 	user_addr_t vdrm_reclaim_indices;
80 	uint64_t vdrm_reclaimed_at;
81 	/*
82 	 * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
83 	 * cumulatively. Both values are in terms of virtual memory, so they give an upper bound
84 	 * on the amount of physical memory that can be reclaimed.
85 	 * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
86 	 * Note that neither value is protected by the vdrm_lock.
87 	 */
88 	_Atomic size_t vdrm_num_bytes_put_in_buffer;
89 	_Atomic size_t vdrm_num_bytes_reclaimed;
90 };
91 static void process_async_reclamation_list(void);
92 
93 extern void *proc_find(int pid);
94 extern task_t proc_task(proc_t);
95 
96 #pragma mark Globals
97 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
98 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
99 static os_log_t vm_reclaim_log_handle;
100 static size_t kReclaimChunkFailed = UINT64_MAX;
101 
102 /*
103  * We maintain two lists of reclamation buffers.
104  * The reclamation_buffers list contains every buffer in the system.
105  * The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
106  * Each list has its own lock.
107  */
108 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
109 
110 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
111 /*
112  * The reclamation_buffers_lock protects the reclamation_buffers list.
113  * It must be held when iterating over the list or manipulating the list.
114  * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
115  */
116 LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
117 LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
118 static size_t reclamation_buffers_length;
119 static uint64_t reclamation_counter; // generation count for global reclaims
120 
121 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
122 static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
123 
124 #pragma mark Implementation
125 
126 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,user_addr_t indices)127 vmdr_metadata_alloc(
128 	task_t                  task,
129 	user_addr_t             buffer,
130 	mach_vm_size_t          size,
131 	user_addr_t             indices)
132 {
133 	vm_deferred_reclamation_metadata_t metadata;
134 	vm_map_t map = task->map;
135 
136 	assert(!map->is_nested_map);
137 
138 	metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
139 	lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
140 	metadata->vdrm_task = task;
141 	metadata->vdrm_map = map;
142 	metadata->vdrm_reclaim_buffer = buffer;
143 	metadata->vdrm_buffer_size = size;
144 	metadata->vdrm_reclaim_indices = indices;
145 
146 	/*
147 	 * we do not need to hold a lock on `task` because this is called
148 	 * either at fork() time or from the context of current_task().
149 	 */
150 	vm_map_reference(map);
151 	return metadata;
152 }
153 
154 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)155 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
156 {
157 	vm_map_deallocate(metadata->vdrm_map);
158 	lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
159 	zfree(vm_reclaim_metadata_zone, metadata);
160 }
161 
162 kern_return_t
vm_deferred_reclamation_buffer_init_internal(task_t task,mach_vm_offset_t address,mach_vm_size_t size,user_addr_t indices)163 vm_deferred_reclamation_buffer_init_internal(
164 	task_t                  task,
165 	mach_vm_offset_t        address,
166 	mach_vm_size_t          size,
167 	user_addr_t             indices)
168 {
169 	kern_return_t kr = KERN_FAILURE;
170 	vm_deferred_reclamation_metadata_t metadata = NULL;
171 	bool success;
172 	uint64_t head = 0, tail = 0, busy = 0;
173 
174 	if (address == 0 || indices == 0 || size < 2 * sizeof(mach_vm_reclaim_entry_v1_t)) {
175 		return KERN_INVALID_ARGUMENT;
176 	}
177 
178 	/* vm_reclaim is disabled */
179 	if (vm_reclaim_max_threshold == 0) {
180 		os_log_error(vm_reclaim_log_handle,
181 		    "vm_reclaim: failed to initialize vmdr buffer - reclaim is disabled (%llu)",
182 		    vm_reclaim_max_threshold);
183 		return KERN_FAILURE;
184 	}
185 
186 	metadata = vmdr_metadata_alloc(task, address, size, indices);
187 
188 	/*
189 	 * Validate the starting indices
190 	 */
191 	success = reclaim_copyin_busy(metadata, &busy);
192 	if (!success) {
193 		kr = KERN_INVALID_ARGUMENT;
194 		goto out;
195 	}
196 	success = reclaim_copyin_head(metadata, &head);
197 	if (!success) {
198 		kr = KERN_INVALID_ARGUMENT;
199 		goto out;
200 	}
201 	success = reclaim_copyin_tail(metadata, &tail);
202 	if (!success) {
203 		kr = KERN_INVALID_ARGUMENT;
204 		goto out;
205 	}
206 	if (head != 0 || tail != 0 || busy != 0) {
207 		kr = KERN_INVALID_ARGUMENT;
208 		goto out;
209 	}
210 
211 	task_lock(task);
212 	if (task->deferred_reclamation_metadata != NULL) {
213 		/* Attempt to overwrite existing reclaim buffer. This is not allowed. */
214 		os_log_error(vm_reclaim_log_handle,
215 		    "vm_reclaim: tried to overwrite exisiting reclaim buffer for task %p", task);
216 		kr = KERN_INVALID_ARGUMENT;
217 		task_unlock(task);
218 		goto out;
219 	}
220 	task->deferred_reclamation_metadata = metadata;
221 
222 	task_unlock(task);
223 	lck_mtx_lock(&reclamation_buffers_lock);
224 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
225 	reclamation_buffers_length++;
226 	lck_mtx_unlock(&reclamation_buffers_lock);
227 
228 	return KERN_SUCCESS;
229 
230 out:
231 	vmdr_metadata_free(metadata);
232 	return kr;
233 }
234 
235 void
vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)236 vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)
237 {
238 	assert(metadata != NULL);
239 	/*
240 	 * First remove the buffer from the global list so no one else can get access to it.
241 	 */
242 	lck_mtx_lock(&reclamation_buffers_lock);
243 	TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
244 	reclamation_buffers_length--;
245 	lck_mtx_unlock(&reclamation_buffers_lock);
246 
247 	/*
248 	 * Now remove it from the async list (if present)
249 	 */
250 	lck_mtx_lock(&async_reclamation_buffers_lock);
251 	if (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
252 		TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
253 		metadata->vdrm_async_list.tqe_next = NULL;
254 		metadata->vdrm_async_list.tqe_prev = NULL;
255 	}
256 	lck_mtx_unlock(&async_reclamation_buffers_lock);
257 
258 	// A kernel thread may have grabbed the lock for this buffer before we had
259 	// a chance to remove it from the queues. Take the metadata lock to ensure
260 	// any such workers are finished operating on the buffer.
261 	lck_mtx_lock(&metadata->vdrm_lock);
262 	lck_mtx_unlock(&metadata->vdrm_lock);
263 }
264 
265 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)266 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
267 {
268 	assert(metadata != NULL);
269 	vmdr_metadata_free(metadata);
270 }
271 
272 static user_addr_t
get_head_ptr(user_addr_t indices)273 get_head_ptr(user_addr_t indices)
274 {
275 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
276 }
277 
278 static user_addr_t
get_tail_ptr(user_addr_t indices)279 get_tail_ptr(user_addr_t indices)
280 {
281 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
282 }
283 
284 static user_addr_t
get_busy_ptr(user_addr_t indices)285 get_busy_ptr(user_addr_t indices)
286 {
287 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
288 }
289 
290 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)291 reclaim_kill_with_reason(
292 	vm_deferred_reclamation_metadata_t metadata,
293 	unsigned reason,
294 	mach_exception_data_type_t subcode)
295 {
296 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
297 	mach_exception_code_t code = 0;
298 	task_t task = metadata->vdrm_task;
299 	proc_t p = NULL;
300 	boolean_t fatal = TRUE;
301 	bool killing_self = false;
302 	pid_t pid;
303 	int err;
304 
305 	if (panic_on_kill) {
306 		panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
307 	}
308 
309 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
310 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
311 	EXC_GUARD_ENCODE_TARGET(code, 0);
312 
313 	assert(metadata->vdrm_task != kernel_task);
314 	killing_self = task == current_task();
315 	if (!killing_self) {
316 		/*
317 		 * Grab a reference on the task to make sure it doesn't go away
318 		 * after we drop the metadata lock
319 		 */
320 		task_reference(task);
321 	}
322 	/*
323 	 * We need to issue a wakeup in case this kill is coming from the async path.
324 	 * Once we drop the lock the caller can no longer do this wakeup, but
325 	 * if there's someone blocked on this reclaim they hold a map reference
326 	 * and thus need to be woken up so the map can be freed.
327 	 */
328 	thread_wakeup(&metadata->vdrm_async_list);
329 	lck_mtx_unlock(&metadata->vdrm_lock);
330 
331 	if (reason == kGUARD_EXC_DEALLOC_GAP) {
332 		task_lock(task);
333 		fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
334 		task_unlock(task);
335 	}
336 
337 	if (!fatal) {
338 		os_log_info(vm_reclaim_log_handle,
339 		    "vm_reclaim: Skipping non fatal guard exception.\n");
340 		goto out;
341 	}
342 
343 	pid = task_pid(task);
344 	if (killing_self) {
345 		p = get_bsdtask_info(task);
346 	} else {
347 		p = proc_find(pid);
348 		if (p && proc_task(p) != task) {
349 			os_log_error(vm_reclaim_log_handle,
350 			    "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
351 			goto out;
352 		}
353 
354 		task_deallocate(task);
355 		task = NULL;
356 	}
357 
358 	if (!p) {
359 		os_log_error(vm_reclaim_log_handle,
360 		    "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
361 		goto out;
362 	}
363 
364 	err = exit_with_guard_exception(p, code, subcode);
365 	if (err != 0) {
366 		os_log_error(vm_reclaim_log_handle, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
367 	}
368 out:
369 	if (!killing_self) {
370 		if (p) {
371 			proc_rele(p);
372 			p = NULL;
373 		}
374 		if (task) {
375 			task_deallocate(task);
376 			task = NULL;
377 		}
378 	}
379 }
380 
381 static void
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)382 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
383 {
384 	reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE, result);
385 }
386 
387 /*
388  * Helper functions to do copyio on the head, tail, and busy pointers.
389  * Note that the kernel will only write to the busy and head pointers.
390  * Userspace is not supposed to write to the head or busy pointers, but the kernel
391  * must be resilient to that kind of bug in userspace.
392  */
393 
394 
395 static bool
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)396 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
397 {
398 	int result;
399 	user_addr_t indices = metadata->vdrm_reclaim_indices;
400 	user_addr_t head_ptr = get_head_ptr(indices);
401 
402 	result = copyin_atomic64(head_ptr, head);
403 
404 	if (result != 0) {
405 		os_log_error(vm_reclaim_log_handle,
406 		    "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
407 		reclaim_handle_copyio_error(metadata, result);
408 		return false;
409 	}
410 	return true;
411 }
412 
413 static bool
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)414 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
415 {
416 	int result;
417 	user_addr_t indices = metadata->vdrm_reclaim_indices;
418 	user_addr_t tail_ptr = get_tail_ptr(indices);
419 
420 	result = copyin_atomic64(tail_ptr, tail);
421 
422 	if (result != 0) {
423 		os_log_error(vm_reclaim_log_handle,
424 		    "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
425 		reclaim_handle_copyio_error(metadata, result);
426 		return false;
427 	}
428 	return true;
429 }
430 
431 static bool
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)432 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
433 {
434 	int result;
435 	user_addr_t indices = metadata->vdrm_reclaim_indices;
436 	user_addr_t busy_ptr = get_busy_ptr(indices);
437 
438 	result = copyin_atomic64(busy_ptr, busy);
439 
440 	if (result != 0) {
441 		os_log_error(vm_reclaim_log_handle,
442 		    "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
443 		reclaim_handle_copyio_error(metadata, result);
444 		return false;
445 	}
446 	return true;
447 }
448 
449 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)450 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
451 {
452 	int result;
453 	user_addr_t indices = metadata->vdrm_reclaim_indices;
454 	user_addr_t busy_ptr = get_busy_ptr(indices);
455 
456 	result = copyout_atomic64(value, busy_ptr);
457 
458 	if (result != 0) {
459 		os_log_error(vm_reclaim_log_handle,
460 		    "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
461 		reclaim_handle_copyio_error(metadata, result);
462 		return false;
463 	}
464 	return true;
465 }
466 
467 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)468 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
469 {
470 	int result;
471 	user_addr_t indices = metadata->vdrm_reclaim_indices;
472 	user_addr_t head_ptr = get_head_ptr(indices);
473 
474 	result = copyout_atomic64(value, head_ptr);
475 
476 	if (result != 0) {
477 		os_log_error(vm_reclaim_log_handle,
478 		    "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
479 		reclaim_handle_copyio_error(metadata, result);
480 		return false;
481 	}
482 	return true;
483 }
484 
485 /*
486  * Reclaim a chunk from the buffer.
487  * Returns the number of entries reclaimed or 0 if there are no entries left in the buffer.
488  */
489 static size_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)490 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)
491 {
492 	assert(metadata != NULL);
493 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
494 
495 	int result = 0;
496 	size_t num_reclaimed = 0;
497 	uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0, num_copied = 0, buffer_len = 0;
498 	user_addr_t indices;
499 	vm_map_t map = metadata->vdrm_map, old_map;
500 	mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
501 	bool success;
502 
503 	buffer_len = metadata->vdrm_buffer_size / sizeof(mach_vm_reclaim_entry_v1_t);
504 
505 	memset(reclaim_entries, 0, sizeof(reclaim_entries));
506 
507 	indices = (user_addr_t) metadata->vdrm_reclaim_indices;
508 	old_map = vm_map_switch(map);
509 
510 	success = reclaim_copyin_busy(metadata, &busy);
511 	if (!success) {
512 		goto fail;
513 	}
514 	success = reclaim_copyin_head(metadata, &head);
515 	if (!success) {
516 		goto fail;
517 	}
518 	success = reclaim_copyin_tail(metadata, &tail);
519 	if (!success) {
520 		goto fail;
521 	}
522 
523 	if (busy != head) {
524 		// Userspace overwrote one of the pointers
525 		os_log_error(vm_reclaim_log_handle,
526 		    "vm_reclaim: Userspace modified head or busy pointer! head: %llu (0x%llx) != busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
527 		    head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, get_tail_ptr(indices));
528 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, busy);
529 		goto fail;
530 	}
531 
532 	if (tail < head) {
533 		// Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
534 		os_log_error(vm_reclaim_log_handle,
535 		    "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
536 		    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
537 		lck_mtx_unlock(&metadata->vdrm_lock);
538 		goto fail;
539 	}
540 
541 	num_to_reclaim = tail - head;
542 	while (true) {
543 		num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
544 		if (num_to_reclaim == 0) {
545 			break;
546 		}
547 		busy = head + num_to_reclaim;
548 		success = reclaim_copyout_busy(metadata, busy);
549 		if (!success) {
550 			goto fail;
551 		}
552 		os_atomic_thread_fence(seq_cst);
553 		success = reclaim_copyin_tail(metadata, &new_tail);
554 		if (!success) {
555 			goto fail;
556 		}
557 
558 		if (new_tail >= busy) {
559 			/* Got num_to_reclaim entries */
560 			break;
561 		}
562 		tail = new_tail;
563 		if (tail < head) {
564 			// Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
565 			os_log_error(vm_reclaim_log_handle,
566 			    "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
567 			    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
568 			lck_mtx_unlock(&metadata->vdrm_lock);
569 			goto fail;
570 		}
571 		/* Can't reclaim these entries. Try again */
572 		num_to_reclaim = tail - head;
573 		if (num_to_reclaim == 0) {
574 			/* Nothing left to reclaim. Reset busy to head. */
575 			success = reclaim_copyout_busy(metadata, head);
576 			if (!success) {
577 				goto fail;
578 			}
579 			break;
580 		}
581 		/*
582 		 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
583 		 * so this is gauranteed to converge.
584 		 */
585 	}
586 
587 	while (num_copied < num_to_reclaim) {
588 		uint64_t memcpy_start_idx = (head % buffer_len);
589 		uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
590 		// Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
591 		memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
592 		uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
593 
594 		assert(num_to_copy + num_copied <= kReclaimChunkSize);
595 		user_addr_t src_ptr = metadata->vdrm_reclaim_buffer + memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t);
596 		mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
597 
598 		result = copyin(src_ptr, dst_ptr, num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t));
599 
600 		if (result != 0) {
601 			os_log_error(vm_reclaim_log_handle,
602 			    "vm_reclaim: Unable to copyin %llu entries in reclaim buffer at 0x%llx to 0x%llx: err=%d\n",
603 			    num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
604 			reclaim_handle_copyio_error(metadata, result);
605 			goto fail;
606 		}
607 
608 		num_copied += num_to_copy;
609 		head += num_to_copy;
610 	}
611 
612 	for (size_t i = 0; i < num_to_reclaim; i++) {
613 		mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
614 		if (entry->address != 0 && entry->size != 0) {
615 			kern_return_t kr = vm_map_remove_guard(map,
616 			    vm_map_trunc_page(entry->address,
617 			    VM_MAP_PAGE_MASK(map)),
618 			    vm_map_round_page(entry->address + entry->size,
619 			    VM_MAP_PAGE_MASK(map)),
620 			    VM_MAP_REMOVE_GAPS_FAIL,
621 			    KMEM_GUARD_NONE).kmr_return;
622 			if (kr == KERN_INVALID_VALUE) {
623 				reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
624 				goto fail;
625 			} else if (kr != KERN_SUCCESS) {
626 				os_log_error(vm_reclaim_log_handle,
627 				    "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx. Err: %d\n",
628 				    entry->address, entry->size, (uint64_t) map, kr);
629 				reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
630 				goto fail;
631 			}
632 			num_reclaimed++;
633 			os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
634 		}
635 	}
636 
637 	success = reclaim_copyout_head(metadata, head);
638 	if (!success) {
639 		goto fail;
640 	}
641 
642 	vm_map_switch(old_map);
643 	return num_reclaimed;
644 fail:
645 	vm_map_switch(old_map);
646 	return kReclaimChunkFailed;
647 }
648 
649 /*
650  * Attempts to reclaim until the buffer's estimated number of available bytes is <= num_bytes_reclaimable_threshold
651  * The metadata buffer lock should be held by the caller.
652  *
653  * Returns the number of entries reclaimed.
654  */
655 static size_t
reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,size_t num_bytes_reclaimable_threshold)656 reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata, size_t num_bytes_reclaimable_threshold)
657 {
658 	assert(metadata != NULL);
659 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
660 	if (!task_is_active(metadata->vdrm_task)) {
661 		/*
662 		 * If the task is exiting, the reclaim below will likely fail and fall through
663 		 * to the (slower) error path.
664 		 * So as an optimization, we bail out early here.
665 		 */
666 		return 0;
667 	}
668 
669 	size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
670 	while (true) {
671 		size_t curr_entries_reclaimed = 0;
672 		num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
673 		reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
674 		if (num_bytes_reclaimed > reclaimable_bytes) {
675 			estimated_reclaimable_bytes = 0;
676 		} else {
677 			estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
678 		}
679 		if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
680 			break;
681 		}
682 		curr_entries_reclaimed = reclaim_chunk(metadata);
683 		if (curr_entries_reclaimed == kReclaimChunkFailed) {
684 			return kReclaimChunkFailed;
685 		}
686 		if (curr_entries_reclaimed == 0) {
687 			break;
688 		}
689 		num_entries_reclaimed += curr_entries_reclaimed;
690 	}
691 
692 	return num_entries_reclaimed;
693 }
694 
695 /*
696  * Get the reclamation metadata buffer for the given map.
697  * If the buffer exists it is returned locked.
698  */
699 static vm_deferred_reclamation_metadata_t
get_task_reclaim_metadata(task_t task)700 get_task_reclaim_metadata(task_t task)
701 {
702 	assert(task != NULL);
703 	vm_deferred_reclamation_metadata_t metadata = NULL;
704 	task_lock(task);
705 	metadata = task->deferred_reclamation_metadata;
706 	if (metadata != NULL) {
707 		lck_mtx_lock(&metadata->vdrm_lock);
708 	}
709 	task_unlock(task);
710 	return metadata;
711 }
712 
713 kern_return_t
vm_deferred_reclamation_buffer_synchronize_internal(task_t task,size_t num_entries_to_reclaim)714 vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
715 {
716 	vm_deferred_reclamation_metadata_t metadata = NULL;
717 	size_t total_reclaimed = 0;
718 
719 	if (!task_is_active(task)) {
720 		return KERN_FAILURE;
721 	}
722 
723 	metadata = get_task_reclaim_metadata(task);
724 	if (metadata == NULL) {
725 		return KERN_INVALID_ARGUMENT;
726 	}
727 
728 	while (total_reclaimed < num_entries_to_reclaim) {
729 		size_t num_reclaimed = reclaim_chunk(metadata);
730 		if (num_reclaimed == kReclaimChunkFailed) {
731 			/* Lock has already been released and task is being killed. */
732 			return KERN_FAILURE;
733 		}
734 		if (num_reclaimed == 0) {
735 			/* There was nothing to reclaim. A reclamation thread must have beaten us to it. Nothing to do here. */
736 			break;
737 		}
738 
739 		total_reclaimed += num_reclaimed;
740 	}
741 	lck_mtx_unlock(&metadata->vdrm_lock);
742 
743 	return KERN_SUCCESS;
744 }
745 
746 kern_return_t
vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task,size_t reclaimable_bytes)747 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
748 {
749 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
750 	size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer;
751 	bool success;
752 	if (metadata == NULL) {
753 		return KERN_INVALID_ARGUMENT;
754 	}
755 
756 	/*
757 	 * The client is allowed to make this call in parallel from multiple threads.
758 	 * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
759 	 * If the client's value is smaller than what we've stored, another thread
760 	 * raced ahead of them and we've already acted on that accounting so this
761 	 * call should be a no-op.
762 	 */
763 	success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
764 	    reclaimable_bytes, acquire,
765 	{
766 		if (num_bytes_in_buffer > reclaimable_bytes) {
767 		        os_atomic_rmw_loop_give_up(break);
768 		}
769 	});
770 	if (!success) {
771 		/* Stale value. Nothing new to reclaim */
772 		return KERN_SUCCESS;
773 	}
774 	num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
775 
776 	if (reclaimable_bytes > num_bytes_reclaimed) {
777 		estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
778 		if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
779 			lck_mtx_lock(&metadata->vdrm_lock);
780 			size_t num_reclaimed = reclaim_entries_from_buffer(metadata, vm_reclaim_max_threshold);
781 			if (num_reclaimed == kReclaimChunkFailed) {
782 				/* Lock has already been released & task is in the process of getting killed. */
783 				return KERN_INVALID_ARGUMENT;
784 			}
785 			lck_mtx_unlock(&metadata->vdrm_lock);
786 		}
787 	}
788 
789 	return KERN_SUCCESS;
790 }
791 
792 static inline size_t
pick_reclaim_threshold(vm_deferred_reclamation_action_t action)793 pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
794 {
795 	switch (action) {
796 	case RECLAIM_FULL:
797 		return 0;
798 	case RECLAIM_TRIM:
799 		return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
800 	case RECLAIM_ASYNC:
801 		return 0;
802 	}
803 }
804 
805 void
vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)806 vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)
807 {
808 	if (action == RECLAIM_ASYNC) {
809 		lck_mtx_lock(&async_reclamation_buffers_lock);
810 
811 		process_async_reclamation_list();
812 		lck_mtx_unlock(&async_reclamation_buffers_lock);
813 	} else {
814 		size_t reclaim_threshold = pick_reclaim_threshold(action);
815 		lck_mtx_lock(&reclamation_buffers_lock);
816 		reclamation_counter++;
817 		while (true) {
818 			vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
819 			if (metadata == NULL) {
820 				break;
821 			}
822 			lck_mtx_lock(&metadata->vdrm_lock);
823 			if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
824 				// We've already seen this one. We're done
825 				lck_mtx_unlock(&metadata->vdrm_lock);
826 				break;
827 			}
828 			metadata->vdrm_reclaimed_at = reclamation_counter;
829 
830 			TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
831 			TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
832 			lck_mtx_unlock(&reclamation_buffers_lock);
833 
834 			size_t num_reclaimed = reclaim_entries_from_buffer(metadata, reclaim_threshold);
835 			if (num_reclaimed != kReclaimChunkFailed) {
836 				lck_mtx_unlock(&metadata->vdrm_lock);
837 			}
838 
839 			lck_mtx_lock(&reclamation_buffers_lock);
840 		}
841 		lck_mtx_unlock(&reclamation_buffers_lock);
842 	}
843 }
844 
845 void
vm_deferred_reclamation_reclaim_all_memory(void)846 vm_deferred_reclamation_reclaim_all_memory(void)
847 {
848 	vm_deferred_reclamation_reclaim_memory(RECLAIM_FULL);
849 }
850 
851 bool
vm_deferred_reclamation_reclaim_from_task_async(task_t task)852 vm_deferred_reclamation_reclaim_from_task_async(task_t task)
853 {
854 	bool queued = false;
855 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
856 
857 	if (metadata != NULL) {
858 		lck_mtx_lock(&async_reclamation_buffers_lock);
859 		if (metadata->vdrm_async_list.tqe_next != NULL ||
860 		    metadata->vdrm_async_list.tqe_prev != NULL) {
861 			// move this buffer to the tail if still on the async list
862 			TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
863 		}
864 		TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
865 		lck_mtx_unlock(&async_reclamation_buffers_lock);
866 		queued = true;
867 		thread_wakeup(&vm_reclaim_thread);
868 	}
869 
870 	return queued;
871 }
872 
873 bool
vm_deferred_reclamation_reclaim_from_task_sync(task_t task,size_t max_entries_to_reclaim)874 vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
875 {
876 	size_t num_reclaimed = 0;
877 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
878 
879 	if (!task_is_active(task)) {
880 		return false;
881 	}
882 
883 	if (metadata != NULL) {
884 		lck_mtx_lock(&metadata->vdrm_lock);
885 		while (num_reclaimed < max_entries_to_reclaim) {
886 			size_t num_reclaimed_now = reclaim_chunk(metadata);
887 			if (num_reclaimed_now == kReclaimChunkFailed) {
888 				/* Lock has already been released and task is being killed. */
889 				return false;
890 			}
891 			if (num_reclaimed_now == 0) {
892 				// Nothing left to reclaim
893 				break;
894 			}
895 			num_reclaimed += num_reclaimed_now;
896 		}
897 		lck_mtx_unlock(&metadata->vdrm_lock);
898 	}
899 
900 	return num_reclaimed > 0;
901 }
902 
903 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_buffer_fork(task_t task,vm_deferred_reclamation_metadata_t parent)904 vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
905 {
906 	vm_deferred_reclamation_metadata_t metadata = NULL;
907 
908 	LCK_MTX_ASSERT(&parent->vdrm_lock, LCK_MTX_ASSERT_OWNED);
909 
910 	assert(task->deferred_reclamation_metadata == NULL);
911 	metadata = vmdr_metadata_alloc(task, parent->vdrm_reclaim_buffer,
912 	    parent->vdrm_buffer_size, parent->vdrm_reclaim_indices);
913 	lck_mtx_unlock(&parent->vdrm_lock);
914 
915 	lck_mtx_lock(&reclamation_buffers_lock);
916 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
917 	reclamation_buffers_length++;
918 	lck_mtx_unlock(&reclamation_buffers_lock);
919 
920 	return metadata;
921 }
922 
923 void
vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)924 vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)
925 {
926 	lck_mtx_lock(&metadata->vdrm_lock);
927 }
928 
929 void
vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)930 vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)
931 {
932 	lck_mtx_unlock(&metadata->vdrm_lock);
933 }
934 
935 
936 static void
reclaim_thread_init(void)937 reclaim_thread_init(void)
938 {
939 #if CONFIG_THREAD_GROUPS
940 	thread_group_vm_add();
941 #endif
942 	thread_set_thread_name(current_thread(), "VM_reclaim");
943 }
944 
945 
946 static void
process_async_reclamation_list(void)947 process_async_reclamation_list(void)
948 {
949 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
950 
951 	vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
952 	while (metadata != NULL) {
953 		TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
954 		metadata->vdrm_async_list.tqe_next = NULL;
955 		metadata->vdrm_async_list.tqe_prev = NULL;
956 		lck_mtx_lock(&metadata->vdrm_lock);
957 		lck_mtx_unlock(&async_reclamation_buffers_lock);
958 
959 		// NB: Currently the async reclaim thread fully reclaims the buffer.
960 		size_t num_reclaimed = reclaim_entries_from_buffer(metadata, 0);
961 		if (num_reclaimed == kReclaimChunkFailed) {
962 			/* Lock has already been released & task is in the process of getting killed. */
963 			goto next;
964 		}
965 		/* Wakeup anyone waiting on this buffer getting processed */
966 		thread_wakeup(&metadata->vdrm_async_list);
967 		assert(current_thread()->map == kernel_map);
968 		lck_mtx_unlock(&metadata->vdrm_lock);
969 
970 next:
971 		lck_mtx_lock(&async_reclamation_buffers_lock);
972 		metadata = TAILQ_FIRST(&async_reclamation_buffers);
973 	}
974 }
975 
976 __enum_decl(reclaim_thread_state, uint32_t, {
977 	RECLAIM_THREAD_INIT = 0,
978 	RECLAIM_THREAD_CONT = 1,
979 });
980 
981 static void
reclaim_thread_continue(void)982 reclaim_thread_continue(void)
983 {
984 	lck_mtx_lock(&async_reclamation_buffers_lock);
985 
986 	process_async_reclamation_list();
987 	assert_wait(&vm_reclaim_thread, THREAD_UNINT);
988 
989 	lck_mtx_unlock(&async_reclamation_buffers_lock);
990 }
991 
992 void
reclaim_thread(void * param,wait_result_t wr __unused)993 reclaim_thread(void *param, wait_result_t wr __unused)
994 {
995 	if (param == (void *) RECLAIM_THREAD_INIT) {
996 		reclaim_thread_init();
997 	} else {
998 		assert(param == (void *) RECLAIM_THREAD_CONT);
999 	}
1000 
1001 	reclaim_thread_continue();
1002 
1003 	(void) thread_block_parameter(reclaim_thread, (void*) RECLAIM_THREAD_CONT);
1004 }
1005 
1006 __startup_func
1007 static void
vm_deferred_reclamation_init(void)1008 vm_deferred_reclamation_init(void)
1009 {
1010 	// Note: no-op pending rdar://27006343 (Custom kernel log handles)
1011 	vm_reclaim_log_handle = os_log_create("com.apple.mach.vm", "reclaim");
1012 
1013 	(void)kernel_thread_start_priority(reclaim_thread,
1014 	    (void *)RECLAIM_THREAD_INIT, kReclaimThreadPriority,
1015 	    &vm_reclaim_thread);
1016 }
1017 
1018 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1019 
1020 #if DEVELOPMENT || DEBUG
1021 
1022 bool
vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)1023 vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1024 {
1025 	vm_deferred_reclamation_metadata_t metadata = NULL;
1026 	proc_t p = proc_find(pid);
1027 	vm_map_t map = NULL;
1028 	if (p == NULL) {
1029 		return false;
1030 	}
1031 	task_t t = proc_task(p);
1032 	if (t == NULL) {
1033 		proc_rele(p);
1034 		return false;
1035 	}
1036 
1037 	task_lock(t);
1038 	if (t->map) {
1039 		metadata = t->deferred_reclamation_metadata;
1040 		if (metadata != NULL) {
1041 			map = t->map;
1042 			vm_map_reference(t->map);
1043 		}
1044 	}
1045 	task_unlock(t);
1046 	proc_rele(p);
1047 	if (metadata == NULL) {
1048 		return false;
1049 	}
1050 
1051 	lck_mtx_lock(&async_reclamation_buffers_lock);
1052 	while (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
1053 		assert_wait(&metadata->vdrm_async_list, THREAD_UNINT);
1054 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1055 		thread_block(THREAD_CONTINUE_NULL);
1056 		lck_mtx_lock(&async_reclamation_buffers_lock);
1057 	}
1058 
1059 	/*
1060 	 * The async reclaim thread first removes the buffer from the list
1061 	 * and then reclaims it (while holding its lock).
1062 	 * So grab the metadata buffer's lock here to ensure the
1063 	 * reclaim is done.
1064 	 */
1065 	lck_mtx_lock(&metadata->vdrm_lock);
1066 	lck_mtx_unlock(&metadata->vdrm_lock);
1067 	lck_mtx_unlock(&async_reclamation_buffers_lock);
1068 
1069 	vm_map_deallocate(map);
1070 	return true;
1071 }
1072 
1073 #endif /* DEVELOPMENT || DEBUG */
1074