xref: /xnu-10002.41.9/osfmk/vm/vm_reclaim.c (revision 699cd48037512bf4380799317ca44ca453c82f57)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/startup.h>
35 #include <kern/sched.h>
36 #include <libkern/OSAtomic.h>
37 #include <mach/kern_return.h>
38 #include <mach/mach_types.h>
39 #include <mach/mach_vm.h>
40 #include <mach/vm_reclaim.h>
41 #include <os/log.h>
42 #include <pexpert/pexpert.h>
43 #include <vm/vm_map.h>
44 #include <vm/vm_map_internal.h>
45 #include <vm/vm_reclaim_internal.h>
46 #include <sys/queue.h>
47 #include <os/atomic_private.h>
48 
49 #pragma mark Tunables
50 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
51 static integer_t kReclaimThreadPriority = BASEPRI_VM;
52 // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
53 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", 2);
54 TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults", "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", 0, TUNABLE_DT_NONE);
55 // Used to debug vm_reclaim kills
56 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
57 
58 #pragma mark Declarations
59 typedef struct proc *proc_t;
60 extern char *proc_best_name(proc_t proc);
61 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
62 struct proc *proc_ref(struct proc *p, int locked);
63 int proc_rele(proc_t p);
64 static bool reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
65 static bool reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
66 static bool reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
67 
68 struct vm_deferred_reclamation_metadata_s {
69 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; // Global list containing every reclamation buffer
70 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list; // A list containing buffers that are ripe for reclamation
71 	decl_lck_mtx_data(, vdrm_lock); /* Held when reclaiming from the buffer */
72 	/*
73 	 * The task owns this structure but we maintain a backpointer here
74 	 * so that we can send an exception if we hit an error.
75 	 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
76 	 */
77 	task_t vdrm_task;
78 	vm_map_t vdrm_map;
79 	user_addr_t vdrm_reclaim_buffer;
80 	mach_vm_size_t vdrm_buffer_size;
81 	user_addr_t vdrm_reclaim_indices;
82 	uint64_t vdrm_reclaimed_at;
83 	/*
84 	 * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
85 	 * cumulatively. Both values are in terms of virtual memory, so they give an upper bound
86 	 * on the amount of physical memory that can be reclaimed.
87 	 * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
88 	 * Note that neither value is protected by the vdrm_lock.
89 	 */
90 	_Atomic size_t vdrm_num_bytes_put_in_buffer;
91 	_Atomic size_t vdrm_num_bytes_reclaimed;
92 };
93 static void process_async_reclamation_list(void);
94 
95 extern void *proc_find(int pid);
96 extern task_t proc_task(proc_t);
97 
98 #pragma mark Globals
99 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
100 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
101 static os_log_t vm_reclaim_log_handle;
102 static size_t kReclaimChunkFailed = UINT64_MAX;
103 
104 /*
105  * The ringbuffer must contain at least 2 entries to distinguish between empty
106  * (head == tail) and full (head == tail + 1).
107  */
108 #define BUFFER_MIN_ENTRY_COUNT 2
109 
110 /*
111  * We maintain two lists of reclamation buffers.
112  * The reclamation_buffers list contains every buffer in the system.
113  * The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
114  * Each list has its own lock.
115  */
116 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
117 
118 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
119 /*
120  * The reclamation_buffers_lock protects the reclamation_buffers list.
121  * It must be held when iterating over the list or manipulating the list.
122  * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
123  */
124 LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
125 LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
126 static size_t reclamation_buffers_length;
127 static uint64_t reclamation_counter; // generation count for global reclaims
128 
129 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
130 static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
131 
132 #pragma mark Implementation
133 
134 /*
135  * The current design is not tolerant to faulting on the buffer under the
136  * metadata lock. Wire the buffer as a stop-gap solution for now; in the
137  * future, the synchronization scheme should be revised to allow the buffer
138  * to be pageable (rdar://112039103).
139  */
140 
141 static kern_return_t
vmdr_metadata_wire(vm_deferred_reclamation_metadata_t metadata)142 vmdr_metadata_wire(vm_deferred_reclamation_metadata_t metadata)
143 {
144 	kern_return_t kr;
145 	vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer -
146 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries));
147 	vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer +
148 	    metadata->vdrm_buffer_size);
149 	kr = vm_map_wire_kernel(metadata->vdrm_map, buffer_start, buffer_end,
150 	    VM_PROT_NONE, VM_KERN_MEMORY_OSFMK, TRUE);
151 	if (kr != KERN_SUCCESS) {
152 		os_log_error(vm_reclaim_log_handle,
153 		    "vm_reclaim: failed to wire userspace reclaim buffer for pid %d (%d)",
154 		    task_pid(metadata->vdrm_task), kr);
155 	}
156 	return kr;
157 }
158 
159 static kern_return_t
vmdr_metadata_unwire(vm_deferred_reclamation_metadata_t metadata)160 vmdr_metadata_unwire(vm_deferred_reclamation_metadata_t metadata)
161 {
162 	kern_return_t kr;
163 	vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer -
164 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries));
165 	vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer +
166 	    metadata->vdrm_buffer_size);
167 	kr = vm_map_unwire(metadata->vdrm_map, buffer_start, buffer_end, TRUE);
168 	if (kr != KERN_SUCCESS) {
169 		os_log_error(vm_reclaim_log_handle,
170 		    "vm_reclaim: unable to un-wire buffer %p (%llu) for pid %d (%d)",
171 		    (void *)buffer_start, (buffer_end - buffer_start),
172 		    task_pid(metadata->vdrm_task), kr);
173 	}
174 	return kr;
175 }
176 
177 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,user_addr_t indices)178 vmdr_metadata_alloc(
179 	task_t                  task,
180 	user_addr_t             buffer,
181 	mach_vm_size_t          size,
182 	user_addr_t             indices)
183 {
184 	vm_deferred_reclamation_metadata_t metadata;
185 	vm_map_t map = task->map;
186 
187 	assert(!map->is_nested_map);
188 
189 	metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
190 	lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
191 	metadata->vdrm_task = task;
192 	metadata->vdrm_map = map;
193 	metadata->vdrm_reclaim_buffer = buffer;
194 	metadata->vdrm_buffer_size = size;
195 	metadata->vdrm_reclaim_indices = indices;
196 
197 	/*
198 	 * we do not need to hold a lock on `task` because this is called
199 	 * either at fork() time or from the context of current_task().
200 	 */
201 	vm_map_reference(map);
202 	return metadata;
203 }
204 
205 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)206 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
207 {
208 	vm_map_deallocate(metadata->vdrm_map);
209 	lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
210 	zfree(vm_reclaim_metadata_zone, metadata);
211 }
212 
213 kern_return_t
vm_deferred_reclamation_buffer_init_internal(task_t task,mach_vm_offset_t address,mach_vm_size_t size)214 vm_deferred_reclamation_buffer_init_internal(
215 	task_t                  task,
216 	mach_vm_offset_t        address,
217 	mach_vm_size_t          size)
218 {
219 	kern_return_t kr = KERN_FAILURE, tmp_kr;
220 	vm_deferred_reclamation_metadata_t metadata = NULL;
221 	bool success;
222 	uint64_t head = 0, tail = 0, busy = 0;
223 
224 	if (address == 0 ||
225 	    size < (sizeof(struct mach_vm_reclaim_buffer_v1_s) +
226 	    BUFFER_MIN_ENTRY_COUNT * sizeof(mach_vm_reclaim_entry_v1_t)) ||
227 	    !VM_MAP_PAGE_ALIGNED(address, VM_MAP_PAGE_MASK(task->map)) ||
228 	    !VM_MAP_PAGE_ALIGNED((address + size), VM_MAP_PAGE_MASK(task->map))) {
229 		return KERN_INVALID_ARGUMENT;
230 	}
231 
232 	/* vm_reclaim is disabled */
233 	if (vm_reclaim_max_threshold == 0) {
234 		os_log_error(vm_reclaim_log_handle,
235 		    "vm_reclaim: failed to initialize vmdr buffer - reclaim is disabled (%llu)",
236 		    vm_reclaim_max_threshold);
237 		return KERN_NOT_SUPPORTED;
238 	}
239 
240 	user_addr_t buffer = address + \
241 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
242 	mach_vm_size_t buffer_size = size - \
243 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
244 	user_addr_t indices = address + \
245 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, indices);
246 
247 	metadata = vmdr_metadata_alloc(task, buffer, buffer_size, indices);
248 
249 	/*
250 	 * Validate the starting indices
251 	 */
252 	success = reclaim_copyin_busy(metadata, &busy);
253 	if (!success) {
254 		kr = KERN_INVALID_ARGUMENT;
255 		goto out;
256 	}
257 	success = reclaim_copyin_head(metadata, &head);
258 	if (!success) {
259 		kr = KERN_INVALID_ARGUMENT;
260 		goto out;
261 	}
262 	success = reclaim_copyin_tail(metadata, &tail);
263 	if (!success) {
264 		kr = KERN_INVALID_ARGUMENT;
265 		goto out;
266 	}
267 	if (head != 0 || tail != 0 || busy != 0) {
268 		kr = KERN_INVALID_ARGUMENT;
269 		goto out;
270 	}
271 
272 	kr = vmdr_metadata_wire(metadata);
273 	if (kr != KERN_SUCCESS) {
274 		goto out;
275 	}
276 
277 	/*
278 	 * Publish the metadata to the task & global buffer list. This must be
279 	 * done under the task lock to synchronize with task termination - i.e.
280 	 * task_terminate_internal is guaranteed to see the published metadata and
281 	 * tear it down.
282 	 */
283 	lck_mtx_lock(&reclamation_buffers_lock);
284 	task_lock(task);
285 
286 	if (!task_is_active(task)) {
287 		os_log_error(vm_reclaim_log_handle,
288 		    "vm_reclaim: failed to initialize buffer on dying task (pid %d)", task_pid(task));
289 		kr = KERN_TERMINATED;
290 		goto fail_wired;
291 	} else if (task->deferred_reclamation_metadata != NULL) {
292 		os_log_error(vm_reclaim_log_handle,
293 		    "vm_reclaim: tried to overwrite existing reclaim buffer for pid %d", task_pid(task));
294 		kr = KERN_INVALID_ARGUMENT;
295 		goto fail_wired;
296 	}
297 
298 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
299 	reclamation_buffers_length++;
300 
301 	task->deferred_reclamation_metadata = metadata;
302 
303 	task_unlock(task);
304 	lck_mtx_unlock(&reclamation_buffers_lock);
305 
306 	return KERN_SUCCESS;
307 
308 fail_wired:
309 	task_unlock(task);
310 	lck_mtx_unlock(&reclamation_buffers_lock);
311 	tmp_kr = vmdr_metadata_unwire(metadata);
312 	assert3u(tmp_kr, ==, KERN_SUCCESS);
313 
314 out:
315 	vmdr_metadata_free(metadata);
316 	return kr;
317 }
318 
319 void
vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)320 vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)
321 {
322 	assert(metadata != NULL);
323 	/*
324 	 * First remove the buffer from the global list so no one else can get access to it.
325 	 */
326 	lck_mtx_lock(&reclamation_buffers_lock);
327 	TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
328 	reclamation_buffers_length--;
329 	lck_mtx_unlock(&reclamation_buffers_lock);
330 
331 	/*
332 	 * Now remove it from the async list (if present)
333 	 */
334 	lck_mtx_lock(&async_reclamation_buffers_lock);
335 	if (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
336 		TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
337 		metadata->vdrm_async_list.tqe_next = NULL;
338 		metadata->vdrm_async_list.tqe_prev = NULL;
339 	}
340 	lck_mtx_unlock(&async_reclamation_buffers_lock);
341 
342 	// A kernel thread may have grabbed the lock for this buffer before we had
343 	// a chance to remove it from the queues. Take the metadata lock to ensure
344 	// any such workers are finished operating on the buffer.
345 	lck_mtx_lock(&metadata->vdrm_lock);
346 	lck_mtx_unlock(&metadata->vdrm_lock);
347 
348 	vmdr_metadata_unwire(metadata);
349 }
350 
351 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)352 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
353 {
354 	assert(metadata != NULL);
355 	vmdr_metadata_free(metadata);
356 }
357 
358 static user_addr_t
get_head_ptr(user_addr_t indices)359 get_head_ptr(user_addr_t indices)
360 {
361 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
362 }
363 
364 static user_addr_t
get_tail_ptr(user_addr_t indices)365 get_tail_ptr(user_addr_t indices)
366 {
367 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
368 }
369 
370 static user_addr_t
get_busy_ptr(user_addr_t indices)371 get_busy_ptr(user_addr_t indices)
372 {
373 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
374 }
375 
376 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)377 reclaim_kill_with_reason(
378 	vm_deferred_reclamation_metadata_t metadata,
379 	unsigned reason,
380 	mach_exception_data_type_t subcode)
381 {
382 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
383 	mach_exception_code_t code = 0;
384 	task_t task = metadata->vdrm_task;
385 	proc_t p = NULL;
386 	boolean_t fatal = TRUE;
387 	bool killing_self = false;
388 	pid_t pid;
389 	int err;
390 
391 	if (panic_on_kill) {
392 		panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
393 	}
394 
395 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
396 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
397 	EXC_GUARD_ENCODE_TARGET(code, 0);
398 
399 	assert(metadata->vdrm_task != kernel_task);
400 	killing_self = task == current_task();
401 	if (!killing_self) {
402 		/*
403 		 * Grab a reference on the task to make sure it doesn't go away
404 		 * after we drop the metadata lock
405 		 */
406 		task_reference(task);
407 	}
408 	/*
409 	 * We need to issue a wakeup in case this kill is coming from the async path.
410 	 * Once we drop the lock the caller can no longer do this wakeup, but
411 	 * if there's someone blocked on this reclaim they hold a map reference
412 	 * and thus need to be woken up so the map can be freed.
413 	 */
414 	thread_wakeup(&metadata->vdrm_async_list);
415 	lck_mtx_unlock(&metadata->vdrm_lock);
416 
417 	if (reason == kGUARD_EXC_DEALLOC_GAP) {
418 		task_lock(task);
419 		fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
420 		task_unlock(task);
421 	}
422 
423 	if (!fatal) {
424 		os_log_info(vm_reclaim_log_handle,
425 		    "vm_reclaim: Skipping non fatal guard exception.\n");
426 		goto out;
427 	}
428 
429 	pid = task_pid(task);
430 	if (killing_self) {
431 		p = get_bsdtask_info(task);
432 	} else {
433 		p = proc_find(pid);
434 		if (p && proc_task(p) != task) {
435 			os_log_error(vm_reclaim_log_handle,
436 			    "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
437 			goto out;
438 		}
439 
440 		task_deallocate(task);
441 		task = NULL;
442 	}
443 
444 	if (!p) {
445 		os_log_error(vm_reclaim_log_handle,
446 		    "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
447 		goto out;
448 	}
449 
450 	err = exit_with_guard_exception(p, code, subcode);
451 	if (err != 0) {
452 		os_log_error(vm_reclaim_log_handle, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
453 	}
454 out:
455 	if (!killing_self) {
456 		if (p) {
457 			proc_rele(p);
458 			p = NULL;
459 		}
460 		if (task) {
461 			task_deallocate(task);
462 			task = NULL;
463 		}
464 	}
465 }
466 
467 static void
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)468 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
469 {
470 	reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE, result);
471 }
472 
473 /*
474  * Helper functions to do copyio on the head, tail, and busy pointers.
475  * Note that the kernel will only write to the busy and head pointers.
476  * Userspace is not supposed to write to the head or busy pointers, but the kernel
477  * must be resilient to that kind of bug in userspace.
478  */
479 
480 
481 static bool
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)482 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
483 {
484 	int result;
485 	user_addr_t indices = metadata->vdrm_reclaim_indices;
486 	user_addr_t head_ptr = get_head_ptr(indices);
487 
488 	result = copyin_atomic64(head_ptr, head);
489 
490 	if (result != 0) {
491 		os_log_error(vm_reclaim_log_handle,
492 		    "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
493 		reclaim_handle_copyio_error(metadata, result);
494 		return false;
495 	}
496 	return true;
497 }
498 
499 static bool
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)500 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
501 {
502 	int result;
503 	user_addr_t indices = metadata->vdrm_reclaim_indices;
504 	user_addr_t tail_ptr = get_tail_ptr(indices);
505 
506 	result = copyin_atomic64(tail_ptr, tail);
507 
508 	if (result != 0) {
509 		os_log_error(vm_reclaim_log_handle,
510 		    "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
511 		reclaim_handle_copyio_error(metadata, result);
512 		return false;
513 	}
514 	return true;
515 }
516 
517 static bool
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)518 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
519 {
520 	int result;
521 	user_addr_t indices = metadata->vdrm_reclaim_indices;
522 	user_addr_t busy_ptr = get_busy_ptr(indices);
523 
524 	result = copyin_atomic64(busy_ptr, busy);
525 
526 	if (result != 0) {
527 		os_log_error(vm_reclaim_log_handle,
528 		    "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
529 		reclaim_handle_copyio_error(metadata, result);
530 		return false;
531 	}
532 	return true;
533 }
534 
535 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)536 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
537 {
538 	int result;
539 	user_addr_t indices = metadata->vdrm_reclaim_indices;
540 	user_addr_t busy_ptr = get_busy_ptr(indices);
541 
542 	result = copyout_atomic64(value, busy_ptr);
543 
544 	if (result != 0) {
545 		os_log_error(vm_reclaim_log_handle,
546 		    "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
547 		reclaim_handle_copyio_error(metadata, result);
548 		return false;
549 	}
550 	return true;
551 }
552 
553 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)554 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
555 {
556 	int result;
557 	user_addr_t indices = metadata->vdrm_reclaim_indices;
558 	user_addr_t head_ptr = get_head_ptr(indices);
559 
560 	result = copyout_atomic64(value, head_ptr);
561 
562 	if (result != 0) {
563 		os_log_error(vm_reclaim_log_handle,
564 		    "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
565 		reclaim_handle_copyio_error(metadata, result);
566 		return false;
567 	}
568 	return true;
569 }
570 
571 /*
572  * Reclaim a chunk from the buffer.
573  * Returns the number of entries reclaimed or 0 if there are no entries left in the buffer.
574  */
575 static size_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)576 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata)
577 {
578 	assert(metadata != NULL);
579 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
580 
581 	int result = 0;
582 	size_t num_reclaimed = 0;
583 	uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0, num_copied = 0, buffer_len = 0;
584 	user_addr_t indices;
585 	vm_map_t map = metadata->vdrm_map, old_map;
586 	mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
587 	bool success;
588 
589 	buffer_len = metadata->vdrm_buffer_size / sizeof(mach_vm_reclaim_entry_v1_t);
590 
591 	memset(reclaim_entries, 0, sizeof(reclaim_entries));
592 
593 	indices = (user_addr_t) metadata->vdrm_reclaim_indices;
594 	old_map = vm_map_switch(map);
595 
596 	success = reclaim_copyin_busy(metadata, &busy);
597 	if (!success) {
598 		goto fail;
599 	}
600 	success = reclaim_copyin_head(metadata, &head);
601 	if (!success) {
602 		goto fail;
603 	}
604 	success = reclaim_copyin_tail(metadata, &tail);
605 	if (!success) {
606 		goto fail;
607 	}
608 
609 	if (busy != head) {
610 		// Userspace overwrote one of the pointers
611 		os_log_error(vm_reclaim_log_handle,
612 		    "vm_reclaim: Userspace modified head or busy pointer! head: %llu (0x%llx) != busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
613 		    head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, get_tail_ptr(indices));
614 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, busy);
615 		goto fail;
616 	}
617 
618 	if (tail < head) {
619 		// Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
620 		os_log_error(vm_reclaim_log_handle,
621 		    "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
622 		    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
623 		lck_mtx_unlock(&metadata->vdrm_lock);
624 		goto fail;
625 	}
626 
627 	num_to_reclaim = tail - head;
628 	while (true) {
629 		num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
630 		if (num_to_reclaim == 0) {
631 			break;
632 		}
633 		busy = head + num_to_reclaim;
634 		success = reclaim_copyout_busy(metadata, busy);
635 		if (!success) {
636 			goto fail;
637 		}
638 		os_atomic_thread_fence(seq_cst);
639 		success = reclaim_copyin_tail(metadata, &new_tail);
640 		if (!success) {
641 			goto fail;
642 		}
643 
644 		if (new_tail >= busy) {
645 			/* Got num_to_reclaim entries */
646 			break;
647 		}
648 		tail = new_tail;
649 		if (tail < head) {
650 			// Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
651 			os_log_error(vm_reclaim_log_handle,
652 			    "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
653 			    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
654 			lck_mtx_unlock(&metadata->vdrm_lock);
655 			goto fail;
656 		}
657 		/* Can't reclaim these entries. Try again */
658 		num_to_reclaim = tail - head;
659 		if (num_to_reclaim == 0) {
660 			/* Nothing left to reclaim. Reset busy to head. */
661 			success = reclaim_copyout_busy(metadata, head);
662 			if (!success) {
663 				goto fail;
664 			}
665 			break;
666 		}
667 		/*
668 		 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
669 		 * so this is gauranteed to converge.
670 		 */
671 	}
672 
673 	while (num_copied < num_to_reclaim) {
674 		uint64_t memcpy_start_idx = (head % buffer_len);
675 		uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
676 		// Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
677 		memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
678 		uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
679 
680 		assert(num_to_copy + num_copied <= kReclaimChunkSize);
681 		user_addr_t src_ptr = metadata->vdrm_reclaim_buffer + memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t);
682 		mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
683 
684 		result = copyin(src_ptr, dst_ptr, num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t));
685 
686 		if (result != 0) {
687 			os_log_error(vm_reclaim_log_handle,
688 			    "vm_reclaim: Unable to copyin %llu entries in reclaim buffer at 0x%llx to 0x%llx: err=%d\n",
689 			    num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
690 			reclaim_handle_copyio_error(metadata, result);
691 			goto fail;
692 		}
693 
694 		num_copied += num_to_copy;
695 		head += num_to_copy;
696 	}
697 
698 	for (size_t i = 0; i < num_to_reclaim; i++) {
699 		mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
700 		if (entry->address != 0 && entry->size != 0) {
701 			kern_return_t kr = vm_map_remove_guard(map,
702 			    vm_map_trunc_page(entry->address,
703 			    VM_MAP_PAGE_MASK(map)),
704 			    vm_map_round_page(entry->address + entry->size,
705 			    VM_MAP_PAGE_MASK(map)),
706 			    VM_MAP_REMOVE_GAPS_FAIL,
707 			    KMEM_GUARD_NONE).kmr_return;
708 			if (kr == KERN_INVALID_VALUE) {
709 				reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
710 				goto fail;
711 			} else if (kr != KERN_SUCCESS) {
712 				os_log_error(vm_reclaim_log_handle,
713 				    "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx. Err: %d\n",
714 				    entry->address, entry->size, (uint64_t) map, kr);
715 				reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
716 				goto fail;
717 			}
718 			num_reclaimed++;
719 			os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
720 		}
721 	}
722 
723 	success = reclaim_copyout_head(metadata, head);
724 	if (!success) {
725 		goto fail;
726 	}
727 
728 	vm_map_switch(old_map);
729 	return num_reclaimed;
730 fail:
731 	vm_map_switch(old_map);
732 	return kReclaimChunkFailed;
733 }
734 
735 /*
736  * Attempts to reclaim until the buffer's estimated number of available bytes is <= num_bytes_reclaimable_threshold
737  * The metadata buffer lock should be held by the caller.
738  *
739  * Returns the number of entries reclaimed.
740  */
741 static size_t
reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,size_t num_bytes_reclaimable_threshold)742 reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata, size_t num_bytes_reclaimable_threshold)
743 {
744 	assert(metadata != NULL);
745 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
746 	if (!task_is_active(metadata->vdrm_task)) {
747 		/*
748 		 * If the task is exiting, the reclaim below will likely fail and fall through
749 		 * to the (slower) error path.
750 		 * So as an optimization, we bail out early here.
751 		 */
752 		return 0;
753 	}
754 
755 	size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
756 	while (true) {
757 		size_t curr_entries_reclaimed = 0;
758 		num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
759 		reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
760 		if (num_bytes_reclaimed > reclaimable_bytes) {
761 			estimated_reclaimable_bytes = 0;
762 		} else {
763 			estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
764 		}
765 		if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
766 			break;
767 		}
768 		curr_entries_reclaimed = reclaim_chunk(metadata);
769 		if (curr_entries_reclaimed == kReclaimChunkFailed) {
770 			return kReclaimChunkFailed;
771 		}
772 		if (curr_entries_reclaimed == 0) {
773 			break;
774 		}
775 		num_entries_reclaimed += curr_entries_reclaimed;
776 	}
777 
778 	return num_entries_reclaimed;
779 }
780 
781 /*
782  * Get the reclamation metadata buffer for the given map.
783  * If the buffer exists it is returned locked.
784  */
785 static vm_deferred_reclamation_metadata_t
get_task_reclaim_metadata(task_t task)786 get_task_reclaim_metadata(task_t task)
787 {
788 	assert(task != NULL);
789 	vm_deferred_reclamation_metadata_t metadata = NULL;
790 	task_lock(task);
791 	metadata = task->deferred_reclamation_metadata;
792 	if (metadata != NULL) {
793 		lck_mtx_lock(&metadata->vdrm_lock);
794 	}
795 	task_unlock(task);
796 	return metadata;
797 }
798 
799 kern_return_t
vm_deferred_reclamation_buffer_synchronize_internal(task_t task,size_t num_entries_to_reclaim)800 vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
801 {
802 	vm_deferred_reclamation_metadata_t metadata = NULL;
803 	size_t total_reclaimed = 0;
804 
805 	if (!task_is_active(task)) {
806 		return KERN_FAILURE;
807 	}
808 
809 	metadata = get_task_reclaim_metadata(task);
810 	if (metadata == NULL) {
811 		return KERN_INVALID_ARGUMENT;
812 	}
813 
814 	while (total_reclaimed < num_entries_to_reclaim) {
815 		size_t num_reclaimed = reclaim_chunk(metadata);
816 		if (num_reclaimed == kReclaimChunkFailed) {
817 			/* Lock has already been released and task is being killed. */
818 			return KERN_FAILURE;
819 		}
820 		if (num_reclaimed == 0) {
821 			/* There was nothing to reclaim. A reclamation thread must have beaten us to it. Nothing to do here. */
822 			break;
823 		}
824 
825 		total_reclaimed += num_reclaimed;
826 	}
827 	lck_mtx_unlock(&metadata->vdrm_lock);
828 
829 	return KERN_SUCCESS;
830 }
831 
832 kern_return_t
vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task,size_t reclaimable_bytes)833 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
834 {
835 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
836 	size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer;
837 	bool success;
838 	if (metadata == NULL) {
839 		return KERN_INVALID_ARGUMENT;
840 	}
841 
842 	/*
843 	 * The client is allowed to make this call in parallel from multiple threads.
844 	 * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
845 	 * If the client's value is smaller than what we've stored, another thread
846 	 * raced ahead of them and we've already acted on that accounting so this
847 	 * call should be a no-op.
848 	 */
849 	success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
850 	    reclaimable_bytes, acquire,
851 	{
852 		if (num_bytes_in_buffer > reclaimable_bytes) {
853 		        os_atomic_rmw_loop_give_up(break);
854 		}
855 	});
856 	if (!success) {
857 		/* Stale value. Nothing new to reclaim */
858 		return KERN_SUCCESS;
859 	}
860 	num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
861 
862 	if (reclaimable_bytes > num_bytes_reclaimed) {
863 		estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
864 		if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
865 			lck_mtx_lock(&metadata->vdrm_lock);
866 			size_t num_reclaimed = reclaim_entries_from_buffer(metadata, vm_reclaim_max_threshold);
867 			if (num_reclaimed == kReclaimChunkFailed) {
868 				/* Lock has already been released & task is in the process of getting killed. */
869 				return KERN_INVALID_ARGUMENT;
870 			}
871 			lck_mtx_unlock(&metadata->vdrm_lock);
872 		}
873 	}
874 
875 	return KERN_SUCCESS;
876 }
877 
878 static inline size_t
pick_reclaim_threshold(vm_deferred_reclamation_action_t action)879 pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
880 {
881 	switch (action) {
882 	case RECLAIM_FULL:
883 		return 0;
884 	case RECLAIM_TRIM:
885 		return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
886 	case RECLAIM_ASYNC:
887 		return 0;
888 	}
889 }
890 
891 void
vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)892 vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)
893 {
894 	if (action == RECLAIM_ASYNC) {
895 		lck_mtx_lock(&async_reclamation_buffers_lock);
896 
897 		process_async_reclamation_list();
898 		lck_mtx_unlock(&async_reclamation_buffers_lock);
899 	} else {
900 		size_t reclaim_threshold = pick_reclaim_threshold(action);
901 		lck_mtx_lock(&reclamation_buffers_lock);
902 		reclamation_counter++;
903 		while (true) {
904 			vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
905 			if (metadata == NULL) {
906 				break;
907 			}
908 			lck_mtx_lock(&metadata->vdrm_lock);
909 			if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
910 				// We've already seen this one. We're done
911 				lck_mtx_unlock(&metadata->vdrm_lock);
912 				break;
913 			}
914 			metadata->vdrm_reclaimed_at = reclamation_counter;
915 
916 			TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
917 			TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
918 			lck_mtx_unlock(&reclamation_buffers_lock);
919 
920 			size_t num_reclaimed = reclaim_entries_from_buffer(metadata, reclaim_threshold);
921 			if (num_reclaimed != kReclaimChunkFailed) {
922 				lck_mtx_unlock(&metadata->vdrm_lock);
923 			}
924 
925 			lck_mtx_lock(&reclamation_buffers_lock);
926 		}
927 		lck_mtx_unlock(&reclamation_buffers_lock);
928 	}
929 }
930 
931 void
vm_deferred_reclamation_reclaim_all_memory(void)932 vm_deferred_reclamation_reclaim_all_memory(void)
933 {
934 	vm_deferred_reclamation_reclaim_memory(RECLAIM_FULL);
935 }
936 
937 bool
vm_deferred_reclamation_reclaim_from_task_async(task_t task)938 vm_deferred_reclamation_reclaim_from_task_async(task_t task)
939 {
940 	bool queued = false;
941 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
942 
943 	if (metadata != NULL) {
944 		lck_mtx_lock(&async_reclamation_buffers_lock);
945 		if (metadata->vdrm_async_list.tqe_next != NULL ||
946 		    metadata->vdrm_async_list.tqe_prev != NULL) {
947 			// move this buffer to the tail if still on the async list
948 			TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
949 		}
950 		TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
951 		lck_mtx_unlock(&async_reclamation_buffers_lock);
952 		queued = true;
953 		thread_wakeup(&vm_reclaim_thread);
954 	}
955 
956 	return queued;
957 }
958 
959 bool
vm_deferred_reclamation_reclaim_from_task_sync(task_t task,size_t max_entries_to_reclaim)960 vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
961 {
962 	size_t num_reclaimed = 0;
963 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
964 
965 	if (!task_is_active(task)) {
966 		return false;
967 	}
968 
969 	if (metadata != NULL) {
970 		lck_mtx_lock(&metadata->vdrm_lock);
971 		while (num_reclaimed < max_entries_to_reclaim) {
972 			size_t num_reclaimed_now = reclaim_chunk(metadata);
973 			if (num_reclaimed_now == kReclaimChunkFailed) {
974 				/* Lock has already been released and task is being killed. */
975 				return false;
976 			}
977 			if (num_reclaimed_now == 0) {
978 				// Nothing left to reclaim
979 				break;
980 			}
981 			num_reclaimed += num_reclaimed_now;
982 		}
983 		lck_mtx_unlock(&metadata->vdrm_lock);
984 	}
985 
986 	return num_reclaimed > 0;
987 }
988 
989 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_buffer_fork(task_t task,vm_deferred_reclamation_metadata_t parent)990 vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
991 {
992 	kern_return_t kr;
993 	vm_deferred_reclamation_metadata_t metadata = NULL;
994 
995 	LCK_MTX_ASSERT(&parent->vdrm_lock, LCK_MTX_ASSERT_OWNED);
996 
997 	assert(task->deferred_reclamation_metadata == NULL);
998 	metadata = vmdr_metadata_alloc(task, parent->vdrm_reclaim_buffer,
999 	    parent->vdrm_buffer_size, parent->vdrm_reclaim_indices);
1000 	lck_mtx_unlock(&parent->vdrm_lock);
1001 
1002 	kr = vmdr_metadata_wire(metadata);
1003 	if (kr != KERN_SUCCESS) {
1004 		vmdr_metadata_free(metadata);
1005 		return NULL;
1006 	}
1007 
1008 	lck_mtx_lock(&reclamation_buffers_lock);
1009 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
1010 	reclamation_buffers_length++;
1011 	lck_mtx_unlock(&reclamation_buffers_lock);
1012 
1013 	return metadata;
1014 }
1015 
1016 void
vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)1017 vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)
1018 {
1019 	lck_mtx_lock(&metadata->vdrm_lock);
1020 }
1021 
1022 void
vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)1023 vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)
1024 {
1025 	lck_mtx_unlock(&metadata->vdrm_lock);
1026 }
1027 
1028 
1029 static void
reclaim_thread_init(void)1030 reclaim_thread_init(void)
1031 {
1032 #if CONFIG_THREAD_GROUPS
1033 	thread_group_vm_add();
1034 #endif
1035 	thread_set_thread_name(current_thread(), "VM_reclaim");
1036 }
1037 
1038 
1039 static void
process_async_reclamation_list(void)1040 process_async_reclamation_list(void)
1041 {
1042 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
1043 
1044 	vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
1045 	while (metadata != NULL) {
1046 		TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
1047 		metadata->vdrm_async_list.tqe_next = NULL;
1048 		metadata->vdrm_async_list.tqe_prev = NULL;
1049 		lck_mtx_lock(&metadata->vdrm_lock);
1050 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1051 
1052 		// NB: Currently the async reclaim thread fully reclaims the buffer.
1053 		size_t num_reclaimed = reclaim_entries_from_buffer(metadata, 0);
1054 		if (num_reclaimed == kReclaimChunkFailed) {
1055 			/* Lock has already been released & task is in the process of getting killed. */
1056 			goto next;
1057 		}
1058 		/* Wakeup anyone waiting on this buffer getting processed */
1059 		thread_wakeup(&metadata->vdrm_async_list);
1060 		assert(current_thread()->map == kernel_map);
1061 		lck_mtx_unlock(&metadata->vdrm_lock);
1062 
1063 next:
1064 		lck_mtx_lock(&async_reclamation_buffers_lock);
1065 		metadata = TAILQ_FIRST(&async_reclamation_buffers);
1066 	}
1067 }
1068 
1069 __enum_decl(reclaim_thread_state, uint32_t, {
1070 	RECLAIM_THREAD_INIT = 0,
1071 	RECLAIM_THREAD_CONT = 1,
1072 });
1073 
1074 static void
reclaim_thread_continue(void)1075 reclaim_thread_continue(void)
1076 {
1077 	lck_mtx_lock(&async_reclamation_buffers_lock);
1078 
1079 	process_async_reclamation_list();
1080 	assert_wait(&vm_reclaim_thread, THREAD_UNINT);
1081 
1082 	lck_mtx_unlock(&async_reclamation_buffers_lock);
1083 }
1084 
1085 void
reclaim_thread(void * param,wait_result_t wr __unused)1086 reclaim_thread(void *param, wait_result_t wr __unused)
1087 {
1088 	if (param == (void *) RECLAIM_THREAD_INIT) {
1089 		reclaim_thread_init();
1090 	} else {
1091 		assert(param == (void *) RECLAIM_THREAD_CONT);
1092 	}
1093 
1094 	reclaim_thread_continue();
1095 
1096 	(void) thread_block_parameter(reclaim_thread, (void*) RECLAIM_THREAD_CONT);
1097 }
1098 
1099 __startup_func
1100 static void
vm_deferred_reclamation_init(void)1101 vm_deferred_reclamation_init(void)
1102 {
1103 	// Note: no-op pending rdar://27006343 (Custom kernel log handles)
1104 	vm_reclaim_log_handle = os_log_create("com.apple.mach.vm", "reclaim");
1105 
1106 	(void)kernel_thread_start_priority(reclaim_thread,
1107 	    (void *)RECLAIM_THREAD_INIT, kReclaimThreadPriority,
1108 	    &vm_reclaim_thread);
1109 }
1110 
1111 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1112 
1113 #if DEVELOPMENT || DEBUG
1114 
1115 bool
vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)1116 vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1117 {
1118 	vm_deferred_reclamation_metadata_t metadata = NULL;
1119 	proc_t p = proc_find(pid);
1120 	vm_map_t map = NULL;
1121 	if (p == NULL) {
1122 		return false;
1123 	}
1124 	task_t t = proc_task(p);
1125 	if (t == NULL) {
1126 		proc_rele(p);
1127 		return false;
1128 	}
1129 
1130 	task_lock(t);
1131 	if (t->map) {
1132 		metadata = t->deferred_reclamation_metadata;
1133 		if (metadata != NULL) {
1134 			map = t->map;
1135 			vm_map_reference(t->map);
1136 		}
1137 	}
1138 	task_unlock(t);
1139 	proc_rele(p);
1140 	if (metadata == NULL) {
1141 		return false;
1142 	}
1143 
1144 	lck_mtx_lock(&async_reclamation_buffers_lock);
1145 	while (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
1146 		assert_wait(&metadata->vdrm_async_list, THREAD_UNINT);
1147 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1148 		thread_block(THREAD_CONTINUE_NULL);
1149 		lck_mtx_lock(&async_reclamation_buffers_lock);
1150 	}
1151 
1152 	/*
1153 	 * The async reclaim thread first removes the buffer from the list
1154 	 * and then reclaims it (while holding its lock).
1155 	 * So grab the metadata buffer's lock here to ensure the
1156 	 * reclaim is done.
1157 	 */
1158 	lck_mtx_lock(&metadata->vdrm_lock);
1159 	lck_mtx_unlock(&metadata->vdrm_lock);
1160 	lck_mtx_unlock(&async_reclamation_buffers_lock);
1161 
1162 	vm_map_deallocate(map);
1163 	return true;
1164 }
1165 
1166 #endif /* DEVELOPMENT || DEBUG */
1167