xref: /xnu-10063.121.3/osfmk/vm/vm_reclaim.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/startup.h>
35 #include <kern/sched.h>
36 #include <libkern/OSAtomic.h>
37 #include <mach/kern_return.h>
38 #include <mach/mach_types.h>
39 #include <mach/mach_vm.h>
40 #include <mach/vm_reclaim.h>
41 #include <os/log.h>
42 #include <pexpert/pexpert.h>
43 #include <vm/vm_map.h>
44 #include <vm/vm_map_internal.h>
45 #include <vm/vm_reclaim_internal.h>
46 #include <sys/kdebug.h>
47 #include <sys/queue.h>
48 #include <os/atomic_private.h>
49 
50 #pragma mark Tunables
51 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
52 static integer_t kReclaimThreadPriority = BASEPRI_VM;
53 // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
54 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", 2);
55 TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults", "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", 0, TUNABLE_DT_NONE);
56 // Used to debug vm_reclaim kills
57 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
58 
59 #pragma mark Declarations
60 typedef struct proc *proc_t;
61 extern char *proc_best_name(proc_t proc);
62 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
63 struct proc *proc_ref(struct proc *p, int locked);
64 int proc_rele(proc_t p);
65 static bool reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
66 static bool reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
67 static bool reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
68 
69 struct vm_deferred_reclamation_metadata_s {
70 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; // Global list containing every reclamation buffer
71 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list; // A list containing buffers that are ripe for reclamation
72 	decl_lck_mtx_data(, vdrm_lock); /* Held when reclaiming from the buffer */
73 	/*
74 	 * The task owns this structure but we maintain a backpointer here
75 	 * so that we can send an exception if we hit an error.
76 	 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
77 	 */
78 	task_t vdrm_task;
79 	vm_map_t vdrm_map;
80 	user_addr_t vdrm_reclaim_buffer;
81 	mach_vm_size_t vdrm_buffer_size;
82 	user_addr_t vdrm_reclaim_indices;
83 	uint64_t vdrm_reclaimed_at;
84 	/*
85 	 * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
86 	 * cumulatively. Both values are in terms of virtual memory, so they give an upper bound
87 	 * on the amount of physical memory that can be reclaimed.
88 	 * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
89 	 * Note that neither value is protected by the vdrm_lock.
90 	 */
91 	_Atomic size_t vdrm_num_bytes_put_in_buffer;
92 	_Atomic size_t vdrm_num_bytes_reclaimed;
93 };
94 static void process_async_reclamation_list(void);
95 
96 extern void *proc_find(int pid);
97 extern task_t proc_task(proc_t);
98 
99 #pragma mark Globals
100 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
101 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
102 static os_log_t vm_reclaim_log_handle;
103 
104 /*
105  * The ringbuffer must contain at least 2 entries to distinguish between empty
106  * (head == tail) and full (head == tail + 1).
107  */
108 #define BUFFER_MIN_ENTRY_COUNT 2
109 
110 /*
111  * We maintain two lists of reclamation buffers.
112  * The reclamation_buffers list contains every buffer in the system.
113  * The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
114  * Each list has its own lock.
115  */
116 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
117 
118 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
119 /*
120  * The reclamation_buffers_lock protects the reclamation_buffers list.
121  * It must be held when iterating over the list or manipulating the list.
122  * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
123  */
124 LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
125 LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
126 static size_t reclamation_buffers_length;
127 static uint64_t reclamation_counter; // generation count for global reclaims
128 
129 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
130 static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
131 
132 #pragma mark Implementation
133 
134 /*
135  * The current design is not tolerant to faulting on the buffer under the
136  * metadata lock. Wire the buffer as a stop-gap solution for now; in the
137  * future, the synchronization scheme should be revised to allow the buffer
138  * to be pageable (rdar://112039103).
139  */
140 
141 static kern_return_t
vmdr_metadata_wire(vm_deferred_reclamation_metadata_t metadata)142 vmdr_metadata_wire(vm_deferred_reclamation_metadata_t metadata)
143 {
144 	kern_return_t kr;
145 	vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer -
146 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries));
147 	vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer +
148 	    metadata->vdrm_buffer_size);
149 	kr = vm_map_wire_kernel(metadata->vdrm_map, buffer_start, buffer_end,
150 	    VM_PROT_NONE, VM_KERN_MEMORY_OSFMK, TRUE);
151 	if (kr != KERN_SUCCESS) {
152 		os_log_error(vm_reclaim_log_handle,
153 		    "vm_reclaim: failed to wire userspace reclaim buffer for pid %d (%d)",
154 		    task_pid(metadata->vdrm_task), kr);
155 	}
156 	return kr;
157 }
158 
159 static kern_return_t
vmdr_metadata_unwire(vm_deferred_reclamation_metadata_t metadata)160 vmdr_metadata_unwire(vm_deferred_reclamation_metadata_t metadata)
161 {
162 	kern_return_t kr;
163 	vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer -
164 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries));
165 	vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer +
166 	    metadata->vdrm_buffer_size);
167 	kr = vm_map_unwire(metadata->vdrm_map, buffer_start, buffer_end, TRUE);
168 	if (kr != KERN_SUCCESS) {
169 		os_log_error(vm_reclaim_log_handle,
170 		    "vm_reclaim: unable to un-wire buffer %p (%llu) for pid %d (%d)",
171 		    (void *)buffer_start, (buffer_end - buffer_start),
172 		    task_pid(metadata->vdrm_task), kr);
173 	}
174 	return kr;
175 }
176 
177 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,user_addr_t indices)178 vmdr_metadata_alloc(
179 	task_t                  task,
180 	user_addr_t             buffer,
181 	mach_vm_size_t          size,
182 	user_addr_t             indices)
183 {
184 	vm_deferred_reclamation_metadata_t metadata;
185 	vm_map_t map = task->map;
186 
187 	assert(!map->is_nested_map);
188 
189 	metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
190 	lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
191 	metadata->vdrm_task = task;
192 	metadata->vdrm_map = map;
193 	metadata->vdrm_reclaim_buffer = buffer;
194 	metadata->vdrm_buffer_size = size;
195 	metadata->vdrm_reclaim_indices = indices;
196 
197 	/*
198 	 * we do not need to hold a lock on `task` because this is called
199 	 * either at fork() time or from the context of current_task().
200 	 */
201 	vm_map_reference(map);
202 	return metadata;
203 }
204 
205 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)206 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
207 {
208 	vm_map_deallocate(metadata->vdrm_map);
209 	lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
210 	zfree(vm_reclaim_metadata_zone, metadata);
211 }
212 
213 kern_return_t
vm_deferred_reclamation_buffer_init_internal(task_t task,mach_vm_offset_t address,mach_vm_size_t size)214 vm_deferred_reclamation_buffer_init_internal(
215 	task_t                  task,
216 	mach_vm_offset_t        address,
217 	mach_vm_size_t          size)
218 {
219 	kern_return_t kr = KERN_FAILURE, tmp_kr;
220 	vm_deferred_reclamation_metadata_t metadata = NULL;
221 	bool success;
222 	uint64_t head = 0, tail = 0, busy = 0;
223 
224 	if (address == 0 ||
225 	    size < (sizeof(struct mach_vm_reclaim_buffer_v1_s) +
226 	    BUFFER_MIN_ENTRY_COUNT * sizeof(mach_vm_reclaim_entry_v1_t)) ||
227 	    !VM_MAP_PAGE_ALIGNED(address, VM_MAP_PAGE_MASK(task->map)) ||
228 	    !VM_MAP_PAGE_ALIGNED((address + size), VM_MAP_PAGE_MASK(task->map))) {
229 		return KERN_INVALID_ARGUMENT;
230 	}
231 
232 	/* vm_reclaim is disabled */
233 	if (vm_reclaim_max_threshold == 0) {
234 		os_log_error(vm_reclaim_log_handle,
235 		    "vm_reclaim: failed to initialize vmdr buffer - reclaim is disabled (%llu)",
236 		    vm_reclaim_max_threshold);
237 		return KERN_NOT_SUPPORTED;
238 	}
239 
240 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START,
241 	    task_pid(task), address, size);
242 
243 	user_addr_t buffer = address + \
244 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
245 	mach_vm_size_t buffer_size = size - \
246 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
247 	user_addr_t indices = address + \
248 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, indices);
249 
250 	metadata = vmdr_metadata_alloc(task, buffer, buffer_size, indices);
251 
252 	kr = vmdr_metadata_wire(metadata);
253 	if (kr != KERN_SUCCESS) {
254 		goto out;
255 	}
256 
257 	/*
258 	 * Validate the starting indices.
259 	 *
260 	 * NB: At this point it is impossible for another thread to hold a
261 	 * reference to this metadata. However, reclaim_copyin may call reclaim_kill
262 	 * on failure, which assumes the metadata lock is held.
263 	 */
264 	lck_mtx_lock(&metadata->vdrm_lock);
265 
266 	success = reclaim_copyin_busy(metadata, &busy);
267 	if (!success) {
268 		/* metadata lock has been dropped and exception delivered to task */
269 		kr = KERN_INVALID_ARGUMENT;
270 		goto fail_wired;
271 	}
272 	success = reclaim_copyin_head(metadata, &head);
273 	if (!success) {
274 		/* metadata lock has been dropped and exception delivered to task */
275 		kr = KERN_INVALID_ARGUMENT;
276 		goto fail_wired;
277 	}
278 	success = reclaim_copyin_tail(metadata, &tail);
279 	if (!success) {
280 		/* metadata lock has been dropped and exception delivered to task */
281 		kr = KERN_INVALID_ARGUMENT;
282 		goto fail_wired;
283 	}
284 
285 	lck_mtx_unlock(&metadata->vdrm_lock);
286 
287 	if (head != 0 || tail != 0 || busy != 0) {
288 		kr = KERN_INVALID_ARGUMENT;
289 		goto fail_wired;
290 	}
291 
292 	/*
293 	 * Publish the metadata to the task & global buffer list. This must be
294 	 * done under the task lock to synchronize with task termination - i.e.
295 	 * task_terminate_internal is guaranteed to see the published metadata and
296 	 * tear it down.
297 	 */
298 	lck_mtx_lock(&reclamation_buffers_lock);
299 	task_lock(task);
300 
301 	if (!task_is_active(task) || task_is_halting(task)) {
302 		os_log_error(vm_reclaim_log_handle,
303 		    "vm_reclaim: failed to initialize buffer on dying task (pid %d)", task_pid(task));
304 		kr = KERN_TERMINATED;
305 		goto fail_task;
306 	} else if (task->deferred_reclamation_metadata != NULL) {
307 		os_log_error(vm_reclaim_log_handle,
308 		    "vm_reclaim: tried to overwrite existing reclaim buffer for pid %d", task_pid(task));
309 		kr = KERN_INVALID_ARGUMENT;
310 		goto fail_task;
311 	}
312 
313 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
314 	reclamation_buffers_length++;
315 
316 	task->deferred_reclamation_metadata = metadata;
317 
318 	task_unlock(task);
319 	lck_mtx_unlock(&reclamation_buffers_lock);
320 
321 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
322 	    task_pid(task), KERN_SUCCESS);
323 	return KERN_SUCCESS;
324 
325 fail_task:
326 	task_unlock(task);
327 	lck_mtx_unlock(&reclamation_buffers_lock);
328 
329 fail_wired:
330 	tmp_kr = vmdr_metadata_unwire(metadata);
331 	assert3u(tmp_kr, ==, KERN_SUCCESS);
332 
333 out:
334 	vmdr_metadata_free(metadata);
335 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
336 	    task_pid(task), kr);
337 	return kr;
338 }
339 
340 void
vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)341 vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)
342 {
343 	assert(metadata != NULL);
344 	/*
345 	 * First remove the buffer from the global list so no one else can get access to it.
346 	 */
347 	lck_mtx_lock(&reclamation_buffers_lock);
348 	TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
349 	reclamation_buffers_length--;
350 	lck_mtx_unlock(&reclamation_buffers_lock);
351 
352 	/*
353 	 * Now remove it from the async list (if present)
354 	 */
355 	lck_mtx_lock(&async_reclamation_buffers_lock);
356 	if (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
357 		TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
358 		metadata->vdrm_async_list.tqe_next = NULL;
359 		metadata->vdrm_async_list.tqe_prev = NULL;
360 	}
361 	lck_mtx_unlock(&async_reclamation_buffers_lock);
362 
363 	// A kernel thread may have grabbed the lock for this buffer before we had
364 	// a chance to remove it from the queues. Take the metadata lock to ensure
365 	// any such workers are finished operating on the buffer.
366 	lck_mtx_lock(&metadata->vdrm_lock);
367 	lck_mtx_unlock(&metadata->vdrm_lock);
368 
369 	vmdr_metadata_unwire(metadata);
370 }
371 
372 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)373 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
374 {
375 	assert(metadata != NULL);
376 	vmdr_metadata_free(metadata);
377 }
378 
379 static user_addr_t
get_head_ptr(user_addr_t indices)380 get_head_ptr(user_addr_t indices)
381 {
382 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
383 }
384 
385 static user_addr_t
get_tail_ptr(user_addr_t indices)386 get_tail_ptr(user_addr_t indices)
387 {
388 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
389 }
390 
391 static user_addr_t
get_busy_ptr(user_addr_t indices)392 get_busy_ptr(user_addr_t indices)
393 {
394 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
395 }
396 
397 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)398 reclaim_kill_with_reason(
399 	vm_deferred_reclamation_metadata_t metadata,
400 	unsigned reason,
401 	mach_exception_data_type_t subcode)
402 {
403 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
404 	mach_exception_code_t code = 0;
405 	task_t task = metadata->vdrm_task;
406 	proc_t p = NULL;
407 	boolean_t fatal = TRUE;
408 	bool killing_self = false;
409 	pid_t pid;
410 	int err;
411 
412 	if (panic_on_kill) {
413 		panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
414 	}
415 
416 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
417 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
418 	EXC_GUARD_ENCODE_TARGET(code, 0);
419 
420 	assert(metadata->vdrm_task != kernel_task);
421 	killing_self = task == current_task();
422 	if (!killing_self) {
423 		/*
424 		 * Grab a reference on the task to make sure it doesn't go away
425 		 * after we drop the metadata lock
426 		 */
427 		task_reference(task);
428 	}
429 	/*
430 	 * We need to issue a wakeup in case this kill is coming from the async path.
431 	 * Once we drop the lock the caller can no longer do this wakeup, but
432 	 * if there's someone blocked on this reclaim they hold a map reference
433 	 * and thus need to be woken up so the map can be freed.
434 	 */
435 	thread_wakeup(&metadata->vdrm_async_list);
436 	lck_mtx_unlock(&metadata->vdrm_lock);
437 
438 	if (reason == kGUARD_EXC_DEALLOC_GAP) {
439 		task_lock(task);
440 		fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
441 		task_unlock(task);
442 	}
443 
444 	if (!fatal) {
445 		os_log_info(vm_reclaim_log_handle,
446 		    "vm_reclaim: Skipping non fatal guard exception.\n");
447 		goto out;
448 	}
449 
450 	pid = task_pid(task);
451 	if (killing_self) {
452 		p = get_bsdtask_info(task);
453 	} else {
454 		p = proc_find(pid);
455 		if (p && proc_task(p) != task) {
456 			os_log_error(vm_reclaim_log_handle,
457 			    "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
458 			goto out;
459 		}
460 
461 		task_deallocate(task);
462 		task = NULL;
463 	}
464 
465 	if (!p) {
466 		os_log_error(vm_reclaim_log_handle,
467 		    "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
468 		goto out;
469 	}
470 
471 	err = exit_with_guard_exception(p, code, subcode);
472 	if (err != 0) {
473 		os_log_error(vm_reclaim_log_handle, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
474 	}
475 out:
476 	if (!killing_self) {
477 		if (p) {
478 			proc_rele(p);
479 			p = NULL;
480 		}
481 		if (task) {
482 			task_deallocate(task);
483 			task = NULL;
484 		}
485 	}
486 }
487 
488 static void
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)489 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
490 {
491 	reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE, result);
492 }
493 
494 /*
495  * Helper functions to do copyio on the head, tail, and busy pointers.
496  * Note that the kernel will only write to the busy and head pointers.
497  * Userspace is not supposed to write to the head or busy pointers, but the kernel
498  * must be resilient to that kind of bug in userspace.
499  */
500 
501 
502 static bool
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)503 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
504 {
505 	int result;
506 	user_addr_t indices = metadata->vdrm_reclaim_indices;
507 	user_addr_t head_ptr = get_head_ptr(indices);
508 
509 	result = copyin_atomic64(head_ptr, head);
510 
511 	if (result != 0) {
512 		os_log_error(vm_reclaim_log_handle,
513 		    "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
514 		reclaim_handle_copyio_error(metadata, result);
515 		return false;
516 	}
517 	return true;
518 }
519 
520 static bool
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)521 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
522 {
523 	int result;
524 	user_addr_t indices = metadata->vdrm_reclaim_indices;
525 	user_addr_t tail_ptr = get_tail_ptr(indices);
526 
527 	result = copyin_atomic64(tail_ptr, tail);
528 
529 	if (result != 0) {
530 		os_log_error(vm_reclaim_log_handle,
531 		    "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
532 		reclaim_handle_copyio_error(metadata, result);
533 		return false;
534 	}
535 	return true;
536 }
537 
538 static bool
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)539 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
540 {
541 	int result;
542 	user_addr_t indices = metadata->vdrm_reclaim_indices;
543 	user_addr_t busy_ptr = get_busy_ptr(indices);
544 
545 	result = copyin_atomic64(busy_ptr, busy);
546 
547 	if (result != 0) {
548 		os_log_error(vm_reclaim_log_handle,
549 		    "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
550 		reclaim_handle_copyio_error(metadata, result);
551 		return false;
552 	}
553 	return true;
554 }
555 
556 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)557 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
558 {
559 	int result;
560 	user_addr_t indices = metadata->vdrm_reclaim_indices;
561 	user_addr_t busy_ptr = get_busy_ptr(indices);
562 
563 	result = copyout_atomic64(value, busy_ptr);
564 
565 	if (result != 0) {
566 		os_log_error(vm_reclaim_log_handle,
567 		    "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
568 		reclaim_handle_copyio_error(metadata, result);
569 		return false;
570 	}
571 	return true;
572 }
573 
574 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)575 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
576 {
577 	int result;
578 	user_addr_t indices = metadata->vdrm_reclaim_indices;
579 	user_addr_t head_ptr = get_head_ptr(indices);
580 
581 	result = copyout_atomic64(value, head_ptr);
582 
583 	if (result != 0) {
584 		os_log_error(vm_reclaim_log_handle,
585 		    "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
586 		reclaim_handle_copyio_error(metadata, result);
587 		return false;
588 	}
589 	return true;
590 }
591 
592 /*
593  * Reclaim a chunk (kReclaimChunkSize entries) from the buffer.
594  *
595  * Writes the number of entries reclaimed to `num_reclaimed_out`. Note that
596  * there may be zero reclaimable entries in the chunk (they have all been
597  * re-used by userspace).
598  *
599  * Returns:
600  *  - KERN_NOT_FOUND if the buffer has been exhausted (head == tail)
601  *  - KERN_FAILURE on failure to reclaim -- metadata lock will be dropped
602  *    before returning
603  */
604 static kern_return_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,size_t * num_reclaimed_out)605 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata, size_t *num_reclaimed_out)
606 {
607 	assert(metadata != NULL);
608 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
609 	int result = 0;
610 	size_t num_reclaimed = 0;
611 	uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0, num_copied = 0, buffer_len = 0;
612 	user_addr_t indices;
613 	vm_map_t map = metadata->vdrm_map, old_map;
614 	mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
615 	bool success;
616 
617 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START,
618 	    task_pid(metadata->vdrm_task), kReclaimChunkSize);
619 
620 	buffer_len = metadata->vdrm_buffer_size / sizeof(mach_vm_reclaim_entry_v1_t);
621 
622 	memset(reclaim_entries, 0, sizeof(reclaim_entries));
623 
624 	indices = (user_addr_t) metadata->vdrm_reclaim_indices;
625 	old_map = vm_map_switch(map);
626 
627 	success = reclaim_copyin_busy(metadata, &busy);
628 	if (!success) {
629 		goto fail;
630 	}
631 	success = reclaim_copyin_head(metadata, &head);
632 	if (!success) {
633 		goto fail;
634 	}
635 	success = reclaim_copyin_tail(metadata, &tail);
636 	if (!success) {
637 		goto fail;
638 	}
639 
640 	if (busy != head) {
641 		// Userspace overwrote one of the pointers
642 		os_log_error(vm_reclaim_log_handle,
643 		    "vm_reclaim: Userspace modified head or busy pointer! head: %llu (0x%llx) != busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
644 		    head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, get_tail_ptr(indices));
645 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, busy);
646 		goto fail;
647 	}
648 
649 	if (tail < head) {
650 		// Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
651 		os_log_error(vm_reclaim_log_handle,
652 		    "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
653 		    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
654 		lck_mtx_unlock(&metadata->vdrm_lock);
655 		goto fail;
656 	}
657 
658 	num_to_reclaim = tail - head;
659 	while (true) {
660 		num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
661 		if (num_to_reclaim == 0) {
662 			break;
663 		}
664 		busy = head + num_to_reclaim;
665 		success = reclaim_copyout_busy(metadata, busy);
666 		if (!success) {
667 			goto fail;
668 		}
669 		os_atomic_thread_fence(seq_cst);
670 		success = reclaim_copyin_tail(metadata, &new_tail);
671 		if (!success) {
672 			goto fail;
673 		}
674 
675 		if (new_tail >= busy) {
676 			/* Got num_to_reclaim entries */
677 			break;
678 		}
679 		tail = new_tail;
680 		if (tail < head) {
681 			// Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
682 			os_log_error(vm_reclaim_log_handle,
683 			    "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
684 			    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
685 			lck_mtx_unlock(&metadata->vdrm_lock);
686 			goto fail;
687 		}
688 		/* Can't reclaim these entries. Try again */
689 		num_to_reclaim = tail - head;
690 		if (num_to_reclaim == 0) {
691 			/* Nothing left to reclaim. Reset busy to head. */
692 			success = reclaim_copyout_busy(metadata, head);
693 			if (!success) {
694 				goto fail;
695 			}
696 			break;
697 		}
698 		/*
699 		 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
700 		 * so this is gauranteed to converge.
701 		 */
702 	}
703 
704 	while (num_copied < num_to_reclaim) {
705 		uint64_t memcpy_start_idx = (head % buffer_len);
706 		uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
707 		// Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
708 		memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
709 		uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
710 
711 		assert(num_to_copy + num_copied <= kReclaimChunkSize);
712 		user_addr_t src_ptr = metadata->vdrm_reclaim_buffer + memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t);
713 		mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
714 
715 		result = copyin(src_ptr, dst_ptr, num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t));
716 
717 		if (result != 0) {
718 			os_log_error(vm_reclaim_log_handle,
719 			    "vm_reclaim: Unable to copyin %llu entries in reclaim buffer at 0x%llx to 0x%llx: err=%d\n",
720 			    num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
721 			reclaim_handle_copyio_error(metadata, result);
722 			goto fail;
723 		}
724 
725 		num_copied += num_to_copy;
726 		head += num_to_copy;
727 	}
728 
729 	for (size_t i = 0; i < num_to_reclaim; i++) {
730 		mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
731 		KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
732 		    task_pid(metadata->vdrm_task), entry->address, entry->size,
733 		    entry->behavior);
734 		DTRACE_VM4(vm_reclaim_chunk,
735 		    int, task_pid(metadata->vdrm_task),
736 		    mach_vm_address_t, entry->address,
737 		    size_t, entry->size,
738 		    mach_vm_reclaim_behavior_v1_t, entry->behavior);
739 		if (entry->address != 0 && entry->size != 0) {
740 			kern_return_t kr;
741 			switch (entry->behavior) {
742 			case MACH_VM_RECLAIM_DEALLOCATE:
743 				kr = vm_map_remove_guard(map,
744 				    vm_map_trunc_page(entry->address,
745 				    VM_MAP_PAGE_MASK(map)),
746 				    vm_map_round_page(entry->address + entry->size,
747 				    VM_MAP_PAGE_MASK(map)),
748 				    VM_MAP_REMOVE_GAPS_FAIL,
749 				    KMEM_GUARD_NONE).kmr_return;
750 				if (kr == KERN_INVALID_VALUE) {
751 					reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
752 					goto fail;
753 				} else if (kr != KERN_SUCCESS) {
754 					os_log_error(vm_reclaim_log_handle,
755 					    "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx err=%d\n",
756 					    entry->address, entry->size, (uint64_t) map, kr);
757 					reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
758 					goto fail;
759 				}
760 				break;
761 			case MACH_VM_RECLAIM_REUSABLE:
762 				kr = vm_map_behavior_set(map,
763 				    vm_map_trunc_page(entry->address, VM_MAP_PAGE_MASK(map)),
764 				    vm_map_round_page(entry->address + entry->size, VM_MAP_PAGE_MASK(map)),
765 				    VM_BEHAVIOR_REUSABLE);
766 				if (kr != KERN_SUCCESS) {
767 					os_log_error(vm_reclaim_log_handle,
768 					    "vm_reclaim: unable to free(reusable) 0x%llx (%u) for pid %d err=%d\n",
769 					    entry->address, entry->size, task_pid(metadata->vdrm_task), kr);
770 				}
771 				break;
772 			default:
773 				os_log_error(vm_reclaim_log_handle,
774 				    "vm_reclaim: attempted to reclaim entry with unsupported behavior %uh",
775 				    entry->behavior);
776 				reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
777 				goto fail;
778 			}
779 			num_reclaimed++;
780 			os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
781 			KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END,
782 			    task_pid(metadata->vdrm_task), entry->address);
783 		}
784 	}
785 
786 	success = reclaim_copyout_head(metadata, head);
787 	if (!success) {
788 		goto fail;
789 	}
790 
791 	vm_map_switch(old_map);
792 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
793 	    task_pid(metadata->vdrm_task), num_to_reclaim, num_reclaimed, true);
794 	*num_reclaimed_out = num_reclaimed;
795 	if (num_to_reclaim == 0) {
796 		// We have exhausted the reclaimable portion of the buffer
797 		return KERN_NOT_FOUND;
798 	}
799 	return KERN_SUCCESS;
800 fail:
801 	vm_map_switch(old_map);
802 	*num_reclaimed_out = num_reclaimed;
803 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
804 	    task_pid(metadata->vdrm_task), num_to_reclaim, num_reclaimed, false);
805 	return KERN_FAILURE;
806 }
807 
808 /*
809  * Attempts to reclaim until the buffer's estimated number of available bytes
810  * is <= num_bytes_reclaimable_threshold. The metadata buffer lock should be
811  * held by the caller.
812  *
813  * Writes the number of entries reclaimed to `num_reclaimed_out`.
814  */
815 static kern_return_t
reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,size_t num_bytes_reclaimable_threshold,size_t * num_reclaimed_out)816 reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,
817     size_t num_bytes_reclaimable_threshold, size_t *num_reclaimed_out)
818 {
819 	assert(metadata != NULL);
820 	assert(num_reclaimed_out != NULL);
821 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
822 	if (!task_is_active(metadata->vdrm_task)) {
823 		/*
824 		 * If the task is exiting, the reclaim below will likely fail and fall through
825 		 * to the (slower) error path.
826 		 * So as an optimization, we bail out early here.
827 		 */
828 		return 0;
829 	}
830 
831 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_START, task_pid(metadata->vdrm_task));
832 
833 	size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
834 	while (true) {
835 		kern_return_t kr;
836 		size_t curr_entries_reclaimed = 0;
837 		num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
838 		reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
839 		if (num_bytes_reclaimed > reclaimable_bytes) {
840 			estimated_reclaimable_bytes = 0;
841 		} else {
842 			estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
843 		}
844 		if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
845 			break;
846 		}
847 		kr = reclaim_chunk(metadata, &curr_entries_reclaimed);
848 		if (kr == KERN_NOT_FOUND) {
849 			break;
850 		} else if (kr != KERN_SUCCESS) {
851 			KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END,
852 			    task_pid(metadata->vdrm_task), num_entries_reclaimed,
853 			    estimated_reclaimable_bytes, kr);
854 			*num_reclaimed_out = num_entries_reclaimed;
855 			return kr;
856 		}
857 		num_entries_reclaimed += curr_entries_reclaimed;
858 	}
859 
860 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END,
861 	    task_pid(metadata->vdrm_task), num_entries_reclaimed,
862 	    estimated_reclaimable_bytes, KERN_SUCCESS);
863 	*num_reclaimed_out = num_entries_reclaimed;
864 	return KERN_SUCCESS;
865 }
866 
867 /*
868  * Get the reclamation metadata buffer for the given map.
869  * If the buffer exists it is returned locked.
870  */
871 static vm_deferred_reclamation_metadata_t
get_task_reclaim_metadata(task_t task)872 get_task_reclaim_metadata(task_t task)
873 {
874 	assert(task != NULL);
875 	vm_deferred_reclamation_metadata_t metadata = NULL;
876 	task_lock(task);
877 	metadata = task->deferred_reclamation_metadata;
878 	if (metadata != NULL) {
879 		lck_mtx_lock(&metadata->vdrm_lock);
880 	}
881 	task_unlock(task);
882 	return metadata;
883 }
884 
885 kern_return_t
vm_deferred_reclamation_buffer_synchronize_internal(task_t task,size_t num_entries_to_reclaim)886 vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
887 {
888 	kern_return_t kr;
889 	vm_deferred_reclamation_metadata_t metadata = NULL;
890 	size_t total_reclaimed = 0;
891 
892 	if (!task_is_active(task)) {
893 		return KERN_FAILURE;
894 	}
895 
896 	metadata = get_task_reclaim_metadata(task);
897 	if (metadata == NULL) {
898 		return KERN_INVALID_ARGUMENT;
899 	}
900 
901 	while (total_reclaimed < num_entries_to_reclaim) {
902 		size_t num_reclaimed;
903 		kr = reclaim_chunk(metadata, &num_reclaimed);
904 		if (kr == KERN_NOT_FOUND) {
905 			/* buffer has been fully reclaimed from */
906 			break;
907 		} else if (kr != KERN_SUCCESS) {
908 			/* Lock has already been released and task is being killed. */
909 			return kr;
910 		}
911 
912 		total_reclaimed += num_reclaimed;
913 	}
914 	lck_mtx_unlock(&metadata->vdrm_lock);
915 
916 	return KERN_SUCCESS;
917 }
918 
919 kern_return_t
vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task,size_t reclaimable_bytes)920 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
921 {
922 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
923 	size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer, num_reclaimed = 0;
924 	bool success;
925 	kern_return_t kr = KERN_SUCCESS;
926 	if (metadata == NULL) {
927 		return KERN_INVALID_ARGUMENT;
928 	}
929 
930 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START,
931 	    task_pid(task), reclaimable_bytes);
932 
933 	/*
934 	 * The client is allowed to make this call in parallel from multiple threads.
935 	 * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
936 	 * If the client's value is smaller than what we've stored, another thread
937 	 * raced ahead of them and we've already acted on that accounting so this
938 	 * call should be a no-op.
939 	 */
940 	success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
941 	    reclaimable_bytes, acquire,
942 	{
943 		if (num_bytes_in_buffer > reclaimable_bytes) {
944 		        os_atomic_rmw_loop_give_up(break);
945 		}
946 	});
947 	if (!success) {
948 		/* Stale value. Nothing new to reclaim */
949 		goto done;
950 	}
951 	num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
952 
953 	if (reclaimable_bytes > num_bytes_reclaimed) {
954 		estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
955 		if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
956 			lck_mtx_lock(&metadata->vdrm_lock);
957 			kr = reclaim_entries_from_buffer(metadata,
958 			    vm_reclaim_max_threshold, &num_reclaimed);
959 			if (kr != KERN_SUCCESS) {
960 				/* Lock has already been released & task is in the process of getting killed. */
961 				goto done;
962 			}
963 			lck_mtx_unlock(&metadata->vdrm_lock);
964 		}
965 	}
966 
967 done:
968 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END,
969 	    task_pid(task), reclaimable_bytes, num_bytes_reclaimed, num_reclaimed);
970 
971 	return kr;
972 }
973 
974 static inline size_t
pick_reclaim_threshold(vm_deferred_reclamation_action_t action)975 pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
976 {
977 	switch (action) {
978 	case RECLAIM_FULL:
979 		return 0;
980 	case RECLAIM_TRIM:
981 		return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
982 	case RECLAIM_ASYNC:
983 		return 0;
984 	}
985 }
986 
987 void
vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)988 vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)
989 {
990 	kern_return_t kr;
991 	size_t num_reclaimed;
992 
993 	if (action == RECLAIM_ASYNC) {
994 		lck_mtx_lock(&async_reclamation_buffers_lock);
995 
996 		process_async_reclamation_list();
997 		lck_mtx_unlock(&async_reclamation_buffers_lock);
998 	} else {
999 		size_t reclaim_threshold = pick_reclaim_threshold(action);
1000 		KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_START,
1001 		    action, reclaim_threshold);
1002 		lck_mtx_lock(&reclamation_buffers_lock);
1003 		reclamation_counter++;
1004 		while (true) {
1005 			vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
1006 			if (metadata == NULL) {
1007 				break;
1008 			}
1009 			lck_mtx_lock(&metadata->vdrm_lock);
1010 			if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
1011 				// We've already seen this one. We're done
1012 				lck_mtx_unlock(&metadata->vdrm_lock);
1013 				break;
1014 			}
1015 			metadata->vdrm_reclaimed_at = reclamation_counter;
1016 
1017 			TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
1018 			TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
1019 			lck_mtx_unlock(&reclamation_buffers_lock);
1020 
1021 			kr = reclaim_entries_from_buffer(metadata,
1022 			    reclaim_threshold, &num_reclaimed);
1023 			if (kr == KERN_SUCCESS) {
1024 				lck_mtx_unlock(&metadata->vdrm_lock);
1025 			}
1026 
1027 			lck_mtx_lock(&reclamation_buffers_lock);
1028 		}
1029 		lck_mtx_unlock(&reclamation_buffers_lock);
1030 		KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_END,
1031 		    reclamation_counter);
1032 	}
1033 }
1034 
1035 void
vm_deferred_reclamation_reclaim_all_memory(void)1036 vm_deferred_reclamation_reclaim_all_memory(void)
1037 {
1038 	vm_deferred_reclamation_reclaim_memory(RECLAIM_FULL);
1039 }
1040 
1041 bool
vm_deferred_reclamation_reclaim_from_task_async(task_t task)1042 vm_deferred_reclamation_reclaim_from_task_async(task_t task)
1043 {
1044 	bool queued = false;
1045 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1046 
1047 	if (metadata != NULL) {
1048 		lck_mtx_lock(&async_reclamation_buffers_lock);
1049 		if (metadata->vdrm_async_list.tqe_next != NULL ||
1050 		    metadata->vdrm_async_list.tqe_prev != NULL) {
1051 			// move this buffer to the tail if still on the async list
1052 			TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
1053 		}
1054 		TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
1055 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1056 		queued = true;
1057 		thread_wakeup(&vm_reclaim_thread);
1058 	}
1059 
1060 	return queued;
1061 }
1062 
1063 bool
vm_deferred_reclamation_reclaim_from_task_sync(task_t task,size_t max_entries_to_reclaim)1064 vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
1065 {
1066 	kern_return_t kr;
1067 	size_t num_reclaimed = 0;
1068 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1069 
1070 	if (!task_is_active(task)) {
1071 		return false;
1072 	}
1073 
1074 	if (metadata != NULL) {
1075 		lck_mtx_lock(&metadata->vdrm_lock);
1076 		while (num_reclaimed < max_entries_to_reclaim) {
1077 			size_t num_reclaimed_now;
1078 			kr = reclaim_chunk(metadata, &num_reclaimed_now);
1079 			if (kr == KERN_NOT_FOUND) {
1080 				// Nothing left to reclaim
1081 				break;
1082 			} else if (kr != KERN_SUCCESS) {
1083 				/* Lock has already been released and task is being killed. */
1084 				return false;
1085 			}
1086 			num_reclaimed += num_reclaimed_now;
1087 		}
1088 		lck_mtx_unlock(&metadata->vdrm_lock);
1089 	}
1090 
1091 	return num_reclaimed > 0;
1092 }
1093 
1094 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_buffer_fork(task_t task,vm_deferred_reclamation_metadata_t parent)1095 vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1096 {
1097 	kern_return_t kr;
1098 	vm_deferred_reclamation_metadata_t metadata = NULL;
1099 
1100 	LCK_MTX_ASSERT(&parent->vdrm_lock, LCK_MTX_ASSERT_OWNED);
1101 
1102 	assert(task->deferred_reclamation_metadata == NULL);
1103 	metadata = vmdr_metadata_alloc(task, parent->vdrm_reclaim_buffer,
1104 	    parent->vdrm_buffer_size, parent->vdrm_reclaim_indices);
1105 	lck_mtx_unlock(&parent->vdrm_lock);
1106 
1107 	kr = vmdr_metadata_wire(metadata);
1108 	if (kr != KERN_SUCCESS) {
1109 		vmdr_metadata_free(metadata);
1110 		return NULL;
1111 	}
1112 
1113 	lck_mtx_lock(&reclamation_buffers_lock);
1114 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
1115 	reclamation_buffers_length++;
1116 	lck_mtx_unlock(&reclamation_buffers_lock);
1117 
1118 	return metadata;
1119 }
1120 
1121 void
vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)1122 vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)
1123 {
1124 	lck_mtx_lock(&metadata->vdrm_lock);
1125 }
1126 
1127 void
vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)1128 vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)
1129 {
1130 	lck_mtx_unlock(&metadata->vdrm_lock);
1131 }
1132 
1133 
1134 static void
reclaim_thread_init(void)1135 reclaim_thread_init(void)
1136 {
1137 #if CONFIG_THREAD_GROUPS
1138 	thread_group_vm_add();
1139 #endif
1140 	thread_set_thread_name(current_thread(), "VM_reclaim");
1141 }
1142 
1143 
1144 static void
process_async_reclamation_list(void)1145 process_async_reclamation_list(void)
1146 {
1147 	kern_return_t kr;
1148 	size_t total_entries_reclaimed = 0;
1149 	size_t num_tasks_reclaimed = 0;
1150 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
1151 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_START);
1152 
1153 	vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
1154 	while (metadata != NULL) {
1155 		size_t num_reclaimed;
1156 		TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
1157 		metadata->vdrm_async_list.tqe_next = NULL;
1158 		metadata->vdrm_async_list.tqe_prev = NULL;
1159 		lck_mtx_lock(&metadata->vdrm_lock);
1160 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1161 
1162 		// NB: Currently the async reclaim thread fully reclaims the buffer.
1163 		kr = reclaim_entries_from_buffer(metadata, 0, &num_reclaimed);
1164 		total_entries_reclaimed += num_reclaimed;
1165 		if (kr != KERN_SUCCESS) {
1166 			/* Lock has already been released & task is in the process of getting killed. */
1167 			goto next;
1168 		}
1169 		num_tasks_reclaimed++;
1170 		/* Wakeup anyone waiting on this buffer getting processed */
1171 		thread_wakeup(&metadata->vdrm_async_list);
1172 		assert(current_thread()->map == kernel_map);
1173 		lck_mtx_unlock(&metadata->vdrm_lock);
1174 
1175 next:
1176 		lck_mtx_lock(&async_reclamation_buffers_lock);
1177 		metadata = TAILQ_FIRST(&async_reclamation_buffers);
1178 	}
1179 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_END,
1180 	    num_tasks_reclaimed, total_entries_reclaimed);
1181 }
1182 
1183 __enum_decl(reclaim_thread_state, uint32_t, {
1184 	RECLAIM_THREAD_INIT = 0,
1185 	RECLAIM_THREAD_CONT = 1,
1186 });
1187 
1188 static void
reclaim_thread_continue(void)1189 reclaim_thread_continue(void)
1190 {
1191 	lck_mtx_lock(&async_reclamation_buffers_lock);
1192 
1193 	process_async_reclamation_list();
1194 	assert_wait(&vm_reclaim_thread, THREAD_UNINT);
1195 
1196 	lck_mtx_unlock(&async_reclamation_buffers_lock);
1197 }
1198 
1199 void
reclaim_thread(void * param,wait_result_t wr __unused)1200 reclaim_thread(void *param, wait_result_t wr __unused)
1201 {
1202 	if (param == (void *) RECLAIM_THREAD_INIT) {
1203 		reclaim_thread_init();
1204 	} else {
1205 		assert(param == (void *) RECLAIM_THREAD_CONT);
1206 	}
1207 
1208 	reclaim_thread_continue();
1209 
1210 	(void) thread_block_parameter(reclaim_thread, (void*) RECLAIM_THREAD_CONT);
1211 }
1212 
1213 __startup_func
1214 static void
vm_deferred_reclamation_init(void)1215 vm_deferred_reclamation_init(void)
1216 {
1217 	// Note: no-op pending rdar://27006343 (Custom kernel log handles)
1218 	vm_reclaim_log_handle = os_log_create("com.apple.mach.vm", "reclaim");
1219 
1220 	(void)kernel_thread_start_priority(reclaim_thread,
1221 	    (void *)RECLAIM_THREAD_INIT, kReclaimThreadPriority,
1222 	    &vm_reclaim_thread);
1223 }
1224 
1225 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1226 
1227 #if DEVELOPMENT || DEBUG
1228 
1229 bool
vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)1230 vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1231 {
1232 	vm_deferred_reclamation_metadata_t metadata = NULL;
1233 	proc_t p = proc_find(pid);
1234 	vm_map_t map = NULL;
1235 	if (p == NULL) {
1236 		return false;
1237 	}
1238 	task_t t = proc_task(p);
1239 	if (t == NULL) {
1240 		proc_rele(p);
1241 		return false;
1242 	}
1243 
1244 	task_lock(t);
1245 	if (t->map) {
1246 		metadata = t->deferred_reclamation_metadata;
1247 		if (metadata != NULL) {
1248 			map = t->map;
1249 			vm_map_reference(t->map);
1250 		}
1251 	}
1252 	task_unlock(t);
1253 	proc_rele(p);
1254 	if (metadata == NULL) {
1255 		return false;
1256 	}
1257 
1258 	lck_mtx_lock(&async_reclamation_buffers_lock);
1259 	while (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
1260 		assert_wait(&metadata->vdrm_async_list, THREAD_UNINT);
1261 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1262 		thread_block(THREAD_CONTINUE_NULL);
1263 		lck_mtx_lock(&async_reclamation_buffers_lock);
1264 	}
1265 
1266 	/*
1267 	 * The async reclaim thread first removes the buffer from the list
1268 	 * and then reclaims it (while holding its lock).
1269 	 * So grab the metadata buffer's lock here to ensure the
1270 	 * reclaim is done.
1271 	 */
1272 	lck_mtx_lock(&metadata->vdrm_lock);
1273 	lck_mtx_unlock(&metadata->vdrm_lock);
1274 	lck_mtx_unlock(&async_reclamation_buffers_lock);
1275 
1276 	vm_map_deallocate(map);
1277 	return true;
1278 }
1279 
1280 #endif /* DEVELOPMENT || DEBUG */
1281