xref: /xnu-11215.61.5/osfmk/vm/vm_reclaim.c (revision 4f1223e81cd707a65cc109d0b8ad6653699da3c4)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/startup.h>
35 #include <kern/sched.h>
36 #include <libkern/OSAtomic.h>
37 #include <mach/kern_return.h>
38 #include <mach/mach_types.h>
39 #include <mach/vm_reclaim.h>
40 #include <os/log.h>
41 #include <pexpert/pexpert.h>
42 #include <vm/vm_fault_xnu.h>
43 #include <vm/vm_map.h>
44 #include <vm/vm_map_internal.h>
45 #include <vm/vm_reclaim_internal.h>
46 #include <vm/vm_sanitize_internal.h>
47 #include <sys/errno.h>
48 #include <sys/kdebug.h>
49 #include <vm/vm_kern_xnu.h>
50 #include <sys/queue.h>
51 #include <sys/reason.h>
52 #include <os/atomic_private.h>
53 #include <os/refcnt.h>
54 #include <os/refcnt_internal.h>
55 
56 #pragma mark Tunables
57 
58 #define VM_RECLAIM_THRESHOLD_DISABLED 0ULL
59 
60 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
61 static integer_t kReclaimThreadPriority = BASEPRI_VM;
62 // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
63 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", 2);
64 TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults", "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", 0, TUNABLE_DT_NONE);
65 // Used to debug vm_reclaim kills
66 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
67 
68 #pragma mark Declarations
69 typedef struct proc *proc_t;
70 extern const char *proc_best_name(struct proc *);
71 extern kern_return_t kern_return_for_errno(int);
72 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
73 struct proc *proc_ref(struct proc *p, int locked);
74 int proc_rele(proc_t p);
75 static kern_return_t reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
76 static kern_return_t reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
77 static kern_return_t reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
78 
79 os_refgrp_decl(static, vdrm_refgrp, "vm_reclaim_metadata_refgrp", NULL);
80 
81 struct vm_deferred_reclamation_metadata_s {
82 	/*
83 	 * Global list containing every reclamation buffer. Protected by the
84 	 * reclamation_buffers_lock.
85 	 */
86 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list;
87 	/*
88 	 * A list containing buffers that are ripe for reclamation. Protected by
89 	 * the async_reclamation_buffers_lock.
90 	 */
91 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list;
92 	/* Protects all struct fields (except denoted otherwise) */
93 	decl_lck_mtx_data(, vdrm_lock);
94 	decl_lck_mtx_gate_data(, vdrm_gate);
95 	/*
96 	 * The task owns this structure but we maintain a backpointer here
97 	 * so that we can send an exception if we hit an error.
98 	 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
99 	 */
100 	task_t vdrm_task;
101 	pid_t vdrm_pid;
102 	vm_map_t vdrm_map;
103 	/*
104 	 * The owning task holds a ref on this object. When the task dies, it
105 	 * will set vdrm_task := NULL and drop its ref. Threads operating on the buffer
106 	 * should hold a +1 on the metadata structure to ensure it's validity.
107 	 */
108 	os_refcnt_t vdrm_refcnt;
109 	user_addr_t vdrm_reclaim_buffer;
110 	mach_vm_size_t vdrm_buffer_size;
111 	user_addr_t vdrm_reclaim_indices;
112 	uint64_t vdrm_reclaimed_at;
113 	/*
114 	 * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
115 	 * cumulatively. Both values are in terms of virtual memory, so they give an upper bound
116 	 * on the amount of physical memory that can be reclaimed.
117 	 * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
118 	 * Note that neither value is protected by the vdrm_lock.
119 	 */
120 	_Atomic size_t vdrm_num_bytes_put_in_buffer;
121 	_Atomic size_t vdrm_num_bytes_reclaimed;
122 	/*
123 	 * The number of threads waiting for a pending reclamation
124 	 * on this buffer to complete. Protected by the
125 	 * async_reclamation_buffers_lock.
126 	 */
127 	uint32_t vdrm_waiters;
128 };
129 static void vmdr_process_async_reclamation_list(void);
130 
131 extern void *proc_find(int pid);
132 extern task_t proc_task(proc_t);
133 
134 #pragma mark Globals
135 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
136 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
137 static os_log_t vm_reclaim_log_handle;
138 
139 /*
140  * We maintain two lists of reclamation buffers.
141  * The reclamation_buffers list contains every buffer in the system.
142  * The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
143  * Each list has its own lock.
144  */
145 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
146 
147 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
148 /*
149  * The reclamation_buffers_lock protects the reclamation_buffers list.
150  * It must be held when iterating over the list or manipulating the list.
151  * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
152  */
153 LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
154 LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
155 static uint64_t reclamation_counter; // generation count for global reclaims
156 
157 
158 static void vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
159 static void vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
160 static void vmdr_async_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
161 static void vmdr_async_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
162 
163 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
164 static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
165 static void vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata);
166 
167 #pragma mark Implementation
168 
169 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,user_addr_t indices)170 vmdr_metadata_alloc(
171 	task_t                  task,
172 	user_addr_t             buffer,
173 	mach_vm_size_t          size,
174 	user_addr_t             indices)
175 {
176 	vm_deferred_reclamation_metadata_t metadata;
177 	vm_map_t map = task->map;
178 
179 	assert(!map->is_nested_map);
180 
181 	metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
182 	lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
183 	lck_mtx_gate_init(&metadata->vdrm_lock, &metadata->vdrm_gate);
184 	os_ref_init(&metadata->vdrm_refcnt, &vdrm_refgrp);
185 
186 	metadata->vdrm_task = task;
187 	/*
188 	 * Forked children will not yet have a pid. Lazily set the pid once the
189 	 * task has been started.
190 	 *
191 	 * TODO: do not support buffer initialization during fork and have libmalloc
192 	 * initialize the buffer after fork. (rdar://124295804)
193 	 */
194 	metadata->vdrm_pid = 0;
195 	metadata->vdrm_map = map;
196 	metadata->vdrm_reclaim_buffer = buffer;
197 	metadata->vdrm_buffer_size = size;
198 	metadata->vdrm_reclaim_indices = indices;
199 
200 	/*
201 	 * we do not need to hold a lock on `task` because this is called
202 	 * either at fork() time or from the context of current_task().
203 	 */
204 	vm_map_reference(map);
205 	return metadata;
206 }
207 
208 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)209 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
210 {
211 	assert3u(os_ref_get_count(&metadata->vdrm_refcnt), ==, 0);
212 	vm_map_deallocate(metadata->vdrm_map);
213 	lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
214 	lck_mtx_gate_destroy(&metadata->vdrm_lock, &metadata->vdrm_gate);
215 	zfree(vm_reclaim_metadata_zone, metadata);
216 }
217 
218 static inline __result_use_check
219 kern_return_t
vm_deferred_reclamation_buffer_init_internal_sanitize(vm_map_t map,mach_vm_address_ut address_u,mach_vm_size_ut size_u,mach_vm_address_t * address,mach_vm_size_t * size)220 vm_deferred_reclamation_buffer_init_internal_sanitize(
221 	vm_map_t           map,
222 	mach_vm_address_ut address_u,
223 	mach_vm_size_ut    size_u,
224 	mach_vm_address_t  *address,
225 	mach_vm_size_t     *size)
226 {
227 	/* Sanitize addr/size separately since addr is only a hint. */
228 	*address = vm_sanitize_addr(map, address_u);
229 
230 	static_assert(
231 		sizeof(struct mach_vm_reclaim_buffer_v1_s) < FOURK_PAGE_SIZE,
232 		"If growing struct mach_vm_reclaim_buffer_v1_s beyond 4K, "
233 		"add a runtime check on size to prevent subtraction "
234 		"underflow.");
235 	return vm_sanitize_size(
236 		0,
237 		size_u,
238 		VM_SANITIZE_CALLER_MACH_VM_DEFERRED_RECLAMATION_BUFFER_INIT,
239 		map,
240 		VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS,
241 		size);
242 }
243 
244 kern_return_t
vm_deferred_reclamation_buffer_init_internal(task_t task,mach_vm_address_ut * address_u,mach_vm_size_ut size_u)245 vm_deferred_reclamation_buffer_init_internal(
246 	task_t             task,
247 	mach_vm_address_ut *address_u,
248 	mach_vm_size_ut    size_u)
249 {
250 	kern_return_t kr = KERN_FAILURE;
251 	mach_vm_address_t address;
252 	mach_vm_size_t size;
253 	vm_deferred_reclamation_metadata_t metadata = NULL;
254 	vm_map_t map;
255 	uint64_t head = 0, tail = 0, busy = 0;
256 	static bool reclaim_disabled_logged = false;
257 
258 	if (task == TASK_NULL || address_u == NULL) {
259 		return KERN_INVALID_ARGUMENT;
260 	}
261 
262 	map = task->map;
263 
264 	kr = vm_deferred_reclamation_buffer_init_internal_sanitize(
265 		map,
266 		*address_u,
267 		size_u,
268 		&address,
269 		&size);
270 	if (__improbable(kr != KERN_SUCCESS)) {
271 		return vm_sanitize_get_kr(kr);
272 	}
273 
274 	if (!vm_reclaim_max_threshold) {
275 		if (!reclaim_disabled_logged) {
276 			/* Avoid logging failure for every new process */
277 			reclaim_disabled_logged = true;
278 			os_log_error(vm_reclaim_log_handle,
279 			    "vm_reclaim: failed to initialize vmdr buffer - reclaim is disabled (%llu)\n",
280 			    vm_reclaim_max_threshold);
281 		}
282 		return KERN_NOT_SUPPORTED;
283 	}
284 
285 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START,
286 	    task_pid(task), size);
287 	/*
288 	 * TODO: If clients other than libmalloc adopt deferred reclaim, a
289 	 * different tag should be given
290 	 */
291 	/*
292 	 * `address` was sanitized under the assumption that we'll only use
293 	 * it as a hint (overflow checks were used) so we must pass the
294 	 * anywhere flag.
295 	 */
296 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE_PERMANENT(
297 		.vm_tag = VM_MEMORY_MALLOC);
298 	kr = mach_vm_allocate_kernel(
299 		map,
300 		vm_sanitize_wrap_addr_ref(&address),
301 		vm_sanitize_wrap_size(size),
302 		vmk_flags);
303 	if (kr != KERN_SUCCESS) {
304 		os_log_error(vm_reclaim_log_handle, "vm_reclaim: failed to allocate VA for reclaim "
305 		    "buffer (%d) - %s [%d]\n", kr, task_best_name(task), task_pid(task));
306 		return kr;
307 	}
308 	assert3u(address, !=, 0);
309 
310 	user_addr_t buffer = address + \
311 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
312 	/*
313 	 * vm_sanitize_size above guarantees that size is at least one map
314 	 * page. This guarantees that subtraction below doesn't underflow.
315 	 */
316 	mach_vm_size_t buffer_size = size - \
317 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
318 	user_addr_t indices = address + \
319 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, indices);
320 
321 	metadata = vmdr_metadata_alloc(task, buffer, buffer_size, indices);
322 
323 	/*
324 	 * Validate the starting indices.
325 	 */
326 	kr = reclaim_copyin_busy(metadata, &busy);
327 	if (kr != KERN_SUCCESS) {
328 		goto out;
329 	}
330 	kr = reclaim_copyin_head(metadata, &head);
331 	if (kr != KERN_SUCCESS) {
332 		goto out;
333 	}
334 	kr = reclaim_copyin_tail(metadata, &tail);
335 	if (kr != KERN_SUCCESS) {
336 		goto out;
337 	}
338 
339 	if (head != 0 || tail != 0 || busy != 0) {
340 		os_log_error(vm_reclaim_log_handle, "vm_reclaim: indices were not "
341 		    "zero-initialized\n");
342 		kr = KERN_INVALID_ARGUMENT;
343 		goto out;
344 	}
345 
346 	/*
347 	 * Publish the metadata to the task & global buffer list. This must be
348 	 * done under the task lock to synchronize with task termination - i.e.
349 	 * task_terminate_internal is guaranteed to see the published metadata and
350 	 * tear it down.
351 	 */
352 	lck_mtx_lock(&reclamation_buffers_lock);
353 	task_lock(task);
354 
355 	if (!task_is_active(task) || task_is_halting(task)) {
356 		os_log_error(vm_reclaim_log_handle,
357 		    "vm_reclaim: failed to initialize buffer on dying task %s [%d]", task_best_name(task), task_pid(task));
358 		kr = KERN_ABORTED;
359 		goto fail_task;
360 	}
361 	if (task->deferred_reclamation_metadata != NULL) {
362 		os_log_error(vm_reclaim_log_handle,
363 		    "vm_reclaim: tried to overwrite existing reclaim buffer for %s [%d]", task_best_name(task), task_pid(task));
364 		kr = KERN_INVALID_ARGUMENT;
365 		goto fail_task;
366 	}
367 
368 	vmdr_list_append_locked(metadata);
369 
370 	task->deferred_reclamation_metadata = metadata;
371 
372 	task_unlock(task);
373 	lck_mtx_unlock(&reclamation_buffers_lock);
374 
375 	*address_u = vm_sanitize_wrap_addr(address);
376 
377 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
378 	    task_pid(task), KERN_SUCCESS, address);
379 	return KERN_SUCCESS;
380 
381 fail_task:
382 	task_unlock(task);
383 	lck_mtx_unlock(&reclamation_buffers_lock);
384 
385 out:
386 	vmdr_metadata_release(metadata);
387 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
388 	    task_pid(task), kr);
389 	return kr;
390 }
391 
392 #pragma mark Synchronization
393 
394 static inline void
vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)395 vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)
396 {
397 	lck_mtx_lock(&metadata->vdrm_lock);
398 }
399 
400 static inline void
vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)401 vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)
402 {
403 	lck_mtx_unlock(&metadata->vdrm_lock);
404 }
405 
406 static inline void
vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)407 vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)
408 {
409 	lck_mtx_gate_assert(&metadata->vdrm_lock, &metadata->vdrm_gate,
410 	    GATE_ASSERT_HELD);
411 }
412 
413 static inline void
vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)414 vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)
415 {
416 #if MACH_ASSERT
417 	vmdr_metadata_lock(metadata);
418 	vmdr_metadata_assert_owned_locked(metadata);
419 	vmdr_metadata_unlock(metadata);
420 #else /* MACH_ASSERT */
421 	(void)metadata;
422 #endif /* MACH_ASSERT */
423 }
424 
425 
426 /*
427  * Try to take ownership of the buffer. Returns true if successful.
428  */
429 static bool
vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)430 vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)
431 {
432 	kern_return_t kr = lck_mtx_gate_try_close(&metadata->vdrm_lock,
433 	    &metadata->vdrm_gate);
434 	return kr == KERN_SUCCESS;
435 }
436 
437 static void
vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata)438 vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata)
439 {
440 	__assert_only gate_wait_result_t wait_result;
441 	if (!vmdr_metadata_try_own_locked(metadata)) {
442 		wait_result = lck_mtx_gate_wait(
443 			&metadata->vdrm_lock, &metadata->vdrm_gate, LCK_SLEEP_DEFAULT,
444 			THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
445 		assert(wait_result == GATE_HANDOFF);
446 	}
447 }
448 
449 /*
450  * Set the current thread as the owner of a reclaim buffer. May block. Will
451  * propagate priority.
452  */
453 static void
vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)454 vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)
455 {
456 	vmdr_metadata_lock(metadata);
457 	vmdr_metadata_own_locked(metadata);
458 	vmdr_metadata_unlock(metadata);
459 }
460 
461 static void
vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)462 vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)
463 {
464 	vmdr_metadata_assert_owned_locked(metadata);
465 	lck_mtx_gate_handoff(&metadata->vdrm_lock, &metadata->vdrm_gate,
466 	    GATE_HANDOFF_OPEN_IF_NO_WAITERS);
467 }
468 
469 /*
470  * Release ownership of a reclaim buffer and wakeup any threads waiting for
471  * ownership. Must be called from the thread that acquired ownership.
472  */
473 static void
vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)474 vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)
475 {
476 	vmdr_metadata_lock(metadata);
477 	vmdr_metadata_disown_locked(metadata);
478 	vmdr_metadata_unlock(metadata);
479 }
480 
481 static void
vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)482 vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)
483 {
484 	os_ref_retain(&metadata->vdrm_refcnt);
485 }
486 
487 static void
vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)488 vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)
489 {
490 	if (os_ref_release(&metadata->vdrm_refcnt) == 0) {
491 		vmdr_metadata_free(metadata);
492 	}
493 }
494 
495 void
vm_deferred_reclamation_buffer_own(vm_deferred_reclamation_metadata_t metadata)496 vm_deferred_reclamation_buffer_own(vm_deferred_reclamation_metadata_t metadata)
497 {
498 	vmdr_metadata_own(metadata);
499 }
500 
501 void
vm_deferred_reclamation_buffer_disown(vm_deferred_reclamation_metadata_t metadata)502 vm_deferred_reclamation_buffer_disown(vm_deferred_reclamation_metadata_t metadata)
503 {
504 	vmdr_metadata_disown(metadata);
505 }
506 
507 #pragma mark Global Queue Management
508 
509 static void
vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)510 vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
511 {
512 	LCK_MTX_ASSERT(&reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
513 	assert(metadata->vdrm_list.tqe_prev != NULL);
514 	TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
515 	metadata->vdrm_list.tqe_prev = NULL;
516 	metadata->vdrm_list.tqe_next = NULL;
517 }
518 
519 static void
vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)520 vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
521 {
522 	LCK_MTX_ASSERT(&reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
523 	assert(metadata->vdrm_list.tqe_prev == NULL);
524 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
525 }
526 
527 static void
vmdr_async_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)528 vmdr_async_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
529 {
530 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
531 	assert(metadata->vdrm_async_list.tqe_prev != NULL);
532 	TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
533 	metadata->vdrm_async_list.tqe_prev = NULL;
534 	metadata->vdrm_async_list.tqe_next = NULL;
535 }
536 
537 static void
vmdr_async_list_append_locked(vm_deferred_reclamation_metadata_t metadata)538 vmdr_async_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
539 {
540 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
541 	assert(metadata->vdrm_async_list.tqe_prev == NULL);
542 	TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
543 }
544 
545 static bool
vmdr_metadata_has_pending_reclamation(vm_deferred_reclamation_metadata_t metadata)546 vmdr_metadata_has_pending_reclamation(vm_deferred_reclamation_metadata_t metadata)
547 {
548 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
549 	return metadata->vdrm_async_list.tqe_prev != NULL;
550 }
551 
552 #pragma mark Lifecycle
553 
554 void
vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)555 vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)
556 {
557 	assert(metadata != NULL);
558 	/*
559 	 * First remove the buffer from the global list so no one else can get access to it.
560 	 */
561 	lck_mtx_lock(&reclamation_buffers_lock);
562 	vmdr_list_remove_locked(metadata);
563 	lck_mtx_unlock(&reclamation_buffers_lock);
564 
565 	/*
566 	 * Now remove it from the async list (if present)
567 	 */
568 	lck_mtx_lock(&async_reclamation_buffers_lock);
569 	if (vmdr_metadata_has_pending_reclamation(metadata)) {
570 		vmdr_async_list_remove_locked(metadata);
571 	}
572 	lck_mtx_unlock(&async_reclamation_buffers_lock);
573 }
574 
575 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)576 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
577 {
578 	assert(metadata != NULL);
579 	/* Buffer must be uninstalled before being deallocated */
580 	assert(metadata->vdrm_async_list.tqe_prev == NULL);
581 	assert(metadata->vdrm_async_list.tqe_next == NULL);
582 	assert(metadata->vdrm_list.tqe_prev == NULL);
583 	assert(metadata->vdrm_list.tqe_next == NULL);
584 	/*
585 	 * The task is dropping its ref on this buffer. First remove the buffer's
586 	 * back-reference to the task so that any threads currently operating on
587 	 * this buffer do not try to operate on the dead/dying task
588 	 */
589 	vmdr_metadata_lock(metadata);
590 	metadata->vdrm_task = TASK_NULL;
591 	vmdr_metadata_unlock(metadata);
592 
593 	vmdr_metadata_release(metadata);
594 }
595 
596 #pragma mark Exception Delivery
597 
598 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)599 reclaim_kill_with_reason(
600 	vm_deferred_reclamation_metadata_t metadata,
601 	unsigned reason,
602 	mach_exception_data_type_t subcode)
603 {
604 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
605 	mach_exception_code_t code = 0;
606 	task_t task;
607 	proc_t p = NULL;
608 	boolean_t fatal = TRUE;
609 	bool killing_self;
610 	pid_t pid;
611 	int err;
612 
613 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
614 
615 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
616 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
617 	EXC_GUARD_ENCODE_TARGET(code, 0);
618 
619 	vmdr_metadata_lock(metadata);
620 	task = metadata->vdrm_task;
621 	if (task == TASK_NULL || !task_is_active(task) || task_is_halting(task)) {
622 		/* Task is no longer alive */
623 		vmdr_metadata_unlock(metadata);
624 		os_log_error(vm_reclaim_log_handle,
625 		    "vm_reclaim: Unable to deliver guard exception because task "
626 		    "[%d] is already dead.\n",
627 		    task ? task_pid(task) : -1);
628 		return;
629 	}
630 
631 	if (panic_on_kill) {
632 		panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
633 	}
634 
635 	killing_self = (task == current_task());
636 	if (!killing_self) {
637 		task_reference(task);
638 	}
639 	assert(task != kernel_task);
640 	vmdr_metadata_unlock(metadata);
641 
642 	if (reason == kGUARD_EXC_DEALLOC_GAP) {
643 		task_lock(task);
644 		fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
645 		task_unlock(task);
646 	}
647 
648 	if (!fatal) {
649 		os_log_info(vm_reclaim_log_handle,
650 		    "vm_reclaim: Skipping non fatal guard exception.\n");
651 		goto out;
652 	}
653 
654 	pid = task_pid(task);
655 	if (killing_self) {
656 		p = get_bsdtask_info(task);
657 	} else {
658 		p = proc_find(pid);
659 		if (p && proc_task(p) != task) {
660 			os_log_error(vm_reclaim_log_handle,
661 			    "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
662 			goto out;
663 		}
664 	}
665 
666 	if (!p) {
667 		os_log_error(vm_reclaim_log_handle,
668 		    "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
669 		goto out;
670 	}
671 
672 	int flags = PX_DEBUG_NO_HONOR;
673 	exception_info_t info = {
674 		.os_reason = OS_REASON_GUARD,
675 		.exception_type = EXC_GUARD,
676 		.mx_code = code,
677 		.mx_subcode = subcode
678 	};
679 
680 	err = exit_with_mach_exception(p, info, flags);
681 	if (err != 0) {
682 		os_log_error(vm_reclaim_log_handle, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
683 		goto out;
684 	}
685 
686 
687 out:
688 	if (!killing_self) {
689 		if (p) {
690 			proc_rele(p);
691 			p = NULL;
692 		}
693 		if (task) {
694 			task_deallocate(task);
695 			task = NULL;
696 		}
697 	}
698 }
699 
700 #pragma mark CopyI/O
701 
702 static user_addr_t
get_head_ptr(user_addr_t indices)703 get_head_ptr(user_addr_t indices)
704 {
705 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
706 }
707 
708 static user_addr_t
get_tail_ptr(user_addr_t indices)709 get_tail_ptr(user_addr_t indices)
710 {
711 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
712 }
713 
714 static user_addr_t
get_busy_ptr(user_addr_t indices)715 get_busy_ptr(user_addr_t indices)
716 {
717 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
718 }
719 
720 static kern_return_t
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)721 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
722 {
723 	if (result != 0 && (result != EFAULT || !vm_fault_get_disabled())) {
724 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE,
725 		    result);
726 	}
727 	return kern_return_for_errno(result);
728 }
729 
730 /*
731  * Helper functions to do copyio on the head, tail, and busy pointers.
732  * Note that the kernel will only write to the busy and head pointers.
733  * Userspace is not supposed to write to the head or busy pointers, but the kernel
734  * must be resilient to that kind of bug in userspace.
735  */
736 
737 static kern_return_t
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)738 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
739 {
740 	int result;
741 	kern_return_t kr;
742 	user_addr_t indices = metadata->vdrm_reclaim_indices;
743 	user_addr_t head_ptr = get_head_ptr(indices);
744 
745 	result = copyin_atomic64(head_ptr, head);
746 	kr = reclaim_handle_copyio_error(metadata, result);
747 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
748 		os_log_error(vm_reclaim_log_handle,
749 		    "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
750 	}
751 	return kr;
752 }
753 
754 static kern_return_t
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)755 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
756 {
757 	int result;
758 	kern_return_t kr;
759 	user_addr_t indices = metadata->vdrm_reclaim_indices;
760 	user_addr_t tail_ptr = get_tail_ptr(indices);
761 
762 	result = copyin_atomic64(tail_ptr, tail);
763 	kr = reclaim_handle_copyio_error(metadata, result);
764 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
765 		os_log_error(vm_reclaim_log_handle,
766 		    "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
767 	}
768 	return kr;
769 }
770 
771 static kern_return_t
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)772 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
773 {
774 	int result;
775 	kern_return_t kr;
776 	user_addr_t indices = metadata->vdrm_reclaim_indices;
777 	user_addr_t busy_ptr = get_busy_ptr(indices);
778 
779 	result = copyin_atomic64(busy_ptr, busy);
780 	kr = reclaim_handle_copyio_error(metadata, result);
781 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
782 		os_log_error(vm_reclaim_log_handle,
783 		    "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
784 	}
785 	return kr;
786 }
787 
788 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)789 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
790 {
791 	int result;
792 	kern_return_t kr;
793 	user_addr_t indices = metadata->vdrm_reclaim_indices;
794 	user_addr_t busy_ptr = get_busy_ptr(indices);
795 
796 	result = copyout_atomic64(value, busy_ptr);
797 	kr = reclaim_handle_copyio_error(metadata, result);
798 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
799 		os_log_error(vm_reclaim_log_handle,
800 		    "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
801 	}
802 	return kr;
803 }
804 
805 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)806 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
807 {
808 	int result;
809 	kern_return_t kr;
810 	user_addr_t indices = metadata->vdrm_reclaim_indices;
811 	user_addr_t head_ptr = get_head_ptr(indices);
812 
813 	result = copyout_atomic64(value, head_ptr);
814 	kr = reclaim_handle_copyio_error(metadata, result);
815 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
816 		os_log_error(vm_reclaim_log_handle,
817 		    "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
818 	}
819 	return kr;
820 }
821 
822 #pragma mark Reclamation
823 
824 /*
825  * Reclaim a chunk (kReclaimChunkSize entries) from the buffer.
826  *
827  * Writes the number of entries reclaimed to `num_reclaimed_out`. Note that
828  * there may be zero reclaimable entries in the chunk (they have all been
829  * re-used by userspace).
830  *
831  * Returns:
832  *  - KERN_NOT_FOUND if the buffer has been exhausted (head == tail)
833  *  - KERN_FAILURE on failure to reclaim -- metadata lock will be dropped
834  *    before returning
835  */
836 static kern_return_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,size_t * num_reclaimed_out,vm_deferred_reclamation_options_t options)837 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
838     size_t *num_reclaimed_out, vm_deferred_reclamation_options_t options)
839 {
840 	kern_return_t kr;
841 	int result = 0;
842 	size_t num_reclaimed = 0;
843 	uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0,
844 	    num_copied = 0, buffer_len = 0;
845 	user_addr_t indices;
846 	vm_map_t map = metadata->vdrm_map, old_map;
847 	mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
848 
849 	assert(metadata != NULL);
850 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
851 
852 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START,
853 	    metadata->vdrm_pid, kReclaimChunkSize);
854 
855 	buffer_len = metadata->vdrm_buffer_size /
856 	    sizeof(mach_vm_reclaim_entry_v1_t);
857 
858 	memset(reclaim_entries, 0, sizeof(reclaim_entries));
859 
860 	indices = (user_addr_t) metadata->vdrm_reclaim_indices;
861 	old_map = vm_map_switch(map);
862 
863 	if (options & RECLAIM_NO_FAULT) {
864 		vm_fault_disable();
865 	}
866 
867 	kr = reclaim_copyin_busy(metadata, &busy);
868 	if (kr != KERN_SUCCESS) {
869 		goto fail;
870 	}
871 	kr = reclaim_copyin_head(metadata, &head);
872 	if (kr != KERN_SUCCESS) {
873 		goto fail;
874 	}
875 	kr = reclaim_copyin_tail(metadata, &tail);
876 	if (kr != KERN_SUCCESS) {
877 		goto fail;
878 	}
879 
880 	if (busy != head) {
881 		// Userspace overwrote one of the pointers
882 		os_log_error(vm_reclaim_log_handle,
883 		    "vm_reclaim: Userspace modified head or busy pointer! head: %llu "
884 		    "(0x%llx) != busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
885 		    head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail,
886 		    get_tail_ptr(indices));
887 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE,
888 		    busy);
889 		kr = KERN_FAILURE;
890 		goto fail;
891 	}
892 
893 	if (tail < head) {
894 		/*
895 		 * Userspace is likely in the middle of trying to re-use an entry,
896 		 * bail on this reclamation.
897 		 */
898 		os_log_error(vm_reclaim_log_handle,
899 		    "vm_reclaim: Userspace modified head or tail pointer! head: %llu "
900 		    "(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
901 		    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy,
902 		    get_busy_ptr(indices));
903 		kr = KERN_FAILURE;
904 		goto fail;
905 	}
906 
907 	/*
908 	 * NB: If any of the copyouts below fail due to faults being disabled,
909 	 * the buffer may be left in a state where several entries are unusable
910 	 * until the next reclamation (i.e. busy > head)
911 	 */
912 	num_to_reclaim = tail - head;
913 	while (true) {
914 		num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
915 		if (num_to_reclaim == 0) {
916 			break;
917 		}
918 		busy = head + num_to_reclaim;
919 		kr = reclaim_copyout_busy(metadata, busy);
920 		if (kr != KERN_SUCCESS) {
921 			goto fail;
922 		}
923 		os_atomic_thread_fence(seq_cst);
924 		kr = reclaim_copyin_tail(metadata, &new_tail);
925 		if (kr != KERN_SUCCESS) {
926 			goto fail;
927 		}
928 
929 		if (new_tail >= busy) {
930 			/* Got num_to_reclaim entries */
931 			break;
932 		}
933 		tail = new_tail;
934 		if (tail < head) {
935 			/*
936 			 * Userspace is likely in the middle of trying to re-use an entry,
937 			 * bail on this reclamation
938 			 */
939 			os_log_error(vm_reclaim_log_handle,
940 			    "vm_reclaim: Userspace modified head or tail pointer! head: "
941 			    "%llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
942 			    head, get_head_ptr(indices), tail, get_tail_ptr(indices),
943 			    busy, get_busy_ptr(indices));
944 			/* Reset busy back to head */
945 			reclaim_copyout_busy(metadata, head);
946 			kr = KERN_FAILURE;
947 			goto fail;
948 		}
949 		/* Can't reclaim these entries. Try again */
950 		num_to_reclaim = tail - head;
951 		if (num_to_reclaim == 0) {
952 			/* Nothing left to reclaim. Reset busy to head. */
953 			kr = reclaim_copyout_busy(metadata, head);
954 			if (kr != KERN_SUCCESS) {
955 				goto fail;
956 			}
957 			break;
958 		}
959 		/*
960 		 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
961 		 * so this is gauranteed to converge.
962 		 */
963 	}
964 
965 	while (num_copied < num_to_reclaim) {
966 		uint64_t memcpy_start_idx = (head % buffer_len);
967 		uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
968 		// Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
969 		memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
970 		uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
971 
972 		assert(num_to_copy + num_copied <= kReclaimChunkSize);
973 		user_addr_t src_ptr = metadata->vdrm_reclaim_buffer +
974 		    (memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t));
975 		mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
976 
977 		result = copyin(src_ptr, dst_ptr,
978 		    (num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t)));
979 		kr = reclaim_handle_copyio_error(metadata, result);
980 		if (kr != KERN_SUCCESS) {
981 			if (kr != KERN_MEMORY_ERROR) {
982 				os_log_error(vm_reclaim_log_handle,
983 				    "vm_reclaim: Unable to copyin %llu entries in reclaim "
984 				    "buffer at 0x%llx to 0x%llx: err=%d\n",
985 				    num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
986 			}
987 			goto fail;
988 		}
989 
990 		num_copied += num_to_copy;
991 		head += num_to_copy;
992 	}
993 
994 	for (size_t i = 0; i < num_to_reclaim; i++) {
995 		mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
996 		KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
997 		    metadata->vdrm_pid, entry->address, entry->size,
998 		    entry->behavior);
999 		DTRACE_VM4(vm_reclaim_chunk,
1000 		    int, metadata->vdrm_pid,
1001 		    mach_vm_address_t, entry->address,
1002 		    size_t, entry->size,
1003 		    mach_vm_reclaim_behavior_v1_t, entry->behavior);
1004 		if (entry->address != 0 && entry->size != 0) {
1005 			switch (entry->behavior) {
1006 			case MACH_VM_RECLAIM_DEALLOCATE:
1007 				kr = vm_map_remove_guard(map,
1008 				    vm_map_trunc_page(entry->address,
1009 				    VM_MAP_PAGE_MASK(map)),
1010 				    vm_map_round_page(entry->address + entry->size,
1011 				    VM_MAP_PAGE_MASK(map)),
1012 				    VM_MAP_REMOVE_GAPS_FAIL,
1013 				    KMEM_GUARD_NONE).kmr_return;
1014 				if (kr == KERN_INVALID_VALUE) {
1015 					reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
1016 					goto fail;
1017 				} else if (kr != KERN_SUCCESS) {
1018 					os_log_error(vm_reclaim_log_handle,
1019 					    "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx err=%d\n",
1020 					    entry->address, entry->size, (uint64_t) map, kr);
1021 					reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1022 					goto fail;
1023 				}
1024 				break;
1025 			case MACH_VM_RECLAIM_REUSABLE:
1026 				kr = vm_map_behavior_set(map,
1027 				    vm_map_trunc_page(entry->address, VM_MAP_PAGE_MASK(map)),
1028 				    vm_map_round_page(entry->address + entry->size, VM_MAP_PAGE_MASK(map)),
1029 				    VM_BEHAVIOR_REUSABLE);
1030 				if (kr != KERN_SUCCESS) {
1031 					os_log_error(vm_reclaim_log_handle,
1032 					    "vm_reclaim: unable to free(reusable) 0x%llx (%u) for pid %d err=%d\n",
1033 					    entry->address, entry->size, metadata->vdrm_pid, kr);
1034 				}
1035 				break;
1036 			default:
1037 				os_log_error(vm_reclaim_log_handle,
1038 				    "vm_reclaim: attempted to reclaim entry with unsupported behavior %uh",
1039 				    entry->behavior);
1040 				reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
1041 				kr = KERN_INVALID_VALUE;
1042 				goto fail;
1043 			}
1044 			num_reclaimed++;
1045 			os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
1046 			KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END,
1047 			    metadata->vdrm_pid, entry->address);
1048 		}
1049 	}
1050 
1051 	kr = reclaim_copyout_head(metadata, head);
1052 	if (kr != KERN_SUCCESS) {
1053 		goto fail;
1054 	}
1055 
1056 	if (options & RECLAIM_NO_FAULT) {
1057 		vm_fault_enable();
1058 	}
1059 	vm_map_switch(old_map);
1060 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1061 	    metadata->vdrm_pid, num_to_reclaim, num_reclaimed, true);
1062 	*num_reclaimed_out = num_reclaimed;
1063 	if (num_to_reclaim == 0) {
1064 		// We have exhausted the reclaimable portion of the buffer
1065 		return KERN_NOT_FOUND;
1066 	}
1067 	return KERN_SUCCESS;
1068 
1069 fail:
1070 	if (options & RECLAIM_NO_FAULT) {
1071 		vm_fault_enable();
1072 	}
1073 	vm_map_switch(old_map);
1074 	*num_reclaimed_out = num_reclaimed;
1075 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1076 	    metadata->vdrm_pid, num_to_reclaim, num_reclaimed, false);
1077 	return kr;
1078 }
1079 
1080 /*
1081  * Attempts to reclaim until the buffer's estimated number of available bytes
1082  * is <= num_bytes_reclaimable_threshold. The metadata buffer lock should be
1083  * held by the caller.
1084  *
1085  * Writes the number of entries reclaimed to `num_reclaimed_out`.
1086  */
1087 static kern_return_t
reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,size_t num_bytes_reclaimable_threshold,size_t * num_reclaimed_out)1088 reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,
1089     size_t num_bytes_reclaimable_threshold, size_t *num_reclaimed_out)
1090 {
1091 	assert(metadata != NULL);
1092 	assert(num_reclaimed_out != NULL);
1093 	vmdr_metadata_assert_owned(metadata);
1094 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
1095 
1096 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_START, metadata->vdrm_pid);
1097 
1098 	size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
1099 	while (true) {
1100 		kern_return_t kr;
1101 		size_t curr_entries_reclaimed = 0;
1102 		num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
1103 		reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
1104 		if (num_bytes_reclaimed > reclaimable_bytes) {
1105 			estimated_reclaimable_bytes = 0;
1106 		} else {
1107 			estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
1108 		}
1109 		if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
1110 			break;
1111 		}
1112 		kr = reclaim_chunk(metadata, &curr_entries_reclaimed,
1113 		    RECLAIM_OPTIONS_NONE);
1114 		if (kr == KERN_NOT_FOUND) {
1115 			// Nothing left to reclaim
1116 			break;
1117 		} else if (kr != KERN_SUCCESS) {
1118 			KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END,
1119 			    metadata->vdrm_pid, num_entries_reclaimed,
1120 			    estimated_reclaimable_bytes, kr);
1121 			*num_reclaimed_out = num_entries_reclaimed;
1122 			return kr;
1123 		}
1124 		num_entries_reclaimed += curr_entries_reclaimed;
1125 	}
1126 
1127 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END,
1128 	    metadata->vdrm_pid, num_entries_reclaimed,
1129 	    estimated_reclaimable_bytes, KERN_SUCCESS);
1130 	*num_reclaimed_out = num_entries_reclaimed;
1131 	return KERN_SUCCESS;
1132 }
1133 
1134 /*
1135  * Get the reclamation metadata buffer for the given map.
1136  */
1137 static vm_deferred_reclamation_metadata_t
get_task_reclaim_metadata(task_t task)1138 get_task_reclaim_metadata(task_t task)
1139 {
1140 	assert(task != NULL);
1141 	vm_deferred_reclamation_metadata_t metadata = NULL;
1142 	task_lock(task);
1143 	metadata = task->deferred_reclamation_metadata;
1144 	task_unlock(task);
1145 	return metadata;
1146 }
1147 
1148 kern_return_t
vm_deferred_reclamation_buffer_synchronize_internal(task_t task,size_t num_entries_to_reclaim)1149 vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
1150 {
1151 	kern_return_t kr;
1152 	vm_deferred_reclamation_metadata_t metadata = NULL;
1153 	size_t total_reclaimed = 0;
1154 
1155 	if (!task_is_active(task)) {
1156 		return KERN_FAILURE;
1157 	}
1158 
1159 	metadata = get_task_reclaim_metadata(task);
1160 	if (metadata == NULL) {
1161 		return KERN_INVALID_ARGUMENT;
1162 	}
1163 
1164 	vmdr_metadata_own(metadata);
1165 
1166 	while (total_reclaimed < num_entries_to_reclaim) {
1167 		size_t num_reclaimed;
1168 		kr = reclaim_chunk(metadata, &num_reclaimed, RECLAIM_OPTIONS_NONE);
1169 		if (kr == KERN_NOT_FOUND) {
1170 			/* buffer has been fully reclaimed from */
1171 			break;
1172 		} else if (kr != KERN_SUCCESS) {
1173 			vmdr_metadata_disown(metadata);
1174 			return kr;
1175 		}
1176 
1177 		total_reclaimed += num_reclaimed;
1178 	}
1179 
1180 	vmdr_metadata_disown(metadata);
1181 	return KERN_SUCCESS;
1182 }
1183 
1184 kern_return_t
vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task,size_t reclaimable_bytes)1185 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
1186 {
1187 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1188 	size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer, num_reclaimed = 0;
1189 	bool success;
1190 	kern_return_t kr = KERN_SUCCESS;
1191 	if (metadata == NULL) {
1192 		return KERN_INVALID_ARGUMENT;
1193 	}
1194 
1195 	if (!metadata->vdrm_pid) {
1196 		metadata->vdrm_pid = task_pid(task);
1197 	}
1198 
1199 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START,
1200 	    metadata->vdrm_pid, reclaimable_bytes);
1201 
1202 	/*
1203 	 * The client is allowed to make this call in parallel from multiple threads.
1204 	 * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
1205 	 * If the client's value is smaller than what we've stored, another thread
1206 	 * raced ahead of them and we've already acted on that accounting so this
1207 	 * call should be a no-op.
1208 	 */
1209 	success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
1210 	    reclaimable_bytes, acquire,
1211 	{
1212 		if (num_bytes_in_buffer > reclaimable_bytes) {
1213 		        os_atomic_rmw_loop_give_up(break);
1214 		}
1215 	});
1216 	if (!success) {
1217 		/* Stale value. Nothing new to reclaim */
1218 		goto done;
1219 	}
1220 	num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
1221 
1222 	if (reclaimable_bytes > num_bytes_reclaimed) {
1223 		estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
1224 		if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
1225 			vmdr_metadata_own(metadata);
1226 			kr = reclaim_entries_from_buffer(metadata,
1227 			    vm_reclaim_max_threshold, &num_reclaimed);
1228 			vmdr_metadata_disown(metadata);
1229 		}
1230 	}
1231 
1232 done:
1233 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END,
1234 	    metadata->vdrm_pid, reclaimable_bytes, num_bytes_reclaimed,
1235 	    num_reclaimed);
1236 
1237 	return kr;
1238 }
1239 
1240 static inline size_t
pick_reclaim_threshold(vm_deferred_reclamation_action_t action)1241 pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
1242 {
1243 	switch (action) {
1244 	case RECLAIM_FULL:
1245 		return 0;
1246 	case RECLAIM_TRIM:
1247 		return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
1248 	case RECLAIM_ASYNC:
1249 		return 0;
1250 	}
1251 }
1252 
1253 void
vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action,vm_deferred_reclamation_options_t options)1254 vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action, vm_deferred_reclamation_options_t options)
1255 {
1256 	kern_return_t kr;
1257 	size_t num_reclaimed;
1258 	size_t reclaim_threshold;
1259 
1260 	switch (action) {
1261 	case RECLAIM_ASYNC:
1262 		lck_mtx_lock(&async_reclamation_buffers_lock);
1263 		vmdr_process_async_reclamation_list();
1264 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1265 		break;
1266 	case RECLAIM_TRIM:
1267 	case RECLAIM_FULL:
1268 		reclaim_threshold = pick_reclaim_threshold(action);
1269 		KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_START,
1270 		    action, reclaim_threshold);
1271 		lck_mtx_lock(&reclamation_buffers_lock);
1272 		reclamation_counter++;
1273 		vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
1274 		while (metadata != NULL) {
1275 			vmdr_list_remove_locked(metadata);
1276 			vmdr_list_append_locked(metadata);
1277 			vmdr_metadata_retain(metadata);
1278 			lck_mtx_unlock(&reclamation_buffers_lock);
1279 
1280 			vmdr_metadata_lock(metadata);
1281 
1282 			if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
1283 				// We've already seen this one. We're done
1284 				vmdr_metadata_unlock(metadata);
1285 				lck_mtx_lock(&reclamation_buffers_lock);
1286 				break;
1287 			}
1288 			metadata->vdrm_reclaimed_at = reclamation_counter;
1289 
1290 			if (options & RECLAIM_NO_WAIT) {
1291 				bool acquired = vmdr_metadata_try_own_locked(metadata);
1292 				if (!acquired) {
1293 					vmdr_metadata_unlock(metadata);
1294 					goto next;
1295 				}
1296 			} else {
1297 				vmdr_metadata_own_locked(metadata);
1298 			}
1299 			vmdr_metadata_unlock(metadata);
1300 
1301 			kr = reclaim_entries_from_buffer(metadata,
1302 			    reclaim_threshold, &num_reclaimed);
1303 
1304 			vmdr_metadata_disown(metadata);
1305 next:
1306 			vmdr_metadata_release(metadata);
1307 			lck_mtx_lock(&reclamation_buffers_lock);
1308 			metadata = TAILQ_FIRST(&reclamation_buffers);
1309 		}
1310 		lck_mtx_unlock(&reclamation_buffers_lock);
1311 		KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_END,
1312 		    reclamation_counter);
1313 		break;
1314 	default:
1315 		panic("Unexpected reclaim action %d", action);
1316 	}
1317 }
1318 
1319 void
vm_deferred_reclamation_reclaim_all_memory(vm_deferred_reclamation_options_t options)1320 vm_deferred_reclamation_reclaim_all_memory(
1321 	vm_deferred_reclamation_options_t options)
1322 {
1323 	vm_deferred_reclamation_reclaim_memory(RECLAIM_FULL, options);
1324 }
1325 
1326 bool
vm_deferred_reclamation_reclaim_from_task_async(task_t task)1327 vm_deferred_reclamation_reclaim_from_task_async(task_t task)
1328 {
1329 	bool queued = false;
1330 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1331 
1332 	if (metadata != NULL) {
1333 		os_log_debug(vm_reclaim_log_handle, "vm_reclaim: enquequeing %d for "
1334 		    "asynchronous reclamation.\n", task_pid(task));
1335 		lck_mtx_lock(&async_reclamation_buffers_lock);
1336 		// move this buffer to the tail if still on the async list
1337 		if (vmdr_metadata_has_pending_reclamation(metadata)) {
1338 			vmdr_async_list_remove_locked(metadata);
1339 		}
1340 		vmdr_async_list_append_locked(metadata);
1341 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1342 		queued = true;
1343 		thread_wakeup_thread(&vm_reclaim_thread, vm_reclaim_thread);
1344 	}
1345 
1346 	return queued;
1347 }
1348 
1349 kern_return_t
vm_deferred_reclamation_reclaim_from_task_sync(task_t task,size_t max_entries_to_reclaim)1350 vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
1351 {
1352 	kern_return_t kr;
1353 	size_t num_reclaimed = 0;
1354 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1355 
1356 	if (!task_is_active(task) || task_is_halting(task)) {
1357 		return KERN_ABORTED;
1358 	}
1359 
1360 	if (metadata != NULL) {
1361 		vmdr_metadata_own(metadata);
1362 		while (num_reclaimed < max_entries_to_reclaim) {
1363 			size_t num_reclaimed_now;
1364 			kr = reclaim_chunk(metadata, &num_reclaimed_now, RECLAIM_OPTIONS_NONE);
1365 			if (kr == KERN_NOT_FOUND) {
1366 				// Nothing left to reclaim
1367 				break;
1368 			} else if (kr != KERN_SUCCESS) {
1369 				/* Lock has already been released and task is being killed. */
1370 				vmdr_metadata_disown(metadata);
1371 				return kr;
1372 			}
1373 			num_reclaimed += num_reclaimed_now;
1374 		}
1375 		vmdr_metadata_disown(metadata);
1376 	}
1377 
1378 	return KERN_SUCCESS;
1379 }
1380 
1381 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_buffer_fork(task_t task,vm_deferred_reclamation_metadata_t parent)1382 vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1383 {
1384 	vm_deferred_reclamation_metadata_t metadata = NULL;
1385 	vmdr_metadata_assert_owned(parent);
1386 
1387 	assert(task->deferred_reclamation_metadata == NULL);
1388 	metadata = vmdr_metadata_alloc(task, parent->vdrm_reclaim_buffer,
1389 	    parent->vdrm_buffer_size, parent->vdrm_reclaim_indices);
1390 	vmdr_metadata_disown(parent);
1391 
1392 	lck_mtx_lock(&reclamation_buffers_lock);
1393 	vmdr_list_append_locked(metadata);
1394 	lck_mtx_unlock(&reclamation_buffers_lock);
1395 
1396 	return metadata;
1397 }
1398 
1399 static void
reclaim_thread_init(void)1400 reclaim_thread_init(void)
1401 {
1402 #if CONFIG_THREAD_GROUPS
1403 	thread_group_vm_add();
1404 #endif
1405 	thread_set_thread_name(current_thread(), "VM_reclaim");
1406 }
1407 
1408 
1409 static void
vmdr_process_async_reclamation_list(void)1410 vmdr_process_async_reclamation_list(void)
1411 {
1412 	kern_return_t kr;
1413 	size_t total_entries_reclaimed = 0;
1414 	size_t num_tasks_reclaimed = 0;
1415 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
1416 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_START);
1417 
1418 	vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
1419 	while (metadata != NULL) {
1420 		size_t num_reclaimed;
1421 		vmdr_metadata_retain(metadata);
1422 		/*
1423 		 * NB: It is safe to drop the async list lock without removing the
1424 		 * buffer because only one thread (the reclamation thread) may consume
1425 		 * from the async list. The buffer is guaranteed to still be in the
1426 		 * list when the lock is re-taken.
1427 		 */
1428 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1429 
1430 		vmdr_metadata_own(metadata);
1431 
1432 		/* NB: Currently the async reclaim thread fully reclaims the buffer */
1433 		kr = reclaim_entries_from_buffer(metadata, 0, &num_reclaimed);
1434 		total_entries_reclaimed += num_reclaimed;
1435 		num_tasks_reclaimed++;
1436 
1437 		assert(current_thread()->map == kernel_map);
1438 		vmdr_metadata_disown(metadata);
1439 
1440 		lck_mtx_lock(&async_reclamation_buffers_lock);
1441 		/* Wakeup anyone waiting on this buffer getting processed */
1442 		if (metadata->vdrm_waiters) {
1443 			wakeup_all_with_inheritor(&metadata->vdrm_async_list,
1444 			    THREAD_AWAKENED);
1445 		}
1446 		/*
1447 		 * Check that the buffer has not been removed from the async list
1448 		 * while being reclaimed from. This can happen if the task terminates
1449 		 * while the reclamation is in flight.
1450 		 */
1451 		if (vmdr_metadata_has_pending_reclamation(metadata)) {
1452 			vmdr_async_list_remove_locked(metadata);
1453 		}
1454 		vmdr_metadata_release(metadata);
1455 		metadata = TAILQ_FIRST(&async_reclamation_buffers);
1456 	}
1457 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_END,
1458 	    num_tasks_reclaimed, total_entries_reclaimed);
1459 }
1460 
1461 __enum_decl(reclaim_thread_state, uint32_t, {
1462 	RECLAIM_THREAD_INIT = 0,
1463 	RECLAIM_THREAD_CONT = 1,
1464 });
1465 
1466 static void
reclaim_thread_continue(void)1467 reclaim_thread_continue(void)
1468 {
1469 	lck_mtx_lock(&async_reclamation_buffers_lock);
1470 
1471 	vmdr_process_async_reclamation_list();
1472 	assert_wait(&vm_reclaim_thread, THREAD_UNINT);
1473 
1474 	lck_mtx_unlock(&async_reclamation_buffers_lock);
1475 }
1476 
1477 void
reclaim_thread(void * param,wait_result_t wr __unused)1478 reclaim_thread(void *param, wait_result_t wr __unused)
1479 {
1480 	if (param == (void *) RECLAIM_THREAD_INIT) {
1481 		reclaim_thread_init();
1482 	} else {
1483 		assert(param == (void *) RECLAIM_THREAD_CONT);
1484 	}
1485 
1486 	reclaim_thread_continue();
1487 
1488 	(void) thread_block_parameter(reclaim_thread, (void*) RECLAIM_THREAD_CONT);
1489 }
1490 
1491 __startup_func
1492 static void
vm_deferred_reclamation_init(void)1493 vm_deferred_reclamation_init(void)
1494 {
1495 	// Note: no-op pending rdar://27006343 (Custom kernel log handles)
1496 	vm_reclaim_log_handle = os_log_create("com.apple.xnu", "vm_reclaim");
1497 
1498 	(void)kernel_thread_start_priority(reclaim_thread,
1499 	    (void *)RECLAIM_THREAD_INIT, kReclaimThreadPriority,
1500 	    &vm_reclaim_thread);
1501 }
1502 
1503 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1504 
1505 #if DEVELOPMENT || DEBUG
1506 
1507 bool
vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)1508 vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1509 {
1510 	vm_deferred_reclamation_metadata_t metadata = NULL;
1511 	proc_t p = proc_find(pid);
1512 	if (p == NULL) {
1513 		return false;
1514 	}
1515 	task_t t = proc_task(p);
1516 	if (t == NULL) {
1517 		proc_rele(p);
1518 		return false;
1519 	}
1520 
1521 	task_lock(t);
1522 	if (!task_is_halting(t) && task_is_active(t)) {
1523 		metadata = t->deferred_reclamation_metadata;
1524 		if (metadata != NULL) {
1525 			vmdr_metadata_retain(metadata);
1526 		}
1527 	}
1528 	task_unlock(t);
1529 	proc_rele(p);
1530 	if (metadata == NULL) {
1531 		return false;
1532 	}
1533 
1534 	lck_mtx_lock(&async_reclamation_buffers_lock);
1535 	while (vmdr_metadata_has_pending_reclamation(metadata)) {
1536 		metadata->vdrm_waiters++;
1537 		lck_mtx_sleep_with_inheritor(&async_reclamation_buffers_lock,
1538 		    LCK_SLEEP_DEFAULT, &metadata->vdrm_async_list, vm_reclaim_thread,
1539 		    THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1540 		metadata->vdrm_waiters--;
1541 	}
1542 	lck_mtx_unlock(&async_reclamation_buffers_lock);
1543 
1544 	vmdr_metadata_release(metadata);
1545 	return true;
1546 }
1547 
1548 #endif /* DEVELOPMENT || DEBUG */
1549