xref: /xnu-11215.1.10/osfmk/vm/vm_reclaim.c (revision 8d741a5de7ff4191bf97d57b9f54c2f6d4a15585)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/exc_guard.h>
30 #include <kern/locks.h>
31 #include <kern/task.h>
32 #include <kern/zalloc.h>
33 #include <kern/misc_protos.h>
34 #include <kern/startup.h>
35 #include <kern/sched.h>
36 #include <libkern/OSAtomic.h>
37 #include <mach/kern_return.h>
38 #include <mach/mach_types.h>
39 #include <mach/vm_reclaim.h>
40 #include <os/log.h>
41 #include <pexpert/pexpert.h>
42 #include <vm/vm_fault_xnu.h>
43 #include <vm/vm_map.h>
44 #include <vm/vm_map_internal.h>
45 #include <vm/vm_reclaim_internal.h>
46 #include <vm/vm_sanitize_internal.h>
47 #include <sys/errno.h>
48 #include <sys/kdebug.h>
49 #include <vm/vm_kern_xnu.h>
50 #include <sys/queue.h>
51 #include <sys/reason.h>
52 #include <os/atomic_private.h>
53 #include <os/refcnt.h>
54 #include <os/refcnt_internal.h>
55 
56 #pragma mark Tunables
57 
58 #define VM_RECLAIM_THRESHOLD_DISABLED 0ULL
59 
60 TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
61 static integer_t kReclaimThreadPriority = BASEPRI_VM;
62 // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
63 TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", 2);
64 TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults", "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", 0, TUNABLE_DT_NONE);
65 // Used to debug vm_reclaim kills
66 TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
67 
68 #pragma mark Declarations
69 typedef struct proc *proc_t;
70 extern const char *proc_best_name(struct proc *);
71 extern kern_return_t kern_return_for_errno(int);
72 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
73 struct proc *proc_ref(struct proc *p, int locked);
74 int proc_rele(proc_t p);
75 static kern_return_t reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
76 static kern_return_t reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
77 static kern_return_t reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
78 
79 os_refgrp_decl(static, vdrm_refgrp, "vm_reclaim_metadata_refgrp", NULL);
80 
81 struct vm_deferred_reclamation_metadata_s {
82 	/*
83 	 * Global list containing every reclamation buffer. Protected by the
84 	 * reclamation_buffers_lock.
85 	 */
86 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list;
87 	/*
88 	 * A list containing buffers that are ripe for reclamation. Protected by
89 	 * the async_reclamation_buffers_lock.
90 	 */
91 	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list;
92 	/* Protects all struct fields (except denoted otherwise) */
93 	decl_lck_mtx_data(, vdrm_lock);
94 	decl_lck_mtx_gate_data(, vdrm_gate);
95 	/*
96 	 * The task owns this structure but we maintain a backpointer here
97 	 * so that we can send an exception if we hit an error.
98 	 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
99 	 */
100 	task_t vdrm_task;
101 	pid_t vdrm_pid;
102 	vm_map_t vdrm_map;
103 	/*
104 	 * The owning task holds a ref on this object. When the task dies, it
105 	 * will set vdrm_task := NULL and drop its ref. Threads operating on the buffer
106 	 * should hold a +1 on the metadata structure to ensure it's validity.
107 	 */
108 	os_refcnt_t vdrm_refcnt;
109 	user_addr_t vdrm_reclaim_buffer;
110 	mach_vm_size_t vdrm_buffer_size;
111 	user_addr_t vdrm_reclaim_indices;
112 	uint64_t vdrm_reclaimed_at;
113 	/*
114 	 * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
115 	 * cumulatively. Both values are in terms of virtual memory, so they give an upper bound
116 	 * on the amount of physical memory that can be reclaimed.
117 	 * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
118 	 * Note that neither value is protected by the vdrm_lock.
119 	 */
120 	_Atomic size_t vdrm_num_bytes_put_in_buffer;
121 	_Atomic size_t vdrm_num_bytes_reclaimed;
122 	/*
123 	 * The number of threads waiting for a pending reclamation
124 	 * on this buffer to complete. Protected by the
125 	 * async_reclamation_buffers_lock.
126 	 */
127 	uint32_t vdrm_waiters;
128 };
129 static void vmdr_process_async_reclamation_list(void);
130 
131 extern void *proc_find(int pid);
132 extern task_t proc_task(proc_t);
133 
134 #pragma mark Globals
135 static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
136 static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
137 static os_log_t vm_reclaim_log_handle;
138 
139 /*
140  * We maintain two lists of reclamation buffers.
141  * The reclamation_buffers list contains every buffer in the system.
142  * The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
143  * Each list has its own lock.
144  */
145 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
146 
147 static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
148 /*
149  * The reclamation_buffers_lock protects the reclamation_buffers list.
150  * It must be held when iterating over the list or manipulating the list.
151  * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
152  */
153 LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
154 LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
155 static uint64_t reclamation_counter; // generation count for global reclaims
156 
157 
158 static void vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
159 static void vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
160 static void vmdr_async_list_append_locked(vm_deferred_reclamation_metadata_t metadata);
161 static void vmdr_async_list_remove_locked(vm_deferred_reclamation_metadata_t metadata);
162 
163 static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
164 static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
165 static void vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata);
166 
167 #pragma mark Implementation
168 
169 static vm_deferred_reclamation_metadata_t
vmdr_metadata_alloc(task_t task,user_addr_t buffer,mach_vm_size_t size,user_addr_t indices)170 vmdr_metadata_alloc(
171 	task_t                  task,
172 	user_addr_t             buffer,
173 	mach_vm_size_t          size,
174 	user_addr_t             indices)
175 {
176 	vm_deferred_reclamation_metadata_t metadata;
177 	vm_map_t map = task->map;
178 
179 	assert(!map->is_nested_map);
180 
181 	metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
182 	lck_mtx_init(&metadata->vdrm_lock, &vm_reclaim_lock_grp, LCK_ATTR_NULL);
183 	lck_mtx_gate_init(&metadata->vdrm_lock, &metadata->vdrm_gate);
184 	os_ref_init(&metadata->vdrm_refcnt, &vdrm_refgrp);
185 
186 	metadata->vdrm_task = task;
187 	/*
188 	 * Forked children will not yet have a pid. Lazily set the pid once the
189 	 * task has been started.
190 	 *
191 	 * TODO: do not support buffer initialization during fork and have libmalloc
192 	 * initialize the buffer after fork. (rdar://124295804)
193 	 */
194 	metadata->vdrm_pid = 0;
195 	metadata->vdrm_map = map;
196 	metadata->vdrm_reclaim_buffer = buffer;
197 	metadata->vdrm_buffer_size = size;
198 	metadata->vdrm_reclaim_indices = indices;
199 
200 	/*
201 	 * we do not need to hold a lock on `task` because this is called
202 	 * either at fork() time or from the context of current_task().
203 	 */
204 	vm_map_reference(map);
205 	return metadata;
206 }
207 
208 static void
vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)209 vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
210 {
211 	assert3u(os_ref_get_count(&metadata->vdrm_refcnt), ==, 0);
212 	vm_map_deallocate(metadata->vdrm_map);
213 	lck_mtx_destroy(&metadata->vdrm_lock, &vm_reclaim_lock_grp);
214 	lck_mtx_gate_destroy(&metadata->vdrm_lock, &metadata->vdrm_gate);
215 	zfree(vm_reclaim_metadata_zone, metadata);
216 }
217 
218 kern_return_t
vm_deferred_reclamation_buffer_init_internal(task_t task,mach_vm_address_t * address,mach_vm_size_t size)219 vm_deferred_reclamation_buffer_init_internal(
220 	task_t                   task,
221 	mach_vm_address_t        *address,
222 	mach_vm_size_t           size)
223 {
224 	kern_return_t kr = KERN_FAILURE;
225 	vm_deferred_reclamation_metadata_t metadata = NULL;
226 	vm_map_t map;
227 	uint64_t head = 0, tail = 0, busy = 0;
228 	static bool reclaim_disabled_logged = false;
229 
230 	if (task == TASK_NULL || address == NULL || size == 0) {
231 		return KERN_INVALID_ARGUMENT;
232 	}
233 
234 	if (!vm_reclaim_max_threshold) {
235 		if (!reclaim_disabled_logged) {
236 			/* Avoid logging failure for every new process */
237 			reclaim_disabled_logged = true;
238 			os_log_error(vm_reclaim_log_handle,
239 			    "vm_reclaim: failed to initialize vmdr buffer - reclaim is disabled (%llu)\n",
240 			    vm_reclaim_max_threshold);
241 		}
242 		return KERN_NOT_SUPPORTED;
243 	}
244 
245 	map = task->map;
246 	size = vm_map_round_page(size, VM_MAP_PAGE_MASK(map));
247 
248 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START,
249 	    task_pid(task), size);
250 	/*
251 	 * TODO: If clients other than libmalloc adopt deferred reclaim, a
252 	 * different tag should be given
253 	 */
254 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE_PERMANENT(
255 		.vm_tag = VM_MEMORY_MALLOC);
256 	mach_vm_offset_ut *offset_u = vm_sanitize_wrap_addr_ref(address);
257 	mach_vm_size_ut size_u = vm_sanitize_wrap_size(size);
258 	kr = mach_vm_allocate_kernel(map, offset_u, size_u, vmk_flags);
259 	if (kr != KERN_SUCCESS) {
260 		os_log_error(vm_reclaim_log_handle, "vm_reclaim: failed to allocate VA for reclaim "
261 		    "buffer (%d) - %s [%d]\n", kr, task_best_name(task), task_pid(task));
262 		return kr;
263 	}
264 	assert3u(*address, !=, 0);
265 
266 	user_addr_t buffer = *address + \
267 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
268 	mach_vm_size_t buffer_size = size - \
269 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
270 	user_addr_t indices = *address + \
271 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, indices);
272 
273 	metadata = vmdr_metadata_alloc(task, buffer, buffer_size, indices);
274 
275 	/*
276 	 * Validate the starting indices.
277 	 */
278 	kr = reclaim_copyin_busy(metadata, &busy);
279 	if (kr != KERN_SUCCESS) {
280 		goto out;
281 	}
282 	kr = reclaim_copyin_head(metadata, &head);
283 	if (kr != KERN_SUCCESS) {
284 		goto out;
285 	}
286 	kr = reclaim_copyin_tail(metadata, &tail);
287 	if (kr != KERN_SUCCESS) {
288 		goto out;
289 	}
290 
291 	if (head != 0 || tail != 0 || busy != 0) {
292 		os_log_error(vm_reclaim_log_handle, "vm_reclaim: indices were not "
293 		    "zero-initialized\n");
294 		kr = KERN_INVALID_ARGUMENT;
295 		goto out;
296 	}
297 
298 	/*
299 	 * Publish the metadata to the task & global buffer list. This must be
300 	 * done under the task lock to synchronize with task termination - i.e.
301 	 * task_terminate_internal is guaranteed to see the published metadata and
302 	 * tear it down.
303 	 */
304 	lck_mtx_lock(&reclamation_buffers_lock);
305 	task_lock(task);
306 
307 	if (!task_is_active(task) || task_is_halting(task)) {
308 		os_log_error(vm_reclaim_log_handle,
309 		    "vm_reclaim: failed to initialize buffer on dying task %s [%d]", task_best_name(task), task_pid(task));
310 		kr = KERN_ABORTED;
311 		goto fail_task;
312 	}
313 	if (task->deferred_reclamation_metadata != NULL) {
314 		os_log_error(vm_reclaim_log_handle,
315 		    "vm_reclaim: tried to overwrite existing reclaim buffer for %s [%d]", task_best_name(task), task_pid(task));
316 		kr = KERN_INVALID_ARGUMENT;
317 		goto fail_task;
318 	}
319 
320 	vmdr_list_append_locked(metadata);
321 
322 	task->deferred_reclamation_metadata = metadata;
323 
324 	task_unlock(task);
325 	lck_mtx_unlock(&reclamation_buffers_lock);
326 
327 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
328 	    task_pid(task), KERN_SUCCESS, *address);
329 	return KERN_SUCCESS;
330 
331 fail_task:
332 	task_unlock(task);
333 	lck_mtx_unlock(&reclamation_buffers_lock);
334 
335 out:
336 	vmdr_metadata_release(metadata);
337 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
338 	    task_pid(task), kr);
339 	return kr;
340 }
341 
342 #pragma mark Synchronization
343 
344 static inline void
vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)345 vmdr_metadata_lock(vm_deferred_reclamation_metadata_t metadata)
346 {
347 	lck_mtx_lock(&metadata->vdrm_lock);
348 }
349 
350 static inline void
vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)351 vmdr_metadata_unlock(vm_deferred_reclamation_metadata_t metadata)
352 {
353 	lck_mtx_unlock(&metadata->vdrm_lock);
354 }
355 
356 static inline void
vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)357 vmdr_metadata_assert_owned_locked(vm_deferred_reclamation_metadata_t metadata)
358 {
359 	lck_mtx_gate_assert(&metadata->vdrm_lock, &metadata->vdrm_gate,
360 	    GATE_ASSERT_HELD);
361 }
362 
363 static inline void
vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)364 vmdr_metadata_assert_owned(vm_deferred_reclamation_metadata_t metadata)
365 {
366 #if MACH_ASSERT
367 	vmdr_metadata_lock(metadata);
368 	vmdr_metadata_assert_owned_locked(metadata);
369 	vmdr_metadata_unlock(metadata);
370 #else /* MACH_ASSERT */
371 	(void)metadata;
372 #endif /* MACH_ASSERT */
373 }
374 
375 
376 /*
377  * Try to take ownership of the buffer. Returns true if successful.
378  */
379 static bool
vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)380 vmdr_metadata_try_own_locked(vm_deferred_reclamation_metadata_t metadata)
381 {
382 	kern_return_t kr = lck_mtx_gate_try_close(&metadata->vdrm_lock,
383 	    &metadata->vdrm_gate);
384 	return kr == KERN_SUCCESS;
385 }
386 
387 static void
vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata)388 vmdr_metadata_own_locked(vm_deferred_reclamation_metadata_t metadata)
389 {
390 	__assert_only gate_wait_result_t wait_result;
391 	if (!vmdr_metadata_try_own_locked(metadata)) {
392 		wait_result = lck_mtx_gate_wait(
393 			&metadata->vdrm_lock, &metadata->vdrm_gate, LCK_SLEEP_DEFAULT,
394 			THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
395 		assert(wait_result == GATE_HANDOFF);
396 	}
397 }
398 
399 /*
400  * Set the current thread as the owner of a reclaim buffer. May block. Will
401  * propagate priority.
402  */
403 static void
vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)404 vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata)
405 {
406 	vmdr_metadata_lock(metadata);
407 	vmdr_metadata_own_locked(metadata);
408 	vmdr_metadata_unlock(metadata);
409 }
410 
411 static void
vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)412 vmdr_metadata_disown_locked(vm_deferred_reclamation_metadata_t metadata)
413 {
414 	vmdr_metadata_assert_owned_locked(metadata);
415 	lck_mtx_gate_handoff(&metadata->vdrm_lock, &metadata->vdrm_gate,
416 	    GATE_HANDOFF_OPEN_IF_NO_WAITERS);
417 }
418 
419 /*
420  * Release ownership of a reclaim buffer and wakeup any threads waiting for
421  * ownership. Must be called from the thread that acquired ownership.
422  */
423 static void
vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)424 vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata)
425 {
426 	vmdr_metadata_lock(metadata);
427 	vmdr_metadata_disown_locked(metadata);
428 	vmdr_metadata_unlock(metadata);
429 }
430 
431 static void
vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)432 vmdr_metadata_retain(vm_deferred_reclamation_metadata_t metadata)
433 {
434 	os_ref_retain(&metadata->vdrm_refcnt);
435 }
436 
437 static void
vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)438 vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata)
439 {
440 	if (os_ref_release(&metadata->vdrm_refcnt) == 0) {
441 		vmdr_metadata_free(metadata);
442 	}
443 }
444 
445 void
vm_deferred_reclamation_buffer_own(vm_deferred_reclamation_metadata_t metadata)446 vm_deferred_reclamation_buffer_own(vm_deferred_reclamation_metadata_t metadata)
447 {
448 	vmdr_metadata_own(metadata);
449 }
450 
451 void
vm_deferred_reclamation_buffer_disown(vm_deferred_reclamation_metadata_t metadata)452 vm_deferred_reclamation_buffer_disown(vm_deferred_reclamation_metadata_t metadata)
453 {
454 	vmdr_metadata_disown(metadata);
455 }
456 
457 #pragma mark Global Queue Management
458 
459 static void
vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)460 vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
461 {
462 	LCK_MTX_ASSERT(&reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
463 	assert(metadata->vdrm_list.tqe_prev != NULL);
464 	TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
465 	metadata->vdrm_list.tqe_prev = NULL;
466 	metadata->vdrm_list.tqe_next = NULL;
467 }
468 
469 static void
vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)470 vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
471 {
472 	LCK_MTX_ASSERT(&reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
473 	assert(metadata->vdrm_list.tqe_prev == NULL);
474 	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
475 }
476 
477 static void
vmdr_async_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)478 vmdr_async_list_remove_locked(vm_deferred_reclamation_metadata_t metadata)
479 {
480 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
481 	assert(metadata->vdrm_async_list.tqe_prev != NULL);
482 	TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
483 	metadata->vdrm_async_list.tqe_prev = NULL;
484 	metadata->vdrm_async_list.tqe_next = NULL;
485 }
486 
487 static void
vmdr_async_list_append_locked(vm_deferred_reclamation_metadata_t metadata)488 vmdr_async_list_append_locked(vm_deferred_reclamation_metadata_t metadata)
489 {
490 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
491 	assert(metadata->vdrm_async_list.tqe_prev == NULL);
492 	TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
493 }
494 
495 static bool
vmdr_metadata_has_pending_reclamation(vm_deferred_reclamation_metadata_t metadata)496 vmdr_metadata_has_pending_reclamation(vm_deferred_reclamation_metadata_t metadata)
497 {
498 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
499 	return metadata->vdrm_async_list.tqe_prev != NULL;
500 }
501 
502 #pragma mark Lifecycle
503 
504 void
vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)505 vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)
506 {
507 	assert(metadata != NULL);
508 	/*
509 	 * First remove the buffer from the global list so no one else can get access to it.
510 	 */
511 	lck_mtx_lock(&reclamation_buffers_lock);
512 	vmdr_list_remove_locked(metadata);
513 	lck_mtx_unlock(&reclamation_buffers_lock);
514 
515 	/*
516 	 * Now remove it from the async list (if present)
517 	 */
518 	lck_mtx_lock(&async_reclamation_buffers_lock);
519 	if (vmdr_metadata_has_pending_reclamation(metadata)) {
520 		vmdr_async_list_remove_locked(metadata);
521 	}
522 	lck_mtx_unlock(&async_reclamation_buffers_lock);
523 }
524 
525 void
vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)526 vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
527 {
528 	assert(metadata != NULL);
529 	/* Buffer must be uninstalled before being deallocated */
530 	assert(metadata->vdrm_async_list.tqe_prev == NULL);
531 	assert(metadata->vdrm_async_list.tqe_next == NULL);
532 	assert(metadata->vdrm_list.tqe_prev == NULL);
533 	assert(metadata->vdrm_list.tqe_next == NULL);
534 	/*
535 	 * The task is dropping its ref on this buffer. First remove the buffer's
536 	 * back-reference to the task so that any threads currently operating on
537 	 * this buffer do not try to operate on the dead/dying task
538 	 */
539 	vmdr_metadata_lock(metadata);
540 	metadata->vdrm_task = TASK_NULL;
541 	vmdr_metadata_unlock(metadata);
542 
543 	vmdr_metadata_release(metadata);
544 }
545 
546 #pragma mark Exception Delivery
547 
548 static void
reclaim_kill_with_reason(vm_deferred_reclamation_metadata_t metadata,unsigned reason,mach_exception_data_type_t subcode)549 reclaim_kill_with_reason(
550 	vm_deferred_reclamation_metadata_t metadata,
551 	unsigned reason,
552 	mach_exception_data_type_t subcode)
553 {
554 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
555 	mach_exception_code_t code = 0;
556 	task_t task;
557 	proc_t p = NULL;
558 	boolean_t fatal = TRUE;
559 	bool killing_self;
560 	pid_t pid;
561 	int err;
562 
563 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
564 
565 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
566 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
567 	EXC_GUARD_ENCODE_TARGET(code, 0);
568 
569 	vmdr_metadata_lock(metadata);
570 	task = metadata->vdrm_task;
571 	if (task == TASK_NULL || !task_is_active(task) || task_is_halting(task)) {
572 		/* Task is no longer alive */
573 		vmdr_metadata_unlock(metadata);
574 		os_log_error(vm_reclaim_log_handle,
575 		    "vm_reclaim: Unable to deliver guard exception because task "
576 		    "[%d] is already dead.\n",
577 		    task ? task_pid(task) : -1);
578 		return;
579 	}
580 
581 	if (panic_on_kill) {
582 		panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
583 	}
584 
585 	killing_self = (task == current_task());
586 	if (!killing_self) {
587 		task_reference(task);
588 	}
589 	assert(task != kernel_task);
590 	vmdr_metadata_unlock(metadata);
591 
592 	if (reason == kGUARD_EXC_DEALLOC_GAP) {
593 		task_lock(task);
594 		fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
595 		task_unlock(task);
596 	}
597 
598 	if (!fatal) {
599 		os_log_info(vm_reclaim_log_handle,
600 		    "vm_reclaim: Skipping non fatal guard exception.\n");
601 		goto out;
602 	}
603 
604 	pid = task_pid(task);
605 	if (killing_self) {
606 		p = get_bsdtask_info(task);
607 	} else {
608 		p = proc_find(pid);
609 		if (p && proc_task(p) != task) {
610 			os_log_error(vm_reclaim_log_handle,
611 			    "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
612 			goto out;
613 		}
614 	}
615 
616 	if (!p) {
617 		os_log_error(vm_reclaim_log_handle,
618 		    "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
619 		goto out;
620 	}
621 
622 	int flags = PX_DEBUG_NO_HONOR;
623 	exception_info_t info = {
624 		.os_reason = OS_REASON_GUARD,
625 		.exception_type = EXC_GUARD,
626 		.mx_code = code,
627 		.mx_subcode = subcode
628 	};
629 
630 	err = exit_with_mach_exception(p, info, flags);
631 	if (err != 0) {
632 		os_log_error(vm_reclaim_log_handle, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
633 		goto out;
634 	}
635 
636 
637 out:
638 	if (!killing_self) {
639 		if (p) {
640 			proc_rele(p);
641 			p = NULL;
642 		}
643 		if (task) {
644 			task_deallocate(task);
645 			task = NULL;
646 		}
647 	}
648 }
649 
650 #pragma mark CopyI/O
651 
652 static user_addr_t
get_head_ptr(user_addr_t indices)653 get_head_ptr(user_addr_t indices)
654 {
655 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
656 }
657 
658 static user_addr_t
get_tail_ptr(user_addr_t indices)659 get_tail_ptr(user_addr_t indices)
660 {
661 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
662 }
663 
664 static user_addr_t
get_busy_ptr(user_addr_t indices)665 get_busy_ptr(user_addr_t indices)
666 {
667 	return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
668 }
669 
670 static kern_return_t
reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata,int result)671 reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
672 {
673 	if (result != 0 && (result != EFAULT || !vm_fault_get_disabled())) {
674 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_COPYIO_FAILURE,
675 		    result);
676 	}
677 	return kern_return_for_errno(result);
678 }
679 
680 /*
681  * Helper functions to do copyio on the head, tail, and busy pointers.
682  * Note that the kernel will only write to the busy and head pointers.
683  * Userspace is not supposed to write to the head or busy pointers, but the kernel
684  * must be resilient to that kind of bug in userspace.
685  */
686 
687 static kern_return_t
reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata,uint64_t * head)688 reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
689 {
690 	int result;
691 	kern_return_t kr;
692 	user_addr_t indices = metadata->vdrm_reclaim_indices;
693 	user_addr_t head_ptr = get_head_ptr(indices);
694 
695 	result = copyin_atomic64(head_ptr, head);
696 	kr = reclaim_handle_copyio_error(metadata, result);
697 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
698 		os_log_error(vm_reclaim_log_handle,
699 		    "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
700 	}
701 	return kr;
702 }
703 
704 static kern_return_t
reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata,uint64_t * tail)705 reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
706 {
707 	int result;
708 	kern_return_t kr;
709 	user_addr_t indices = metadata->vdrm_reclaim_indices;
710 	user_addr_t tail_ptr = get_tail_ptr(indices);
711 
712 	result = copyin_atomic64(tail_ptr, tail);
713 	kr = reclaim_handle_copyio_error(metadata, result);
714 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
715 		os_log_error(vm_reclaim_log_handle,
716 		    "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
717 	}
718 	return kr;
719 }
720 
721 static kern_return_t
reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t * busy)722 reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
723 {
724 	int result;
725 	kern_return_t kr;
726 	user_addr_t indices = metadata->vdrm_reclaim_indices;
727 	user_addr_t busy_ptr = get_busy_ptr(indices);
728 
729 	result = copyin_atomic64(busy_ptr, busy);
730 	kr = reclaim_handle_copyio_error(metadata, result);
731 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
732 		os_log_error(vm_reclaim_log_handle,
733 		    "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
734 	}
735 	return kr;
736 }
737 
738 static bool
reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata,uint64_t value)739 reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
740 {
741 	int result;
742 	kern_return_t kr;
743 	user_addr_t indices = metadata->vdrm_reclaim_indices;
744 	user_addr_t busy_ptr = get_busy_ptr(indices);
745 
746 	result = copyout_atomic64(value, busy_ptr);
747 	kr = reclaim_handle_copyio_error(metadata, result);
748 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
749 		os_log_error(vm_reclaim_log_handle,
750 		    "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
751 	}
752 	return kr;
753 }
754 
755 static bool
reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata,uint64_t value)756 reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
757 {
758 	int result;
759 	kern_return_t kr;
760 	user_addr_t indices = metadata->vdrm_reclaim_indices;
761 	user_addr_t head_ptr = get_head_ptr(indices);
762 
763 	result = copyout_atomic64(value, head_ptr);
764 	kr = reclaim_handle_copyio_error(metadata, result);
765 	if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) {
766 		os_log_error(vm_reclaim_log_handle,
767 		    "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
768 	}
769 	return kr;
770 }
771 
772 #pragma mark Reclamation
773 
774 /*
775  * Reclaim a chunk (kReclaimChunkSize entries) from the buffer.
776  *
777  * Writes the number of entries reclaimed to `num_reclaimed_out`. Note that
778  * there may be zero reclaimable entries in the chunk (they have all been
779  * re-used by userspace).
780  *
781  * Returns:
782  *  - KERN_NOT_FOUND if the buffer has been exhausted (head == tail)
783  *  - KERN_FAILURE on failure to reclaim -- metadata lock will be dropped
784  *    before returning
785  */
786 static kern_return_t
reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,size_t * num_reclaimed_out,vm_deferred_reclamation_options_t options)787 reclaim_chunk(vm_deferred_reclamation_metadata_t metadata,
788     size_t *num_reclaimed_out, vm_deferred_reclamation_options_t options)
789 {
790 	kern_return_t kr;
791 	int result = 0;
792 	size_t num_reclaimed = 0;
793 	uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0,
794 	    num_copied = 0, buffer_len = 0;
795 	user_addr_t indices;
796 	vm_map_t map = metadata->vdrm_map, old_map;
797 	mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
798 
799 	assert(metadata != NULL);
800 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
801 
802 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START,
803 	    metadata->vdrm_pid, kReclaimChunkSize);
804 
805 	buffer_len = metadata->vdrm_buffer_size /
806 	    sizeof(mach_vm_reclaim_entry_v1_t);
807 
808 	memset(reclaim_entries, 0, sizeof(reclaim_entries));
809 
810 	indices = (user_addr_t) metadata->vdrm_reclaim_indices;
811 	old_map = vm_map_switch(map);
812 
813 	if (options & RECLAIM_NO_FAULT) {
814 		vm_fault_disable();
815 	}
816 
817 	kr = reclaim_copyin_busy(metadata, &busy);
818 	if (kr != KERN_SUCCESS) {
819 		goto fail;
820 	}
821 	kr = reclaim_copyin_head(metadata, &head);
822 	if (kr != KERN_SUCCESS) {
823 		goto fail;
824 	}
825 	kr = reclaim_copyin_tail(metadata, &tail);
826 	if (kr != KERN_SUCCESS) {
827 		goto fail;
828 	}
829 
830 	if (busy != head) {
831 		// Userspace overwrote one of the pointers
832 		os_log_error(vm_reclaim_log_handle,
833 		    "vm_reclaim: Userspace modified head or busy pointer! head: %llu "
834 		    "(0x%llx) != busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
835 		    head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail,
836 		    get_tail_ptr(indices));
837 		reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE,
838 		    busy);
839 		kr = KERN_FAILURE;
840 		goto fail;
841 	}
842 
843 	if (tail < head) {
844 		/*
845 		 * Userspace is likely in the middle of trying to re-use an entry,
846 		 * bail on this reclamation.
847 		 */
848 		os_log_error(vm_reclaim_log_handle,
849 		    "vm_reclaim: Userspace modified head or tail pointer! head: %llu "
850 		    "(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
851 		    head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy,
852 		    get_busy_ptr(indices));
853 		kr = KERN_FAILURE;
854 		goto fail;
855 	}
856 
857 	/*
858 	 * NB: If any of the copyouts below fail due to faults being disabled,
859 	 * the buffer may be left in a state where several entries are unusable
860 	 * until the next reclamation (i.e. busy > head)
861 	 */
862 	num_to_reclaim = tail - head;
863 	while (true) {
864 		num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
865 		if (num_to_reclaim == 0) {
866 			break;
867 		}
868 		busy = head + num_to_reclaim;
869 		kr = reclaim_copyout_busy(metadata, busy);
870 		if (kr != KERN_SUCCESS) {
871 			goto fail;
872 		}
873 		os_atomic_thread_fence(seq_cst);
874 		kr = reclaim_copyin_tail(metadata, &new_tail);
875 		if (kr != KERN_SUCCESS) {
876 			goto fail;
877 		}
878 
879 		if (new_tail >= busy) {
880 			/* Got num_to_reclaim entries */
881 			break;
882 		}
883 		tail = new_tail;
884 		if (tail < head) {
885 			/*
886 			 * Userspace is likely in the middle of trying to re-use an entry,
887 			 * bail on this reclamation
888 			 */
889 			os_log_error(vm_reclaim_log_handle,
890 			    "vm_reclaim: Userspace modified head or tail pointer! head: "
891 			    "%llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
892 			    head, get_head_ptr(indices), tail, get_tail_ptr(indices),
893 			    busy, get_busy_ptr(indices));
894 			/* Reset busy back to head */
895 			reclaim_copyout_busy(metadata, head);
896 			kr = KERN_FAILURE;
897 			goto fail;
898 		}
899 		/* Can't reclaim these entries. Try again */
900 		num_to_reclaim = tail - head;
901 		if (num_to_reclaim == 0) {
902 			/* Nothing left to reclaim. Reset busy to head. */
903 			kr = reclaim_copyout_busy(metadata, head);
904 			if (kr != KERN_SUCCESS) {
905 				goto fail;
906 			}
907 			break;
908 		}
909 		/*
910 		 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
911 		 * so this is gauranteed to converge.
912 		 */
913 	}
914 
915 	while (num_copied < num_to_reclaim) {
916 		uint64_t memcpy_start_idx = (head % buffer_len);
917 		uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
918 		// Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
919 		memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
920 		uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
921 
922 		assert(num_to_copy + num_copied <= kReclaimChunkSize);
923 		user_addr_t src_ptr = metadata->vdrm_reclaim_buffer +
924 		    (memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t));
925 		mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
926 
927 		result = copyin(src_ptr, dst_ptr,
928 		    (num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t)));
929 		kr = reclaim_handle_copyio_error(metadata, result);
930 		if (kr != KERN_SUCCESS) {
931 			if (kr != KERN_MEMORY_ERROR) {
932 				os_log_error(vm_reclaim_log_handle,
933 				    "vm_reclaim: Unable to copyin %llu entries in reclaim "
934 				    "buffer at 0x%llx to 0x%llx: err=%d\n",
935 				    num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
936 			}
937 			goto fail;
938 		}
939 
940 		num_copied += num_to_copy;
941 		head += num_to_copy;
942 	}
943 
944 	for (size_t i = 0; i < num_to_reclaim; i++) {
945 		mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
946 		KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
947 		    metadata->vdrm_pid, entry->address, entry->size,
948 		    entry->behavior);
949 		DTRACE_VM4(vm_reclaim_chunk,
950 		    int, metadata->vdrm_pid,
951 		    mach_vm_address_t, entry->address,
952 		    size_t, entry->size,
953 		    mach_vm_reclaim_behavior_v1_t, entry->behavior);
954 		if (entry->address != 0 && entry->size != 0) {
955 			switch (entry->behavior) {
956 			case MACH_VM_RECLAIM_DEALLOCATE:
957 				kr = vm_map_remove_guard(map,
958 				    vm_map_trunc_page(entry->address,
959 				    VM_MAP_PAGE_MASK(map)),
960 				    vm_map_round_page(entry->address + entry->size,
961 				    VM_MAP_PAGE_MASK(map)),
962 				    VM_MAP_REMOVE_GAPS_FAIL,
963 				    KMEM_GUARD_NONE).kmr_return;
964 				if (kr == KERN_INVALID_VALUE) {
965 					reclaim_kill_with_reason(metadata, kGUARD_EXC_DEALLOC_GAP, entry->address);
966 					goto fail;
967 				} else if (kr != KERN_SUCCESS) {
968 					os_log_error(vm_reclaim_log_handle,
969 					    "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx err=%d\n",
970 					    entry->address, entry->size, (uint64_t) map, kr);
971 					reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
972 					goto fail;
973 				}
974 				break;
975 			case MACH_VM_RECLAIM_REUSABLE:
976 				kr = vm_map_behavior_set(map,
977 				    vm_map_trunc_page(entry->address, VM_MAP_PAGE_MASK(map)),
978 				    vm_map_round_page(entry->address + entry->size, VM_MAP_PAGE_MASK(map)),
979 				    VM_BEHAVIOR_REUSABLE);
980 				if (kr != KERN_SUCCESS) {
981 					os_log_error(vm_reclaim_log_handle,
982 					    "vm_reclaim: unable to free(reusable) 0x%llx (%u) for pid %d err=%d\n",
983 					    entry->address, entry->size, metadata->vdrm_pid, kr);
984 				}
985 				break;
986 			default:
987 				os_log_error(vm_reclaim_log_handle,
988 				    "vm_reclaim: attempted to reclaim entry with unsupported behavior %uh",
989 				    entry->behavior);
990 				reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, kr);
991 				kr = KERN_INVALID_VALUE;
992 				goto fail;
993 			}
994 			num_reclaimed++;
995 			os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
996 			KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END,
997 			    metadata->vdrm_pid, entry->address);
998 		}
999 	}
1000 
1001 	kr = reclaim_copyout_head(metadata, head);
1002 	if (kr != KERN_SUCCESS) {
1003 		goto fail;
1004 	}
1005 
1006 	if (options & RECLAIM_NO_FAULT) {
1007 		vm_fault_enable();
1008 	}
1009 	vm_map_switch(old_map);
1010 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1011 	    metadata->vdrm_pid, num_to_reclaim, num_reclaimed, true);
1012 	*num_reclaimed_out = num_reclaimed;
1013 	if (num_to_reclaim == 0) {
1014 		// We have exhausted the reclaimable portion of the buffer
1015 		return KERN_NOT_FOUND;
1016 	}
1017 	return KERN_SUCCESS;
1018 
1019 fail:
1020 	if (options & RECLAIM_NO_FAULT) {
1021 		vm_fault_enable();
1022 	}
1023 	vm_map_switch(old_map);
1024 	*num_reclaimed_out = num_reclaimed;
1025 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
1026 	    metadata->vdrm_pid, num_to_reclaim, num_reclaimed, false);
1027 	return kr;
1028 }
1029 
1030 /*
1031  * Attempts to reclaim until the buffer's estimated number of available bytes
1032  * is <= num_bytes_reclaimable_threshold. The metadata buffer lock should be
1033  * held by the caller.
1034  *
1035  * Writes the number of entries reclaimed to `num_reclaimed_out`.
1036  */
1037 static kern_return_t
reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,size_t num_bytes_reclaimable_threshold,size_t * num_reclaimed_out)1038 reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,
1039     size_t num_bytes_reclaimable_threshold, size_t *num_reclaimed_out)
1040 {
1041 	assert(metadata != NULL);
1042 	assert(num_reclaimed_out != NULL);
1043 	vmdr_metadata_assert_owned(metadata);
1044 	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_NOTOWNED);
1045 
1046 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_START, metadata->vdrm_pid);
1047 
1048 	size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
1049 	while (true) {
1050 		kern_return_t kr;
1051 		size_t curr_entries_reclaimed = 0;
1052 		num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
1053 		reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
1054 		if (num_bytes_reclaimed > reclaimable_bytes) {
1055 			estimated_reclaimable_bytes = 0;
1056 		} else {
1057 			estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
1058 		}
1059 		if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
1060 			break;
1061 		}
1062 		kr = reclaim_chunk(metadata, &curr_entries_reclaimed,
1063 		    RECLAIM_OPTIONS_NONE);
1064 		if (kr == KERN_NOT_FOUND) {
1065 			// Nothing left to reclaim
1066 			break;
1067 		} else if (kr != KERN_SUCCESS) {
1068 			KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END,
1069 			    metadata->vdrm_pid, num_entries_reclaimed,
1070 			    estimated_reclaimable_bytes, kr);
1071 			*num_reclaimed_out = num_entries_reclaimed;
1072 			return kr;
1073 		}
1074 		num_entries_reclaimed += curr_entries_reclaimed;
1075 	}
1076 
1077 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END,
1078 	    metadata->vdrm_pid, num_entries_reclaimed,
1079 	    estimated_reclaimable_bytes, KERN_SUCCESS);
1080 	*num_reclaimed_out = num_entries_reclaimed;
1081 	return KERN_SUCCESS;
1082 }
1083 
1084 /*
1085  * Get the reclamation metadata buffer for the given map.
1086  */
1087 static vm_deferred_reclamation_metadata_t
get_task_reclaim_metadata(task_t task)1088 get_task_reclaim_metadata(task_t task)
1089 {
1090 	assert(task != NULL);
1091 	vm_deferred_reclamation_metadata_t metadata = NULL;
1092 	task_lock(task);
1093 	metadata = task->deferred_reclamation_metadata;
1094 	task_unlock(task);
1095 	return metadata;
1096 }
1097 
1098 kern_return_t
vm_deferred_reclamation_buffer_synchronize_internal(task_t task,size_t num_entries_to_reclaim)1099 vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
1100 {
1101 	kern_return_t kr;
1102 	vm_deferred_reclamation_metadata_t metadata = NULL;
1103 	size_t total_reclaimed = 0;
1104 
1105 	if (!task_is_active(task)) {
1106 		return KERN_FAILURE;
1107 	}
1108 
1109 	metadata = get_task_reclaim_metadata(task);
1110 	if (metadata == NULL) {
1111 		return KERN_INVALID_ARGUMENT;
1112 	}
1113 
1114 	vmdr_metadata_own(metadata);
1115 
1116 	while (total_reclaimed < num_entries_to_reclaim) {
1117 		size_t num_reclaimed;
1118 		kr = reclaim_chunk(metadata, &num_reclaimed, RECLAIM_OPTIONS_NONE);
1119 		if (kr == KERN_NOT_FOUND) {
1120 			/* buffer has been fully reclaimed from */
1121 			break;
1122 		} else if (kr != KERN_SUCCESS) {
1123 			vmdr_metadata_disown(metadata);
1124 			return kr;
1125 		}
1126 
1127 		total_reclaimed += num_reclaimed;
1128 	}
1129 
1130 	vmdr_metadata_disown(metadata);
1131 	return KERN_SUCCESS;
1132 }
1133 
1134 kern_return_t
vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task,size_t reclaimable_bytes)1135 vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
1136 {
1137 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1138 	size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer, num_reclaimed = 0;
1139 	bool success;
1140 	kern_return_t kr = KERN_SUCCESS;
1141 	if (metadata == NULL) {
1142 		return KERN_INVALID_ARGUMENT;
1143 	}
1144 
1145 	if (!metadata->vdrm_pid) {
1146 		metadata->vdrm_pid = task_pid(task);
1147 	}
1148 
1149 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START,
1150 	    metadata->vdrm_pid, reclaimable_bytes);
1151 
1152 	/*
1153 	 * The client is allowed to make this call in parallel from multiple threads.
1154 	 * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
1155 	 * If the client's value is smaller than what we've stored, another thread
1156 	 * raced ahead of them and we've already acted on that accounting so this
1157 	 * call should be a no-op.
1158 	 */
1159 	success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
1160 	    reclaimable_bytes, acquire,
1161 	{
1162 		if (num_bytes_in_buffer > reclaimable_bytes) {
1163 		        os_atomic_rmw_loop_give_up(break);
1164 		}
1165 	});
1166 	if (!success) {
1167 		/* Stale value. Nothing new to reclaim */
1168 		goto done;
1169 	}
1170 	num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
1171 
1172 	if (reclaimable_bytes > num_bytes_reclaimed) {
1173 		estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
1174 		if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
1175 			vmdr_metadata_own(metadata);
1176 			kr = reclaim_entries_from_buffer(metadata,
1177 			    vm_reclaim_max_threshold, &num_reclaimed);
1178 			vmdr_metadata_disown(metadata);
1179 		}
1180 	}
1181 
1182 done:
1183 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END,
1184 	    metadata->vdrm_pid, reclaimable_bytes, num_bytes_reclaimed,
1185 	    num_reclaimed);
1186 
1187 	return kr;
1188 }
1189 
1190 static inline size_t
pick_reclaim_threshold(vm_deferred_reclamation_action_t action)1191 pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
1192 {
1193 	switch (action) {
1194 	case RECLAIM_FULL:
1195 		return 0;
1196 	case RECLAIM_TRIM:
1197 		return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
1198 	case RECLAIM_ASYNC:
1199 		return 0;
1200 	}
1201 }
1202 
1203 void
vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action,vm_deferred_reclamation_options_t options)1204 vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action, vm_deferred_reclamation_options_t options)
1205 {
1206 	kern_return_t kr;
1207 	size_t num_reclaimed;
1208 	size_t reclaim_threshold;
1209 
1210 	switch (action) {
1211 	case RECLAIM_ASYNC:
1212 		lck_mtx_lock(&async_reclamation_buffers_lock);
1213 		vmdr_process_async_reclamation_list();
1214 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1215 		break;
1216 	case RECLAIM_TRIM:
1217 	case RECLAIM_FULL:
1218 		reclaim_threshold = pick_reclaim_threshold(action);
1219 		KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_START,
1220 		    action, reclaim_threshold);
1221 		lck_mtx_lock(&reclamation_buffers_lock);
1222 		reclamation_counter++;
1223 		vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
1224 		while (metadata != NULL) {
1225 			vmdr_list_remove_locked(metadata);
1226 			vmdr_list_append_locked(metadata);
1227 			vmdr_metadata_retain(metadata);
1228 			lck_mtx_unlock(&reclamation_buffers_lock);
1229 
1230 			vmdr_metadata_lock(metadata);
1231 
1232 			if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
1233 				// We've already seen this one. We're done
1234 				vmdr_metadata_unlock(metadata);
1235 				lck_mtx_lock(&reclamation_buffers_lock);
1236 				break;
1237 			}
1238 			metadata->vdrm_reclaimed_at = reclamation_counter;
1239 
1240 			if (options & RECLAIM_NO_WAIT) {
1241 				bool acquired = vmdr_metadata_try_own_locked(metadata);
1242 				if (!acquired) {
1243 					vmdr_metadata_unlock(metadata);
1244 					goto next;
1245 				}
1246 			} else {
1247 				vmdr_metadata_own_locked(metadata);
1248 			}
1249 			vmdr_metadata_unlock(metadata);
1250 
1251 			kr = reclaim_entries_from_buffer(metadata,
1252 			    reclaim_threshold, &num_reclaimed);
1253 
1254 			vmdr_metadata_disown(metadata);
1255 next:
1256 			vmdr_metadata_release(metadata);
1257 			lck_mtx_lock(&reclamation_buffers_lock);
1258 			metadata = TAILQ_FIRST(&reclamation_buffers);
1259 		}
1260 		lck_mtx_unlock(&reclamation_buffers_lock);
1261 		KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_END,
1262 		    reclamation_counter);
1263 		break;
1264 	default:
1265 		panic("Unexpected reclaim action %d", action);
1266 	}
1267 }
1268 
1269 void
vm_deferred_reclamation_reclaim_all_memory(vm_deferred_reclamation_options_t options)1270 vm_deferred_reclamation_reclaim_all_memory(
1271 	vm_deferred_reclamation_options_t options)
1272 {
1273 	vm_deferred_reclamation_reclaim_memory(RECLAIM_FULL, options);
1274 }
1275 
1276 bool
vm_deferred_reclamation_reclaim_from_task_async(task_t task)1277 vm_deferred_reclamation_reclaim_from_task_async(task_t task)
1278 {
1279 	bool queued = false;
1280 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1281 
1282 	if (metadata != NULL) {
1283 		os_log_debug(vm_reclaim_log_handle, "vm_reclaim: enquequeing %d for "
1284 		    "asynchronous reclamation.\n", task_pid(task));
1285 		lck_mtx_lock(&async_reclamation_buffers_lock);
1286 		// move this buffer to the tail if still on the async list
1287 		if (vmdr_metadata_has_pending_reclamation(metadata)) {
1288 			vmdr_async_list_remove_locked(metadata);
1289 		}
1290 		vmdr_async_list_append_locked(metadata);
1291 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1292 		queued = true;
1293 		thread_wakeup_thread(&vm_reclaim_thread, vm_reclaim_thread);
1294 	}
1295 
1296 	return queued;
1297 }
1298 
1299 kern_return_t
vm_deferred_reclamation_reclaim_from_task_sync(task_t task,size_t max_entries_to_reclaim)1300 vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
1301 {
1302 	kern_return_t kr;
1303 	size_t num_reclaimed = 0;
1304 	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1305 
1306 	if (!task_is_active(task) || task_is_halting(task)) {
1307 		return KERN_ABORTED;
1308 	}
1309 
1310 	if (metadata != NULL) {
1311 		vmdr_metadata_own(metadata);
1312 		while (num_reclaimed < max_entries_to_reclaim) {
1313 			size_t num_reclaimed_now;
1314 			kr = reclaim_chunk(metadata, &num_reclaimed_now, RECLAIM_OPTIONS_NONE);
1315 			if (kr == KERN_NOT_FOUND) {
1316 				// Nothing left to reclaim
1317 				break;
1318 			} else if (kr != KERN_SUCCESS) {
1319 				/* Lock has already been released and task is being killed. */
1320 				vmdr_metadata_disown(metadata);
1321 				return kr;
1322 			}
1323 			num_reclaimed += num_reclaimed_now;
1324 		}
1325 		vmdr_metadata_disown(metadata);
1326 	}
1327 
1328 	return KERN_SUCCESS;
1329 }
1330 
1331 vm_deferred_reclamation_metadata_t
vm_deferred_reclamation_buffer_fork(task_t task,vm_deferred_reclamation_metadata_t parent)1332 vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1333 {
1334 	vm_deferred_reclamation_metadata_t metadata = NULL;
1335 	vmdr_metadata_assert_owned(parent);
1336 
1337 	assert(task->deferred_reclamation_metadata == NULL);
1338 	metadata = vmdr_metadata_alloc(task, parent->vdrm_reclaim_buffer,
1339 	    parent->vdrm_buffer_size, parent->vdrm_reclaim_indices);
1340 	vmdr_metadata_disown(parent);
1341 
1342 	lck_mtx_lock(&reclamation_buffers_lock);
1343 	vmdr_list_append_locked(metadata);
1344 	lck_mtx_unlock(&reclamation_buffers_lock);
1345 
1346 	return metadata;
1347 }
1348 
1349 static void
reclaim_thread_init(void)1350 reclaim_thread_init(void)
1351 {
1352 #if CONFIG_THREAD_GROUPS
1353 	thread_group_vm_add();
1354 #endif
1355 	thread_set_thread_name(current_thread(), "VM_reclaim");
1356 }
1357 
1358 
1359 static void
vmdr_process_async_reclamation_list(void)1360 vmdr_process_async_reclamation_list(void)
1361 {
1362 	kern_return_t kr;
1363 	size_t total_entries_reclaimed = 0;
1364 	size_t num_tasks_reclaimed = 0;
1365 	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
1366 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_START);
1367 
1368 	vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
1369 	while (metadata != NULL) {
1370 		size_t num_reclaimed;
1371 		vmdr_metadata_retain(metadata);
1372 		/*
1373 		 * NB: It is safe to drop the async list lock without removing the
1374 		 * buffer because only one thread (the reclamation thread) may consume
1375 		 * from the async list. The buffer is guaranteed to still be in the
1376 		 * list when the lock is re-taken.
1377 		 */
1378 		lck_mtx_unlock(&async_reclamation_buffers_lock);
1379 
1380 		vmdr_metadata_own(metadata);
1381 
1382 		/* NB: Currently the async reclaim thread fully reclaims the buffer */
1383 		kr = reclaim_entries_from_buffer(metadata, 0, &num_reclaimed);
1384 		total_entries_reclaimed += num_reclaimed;
1385 		num_tasks_reclaimed++;
1386 
1387 		assert(current_thread()->map == kernel_map);
1388 		vmdr_metadata_disown(metadata);
1389 
1390 		lck_mtx_lock(&async_reclamation_buffers_lock);
1391 		/* Wakeup anyone waiting on this buffer getting processed */
1392 		if (metadata->vdrm_waiters) {
1393 			wakeup_all_with_inheritor(&metadata->vdrm_async_list,
1394 			    THREAD_AWAKENED);
1395 		}
1396 		/*
1397 		 * Check that the buffer has not been removed from the async list
1398 		 * while being reclaimed from. This can happen if the task terminates
1399 		 * while the reclamation is in flight.
1400 		 */
1401 		if (vmdr_metadata_has_pending_reclamation(metadata)) {
1402 			vmdr_async_list_remove_locked(metadata);
1403 		}
1404 		vmdr_metadata_release(metadata);
1405 		metadata = TAILQ_FIRST(&async_reclamation_buffers);
1406 	}
1407 	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_END,
1408 	    num_tasks_reclaimed, total_entries_reclaimed);
1409 }
1410 
1411 __enum_decl(reclaim_thread_state, uint32_t, {
1412 	RECLAIM_THREAD_INIT = 0,
1413 	RECLAIM_THREAD_CONT = 1,
1414 });
1415 
1416 static void
reclaim_thread_continue(void)1417 reclaim_thread_continue(void)
1418 {
1419 	lck_mtx_lock(&async_reclamation_buffers_lock);
1420 
1421 	vmdr_process_async_reclamation_list();
1422 	assert_wait(&vm_reclaim_thread, THREAD_UNINT);
1423 
1424 	lck_mtx_unlock(&async_reclamation_buffers_lock);
1425 }
1426 
1427 void
reclaim_thread(void * param,wait_result_t wr __unused)1428 reclaim_thread(void *param, wait_result_t wr __unused)
1429 {
1430 	if (param == (void *) RECLAIM_THREAD_INIT) {
1431 		reclaim_thread_init();
1432 	} else {
1433 		assert(param == (void *) RECLAIM_THREAD_CONT);
1434 	}
1435 
1436 	reclaim_thread_continue();
1437 
1438 	(void) thread_block_parameter(reclaim_thread, (void*) RECLAIM_THREAD_CONT);
1439 }
1440 
1441 __startup_func
1442 static void
vm_deferred_reclamation_init(void)1443 vm_deferred_reclamation_init(void)
1444 {
1445 	// Note: no-op pending rdar://27006343 (Custom kernel log handles)
1446 	vm_reclaim_log_handle = os_log_create("com.apple.xnu", "vm_reclaim");
1447 
1448 	(void)kernel_thread_start_priority(reclaim_thread,
1449 	    (void *)RECLAIM_THREAD_INIT, kReclaimThreadPriority,
1450 	    &vm_reclaim_thread);
1451 }
1452 
1453 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1454 
1455 #if DEVELOPMENT || DEBUG
1456 
1457 bool
vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)1458 vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1459 {
1460 	vm_deferred_reclamation_metadata_t metadata = NULL;
1461 	proc_t p = proc_find(pid);
1462 	if (p == NULL) {
1463 		return false;
1464 	}
1465 	task_t t = proc_task(p);
1466 	if (t == NULL) {
1467 		proc_rele(p);
1468 		return false;
1469 	}
1470 
1471 	task_lock(t);
1472 	if (!task_is_halting(t) && task_is_active(t)) {
1473 		metadata = t->deferred_reclamation_metadata;
1474 		if (metadata != NULL) {
1475 			vmdr_metadata_retain(metadata);
1476 		}
1477 	}
1478 	task_unlock(t);
1479 	proc_rele(p);
1480 	if (metadata == NULL) {
1481 		return false;
1482 	}
1483 
1484 	lck_mtx_lock(&async_reclamation_buffers_lock);
1485 	while (vmdr_metadata_has_pending_reclamation(metadata)) {
1486 		metadata->vdrm_waiters++;
1487 		lck_mtx_sleep_with_inheritor(&async_reclamation_buffers_lock,
1488 		    LCK_SLEEP_DEFAULT, &metadata->vdrm_async_list, vm_reclaim_thread,
1489 		    THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1490 		metadata->vdrm_waiters--;
1491 	}
1492 	lck_mtx_unlock(&async_reclamation_buffers_lock);
1493 
1494 	vmdr_metadata_release(metadata);
1495 	return true;
1496 }
1497 
1498 #endif /* DEVELOPMENT || DEBUG */
1499