xref: /xnu-11215.81.4/libsyscall/mach/vm_reclaim.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #if defined(__LP64__)
30 /*
31  * Userspace functions for manipulating the reclaim buffer.
32  */
33 #include <inttypes.h>
34 #include <stdbool.h>
35 #include <stdlib.h>
36 #include <mach/vm_reclaim.h>
37 #include <mach/mach.h>
38 #include <mach/mach_vm.h>
39 #undef _mach_vm_user_
40 #include <mach/mach_vm_internal.h>
41 #include <mach/vm_map.h>
42 #include <mach/vm_statistics.h>
43 #include <os/atomic_private.h>
44 #include <mach/vm_page_size.h>
45 
46 
47 #pragma mark Utilities
48 #define _assert(__op, __condition, __cause) \
49 	do { \
50 	        if (!(__condition)) { \
51 	                __builtin_trap(); \
52 	        } \
53 	} while (0)
54 
55 static uint64_t kAccountingThreshold;
56 
57 static bool
update_accounting(mach_vm_reclaim_ringbuffer_v1_t ring_buffer,int64_t size)58 update_accounting(mach_vm_reclaim_ringbuffer_v1_t ring_buffer, int64_t size)
59 {
60 	ring_buffer->va_in_buffer += size;
61 	if ((ring_buffer->va_in_buffer > ring_buffer->last_accounting_given_to_kernel &&
62 	    ring_buffer->va_in_buffer - ring_buffer->last_accounting_given_to_kernel > kAccountingThreshold) ||
63 	    (ring_buffer->last_accounting_given_to_kernel > ring_buffer->va_in_buffer &&
64 	    ring_buffer->last_accounting_given_to_kernel - ring_buffer->va_in_buffer > kAccountingThreshold)) {
65 		/*
66 		 * The caller should call mach_vm_reclaim_update_kernel_accounting.
67 		 * We store the value that they will give to the kernel here while we hold the lock.
68 		 * Technically it's out of sync with what the kernel has seen, but
69 		 * that will be rectified once the caller makes the mach_vm_reclaim_update_kernel_accounting call.
70 		 * If we forced this value to be in sync with the kernel's value
71 		 * all callers would start calling mach_vm_reclaim_update_kernel_accounting until one of them
72 		 * finishes & we'd have to take the ringbuffer lock again in
73 		 * mach_vm_reclaim_update_kernel_accounting.
74 		 */
75 		ring_buffer->last_accounting_given_to_kernel = ring_buffer->va_in_buffer;
76 		return true;
77 	}
78 	return false;
79 }
80 
81 static inline
82 mach_vm_reclaim_entry_v1_t
construct_entry(mach_vm_address_t start_addr,uint32_t size,mach_vm_reclaim_behavior_v1_t behavior)83 construct_entry(mach_vm_address_t start_addr, uint32_t size, mach_vm_reclaim_behavior_v1_t behavior)
84 {
85 	mach_vm_reclaim_entry_v1_t entry = {0ULL};
86 	entry.address = start_addr;
87 	entry.size = size;
88 	entry.behavior = behavior;
89 	return entry;
90 }
91 
92 kern_return_t
mach_vm_reclaim_ringbuffer_init(mach_vm_reclaim_ringbuffer_v1_t ring_buffer)93 mach_vm_reclaim_ringbuffer_init(mach_vm_reclaim_ringbuffer_v1_t ring_buffer)
94 {
95 	kAccountingThreshold = vm_page_size;
96 	kern_return_t kr;
97 	mach_vm_size_t buffer_size = vm_page_size;
98 	bzero(ring_buffer, sizeof(struct mach_vm_reclaim_ringbuffer_v1_s));
99 	size_t entries_size = buffer_size - \
100 	    offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
101 	ring_buffer->buffer_len = entries_size / sizeof(mach_vm_reclaim_entry_v1_t);
102 
103 	kr = mach_vm_deferred_reclamation_buffer_init(mach_task_self(),
104 	    (mach_vm_address_t *)&ring_buffer->buffer, buffer_size);
105 	return kr;
106 }
107 
108 uint64_t
mach_vm_reclaim_mark_free(mach_vm_reclaim_ringbuffer_v1_t ring_buffer,mach_vm_address_t start_addr,uint32_t size,mach_vm_reclaim_behavior_v1_t behavior,bool * should_update_kernel_accounting)109 mach_vm_reclaim_mark_free(
110 	mach_vm_reclaim_ringbuffer_v1_t ring_buffer, mach_vm_address_t start_addr, uint32_t size,
111 	mach_vm_reclaim_behavior_v1_t behavior, bool *should_update_kernel_accounting)
112 {
113 	uint64_t idx = 0, head = 0;
114 	kern_return_t kr;
115 	mach_vm_reclaim_entry_v1_t entry = construct_entry(start_addr, size, behavior);
116 	mach_vm_reclaim_indices_v1_t *indices = &ring_buffer->buffer->indices;
117 	mach_vm_reclaim_entry_v1_t *buffer = ring_buffer->buffer->entries;
118 	mach_vm_size_t buffer_len = ring_buffer->buffer_len;
119 	*should_update_kernel_accounting = false;
120 
121 	idx = os_atomic_load_wide(&indices->tail, relaxed);
122 	head = os_atomic_load_wide(&indices->head, relaxed);
123 
124 	// This leaves one entry empty at the end of the buffer to differentiate an empty buffer from a full one
125 	while ((idx + 1) % buffer_len == head % buffer_len) {
126 		/*
127 		 * Buffer is full. Ask the kernel to reap it.
128 		 */
129 		kr = mach_vm_deferred_reclamation_buffer_synchronize(mach_task_self(), buffer_len - 1);
130 		_assert("mach_vm_reclaim_mark_free", kr == KERN_SUCCESS, kr);
131 		head = os_atomic_load_wide(&indices->head, relaxed);
132 		/* kernel had to march head forward at least kNumEntriesToReclaim. We hold the buffer lock so tail couldn't have changed */
133 		_assert("mach_vm_reclaim_mark_free", (idx + 1) % buffer_len != head % buffer_len, head);
134 	}
135 
136 	/*
137 	 * idx must be >= head & the buffer is not full so it's not possible for the kernel to be acting on the entry at (tail + 1) % size.
138 	 * Thus we don't need to check the busy pointer here.
139 	 */
140 	buffer[idx % buffer_len] = entry;
141 	os_atomic_thread_fence(seq_cst); // tail increment can not be seen before the entry is cleared in the buffer
142 	os_atomic_inc(&indices->tail, relaxed);
143 	*should_update_kernel_accounting = update_accounting(ring_buffer, size);
144 
145 	return idx;
146 }
147 
148 kern_return_t
mach_vm_reclaim_mark_free_with_id(mach_vm_reclaim_ringbuffer_v1_t ring_buffer,mach_vm_address_t start_addr,uint32_t size,mach_vm_reclaim_behavior_v1_t behavior,uint64_t id,bool * should_update_kernel_accounting)149 mach_vm_reclaim_mark_free_with_id(
150 	mach_vm_reclaim_ringbuffer_v1_t ring_buffer,
151 	mach_vm_address_t start_addr,
152 	uint32_t size,
153 	mach_vm_reclaim_behavior_v1_t behavior,
154 	uint64_t id,
155 	bool *should_update_kernel_accounting)
156 {
157 	mach_vm_reclaim_indices_v1_t *indices = &ring_buffer->buffer->indices;
158 	mach_vm_reclaim_entry_v1_t *buffer = ring_buffer->buffer->entries;
159 	mach_vm_size_t buffer_len = ring_buffer->buffer_len;
160 	uint64_t head = 0, busy = 0, original_tail = 0;
161 
162 	if (id == VM_RECLAIM_INDEX_NULL) {
163 		return KERN_INVALID_ARGUMENT;
164 	}
165 
166 	head = os_atomic_load_wide(&indices->head, relaxed);
167 	if (id < head) {
168 		/*
169 		 * This is just a fast path for the case where the buffer has wrapped.
170 		 * It's not strictly necessary beacuse idx must also be < busy.
171 		 * That's why we can use a relaxed load for the head ptr.
172 		 */
173 		return KERN_FAILURE;
174 	}
175 	/* Attempt to move tail to idx */
176 	original_tail = os_atomic_load_wide(&indices->tail, relaxed);
177 	_assert("mach_vm_reclaim_mark_free_with_id",
178 	    id < original_tail, original_tail);
179 
180 	os_atomic_store_wide(&indices->tail, id, relaxed);
181 	os_atomic_thread_fence(seq_cst); // Our write to tail must happen before our read of busy
182 	busy = os_atomic_load_wide(&indices->busy, relaxed);
183 	if (id < busy) {
184 		/* Kernel is acting on this entry. Undo. */
185 		os_atomic_store_wide(&indices->tail, original_tail, relaxed);
186 		return KERN_FAILURE;
187 	}
188 
189 	mach_vm_reclaim_entry_v1_t *entry = &buffer[id % buffer_len];
190 	_assert("mach_vm_reclaim_mark_free_with_id",
191 	    entry->address == 0 && entry->size == 0, entry->address);
192 
193 	/* Sucessfully moved tail back. Can now overwrite the entry */
194 	*entry = construct_entry(start_addr, size, behavior);
195 
196 	/* Tail increment can not be seen before the entry is set in the buffer */
197 	os_atomic_thread_fence(seq_cst);
198 	/* Reset tail. */
199 	os_atomic_store_wide(&indices->tail, original_tail, relaxed);
200 
201 	*should_update_kernel_accounting = update_accounting(ring_buffer, size);
202 
203 	return KERN_SUCCESS;
204 }
205 
206 bool
mach_vm_reclaim_mark_used(mach_vm_reclaim_ringbuffer_v1_t ring_buffer,uint64_t id,mach_vm_address_t start_addr,uint32_t size)207 mach_vm_reclaim_mark_used(
208 	mach_vm_reclaim_ringbuffer_v1_t ring_buffer, uint64_t id,
209 	mach_vm_address_t start_addr, uint32_t size)
210 {
211 	mach_vm_reclaim_indices_v1_t *indices = &ring_buffer->buffer->indices;
212 	mach_vm_reclaim_entry_v1_t *buffer = ring_buffer->buffer->entries;
213 	mach_vm_size_t buffer_len = ring_buffer->buffer_len;
214 	uint64_t head = 0, busy = 0, original_tail = 0;
215 	if (id == VM_RECLAIM_INDEX_NULL) {
216 		// entry was never put in the reclaim ring buffer, so it's safe to re-use.
217 		return true;
218 	}
219 
220 	head = os_atomic_load_wide(&indices->head, relaxed);
221 	if (id < head) {
222 		/*
223 		 * This is just a fast path for the case where the buffer has wrapped.
224 		 * It's not strictly necessary beacuse idx must also be < busy.
225 		 * That's why we can use a relaxed load for the head ptr.
226 		 */
227 		return false;
228 	}
229 
230 	/* Attempt to move tail to idx */
231 	original_tail = os_atomic_load_wide(&indices->tail, relaxed);
232 	_assert("mach_vm_reclaim_mark_used", id < original_tail, original_tail);
233 
234 	os_atomic_store_wide(&indices->tail, id, relaxed);
235 	os_atomic_thread_fence(seq_cst); // Our write to tail must happen before our read of busy
236 	busy = os_atomic_load_wide(&indices->busy, relaxed);
237 	if (id < busy) {
238 		/* Kernel is acting on this entry. Undo. */
239 		os_atomic_store_wide(&indices->tail, original_tail, relaxed);
240 		return false;
241 	}
242 	mach_vm_reclaim_entry_v1_t *entry = &buffer[id % buffer_len];
243 	_assert("mach_vm_reclaim_mark_used", entry->size == size && entry->address == start_addr, entry->address);
244 
245 	/* Sucessfully moved tail back. Can now overwrite the entry */
246 	memset(entry, 0, sizeof(mach_vm_reclaim_entry_v1_t));
247 	os_atomic_thread_fence(seq_cst); // tail increment can not be seen before the entry is cleared in the buffer
248 	/* Reset tail. */
249 	os_atomic_store_wide(&indices->tail, original_tail, relaxed);
250 
251 	update_accounting(ring_buffer, -(int64_t) size);
252 
253 	return true;
254 }
255 
256 kern_return_t
mach_vm_reclaim_update_kernel_accounting(const mach_vm_reclaim_ringbuffer_v1_t ring_buffer)257 mach_vm_reclaim_update_kernel_accounting(const mach_vm_reclaim_ringbuffer_v1_t ring_buffer)
258 {
259 	return mach_vm_deferred_reclamation_buffer_update_reclaimable_bytes(current_task(),
260 	           ring_buffer->va_in_buffer);
261 }
262 
263 bool
mach_vm_reclaim_is_available(const mach_vm_reclaim_ringbuffer_v1_t ring_buffer,uint64_t id)264 mach_vm_reclaim_is_available(const mach_vm_reclaim_ringbuffer_v1_t ring_buffer,
265     uint64_t id)
266 {
267 	const mach_vm_reclaim_indices_v1_t *indices = &ring_buffer->buffer->indices;
268 	if (id == VM_RECLAIM_INDEX_NULL) {
269 		// entry was never put in the reclaim ring buffer, so it's safe to re-use.
270 		return true;
271 	}
272 
273 	/*
274 	 * If the kernel has marched its busy pointer past this entry, consider it reclaimed.
275 	 * It's possible that the kernel will not reclaim this entry yet b/c we're racing with it on
276 	 * another thread via mach_vm_reclaim_mark_used.
277 	 */
278 	uint64_t busy = os_atomic_load_wide(&indices->busy, relaxed);
279 
280 	return id >= busy;
281 }
282 
283 bool
mach_vm_reclaim_is_reclaimed(const mach_vm_reclaim_ringbuffer_v1_t ring_buffer,uint64_t id)284 mach_vm_reclaim_is_reclaimed(const mach_vm_reclaim_ringbuffer_v1_t ring_buffer,
285     uint64_t id)
286 {
287 	const mach_vm_reclaim_indices_v1_t *indices = &ring_buffer->buffer->indices;
288 	if (id == VM_RECLAIM_INDEX_NULL) {
289 		// entry was never put in reclaim ring buffer, consider it un-reclaimed
290 		return false;
291 	}
292 
293 	/*
294 	 * If the kernel has marched its head pointer past this entry, consider it
295 	 * reclaimed.
296 	 */
297 	uint64_t head = os_atomic_load_wide(&indices->head, relaxed);
298 
299 	return id < head;
300 }
301 
302 kern_return_t
mach_vm_reclaim_synchronize(mach_vm_reclaim_ringbuffer_v1_t ringbuffer,mach_vm_size_t num_entries_to_reclaim)303 mach_vm_reclaim_synchronize(mach_vm_reclaim_ringbuffer_v1_t ringbuffer, mach_vm_size_t num_entries_to_reclaim)
304 {
305 	if (ringbuffer == NULL) {
306 		return KERN_INVALID_ARGUMENT;
307 	}
308 
309 	return mach_vm_deferred_reclamation_buffer_synchronize(mach_task_self(), num_entries_to_reclaim);
310 }
311 
312 #endif /* defined(__LP64__) */
313