xref: /xnu-11417.121.6/libsyscall/mach/vm_reclaim.c (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #if defined(__LP64__)
30 /*
31  * Userspace functions for manipulating the reclaim buffer.
32  */
33 #include <inttypes.h>
34 #include <stdbool.h>
35 #include <stdlib.h>
36 #include <mach/error.h>
37 #include <mach/kern_return.h>
38 #include <mach/mach.h>
39 #include <mach/mach_vm.h>
40 #include <mach/vm_reclaim_private.h>
41 #undef _mach_vm_user_
42 #include <mach/mach_vm_internal.h>
43 #include <mach/vm_map.h>
44 #include <os/atomic_private.h>
45 #include <os/overflow.h>
46 #include <mach/vm_page_size.h>
47 #include <TargetConditionals.h>
48 
49 
50 #pragma mark Utilities
51 #define _assert(__op, __condition, __cause) \
52 	do { \
53 	        if (!(__condition)) { \
54 	                __builtin_trap(); \
55 	        } \
56 	} while (false)
57 #define _abort(__op, __cause) \
58 	do { \
59 	        __builtin_trap(); \
60 	} while(false)
61 
62 _Static_assert(VM_RECLAIM_MAX_CAPACITY <= UINT32_MAX, "Max capacity must fit in mach_vm_reclaim_count_t");
63 
64 static uint64_t kAccountingThreshold;
65 
66 static bool
update_accounting(mach_vm_reclaim_ring_t ring_buffer,int64_t size)67 update_accounting(mach_vm_reclaim_ring_t ring_buffer, int64_t size)
68 {
69 	ring_buffer->va_in_buffer += size;
70 	if ((ring_buffer->va_in_buffer > ring_buffer->last_accounting_given_to_kernel &&
71 	    ring_buffer->va_in_buffer - ring_buffer->last_accounting_given_to_kernel > kAccountingThreshold) ||
72 	    (ring_buffer->last_accounting_given_to_kernel > ring_buffer->va_in_buffer &&
73 	    ring_buffer->last_accounting_given_to_kernel - ring_buffer->va_in_buffer > kAccountingThreshold)) {
74 		/*
75 		 * The caller should call mach_vm_reclaim_update_kernel_accounting.
76 		 * We store the value that they will give to the kernel here while we hold the lock.
77 		 * Technically it's out of sync with what the kernel has seen, but
78 		 * that will be rectified once the caller makes the mach_vm_reclaim_update_kernel_accounting call.
79 		 * If we forced this value to be in sync with the kernel's value
80 		 * all callers would start calling mach_vm_reclaim_update_kernel_accounting until one of them
81 		 * finishes & we'd have to take the ringbuffer lock again in
82 		 * mach_vm_reclaim_update_kernel_accounting.
83 		 */
84 		ring_buffer->last_accounting_given_to_kernel = ring_buffer->va_in_buffer;
85 		return true;
86 	}
87 	return false;
88 }
89 
90 static inline struct mach_vm_reclaim_entry_s
construct_entry(mach_vm_address_t start_addr,uint32_t size,mach_vm_reclaim_action_t behavior)91 construct_entry(
92 	mach_vm_address_t start_addr,
93 	uint32_t size,
94 	mach_vm_reclaim_action_t behavior)
95 {
96 	struct mach_vm_reclaim_entry_s entry = {0ULL};
97 	entry.address = start_addr;
98 	entry.size = size;
99 	entry.behavior = behavior;
100 	return entry;
101 }
102 
103 static uint64_t
max_buffer_len_for_size(mach_vm_size_t size)104 max_buffer_len_for_size(mach_vm_size_t size)
105 {
106 	mach_vm_size_t entries_size = size - offsetof(struct mach_vm_reclaim_ring_s, entries);
107 	return entries_size / sizeof(struct mach_vm_reclaim_entry_s);
108 }
109 
110 static mach_vm_reclaim_count_t
round_buffer_len(mach_vm_reclaim_count_t count)111 round_buffer_len(mach_vm_reclaim_count_t count)
112 {
113 	mach_vm_reclaim_count_t rounded_count;
114 	mach_vm_size_t buffer_size =
115 	    offsetof(struct mach_vm_reclaim_ring_s, entries) +
116 	    (count * sizeof(struct mach_vm_reclaim_entry_s));
117 	mach_vm_size_t rounded_size = mach_vm_round_page(buffer_size);
118 	uint64_t num_entries = max_buffer_len_for_size(rounded_size);
119 	if (os_convert_overflow(num_entries, &rounded_count)) {
120 		return UINT32_MAX;
121 	}
122 	return rounded_count;
123 }
124 
125 mach_vm_reclaim_error_t
mach_vm_reclaim_ring_allocate(mach_vm_reclaim_ring_t * ring_out,mach_vm_reclaim_count_t initial_capacity,mach_vm_reclaim_count_t max_capacity)126 mach_vm_reclaim_ring_allocate(
127 	mach_vm_reclaim_ring_t *ring_out,
128 	mach_vm_reclaim_count_t initial_capacity,
129 	mach_vm_reclaim_count_t max_capacity)
130 {
131 	kAccountingThreshold = vm_page_size;
132 	kern_return_t kr;
133 	mach_vm_address_t vm_addr = 0;
134 	if (ring_out == NULL || max_capacity < initial_capacity ||
135 	    initial_capacity == 0 || max_capacity == 0) {
136 		return VM_RECLAIM_INVALID_ARGUMENT;
137 	}
138 	if (max_capacity > VM_RECLAIM_MAX_CAPACITY) {
139 		return VM_RECLAIM_INVALID_CAPACITY;
140 	}
141 
142 	*ring_out = NULL;
143 	kr = mach_vm_deferred_reclamation_buffer_allocate(mach_task_self(),
144 	    &vm_addr, initial_capacity, max_capacity);
145 	if (kr == ERR_SUCCESS) {
146 		mach_vm_reclaim_ring_t ringbuffer =
147 		    (mach_vm_reclaim_ring_t)vm_addr;
148 
149 		ringbuffer->va_in_buffer = 0;
150 		ringbuffer->last_accounting_given_to_kernel = 0;
151 		ringbuffer->len = initial_capacity;
152 		ringbuffer->max_len = max_capacity;
153 		*ring_out = ringbuffer;
154 	}
155 	return kr;
156 }
157 
158 mach_vm_reclaim_error_t
mach_vm_reclaim_ring_resize(mach_vm_reclaim_ring_t ring,mach_vm_reclaim_count_t capacity)159 mach_vm_reclaim_ring_resize(
160 	mach_vm_reclaim_ring_t ring,
161 	mach_vm_reclaim_count_t capacity)
162 {
163 	kern_return_t kr;
164 	if (ring == NULL) {
165 		return VM_RECLAIM_INVALID_RING;
166 	}
167 	if (capacity == 0 || capacity > ring->max_len) {
168 		return VM_RECLAIM_INVALID_CAPACITY;
169 	}
170 	kr = mach_vm_deferred_reclamation_buffer_resize(mach_task_self(),
171 	    capacity);
172 	if (kr == KERN_SUCCESS) {
173 		ring->len = capacity;
174 	}
175 	return kr;
176 }
177 
178 mach_vm_reclaim_count_t
mach_vm_reclaim_round_capacity(mach_vm_reclaim_count_t count)179 mach_vm_reclaim_round_capacity(
180 	mach_vm_reclaim_count_t count)
181 {
182 	if (count > VM_RECLAIM_MAX_CAPACITY) {
183 		return VM_RECLAIM_MAX_CAPACITY;
184 	}
185 	return round_buffer_len(count);
186 }
187 
188 mach_vm_reclaim_error_t
mach_vm_reclaim_try_enter(mach_vm_reclaim_ring_t ring,mach_vm_address_t region_start,mach_vm_size_t region_size,mach_vm_reclaim_action_t action,mach_vm_reclaim_id_t * id,bool * should_update_kernel_accounting)189 mach_vm_reclaim_try_enter(
190 	mach_vm_reclaim_ring_t ring,
191 	mach_vm_address_t region_start,
192 	mach_vm_size_t region_size,
193 	mach_vm_reclaim_action_t action,
194 	mach_vm_reclaim_id_t *id,
195 	bool *should_update_kernel_accounting)
196 {
197 	mach_vm_reclaim_id_t tail = 0, head = 0, original_tail = 0, busy = 0;
198 	mach_vm_reclaim_indices_t indices = &ring->indices;
199 	mach_vm_reclaim_entry_t entries = ring->entries;
200 	uint64_t buffer_len = (uint64_t)ring->len;
201 	*should_update_kernel_accounting = false;
202 
203 	if (ring == NULL) {
204 		return VM_RECLAIM_INVALID_RING;
205 	}
206 	if (id == NULL) {
207 		return VM_RECLAIM_INVALID_ID;
208 	}
209 
210 	uint32_t size32;
211 	if (os_convert_overflow(region_size, &size32)) {
212 		/* regions must fit in 32-bits */
213 		*id = VM_RECLAIM_ID_NULL;
214 		return VM_RECLAIM_INVALID_REGION_SIZE;
215 	}
216 
217 	mach_vm_reclaim_id_t requested_id = *id;
218 	*id = VM_RECLAIM_ID_NULL;
219 
220 	if (requested_id == VM_RECLAIM_ID_NULL) {
221 		tail = os_atomic_load_wide(&indices->tail, relaxed);
222 		head = os_atomic_load_wide(&indices->head, relaxed);
223 
224 		if (tail % buffer_len == head % buffer_len && tail > head) {
225 			/* Buffer is full */
226 			return VM_RECLAIM_SUCCESS;
227 		}
228 
229 		/*
230 		 * idx must be >= head & the buffer is not full so it's not possible for the kernel to be acting on the entry at (tail + 1) % size.
231 		 * Thus we don't need to check the busy pointer here.
232 		 */
233 		struct mach_vm_reclaim_entry_s entry = construct_entry(region_start, size32, action);
234 		entries[tail % buffer_len] = entry;
235 		os_atomic_thread_fence(seq_cst); // tail increment can not be seen before the entry is cleared in the buffer
236 		os_atomic_inc(&indices->tail, relaxed);
237 		*id = tail;
238 	} else {
239 		head = os_atomic_load_wide(&indices->head, relaxed);
240 		if (requested_id < head) {
241 			/*
242 			 * This is just a fast path for the case where the buffer has wrapped.
243 			 * It's not strictly necessary beacuse idx must also be < busy.
244 			 * That's why we can use a relaxed load for the head ptr.
245 			 */
246 			return VM_RECLAIM_SUCCESS;
247 		}
248 		/* Attempt to move tail to idx */
249 		original_tail = os_atomic_load_wide(&indices->tail, relaxed);
250 		_assert("mach_vm_reclaim_mark_free_with_id",
251 		    requested_id < original_tail, original_tail);
252 
253 		os_atomic_store_wide(&indices->tail, requested_id, relaxed);
254 		os_atomic_thread_fence(seq_cst); // Our write to tail must happen before our read of busy
255 		busy = os_atomic_load_wide(&indices->busy, relaxed);
256 		if (requested_id < busy) {
257 			/* Kernel is acting on this entry. Undo. */
258 			os_atomic_store_wide(&indices->tail, original_tail, relaxed);
259 			return VM_RECLAIM_SUCCESS;
260 		}
261 
262 		mach_vm_reclaim_entry_t entry = &entries[requested_id % buffer_len];
263 		_assert("mach_vm_reclaim_try_enter",
264 		    entry->address == 0 && entry->size == 0, entry->address);
265 
266 		/* Sucessfully moved tail back. Can now overwrite the entry */
267 		*entry = construct_entry(region_start, size32, action);
268 
269 		/* Tail increment can not be seen before the entry is set in the buffer */
270 		os_atomic_thread_fence(seq_cst);
271 		/* Reset tail. */
272 		os_atomic_store_wide(&indices->tail, original_tail, relaxed);
273 		*id = requested_id;
274 	}
275 	*should_update_kernel_accounting = update_accounting(ring, region_size);
276 	return VM_RECLAIM_SUCCESS;
277 }
278 
279 mach_vm_reclaim_error_t
mach_vm_reclaim_try_cancel(mach_vm_reclaim_ring_t ring_buffer,mach_vm_reclaim_id_t id,mach_vm_address_t region_start,mach_vm_size_t region_size,mach_vm_reclaim_action_t behavior,mach_vm_reclaim_state_t * state,bool * should_update_kernel_accounting)280 mach_vm_reclaim_try_cancel(
281 	mach_vm_reclaim_ring_t ring_buffer,
282 	mach_vm_reclaim_id_t id,
283 	mach_vm_address_t region_start,
284 	mach_vm_size_t region_size,
285 	mach_vm_reclaim_action_t behavior,
286 	mach_vm_reclaim_state_t *state,
287 	bool *should_update_kernel_accounting)
288 {
289 	mach_vm_reclaim_indices_t indices = &ring_buffer->indices;
290 	mach_vm_reclaim_entry_t entries = ring_buffer->entries;
291 	uint64_t buffer_len = (uint64_t)ring_buffer->len;
292 	uint64_t head = 0, busy = 0, original_tail = 0;
293 
294 	if (ring_buffer == NULL) {
295 		return VM_RECLAIM_INVALID_RING;
296 	}
297 	if (id == VM_RECLAIM_ID_NULL) {
298 		/* The entry was never put in the reclaim ring buffer */
299 		return VM_RECLAIM_INVALID_ID;
300 	}
301 	if (state == NULL || should_update_kernel_accounting == NULL) {
302 		return VM_RECLAIM_INVALID_ARGUMENT;
303 	}
304 
305 	*should_update_kernel_accounting = false;
306 
307 	uint32_t size32;
308 	if (os_convert_overflow(region_size, &size32)) {
309 		/* Regions must fit in 32-bits */
310 		return VM_RECLAIM_INVALID_REGION_SIZE;
311 	}
312 
313 	head = os_atomic_load_wide(&indices->head, relaxed);
314 	if (id < head) {
315 		/*
316 		 * This is just a fast path for the case where the buffer has wrapped.
317 		 * It's not strictly necessary beacuse idx must also be < busy.
318 		 * That's why we can use a relaxed load for the head ptr.
319 		 */
320 		switch (behavior) {
321 		case VM_RECLAIM_DEALLOCATE:
322 			/* Entry has been deallocated and is not safe to re-use */
323 			*state = VM_RECLAIM_DEALLOCATED;
324 			break;
325 		case VM_RECLAIM_FREE:
326 			/* Entry has been freed, the virtual region is now safe to re-use */
327 			*state = VM_RECLAIM_FREED;
328 			break;
329 		default:
330 			return VM_RECLAIM_INVALID_ARGUMENT;
331 		}
332 		return VM_RECLAIM_SUCCESS;
333 	}
334 
335 	/* Attempt to move tail to idx */
336 	original_tail = os_atomic_load_wide(&indices->tail, relaxed);
337 	_assert("mach_vm_reclaim_mark_used", id < original_tail, original_tail);
338 
339 	os_atomic_store_wide(&indices->tail, id, relaxed);
340 	/* Our write to tail must happen before our read of busy */
341 	os_atomic_thread_fence(seq_cst);
342 	busy = os_atomic_load_wide(&indices->busy, relaxed);
343 	if (id < busy) {
344 		/*
345 		 * This entry is in the process of being reclaimed. It is
346 		 * never safe to re-use while in this state.
347 		 */
348 		os_atomic_store_wide(&indices->tail, original_tail, relaxed);
349 		*state = VM_RECLAIM_BUSY;
350 		return VM_RECLAIM_SUCCESS;
351 	}
352 	mach_vm_reclaim_entry_t entry = &entries[id % buffer_len];
353 	_assert("mach_vm_reclaim_mark_used", entry->size == region_size, entry->size);
354 	_assert("mach_vm_reclaim_mark_used", entry->address == region_start, entry->address);
355 	_assert("mach_vm_reclaim_mark_used", entry->behavior == behavior, entry->behavior);
356 
357 	/* Sucessfully moved tail back. Can now overwrite the entry */
358 	memset(entry, 0, sizeof(struct mach_vm_reclaim_entry_s));
359 	/* tail increment can not be seen before the entry is cleared in the buffer */
360 	os_atomic_thread_fence(seq_cst);
361 	/* Reset tail. */
362 	os_atomic_store_wide(&indices->tail, original_tail, relaxed);
363 
364 	*should_update_kernel_accounting = update_accounting(ring_buffer, -(int64_t)region_size);
365 	*state = VM_RECLAIM_UNRECLAIMED;
366 	return VM_RECLAIM_SUCCESS;
367 }
368 
369 mach_vm_reclaim_error_t
mach_vm_reclaim_query_state(mach_vm_reclaim_ring_t ring,mach_vm_reclaim_id_t id,mach_vm_reclaim_action_t action,mach_vm_reclaim_state_t * state)370 mach_vm_reclaim_query_state(
371 	mach_vm_reclaim_ring_t ring,
372 	mach_vm_reclaim_id_t id,
373 	mach_vm_reclaim_action_t action,
374 	mach_vm_reclaim_state_t *state)
375 {
376 	if (ring == NULL) {
377 		return VM_RECLAIM_INVALID_RING;
378 	}
379 	if (id == VM_RECLAIM_ID_NULL) {
380 		return VM_RECLAIM_INVALID_ID;
381 	}
382 	mach_vm_reclaim_indices_t indices = &ring->indices;
383 
384 	mach_vm_reclaim_id_t head = os_atomic_load_wide(&indices->head, relaxed);
385 	if (id < head) {
386 		switch (action) {
387 		case VM_RECLAIM_FREE:
388 			*state = VM_RECLAIM_FREED;
389 			break;
390 		case VM_RECLAIM_DEALLOCATE:
391 			*state = VM_RECLAIM_DEALLOCATED;
392 			break;
393 		default:
394 			return VM_RECLAIM_INVALID_ARGUMENT;
395 		}
396 		return VM_RECLAIM_SUCCESS;
397 	}
398 
399 	mach_vm_reclaim_id_t busy = os_atomic_load_wide(&indices->busy, relaxed);
400 	if (id < busy) {
401 		*state = VM_RECLAIM_BUSY;
402 	} else {
403 		*state = VM_RECLAIM_UNRECLAIMED;
404 	}
405 	return VM_RECLAIM_SUCCESS;
406 }
407 
408 mach_vm_reclaim_error_t
mach_vm_reclaim_update_kernel_accounting(const mach_vm_reclaim_ring_t ring)409 mach_vm_reclaim_update_kernel_accounting(const mach_vm_reclaim_ring_t ring)
410 {
411 	return mach_vm_deferred_reclamation_buffer_update_reclaimable_bytes(current_task(),
412 	           ring->va_in_buffer);
413 }
414 
415 bool
mach_vm_reclaim_is_reusable(mach_vm_reclaim_state_t state)416 mach_vm_reclaim_is_reusable(
417 	mach_vm_reclaim_state_t state)
418 {
419 	return state == VM_RECLAIM_FREED || state == VM_RECLAIM_UNRECLAIMED;
420 }
421 
422 mach_vm_reclaim_error_t
mach_vm_reclaim_ring_capacity(mach_vm_reclaim_ring_t ring,mach_vm_reclaim_count_t * capacity)423 mach_vm_reclaim_ring_capacity(mach_vm_reclaim_ring_t ring, mach_vm_reclaim_count_t *capacity)
424 {
425 	if (ring == NULL) {
426 		return VM_RECLAIM_INVALID_RING;
427 	}
428 	if (capacity == NULL) {
429 		return VM_RECLAIM_INVALID_ARGUMENT;
430 	}
431 	*capacity = ring->len;
432 	return VM_RECLAIM_SUCCESS;
433 }
434 
435 mach_vm_reclaim_error_t
mach_vm_reclaim_ring_flush(mach_vm_reclaim_ring_t ring_buffer,mach_vm_reclaim_count_t num_entries_to_reclaim)436 mach_vm_reclaim_ring_flush(
437 	mach_vm_reclaim_ring_t ring_buffer,
438 	mach_vm_reclaim_count_t num_entries_to_reclaim)
439 {
440 	if (ring_buffer == NULL) {
441 		return VM_RECLAIM_INVALID_RING;
442 	}
443 	if (num_entries_to_reclaim == 0) {
444 		return VM_RECLAIM_INVALID_ARGUMENT;
445 	}
446 
447 	return mach_vm_deferred_reclamation_buffer_flush(mach_task_self(), num_entries_to_reclaim);
448 }
449 
450 #endif /* defined(__LP64__) */
451