xref: /xnu-12377.1.9/osfmk/vm/vm_compressor_backing_store.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include "vm_compressor_backing_store_internal.h"
30 #include <vm/vm_pageout_xnu.h>
31 #include <vm/vm_protos_internal.h>
32 #include <vm/vm_kern_xnu.h>
33 #include <vm/vm_map_xnu.h>
34 #include <vm/vm_compressor_internal.h>
35 #include <vm/vm_iokit.h>
36 #include <vm/vm_map_internal.h>
37 
38 #include <IOKit/IOHibernatePrivate.h>
39 #include <kern/policy_internal.h>
40 #include <sys/kern_memorystatus_xnu.h>
41 
42 LCK_GRP_DECLARE(vm_swap_data_lock_grp, "vm_swap_data");
43 LCK_MTX_DECLARE(vm_swap_data_lock, &vm_swap_data_lock_grp);
44 
45 #if defined(XNU_TARGET_OS_OSX)
46 /*
47  * launchd explicitly turns ON swap later during boot on macOS devices.
48  */
49 boolean_t       compressor_store_stop_compaction = TRUE;
50 #else
51 boolean_t       compressor_store_stop_compaction = FALSE;
52 #endif
53 
54 boolean_t       vm_swapfile_create_needed = FALSE;
55 boolean_t       vm_swapfile_gc_needed = FALSE;
56 
57 int             vm_swapper_throttle = -1;
58 uint64_t        vm_swapout_thread_id;
59 
60 uint64_t        vm_swap_put_failures = 0; /* Likely failed I/O. Data is still in memory. */
61 uint64_t        vm_swap_get_failures = 0; /* Fatal */
62 uint64_t        vm_swap_put_failures_no_swap_file = 0; /* Possibly not fatal because we might just need a new swapfile. */
63 int             vm_num_swap_files_config = 0;
64 int             vm_num_swap_files = 0;
65 int             vm_num_pinned_swap_files = 0;
66 uint64_t        vm_swap_volume_capacity = 0;
67 int             vm_swapout_thread_processed_segments = 0;
68 int             vm_swapout_thread_awakened = 0;
69 bool            vm_swapout_thread_running = FALSE;
70 _Atomic bool    vm_swapout_wake_pending = false;
71 int             vm_swapfile_create_thread_awakened = 0;
72 int             vm_swapfile_create_thread_running = 0;
73 int             vm_swapfile_gc_thread_awakened = 0;
74 int             vm_swapfile_gc_thread_running = 0;
75 
76 int64_t         vm_swappin_avail = 0;
77 boolean_t       vm_swappin_enabled = FALSE;
78 unsigned int    vm_swapfile_total_segs_alloced = 0;
79 unsigned int    vm_swapfile_total_segs_alloced_max = 0;
80 unsigned int    vm_swapfile_total_segs_used = 0;
81 unsigned int    vm_swapfile_total_segs_used_max = 0;
82 
83 char            swapfilename[MAX_SWAPFILENAME_LEN + 1] = SWAP_FILE_NAME;
84 
85 extern vm_map_t compressor_map;
86 extern uint32_t c_seg_bufsize, c_seg_allocsize, c_seg_off_limit;
87 
88 #define SWAP_READY      0x1     /* Swap file is ready to be used */
89 #define SWAP_RECLAIM    0x2     /* Swap file is marked to be reclaimed */
90 #define SWAP_WANTED     0x4     /* Swap file has waiters */
91 #define SWAP_REUSE      0x8     /* Swap file is on the Q and has a name. Reuse after init-ing.*/
92 #define SWAP_PINNED     0x10    /* Swap file is pinned (FusionDrive) */
93 
94 
95 struct swapfile {
96 	queue_head_t            swp_queue;      /* list of swap files */
97 	char                    *swp_path;      /* saved pathname of swap file */
98 	struct vnode            *swp_vp;        /* backing vnode */
99 	uint64_t                swp_size;       /* size of this swap file */
100 	uint8_t                 *swp_bitmap;    /* bitmap showing the alloced/freed slots in the swap file */
101 	unsigned int            swp_pathlen;    /* length of pathname */
102 	unsigned int            swp_nsegs;      /* #segments we can use */
103 	unsigned int            swp_nseginuse;  /* #segments in use */
104 	unsigned int            swp_index;      /* index of this swap file */
105 	unsigned int            swp_flags;      /* state of swap file */
106 	unsigned int            swp_free_hint;  /* offset of 1st free chunk */
107 	unsigned int            swp_io_count;   /* count of outstanding I/Os */
108 	c_segment_t             *swp_csegs;     /* back pointers to the c_segments. Used during swap reclaim. */
109 
110 	struct trim_list        *swp_delayed_trim_list_head;
111 	unsigned int            swp_delayed_trim_count;
112 };
113 
114 queue_head_t    swf_global_queue;
115 boolean_t       swp_trim_supported = FALSE;
116 
117 extern uint64_t         dont_trim_until_ts;
118 uint64_t                vm_swapfile_last_failed_to_create_ts = 0;
119 uint64_t                vm_swapfile_last_successful_create_ts = 0;
120 static bool             vm_swapfile_can_be_created = false;
121 static bool             delayed_trim_handling_in_progress = false;
122 
123 boolean_t               hibernate_in_progress_with_pinned_swap = FALSE;
124 
125 static void vm_swapout_thread_throttle_adjust(void);
126 static void vm_swap_free_now(struct swapfile *swf, uint64_t f_offset);
127 static void vm_swapfile_create_thread(void);
128 static void vm_swapfile_gc_thread(void);
129 static void vm_swap_defragment(void);
130 static void vm_swap_handle_delayed_trims(boolean_t);
131 static void vm_swap_do_delayed_trim(struct swapfile *);
132 static void vm_swap_wait_on_trim_handling_in_progress(void);
133 static void vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset, uint32_t size, kern_return_t kr);
134 
135 extern int vnode_getwithref(struct vnode* vp);
136 
137 boolean_t vm_swap_force_defrag = FALSE, vm_swap_force_reclaim = FALSE;
138 
139 #if !XNU_TARGET_OS_OSX
140 
141 /*
142  * For CONFIG_FREEZE, we scale the c_segments_limit based on the
143  * number of swapfiles allowed. That increases wired memory overhead.
144  * So we want to keep the max swapfiles same on both DEV/RELEASE so
145  * that the memory overhead is similar for performance comparisons.
146  */
147 #define VM_MAX_SWAP_FILE_NUM            5
148 #if defined(__arm64__) && defined(ARM_LARGE_MEMORY)
149 #define VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM (64ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
150 #define VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM (16ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
151 #else /* defined(__arm64__) && defined(ARM_LARGE_MEMORY) */
152 /*
153  * We reserve compressor pool VA at boot for the max # of swap files. If someone
154  * has enabled app swap but we're not an arm large memory device we can't hog
155  * all of the VA so we only go up to 4GB.
156  */
157 #define VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM (4ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
158 #define VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM (4ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
159 #endif /* defined(__arm64__) && defined(ARM_LARGE_MEMORY) */
160 #define VM_SWAP_MIN_VOLUME_CAPACITY (128ULL * (1ULL << 30))
161 
162 #define VM_SWAPFILE_DELAYED_TRIM_MAX    4
163 
164 #define VM_SWAP_SHOULD_DEFRAGMENT()     (((vm_swap_force_defrag == TRUE) || (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 16))) ? 1 : 0)
165 #define VM_SWAP_SHOULD_PIN(_size)       FALSE
166 #define VM_SWAP_SHOULD_TRIM(swf)        ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0)
167 
168 #else /* !XNU_TARGET_OS_OSX */
169 
170 #define VM_MAX_SWAP_FILE_NUM            100
171 #define VM_SWAPFILE_DELAYED_TRIM_MAX    128
172 
173 #define VM_SWAP_SHOULD_DEFRAGMENT()     (((vm_swap_force_defrag == TRUE) || (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 4))) ? 1 : 0)
174 #define VM_SWAP_SHOULD_PIN(_size)       (vm_swappin_avail > 0 && vm_swappin_avail >= (int64_t)(_size))
175 #define VM_SWAP_SHOULD_TRIM(swf)        ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0)
176 
177 #endif /* !XNU_TARGET_OS_OSX */
178 
179 #define VM_SWAP_SHOULD_RECLAIM()        (((vm_swap_force_reclaim == TRUE) || ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) >= swapfile_reclaim_threshold_segs)) ? 1 : 0)
180 #define VM_SWAP_SHOULD_ABORT_RECLAIM()  (((vm_swap_force_reclaim == FALSE) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) <= swapfile_reclam_minimum_segs)) ? 1 : 0)
181 
182 #define VM_SWAP_BUSY()  (((c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count) && (vm_swapper_throttle == THROTTLE_LEVEL_COMPRESSOR_TIER0)) ? 1 : 0)
183 
184 
185 #if CHECKSUM_THE_SWAP
186 extern unsigned int hash_string(char *cp, int len);
187 #endif
188 
189 #if RECORD_THE_COMPRESSED_DATA
190 boolean_t       c_compressed_record_init_done = FALSE;  /* was the record file opened? */
191 int             c_compressed_record_write_error = 0;
192 struct vnode    *c_compressed_record_vp = NULL;         /* the file opened for record write */
193 uint64_t        c_compressed_record_file_offset = 0;    /* next write offset */
194 void    c_compressed_record_init(void);
195 void    c_compressed_record_write(char *, int);
196 #endif
197 
198 extern void                     vm_pageout_io_throttle(void);
199 
200 static struct swapfile *vm_swapfile_for_handle(uint64_t);
201 
202 /*
203  * Called with the vm_swap_data_lock held.
204  */
205 
206 static struct swapfile *
vm_swapfile_for_handle(uint64_t f_offset)207 vm_swapfile_for_handle(uint64_t f_offset)
208 {
209 	uint64_t                file_offset = 0;
210 	unsigned int            swapfile_index = 0;
211 	struct swapfile*        swf = NULL;
212 
213 	file_offset = (f_offset & SWAP_SLOT_MASK);
214 	swapfile_index = (f_offset >> SWAP_DEVICE_SHIFT);
215 
216 	swf = (struct swapfile*) queue_first(&swf_global_queue);
217 
218 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
219 		if (swapfile_index == swf->swp_index) {
220 			break;
221 		}
222 
223 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
224 	}
225 
226 	if (queue_end(&swf_global_queue, (queue_entry_t) swf)) {
227 		swf = NULL;
228 	}
229 
230 	return swf;
231 }
232 
233 #if ENCRYPTED_SWAP
234 
235 #include <libkern/crypto/aesxts.h>
236 
237 extern int cc_rand_generate(void *, size_t);     /* from libkern/cyrpto/rand.h> */
238 
239 boolean_t       swap_crypt_initialized;
240 void            swap_crypt_initialize(void);
241 
242 symmetric_xts   xts_modectx;
243 uint32_t        swap_crypt_key1[8];   /* big enough for a 256 bit random key */
244 uint32_t        swap_crypt_key2[8];   /* big enough for a 256 bit random key */
245 
246 #if DEVELOPMENT || DEBUG
247 boolean_t       swap_crypt_xts_tested = FALSE;
248 unsigned char   swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
249 unsigned char   swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
250 unsigned char   swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
251 #endif /* DEVELOPMENT || DEBUG */
252 
253 unsigned long   vm_page_encrypt_counter;
254 unsigned long   vm_page_decrypt_counter;
255 
256 
257 void
swap_crypt_initialize(void)258 swap_crypt_initialize(void)
259 {
260 	uint8_t  *enckey1, *enckey2;
261 	int      keylen1, keylen2;
262 	int      error;
263 
264 	assert(swap_crypt_initialized == FALSE);
265 
266 	keylen1 = sizeof(swap_crypt_key1);
267 	enckey1 = (uint8_t *)&swap_crypt_key1;
268 	keylen2 = sizeof(swap_crypt_key2);
269 	enckey2 = (uint8_t *)&swap_crypt_key2;
270 
271 	error = cc_rand_generate((void *)enckey1, keylen1);
272 	assert(!error);
273 
274 	error = cc_rand_generate((void *)enckey2, keylen2);
275 	assert(!error);
276 
277 	error = xts_start(0, NULL, enckey1, keylen1, enckey2, keylen2, 0, 0, &xts_modectx);
278 	assert(!error);
279 
280 	swap_crypt_initialized = TRUE;
281 
282 #if DEVELOPMENT || DEBUG
283 	uint8_t *encptr;
284 	uint8_t *decptr;
285 	uint8_t *refptr;
286 	uint8_t *iv;
287 	uint64_t ivnum[2];
288 	int size = 0;
289 	int i    = 0;
290 	int rc   = 0;
291 
292 	assert(swap_crypt_xts_tested == FALSE);
293 
294 	/*
295 	 * Validate the encryption algorithms.
296 	 *
297 	 * First initialize the test data.
298 	 */
299 	for (i = 0; i < 4096; i++) {
300 		swap_crypt_test_page_ref[i] = (char) i;
301 	}
302 	ivnum[0] = (uint64_t)0xaa;
303 	ivnum[1] = 0;
304 	iv = (uint8_t *)ivnum;
305 
306 	refptr = (uint8_t *)swap_crypt_test_page_ref;
307 	encptr = (uint8_t *)swap_crypt_test_page_encrypt;
308 	decptr = (uint8_t *)swap_crypt_test_page_decrypt;
309 	size = 4096;
310 
311 	/* encrypt */
312 	rc = xts_encrypt(refptr, size, encptr, iv, &xts_modectx);
313 	assert(!rc);
314 
315 	/* compare result with original - should NOT match */
316 	for (i = 0; i < 4096; i++) {
317 		if (swap_crypt_test_page_encrypt[i] !=
318 		    swap_crypt_test_page_ref[i]) {
319 			break;
320 		}
321 	}
322 	assert(i != 4096);
323 
324 	/* decrypt */
325 	rc = xts_decrypt(encptr, size, decptr, iv, &xts_modectx);
326 	assert(!rc);
327 
328 	/* compare result with original */
329 	for (i = 0; i < 4096; i++) {
330 		if (swap_crypt_test_page_decrypt[i] !=
331 		    swap_crypt_test_page_ref[i]) {
332 			panic("encryption test failed");
333 		}
334 	}
335 	/* encrypt in place */
336 	rc = xts_encrypt(decptr, size, decptr, iv, &xts_modectx);
337 	assert(!rc);
338 
339 	/* decrypt in place */
340 	rc = xts_decrypt(decptr, size, decptr, iv, &xts_modectx);
341 	assert(!rc);
342 
343 	for (i = 0; i < 4096; i++) {
344 		if (swap_crypt_test_page_decrypt[i] !=
345 		    swap_crypt_test_page_ref[i]) {
346 			panic("in place encryption test failed");
347 		}
348 	}
349 	swap_crypt_xts_tested = TRUE;
350 #endif /* DEVELOPMENT || DEBUG */
351 }
352 
353 
354 void
vm_swap_encrypt(c_segment_t c_seg)355 vm_swap_encrypt(c_segment_t c_seg)
356 {
357 	uint8_t *ptr;
358 	uint8_t *iv;
359 	uint64_t ivnum[2];
360 	int size = 0;
361 	int rc   = 0;
362 
363 	if (swap_crypt_initialized == FALSE) {
364 		swap_crypt_initialize();
365 	}
366 
367 	/*
368 	 * Data stored in the compressor should never need to be faulted in.
369 	 * Make sure pages storing data that we're encrypting cannot
370 	 * be stolen out from under us in the off chance that the mapping
371 	 * gets disconnected while we're actively encrypting.
372 	 */
373 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
374 #if DEVELOPMENT || DEBUG
375 	C_SEG_MAKE_WRITEABLE(c_seg);
376 #endif
377 	ptr = (uint8_t *)c_seg->c_store.c_buffer;
378 	size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
379 
380 	ivnum[0] = (uint64_t)c_seg;
381 	ivnum[1] = 0;
382 	iv = (uint8_t *)ivnum;
383 
384 	rc = xts_encrypt(ptr, size, ptr, iv, &xts_modectx);
385 	assert(!rc);
386 
387 	vm_page_encrypt_counter += (size / PAGE_SIZE_64);
388 
389 #if DEVELOPMENT || DEBUG
390 	C_SEG_WRITE_PROTECT(c_seg);
391 #endif
392 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
393 }
394 
395 void
vm_swap_decrypt(c_segment_t c_seg,bool disallow_page_replacement)396 vm_swap_decrypt(c_segment_t c_seg, bool disallow_page_replacement)
397 {
398 	uint8_t *ptr;
399 	uint8_t *iv;
400 	uint64_t ivnum[2];
401 	int size = 0;
402 	int rc   = 0;
403 
404 	assert(swap_crypt_initialized);
405 
406 	/*
407 	 * See comment in vm_swap_encrypt().
408 	 * The master lock may already be held, though, which is why we don't do
409 	 * PAGE_REPLACEMENT_DISALLOWED(TRUE) and do a try_lock instead.
410 	 */
411 	if (disallow_page_replacement) {
412 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
413 	}
414 
415 #if DEVELOPMENT || DEBUG
416 	C_SEG_MAKE_WRITEABLE(c_seg);
417 #endif
418 	ptr = (uint8_t *)c_seg->c_store.c_buffer;
419 	size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
420 
421 	ivnum[0] = (uint64_t)c_seg;
422 	ivnum[1] = 0;
423 	iv = (uint8_t *)ivnum;
424 
425 	rc = xts_decrypt(ptr, size, ptr, iv, &xts_modectx);
426 	assert(!rc);
427 
428 	vm_page_decrypt_counter += (size / PAGE_SIZE_64);
429 
430 #if DEVELOPMENT || DEBUG
431 	C_SEG_WRITE_PROTECT(c_seg);
432 #endif
433 	if (disallow_page_replacement) {
434 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
435 	}
436 }
437 #endif /* ENCRYPTED_SWAP */
438 
439 uint64_t compressed_swap_chunk_size, vm_swapfile_hiwater_segs, swapfile_reclaim_threshold_segs, swapfile_reclam_minimum_segs;
440 extern bool memorystatus_swap_all_apps;
441 
442 void
vm_compressor_swap_init_swap_file_limit(void)443 vm_compressor_swap_init_swap_file_limit(void)
444 {
445 	vm_num_swap_files_config = VM_MAX_SWAP_FILE_NUM;
446 #if CONFIG_JETSAM
447 	if (memorystatus_swap_all_apps) {
448 		if (vm_swap_volume_capacity == 0) {
449 			/*
450 			 * Early in boot we don't know the swap volume capacity.
451 			 * That's fine. Reserve space for the maximum config
452 			 * and we'll lower this later in boot once we have the capacity.
453 			 */
454 			vm_num_swap_files_config = VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM;
455 		} else {
456 			static uint64_t kFixedPointFactor = 100;
457 			/*
458 			 * Scale the max number of swap files linearly.
459 			 * But we can never go above VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM.
460 			 */
461 			vm_num_swap_files_config = vm_swap_volume_capacity * kFixedPointFactor / VM_SWAP_MIN_VOLUME_CAPACITY
462 			    * VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM / kFixedPointFactor;
463 			vm_num_swap_files_config = MAX(vm_num_swap_files_config, VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM);
464 			vm_num_swap_files_config = MIN(vm_num_swap_files_config, VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM);
465 		}
466 	}
467 #endif /* CONFIG_JETSAM */
468 #if DEVELOPMENT || DEBUG
469 	typeof(vm_num_swap_files_config) parsed_vm_max_num_swap_files = 0;
470 	if (PE_parse_boot_argn("vm_max_num_swap_files", &parsed_vm_max_num_swap_files, sizeof(parsed_vm_max_num_swap_files))) {
471 		if (parsed_vm_max_num_swap_files > 0) {
472 			vm_num_swap_files_config = parsed_vm_max_num_swap_files;
473 		} else {
474 			printf("WARNING: Ignoring vm_max_num_swap_files=%d boot-arg. Value must be > 0\n", parsed_vm_max_num_swap_files);
475 		}
476 	}
477 #endif
478 	printf("Maximum number of VM swap files: %d\n", vm_num_swap_files_config);
479 }
480 
481 int vm_swap_enabled = 0;
482 void
vm_compressor_swap_init(void)483 vm_compressor_swap_init(void)
484 {
485 	thread_t        thread = NULL;
486 
487 	queue_init(&swf_global_queue);
488 
489 #if !XNU_TARGET_OS_OSX
490 	/*
491 	 * dummy value until the swap file gets created
492 	 * when we drive the first c_segment_t to the
493 	 * swapout queue... at that time we will
494 	 * know the true size we have to work with
495 	 */
496 	c_overage_swapped_limit = 16;
497 #endif /* !XNU_TARGET_OS_OSX */
498 
499 	compressed_swap_chunk_size = c_seg_bufsize;
500 	vm_swapfile_hiwater_segs = (MIN_SWAP_FILE_SIZE / compressed_swap_chunk_size);
501 	swapfile_reclaim_threshold_segs = ((17 * (MAX_SWAP_FILE_SIZE / compressed_swap_chunk_size)) / 10);
502 	swapfile_reclam_minimum_segs = ((13 * (MAX_SWAP_FILE_SIZE / compressed_swap_chunk_size)) / 10);
503 
504 	if (kernel_thread_start_priority((thread_continue_t)vm_swapout_thread, NULL,
505 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
506 		panic("vm_swapout_thread: create failed");
507 	}
508 	thread_set_thread_name(thread, "VM_swapout");
509 	vm_swapout_thread_id = thread->thread_id;
510 	thread_deallocate(thread);
511 
512 	if (kernel_thread_start_priority((thread_continue_t)vm_swapfile_create_thread, NULL,
513 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
514 		panic("vm_swapfile_create_thread: create failed");
515 	}
516 	thread_set_thread_name(thread, "VM_swapfile_create");
517 	thread_deallocate(thread);
518 
519 	if (kernel_thread_start_priority((thread_continue_t)vm_swapfile_gc_thread, NULL,
520 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
521 		panic("vm_swapfile_gc_thread: create failed");
522 	}
523 	thread_set_thread_name(thread, "VM_swapfile_gc");
524 	/*
525 	 * Swapfile garbage collection will need to allocate memory
526 	 * to complete its swap reclaim and in-memory compaction.
527 	 * So allow it to dip into the reserved VM page pool.
528 	 */
529 	thread_lock(thread);
530 	thread->options |= TH_OPT_VMPRIV;
531 	thread_unlock(thread);
532 	thread_deallocate(thread);
533 	proc_set_thread_policy_with_tid(kernel_task, thread->thread_id,
534 	    TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2);
535 	proc_set_thread_policy_with_tid(kernel_task, thread->thread_id,
536 	    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
537 
538 	vm_swap_enabled = 1;
539 	printf("VM Swap Subsystem is ON\n");
540 }
541 
542 
543 #if RECORD_THE_COMPRESSED_DATA
544 
545 void
c_compressed_record_init()546 c_compressed_record_init()
547 {
548 	if (c_compressed_record_init_done == FALSE) {
549 		vm_swapfile_open("/tmp/compressed_data", &c_compressed_record_vp);
550 		c_compressed_record_init_done = TRUE;
551 	}
552 }
553 
554 void
c_compressed_record_write(char * buf,int size)555 c_compressed_record_write(char *buf, int size)
556 {
557 	if (c_compressed_record_write_error == 0) {
558 		c_compressed_record_write_error = vm_record_file_write(c_compressed_record_vp, c_compressed_record_file_offset, buf, size);
559 		c_compressed_record_file_offset += size;
560 	}
561 }
562 #endif
563 
564 
565 int             compaction_swapper_inited = 0;
566 
567 void
vm_compaction_swapper_do_init(void)568 vm_compaction_swapper_do_init(void)
569 {
570 	struct  vnode *vp;
571 	char    *pathname;
572 	int     namelen;
573 
574 	if (compaction_swapper_inited) {
575 		return;
576 	}
577 
578 	if (vm_compressor_mode != VM_PAGER_COMPRESSOR_WITH_SWAP) {
579 		compaction_swapper_inited = 1;
580 		return;
581 	}
582 	lck_mtx_lock(&vm_swap_data_lock);
583 
584 	if (!compaction_swapper_inited) {
585 		namelen = (int)strlen(swapfilename) + SWAPFILENAME_INDEX_LEN + 1;
586 		pathname = kalloc_data(namelen, Z_WAITOK | Z_ZERO);
587 		snprintf(pathname, namelen, "%s%d", swapfilename, 0);
588 
589 		vm_swapfile_open(pathname, &vp);
590 
591 		if (vp) {
592 			if (vnode_pager_isSSD(vp) == FALSE) {
593 				/*
594 				 * swap files live on an HDD, so let's make sure to start swapping
595 				 * much earlier since we're not worried about SSD write-wear and
596 				 * we have so little write bandwidth to work with
597 				 * these values were derived expermentially by running the performance
598 				 * teams stock test for evaluating HDD performance against various
599 				 * combinations and looking and comparing overall results.
600 				 * Note that the > relationship between these 4 values must be maintained
601 				 */
602 				if (vm_compressor_minorcompact_threshold_divisor_overridden == 0) {
603 					vm_compressor_minorcompact_threshold_divisor = 15;
604 				}
605 				if (vm_compressor_majorcompact_threshold_divisor_overridden == 0) {
606 					vm_compressor_majorcompact_threshold_divisor = 18;
607 				}
608 				if (vm_compressor_unthrottle_threshold_divisor_overridden == 0) {
609 					vm_compressor_unthrottle_threshold_divisor = 24;
610 				}
611 				if (vm_compressor_catchup_threshold_divisor_overridden == 0) {
612 					vm_compressor_catchup_threshold_divisor = 30;
613 				}
614 			}
615 #if XNU_TARGET_OS_OSX
616 			vnode_setswapmount(vp);
617 			vm_swappin_avail = vnode_getswappin_avail(vp);
618 
619 			if (vm_swappin_avail) {
620 				vm_swappin_enabled = TRUE;
621 			}
622 #endif /* XNU_TARGET_OS_OSX */
623 			vm_swapfile_close((uint64_t)pathname, vp);
624 		}
625 		kfree_data(pathname, namelen);
626 
627 		compaction_swapper_inited = 1;
628 	}
629 	lck_mtx_unlock(&vm_swap_data_lock);
630 }
631 
632 
633 void
vm_swap_consider_defragmenting(int flags)634 vm_swap_consider_defragmenting(int flags)
635 {
636 	boolean_t force_defrag = (flags & VM_SWAP_FLAGS_FORCE_DEFRAG);
637 	boolean_t force_reclaim = (flags & VM_SWAP_FLAGS_FORCE_RECLAIM);
638 
639 	if (compressor_store_stop_compaction == FALSE && !VM_SWAP_BUSY() &&
640 	    (force_defrag || force_reclaim || VM_SWAP_SHOULD_DEFRAGMENT() || VM_SWAP_SHOULD_RECLAIM())) {
641 		if (!vm_swapfile_gc_thread_running || force_defrag || force_reclaim) {
642 			lck_mtx_lock(&vm_swap_data_lock);
643 
644 			if (force_defrag) {
645 				vm_swap_force_defrag = TRUE;
646 			}
647 
648 			if (force_reclaim) {
649 				vm_swap_force_reclaim = TRUE;
650 			}
651 
652 			if (!vm_swapfile_gc_thread_running) {
653 				thread_wakeup((event_t) &vm_swapfile_gc_needed);
654 			}
655 
656 			lck_mtx_unlock(&vm_swap_data_lock);
657 		}
658 	}
659 }
660 
661 
662 int vm_swap_defragment_yielded = 0;
663 int vm_swap_defragment_swapin = 0;
664 int vm_swap_defragment_free = 0;
665 int vm_swap_defragment_busy = 0;
666 
667 #if CONFIG_FREEZE
668 extern int32_t c_segment_pages_compressed_incore;
669 extern int32_t c_segment_pages_compressed_incore_late_swapout;
670 extern uint32_t c_segment_pages_compressed_nearing_limit;
671 extern uint32_t c_segment_count;
672 extern uint32_t c_segments_nearing_limit;
673 
674 extern bool freezer_incore_cseg_acct;
675 #endif /* CONFIG_FREEZE */
676 
677 static void
vm_swap_defragment()678 vm_swap_defragment()
679 {
680 	c_segment_t     c_seg;
681 
682 	/*
683 	 * have to grab the master lock w/o holding
684 	 * any locks in spin mode
685 	 */
686 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
687 
688 	lck_mtx_lock_spin_always(c_list_lock);
689 
690 	while (!queue_empty(&c_swappedout_sparse_list_head)) {
691 		if (compressor_store_stop_compaction == TRUE || VM_SWAP_BUSY()) {
692 			vm_swap_defragment_yielded++;
693 			break;
694 		}
695 		c_seg = (c_segment_t)queue_first(&c_swappedout_sparse_list_head);
696 
697 		lck_mtx_lock_spin_always(&c_seg->c_lock);
698 
699 		assert(c_seg->c_state == C_ON_SWAPPEDOUTSPARSE_Q);
700 
701 		if (c_seg->c_busy) {
702 			lck_mtx_unlock_always(c_list_lock);
703 
704 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
705 			/*
706 			 * c_seg_wait_on_busy consumes c_seg->c_lock
707 			 */
708 			c_seg_wait_on_busy(c_seg);
709 
710 			PAGE_REPLACEMENT_DISALLOWED(TRUE);
711 
712 			lck_mtx_lock_spin_always(c_list_lock);
713 
714 			vm_swap_defragment_busy++;
715 			continue;
716 		}
717 		if (c_seg->c_bytes_used == 0) {
718 			/*
719 			 * c_seg_free_locked consumes the c_list_lock
720 			 * and c_seg->c_lock
721 			 */
722 			C_SEG_BUSY(c_seg);
723 			c_seg_free_locked(c_seg);
724 
725 			vm_swap_defragment_free++;
726 		} else {
727 			lck_mtx_unlock_always(c_list_lock);
728 
729 #if CONFIG_FREEZE
730 			if (freezer_incore_cseg_acct) {
731 				/*
732 				 * TODO(jason): These two are tricky because they're pre-emptive jetsams.
733 				 * The system is not unhealthy, but we know that it's about to become unhealthy once
734 				 * we do this swapin.
735 				 * So we're waking up the memorystatus thread to make space
736 				 * (hopefully) before this segment comes in.
737 				 *
738 				 * I think the compressor_backing_store needs to keep track of
739 				 * two new globals that will track the number of segments
740 				 * being swapped in due to defrag and the number of slots used
741 				 * in those segments.
742 				 * Then the health check below can be called from the memorystatus
743 				 * thread.
744 				 */
745 				if ((c_seg->c_slots_used + c_segment_pages_compressed_incore) >= c_segment_pages_compressed_nearing_limit) {
746 					memorystatus_kill_on_VM_compressor_space_shortage(TRUE /* async */);
747 				}
748 
749 				uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
750 				if ((incore_seg_count + 1) >= c_segments_nearing_limit) {
751 					memorystatus_kill_on_VM_compressor_space_shortage(TRUE /* async */);
752 				}
753 			}
754 #endif /* CONFIG_FREEZE */
755 			if (c_seg_swapin(c_seg, TRUE, FALSE) == 0) {
756 				lck_mtx_unlock_always(&c_seg->c_lock);
757 				vmcs_stats.defrag_swapins += (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) >> PAGE_SHIFT;
758 			}
759 
760 			vm_swap_defragment_swapin++;
761 		}
762 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
763 
764 		vm_pageout_io_throttle();
765 
766 		/*
767 		 * because write waiters have privilege over readers,
768 		 * dropping and immediately retaking the master lock will
769 		 * still allow any thread waiting to acquire the
770 		 * master lock exclusively an opportunity to take it
771 		 */
772 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
773 
774 		lck_mtx_lock_spin_always(c_list_lock);
775 	}
776 	lck_mtx_unlock_always(c_list_lock);
777 
778 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
779 }
780 
781 TUNABLE(uint64_t, vm_swapfile_creation_delay_ns, "vm_swapfile_creation_delay_ns", 15 * NSEC_PER_SEC);
782 
783 static inline bool
vm_swapfile_should_create(uint64_t now)784 vm_swapfile_should_create(uint64_t now)
785 {
786 	uint64_t delta_failed_creation_ns;
787 	absolutetime_to_nanoseconds(now - vm_swapfile_last_failed_to_create_ts, &delta_failed_creation_ns);
788 
789 	return (vm_num_swap_files < vm_num_swap_files_config) &&
790 	       ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)vm_swapfile_hiwater_segs) &&
791 	       (delta_failed_creation_ns > vm_swapfile_creation_delay_ns);
792 }
793 
794 bool vm_swapfile_create_thread_inited = false;
795 
796 static void
vm_swapfile_create_thread(void)797 vm_swapfile_create_thread(void)
798 {
799 	uint64_t now;
800 
801 	if (!vm_swapfile_create_thread_inited) {
802 #if CONFIG_THREAD_GROUPS
803 		thread_group_vm_add();
804 #endif /* CONFIG_THREAD_GROUPS */
805 		current_thread()->options |= TH_OPT_VMPRIV;
806 
807 		vm_swapfile_create_thread_inited = true;
808 	}
809 
810 	vm_swapfile_create_thread_awakened++;
811 	vm_swapfile_create_thread_running = 1;
812 
813 	while (TRUE) {
814 		/*
815 		 * walk through the list of swap files
816 		 * and do the delayed frees/trims for
817 		 * any swap file whose count of delayed
818 		 * frees is above the batch limit
819 		 */
820 		vm_swap_handle_delayed_trims(FALSE);
821 
822 		lck_mtx_lock(&vm_swap_data_lock);
823 
824 		if (hibernate_in_progress_with_pinned_swap == TRUE) {
825 			break;
826 		}
827 
828 		if (compressor_store_stop_compaction == TRUE) {
829 			break;
830 		}
831 
832 		now = mach_absolute_time();
833 
834 		if (!vm_swapfile_should_create(now)) {
835 			break;
836 		}
837 
838 		lck_mtx_unlock(&vm_swap_data_lock);
839 
840 		if (vm_swap_create_file() == FALSE) {
841 			vm_swapfile_last_failed_to_create_ts = now;
842 			HIBLOG("low swap: failed to create swapfile\n");
843 		} else {
844 			vm_swapfile_last_successful_create_ts = now;
845 		}
846 	}
847 	vm_swapfile_create_thread_running = 0;
848 
849 	if (hibernate_in_progress_with_pinned_swap == TRUE) {
850 		thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap);
851 	}
852 
853 	if (compressor_store_stop_compaction == TRUE) {
854 		thread_wakeup((event_t)&compressor_store_stop_compaction);
855 	}
856 
857 	assert_wait((event_t)&vm_swapfile_create_needed, THREAD_UNINT);
858 
859 	lck_mtx_unlock(&vm_swap_data_lock);
860 
861 	thread_block((thread_continue_t)vm_swapfile_create_thread);
862 
863 	/* NOTREACHED */
864 }
865 
866 
867 #if HIBERNATION
868 
869 kern_return_t
hibernate_pin_swap(boolean_t start)870 hibernate_pin_swap(boolean_t start)
871 {
872 	vm_compaction_swapper_do_init();
873 
874 	if (start == FALSE) {
875 		lck_mtx_lock(&vm_swap_data_lock);
876 		hibernate_in_progress_with_pinned_swap = FALSE;
877 		lck_mtx_unlock(&vm_swap_data_lock);
878 
879 		return KERN_SUCCESS;
880 	}
881 	if (vm_swappin_enabled == FALSE) {
882 		return KERN_SUCCESS;
883 	}
884 
885 	lck_mtx_lock(&vm_swap_data_lock);
886 
887 	hibernate_in_progress_with_pinned_swap = TRUE;
888 
889 	while (vm_swapfile_create_thread_running || vm_swapfile_gc_thread_running) {
890 		assert_wait((event_t)&hibernate_in_progress_with_pinned_swap, THREAD_UNINT);
891 
892 		lck_mtx_unlock(&vm_swap_data_lock);
893 
894 		thread_block(THREAD_CONTINUE_NULL);
895 
896 		lck_mtx_lock(&vm_swap_data_lock);
897 	}
898 	if (vm_num_swap_files > vm_num_pinned_swap_files) {
899 		hibernate_in_progress_with_pinned_swap = FALSE;
900 		lck_mtx_unlock(&vm_swap_data_lock);
901 
902 		HIBLOG("hibernate_pin_swap failed - vm_num_swap_files = %d, vm_num_pinned_swap_files = %d\n",
903 		    vm_num_swap_files, vm_num_pinned_swap_files);
904 		return KERN_FAILURE;
905 	}
906 	lck_mtx_unlock(&vm_swap_data_lock);
907 
908 	while (VM_SWAP_SHOULD_PIN(MAX_SWAP_FILE_SIZE)) {
909 		if (vm_swap_create_file() == FALSE) {
910 			break;
911 		}
912 	}
913 	return KERN_SUCCESS;
914 }
915 #endif
916 bool vm_swapfile_gc_thread_inited = false;
917 static void
vm_swapfile_gc_thread(void)918 vm_swapfile_gc_thread(void)
919 {
920 	boolean_t       need_defragment;
921 	boolean_t       need_reclaim;
922 
923 	if (!vm_swapfile_gc_thread_inited) {
924 #if CONFIG_THREAD_GROUPS
925 		thread_group_vm_add();
926 #endif /* CONFIG_THREAD_GROUPS */
927 		vm_swapfile_gc_thread_inited = true;
928 	}
929 
930 	vm_swapfile_gc_thread_awakened++;
931 	vm_swapfile_gc_thread_running = 1;
932 
933 	while (TRUE) {
934 		lck_mtx_lock(&vm_swap_data_lock);
935 
936 		if (hibernate_in_progress_with_pinned_swap == TRUE) {
937 			break;
938 		}
939 
940 		if (VM_SWAP_BUSY() || compressor_store_stop_compaction == TRUE) {
941 			break;
942 		}
943 
944 		need_defragment = FALSE;
945 		need_reclaim = FALSE;
946 
947 		if (VM_SWAP_SHOULD_DEFRAGMENT()) {
948 			need_defragment = TRUE;
949 		}
950 
951 		if (VM_SWAP_SHOULD_RECLAIM()) {
952 			need_defragment = TRUE;
953 			need_reclaim = TRUE;
954 		}
955 		if (need_defragment == FALSE && need_reclaim == FALSE) {
956 			break;
957 		}
958 
959 		vm_swap_force_defrag = FALSE;
960 		vm_swap_force_reclaim = FALSE;
961 
962 		lck_mtx_unlock(&vm_swap_data_lock);
963 
964 		if (need_defragment == TRUE) {
965 			vm_swap_defragment();
966 		}
967 		if (need_reclaim == TRUE) {
968 			vm_swap_reclaim();
969 		}
970 	}
971 	vm_swapfile_gc_thread_running = 0;
972 
973 	if (hibernate_in_progress_with_pinned_swap == TRUE) {
974 		thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap);
975 	}
976 
977 	if (compressor_store_stop_compaction == TRUE) {
978 		thread_wakeup((event_t)&compressor_store_stop_compaction);
979 	}
980 
981 	assert_wait((event_t)&vm_swapfile_gc_needed, THREAD_UNINT);
982 
983 	lck_mtx_unlock(&vm_swap_data_lock);
984 
985 	thread_block((thread_continue_t)vm_swapfile_gc_thread);
986 
987 	/* NOTREACHED */
988 }
989 
990 
991 
992 #define   VM_SWAPOUT_LIMIT_T2P  4
993 #define   VM_SWAPOUT_LIMIT_T1P  4
994 #define   VM_SWAPOUT_LIMIT_T0P  6
995 #define   VM_SWAPOUT_LIMIT_T0   8
996 #define   VM_SWAPOUT_LIMIT_MAX  8
997 
998 #define   VM_SWAPOUT_START      0
999 #define   VM_SWAPOUT_T2_PASSIVE 1
1000 #define   VM_SWAPOUT_T1_PASSIVE 2
1001 #define   VM_SWAPOUT_T0_PASSIVE 3
1002 #define   VM_SWAPOUT_T0         4
1003 
1004 int vm_swapout_state = VM_SWAPOUT_START;
1005 int vm_swapout_limit = 1;
1006 
1007 int vm_swapper_entered_T0  = 0;
1008 int vm_swapper_entered_T0P = 0;
1009 int vm_swapper_entered_T1P = 0;
1010 int vm_swapper_entered_T2P = 0;
1011 
1012 
1013 static void
vm_swapout_thread_throttle_adjust(void)1014 vm_swapout_thread_throttle_adjust(void)
1015 {
1016 	switch (vm_swapout_state) {
1017 	case VM_SWAPOUT_START:
1018 
1019 		vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
1020 		vm_swapper_entered_T2P++;
1021 
1022 		proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1023 		    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1024 		proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1025 		    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1026 		vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
1027 		vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
1028 
1029 		break;
1030 
1031 	case VM_SWAPOUT_T2_PASSIVE:
1032 
1033 		if (SWAPPER_NEEDS_TO_UNTHROTTLE()) {
1034 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER0;
1035 			vm_swapper_entered_T0P++;
1036 
1037 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1038 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1039 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1040 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1041 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
1042 			vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
1043 
1044 			break;
1045 		}
1046 		if (swapout_target_age || hibernate_flushing == TRUE) {
1047 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER1;
1048 			vm_swapper_entered_T1P++;
1049 
1050 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1051 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1052 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1053 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1054 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T1P;
1055 			vm_swapout_state = VM_SWAPOUT_T1_PASSIVE;
1056 		}
1057 		break;
1058 
1059 	case VM_SWAPOUT_T1_PASSIVE:
1060 
1061 		if (SWAPPER_NEEDS_TO_UNTHROTTLE()) {
1062 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER0;
1063 			vm_swapper_entered_T0P++;
1064 
1065 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1066 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1067 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1068 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1069 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
1070 			vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
1071 
1072 			break;
1073 		}
1074 		if (swapout_target_age == 0 && hibernate_flushing == FALSE) {
1075 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
1076 			vm_swapper_entered_T2P++;
1077 
1078 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1079 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1080 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1081 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1082 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
1083 			vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
1084 		}
1085 		break;
1086 
1087 	case VM_SWAPOUT_T0_PASSIVE:
1088 
1089 		if (SWAPPER_NEEDS_TO_RETHROTTLE()) {
1090 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
1091 			vm_swapper_entered_T2P++;
1092 
1093 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1094 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1095 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1096 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1097 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
1098 			vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
1099 
1100 			break;
1101 		}
1102 		if (SWAPPER_NEEDS_TO_CATCHUP()) {
1103 			vm_swapper_entered_T0++;
1104 
1105 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1106 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_DISABLE);
1107 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0;
1108 			vm_swapout_state = VM_SWAPOUT_T0;
1109 		}
1110 		break;
1111 
1112 	case VM_SWAPOUT_T0:
1113 
1114 		if (SWAPPER_HAS_CAUGHTUP()) {
1115 			vm_swapper_entered_T0P++;
1116 
1117 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1118 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1119 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
1120 			vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
1121 		}
1122 		break;
1123 	}
1124 }
1125 
1126 int vm_swapout_found_empty = 0;
1127 
1128 struct swapout_io_completion vm_swapout_ctx[VM_SWAPOUT_LIMIT_MAX];
1129 
1130 int vm_swapout_soc_busy = 0;
1131 int vm_swapout_soc_done = 0;
1132 
1133 
1134 static struct swapout_io_completion *
vm_swapout_find_free_soc(void)1135 vm_swapout_find_free_soc(void)
1136 {
1137 	int      i;
1138 
1139 	for (i = 0; i < VM_SWAPOUT_LIMIT_MAX; i++) {
1140 		if (vm_swapout_ctx[i].swp_io_busy == 0) {
1141 			return &vm_swapout_ctx[i];
1142 		}
1143 	}
1144 	assert(vm_swapout_soc_busy == VM_SWAPOUT_LIMIT_MAX);
1145 
1146 	return NULL;
1147 }
1148 
1149 static struct swapout_io_completion *
vm_swapout_find_done_soc(void)1150 vm_swapout_find_done_soc(void)
1151 {
1152 	int      i;
1153 
1154 	if (vm_swapout_soc_done) {
1155 		for (i = 0; i < VM_SWAPOUT_LIMIT_MAX; i++) {
1156 			if (vm_swapout_ctx[i].swp_io_done) {
1157 				return &vm_swapout_ctx[i];
1158 			}
1159 		}
1160 	}
1161 	return NULL;
1162 }
1163 
1164 static void
vm_swapout_complete_soc(struct swapout_io_completion * soc)1165 vm_swapout_complete_soc(struct swapout_io_completion *soc)
1166 {
1167 	kern_return_t  kr;
1168 
1169 	if (soc->swp_io_error) {
1170 		kr = KERN_FAILURE;
1171 	} else {
1172 		kr = KERN_SUCCESS;
1173 	}
1174 
1175 	lck_mtx_unlock_always(c_list_lock);
1176 
1177 	vm_swap_put_finish(soc->swp_swf, &soc->swp_f_offset, soc->swp_io_error, TRUE /*drop iocount*/);
1178 	vm_swapout_finish(soc->swp_c_seg, soc->swp_f_offset, soc->swp_c_size, kr);
1179 
1180 	lck_mtx_lock_spin_always(c_list_lock);
1181 
1182 	soc->swp_io_done = 0;
1183 	soc->swp_io_busy = 0;
1184 
1185 	vm_swapout_soc_busy--;
1186 	vm_swapout_soc_done--;
1187 }
1188 
1189 bool vm_swapout_thread_inited = false;
1190 extern uint32_t c_donate_swapout_count;
1191 #if CONFIG_JETSAM
1192 bool memorystatus_swap_over_trigger(uint64_t adjustment_factor);
1193 /*
1194  * swapout_sleep_threshold sets the percentage of the swapout threshold at which
1195  * the swap thread will stop processing the swapout queue.
1196  * By default this is 90 which means we will swap until the
1197  * swapout queue size is at 90% of the threshold to wake the swap thread.
1198  * By definition the queue  length must be >= 100% of the threshold when the.
1199  * swap thread is woken up. On development builds this can be adjusted with
1200  * the vm.swapout_sleep_threshold sysctl.
1201  */
1202 uint32_t swapout_sleep_threshold = 90;
1203 #endif /* CONFIG_JETSAM */
1204 static bool
should_process_swapout_queue(const queue_head_t * swapout_list_head)1205 should_process_swapout_queue(const queue_head_t *swapout_list_head)
1206 {
1207 	bool process_queue = !queue_empty(swapout_list_head) &&
1208 	    vm_swapout_soc_busy < vm_swapout_limit &&
1209 	    !compressor_store_stop_compaction;
1210 #if CONFIG_JETSAM
1211 	if (memorystatus_swap_all_apps && swapout_list_head == &c_late_swapout_list_head) {
1212 		process_queue = process_queue && memorystatus_swap_over_trigger(swapout_sleep_threshold);
1213 	}
1214 #endif /* CONFIG_JETSAM */
1215 	return process_queue;
1216 }
1217 
1218 void
vm_swapout_thread(void)1219 vm_swapout_thread(void)
1220 {
1221 	uint32_t        size = 0;
1222 	c_segment_t     c_seg = NULL;
1223 	kern_return_t   kr = KERN_SUCCESS;
1224 	struct swapout_io_completion *soc;
1225 	queue_head_t    *swapout_list_head;
1226 	bool            queues_empty = false;
1227 
1228 	if (!vm_swapout_thread_inited) {
1229 #if CONFIG_THREAD_GROUPS
1230 		thread_group_vm_add();
1231 #endif /* CONFIG_THREAD_GROUPS */
1232 		current_thread()->options |= TH_OPT_VMPRIV;
1233 		vm_swapout_thread_inited = true;
1234 	}
1235 
1236 	vm_swapout_thread_awakened++;
1237 
1238 	lck_mtx_lock_spin_always(c_list_lock);
1239 
1240 	swapout_list_head = &c_early_swapout_list_head;
1241 	vm_swapout_thread_running = TRUE;
1242 	os_atomic_store(&vm_swapout_wake_pending, false, relaxed);
1243 again:
1244 	while (should_process_swapout_queue(swapout_list_head)) {
1245 		c_seg = (c_segment_t)queue_first(swapout_list_head);
1246 
1247 		lck_mtx_lock_spin_always(&c_seg->c_lock);
1248 
1249 		assert(c_seg->c_state == C_ON_SWAPOUT_Q);
1250 
1251 		if (c_seg->c_busy) {
1252 			lck_mtx_unlock_always(c_list_lock);
1253 
1254 			c_seg_wait_on_busy(c_seg);
1255 
1256 			lck_mtx_lock_spin_always(c_list_lock);
1257 
1258 			continue;
1259 		}
1260 		vm_swapout_thread_processed_segments++;
1261 
1262 		size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
1263 
1264 		if (size == 0) {
1265 			assert(c_seg->c_bytes_used == 0);
1266 
1267 			/*
1268 			 * c_seg_free_locked will drop the c_list_lock and
1269 			 * the c_seg->c_lock.
1270 			 */
1271 			C_SEG_BUSY(c_seg);
1272 			c_seg_free_locked(c_seg);
1273 			c_seg = NULL;
1274 
1275 			vm_swapout_found_empty++;
1276 			goto c_seg_is_empty;
1277 		}
1278 		C_SEG_BUSY(c_seg);
1279 		c_seg->c_busy_swapping = 1;
1280 
1281 		c_seg_switch_state(c_seg, C_ON_SWAPIO_Q, FALSE);
1282 
1283 		lck_mtx_unlock_always(c_list_lock);
1284 		lck_mtx_unlock_always(&c_seg->c_lock);
1285 
1286 #if CHECKSUM_THE_SWAP
1287 		c_seg->cseg_hash = hash_string((char *)c_seg->c_store.c_buffer, (int)size);
1288 		c_seg->cseg_swap_size = size;
1289 #endif /* CHECKSUM_THE_SWAP */
1290 
1291 #if ENCRYPTED_SWAP
1292 		vm_swap_encrypt(c_seg);
1293 #endif /* ENCRYPTED_SWAP */
1294 
1295 		soc = vm_swapout_find_free_soc();
1296 		assert(soc);
1297 
1298 		soc->swp_upl_ctx.io_context = (void *)soc;
1299 		soc->swp_upl_ctx.io_done = (void *)vm_swapout_iodone;
1300 		soc->swp_upl_ctx.io_error = 0;
1301 
1302 		kr = vm_swap_put((vm_offset_t)c_seg->c_store.c_buffer, &soc->swp_f_offset, size, c_seg, soc);
1303 
1304 		if (kr != KERN_SUCCESS) {
1305 			if (soc->swp_io_done) {
1306 				lck_mtx_lock_spin_always(c_list_lock);
1307 
1308 				soc->swp_io_done = 0;
1309 				vm_swapout_soc_done--;
1310 
1311 				lck_mtx_unlock_always(c_list_lock);
1312 			}
1313 			vm_swapout_finish(c_seg, soc->swp_f_offset, size, kr);
1314 		} else {
1315 			soc->swp_io_busy = 1;
1316 			vm_swapout_soc_busy++;
1317 		}
1318 
1319 c_seg_is_empty:
1320 		if (!(c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count)) {
1321 			vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE);
1322 		}
1323 
1324 		lck_mtx_lock_spin_always(c_list_lock);
1325 
1326 		while ((soc = vm_swapout_find_done_soc())) {
1327 			vm_swapout_complete_soc(soc);
1328 		}
1329 		lck_mtx_unlock_always(c_list_lock);
1330 
1331 		vm_swapout_thread_throttle_adjust();
1332 
1333 		lck_mtx_lock_spin_always(c_list_lock);
1334 	}
1335 	while ((soc = vm_swapout_find_done_soc())) {
1336 		vm_swapout_complete_soc(soc);
1337 	}
1338 	lck_mtx_unlock_always(c_list_lock);
1339 
1340 	vm_pageout_io_throttle();
1341 
1342 	lck_mtx_lock_spin_always(c_list_lock);
1343 
1344 	/*
1345 	 * Recheck if we have some c_segs to wakeup
1346 	 * post throttle. And, check to see if we
1347 	 * have any more swapouts needed.
1348 	 */
1349 	if (vm_swapout_soc_done) {
1350 		goto again;
1351 	}
1352 
1353 #if XNU_TARGET_OS_OSX
1354 	queues_empty = queue_empty(&c_early_swapout_list_head) && queue_empty(&c_regular_swapout_list_head) && queue_empty(&c_late_swapout_list_head);
1355 #else /* XNU_TARGET_OS_OSX */
1356 	queues_empty = queue_empty(&c_early_swapout_list_head) && queue_empty(&c_late_swapout_list_head);
1357 #endif /* XNU_TARGET_OS_OSX */
1358 
1359 	if (!queues_empty) {
1360 		swapout_list_head = NULL;
1361 		if (!queue_empty(&c_early_swapout_list_head)) {
1362 			swapout_list_head = &c_early_swapout_list_head;
1363 		} else {
1364 #if XNU_TARGET_OS_OSX
1365 			/*
1366 			 * On macOS we _always_ processs all swapout queues.
1367 			 */
1368 			if (!queue_empty(&c_regular_swapout_list_head)) {
1369 				swapout_list_head = &c_regular_swapout_list_head;
1370 			} else {
1371 				swapout_list_head = &c_late_swapout_list_head;
1372 			}
1373 #else /* XNU_TARGET_OS_OSX */
1374 			/*
1375 			 * On non-macOS swap-capable platforms, we might want to
1376 			 * processs just the early queue (Freezer) or process both
1377 			 * early and late queues (app swap). We processed the early
1378 			 * queue up above. The late Q will only be processed if the
1379 			 * checks in should_process_swapout_queue give the go-ahead.
1380 			 */
1381 			swapout_list_head = &c_late_swapout_list_head;
1382 #endif /* XNU_TARGET_OS_OSX */
1383 		}
1384 		if (swapout_list_head && should_process_swapout_queue(swapout_list_head)) {
1385 			goto again;
1386 		}
1387 	}
1388 
1389 	assert_wait((event_t)&vm_swapout_thread, THREAD_UNINT);
1390 
1391 	vm_swapout_thread_running = FALSE;
1392 
1393 	lck_mtx_unlock_always(c_list_lock);
1394 
1395 	thread_block((thread_continue_t)vm_swapout_thread);
1396 
1397 	/* NOTREACHED */
1398 }
1399 
1400 
1401 void
vm_swapout_iodone(void * io_context,int error)1402 vm_swapout_iodone(void *io_context, int error)
1403 {
1404 	struct swapout_io_completion *soc;
1405 
1406 	soc = (struct swapout_io_completion *)io_context;
1407 
1408 	lck_mtx_lock_spin_always(c_list_lock);
1409 
1410 	soc->swp_io_done = 1;
1411 	soc->swp_io_error = error;
1412 	vm_swapout_soc_done++;
1413 
1414 	if (!vm_swapout_thread_running) {
1415 		thread_wakeup((event_t)&vm_swapout_thread);
1416 	}
1417 
1418 	lck_mtx_unlock_always(c_list_lock);
1419 }
1420 
1421 
1422 static void
vm_swapout_finish(c_segment_t c_seg,uint64_t f_offset,uint32_t size,kern_return_t kr)1423 vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset, uint32_t size, kern_return_t kr)
1424 {
1425 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
1426 
1427 	if (kr == KERN_SUCCESS) {
1428 		kernel_memory_depopulate((vm_offset_t)c_seg->c_store.c_buffer, size,
1429 		    KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
1430 	}
1431 #if ENCRYPTED_SWAP
1432 	else {
1433 		vm_swap_decrypt(c_seg, false);
1434 	}
1435 #endif /* ENCRYPTED_SWAP */
1436 	lck_mtx_lock_spin_always(c_list_lock);
1437 	lck_mtx_lock_spin_always(&c_seg->c_lock);
1438 
1439 	if (kr == KERN_SUCCESS) {
1440 		int             new_state = C_ON_SWAPPEDOUT_Q;
1441 		boolean_t       insert_head = FALSE;
1442 
1443 		if (hibernate_flushing == TRUE) {
1444 			if (c_seg->c_generation_id >= first_c_segment_to_warm_generation_id &&
1445 			    c_seg->c_generation_id <= last_c_segment_to_warm_generation_id) {
1446 				insert_head = TRUE;
1447 			}
1448 		} else if (C_SEG_ONDISK_IS_SPARSE(c_seg)) {
1449 			new_state = C_ON_SWAPPEDOUTSPARSE_Q;
1450 		}
1451 
1452 		c_seg_switch_state(c_seg, new_state, insert_head);
1453 
1454 		c_seg->c_store.c_swap_handle = f_offset;
1455 
1456 		counter_add(&vm_statistics_swapouts, size >> PAGE_SHIFT);
1457 		__assert_only unsigned int new_swapped_count = os_atomic_add(
1458 			&vm_page_swapped_count, c_seg->c_slots_used, relaxed);
1459 		/* Detect overflow */
1460 		assert3u(new_swapped_count, >=, c_seg->c_slots_used);
1461 
1462 		c_seg->c_swappedin = false;
1463 
1464 		if (c_seg->c_bytes_used) {
1465 			os_atomic_sub(&compressor_bytes_used, c_seg->c_bytes_used, relaxed);
1466 		}
1467 
1468 #if CONFIG_FREEZE
1469 		/*
1470 		 * Successful swapout. Decrement the in-core compressed pages count.
1471 		 */
1472 		os_atomic_sub(&c_segment_pages_compressed_incore, c_seg->c_slots_used, relaxed);
1473 		assertf(c_segment_pages_compressed_incore >= 0, "-ve incore count %p 0x%x", c_seg, c_segment_pages_compressed_incore);
1474 		if (c_seg->c_has_donated_pages) {
1475 			os_atomic_sub(&c_segment_pages_compressed_incore_late_swapout, (c_seg->c_slots_used), relaxed);
1476 		}
1477 #endif /* CONFIG_FREEZE */
1478 	} else {
1479 		if (c_seg->c_overage_swap == TRUE) {
1480 			c_seg->c_overage_swap = FALSE;
1481 			c_overage_swapped_count--;
1482 		}
1483 
1484 #if CONFIG_FREEZE
1485 		if (c_seg->c_has_freezer_pages) {
1486 			if (c_seg->c_task_owner) {
1487 				c_seg_update_task_owner(c_seg, NULL);
1488 			}
1489 			/*
1490 			 * We failed to swapout a frozen cseg. We need
1491 			 * to put it back in the queues, specifically the
1492 			 * AGE_Q. So clear the donated bit otherwise it'll
1493 			 * land on the swapped_in Q.
1494 			 */
1495 			c_seg->c_has_donated_pages = 0;
1496 			c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
1497 		} else
1498 #endif /* CONFIG_FREEZE */
1499 		{
1500 			if (c_seg->c_has_donated_pages) {
1501 				c_seg_switch_state(c_seg, C_ON_SWAPPEDIN_Q, FALSE);
1502 			} else {
1503 				c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
1504 			}
1505 		}
1506 
1507 		if (!c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
1508 			c_seg_need_delayed_compaction(c_seg, TRUE);
1509 		}
1510 	}
1511 	assert(c_seg->c_busy_swapping);
1512 	assert(c_seg->c_busy);
1513 
1514 	c_seg->c_busy_swapping = 0;
1515 	lck_mtx_unlock_always(c_list_lock);
1516 
1517 	C_SEG_WAKEUP_DONE(c_seg);
1518 	lck_mtx_unlock_always(&c_seg->c_lock);
1519 
1520 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
1521 }
1522 
1523 
1524 boolean_t
vm_swap_create_file()1525 vm_swap_create_file()
1526 {
1527 	uint64_t        size = 0;
1528 	int             namelen = 0;
1529 	boolean_t       swap_file_created = FALSE;
1530 	boolean_t       swap_file_reuse = FALSE;
1531 	boolean_t       swap_file_pin = FALSE;
1532 	struct swapfile *swf = NULL;
1533 
1534 	/*
1535 	 * make sure we've got all the info we need
1536 	 * to potentially pin a swap file... we could
1537 	 * be swapping out due to hibernation w/o ever
1538 	 * having run vm_pageout_scan, which is normally
1539 	 * the trigger to do the init
1540 	 */
1541 	vm_compaction_swapper_do_init();
1542 
1543 	/*
1544 	 * Any swapfile structure ready for re-use?
1545 	 */
1546 
1547 	lck_mtx_lock(&vm_swap_data_lock);
1548 
1549 	swf = (struct swapfile*) queue_first(&swf_global_queue);
1550 
1551 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
1552 		if (swf->swp_flags == SWAP_REUSE) {
1553 			swap_file_reuse = TRUE;
1554 			break;
1555 		}
1556 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
1557 	}
1558 
1559 	lck_mtx_unlock(&vm_swap_data_lock);
1560 
1561 	if (swap_file_reuse == FALSE) {
1562 		namelen = (int)strlen(swapfilename) + SWAPFILENAME_INDEX_LEN + 1;
1563 
1564 		swf = kalloc_type(struct swapfile, Z_WAITOK | Z_ZERO);
1565 		swf->swp_index = vm_num_swap_files + 1;
1566 		swf->swp_pathlen = namelen;
1567 		swf->swp_path = kalloc_data(swf->swp_pathlen, Z_WAITOK | Z_ZERO);
1568 
1569 		snprintf(swf->swp_path, namelen, "%s%d", swapfilename, vm_num_swap_files);
1570 	}
1571 
1572 	vm_swapfile_open(swf->swp_path, &swf->swp_vp);
1573 
1574 	if (swf->swp_vp == NULL) {
1575 		if (swap_file_reuse == FALSE) {
1576 			kfree_data(swf->swp_path, swf->swp_pathlen);
1577 			kfree_type(struct swapfile, swf);
1578 		}
1579 		return FALSE;
1580 	}
1581 	vm_swapfile_can_be_created = true;
1582 
1583 	size = MAX_SWAP_FILE_SIZE;
1584 
1585 	while (size >= MIN_SWAP_FILE_SIZE) {
1586 		swap_file_pin = VM_SWAP_SHOULD_PIN(size);
1587 
1588 		if (vm_swapfile_preallocate(swf->swp_vp, &size, &swap_file_pin) == 0) {
1589 			int num_bytes_for_bitmap = 0;
1590 
1591 			swap_file_created = TRUE;
1592 
1593 			swf->swp_size = size;
1594 			swf->swp_nsegs = (unsigned int) (size / compressed_swap_chunk_size);
1595 			swf->swp_nseginuse = 0;
1596 			swf->swp_free_hint = 0;
1597 
1598 			num_bytes_for_bitmap = MAX((swf->swp_nsegs >> 3), 1);
1599 			/*
1600 			 * Allocate a bitmap that describes the
1601 			 * number of segments held by this swapfile.
1602 			 */
1603 			swf->swp_bitmap = kalloc_data(num_bytes_for_bitmap,
1604 			    Z_WAITOK | Z_ZERO);
1605 
1606 			swf->swp_csegs = kalloc_type(c_segment_t, swf->swp_nsegs,
1607 			    Z_WAITOK | Z_ZERO);
1608 
1609 			/*
1610 			 * passing a NULL trim_list into vnode_trim_list
1611 			 * will return ENOTSUP if trim isn't supported
1612 			 * and 0 if it is
1613 			 */
1614 			if (vnode_trim_list(swf->swp_vp, NULL, FALSE) == 0) {
1615 				swp_trim_supported = TRUE;
1616 			}
1617 
1618 			lck_mtx_lock(&vm_swap_data_lock);
1619 
1620 			swf->swp_flags = SWAP_READY;
1621 
1622 			if (swap_file_reuse == FALSE) {
1623 				queue_enter(&swf_global_queue, swf, struct swapfile*, swp_queue);
1624 			}
1625 
1626 			vm_num_swap_files++;
1627 
1628 			vm_swapfile_total_segs_alloced += swf->swp_nsegs;
1629 			if (vm_swapfile_total_segs_alloced > vm_swapfile_total_segs_alloced_max) {
1630 				vm_swapfile_total_segs_alloced_max = vm_swapfile_total_segs_alloced;
1631 			}
1632 
1633 			if (swap_file_pin == TRUE) {
1634 				vm_num_pinned_swap_files++;
1635 				swf->swp_flags |= SWAP_PINNED;
1636 				vm_swappin_avail -= swf->swp_size;
1637 			}
1638 
1639 			lck_mtx_unlock(&vm_swap_data_lock);
1640 
1641 			thread_wakeup((event_t) &vm_num_swap_files);
1642 #if !XNU_TARGET_OS_OSX
1643 			if (vm_num_swap_files == 1) {
1644 				c_overage_swapped_limit = (uint32_t)size / c_seg_bufsize;
1645 
1646 				if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
1647 					c_overage_swapped_limit /= 2;
1648 				}
1649 			}
1650 #endif /* !XNU_TARGET_OS_OSX */
1651 			break;
1652 		} else {
1653 			size = size / 2;
1654 		}
1655 	}
1656 	if (swap_file_created == FALSE) {
1657 		vm_swapfile_close((uint64_t)(swf->swp_path), swf->swp_vp);
1658 
1659 		swf->swp_vp = NULL;
1660 
1661 		if (swap_file_reuse == FALSE) {
1662 			kfree_data(swf->swp_path, swf->swp_pathlen);
1663 			kfree_type(struct swapfile, swf);
1664 		}
1665 	}
1666 	return swap_file_created;
1667 }
1668 
1669 extern void vnode_put(struct vnode* vp);
1670 kern_return_t
vm_swap_get(c_segment_t c_seg,uint64_t f_offset,uint64_t size)1671 vm_swap_get(c_segment_t c_seg, uint64_t f_offset, uint64_t size)
1672 {
1673 	struct swapfile *swf = NULL;
1674 	uint64_t        file_offset = 0;
1675 	int             retval = 0;
1676 
1677 	assert(c_seg->c_store.c_buffer);
1678 
1679 	lck_mtx_lock(&vm_swap_data_lock);
1680 
1681 	swf = vm_swapfile_for_handle(f_offset);
1682 
1683 	if (swf == NULL || (!(swf->swp_flags & SWAP_READY) && !(swf->swp_flags & SWAP_RECLAIM))) {
1684 		vm_swap_get_failures++;
1685 		retval = 1;
1686 		goto done;
1687 	}
1688 	swf->swp_io_count++;
1689 
1690 	lck_mtx_unlock(&vm_swap_data_lock);
1691 
1692 #if DEVELOPMENT || DEBUG
1693 	C_SEG_MAKE_WRITEABLE(c_seg);
1694 #endif
1695 	file_offset = (f_offset & SWAP_SLOT_MASK);
1696 
1697 	if ((retval = vnode_getwithref(swf->swp_vp)) != 0) {
1698 		printf("vm_swap_get: vnode_getwithref on swapfile failed with %d\n", retval);
1699 	} else {
1700 		retval = vm_swapfile_io(swf->swp_vp, file_offset, (uint64_t)c_seg->c_store.c_buffer, (int)(size / PAGE_SIZE_64), SWAP_READ, NULL);
1701 		vnode_put(swf->swp_vp);
1702 	}
1703 
1704 #if DEVELOPMENT || DEBUG
1705 	C_SEG_WRITE_PROTECT(c_seg);
1706 #endif
1707 	if (retval == 0) {
1708 		counter_add(&vm_statistics_swapins, size >> PAGE_SHIFT);
1709 	} else {
1710 		vm_swap_get_failures++;
1711 	}
1712 
1713 	/*
1714 	 * Free this slot in the swap structure.
1715 	 */
1716 	vm_swap_free(f_offset);
1717 
1718 	lck_mtx_lock(&vm_swap_data_lock);
1719 	swf->swp_io_count--;
1720 
1721 	if ((swf->swp_flags & SWAP_WANTED) && swf->swp_io_count == 0) {
1722 		swf->swp_flags &= ~SWAP_WANTED;
1723 		thread_wakeup((event_t) &swf->swp_flags);
1724 	}
1725 done:
1726 	lck_mtx_unlock(&vm_swap_data_lock);
1727 
1728 	if (retval == 0) {
1729 		return KERN_SUCCESS;
1730 	} else {
1731 		return KERN_FAILURE;
1732 	}
1733 }
1734 
1735 kern_return_t
vm_swap_put(vm_offset_t addr,uint64_t * f_offset,uint32_t size,c_segment_t c_seg,struct swapout_io_completion * soc)1736 vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint32_t size, c_segment_t c_seg, struct swapout_io_completion *soc)
1737 {
1738 	unsigned int    segidx = 0;
1739 	struct swapfile *swf = NULL;
1740 	uint64_t        file_offset = 0;
1741 	uint64_t        swapfile_index = 0;
1742 	unsigned int    byte_for_segidx = 0;
1743 	unsigned int    offset_within_byte = 0;
1744 	boolean_t       swf_eligible = FALSE;
1745 	boolean_t       waiting = FALSE;
1746 	boolean_t       retried = FALSE;
1747 	int             error = 0;
1748 	uint64_t        now;
1749 	void            *upl_ctx = NULL;
1750 	boolean_t       drop_iocount = FALSE;
1751 
1752 	if (addr == 0 || f_offset == NULL || compressor_store_stop_compaction) {
1753 		return KERN_FAILURE;
1754 	}
1755 retry:
1756 	lck_mtx_lock(&vm_swap_data_lock);
1757 
1758 	swf = (struct swapfile*) queue_first(&swf_global_queue);
1759 
1760 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
1761 		segidx = swf->swp_free_hint;
1762 
1763 		swf_eligible =  (swf->swp_flags & SWAP_READY) && (swf->swp_nseginuse < swf->swp_nsegs);
1764 
1765 		if (swf_eligible) {
1766 			while (segidx < swf->swp_nsegs) {
1767 				byte_for_segidx = segidx >> 3;
1768 				offset_within_byte = segidx % 8;
1769 
1770 				if ((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) {
1771 					segidx++;
1772 					continue;
1773 				}
1774 
1775 				(swf->swp_bitmap)[byte_for_segidx] |= (uint8_t)(1 << offset_within_byte);
1776 
1777 				file_offset = segidx * compressed_swap_chunk_size;
1778 				swf->swp_nseginuse++;
1779 				swf->swp_io_count++;
1780 				swf->swp_csegs[segidx] = c_seg;
1781 
1782 				swapfile_index = swf->swp_index;
1783 				vm_swapfile_total_segs_used++;
1784 				if (vm_swapfile_total_segs_used > vm_swapfile_total_segs_used_max) {
1785 					vm_swapfile_total_segs_used_max = vm_swapfile_total_segs_used;
1786 				}
1787 
1788 				now = mach_absolute_time();
1789 
1790 				if (vm_swapfile_should_create(now) && !vm_swapfile_create_thread_running) {
1791 					thread_wakeup((event_t) &vm_swapfile_create_needed);
1792 				}
1793 
1794 				lck_mtx_unlock(&vm_swap_data_lock);
1795 
1796 				goto issue_io;
1797 			}
1798 		}
1799 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
1800 	}
1801 	assert(queue_end(&swf_global_queue, (queue_entry_t) swf));
1802 
1803 	/*
1804 	 * we've run out of swap segments, but may not
1805 	 * be in a position to immediately create a new swap
1806 	 * file if we've recently failed to create due to a lack
1807 	 * of free space in the root filesystem... we'll try
1808 	 * to kick that create off, but in any event we're going
1809 	 * to take a breather (up to 1 second) so that we're not caught in a tight
1810 	 * loop back in "vm_compressor_compact_and_swap" trying to stuff
1811 	 * segments into swap files only to have them immediately put back
1812 	 * on the c_age queue due to vm_swap_put failing.
1813 	 *
1814 	 * if we're doing these puts due to a hibernation flush,
1815 	 * no need to block... setting hibernate_no_swapspace to TRUE,
1816 	 * will cause "vm_compressor_compact_and_swap" to immediately abort
1817 	 */
1818 	now = mach_absolute_time();
1819 
1820 	if (vm_swapfile_should_create(now)) {
1821 		if (!vm_swapfile_create_thread_running) {
1822 			thread_wakeup((event_t) &vm_swapfile_create_needed);
1823 		}
1824 		waiting = TRUE;
1825 		assert_wait_timeout((event_t) &vm_num_swap_files, THREAD_INTERRUPTIBLE, 1000, 1000 * NSEC_PER_USEC);
1826 	} else {
1827 		if (hibernate_flushing) {
1828 			hibernate_no_swapspace = TRUE;
1829 		}
1830 	}
1831 
1832 	lck_mtx_unlock(&vm_swap_data_lock);
1833 
1834 	if (waiting == TRUE) {
1835 		thread_block(THREAD_CONTINUE_NULL);
1836 
1837 		if (retried == FALSE && hibernate_flushing == TRUE) {
1838 			retried = TRUE;
1839 			goto retry;
1840 		}
1841 	}
1842 	vm_swap_put_failures_no_swap_file++;
1843 
1844 	return KERN_FAILURE;
1845 
1846 issue_io:
1847 	assert(c_seg->c_busy_swapping);
1848 	assert(c_seg->c_busy);
1849 	assert(!c_seg->c_on_minorcompact_q);
1850 
1851 	*f_offset = (swapfile_index << SWAP_DEVICE_SHIFT) | file_offset;
1852 
1853 	if (soc) {
1854 		soc->swp_c_seg = c_seg;
1855 		soc->swp_c_size = size;
1856 
1857 		soc->swp_swf = swf;
1858 
1859 		soc->swp_io_error = 0;
1860 		soc->swp_io_done = 0;
1861 
1862 		upl_ctx = (void *)&soc->swp_upl_ctx;
1863 	}
1864 
1865 	if ((error = vnode_getwithref(swf->swp_vp)) != 0) {
1866 		printf("vm_swap_put: vnode_getwithref on swapfile failed with %d\n", error);
1867 	} else {
1868 		error = vm_swapfile_io(swf->swp_vp, file_offset, addr, (int) (size / PAGE_SIZE_64), SWAP_WRITE, upl_ctx);
1869 		drop_iocount = TRUE;
1870 	}
1871 
1872 	if (error || upl_ctx == NULL) {
1873 		return vm_swap_put_finish(swf, f_offset, error, drop_iocount);
1874 	}
1875 
1876 	return KERN_SUCCESS;
1877 }
1878 
1879 kern_return_t
vm_swap_put_finish(struct swapfile * swf,uint64_t * f_offset,int error,boolean_t drop_iocount)1880 vm_swap_put_finish(struct swapfile *swf, uint64_t *f_offset, int error, boolean_t drop_iocount)
1881 {
1882 	if (drop_iocount) {
1883 		vnode_put(swf->swp_vp);
1884 	}
1885 
1886 	lck_mtx_lock(&vm_swap_data_lock);
1887 
1888 	swf->swp_io_count--;
1889 
1890 	if ((swf->swp_flags & SWAP_WANTED) && swf->swp_io_count == 0) {
1891 		swf->swp_flags &= ~SWAP_WANTED;
1892 		thread_wakeup((event_t) &swf->swp_flags);
1893 	}
1894 	lck_mtx_unlock(&vm_swap_data_lock);
1895 
1896 	if (error) {
1897 		vm_swap_free(*f_offset);
1898 		vm_swap_put_failures++;
1899 
1900 		return KERN_FAILURE;
1901 	}
1902 	return KERN_SUCCESS;
1903 }
1904 
1905 
1906 static void
vm_swap_free_now(struct swapfile * swf,uint64_t f_offset)1907 vm_swap_free_now(struct swapfile *swf, uint64_t f_offset)
1908 {
1909 	uint64_t        file_offset = 0;
1910 	unsigned int    segidx = 0;
1911 
1912 
1913 	if ((swf->swp_flags & SWAP_READY) || (swf->swp_flags & SWAP_RECLAIM)) {
1914 		unsigned int byte_for_segidx = 0;
1915 		unsigned int offset_within_byte = 0;
1916 
1917 		file_offset = (f_offset & SWAP_SLOT_MASK);
1918 		segidx = (unsigned int) (file_offset / compressed_swap_chunk_size);
1919 
1920 		byte_for_segidx = segidx >> 3;
1921 		offset_within_byte = segidx % 8;
1922 
1923 		if ((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) {
1924 			(swf->swp_bitmap)[byte_for_segidx] &= ~(1 << offset_within_byte);
1925 
1926 			swf->swp_csegs[segidx] = NULL;
1927 
1928 			swf->swp_nseginuse--;
1929 			vm_swapfile_total_segs_used--;
1930 
1931 			if (segidx < swf->swp_free_hint) {
1932 				swf->swp_free_hint = segidx;
1933 			}
1934 		}
1935 		if (VM_SWAP_SHOULD_RECLAIM() && !vm_swapfile_gc_thread_running) {
1936 			thread_wakeup((event_t) &vm_swapfile_gc_needed);
1937 		}
1938 	}
1939 }
1940 
1941 
1942 uint32_t vm_swap_free_now_count = 0;
1943 uint32_t vm_swap_free_delayed_count = 0;
1944 
1945 
1946 void
vm_swap_free(uint64_t f_offset)1947 vm_swap_free(uint64_t f_offset)
1948 {
1949 	struct swapfile *swf = NULL;
1950 	struct trim_list *tl = NULL;
1951 	uint64_t now;
1952 
1953 	if (swp_trim_supported == TRUE) {
1954 		tl = kalloc_type(struct trim_list, Z_WAITOK);
1955 	}
1956 
1957 	lck_mtx_lock(&vm_swap_data_lock);
1958 
1959 	swf = vm_swapfile_for_handle(f_offset);
1960 
1961 	if (swf && (swf->swp_flags & (SWAP_READY | SWAP_RECLAIM))) {
1962 		if (swp_trim_supported == FALSE || (swf->swp_flags & SWAP_RECLAIM)) {
1963 			/*
1964 			 * don't delay the free if the underlying disk doesn't support
1965 			 * trim, or we're in the midst of reclaiming this swap file since
1966 			 * we don't want to move segments that are technically free
1967 			 * but not yet handled by the delayed free mechanism
1968 			 */
1969 			vm_swap_free_now(swf, f_offset);
1970 
1971 			vm_swap_free_now_count++;
1972 			goto done;
1973 		}
1974 		tl->tl_offset = f_offset & SWAP_SLOT_MASK;
1975 		tl->tl_length = compressed_swap_chunk_size;
1976 
1977 		tl->tl_next = swf->swp_delayed_trim_list_head;
1978 		swf->swp_delayed_trim_list_head = tl;
1979 		swf->swp_delayed_trim_count++;
1980 		tl = NULL;
1981 
1982 		if (VM_SWAP_SHOULD_TRIM(swf) && !vm_swapfile_create_thread_running) {
1983 			now = mach_absolute_time();
1984 
1985 			if (now > dont_trim_until_ts) {
1986 				thread_wakeup((event_t) &vm_swapfile_create_needed);
1987 			}
1988 		}
1989 		vm_swap_free_delayed_count++;
1990 	}
1991 done:
1992 	lck_mtx_unlock(&vm_swap_data_lock);
1993 
1994 	if (tl != NULL) {
1995 		kfree_type(struct trim_list, tl);
1996 	}
1997 }
1998 
1999 
2000 static void
vm_swap_wait_on_trim_handling_in_progress()2001 vm_swap_wait_on_trim_handling_in_progress()
2002 {
2003 	while (delayed_trim_handling_in_progress) {
2004 		assert_wait((event_t) &delayed_trim_handling_in_progress, THREAD_UNINT);
2005 		lck_mtx_unlock(&vm_swap_data_lock);
2006 
2007 		thread_block(THREAD_CONTINUE_NULL);
2008 
2009 		lck_mtx_lock(&vm_swap_data_lock);
2010 	}
2011 }
2012 
2013 
2014 static void
vm_swap_handle_delayed_trims(boolean_t force_now)2015 vm_swap_handle_delayed_trims(boolean_t force_now)
2016 {
2017 	struct swapfile *swf = NULL;
2018 
2019 	/*
2020 	 * serialize the race between us and vm_swap_reclaim...
2021 	 * if vm_swap_reclaim wins it will turn off SWAP_READY
2022 	 * on the victim it has chosen... we can just skip over
2023 	 * that file since vm_swap_reclaim will first process
2024 	 * all of the delayed trims associated with it
2025 	 */
2026 
2027 	if (compressor_store_stop_compaction == TRUE) {
2028 		return;
2029 	}
2030 
2031 	lck_mtx_lock(&vm_swap_data_lock);
2032 
2033 	delayed_trim_handling_in_progress = true;
2034 
2035 	lck_mtx_unlock(&vm_swap_data_lock);
2036 
2037 	/*
2038 	 * no need to hold the lock to walk the swf list since
2039 	 * vm_swap_create (the only place where we add to this list)
2040 	 * is run on the same thread as this function
2041 	 * and vm_swap_reclaim doesn't remove items from this list
2042 	 * instead marking them with SWAP_REUSE for future re-use
2043 	 */
2044 	swf = (struct swapfile*) queue_first(&swf_global_queue);
2045 
2046 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
2047 		if ((swf->swp_flags & SWAP_READY) && (force_now == TRUE || VM_SWAP_SHOULD_TRIM(swf))) {
2048 			assert(!(swf->swp_flags & SWAP_RECLAIM));
2049 			vm_swap_do_delayed_trim(swf);
2050 		}
2051 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
2052 	}
2053 	lck_mtx_lock(&vm_swap_data_lock);
2054 
2055 	delayed_trim_handling_in_progress = false;
2056 	thread_wakeup((event_t) &delayed_trim_handling_in_progress);
2057 
2058 	if (VM_SWAP_SHOULD_RECLAIM() && !vm_swapfile_gc_thread_running) {
2059 		thread_wakeup((event_t) &vm_swapfile_gc_needed);
2060 	}
2061 
2062 	lck_mtx_unlock(&vm_swap_data_lock);
2063 }
2064 
2065 static void
vm_swap_do_delayed_trim(struct swapfile * swf)2066 vm_swap_do_delayed_trim(struct swapfile *swf)
2067 {
2068 	struct trim_list *tl, *tl_head;
2069 	int error;
2070 
2071 	if (compressor_store_stop_compaction == TRUE) {
2072 		return;
2073 	}
2074 
2075 	if ((error = vnode_getwithref(swf->swp_vp)) != 0) {
2076 		printf("vm_swap_do_delayed_trim: vnode_getwithref on swapfile failed with %d\n", error);
2077 		return;
2078 	}
2079 
2080 	lck_mtx_lock(&vm_swap_data_lock);
2081 
2082 	tl_head = swf->swp_delayed_trim_list_head;
2083 	swf->swp_delayed_trim_list_head = NULL;
2084 	swf->swp_delayed_trim_count = 0;
2085 
2086 	lck_mtx_unlock(&vm_swap_data_lock);
2087 
2088 	vnode_trim_list(swf->swp_vp, tl_head, TRUE);
2089 
2090 	(void) vnode_put(swf->swp_vp);
2091 
2092 	while ((tl = tl_head) != NULL) {
2093 		unsigned int    segidx = 0;
2094 		unsigned int    byte_for_segidx = 0;
2095 		unsigned int    offset_within_byte = 0;
2096 
2097 		lck_mtx_lock(&vm_swap_data_lock);
2098 
2099 		segidx = (unsigned int) (tl->tl_offset / compressed_swap_chunk_size);
2100 
2101 		byte_for_segidx = segidx >> 3;
2102 		offset_within_byte = segidx % 8;
2103 
2104 		if ((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) {
2105 			(swf->swp_bitmap)[byte_for_segidx] &= ~(1 << offset_within_byte);
2106 
2107 			swf->swp_csegs[segidx] = NULL;
2108 
2109 			swf->swp_nseginuse--;
2110 			vm_swapfile_total_segs_used--;
2111 
2112 			if (segidx < swf->swp_free_hint) {
2113 				swf->swp_free_hint = segidx;
2114 			}
2115 		}
2116 		lck_mtx_unlock(&vm_swap_data_lock);
2117 
2118 		tl_head = tl->tl_next;
2119 
2120 		kfree_type(struct trim_list, tl);
2121 	}
2122 }
2123 
2124 
2125 void
vm_swap_flush()2126 vm_swap_flush()
2127 {
2128 	return;
2129 }
2130 
2131 int     vm_swap_reclaim_yielded = 0;
2132 
2133 void
vm_swap_reclaim(void)2134 vm_swap_reclaim(void)
2135 {
2136 	vm_offset_t     addr = 0;
2137 	unsigned int    segidx = 0;
2138 	uint64_t        f_offset = 0;
2139 	struct swapfile *swf = NULL;
2140 	struct swapfile *smallest_swf = NULL;
2141 	unsigned int    min_nsegs = 0;
2142 	unsigned int    byte_for_segidx = 0;
2143 	unsigned int    offset_within_byte = 0;
2144 	uint32_t        c_size = 0;
2145 
2146 	c_segment_t     c_seg = NULL;
2147 
2148 	kmem_alloc(compressor_map, (vm_offset_t *)&addr, c_seg_bufsize,
2149 	    KMA_NOFAIL | KMA_KOBJECT | KMA_DATA_SHARED, VM_KERN_MEMORY_COMPRESSOR);
2150 
2151 	lck_mtx_lock(&vm_swap_data_lock);
2152 
2153 	/*
2154 	 * if we're running the swapfile list looking for
2155 	 * candidates with delayed trims, we need to
2156 	 * wait before making our decision concerning
2157 	 * the swapfile we want to reclaim
2158 	 */
2159 	vm_swap_wait_on_trim_handling_in_progress();
2160 
2161 	/*
2162 	 * from here until we knock down the SWAP_READY bit,
2163 	 * we need to remain behind the vm_swap_data_lock...
2164 	 * once that bit has been turned off, "vm_swap_handle_delayed_trims"
2165 	 * will not consider this swapfile for processing
2166 	 */
2167 	swf = (struct swapfile*) queue_first(&swf_global_queue);
2168 	min_nsegs = MAX_SWAP_FILE_SIZE / compressed_swap_chunk_size;
2169 	smallest_swf = NULL;
2170 
2171 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
2172 		if ((swf->swp_flags & SWAP_READY) && (swf->swp_nseginuse <= min_nsegs)) {
2173 			smallest_swf = swf;
2174 			min_nsegs = swf->swp_nseginuse;
2175 		}
2176 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
2177 	}
2178 
2179 	if (smallest_swf == NULL) {
2180 		goto done;
2181 	}
2182 
2183 	swf = smallest_swf;
2184 
2185 
2186 	swf->swp_flags &= ~SWAP_READY;
2187 	swf->swp_flags |= SWAP_RECLAIM;
2188 
2189 	if (swf->swp_delayed_trim_count) {
2190 		lck_mtx_unlock(&vm_swap_data_lock);
2191 
2192 		vm_swap_do_delayed_trim(swf);
2193 
2194 		lck_mtx_lock(&vm_swap_data_lock);
2195 	}
2196 	segidx = 0;
2197 
2198 	while (segidx < swf->swp_nsegs) {
2199 ReTry_for_cseg:
2200 		/*
2201 		 * Wait for outgoing I/Os.
2202 		 */
2203 		while (swf->swp_io_count) {
2204 			swf->swp_flags |= SWAP_WANTED;
2205 
2206 			assert_wait((event_t) &swf->swp_flags, THREAD_UNINT);
2207 			lck_mtx_unlock(&vm_swap_data_lock);
2208 
2209 			thread_block(THREAD_CONTINUE_NULL);
2210 
2211 			lck_mtx_lock(&vm_swap_data_lock);
2212 		}
2213 		if (compressor_store_stop_compaction == TRUE || VM_SWAP_SHOULD_ABORT_RECLAIM() || VM_SWAP_BUSY()) {
2214 			vm_swap_reclaim_yielded++;
2215 			break;
2216 		}
2217 
2218 		byte_for_segidx = segidx >> 3;
2219 		offset_within_byte = segidx % 8;
2220 
2221 		if (((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) == 0) {
2222 			segidx++;
2223 			continue;
2224 		}
2225 
2226 		c_seg = swf->swp_csegs[segidx];
2227 		assert(c_seg);
2228 
2229 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2230 
2231 		if (c_seg->c_busy) {
2232 			/*
2233 			 * a swapped out c_segment in the process of being freed will remain in the
2234 			 * busy state until after the vm_swap_free is called on it... vm_swap_free
2235 			 * takes the vm_swap_data_lock, so can't change the swap state until after
2236 			 * we drop the vm_swap_data_lock... once we do, vm_swap_free will complete
2237 			 * which will allow c_seg_free_locked to clear busy and wake up this thread...
2238 			 * at that point, we re-look up the swap state which will now indicate that
2239 			 * this c_segment no longer exists.
2240 			 */
2241 			c_seg->c_wanted = 1;
2242 
2243 			assert_wait((event_t) (c_seg), THREAD_UNINT);
2244 			lck_mtx_unlock_always(&c_seg->c_lock);
2245 
2246 			lck_mtx_unlock(&vm_swap_data_lock);
2247 
2248 			thread_block(THREAD_CONTINUE_NULL);
2249 
2250 			lck_mtx_lock(&vm_swap_data_lock);
2251 
2252 			goto ReTry_for_cseg;
2253 		}
2254 		(swf->swp_bitmap)[byte_for_segidx] &= ~(1 << offset_within_byte);
2255 
2256 		f_offset = segidx * compressed_swap_chunk_size;
2257 
2258 		assert(c_seg == swf->swp_csegs[segidx]);
2259 		swf->swp_csegs[segidx] = NULL;
2260 		swf->swp_nseginuse--;
2261 
2262 		vm_swapfile_total_segs_used--;
2263 
2264 		lck_mtx_unlock(&vm_swap_data_lock);
2265 
2266 		assert(C_SEG_IS_ONDISK(c_seg));
2267 
2268 		C_SEG_BUSY(c_seg);
2269 		c_seg->c_busy_swapping = 1;
2270 #if !CHECKSUM_THE_SWAP
2271 		c_seg_trim_tail(c_seg);
2272 #endif
2273 		c_size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
2274 
2275 		assert(c_size <= c_seg_bufsize && c_size);
2276 
2277 		lck_mtx_unlock_always(&c_seg->c_lock);
2278 
2279 		if (vnode_getwithref(swf->swp_vp)) {
2280 			printf("vm_swap_reclaim: vnode_getwithref on swapfile failed.\n");
2281 			vm_swap_get_failures++;
2282 			goto swap_io_failed;
2283 		} else {
2284 			if (vm_swapfile_io(swf->swp_vp, f_offset, addr, (int)(c_size / PAGE_SIZE_64), SWAP_READ, NULL)) {
2285 				/*
2286 				 * reading the data back in failed, so convert c_seg
2287 				 * to a swapped in c_segment that contains no data
2288 				 */
2289 				c_seg_swapin_requeue(c_seg, FALSE, TRUE, FALSE);
2290 				/*
2291 				 * returns with c_busy_swapping cleared
2292 				 */
2293 				vnode_put(swf->swp_vp);
2294 				vm_swap_get_failures++;
2295 				goto swap_io_failed;
2296 			}
2297 			vnode_put(swf->swp_vp);
2298 		}
2299 
2300 		counter_add(&vm_statistics_swapins, c_size >> PAGE_SHIFT);
2301 		vmcs_stats.reclaim_swapins += c_size >> PAGE_SHIFT;
2302 
2303 		if (vm_swap_put(addr, &f_offset, c_size, c_seg, NULL)) {
2304 			vm_offset_t     c_buffer;
2305 
2306 			/*
2307 			 * the put failed, so convert c_seg to a fully swapped in c_segment
2308 			 * with valid data
2309 			 */
2310 			c_buffer = (vm_offset_t)C_SEG_BUFFER_ADDRESS(c_seg->c_mysegno);
2311 
2312 			kernel_memory_populate(c_buffer, c_size,
2313 			    KMA_NOFAIL | KMA_COMPRESSOR,
2314 			    VM_KERN_MEMORY_COMPRESSOR);
2315 
2316 			memcpy((char *)c_buffer, (char *)addr, c_size);
2317 
2318 			c_seg->c_store.c_buffer = (int32_t *)c_buffer;
2319 #if ENCRYPTED_SWAP
2320 			vm_swap_decrypt(c_seg, true);
2321 #endif /* ENCRYPTED_SWAP */
2322 			c_seg_swapin_requeue(c_seg, TRUE, TRUE, FALSE);
2323 			/*
2324 			 * returns with c_busy_swapping cleared
2325 			 */
2326 			OSAddAtomic64(c_seg->c_bytes_used, &compressor_bytes_used);
2327 
2328 			goto swap_io_failed;
2329 		}
2330 		counter_add(&vm_statistics_swapouts, c_size >> PAGE_SHIFT);
2331 
2332 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2333 
2334 		c_seg->c_swappedin = false;
2335 
2336 		assert(C_SEG_IS_ONDISK(c_seg));
2337 		/*
2338 		 * The c_seg will now know about the new location on disk.
2339 		 */
2340 		c_seg->c_store.c_swap_handle = f_offset;
2341 
2342 		assert(c_seg->c_busy_swapping);
2343 		c_seg->c_busy_swapping = 0;
2344 swap_io_failed:
2345 		assert(c_seg->c_busy);
2346 		C_SEG_WAKEUP_DONE(c_seg);
2347 
2348 		lck_mtx_unlock_always(&c_seg->c_lock);
2349 		lck_mtx_lock(&vm_swap_data_lock);
2350 	}
2351 
2352 	if (swf->swp_nseginuse) {
2353 		swf->swp_flags &= ~SWAP_RECLAIM;
2354 		swf->swp_flags |= SWAP_READY;
2355 
2356 		goto done;
2357 	}
2358 	/*
2359 	 * We don't remove this inactive swf from the queue.
2360 	 * That way, we can re-use it when needed again and
2361 	 * preserve the namespace. The delayed_trim processing
2362 	 * is also dependent on us not removing swfs from the queue.
2363 	 */
2364 	//queue_remove(&swf_global_queue, swf, struct swapfile*, swp_queue);
2365 
2366 	vm_swapfile_total_segs_alloced -= swf->swp_nsegs;
2367 
2368 	lck_mtx_unlock(&vm_swap_data_lock);
2369 
2370 	vm_swapfile_close((uint64_t)(swf->swp_path), swf->swp_vp);
2371 
2372 	kfree_type(c_segment_t, swf->swp_nsegs, swf->swp_csegs);
2373 	kfree_data(swf->swp_bitmap, MAX((swf->swp_nsegs >> 3), 1));
2374 
2375 	lck_mtx_lock(&vm_swap_data_lock);
2376 
2377 	if (swf->swp_flags & SWAP_PINNED) {
2378 		vm_num_pinned_swap_files--;
2379 		vm_swappin_avail += swf->swp_size;
2380 	}
2381 
2382 	swf->swp_vp = NULL;
2383 	swf->swp_size = 0;
2384 	swf->swp_free_hint = 0;
2385 	swf->swp_nsegs = 0;
2386 	swf->swp_flags = SWAP_REUSE;
2387 
2388 	vm_num_swap_files--;
2389 
2390 done:
2391 	thread_wakeup((event_t) &swf->swp_flags);
2392 	lck_mtx_unlock(&vm_swap_data_lock);
2393 
2394 	kmem_free(compressor_map, (vm_offset_t) addr, c_seg_bufsize);
2395 }
2396 
2397 
2398 uint64_t
vm_swap_get_total_space(void)2399 vm_swap_get_total_space(void)
2400 {
2401 	uint64_t total_space = 0;
2402 
2403 	total_space = (uint64_t)vm_swapfile_total_segs_alloced * compressed_swap_chunk_size;
2404 
2405 	return total_space;
2406 }
2407 
2408 uint64_t
vm_swap_get_used_space(void)2409 vm_swap_get_used_space(void)
2410 {
2411 	uint64_t used_space = 0;
2412 
2413 	used_space = (uint64_t)vm_swapfile_total_segs_used * compressed_swap_chunk_size;
2414 
2415 	return used_space;
2416 }
2417 
2418 uint64_t
vm_swap_get_free_space(void)2419 vm_swap_get_free_space(void)
2420 {
2421 	return vm_swap_get_total_space() - vm_swap_get_used_space();
2422 }
2423 
2424 uint64_t
vm_swap_get_max_configured_space(void)2425 vm_swap_get_max_configured_space(void)
2426 {
2427 	int num_swap_files = (vm_num_swap_files_config ? vm_num_swap_files_config : VM_MAX_SWAP_FILE_NUM);
2428 	return num_swap_files * MAX_SWAP_FILE_SIZE;
2429 }
2430 
2431 bool
vm_swap_low_on_space(void)2432 vm_swap_low_on_space(void)
2433 {
2434 	if (vm_num_swap_files == 0 &&
2435 	    (!vm_swapfile_can_be_created || !SWAPPER_NEEDS_TO_UNTHROTTLE())) {
2436 		/* We haven't started creating swap files yet */
2437 		return false;
2438 	}
2439 
2440 	if (vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used <
2441 	    (unsigned int)vm_swapfile_hiwater_segs / 8) {
2442 		/*
2443 		 * We're running low on swapfile segments
2444 		 */
2445 		if (vm_swapfile_last_failed_to_create_ts >= vm_swapfile_last_successful_create_ts) {
2446 			/*
2447 			 * We've recently failed to create a new swapfile, likely due to disk
2448 			 * space exhaustion
2449 			 */
2450 			return true;
2451 		}
2452 
2453 		if (vm_num_swap_files == vm_num_swap_files_config) {
2454 			/* We've reached the swapfile limit */
2455 			return true;
2456 		}
2457 	}
2458 	return false;
2459 }
2460 
2461 bool
vm_swap_out_of_space(void)2462 vm_swap_out_of_space(void)
2463 {
2464 	if (vm_num_swap_files == 0 &&
2465 	    (!vm_swapfile_can_be_created || !SWAPPER_NEEDS_TO_UNTHROTTLE())) {
2466 		/* We haven't started creating swap files yet */
2467 		return false;
2468 	}
2469 
2470 	if (vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used <
2471 	    VM_SWAPOUT_LIMIT_MAX) {
2472 		/*
2473 		 * We have run out of swapfile segments
2474 		 */
2475 		if (vm_num_swap_files == vm_num_swap_files_config) {
2476 			/* And we can't create any more swapfiles */
2477 			return true;
2478 		}
2479 	}
2480 
2481 	return false;
2482 }
2483 
2484 boolean_t
vm_swap_files_pinned(void)2485 vm_swap_files_pinned(void)
2486 {
2487 	boolean_t result;
2488 
2489 	if (vm_swappin_enabled == FALSE) {
2490 		return TRUE;
2491 	}
2492 
2493 	result = (vm_num_pinned_swap_files == vm_num_swap_files);
2494 
2495 	return result;
2496 }
2497 
2498 #if CONFIG_FREEZE
2499 boolean_t
vm_swap_max_budget(uint64_t * freeze_daily_budget)2500 vm_swap_max_budget(uint64_t *freeze_daily_budget)
2501 {
2502 	boolean_t       use_device_value = FALSE;
2503 	struct swapfile *swf = NULL;
2504 
2505 	if (vm_num_swap_files) {
2506 		lck_mtx_lock(&vm_swap_data_lock);
2507 
2508 		swf = (struct swapfile*) queue_first(&swf_global_queue);
2509 
2510 		if (swf) {
2511 			while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
2512 				if (swf->swp_flags == SWAP_READY) {
2513 					assert(swf->swp_vp);
2514 
2515 					if (vm_swap_vol_get_budget(swf->swp_vp, freeze_daily_budget) == 0) {
2516 						use_device_value = TRUE;
2517 					}
2518 					break;
2519 				}
2520 				swf = (struct swapfile*) queue_next(&swf->swp_queue);
2521 			}
2522 		}
2523 
2524 		lck_mtx_unlock(&vm_swap_data_lock);
2525 	} else {
2526 		/*
2527 		 * This block is used for the initial budget value before any swap files
2528 		 * are created. We create a temp swap file to get the budget.
2529 		 */
2530 
2531 		struct vnode *temp_vp = NULL;
2532 
2533 		vm_swapfile_open(swapfilename, &temp_vp);
2534 
2535 		if (temp_vp) {
2536 			if (vm_swap_vol_get_budget(temp_vp, freeze_daily_budget) == 0) {
2537 				use_device_value = TRUE;
2538 			}
2539 
2540 			vm_swapfile_close((uint64_t)&swapfilename, temp_vp);
2541 			temp_vp = NULL;
2542 		} else {
2543 			*freeze_daily_budget = 0;
2544 		}
2545 	}
2546 
2547 	return use_device_value;
2548 }
2549 #endif /* CONFIG_FREEZE */
2550 
2551 void
vm_swap_reset_max_segs_tracking(uint64_t * alloced_max,uint64_t * used_max)2552 vm_swap_reset_max_segs_tracking(uint64_t *alloced_max, uint64_t *used_max)
2553 {
2554 	lck_mtx_lock(&vm_swap_data_lock);
2555 
2556 	*alloced_max = (uint64_t) vm_swapfile_total_segs_alloced_max * compressed_swap_chunk_size;
2557 	*used_max = (uint64_t) vm_swapfile_total_segs_used_max * compressed_swap_chunk_size;
2558 
2559 	vm_swapfile_total_segs_alloced_max = vm_swapfile_total_segs_alloced;
2560 	vm_swapfile_total_segs_used_max = vm_swapfile_total_segs_used;
2561 
2562 	lck_mtx_unlock(&vm_swap_data_lock);
2563 }
2564