xref: /xnu-11215.61.5/osfmk/vm/vm_compressor_backing_store.c (revision 4f1223e81cd707a65cc109d0b8ad6653699da3c4)
1 /*
2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include "vm_compressor_backing_store_internal.h"
30 #include <vm/vm_pageout_xnu.h>
31 #include <vm/vm_protos_internal.h>
32 #include <vm/vm_kern_xnu.h>
33 #include <vm/vm_map_xnu.h>
34 #include <vm/vm_compressor_internal.h>
35 #include <vm/vm_iokit.h>
36 #include <vm/vm_map_internal.h>
37 
38 #include <IOKit/IOHibernatePrivate.h>
39 
40 #include <kern/policy_internal.h>
41 
42 LCK_GRP_DECLARE(vm_swap_data_lock_grp, "vm_swap_data");
43 LCK_MTX_DECLARE(vm_swap_data_lock, &vm_swap_data_lock_grp);
44 
45 #if defined(XNU_TARGET_OS_OSX)
46 /*
47  * launchd explicitly turns ON swap later during boot on macOS devices.
48  */
49 boolean_t       compressor_store_stop_compaction = TRUE;
50 #else
51 boolean_t       compressor_store_stop_compaction = FALSE;
52 #endif
53 
54 boolean_t       vm_swapfile_create_needed = FALSE;
55 boolean_t       vm_swapfile_gc_needed = FALSE;
56 
57 int             vm_swapper_throttle = -1;
58 uint64_t        vm_swapout_thread_id;
59 
60 uint64_t        vm_swap_put_failures = 0; /* Likely failed I/O. Data is still in memory. */
61 uint64_t        vm_swap_get_failures = 0; /* Fatal */
62 uint64_t        vm_swap_put_failures_no_swap_file = 0; /* Possibly not fatal because we might just need a new swapfile. */
63 int             vm_num_swap_files_config = 0;
64 int             vm_num_swap_files = 0;
65 int             vm_num_pinned_swap_files = 0;
66 uint64_t        vm_swap_volume_capacity = 0;
67 int             vm_swapout_thread_processed_segments = 0;
68 int             vm_swapout_thread_awakened = 0;
69 bool            vm_swapout_thread_running = FALSE;
70 _Atomic bool    vm_swapout_wake_pending = false;
71 int             vm_swapfile_create_thread_awakened = 0;
72 int             vm_swapfile_create_thread_running = 0;
73 int             vm_swapfile_gc_thread_awakened = 0;
74 int             vm_swapfile_gc_thread_running = 0;
75 
76 int64_t         vm_swappin_avail = 0;
77 boolean_t       vm_swappin_enabled = FALSE;
78 unsigned int    vm_swapfile_total_segs_alloced = 0;
79 unsigned int    vm_swapfile_total_segs_alloced_max = 0;
80 unsigned int    vm_swapfile_total_segs_used = 0;
81 unsigned int    vm_swapfile_total_segs_used_max = 0;
82 
83 char            swapfilename[MAX_SWAPFILENAME_LEN + 1] = SWAP_FILE_NAME;
84 
85 extern vm_map_t compressor_map;
86 extern uint32_t c_seg_bufsize, c_seg_allocsize, c_seg_off_limit;
87 
88 #define SWAP_READY      0x1     /* Swap file is ready to be used */
89 #define SWAP_RECLAIM    0x2     /* Swap file is marked to be reclaimed */
90 #define SWAP_WANTED     0x4     /* Swap file has waiters */
91 #define SWAP_REUSE      0x8     /* Swap file is on the Q and has a name. Reuse after init-ing.*/
92 #define SWAP_PINNED     0x10    /* Swap file is pinned (FusionDrive) */
93 
94 
95 struct swapfile {
96 	queue_head_t            swp_queue;      /* list of swap files */
97 	char                    *swp_path;      /* saved pathname of swap file */
98 	struct vnode            *swp_vp;        /* backing vnode */
99 	uint64_t                swp_size;       /* size of this swap file */
100 	uint8_t                 *swp_bitmap;    /* bitmap showing the alloced/freed slots in the swap file */
101 	unsigned int            swp_pathlen;    /* length of pathname */
102 	unsigned int            swp_nsegs;      /* #segments we can use */
103 	unsigned int            swp_nseginuse;  /* #segments in use */
104 	unsigned int            swp_index;      /* index of this swap file */
105 	unsigned int            swp_flags;      /* state of swap file */
106 	unsigned int            swp_free_hint;  /* offset of 1st free chunk */
107 	unsigned int            swp_io_count;   /* count of outstanding I/Os */
108 	c_segment_t             *swp_csegs;     /* back pointers to the c_segments. Used during swap reclaim. */
109 
110 	struct trim_list        *swp_delayed_trim_list_head;
111 	unsigned int            swp_delayed_trim_count;
112 };
113 
114 queue_head_t    swf_global_queue;
115 boolean_t       swp_trim_supported = FALSE;
116 
117 extern uint64_t         dont_trim_until_ts;
118 uint64_t                vm_swapfile_last_failed_to_create_ts = 0;
119 uint64_t                vm_swapfile_last_successful_create_ts = 0;
120 int                     vm_swapfile_can_be_created = FALSE;
121 boolean_t               delayed_trim_handling_in_progress = FALSE;
122 
123 boolean_t               hibernate_in_progress_with_pinned_swap = FALSE;
124 
125 static void vm_swapout_thread_throttle_adjust(void);
126 static void vm_swap_free_now(struct swapfile *swf, uint64_t f_offset);
127 static void vm_swapfile_create_thread(void);
128 static void vm_swapfile_gc_thread(void);
129 static void vm_swap_defragment(void);
130 static void vm_swap_handle_delayed_trims(boolean_t);
131 static void vm_swap_do_delayed_trim(struct swapfile *);
132 static void vm_swap_wait_on_trim_handling_in_progress(void);
133 static void vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset, uint32_t size, kern_return_t kr);
134 
135 extern int vnode_getwithref(struct vnode* vp);
136 
137 boolean_t vm_swap_force_defrag = FALSE, vm_swap_force_reclaim = FALSE;
138 
139 #if !XNU_TARGET_OS_OSX
140 
141 /*
142  * For CONFIG_FREEZE, we scale the c_segments_limit based on the
143  * number of swapfiles allowed. That increases wired memory overhead.
144  * So we want to keep the max swapfiles same on both DEV/RELEASE so
145  * that the memory overhead is similar for performance comparisons.
146  */
147 #define VM_MAX_SWAP_FILE_NUM            5
148 #if defined(__arm64__) && defined(ARM_LARGE_MEMORY)
149 #define VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM (64ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
150 #define VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM (16ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
151 #else /* defined(__arm64__) && defined(ARM_LARGE_MEMORY) */
152 /*
153  * We reserve compressor pool VA at boot for the max # of swap files. If someone
154  * has enabled app swap but we're not an arm large memory device we can't hog
155  * all of the VA so we only go up to 4GB.
156  */
157 #define VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM (4ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
158 #define VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM (4ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
159 #endif /* defined(__arm64__) && defined(ARM_LARGE_MEMORY) */
160 #define VM_SWAP_MIN_VOLUME_CAPACITY (128ULL * (1ULL << 30))
161 
162 #define VM_SWAPFILE_DELAYED_TRIM_MAX    4
163 
164 #define VM_SWAP_SHOULD_DEFRAGMENT()     (((vm_swap_force_defrag == TRUE) || (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 16))) ? 1 : 0)
165 #define VM_SWAP_SHOULD_PIN(_size)       FALSE
166 #define VM_SWAP_SHOULD_TRIM(swf)        ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0)
167 
168 #else /* !XNU_TARGET_OS_OSX */
169 
170 #define VM_MAX_SWAP_FILE_NUM            100
171 #define VM_SWAPFILE_DELAYED_TRIM_MAX    128
172 
173 #define VM_SWAP_SHOULD_DEFRAGMENT()     (((vm_swap_force_defrag == TRUE) || (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 4))) ? 1 : 0)
174 #define VM_SWAP_SHOULD_PIN(_size)       (vm_swappin_avail > 0 && vm_swappin_avail >= (int64_t)(_size))
175 #define VM_SWAP_SHOULD_TRIM(swf)        ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0)
176 
177 #endif /* !XNU_TARGET_OS_OSX */
178 
179 #define VM_SWAP_SHOULD_RECLAIM()        (((vm_swap_force_reclaim == TRUE) || ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) >= swapfile_reclaim_threshold_segs)) ? 1 : 0)
180 #define VM_SWAP_SHOULD_ABORT_RECLAIM()  (((vm_swap_force_reclaim == FALSE) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) <= swapfile_reclam_minimum_segs)) ? 1 : 0)
181 
182 #define VM_SWAP_BUSY()  (((c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count) && (vm_swapper_throttle == THROTTLE_LEVEL_COMPRESSOR_TIER0)) ? 1 : 0)
183 
184 
185 #if CHECKSUM_THE_SWAP
186 extern unsigned int hash_string(char *cp, int len);
187 #endif
188 
189 #if RECORD_THE_COMPRESSED_DATA
190 boolean_t       c_compressed_record_init_done = FALSE;  /* was the record file opened? */
191 int             c_compressed_record_write_error = 0;
192 struct vnode    *c_compressed_record_vp = NULL;         /* the file opened for record write */
193 uint64_t        c_compressed_record_file_offset = 0;    /* next write offset */
194 void    c_compressed_record_init(void);
195 void    c_compressed_record_write(char *, int);
196 #endif
197 
198 extern void                     vm_pageout_io_throttle(void);
199 
200 static struct swapfile *vm_swapfile_for_handle(uint64_t);
201 
202 /*
203  * Called with the vm_swap_data_lock held.
204  */
205 
206 static struct swapfile *
vm_swapfile_for_handle(uint64_t f_offset)207 vm_swapfile_for_handle(uint64_t f_offset)
208 {
209 	uint64_t                file_offset = 0;
210 	unsigned int            swapfile_index = 0;
211 	struct swapfile*        swf = NULL;
212 
213 	file_offset = (f_offset & SWAP_SLOT_MASK);
214 	swapfile_index = (f_offset >> SWAP_DEVICE_SHIFT);
215 
216 	swf = (struct swapfile*) queue_first(&swf_global_queue);
217 
218 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
219 		if (swapfile_index == swf->swp_index) {
220 			break;
221 		}
222 
223 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
224 	}
225 
226 	if (queue_end(&swf_global_queue, (queue_entry_t) swf)) {
227 		swf = NULL;
228 	}
229 
230 	return swf;
231 }
232 
233 #if ENCRYPTED_SWAP
234 
235 #include <libkern/crypto/aesxts.h>
236 
237 extern int cc_rand_generate(void *, size_t);     /* from libkern/cyrpto/rand.h> */
238 
239 boolean_t       swap_crypt_initialized;
240 void            swap_crypt_initialize(void);
241 
242 symmetric_xts   xts_modectx;
243 uint32_t        swap_crypt_key1[8];   /* big enough for a 256 bit random key */
244 uint32_t        swap_crypt_key2[8];   /* big enough for a 256 bit random key */
245 
246 #if DEVELOPMENT || DEBUG
247 boolean_t       swap_crypt_xts_tested = FALSE;
248 unsigned char   swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
249 unsigned char   swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
250 unsigned char   swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
251 #endif /* DEVELOPMENT || DEBUG */
252 
253 unsigned long   vm_page_encrypt_counter;
254 unsigned long   vm_page_decrypt_counter;
255 
256 
257 void
swap_crypt_initialize(void)258 swap_crypt_initialize(void)
259 {
260 	uint8_t  *enckey1, *enckey2;
261 	int      keylen1, keylen2;
262 	int      error;
263 
264 	assert(swap_crypt_initialized == FALSE);
265 
266 	keylen1 = sizeof(swap_crypt_key1);
267 	enckey1 = (uint8_t *)&swap_crypt_key1;
268 	keylen2 = sizeof(swap_crypt_key2);
269 	enckey2 = (uint8_t *)&swap_crypt_key2;
270 
271 	error = cc_rand_generate((void *)enckey1, keylen1);
272 	assert(!error);
273 
274 	error = cc_rand_generate((void *)enckey2, keylen2);
275 	assert(!error);
276 
277 	error = xts_start(0, NULL, enckey1, keylen1, enckey2, keylen2, 0, 0, &xts_modectx);
278 	assert(!error);
279 
280 	swap_crypt_initialized = TRUE;
281 
282 #if DEVELOPMENT || DEBUG
283 	uint8_t *encptr;
284 	uint8_t *decptr;
285 	uint8_t *refptr;
286 	uint8_t *iv;
287 	uint64_t ivnum[2];
288 	int size = 0;
289 	int i    = 0;
290 	int rc   = 0;
291 
292 	assert(swap_crypt_xts_tested == FALSE);
293 
294 	/*
295 	 * Validate the encryption algorithms.
296 	 *
297 	 * First initialize the test data.
298 	 */
299 	for (i = 0; i < 4096; i++) {
300 		swap_crypt_test_page_ref[i] = (char) i;
301 	}
302 	ivnum[0] = (uint64_t)0xaa;
303 	ivnum[1] = 0;
304 	iv = (uint8_t *)ivnum;
305 
306 	refptr = (uint8_t *)swap_crypt_test_page_ref;
307 	encptr = (uint8_t *)swap_crypt_test_page_encrypt;
308 	decptr = (uint8_t *)swap_crypt_test_page_decrypt;
309 	size = 4096;
310 
311 	/* encrypt */
312 	rc = xts_encrypt(refptr, size, encptr, iv, &xts_modectx);
313 	assert(!rc);
314 
315 	/* compare result with original - should NOT match */
316 	for (i = 0; i < 4096; i++) {
317 		if (swap_crypt_test_page_encrypt[i] !=
318 		    swap_crypt_test_page_ref[i]) {
319 			break;
320 		}
321 	}
322 	assert(i != 4096);
323 
324 	/* decrypt */
325 	rc = xts_decrypt(encptr, size, decptr, iv, &xts_modectx);
326 	assert(!rc);
327 
328 	/* compare result with original */
329 	for (i = 0; i < 4096; i++) {
330 		if (swap_crypt_test_page_decrypt[i] !=
331 		    swap_crypt_test_page_ref[i]) {
332 			panic("encryption test failed");
333 		}
334 	}
335 	/* encrypt in place */
336 	rc = xts_encrypt(decptr, size, decptr, iv, &xts_modectx);
337 	assert(!rc);
338 
339 	/* decrypt in place */
340 	rc = xts_decrypt(decptr, size, decptr, iv, &xts_modectx);
341 	assert(!rc);
342 
343 	for (i = 0; i < 4096; i++) {
344 		if (swap_crypt_test_page_decrypt[i] !=
345 		    swap_crypt_test_page_ref[i]) {
346 			panic("in place encryption test failed");
347 		}
348 	}
349 	swap_crypt_xts_tested = TRUE;
350 #endif /* DEVELOPMENT || DEBUG */
351 }
352 
353 
354 void
vm_swap_encrypt(c_segment_t c_seg)355 vm_swap_encrypt(c_segment_t c_seg)
356 {
357 	uint8_t *ptr;
358 	uint8_t *iv;
359 	uint64_t ivnum[2];
360 	int size = 0;
361 	int rc   = 0;
362 
363 	if (swap_crypt_initialized == FALSE) {
364 		swap_crypt_initialize();
365 	}
366 
367 #if DEVELOPMENT || DEBUG
368 	C_SEG_MAKE_WRITEABLE(c_seg);
369 #endif
370 	ptr = (uint8_t *)c_seg->c_store.c_buffer;
371 	size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
372 
373 	ivnum[0] = (uint64_t)c_seg;
374 	ivnum[1] = 0;
375 	iv = (uint8_t *)ivnum;
376 
377 	rc = xts_encrypt(ptr, size, ptr, iv, &xts_modectx);
378 	assert(!rc);
379 
380 	vm_page_encrypt_counter += (size / PAGE_SIZE_64);
381 
382 #if DEVELOPMENT || DEBUG
383 	C_SEG_WRITE_PROTECT(c_seg);
384 #endif
385 }
386 
387 void
vm_swap_decrypt(c_segment_t c_seg)388 vm_swap_decrypt(c_segment_t c_seg)
389 {
390 	uint8_t *ptr;
391 	uint8_t *iv;
392 	uint64_t ivnum[2];
393 	int size = 0;
394 	int rc   = 0;
395 
396 	assert(swap_crypt_initialized);
397 
398 #if DEVELOPMENT || DEBUG
399 	C_SEG_MAKE_WRITEABLE(c_seg);
400 #endif
401 	ptr = (uint8_t *)c_seg->c_store.c_buffer;
402 	size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
403 
404 	ivnum[0] = (uint64_t)c_seg;
405 	ivnum[1] = 0;
406 	iv = (uint8_t *)ivnum;
407 
408 	rc = xts_decrypt(ptr, size, ptr, iv, &xts_modectx);
409 	assert(!rc);
410 
411 	vm_page_decrypt_counter += (size / PAGE_SIZE_64);
412 
413 #if DEVELOPMENT || DEBUG
414 	C_SEG_WRITE_PROTECT(c_seg);
415 #endif
416 }
417 #endif /* ENCRYPTED_SWAP */
418 
419 uint64_t compressed_swap_chunk_size, vm_swapfile_hiwater_segs, swapfile_reclaim_threshold_segs, swapfile_reclam_minimum_segs;
420 extern bool memorystatus_swap_all_apps;
421 
422 void
vm_compressor_swap_init_swap_file_limit(void)423 vm_compressor_swap_init_swap_file_limit(void)
424 {
425 	vm_num_swap_files_config = VM_MAX_SWAP_FILE_NUM;
426 #if CONFIG_JETSAM
427 	if (memorystatus_swap_all_apps) {
428 		if (vm_swap_volume_capacity == 0) {
429 			/*
430 			 * Early in boot we don't know the swap volume capacity.
431 			 * That's fine. Reserve space for the maximum config
432 			 * and we'll lower this later in boot once we have the capacity.
433 			 */
434 			vm_num_swap_files_config = VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM;
435 		} else {
436 			static uint64_t kFixedPointFactor = 100;
437 			/*
438 			 * Scale the max number of swap files linearly.
439 			 * But we can never go above VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM.
440 			 */
441 			vm_num_swap_files_config = vm_swap_volume_capacity * kFixedPointFactor / VM_SWAP_MIN_VOLUME_CAPACITY
442 			    * VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM / kFixedPointFactor;
443 			vm_num_swap_files_config = MAX(vm_num_swap_files_config, VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM);
444 			vm_num_swap_files_config = MIN(vm_num_swap_files_config, VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM);
445 		}
446 	}
447 #endif /* CONFIG_JETSAM */
448 #if DEVELOPMENT || DEBUG
449 	typeof(vm_num_swap_files_config) parsed_vm_max_num_swap_files = 0;
450 	if (PE_parse_boot_argn("vm_max_num_swap_files", &parsed_vm_max_num_swap_files, sizeof(parsed_vm_max_num_swap_files))) {
451 		if (parsed_vm_max_num_swap_files > 0) {
452 			vm_num_swap_files_config = parsed_vm_max_num_swap_files;
453 		} else {
454 			printf("WARNING: Ignoring vm_max_num_swap_files=%d boot-arg. Value must be > 0\n", parsed_vm_max_num_swap_files);
455 		}
456 	}
457 #endif
458 	printf("Maximum number of VM swap files: %d\n", vm_num_swap_files_config);
459 }
460 
461 int vm_swap_enabled = 0;
462 void
vm_compressor_swap_init(void)463 vm_compressor_swap_init(void)
464 {
465 	thread_t        thread = NULL;
466 
467 	queue_init(&swf_global_queue);
468 
469 #if !XNU_TARGET_OS_OSX
470 	/*
471 	 * dummy value until the swap file gets created
472 	 * when we drive the first c_segment_t to the
473 	 * swapout queue... at that time we will
474 	 * know the true size we have to work with
475 	 */
476 	c_overage_swapped_limit = 16;
477 #endif /* !XNU_TARGET_OS_OSX */
478 
479 	compressed_swap_chunk_size = c_seg_bufsize;
480 	vm_swapfile_hiwater_segs = (MIN_SWAP_FILE_SIZE / compressed_swap_chunk_size);
481 	swapfile_reclaim_threshold_segs = ((17 * (MAX_SWAP_FILE_SIZE / compressed_swap_chunk_size)) / 10);
482 	swapfile_reclam_minimum_segs = ((13 * (MAX_SWAP_FILE_SIZE / compressed_swap_chunk_size)) / 10);
483 
484 	if (kernel_thread_start_priority((thread_continue_t)vm_swapout_thread, NULL,
485 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
486 		panic("vm_swapout_thread: create failed");
487 	}
488 	thread_set_thread_name(thread, "VM_swapout");
489 	vm_swapout_thread_id = thread->thread_id;
490 	thread_deallocate(thread);
491 
492 	if (kernel_thread_start_priority((thread_continue_t)vm_swapfile_create_thread, NULL,
493 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
494 		panic("vm_swapfile_create_thread: create failed");
495 	}
496 	thread_set_thread_name(thread, "VM_swapfile_create");
497 	thread_deallocate(thread);
498 
499 	if (kernel_thread_start_priority((thread_continue_t)vm_swapfile_gc_thread, NULL,
500 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
501 		panic("vm_swapfile_gc_thread: create failed");
502 	}
503 	thread_set_thread_name(thread, "VM_swapfile_gc");
504 	/*
505 	 * Swapfile garbage collection will need to allocate memory
506 	 * to complete its swap reclaim and in-memory compaction.
507 	 * So allow it to dip into the reserved VM page pool.
508 	 */
509 	thread_lock(thread);
510 	thread->options |= TH_OPT_VMPRIV;
511 	thread_unlock(thread);
512 	thread_deallocate(thread);
513 	proc_set_thread_policy_with_tid(kernel_task, thread->thread_id,
514 	    TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2);
515 	proc_set_thread_policy_with_tid(kernel_task, thread->thread_id,
516 	    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
517 
518 	vm_swap_enabled = 1;
519 	printf("VM Swap Subsystem is ON\n");
520 }
521 
522 
523 #if RECORD_THE_COMPRESSED_DATA
524 
525 void
c_compressed_record_init()526 c_compressed_record_init()
527 {
528 	if (c_compressed_record_init_done == FALSE) {
529 		vm_swapfile_open("/tmp/compressed_data", &c_compressed_record_vp);
530 		c_compressed_record_init_done = TRUE;
531 	}
532 }
533 
534 void
c_compressed_record_write(char * buf,int size)535 c_compressed_record_write(char *buf, int size)
536 {
537 	if (c_compressed_record_write_error == 0) {
538 		c_compressed_record_write_error = vm_record_file_write(c_compressed_record_vp, c_compressed_record_file_offset, buf, size);
539 		c_compressed_record_file_offset += size;
540 	}
541 }
542 #endif
543 
544 
545 int             compaction_swapper_inited = 0;
546 
547 void
vm_compaction_swapper_do_init(void)548 vm_compaction_swapper_do_init(void)
549 {
550 	struct  vnode *vp;
551 	char    *pathname;
552 	int     namelen;
553 
554 	if (compaction_swapper_inited) {
555 		return;
556 	}
557 
558 	if (vm_compressor_mode != VM_PAGER_COMPRESSOR_WITH_SWAP) {
559 		compaction_swapper_inited = 1;
560 		return;
561 	}
562 	lck_mtx_lock(&vm_swap_data_lock);
563 
564 	if (!compaction_swapper_inited) {
565 		namelen = (int)strlen(swapfilename) + SWAPFILENAME_INDEX_LEN + 1;
566 		pathname = kalloc_data(namelen, Z_WAITOK | Z_ZERO);
567 		snprintf(pathname, namelen, "%s%d", swapfilename, 0);
568 
569 		vm_swapfile_open(pathname, &vp);
570 
571 		if (vp) {
572 			if (vnode_pager_isSSD(vp) == FALSE) {
573 				/*
574 				 * swap files live on an HDD, so let's make sure to start swapping
575 				 * much earlier since we're not worried about SSD write-wear and
576 				 * we have so little write bandwidth to work with
577 				 * these values were derived expermentially by running the performance
578 				 * teams stock test for evaluating HDD performance against various
579 				 * combinations and looking and comparing overall results.
580 				 * Note that the > relationship between these 4 values must be maintained
581 				 */
582 				if (vm_compressor_minorcompact_threshold_divisor_overridden == 0) {
583 					vm_compressor_minorcompact_threshold_divisor = 15;
584 				}
585 				if (vm_compressor_majorcompact_threshold_divisor_overridden == 0) {
586 					vm_compressor_majorcompact_threshold_divisor = 18;
587 				}
588 				if (vm_compressor_unthrottle_threshold_divisor_overridden == 0) {
589 					vm_compressor_unthrottle_threshold_divisor = 24;
590 				}
591 				if (vm_compressor_catchup_threshold_divisor_overridden == 0) {
592 					vm_compressor_catchup_threshold_divisor = 30;
593 				}
594 			}
595 #if XNU_TARGET_OS_OSX
596 			vnode_setswapmount(vp);
597 			vm_swappin_avail = vnode_getswappin_avail(vp);
598 
599 			if (vm_swappin_avail) {
600 				vm_swappin_enabled = TRUE;
601 			}
602 #endif /* XNU_TARGET_OS_OSX */
603 			vm_swapfile_close((uint64_t)pathname, vp);
604 		}
605 		kfree_data(pathname, namelen);
606 
607 		compaction_swapper_inited = 1;
608 	}
609 	lck_mtx_unlock(&vm_swap_data_lock);
610 }
611 
612 
613 void
vm_swap_consider_defragmenting(int flags)614 vm_swap_consider_defragmenting(int flags)
615 {
616 	boolean_t force_defrag = (flags & VM_SWAP_FLAGS_FORCE_DEFRAG);
617 	boolean_t force_reclaim = (flags & VM_SWAP_FLAGS_FORCE_RECLAIM);
618 
619 	if (compressor_store_stop_compaction == FALSE && !VM_SWAP_BUSY() &&
620 	    (force_defrag || force_reclaim || VM_SWAP_SHOULD_DEFRAGMENT() || VM_SWAP_SHOULD_RECLAIM())) {
621 		if (!vm_swapfile_gc_thread_running || force_defrag || force_reclaim) {
622 			lck_mtx_lock(&vm_swap_data_lock);
623 
624 			if (force_defrag) {
625 				vm_swap_force_defrag = TRUE;
626 			}
627 
628 			if (force_reclaim) {
629 				vm_swap_force_reclaim = TRUE;
630 			}
631 
632 			if (!vm_swapfile_gc_thread_running) {
633 				thread_wakeup((event_t) &vm_swapfile_gc_needed);
634 			}
635 
636 			lck_mtx_unlock(&vm_swap_data_lock);
637 		}
638 	}
639 }
640 
641 
642 int vm_swap_defragment_yielded = 0;
643 int vm_swap_defragment_swapin = 0;
644 int vm_swap_defragment_free = 0;
645 int vm_swap_defragment_busy = 0;
646 
647 #if CONFIG_FREEZE
648 extern int32_t c_segment_pages_compressed_incore;
649 extern int32_t c_segment_pages_compressed_incore_late_swapout;
650 extern uint32_t c_segment_pages_compressed_nearing_limit;
651 extern uint32_t c_segment_count;
652 extern uint32_t c_segments_nearing_limit;
653 
654 boolean_t       memorystatus_kill_on_VM_compressor_space_shortage(boolean_t);
655 
656 extern bool freezer_incore_cseg_acct;
657 #endif /* CONFIG_FREEZE */
658 
659 static void
vm_swap_defragment()660 vm_swap_defragment()
661 {
662 	c_segment_t     c_seg;
663 
664 	/*
665 	 * have to grab the master lock w/o holding
666 	 * any locks in spin mode
667 	 */
668 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
669 
670 	lck_mtx_lock_spin_always(c_list_lock);
671 
672 	while (!queue_empty(&c_swappedout_sparse_list_head)) {
673 		if (compressor_store_stop_compaction == TRUE || VM_SWAP_BUSY()) {
674 			vm_swap_defragment_yielded++;
675 			break;
676 		}
677 		c_seg = (c_segment_t)queue_first(&c_swappedout_sparse_list_head);
678 
679 		lck_mtx_lock_spin_always(&c_seg->c_lock);
680 
681 		assert(c_seg->c_state == C_ON_SWAPPEDOUTSPARSE_Q);
682 
683 		if (c_seg->c_busy) {
684 			lck_mtx_unlock_always(c_list_lock);
685 
686 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
687 			/*
688 			 * c_seg_wait_on_busy consumes c_seg->c_lock
689 			 */
690 			c_seg_wait_on_busy(c_seg);
691 
692 			PAGE_REPLACEMENT_DISALLOWED(TRUE);
693 
694 			lck_mtx_lock_spin_always(c_list_lock);
695 
696 			vm_swap_defragment_busy++;
697 			continue;
698 		}
699 		if (c_seg->c_bytes_used == 0) {
700 			/*
701 			 * c_seg_free_locked consumes the c_list_lock
702 			 * and c_seg->c_lock
703 			 */
704 			C_SEG_BUSY(c_seg);
705 			c_seg_free_locked(c_seg);
706 
707 			vm_swap_defragment_free++;
708 		} else {
709 			lck_mtx_unlock_always(c_list_lock);
710 
711 #if CONFIG_FREEZE
712 			if (freezer_incore_cseg_acct) {
713 				/*
714 				 * TODO(jason): These two are tricky because they're pre-emptive jetsams.
715 				 * The system is not unhealthy, but we know that it's about to become unhealthy once
716 				 * we do this swapin.
717 				 * So we're waking up the memorystatus thread to make space
718 				 * (hopefully) before this segment comes in.
719 				 *
720 				 * I think the compressor_backing_store needs to keep track of
721 				 * two new globals that will track the number of segments
722 				 * being swapped in due to defrag and the number of slots used
723 				 * in those segments.
724 				 * Then the health check below can be called from the memorystatus
725 				 * thread.
726 				 */
727 				if ((c_seg->c_slots_used + c_segment_pages_compressed_incore) >= c_segment_pages_compressed_nearing_limit) {
728 					memorystatus_kill_on_VM_compressor_space_shortage(TRUE /* async */);
729 				}
730 
731 				uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
732 				if ((incore_seg_count + 1) >= c_segments_nearing_limit) {
733 					memorystatus_kill_on_VM_compressor_space_shortage(TRUE /* async */);
734 				}
735 			}
736 #endif /* CONFIG_FREEZE */
737 			if (c_seg_swapin(c_seg, TRUE, FALSE) == 0) {
738 				lck_mtx_unlock_always(&c_seg->c_lock);
739 				vmcs_stats.defrag_swapins += (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) >> PAGE_SHIFT;
740 			}
741 
742 			vm_swap_defragment_swapin++;
743 		}
744 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
745 
746 		vm_pageout_io_throttle();
747 
748 		/*
749 		 * because write waiters have privilege over readers,
750 		 * dropping and immediately retaking the master lock will
751 		 * still allow any thread waiting to acquire the
752 		 * master lock exclusively an opportunity to take it
753 		 */
754 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
755 
756 		lck_mtx_lock_spin_always(c_list_lock);
757 	}
758 	lck_mtx_unlock_always(c_list_lock);
759 
760 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
761 }
762 
763 TUNABLE(uint64_t, vm_swapfile_creation_delay_ns, "vm_swapfile_creation_delay_ns", 15 * NSEC_PER_SEC);
764 
765 static inline bool
vm_swapfile_should_create(uint64_t now)766 vm_swapfile_should_create(uint64_t now)
767 {
768 	uint64_t delta_failed_creation_ns;
769 	absolutetime_to_nanoseconds(now - vm_swapfile_last_failed_to_create_ts, &delta_failed_creation_ns);
770 
771 	return (vm_num_swap_files < vm_num_swap_files_config) &&
772 	       ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)vm_swapfile_hiwater_segs) &&
773 	       (delta_failed_creation_ns > vm_swapfile_creation_delay_ns);
774 }
775 
776 bool vm_swapfile_create_thread_inited = false;
777 
778 static void
vm_swapfile_create_thread(void)779 vm_swapfile_create_thread(void)
780 {
781 	uint64_t now;
782 
783 	if (!vm_swapfile_create_thread_inited) {
784 #if CONFIG_THREAD_GROUPS
785 		thread_group_vm_add();
786 #endif /* CONFIG_THREAD_GROUPS */
787 		current_thread()->options |= TH_OPT_VMPRIV;
788 
789 		vm_swapfile_create_thread_inited = true;
790 	}
791 
792 	vm_swapfile_create_thread_awakened++;
793 	vm_swapfile_create_thread_running = 1;
794 
795 	while (TRUE) {
796 		/*
797 		 * walk through the list of swap files
798 		 * and do the delayed frees/trims for
799 		 * any swap file whose count of delayed
800 		 * frees is above the batch limit
801 		 */
802 		vm_swap_handle_delayed_trims(FALSE);
803 
804 		lck_mtx_lock(&vm_swap_data_lock);
805 
806 		if (hibernate_in_progress_with_pinned_swap == TRUE) {
807 			break;
808 		}
809 
810 		if (compressor_store_stop_compaction == TRUE) {
811 			break;
812 		}
813 
814 		now = mach_absolute_time();
815 
816 		if (!vm_swapfile_should_create(now)) {
817 			break;
818 		}
819 
820 		lck_mtx_unlock(&vm_swap_data_lock);
821 
822 		if (vm_swap_create_file() == FALSE) {
823 			vm_swapfile_last_failed_to_create_ts = now;
824 			HIBLOG("low swap: failed to create swapfile\n");
825 		} else {
826 			vm_swapfile_last_successful_create_ts = now;
827 		}
828 	}
829 	vm_swapfile_create_thread_running = 0;
830 
831 	if (hibernate_in_progress_with_pinned_swap == TRUE) {
832 		thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap);
833 	}
834 
835 	if (compressor_store_stop_compaction == TRUE) {
836 		thread_wakeup((event_t)&compressor_store_stop_compaction);
837 	}
838 
839 	assert_wait((event_t)&vm_swapfile_create_needed, THREAD_UNINT);
840 
841 	lck_mtx_unlock(&vm_swap_data_lock);
842 
843 	thread_block((thread_continue_t)vm_swapfile_create_thread);
844 
845 	/* NOTREACHED */
846 }
847 
848 
849 #if HIBERNATION
850 
851 kern_return_t
hibernate_pin_swap(boolean_t start)852 hibernate_pin_swap(boolean_t start)
853 {
854 	vm_compaction_swapper_do_init();
855 
856 	if (start == FALSE) {
857 		lck_mtx_lock(&vm_swap_data_lock);
858 		hibernate_in_progress_with_pinned_swap = FALSE;
859 		lck_mtx_unlock(&vm_swap_data_lock);
860 
861 		return KERN_SUCCESS;
862 	}
863 	if (vm_swappin_enabled == FALSE) {
864 		return KERN_SUCCESS;
865 	}
866 
867 	lck_mtx_lock(&vm_swap_data_lock);
868 
869 	hibernate_in_progress_with_pinned_swap = TRUE;
870 
871 	while (vm_swapfile_create_thread_running || vm_swapfile_gc_thread_running) {
872 		assert_wait((event_t)&hibernate_in_progress_with_pinned_swap, THREAD_UNINT);
873 
874 		lck_mtx_unlock(&vm_swap_data_lock);
875 
876 		thread_block(THREAD_CONTINUE_NULL);
877 
878 		lck_mtx_lock(&vm_swap_data_lock);
879 	}
880 	if (vm_num_swap_files > vm_num_pinned_swap_files) {
881 		hibernate_in_progress_with_pinned_swap = FALSE;
882 		lck_mtx_unlock(&vm_swap_data_lock);
883 
884 		HIBLOG("hibernate_pin_swap failed - vm_num_swap_files = %d, vm_num_pinned_swap_files = %d\n",
885 		    vm_num_swap_files, vm_num_pinned_swap_files);
886 		return KERN_FAILURE;
887 	}
888 	lck_mtx_unlock(&vm_swap_data_lock);
889 
890 	while (VM_SWAP_SHOULD_PIN(MAX_SWAP_FILE_SIZE)) {
891 		if (vm_swap_create_file() == FALSE) {
892 			break;
893 		}
894 	}
895 	return KERN_SUCCESS;
896 }
897 #endif
898 bool vm_swapfile_gc_thread_inited = false;
899 static void
vm_swapfile_gc_thread(void)900 vm_swapfile_gc_thread(void)
901 {
902 	boolean_t       need_defragment;
903 	boolean_t       need_reclaim;
904 
905 	if (!vm_swapfile_gc_thread_inited) {
906 #if CONFIG_THREAD_GROUPS
907 		thread_group_vm_add();
908 #endif /* CONFIG_THREAD_GROUPS */
909 		vm_swapfile_gc_thread_inited = true;
910 	}
911 
912 	vm_swapfile_gc_thread_awakened++;
913 	vm_swapfile_gc_thread_running = 1;
914 
915 	while (TRUE) {
916 		lck_mtx_lock(&vm_swap_data_lock);
917 
918 		if (hibernate_in_progress_with_pinned_swap == TRUE) {
919 			break;
920 		}
921 
922 		if (VM_SWAP_BUSY() || compressor_store_stop_compaction == TRUE) {
923 			break;
924 		}
925 
926 		need_defragment = FALSE;
927 		need_reclaim = FALSE;
928 
929 		if (VM_SWAP_SHOULD_DEFRAGMENT()) {
930 			need_defragment = TRUE;
931 		}
932 
933 		if (VM_SWAP_SHOULD_RECLAIM()) {
934 			need_defragment = TRUE;
935 			need_reclaim = TRUE;
936 		}
937 		if (need_defragment == FALSE && need_reclaim == FALSE) {
938 			break;
939 		}
940 
941 		vm_swap_force_defrag = FALSE;
942 		vm_swap_force_reclaim = FALSE;
943 
944 		lck_mtx_unlock(&vm_swap_data_lock);
945 
946 		if (need_defragment == TRUE) {
947 			vm_swap_defragment();
948 		}
949 		if (need_reclaim == TRUE) {
950 			vm_swap_reclaim();
951 		}
952 	}
953 	vm_swapfile_gc_thread_running = 0;
954 
955 	if (hibernate_in_progress_with_pinned_swap == TRUE) {
956 		thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap);
957 	}
958 
959 	if (compressor_store_stop_compaction == TRUE) {
960 		thread_wakeup((event_t)&compressor_store_stop_compaction);
961 	}
962 
963 	assert_wait((event_t)&vm_swapfile_gc_needed, THREAD_UNINT);
964 
965 	lck_mtx_unlock(&vm_swap_data_lock);
966 
967 	thread_block((thread_continue_t)vm_swapfile_gc_thread);
968 
969 	/* NOTREACHED */
970 }
971 
972 
973 
974 #define   VM_SWAPOUT_LIMIT_T2P  4
975 #define   VM_SWAPOUT_LIMIT_T1P  4
976 #define   VM_SWAPOUT_LIMIT_T0P  6
977 #define   VM_SWAPOUT_LIMIT_T0   8
978 #define   VM_SWAPOUT_LIMIT_MAX  8
979 
980 #define   VM_SWAPOUT_START      0
981 #define   VM_SWAPOUT_T2_PASSIVE 1
982 #define   VM_SWAPOUT_T1_PASSIVE 2
983 #define   VM_SWAPOUT_T0_PASSIVE 3
984 #define   VM_SWAPOUT_T0         4
985 
986 int vm_swapout_state = VM_SWAPOUT_START;
987 int vm_swapout_limit = 1;
988 
989 int vm_swapper_entered_T0  = 0;
990 int vm_swapper_entered_T0P = 0;
991 int vm_swapper_entered_T1P = 0;
992 int vm_swapper_entered_T2P = 0;
993 
994 
995 static void
vm_swapout_thread_throttle_adjust(void)996 vm_swapout_thread_throttle_adjust(void)
997 {
998 	switch (vm_swapout_state) {
999 	case VM_SWAPOUT_START:
1000 
1001 		vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
1002 		vm_swapper_entered_T2P++;
1003 
1004 		proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1005 		    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1006 		proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1007 		    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1008 		vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
1009 		vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
1010 
1011 		break;
1012 
1013 	case VM_SWAPOUT_T2_PASSIVE:
1014 
1015 		if (SWAPPER_NEEDS_TO_UNTHROTTLE()) {
1016 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER0;
1017 			vm_swapper_entered_T0P++;
1018 
1019 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1020 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1021 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1022 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1023 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
1024 			vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
1025 
1026 			break;
1027 		}
1028 		if (swapout_target_age || hibernate_flushing == TRUE) {
1029 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER1;
1030 			vm_swapper_entered_T1P++;
1031 
1032 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1033 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1034 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1035 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1036 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T1P;
1037 			vm_swapout_state = VM_SWAPOUT_T1_PASSIVE;
1038 		}
1039 		break;
1040 
1041 	case VM_SWAPOUT_T1_PASSIVE:
1042 
1043 		if (SWAPPER_NEEDS_TO_UNTHROTTLE()) {
1044 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER0;
1045 			vm_swapper_entered_T0P++;
1046 
1047 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1048 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1049 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1050 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1051 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
1052 			vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
1053 
1054 			break;
1055 		}
1056 		if (swapout_target_age == 0 && hibernate_flushing == FALSE) {
1057 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
1058 			vm_swapper_entered_T2P++;
1059 
1060 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1061 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1062 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1063 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1064 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
1065 			vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
1066 		}
1067 		break;
1068 
1069 	case VM_SWAPOUT_T0_PASSIVE:
1070 
1071 		if (SWAPPER_NEEDS_TO_RETHROTTLE()) {
1072 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
1073 			vm_swapper_entered_T2P++;
1074 
1075 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1076 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1077 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1078 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1079 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
1080 			vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
1081 
1082 			break;
1083 		}
1084 		if (SWAPPER_NEEDS_TO_CATCHUP()) {
1085 			vm_swapper_entered_T0++;
1086 
1087 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1088 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_DISABLE);
1089 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0;
1090 			vm_swapout_state = VM_SWAPOUT_T0;
1091 		}
1092 		break;
1093 
1094 	case VM_SWAPOUT_T0:
1095 
1096 		if (SWAPPER_HAS_CAUGHTUP()) {
1097 			vm_swapper_entered_T0P++;
1098 
1099 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1100 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1101 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
1102 			vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
1103 		}
1104 		break;
1105 	}
1106 }
1107 
1108 int vm_swapout_found_empty = 0;
1109 
1110 struct swapout_io_completion vm_swapout_ctx[VM_SWAPOUT_LIMIT_MAX];
1111 
1112 int vm_swapout_soc_busy = 0;
1113 int vm_swapout_soc_done = 0;
1114 
1115 
1116 static struct swapout_io_completion *
vm_swapout_find_free_soc(void)1117 vm_swapout_find_free_soc(void)
1118 {
1119 	int      i;
1120 
1121 	for (i = 0; i < VM_SWAPOUT_LIMIT_MAX; i++) {
1122 		if (vm_swapout_ctx[i].swp_io_busy == 0) {
1123 			return &vm_swapout_ctx[i];
1124 		}
1125 	}
1126 	assert(vm_swapout_soc_busy == VM_SWAPOUT_LIMIT_MAX);
1127 
1128 	return NULL;
1129 }
1130 
1131 static struct swapout_io_completion *
vm_swapout_find_done_soc(void)1132 vm_swapout_find_done_soc(void)
1133 {
1134 	int      i;
1135 
1136 	if (vm_swapout_soc_done) {
1137 		for (i = 0; i < VM_SWAPOUT_LIMIT_MAX; i++) {
1138 			if (vm_swapout_ctx[i].swp_io_done) {
1139 				return &vm_swapout_ctx[i];
1140 			}
1141 		}
1142 	}
1143 	return NULL;
1144 }
1145 
1146 static void
vm_swapout_complete_soc(struct swapout_io_completion * soc)1147 vm_swapout_complete_soc(struct swapout_io_completion *soc)
1148 {
1149 	kern_return_t  kr;
1150 
1151 	if (soc->swp_io_error) {
1152 		kr = KERN_FAILURE;
1153 	} else {
1154 		kr = KERN_SUCCESS;
1155 	}
1156 
1157 	lck_mtx_unlock_always(c_list_lock);
1158 
1159 	vm_swap_put_finish(soc->swp_swf, &soc->swp_f_offset, soc->swp_io_error, TRUE /*drop iocount*/);
1160 	vm_swapout_finish(soc->swp_c_seg, soc->swp_f_offset, soc->swp_c_size, kr);
1161 
1162 	lck_mtx_lock_spin_always(c_list_lock);
1163 
1164 	soc->swp_io_done = 0;
1165 	soc->swp_io_busy = 0;
1166 
1167 	vm_swapout_soc_busy--;
1168 	vm_swapout_soc_done--;
1169 }
1170 
1171 bool vm_swapout_thread_inited = false;
1172 extern uint32_t c_donate_swapout_count;
1173 #if CONFIG_JETSAM
1174 bool memorystatus_swap_over_trigger(uint64_t adjustment_factor);
1175 /*
1176  * swapout_sleep_threshold sets the percentage of the swapout threshold at which
1177  * the swap thread will stop processing the swapout queue.
1178  * By default this is 90 which means we will swap until the
1179  * swapout queue size is at 90% of the threshold to wake the swap thread.
1180  * By definition the queue  length must be >= 100% of the threshold when the.
1181  * swap thread is woken up. On development builds this can be adjusted with
1182  * the vm.swapout_sleep_threshold sysctl.
1183  */
1184 uint32_t swapout_sleep_threshold = 90;
1185 #endif /* CONFIG_JETSAM */
1186 static bool
should_process_swapout_queue(const queue_head_t * swapout_list_head)1187 should_process_swapout_queue(const queue_head_t *swapout_list_head)
1188 {
1189 	bool process_queue = !queue_empty(swapout_list_head) &&
1190 	    vm_swapout_soc_busy < vm_swapout_limit &&
1191 	    !compressor_store_stop_compaction;
1192 #if CONFIG_JETSAM
1193 	if (memorystatus_swap_all_apps && swapout_list_head == &c_late_swapout_list_head) {
1194 		process_queue = process_queue && memorystatus_swap_over_trigger(swapout_sleep_threshold);
1195 	}
1196 #endif /* CONFIG_JETSAM */
1197 	return process_queue;
1198 }
1199 
1200 void
vm_swapout_thread(void)1201 vm_swapout_thread(void)
1202 {
1203 	uint32_t        size = 0;
1204 	c_segment_t     c_seg = NULL;
1205 	kern_return_t   kr = KERN_SUCCESS;
1206 	struct swapout_io_completion *soc;
1207 	queue_head_t    *swapout_list_head;
1208 	bool            queues_empty = false;
1209 
1210 	if (!vm_swapout_thread_inited) {
1211 #if CONFIG_THREAD_GROUPS
1212 		thread_group_vm_add();
1213 #endif /* CONFIG_THREAD_GROUPS */
1214 		current_thread()->options |= TH_OPT_VMPRIV;
1215 		vm_swapout_thread_inited = true;
1216 	}
1217 
1218 	vm_swapout_thread_awakened++;
1219 
1220 	lck_mtx_lock_spin_always(c_list_lock);
1221 
1222 	swapout_list_head = &c_early_swapout_list_head;
1223 	vm_swapout_thread_running = TRUE;
1224 	os_atomic_store(&vm_swapout_wake_pending, false, relaxed);
1225 again:
1226 	while (should_process_swapout_queue(swapout_list_head)) {
1227 		c_seg = (c_segment_t)queue_first(swapout_list_head);
1228 
1229 		lck_mtx_lock_spin_always(&c_seg->c_lock);
1230 
1231 		assert(c_seg->c_state == C_ON_SWAPOUT_Q);
1232 
1233 		if (c_seg->c_busy) {
1234 			lck_mtx_unlock_always(c_list_lock);
1235 
1236 			c_seg_wait_on_busy(c_seg);
1237 
1238 			lck_mtx_lock_spin_always(c_list_lock);
1239 
1240 			continue;
1241 		}
1242 		vm_swapout_thread_processed_segments++;
1243 
1244 		size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
1245 
1246 		if (size == 0) {
1247 			assert(c_seg->c_bytes_used == 0);
1248 
1249 			/*
1250 			 * c_seg_free_locked will drop the c_list_lock and
1251 			 * the c_seg->c_lock.
1252 			 */
1253 			C_SEG_BUSY(c_seg);
1254 			c_seg_free_locked(c_seg);
1255 			c_seg = NULL;
1256 
1257 			vm_swapout_found_empty++;
1258 			goto c_seg_is_empty;
1259 		}
1260 		C_SEG_BUSY(c_seg);
1261 		c_seg->c_busy_swapping = 1;
1262 
1263 		c_seg_switch_state(c_seg, C_ON_SWAPIO_Q, FALSE);
1264 
1265 		lck_mtx_unlock_always(c_list_lock);
1266 		lck_mtx_unlock_always(&c_seg->c_lock);
1267 
1268 #if CHECKSUM_THE_SWAP
1269 		c_seg->cseg_hash = hash_string((char *)c_seg->c_store.c_buffer, (int)size);
1270 		c_seg->cseg_swap_size = size;
1271 #endif /* CHECKSUM_THE_SWAP */
1272 
1273 #if ENCRYPTED_SWAP
1274 		vm_swap_encrypt(c_seg);
1275 #endif /* ENCRYPTED_SWAP */
1276 
1277 		soc = vm_swapout_find_free_soc();
1278 		assert(soc);
1279 
1280 		soc->swp_upl_ctx.io_context = (void *)soc;
1281 		soc->swp_upl_ctx.io_done = (void *)vm_swapout_iodone;
1282 		soc->swp_upl_ctx.io_error = 0;
1283 
1284 		kr = vm_swap_put((vm_offset_t)c_seg->c_store.c_buffer, &soc->swp_f_offset, size, c_seg, soc);
1285 
1286 		if (kr != KERN_SUCCESS) {
1287 			if (soc->swp_io_done) {
1288 				lck_mtx_lock_spin_always(c_list_lock);
1289 
1290 				soc->swp_io_done = 0;
1291 				vm_swapout_soc_done--;
1292 
1293 				lck_mtx_unlock_always(c_list_lock);
1294 			}
1295 			vm_swapout_finish(c_seg, soc->swp_f_offset, size, kr);
1296 		} else {
1297 			soc->swp_io_busy = 1;
1298 			vm_swapout_soc_busy++;
1299 		}
1300 
1301 c_seg_is_empty:
1302 		if (!(c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count)) {
1303 			vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE);
1304 		}
1305 
1306 		lck_mtx_lock_spin_always(c_list_lock);
1307 
1308 		while ((soc = vm_swapout_find_done_soc())) {
1309 			vm_swapout_complete_soc(soc);
1310 		}
1311 		lck_mtx_unlock_always(c_list_lock);
1312 
1313 		vm_swapout_thread_throttle_adjust();
1314 
1315 		lck_mtx_lock_spin_always(c_list_lock);
1316 	}
1317 	while ((soc = vm_swapout_find_done_soc())) {
1318 		vm_swapout_complete_soc(soc);
1319 	}
1320 	lck_mtx_unlock_always(c_list_lock);
1321 
1322 	vm_pageout_io_throttle();
1323 
1324 	lck_mtx_lock_spin_always(c_list_lock);
1325 
1326 	/*
1327 	 * Recheck if we have some c_segs to wakeup
1328 	 * post throttle. And, check to see if we
1329 	 * have any more swapouts needed.
1330 	 */
1331 	if (vm_swapout_soc_done) {
1332 		goto again;
1333 	}
1334 
1335 #if XNU_TARGET_OS_OSX
1336 	queues_empty = queue_empty(&c_early_swapout_list_head) && queue_empty(&c_regular_swapout_list_head) && queue_empty(&c_late_swapout_list_head);
1337 #else /* XNU_TARGET_OS_OSX */
1338 	queues_empty = queue_empty(&c_early_swapout_list_head) && queue_empty(&c_late_swapout_list_head);
1339 #endif /* XNU_TARGET_OS_OSX */
1340 
1341 	if (!queues_empty) {
1342 		swapout_list_head = NULL;
1343 		if (!queue_empty(&c_early_swapout_list_head)) {
1344 			swapout_list_head = &c_early_swapout_list_head;
1345 		} else {
1346 #if XNU_TARGET_OS_OSX
1347 			/*
1348 			 * On macOS we _always_ processs all swapout queues.
1349 			 */
1350 			if (!queue_empty(&c_regular_swapout_list_head)) {
1351 				swapout_list_head = &c_regular_swapout_list_head;
1352 			} else {
1353 				swapout_list_head = &c_late_swapout_list_head;
1354 			}
1355 #else /* XNU_TARGET_OS_OSX */
1356 			/*
1357 			 * On non-macOS swap-capable platforms, we might want to
1358 			 * processs just the early queue (Freezer) or process both
1359 			 * early and late queues (app swap). We processed the early
1360 			 * queue up above. The late Q will only be processed if the
1361 			 * checks in should_process_swapout_queue give the go-ahead.
1362 			 */
1363 			swapout_list_head = &c_late_swapout_list_head;
1364 #endif /* XNU_TARGET_OS_OSX */
1365 		}
1366 		if (swapout_list_head && should_process_swapout_queue(swapout_list_head)) {
1367 			goto again;
1368 		}
1369 	}
1370 
1371 	assert_wait((event_t)&vm_swapout_thread, THREAD_UNINT);
1372 
1373 	vm_swapout_thread_running = FALSE;
1374 
1375 	lck_mtx_unlock_always(c_list_lock);
1376 
1377 	thread_block((thread_continue_t)vm_swapout_thread);
1378 
1379 	/* NOTREACHED */
1380 }
1381 
1382 
1383 void
vm_swapout_iodone(void * io_context,int error)1384 vm_swapout_iodone(void *io_context, int error)
1385 {
1386 	struct swapout_io_completion *soc;
1387 
1388 	soc = (struct swapout_io_completion *)io_context;
1389 
1390 	lck_mtx_lock_spin_always(c_list_lock);
1391 
1392 	soc->swp_io_done = 1;
1393 	soc->swp_io_error = error;
1394 	vm_swapout_soc_done++;
1395 
1396 	if (!vm_swapout_thread_running) {
1397 		thread_wakeup((event_t)&vm_swapout_thread);
1398 	}
1399 
1400 	lck_mtx_unlock_always(c_list_lock);
1401 }
1402 
1403 
1404 static void
vm_swapout_finish(c_segment_t c_seg,uint64_t f_offset,uint32_t size,kern_return_t kr)1405 vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset, uint32_t size, kern_return_t kr)
1406 {
1407 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
1408 
1409 	if (kr == KERN_SUCCESS) {
1410 		kernel_memory_depopulate((vm_offset_t)c_seg->c_store.c_buffer, size,
1411 		    KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
1412 	}
1413 #if ENCRYPTED_SWAP
1414 	else {
1415 		vm_swap_decrypt(c_seg);
1416 	}
1417 #endif /* ENCRYPTED_SWAP */
1418 	lck_mtx_lock_spin_always(c_list_lock);
1419 	lck_mtx_lock_spin_always(&c_seg->c_lock);
1420 
1421 	if (kr == KERN_SUCCESS) {
1422 		int             new_state = C_ON_SWAPPEDOUT_Q;
1423 		boolean_t       insert_head = FALSE;
1424 
1425 		if (hibernate_flushing == TRUE) {
1426 			if (c_seg->c_generation_id >= first_c_segment_to_warm_generation_id &&
1427 			    c_seg->c_generation_id <= last_c_segment_to_warm_generation_id) {
1428 				insert_head = TRUE;
1429 			}
1430 		} else if (C_SEG_ONDISK_IS_SPARSE(c_seg)) {
1431 			new_state = C_ON_SWAPPEDOUTSPARSE_Q;
1432 		}
1433 
1434 		c_seg_switch_state(c_seg, new_state, insert_head);
1435 
1436 		c_seg->c_store.c_swap_handle = f_offset;
1437 
1438 		counter_add(&vm_statistics_swapouts, size >> PAGE_SHIFT);
1439 
1440 		c_seg->c_swappedin = false;
1441 
1442 		if (c_seg->c_bytes_used) {
1443 			OSAddAtomic64(-c_seg->c_bytes_used, &compressor_bytes_used);
1444 		}
1445 
1446 #if CONFIG_FREEZE
1447 		/*
1448 		 * Successful swapout. Decrement the in-core compressed pages count.
1449 		 */
1450 		OSAddAtomic(-(c_seg->c_slots_used), &c_segment_pages_compressed_incore);
1451 		assertf(c_segment_pages_compressed_incore >= 0, "-ve incore count %p 0x%x", c_seg, c_segment_pages_compressed_incore);
1452 		if (c_seg->c_has_donated_pages) {
1453 			OSAddAtomic(-(c_seg->c_slots_used), &c_segment_pages_compressed_incore_late_swapout);
1454 		}
1455 #endif /* CONFIG_FREEZE */
1456 	} else {
1457 		if (c_seg->c_overage_swap == TRUE) {
1458 			c_seg->c_overage_swap = FALSE;
1459 			c_overage_swapped_count--;
1460 		}
1461 
1462 #if CONFIG_FREEZE
1463 		if (c_seg->c_has_freezer_pages) {
1464 			if (c_seg->c_task_owner) {
1465 				c_seg_update_task_owner(c_seg, NULL);
1466 			}
1467 			/*
1468 			 * We failed to swapout a frozen cseg. We need
1469 			 * to put it back in the queues, specifically the
1470 			 * AGE_Q. So clear the donated bit otherwise it'll
1471 			 * land on the swapped_in Q.
1472 			 */
1473 			c_seg->c_has_donated_pages = 0;
1474 			c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
1475 		} else
1476 #endif /* CONFIG_FREEZE */
1477 		{
1478 			if (c_seg->c_has_donated_pages) {
1479 				c_seg_switch_state(c_seg, C_ON_SWAPPEDIN_Q, FALSE);
1480 			} else {
1481 				c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
1482 			}
1483 		}
1484 
1485 		if (!c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
1486 			c_seg_need_delayed_compaction(c_seg, TRUE);
1487 		}
1488 	}
1489 	assert(c_seg->c_busy_swapping);
1490 	assert(c_seg->c_busy);
1491 
1492 	c_seg->c_busy_swapping = 0;
1493 	lck_mtx_unlock_always(c_list_lock);
1494 
1495 	C_SEG_WAKEUP_DONE(c_seg);
1496 	lck_mtx_unlock_always(&c_seg->c_lock);
1497 
1498 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
1499 }
1500 
1501 
1502 boolean_t
vm_swap_create_file()1503 vm_swap_create_file()
1504 {
1505 	uint64_t        size = 0;
1506 	int             namelen = 0;
1507 	boolean_t       swap_file_created = FALSE;
1508 	boolean_t       swap_file_reuse = FALSE;
1509 	boolean_t       swap_file_pin = FALSE;
1510 	struct swapfile *swf = NULL;
1511 
1512 	/*
1513 	 * make sure we've got all the info we need
1514 	 * to potentially pin a swap file... we could
1515 	 * be swapping out due to hibernation w/o ever
1516 	 * having run vm_pageout_scan, which is normally
1517 	 * the trigger to do the init
1518 	 */
1519 	vm_compaction_swapper_do_init();
1520 
1521 	/*
1522 	 * Any swapfile structure ready for re-use?
1523 	 */
1524 
1525 	lck_mtx_lock(&vm_swap_data_lock);
1526 
1527 	swf = (struct swapfile*) queue_first(&swf_global_queue);
1528 
1529 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
1530 		if (swf->swp_flags == SWAP_REUSE) {
1531 			swap_file_reuse = TRUE;
1532 			break;
1533 		}
1534 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
1535 	}
1536 
1537 	lck_mtx_unlock(&vm_swap_data_lock);
1538 
1539 	if (swap_file_reuse == FALSE) {
1540 		namelen = (int)strlen(swapfilename) + SWAPFILENAME_INDEX_LEN + 1;
1541 
1542 		swf = kalloc_type(struct swapfile, Z_WAITOK | Z_ZERO);
1543 		swf->swp_index = vm_num_swap_files + 1;
1544 		swf->swp_pathlen = namelen;
1545 		swf->swp_path = kalloc_data(swf->swp_pathlen, Z_WAITOK | Z_ZERO);
1546 
1547 		snprintf(swf->swp_path, namelen, "%s%d", swapfilename, vm_num_swap_files);
1548 	}
1549 
1550 	vm_swapfile_open(swf->swp_path, &swf->swp_vp);
1551 
1552 	if (swf->swp_vp == NULL) {
1553 		if (swap_file_reuse == FALSE) {
1554 			kfree_data(swf->swp_path, swf->swp_pathlen);
1555 			kfree_type(struct swapfile, swf);
1556 		}
1557 		return FALSE;
1558 	}
1559 	vm_swapfile_can_be_created = TRUE;
1560 
1561 	size = MAX_SWAP_FILE_SIZE;
1562 
1563 	while (size >= MIN_SWAP_FILE_SIZE) {
1564 		swap_file_pin = VM_SWAP_SHOULD_PIN(size);
1565 
1566 		if (vm_swapfile_preallocate(swf->swp_vp, &size, &swap_file_pin) == 0) {
1567 			int num_bytes_for_bitmap = 0;
1568 
1569 			swap_file_created = TRUE;
1570 
1571 			swf->swp_size = size;
1572 			swf->swp_nsegs = (unsigned int) (size / compressed_swap_chunk_size);
1573 			swf->swp_nseginuse = 0;
1574 			swf->swp_free_hint = 0;
1575 
1576 			num_bytes_for_bitmap = MAX((swf->swp_nsegs >> 3), 1);
1577 			/*
1578 			 * Allocate a bitmap that describes the
1579 			 * number of segments held by this swapfile.
1580 			 */
1581 			swf->swp_bitmap = kalloc_data(num_bytes_for_bitmap,
1582 			    Z_WAITOK | Z_ZERO);
1583 
1584 			swf->swp_csegs = kalloc_type(c_segment_t, swf->swp_nsegs,
1585 			    Z_WAITOK | Z_ZERO);
1586 
1587 			/*
1588 			 * passing a NULL trim_list into vnode_trim_list
1589 			 * will return ENOTSUP if trim isn't supported
1590 			 * and 0 if it is
1591 			 */
1592 			if (vnode_trim_list(swf->swp_vp, NULL, FALSE) == 0) {
1593 				swp_trim_supported = TRUE;
1594 			}
1595 
1596 			lck_mtx_lock(&vm_swap_data_lock);
1597 
1598 			swf->swp_flags = SWAP_READY;
1599 
1600 			if (swap_file_reuse == FALSE) {
1601 				queue_enter(&swf_global_queue, swf, struct swapfile*, swp_queue);
1602 			}
1603 
1604 			vm_num_swap_files++;
1605 
1606 			vm_swapfile_total_segs_alloced += swf->swp_nsegs;
1607 			if (vm_swapfile_total_segs_alloced > vm_swapfile_total_segs_alloced_max) {
1608 				vm_swapfile_total_segs_alloced_max = vm_swapfile_total_segs_alloced;
1609 			}
1610 
1611 			if (swap_file_pin == TRUE) {
1612 				vm_num_pinned_swap_files++;
1613 				swf->swp_flags |= SWAP_PINNED;
1614 				vm_swappin_avail -= swf->swp_size;
1615 			}
1616 
1617 			lck_mtx_unlock(&vm_swap_data_lock);
1618 
1619 			thread_wakeup((event_t) &vm_num_swap_files);
1620 #if !XNU_TARGET_OS_OSX
1621 			if (vm_num_swap_files == 1) {
1622 				c_overage_swapped_limit = (uint32_t)size / c_seg_bufsize;
1623 
1624 				if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
1625 					c_overage_swapped_limit /= 2;
1626 				}
1627 			}
1628 #endif /* !XNU_TARGET_OS_OSX */
1629 			break;
1630 		} else {
1631 			size = size / 2;
1632 		}
1633 	}
1634 	if (swap_file_created == FALSE) {
1635 		vm_swapfile_close((uint64_t)(swf->swp_path), swf->swp_vp);
1636 
1637 		swf->swp_vp = NULL;
1638 
1639 		if (swap_file_reuse == FALSE) {
1640 			kfree_data(swf->swp_path, swf->swp_pathlen);
1641 			kfree_type(struct swapfile, swf);
1642 		}
1643 	}
1644 	return swap_file_created;
1645 }
1646 
1647 extern void vnode_put(struct vnode* vp);
1648 kern_return_t
vm_swap_get(c_segment_t c_seg,uint64_t f_offset,uint64_t size)1649 vm_swap_get(c_segment_t c_seg, uint64_t f_offset, uint64_t size)
1650 {
1651 	struct swapfile *swf = NULL;
1652 	uint64_t        file_offset = 0;
1653 	int             retval = 0;
1654 
1655 	assert(c_seg->c_store.c_buffer);
1656 
1657 	lck_mtx_lock(&vm_swap_data_lock);
1658 
1659 	swf = vm_swapfile_for_handle(f_offset);
1660 
1661 	if (swf == NULL || (!(swf->swp_flags & SWAP_READY) && !(swf->swp_flags & SWAP_RECLAIM))) {
1662 		vm_swap_get_failures++;
1663 		retval = 1;
1664 		goto done;
1665 	}
1666 	swf->swp_io_count++;
1667 
1668 	lck_mtx_unlock(&vm_swap_data_lock);
1669 
1670 #if DEVELOPMENT || DEBUG
1671 	C_SEG_MAKE_WRITEABLE(c_seg);
1672 #endif
1673 	file_offset = (f_offset & SWAP_SLOT_MASK);
1674 
1675 	if ((retval = vnode_getwithref(swf->swp_vp)) != 0) {
1676 		printf("vm_swap_get: vnode_getwithref on swapfile failed with %d\n", retval);
1677 	} else {
1678 		retval = vm_swapfile_io(swf->swp_vp, file_offset, (uint64_t)c_seg->c_store.c_buffer, (int)(size / PAGE_SIZE_64), SWAP_READ, NULL);
1679 		vnode_put(swf->swp_vp);
1680 	}
1681 
1682 #if DEVELOPMENT || DEBUG
1683 	C_SEG_WRITE_PROTECT(c_seg);
1684 #endif
1685 	if (retval == 0) {
1686 		counter_add(&vm_statistics_swapins, size >> PAGE_SHIFT);
1687 	} else {
1688 		vm_swap_get_failures++;
1689 	}
1690 
1691 	/*
1692 	 * Free this slot in the swap structure.
1693 	 */
1694 	vm_swap_free(f_offset);
1695 
1696 	lck_mtx_lock(&vm_swap_data_lock);
1697 	swf->swp_io_count--;
1698 
1699 	if ((swf->swp_flags & SWAP_WANTED) && swf->swp_io_count == 0) {
1700 		swf->swp_flags &= ~SWAP_WANTED;
1701 		thread_wakeup((event_t) &swf->swp_flags);
1702 	}
1703 done:
1704 	lck_mtx_unlock(&vm_swap_data_lock);
1705 
1706 	if (retval == 0) {
1707 		return KERN_SUCCESS;
1708 	} else {
1709 		return KERN_FAILURE;
1710 	}
1711 }
1712 
1713 kern_return_t
vm_swap_put(vm_offset_t addr,uint64_t * f_offset,uint32_t size,c_segment_t c_seg,struct swapout_io_completion * soc)1714 vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint32_t size, c_segment_t c_seg, struct swapout_io_completion *soc)
1715 {
1716 	unsigned int    segidx = 0;
1717 	struct swapfile *swf = NULL;
1718 	uint64_t        file_offset = 0;
1719 	uint64_t        swapfile_index = 0;
1720 	unsigned int    byte_for_segidx = 0;
1721 	unsigned int    offset_within_byte = 0;
1722 	boolean_t       swf_eligible = FALSE;
1723 	boolean_t       waiting = FALSE;
1724 	boolean_t       retried = FALSE;
1725 	int             error = 0;
1726 	uint64_t        now;
1727 	void            *upl_ctx = NULL;
1728 	boolean_t       drop_iocount = FALSE;
1729 
1730 	if (addr == 0 || f_offset == NULL || compressor_store_stop_compaction) {
1731 		return KERN_FAILURE;
1732 	}
1733 retry:
1734 	lck_mtx_lock(&vm_swap_data_lock);
1735 
1736 	swf = (struct swapfile*) queue_first(&swf_global_queue);
1737 
1738 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
1739 		segidx = swf->swp_free_hint;
1740 
1741 		swf_eligible =  (swf->swp_flags & SWAP_READY) && (swf->swp_nseginuse < swf->swp_nsegs);
1742 
1743 		if (swf_eligible) {
1744 			while (segidx < swf->swp_nsegs) {
1745 				byte_for_segidx = segidx >> 3;
1746 				offset_within_byte = segidx % 8;
1747 
1748 				if ((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) {
1749 					segidx++;
1750 					continue;
1751 				}
1752 
1753 				(swf->swp_bitmap)[byte_for_segidx] |= (uint8_t)(1 << offset_within_byte);
1754 
1755 				file_offset = segidx * compressed_swap_chunk_size;
1756 				swf->swp_nseginuse++;
1757 				swf->swp_io_count++;
1758 				swf->swp_csegs[segidx] = c_seg;
1759 
1760 				swapfile_index = swf->swp_index;
1761 				vm_swapfile_total_segs_used++;
1762 				if (vm_swapfile_total_segs_used > vm_swapfile_total_segs_used_max) {
1763 					vm_swapfile_total_segs_used_max = vm_swapfile_total_segs_used;
1764 				}
1765 
1766 				now = mach_absolute_time();
1767 
1768 				if (vm_swapfile_should_create(now) && !vm_swapfile_create_thread_running) {
1769 					thread_wakeup((event_t) &vm_swapfile_create_needed);
1770 				}
1771 
1772 				lck_mtx_unlock(&vm_swap_data_lock);
1773 
1774 				goto issue_io;
1775 			}
1776 		}
1777 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
1778 	}
1779 	assert(queue_end(&swf_global_queue, (queue_entry_t) swf));
1780 
1781 	/*
1782 	 * we've run out of swap segments, but may not
1783 	 * be in a position to immediately create a new swap
1784 	 * file if we've recently failed to create due to a lack
1785 	 * of free space in the root filesystem... we'll try
1786 	 * to kick that create off, but in any event we're going
1787 	 * to take a breather (up to 1 second) so that we're not caught in a tight
1788 	 * loop back in "vm_compressor_compact_and_swap" trying to stuff
1789 	 * segments into swap files only to have them immediately put back
1790 	 * on the c_age queue due to vm_swap_put failing.
1791 	 *
1792 	 * if we're doing these puts due to a hibernation flush,
1793 	 * no need to block... setting hibernate_no_swapspace to TRUE,
1794 	 * will cause "vm_compressor_compact_and_swap" to immediately abort
1795 	 */
1796 	now = mach_absolute_time();
1797 
1798 	if (vm_swapfile_should_create(now)) {
1799 		if (!vm_swapfile_create_thread_running) {
1800 			thread_wakeup((event_t) &vm_swapfile_create_needed);
1801 		}
1802 		waiting = TRUE;
1803 		assert_wait_timeout((event_t) &vm_num_swap_files, THREAD_INTERRUPTIBLE, 1000, 1000 * NSEC_PER_USEC);
1804 	} else {
1805 		if (hibernate_flushing) {
1806 			hibernate_no_swapspace = TRUE;
1807 		}
1808 	}
1809 
1810 	lck_mtx_unlock(&vm_swap_data_lock);
1811 
1812 	if (waiting == TRUE) {
1813 		thread_block(THREAD_CONTINUE_NULL);
1814 
1815 		if (retried == FALSE && hibernate_flushing == TRUE) {
1816 			retried = TRUE;
1817 			goto retry;
1818 		}
1819 	}
1820 	vm_swap_put_failures_no_swap_file++;
1821 
1822 	return KERN_FAILURE;
1823 
1824 issue_io:
1825 	assert(c_seg->c_busy_swapping);
1826 	assert(c_seg->c_busy);
1827 	assert(!c_seg->c_on_minorcompact_q);
1828 
1829 	*f_offset = (swapfile_index << SWAP_DEVICE_SHIFT) | file_offset;
1830 
1831 	if (soc) {
1832 		soc->swp_c_seg = c_seg;
1833 		soc->swp_c_size = size;
1834 
1835 		soc->swp_swf = swf;
1836 
1837 		soc->swp_io_error = 0;
1838 		soc->swp_io_done = 0;
1839 
1840 		upl_ctx = (void *)&soc->swp_upl_ctx;
1841 	}
1842 
1843 	if ((error = vnode_getwithref(swf->swp_vp)) != 0) {
1844 		printf("vm_swap_put: vnode_getwithref on swapfile failed with %d\n", error);
1845 	} else {
1846 		error = vm_swapfile_io(swf->swp_vp, file_offset, addr, (int) (size / PAGE_SIZE_64), SWAP_WRITE, upl_ctx);
1847 		drop_iocount = TRUE;
1848 	}
1849 
1850 	if (error || upl_ctx == NULL) {
1851 		return vm_swap_put_finish(swf, f_offset, error, drop_iocount);
1852 	}
1853 
1854 	return KERN_SUCCESS;
1855 }
1856 
1857 kern_return_t
vm_swap_put_finish(struct swapfile * swf,uint64_t * f_offset,int error,boolean_t drop_iocount)1858 vm_swap_put_finish(struct swapfile *swf, uint64_t *f_offset, int error, boolean_t drop_iocount)
1859 {
1860 	if (drop_iocount) {
1861 		vnode_put(swf->swp_vp);
1862 	}
1863 
1864 	lck_mtx_lock(&vm_swap_data_lock);
1865 
1866 	swf->swp_io_count--;
1867 
1868 	if ((swf->swp_flags & SWAP_WANTED) && swf->swp_io_count == 0) {
1869 		swf->swp_flags &= ~SWAP_WANTED;
1870 		thread_wakeup((event_t) &swf->swp_flags);
1871 	}
1872 	lck_mtx_unlock(&vm_swap_data_lock);
1873 
1874 	if (error) {
1875 		vm_swap_free(*f_offset);
1876 		vm_swap_put_failures++;
1877 
1878 		return KERN_FAILURE;
1879 	}
1880 	return KERN_SUCCESS;
1881 }
1882 
1883 
1884 static void
vm_swap_free_now(struct swapfile * swf,uint64_t f_offset)1885 vm_swap_free_now(struct swapfile *swf, uint64_t f_offset)
1886 {
1887 	uint64_t        file_offset = 0;
1888 	unsigned int    segidx = 0;
1889 
1890 
1891 	if ((swf->swp_flags & SWAP_READY) || (swf->swp_flags & SWAP_RECLAIM)) {
1892 		unsigned int byte_for_segidx = 0;
1893 		unsigned int offset_within_byte = 0;
1894 
1895 		file_offset = (f_offset & SWAP_SLOT_MASK);
1896 		segidx = (unsigned int) (file_offset / compressed_swap_chunk_size);
1897 
1898 		byte_for_segidx = segidx >> 3;
1899 		offset_within_byte = segidx % 8;
1900 
1901 		if ((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) {
1902 			(swf->swp_bitmap)[byte_for_segidx] &= ~(1 << offset_within_byte);
1903 
1904 			swf->swp_csegs[segidx] = NULL;
1905 
1906 			swf->swp_nseginuse--;
1907 			vm_swapfile_total_segs_used--;
1908 
1909 			if (segidx < swf->swp_free_hint) {
1910 				swf->swp_free_hint = segidx;
1911 			}
1912 		}
1913 		if (VM_SWAP_SHOULD_RECLAIM() && !vm_swapfile_gc_thread_running) {
1914 			thread_wakeup((event_t) &vm_swapfile_gc_needed);
1915 		}
1916 	}
1917 }
1918 
1919 
1920 uint32_t vm_swap_free_now_count = 0;
1921 uint32_t vm_swap_free_delayed_count = 0;
1922 
1923 
1924 void
vm_swap_free(uint64_t f_offset)1925 vm_swap_free(uint64_t f_offset)
1926 {
1927 	struct swapfile *swf = NULL;
1928 	struct trim_list *tl = NULL;
1929 	uint64_t now;
1930 
1931 	if (swp_trim_supported == TRUE) {
1932 		tl = kalloc_type(struct trim_list, Z_WAITOK);
1933 	}
1934 
1935 	lck_mtx_lock(&vm_swap_data_lock);
1936 
1937 	swf = vm_swapfile_for_handle(f_offset);
1938 
1939 	if (swf && (swf->swp_flags & (SWAP_READY | SWAP_RECLAIM))) {
1940 		if (swp_trim_supported == FALSE || (swf->swp_flags & SWAP_RECLAIM)) {
1941 			/*
1942 			 * don't delay the free if the underlying disk doesn't support
1943 			 * trim, or we're in the midst of reclaiming this swap file since
1944 			 * we don't want to move segments that are technically free
1945 			 * but not yet handled by the delayed free mechanism
1946 			 */
1947 			vm_swap_free_now(swf, f_offset);
1948 
1949 			vm_swap_free_now_count++;
1950 			goto done;
1951 		}
1952 		tl->tl_offset = f_offset & SWAP_SLOT_MASK;
1953 		tl->tl_length = compressed_swap_chunk_size;
1954 
1955 		tl->tl_next = swf->swp_delayed_trim_list_head;
1956 		swf->swp_delayed_trim_list_head = tl;
1957 		swf->swp_delayed_trim_count++;
1958 		tl = NULL;
1959 
1960 		if (VM_SWAP_SHOULD_TRIM(swf) && !vm_swapfile_create_thread_running) {
1961 			now = mach_absolute_time();
1962 
1963 			if (now > dont_trim_until_ts) {
1964 				thread_wakeup((event_t) &vm_swapfile_create_needed);
1965 			}
1966 		}
1967 		vm_swap_free_delayed_count++;
1968 	}
1969 done:
1970 	lck_mtx_unlock(&vm_swap_data_lock);
1971 
1972 	if (tl != NULL) {
1973 		kfree_type(struct trim_list, tl);
1974 	}
1975 }
1976 
1977 
1978 static void
vm_swap_wait_on_trim_handling_in_progress()1979 vm_swap_wait_on_trim_handling_in_progress()
1980 {
1981 	while (delayed_trim_handling_in_progress == TRUE) {
1982 		assert_wait((event_t) &delayed_trim_handling_in_progress, THREAD_UNINT);
1983 		lck_mtx_unlock(&vm_swap_data_lock);
1984 
1985 		thread_block(THREAD_CONTINUE_NULL);
1986 
1987 		lck_mtx_lock(&vm_swap_data_lock);
1988 	}
1989 }
1990 
1991 
1992 static void
vm_swap_handle_delayed_trims(boolean_t force_now)1993 vm_swap_handle_delayed_trims(boolean_t force_now)
1994 {
1995 	struct swapfile *swf = NULL;
1996 
1997 	/*
1998 	 * serialize the race between us and vm_swap_reclaim...
1999 	 * if vm_swap_reclaim wins it will turn off SWAP_READY
2000 	 * on the victim it has chosen... we can just skip over
2001 	 * that file since vm_swap_reclaim will first process
2002 	 * all of the delayed trims associated with it
2003 	 */
2004 
2005 	if (compressor_store_stop_compaction == TRUE) {
2006 		return;
2007 	}
2008 
2009 	lck_mtx_lock(&vm_swap_data_lock);
2010 
2011 	delayed_trim_handling_in_progress = TRUE;
2012 
2013 	lck_mtx_unlock(&vm_swap_data_lock);
2014 
2015 	/*
2016 	 * no need to hold the lock to walk the swf list since
2017 	 * vm_swap_create (the only place where we add to this list)
2018 	 * is run on the same thread as this function
2019 	 * and vm_swap_reclaim doesn't remove items from this list
2020 	 * instead marking them with SWAP_REUSE for future re-use
2021 	 */
2022 	swf = (struct swapfile*) queue_first(&swf_global_queue);
2023 
2024 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
2025 		if ((swf->swp_flags & SWAP_READY) && (force_now == TRUE || VM_SWAP_SHOULD_TRIM(swf))) {
2026 			assert(!(swf->swp_flags & SWAP_RECLAIM));
2027 			vm_swap_do_delayed_trim(swf);
2028 		}
2029 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
2030 	}
2031 	lck_mtx_lock(&vm_swap_data_lock);
2032 
2033 	delayed_trim_handling_in_progress = FALSE;
2034 	thread_wakeup((event_t) &delayed_trim_handling_in_progress);
2035 
2036 	if (VM_SWAP_SHOULD_RECLAIM() && !vm_swapfile_gc_thread_running) {
2037 		thread_wakeup((event_t) &vm_swapfile_gc_needed);
2038 	}
2039 
2040 	lck_mtx_unlock(&vm_swap_data_lock);
2041 }
2042 
2043 static void
vm_swap_do_delayed_trim(struct swapfile * swf)2044 vm_swap_do_delayed_trim(struct swapfile *swf)
2045 {
2046 	struct trim_list *tl, *tl_head;
2047 	int error;
2048 
2049 	if (compressor_store_stop_compaction == TRUE) {
2050 		return;
2051 	}
2052 
2053 	if ((error = vnode_getwithref(swf->swp_vp)) != 0) {
2054 		printf("vm_swap_do_delayed_trim: vnode_getwithref on swapfile failed with %d\n", error);
2055 		return;
2056 	}
2057 
2058 	lck_mtx_lock(&vm_swap_data_lock);
2059 
2060 	tl_head = swf->swp_delayed_trim_list_head;
2061 	swf->swp_delayed_trim_list_head = NULL;
2062 	swf->swp_delayed_trim_count = 0;
2063 
2064 	lck_mtx_unlock(&vm_swap_data_lock);
2065 
2066 	vnode_trim_list(swf->swp_vp, tl_head, TRUE);
2067 
2068 	(void) vnode_put(swf->swp_vp);
2069 
2070 	while ((tl = tl_head) != NULL) {
2071 		unsigned int    segidx = 0;
2072 		unsigned int    byte_for_segidx = 0;
2073 		unsigned int    offset_within_byte = 0;
2074 
2075 		lck_mtx_lock(&vm_swap_data_lock);
2076 
2077 		segidx = (unsigned int) (tl->tl_offset / compressed_swap_chunk_size);
2078 
2079 		byte_for_segidx = segidx >> 3;
2080 		offset_within_byte = segidx % 8;
2081 
2082 		if ((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) {
2083 			(swf->swp_bitmap)[byte_for_segidx] &= ~(1 << offset_within_byte);
2084 
2085 			swf->swp_csegs[segidx] = NULL;
2086 
2087 			swf->swp_nseginuse--;
2088 			vm_swapfile_total_segs_used--;
2089 
2090 			if (segidx < swf->swp_free_hint) {
2091 				swf->swp_free_hint = segidx;
2092 			}
2093 		}
2094 		lck_mtx_unlock(&vm_swap_data_lock);
2095 
2096 		tl_head = tl->tl_next;
2097 
2098 		kfree_type(struct trim_list, tl);
2099 	}
2100 }
2101 
2102 
2103 void
vm_swap_flush()2104 vm_swap_flush()
2105 {
2106 	return;
2107 }
2108 
2109 int     vm_swap_reclaim_yielded = 0;
2110 
2111 void
vm_swap_reclaim(void)2112 vm_swap_reclaim(void)
2113 {
2114 	vm_offset_t     addr = 0;
2115 	unsigned int    segidx = 0;
2116 	uint64_t        f_offset = 0;
2117 	struct swapfile *swf = NULL;
2118 	struct swapfile *smallest_swf = NULL;
2119 	unsigned int    min_nsegs = 0;
2120 	unsigned int    byte_for_segidx = 0;
2121 	unsigned int    offset_within_byte = 0;
2122 	uint32_t        c_size = 0;
2123 
2124 	c_segment_t     c_seg = NULL;
2125 
2126 	kmem_alloc(compressor_map, (vm_offset_t *)&addr, c_seg_bufsize,
2127 	    KMA_NOFAIL | KMA_KOBJECT | KMA_DATA, VM_KERN_MEMORY_COMPRESSOR);
2128 
2129 	lck_mtx_lock(&vm_swap_data_lock);
2130 
2131 	/*
2132 	 * if we're running the swapfile list looking for
2133 	 * candidates with delayed trims, we need to
2134 	 * wait before making our decision concerning
2135 	 * the swapfile we want to reclaim
2136 	 */
2137 	vm_swap_wait_on_trim_handling_in_progress();
2138 
2139 	/*
2140 	 * from here until we knock down the SWAP_READY bit,
2141 	 * we need to remain behind the vm_swap_data_lock...
2142 	 * once that bit has been turned off, "vm_swap_handle_delayed_trims"
2143 	 * will not consider this swapfile for processing
2144 	 */
2145 	swf = (struct swapfile*) queue_first(&swf_global_queue);
2146 	min_nsegs = MAX_SWAP_FILE_SIZE / compressed_swap_chunk_size;
2147 	smallest_swf = NULL;
2148 
2149 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
2150 		if ((swf->swp_flags & SWAP_READY) && (swf->swp_nseginuse <= min_nsegs)) {
2151 			smallest_swf = swf;
2152 			min_nsegs = swf->swp_nseginuse;
2153 		}
2154 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
2155 	}
2156 
2157 	if (smallest_swf == NULL) {
2158 		goto done;
2159 	}
2160 
2161 	swf = smallest_swf;
2162 
2163 
2164 	swf->swp_flags &= ~SWAP_READY;
2165 	swf->swp_flags |= SWAP_RECLAIM;
2166 
2167 	if (swf->swp_delayed_trim_count) {
2168 		lck_mtx_unlock(&vm_swap_data_lock);
2169 
2170 		vm_swap_do_delayed_trim(swf);
2171 
2172 		lck_mtx_lock(&vm_swap_data_lock);
2173 	}
2174 	segidx = 0;
2175 
2176 	while (segidx < swf->swp_nsegs) {
2177 ReTry_for_cseg:
2178 		/*
2179 		 * Wait for outgoing I/Os.
2180 		 */
2181 		while (swf->swp_io_count) {
2182 			swf->swp_flags |= SWAP_WANTED;
2183 
2184 			assert_wait((event_t) &swf->swp_flags, THREAD_UNINT);
2185 			lck_mtx_unlock(&vm_swap_data_lock);
2186 
2187 			thread_block(THREAD_CONTINUE_NULL);
2188 
2189 			lck_mtx_lock(&vm_swap_data_lock);
2190 		}
2191 		if (compressor_store_stop_compaction == TRUE || VM_SWAP_SHOULD_ABORT_RECLAIM() || VM_SWAP_BUSY()) {
2192 			vm_swap_reclaim_yielded++;
2193 			break;
2194 		}
2195 
2196 		byte_for_segidx = segidx >> 3;
2197 		offset_within_byte = segidx % 8;
2198 
2199 		if (((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) == 0) {
2200 			segidx++;
2201 			continue;
2202 		}
2203 
2204 		c_seg = swf->swp_csegs[segidx];
2205 		assert(c_seg);
2206 
2207 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2208 
2209 		if (c_seg->c_busy) {
2210 			/*
2211 			 * a swapped out c_segment in the process of being freed will remain in the
2212 			 * busy state until after the vm_swap_free is called on it... vm_swap_free
2213 			 * takes the vm_swap_data_lock, so can't change the swap state until after
2214 			 * we drop the vm_swap_data_lock... once we do, vm_swap_free will complete
2215 			 * which will allow c_seg_free_locked to clear busy and wake up this thread...
2216 			 * at that point, we re-look up the swap state which will now indicate that
2217 			 * this c_segment no longer exists.
2218 			 */
2219 			c_seg->c_wanted = 1;
2220 
2221 			assert_wait((event_t) (c_seg), THREAD_UNINT);
2222 			lck_mtx_unlock_always(&c_seg->c_lock);
2223 
2224 			lck_mtx_unlock(&vm_swap_data_lock);
2225 
2226 			thread_block(THREAD_CONTINUE_NULL);
2227 
2228 			lck_mtx_lock(&vm_swap_data_lock);
2229 
2230 			goto ReTry_for_cseg;
2231 		}
2232 		(swf->swp_bitmap)[byte_for_segidx] &= ~(1 << offset_within_byte);
2233 
2234 		f_offset = segidx * compressed_swap_chunk_size;
2235 
2236 		assert(c_seg == swf->swp_csegs[segidx]);
2237 		swf->swp_csegs[segidx] = NULL;
2238 		swf->swp_nseginuse--;
2239 
2240 		vm_swapfile_total_segs_used--;
2241 
2242 		lck_mtx_unlock(&vm_swap_data_lock);
2243 
2244 		assert(C_SEG_IS_ONDISK(c_seg));
2245 
2246 		C_SEG_BUSY(c_seg);
2247 		c_seg->c_busy_swapping = 1;
2248 #if !CHECKSUM_THE_SWAP
2249 		c_seg_trim_tail(c_seg);
2250 #endif
2251 		c_size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
2252 
2253 		assert(c_size <= c_seg_bufsize && c_size);
2254 
2255 		lck_mtx_unlock_always(&c_seg->c_lock);
2256 
2257 		if (vnode_getwithref(swf->swp_vp)) {
2258 			printf("vm_swap_reclaim: vnode_getwithref on swapfile failed.\n");
2259 			vm_swap_get_failures++;
2260 			goto swap_io_failed;
2261 		} else {
2262 			if (vm_swapfile_io(swf->swp_vp, f_offset, addr, (int)(c_size / PAGE_SIZE_64), SWAP_READ, NULL)) {
2263 				/*
2264 				 * reading the data back in failed, so convert c_seg
2265 				 * to a swapped in c_segment that contains no data
2266 				 */
2267 				c_seg_swapin_requeue(c_seg, FALSE, TRUE, FALSE);
2268 				/*
2269 				 * returns with c_busy_swapping cleared
2270 				 */
2271 				vnode_put(swf->swp_vp);
2272 				vm_swap_get_failures++;
2273 				goto swap_io_failed;
2274 			}
2275 			vnode_put(swf->swp_vp);
2276 		}
2277 
2278 		counter_add(&vm_statistics_swapins, c_size >> PAGE_SHIFT);
2279 		vmcs_stats.reclaim_swapins += c_size >> PAGE_SHIFT;
2280 
2281 		if (vm_swap_put(addr, &f_offset, c_size, c_seg, NULL)) {
2282 			vm_offset_t     c_buffer;
2283 
2284 			/*
2285 			 * the put failed, so convert c_seg to a fully swapped in c_segment
2286 			 * with valid data
2287 			 */
2288 			c_buffer = (vm_offset_t)C_SEG_BUFFER_ADDRESS(c_seg->c_mysegno);
2289 
2290 			kernel_memory_populate(c_buffer, c_size,
2291 			    KMA_NOFAIL | KMA_COMPRESSOR,
2292 			    VM_KERN_MEMORY_COMPRESSOR);
2293 
2294 			memcpy((char *)c_buffer, (char *)addr, c_size);
2295 
2296 			c_seg->c_store.c_buffer = (int32_t *)c_buffer;
2297 #if ENCRYPTED_SWAP
2298 			vm_swap_decrypt(c_seg);
2299 #endif /* ENCRYPTED_SWAP */
2300 			c_seg_swapin_requeue(c_seg, TRUE, TRUE, FALSE);
2301 			/*
2302 			 * returns with c_busy_swapping cleared
2303 			 */
2304 			OSAddAtomic64(c_seg->c_bytes_used, &compressor_bytes_used);
2305 
2306 			goto swap_io_failed;
2307 		}
2308 		counter_add(&vm_statistics_swapouts, c_size >> PAGE_SHIFT);
2309 
2310 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2311 
2312 		c_seg->c_swappedin = false;
2313 
2314 		assert(C_SEG_IS_ONDISK(c_seg));
2315 		/*
2316 		 * The c_seg will now know about the new location on disk.
2317 		 */
2318 		c_seg->c_store.c_swap_handle = f_offset;
2319 
2320 		assert(c_seg->c_busy_swapping);
2321 		c_seg->c_busy_swapping = 0;
2322 swap_io_failed:
2323 		assert(c_seg->c_busy);
2324 		C_SEG_WAKEUP_DONE(c_seg);
2325 
2326 		lck_mtx_unlock_always(&c_seg->c_lock);
2327 		lck_mtx_lock(&vm_swap_data_lock);
2328 	}
2329 
2330 	if (swf->swp_nseginuse) {
2331 		swf->swp_flags &= ~SWAP_RECLAIM;
2332 		swf->swp_flags |= SWAP_READY;
2333 
2334 		goto done;
2335 	}
2336 	/*
2337 	 * We don't remove this inactive swf from the queue.
2338 	 * That way, we can re-use it when needed again and
2339 	 * preserve the namespace. The delayed_trim processing
2340 	 * is also dependent on us not removing swfs from the queue.
2341 	 */
2342 	//queue_remove(&swf_global_queue, swf, struct swapfile*, swp_queue);
2343 
2344 	vm_swapfile_total_segs_alloced -= swf->swp_nsegs;
2345 
2346 	lck_mtx_unlock(&vm_swap_data_lock);
2347 
2348 	vm_swapfile_close((uint64_t)(swf->swp_path), swf->swp_vp);
2349 
2350 	kfree_type(c_segment_t, swf->swp_nsegs, swf->swp_csegs);
2351 	kfree_data(swf->swp_bitmap, MAX((swf->swp_nsegs >> 3), 1));
2352 
2353 	lck_mtx_lock(&vm_swap_data_lock);
2354 
2355 	if (swf->swp_flags & SWAP_PINNED) {
2356 		vm_num_pinned_swap_files--;
2357 		vm_swappin_avail += swf->swp_size;
2358 	}
2359 
2360 	swf->swp_vp = NULL;
2361 	swf->swp_size = 0;
2362 	swf->swp_free_hint = 0;
2363 	swf->swp_nsegs = 0;
2364 	swf->swp_flags = SWAP_REUSE;
2365 
2366 	vm_num_swap_files--;
2367 
2368 done:
2369 	thread_wakeup((event_t) &swf->swp_flags);
2370 	lck_mtx_unlock(&vm_swap_data_lock);
2371 
2372 	kmem_free(compressor_map, (vm_offset_t) addr, c_seg_bufsize);
2373 }
2374 
2375 
2376 uint64_t
vm_swap_get_total_space(void)2377 vm_swap_get_total_space(void)
2378 {
2379 	uint64_t total_space = 0;
2380 
2381 	total_space = (uint64_t)vm_swapfile_total_segs_alloced * compressed_swap_chunk_size;
2382 
2383 	return total_space;
2384 }
2385 
2386 uint64_t
vm_swap_get_used_space(void)2387 vm_swap_get_used_space(void)
2388 {
2389 	uint64_t used_space = 0;
2390 
2391 	used_space = (uint64_t)vm_swapfile_total_segs_used * compressed_swap_chunk_size;
2392 
2393 	return used_space;
2394 }
2395 
2396 uint64_t
vm_swap_get_free_space(void)2397 vm_swap_get_free_space(void)
2398 {
2399 	return vm_swap_get_total_space() - vm_swap_get_used_space();
2400 }
2401 
2402 uint64_t
vm_swap_get_max_configured_space(void)2403 vm_swap_get_max_configured_space(void)
2404 {
2405 	int num_swap_files = (vm_num_swap_files_config ? vm_num_swap_files_config : VM_MAX_SWAP_FILE_NUM);
2406 	return num_swap_files * MAX_SWAP_FILE_SIZE;
2407 }
2408 
2409 bool
vm_swap_low_on_space(void)2410 vm_swap_low_on_space(void)
2411 {
2412 	if (vm_num_swap_files == 0 && vm_swapfile_can_be_created == FALSE) {
2413 		return false;
2414 	}
2415 
2416 	if (((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < ((unsigned int)vm_swapfile_hiwater_segs) / 8)) {
2417 		if (vm_num_swap_files == 0 && !SWAPPER_NEEDS_TO_UNTHROTTLE()) {
2418 			return false;
2419 		}
2420 
2421 		if (vm_swapfile_last_failed_to_create_ts >= vm_swapfile_last_successful_create_ts) {
2422 			return true;
2423 		}
2424 	}
2425 	return false;
2426 }
2427 
2428 int
vm_swap_out_of_space(void)2429 vm_swap_out_of_space(void)
2430 {
2431 	if ((vm_num_swap_files == vm_num_swap_files_config) &&
2432 	    ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < VM_SWAPOUT_LIMIT_MAX)) {
2433 		/*
2434 		 * Last swapfile and we have only space for the
2435 		 * last few swapouts.
2436 		 */
2437 		return 1;
2438 	}
2439 
2440 	return 0;
2441 }
2442 
2443 boolean_t
vm_swap_files_pinned(void)2444 vm_swap_files_pinned(void)
2445 {
2446 	boolean_t result;
2447 
2448 	if (vm_swappin_enabled == FALSE) {
2449 		return TRUE;
2450 	}
2451 
2452 	result = (vm_num_pinned_swap_files == vm_num_swap_files);
2453 
2454 	return result;
2455 }
2456 
2457 #if CONFIG_FREEZE
2458 boolean_t
vm_swap_max_budget(uint64_t * freeze_daily_budget)2459 vm_swap_max_budget(uint64_t *freeze_daily_budget)
2460 {
2461 	boolean_t       use_device_value = FALSE;
2462 	struct swapfile *swf = NULL;
2463 
2464 	if (vm_num_swap_files) {
2465 		lck_mtx_lock(&vm_swap_data_lock);
2466 
2467 		swf = (struct swapfile*) queue_first(&swf_global_queue);
2468 
2469 		if (swf) {
2470 			while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
2471 				if (swf->swp_flags == SWAP_READY) {
2472 					assert(swf->swp_vp);
2473 
2474 					if (vm_swap_vol_get_budget(swf->swp_vp, freeze_daily_budget) == 0) {
2475 						use_device_value = TRUE;
2476 					}
2477 					break;
2478 				}
2479 				swf = (struct swapfile*) queue_next(&swf->swp_queue);
2480 			}
2481 		}
2482 
2483 		lck_mtx_unlock(&vm_swap_data_lock);
2484 	} else {
2485 		/*
2486 		 * This block is used for the initial budget value before any swap files
2487 		 * are created. We create a temp swap file to get the budget.
2488 		 */
2489 
2490 		struct vnode *temp_vp = NULL;
2491 
2492 		vm_swapfile_open(swapfilename, &temp_vp);
2493 
2494 		if (temp_vp) {
2495 			if (vm_swap_vol_get_budget(temp_vp, freeze_daily_budget) == 0) {
2496 				use_device_value = TRUE;
2497 			}
2498 
2499 			vm_swapfile_close((uint64_t)&swapfilename, temp_vp);
2500 			temp_vp = NULL;
2501 		} else {
2502 			*freeze_daily_budget = 0;
2503 		}
2504 	}
2505 
2506 	return use_device_value;
2507 }
2508 #endif /* CONFIG_FREEZE */
2509 
2510 void
vm_swap_reset_max_segs_tracking(uint64_t * alloced_max,uint64_t * used_max)2511 vm_swap_reset_max_segs_tracking(uint64_t *alloced_max, uint64_t *used_max)
2512 {
2513 	lck_mtx_lock(&vm_swap_data_lock);
2514 
2515 	*alloced_max = (uint64_t) vm_swapfile_total_segs_alloced_max * compressed_swap_chunk_size;
2516 	*used_max = (uint64_t) vm_swapfile_total_segs_used_max * compressed_swap_chunk_size;
2517 
2518 	vm_swapfile_total_segs_alloced_max = vm_swapfile_total_segs_alloced;
2519 	vm_swapfile_total_segs_used_max = vm_swapfile_total_segs_used;
2520 
2521 	lck_mtx_unlock(&vm_swap_data_lock);
2522 }
2523