xref: /xnu-10002.61.3/osfmk/vm/vm_compressor_backing_store.c (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1 /*
2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include "vm_compressor_backing_store.h"
30 #include <vm/vm_pageout.h>
31 #include <vm/vm_protos.h>
32 
33 #include <IOKit/IOHibernatePrivate.h>
34 
35 #include <kern/policy_internal.h>
36 
37 LCK_GRP_DECLARE(vm_swap_data_lock_grp, "vm_swap_data");
38 LCK_MTX_DECLARE(vm_swap_data_lock, &vm_swap_data_lock_grp);
39 
40 #if defined(XNU_TARGET_OS_OSX)
41 /*
42  * launchd explicitly turns ON swap later during boot on macOS devices.
43  */
44 boolean_t       compressor_store_stop_compaction = TRUE;
45 #else
46 boolean_t       compressor_store_stop_compaction = FALSE;
47 #endif
48 
49 boolean_t       vm_swapfile_create_needed = FALSE;
50 boolean_t       vm_swapfile_gc_needed = FALSE;
51 
52 int             vm_swapper_throttle = -1;
53 uint64_t        vm_swapout_thread_id;
54 
55 uint64_t        vm_swap_put_failures = 0; /* Likely failed I/O. Data is still in memory. */
56 uint64_t        vm_swap_get_failures = 0; /* Fatal */
57 uint64_t        vm_swap_put_failures_no_swap_file = 0; /* Possibly not fatal because we might just need a new swapfile. */
58 int             vm_num_swap_files_config = 0;
59 int             vm_num_swap_files = 0;
60 int             vm_num_pinned_swap_files = 0;
61 uint64_t        vm_swap_volume_capacity = 0;
62 int             vm_swapout_thread_processed_segments = 0;
63 int             vm_swapout_thread_awakened = 0;
64 bool            vm_swapout_thread_running = FALSE;
65 _Atomic bool    vm_swapout_wake_pending = false;
66 int             vm_swapfile_create_thread_awakened = 0;
67 int             vm_swapfile_create_thread_running = 0;
68 int             vm_swapfile_gc_thread_awakened = 0;
69 int             vm_swapfile_gc_thread_running = 0;
70 
71 int64_t         vm_swappin_avail = 0;
72 boolean_t       vm_swappin_enabled = FALSE;
73 unsigned int    vm_swapfile_total_segs_alloced = 0;
74 unsigned int    vm_swapfile_total_segs_alloced_max = 0;
75 unsigned int    vm_swapfile_total_segs_used = 0;
76 unsigned int    vm_swapfile_total_segs_used_max = 0;
77 
78 char            swapfilename[MAX_SWAPFILENAME_LEN + 1] = SWAP_FILE_NAME;
79 
80 extern vm_map_t compressor_map;
81 extern uint32_t c_seg_bufsize, c_seg_allocsize, c_seg_off_limit;
82 
83 #define SWAP_READY      0x1     /* Swap file is ready to be used */
84 #define SWAP_RECLAIM    0x2     /* Swap file is marked to be reclaimed */
85 #define SWAP_WANTED     0x4     /* Swap file has waiters */
86 #define SWAP_REUSE      0x8     /* Swap file is on the Q and has a name. Reuse after init-ing.*/
87 #define SWAP_PINNED     0x10    /* Swap file is pinned (FusionDrive) */
88 
89 
90 struct swapfile {
91 	queue_head_t            swp_queue;      /* list of swap files */
92 	char                    *swp_path;      /* saved pathname of swap file */
93 	struct vnode            *swp_vp;        /* backing vnode */
94 	uint64_t                swp_size;       /* size of this swap file */
95 	uint8_t                 *swp_bitmap;    /* bitmap showing the alloced/freed slots in the swap file */
96 	unsigned int            swp_pathlen;    /* length of pathname */
97 	unsigned int            swp_nsegs;      /* #segments we can use */
98 	unsigned int            swp_nseginuse;  /* #segments in use */
99 	unsigned int            swp_index;      /* index of this swap file */
100 	unsigned int            swp_flags;      /* state of swap file */
101 	unsigned int            swp_free_hint;  /* offset of 1st free chunk */
102 	unsigned int            swp_io_count;   /* count of outstanding I/Os */
103 	c_segment_t             *swp_csegs;     /* back pointers to the c_segments. Used during swap reclaim. */
104 
105 	struct trim_list        *swp_delayed_trim_list_head;
106 	unsigned int            swp_delayed_trim_count;
107 };
108 
109 queue_head_t    swf_global_queue;
110 boolean_t       swp_trim_supported = FALSE;
111 
112 extern clock_sec_t      dont_trim_until_ts;
113 clock_sec_t             vm_swapfile_last_failed_to_create_ts = 0;
114 clock_sec_t             vm_swapfile_last_successful_create_ts = 0;
115 int                     vm_swapfile_can_be_created = FALSE;
116 boolean_t               delayed_trim_handling_in_progress = FALSE;
117 
118 boolean_t               hibernate_in_progress_with_pinned_swap = FALSE;
119 
120 static void vm_swapout_thread_throttle_adjust(void);
121 static void vm_swap_free_now(struct swapfile *swf, uint64_t f_offset);
122 void vm_swapout_thread(void);
123 static void vm_swapfile_create_thread(void);
124 static void vm_swapfile_gc_thread(void);
125 static void vm_swap_defragment(void);
126 static void vm_swap_handle_delayed_trims(boolean_t);
127 static void vm_swap_do_delayed_trim(struct swapfile *);
128 static void vm_swap_wait_on_trim_handling_in_progress(void);
129 static void vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset, uint32_t size, kern_return_t kr);
130 
131 extern int vnode_getwithref(struct vnode* vp);
132 
133 boolean_t vm_swap_force_defrag = FALSE, vm_swap_force_reclaim = FALSE;
134 
135 #if !XNU_TARGET_OS_OSX
136 
137 /*
138  * For CONFIG_FREEZE, we scale the c_segments_limit based on the
139  * number of swapfiles allowed. That increases wired memory overhead.
140  * So we want to keep the max swapfiles same on both DEV/RELEASE so
141  * that the memory overhead is similar for performance comparisons.
142  */
143 #define VM_MAX_SWAP_FILE_NUM            5
144 #if defined(__arm64__) && defined(ARM_LARGE_MEMORY)
145 #define VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM (64ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
146 #define VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM (16ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
147 #else /* defined(__arm64__) && defined(ARM_LARGE_MEMORY) */
148 /*
149  * We reserve compressor pool VA at boot for the max # of swap files. If someone
150  * has enabled app swap but we're not an arm large memory device we can't hog
151  * all of the VA so we only go up to 4GB.
152  */
153 #define VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM (4ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
154 #define VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM (4ULL * (1ULL << 30) / MAX_SWAP_FILE_SIZE)
155 #endif /* defined(__arm64__) && defined(ARM_LARGE_MEMORY) */
156 #define VM_SWAP_MIN_VOLUME_CAPACITY (128ULL * (1ULL << 30))
157 
158 #define VM_SWAPFILE_DELAYED_TRIM_MAX    4
159 
160 #define VM_SWAP_SHOULD_DEFRAGMENT()     (((vm_swap_force_defrag == TRUE) || (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 16))) ? 1 : 0)
161 #define VM_SWAP_SHOULD_PIN(_size)       FALSE
162 #define VM_SWAP_SHOULD_CREATE(cur_ts)   ((vm_num_swap_files < vm_num_swap_files_config) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)vm_swapfile_hiwater_segs) && \
163 	                                 ((cur_ts - vm_swapfile_last_failed_to_create_ts) > VM_SWAPFILE_DELAYED_CREATE) ? 1 : 0)
164 #define VM_SWAP_SHOULD_TRIM(swf)        ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0)
165 
166 #else /* !XNU_TARGET_OS_OSX */
167 
168 #define VM_MAX_SWAP_FILE_NUM            100
169 #define VM_SWAPFILE_DELAYED_TRIM_MAX    128
170 
171 #define VM_SWAP_SHOULD_DEFRAGMENT()     (((vm_swap_force_defrag == TRUE) || (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 4))) ? 1 : 0)
172 #define VM_SWAP_SHOULD_PIN(_size)       (vm_swappin_avail > 0 && vm_swappin_avail >= (int64_t)(_size))
173 #define VM_SWAP_SHOULD_CREATE(cur_ts)   ((vm_num_swap_files < vm_num_swap_files_config) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)vm_swapfile_hiwater_segs) && \
174 	                                 ((cur_ts - vm_swapfile_last_failed_to_create_ts) > VM_SWAPFILE_DELAYED_CREATE) ? 1 : 0)
175 #define VM_SWAP_SHOULD_TRIM(swf)        ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0)
176 
177 #endif /* !XNU_TARGET_OS_OSX */
178 
179 #define VM_SWAP_SHOULD_RECLAIM()        (((vm_swap_force_reclaim == TRUE) || ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) >= swapfile_reclaim_threshold_segs)) ? 1 : 0)
180 #define VM_SWAP_SHOULD_ABORT_RECLAIM()  (((vm_swap_force_reclaim == FALSE) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) <= swapfile_reclam_minimum_segs)) ? 1 : 0)
181 #define VM_SWAPFILE_DELAYED_CREATE      15
182 
183 #define VM_SWAP_BUSY()  (((c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count) && (vm_swapper_throttle == THROTTLE_LEVEL_COMPRESSOR_TIER0)) ? 1 : 0)
184 
185 
186 #if CHECKSUM_THE_SWAP
187 extern unsigned int hash_string(char *cp, int len);
188 #endif
189 
190 #if RECORD_THE_COMPRESSED_DATA
191 boolean_t       c_compressed_record_init_done = FALSE;
192 int             c_compressed_record_write_error = 0;
193 struct vnode    *c_compressed_record_vp = NULL;
194 uint64_t        c_compressed_record_file_offset = 0;
195 void    c_compressed_record_init(void);
196 void    c_compressed_record_write(char *, int);
197 #endif
198 
199 extern void                     vm_pageout_io_throttle(void);
200 
201 static struct swapfile *vm_swapfile_for_handle(uint64_t);
202 
203 /*
204  * Called with the vm_swap_data_lock held.
205  */
206 
207 static struct swapfile *
vm_swapfile_for_handle(uint64_t f_offset)208 vm_swapfile_for_handle(uint64_t f_offset)
209 {
210 	uint64_t                file_offset = 0;
211 	unsigned int            swapfile_index = 0;
212 	struct swapfile*        swf = NULL;
213 
214 	file_offset = (f_offset & SWAP_SLOT_MASK);
215 	swapfile_index = (f_offset >> SWAP_DEVICE_SHIFT);
216 
217 	swf = (struct swapfile*) queue_first(&swf_global_queue);
218 
219 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
220 		if (swapfile_index == swf->swp_index) {
221 			break;
222 		}
223 
224 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
225 	}
226 
227 	if (queue_end(&swf_global_queue, (queue_entry_t) swf)) {
228 		swf = NULL;
229 	}
230 
231 	return swf;
232 }
233 
234 #if ENCRYPTED_SWAP
235 
236 #include <libkern/crypto/aesxts.h>
237 
238 extern int cc_rand_generate(void *, size_t);     /* from libkern/cyrpto/rand.h> */
239 
240 boolean_t       swap_crypt_initialized;
241 void            swap_crypt_initialize(void);
242 
243 symmetric_xts   xts_modectx;
244 uint32_t        swap_crypt_key1[8];   /* big enough for a 256 bit random key */
245 uint32_t        swap_crypt_key2[8];   /* big enough for a 256 bit random key */
246 
247 #if DEVELOPMENT || DEBUG
248 boolean_t       swap_crypt_xts_tested = FALSE;
249 unsigned char   swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
250 unsigned char   swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
251 unsigned char   swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
252 #endif /* DEVELOPMENT || DEBUG */
253 
254 unsigned long   vm_page_encrypt_counter;
255 unsigned long   vm_page_decrypt_counter;
256 
257 
258 void
swap_crypt_initialize(void)259 swap_crypt_initialize(void)
260 {
261 	uint8_t  *enckey1, *enckey2;
262 	int      keylen1, keylen2;
263 	int      error;
264 
265 	assert(swap_crypt_initialized == FALSE);
266 
267 	keylen1 = sizeof(swap_crypt_key1);
268 	enckey1 = (uint8_t *)&swap_crypt_key1;
269 	keylen2 = sizeof(swap_crypt_key2);
270 	enckey2 = (uint8_t *)&swap_crypt_key2;
271 
272 	error = cc_rand_generate((void *)enckey1, keylen1);
273 	assert(!error);
274 
275 	error = cc_rand_generate((void *)enckey2, keylen2);
276 	assert(!error);
277 
278 	error = xts_start(0, NULL, enckey1, keylen1, enckey2, keylen2, 0, 0, &xts_modectx);
279 	assert(!error);
280 
281 	swap_crypt_initialized = TRUE;
282 
283 #if DEVELOPMENT || DEBUG
284 	uint8_t *encptr;
285 	uint8_t *decptr;
286 	uint8_t *refptr;
287 	uint8_t *iv;
288 	uint64_t ivnum[2];
289 	int size = 0;
290 	int i    = 0;
291 	int rc   = 0;
292 
293 	assert(swap_crypt_xts_tested == FALSE);
294 
295 	/*
296 	 * Validate the encryption algorithms.
297 	 *
298 	 * First initialize the test data.
299 	 */
300 	for (i = 0; i < 4096; i++) {
301 		swap_crypt_test_page_ref[i] = (char) i;
302 	}
303 	ivnum[0] = (uint64_t)0xaa;
304 	ivnum[1] = 0;
305 	iv = (uint8_t *)ivnum;
306 
307 	refptr = (uint8_t *)swap_crypt_test_page_ref;
308 	encptr = (uint8_t *)swap_crypt_test_page_encrypt;
309 	decptr = (uint8_t *)swap_crypt_test_page_decrypt;
310 	size = 4096;
311 
312 	/* encrypt */
313 	rc = xts_encrypt(refptr, size, encptr, iv, &xts_modectx);
314 	assert(!rc);
315 
316 	/* compare result with original - should NOT match */
317 	for (i = 0; i < 4096; i++) {
318 		if (swap_crypt_test_page_encrypt[i] !=
319 		    swap_crypt_test_page_ref[i]) {
320 			break;
321 		}
322 	}
323 	assert(i != 4096);
324 
325 	/* decrypt */
326 	rc = xts_decrypt(encptr, size, decptr, iv, &xts_modectx);
327 	assert(!rc);
328 
329 	/* compare result with original */
330 	for (i = 0; i < 4096; i++) {
331 		if (swap_crypt_test_page_decrypt[i] !=
332 		    swap_crypt_test_page_ref[i]) {
333 			panic("encryption test failed");
334 		}
335 	}
336 	/* encrypt in place */
337 	rc = xts_encrypt(decptr, size, decptr, iv, &xts_modectx);
338 	assert(!rc);
339 
340 	/* decrypt in place */
341 	rc = xts_decrypt(decptr, size, decptr, iv, &xts_modectx);
342 	assert(!rc);
343 
344 	for (i = 0; i < 4096; i++) {
345 		if (swap_crypt_test_page_decrypt[i] !=
346 		    swap_crypt_test_page_ref[i]) {
347 			panic("in place encryption test failed");
348 		}
349 	}
350 	swap_crypt_xts_tested = TRUE;
351 #endif /* DEVELOPMENT || DEBUG */
352 }
353 
354 
355 void
vm_swap_encrypt(c_segment_t c_seg)356 vm_swap_encrypt(c_segment_t c_seg)
357 {
358 	uint8_t *ptr;
359 	uint8_t *iv;
360 	uint64_t ivnum[2];
361 	int size = 0;
362 	int rc   = 0;
363 
364 	if (swap_crypt_initialized == FALSE) {
365 		swap_crypt_initialize();
366 	}
367 
368 #if DEVELOPMENT || DEBUG
369 	C_SEG_MAKE_WRITEABLE(c_seg);
370 #endif
371 	ptr = (uint8_t *)c_seg->c_store.c_buffer;
372 	size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
373 
374 	ivnum[0] = (uint64_t)c_seg;
375 	ivnum[1] = 0;
376 	iv = (uint8_t *)ivnum;
377 
378 	rc = xts_encrypt(ptr, size, ptr, iv, &xts_modectx);
379 	assert(!rc);
380 
381 	vm_page_encrypt_counter += (size / PAGE_SIZE_64);
382 
383 #if DEVELOPMENT || DEBUG
384 	C_SEG_WRITE_PROTECT(c_seg);
385 #endif
386 }
387 
388 void
vm_swap_decrypt(c_segment_t c_seg)389 vm_swap_decrypt(c_segment_t c_seg)
390 {
391 	uint8_t *ptr;
392 	uint8_t *iv;
393 	uint64_t ivnum[2];
394 	int size = 0;
395 	int rc   = 0;
396 
397 	assert(swap_crypt_initialized);
398 
399 #if DEVELOPMENT || DEBUG
400 	C_SEG_MAKE_WRITEABLE(c_seg);
401 #endif
402 	ptr = (uint8_t *)c_seg->c_store.c_buffer;
403 	size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
404 
405 	ivnum[0] = (uint64_t)c_seg;
406 	ivnum[1] = 0;
407 	iv = (uint8_t *)ivnum;
408 
409 	rc = xts_decrypt(ptr, size, ptr, iv, &xts_modectx);
410 	assert(!rc);
411 
412 	vm_page_decrypt_counter += (size / PAGE_SIZE_64);
413 
414 #if DEVELOPMENT || DEBUG
415 	C_SEG_WRITE_PROTECT(c_seg);
416 #endif
417 }
418 #endif /* ENCRYPTED_SWAP */
419 
420 uint64_t compressed_swap_chunk_size, vm_swapfile_hiwater_segs, swapfile_reclaim_threshold_segs, swapfile_reclam_minimum_segs;
421 extern bool memorystatus_swap_all_apps;
422 
423 void
vm_compressor_swap_init_swap_file_limit(void)424 vm_compressor_swap_init_swap_file_limit(void)
425 {
426 	vm_num_swap_files_config = VM_MAX_SWAP_FILE_NUM;
427 #if CONFIG_JETSAM
428 	if (memorystatus_swap_all_apps) {
429 		if (vm_swap_volume_capacity == 0) {
430 			/*
431 			 * Early in boot we don't know the swap volume capacity.
432 			 * That's fine. Reserve space for the maximum config
433 			 * and we'll lower this later in boot once we have the capacity.
434 			 */
435 			vm_num_swap_files_config = VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM;
436 		} else {
437 			static uint64_t kFixedPointFactor = 100;
438 			/*
439 			 * Scale the max number of swap files linearly.
440 			 * But we can never go above VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM.
441 			 */
442 			vm_num_swap_files_config = vm_swap_volume_capacity * kFixedPointFactor / VM_SWAP_MIN_VOLUME_CAPACITY
443 			    * VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM / kFixedPointFactor;
444 			vm_num_swap_files_config = MAX(vm_num_swap_files_config, VM_MIN_SWAP_FILE_SWAP_ENABLED_NUM);
445 			vm_num_swap_files_config = MIN(vm_num_swap_files_config, VM_MAX_SWAP_FILE_SWAP_ENABLED_NUM);
446 		}
447 	}
448 #endif /* CONFIG_JETSAM */
449 #if DEVELOPMENT || DEBUG
450 	typeof(vm_num_swap_files_config) parsed_vm_max_num_swap_files = 0;
451 	if (PE_parse_boot_argn("vm_max_num_swap_files", &parsed_vm_max_num_swap_files, sizeof(parsed_vm_max_num_swap_files))) {
452 		if (parsed_vm_max_num_swap_files > 0) {
453 			vm_num_swap_files_config = parsed_vm_max_num_swap_files;
454 		} else {
455 			printf("WARNING: Ignoring vm_max_num_swap_files=%d boot-arg. Value must be > 0\n", parsed_vm_max_num_swap_files);
456 		}
457 	}
458 #endif
459 	printf("Maximum number of VM swap files: %d\n", vm_num_swap_files_config);
460 }
461 
462 int vm_swap_enabled = 0;
463 void
vm_compressor_swap_init(void)464 vm_compressor_swap_init(void)
465 {
466 	thread_t        thread = NULL;
467 
468 	queue_init(&swf_global_queue);
469 
470 #if !XNU_TARGET_OS_OSX
471 	/*
472 	 * dummy value until the swap file gets created
473 	 * when we drive the first c_segment_t to the
474 	 * swapout queue... at that time we will
475 	 * know the true size we have to work with
476 	 */
477 	c_overage_swapped_limit = 16;
478 #endif /* !XNU_TARGET_OS_OSX */
479 
480 	compressed_swap_chunk_size = c_seg_bufsize;
481 	vm_swapfile_hiwater_segs = (MIN_SWAP_FILE_SIZE / compressed_swap_chunk_size);
482 	swapfile_reclaim_threshold_segs = ((17 * (MAX_SWAP_FILE_SIZE / compressed_swap_chunk_size)) / 10);
483 	swapfile_reclam_minimum_segs = ((13 * (MAX_SWAP_FILE_SIZE / compressed_swap_chunk_size)) / 10);
484 
485 	if (kernel_thread_start_priority((thread_continue_t)vm_swapout_thread, NULL,
486 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
487 		panic("vm_swapout_thread: create failed");
488 	}
489 	thread_set_thread_name(thread, "VM_swapout");
490 	vm_swapout_thread_id = thread->thread_id;
491 	thread_deallocate(thread);
492 
493 	if (kernel_thread_start_priority((thread_continue_t)vm_swapfile_create_thread, NULL,
494 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
495 		panic("vm_swapfile_create_thread: create failed");
496 	}
497 	thread_set_thread_name(thread, "VM_swapfile_create");
498 	thread_deallocate(thread);
499 
500 	if (kernel_thread_start_priority((thread_continue_t)vm_swapfile_gc_thread, NULL,
501 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
502 		panic("vm_swapfile_gc_thread: create failed");
503 	}
504 	thread_set_thread_name(thread, "VM_swapfile_gc");
505 	/*
506 	 * Swapfile garbage collection will need to allocate memory
507 	 * to complete its swap reclaim and in-memory compaction.
508 	 * So allow it to dip into the reserved VM page pool.
509 	 */
510 	thread_lock(thread);
511 	thread->options |= TH_OPT_VMPRIV;
512 	thread_unlock(thread);
513 	thread_deallocate(thread);
514 	proc_set_thread_policy_with_tid(kernel_task, thread->thread_id,
515 	    TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2);
516 	proc_set_thread_policy_with_tid(kernel_task, thread->thread_id,
517 	    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
518 
519 	vm_swap_enabled = 1;
520 	printf("VM Swap Subsystem is ON\n");
521 }
522 
523 
524 #if RECORD_THE_COMPRESSED_DATA
525 
526 void
c_compressed_record_init()527 c_compressed_record_init()
528 {
529 	if (c_compressed_record_init_done == FALSE) {
530 		vm_swapfile_open("/tmp/compressed_data", &c_compressed_record_vp);
531 		c_compressed_record_init_done = TRUE;
532 	}
533 }
534 
535 void
c_compressed_record_write(char * buf,int size)536 c_compressed_record_write(char *buf, int size)
537 {
538 	if (c_compressed_record_write_error == 0) {
539 		c_compressed_record_write_error = vm_record_file_write(c_compressed_record_vp, c_compressed_record_file_offset, buf, size);
540 		c_compressed_record_file_offset += size;
541 	}
542 }
543 #endif
544 
545 
546 int             compaction_swapper_inited = 0;
547 
548 void
vm_compaction_swapper_do_init(void)549 vm_compaction_swapper_do_init(void)
550 {
551 	struct  vnode *vp;
552 	char    *pathname;
553 	int     namelen;
554 
555 	if (compaction_swapper_inited) {
556 		return;
557 	}
558 
559 	if (vm_compressor_mode != VM_PAGER_COMPRESSOR_WITH_SWAP) {
560 		compaction_swapper_inited = 1;
561 		return;
562 	}
563 	lck_mtx_lock(&vm_swap_data_lock);
564 
565 	if (!compaction_swapper_inited) {
566 		namelen = (int)strlen(swapfilename) + SWAPFILENAME_INDEX_LEN + 1;
567 		pathname = kalloc_data(namelen, Z_WAITOK | Z_ZERO);
568 		snprintf(pathname, namelen, "%s%d", swapfilename, 0);
569 
570 		vm_swapfile_open(pathname, &vp);
571 
572 		if (vp) {
573 			if (vnode_pager_isSSD(vp) == FALSE) {
574 				/*
575 				 * swap files live on an HDD, so let's make sure to start swapping
576 				 * much earlier since we're not worried about SSD write-wear and
577 				 * we have so little write bandwidth to work with
578 				 * these values were derived expermentially by running the performance
579 				 * teams stock test for evaluating HDD performance against various
580 				 * combinations and looking and comparing overall results.
581 				 * Note that the > relationship between these 4 values must be maintained
582 				 */
583 				if (vm_compressor_minorcompact_threshold_divisor_overridden == 0) {
584 					vm_compressor_minorcompact_threshold_divisor = 15;
585 				}
586 				if (vm_compressor_majorcompact_threshold_divisor_overridden == 0) {
587 					vm_compressor_majorcompact_threshold_divisor = 18;
588 				}
589 				if (vm_compressor_unthrottle_threshold_divisor_overridden == 0) {
590 					vm_compressor_unthrottle_threshold_divisor = 24;
591 				}
592 				if (vm_compressor_catchup_threshold_divisor_overridden == 0) {
593 					vm_compressor_catchup_threshold_divisor = 30;
594 				}
595 			}
596 #if XNU_TARGET_OS_OSX
597 			vnode_setswapmount(vp);
598 			vm_swappin_avail = vnode_getswappin_avail(vp);
599 
600 			if (vm_swappin_avail) {
601 				vm_swappin_enabled = TRUE;
602 			}
603 #endif /* XNU_TARGET_OS_OSX */
604 			vm_swapfile_close((uint64_t)pathname, vp);
605 		}
606 		kfree_data(pathname, namelen);
607 
608 		compaction_swapper_inited = 1;
609 	}
610 	lck_mtx_unlock(&vm_swap_data_lock);
611 }
612 
613 
614 void
vm_swap_consider_defragmenting(int flags)615 vm_swap_consider_defragmenting(int flags)
616 {
617 	boolean_t force_defrag = (flags & VM_SWAP_FLAGS_FORCE_DEFRAG);
618 	boolean_t force_reclaim = (flags & VM_SWAP_FLAGS_FORCE_RECLAIM);
619 
620 	if (compressor_store_stop_compaction == FALSE && !VM_SWAP_BUSY() &&
621 	    (force_defrag || force_reclaim || VM_SWAP_SHOULD_DEFRAGMENT() || VM_SWAP_SHOULD_RECLAIM())) {
622 		if (!vm_swapfile_gc_thread_running || force_defrag || force_reclaim) {
623 			lck_mtx_lock(&vm_swap_data_lock);
624 
625 			if (force_defrag) {
626 				vm_swap_force_defrag = TRUE;
627 			}
628 
629 			if (force_reclaim) {
630 				vm_swap_force_reclaim = TRUE;
631 			}
632 
633 			if (!vm_swapfile_gc_thread_running) {
634 				thread_wakeup((event_t) &vm_swapfile_gc_needed);
635 			}
636 
637 			lck_mtx_unlock(&vm_swap_data_lock);
638 		}
639 	}
640 }
641 
642 
643 int vm_swap_defragment_yielded = 0;
644 int vm_swap_defragment_swapin = 0;
645 int vm_swap_defragment_free = 0;
646 int vm_swap_defragment_busy = 0;
647 
648 #if CONFIG_FREEZE
649 extern int32_t c_segment_pages_compressed_incore;
650 extern int32_t c_segment_pages_compressed_incore_late_swapout;
651 extern uint32_t c_segment_pages_compressed_nearing_limit;
652 extern uint32_t c_segment_count;
653 extern uint32_t c_segments_nearing_limit;
654 
655 boolean_t       memorystatus_kill_on_VM_compressor_space_shortage(boolean_t);
656 
657 extern bool freezer_incore_cseg_acct;
658 #endif /* CONFIG_FREEZE */
659 
660 static void
vm_swap_defragment()661 vm_swap_defragment()
662 {
663 	c_segment_t     c_seg;
664 
665 	/*
666 	 * have to grab the master lock w/o holding
667 	 * any locks in spin mode
668 	 */
669 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
670 
671 	lck_mtx_lock_spin_always(c_list_lock);
672 
673 	while (!queue_empty(&c_swappedout_sparse_list_head)) {
674 		if (compressor_store_stop_compaction == TRUE || VM_SWAP_BUSY()) {
675 			vm_swap_defragment_yielded++;
676 			break;
677 		}
678 		c_seg = (c_segment_t)queue_first(&c_swappedout_sparse_list_head);
679 
680 		lck_mtx_lock_spin_always(&c_seg->c_lock);
681 
682 		assert(c_seg->c_state == C_ON_SWAPPEDOUTSPARSE_Q);
683 
684 		if (c_seg->c_busy) {
685 			lck_mtx_unlock_always(c_list_lock);
686 
687 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
688 			/*
689 			 * c_seg_wait_on_busy consumes c_seg->c_lock
690 			 */
691 			c_seg_wait_on_busy(c_seg);
692 
693 			PAGE_REPLACEMENT_DISALLOWED(TRUE);
694 
695 			lck_mtx_lock_spin_always(c_list_lock);
696 
697 			vm_swap_defragment_busy++;
698 			continue;
699 		}
700 		if (c_seg->c_bytes_used == 0) {
701 			/*
702 			 * c_seg_free_locked consumes the c_list_lock
703 			 * and c_seg->c_lock
704 			 */
705 			C_SEG_BUSY(c_seg);
706 			c_seg_free_locked(c_seg);
707 
708 			vm_swap_defragment_free++;
709 		} else {
710 			lck_mtx_unlock_always(c_list_lock);
711 
712 #if CONFIG_FREEZE
713 			if (freezer_incore_cseg_acct) {
714 				/*
715 				 * TODO(jason): These two are tricky because they're pre-emptive jetsams.
716 				 * The system is not unhealthy, but we know that it's about to become unhealthy once
717 				 * we do this swapin.
718 				 * So we're waking up the memorystatus thread to make space
719 				 * (hopefully) before this segment comes in.
720 				 *
721 				 * I think the compressor_backing_store needs to keep track of
722 				 * two new globals that will track the number of segments
723 				 * being swapped in due to defrag and the number of slots used
724 				 * in those segments.
725 				 * Then the health check below can be called from the memorystatus
726 				 * thread.
727 				 */
728 				if ((c_seg->c_slots_used + c_segment_pages_compressed_incore) >= c_segment_pages_compressed_nearing_limit) {
729 					memorystatus_kill_on_VM_compressor_space_shortage(TRUE /* async */);
730 				}
731 
732 				uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
733 				if ((incore_seg_count + 1) >= c_segments_nearing_limit) {
734 					memorystatus_kill_on_VM_compressor_space_shortage(TRUE /* async */);
735 				}
736 			}
737 #endif /* CONFIG_FREEZE */
738 			if (c_seg_swapin(c_seg, TRUE, FALSE) == 0) {
739 				lck_mtx_unlock_always(&c_seg->c_lock);
740 				vmcs_stats.defrag_swapins += (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) >> PAGE_SHIFT;
741 			}
742 
743 			vm_swap_defragment_swapin++;
744 		}
745 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
746 
747 		vm_pageout_io_throttle();
748 
749 		/*
750 		 * because write waiters have privilege over readers,
751 		 * dropping and immediately retaking the master lock will
752 		 * still allow any thread waiting to acquire the
753 		 * master lock exclusively an opportunity to take it
754 		 */
755 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
756 
757 		lck_mtx_lock_spin_always(c_list_lock);
758 	}
759 	lck_mtx_unlock_always(c_list_lock);
760 
761 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
762 }
763 
764 
765 bool vm_swapfile_create_thread_inited = false;
766 static void
vm_swapfile_create_thread(void)767 vm_swapfile_create_thread(void)
768 {
769 	clock_sec_t     sec;
770 	clock_nsec_t    nsec;
771 
772 	if (!vm_swapfile_create_thread_inited) {
773 #if CONFIG_THREAD_GROUPS
774 		thread_group_vm_add();
775 #endif /* CONFIG_THREAD_GROUPS */
776 		current_thread()->options |= TH_OPT_VMPRIV;
777 		vm_swapfile_create_thread_inited = true;
778 	}
779 
780 	vm_swapfile_create_thread_awakened++;
781 	vm_swapfile_create_thread_running = 1;
782 
783 	while (TRUE) {
784 		/*
785 		 * walk through the list of swap files
786 		 * and do the delayed frees/trims for
787 		 * any swap file whose count of delayed
788 		 * frees is above the batch limit
789 		 */
790 		vm_swap_handle_delayed_trims(FALSE);
791 
792 		lck_mtx_lock(&vm_swap_data_lock);
793 
794 		if (hibernate_in_progress_with_pinned_swap == TRUE) {
795 			break;
796 		}
797 
798 		if (compressor_store_stop_compaction == TRUE) {
799 			break;
800 		}
801 
802 		clock_get_system_nanotime(&sec, &nsec);
803 
804 		if (VM_SWAP_SHOULD_CREATE(sec) == 0) {
805 			break;
806 		}
807 
808 		lck_mtx_unlock(&vm_swap_data_lock);
809 
810 		if (vm_swap_create_file() == FALSE) {
811 			vm_swapfile_last_failed_to_create_ts = sec;
812 			HIBLOG("vm_swap_create_file failed @ %lu secs\n", (unsigned long)sec);
813 		} else {
814 			vm_swapfile_last_successful_create_ts = sec;
815 		}
816 	}
817 	vm_swapfile_create_thread_running = 0;
818 
819 	if (hibernate_in_progress_with_pinned_swap == TRUE) {
820 		thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap);
821 	}
822 
823 	if (compressor_store_stop_compaction == TRUE) {
824 		thread_wakeup((event_t)&compressor_store_stop_compaction);
825 	}
826 
827 	assert_wait((event_t)&vm_swapfile_create_needed, THREAD_UNINT);
828 
829 	lck_mtx_unlock(&vm_swap_data_lock);
830 
831 	thread_block((thread_continue_t)vm_swapfile_create_thread);
832 
833 	/* NOTREACHED */
834 }
835 
836 
837 #if HIBERNATION
838 
839 kern_return_t
hibernate_pin_swap(boolean_t start)840 hibernate_pin_swap(boolean_t start)
841 {
842 	vm_compaction_swapper_do_init();
843 
844 	if (start == FALSE) {
845 		lck_mtx_lock(&vm_swap_data_lock);
846 		hibernate_in_progress_with_pinned_swap = FALSE;
847 		lck_mtx_unlock(&vm_swap_data_lock);
848 
849 		return KERN_SUCCESS;
850 	}
851 	if (vm_swappin_enabled == FALSE) {
852 		return KERN_SUCCESS;
853 	}
854 
855 	lck_mtx_lock(&vm_swap_data_lock);
856 
857 	hibernate_in_progress_with_pinned_swap = TRUE;
858 
859 	while (vm_swapfile_create_thread_running || vm_swapfile_gc_thread_running) {
860 		assert_wait((event_t)&hibernate_in_progress_with_pinned_swap, THREAD_UNINT);
861 
862 		lck_mtx_unlock(&vm_swap_data_lock);
863 
864 		thread_block(THREAD_CONTINUE_NULL);
865 
866 		lck_mtx_lock(&vm_swap_data_lock);
867 	}
868 	if (vm_num_swap_files > vm_num_pinned_swap_files) {
869 		hibernate_in_progress_with_pinned_swap = FALSE;
870 		lck_mtx_unlock(&vm_swap_data_lock);
871 
872 		HIBLOG("hibernate_pin_swap failed - vm_num_swap_files = %d, vm_num_pinned_swap_files = %d\n",
873 		    vm_num_swap_files, vm_num_pinned_swap_files);
874 		return KERN_FAILURE;
875 	}
876 	lck_mtx_unlock(&vm_swap_data_lock);
877 
878 	while (VM_SWAP_SHOULD_PIN(MAX_SWAP_FILE_SIZE)) {
879 		if (vm_swap_create_file() == FALSE) {
880 			break;
881 		}
882 	}
883 	return KERN_SUCCESS;
884 }
885 #endif
886 bool vm_swapfile_gc_thread_inited = false;
887 static void
vm_swapfile_gc_thread(void)888 vm_swapfile_gc_thread(void)
889 {
890 	boolean_t       need_defragment;
891 	boolean_t       need_reclaim;
892 
893 	if (!vm_swapfile_gc_thread_inited) {
894 #if CONFIG_THREAD_GROUPS
895 		thread_group_vm_add();
896 #endif /* CONFIG_THREAD_GROUPS */
897 		vm_swapfile_gc_thread_inited = true;
898 	}
899 
900 	vm_swapfile_gc_thread_awakened++;
901 	vm_swapfile_gc_thread_running = 1;
902 
903 	while (TRUE) {
904 		lck_mtx_lock(&vm_swap_data_lock);
905 
906 		if (hibernate_in_progress_with_pinned_swap == TRUE) {
907 			break;
908 		}
909 
910 		if (VM_SWAP_BUSY() || compressor_store_stop_compaction == TRUE) {
911 			break;
912 		}
913 
914 		need_defragment = FALSE;
915 		need_reclaim = FALSE;
916 
917 		if (VM_SWAP_SHOULD_DEFRAGMENT()) {
918 			need_defragment = TRUE;
919 		}
920 
921 		if (VM_SWAP_SHOULD_RECLAIM()) {
922 			need_defragment = TRUE;
923 			need_reclaim = TRUE;
924 		}
925 		if (need_defragment == FALSE && need_reclaim == FALSE) {
926 			break;
927 		}
928 
929 		vm_swap_force_defrag = FALSE;
930 		vm_swap_force_reclaim = FALSE;
931 
932 		lck_mtx_unlock(&vm_swap_data_lock);
933 
934 		if (need_defragment == TRUE) {
935 			vm_swap_defragment();
936 		}
937 		if (need_reclaim == TRUE) {
938 			vm_swap_reclaim();
939 		}
940 	}
941 	vm_swapfile_gc_thread_running = 0;
942 
943 	if (hibernate_in_progress_with_pinned_swap == TRUE) {
944 		thread_wakeup((event_t)&hibernate_in_progress_with_pinned_swap);
945 	}
946 
947 	if (compressor_store_stop_compaction == TRUE) {
948 		thread_wakeup((event_t)&compressor_store_stop_compaction);
949 	}
950 
951 	assert_wait((event_t)&vm_swapfile_gc_needed, THREAD_UNINT);
952 
953 	lck_mtx_unlock(&vm_swap_data_lock);
954 
955 	thread_block((thread_continue_t)vm_swapfile_gc_thread);
956 
957 	/* NOTREACHED */
958 }
959 
960 
961 
962 #define   VM_SWAPOUT_LIMIT_T2P  4
963 #define   VM_SWAPOUT_LIMIT_T1P  4
964 #define   VM_SWAPOUT_LIMIT_T0P  6
965 #define   VM_SWAPOUT_LIMIT_T0   8
966 #define   VM_SWAPOUT_LIMIT_MAX  8
967 
968 #define   VM_SWAPOUT_START      0
969 #define   VM_SWAPOUT_T2_PASSIVE 1
970 #define   VM_SWAPOUT_T1_PASSIVE 2
971 #define   VM_SWAPOUT_T0_PASSIVE 3
972 #define   VM_SWAPOUT_T0         4
973 
974 int vm_swapout_state = VM_SWAPOUT_START;
975 int vm_swapout_limit = 1;
976 
977 int vm_swapper_entered_T0  = 0;
978 int vm_swapper_entered_T0P = 0;
979 int vm_swapper_entered_T1P = 0;
980 int vm_swapper_entered_T2P = 0;
981 
982 
983 static void
vm_swapout_thread_throttle_adjust(void)984 vm_swapout_thread_throttle_adjust(void)
985 {
986 	switch (vm_swapout_state) {
987 	case VM_SWAPOUT_START:
988 
989 		vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
990 		vm_swapper_entered_T2P++;
991 
992 		proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
993 		    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
994 		proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
995 		    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
996 		vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
997 		vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
998 
999 		break;
1000 
1001 	case VM_SWAPOUT_T2_PASSIVE:
1002 
1003 		if (SWAPPER_NEEDS_TO_UNTHROTTLE()) {
1004 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER0;
1005 			vm_swapper_entered_T0P++;
1006 
1007 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1008 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1009 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1010 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1011 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
1012 			vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
1013 
1014 			break;
1015 		}
1016 		if (swapout_target_age || hibernate_flushing == TRUE) {
1017 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER1;
1018 			vm_swapper_entered_T1P++;
1019 
1020 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1021 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1022 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1023 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1024 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T1P;
1025 			vm_swapout_state = VM_SWAPOUT_T1_PASSIVE;
1026 		}
1027 		break;
1028 
1029 	case VM_SWAPOUT_T1_PASSIVE:
1030 
1031 		if (SWAPPER_NEEDS_TO_UNTHROTTLE()) {
1032 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER0;
1033 			vm_swapper_entered_T0P++;
1034 
1035 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1036 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1037 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1038 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1039 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
1040 			vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
1041 
1042 			break;
1043 		}
1044 		if (swapout_target_age == 0 && hibernate_flushing == FALSE) {
1045 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
1046 			vm_swapper_entered_T2P++;
1047 
1048 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1049 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1050 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1051 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1052 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
1053 			vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
1054 		}
1055 		break;
1056 
1057 	case VM_SWAPOUT_T0_PASSIVE:
1058 
1059 		if (SWAPPER_NEEDS_TO_RETHROTTLE()) {
1060 			vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
1061 			vm_swapper_entered_T2P++;
1062 
1063 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1064 			    TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
1065 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1066 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1067 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
1068 			vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
1069 
1070 			break;
1071 		}
1072 		if (SWAPPER_NEEDS_TO_CATCHUP()) {
1073 			vm_swapper_entered_T0++;
1074 
1075 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1076 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_DISABLE);
1077 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0;
1078 			vm_swapout_state = VM_SWAPOUT_T0;
1079 		}
1080 		break;
1081 
1082 	case VM_SWAPOUT_T0:
1083 
1084 		if (SWAPPER_HAS_CAUGHTUP()) {
1085 			vm_swapper_entered_T0P++;
1086 
1087 			proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
1088 			    TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
1089 			vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
1090 			vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
1091 		}
1092 		break;
1093 	}
1094 }
1095 
1096 int vm_swapout_found_empty = 0;
1097 
1098 struct swapout_io_completion vm_swapout_ctx[VM_SWAPOUT_LIMIT_MAX];
1099 
1100 int vm_swapout_soc_busy = 0;
1101 int vm_swapout_soc_done = 0;
1102 
1103 
1104 static struct swapout_io_completion *
vm_swapout_find_free_soc(void)1105 vm_swapout_find_free_soc(void)
1106 {
1107 	int      i;
1108 
1109 	for (i = 0; i < VM_SWAPOUT_LIMIT_MAX; i++) {
1110 		if (vm_swapout_ctx[i].swp_io_busy == 0) {
1111 			return &vm_swapout_ctx[i];
1112 		}
1113 	}
1114 	assert(vm_swapout_soc_busy == VM_SWAPOUT_LIMIT_MAX);
1115 
1116 	return NULL;
1117 }
1118 
1119 static struct swapout_io_completion *
vm_swapout_find_done_soc(void)1120 vm_swapout_find_done_soc(void)
1121 {
1122 	int      i;
1123 
1124 	if (vm_swapout_soc_done) {
1125 		for (i = 0; i < VM_SWAPOUT_LIMIT_MAX; i++) {
1126 			if (vm_swapout_ctx[i].swp_io_done) {
1127 				return &vm_swapout_ctx[i];
1128 			}
1129 		}
1130 	}
1131 	return NULL;
1132 }
1133 
1134 static void
vm_swapout_complete_soc(struct swapout_io_completion * soc)1135 vm_swapout_complete_soc(struct swapout_io_completion *soc)
1136 {
1137 	kern_return_t  kr;
1138 
1139 	if (soc->swp_io_error) {
1140 		kr = KERN_FAILURE;
1141 	} else {
1142 		kr = KERN_SUCCESS;
1143 	}
1144 
1145 	lck_mtx_unlock_always(c_list_lock);
1146 
1147 	vm_swap_put_finish(soc->swp_swf, &soc->swp_f_offset, soc->swp_io_error, TRUE /*drop iocount*/);
1148 	vm_swapout_finish(soc->swp_c_seg, soc->swp_f_offset, soc->swp_c_size, kr);
1149 
1150 	lck_mtx_lock_spin_always(c_list_lock);
1151 
1152 	soc->swp_io_done = 0;
1153 	soc->swp_io_busy = 0;
1154 
1155 	vm_swapout_soc_busy--;
1156 	vm_swapout_soc_done--;
1157 }
1158 
1159 bool vm_swapout_thread_inited = false;
1160 extern uint32_t c_donate_swapout_count;
1161 #if CONFIG_JETSAM
1162 bool memorystatus_swap_over_trigger(uint64_t adjustment_factor);
1163 /*
1164  * swapout_sleep_threshold sets the percentage of the swapout threshold at which
1165  * the swap thread will stop processing the swapout queue.
1166  * By default this is 90 which means we will swap until the
1167  * swapout queue size is at 90% of the threshold to wake the swap thread.
1168  * By definition the queue  length must be >= 100% of the threshold when the.
1169  * swap thread is woken up. On development builds this can be adjusted with
1170  * the vm.swapout_sleep_threshold sysctl.
1171  */
1172 uint32_t swapout_sleep_threshold = 90;
1173 #endif /* CONFIG_JETSAM */
1174 static bool
should_process_swapout_queue(const queue_head_t * swapout_list_head)1175 should_process_swapout_queue(const queue_head_t *swapout_list_head)
1176 {
1177 	bool process_queue = !queue_empty(swapout_list_head) &&
1178 	    vm_swapout_soc_busy < vm_swapout_limit &&
1179 	    !compressor_store_stop_compaction;
1180 #if CONFIG_JETSAM
1181 	if (memorystatus_swap_all_apps && swapout_list_head == &c_late_swapout_list_head) {
1182 		process_queue = process_queue && memorystatus_swap_over_trigger(swapout_sleep_threshold);
1183 	}
1184 #endif /* CONFIG_JETSAM */
1185 	return process_queue;
1186 }
1187 
1188 void
vm_swapout_thread(void)1189 vm_swapout_thread(void)
1190 {
1191 	uint32_t        size = 0;
1192 	c_segment_t     c_seg = NULL;
1193 	kern_return_t   kr = KERN_SUCCESS;
1194 	struct swapout_io_completion *soc;
1195 	queue_head_t    *swapout_list_head;
1196 	bool            queues_empty = false;
1197 
1198 	if (!vm_swapout_thread_inited) {
1199 #if CONFIG_THREAD_GROUPS
1200 		thread_group_vm_add();
1201 #endif /* CONFIG_THREAD_GROUPS */
1202 		current_thread()->options |= TH_OPT_VMPRIV;
1203 		vm_swapout_thread_inited = true;
1204 	}
1205 
1206 	vm_swapout_thread_awakened++;
1207 
1208 	lck_mtx_lock_spin_always(c_list_lock);
1209 
1210 	swapout_list_head = &c_early_swapout_list_head;
1211 	vm_swapout_thread_running = TRUE;
1212 	os_atomic_store(&vm_swapout_wake_pending, false, relaxed);
1213 again:
1214 	while (should_process_swapout_queue(swapout_list_head)) {
1215 		c_seg = (c_segment_t)queue_first(swapout_list_head);
1216 
1217 		lck_mtx_lock_spin_always(&c_seg->c_lock);
1218 
1219 		assert(c_seg->c_state == C_ON_SWAPOUT_Q);
1220 
1221 		if (c_seg->c_busy) {
1222 			lck_mtx_unlock_always(c_list_lock);
1223 
1224 			c_seg_wait_on_busy(c_seg);
1225 
1226 			lck_mtx_lock_spin_always(c_list_lock);
1227 
1228 			continue;
1229 		}
1230 		vm_swapout_thread_processed_segments++;
1231 
1232 		size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
1233 
1234 		if (size == 0) {
1235 			assert(c_seg->c_bytes_used == 0);
1236 
1237 			/*
1238 			 * c_seg_free_locked will drop the c_list_lock and
1239 			 * the c_seg->c_lock.
1240 			 */
1241 			C_SEG_BUSY(c_seg);
1242 			c_seg_free_locked(c_seg);
1243 			c_seg = NULL;
1244 
1245 			vm_swapout_found_empty++;
1246 			goto c_seg_is_empty;
1247 		}
1248 		C_SEG_BUSY(c_seg);
1249 		c_seg->c_busy_swapping = 1;
1250 
1251 		c_seg_switch_state(c_seg, C_ON_SWAPIO_Q, FALSE);
1252 
1253 		lck_mtx_unlock_always(c_list_lock);
1254 		lck_mtx_unlock_always(&c_seg->c_lock);
1255 
1256 #if CHECKSUM_THE_SWAP
1257 		c_seg->cseg_hash = hash_string((char *)c_seg->c_store.c_buffer, (int)size);
1258 		c_seg->cseg_swap_size = size;
1259 #endif /* CHECKSUM_THE_SWAP */
1260 
1261 #if ENCRYPTED_SWAP
1262 		vm_swap_encrypt(c_seg);
1263 #endif /* ENCRYPTED_SWAP */
1264 
1265 		soc = vm_swapout_find_free_soc();
1266 		assert(soc);
1267 
1268 		soc->swp_upl_ctx.io_context = (void *)soc;
1269 		soc->swp_upl_ctx.io_done = (void *)vm_swapout_iodone;
1270 		soc->swp_upl_ctx.io_error = 0;
1271 
1272 		kr = vm_swap_put((vm_offset_t)c_seg->c_store.c_buffer, &soc->swp_f_offset, size, c_seg, soc);
1273 
1274 		if (kr != KERN_SUCCESS) {
1275 			if (soc->swp_io_done) {
1276 				lck_mtx_lock_spin_always(c_list_lock);
1277 
1278 				soc->swp_io_done = 0;
1279 				vm_swapout_soc_done--;
1280 
1281 				lck_mtx_unlock_always(c_list_lock);
1282 			}
1283 			vm_swapout_finish(c_seg, soc->swp_f_offset, size, kr);
1284 		} else {
1285 			soc->swp_io_busy = 1;
1286 			vm_swapout_soc_busy++;
1287 		}
1288 
1289 c_seg_is_empty:
1290 		if (!(c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count)) {
1291 			vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE);
1292 		}
1293 
1294 		lck_mtx_lock_spin_always(c_list_lock);
1295 
1296 		while ((soc = vm_swapout_find_done_soc())) {
1297 			vm_swapout_complete_soc(soc);
1298 		}
1299 		lck_mtx_unlock_always(c_list_lock);
1300 
1301 		vm_swapout_thread_throttle_adjust();
1302 
1303 		lck_mtx_lock_spin_always(c_list_lock);
1304 	}
1305 	while ((soc = vm_swapout_find_done_soc())) {
1306 		vm_swapout_complete_soc(soc);
1307 	}
1308 	lck_mtx_unlock_always(c_list_lock);
1309 
1310 	vm_pageout_io_throttle();
1311 
1312 	lck_mtx_lock_spin_always(c_list_lock);
1313 
1314 	/*
1315 	 * Recheck if we have some c_segs to wakeup
1316 	 * post throttle. And, check to see if we
1317 	 * have any more swapouts needed.
1318 	 */
1319 	if (vm_swapout_soc_done) {
1320 		goto again;
1321 	}
1322 
1323 #if XNU_TARGET_OS_OSX
1324 	queues_empty = queue_empty(&c_early_swapout_list_head) && queue_empty(&c_regular_swapout_list_head) && queue_empty(&c_late_swapout_list_head);
1325 #else /* XNU_TARGET_OS_OSX */
1326 	queues_empty = queue_empty(&c_early_swapout_list_head) && queue_empty(&c_late_swapout_list_head);
1327 #endif /* XNU_TARGET_OS_OSX */
1328 
1329 	if (!queues_empty) {
1330 		swapout_list_head = NULL;
1331 		if (!queue_empty(&c_early_swapout_list_head)) {
1332 			swapout_list_head = &c_early_swapout_list_head;
1333 		} else {
1334 #if XNU_TARGET_OS_OSX
1335 			/*
1336 			 * On macOS we _always_ processs all swapout queues.
1337 			 */
1338 			if (!queue_empty(&c_regular_swapout_list_head)) {
1339 				swapout_list_head = &c_regular_swapout_list_head;
1340 			} else {
1341 				swapout_list_head = &c_late_swapout_list_head;
1342 			}
1343 #else /* XNU_TARGET_OS_OSX */
1344 			/*
1345 			 * On non-macOS swap-capable platforms, we might want to
1346 			 * processs just the early queue (Freezer) or process both
1347 			 * early and late queues (app swap). We processed the early
1348 			 * queue up above. The late Q will only be processed if the
1349 			 * checks in should_process_swapout_queue give the go-ahead.
1350 			 */
1351 			swapout_list_head = &c_late_swapout_list_head;
1352 #endif /* XNU_TARGET_OS_OSX */
1353 		}
1354 		if (swapout_list_head && should_process_swapout_queue(swapout_list_head)) {
1355 			goto again;
1356 		}
1357 	}
1358 
1359 	assert_wait((event_t)&vm_swapout_thread, THREAD_UNINT);
1360 
1361 	vm_swapout_thread_running = FALSE;
1362 
1363 	lck_mtx_unlock_always(c_list_lock);
1364 
1365 	thread_block((thread_continue_t)vm_swapout_thread);
1366 
1367 	/* NOTREACHED */
1368 }
1369 
1370 
1371 void
vm_swapout_iodone(void * io_context,int error)1372 vm_swapout_iodone(void *io_context, int error)
1373 {
1374 	struct swapout_io_completion *soc;
1375 
1376 	soc = (struct swapout_io_completion *)io_context;
1377 
1378 	lck_mtx_lock_spin_always(c_list_lock);
1379 
1380 	soc->swp_io_done = 1;
1381 	soc->swp_io_error = error;
1382 	vm_swapout_soc_done++;
1383 
1384 	if (!vm_swapout_thread_running) {
1385 		thread_wakeup((event_t)&vm_swapout_thread);
1386 	}
1387 
1388 	lck_mtx_unlock_always(c_list_lock);
1389 }
1390 
1391 
1392 static void
vm_swapout_finish(c_segment_t c_seg,uint64_t f_offset,uint32_t size,kern_return_t kr)1393 vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset, uint32_t size, kern_return_t kr)
1394 {
1395 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
1396 
1397 	if (kr == KERN_SUCCESS) {
1398 		kernel_memory_depopulate((vm_offset_t)c_seg->c_store.c_buffer, size,
1399 		    KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
1400 	}
1401 #if ENCRYPTED_SWAP
1402 	else {
1403 		vm_swap_decrypt(c_seg);
1404 	}
1405 #endif /* ENCRYPTED_SWAP */
1406 	lck_mtx_lock_spin_always(c_list_lock);
1407 	lck_mtx_lock_spin_always(&c_seg->c_lock);
1408 
1409 	if (kr == KERN_SUCCESS) {
1410 		int             new_state = C_ON_SWAPPEDOUT_Q;
1411 		boolean_t       insert_head = FALSE;
1412 
1413 		if (hibernate_flushing == TRUE) {
1414 			if (c_seg->c_generation_id >= first_c_segment_to_warm_generation_id &&
1415 			    c_seg->c_generation_id <= last_c_segment_to_warm_generation_id) {
1416 				insert_head = TRUE;
1417 			}
1418 		} else if (C_SEG_ONDISK_IS_SPARSE(c_seg)) {
1419 			new_state = C_ON_SWAPPEDOUTSPARSE_Q;
1420 		}
1421 
1422 		c_seg_switch_state(c_seg, new_state, insert_head);
1423 
1424 		c_seg->c_store.c_swap_handle = f_offset;
1425 
1426 		counter_add(&vm_statistics_swapouts, size >> PAGE_SHIFT);
1427 
1428 		c_seg->c_swappedin = false;
1429 
1430 		if (c_seg->c_bytes_used) {
1431 			OSAddAtomic64(-c_seg->c_bytes_used, &compressor_bytes_used);
1432 		}
1433 
1434 #if CONFIG_FREEZE
1435 		/*
1436 		 * Successful swapout. Decrement the in-core compressed pages count.
1437 		 */
1438 		OSAddAtomic(-(c_seg->c_slots_used), &c_segment_pages_compressed_incore);
1439 		assertf(c_segment_pages_compressed_incore >= 0, "-ve incore count %p 0x%x", c_seg, c_segment_pages_compressed_incore);
1440 		if (c_seg->c_has_donated_pages) {
1441 			OSAddAtomic(-(c_seg->c_slots_used), &c_segment_pages_compressed_incore_late_swapout);
1442 		}
1443 #endif /* CONFIG_FREEZE */
1444 	} else {
1445 		if (c_seg->c_overage_swap == TRUE) {
1446 			c_seg->c_overage_swap = FALSE;
1447 			c_overage_swapped_count--;
1448 		}
1449 
1450 #if CONFIG_FREEZE
1451 		if (c_seg->c_has_freezer_pages) {
1452 			if (c_seg->c_task_owner) {
1453 				c_seg_update_task_owner(c_seg, NULL);
1454 			}
1455 			/*
1456 			 * We failed to swapout a frozen cseg. We need
1457 			 * to put it back in the queues, specifically the
1458 			 * AGE_Q. So clear the donated bit otherwise it'll
1459 			 * land on the swapped_in Q.
1460 			 */
1461 			c_seg->c_has_donated_pages = 0;
1462 			c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
1463 		} else
1464 #endif /* CONFIG_FREEZE */
1465 		{
1466 			if (c_seg->c_has_donated_pages) {
1467 				c_seg_switch_state(c_seg, C_ON_SWAPPEDIN_Q, FALSE);
1468 			} else {
1469 				c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
1470 			}
1471 		}
1472 
1473 		if (!c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
1474 			c_seg_need_delayed_compaction(c_seg, TRUE);
1475 		}
1476 	}
1477 	assert(c_seg->c_busy_swapping);
1478 	assert(c_seg->c_busy);
1479 
1480 	c_seg->c_busy_swapping = 0;
1481 	lck_mtx_unlock_always(c_list_lock);
1482 
1483 	C_SEG_WAKEUP_DONE(c_seg);
1484 	lck_mtx_unlock_always(&c_seg->c_lock);
1485 
1486 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
1487 }
1488 
1489 
1490 boolean_t
vm_swap_create_file()1491 vm_swap_create_file()
1492 {
1493 	uint64_t        size = 0;
1494 	int             namelen = 0;
1495 	boolean_t       swap_file_created = FALSE;
1496 	boolean_t       swap_file_reuse = FALSE;
1497 	boolean_t       swap_file_pin = FALSE;
1498 	struct swapfile *swf = NULL;
1499 
1500 	/*
1501 	 * make sure we've got all the info we need
1502 	 * to potentially pin a swap file... we could
1503 	 * be swapping out due to hibernation w/o ever
1504 	 * having run vm_pageout_scan, which is normally
1505 	 * the trigger to do the init
1506 	 */
1507 	vm_compaction_swapper_do_init();
1508 
1509 	/*
1510 	 * Any swapfile structure ready for re-use?
1511 	 */
1512 
1513 	lck_mtx_lock(&vm_swap_data_lock);
1514 
1515 	swf = (struct swapfile*) queue_first(&swf_global_queue);
1516 
1517 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
1518 		if (swf->swp_flags == SWAP_REUSE) {
1519 			swap_file_reuse = TRUE;
1520 			break;
1521 		}
1522 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
1523 	}
1524 
1525 	lck_mtx_unlock(&vm_swap_data_lock);
1526 
1527 	if (swap_file_reuse == FALSE) {
1528 		namelen = (int)strlen(swapfilename) + SWAPFILENAME_INDEX_LEN + 1;
1529 
1530 		swf = kalloc_type(struct swapfile, Z_WAITOK | Z_ZERO);
1531 		swf->swp_index = vm_num_swap_files + 1;
1532 		swf->swp_pathlen = namelen;
1533 		swf->swp_path = kalloc_data(swf->swp_pathlen, Z_WAITOK | Z_ZERO);
1534 
1535 		snprintf(swf->swp_path, namelen, "%s%d", swapfilename, vm_num_swap_files);
1536 	}
1537 
1538 	vm_swapfile_open(swf->swp_path, &swf->swp_vp);
1539 
1540 	if (swf->swp_vp == NULL) {
1541 		if (swap_file_reuse == FALSE) {
1542 			kfree_data(swf->swp_path, swf->swp_pathlen);
1543 			kfree_type(struct swapfile, swf);
1544 		}
1545 		return FALSE;
1546 	}
1547 	vm_swapfile_can_be_created = TRUE;
1548 
1549 	size = MAX_SWAP_FILE_SIZE;
1550 
1551 	while (size >= MIN_SWAP_FILE_SIZE) {
1552 		swap_file_pin = VM_SWAP_SHOULD_PIN(size);
1553 
1554 		if (vm_swapfile_preallocate(swf->swp_vp, &size, &swap_file_pin) == 0) {
1555 			int num_bytes_for_bitmap = 0;
1556 
1557 			swap_file_created = TRUE;
1558 
1559 			swf->swp_size = size;
1560 			swf->swp_nsegs = (unsigned int) (size / compressed_swap_chunk_size);
1561 			swf->swp_nseginuse = 0;
1562 			swf->swp_free_hint = 0;
1563 
1564 			num_bytes_for_bitmap = MAX((swf->swp_nsegs >> 3), 1);
1565 			/*
1566 			 * Allocate a bitmap that describes the
1567 			 * number of segments held by this swapfile.
1568 			 */
1569 			swf->swp_bitmap = kalloc_data(num_bytes_for_bitmap,
1570 			    Z_WAITOK | Z_ZERO);
1571 
1572 			swf->swp_csegs = kalloc_type(c_segment_t, swf->swp_nsegs,
1573 			    Z_WAITOK | Z_ZERO);
1574 
1575 			/*
1576 			 * passing a NULL trim_list into vnode_trim_list
1577 			 * will return ENOTSUP if trim isn't supported
1578 			 * and 0 if it is
1579 			 */
1580 			if (vnode_trim_list(swf->swp_vp, NULL, FALSE) == 0) {
1581 				swp_trim_supported = TRUE;
1582 			}
1583 
1584 			lck_mtx_lock(&vm_swap_data_lock);
1585 
1586 			swf->swp_flags = SWAP_READY;
1587 
1588 			if (swap_file_reuse == FALSE) {
1589 				queue_enter(&swf_global_queue, swf, struct swapfile*, swp_queue);
1590 			}
1591 
1592 			vm_num_swap_files++;
1593 
1594 			vm_swapfile_total_segs_alloced += swf->swp_nsegs;
1595 			if (vm_swapfile_total_segs_alloced > vm_swapfile_total_segs_alloced_max) {
1596 				vm_swapfile_total_segs_alloced_max = vm_swapfile_total_segs_alloced;
1597 			}
1598 
1599 			if (swap_file_pin == TRUE) {
1600 				vm_num_pinned_swap_files++;
1601 				swf->swp_flags |= SWAP_PINNED;
1602 				vm_swappin_avail -= swf->swp_size;
1603 			}
1604 
1605 			lck_mtx_unlock(&vm_swap_data_lock);
1606 
1607 			thread_wakeup((event_t) &vm_num_swap_files);
1608 #if !XNU_TARGET_OS_OSX
1609 			if (vm_num_swap_files == 1) {
1610 				c_overage_swapped_limit = (uint32_t)size / c_seg_bufsize;
1611 
1612 				if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
1613 					c_overage_swapped_limit /= 2;
1614 				}
1615 			}
1616 #endif /* !XNU_TARGET_OS_OSX */
1617 			break;
1618 		} else {
1619 			size = size / 2;
1620 		}
1621 	}
1622 	if (swap_file_created == FALSE) {
1623 		vm_swapfile_close((uint64_t)(swf->swp_path), swf->swp_vp);
1624 
1625 		swf->swp_vp = NULL;
1626 
1627 		if (swap_file_reuse == FALSE) {
1628 			kfree_data(swf->swp_path, swf->swp_pathlen);
1629 			kfree_type(struct swapfile, swf);
1630 		}
1631 	}
1632 	return swap_file_created;
1633 }
1634 
1635 extern void vnode_put(struct vnode* vp);
1636 kern_return_t
vm_swap_get(c_segment_t c_seg,uint64_t f_offset,uint64_t size)1637 vm_swap_get(c_segment_t c_seg, uint64_t f_offset, uint64_t size)
1638 {
1639 	struct swapfile *swf = NULL;
1640 	uint64_t        file_offset = 0;
1641 	int             retval = 0;
1642 
1643 	assert(c_seg->c_store.c_buffer);
1644 
1645 	lck_mtx_lock(&vm_swap_data_lock);
1646 
1647 	swf = vm_swapfile_for_handle(f_offset);
1648 
1649 	if (swf == NULL || (!(swf->swp_flags & SWAP_READY) && !(swf->swp_flags & SWAP_RECLAIM))) {
1650 		vm_swap_get_failures++;
1651 		retval = 1;
1652 		goto done;
1653 	}
1654 	swf->swp_io_count++;
1655 
1656 	lck_mtx_unlock(&vm_swap_data_lock);
1657 
1658 #if DEVELOPMENT || DEBUG
1659 	C_SEG_MAKE_WRITEABLE(c_seg);
1660 #endif
1661 	file_offset = (f_offset & SWAP_SLOT_MASK);
1662 
1663 	if ((retval = vnode_getwithref(swf->swp_vp)) != 0) {
1664 		printf("vm_swap_get: vnode_getwithref on swapfile failed with %d\n", retval);
1665 	} else {
1666 		retval = vm_swapfile_io(swf->swp_vp, file_offset, (uint64_t)c_seg->c_store.c_buffer, (int)(size / PAGE_SIZE_64), SWAP_READ, NULL);
1667 		vnode_put(swf->swp_vp);
1668 	}
1669 
1670 #if DEVELOPMENT || DEBUG
1671 	C_SEG_WRITE_PROTECT(c_seg);
1672 #endif
1673 	if (retval == 0) {
1674 		counter_add(&vm_statistics_swapins, size >> PAGE_SHIFT);
1675 	} else {
1676 		vm_swap_get_failures++;
1677 	}
1678 
1679 	/*
1680 	 * Free this slot in the swap structure.
1681 	 */
1682 	vm_swap_free(f_offset);
1683 
1684 	lck_mtx_lock(&vm_swap_data_lock);
1685 	swf->swp_io_count--;
1686 
1687 	if ((swf->swp_flags & SWAP_WANTED) && swf->swp_io_count == 0) {
1688 		swf->swp_flags &= ~SWAP_WANTED;
1689 		thread_wakeup((event_t) &swf->swp_flags);
1690 	}
1691 done:
1692 	lck_mtx_unlock(&vm_swap_data_lock);
1693 
1694 	if (retval == 0) {
1695 		return KERN_SUCCESS;
1696 	} else {
1697 		return KERN_FAILURE;
1698 	}
1699 }
1700 
1701 kern_return_t
vm_swap_put(vm_offset_t addr,uint64_t * f_offset,uint32_t size,c_segment_t c_seg,struct swapout_io_completion * soc)1702 vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint32_t size, c_segment_t c_seg, struct swapout_io_completion *soc)
1703 {
1704 	unsigned int    segidx = 0;
1705 	struct swapfile *swf = NULL;
1706 	uint64_t        file_offset = 0;
1707 	uint64_t        swapfile_index = 0;
1708 	unsigned int    byte_for_segidx = 0;
1709 	unsigned int    offset_within_byte = 0;
1710 	boolean_t       swf_eligible = FALSE;
1711 	boolean_t       waiting = FALSE;
1712 	boolean_t       retried = FALSE;
1713 	int             error = 0;
1714 	clock_sec_t     sec;
1715 	clock_nsec_t    nsec;
1716 	void            *upl_ctx = NULL;
1717 	boolean_t       drop_iocount = FALSE;
1718 
1719 	if (addr == 0 || f_offset == NULL || compressor_store_stop_compaction) {
1720 		return KERN_FAILURE;
1721 	}
1722 retry:
1723 	lck_mtx_lock(&vm_swap_data_lock);
1724 
1725 	swf = (struct swapfile*) queue_first(&swf_global_queue);
1726 
1727 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
1728 		segidx = swf->swp_free_hint;
1729 
1730 		swf_eligible =  (swf->swp_flags & SWAP_READY) && (swf->swp_nseginuse < swf->swp_nsegs);
1731 
1732 		if (swf_eligible) {
1733 			while (segidx < swf->swp_nsegs) {
1734 				byte_for_segidx = segidx >> 3;
1735 				offset_within_byte = segidx % 8;
1736 
1737 				if ((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) {
1738 					segidx++;
1739 					continue;
1740 				}
1741 
1742 				(swf->swp_bitmap)[byte_for_segidx] |= (uint8_t)(1 << offset_within_byte);
1743 
1744 				file_offset = segidx * compressed_swap_chunk_size;
1745 				swf->swp_nseginuse++;
1746 				swf->swp_io_count++;
1747 				swf->swp_csegs[segidx] = c_seg;
1748 
1749 				swapfile_index = swf->swp_index;
1750 				vm_swapfile_total_segs_used++;
1751 				if (vm_swapfile_total_segs_used > vm_swapfile_total_segs_used_max) {
1752 					vm_swapfile_total_segs_used_max = vm_swapfile_total_segs_used;
1753 				}
1754 
1755 				clock_get_system_nanotime(&sec, &nsec);
1756 
1757 				if (VM_SWAP_SHOULD_CREATE(sec) && !vm_swapfile_create_thread_running) {
1758 					thread_wakeup((event_t) &vm_swapfile_create_needed);
1759 				}
1760 
1761 				lck_mtx_unlock(&vm_swap_data_lock);
1762 
1763 				goto issue_io;
1764 			}
1765 		}
1766 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
1767 	}
1768 	assert(queue_end(&swf_global_queue, (queue_entry_t) swf));
1769 
1770 	/*
1771 	 * we've run out of swap segments, but may not
1772 	 * be in a position to immediately create a new swap
1773 	 * file if we've recently failed to create due to a lack
1774 	 * of free space in the root filesystem... we'll try
1775 	 * to kick that create off, but in any event we're going
1776 	 * to take a breather (up to 1 second) so that we're not caught in a tight
1777 	 * loop back in "vm_compressor_compact_and_swap" trying to stuff
1778 	 * segments into swap files only to have them immediately put back
1779 	 * on the c_age queue due to vm_swap_put failing.
1780 	 *
1781 	 * if we're doing these puts due to a hibernation flush,
1782 	 * no need to block... setting hibernate_no_swapspace to TRUE,
1783 	 * will cause "vm_compressor_compact_and_swap" to immediately abort
1784 	 */
1785 	clock_get_system_nanotime(&sec, &nsec);
1786 
1787 	if (VM_SWAP_SHOULD_CREATE(sec)) {
1788 		if (!vm_swapfile_create_thread_running) {
1789 			thread_wakeup((event_t) &vm_swapfile_create_needed);
1790 		}
1791 		waiting = TRUE;
1792 		assert_wait_timeout((event_t) &vm_num_swap_files, THREAD_INTERRUPTIBLE, 1000, 1000 * NSEC_PER_USEC);
1793 	} else {
1794 		if (hibernate_flushing) {
1795 			hibernate_no_swapspace = TRUE;
1796 		}
1797 	}
1798 
1799 	lck_mtx_unlock(&vm_swap_data_lock);
1800 
1801 	if (waiting == TRUE) {
1802 		thread_block(THREAD_CONTINUE_NULL);
1803 
1804 		if (retried == FALSE && hibernate_flushing == TRUE) {
1805 			retried = TRUE;
1806 			goto retry;
1807 		}
1808 	}
1809 	vm_swap_put_failures_no_swap_file++;
1810 
1811 	return KERN_FAILURE;
1812 
1813 issue_io:
1814 	assert(c_seg->c_busy_swapping);
1815 	assert(c_seg->c_busy);
1816 	assert(!c_seg->c_on_minorcompact_q);
1817 
1818 	*f_offset = (swapfile_index << SWAP_DEVICE_SHIFT) | file_offset;
1819 
1820 	if (soc) {
1821 		soc->swp_c_seg = c_seg;
1822 		soc->swp_c_size = size;
1823 
1824 		soc->swp_swf = swf;
1825 
1826 		soc->swp_io_error = 0;
1827 		soc->swp_io_done = 0;
1828 
1829 		upl_ctx = (void *)&soc->swp_upl_ctx;
1830 	}
1831 
1832 	if ((error = vnode_getwithref(swf->swp_vp)) != 0) {
1833 		printf("vm_swap_put: vnode_getwithref on swapfile failed with %d\n", error);
1834 	} else {
1835 		error = vm_swapfile_io(swf->swp_vp, file_offset, addr, (int) (size / PAGE_SIZE_64), SWAP_WRITE, upl_ctx);
1836 		drop_iocount = TRUE;
1837 	}
1838 
1839 	if (error || upl_ctx == NULL) {
1840 		return vm_swap_put_finish(swf, f_offset, error, drop_iocount);
1841 	}
1842 
1843 	return KERN_SUCCESS;
1844 }
1845 
1846 kern_return_t
vm_swap_put_finish(struct swapfile * swf,uint64_t * f_offset,int error,boolean_t drop_iocount)1847 vm_swap_put_finish(struct swapfile *swf, uint64_t *f_offset, int error, boolean_t drop_iocount)
1848 {
1849 	if (drop_iocount) {
1850 		vnode_put(swf->swp_vp);
1851 	}
1852 
1853 	lck_mtx_lock(&vm_swap_data_lock);
1854 
1855 	swf->swp_io_count--;
1856 
1857 	if ((swf->swp_flags & SWAP_WANTED) && swf->swp_io_count == 0) {
1858 		swf->swp_flags &= ~SWAP_WANTED;
1859 		thread_wakeup((event_t) &swf->swp_flags);
1860 	}
1861 	lck_mtx_unlock(&vm_swap_data_lock);
1862 
1863 	if (error) {
1864 		vm_swap_free(*f_offset);
1865 		vm_swap_put_failures++;
1866 
1867 		return KERN_FAILURE;
1868 	}
1869 	return KERN_SUCCESS;
1870 }
1871 
1872 
1873 static void
vm_swap_free_now(struct swapfile * swf,uint64_t f_offset)1874 vm_swap_free_now(struct swapfile *swf, uint64_t f_offset)
1875 {
1876 	uint64_t        file_offset = 0;
1877 	unsigned int    segidx = 0;
1878 
1879 
1880 	if ((swf->swp_flags & SWAP_READY) || (swf->swp_flags & SWAP_RECLAIM)) {
1881 		unsigned int byte_for_segidx = 0;
1882 		unsigned int offset_within_byte = 0;
1883 
1884 		file_offset = (f_offset & SWAP_SLOT_MASK);
1885 		segidx = (unsigned int) (file_offset / compressed_swap_chunk_size);
1886 
1887 		byte_for_segidx = segidx >> 3;
1888 		offset_within_byte = segidx % 8;
1889 
1890 		if ((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) {
1891 			(swf->swp_bitmap)[byte_for_segidx] &= ~(1 << offset_within_byte);
1892 
1893 			swf->swp_csegs[segidx] = NULL;
1894 
1895 			swf->swp_nseginuse--;
1896 			vm_swapfile_total_segs_used--;
1897 
1898 			if (segidx < swf->swp_free_hint) {
1899 				swf->swp_free_hint = segidx;
1900 			}
1901 		}
1902 		if (VM_SWAP_SHOULD_RECLAIM() && !vm_swapfile_gc_thread_running) {
1903 			thread_wakeup((event_t) &vm_swapfile_gc_needed);
1904 		}
1905 	}
1906 }
1907 
1908 
1909 uint32_t vm_swap_free_now_count = 0;
1910 uint32_t vm_swap_free_delayed_count = 0;
1911 
1912 
1913 void
vm_swap_free(uint64_t f_offset)1914 vm_swap_free(uint64_t f_offset)
1915 {
1916 	struct swapfile *swf = NULL;
1917 	struct trim_list *tl = NULL;
1918 	clock_sec_t     sec;
1919 	clock_nsec_t    nsec;
1920 
1921 	if (swp_trim_supported == TRUE) {
1922 		tl = kalloc_type(struct trim_list, Z_WAITOK);
1923 	}
1924 
1925 	lck_mtx_lock(&vm_swap_data_lock);
1926 
1927 	swf = vm_swapfile_for_handle(f_offset);
1928 
1929 	if (swf && (swf->swp_flags & (SWAP_READY | SWAP_RECLAIM))) {
1930 		if (swp_trim_supported == FALSE || (swf->swp_flags & SWAP_RECLAIM)) {
1931 			/*
1932 			 * don't delay the free if the underlying disk doesn't support
1933 			 * trim, or we're in the midst of reclaiming this swap file since
1934 			 * we don't want to move segments that are technically free
1935 			 * but not yet handled by the delayed free mechanism
1936 			 */
1937 			vm_swap_free_now(swf, f_offset);
1938 
1939 			vm_swap_free_now_count++;
1940 			goto done;
1941 		}
1942 		tl->tl_offset = f_offset & SWAP_SLOT_MASK;
1943 		tl->tl_length = compressed_swap_chunk_size;
1944 
1945 		tl->tl_next = swf->swp_delayed_trim_list_head;
1946 		swf->swp_delayed_trim_list_head = tl;
1947 		swf->swp_delayed_trim_count++;
1948 		tl = NULL;
1949 
1950 		if (VM_SWAP_SHOULD_TRIM(swf) && !vm_swapfile_create_thread_running) {
1951 			clock_get_system_nanotime(&sec, &nsec);
1952 
1953 			if (sec > dont_trim_until_ts) {
1954 				thread_wakeup((event_t) &vm_swapfile_create_needed);
1955 			}
1956 		}
1957 		vm_swap_free_delayed_count++;
1958 	}
1959 done:
1960 	lck_mtx_unlock(&vm_swap_data_lock);
1961 
1962 	if (tl != NULL) {
1963 		kfree_type(struct trim_list, tl);
1964 	}
1965 }
1966 
1967 
1968 static void
vm_swap_wait_on_trim_handling_in_progress()1969 vm_swap_wait_on_trim_handling_in_progress()
1970 {
1971 	while (delayed_trim_handling_in_progress == TRUE) {
1972 		assert_wait((event_t) &delayed_trim_handling_in_progress, THREAD_UNINT);
1973 		lck_mtx_unlock(&vm_swap_data_lock);
1974 
1975 		thread_block(THREAD_CONTINUE_NULL);
1976 
1977 		lck_mtx_lock(&vm_swap_data_lock);
1978 	}
1979 }
1980 
1981 
1982 static void
vm_swap_handle_delayed_trims(boolean_t force_now)1983 vm_swap_handle_delayed_trims(boolean_t force_now)
1984 {
1985 	struct swapfile *swf = NULL;
1986 
1987 	/*
1988 	 * serialize the race between us and vm_swap_reclaim...
1989 	 * if vm_swap_reclaim wins it will turn off SWAP_READY
1990 	 * on the victim it has chosen... we can just skip over
1991 	 * that file since vm_swap_reclaim will first process
1992 	 * all of the delayed trims associated with it
1993 	 */
1994 
1995 	if (compressor_store_stop_compaction == TRUE) {
1996 		return;
1997 	}
1998 
1999 	lck_mtx_lock(&vm_swap_data_lock);
2000 
2001 	delayed_trim_handling_in_progress = TRUE;
2002 
2003 	lck_mtx_unlock(&vm_swap_data_lock);
2004 
2005 	/*
2006 	 * no need to hold the lock to walk the swf list since
2007 	 * vm_swap_create (the only place where we add to this list)
2008 	 * is run on the same thread as this function
2009 	 * and vm_swap_reclaim doesn't remove items from this list
2010 	 * instead marking them with SWAP_REUSE for future re-use
2011 	 */
2012 	swf = (struct swapfile*) queue_first(&swf_global_queue);
2013 
2014 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
2015 		if ((swf->swp_flags & SWAP_READY) && (force_now == TRUE || VM_SWAP_SHOULD_TRIM(swf))) {
2016 			assert(!(swf->swp_flags & SWAP_RECLAIM));
2017 			vm_swap_do_delayed_trim(swf);
2018 		}
2019 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
2020 	}
2021 	lck_mtx_lock(&vm_swap_data_lock);
2022 
2023 	delayed_trim_handling_in_progress = FALSE;
2024 	thread_wakeup((event_t) &delayed_trim_handling_in_progress);
2025 
2026 	if (VM_SWAP_SHOULD_RECLAIM() && !vm_swapfile_gc_thread_running) {
2027 		thread_wakeup((event_t) &vm_swapfile_gc_needed);
2028 	}
2029 
2030 	lck_mtx_unlock(&vm_swap_data_lock);
2031 }
2032 
2033 static void
vm_swap_do_delayed_trim(struct swapfile * swf)2034 vm_swap_do_delayed_trim(struct swapfile *swf)
2035 {
2036 	struct trim_list *tl, *tl_head;
2037 	int error;
2038 
2039 	if (compressor_store_stop_compaction == TRUE) {
2040 		return;
2041 	}
2042 
2043 	if ((error = vnode_getwithref(swf->swp_vp)) != 0) {
2044 		printf("vm_swap_do_delayed_trim: vnode_getwithref on swapfile failed with %d\n", error);
2045 		return;
2046 	}
2047 
2048 	lck_mtx_lock(&vm_swap_data_lock);
2049 
2050 	tl_head = swf->swp_delayed_trim_list_head;
2051 	swf->swp_delayed_trim_list_head = NULL;
2052 	swf->swp_delayed_trim_count = 0;
2053 
2054 	lck_mtx_unlock(&vm_swap_data_lock);
2055 
2056 	vnode_trim_list(swf->swp_vp, tl_head, TRUE);
2057 
2058 	(void) vnode_put(swf->swp_vp);
2059 
2060 	while ((tl = tl_head) != NULL) {
2061 		unsigned int    segidx = 0;
2062 		unsigned int    byte_for_segidx = 0;
2063 		unsigned int    offset_within_byte = 0;
2064 
2065 		lck_mtx_lock(&vm_swap_data_lock);
2066 
2067 		segidx = (unsigned int) (tl->tl_offset / compressed_swap_chunk_size);
2068 
2069 		byte_for_segidx = segidx >> 3;
2070 		offset_within_byte = segidx % 8;
2071 
2072 		if ((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) {
2073 			(swf->swp_bitmap)[byte_for_segidx] &= ~(1 << offset_within_byte);
2074 
2075 			swf->swp_csegs[segidx] = NULL;
2076 
2077 			swf->swp_nseginuse--;
2078 			vm_swapfile_total_segs_used--;
2079 
2080 			if (segidx < swf->swp_free_hint) {
2081 				swf->swp_free_hint = segidx;
2082 			}
2083 		}
2084 		lck_mtx_unlock(&vm_swap_data_lock);
2085 
2086 		tl_head = tl->tl_next;
2087 
2088 		kfree_type(struct trim_list, tl);
2089 	}
2090 }
2091 
2092 
2093 void
vm_swap_flush()2094 vm_swap_flush()
2095 {
2096 	return;
2097 }
2098 
2099 int     vm_swap_reclaim_yielded = 0;
2100 
2101 void
vm_swap_reclaim(void)2102 vm_swap_reclaim(void)
2103 {
2104 	vm_offset_t     addr = 0;
2105 	unsigned int    segidx = 0;
2106 	uint64_t        f_offset = 0;
2107 	struct swapfile *swf = NULL;
2108 	struct swapfile *smallest_swf = NULL;
2109 	unsigned int    min_nsegs = 0;
2110 	unsigned int    byte_for_segidx = 0;
2111 	unsigned int    offset_within_byte = 0;
2112 	uint32_t        c_size = 0;
2113 
2114 	c_segment_t     c_seg = NULL;
2115 
2116 	kmem_alloc(compressor_map, (vm_offset_t *)&addr, c_seg_bufsize,
2117 	    KMA_NOFAIL | KMA_KOBJECT | KMA_DATA, VM_KERN_MEMORY_COMPRESSOR);
2118 
2119 	lck_mtx_lock(&vm_swap_data_lock);
2120 
2121 	/*
2122 	 * if we're running the swapfile list looking for
2123 	 * candidates with delayed trims, we need to
2124 	 * wait before making our decision concerning
2125 	 * the swapfile we want to reclaim
2126 	 */
2127 	vm_swap_wait_on_trim_handling_in_progress();
2128 
2129 	/*
2130 	 * from here until we knock down the SWAP_READY bit,
2131 	 * we need to remain behind the vm_swap_data_lock...
2132 	 * once that bit has been turned off, "vm_swap_handle_delayed_trims"
2133 	 * will not consider this swapfile for processing
2134 	 */
2135 	swf = (struct swapfile*) queue_first(&swf_global_queue);
2136 	min_nsegs = MAX_SWAP_FILE_SIZE / compressed_swap_chunk_size;
2137 	smallest_swf = NULL;
2138 
2139 	while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
2140 		if ((swf->swp_flags & SWAP_READY) && (swf->swp_nseginuse <= min_nsegs)) {
2141 			smallest_swf = swf;
2142 			min_nsegs = swf->swp_nseginuse;
2143 		}
2144 		swf = (struct swapfile*) queue_next(&swf->swp_queue);
2145 	}
2146 
2147 	if (smallest_swf == NULL) {
2148 		goto done;
2149 	}
2150 
2151 	swf = smallest_swf;
2152 
2153 
2154 	swf->swp_flags &= ~SWAP_READY;
2155 	swf->swp_flags |= SWAP_RECLAIM;
2156 
2157 	if (swf->swp_delayed_trim_count) {
2158 		lck_mtx_unlock(&vm_swap_data_lock);
2159 
2160 		vm_swap_do_delayed_trim(swf);
2161 
2162 		lck_mtx_lock(&vm_swap_data_lock);
2163 	}
2164 	segidx = 0;
2165 
2166 	while (segidx < swf->swp_nsegs) {
2167 ReTry_for_cseg:
2168 		/*
2169 		 * Wait for outgoing I/Os.
2170 		 */
2171 		while (swf->swp_io_count) {
2172 			swf->swp_flags |= SWAP_WANTED;
2173 
2174 			assert_wait((event_t) &swf->swp_flags, THREAD_UNINT);
2175 			lck_mtx_unlock(&vm_swap_data_lock);
2176 
2177 			thread_block(THREAD_CONTINUE_NULL);
2178 
2179 			lck_mtx_lock(&vm_swap_data_lock);
2180 		}
2181 		if (compressor_store_stop_compaction == TRUE || VM_SWAP_SHOULD_ABORT_RECLAIM() || VM_SWAP_BUSY()) {
2182 			vm_swap_reclaim_yielded++;
2183 			break;
2184 		}
2185 
2186 		byte_for_segidx = segidx >> 3;
2187 		offset_within_byte = segidx % 8;
2188 
2189 		if (((swf->swp_bitmap)[byte_for_segidx] & (1 << offset_within_byte)) == 0) {
2190 			segidx++;
2191 			continue;
2192 		}
2193 
2194 		c_seg = swf->swp_csegs[segidx];
2195 		assert(c_seg);
2196 
2197 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2198 
2199 		if (c_seg->c_busy) {
2200 			/*
2201 			 * a swapped out c_segment in the process of being freed will remain in the
2202 			 * busy state until after the vm_swap_free is called on it... vm_swap_free
2203 			 * takes the vm_swap_data_lock, so can't change the swap state until after
2204 			 * we drop the vm_swap_data_lock... once we do, vm_swap_free will complete
2205 			 * which will allow c_seg_free_locked to clear busy and wake up this thread...
2206 			 * at that point, we re-look up the swap state which will now indicate that
2207 			 * this c_segment no longer exists.
2208 			 */
2209 			c_seg->c_wanted = 1;
2210 
2211 			assert_wait((event_t) (c_seg), THREAD_UNINT);
2212 			lck_mtx_unlock_always(&c_seg->c_lock);
2213 
2214 			lck_mtx_unlock(&vm_swap_data_lock);
2215 
2216 			thread_block(THREAD_CONTINUE_NULL);
2217 
2218 			lck_mtx_lock(&vm_swap_data_lock);
2219 
2220 			goto ReTry_for_cseg;
2221 		}
2222 		(swf->swp_bitmap)[byte_for_segidx] &= ~(1 << offset_within_byte);
2223 
2224 		f_offset = segidx * compressed_swap_chunk_size;
2225 
2226 		assert(c_seg == swf->swp_csegs[segidx]);
2227 		swf->swp_csegs[segidx] = NULL;
2228 		swf->swp_nseginuse--;
2229 
2230 		vm_swapfile_total_segs_used--;
2231 
2232 		lck_mtx_unlock(&vm_swap_data_lock);
2233 
2234 		assert(C_SEG_IS_ONDISK(c_seg));
2235 
2236 		C_SEG_BUSY(c_seg);
2237 		c_seg->c_busy_swapping = 1;
2238 #if !CHECKSUM_THE_SWAP
2239 		c_seg_trim_tail(c_seg);
2240 #endif
2241 		c_size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
2242 
2243 		assert(c_size <= c_seg_bufsize && c_size);
2244 
2245 		lck_mtx_unlock_always(&c_seg->c_lock);
2246 
2247 		if (vnode_getwithref(swf->swp_vp)) {
2248 			printf("vm_swap_reclaim: vnode_getwithref on swapfile failed.\n");
2249 			vm_swap_get_failures++;
2250 			goto swap_io_failed;
2251 		} else {
2252 			if (vm_swapfile_io(swf->swp_vp, f_offset, addr, (int)(c_size / PAGE_SIZE_64), SWAP_READ, NULL)) {
2253 				/*
2254 				 * reading the data back in failed, so convert c_seg
2255 				 * to a swapped in c_segment that contains no data
2256 				 */
2257 				c_seg_swapin_requeue(c_seg, FALSE, TRUE, FALSE);
2258 				/*
2259 				 * returns with c_busy_swapping cleared
2260 				 */
2261 				vnode_put(swf->swp_vp);
2262 				vm_swap_get_failures++;
2263 				goto swap_io_failed;
2264 			}
2265 			vnode_put(swf->swp_vp);
2266 		}
2267 
2268 		counter_add(&vm_statistics_swapins, c_size >> PAGE_SHIFT);
2269 		vmcs_stats.reclaim_swapins += c_size >> PAGE_SHIFT;
2270 
2271 		if (vm_swap_put(addr, &f_offset, c_size, c_seg, NULL)) {
2272 			vm_offset_t     c_buffer;
2273 
2274 			/*
2275 			 * the put failed, so convert c_seg to a fully swapped in c_segment
2276 			 * with valid data
2277 			 */
2278 			c_buffer = (vm_offset_t)C_SEG_BUFFER_ADDRESS(c_seg->c_mysegno);
2279 
2280 			kernel_memory_populate(c_buffer, c_size,
2281 			    KMA_NOFAIL | KMA_COMPRESSOR,
2282 			    VM_KERN_MEMORY_COMPRESSOR);
2283 
2284 			memcpy((char *)c_buffer, (char *)addr, c_size);
2285 
2286 			c_seg->c_store.c_buffer = (int32_t *)c_buffer;
2287 #if ENCRYPTED_SWAP
2288 			vm_swap_decrypt(c_seg);
2289 #endif /* ENCRYPTED_SWAP */
2290 			c_seg_swapin_requeue(c_seg, TRUE, TRUE, FALSE);
2291 			/*
2292 			 * returns with c_busy_swapping cleared
2293 			 */
2294 			OSAddAtomic64(c_seg->c_bytes_used, &compressor_bytes_used);
2295 
2296 			goto swap_io_failed;
2297 		}
2298 		counter_add(&vm_statistics_swapouts, c_size >> PAGE_SHIFT);
2299 
2300 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2301 
2302 		c_seg->c_swappedin = false;
2303 
2304 		assert(C_SEG_IS_ONDISK(c_seg));
2305 		/*
2306 		 * The c_seg will now know about the new location on disk.
2307 		 */
2308 		c_seg->c_store.c_swap_handle = f_offset;
2309 
2310 		assert(c_seg->c_busy_swapping);
2311 		c_seg->c_busy_swapping = 0;
2312 swap_io_failed:
2313 		assert(c_seg->c_busy);
2314 		C_SEG_WAKEUP_DONE(c_seg);
2315 
2316 		lck_mtx_unlock_always(&c_seg->c_lock);
2317 		lck_mtx_lock(&vm_swap_data_lock);
2318 	}
2319 
2320 	if (swf->swp_nseginuse) {
2321 		swf->swp_flags &= ~SWAP_RECLAIM;
2322 		swf->swp_flags |= SWAP_READY;
2323 
2324 		goto done;
2325 	}
2326 	/*
2327 	 * We don't remove this inactive swf from the queue.
2328 	 * That way, we can re-use it when needed again and
2329 	 * preserve the namespace. The delayed_trim processing
2330 	 * is also dependent on us not removing swfs from the queue.
2331 	 */
2332 	//queue_remove(&swf_global_queue, swf, struct swapfile*, swp_queue);
2333 
2334 	vm_swapfile_total_segs_alloced -= swf->swp_nsegs;
2335 
2336 	lck_mtx_unlock(&vm_swap_data_lock);
2337 
2338 	vm_swapfile_close((uint64_t)(swf->swp_path), swf->swp_vp);
2339 
2340 	kfree_type(c_segment_t, swf->swp_nsegs, swf->swp_csegs);
2341 	kfree_data(swf->swp_bitmap, MAX((swf->swp_nsegs >> 3), 1));
2342 
2343 	lck_mtx_lock(&vm_swap_data_lock);
2344 
2345 	if (swf->swp_flags & SWAP_PINNED) {
2346 		vm_num_pinned_swap_files--;
2347 		vm_swappin_avail += swf->swp_size;
2348 	}
2349 
2350 	swf->swp_vp = NULL;
2351 	swf->swp_size = 0;
2352 	swf->swp_free_hint = 0;
2353 	swf->swp_nsegs = 0;
2354 	swf->swp_flags = SWAP_REUSE;
2355 
2356 	vm_num_swap_files--;
2357 
2358 done:
2359 	thread_wakeup((event_t) &swf->swp_flags);
2360 	lck_mtx_unlock(&vm_swap_data_lock);
2361 
2362 	kmem_free(compressor_map, (vm_offset_t) addr, c_seg_bufsize);
2363 }
2364 
2365 
2366 uint64_t
vm_swap_get_total_space(void)2367 vm_swap_get_total_space(void)
2368 {
2369 	uint64_t total_space = 0;
2370 
2371 	total_space = (uint64_t)vm_swapfile_total_segs_alloced * compressed_swap_chunk_size;
2372 
2373 	return total_space;
2374 }
2375 
2376 uint64_t
vm_swap_get_used_space(void)2377 vm_swap_get_used_space(void)
2378 {
2379 	uint64_t used_space = 0;
2380 
2381 	used_space = (uint64_t)vm_swapfile_total_segs_used * compressed_swap_chunk_size;
2382 
2383 	return used_space;
2384 }
2385 
2386 uint64_t
vm_swap_get_free_space(void)2387 vm_swap_get_free_space(void)
2388 {
2389 	return vm_swap_get_total_space() - vm_swap_get_used_space();
2390 }
2391 
2392 uint64_t
vm_swap_get_max_configured_space(void)2393 vm_swap_get_max_configured_space(void)
2394 {
2395 	int num_swap_files = (vm_num_swap_files_config ? vm_num_swap_files_config : VM_MAX_SWAP_FILE_NUM);
2396 	return num_swap_files * MAX_SWAP_FILE_SIZE;
2397 }
2398 
2399 int
vm_swap_low_on_space(void)2400 vm_swap_low_on_space(void)
2401 {
2402 	if (vm_num_swap_files == 0 && vm_swapfile_can_be_created == FALSE) {
2403 		return 0;
2404 	}
2405 
2406 	if (((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < ((unsigned int)vm_swapfile_hiwater_segs) / 8)) {
2407 		if (vm_num_swap_files == 0 && !SWAPPER_NEEDS_TO_UNTHROTTLE()) {
2408 			return 0;
2409 		}
2410 
2411 		if (vm_swapfile_last_failed_to_create_ts >= vm_swapfile_last_successful_create_ts) {
2412 			return 1;
2413 		}
2414 	}
2415 	return 0;
2416 }
2417 
2418 int
vm_swap_out_of_space(void)2419 vm_swap_out_of_space(void)
2420 {
2421 	if ((vm_num_swap_files == vm_num_swap_files_config) &&
2422 	    ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < VM_SWAPOUT_LIMIT_MAX)) {
2423 		/*
2424 		 * Last swapfile and we have only space for the
2425 		 * last few swapouts.
2426 		 */
2427 		return 1;
2428 	}
2429 
2430 	return 0;
2431 }
2432 
2433 boolean_t
vm_swap_files_pinned(void)2434 vm_swap_files_pinned(void)
2435 {
2436 	boolean_t result;
2437 
2438 	if (vm_swappin_enabled == FALSE) {
2439 		return TRUE;
2440 	}
2441 
2442 	result = (vm_num_pinned_swap_files == vm_num_swap_files);
2443 
2444 	return result;
2445 }
2446 
2447 #if CONFIG_FREEZE
2448 boolean_t
vm_swap_max_budget(uint64_t * freeze_daily_budget)2449 vm_swap_max_budget(uint64_t *freeze_daily_budget)
2450 {
2451 	boolean_t       use_device_value = FALSE;
2452 	struct swapfile *swf = NULL;
2453 
2454 	if (vm_num_swap_files) {
2455 		lck_mtx_lock(&vm_swap_data_lock);
2456 
2457 		swf = (struct swapfile*) queue_first(&swf_global_queue);
2458 
2459 		if (swf) {
2460 			while (queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
2461 				if (swf->swp_flags == SWAP_READY) {
2462 					assert(swf->swp_vp);
2463 
2464 					if (vm_swap_vol_get_budget(swf->swp_vp, freeze_daily_budget) == 0) {
2465 						use_device_value = TRUE;
2466 					}
2467 					break;
2468 				}
2469 				swf = (struct swapfile*) queue_next(&swf->swp_queue);
2470 			}
2471 		}
2472 
2473 		lck_mtx_unlock(&vm_swap_data_lock);
2474 	} else {
2475 		/*
2476 		 * This block is used for the initial budget value before any swap files
2477 		 * are created. We create a temp swap file to get the budget.
2478 		 */
2479 
2480 		struct vnode *temp_vp = NULL;
2481 
2482 		vm_swapfile_open(swapfilename, &temp_vp);
2483 
2484 		if (temp_vp) {
2485 			if (vm_swap_vol_get_budget(temp_vp, freeze_daily_budget) == 0) {
2486 				use_device_value = TRUE;
2487 			}
2488 
2489 			vm_swapfile_close((uint64_t)&swapfilename, temp_vp);
2490 			temp_vp = NULL;
2491 		} else {
2492 			*freeze_daily_budget = 0;
2493 		}
2494 	}
2495 
2496 	return use_device_value;
2497 }
2498 #endif /* CONFIG_FREEZE */
2499 
2500 void
vm_swap_reset_max_segs_tracking(uint64_t * alloced_max,uint64_t * used_max)2501 vm_swap_reset_max_segs_tracking(uint64_t *alloced_max, uint64_t *used_max)
2502 {
2503 	lck_mtx_lock(&vm_swap_data_lock);
2504 
2505 	*alloced_max = (uint64_t) vm_swapfile_total_segs_alloced_max * compressed_swap_chunk_size;
2506 	*used_max = (uint64_t) vm_swapfile_total_segs_used_max * compressed_swap_chunk_size;
2507 
2508 	vm_swapfile_total_segs_alloced_max = vm_swapfile_total_segs_alloced;
2509 	vm_swapfile_total_segs_used_max = vm_swapfile_total_segs_used;
2510 
2511 	lck_mtx_unlock(&vm_swap_data_lock);
2512 }
2513