xref: /xnu-8796.141.3/osfmk/vm/vm_compressor.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <vm/vm_compressor.h>
30 
31 #if CONFIG_PHANTOM_CACHE
32 #include <vm/vm_phantom_cache.h>
33 #endif
34 
35 #include <vm/vm_map.h>
36 #include <vm/vm_pageout.h>
37 #include <vm/memory_object.h>
38 #include <vm/vm_compressor_algorithms.h>
39 #include <vm/vm_compressor_backing_store.h>
40 #include <vm/vm_fault.h>
41 #include <vm/vm_protos.h>
42 #include <mach/mach_host.h>             /* for host_info() */
43 #if DEVELOPMENT || DEBUG
44 #include <kern/hvg_hypercall.h>
45 #endif
46 #include <kern/ledger.h>
47 #include <kern/policy_internal.h>
48 #include <kern/thread_group.h>
49 #include <san/kasan.h>
50 #include <os/log.h>
51 #include <pexpert/pexpert.h>
52 #include <pexpert/device_tree.h>
53 
54 #if defined(__x86_64__)
55 #include <i386/misc_protos.h>
56 #endif
57 #if defined(__arm64__)
58 #include <arm/machine_routines.h>
59 #endif
60 
61 #include <IOKit/IOHibernatePrivate.h>
62 
63 /*
64  * The segment buffer size is a tradeoff.
65  * A larger buffer leads to faster I/O throughput, better compression ratios
66  * (since fewer bytes are wasted at the end of the segment),
67  * and less overhead (both in time and space).
68  * However, a smaller buffer causes less swap when the system is overcommited
69  * b/c a higher percentage of the swapped-in segment is definitely accessed
70  * before it goes back out to storage.
71  *
72  * So on systems without swap, a larger segment is a clear win.
73  * On systems with swap, the choice is murkier. Empirically, we've
74  * found that a 64KB segment provides a better tradeoff both in terms of
75  * performance and swap writes than a 256KB segment on systems with fast SSDs
76  * and a HW compression block.
77  */
78 #define C_SEG_BUFSIZE_ARM_SWAP (1024 * 64)
79 #if XNU_TARGET_OS_OSX && defined(__arm64__)
80 #define C_SEG_BUFSIZE_DEFAULT C_SEG_BUFSIZE_ARM_SWAP
81 #else
82 #define C_SEG_BUFSIZE_DEFAULT (1024 * 256)
83 #endif /* TARGET_OS_OSX && defined(__arm64__) */
84 uint32_t c_seg_bufsize;
85 
86 uint32_t c_seg_max_pages, c_seg_off_limit, c_seg_allocsize, c_seg_slot_var_array_min_len;
87 
88 extern boolean_t vm_darkwake_mode;
89 extern zone_t vm_page_zone;
90 
91 #if DEVELOPMENT || DEBUG
92 /* sysctl defined in bsd/dev/arm64/sysctl.c */
93 int do_cseg_wedge_thread(void);
94 int do_cseg_unwedge_thread(void);
95 static event_t debug_cseg_wait_event = NULL;
96 #endif /* DEVELOPMENT || DEBUG */
97 
98 #if CONFIG_FREEZE
99 bool freezer_incore_cseg_acct = TRUE; /* Only count incore compressed memory for jetsams. */
100 void task_disown_frozen_csegs(task_t owner_task);
101 #endif /* CONFIG_FREEZE */
102 
103 #if POPCOUNT_THE_COMPRESSED_DATA
104 boolean_t popcount_c_segs = TRUE;
105 
106 static inline uint32_t
vmc_pop(uintptr_t ins,int sz)107 vmc_pop(uintptr_t ins, int sz)
108 {
109 	uint32_t rv = 0;
110 
111 	if (__probable(popcount_c_segs == FALSE)) {
112 		return 0xDEAD707C;
113 	}
114 
115 	while (sz >= 16) {
116 		uint32_t rv1, rv2;
117 		uint64_t *ins64 = (uint64_t *) ins;
118 		uint64_t *ins642 = (uint64_t *) (ins + 8);
119 		rv1 = __builtin_popcountll(*ins64);
120 		rv2 = __builtin_popcountll(*ins642);
121 		rv += rv1 + rv2;
122 		sz -= 16;
123 		ins += 16;
124 	}
125 
126 	while (sz >= 4) {
127 		uint32_t *ins32 = (uint32_t *) ins;
128 		rv += __builtin_popcount(*ins32);
129 		sz -= 4;
130 		ins += 4;
131 	}
132 
133 	while (sz > 0) {
134 		char *ins8 = (char *)ins;
135 		rv += __builtin_popcount(*ins8);
136 		sz--;
137 		ins++;
138 	}
139 	return rv;
140 }
141 #endif
142 
143 #if VALIDATE_C_SEGMENTS
144 boolean_t validate_c_segs = TRUE;
145 #endif
146 /*
147  * vm_compressor_mode has a hierarchy of control to set its value.
148  * boot-args are checked first, then device-tree, and finally
149  * the default value that is defined below. See vm_fault_init() for
150  * the boot-arg & device-tree code.
151  */
152 
153 #if !XNU_TARGET_OS_OSX
154 
155 #if CONFIG_FREEZE
156 int     vm_compressor_mode = VM_PAGER_FREEZER_DEFAULT;
157 struct  freezer_context freezer_context_global;
158 #else /* CONFIG_FREEZE */
159 int     vm_compressor_mode = VM_PAGER_NOT_CONFIGURED;
160 #endif /* CONFIG_FREEZE */
161 
162 #else /* !XNU_TARGET_OS_OSX */
163 int             vm_compressor_mode = VM_PAGER_COMPRESSOR_WITH_SWAP;
164 
165 #endif /* !XNU_TARGET_OS_OSX */
166 
167 TUNABLE(uint32_t, vm_compression_limit, "vm_compression_limit", 0);
168 int             vm_compressor_is_active = 0;
169 int             vm_compressor_available = 0;
170 
171 extern uint64_t vm_swap_get_max_configured_space(void);
172 extern void     vm_pageout_io_throttle(void);
173 bool vm_compressor_swapout_is_ripe(void);
174 
175 #if CHECKSUM_THE_DATA || CHECKSUM_THE_SWAP || CHECKSUM_THE_COMPRESSED_DATA
176 extern unsigned int hash_string(char *cp, int len);
177 static unsigned int vmc_hash(char *, int);
178 boolean_t checksum_c_segs = TRUE;
179 
180 unsigned int
vmc_hash(char * cp,int len)181 vmc_hash(char *cp, int len)
182 {
183 	if (__probable(checksum_c_segs == FALSE)) {
184 		return 0xDEAD7A37;
185 	}
186 	return hash_string(cp, len);
187 }
188 #endif
189 
190 #define UNPACK_C_SIZE(cs)       ((cs->c_size == (PAGE_SIZE-1)) ? PAGE_SIZE : cs->c_size)
191 #define PACK_C_SIZE(cs, size)   (cs->c_size = ((size == PAGE_SIZE) ? PAGE_SIZE - 1 : size))
192 
193 
194 struct c_sv_hash_entry {
195 	union {
196 		struct  {
197 			uint32_t        c_sv_he_ref;
198 			uint32_t        c_sv_he_data;
199 		} c_sv_he;
200 		uint64_t        c_sv_he_record;
201 	} c_sv_he_un;
202 };
203 
204 #define he_ref  c_sv_he_un.c_sv_he.c_sv_he_ref
205 #define he_data c_sv_he_un.c_sv_he.c_sv_he_data
206 #define he_record c_sv_he_un.c_sv_he_record
207 
208 #define C_SV_HASH_MAX_MISS      32
209 #define C_SV_HASH_SIZE          ((1 << 10))
210 #define C_SV_HASH_MASK          ((1 << 10) - 1)
211 #define C_SV_CSEG_ID            ((1 << 22) - 1)
212 
213 
214 union c_segu {
215 	c_segment_t     c_seg;
216 	uintptr_t       c_segno;
217 };
218 
219 #define C_SLOT_ASSERT_PACKABLE(ptr) \
220 	VM_ASSERT_POINTER_PACKABLE((vm_offset_t)(ptr), C_SLOT_PACKED_PTR);
221 
222 #define C_SLOT_PACK_PTR(ptr) \
223 	VM_PACK_POINTER((vm_offset_t)(ptr), C_SLOT_PACKED_PTR)
224 
225 #define C_SLOT_UNPACK_PTR(cslot) \
226 	(c_slot_mapping_t)VM_UNPACK_POINTER((cslot)->c_packed_ptr, C_SLOT_PACKED_PTR)
227 
228 /* for debugging purposes */
229 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) c_slot_packing_params =
230     VM_PACKING_PARAMS(C_SLOT_PACKED_PTR);
231 
232 uint32_t        c_segment_count = 0;
233 uint32_t        c_segment_count_max = 0;
234 
235 uint64_t        c_generation_id = 0;
236 uint64_t        c_generation_id_flush_barrier;
237 
238 
239 #define         HIBERNATE_FLUSHING_SECS_TO_COMPLETE     120
240 
241 boolean_t       hibernate_no_swapspace = FALSE;
242 boolean_t       hibernate_flush_timed_out = FALSE;
243 clock_sec_t     hibernate_flushing_deadline = 0;
244 
245 #if RECORD_THE_COMPRESSED_DATA
246 char    *c_compressed_record_sbuf;
247 char    *c_compressed_record_ebuf;
248 char    *c_compressed_record_cptr;
249 #endif
250 
251 
252 queue_head_t    c_age_list_head;
253 queue_head_t    c_early_swappedin_list_head, c_regular_swappedin_list_head, c_late_swappedin_list_head;
254 queue_head_t    c_early_swapout_list_head, c_regular_swapout_list_head, c_late_swapout_list_head;
255 queue_head_t    c_swapio_list_head;
256 queue_head_t    c_swappedout_list_head;
257 queue_head_t    c_swappedout_sparse_list_head;
258 queue_head_t    c_major_list_head;
259 queue_head_t    c_filling_list_head;
260 queue_head_t    c_bad_list_head;
261 
262 uint32_t        c_age_count = 0;
263 uint32_t        c_early_swappedin_count = 0, c_regular_swappedin_count = 0, c_late_swappedin_count = 0;
264 uint32_t        c_early_swapout_count = 0, c_regular_swapout_count = 0, c_late_swapout_count = 0;
265 uint32_t        c_swapio_count = 0;
266 uint32_t        c_swappedout_count = 0;
267 uint32_t        c_swappedout_sparse_count = 0;
268 uint32_t        c_major_count = 0;
269 uint32_t        c_filling_count = 0;
270 uint32_t        c_empty_count = 0;
271 uint32_t        c_bad_count = 0;
272 
273 
274 queue_head_t    c_minor_list_head;
275 uint32_t        c_minor_count = 0;
276 
277 int             c_overage_swapped_count = 0;
278 int             c_overage_swapped_limit = 0;
279 
280 int             c_seg_fixed_array_len;
281 union  c_segu   *c_segments;
282 vm_offset_t     c_buffers;
283 vm_size_t       c_buffers_size;
284 caddr_t         c_segments_next_page;
285 boolean_t       c_segments_busy;
286 uint32_t        c_segments_available;
287 uint32_t        c_segments_limit;
288 uint32_t        c_segments_nearing_limit;
289 
290 uint32_t        c_segment_svp_in_hash;
291 uint32_t        c_segment_svp_hash_succeeded;
292 uint32_t        c_segment_svp_hash_failed;
293 uint32_t        c_segment_svp_zero_compressions;
294 uint32_t        c_segment_svp_nonzero_compressions;
295 uint32_t        c_segment_svp_zero_decompressions;
296 uint32_t        c_segment_svp_nonzero_decompressions;
297 
298 uint32_t        c_segment_noncompressible_pages;
299 
300 uint32_t        c_segment_pages_compressed = 0; /* Tracks # of uncompressed pages fed into the compressor */
301 #if CONFIG_FREEZE
302 int32_t         c_segment_pages_compressed_incore = 0; /* Tracks # of uncompressed pages fed into the compressor that are in memory */
303 int32_t         c_segment_pages_compressed_incore_late_swapout = 0; /* Tracks # of uncompressed pages fed into the compressor that are in memory and tagged for swapout */
304 uint32_t        c_segments_incore_limit = 0; /* Tracks # of segments allowed to be in-core. Based on compressor pool size */
305 #endif /* CONFIG_FREEZE */
306 
307 uint32_t        c_segment_pages_compressed_limit;
308 uint32_t        c_segment_pages_compressed_nearing_limit;
309 uint32_t        c_free_segno_head = (uint32_t)-1;
310 
311 uint32_t        vm_compressor_minorcompact_threshold_divisor = 10;
312 uint32_t        vm_compressor_majorcompact_threshold_divisor = 10;
313 uint32_t        vm_compressor_unthrottle_threshold_divisor = 10;
314 uint32_t        vm_compressor_catchup_threshold_divisor = 10;
315 
316 uint32_t        vm_compressor_minorcompact_threshold_divisor_overridden = 0;
317 uint32_t        vm_compressor_majorcompact_threshold_divisor_overridden = 0;
318 uint32_t        vm_compressor_unthrottle_threshold_divisor_overridden = 0;
319 uint32_t        vm_compressor_catchup_threshold_divisor_overridden = 0;
320 
321 #define         C_SEGMENTS_PER_PAGE     (PAGE_SIZE / sizeof(union c_segu))
322 
323 LCK_GRP_DECLARE(vm_compressor_lck_grp, "vm_compressor");
324 LCK_RW_DECLARE(c_master_lock, &vm_compressor_lck_grp);
325 LCK_MTX_DECLARE(c_list_lock_storage, &vm_compressor_lck_grp);
326 
327 boolean_t       decompressions_blocked = FALSE;
328 
329 zone_t          compressor_segment_zone;
330 int             c_compressor_swap_trigger = 0;
331 
332 uint32_t        compressor_cpus;
333 char            *compressor_scratch_bufs;
334 char            *kdp_compressor_scratch_buf;
335 char            *kdp_compressor_decompressed_page;
336 addr64_t        kdp_compressor_decompressed_page_paddr;
337 ppnum_t         kdp_compressor_decompressed_page_ppnum;
338 
339 clock_sec_t     start_of_sample_period_sec = 0;
340 clock_nsec_t    start_of_sample_period_nsec = 0;
341 clock_sec_t     start_of_eval_period_sec = 0;
342 clock_nsec_t    start_of_eval_period_nsec = 0;
343 uint32_t        sample_period_decompression_count = 0;
344 uint32_t        sample_period_compression_count = 0;
345 uint32_t        last_eval_decompression_count = 0;
346 uint32_t        last_eval_compression_count = 0;
347 
348 #define         DECOMPRESSION_SAMPLE_MAX_AGE            (60 * 30)
349 
350 boolean_t       vm_swapout_ripe_segments = FALSE;
351 uint32_t        vm_ripe_target_age = (60 * 60 * 48);
352 
353 uint32_t        swapout_target_age = 0;
354 uint32_t        age_of_decompressions_during_sample_period[DECOMPRESSION_SAMPLE_MAX_AGE];
355 uint32_t        overage_decompressions_during_sample_period = 0;
356 
357 
358 void            do_fastwake_warmup(queue_head_t *, boolean_t);
359 boolean_t       fastwake_warmup = FALSE;
360 boolean_t       fastwake_recording_in_progress = FALSE;
361 clock_sec_t     dont_trim_until_ts = 0;
362 
363 uint64_t        c_segment_warmup_count;
364 uint64_t        first_c_segment_to_warm_generation_id = 0;
365 uint64_t        last_c_segment_to_warm_generation_id = 0;
366 boolean_t       hibernate_flushing = FALSE;
367 
368 int64_t         c_segment_input_bytes __attribute__((aligned(8))) = 0;
369 int64_t         c_segment_compressed_bytes __attribute__((aligned(8))) = 0;
370 int64_t         compressor_bytes_used __attribute__((aligned(8))) = 0;
371 
372 /* Keeps track of the most recent timestamp for when major compaction finished. */
373 mach_timespec_t major_compact_ts;
374 
375 struct c_sv_hash_entry c_segment_sv_hash_table[C_SV_HASH_SIZE]  __attribute__ ((aligned(8)));
376 
377 static void vm_compressor_swap_trigger_thread(void);
378 static void vm_compressor_do_delayed_compactions(boolean_t);
379 static void vm_compressor_compact_and_swap(boolean_t);
380 static void vm_compressor_process_regular_swapped_in_segments(boolean_t);
381 void vm_compressor_process_special_swapped_in_segments(void);
382 static void vm_compressor_process_special_swapped_in_segments_locked(void);
383 
384 struct vm_compressor_swapper_stats vmcs_stats;
385 
386 #if XNU_TARGET_OS_OSX
387 #if (__arm64__)
388 static void vm_compressor_process_major_segments(void);
389 #endif /* (__arm64__) */
390 static void vm_compressor_take_paging_space_action(void);
391 #endif /* XNU_TARGET_OS_OSX */
392 
393 void compute_swapout_target_age(void);
394 
395 boolean_t c_seg_major_compact(c_segment_t, c_segment_t);
396 boolean_t c_seg_major_compact_ok(c_segment_t, c_segment_t);
397 
398 int  c_seg_minor_compaction_and_unlock(c_segment_t, boolean_t);
399 int  c_seg_do_minor_compaction_and_unlock(c_segment_t, boolean_t, boolean_t, boolean_t);
400 void c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg);
401 
402 void c_seg_move_to_sparse_list(c_segment_t);
403 void c_seg_insert_into_q(queue_head_t *, c_segment_t);
404 
405 uint64_t vm_available_memory(void);
406 uint64_t vm_compressor_pages_compressed(void);
407 uint32_t vm_compressor_pool_size(void);
408 uint32_t vm_compressor_fragmentation_level(void);
409 uint32_t vm_compression_ratio(void);
410 
411 /*
412  * indicate the need to do a major compaction if
413  * the overall set of in-use compression segments
414  * becomes sparse... on systems that support pressure
415  * driven swapping, this will also cause swapouts to
416  * be initiated.
417  */
418 static inline bool
vm_compressor_needs_to_major_compact()419 vm_compressor_needs_to_major_compact()
420 {
421 	uint32_t        incore_seg_count;
422 
423 	incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
424 
425 	if ((c_segment_count >= (c_segments_nearing_limit / 8)) &&
426 	    ((incore_seg_count * c_seg_max_pages) - VM_PAGE_COMPRESSOR_COUNT) >
427 	    ((incore_seg_count / 8) * c_seg_max_pages)) {
428 		return true;
429 	}
430 	return false;
431 }
432 
433 
434 uint64_t
vm_available_memory(void)435 vm_available_memory(void)
436 {
437 	return ((uint64_t)AVAILABLE_NON_COMPRESSED_MEMORY) * PAGE_SIZE_64;
438 }
439 
440 
441 uint32_t
vm_compressor_pool_size(void)442 vm_compressor_pool_size(void)
443 {
444 	return VM_PAGE_COMPRESSOR_COUNT;
445 }
446 
447 uint32_t
vm_compressor_fragmentation_level(void)448 vm_compressor_fragmentation_level(void)
449 {
450 	const uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
451 	if ((incore_seg_count == 0) || (c_seg_max_pages == 0)) {
452 		return 0;
453 	}
454 	return 100 - (vm_compressor_pool_size() * 100 / (incore_seg_count * c_seg_max_pages));
455 }
456 
457 uint32_t
vm_compression_ratio(void)458 vm_compression_ratio(void)
459 {
460 	if (vm_compressor_pool_size() == 0) {
461 		return UINT32_MAX;
462 	}
463 	return c_segment_pages_compressed / vm_compressor_pool_size();
464 }
465 
466 uint64_t
vm_compressor_pages_compressed(void)467 vm_compressor_pages_compressed(void)
468 {
469 	return c_segment_pages_compressed * PAGE_SIZE_64;
470 }
471 
472 bool
vm_compressor_compressed_pages_nearing_limit(void)473 vm_compressor_compressed_pages_nearing_limit(void)
474 {
475 	uint32_t pages = 0;
476 
477 #if CONFIG_FREEZE
478 	pages = os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
479 #else /* CONFIG_FREEZE */
480 	pages = c_segment_pages_compressed;
481 #endif /* CONFIG_FREEZE */
482 
483 	return pages > c_segment_pages_compressed_nearing_limit;
484 }
485 
486 static bool
vm_compressor_segments_nearing_limit(void)487 vm_compressor_segments_nearing_limit(void)
488 {
489 	uint64_t segments;
490 
491 #if CONFIG_FREEZE
492 	if (freezer_incore_cseg_acct) {
493 		if (os_sub_overflow(c_segment_count, c_swappedout_count, &segments)) {
494 			segments = 0;
495 		}
496 		if (os_sub_overflow(segments, c_swappedout_sparse_count, &segments)) {
497 			segments = 0;
498 		}
499 	} else {
500 		segments = os_atomic_load(&c_segment_count, relaxed);
501 	}
502 #else /* CONFIG_FREEZE */
503 	segments = c_segment_count;
504 #endif /* CONFIG_FREEZE */
505 
506 	return segments > c_segments_nearing_limit;
507 }
508 
509 boolean_t
vm_compressor_low_on_space(void)510 vm_compressor_low_on_space(void)
511 {
512 	return vm_compressor_compressed_pages_nearing_limit() ||
513 	       vm_compressor_segments_nearing_limit();
514 }
515 
516 
517 boolean_t
vm_compressor_out_of_space(void)518 vm_compressor_out_of_space(void)
519 {
520 #if CONFIG_FREEZE
521 	uint64_t incore_seg_count;
522 	uint32_t incore_compressed_pages;
523 	if (freezer_incore_cseg_acct) {
524 		if (os_sub_overflow(c_segment_count, c_swappedout_count, &incore_seg_count)) {
525 			incore_seg_count = 0;
526 		}
527 		if (os_sub_overflow(incore_seg_count, c_swappedout_sparse_count, &incore_seg_count)) {
528 			incore_seg_count = 0;
529 		}
530 		incore_compressed_pages = os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
531 	} else {
532 		incore_seg_count = os_atomic_load(&c_segment_count, relaxed);
533 		incore_compressed_pages = os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
534 	}
535 
536 	if ((incore_compressed_pages >= c_segment_pages_compressed_limit) ||
537 	    (incore_seg_count > c_segments_incore_limit)) {
538 		return TRUE;
539 	}
540 #else /* CONFIG_FREEZE */
541 	if ((c_segment_pages_compressed >= c_segment_pages_compressed_limit) ||
542 	    (c_segment_count >= c_segments_limit)) {
543 		return TRUE;
544 	}
545 #endif /* CONFIG_FREEZE */
546 	return FALSE;
547 }
548 
549 bool
vm_compressor_is_thrashing()550 vm_compressor_is_thrashing()
551 {
552 	compute_swapout_target_age();
553 
554 	if (swapout_target_age) {
555 		c_segment_t     c_seg;
556 
557 		lck_mtx_lock_spin_always(c_list_lock);
558 
559 		if (!queue_empty(&c_age_list_head)) {
560 			c_seg = (c_segment_t) queue_first(&c_age_list_head);
561 
562 			if (c_seg->c_creation_ts > swapout_target_age) {
563 				swapout_target_age = 0;
564 			}
565 		}
566 		lck_mtx_unlock_always(c_list_lock);
567 	}
568 
569 	return swapout_target_age != 0;
570 }
571 
572 
573 int
vm_wants_task_throttled(task_t task)574 vm_wants_task_throttled(task_t task)
575 {
576 	ledger_amount_t compressed;
577 	if (task == kernel_task) {
578 		return 0;
579 	}
580 
581 	if (VM_CONFIG_SWAP_IS_ACTIVE) {
582 		if ((vm_compressor_low_on_space() || HARD_THROTTLE_LIMIT_REACHED())) {
583 			ledger_get_balance(task->ledger, task_ledgers.internal_compressed, &compressed);
584 			compressed >>= VM_MAP_PAGE_SHIFT(task->map);
585 			if ((unsigned int)compressed > (c_segment_pages_compressed / 4)) {
586 				return 1;
587 			}
588 		}
589 	}
590 	return 0;
591 }
592 
593 
594 #if DEVELOPMENT || DEBUG
595 /*
596  * On compressor/swap exhaustion, kill the largest process regardless of
597  * its chosen process policy.
598  */
599 TUNABLE(bool, kill_on_no_paging_space, "-kill_on_no_paging_space", false);
600 #endif /* DEVELOPMENT || DEBUG */
601 
602 #if CONFIG_JETSAM
603 boolean_t       memorystatus_kill_on_VM_compressor_space_shortage(boolean_t);
604 void            memorystatus_thread_wake(void);
605 extern uint32_t jetsam_kill_on_low_swap;
606 bool            memorystatus_disable_swap(void);
607 #if CONFIG_PHANTOM_CACHE
608 extern bool memorystatus_phantom_cache_pressure;
609 #endif /* CONFIG_PHANTOM_CACHE */
610 int             compressor_thrashing_induced_jetsam = 0;
611 int             filecache_thrashing_induced_jetsam = 0;
612 static boolean_t        vm_compressor_thrashing_detected = FALSE;
613 #else  /* CONFIG_JETSAM */
614 static uint32_t no_paging_space_action_in_progress = 0;
615 extern void memorystatus_send_low_swap_note(void);
616 #endif /* CONFIG_JETSAM */
617 
618 static void
vm_compressor_take_paging_space_action(void)619 vm_compressor_take_paging_space_action(void)
620 {
621 #if CONFIG_JETSAM
622 	/*
623 	 * On systems with both swap and jetsam,
624 	 * just wake up the jetsam thread and have it handle the low swap condition
625 	 * by killing apps.
626 	 */
627 	if (jetsam_kill_on_low_swap) {
628 		memorystatus_thread_wake();
629 	}
630 #else /* CONFIG_JETSAM */
631 	if (no_paging_space_action_in_progress == 0) {
632 		if (OSCompareAndSwap(0, 1, (UInt32 *)&no_paging_space_action_in_progress)) {
633 			if (no_paging_space_action()) {
634 #if DEVELOPMENT || DEBUG
635 				if (kill_on_no_paging_space) {
636 					/*
637 					 * Since we are choosing to always kill a process, we don't need the
638 					 * "out of application memory" dialog box in this mode. And, hence we won't
639 					 * send the knote.
640 					 */
641 					no_paging_space_action_in_progress = 0;
642 					return;
643 				}
644 #endif /* DEVELOPMENT || DEBUG */
645 				memorystatus_send_low_swap_note();
646 			}
647 
648 			no_paging_space_action_in_progress = 0;
649 		}
650 	}
651 #endif /* !CONFIG_JETSAM */
652 }
653 
654 
655 void
vm_decompressor_lock(void)656 vm_decompressor_lock(void)
657 {
658 	PAGE_REPLACEMENT_ALLOWED(TRUE);
659 
660 	decompressions_blocked = TRUE;
661 
662 	PAGE_REPLACEMENT_ALLOWED(FALSE);
663 }
664 
665 void
vm_decompressor_unlock(void)666 vm_decompressor_unlock(void)
667 {
668 	PAGE_REPLACEMENT_ALLOWED(TRUE);
669 
670 	decompressions_blocked = FALSE;
671 
672 	PAGE_REPLACEMENT_ALLOWED(FALSE);
673 
674 	thread_wakeup((event_t)&decompressions_blocked);
675 }
676 
677 static inline void
cslot_copy(c_slot_t cdst,c_slot_t csrc)678 cslot_copy(c_slot_t cdst, c_slot_t csrc)
679 {
680 #if CHECKSUM_THE_DATA
681 	cdst->c_hash_data = csrc->c_hash_data;
682 #endif
683 #if CHECKSUM_THE_COMPRESSED_DATA
684 	cdst->c_hash_compressed_data = csrc->c_hash_compressed_data;
685 #endif
686 #if POPCOUNT_THE_COMPRESSED_DATA
687 	cdst->c_pop_cdata = csrc->c_pop_cdata;
688 #endif
689 	cdst->c_size = csrc->c_size;
690 	cdst->c_packed_ptr = csrc->c_packed_ptr;
691 #if defined(__arm64__)
692 	cdst->c_codec = csrc->c_codec;
693 #endif
694 }
695 
696 #if XNU_TARGET_OS_OSX
697 #define VM_COMPRESSOR_MAX_POOL_SIZE (192UL << 30)
698 #else
699 #define VM_COMPRESSOR_MAX_POOL_SIZE (0)
700 #endif
701 
702 static vm_map_size_t compressor_size;
703 static SECURITY_READ_ONLY_LATE(struct mach_vm_range) compressor_range;
704 vm_map_t compressor_map;
705 uint64_t compressor_pool_max_size;
706 uint64_t compressor_pool_size;
707 uint32_t compressor_pool_multiplier;
708 
709 #if DEVELOPMENT || DEBUG
710 /*
711  * Compressor segments are write-protected in development/debug
712  * kernels to help debug memory corruption.
713  * In cases where performance is a concern, this can be disabled
714  * via the boot-arg "-disable_cseg_write_protection".
715  */
716 boolean_t write_protect_c_segs = TRUE;
717 int vm_compressor_test_seg_wp;
718 uint32_t vm_ktrace_enabled;
719 #endif /* DEVELOPMENT || DEBUG */
720 
721 #if (XNU_TARGET_OS_OSX && __arm64__)
722 
723 #include <IOKit/IOPlatformExpert.h>
724 #include <sys/random.h>
725 
726 static const char *csegbufsizeExperimentProperty = "_csegbufsz_experiment";
727 static thread_call_t csegbufsz_experiment_thread_call;
728 
729 extern boolean_t IOServiceWaitForMatchingResource(const char * property, uint64_t timeout);
730 static void
erase_csegbufsz_experiment_property(__unused void * param0,__unused void * param1)731 erase_csegbufsz_experiment_property(__unused void *param0, __unused void *param1)
732 {
733 	// Wait for NVRAM to be writable
734 	if (!IOServiceWaitForMatchingResource("IONVRAM", UINT64_MAX)) {
735 		printf("csegbufsz_experiment_property: Failed to wait for IONVRAM.");
736 	}
737 
738 	if (!PERemoveNVRAMProperty(csegbufsizeExperimentProperty)) {
739 		printf("csegbufsize_experiment_property: Failed to remove %s from NVRAM.", csegbufsizeExperimentProperty);
740 	}
741 	thread_call_free(csegbufsz_experiment_thread_call);
742 }
743 
744 static void
erase_csegbufsz_experiment_property_async()745 erase_csegbufsz_experiment_property_async()
746 {
747 	csegbufsz_experiment_thread_call = thread_call_allocate_with_priority(
748 		erase_csegbufsz_experiment_property,
749 		NULL,
750 		THREAD_CALL_PRIORITY_LOW
751 		);
752 	if (csegbufsz_experiment_thread_call == NULL) {
753 		printf("csegbufsize_experiment_property: Unable to allocate thread call.");
754 	} else {
755 		thread_call_enter(csegbufsz_experiment_thread_call);
756 	}
757 }
758 
759 static void
cleanup_csegbufsz_experiment(__unused void * arg0)760 cleanup_csegbufsz_experiment(__unused void *arg0)
761 {
762 	char nvram = 0;
763 	unsigned int len = sizeof(nvram);
764 	if (PEReadNVRAMProperty(csegbufsizeExperimentProperty, &nvram, &len)) {
765 		erase_csegbufsz_experiment_property_async();
766 	}
767 }
768 
769 STARTUP_ARG(EARLY_BOOT, STARTUP_RANK_FIRST, cleanup_csegbufsz_experiment, NULL);
770 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
771 
772 #if CONFIG_JETSAM
773 extern unsigned int memorystatus_swap_all_apps;
774 #endif /* CONFIG_JETSAM */
775 
776 TUNABLE_DT(uint64_t, swap_vol_min_capacity, "/defaults", "kern.swap_min_capacity", "kern.swap_min_capacity", 0, TUNABLE_DT_NONE);
777 
778 static void
vm_compressor_set_size(void)779 vm_compressor_set_size(void)
780 {
781 	/*
782 	 * Note that this function may be called multiple times on systems with app swap
783 	 * because the value of vm_swap_get_max_configured_space() and memorystatus_swap_all_apps
784 	 * can change based the size of the swap volume. On these systems, we'll call
785 	 * this function once early in boot to reserve the maximum amount of VA required
786 	 * for the compressor submap and then one more time in vm_compressor_init after
787 	 * determining the swap volume size. We must not return a larger value the second
788 	 * time around.
789 	 */
790 	vm_size_t       c_segments_arr_size = 0;
791 	struct c_slot_mapping tmp_slot_ptr;
792 
793 	/* The segment size can be overwritten by a boot-arg */
794 	if (!PE_parse_boot_argn("vm_compressor_segment_buffer_size", &c_seg_bufsize, sizeof(c_seg_bufsize))) {
795 #if CONFIG_JETSAM
796 		if (memorystatus_swap_all_apps) {
797 			c_seg_bufsize = C_SEG_BUFSIZE_ARM_SWAP;
798 		} else {
799 			c_seg_bufsize = C_SEG_BUFSIZE_DEFAULT;
800 		}
801 #else
802 		c_seg_bufsize = C_SEG_BUFSIZE_DEFAULT;
803 #endif /* CONFIG_JETSAM */
804 	}
805 
806 	vm_compressor_swap_init_swap_file_limit();
807 	if (vm_compression_limit) {
808 		compressor_pool_size = ptoa_64(vm_compression_limit);
809 	}
810 
811 	compressor_pool_max_size = C_SEG_MAX_LIMIT;
812 	compressor_pool_max_size *= c_seg_bufsize;
813 
814 #if XNU_TARGET_OS_OSX
815 
816 	if (vm_compression_limit == 0) {
817 		if (max_mem <= (4ULL * 1024ULL * 1024ULL * 1024ULL)) {
818 			compressor_pool_size = 16ULL * max_mem;
819 		} else if (max_mem <= (8ULL * 1024ULL * 1024ULL * 1024ULL)) {
820 			compressor_pool_size = 8ULL * max_mem;
821 		} else if (max_mem <= (32ULL * 1024ULL * 1024ULL * 1024ULL)) {
822 			compressor_pool_size = 4ULL * max_mem;
823 		} else {
824 			compressor_pool_size = 2ULL * max_mem;
825 		}
826 	}
827 	/*
828 	 * Cap the compressor pool size to a max of 192G
829 	 */
830 	if (compressor_pool_size > VM_COMPRESSOR_MAX_POOL_SIZE) {
831 		compressor_pool_size = VM_COMPRESSOR_MAX_POOL_SIZE;
832 	}
833 	if (max_mem <= (8ULL * 1024ULL * 1024ULL * 1024ULL)) {
834 		compressor_pool_multiplier = 1;
835 	} else if (max_mem <= (32ULL * 1024ULL * 1024ULL * 1024ULL)) {
836 		compressor_pool_multiplier = 2;
837 	} else {
838 		compressor_pool_multiplier = 4;
839 	}
840 
841 #elif defined(__arm64__) && defined(XNU_TARGET_OS_WATCH)
842 
843 	/*
844 	 * On M9 watches the compressor can become big and can lead to
845 	 * churn in workingset resulting in audio drops. Setting a cap
846 	 * on the compressor size favors reclaiming unused memory
847 	 * sitting in idle band via jetsams
848 	 */
849 
850 #define COMPRESSOR_CAP_PERCENTAGE        37ULL
851 
852 	if (compressor_pool_max_size > max_mem) {
853 		compressor_pool_max_size = max_mem;
854 	}
855 
856 	if (vm_compression_limit == 0) {
857 		compressor_pool_size = (max_mem * COMPRESSOR_CAP_PERCENTAGE) / 100ULL;
858 	}
859 	compressor_pool_multiplier = 1;
860 
861 #else
862 
863 	if (compressor_pool_max_size > max_mem) {
864 		compressor_pool_max_size = max_mem;
865 	}
866 
867 	if (vm_compression_limit == 0) {
868 		compressor_pool_size = max_mem;
869 	}
870 	compressor_pool_multiplier = 1;
871 #endif
872 	if (compressor_pool_size > compressor_pool_max_size) {
873 		compressor_pool_size = compressor_pool_max_size;
874 	}
875 
876 	c_seg_max_pages = (c_seg_bufsize / PAGE_SIZE);
877 	c_seg_slot_var_array_min_len = c_seg_max_pages;
878 
879 #if !defined(__x86_64__)
880 	c_seg_off_limit = (C_SEG_BYTES_TO_OFFSET((c_seg_bufsize - 512)));
881 	c_seg_allocsize = (c_seg_bufsize + PAGE_SIZE);
882 #else
883 	c_seg_off_limit = (C_SEG_BYTES_TO_OFFSET((c_seg_bufsize - 128)));
884 	c_seg_allocsize = c_seg_bufsize;
885 #endif /* !defined(__x86_64__) */
886 
887 	c_segments_limit = (uint32_t)(compressor_pool_size / (vm_size_t)(c_seg_allocsize));
888 	tmp_slot_ptr.s_cseg = c_segments_limit;
889 	/* Panic on internal configs*/
890 	assertf((tmp_slot_ptr.s_cseg == c_segments_limit), "vm_compressor_init: overflowed s_cseg field in c_slot_mapping with c_segno: %d", c_segments_limit);
891 
892 	if (tmp_slot_ptr.s_cseg != c_segments_limit) {
893 		tmp_slot_ptr.s_cseg = -1;
894 		c_segments_limit = tmp_slot_ptr.s_cseg - 1; /*limited by segment idx bits in c_slot_mapping*/
895 		compressor_pool_size = (c_segments_limit * (vm_size_t)(c_seg_allocsize));
896 	}
897 
898 	c_segments_nearing_limit = (uint32_t)(((uint64_t)c_segments_limit * 98ULL) / 100ULL);
899 
900 	c_segment_pages_compressed_limit = (c_segments_limit * (c_seg_bufsize / PAGE_SIZE) * compressor_pool_multiplier);
901 
902 	if (c_segment_pages_compressed_limit < (uint32_t)(max_mem / PAGE_SIZE)) {
903 #if defined(XNU_TARGET_OS_WATCH)
904 		c_segment_pages_compressed_limit = (uint32_t)(max_mem / PAGE_SIZE);
905 #else
906 		if (!vm_compression_limit) {
907 			c_segment_pages_compressed_limit = (uint32_t)(max_mem / PAGE_SIZE);
908 		}
909 #endif
910 	}
911 
912 	c_segment_pages_compressed_nearing_limit = (uint32_t)(((uint64_t)c_segment_pages_compressed_limit * 98ULL) / 100ULL);
913 
914 #if CONFIG_FREEZE
915 	/*
916 	 * Our in-core limits are based on the size of the compressor pool.
917 	 * The c_segments_nearing_limit is also based on the compressor pool
918 	 * size and calculated above.
919 	 */
920 	c_segments_incore_limit = c_segments_limit;
921 
922 	if (freezer_incore_cseg_acct) {
923 		/*
924 		 * Add enough segments to track all frozen c_segs that can be stored in swap.
925 		 */
926 		c_segments_limit += (uint32_t)(vm_swap_get_max_configured_space() / (vm_size_t)(c_seg_allocsize));
927 		tmp_slot_ptr.s_cseg = c_segments_limit;
928 		/* Panic on internal configs*/
929 		assertf((tmp_slot_ptr.s_cseg == c_segments_limit), "vm_compressor_init: freezer reserve overflowed s_cseg field in c_slot_mapping with c_segno: %d", c_segments_limit);
930 	}
931 #endif
932 	/*
933 	 * Submap needs space for:
934 	 * - c_segments
935 	 * - c_buffers
936 	 * - swap reclaimations -- c_seg_bufsize
937 	 */
938 	c_segments_arr_size = vm_map_round_page((sizeof(union c_segu) * c_segments_limit), VM_MAP_PAGE_MASK(kernel_map));
939 	c_buffers_size = vm_map_round_page(((vm_size_t)c_seg_allocsize * (vm_size_t)c_segments_limit), VM_MAP_PAGE_MASK(kernel_map));
940 
941 	compressor_size = c_segments_arr_size + c_buffers_size + c_seg_bufsize;
942 
943 #if RECORD_THE_COMPRESSED_DATA
944 	c_compressed_record_sbuf_size = (vm_size_t)c_seg_allocsize + (PAGE_SIZE * 2);
945 	compressor_size += c_compressed_record_sbuf_size;
946 #endif /* RECORD_THE_COMPRESSED_DATA */
947 }
948 STARTUP(KMEM, STARTUP_RANK_FIRST, vm_compressor_set_size);
949 
950 KMEM_RANGE_REGISTER_DYNAMIC(compressor, &compressor_range, ^() {
951 	return compressor_size;
952 });
953 
954 bool
osenvironment_is_diagnostics(void)955 osenvironment_is_diagnostics(void)
956 {
957 	DTEntry chosen;
958 	const char *osenvironment;
959 	unsigned int size;
960 	if (kSuccess == SecureDTLookupEntry(0, "/chosen", &chosen)) {
961 		if (kSuccess == SecureDTGetProperty(chosen, "osenvironment", (void const **) &osenvironment, &size)) {
962 			return strcmp(osenvironment, "diagnostics") == 0;
963 		}
964 	}
965 	return false;
966 }
967 
968 void
vm_compressor_init(void)969 vm_compressor_init(void)
970 {
971 	thread_t        thread;
972 #if RECORD_THE_COMPRESSED_DATA
973 	vm_size_t       c_compressed_record_sbuf_size = 0;
974 #endif /* RECORD_THE_COMPRESSED_DATA */
975 
976 #if DEVELOPMENT || DEBUG || CONFIG_FREEZE
977 	char bootarg_name[32];
978 #endif /* DEVELOPMENT || DEBUG || CONFIG_FREEZE */
979 	__unused uint64_t early_boot_compressor_size = compressor_size;
980 
981 #if CONFIG_JETSAM
982 	if (memorystatus_swap_all_apps && osenvironment_is_diagnostics()) {
983 		printf("osenvironment == \"diagnostics\". Disabling app swap.\n");
984 		memorystatus_disable_swap();
985 	}
986 
987 	if (memorystatus_swap_all_apps) {
988 		/*
989 		 * App swap is disabled on devices with small NANDs.
990 		 * Now that we're no longer in early boot, we can get
991 		 * the NAND size and re-run vm_compressor_set_size.
992 		 */
993 		int error = vm_swap_vol_get_capacity(SWAP_VOLUME_NAME, &vm_swap_volume_capacity);
994 #if DEVELOPMENT || DEBUG
995 		if (error != 0) {
996 			panic("vm_compressor_init: Unable to get swap volume capacity. error=%d\n", error);
997 		}
998 #else
999 		if (error != 0) {
1000 			os_log_with_startup_serial(OS_LOG_DEFAULT, "vm_compressor_init: Unable to get swap volume capacity. error=%d\n", error);
1001 		}
1002 #endif /* DEVELOPMENT || DEBUG */
1003 		if (vm_swap_volume_capacity < swap_vol_min_capacity) {
1004 			memorystatus_disable_swap();
1005 		}
1006 		/*
1007 		 * Resize the compressor and swap now that we know the capacity
1008 		 * of the swap volume.
1009 		 */
1010 		vm_compressor_set_size();
1011 		/*
1012 		 * We reserved a chunk of VA early in boot for the compressor submap.
1013 		 * We can't allocate more than that.
1014 		 */
1015 		assert(compressor_size <= early_boot_compressor_size);
1016 	}
1017 #endif /* CONFIG_JETSAM */
1018 
1019 #if DEVELOPMENT || DEBUG
1020 	if (PE_parse_boot_argn("-disable_cseg_write_protection", bootarg_name, sizeof(bootarg_name))) {
1021 		write_protect_c_segs = FALSE;
1022 	}
1023 
1024 	int vmcval = 1;
1025 #if defined(XNU_TARGET_OS_WATCH)
1026 	vmcval = 0;
1027 #endif /* XNU_TARGET_OS_WATCH */
1028 	PE_parse_boot_argn("vm_compressor_validation", &vmcval, sizeof(vmcval));
1029 
1030 	if (kern_feature_override(KF_COMPRSV_OVRD)) {
1031 		vmcval = 0;
1032 	}
1033 
1034 	if (vmcval == 0) {
1035 #if POPCOUNT_THE_COMPRESSED_DATA
1036 		popcount_c_segs = FALSE;
1037 #endif
1038 #if CHECKSUM_THE_DATA || CHECKSUM_THE_COMPRESSED_DATA
1039 		checksum_c_segs = FALSE;
1040 #endif
1041 #if VALIDATE_C_SEGMENTS
1042 		validate_c_segs = FALSE;
1043 #endif
1044 		write_protect_c_segs = FALSE;
1045 	}
1046 #endif /* DEVELOPMENT || DEBUG */
1047 
1048 #if CONFIG_FREEZE
1049 	if (PE_parse_boot_argn("-disable_freezer_cseg_acct", bootarg_name, sizeof(bootarg_name))) {
1050 		freezer_incore_cseg_acct = FALSE;
1051 	}
1052 #endif /* CONFIG_FREEZE */
1053 
1054 	assert((C_SEGMENTS_PER_PAGE * sizeof(union c_segu)) == PAGE_SIZE);
1055 
1056 #if !XNU_TARGET_OS_OSX
1057 	vm_compressor_minorcompact_threshold_divisor = 20;
1058 	vm_compressor_majorcompact_threshold_divisor = 30;
1059 	vm_compressor_unthrottle_threshold_divisor = 40;
1060 	vm_compressor_catchup_threshold_divisor = 60;
1061 #else /* !XNU_TARGET_OS_OSX */
1062 	if (max_mem <= (3ULL * 1024ULL * 1024ULL * 1024ULL)) {
1063 		vm_compressor_minorcompact_threshold_divisor = 11;
1064 		vm_compressor_majorcompact_threshold_divisor = 13;
1065 		vm_compressor_unthrottle_threshold_divisor = 20;
1066 		vm_compressor_catchup_threshold_divisor = 35;
1067 	} else {
1068 		vm_compressor_minorcompact_threshold_divisor = 20;
1069 		vm_compressor_majorcompact_threshold_divisor = 25;
1070 		vm_compressor_unthrottle_threshold_divisor = 35;
1071 		vm_compressor_catchup_threshold_divisor = 50;
1072 	}
1073 #endif /* !XNU_TARGET_OS_OSX */
1074 
1075 	queue_init(&c_bad_list_head);
1076 	queue_init(&c_age_list_head);
1077 	queue_init(&c_minor_list_head);
1078 	queue_init(&c_major_list_head);
1079 	queue_init(&c_filling_list_head);
1080 	queue_init(&c_early_swapout_list_head);
1081 	queue_init(&c_regular_swapout_list_head);
1082 	queue_init(&c_late_swapout_list_head);
1083 	queue_init(&c_swapio_list_head);
1084 	queue_init(&c_early_swappedin_list_head);
1085 	queue_init(&c_regular_swappedin_list_head);
1086 	queue_init(&c_late_swappedin_list_head);
1087 	queue_init(&c_swappedout_list_head);
1088 	queue_init(&c_swappedout_sparse_list_head);
1089 
1090 	c_free_segno_head = -1;
1091 	c_segments_available = 0;
1092 
1093 	compressor_map = kmem_suballoc(kernel_map, &compressor_range.min_address,
1094 	    compressor_size, VM_MAP_CREATE_NEVER_FAULTS,
1095 	    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_NOFAIL | KMS_PERMANENT,
1096 	    VM_KERN_MEMORY_COMPRESSOR).kmr_submap;
1097 
1098 	kmem_alloc(compressor_map, (vm_offset_t *)(&c_segments),
1099 	    (sizeof(union c_segu) * c_segments_limit),
1100 	    KMA_NOFAIL | KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT,
1101 	    VM_KERN_MEMORY_COMPRESSOR);
1102 	kmem_alloc(compressor_map, &c_buffers, c_buffers_size,
1103 	    KMA_NOFAIL | KMA_COMPRESSOR | KMA_VAONLY | KMA_PERMANENT,
1104 	    VM_KERN_MEMORY_COMPRESSOR);
1105 
1106 #if DEVELOPMENT || DEBUG
1107 	if (hvg_is_hcall_available(HVG_HCALL_SET_COREDUMP_DATA)) {
1108 		hvg_hcall_set_coredump_data();
1109 	}
1110 #endif
1111 
1112 	/*
1113 	 * Pick a good size that will minimize fragmentation in zalloc
1114 	 * by minimizing the fragmentation in a 16k run.
1115 	 *
1116 	 * c_seg_slot_var_array_min_len is larger on 4k systems than 16k ones,
1117 	 * making the fragmentation in a 4k page terrible. Using 16k for all
1118 	 * systems matches zalloc() and will minimize fragmentation.
1119 	 */
1120 	uint32_t c_segment_size = sizeof(struct c_segment) + (c_seg_slot_var_array_min_len * sizeof(struct c_slot));
1121 	uint32_t cnt  = (16 << 10) / c_segment_size;
1122 	uint32_t frag = (16 << 10) % c_segment_size;
1123 
1124 	c_seg_fixed_array_len = c_seg_slot_var_array_min_len;
1125 
1126 	while (cnt * sizeof(struct c_slot) < frag) {
1127 		c_segment_size += sizeof(struct c_slot);
1128 		c_seg_fixed_array_len++;
1129 		frag -= cnt * sizeof(struct c_slot);
1130 	}
1131 
1132 	compressor_segment_zone = zone_create("compressor_segment",
1133 	    c_segment_size, ZC_PGZ_USE_GUARDS | ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM);
1134 
1135 	c_segments_busy = FALSE;
1136 
1137 	c_segments_next_page = (caddr_t)c_segments;
1138 	vm_compressor_algorithm_init();
1139 
1140 	{
1141 		host_basic_info_data_t hinfo;
1142 		mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
1143 		size_t bufsize;
1144 		char *buf;
1145 
1146 #define BSD_HOST 1
1147 		host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
1148 
1149 		compressor_cpus = hinfo.max_cpus;
1150 
1151 		bufsize = PAGE_SIZE;
1152 		bufsize += compressor_cpus * vm_compressor_get_decode_scratch_size();
1153 		/* For the KDP path */
1154 		bufsize += vm_compressor_get_decode_scratch_size();
1155 #if CONFIG_FREEZE
1156 		bufsize += vm_compressor_get_encode_scratch_size();
1157 #endif
1158 #if RECORD_THE_COMPRESSED_DATA
1159 		bufsize += c_compressed_record_sbuf_size;
1160 #endif
1161 
1162 		kmem_alloc(kernel_map, (vm_offset_t *)&buf, bufsize,
1163 		    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
1164 		    VM_KERN_MEMORY_COMPRESSOR);
1165 
1166 		/*
1167 		 * kdp_compressor_decompressed_page must be page aligned because we access
1168 		 * it through the physical aperture by page number.
1169 		 */
1170 		kdp_compressor_decompressed_page = buf;
1171 		kdp_compressor_decompressed_page_paddr = kvtophys((vm_offset_t)kdp_compressor_decompressed_page);
1172 		kdp_compressor_decompressed_page_ppnum = (ppnum_t) atop(kdp_compressor_decompressed_page_paddr);
1173 		buf += PAGE_SIZE;
1174 		bufsize -= PAGE_SIZE;
1175 
1176 		compressor_scratch_bufs = buf;
1177 		buf += compressor_cpus * vm_compressor_get_decode_scratch_size();
1178 		bufsize -= compressor_cpus * vm_compressor_get_decode_scratch_size();
1179 
1180 		kdp_compressor_scratch_buf = buf;
1181 		buf += vm_compressor_get_decode_scratch_size();
1182 		bufsize -= vm_compressor_get_decode_scratch_size();
1183 
1184 #if CONFIG_FREEZE
1185 		freezer_context_global.freezer_ctx_compressor_scratch_buf = buf;
1186 		buf += vm_compressor_get_encode_scratch_size();
1187 		bufsize -= vm_compressor_get_encode_scratch_size();
1188 #endif
1189 
1190 #if RECORD_THE_COMPRESSED_DATA
1191 		c_compressed_record_sbuf = buf;
1192 		c_compressed_record_cptr = buf;
1193 		c_compressed_record_ebuf = c_compressed_record_sbuf + c_compressed_record_sbuf_size;
1194 		buf += c_compressed_record_sbuf_size;
1195 		bufsize -= c_compressed_record_sbuf_size;
1196 #endif
1197 		assert(bufsize == 0);
1198 	}
1199 
1200 	if (kernel_thread_start_priority((thread_continue_t)vm_compressor_swap_trigger_thread, NULL,
1201 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
1202 		panic("vm_compressor_swap_trigger_thread: create failed");
1203 	}
1204 	thread_deallocate(thread);
1205 
1206 	if (vm_pageout_internal_start() != KERN_SUCCESS) {
1207 		panic("vm_compressor_init: Failed to start the internal pageout thread.");
1208 	}
1209 	if (VM_CONFIG_SWAP_IS_PRESENT) {
1210 		vm_compressor_swap_init();
1211 	}
1212 
1213 	if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
1214 		vm_compressor_is_active = 1;
1215 	}
1216 
1217 #if CONFIG_FREEZE
1218 	memorystatus_freeze_enabled = TRUE;
1219 #endif /* CONFIG_FREEZE */
1220 
1221 	vm_compressor_available = 1;
1222 
1223 	vm_page_reactivate_all_throttled();
1224 
1225 	bzero(&vmcs_stats, sizeof(struct vm_compressor_swapper_stats));
1226 }
1227 
1228 
1229 #if VALIDATE_C_SEGMENTS
1230 
1231 static void
c_seg_validate(c_segment_t c_seg,boolean_t must_be_compact)1232 c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact)
1233 {
1234 	uint16_t        c_indx;
1235 	int32_t         bytes_used;
1236 	uint32_t        c_rounded_size;
1237 	uint32_t        c_size;
1238 	c_slot_t        cs;
1239 
1240 	if (__probable(validate_c_segs == FALSE)) {
1241 		return;
1242 	}
1243 	if (c_seg->c_firstemptyslot < c_seg->c_nextslot) {
1244 		c_indx = c_seg->c_firstemptyslot;
1245 		cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1246 
1247 		if (cs == NULL) {
1248 			panic("c_seg_validate:  no slot backing c_firstemptyslot");
1249 		}
1250 
1251 		if (cs->c_size) {
1252 			panic("c_seg_validate:  c_firstemptyslot has non-zero size (%d)", cs->c_size);
1253 		}
1254 	}
1255 	bytes_used = 0;
1256 
1257 	for (c_indx = 0; c_indx < c_seg->c_nextslot; c_indx++) {
1258 		cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1259 
1260 		c_size = UNPACK_C_SIZE(cs);
1261 
1262 		c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
1263 
1264 		bytes_used += c_rounded_size;
1265 
1266 #if CHECKSUM_THE_COMPRESSED_DATA
1267 		unsigned csvhash;
1268 		if (c_size && cs->c_hash_compressed_data != (csvhash = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size))) {
1269 			addr64_t csvphys = kvtophys((vm_offset_t)&c_seg->c_store.c_buffer[cs->c_offset]);
1270 			panic("Compressed data doesn't match original %p phys: 0x%llx %d %p %d %d 0x%x 0x%x", c_seg, csvphys, cs->c_offset, cs, c_indx, c_size, cs->c_hash_compressed_data, csvhash);
1271 		}
1272 #endif
1273 #if POPCOUNT_THE_COMPRESSED_DATA
1274 		unsigned csvpop;
1275 		if (c_size) {
1276 			uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
1277 			if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
1278 				panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, (uint64_t)cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
1279 			}
1280 		}
1281 #endif
1282 	}
1283 
1284 	if (bytes_used != c_seg->c_bytes_used) {
1285 		panic("c_seg_validate: bytes_used mismatch - found %d, segment has %d", bytes_used, c_seg->c_bytes_used);
1286 	}
1287 
1288 	if (c_seg->c_bytes_used > C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset)) {
1289 		panic("c_seg_validate: c_bytes_used > c_nextoffset - c_nextoffset = %d,  c_bytes_used = %d",
1290 		    (int32_t)C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset), c_seg->c_bytes_used);
1291 	}
1292 
1293 	if (must_be_compact) {
1294 		if (c_seg->c_bytes_used != C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset)) {
1295 			panic("c_seg_validate: c_bytes_used doesn't match c_nextoffset - c_nextoffset = %d,  c_bytes_used = %d",
1296 			    (int32_t)C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset), c_seg->c_bytes_used);
1297 		}
1298 	}
1299 }
1300 
1301 #endif
1302 
1303 
1304 void
c_seg_need_delayed_compaction(c_segment_t c_seg,boolean_t c_list_lock_held)1305 c_seg_need_delayed_compaction(c_segment_t c_seg, boolean_t c_list_lock_held)
1306 {
1307 	boolean_t       clear_busy = FALSE;
1308 
1309 	if (c_list_lock_held == FALSE) {
1310 		if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1311 			C_SEG_BUSY(c_seg);
1312 
1313 			lck_mtx_unlock_always(&c_seg->c_lock);
1314 			lck_mtx_lock_spin_always(c_list_lock);
1315 			lck_mtx_lock_spin_always(&c_seg->c_lock);
1316 
1317 			clear_busy = TRUE;
1318 		}
1319 	}
1320 	assert(c_seg->c_state != C_IS_FILLING);
1321 
1322 	if (!c_seg->c_on_minorcompact_q && !(C_SEG_IS_ON_DISK_OR_SOQ(c_seg)) && !c_seg->c_has_donated_pages) {
1323 		queue_enter(&c_minor_list_head, c_seg, c_segment_t, c_list);
1324 		c_seg->c_on_minorcompact_q = 1;
1325 		c_minor_count++;
1326 	}
1327 	if (c_list_lock_held == FALSE) {
1328 		lck_mtx_unlock_always(c_list_lock);
1329 	}
1330 
1331 	if (clear_busy == TRUE) {
1332 		C_SEG_WAKEUP_DONE(c_seg);
1333 	}
1334 }
1335 
1336 
1337 unsigned int c_seg_moved_to_sparse_list = 0;
1338 
1339 void
c_seg_move_to_sparse_list(c_segment_t c_seg)1340 c_seg_move_to_sparse_list(c_segment_t c_seg)
1341 {
1342 	boolean_t       clear_busy = FALSE;
1343 
1344 	if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1345 		C_SEG_BUSY(c_seg);
1346 
1347 		lck_mtx_unlock_always(&c_seg->c_lock);
1348 		lck_mtx_lock_spin_always(c_list_lock);
1349 		lck_mtx_lock_spin_always(&c_seg->c_lock);
1350 
1351 		clear_busy = TRUE;
1352 	}
1353 	c_seg_switch_state(c_seg, C_ON_SWAPPEDOUTSPARSE_Q, FALSE);
1354 
1355 	c_seg_moved_to_sparse_list++;
1356 
1357 	lck_mtx_unlock_always(c_list_lock);
1358 
1359 	if (clear_busy == TRUE) {
1360 		C_SEG_WAKEUP_DONE(c_seg);
1361 	}
1362 }
1363 
1364 
1365 void
c_seg_insert_into_q(queue_head_t * qhead,c_segment_t c_seg)1366 c_seg_insert_into_q(queue_head_t *qhead, c_segment_t c_seg)
1367 {
1368 	c_segment_t c_seg_next;
1369 
1370 	if (queue_empty(qhead)) {
1371 		queue_enter(qhead, c_seg, c_segment_t, c_age_list);
1372 	} else {
1373 		c_seg_next = (c_segment_t)queue_first(qhead);
1374 
1375 		while (TRUE) {
1376 			if (c_seg->c_generation_id < c_seg_next->c_generation_id) {
1377 				queue_insert_before(qhead, c_seg, c_seg_next, c_segment_t, c_age_list);
1378 				break;
1379 			}
1380 			c_seg_next = (c_segment_t) queue_next(&c_seg_next->c_age_list);
1381 
1382 			if (queue_end(qhead, (queue_entry_t) c_seg_next)) {
1383 				queue_enter(qhead, c_seg, c_segment_t, c_age_list);
1384 				break;
1385 			}
1386 		}
1387 	}
1388 }
1389 
1390 
1391 int try_minor_compaction_failed = 0;
1392 int try_minor_compaction_succeeded = 0;
1393 
1394 void
c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg)1395 c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg)
1396 {
1397 	assert(c_seg->c_on_minorcompact_q);
1398 	/*
1399 	 * c_seg is currently on the delayed minor compaction
1400 	 * queue and we have c_seg locked... if we can get the
1401 	 * c_list_lock w/o blocking (if we blocked we could deadlock
1402 	 * because the lock order is c_list_lock then c_seg's lock)
1403 	 * we'll pull it from the delayed list and free it directly
1404 	 */
1405 	if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1406 		/*
1407 		 * c_list_lock is held, we need to bail
1408 		 */
1409 		try_minor_compaction_failed++;
1410 
1411 		lck_mtx_unlock_always(&c_seg->c_lock);
1412 	} else {
1413 		try_minor_compaction_succeeded++;
1414 
1415 		C_SEG_BUSY(c_seg);
1416 		c_seg_do_minor_compaction_and_unlock(c_seg, TRUE, FALSE, FALSE);
1417 	}
1418 }
1419 
1420 
1421 int
c_seg_do_minor_compaction_and_unlock(c_segment_t c_seg,boolean_t clear_busy,boolean_t need_list_lock,boolean_t disallow_page_replacement)1422 c_seg_do_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy, boolean_t need_list_lock, boolean_t disallow_page_replacement)
1423 {
1424 	int     c_seg_freed;
1425 
1426 	assert(c_seg->c_busy);
1427 	assert(!C_SEG_IS_ON_DISK_OR_SOQ(c_seg));
1428 
1429 	/*
1430 	 * check for the case that can occur when we are not swapping
1431 	 * and this segment has been major compacted in the past
1432 	 * and moved to the majorcompact q to remove it from further
1433 	 * consideration... if the occupancy falls too low we need
1434 	 * to put it back on the age_q so that it will be considered
1435 	 * in the next major compaction sweep... if we don't do this
1436 	 * we will eventually run into the c_segments_limit
1437 	 */
1438 	if (c_seg->c_state == C_ON_MAJORCOMPACT_Q && C_SEG_SHOULD_MAJORCOMPACT_NOW(c_seg)) {
1439 		c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
1440 	}
1441 	if (!c_seg->c_on_minorcompact_q) {
1442 		if (clear_busy == TRUE) {
1443 			C_SEG_WAKEUP_DONE(c_seg);
1444 		}
1445 
1446 		lck_mtx_unlock_always(&c_seg->c_lock);
1447 
1448 		return 0;
1449 	}
1450 	queue_remove(&c_minor_list_head, c_seg, c_segment_t, c_list);
1451 	c_seg->c_on_minorcompact_q = 0;
1452 	c_minor_count--;
1453 
1454 	lck_mtx_unlock_always(c_list_lock);
1455 
1456 	if (disallow_page_replacement == TRUE) {
1457 		lck_mtx_unlock_always(&c_seg->c_lock);
1458 
1459 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
1460 
1461 		lck_mtx_lock_spin_always(&c_seg->c_lock);
1462 	}
1463 	c_seg_freed = c_seg_minor_compaction_and_unlock(c_seg, clear_busy);
1464 
1465 	if (disallow_page_replacement == TRUE) {
1466 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
1467 	}
1468 
1469 	if (need_list_lock == TRUE) {
1470 		lck_mtx_lock_spin_always(c_list_lock);
1471 	}
1472 
1473 	return c_seg_freed;
1474 }
1475 
1476 void
kdp_compressor_busy_find_owner(event64_t wait_event,thread_waitinfo_t * waitinfo)1477 kdp_compressor_busy_find_owner(event64_t wait_event, thread_waitinfo_t *waitinfo)
1478 {
1479 	c_segment_t c_seg = (c_segment_t) wait_event;
1480 
1481 	waitinfo->owner = thread_tid(c_seg->c_busy_for_thread);
1482 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(c_seg);
1483 }
1484 
1485 #if DEVELOPMENT || DEBUG
1486 int
do_cseg_wedge_thread(void)1487 do_cseg_wedge_thread(void)
1488 {
1489 	struct c_segment c_seg;
1490 	c_seg.c_busy_for_thread = current_thread();
1491 
1492 	debug_cseg_wait_event = (event_t) &c_seg;
1493 
1494 	thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor);
1495 	assert_wait((event_t) (&c_seg), THREAD_INTERRUPTIBLE);
1496 
1497 	thread_block(THREAD_CONTINUE_NULL);
1498 
1499 	return 0;
1500 }
1501 
1502 int
do_cseg_unwedge_thread(void)1503 do_cseg_unwedge_thread(void)
1504 {
1505 	thread_wakeup(debug_cseg_wait_event);
1506 	debug_cseg_wait_event = NULL;
1507 
1508 	return 0;
1509 }
1510 #endif /* DEVELOPMENT || DEBUG */
1511 
1512 void
c_seg_wait_on_busy(c_segment_t c_seg)1513 c_seg_wait_on_busy(c_segment_t c_seg)
1514 {
1515 	c_seg->c_wanted = 1;
1516 
1517 	thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor);
1518 	assert_wait((event_t) (c_seg), THREAD_UNINT);
1519 
1520 	lck_mtx_unlock_always(&c_seg->c_lock);
1521 	thread_block(THREAD_CONTINUE_NULL);
1522 }
1523 
1524 #if CONFIG_FREEZE
1525 /*
1526  * We don't have the task lock held while updating the task's
1527  * c_seg queues. We can do that because of the following restrictions:
1528  *
1529  * - SINGLE FREEZER CONTEXT:
1530  *   We 'insert' c_segs into the task list on the task_freeze path.
1531  *   There can only be one such freeze in progress and the task
1532  *   isn't disappearing because we have the VM map lock held throughout
1533  *   and we have a reference on the proc too.
1534  *
1535  * - SINGLE TASK DISOWN CONTEXT:
1536  *   We 'disown' c_segs of a task ONLY from the task_terminate context. So
1537  *   we don't need the task lock but we need the c_list_lock and the
1538  *   compressor master lock (shared). We also hold the individual
1539  *   c_seg locks (exclusive).
1540  *
1541  *   If we either:
1542  *   - can't get the c_seg lock on a try, then we start again because maybe
1543  *   the c_seg is part of a compaction and might get freed. So we can't trust
1544  *   that linkage and need to restart our queue traversal.
1545  *   - OR, we run into a busy c_seg (say being swapped in or free-ing) we
1546  *   drop all locks again and wait and restart our queue traversal.
1547  *
1548  * - The new_owner_task below is currently only the kernel or NULL.
1549  *
1550  */
1551 void
c_seg_update_task_owner(c_segment_t c_seg,task_t new_owner_task)1552 c_seg_update_task_owner(c_segment_t c_seg, task_t new_owner_task)
1553 {
1554 	task_t          owner_task = c_seg->c_task_owner;
1555 	uint64_t        uncompressed_bytes = ((c_seg->c_slots_used) * PAGE_SIZE_64);
1556 
1557 	LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
1558 	LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
1559 
1560 	if (owner_task) {
1561 		task_update_frozen_to_swap_acct(owner_task, uncompressed_bytes, DEBIT_FROM_SWAP);
1562 		queue_remove(&owner_task->task_frozen_cseg_q, c_seg,
1563 		    c_segment_t, c_task_list_next_cseg);
1564 	}
1565 
1566 	if (new_owner_task) {
1567 		queue_enter(&new_owner_task->task_frozen_cseg_q, c_seg,
1568 		    c_segment_t, c_task_list_next_cseg);
1569 		task_update_frozen_to_swap_acct(new_owner_task, uncompressed_bytes, CREDIT_TO_SWAP);
1570 	}
1571 
1572 	c_seg->c_task_owner = new_owner_task;
1573 }
1574 
1575 void
task_disown_frozen_csegs(task_t owner_task)1576 task_disown_frozen_csegs(task_t owner_task)
1577 {
1578 	c_segment_t c_seg = NULL, next_cseg = NULL;
1579 
1580 again:
1581 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
1582 	lck_mtx_lock_spin_always(c_list_lock);
1583 
1584 	for (c_seg = (c_segment_t) queue_first(&owner_task->task_frozen_cseg_q);
1585 	    !queue_end(&owner_task->task_frozen_cseg_q, (queue_entry_t) c_seg);
1586 	    c_seg = next_cseg) {
1587 		next_cseg = (c_segment_t) queue_next(&c_seg->c_task_list_next_cseg);
1588 
1589 		if (!lck_mtx_try_lock_spin_always(&c_seg->c_lock)) {
1590 			lck_mtx_unlock(c_list_lock);
1591 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
1592 			goto again;
1593 		}
1594 
1595 		if (c_seg->c_busy) {
1596 			lck_mtx_unlock(c_list_lock);
1597 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
1598 
1599 			c_seg_wait_on_busy(c_seg);
1600 
1601 			goto again;
1602 		}
1603 		assert(c_seg->c_task_owner == owner_task);
1604 		c_seg_update_task_owner(c_seg, kernel_task);
1605 		lck_mtx_unlock_always(&c_seg->c_lock);
1606 	}
1607 
1608 	lck_mtx_unlock(c_list_lock);
1609 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
1610 }
1611 #endif /* CONFIG_FREEZE */
1612 
1613 void
c_seg_switch_state(c_segment_t c_seg,int new_state,boolean_t insert_head)1614 c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
1615 {
1616 	int     old_state = c_seg->c_state;
1617 	queue_head_t *donate_swapout_list_head, *donate_swappedin_list_head;
1618 	uint32_t     *donate_swapout_count, *donate_swappedin_count;
1619 
1620 	/*
1621 	 * On macOS the donate queue is swapped first ie the c_early_swapout queue.
1622 	 * On other swap-capable platforms, we want to swap those out last. So we
1623 	 * use the c_late_swapout queue.
1624 	 */
1625 #if XNU_TARGET_OS_OSX
1626 #if (DEVELOPMENT || DEBUG)
1627 	if (new_state != C_IS_FILLING) {
1628 		LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
1629 	}
1630 	LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
1631 #endif /* DEVELOPMENT || DEBUG */
1632 
1633 	donate_swapout_list_head = &c_early_swapout_list_head;
1634 	donate_swapout_count = &c_early_swapout_count;
1635 	donate_swappedin_list_head = &c_early_swappedin_list_head;
1636 	donate_swappedin_count = &c_early_swappedin_count;
1637 #else /* XNU_TARGET_OS_OSX */
1638 	donate_swapout_list_head = &c_late_swapout_list_head;
1639 	donate_swapout_count = &c_late_swapout_count;
1640 	donate_swappedin_list_head = &c_late_swappedin_list_head;
1641 	donate_swappedin_count = &c_late_swappedin_count;
1642 #endif /* XNU_TARGET_OS_OSX */
1643 
1644 	switch (old_state) {
1645 	case C_IS_EMPTY:
1646 		assert(new_state == C_IS_FILLING || new_state == C_IS_FREE);
1647 
1648 		c_empty_count--;
1649 		break;
1650 
1651 	case C_IS_FILLING:
1652 		assert(new_state == C_ON_AGE_Q || new_state == C_ON_SWAPOUT_Q);
1653 
1654 		queue_remove(&c_filling_list_head, c_seg, c_segment_t, c_age_list);
1655 		c_filling_count--;
1656 		break;
1657 
1658 	case C_ON_AGE_Q:
1659 		assert(new_state == C_ON_SWAPOUT_Q || new_state == C_ON_MAJORCOMPACT_Q ||
1660 		    new_state == C_IS_FREE);
1661 
1662 		queue_remove(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1663 		c_age_count--;
1664 		break;
1665 
1666 	case C_ON_SWAPPEDIN_Q:
1667 		if (c_seg->c_has_donated_pages) {
1668 			assert(new_state == C_ON_SWAPOUT_Q || new_state == C_IS_FREE);
1669 			queue_remove(donate_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1670 			*donate_swappedin_count -= 1;
1671 		} else {
1672 			assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE);
1673 #if CONFIG_FREEZE
1674 			assert(c_seg->c_has_freezer_pages);
1675 			queue_remove(&c_early_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1676 			c_early_swappedin_count--;
1677 #else /* CONFIG_FREEZE */
1678 			queue_remove(&c_regular_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1679 			c_regular_swappedin_count--;
1680 #endif /* CONFIG_FREEZE */
1681 		}
1682 		break;
1683 
1684 	case C_ON_SWAPOUT_Q:
1685 		assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE || new_state == C_IS_EMPTY || new_state == C_ON_SWAPIO_Q);
1686 
1687 #if CONFIG_FREEZE
1688 		if (c_seg->c_has_freezer_pages) {
1689 			if (c_seg->c_task_owner && (new_state != C_ON_SWAPIO_Q)) {
1690 				c_seg_update_task_owner(c_seg, NULL);
1691 			}
1692 			queue_remove(&c_early_swapout_list_head, c_seg, c_segment_t, c_age_list);
1693 			c_early_swapout_count--;
1694 		} else
1695 #endif /* CONFIG_FREEZE */
1696 		{
1697 			if (c_seg->c_has_donated_pages) {
1698 				queue_remove(donate_swapout_list_head, c_seg, c_segment_t, c_age_list);
1699 				*donate_swapout_count -= 1;
1700 			} else {
1701 				queue_remove(&c_regular_swapout_list_head, c_seg, c_segment_t, c_age_list);
1702 				c_regular_swapout_count--;
1703 			}
1704 		}
1705 
1706 		if (new_state == C_ON_AGE_Q) {
1707 			c_seg->c_has_donated_pages = 0;
1708 		}
1709 		thread_wakeup((event_t)&compaction_swapper_running);
1710 		break;
1711 
1712 	case C_ON_SWAPIO_Q:
1713 #if CONFIG_FREEZE
1714 		if (c_seg->c_has_freezer_pages) {
1715 			assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_AGE_Q);
1716 		} else
1717 #endif /* CONFIG_FREEZE */
1718 		{
1719 			if (c_seg->c_has_donated_pages) {
1720 				assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_SWAPPEDIN_Q);
1721 			} else {
1722 				assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_AGE_Q);
1723 			}
1724 		}
1725 
1726 		queue_remove(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1727 		c_swapio_count--;
1728 		break;
1729 
1730 	case C_ON_SWAPPEDOUT_Q:
1731 		assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q ||
1732 		    new_state == C_ON_SWAPPEDOUTSPARSE_Q ||
1733 		    new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE);
1734 
1735 		queue_remove(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1736 		c_swappedout_count--;
1737 		break;
1738 
1739 	case C_ON_SWAPPEDOUTSPARSE_Q:
1740 		assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q ||
1741 		    new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE);
1742 
1743 		queue_remove(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1744 		c_swappedout_sparse_count--;
1745 		break;
1746 
1747 	case C_ON_MAJORCOMPACT_Q:
1748 		assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE);
1749 
1750 		queue_remove(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1751 		c_major_count--;
1752 		break;
1753 
1754 	case C_ON_BAD_Q:
1755 		assert(new_state == C_IS_FREE);
1756 
1757 		queue_remove(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1758 		c_bad_count--;
1759 		break;
1760 
1761 	default:
1762 		panic("c_seg %p has bad c_state = %d", c_seg, old_state);
1763 	}
1764 
1765 	switch (new_state) {
1766 	case C_IS_FREE:
1767 		assert(old_state != C_IS_FILLING);
1768 
1769 		break;
1770 
1771 	case C_IS_EMPTY:
1772 		assert(old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1773 
1774 		c_empty_count++;
1775 		break;
1776 
1777 	case C_IS_FILLING:
1778 		assert(old_state == C_IS_EMPTY);
1779 
1780 		queue_enter(&c_filling_list_head, c_seg, c_segment_t, c_age_list);
1781 		c_filling_count++;
1782 		break;
1783 
1784 	case C_ON_AGE_Q:
1785 		assert(old_state == C_IS_FILLING || old_state == C_ON_SWAPPEDIN_Q ||
1786 		    old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPIO_Q ||
1787 		    old_state == C_ON_MAJORCOMPACT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1788 
1789 		assert(!c_seg->c_has_donated_pages);
1790 		if (old_state == C_IS_FILLING) {
1791 			queue_enter(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1792 		} else {
1793 			if (!queue_empty(&c_age_list_head)) {
1794 				c_segment_t     c_first;
1795 
1796 				c_first = (c_segment_t)queue_first(&c_age_list_head);
1797 				c_seg->c_creation_ts = c_first->c_creation_ts;
1798 			}
1799 			queue_enter_first(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1800 		}
1801 		c_age_count++;
1802 		break;
1803 
1804 	case C_ON_SWAPPEDIN_Q:
1805 	{
1806 		queue_head_t *list_head;
1807 
1808 		assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q || old_state == C_ON_SWAPIO_Q);
1809 		if (c_seg->c_has_donated_pages) {
1810 			/* Error in swapouts could happen while the c_seg is still on the swapio queue */
1811 			list_head = donate_swappedin_list_head;
1812 			*donate_swappedin_count += 1;
1813 		} else {
1814 #if CONFIG_FREEZE
1815 			assert(c_seg->c_has_freezer_pages);
1816 			list_head = &c_early_swappedin_list_head;
1817 			c_early_swappedin_count++;
1818 #else /* CONFIG_FREEZE */
1819 			list_head = &c_regular_swappedin_list_head;
1820 			c_regular_swappedin_count++;
1821 #endif /* CONFIG_FREEZE */
1822 		}
1823 
1824 		if (insert_head == TRUE) {
1825 			queue_enter_first(list_head, c_seg, c_segment_t, c_age_list);
1826 		} else {
1827 			queue_enter(list_head, c_seg, c_segment_t, c_age_list);
1828 		}
1829 		break;
1830 	}
1831 
1832 	case C_ON_SWAPOUT_Q:
1833 	{
1834 		queue_head_t *list_head;
1835 
1836 #if CONFIG_FREEZE
1837 		/*
1838 		 * A segment with both identities of frozen + donated pages
1839 		 * will be put on early swapout Q ie the frozen identity wins.
1840 		 * This is because when both identities are set, the donation bit
1841 		 * is added on after in the c_current_seg_filled path for accounting
1842 		 * purposes.
1843 		 */
1844 		if (c_seg->c_has_freezer_pages) {
1845 			assert(old_state == C_ON_AGE_Q || old_state == C_IS_FILLING);
1846 			list_head = &c_early_swapout_list_head;
1847 			c_early_swapout_count++;
1848 		} else
1849 #endif
1850 		{
1851 			if (c_seg->c_has_donated_pages) {
1852 				assert(old_state == C_ON_SWAPPEDIN_Q || old_state == C_IS_FILLING);
1853 				list_head = donate_swapout_list_head;
1854 				*donate_swapout_count += 1;
1855 			} else {
1856 				assert(old_state == C_ON_AGE_Q || old_state == C_IS_FILLING);
1857 				list_head = &c_regular_swapout_list_head;
1858 				c_regular_swapout_count++;
1859 			}
1860 		}
1861 
1862 		if (insert_head == TRUE) {
1863 			queue_enter_first(list_head, c_seg, c_segment_t, c_age_list);
1864 		} else {
1865 			queue_enter(list_head, c_seg, c_segment_t, c_age_list);
1866 		}
1867 		break;
1868 	}
1869 
1870 	case C_ON_SWAPIO_Q:
1871 		assert(old_state == C_ON_SWAPOUT_Q);
1872 
1873 		if (insert_head == TRUE) {
1874 			queue_enter_first(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1875 		} else {
1876 			queue_enter(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1877 		}
1878 		c_swapio_count++;
1879 		break;
1880 
1881 	case C_ON_SWAPPEDOUT_Q:
1882 		assert(old_state == C_ON_SWAPIO_Q);
1883 
1884 		if (insert_head == TRUE) {
1885 			queue_enter_first(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1886 		} else {
1887 			queue_enter(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1888 		}
1889 		c_swappedout_count++;
1890 		break;
1891 
1892 	case C_ON_SWAPPEDOUTSPARSE_Q:
1893 		assert(old_state == C_ON_SWAPIO_Q || old_state == C_ON_SWAPPEDOUT_Q);
1894 
1895 		if (insert_head == TRUE) {
1896 			queue_enter_first(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1897 		} else {
1898 			queue_enter(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1899 		}
1900 
1901 		c_swappedout_sparse_count++;
1902 		break;
1903 
1904 	case C_ON_MAJORCOMPACT_Q:
1905 		assert(old_state == C_ON_AGE_Q);
1906 		assert(!c_seg->c_has_donated_pages);
1907 
1908 		if (insert_head == TRUE) {
1909 			queue_enter_first(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1910 		} else {
1911 			queue_enter(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1912 		}
1913 		c_major_count++;
1914 		break;
1915 
1916 	case C_ON_BAD_Q:
1917 		assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1918 
1919 		if (insert_head == TRUE) {
1920 			queue_enter_first(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1921 		} else {
1922 			queue_enter(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1923 		}
1924 		c_bad_count++;
1925 		break;
1926 
1927 	default:
1928 		panic("c_seg %p requesting bad c_state = %d", c_seg, new_state);
1929 	}
1930 	c_seg->c_state = new_state;
1931 }
1932 
1933 
1934 
1935 void
c_seg_free(c_segment_t c_seg)1936 c_seg_free(c_segment_t c_seg)
1937 {
1938 	assert(c_seg->c_busy);
1939 
1940 	lck_mtx_unlock_always(&c_seg->c_lock);
1941 	lck_mtx_lock_spin_always(c_list_lock);
1942 	lck_mtx_lock_spin_always(&c_seg->c_lock);
1943 
1944 	c_seg_free_locked(c_seg);
1945 }
1946 
1947 
1948 void
c_seg_free_locked(c_segment_t c_seg)1949 c_seg_free_locked(c_segment_t c_seg)
1950 {
1951 	int             segno;
1952 	int             pages_populated = 0;
1953 	int32_t         *c_buffer = NULL;
1954 	uint64_t        c_swap_handle = 0;
1955 
1956 	assert(c_seg->c_busy);
1957 	assert(c_seg->c_slots_used == 0);
1958 	assert(!c_seg->c_on_minorcompact_q);
1959 	assert(!c_seg->c_busy_swapping);
1960 
1961 	if (c_seg->c_overage_swap == TRUE) {
1962 		c_overage_swapped_count--;
1963 		c_seg->c_overage_swap = FALSE;
1964 	}
1965 	if (!(C_SEG_IS_ONDISK(c_seg))) {
1966 		c_buffer = c_seg->c_store.c_buffer;
1967 	} else {
1968 		c_swap_handle = c_seg->c_store.c_swap_handle;
1969 	}
1970 
1971 	c_seg_switch_state(c_seg, C_IS_FREE, FALSE);
1972 
1973 	if (c_buffer) {
1974 		pages_populated = (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) / PAGE_SIZE;
1975 		c_seg->c_store.c_buffer = NULL;
1976 	} else {
1977 #if CONFIG_FREEZE
1978 		c_seg_update_task_owner(c_seg, NULL);
1979 #endif /* CONFIG_FREEZE */
1980 
1981 		c_seg->c_store.c_swap_handle = (uint64_t)-1;
1982 	}
1983 
1984 	lck_mtx_unlock_always(&c_seg->c_lock);
1985 
1986 	lck_mtx_unlock_always(c_list_lock);
1987 
1988 	if (c_buffer) {
1989 		if (pages_populated) {
1990 			kernel_memory_depopulate((vm_offset_t)c_buffer,
1991 			    ptoa(pages_populated), KMA_COMPRESSOR,
1992 			    VM_KERN_MEMORY_COMPRESSOR);
1993 		}
1994 	} else if (c_swap_handle) {
1995 		/*
1996 		 * Free swap space on disk.
1997 		 */
1998 		vm_swap_free(c_swap_handle);
1999 	}
2000 	lck_mtx_lock_spin_always(&c_seg->c_lock);
2001 	/*
2002 	 * c_seg must remain busy until
2003 	 * after the call to vm_swap_free
2004 	 */
2005 	C_SEG_WAKEUP_DONE(c_seg);
2006 	lck_mtx_unlock_always(&c_seg->c_lock);
2007 
2008 	segno = c_seg->c_mysegno;
2009 
2010 	lck_mtx_lock_spin_always(c_list_lock);
2011 	/*
2012 	 * because the c_buffer is now associated with the segno,
2013 	 * we can't put the segno back on the free list until
2014 	 * after we have depopulated the c_buffer range, or
2015 	 * we run the risk of depopulating a range that is
2016 	 * now being used in one of the compressor heads
2017 	 */
2018 	c_segments[segno].c_segno = c_free_segno_head;
2019 	c_free_segno_head = segno;
2020 	c_segment_count--;
2021 
2022 	lck_mtx_unlock_always(c_list_lock);
2023 
2024 	lck_mtx_destroy(&c_seg->c_lock, &vm_compressor_lck_grp);
2025 
2026 	if (c_seg->c_slot_var_array_len) {
2027 		kfree_type(struct c_slot, c_seg->c_slot_var_array_len,
2028 		    c_seg->c_slot_var_array);
2029 	}
2030 
2031 	zfree(compressor_segment_zone, c_seg);
2032 }
2033 
2034 #if DEVELOPMENT || DEBUG
2035 int c_seg_trim_page_count = 0;
2036 #endif
2037 
2038 void
c_seg_trim_tail(c_segment_t c_seg)2039 c_seg_trim_tail(c_segment_t c_seg)
2040 {
2041 	c_slot_t        cs;
2042 	uint32_t        c_size;
2043 	uint32_t        c_offset;
2044 	uint32_t        c_rounded_size;
2045 	uint16_t        current_nextslot;
2046 	uint32_t        current_populated_offset;
2047 
2048 	if (c_seg->c_bytes_used == 0) {
2049 		return;
2050 	}
2051 	current_nextslot = c_seg->c_nextslot;
2052 	current_populated_offset = c_seg->c_populated_offset;
2053 
2054 	while (c_seg->c_nextslot) {
2055 		cs = C_SEG_SLOT_FROM_INDEX(c_seg, (c_seg->c_nextslot - 1));
2056 
2057 		c_size = UNPACK_C_SIZE(cs);
2058 
2059 		if (c_size) {
2060 			if (current_nextslot != c_seg->c_nextslot) {
2061 				c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
2062 				c_offset = cs->c_offset + C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2063 
2064 				c_seg->c_nextoffset = c_offset;
2065 				c_seg->c_populated_offset = (c_offset + (C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1)) &
2066 				    ~(C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1);
2067 
2068 				if (c_seg->c_firstemptyslot > c_seg->c_nextslot) {
2069 					c_seg->c_firstemptyslot = c_seg->c_nextslot;
2070 				}
2071 #if DEVELOPMENT || DEBUG
2072 				c_seg_trim_page_count += ((round_page_32(C_SEG_OFFSET_TO_BYTES(current_populated_offset)) -
2073 				    round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) /
2074 				    PAGE_SIZE);
2075 #endif
2076 			}
2077 			break;
2078 		}
2079 		c_seg->c_nextslot--;
2080 	}
2081 	assert(c_seg->c_nextslot);
2082 }
2083 
2084 
2085 int
c_seg_minor_compaction_and_unlock(c_segment_t c_seg,boolean_t clear_busy)2086 c_seg_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy)
2087 {
2088 	c_slot_mapping_t slot_ptr;
2089 	uint32_t        c_offset = 0;
2090 	uint32_t        old_populated_offset;
2091 	uint32_t        c_rounded_size;
2092 	uint32_t        c_size;
2093 	uint16_t        c_indx = 0;
2094 	int             i;
2095 	c_slot_t        c_dst;
2096 	c_slot_t        c_src;
2097 
2098 	assert(c_seg->c_busy);
2099 
2100 #if VALIDATE_C_SEGMENTS
2101 	c_seg_validate(c_seg, FALSE);
2102 #endif
2103 	if (c_seg->c_bytes_used == 0) {
2104 		c_seg_free(c_seg);
2105 		return 1;
2106 	}
2107 	lck_mtx_unlock_always(&c_seg->c_lock);
2108 
2109 	if (c_seg->c_firstemptyslot >= c_seg->c_nextslot || C_SEG_UNUSED_BYTES(c_seg) < PAGE_SIZE) {
2110 		goto done;
2111 	}
2112 
2113 /* TODO: assert first emptyslot's c_size is actually 0 */
2114 
2115 #if DEVELOPMENT || DEBUG
2116 	C_SEG_MAKE_WRITEABLE(c_seg);
2117 #endif
2118 
2119 #if VALIDATE_C_SEGMENTS
2120 	c_seg->c_was_minor_compacted++;
2121 #endif
2122 	c_indx = c_seg->c_firstemptyslot;
2123 	c_dst = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
2124 
2125 	old_populated_offset = c_seg->c_populated_offset;
2126 	c_offset = c_dst->c_offset;
2127 
2128 	for (i = c_indx + 1; i < c_seg->c_nextslot && c_offset < c_seg->c_nextoffset; i++) {
2129 		c_src = C_SEG_SLOT_FROM_INDEX(c_seg, i);
2130 
2131 		c_size = UNPACK_C_SIZE(c_src);
2132 
2133 		if (c_size == 0) {
2134 			continue;
2135 		}
2136 
2137 		c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
2138 /* N.B.: This memcpy may be an overlapping copy */
2139 		memcpy(&c_seg->c_store.c_buffer[c_offset], &c_seg->c_store.c_buffer[c_src->c_offset], c_rounded_size);
2140 
2141 		cslot_copy(c_dst, c_src);
2142 		c_dst->c_offset = c_offset;
2143 
2144 		slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
2145 		slot_ptr->s_cindx = c_indx;
2146 
2147 		c_offset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2148 		PACK_C_SIZE(c_src, 0);
2149 		c_indx++;
2150 
2151 		c_dst = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
2152 	}
2153 	c_seg->c_firstemptyslot = c_indx;
2154 	c_seg->c_nextslot = c_indx;
2155 	c_seg->c_nextoffset = c_offset;
2156 	c_seg->c_populated_offset = (c_offset + (C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1)) & ~(C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1);
2157 	c_seg->c_bytes_unused = 0;
2158 
2159 #if VALIDATE_C_SEGMENTS
2160 	c_seg_validate(c_seg, TRUE);
2161 #endif
2162 	if (old_populated_offset > c_seg->c_populated_offset) {
2163 		uint32_t        gc_size;
2164 		int32_t         *gc_ptr;
2165 
2166 		gc_size = C_SEG_OFFSET_TO_BYTES(old_populated_offset - c_seg->c_populated_offset);
2167 		gc_ptr = &c_seg->c_store.c_buffer[c_seg->c_populated_offset];
2168 
2169 		kernel_memory_depopulate((vm_offset_t)gc_ptr, gc_size,
2170 		    KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
2171 	}
2172 
2173 #if DEVELOPMENT || DEBUG
2174 	C_SEG_WRITE_PROTECT(c_seg);
2175 #endif
2176 
2177 done:
2178 	if (clear_busy == TRUE) {
2179 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2180 		C_SEG_WAKEUP_DONE(c_seg);
2181 		lck_mtx_unlock_always(&c_seg->c_lock);
2182 	}
2183 	return 0;
2184 }
2185 
2186 
2187 static void
c_seg_alloc_nextslot(c_segment_t c_seg)2188 c_seg_alloc_nextslot(c_segment_t c_seg)
2189 {
2190 	struct c_slot   *old_slot_array = NULL;
2191 	struct c_slot   *new_slot_array = NULL;
2192 	int             newlen;
2193 	int             oldlen;
2194 
2195 	if (c_seg->c_nextslot < c_seg_fixed_array_len) {
2196 		return;
2197 	}
2198 
2199 	if ((c_seg->c_nextslot - c_seg_fixed_array_len) >= c_seg->c_slot_var_array_len) {
2200 		oldlen = c_seg->c_slot_var_array_len;
2201 		old_slot_array = c_seg->c_slot_var_array;
2202 
2203 		if (oldlen == 0) {
2204 			newlen = c_seg_slot_var_array_min_len;
2205 		} else {
2206 			newlen = oldlen * 2;
2207 		}
2208 
2209 		new_slot_array = kalloc_type(struct c_slot, newlen, Z_WAITOK);
2210 
2211 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2212 
2213 		if (old_slot_array) {
2214 			memcpy(new_slot_array, old_slot_array,
2215 			    sizeof(struct c_slot) * oldlen);
2216 		}
2217 
2218 		c_seg->c_slot_var_array_len = newlen;
2219 		c_seg->c_slot_var_array = new_slot_array;
2220 
2221 		lck_mtx_unlock_always(&c_seg->c_lock);
2222 
2223 		kfree_type(struct c_slot, oldlen, old_slot_array);
2224 	}
2225 }
2226 
2227 
2228 #define C_SEG_MAJOR_COMPACT_STATS_MAX   (30)
2229 
2230 struct {
2231 	uint64_t asked_permission;
2232 	uint64_t compactions;
2233 	uint64_t moved_slots;
2234 	uint64_t moved_bytes;
2235 	uint64_t wasted_space_in_swapouts;
2236 	uint64_t count_of_swapouts;
2237 	uint64_t count_of_freed_segs;
2238 	uint64_t bailed_compactions;
2239 	uint64_t bytes_freed_rate_us;
2240 } c_seg_major_compact_stats[C_SEG_MAJOR_COMPACT_STATS_MAX];
2241 
2242 int c_seg_major_compact_stats_now = 0;
2243 
2244 
2245 #define C_MAJOR_COMPACTION_SIZE_APPROPRIATE     ((c_seg_bufsize * 90) / 100)
2246 
2247 
2248 boolean_t
c_seg_major_compact_ok(c_segment_t c_seg_dst,c_segment_t c_seg_src)2249 c_seg_major_compact_ok(
2250 	c_segment_t c_seg_dst,
2251 	c_segment_t c_seg_src)
2252 {
2253 	c_seg_major_compact_stats[c_seg_major_compact_stats_now].asked_permission++;
2254 
2255 	if (c_seg_src->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE &&
2256 	    c_seg_dst->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE) {
2257 		return FALSE;
2258 	}
2259 
2260 	if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
2261 		/*
2262 		 * destination segment is full... can't compact
2263 		 */
2264 		return FALSE;
2265 	}
2266 
2267 	return TRUE;
2268 }
2269 
2270 
2271 boolean_t
c_seg_major_compact(c_segment_t c_seg_dst,c_segment_t c_seg_src)2272 c_seg_major_compact(
2273 	c_segment_t c_seg_dst,
2274 	c_segment_t c_seg_src)
2275 {
2276 	c_slot_mapping_t slot_ptr;
2277 	uint32_t        c_rounded_size;
2278 	uint32_t        c_size;
2279 	uint16_t        dst_slot;
2280 	int             i;
2281 	c_slot_t        c_dst;
2282 	c_slot_t        c_src;
2283 	boolean_t       keep_compacting = TRUE;
2284 
2285 	/*
2286 	 * segments are not locked but they are both marked c_busy
2287 	 * which keeps c_decompress from working on them...
2288 	 * we can safely allocate new pages, move compressed data
2289 	 * from c_seg_src to c_seg_dst and update both c_segment's
2290 	 * state w/o holding the master lock
2291 	 */
2292 #if DEVELOPMENT || DEBUG
2293 	C_SEG_MAKE_WRITEABLE(c_seg_dst);
2294 #endif
2295 
2296 #if VALIDATE_C_SEGMENTS
2297 	c_seg_dst->c_was_major_compacted++;
2298 	c_seg_src->c_was_major_donor++;
2299 #endif
2300 	assertf(c_seg_dst->c_has_donated_pages == c_seg_src->c_has_donated_pages, "Mismatched donation status Dst: %p, Src: %p\n", c_seg_dst, c_seg_src);
2301 	c_seg_major_compact_stats[c_seg_major_compact_stats_now].compactions++;
2302 
2303 	dst_slot = c_seg_dst->c_nextslot;
2304 
2305 	for (i = 0; i < c_seg_src->c_nextslot; i++) {
2306 		c_src = C_SEG_SLOT_FROM_INDEX(c_seg_src, i);
2307 
2308 		c_size = UNPACK_C_SIZE(c_src);
2309 
2310 		if (c_size == 0) {
2311 			/* BATCH: move what we have so far; */
2312 			continue;
2313 		}
2314 
2315 		if (C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset - c_seg_dst->c_nextoffset) < (unsigned) c_size) {
2316 			int     size_to_populate;
2317 
2318 			/* doesn't fit */
2319 			size_to_populate = c_seg_bufsize - C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset);
2320 
2321 			if (size_to_populate == 0) {
2322 				/* can't fit */
2323 				keep_compacting = FALSE;
2324 				break;
2325 			}
2326 			if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) {
2327 				size_to_populate = C_SEG_MAX_POPULATE_SIZE;
2328 			}
2329 
2330 			kernel_memory_populate(
2331 				(vm_offset_t) &c_seg_dst->c_store.c_buffer[c_seg_dst->c_populated_offset],
2332 				size_to_populate,
2333 				KMA_NOFAIL | KMA_COMPRESSOR,
2334 				VM_KERN_MEMORY_COMPRESSOR);
2335 
2336 			c_seg_dst->c_populated_offset += C_SEG_BYTES_TO_OFFSET(size_to_populate);
2337 			assert(C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset) <= c_seg_bufsize);
2338 		}
2339 		c_seg_alloc_nextslot(c_seg_dst);
2340 
2341 		c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot);
2342 
2343 		memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], c_size);
2344 
2345 		c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
2346 
2347 		c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_slots++;
2348 		c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_bytes += c_size;
2349 
2350 		cslot_copy(c_dst, c_src);
2351 		c_dst->c_offset = c_seg_dst->c_nextoffset;
2352 
2353 		if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) {
2354 			c_seg_dst->c_firstemptyslot++;
2355 		}
2356 		c_seg_dst->c_slots_used++;
2357 		c_seg_dst->c_nextslot++;
2358 		c_seg_dst->c_bytes_used += c_rounded_size;
2359 		c_seg_dst->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2360 
2361 		PACK_C_SIZE(c_src, 0);
2362 
2363 		c_seg_src->c_bytes_used -= c_rounded_size;
2364 		c_seg_src->c_bytes_unused += c_rounded_size;
2365 		c_seg_src->c_firstemptyslot = 0;
2366 
2367 		assert(c_seg_src->c_slots_used);
2368 		c_seg_src->c_slots_used--;
2369 
2370 		if (!c_seg_src->c_swappedin) {
2371 			/* Pessimistically lose swappedin status when non-swappedin pages are added. */
2372 			c_seg_dst->c_swappedin = false;
2373 		}
2374 
2375 		if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
2376 			/* dest segment is now full */
2377 			keep_compacting = FALSE;
2378 			break;
2379 		}
2380 	}
2381 #if DEVELOPMENT || DEBUG
2382 	C_SEG_WRITE_PROTECT(c_seg_dst);
2383 #endif
2384 	if (dst_slot < c_seg_dst->c_nextslot) {
2385 		PAGE_REPLACEMENT_ALLOWED(TRUE);
2386 		/*
2387 		 * we've now locked out c_decompress from
2388 		 * converting the slot passed into it into
2389 		 * a c_segment_t which allows us to use
2390 		 * the backptr to change which c_segment and
2391 		 * index the slot points to
2392 		 */
2393 		while (dst_slot < c_seg_dst->c_nextslot) {
2394 			c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, dst_slot);
2395 
2396 			slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
2397 			/* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
2398 			slot_ptr->s_cseg = c_seg_dst->c_mysegno + 1;
2399 			slot_ptr->s_cindx = dst_slot++;
2400 		}
2401 		PAGE_REPLACEMENT_ALLOWED(FALSE);
2402 	}
2403 	return keep_compacting;
2404 }
2405 
2406 
2407 uint64_t
vm_compressor_compute_elapsed_msecs(clock_sec_t end_sec,clock_nsec_t end_nsec,clock_sec_t start_sec,clock_nsec_t start_nsec)2408 vm_compressor_compute_elapsed_msecs(clock_sec_t end_sec, clock_nsec_t end_nsec, clock_sec_t start_sec, clock_nsec_t start_nsec)
2409 {
2410 	uint64_t end_msecs;
2411 	uint64_t start_msecs;
2412 
2413 	end_msecs = (end_sec * 1000) + end_nsec / 1000000;
2414 	start_msecs = (start_sec * 1000) + start_nsec / 1000000;
2415 
2416 	return end_msecs - start_msecs;
2417 }
2418 
2419 
2420 
2421 uint32_t compressor_eval_period_in_msecs = 250;
2422 uint32_t compressor_sample_min_in_msecs = 500;
2423 uint32_t compressor_sample_max_in_msecs = 10000;
2424 uint32_t compressor_thrashing_threshold_per_10msecs = 50;
2425 uint32_t compressor_thrashing_min_per_10msecs = 20;
2426 
2427 /* When true, reset sample data next chance we get. */
2428 static boolean_t        compressor_need_sample_reset = FALSE;
2429 
2430 
2431 void
compute_swapout_target_age(void)2432 compute_swapout_target_age(void)
2433 {
2434 	clock_sec_t     cur_ts_sec;
2435 	clock_nsec_t    cur_ts_nsec;
2436 	uint32_t        min_operations_needed_in_this_sample;
2437 	uint64_t        elapsed_msecs_in_eval;
2438 	uint64_t        elapsed_msecs_in_sample;
2439 	boolean_t       need_eval_reset = FALSE;
2440 
2441 	clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
2442 
2443 	elapsed_msecs_in_sample = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, start_of_sample_period_sec, start_of_sample_period_nsec);
2444 
2445 	if (compressor_need_sample_reset ||
2446 	    elapsed_msecs_in_sample >= compressor_sample_max_in_msecs) {
2447 		compressor_need_sample_reset = TRUE;
2448 		need_eval_reset = TRUE;
2449 		goto done;
2450 	}
2451 	elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, start_of_eval_period_sec, start_of_eval_period_nsec);
2452 
2453 	if (elapsed_msecs_in_eval < compressor_eval_period_in_msecs) {
2454 		goto done;
2455 	}
2456 	need_eval_reset = TRUE;
2457 
2458 	KERNEL_DEBUG(0xe0400020 | DBG_FUNC_START, elapsed_msecs_in_eval, sample_period_compression_count, sample_period_decompression_count, 0, 0);
2459 
2460 	min_operations_needed_in_this_sample = (compressor_thrashing_min_per_10msecs * (uint32_t)elapsed_msecs_in_eval) / 10;
2461 
2462 	if ((sample_period_compression_count - last_eval_compression_count) < min_operations_needed_in_this_sample ||
2463 	    (sample_period_decompression_count - last_eval_decompression_count) < min_operations_needed_in_this_sample) {
2464 		KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, sample_period_compression_count - last_eval_compression_count,
2465 		    sample_period_decompression_count - last_eval_decompression_count, 0, 1, 0);
2466 
2467 		swapout_target_age = 0;
2468 
2469 		compressor_need_sample_reset = TRUE;
2470 		need_eval_reset = TRUE;
2471 		goto done;
2472 	}
2473 	last_eval_compression_count = sample_period_compression_count;
2474 	last_eval_decompression_count = sample_period_decompression_count;
2475 
2476 	if (elapsed_msecs_in_sample < compressor_sample_min_in_msecs) {
2477 		KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, swapout_target_age, 0, 0, 5, 0);
2478 		goto done;
2479 	}
2480 	if (sample_period_decompression_count > ((compressor_thrashing_threshold_per_10msecs * elapsed_msecs_in_sample) / 10)) {
2481 		uint64_t        running_total;
2482 		uint64_t        working_target;
2483 		uint64_t        aging_target;
2484 		uint32_t        oldest_age_of_csegs_sampled = 0;
2485 		uint64_t        working_set_approximation = 0;
2486 
2487 		swapout_target_age = 0;
2488 
2489 		working_target = (sample_period_decompression_count / 100) * 95;                /* 95 percent */
2490 		aging_target = (sample_period_decompression_count / 100) * 1;                   /* 1 percent */
2491 		running_total = 0;
2492 
2493 		for (oldest_age_of_csegs_sampled = 0; oldest_age_of_csegs_sampled < DECOMPRESSION_SAMPLE_MAX_AGE; oldest_age_of_csegs_sampled++) {
2494 			running_total += age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2495 
2496 			working_set_approximation += oldest_age_of_csegs_sampled * age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2497 
2498 			if (running_total >= working_target) {
2499 				break;
2500 			}
2501 		}
2502 		if (oldest_age_of_csegs_sampled < DECOMPRESSION_SAMPLE_MAX_AGE) {
2503 			working_set_approximation = (working_set_approximation * 1000) / elapsed_msecs_in_sample;
2504 
2505 			if (working_set_approximation < VM_PAGE_COMPRESSOR_COUNT) {
2506 				running_total = overage_decompressions_during_sample_period;
2507 
2508 				for (oldest_age_of_csegs_sampled = DECOMPRESSION_SAMPLE_MAX_AGE - 1; oldest_age_of_csegs_sampled; oldest_age_of_csegs_sampled--) {
2509 					running_total += age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2510 
2511 					if (running_total >= aging_target) {
2512 						break;
2513 					}
2514 				}
2515 				swapout_target_age = (uint32_t)cur_ts_sec - oldest_age_of_csegs_sampled;
2516 
2517 				KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, swapout_target_age, working_set_approximation, VM_PAGE_COMPRESSOR_COUNT, 2, 0);
2518 			} else {
2519 				KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, working_set_approximation, VM_PAGE_COMPRESSOR_COUNT, 0, 3, 0);
2520 			}
2521 		} else {
2522 			KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, working_target, running_total, 0, 4, 0);
2523 		}
2524 
2525 		compressor_need_sample_reset = TRUE;
2526 		need_eval_reset = TRUE;
2527 	} else {
2528 		KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, sample_period_decompression_count, (compressor_thrashing_threshold_per_10msecs * elapsed_msecs_in_sample) / 10, 0, 6, 0);
2529 	}
2530 done:
2531 	if (compressor_need_sample_reset == TRUE) {
2532 		bzero(age_of_decompressions_during_sample_period, sizeof(age_of_decompressions_during_sample_period));
2533 		overage_decompressions_during_sample_period = 0;
2534 
2535 		start_of_sample_period_sec = cur_ts_sec;
2536 		start_of_sample_period_nsec = cur_ts_nsec;
2537 		sample_period_decompression_count = 0;
2538 		sample_period_compression_count = 0;
2539 		last_eval_decompression_count = 0;
2540 		last_eval_compression_count = 0;
2541 		compressor_need_sample_reset = FALSE;
2542 	}
2543 	if (need_eval_reset == TRUE) {
2544 		start_of_eval_period_sec = cur_ts_sec;
2545 		start_of_eval_period_nsec = cur_ts_nsec;
2546 	}
2547 }
2548 
2549 
2550 int             compaction_swapper_init_now = 0;
2551 int             compaction_swapper_running = 0;
2552 int             compaction_swapper_awakened = 0;
2553 int             compaction_swapper_abort = 0;
2554 
2555 bool
vm_compressor_swapout_is_ripe()2556 vm_compressor_swapout_is_ripe()
2557 {
2558 	bool is_ripe = false;
2559 	if (vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit) {
2560 		c_segment_t     c_seg;
2561 		clock_sec_t     now;
2562 		clock_sec_t     age;
2563 		clock_nsec_t    nsec;
2564 
2565 		clock_get_system_nanotime(&now, &nsec);
2566 		age = 0;
2567 
2568 		lck_mtx_lock_spin_always(c_list_lock);
2569 
2570 		if (!queue_empty(&c_age_list_head)) {
2571 			c_seg = (c_segment_t) queue_first(&c_age_list_head);
2572 
2573 			age = now - c_seg->c_creation_ts;
2574 		}
2575 		lck_mtx_unlock_always(c_list_lock);
2576 
2577 		if (age >= vm_ripe_target_age) {
2578 			is_ripe = true;
2579 		}
2580 	}
2581 	return is_ripe;
2582 }
2583 
2584 static bool
compressor_swapout_conditions_met(void)2585 compressor_swapout_conditions_met(void)
2586 {
2587 	bool should_swap = false;
2588 	if (COMPRESSOR_NEEDS_TO_SWAP()) {
2589 		should_swap = true;
2590 		vmcs_stats.compressor_swap_threshold_exceeded++;
2591 	}
2592 	if (VM_PAGE_Q_THROTTLED(&vm_pageout_queue_external) && vm_page_anonymous_count < (vm_page_inactive_count / 20)) {
2593 		should_swap = true;
2594 		vmcs_stats.external_q_throttled++;
2595 	}
2596 	if (vm_page_free_count < (vm_page_free_reserved - (COMPRESSOR_FREE_RESERVED_LIMIT * 2))) {
2597 		should_swap = true;
2598 		vmcs_stats.free_count_below_reserve++;
2599 	}
2600 	return should_swap;
2601 }
2602 
2603 static bool
compressor_needs_to_swap()2604 compressor_needs_to_swap()
2605 {
2606 	bool should_swap = false;
2607 	if (vm_compressor_swapout_is_ripe()) {
2608 		should_swap = true;
2609 		goto check_if_low_space;
2610 	}
2611 
2612 	if (VM_CONFIG_SWAP_IS_ACTIVE) {
2613 		should_swap =  compressor_swapout_conditions_met();
2614 		if (should_swap) {
2615 			goto check_if_low_space;
2616 		}
2617 	}
2618 
2619 #if (XNU_TARGET_OS_OSX && __arm64__)
2620 	/*
2621 	 * Thrashing detection disabled.
2622 	 */
2623 #else /* (XNU_TARGET_OS_OSX && __arm64__) */
2624 
2625 	if (vm_compressor_is_thrashing()) {
2626 		should_swap = true;
2627 		vmcs_stats.thrashing_detected++;
2628 	}
2629 
2630 #if CONFIG_PHANTOM_CACHE
2631 	if (vm_phantom_cache_check_pressure()) {
2632 		os_atomic_store(&memorystatus_phantom_cache_pressure, true, release);
2633 		should_swap = true;
2634 	}
2635 #endif
2636 	if (swapout_target_age) {
2637 		should_swap = true;
2638 	}
2639 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
2640 
2641 check_if_low_space:
2642 
2643 #if CONFIG_JETSAM
2644 	if (should_swap || vm_compressor_low_on_space() == TRUE) {
2645 		if (vm_compressor_thrashing_detected == FALSE) {
2646 			vm_compressor_thrashing_detected = TRUE;
2647 
2648 			if (swapout_target_age) {
2649 				compressor_thrashing_induced_jetsam++;
2650 			} else if (vm_compressor_low_on_space() == TRUE) {
2651 				compressor_thrashing_induced_jetsam++;
2652 			} else {
2653 				filecache_thrashing_induced_jetsam++;
2654 			}
2655 			/*
2656 			 * Wake up the memorystatus thread so that it can return
2657 			 * the system to a healthy state (by killing processes).
2658 			 */
2659 			memorystatus_thread_wake();
2660 		}
2661 		/*
2662 		 * let the jetsam take precedence over
2663 		 * any major compactions we might have
2664 		 * been able to do... otherwise we run
2665 		 * the risk of doing major compactions
2666 		 * on segments we're about to free up
2667 		 * due to the jetsam activity.
2668 		 */
2669 		should_swap = false;
2670 		if (memorystatus_swap_all_apps && vm_swap_low_on_space()) {
2671 			vm_compressor_take_paging_space_action();
2672 		}
2673 	}
2674 
2675 #else /* CONFIG_JETSAM */
2676 	if (should_swap && vm_swap_low_on_space()) {
2677 		vm_compressor_take_paging_space_action();
2678 	}
2679 #endif /* CONFIG_JETSAM */
2680 
2681 	if (should_swap == false) {
2682 		/*
2683 		 * vm_compressor_needs_to_major_compact returns true only if we're
2684 		 * about to run out of available compressor segments... in this
2685 		 * case, we absolutely need to run a major compaction even if
2686 		 * we've just kicked off a jetsam or we don't otherwise need to
2687 		 * swap... terminating objects releases
2688 		 * pages back to the uncompressed cache, but does not guarantee
2689 		 * that we will free up even a single compression segment
2690 		 */
2691 		should_swap = vm_compressor_needs_to_major_compact();
2692 		if (should_swap) {
2693 			vmcs_stats.fragmentation_detected++;
2694 		}
2695 	}
2696 
2697 	/*
2698 	 * returning TRUE when swap_supported == FALSE
2699 	 * will cause the major compaction engine to
2700 	 * run, but will not trigger any swapping...
2701 	 * segments that have been major compacted
2702 	 * will be moved to the majorcompact queue
2703 	 */
2704 	return should_swap;
2705 }
2706 
2707 #if CONFIG_JETSAM
2708 /*
2709  * This function is called from the jetsam thread after killing something to
2710  * mitigate thrashing.
2711  *
2712  * We need to restart our thrashing detection heuristics since memory pressure
2713  * has potentially changed significantly, and we don't want to detect on old
2714  * data from before the jetsam.
2715  */
2716 void
vm_thrashing_jetsam_done(void)2717 vm_thrashing_jetsam_done(void)
2718 {
2719 	vm_compressor_thrashing_detected = FALSE;
2720 
2721 	/* Were we compressor-thrashing or filecache-thrashing? */
2722 	if (swapout_target_age) {
2723 		swapout_target_age = 0;
2724 		compressor_need_sample_reset = TRUE;
2725 	}
2726 #if CONFIG_PHANTOM_CACHE
2727 	else {
2728 		vm_phantom_cache_restart_sample();
2729 	}
2730 #endif
2731 }
2732 #endif /* CONFIG_JETSAM */
2733 
2734 uint32_t vm_wake_compactor_swapper_calls = 0;
2735 uint32_t vm_run_compactor_already_running = 0;
2736 uint32_t vm_run_compactor_empty_minor_q = 0;
2737 uint32_t vm_run_compactor_did_compact = 0;
2738 uint32_t vm_run_compactor_waited = 0;
2739 
2740 void
vm_run_compactor(void)2741 vm_run_compactor(void)
2742 {
2743 	if (c_segment_count == 0) {
2744 		return;
2745 	}
2746 
2747 	lck_mtx_lock_spin_always(c_list_lock);
2748 
2749 	if (c_minor_count == 0) {
2750 		vm_run_compactor_empty_minor_q++;
2751 
2752 		lck_mtx_unlock_always(c_list_lock);
2753 		return;
2754 	}
2755 	if (compaction_swapper_running) {
2756 		if (vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2757 			vm_run_compactor_already_running++;
2758 
2759 			lck_mtx_unlock_always(c_list_lock);
2760 			return;
2761 		}
2762 		vm_run_compactor_waited++;
2763 
2764 		assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
2765 
2766 		lck_mtx_unlock_always(c_list_lock);
2767 
2768 		thread_block(THREAD_CONTINUE_NULL);
2769 
2770 		return;
2771 	}
2772 	vm_run_compactor_did_compact++;
2773 
2774 	fastwake_warmup = FALSE;
2775 	compaction_swapper_running = 1;
2776 
2777 	vm_compressor_do_delayed_compactions(FALSE);
2778 
2779 	compaction_swapper_running = 0;
2780 
2781 	lck_mtx_unlock_always(c_list_lock);
2782 
2783 	thread_wakeup((event_t)&compaction_swapper_running);
2784 }
2785 
2786 
2787 void
vm_wake_compactor_swapper(void)2788 vm_wake_compactor_swapper(void)
2789 {
2790 	if (compaction_swapper_running || compaction_swapper_awakened || c_segment_count == 0) {
2791 		return;
2792 	}
2793 
2794 	if (c_minor_count || vm_compressor_needs_to_major_compact()) {
2795 		lck_mtx_lock_spin_always(c_list_lock);
2796 
2797 		fastwake_warmup = FALSE;
2798 
2799 		if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
2800 			vm_wake_compactor_swapper_calls++;
2801 
2802 			compaction_swapper_awakened = 1;
2803 			thread_wakeup((event_t)&c_compressor_swap_trigger);
2804 		}
2805 		lck_mtx_unlock_always(c_list_lock);
2806 	}
2807 }
2808 
2809 
2810 void
vm_consider_swapping()2811 vm_consider_swapping()
2812 {
2813 	c_segment_t     c_seg, c_seg_next;
2814 	clock_sec_t     now;
2815 	clock_nsec_t    nsec;
2816 
2817 	assert(VM_CONFIG_SWAP_IS_PRESENT);
2818 
2819 	lck_mtx_lock_spin_always(c_list_lock);
2820 
2821 	compaction_swapper_abort = 1;
2822 
2823 	while (compaction_swapper_running) {
2824 		assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
2825 
2826 		lck_mtx_unlock_always(c_list_lock);
2827 
2828 		thread_block(THREAD_CONTINUE_NULL);
2829 
2830 		lck_mtx_lock_spin_always(c_list_lock);
2831 	}
2832 	compaction_swapper_abort = 0;
2833 	compaction_swapper_running = 1;
2834 
2835 	vm_swapout_ripe_segments = TRUE;
2836 
2837 	if (!queue_empty(&c_major_list_head)) {
2838 		clock_get_system_nanotime(&now, &nsec);
2839 
2840 		c_seg = (c_segment_t)queue_first(&c_major_list_head);
2841 
2842 		while (!queue_end(&c_major_list_head, (queue_entry_t)c_seg)) {
2843 			if (c_overage_swapped_count >= c_overage_swapped_limit) {
2844 				break;
2845 			}
2846 
2847 			c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
2848 
2849 			if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
2850 				lck_mtx_lock_spin_always(&c_seg->c_lock);
2851 
2852 				c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
2853 
2854 				lck_mtx_unlock_always(&c_seg->c_lock);
2855 			}
2856 			c_seg = c_seg_next;
2857 		}
2858 	}
2859 	vm_compressor_compact_and_swap(FALSE);
2860 
2861 	compaction_swapper_running = 0;
2862 
2863 	vm_swapout_ripe_segments = FALSE;
2864 
2865 	lck_mtx_unlock_always(c_list_lock);
2866 
2867 	thread_wakeup((event_t)&compaction_swapper_running);
2868 }
2869 
2870 
2871 void
vm_consider_waking_compactor_swapper(void)2872 vm_consider_waking_compactor_swapper(void)
2873 {
2874 	boolean_t       need_wakeup = FALSE;
2875 
2876 	if (c_segment_count == 0) {
2877 		return;
2878 	}
2879 
2880 	if (compaction_swapper_running || compaction_swapper_awakened) {
2881 		return;
2882 	}
2883 
2884 	if (!compaction_swapper_inited && !compaction_swapper_init_now) {
2885 		compaction_swapper_init_now = 1;
2886 		need_wakeup = TRUE;
2887 	}
2888 
2889 	if (c_minor_count && (COMPRESSOR_NEEDS_TO_MINOR_COMPACT())) {
2890 		need_wakeup = TRUE;
2891 	} else if (compressor_needs_to_swap()) {
2892 		need_wakeup = TRUE;
2893 	} else if (c_minor_count) {
2894 		uint64_t        total_bytes;
2895 
2896 		total_bytes = compressor_object->resident_page_count * PAGE_SIZE_64;
2897 
2898 		if ((total_bytes - compressor_bytes_used) > total_bytes / 10) {
2899 			need_wakeup = TRUE;
2900 		}
2901 	}
2902 	if (need_wakeup == TRUE) {
2903 		lck_mtx_lock_spin_always(c_list_lock);
2904 
2905 		fastwake_warmup = FALSE;
2906 
2907 		if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
2908 			memoryshot(VM_WAKEUP_COMPACTOR_SWAPPER, DBG_FUNC_NONE);
2909 
2910 			compaction_swapper_awakened = 1;
2911 			thread_wakeup((event_t)&c_compressor_swap_trigger);
2912 		}
2913 		lck_mtx_unlock_always(c_list_lock);
2914 	}
2915 }
2916 
2917 
2918 #define C_SWAPOUT_LIMIT                 4
2919 #define DELAYED_COMPACTIONS_PER_PASS    30
2920 
2921 void
vm_compressor_do_delayed_compactions(boolean_t flush_all)2922 vm_compressor_do_delayed_compactions(boolean_t flush_all)
2923 {
2924 	c_segment_t     c_seg;
2925 	int             number_compacted = 0;
2926 	boolean_t       needs_to_swap = FALSE;
2927 	uint32_t        c_swapout_count = 0;
2928 
2929 
2930 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_START, c_minor_count, flush_all, 0, 0);
2931 
2932 #if XNU_TARGET_OS_OSX
2933 	LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
2934 #endif /* XNU_TARGET_OS_OSX */
2935 
2936 	while (!queue_empty(&c_minor_list_head) && needs_to_swap == FALSE) {
2937 		c_seg = (c_segment_t)queue_first(&c_minor_list_head);
2938 
2939 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2940 
2941 		if (c_seg->c_busy) {
2942 			lck_mtx_unlock_always(c_list_lock);
2943 			c_seg_wait_on_busy(c_seg);
2944 			lck_mtx_lock_spin_always(c_list_lock);
2945 
2946 			continue;
2947 		}
2948 		C_SEG_BUSY(c_seg);
2949 
2950 		c_seg_do_minor_compaction_and_unlock(c_seg, TRUE, FALSE, TRUE);
2951 
2952 		c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
2953 		if (VM_CONFIG_SWAP_IS_ACTIVE && (number_compacted++ > DELAYED_COMPACTIONS_PER_PASS)) {
2954 			if ((flush_all == TRUE || compressor_needs_to_swap()) && c_swapout_count < C_SWAPOUT_LIMIT) {
2955 				needs_to_swap = TRUE;
2956 			}
2957 
2958 			number_compacted = 0;
2959 		}
2960 		lck_mtx_lock_spin_always(c_list_lock);
2961 	}
2962 
2963 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_END, c_minor_count, number_compacted, needs_to_swap, 0);
2964 }
2965 
2966 int min_csegs_per_major_compaction = DELAYED_COMPACTIONS_PER_PASS;
2967 
2968 static bool
vm_compressor_major_compact_cseg(c_segment_t c_seg,uint32_t * c_seg_considered,bool * bail_wanted_cseg,uint64_t * total_bytes_freed)2969 vm_compressor_major_compact_cseg(c_segment_t c_seg, uint32_t* c_seg_considered, bool* bail_wanted_cseg, uint64_t* total_bytes_freed)
2970 {
2971 	/*
2972 	 * Major compaction
2973 	 */
2974 	bool keep_compacting = true, fully_compacted = true;
2975 	queue_head_t *list_head = NULL;
2976 	c_segment_t c_seg_next;
2977 	uint64_t        bytes_to_free = 0, bytes_freed = 0;
2978 	uint32_t        number_considered = 0;
2979 
2980 	if (c_seg->c_state == C_ON_AGE_Q) {
2981 		assert(!c_seg->c_has_donated_pages);
2982 		list_head = &c_age_list_head;
2983 	} else if (c_seg->c_state == C_ON_SWAPPEDIN_Q) {
2984 		assert(c_seg->c_has_donated_pages);
2985 		list_head = &c_late_swappedin_list_head;
2986 	}
2987 
2988 	while (keep_compacting == TRUE) {
2989 		assert(c_seg->c_busy);
2990 
2991 		/* look for another segment to consolidate */
2992 
2993 		c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
2994 
2995 		if (queue_end(list_head, (queue_entry_t)c_seg_next)) {
2996 			break;
2997 		}
2998 
2999 		assert(c_seg_next->c_state == c_seg->c_state);
3000 
3001 		number_considered++;
3002 
3003 		if (c_seg_major_compact_ok(c_seg, c_seg_next) == FALSE) {
3004 			break;
3005 		}
3006 
3007 		lck_mtx_lock_spin_always(&c_seg_next->c_lock);
3008 
3009 		if (c_seg_next->c_busy) {
3010 			/*
3011 			 * We are going to block for our neighbor.
3012 			 * If our c_seg is wanted, we should unbusy
3013 			 * it because we don't know how long we might
3014 			 * have to block here.
3015 			 */
3016 			if (c_seg->c_wanted) {
3017 				lck_mtx_unlock_always(&c_seg_next->c_lock);
3018 				fully_compacted = false;
3019 				c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
3020 				*bail_wanted_cseg = true;
3021 				break;
3022 			}
3023 
3024 			lck_mtx_unlock_always(c_list_lock);
3025 
3026 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 8, (void*) VM_KERNEL_ADDRPERM(c_seg_next), 0, 0);
3027 
3028 			c_seg_wait_on_busy(c_seg_next);
3029 			lck_mtx_lock_spin_always(c_list_lock);
3030 
3031 			continue;
3032 		}
3033 		/* grab that segment */
3034 		C_SEG_BUSY(c_seg_next);
3035 
3036 		bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3037 		if (c_seg_do_minor_compaction_and_unlock(c_seg_next, FALSE, TRUE, TRUE)) {
3038 			/*
3039 			 * found an empty c_segment and freed it
3040 			 * so we can't continue to use c_seg_next
3041 			 */
3042 			bytes_freed += bytes_to_free;
3043 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3044 			continue;
3045 		}
3046 
3047 		/* unlock the list ... */
3048 		lck_mtx_unlock_always(c_list_lock);
3049 
3050 		/* do the major compaction */
3051 
3052 		keep_compacting = c_seg_major_compact(c_seg, c_seg_next);
3053 
3054 		VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 9, keep_compacting, 0, 0);
3055 
3056 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
3057 
3058 		lck_mtx_lock_spin_always(&c_seg_next->c_lock);
3059 		/*
3060 		 * run a minor compaction on the donor segment
3061 		 * since we pulled at least some of it's
3062 		 * data into our target...  if we've emptied
3063 		 * it, now is a good time to free it which
3064 		 * c_seg_minor_compaction_and_unlock also takes care of
3065 		 *
3066 		 * by passing TRUE, we ask for c_busy to be cleared
3067 		 * and c_wanted to be taken care of
3068 		 */
3069 		bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3070 		if (c_seg_minor_compaction_and_unlock(c_seg_next, TRUE)) {
3071 			bytes_freed += bytes_to_free;
3072 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3073 		} else {
3074 			bytes_to_free -= C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3075 			bytes_freed += bytes_to_free;
3076 		}
3077 
3078 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
3079 
3080 		/* relock the list */
3081 		lck_mtx_lock_spin_always(c_list_lock);
3082 
3083 		if (c_seg->c_wanted) {
3084 			/*
3085 			 * Our c_seg is in demand. Let's
3086 			 * unbusy it and wakeup the waiters
3087 			 * instead of continuing the compaction
3088 			 * because we could be in this loop
3089 			 * for a while.
3090 			 */
3091 			fully_compacted = false;
3092 			*bail_wanted_cseg = true;
3093 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
3094 			break;
3095 		}
3096 	} /* major compaction */
3097 
3098 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 10, number_considered, *bail_wanted_cseg, 0);
3099 
3100 	*c_seg_considered += number_considered;
3101 	*total_bytes_freed += bytes_freed;
3102 
3103 	lck_mtx_lock_spin_always(&c_seg->c_lock);
3104 	return fully_compacted;
3105 }
3106 
3107 #define TIME_SUB(rsecs, secs, rfrac, frac, unit)                        \
3108 	MACRO_BEGIN                                                     \
3109 	if ((int)((rfrac) -= (frac)) < 0) {                             \
3110 	        (rfrac) += (unit);                                      \
3111 	        (rsecs) -= 1;                                           \
3112 	}                                                               \
3113 	(rsecs) -= (secs);                                              \
3114 	MACRO_END
3115 
3116 #if (XNU_TARGET_OS_OSX && __arm64__)
3117 clock_nsec_t c_process_major_report_over_ms = 9; /* report if over 9 ms */
3118 int c_process_major_yield_after = 1000; /* yield after moving 1,000 segments */
3119 uint64_t c_process_major_reports = 0;
3120 clock_sec_t c_process_major_max_sec = 0;
3121 clock_nsec_t c_process_major_max_nsec = 0;
3122 uint32_t c_process_major_peak_segcount = 0;
3123 static void
vm_compressor_process_major_segments(void)3124 vm_compressor_process_major_segments(void)
3125 {
3126 	c_segment_t c_seg = NULL;
3127 	int count = 0, total = 0, breaks = 0;
3128 	clock_sec_t start_sec, end_sec;
3129 	clock_nsec_t start_nsec, end_nsec;
3130 	clock_nsec_t report_over_ns;
3131 
3132 	if (queue_empty(&c_major_list_head)) {
3133 		return;
3134 	}
3135 
3136 	// printf("%s: starting to move segments from MAJORQ to AGEQ\n", __FUNCTION__);
3137 	if (c_process_major_report_over_ms != 0) {
3138 		report_over_ns = c_process_major_report_over_ms * NSEC_PER_MSEC;
3139 	} else {
3140 		report_over_ns = (clock_nsec_t)-1;
3141 	}
3142 	clock_get_system_nanotime(&start_sec, &start_nsec);
3143 	while (!queue_empty(&c_major_list_head)) {
3144 		/* start from the end to preserve aging order */
3145 		c_seg = (c_segment_t)queue_last(&c_major_list_head);
3146 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3147 		c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3148 		lck_mtx_unlock_always(&c_seg->c_lock);
3149 
3150 		count++;
3151 		if (count == c_process_major_yield_after ||
3152 		    queue_empty(&c_major_list_head)) {
3153 			/* done or time to take a break */
3154 		} else {
3155 			/* keep going */
3156 			continue;
3157 		}
3158 
3159 		total += count;
3160 		clock_get_system_nanotime(&end_sec, &end_nsec);
3161 		TIME_SUB(end_sec, start_sec, end_nsec, start_nsec, NSEC_PER_SEC);
3162 		if (end_sec > c_process_major_max_sec) {
3163 			c_process_major_max_sec = end_sec;
3164 			c_process_major_max_nsec = end_nsec;
3165 		} else if (end_sec == c_process_major_max_sec &&
3166 		    end_nsec > c_process_major_max_nsec) {
3167 			c_process_major_max_nsec = end_nsec;
3168 		}
3169 		if (total > c_process_major_peak_segcount) {
3170 			c_process_major_peak_segcount = total;
3171 		}
3172 		if (end_sec > 0 ||
3173 		    end_nsec >= report_over_ns) {
3174 			/* we used more than expected */
3175 			c_process_major_reports++;
3176 			printf("%s: moved %d/%d segments from MAJORQ to AGEQ in %lu.%09u seconds and %d breaks\n",
3177 			    __FUNCTION__, count, total,
3178 			    end_sec, end_nsec, breaks);
3179 		}
3180 		if (queue_empty(&c_major_list_head)) {
3181 			/* done */
3182 			break;
3183 		}
3184 		/* take a break to allow someone else to grab the lock */
3185 		lck_mtx_unlock_always(c_list_lock);
3186 		mutex_pause(0); /* 10 microseconds */
3187 		lck_mtx_lock_spin_always(c_list_lock);
3188 		/* start again */
3189 		clock_get_system_nanotime(&start_sec, &start_nsec);
3190 		count = 0;
3191 		breaks++;
3192 	}
3193 }
3194 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3195 
3196 /*
3197  * macOS special swappable csegs -> early_swapin queue
3198  * non-macOS special swappable+non-freezer csegs -> late_swapin queue
3199  * Processing special csegs means minor compacting each cseg and then
3200  * major compacting it and putting them on the early or late
3201  * (depending on platform) swapout queue.
3202  */
3203 static void
vm_compressor_process_special_swapped_in_segments_locked(void)3204 vm_compressor_process_special_swapped_in_segments_locked(void)
3205 {
3206 	c_segment_t c_seg = NULL;
3207 	bool            switch_state = true, bail_wanted_cseg = false;
3208 	unsigned int    number_considered = 0, yield_after_considered_per_pass = 0;
3209 	uint64_t        bytes_freed = 0;
3210 	queue_head_t    *special_swappedin_list_head;
3211 
3212 #if XNU_TARGET_OS_OSX
3213 	special_swappedin_list_head = &c_early_swappedin_list_head;
3214 #else /* XNU_TARGET_OS_OSX */
3215 	if (memorystatus_swap_all_apps) {
3216 		special_swappedin_list_head = &c_late_swappedin_list_head;
3217 	} else {
3218 		/* called on unsupported config*/
3219 		return;
3220 	}
3221 #endif /* XNU_TARGET_OS_OSX */
3222 
3223 	yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS);
3224 	while (!queue_empty(special_swappedin_list_head)) {
3225 		c_seg = (c_segment_t)queue_first(special_swappedin_list_head);
3226 
3227 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3228 
3229 		if (c_seg->c_busy) {
3230 			lck_mtx_unlock_always(c_list_lock);
3231 			c_seg_wait_on_busy(c_seg);
3232 			lck_mtx_lock_spin_always(c_list_lock);
3233 			continue;
3234 		}
3235 
3236 		C_SEG_BUSY(c_seg);
3237 		lck_mtx_unlock_always(&c_seg->c_lock);
3238 		lck_mtx_unlock_always(c_list_lock);
3239 
3240 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
3241 
3242 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3243 
3244 		if (c_seg_minor_compaction_and_unlock(c_seg, FALSE /*clear busy?*/)) {
3245 			/*
3246 			 * found an empty c_segment and freed it
3247 			 * so go grab the next guy in the queue
3248 			 */
3249 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
3250 			lck_mtx_lock_spin_always(c_list_lock);
3251 			continue;
3252 		}
3253 
3254 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
3255 		lck_mtx_lock_spin_always(c_list_lock);
3256 
3257 		switch_state = vm_compressor_major_compact_cseg(c_seg, &number_considered, &bail_wanted_cseg, &bytes_freed);
3258 		assert(c_seg->c_busy);
3259 		assert(!c_seg->c_on_minorcompact_q);
3260 
3261 		if (switch_state) {
3262 			if (VM_CONFIG_SWAP_IS_ACTIVE || VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
3263 				/*
3264 				 * Ordinarily we let swapped in segments age out + get
3265 				 * major compacted with the rest of the c_segs on the ageQ.
3266 				 * But the early donated c_segs, if well compacted, should be
3267 				 * kept ready to be swapped out if needed. These are typically
3268 				 * describing memory belonging to a leaky app (macOS) or a swap-
3269 				 * capable app (iPadOS) and for the latter we can keep these
3270 				 * around longer because we control the triggers in the memorystatus
3271 				 * subsystem
3272 				 */
3273 				c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
3274 			}
3275 		}
3276 
3277 		C_SEG_WAKEUP_DONE(c_seg);
3278 
3279 		lck_mtx_unlock_always(&c_seg->c_lock);
3280 
3281 		if (number_considered >= yield_after_considered_per_pass) {
3282 			if (bail_wanted_cseg) {
3283 				/*
3284 				 * We stopped major compactions on a c_seg
3285 				 * that is wanted. We don't know the priority
3286 				 * of the waiter unfortunately but we are at
3287 				 * a very high priority and so, just in case
3288 				 * the waiter is a critical system daemon or
3289 				 * UI thread, let's give up the CPU in case
3290 				 * the system is running a few CPU intensive
3291 				 * tasks.
3292 				 */
3293 				bail_wanted_cseg = false;
3294 				lck_mtx_unlock_always(c_list_lock);
3295 
3296 				mutex_pause(2); /* 100us yield */
3297 
3298 				lck_mtx_lock_spin_always(c_list_lock);
3299 			}
3300 
3301 			number_considered = 0;
3302 		}
3303 	}
3304 }
3305 
3306 void
vm_compressor_process_special_swapped_in_segments(void)3307 vm_compressor_process_special_swapped_in_segments(void)
3308 {
3309 	lck_mtx_lock_spin_always(c_list_lock);
3310 	vm_compressor_process_special_swapped_in_segments_locked();
3311 	lck_mtx_unlock_always(c_list_lock);
3312 }
3313 
3314 #define C_SEGMENT_SWAPPEDIN_AGE_LIMIT   10
3315 /*
3316  * Processing regular csegs means aging them.
3317  */
3318 static void
vm_compressor_process_regular_swapped_in_segments(boolean_t flush_all)3319 vm_compressor_process_regular_swapped_in_segments(boolean_t flush_all)
3320 {
3321 	c_segment_t     c_seg;
3322 	clock_sec_t     now;
3323 	clock_nsec_t    nsec;
3324 
3325 	clock_get_system_nanotime(&now, &nsec);
3326 
3327 	while (!queue_empty(&c_regular_swappedin_list_head)) {
3328 		c_seg = (c_segment_t)queue_first(&c_regular_swappedin_list_head);
3329 
3330 		if (flush_all == FALSE && (now - c_seg->c_swappedin_ts) < C_SEGMENT_SWAPPEDIN_AGE_LIMIT) {
3331 			break;
3332 		}
3333 
3334 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3335 
3336 		c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3337 		c_seg->c_agedin_ts = (uint32_t) now;
3338 
3339 		lck_mtx_unlock_always(&c_seg->c_lock);
3340 	}
3341 }
3342 
3343 
3344 extern  int     vm_num_swap_files;
3345 extern  int     vm_num_pinned_swap_files;
3346 extern  int     vm_swappin_enabled;
3347 
3348 extern  unsigned int    vm_swapfile_total_segs_used;
3349 extern  unsigned int    vm_swapfile_total_segs_alloced;
3350 
3351 
3352 void
vm_compressor_flush(void)3353 vm_compressor_flush(void)
3354 {
3355 	uint64_t        vm_swap_put_failures_at_start;
3356 	wait_result_t   wait_result = 0;
3357 	AbsoluteTime    startTime, endTime;
3358 	clock_sec_t     now_sec;
3359 	clock_nsec_t    now_nsec;
3360 	uint64_t        nsec;
3361 	c_segment_t     c_seg, c_seg_next;
3362 
3363 	HIBLOG("vm_compressor_flush - starting\n");
3364 
3365 	clock_get_uptime(&startTime);
3366 
3367 	lck_mtx_lock_spin_always(c_list_lock);
3368 
3369 	fastwake_warmup = FALSE;
3370 	compaction_swapper_abort = 1;
3371 
3372 	while (compaction_swapper_running) {
3373 		assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
3374 
3375 		lck_mtx_unlock_always(c_list_lock);
3376 
3377 		thread_block(THREAD_CONTINUE_NULL);
3378 
3379 		lck_mtx_lock_spin_always(c_list_lock);
3380 	}
3381 	compaction_swapper_abort = 0;
3382 	compaction_swapper_running = 1;
3383 
3384 	hibernate_flushing = TRUE;
3385 	hibernate_no_swapspace = FALSE;
3386 	hibernate_flush_timed_out = FALSE;
3387 	c_generation_id_flush_barrier = c_generation_id + 1000;
3388 
3389 	clock_get_system_nanotime(&now_sec, &now_nsec);
3390 	hibernate_flushing_deadline = now_sec + HIBERNATE_FLUSHING_SECS_TO_COMPLETE;
3391 
3392 	vm_swap_put_failures_at_start = vm_swap_put_failures;
3393 
3394 	/*
3395 	 * We are about to hibernate and so we want all segments flushed to disk.
3396 	 * Segments that are on the major compaction queue won't be considered in
3397 	 * the vm_compressor_compact_and_swap() pass. So we need to bring them to
3398 	 * the ageQ for consideration.
3399 	 */
3400 	if (!queue_empty(&c_major_list_head)) {
3401 		c_seg = (c_segment_t)queue_first(&c_major_list_head);
3402 
3403 		while (!queue_end(&c_major_list_head, (queue_entry_t)c_seg)) {
3404 			c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
3405 			lck_mtx_lock_spin_always(&c_seg->c_lock);
3406 			c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3407 			lck_mtx_unlock_always(&c_seg->c_lock);
3408 			c_seg = c_seg_next;
3409 		}
3410 	}
3411 	vm_compressor_compact_and_swap(TRUE);
3412 
3413 	while (!queue_empty(&c_early_swapout_list_head) || !queue_empty(&c_regular_swapout_list_head) || !queue_empty(&c_late_swapout_list_head)) {
3414 		assert_wait_timeout((event_t) &compaction_swapper_running, THREAD_INTERRUPTIBLE, 5000, 1000 * NSEC_PER_USEC);
3415 
3416 		lck_mtx_unlock_always(c_list_lock);
3417 
3418 		wait_result = thread_block(THREAD_CONTINUE_NULL);
3419 
3420 		lck_mtx_lock_spin_always(c_list_lock);
3421 
3422 		if (wait_result == THREAD_TIMED_OUT) {
3423 			break;
3424 		}
3425 	}
3426 	hibernate_flushing = FALSE;
3427 	compaction_swapper_running = 0;
3428 
3429 	if (vm_swap_put_failures > vm_swap_put_failures_at_start) {
3430 		HIBLOG("vm_compressor_flush failed to clean %llu segments - vm_page_compressor_count(%d)\n",
3431 		    vm_swap_put_failures - vm_swap_put_failures_at_start, VM_PAGE_COMPRESSOR_COUNT);
3432 	}
3433 
3434 	lck_mtx_unlock_always(c_list_lock);
3435 
3436 	thread_wakeup((event_t)&compaction_swapper_running);
3437 
3438 	clock_get_uptime(&endTime);
3439 	SUB_ABSOLUTETIME(&endTime, &startTime);
3440 	absolutetime_to_nanoseconds(endTime, &nsec);
3441 
3442 	HIBLOG("vm_compressor_flush completed - took %qd msecs - vm_num_swap_files = %d, vm_num_pinned_swap_files = %d, vm_swappin_enabled = %d\n",
3443 	    nsec / 1000000ULL, vm_num_swap_files, vm_num_pinned_swap_files, vm_swappin_enabled);
3444 }
3445 
3446 
3447 int             compaction_swap_trigger_thread_awakened = 0;
3448 
3449 static void
vm_compressor_swap_trigger_thread(void)3450 vm_compressor_swap_trigger_thread(void)
3451 {
3452 	current_thread()->options |= TH_OPT_VMPRIV;
3453 
3454 	/*
3455 	 * compaction_swapper_init_now is set when the first call to
3456 	 * vm_consider_waking_compactor_swapper is made from
3457 	 * vm_pageout_scan... since this function is called upon
3458 	 * thread creation, we want to make sure to delay adjusting
3459 	 * the tuneables until we are awakened via vm_pageout_scan
3460 	 * so that we are at a point where the vm_swapfile_open will
3461 	 * be operating on the correct directory (in case the default
3462 	 * of using the VM volume is overridden by the dynamic_pager)
3463 	 */
3464 	if (compaction_swapper_init_now) {
3465 		vm_compaction_swapper_do_init();
3466 
3467 		if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
3468 			thread_vm_bind_group_add();
3469 		}
3470 #if CONFIG_THREAD_GROUPS
3471 		thread_group_vm_add();
3472 #endif
3473 		thread_set_thread_name(current_thread(), "VM_cswap_trigger");
3474 		compaction_swapper_init_now = 0;
3475 	}
3476 	lck_mtx_lock_spin_always(c_list_lock);
3477 
3478 	compaction_swap_trigger_thread_awakened++;
3479 	compaction_swapper_awakened = 0;
3480 
3481 	if (compaction_swapper_running == 0) {
3482 		compaction_swapper_running = 1;
3483 
3484 		vm_compressor_compact_and_swap(FALSE);
3485 
3486 		compaction_swapper_running = 0;
3487 	}
3488 	assert_wait((event_t)&c_compressor_swap_trigger, THREAD_UNINT);
3489 
3490 	if (compaction_swapper_running == 0) {
3491 		thread_wakeup((event_t)&compaction_swapper_running);
3492 	}
3493 
3494 	lck_mtx_unlock_always(c_list_lock);
3495 
3496 	thread_block((thread_continue_t)vm_compressor_swap_trigger_thread);
3497 
3498 	/* NOTREACHED */
3499 }
3500 
3501 
3502 void
vm_compressor_record_warmup_start(void)3503 vm_compressor_record_warmup_start(void)
3504 {
3505 	c_segment_t     c_seg;
3506 
3507 	lck_mtx_lock_spin_always(c_list_lock);
3508 
3509 	if (first_c_segment_to_warm_generation_id == 0) {
3510 		if (!queue_empty(&c_age_list_head)) {
3511 			c_seg = (c_segment_t)queue_last(&c_age_list_head);
3512 
3513 			first_c_segment_to_warm_generation_id = c_seg->c_generation_id;
3514 		} else {
3515 			first_c_segment_to_warm_generation_id = 0;
3516 		}
3517 
3518 		fastwake_recording_in_progress = TRUE;
3519 	}
3520 	lck_mtx_unlock_always(c_list_lock);
3521 }
3522 
3523 
3524 void
vm_compressor_record_warmup_end(void)3525 vm_compressor_record_warmup_end(void)
3526 {
3527 	c_segment_t     c_seg;
3528 
3529 	lck_mtx_lock_spin_always(c_list_lock);
3530 
3531 	if (fastwake_recording_in_progress == TRUE) {
3532 		if (!queue_empty(&c_age_list_head)) {
3533 			c_seg = (c_segment_t)queue_last(&c_age_list_head);
3534 
3535 			last_c_segment_to_warm_generation_id = c_seg->c_generation_id;
3536 		} else {
3537 			last_c_segment_to_warm_generation_id = first_c_segment_to_warm_generation_id;
3538 		}
3539 
3540 		fastwake_recording_in_progress = FALSE;
3541 
3542 		HIBLOG("vm_compressor_record_warmup (%qd - %qd)\n", first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id);
3543 	}
3544 	lck_mtx_unlock_always(c_list_lock);
3545 }
3546 
3547 
3548 #define DELAY_TRIM_ON_WAKE_SECS         25
3549 
3550 void
vm_compressor_delay_trim(void)3551 vm_compressor_delay_trim(void)
3552 {
3553 	clock_sec_t     sec;
3554 	clock_nsec_t    nsec;
3555 
3556 	clock_get_system_nanotime(&sec, &nsec);
3557 	dont_trim_until_ts = sec + DELAY_TRIM_ON_WAKE_SECS;
3558 }
3559 
3560 
3561 void
vm_compressor_do_warmup(void)3562 vm_compressor_do_warmup(void)
3563 {
3564 	lck_mtx_lock_spin_always(c_list_lock);
3565 
3566 	if (first_c_segment_to_warm_generation_id == last_c_segment_to_warm_generation_id) {
3567 		first_c_segment_to_warm_generation_id = last_c_segment_to_warm_generation_id = 0;
3568 
3569 		lck_mtx_unlock_always(c_list_lock);
3570 		return;
3571 	}
3572 
3573 	if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
3574 		fastwake_warmup = TRUE;
3575 
3576 		compaction_swapper_awakened = 1;
3577 		thread_wakeup((event_t)&c_compressor_swap_trigger);
3578 	}
3579 	lck_mtx_unlock_always(c_list_lock);
3580 }
3581 
3582 void
do_fastwake_warmup_all(void)3583 do_fastwake_warmup_all(void)
3584 {
3585 	lck_mtx_lock_spin_always(c_list_lock);
3586 
3587 	if (queue_empty(&c_swappedout_list_head) && queue_empty(&c_swappedout_sparse_list_head)) {
3588 		lck_mtx_unlock_always(c_list_lock);
3589 		return;
3590 	}
3591 
3592 	fastwake_warmup = TRUE;
3593 
3594 	do_fastwake_warmup(&c_swappedout_list_head, TRUE);
3595 
3596 	do_fastwake_warmup(&c_swappedout_sparse_list_head, TRUE);
3597 
3598 	fastwake_warmup = FALSE;
3599 
3600 	lck_mtx_unlock_always(c_list_lock);
3601 }
3602 
3603 void
do_fastwake_warmup(queue_head_t * c_queue,boolean_t consider_all_cseg)3604 do_fastwake_warmup(queue_head_t *c_queue, boolean_t consider_all_cseg)
3605 {
3606 	c_segment_t     c_seg = NULL;
3607 	AbsoluteTime    startTime, endTime;
3608 	uint64_t        nsec;
3609 
3610 
3611 	HIBLOG("vm_compressor_fastwake_warmup (%qd - %qd) - starting\n", first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id);
3612 
3613 	clock_get_uptime(&startTime);
3614 
3615 	lck_mtx_unlock_always(c_list_lock);
3616 
3617 	proc_set_thread_policy(current_thread(),
3618 	    TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2);
3619 
3620 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
3621 
3622 	lck_mtx_lock_spin_always(c_list_lock);
3623 
3624 	while (!queue_empty(c_queue) && fastwake_warmup == TRUE) {
3625 		c_seg = (c_segment_t) queue_first(c_queue);
3626 
3627 		if (consider_all_cseg == FALSE) {
3628 			if (c_seg->c_generation_id < first_c_segment_to_warm_generation_id ||
3629 			    c_seg->c_generation_id > last_c_segment_to_warm_generation_id) {
3630 				break;
3631 			}
3632 
3633 			if (vm_page_free_count < (AVAILABLE_MEMORY / 4)) {
3634 				break;
3635 			}
3636 		}
3637 
3638 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3639 		lck_mtx_unlock_always(c_list_lock);
3640 
3641 		if (c_seg->c_busy) {
3642 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
3643 			c_seg_wait_on_busy(c_seg);
3644 			PAGE_REPLACEMENT_DISALLOWED(TRUE);
3645 		} else {
3646 			if (c_seg_swapin(c_seg, TRUE, FALSE) == 0) {
3647 				lck_mtx_unlock_always(&c_seg->c_lock);
3648 			}
3649 			c_segment_warmup_count++;
3650 
3651 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
3652 			vm_pageout_io_throttle();
3653 			PAGE_REPLACEMENT_DISALLOWED(TRUE);
3654 		}
3655 		lck_mtx_lock_spin_always(c_list_lock);
3656 	}
3657 	lck_mtx_unlock_always(c_list_lock);
3658 
3659 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
3660 
3661 	proc_set_thread_policy(current_thread(),
3662 	    TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER0);
3663 
3664 	clock_get_uptime(&endTime);
3665 	SUB_ABSOLUTETIME(&endTime, &startTime);
3666 	absolutetime_to_nanoseconds(endTime, &nsec);
3667 
3668 	HIBLOG("vm_compressor_fastwake_warmup completed - took %qd msecs\n", nsec / 1000000ULL);
3669 
3670 	lck_mtx_lock_spin_always(c_list_lock);
3671 
3672 	if (consider_all_cseg == FALSE) {
3673 		first_c_segment_to_warm_generation_id = last_c_segment_to_warm_generation_id = 0;
3674 	}
3675 }
3676 
3677 extern bool     vm_swapout_thread_running;
3678 extern boolean_t        compressor_store_stop_compaction;
3679 
3680 void
vm_compressor_compact_and_swap(boolean_t flush_all)3681 vm_compressor_compact_and_swap(boolean_t flush_all)
3682 {
3683 	c_segment_t     c_seg;
3684 	bool            switch_state, bail_wanted_cseg = false;
3685 	clock_sec_t     now;
3686 	clock_nsec_t    nsec;
3687 	mach_timespec_t start_ts, end_ts;
3688 	unsigned int    number_considered, wanted_cseg_found, yield_after_considered_per_pass, number_yields;
3689 	uint64_t        bytes_freed, delta_usec;
3690 	uint32_t        c_swapout_count = 0;
3691 
3692 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_START, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
3693 
3694 	if (fastwake_warmup == TRUE) {
3695 		uint64_t        starting_warmup_count;
3696 
3697 		starting_warmup_count = c_segment_warmup_count;
3698 
3699 		KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 11) | DBG_FUNC_START, c_segment_warmup_count,
3700 		    first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id, 0, 0);
3701 		do_fastwake_warmup(&c_swappedout_list_head, FALSE);
3702 		KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 11) | DBG_FUNC_END, c_segment_warmup_count, c_segment_warmup_count - starting_warmup_count, 0, 0, 0);
3703 
3704 		fastwake_warmup = FALSE;
3705 	}
3706 
3707 #if (XNU_TARGET_OS_OSX && __arm64__)
3708 	/*
3709 	 * Re-considering major csegs showed benefits on all platforms by
3710 	 * significantly reducing fragmentation and getting back memory.
3711 	 * However, on smaller devices, eg watch, there was increased power
3712 	 * use for the additional compactions. And the turnover in csegs on
3713 	 * those smaller platforms is high enough in the decompression/free
3714 	 * path that we can skip reconsidering them here because we already
3715 	 * consider them for major compaction in those paths.
3716 	 */
3717 	vm_compressor_process_major_segments();
3718 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3719 
3720 	/*
3721 	 * it's possible for the c_age_list_head to be empty if we
3722 	 * hit our limits for growing the compressor pool and we subsequently
3723 	 * hibernated... on the next hibernation we could see the queue as
3724 	 * empty and not proceeed even though we have a bunch of segments on
3725 	 * the swapped in queue that need to be dealt with.
3726 	 */
3727 	vm_compressor_do_delayed_compactions(flush_all);
3728 	vm_compressor_process_special_swapped_in_segments_locked();
3729 	vm_compressor_process_regular_swapped_in_segments(flush_all);
3730 
3731 	/*
3732 	 * we only need to grab the timestamp once per
3733 	 * invocation of this function since the
3734 	 * timescale we're interested in is measured
3735 	 * in days
3736 	 */
3737 	clock_get_system_nanotime(&now, &nsec);
3738 
3739 	start_ts.tv_sec = (int) now;
3740 	start_ts.tv_nsec = nsec;
3741 	delta_usec = 0;
3742 	number_considered = 0;
3743 	wanted_cseg_found = 0;
3744 	number_yields = 0;
3745 	bytes_freed = 0;
3746 	yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS);
3747 
3748 #if 0
3749 	/**
3750 	 * SW: Need to figure out how to properly rate limit this log because it is currently way too
3751 	 * noisy. rdar://99379414 (Figure out how to rate limit the fragmentation level logging)
3752 	 */
3753 	os_log(OS_LOG_DEFAULT, "memorystatus: before compaction fragmentation level %u\n", vm_compressor_fragmentation_level());
3754 #endif
3755 
3756 	while (!queue_empty(&c_age_list_head) && !compaction_swapper_abort && !compressor_store_stop_compaction) {
3757 		if (hibernate_flushing == TRUE) {
3758 			clock_sec_t     sec;
3759 
3760 			if (hibernate_should_abort()) {
3761 				HIBLOG("vm_compressor_flush - hibernate_should_abort returned TRUE\n");
3762 				break;
3763 			}
3764 			if (hibernate_no_swapspace == TRUE) {
3765 				HIBLOG("vm_compressor_flush - out of swap space\n");
3766 				break;
3767 			}
3768 			if (vm_swap_files_pinned() == FALSE) {
3769 				HIBLOG("vm_compressor_flush - unpinned swap files\n");
3770 				break;
3771 			}
3772 			if (hibernate_in_progress_with_pinned_swap == TRUE &&
3773 			    (vm_swapfile_total_segs_alloced == vm_swapfile_total_segs_used)) {
3774 				HIBLOG("vm_compressor_flush - out of pinned swap space\n");
3775 				break;
3776 			}
3777 			clock_get_system_nanotime(&sec, &nsec);
3778 
3779 			if (sec > hibernate_flushing_deadline) {
3780 				hibernate_flush_timed_out = TRUE;
3781 				HIBLOG("vm_compressor_flush - failed to finish before deadline\n");
3782 				break;
3783 			}
3784 		}
3785 
3786 		c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
3787 		if (VM_CONFIG_SWAP_IS_ACTIVE && !vm_swap_out_of_space() && c_swapout_count >= C_SWAPOUT_LIMIT) {
3788 			assert_wait_timeout((event_t) &compaction_swapper_running, THREAD_INTERRUPTIBLE, 100, 1000 * NSEC_PER_USEC);
3789 
3790 			if (!vm_swapout_thread_running) {
3791 				thread_wakeup((event_t)&vm_swapout_thread);
3792 			}
3793 
3794 			lck_mtx_unlock_always(c_list_lock);
3795 
3796 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 1, c_swapout_count, 0, 0);
3797 
3798 			thread_block(THREAD_CONTINUE_NULL);
3799 
3800 			lck_mtx_lock_spin_always(c_list_lock);
3801 		}
3802 		/*
3803 		 * Minor compactions
3804 		 */
3805 		vm_compressor_do_delayed_compactions(flush_all);
3806 
3807 		/*
3808 		 * vm_compressor_process_early_swapped_in_segments()
3809 		 * might be too aggressive. So OFF for now.
3810 		 */
3811 		vm_compressor_process_regular_swapped_in_segments(flush_all);
3812 
3813 		/* Recompute because we dropped the c_list_lock above*/
3814 		c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
3815 		if (VM_CONFIG_SWAP_IS_ACTIVE && !vm_swap_out_of_space() && c_swapout_count >= C_SWAPOUT_LIMIT) {
3816 			/*
3817 			 * we timed out on the above thread_block
3818 			 * let's loop around and try again
3819 			 * the timeout allows us to continue
3820 			 * to do minor compactions to make
3821 			 * more memory available
3822 			 */
3823 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 2, c_swapout_count, 0, 0);
3824 
3825 			continue;
3826 		}
3827 
3828 		/*
3829 		 * Swap out segments?
3830 		 */
3831 		if (flush_all == FALSE) {
3832 			bool needs_to_swap;
3833 
3834 			lck_mtx_unlock_always(c_list_lock);
3835 
3836 			needs_to_swap = compressor_needs_to_swap();
3837 
3838 			lck_mtx_lock_spin_always(c_list_lock);
3839 
3840 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 3, needs_to_swap, 0, 0);
3841 
3842 			if (!needs_to_swap) {
3843 				break;
3844 			}
3845 		}
3846 		if (queue_empty(&c_age_list_head)) {
3847 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 4, c_age_count, 0, 0);
3848 			break;
3849 		}
3850 		c_seg = (c_segment_t) queue_first(&c_age_list_head);
3851 
3852 		assert(c_seg->c_state == C_ON_AGE_Q);
3853 
3854 		if (flush_all == TRUE && c_seg->c_generation_id > c_generation_id_flush_barrier) {
3855 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 5, 0, 0, 0);
3856 			break;
3857 		}
3858 
3859 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3860 
3861 		if (c_seg->c_busy) {
3862 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 6, (void*) VM_KERNEL_ADDRPERM(c_seg), 0, 0);
3863 
3864 			lck_mtx_unlock_always(c_list_lock);
3865 			c_seg_wait_on_busy(c_seg);
3866 			lck_mtx_lock_spin_always(c_list_lock);
3867 
3868 			continue;
3869 		}
3870 		C_SEG_BUSY(c_seg);
3871 
3872 		if (c_seg_do_minor_compaction_and_unlock(c_seg, FALSE, TRUE, TRUE)) {
3873 			/*
3874 			 * found an empty c_segment and freed it
3875 			 * so go grab the next guy in the queue
3876 			 */
3877 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 7, 0, 0, 0);
3878 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3879 			continue;
3880 		}
3881 
3882 		switch_state = vm_compressor_major_compact_cseg(c_seg, &number_considered, &bail_wanted_cseg, &bytes_freed);
3883 		if (bail_wanted_cseg) {
3884 			wanted_cseg_found++;
3885 			bail_wanted_cseg = false;
3886 		}
3887 
3888 		assert(c_seg->c_busy);
3889 		assert(!c_seg->c_on_minorcompact_q);
3890 
3891 		if (switch_state) {
3892 			if (VM_CONFIG_SWAP_IS_ACTIVE) {
3893 				int new_state = C_ON_SWAPOUT_Q;
3894 #if (XNU_TARGET_OS_OSX && __arm64__)
3895 				if (flush_all == false && compressor_swapout_conditions_met() == false) {
3896 					new_state = C_ON_MAJORCOMPACT_Q;
3897 				}
3898 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3899 
3900 				if (new_state == C_ON_SWAPOUT_Q) {
3901 					/*
3902 					 * This mode of putting a generic c_seg on the swapout list is
3903 					 * only supported when we have general swapping enabled
3904 					 */
3905 					clock_sec_t lnow;
3906 					clock_nsec_t lnsec;
3907 					clock_get_system_nanotime(&lnow, &lnsec);
3908 					if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 30) {
3909 						vmcs_stats.unripe_under_30s++;
3910 					} else if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 60) {
3911 						vmcs_stats.unripe_under_60s++;
3912 					} else if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 300) {
3913 						vmcs_stats.unripe_under_300s++;
3914 					}
3915 				}
3916 
3917 				c_seg_switch_state(c_seg, new_state, FALSE);
3918 			} else {
3919 				if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) {
3920 					assert(VM_CONFIG_SWAP_IS_PRESENT);
3921 					/*
3922 					 * we are running compressor sweeps with swap-behind
3923 					 * make sure the c_seg has aged enough before swapping it
3924 					 * out...
3925 					 */
3926 					if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
3927 						c_seg->c_overage_swap = TRUE;
3928 						c_overage_swapped_count++;
3929 						c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
3930 					}
3931 				}
3932 			}
3933 			if (c_seg->c_state == C_ON_AGE_Q) {
3934 				/*
3935 				 * this c_seg didn't get moved to the swapout queue
3936 				 * so we need to move it out of the way...
3937 				 * we just did a major compaction on it so put it
3938 				 * on that queue
3939 				 */
3940 				c_seg_switch_state(c_seg, C_ON_MAJORCOMPACT_Q, FALSE);
3941 			} else {
3942 				c_seg_major_compact_stats[c_seg_major_compact_stats_now].wasted_space_in_swapouts += c_seg_bufsize - c_seg->c_bytes_used;
3943 				c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_swapouts++;
3944 			}
3945 		}
3946 
3947 		C_SEG_WAKEUP_DONE(c_seg);
3948 
3949 		lck_mtx_unlock_always(&c_seg->c_lock);
3950 
3951 		/*
3952 		 * On systems _with_ general swap, regardless of jetsam, we wake up the swapout thread here.
3953 		 * On systems _without_ general swap, it's the responsibility of the memorystatus
3954 		 * subsystem to wake up the swapper.
3955 		 * TODO: When we have full jetsam support on a swap enabled system, we will need to revisit
3956 		 * this policy.
3957 		 */
3958 		if (VM_CONFIG_SWAP_IS_ACTIVE && c_swapout_count) {
3959 			/*
3960 			 * We don't pause/yield here because we will either
3961 			 * yield below or at the top of the loop with the
3962 			 * assert_wait_timeout.
3963 			 */
3964 			if (!vm_swapout_thread_running) {
3965 				thread_wakeup((event_t)&vm_swapout_thread);
3966 			}
3967 		}
3968 
3969 		if (number_considered >= yield_after_considered_per_pass) {
3970 			if (wanted_cseg_found) {
3971 				/*
3972 				 * We stopped major compactions on a c_seg
3973 				 * that is wanted. We don't know the priority
3974 				 * of the waiter unfortunately but we are at
3975 				 * a very high priority and so, just in case
3976 				 * the waiter is a critical system daemon or
3977 				 * UI thread, let's give up the CPU in case
3978 				 * the system is running a few CPU intensive
3979 				 * tasks.
3980 				 */
3981 				lck_mtx_unlock_always(c_list_lock);
3982 
3983 				mutex_pause(2); /* 100us yield */
3984 
3985 				number_yields++;
3986 
3987 				VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 11, number_considered, number_yields, 0);
3988 
3989 				lck_mtx_lock_spin_always(c_list_lock);
3990 			}
3991 
3992 			number_considered = 0;
3993 			wanted_cseg_found = 0;
3994 		}
3995 	}
3996 	clock_get_system_nanotime(&now, &nsec);
3997 
3998 	end_ts = major_compact_ts = (mach_timespec_t){.tv_sec = (int)now, .tv_nsec = nsec};
3999 
4000 	SUB_MACH_TIMESPEC(&end_ts, &start_ts);
4001 
4002 	delta_usec = (end_ts.tv_sec * USEC_PER_SEC) + (end_ts.tv_nsec / NSEC_PER_USEC) - (number_yields * 100);
4003 
4004 	delta_usec = MAX(1, delta_usec); /* we could have 0 usec run if conditions weren't right */
4005 
4006 	c_seg_major_compact_stats[c_seg_major_compact_stats_now].bytes_freed_rate_us = (bytes_freed / delta_usec);
4007 
4008 	if ((c_seg_major_compact_stats_now + 1) == C_SEG_MAJOR_COMPACT_STATS_MAX) {
4009 		c_seg_major_compact_stats_now = 0;
4010 	} else {
4011 		c_seg_major_compact_stats_now++;
4012 	}
4013 
4014 	assert(c_seg_major_compact_stats_now < C_SEG_MAJOR_COMPACT_STATS_MAX);
4015 
4016 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_END, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
4017 }
4018 
4019 
4020 static c_segment_t
c_seg_allocate(c_segment_t * current_chead)4021 c_seg_allocate(c_segment_t *current_chead)
4022 {
4023 	c_segment_t     c_seg;
4024 	int             min_needed;
4025 	int             size_to_populate;
4026 	c_segment_t     *donate_queue_head;
4027 
4028 #if XNU_TARGET_OS_OSX
4029 	if (vm_compressor_low_on_space()) {
4030 		vm_compressor_take_paging_space_action();
4031 	}
4032 #endif /* XNU_TARGET_OS_OSX */
4033 
4034 	if ((c_seg = *current_chead) == NULL) {
4035 		uint32_t        c_segno;
4036 
4037 		lck_mtx_lock_spin_always(c_list_lock);
4038 
4039 		while (c_segments_busy == TRUE) {
4040 			assert_wait((event_t) (&c_segments_busy), THREAD_UNINT);
4041 
4042 			lck_mtx_unlock_always(c_list_lock);
4043 
4044 			thread_block(THREAD_CONTINUE_NULL);
4045 
4046 			lck_mtx_lock_spin_always(c_list_lock);
4047 		}
4048 		if (c_free_segno_head == (uint32_t)-1) {
4049 			uint32_t        c_segments_available_new;
4050 			uint32_t        compressed_pages;
4051 
4052 #if CONFIG_FREEZE
4053 			if (freezer_incore_cseg_acct) {
4054 				compressed_pages = c_segment_pages_compressed_incore;
4055 			} else {
4056 				compressed_pages = c_segment_pages_compressed;
4057 			}
4058 #else
4059 			compressed_pages = c_segment_pages_compressed;
4060 #endif /* CONFIG_FREEZE */
4061 
4062 			if (c_segments_available >= c_segments_limit || compressed_pages >= c_segment_pages_compressed_limit) {
4063 				lck_mtx_unlock_always(c_list_lock);
4064 
4065 				return NULL;
4066 			}
4067 			c_segments_busy = TRUE;
4068 			lck_mtx_unlock_always(c_list_lock);
4069 
4070 			kernel_memory_populate((vm_offset_t)c_segments_next_page,
4071 			    PAGE_SIZE, KMA_NOFAIL | KMA_KOBJECT,
4072 			    VM_KERN_MEMORY_COMPRESSOR);
4073 			c_segments_next_page += PAGE_SIZE;
4074 
4075 			c_segments_available_new = c_segments_available + C_SEGMENTS_PER_PAGE;
4076 
4077 			if (c_segments_available_new > c_segments_limit) {
4078 				c_segments_available_new = c_segments_limit;
4079 			}
4080 
4081 			for (c_segno = c_segments_available + 1; c_segno < c_segments_available_new; c_segno++) {
4082 				c_segments[c_segno - 1].c_segno = c_segno;
4083 			}
4084 
4085 			lck_mtx_lock_spin_always(c_list_lock);
4086 
4087 			c_segments[c_segno - 1].c_segno = c_free_segno_head;
4088 			c_free_segno_head = c_segments_available;
4089 			c_segments_available = c_segments_available_new;
4090 
4091 			c_segments_busy = FALSE;
4092 			thread_wakeup((event_t) (&c_segments_busy));
4093 		}
4094 		c_segno = c_free_segno_head;
4095 		assert(c_segno >= 0 && c_segno < c_segments_limit);
4096 
4097 		c_free_segno_head = (uint32_t)c_segments[c_segno].c_segno;
4098 
4099 		/*
4100 		 * do the rest of the bookkeeping now while we're still behind
4101 		 * the list lock and grab our generation id now into a local
4102 		 * so that we can install it once we have the c_seg allocated
4103 		 */
4104 		c_segment_count++;
4105 		if (c_segment_count > c_segment_count_max) {
4106 			c_segment_count_max = c_segment_count;
4107 		}
4108 
4109 		lck_mtx_unlock_always(c_list_lock);
4110 
4111 		c_seg = zalloc_flags(compressor_segment_zone, Z_WAITOK | Z_ZERO);
4112 
4113 		c_seg->c_store.c_buffer = (int32_t *)C_SEG_BUFFER_ADDRESS(c_segno);
4114 
4115 		lck_mtx_init(&c_seg->c_lock, &vm_compressor_lck_grp, LCK_ATTR_NULL);
4116 
4117 		c_seg->c_state = C_IS_EMPTY;
4118 		c_seg->c_firstemptyslot = C_SLOT_MAX_INDEX;
4119 		c_seg->c_mysegno = c_segno;
4120 
4121 		lck_mtx_lock_spin_always(c_list_lock);
4122 		c_empty_count++;
4123 		c_seg_switch_state(c_seg, C_IS_FILLING, FALSE);
4124 		c_segments[c_segno].c_seg = c_seg;
4125 		assert(c_segments[c_segno].c_segno > c_segments_available);
4126 		lck_mtx_unlock_always(c_list_lock);
4127 
4128 		for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
4129 #if XNU_TARGET_OS_OSX
4130 			donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_early_swapout_chead);
4131 #else /* XNU_TARGET_OS_OSX */
4132 			if (memorystatus_swap_all_apps) {
4133 				donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_late_swapout_chead);
4134 			} else {
4135 				donate_queue_head = NULL;
4136 			}
4137 #endif /* XNU_TARGET_OS_OSX */
4138 
4139 			if (current_chead == donate_queue_head) {
4140 				c_seg->c_has_donated_pages = 1;
4141 				break;
4142 			}
4143 		}
4144 
4145 		*current_chead = c_seg;
4146 
4147 #if DEVELOPMENT || DEBUG
4148 		C_SEG_MAKE_WRITEABLE(c_seg);
4149 #endif
4150 	}
4151 	c_seg_alloc_nextslot(c_seg);
4152 
4153 	size_to_populate = c_seg_allocsize - C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset);
4154 
4155 	if (size_to_populate) {
4156 		min_needed = PAGE_SIZE + (c_seg_allocsize - c_seg_bufsize);
4157 
4158 		if (C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset) < (unsigned) min_needed) {
4159 			if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) {
4160 				size_to_populate = C_SEG_MAX_POPULATE_SIZE;
4161 			}
4162 
4163 			OSAddAtomic64(size_to_populate / PAGE_SIZE, &vm_pageout_vminfo.vm_compressor_pages_grabbed);
4164 
4165 			kernel_memory_populate(
4166 				(vm_offset_t) &c_seg->c_store.c_buffer[c_seg->c_populated_offset],
4167 				size_to_populate,
4168 				KMA_NOFAIL | KMA_COMPRESSOR,
4169 				VM_KERN_MEMORY_COMPRESSOR);
4170 		} else {
4171 			size_to_populate = 0;
4172 		}
4173 	}
4174 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
4175 
4176 	lck_mtx_lock_spin_always(&c_seg->c_lock);
4177 
4178 	if (size_to_populate) {
4179 		c_seg->c_populated_offset += C_SEG_BYTES_TO_OFFSET(size_to_populate);
4180 	}
4181 
4182 	return c_seg;
4183 }
4184 
4185 #if DEVELOPMENT || DEBUG
4186 #if CONFIG_FREEZE
4187 extern boolean_t memorystatus_freeze_to_memory;
4188 #endif /* CONFIG_FREEZE */
4189 #endif /* DEVELOPMENT || DEBUG */
4190 uint64_t c_seg_total_donated_bytes = 0; /* For testing/debugging only for now. Remove and add new counters for vm_stat.*/
4191 
4192 uint64_t c_seg_filled_no_contention = 0;
4193 uint64_t c_seg_filled_contention = 0;
4194 clock_sec_t c_seg_filled_contention_sec_max = 0;
4195 clock_nsec_t c_seg_filled_contention_nsec_max = 0;
4196 
4197 static void
c_current_seg_filled(c_segment_t c_seg,c_segment_t * current_chead)4198 c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead)
4199 {
4200 	uint32_t        unused_bytes;
4201 	uint32_t        offset_to_depopulate;
4202 	int             new_state = C_ON_AGE_Q;
4203 	clock_sec_t     sec;
4204 	clock_nsec_t    nsec;
4205 	bool            head_insert = false, wakeup_swapout_thread = false;
4206 
4207 	unused_bytes = trunc_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset));
4208 
4209 	if (unused_bytes) {
4210 		offset_to_depopulate = C_SEG_BYTES_TO_OFFSET(round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_nextoffset)));
4211 
4212 		/*
4213 		 *  release the extra physical page(s) at the end of the segment
4214 		 */
4215 		lck_mtx_unlock_always(&c_seg->c_lock);
4216 
4217 		kernel_memory_depopulate(
4218 			(vm_offset_t) &c_seg->c_store.c_buffer[offset_to_depopulate],
4219 			unused_bytes,
4220 			KMA_COMPRESSOR,
4221 			VM_KERN_MEMORY_COMPRESSOR);
4222 
4223 		lck_mtx_lock_spin_always(&c_seg->c_lock);
4224 
4225 		c_seg->c_populated_offset = offset_to_depopulate;
4226 	}
4227 	assert(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset) <= c_seg_bufsize);
4228 
4229 #if DEVELOPMENT || DEBUG
4230 	{
4231 		boolean_t       c_seg_was_busy = FALSE;
4232 
4233 		if (!c_seg->c_busy) {
4234 			C_SEG_BUSY(c_seg);
4235 		} else {
4236 			c_seg_was_busy = TRUE;
4237 		}
4238 
4239 		lck_mtx_unlock_always(&c_seg->c_lock);
4240 
4241 		C_SEG_WRITE_PROTECT(c_seg);
4242 
4243 		lck_mtx_lock_spin_always(&c_seg->c_lock);
4244 
4245 		if (c_seg_was_busy == FALSE) {
4246 			C_SEG_WAKEUP_DONE(c_seg);
4247 		}
4248 	}
4249 #endif
4250 
4251 #if CONFIG_FREEZE
4252 	if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead) &&
4253 	    VM_CONFIG_SWAP_IS_PRESENT &&
4254 	    VM_CONFIG_FREEZER_SWAP_IS_ACTIVE
4255 #if DEVELOPMENT || DEBUG
4256 	    && !memorystatus_freeze_to_memory
4257 #endif /* DEVELOPMENT || DEBUG */
4258 	    ) {
4259 		new_state = C_ON_SWAPOUT_Q;
4260 		wakeup_swapout_thread = true;
4261 	}
4262 #endif /* CONFIG_FREEZE */
4263 
4264 	if (vm_darkwake_mode == TRUE) {
4265 		new_state = C_ON_SWAPOUT_Q;
4266 		head_insert = true;
4267 		wakeup_swapout_thread = true;
4268 	} else {
4269 		c_segment_t *donate_queue_head;
4270 		for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
4271 #if XNU_TARGET_OS_OSX
4272 			donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_early_swapout_chead);
4273 #else /* XNU_TARGET_OS_OSX */
4274 			donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_late_swapout_chead);
4275 #endif /* XNU_TARGET_OS_OSX */
4276 
4277 			if (current_chead == donate_queue_head) {
4278 				assert(c_seg->c_has_donated_pages);
4279 				new_state = C_ON_SWAPOUT_Q;
4280 				c_seg_total_donated_bytes += c_seg->c_bytes_used;
4281 				break;
4282 			}
4283 		}
4284 	}
4285 
4286 	clock_get_system_nanotime(&sec, &nsec);
4287 	c_seg->c_creation_ts = (uint32_t)sec;
4288 
4289 	if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
4290 		clock_sec_t     sec2;
4291 		clock_nsec_t    nsec2;
4292 
4293 		lck_mtx_lock_spin_always(c_list_lock);
4294 		clock_get_system_nanotime(&sec2, &nsec2);
4295 		TIME_SUB(sec2, sec, nsec2, nsec, NSEC_PER_SEC);
4296 		// printf("FBDP %s: head %p waited for c_list_lock for %lu.%09u seconds\n", __FUNCTION__, current_chead, sec2, nsec2);
4297 		if (sec2 > c_seg_filled_contention_sec_max) {
4298 			c_seg_filled_contention_sec_max = sec2;
4299 			c_seg_filled_contention_nsec_max = nsec2;
4300 		} else if (sec2 == c_seg_filled_contention_sec_max &&
4301 		    nsec2 > c_seg_filled_contention_nsec_max) {
4302 			c_seg_filled_contention_nsec_max = nsec2;
4303 		}
4304 		c_seg_filled_contention++;
4305 	} else {
4306 		c_seg_filled_no_contention++;
4307 	}
4308 
4309 #if CONFIG_FREEZE
4310 	if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead)) {
4311 		if (freezer_context_global.freezer_ctx_task->donates_own_pages) {
4312 			assert(!c_seg->c_has_donated_pages);
4313 			c_seg->c_has_donated_pages = 1;
4314 			OSAddAtomic(c_seg->c_slots_used, &c_segment_pages_compressed_incore_late_swapout);
4315 		}
4316 		c_seg->c_has_freezer_pages = 1;
4317 	}
4318 #endif /* CONFIG_FREEZE */
4319 
4320 	c_seg->c_generation_id = c_generation_id++;
4321 	c_seg_switch_state(c_seg, new_state, head_insert);
4322 
4323 #if CONFIG_FREEZE
4324 	/*
4325 	 * Donated segments count as frozen to swap if we go through the freezer.
4326 	 * TODO: What we need is a new ledger and cseg state that can describe
4327 	 * a frozen cseg from a donated task so we can accurately decrement it on
4328 	 * swapins.
4329 	 */
4330 	if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead) && (c_seg->c_state == C_ON_SWAPOUT_Q)) {
4331 		/*
4332 		 * darkwake and freezer can't co-exist together
4333 		 * We'll need to fix this accounting as a start.
4334 		 * And early donation c_segs are separate from frozen c_segs.
4335 		 */
4336 		assert(vm_darkwake_mode == FALSE);
4337 		c_seg_update_task_owner(c_seg, freezer_context_global.freezer_ctx_task);
4338 		freezer_context_global.freezer_ctx_swapped_bytes += c_seg->c_bytes_used;
4339 	}
4340 #endif /* CONFIG_FREEZE */
4341 
4342 	if (c_seg->c_state == C_ON_AGE_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
4343 #if CONFIG_FREEZE
4344 		assert(c_seg->c_task_owner == NULL);
4345 #endif /* CONFIG_FREEZE */
4346 		c_seg_need_delayed_compaction(c_seg, TRUE);
4347 	}
4348 
4349 	lck_mtx_unlock_always(c_list_lock);
4350 
4351 	if (wakeup_swapout_thread) {
4352 		/*
4353 		 * Darkwake and Freeze configs always
4354 		 * wake up the swapout thread because
4355 		 * the compactor thread that normally handles
4356 		 * it may not be running as much in these
4357 		 * configs.
4358 		 */
4359 		thread_wakeup((event_t)&vm_swapout_thread);
4360 	}
4361 
4362 	*current_chead = NULL;
4363 }
4364 
4365 /*
4366  * returns with c_seg locked
4367  */
4368 void
c_seg_swapin_requeue(c_segment_t c_seg,boolean_t has_data,boolean_t minor_compact_ok,boolean_t age_on_swapin_q)4369 c_seg_swapin_requeue(c_segment_t c_seg, boolean_t has_data, boolean_t minor_compact_ok, boolean_t age_on_swapin_q)
4370 {
4371 	clock_sec_t     sec;
4372 	clock_nsec_t    nsec;
4373 
4374 	clock_get_system_nanotime(&sec, &nsec);
4375 
4376 	lck_mtx_lock_spin_always(c_list_lock);
4377 	lck_mtx_lock_spin_always(&c_seg->c_lock);
4378 
4379 	assert(c_seg->c_busy_swapping);
4380 	assert(c_seg->c_busy);
4381 
4382 	c_seg->c_busy_swapping = 0;
4383 
4384 	if (c_seg->c_overage_swap == TRUE) {
4385 		c_overage_swapped_count--;
4386 		c_seg->c_overage_swap = FALSE;
4387 	}
4388 	if (has_data == TRUE) {
4389 		if (age_on_swapin_q == TRUE || c_seg->c_has_donated_pages) {
4390 #if CONFIG_FREEZE
4391 			/*
4392 			 * If a segment has both identities, frozen and donated bits set, the donated
4393 			 * bit wins on the swapin path. This is because the segment is being swapped back
4394 			 * in and so is in demand and should be given more time to spend in memory before
4395 			 * being swapped back out under pressure.
4396 			 */
4397 			if (c_seg->c_has_donated_pages) {
4398 				c_seg->c_has_freezer_pages = 0;
4399 			}
4400 #endif /* CONFIG_FREEZE */
4401 			c_seg_switch_state(c_seg, C_ON_SWAPPEDIN_Q, FALSE);
4402 		} else {
4403 			c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
4404 		}
4405 
4406 		if (minor_compact_ok == TRUE && !c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
4407 			c_seg_need_delayed_compaction(c_seg, TRUE);
4408 		}
4409 	} else {
4410 		c_seg->c_store.c_buffer = (int32_t*) NULL;
4411 		c_seg->c_populated_offset = C_SEG_BYTES_TO_OFFSET(0);
4412 
4413 		c_seg_switch_state(c_seg, C_ON_BAD_Q, FALSE);
4414 	}
4415 	c_seg->c_swappedin_ts = (uint32_t)sec;
4416 	c_seg->c_swappedin = true;
4417 
4418 	lck_mtx_unlock_always(c_list_lock);
4419 }
4420 
4421 
4422 
4423 /*
4424  * c_seg has to be locked and is returned locked if the c_seg isn't freed
4425  * PAGE_REPLACMENT_DISALLOWED has to be TRUE on entry and is returned TRUE
4426  * c_seg_swapin returns 1 if the c_seg was freed, 0 otherwise
4427  */
4428 
4429 int
c_seg_swapin(c_segment_t c_seg,boolean_t force_minor_compaction,boolean_t age_on_swapin_q)4430 c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction, boolean_t age_on_swapin_q)
4431 {
4432 	vm_offset_t     addr = 0;
4433 	uint32_t        io_size = 0;
4434 	uint64_t        f_offset;
4435 	thread_pri_floor_t token;
4436 
4437 	assert(C_SEG_IS_ONDISK(c_seg));
4438 
4439 #if !CHECKSUM_THE_SWAP
4440 	c_seg_trim_tail(c_seg);
4441 #endif
4442 	io_size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
4443 	f_offset = c_seg->c_store.c_swap_handle;
4444 
4445 	C_SEG_BUSY(c_seg);
4446 	c_seg->c_busy_swapping = 1;
4447 
4448 	/*
4449 	 * This thread is likely going to block for I/O.
4450 	 * Make sure it is ready to run when the I/O completes because
4451 	 * it needs to clear the busy bit on the c_seg so that other
4452 	 * waiting threads can make progress too.
4453 	 */
4454 	token = thread_priority_floor_start();
4455 	lck_mtx_unlock_always(&c_seg->c_lock);
4456 
4457 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
4458 
4459 	addr = (vm_offset_t)C_SEG_BUFFER_ADDRESS(c_seg->c_mysegno);
4460 	c_seg->c_store.c_buffer = (int32_t*) addr;
4461 
4462 	kernel_memory_populate(addr, io_size, KMA_NOFAIL | KMA_COMPRESSOR,
4463 	    VM_KERN_MEMORY_COMPRESSOR);
4464 
4465 	if (vm_swap_get(c_seg, f_offset, io_size) != KERN_SUCCESS) {
4466 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
4467 
4468 		kernel_memory_depopulate(addr, io_size, KMA_COMPRESSOR,
4469 		    VM_KERN_MEMORY_COMPRESSOR);
4470 
4471 		c_seg_swapin_requeue(c_seg, FALSE, TRUE, age_on_swapin_q);
4472 	} else {
4473 #if ENCRYPTED_SWAP
4474 		vm_swap_decrypt(c_seg);
4475 #endif /* ENCRYPTED_SWAP */
4476 
4477 #if CHECKSUM_THE_SWAP
4478 		if (c_seg->cseg_swap_size != io_size) {
4479 			panic("swapin size doesn't match swapout size");
4480 		}
4481 
4482 		if (c_seg->cseg_hash != vmc_hash((char*) c_seg->c_store.c_buffer, (int)io_size)) {
4483 			panic("c_seg_swapin - Swap hash mismatch");
4484 		}
4485 #endif /* CHECKSUM_THE_SWAP */
4486 
4487 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
4488 
4489 		c_seg_swapin_requeue(c_seg, TRUE, force_minor_compaction == TRUE ? FALSE : TRUE, age_on_swapin_q);
4490 
4491 #if CONFIG_FREEZE
4492 		/*
4493 		 * c_seg_swapin_requeue() returns with the c_seg lock held.
4494 		 */
4495 		if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
4496 			assert(c_seg->c_busy);
4497 
4498 			lck_mtx_unlock_always(&c_seg->c_lock);
4499 			lck_mtx_lock_spin_always(c_list_lock);
4500 			lck_mtx_lock_spin_always(&c_seg->c_lock);
4501 		}
4502 
4503 		if (c_seg->c_task_owner) {
4504 			c_seg_update_task_owner(c_seg, NULL);
4505 		}
4506 
4507 		lck_mtx_unlock_always(c_list_lock);
4508 
4509 		OSAddAtomic(c_seg->c_slots_used, &c_segment_pages_compressed_incore);
4510 		if (c_seg->c_has_donated_pages) {
4511 			OSAddAtomic(c_seg->c_slots_used, &c_segment_pages_compressed_incore_late_swapout);
4512 		}
4513 #endif /* CONFIG_FREEZE */
4514 
4515 		OSAddAtomic64(c_seg->c_bytes_used, &compressor_bytes_used);
4516 
4517 		if (force_minor_compaction == TRUE) {
4518 			if (c_seg_minor_compaction_and_unlock(c_seg, FALSE)) {
4519 				/*
4520 				 * c_seg was completely empty so it was freed,
4521 				 * so be careful not to reference it again
4522 				 *
4523 				 * Drop the boost so that the thread priority
4524 				 * is returned back to where it is supposed to be.
4525 				 */
4526 				thread_priority_floor_end(&token);
4527 				return 1;
4528 			}
4529 
4530 			lck_mtx_lock_spin_always(&c_seg->c_lock);
4531 		}
4532 	}
4533 	C_SEG_WAKEUP_DONE(c_seg);
4534 
4535 	/*
4536 	 * Drop the boost so that the thread priority
4537 	 * is returned back to where it is supposed to be.
4538 	 */
4539 	thread_priority_floor_end(&token);
4540 
4541 	return 0;
4542 }
4543 
4544 
4545 static void
c_segment_sv_hash_drop_ref(int hash_indx)4546 c_segment_sv_hash_drop_ref(int hash_indx)
4547 {
4548 	struct c_sv_hash_entry o_sv_he, n_sv_he;
4549 
4550 	while (1) {
4551 		o_sv_he.he_record = c_segment_sv_hash_table[hash_indx].he_record;
4552 
4553 		n_sv_he.he_ref = o_sv_he.he_ref - 1;
4554 		n_sv_he.he_data = o_sv_he.he_data;
4555 
4556 		if (OSCompareAndSwap64((UInt64)o_sv_he.he_record, (UInt64)n_sv_he.he_record, (UInt64 *) &c_segment_sv_hash_table[hash_indx].he_record) == TRUE) {
4557 			if (n_sv_he.he_ref == 0) {
4558 				OSAddAtomic(-1, &c_segment_svp_in_hash);
4559 			}
4560 			break;
4561 		}
4562 	}
4563 }
4564 
4565 
4566 static int
c_segment_sv_hash_insert(uint32_t data)4567 c_segment_sv_hash_insert(uint32_t data)
4568 {
4569 	int             hash_sindx;
4570 	int             misses;
4571 	struct c_sv_hash_entry o_sv_he, n_sv_he;
4572 	boolean_t       got_ref = FALSE;
4573 
4574 	if (data == 0) {
4575 		OSAddAtomic(1, &c_segment_svp_zero_compressions);
4576 	} else {
4577 		OSAddAtomic(1, &c_segment_svp_nonzero_compressions);
4578 	}
4579 
4580 	hash_sindx = data & C_SV_HASH_MASK;
4581 
4582 	for (misses = 0; misses < C_SV_HASH_MAX_MISS; misses++) {
4583 		o_sv_he.he_record = c_segment_sv_hash_table[hash_sindx].he_record;
4584 
4585 		while (o_sv_he.he_data == data || o_sv_he.he_ref == 0) {
4586 			n_sv_he.he_ref = o_sv_he.he_ref + 1;
4587 			n_sv_he.he_data = data;
4588 
4589 			if (OSCompareAndSwap64((UInt64)o_sv_he.he_record, (UInt64)n_sv_he.he_record, (UInt64 *) &c_segment_sv_hash_table[hash_sindx].he_record) == TRUE) {
4590 				if (n_sv_he.he_ref == 1) {
4591 					OSAddAtomic(1, &c_segment_svp_in_hash);
4592 				}
4593 				got_ref = TRUE;
4594 				break;
4595 			}
4596 			o_sv_he.he_record = c_segment_sv_hash_table[hash_sindx].he_record;
4597 		}
4598 		if (got_ref == TRUE) {
4599 			break;
4600 		}
4601 		hash_sindx++;
4602 
4603 		if (hash_sindx == C_SV_HASH_SIZE) {
4604 			hash_sindx = 0;
4605 		}
4606 	}
4607 	if (got_ref == FALSE) {
4608 		return -1;
4609 	}
4610 
4611 	return hash_sindx;
4612 }
4613 
4614 
4615 #if RECORD_THE_COMPRESSED_DATA
4616 
4617 static void
c_compressed_record_data(char * src,int c_size)4618 c_compressed_record_data(char *src, int c_size)
4619 {
4620 	if ((c_compressed_record_cptr + c_size + 4) >= c_compressed_record_ebuf) {
4621 		panic("c_compressed_record_cptr >= c_compressed_record_ebuf");
4622 	}
4623 
4624 	*(int *)((void *)c_compressed_record_cptr) = c_size;
4625 
4626 	c_compressed_record_cptr += 4;
4627 
4628 	memcpy(c_compressed_record_cptr, src, c_size);
4629 	c_compressed_record_cptr += c_size;
4630 }
4631 #endif
4632 
4633 
4634 static int
c_compress_page(char * src,c_slot_mapping_t slot_ptr,c_segment_t * current_chead,char * scratch_buf)4635 c_compress_page(char *src, c_slot_mapping_t slot_ptr, c_segment_t *current_chead, char *scratch_buf)
4636 {
4637 	int             c_size = -1;
4638 	int             c_rounded_size = 0;
4639 	int             max_csize;
4640 	c_slot_t        cs;
4641 	c_segment_t     c_seg;
4642 	bool            single_value = false;
4643 
4644 	KERNEL_DEBUG(0xe0400000 | DBG_FUNC_START, *current_chead, 0, 0, 0, 0);
4645 retry:
4646 	if ((c_seg = c_seg_allocate(current_chead)) == NULL) {
4647 		return 1;
4648 	}
4649 	/*
4650 	 * returns with c_seg lock held
4651 	 * and PAGE_REPLACEMENT_DISALLOWED(TRUE)...
4652 	 * c_nextslot has been allocated and
4653 	 * c_store.c_buffer populated
4654 	 */
4655 	assert(c_seg->c_state == C_IS_FILLING);
4656 
4657 	cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_seg->c_nextslot);
4658 
4659 	C_SLOT_ASSERT_PACKABLE(slot_ptr);
4660 	cs->c_packed_ptr = C_SLOT_PACK_PTR(slot_ptr);
4661 
4662 	cs->c_offset = c_seg->c_nextoffset;
4663 
4664 	max_csize = c_seg_bufsize - C_SEG_OFFSET_TO_BYTES((int32_t)cs->c_offset);
4665 
4666 	if (max_csize > PAGE_SIZE) {
4667 		max_csize = PAGE_SIZE;
4668 	}
4669 
4670 #if CHECKSUM_THE_DATA
4671 	cs->c_hash_data = vmc_hash(src, PAGE_SIZE);
4672 #endif
4673 	boolean_t incomp_copy = FALSE;
4674 	int max_csize_adj = (max_csize - 4);
4675 
4676 	if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) {
4677 #if defined(__arm64__)
4678 		uint16_t ccodec = CINVALID;
4679 		uint32_t inline_popcount;
4680 		if (max_csize >= C_SEG_OFFSET_ALIGNMENT_BOUNDARY) {
4681 			c_size = metacompressor((const uint8_t *) src,
4682 			    (uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset],
4683 			    max_csize_adj, &ccodec,
4684 			    scratch_buf, &incomp_copy, &inline_popcount);
4685 			assert(inline_popcount == C_SLOT_NO_POPCOUNT);
4686 
4687 #if C_SEG_OFFSET_ALIGNMENT_BOUNDARY > 4
4688 			if (c_size > max_csize_adj) {
4689 				c_size = -1;
4690 			}
4691 #endif
4692 		} else {
4693 			c_size = -1;
4694 		}
4695 		assert(ccodec == CCWK || ccodec == CCLZ4);
4696 		cs->c_codec = ccodec;
4697 #endif
4698 	} else {
4699 #if defined(__arm64__)
4700 		cs->c_codec = CCWK;
4701 		__unreachable_ok_push
4702 		if (PAGE_SIZE == 4096) {
4703 			c_size = WKdm_compress_4k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4704 			    (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4705 		} else {
4706 			c_size = WKdm_compress_16k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4707 			    (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4708 		}
4709 		__unreachable_ok_pop
4710 #else
4711 		c_size = WKdm_compress_new((const WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4712 		    (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4713 #endif
4714 	}
4715 	assertf(((c_size <= max_csize_adj) && (c_size >= -1)),
4716 	    "c_size invalid (%d, %d), cur compressions: %d", c_size, max_csize_adj, c_segment_pages_compressed);
4717 
4718 	if (c_size == -1) {
4719 		if (max_csize < PAGE_SIZE) {
4720 			c_current_seg_filled(c_seg, current_chead);
4721 			assert(*current_chead == NULL);
4722 
4723 			lck_mtx_unlock_always(&c_seg->c_lock);
4724 			/* TODO: it may be worth requiring codecs to distinguish
4725 			 * between incompressible inputs and failures due to
4726 			 * budget exhaustion.
4727 			 */
4728 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
4729 			goto retry;
4730 		}
4731 		c_size = PAGE_SIZE;
4732 
4733 		if (incomp_copy == FALSE) {
4734 			memcpy(&c_seg->c_store.c_buffer[cs->c_offset], src, c_size);
4735 		}
4736 
4737 		OSAddAtomic(1, &c_segment_noncompressible_pages);
4738 	} else if (c_size == 0) {
4739 		int             hash_index;
4740 
4741 		/*
4742 		 * special case - this is a page completely full of a single 32 bit value
4743 		 */
4744 		single_value = true;
4745 		hash_index = c_segment_sv_hash_insert(*(uint32_t *)(uintptr_t)src);
4746 
4747 		if (hash_index != -1) {
4748 			slot_ptr->s_cindx = hash_index;
4749 			slot_ptr->s_cseg = C_SV_CSEG_ID;
4750 
4751 			OSAddAtomic(1, &c_segment_svp_hash_succeeded);
4752 #if RECORD_THE_COMPRESSED_DATA
4753 			c_compressed_record_data(src, 4);
4754 #endif
4755 			goto sv_compression;
4756 		}
4757 		c_size = 4;
4758 
4759 		memcpy(&c_seg->c_store.c_buffer[cs->c_offset], src, c_size);
4760 
4761 		OSAddAtomic(1, &c_segment_svp_hash_failed);
4762 	}
4763 
4764 #if RECORD_THE_COMPRESSED_DATA
4765 	c_compressed_record_data((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size);
4766 #endif
4767 #if CHECKSUM_THE_COMPRESSED_DATA
4768 	cs->c_hash_compressed_data = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size);
4769 #endif
4770 #if POPCOUNT_THE_COMPRESSED_DATA
4771 	cs->c_pop_cdata = vmc_pop((uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset], c_size);
4772 #endif
4773 	c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
4774 
4775 	PACK_C_SIZE(cs, c_size);
4776 	c_seg->c_bytes_used += c_rounded_size;
4777 	c_seg->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
4778 	c_seg->c_slots_used++;
4779 
4780 #if CONFIG_FREEZE
4781 	/* TODO: should c_segment_pages_compressed be up here too? See 88598046 for details */
4782 	OSAddAtomic(1, &c_segment_pages_compressed_incore);
4783 	if (c_seg->c_has_donated_pages) {
4784 		OSAddAtomic(1, &c_segment_pages_compressed_incore_late_swapout);
4785 	}
4786 #endif /* CONFIG_FREEZE */
4787 
4788 	slot_ptr->s_cindx = c_seg->c_nextslot++;
4789 	/* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
4790 	slot_ptr->s_cseg = c_seg->c_mysegno + 1;
4791 
4792 sv_compression:
4793 	if (c_seg->c_nextoffset >= c_seg_off_limit || c_seg->c_nextslot >= C_SLOT_MAX_INDEX) {
4794 		c_current_seg_filled(c_seg, current_chead);
4795 		assert(*current_chead == NULL);
4796 	}
4797 
4798 	lck_mtx_unlock_always(&c_seg->c_lock);
4799 
4800 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
4801 
4802 #if RECORD_THE_COMPRESSED_DATA
4803 	if ((c_compressed_record_cptr - c_compressed_record_sbuf) >= c_seg_allocsize) {
4804 		c_compressed_record_write(c_compressed_record_sbuf, (int)(c_compressed_record_cptr - c_compressed_record_sbuf));
4805 		c_compressed_record_cptr = c_compressed_record_sbuf;
4806 	}
4807 #endif
4808 	if (c_size) {
4809 		OSAddAtomic64(c_size, &c_segment_compressed_bytes);
4810 		OSAddAtomic64(c_rounded_size, &compressor_bytes_used);
4811 	}
4812 	OSAddAtomic64(PAGE_SIZE, &c_segment_input_bytes);
4813 
4814 	OSAddAtomic(1, &c_segment_pages_compressed);
4815 #if DEVELOPMENT || DEBUG
4816 	if (!compressor_running_perf_test) {
4817 		/*
4818 		 * The perf_compressor benchmark should not be able to trigger
4819 		 * compressor thrashing jetsams.
4820 		 */
4821 		OSAddAtomic(1, &sample_period_compression_count);
4822 	}
4823 #else /* DEVELOPMENT || DEBUG */
4824 	OSAddAtomic(1, &sample_period_compression_count);
4825 #endif /* DEVELOPMENT || DEBUG */
4826 
4827 	KERNEL_DEBUG(0xe0400000 | DBG_FUNC_END, *current_chead, c_size, c_segment_input_bytes, c_segment_compressed_bytes, 0);
4828 
4829 	return 0;
4830 }
4831 
4832 static inline void
sv_decompress(int32_t * ddst,int32_t pattern)4833 sv_decompress(int32_t *ddst, int32_t pattern)
4834 {
4835 //	assert(__builtin_constant_p(PAGE_SIZE) != 0);
4836 #if defined(__x86_64__)
4837 	memset_word(ddst, pattern, PAGE_SIZE / sizeof(int32_t));
4838 #elif defined(__arm64__)
4839 	assert((PAGE_SIZE % 128) == 0);
4840 	if (pattern == 0) {
4841 		fill32_dczva((addr64_t)ddst, PAGE_SIZE);
4842 	} else {
4843 		fill32_nt((addr64_t)ddst, PAGE_SIZE, pattern);
4844 	}
4845 #else
4846 	size_t          i;
4847 
4848 	/* Unroll the pattern fill loop 4x to encourage the
4849 	 * compiler to emit NEON stores, cf.
4850 	 * <rdar://problem/25839866> Loop autovectorization
4851 	 * anomalies.
4852 	 */
4853 	/* * We use separate loops for each PAGE_SIZE
4854 	 * to allow the autovectorizer to engage, as PAGE_SIZE
4855 	 * may not be a constant.
4856 	 */
4857 
4858 	__unreachable_ok_push
4859 	if (PAGE_SIZE == 4096) {
4860 		for (i = 0; i < (4096U / sizeof(int32_t)); i += 4) {
4861 			*ddst++ = pattern;
4862 			*ddst++ = pattern;
4863 			*ddst++ = pattern;
4864 			*ddst++ = pattern;
4865 		}
4866 	} else {
4867 		assert(PAGE_SIZE == 16384);
4868 		for (i = 0; i < (int)(16384U / sizeof(int32_t)); i += 4) {
4869 			*ddst++ = pattern;
4870 			*ddst++ = pattern;
4871 			*ddst++ = pattern;
4872 			*ddst++ = pattern;
4873 		}
4874 	}
4875 	__unreachable_ok_pop
4876 #endif
4877 }
4878 
4879 static int
c_decompress_page(char * dst,volatile c_slot_mapping_t slot_ptr,int flags,int * zeroslot)4880 c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int *zeroslot)
4881 {
4882 	c_slot_t        cs;
4883 	c_segment_t     c_seg;
4884 	uint32_t        c_segno;
4885 	uint16_t        c_indx;
4886 	int             c_rounded_size;
4887 	uint32_t        c_size;
4888 	int             retval = 0;
4889 	boolean_t       need_unlock = TRUE;
4890 	boolean_t       consider_defragmenting = FALSE;
4891 	boolean_t       kdp_mode = FALSE;
4892 
4893 	if (__improbable(flags & C_KDP)) {
4894 		if (not_in_kdp) {
4895 			panic("C_KDP passed to decompress page from outside of debugger context");
4896 		}
4897 
4898 		assert((flags & C_KEEP) == C_KEEP);
4899 		assert((flags & C_DONT_BLOCK) == C_DONT_BLOCK);
4900 
4901 		if ((flags & (C_DONT_BLOCK | C_KEEP)) != (C_DONT_BLOCK | C_KEEP)) {
4902 			return -2;
4903 		}
4904 
4905 		kdp_mode = TRUE;
4906 		*zeroslot = 0;
4907 	}
4908 
4909 ReTry:
4910 	if (__probable(!kdp_mode)) {
4911 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
4912 	} else {
4913 		if (kdp_lck_rw_lock_is_acquired_exclusive(&c_master_lock)) {
4914 			return -2;
4915 		}
4916 	}
4917 
4918 #if HIBERNATION
4919 	/*
4920 	 * if hibernation is enabled, it indicates (via a call
4921 	 * to 'vm_decompressor_lock' that no further
4922 	 * decompressions are allowed once it reaches
4923 	 * the point of flushing all of the currently dirty
4924 	 * anonymous memory through the compressor and out
4925 	 * to disk... in this state we allow freeing of compressed
4926 	 * pages and must honor the C_DONT_BLOCK case
4927 	 */
4928 	if (__improbable(dst && decompressions_blocked == TRUE)) {
4929 		if (flags & C_DONT_BLOCK) {
4930 			if (__probable(!kdp_mode)) {
4931 				PAGE_REPLACEMENT_DISALLOWED(FALSE);
4932 			}
4933 
4934 			*zeroslot = 0;
4935 			return -2;
4936 		}
4937 		/*
4938 		 * it's safe to atomically assert and block behind the
4939 		 * lock held in shared mode because "decompressions_blocked" is
4940 		 * only set and cleared and the thread_wakeup done when the lock
4941 		 * is held exclusively
4942 		 */
4943 		assert_wait((event_t)&decompressions_blocked, THREAD_UNINT);
4944 
4945 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
4946 
4947 		thread_block(THREAD_CONTINUE_NULL);
4948 
4949 		goto ReTry;
4950 	}
4951 #endif
4952 	/* s_cseg is actually "segno+1" */
4953 	c_segno = slot_ptr->s_cseg - 1;
4954 
4955 	if (__improbable(c_segno >= c_segments_available)) {
4956 		panic("c_decompress_page: c_segno %d >= c_segments_available %d, slot_ptr(%p), slot_data(%x)",
4957 		    c_segno, c_segments_available, slot_ptr, *(int *)((void *)slot_ptr));
4958 	}
4959 
4960 	if (__improbable(c_segments[c_segno].c_segno < c_segments_available)) {
4961 		panic("c_decompress_page: c_segno %d is free, slot_ptr(%p), slot_data(%x)",
4962 		    c_segno, slot_ptr, *(int *)((void *)slot_ptr));
4963 	}
4964 
4965 	c_seg = c_segments[c_segno].c_seg;
4966 
4967 	if (__probable(!kdp_mode)) {
4968 		lck_mtx_lock_spin_always(&c_seg->c_lock);
4969 	} else {
4970 		if (kdp_lck_mtx_lock_spin_is_acquired(&c_seg->c_lock)) {
4971 			return -2;
4972 		}
4973 	}
4974 
4975 	assert(c_seg->c_state != C_IS_EMPTY && c_seg->c_state != C_IS_FREE);
4976 
4977 	if (dst == NULL && c_seg->c_busy_swapping) {
4978 		assert(c_seg->c_busy);
4979 
4980 		goto bypass_busy_check;
4981 	}
4982 	if (flags & C_DONT_BLOCK) {
4983 		if (c_seg->c_busy || (C_SEG_IS_ONDISK(c_seg) && dst)) {
4984 			*zeroslot = 0;
4985 
4986 			retval = -2;
4987 			goto done;
4988 		}
4989 	}
4990 	if (c_seg->c_busy) {
4991 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
4992 
4993 		c_seg_wait_on_busy(c_seg);
4994 
4995 		goto ReTry;
4996 	}
4997 bypass_busy_check:
4998 
4999 	c_indx = slot_ptr->s_cindx;
5000 
5001 	if (__improbable(c_indx >= c_seg->c_nextslot)) {
5002 		panic("c_decompress_page: c_indx %d >= c_nextslot %d, c_seg(%p), slot_ptr(%p), slot_data(%x)",
5003 		    c_indx, c_seg->c_nextslot, c_seg, slot_ptr, *(int *)((void *)slot_ptr));
5004 	}
5005 
5006 	cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5007 
5008 	c_size = UNPACK_C_SIZE(cs);
5009 
5010 	if (__improbable(c_size == 0)) {
5011 		panic("c_decompress_page: c_size == 0, c_seg(%p), slot_ptr(%p), slot_data(%x)",
5012 		    c_seg, slot_ptr, *(int *)((void *)slot_ptr));
5013 	}
5014 
5015 	c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
5016 
5017 	if (dst) {
5018 		uint32_t        age_of_cseg;
5019 		clock_sec_t     cur_ts_sec;
5020 		clock_nsec_t    cur_ts_nsec;
5021 
5022 		if (C_SEG_IS_ONDISK(c_seg)) {
5023 #if CONFIG_FREEZE
5024 			if (freezer_incore_cseg_acct) {
5025 				if ((c_seg->c_slots_used + c_segment_pages_compressed_incore) >= c_segment_pages_compressed_nearing_limit) {
5026 					PAGE_REPLACEMENT_DISALLOWED(FALSE);
5027 					lck_mtx_unlock_always(&c_seg->c_lock);
5028 
5029 					memorystatus_kill_on_VM_compressor_space_shortage(FALSE /* async */);
5030 
5031 					goto ReTry;
5032 				}
5033 
5034 				uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
5035 				if ((incore_seg_count + 1) >= c_segments_nearing_limit) {
5036 					PAGE_REPLACEMENT_DISALLOWED(FALSE);
5037 					lck_mtx_unlock_always(&c_seg->c_lock);
5038 
5039 					memorystatus_kill_on_VM_compressor_space_shortage(FALSE /* async */);
5040 
5041 					goto ReTry;
5042 				}
5043 			}
5044 #endif /* CONFIG_FREEZE */
5045 			assert(kdp_mode == FALSE);
5046 			retval = c_seg_swapin(c_seg, FALSE, TRUE);
5047 			assert(retval == 0);
5048 
5049 			retval = 1;
5050 		}
5051 		if (c_seg->c_state == C_ON_BAD_Q) {
5052 			assert(c_seg->c_store.c_buffer == NULL);
5053 			*zeroslot = 0;
5054 
5055 			retval = -1;
5056 			goto done;
5057 		}
5058 
5059 #if POPCOUNT_THE_COMPRESSED_DATA
5060 		unsigned csvpop;
5061 		uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
5062 		if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
5063 			panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%x 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
5064 		}
5065 #endif
5066 
5067 #if CHECKSUM_THE_COMPRESSED_DATA
5068 		unsigned csvhash;
5069 		if (cs->c_hash_compressed_data != (csvhash = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size))) {
5070 			panic("Compressed data doesn't match original %p %p %u %u %u", c_seg, cs, c_size, cs->c_hash_compressed_data, csvhash);
5071 		}
5072 #endif
5073 		if (c_rounded_size == PAGE_SIZE) {
5074 			/*
5075 			 * page wasn't compressible... just copy it out
5076 			 */
5077 			memcpy(dst, &c_seg->c_store.c_buffer[cs->c_offset], PAGE_SIZE);
5078 		} else if (c_size == 4) {
5079 			int32_t         data;
5080 			int32_t         *dptr;
5081 
5082 			/*
5083 			 * page was populated with a single value
5084 			 * that didn't fit into our fast hash
5085 			 * so we packed it in as a single non-compressed value
5086 			 * that we need to populate the page with
5087 			 */
5088 			dptr = (int32_t *)(uintptr_t)dst;
5089 			data = *(int32_t *)(&c_seg->c_store.c_buffer[cs->c_offset]);
5090 			sv_decompress(dptr, data);
5091 		} else {
5092 			uint32_t        my_cpu_no;
5093 			char            *scratch_buf;
5094 
5095 			if (__probable(!kdp_mode)) {
5096 				/*
5097 				 * we're behind the c_seg lock held in spin mode
5098 				 * which means pre-emption is disabled... therefore
5099 				 * the following sequence is atomic and safe
5100 				 */
5101 				my_cpu_no = cpu_number();
5102 
5103 				assert(my_cpu_no < compressor_cpus);
5104 
5105 				scratch_buf = &compressor_scratch_bufs[my_cpu_no * vm_compressor_get_decode_scratch_size()];
5106 			} else {
5107 				scratch_buf = kdp_compressor_scratch_buf;
5108 			}
5109 
5110 			if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) {
5111 #if defined(__arm64__)
5112 				uint16_t c_codec = cs->c_codec;
5113 				uint32_t inline_popcount;
5114 				if (!metadecompressor((const uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset],
5115 				    (uint8_t *)dst, c_size, c_codec, (void *)scratch_buf, &inline_popcount)) {
5116 					retval = -1;
5117 				} else {
5118 					assert(inline_popcount == C_SLOT_NO_POPCOUNT);
5119 				}
5120 #endif
5121 			} else {
5122 #if defined(__arm64__)
5123 				__unreachable_ok_push
5124 				if (PAGE_SIZE == 4096) {
5125 					WKdm_decompress_4k((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5126 					    (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5127 				} else {
5128 					WKdm_decompress_16k((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5129 					    (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5130 				}
5131 				__unreachable_ok_pop
5132 #else
5133 				WKdm_decompress_new((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5134 				    (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5135 #endif
5136 			}
5137 		}
5138 
5139 #if CHECKSUM_THE_DATA
5140 		if (cs->c_hash_data != vmc_hash(dst, PAGE_SIZE)) {
5141 #if defined(__arm64__)
5142 			int32_t *dinput = &c_seg->c_store.c_buffer[cs->c_offset];
5143 			panic("decompressed data doesn't match original cs: %p, hash: 0x%x, offset: %d, c_size: %d, c_rounded_size: %d, codec: %d, header: 0x%x 0x%x 0x%x", cs, cs->c_hash_data, cs->c_offset, c_size, c_rounded_size, cs->c_codec, *dinput, *(dinput + 1), *(dinput + 2));
5144 #else
5145 			panic("decompressed data doesn't match original cs: %p, hash: %d, offset: 0x%x, c_size: %d", cs, cs->c_hash_data, cs->c_offset, c_size);
5146 #endif
5147 		}
5148 #endif
5149 		if (c_seg->c_swappedin_ts == 0 && !kdp_mode) {
5150 			clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
5151 
5152 			age_of_cseg = (uint32_t)cur_ts_sec - c_seg->c_creation_ts;
5153 			if (age_of_cseg < DECOMPRESSION_SAMPLE_MAX_AGE) {
5154 				OSAddAtomic(1, &age_of_decompressions_during_sample_period[age_of_cseg]);
5155 			} else {
5156 				OSAddAtomic(1, &overage_decompressions_during_sample_period);
5157 			}
5158 
5159 			OSAddAtomic(1, &sample_period_decompression_count);
5160 		}
5161 	}
5162 #if CONFIG_FREEZE
5163 	else {
5164 		/*
5165 		 * We are freeing an uncompressed page from this c_seg and so balance the ledgers.
5166 		 */
5167 		if (C_SEG_IS_ONDISK(c_seg)) {
5168 			/*
5169 			 * The compression sweep feature will push out anonymous pages to disk
5170 			 * without going through the freezer path and so those c_segs, while
5171 			 * swapped out, won't have an owner.
5172 			 */
5173 			if (c_seg->c_task_owner) {
5174 				task_update_frozen_to_swap_acct(c_seg->c_task_owner, PAGE_SIZE_64, DEBIT_FROM_SWAP);
5175 			}
5176 
5177 			/*
5178 			 * We are freeing a page in swap without swapping it in. We bump the in-core
5179 			 * count here to simulate a swapin of a page so that we can accurately
5180 			 * decrement it below.
5181 			 */
5182 			OSAddAtomic(1, &c_segment_pages_compressed_incore);
5183 			if (c_seg->c_has_donated_pages) {
5184 				OSAddAtomic(1, &c_segment_pages_compressed_incore_late_swapout);
5185 			}
5186 		} else if (c_seg->c_state == C_ON_BAD_Q) {
5187 			assert(c_seg->c_store.c_buffer == NULL);
5188 			*zeroslot = 0;
5189 
5190 			retval = -1;
5191 			goto done;
5192 		}
5193 	}
5194 #endif /* CONFIG_FREEZE */
5195 
5196 	if (flags & C_KEEP) {
5197 		*zeroslot = 0;
5198 		goto done;
5199 	}
5200 	assert(kdp_mode == FALSE);
5201 
5202 	c_seg->c_bytes_unused += c_rounded_size;
5203 	c_seg->c_bytes_used -= c_rounded_size;
5204 
5205 	assert(c_seg->c_slots_used);
5206 	c_seg->c_slots_used--;
5207 	if (dst && c_seg->c_swappedin) {
5208 		task_t task = current_task();
5209 		if (task) {
5210 			ledger_credit(task->ledger, task_ledgers.swapins, PAGE_SIZE);
5211 		}
5212 	}
5213 
5214 	PACK_C_SIZE(cs, 0);
5215 
5216 	if (c_indx < c_seg->c_firstemptyslot) {
5217 		c_seg->c_firstemptyslot = c_indx;
5218 	}
5219 
5220 	OSAddAtomic(-1, &c_segment_pages_compressed);
5221 #if CONFIG_FREEZE
5222 	OSAddAtomic(-1, &c_segment_pages_compressed_incore);
5223 	assertf(c_segment_pages_compressed_incore >= 0, "-ve incore count %p 0x%x", c_seg, c_segment_pages_compressed_incore);
5224 	if (c_seg->c_has_donated_pages) {
5225 		OSAddAtomic(-1, &c_segment_pages_compressed_incore_late_swapout);
5226 		assertf(c_segment_pages_compressed_incore_late_swapout >= 0, "-ve lateswapout count %p 0x%x", c_seg, c_segment_pages_compressed_incore_late_swapout);
5227 	}
5228 #endif /* CONFIG_FREEZE */
5229 
5230 	if (c_seg->c_state != C_ON_BAD_Q && !(C_SEG_IS_ONDISK(c_seg))) {
5231 		/*
5232 		 * C_SEG_IS_ONDISK == TRUE can occur when we're doing a
5233 		 * free of a compressed page (i.e. dst == NULL)
5234 		 */
5235 		OSAddAtomic64(-c_rounded_size, &compressor_bytes_used);
5236 	}
5237 	if (c_seg->c_busy_swapping) {
5238 		/*
5239 		 * bypass case for c_busy_swapping...
5240 		 * let the swapin/swapout paths deal with putting
5241 		 * the c_seg on the minor compaction queue if needed
5242 		 */
5243 		assert(c_seg->c_busy);
5244 		goto done;
5245 	}
5246 	assert(!c_seg->c_busy);
5247 
5248 	if (c_seg->c_state != C_IS_FILLING) {
5249 		if (c_seg->c_bytes_used == 0) {
5250 			if (!(C_SEG_IS_ONDISK(c_seg))) {
5251 				int     pages_populated;
5252 
5253 				pages_populated = (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) / PAGE_SIZE;
5254 				c_seg->c_populated_offset = C_SEG_BYTES_TO_OFFSET(0);
5255 
5256 				if (pages_populated) {
5257 					assert(c_seg->c_state != C_ON_BAD_Q);
5258 					assert(c_seg->c_store.c_buffer != NULL);
5259 
5260 					C_SEG_BUSY(c_seg);
5261 					lck_mtx_unlock_always(&c_seg->c_lock);
5262 
5263 					kernel_memory_depopulate(
5264 						(vm_offset_t) c_seg->c_store.c_buffer,
5265 						ptoa(pages_populated),
5266 						KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
5267 
5268 					lck_mtx_lock_spin_always(&c_seg->c_lock);
5269 					C_SEG_WAKEUP_DONE(c_seg);
5270 				}
5271 				if (!c_seg->c_on_minorcompact_q && c_seg->c_state != C_ON_SWAPIO_Q) {
5272 					if (c_seg->c_state == C_ON_SWAPOUT_Q) {
5273 						bool clear_busy = false;
5274 						if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
5275 							C_SEG_BUSY(c_seg);
5276 
5277 							lck_mtx_unlock_always(&c_seg->c_lock);
5278 							lck_mtx_lock_spin_always(c_list_lock);
5279 							lck_mtx_lock_spin_always(&c_seg->c_lock);
5280 							clear_busy = true;
5281 						}
5282 						c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
5283 						if (clear_busy) {
5284 							C_SEG_WAKEUP_DONE(c_seg);
5285 							clear_busy = false;
5286 						}
5287 						lck_mtx_unlock_always(c_list_lock);
5288 					}
5289 					c_seg_need_delayed_compaction(c_seg, FALSE);
5290 				}
5291 			} else {
5292 				if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q) {
5293 					c_seg_move_to_sparse_list(c_seg);
5294 					consider_defragmenting = TRUE;
5295 				}
5296 			}
5297 		} else if (c_seg->c_on_minorcompact_q) {
5298 			assert(c_seg->c_state != C_ON_BAD_Q);
5299 			assert(!C_SEG_IS_ON_DISK_OR_SOQ(c_seg));
5300 
5301 			if (C_SEG_SHOULD_MINORCOMPACT_NOW(c_seg)) {
5302 				c_seg_try_minor_compaction_and_unlock(c_seg);
5303 				need_unlock = FALSE;
5304 			}
5305 		} else if (!(C_SEG_IS_ONDISK(c_seg))) {
5306 			if (c_seg->c_state != C_ON_BAD_Q && c_seg->c_state != C_ON_SWAPOUT_Q && c_seg->c_state != C_ON_SWAPIO_Q &&
5307 			    C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
5308 				c_seg_need_delayed_compaction(c_seg, FALSE);
5309 			}
5310 		} else if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q && C_SEG_ONDISK_IS_SPARSE(c_seg)) {
5311 			c_seg_move_to_sparse_list(c_seg);
5312 			consider_defragmenting = TRUE;
5313 		}
5314 	}
5315 done:
5316 	if (__improbable(kdp_mode)) {
5317 		return retval;
5318 	}
5319 
5320 	if (need_unlock == TRUE) {
5321 		lck_mtx_unlock_always(&c_seg->c_lock);
5322 	}
5323 
5324 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5325 
5326 	if (consider_defragmenting == TRUE) {
5327 		vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE);
5328 	}
5329 
5330 #if !XNU_TARGET_OS_OSX
5331 	if ((c_minor_count && COMPRESSOR_NEEDS_TO_MINOR_COMPACT()) || vm_compressor_needs_to_major_compact()) {
5332 		vm_wake_compactor_swapper();
5333 	}
5334 #endif /* !XNU_TARGET_OS_OSX */
5335 
5336 	return retval;
5337 }
5338 
5339 
5340 int
vm_compressor_get(ppnum_t pn,int * slot,int flags)5341 vm_compressor_get(ppnum_t pn, int *slot, int flags)
5342 {
5343 	c_slot_mapping_t  slot_ptr;
5344 	char    *dst;
5345 	int     zeroslot = 1;
5346 	int     retval;
5347 
5348 	dst = pmap_map_compressor_page(pn);
5349 	slot_ptr = (c_slot_mapping_t)slot;
5350 
5351 	assert(dst != NULL);
5352 
5353 	if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
5354 		int32_t         data;
5355 		int32_t         *dptr;
5356 
5357 		/*
5358 		 * page was populated with a single value
5359 		 * that found a home in our hash table
5360 		 * grab that value from the hash and populate the page
5361 		 * that we need to populate the page with
5362 		 */
5363 		dptr = (int32_t *)(uintptr_t)dst;
5364 		data = c_segment_sv_hash_table[slot_ptr->s_cindx].he_data;
5365 		sv_decompress(dptr, data);
5366 		if (!(flags & C_KEEP)) {
5367 			c_segment_sv_hash_drop_ref(slot_ptr->s_cindx);
5368 
5369 			OSAddAtomic(-1, &c_segment_pages_compressed);
5370 			*slot = 0;
5371 		}
5372 		if (data) {
5373 			OSAddAtomic(1, &c_segment_svp_nonzero_decompressions);
5374 		} else {
5375 			OSAddAtomic(1, &c_segment_svp_zero_decompressions);
5376 		}
5377 
5378 		pmap_unmap_compressor_page(pn, dst);
5379 		return 0;
5380 	}
5381 
5382 	retval = c_decompress_page(dst, slot_ptr, flags, &zeroslot);
5383 
5384 	/*
5385 	 * zeroslot will be set to 0 by c_decompress_page if (flags & C_KEEP)
5386 	 * or (flags & C_DONT_BLOCK) and we found 'c_busy' or 'C_SEG_IS_ONDISK' to be TRUE
5387 	 */
5388 	if (zeroslot) {
5389 		*slot = 0;
5390 	}
5391 
5392 	pmap_unmap_compressor_page(pn, dst);
5393 
5394 	/*
5395 	 * returns 0 if we successfully decompressed a page from a segment already in memory
5396 	 * returns 1 if we had to first swap in the segment, before successfully decompressing the page
5397 	 * returns -1 if we encountered an error swapping in the segment - decompression failed
5398 	 * returns -2 if (flags & C_DONT_BLOCK) and we found 'c_busy' or 'C_SEG_IS_ONDISK' to be true
5399 	 */
5400 	return retval;
5401 }
5402 
5403 #if DEVELOPMENT || DEBUG
5404 
5405 void
vm_compressor_inject_error(int * slot)5406 vm_compressor_inject_error(int *slot)
5407 {
5408 	c_slot_mapping_t slot_ptr = (c_slot_mapping_t)slot;
5409 
5410 	/* No error detection for single-value compression. */
5411 	if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
5412 		printf("%s(): cannot inject errors in SV-compressed pages\n", __func__ );
5413 		return;
5414 	}
5415 
5416 	/* s_cseg is actually "segno+1" */
5417 	const uint32_t c_segno = slot_ptr->s_cseg - 1;
5418 
5419 	assert(c_segno < c_segments_available);
5420 	assert(c_segments[c_segno].c_segno >= c_segments_available);
5421 
5422 	const c_segment_t c_seg = c_segments[c_segno].c_seg;
5423 
5424 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
5425 
5426 	lck_mtx_lock_spin_always(&c_seg->c_lock);
5427 	assert(c_seg->c_state != C_IS_EMPTY && c_seg->c_state != C_IS_FREE);
5428 
5429 	const uint16_t c_indx = slot_ptr->s_cindx;
5430 	assert(c_indx < c_seg->c_nextslot);
5431 
5432 	/*
5433 	 * To safely make this segment temporarily writable, we need to mark
5434 	 * the segment busy, which allows us to release the segment lock.
5435 	 */
5436 	while (c_seg->c_busy) {
5437 		c_seg_wait_on_busy(c_seg);
5438 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5439 	}
5440 	C_SEG_BUSY(c_seg);
5441 
5442 	bool already_writable = (c_seg->c_state == C_IS_FILLING);
5443 	if (!already_writable) {
5444 		/*
5445 		 * Protection update must be performed preemptibly, so temporarily drop
5446 		 * the lock. Having set c_busy will prevent most other concurrent
5447 		 * operations.
5448 		 */
5449 		lck_mtx_unlock_always(&c_seg->c_lock);
5450 		C_SEG_MAKE_WRITEABLE(c_seg);
5451 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5452 	}
5453 
5454 	/*
5455 	 * Once we've released the lock following our c_state == C_IS_FILLING check,
5456 	 * c_current_seg_filled() can (re-)write-protect the segment. However, it
5457 	 * will transition from C_IS_FILLING before releasing the c_seg lock, so we
5458 	 * can detect this by re-checking after we've reobtained the lock.
5459 	 */
5460 	if (already_writable && c_seg->c_state != C_IS_FILLING) {
5461 		lck_mtx_unlock_always(&c_seg->c_lock);
5462 		C_SEG_MAKE_WRITEABLE(c_seg);
5463 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5464 		already_writable = false;
5465 		/* Segment can't be freed while c_busy is set. */
5466 		assert(c_seg->c_state != C_IS_FILLING);
5467 	}
5468 
5469 	/*
5470 	 * Skip if the segment is on disk. This check can only be performed after
5471 	 * the final acquisition of the segment lock before we attempt to write to
5472 	 * the segment.
5473 	 */
5474 	if (!C_SEG_IS_ON_DISK_OR_SOQ(c_seg)) {
5475 		c_slot_t cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5476 		int32_t *data = &c_seg->c_store.c_buffer[cs->c_offset];
5477 		/* assume that the compressed data holds at least one int32_t */
5478 		assert(UNPACK_C_SIZE(cs) > sizeof(*data));
5479 		/*
5480 		 * This bit is known to be in the payload of a MISS packet resulting from
5481 		 * the pattern used in the test pattern from decompression_failure.c.
5482 		 * Flipping it should result in many corrupted bits in the test page.
5483 		 */
5484 		data[0] ^= 0x00000100;
5485 	}
5486 
5487 	if (!already_writable) {
5488 		lck_mtx_unlock_always(&c_seg->c_lock);
5489 		C_SEG_WRITE_PROTECT(c_seg);
5490 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5491 	}
5492 
5493 	C_SEG_WAKEUP_DONE(c_seg);
5494 	lck_mtx_unlock_always(&c_seg->c_lock);
5495 
5496 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5497 }
5498 
5499 #endif /* DEVELOPMENT || DEBUG */
5500 
5501 int
vm_compressor_free(int * slot,int flags)5502 vm_compressor_free(int *slot, int flags)
5503 {
5504 	c_slot_mapping_t  slot_ptr;
5505 	int     zeroslot = 1;
5506 	int     retval;
5507 
5508 	assert(flags == 0 || flags == C_DONT_BLOCK);
5509 
5510 	slot_ptr = (c_slot_mapping_t)slot;
5511 
5512 	if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
5513 		c_segment_sv_hash_drop_ref(slot_ptr->s_cindx);
5514 		OSAddAtomic(-1, &c_segment_pages_compressed);
5515 
5516 		*slot = 0;
5517 		return 0;
5518 	}
5519 	retval = c_decompress_page(NULL, slot_ptr, flags, &zeroslot);
5520 	/*
5521 	 * returns 0 if we successfully freed the specified compressed page
5522 	 * returns -1 if we encountered an error swapping in the segment - decompression failed
5523 	 * returns -2 if (flags & C_DONT_BLOCK) and we found 'c_busy' set
5524 	 */
5525 
5526 	if (retval == 0) {
5527 		*slot = 0;
5528 	}
5529 
5530 	return retval;
5531 }
5532 
5533 
5534 int
vm_compressor_put(ppnum_t pn,int * slot,void ** current_chead,char * scratch_buf)5535 vm_compressor_put(ppnum_t pn, int *slot, void  **current_chead, char *scratch_buf)
5536 {
5537 	char    *src;
5538 	int     retval;
5539 
5540 	src = pmap_map_compressor_page(pn);
5541 	assert(src != NULL);
5542 
5543 	retval = c_compress_page(src, (c_slot_mapping_t)slot, (c_segment_t *)current_chead, scratch_buf);
5544 	pmap_unmap_compressor_page(pn, src);
5545 
5546 	return retval;
5547 }
5548 
5549 void
vm_compressor_transfer(int * dst_slot_p,int * src_slot_p)5550 vm_compressor_transfer(
5551 	int     *dst_slot_p,
5552 	int     *src_slot_p)
5553 {
5554 	c_slot_mapping_t        dst_slot, src_slot;
5555 	c_segment_t             c_seg;
5556 	uint16_t                c_indx;
5557 	c_slot_t                cs;
5558 
5559 	src_slot = (c_slot_mapping_t) src_slot_p;
5560 
5561 	if (src_slot->s_cseg == C_SV_CSEG_ID) {
5562 		*dst_slot_p = *src_slot_p;
5563 		*src_slot_p = 0;
5564 		return;
5565 	}
5566 	dst_slot = (c_slot_mapping_t) dst_slot_p;
5567 Retry:
5568 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
5569 	/* get segment for src_slot */
5570 	c_seg = c_segments[src_slot->s_cseg - 1].c_seg;
5571 	/* lock segment */
5572 	lck_mtx_lock_spin_always(&c_seg->c_lock);
5573 	/* wait if it's busy */
5574 	if (c_seg->c_busy && !c_seg->c_busy_swapping) {
5575 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5576 		c_seg_wait_on_busy(c_seg);
5577 		goto Retry;
5578 	}
5579 	/* find the c_slot */
5580 	c_indx = src_slot->s_cindx;
5581 	cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5582 	/* point the c_slot back to dst_slot instead of src_slot */
5583 	C_SLOT_ASSERT_PACKABLE(dst_slot);
5584 	cs->c_packed_ptr = C_SLOT_PACK_PTR(dst_slot);
5585 	/* transfer */
5586 	*dst_slot_p = *src_slot_p;
5587 	*src_slot_p = 0;
5588 	lck_mtx_unlock_always(&c_seg->c_lock);
5589 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5590 }
5591 
5592 #if defined(__arm64__)
5593 extern clock_sec_t             vm_swapfile_last_failed_to_create_ts;
5594 __attribute__((noreturn))
5595 void
vm_panic_hibernate_write_image_failed(int err)5596 vm_panic_hibernate_write_image_failed(int err)
5597 {
5598 	panic("hibernate_write_image encountered error 0x%x - %u, %u, %d, %d, %d, %d, %d, %d, %d, %d, %llu, %d, %d, %d\n",
5599 	    err,
5600 	    VM_PAGE_COMPRESSOR_COUNT, vm_page_wire_count,
5601 	    c_age_count, c_major_count, c_minor_count, (c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count), c_swappedout_sparse_count,
5602 	    vm_num_swap_files, vm_num_pinned_swap_files, vm_swappin_enabled, vm_swap_put_failures,
5603 	    (vm_swapfile_last_failed_to_create_ts ? 1:0), hibernate_no_swapspace, hibernate_flush_timed_out);
5604 }
5605 #endif /*(__arm64__)*/
5606 
5607 #if CONFIG_FREEZE
5608 
5609 int     freezer_finished_filling = 0;
5610 
5611 void
vm_compressor_finished_filling(void ** current_chead)5612 vm_compressor_finished_filling(
5613 	void    **current_chead)
5614 {
5615 	c_segment_t     c_seg;
5616 
5617 	if ((c_seg = *(c_segment_t *)current_chead) == NULL) {
5618 		return;
5619 	}
5620 
5621 	assert(c_seg->c_state == C_IS_FILLING);
5622 
5623 	lck_mtx_lock_spin_always(&c_seg->c_lock);
5624 
5625 	c_current_seg_filled(c_seg, (c_segment_t *)current_chead);
5626 
5627 	lck_mtx_unlock_always(&c_seg->c_lock);
5628 
5629 	freezer_finished_filling++;
5630 }
5631 
5632 
5633 /*
5634  * This routine is used to transfer the compressed chunks from
5635  * the c_seg/cindx pointed to by slot_p into a new c_seg headed
5636  * by the current_chead and a new cindx within that c_seg.
5637  *
5638  * Currently, this routine is only used by the "freezer backed by
5639  * compressor with swap" mode to create a series of c_segs that
5640  * only contain compressed data belonging to one task. So, we
5641  * move a task's previously compressed data into a set of new
5642  * c_segs which will also hold the task's yet to be compressed data.
5643  */
5644 
5645 kern_return_t
vm_compressor_relocate(void ** current_chead,int * slot_p)5646 vm_compressor_relocate(
5647 	void            **current_chead,
5648 	int             *slot_p)
5649 {
5650 	c_slot_mapping_t        slot_ptr;
5651 	c_slot_mapping_t        src_slot;
5652 	uint32_t                c_rounded_size;
5653 	uint32_t                c_size;
5654 	uint16_t                dst_slot;
5655 	c_slot_t                c_dst;
5656 	c_slot_t                c_src;
5657 	uint16_t                c_indx;
5658 	c_segment_t             c_seg_dst = NULL;
5659 	c_segment_t             c_seg_src = NULL;
5660 	kern_return_t           kr = KERN_SUCCESS;
5661 
5662 
5663 	src_slot = (c_slot_mapping_t) slot_p;
5664 
5665 	if (src_slot->s_cseg == C_SV_CSEG_ID) {
5666 		/*
5667 		 * no need to relocate... this is a page full of a single
5668 		 * value which is hashed to a single entry not contained
5669 		 * in a c_segment_t
5670 		 */
5671 		return kr;
5672 	}
5673 
5674 Relookup_dst:
5675 	c_seg_dst = c_seg_allocate((c_segment_t *)current_chead);
5676 	/*
5677 	 * returns with c_seg lock held
5678 	 * and PAGE_REPLACEMENT_DISALLOWED(TRUE)...
5679 	 * c_nextslot has been allocated and
5680 	 * c_store.c_buffer populated
5681 	 */
5682 	if (c_seg_dst == NULL) {
5683 		/*
5684 		 * Out of compression segments?
5685 		 */
5686 		kr = KERN_RESOURCE_SHORTAGE;
5687 		goto out;
5688 	}
5689 
5690 	assert(c_seg_dst->c_busy == 0);
5691 
5692 	C_SEG_BUSY(c_seg_dst);
5693 
5694 	dst_slot = c_seg_dst->c_nextslot;
5695 
5696 	lck_mtx_unlock_always(&c_seg_dst->c_lock);
5697 
5698 Relookup_src:
5699 	c_seg_src = c_segments[src_slot->s_cseg - 1].c_seg;
5700 
5701 	assert(c_seg_dst != c_seg_src);
5702 
5703 	lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5704 
5705 	if (C_SEG_IS_ON_DISK_OR_SOQ(c_seg_src) ||
5706 	    c_seg_src->c_state == C_IS_FILLING) {
5707 		/*
5708 		 * Skip this page if :-
5709 		 * a) the src c_seg is already on-disk (or on its way there)
5710 		 *    A "thaw" can mark a process as eligible for
5711 		 * another freeze cycle without bringing any of
5712 		 * its swapped out c_segs back from disk (because
5713 		 * that is done on-demand).
5714 		 *    Or, this page may be mapped elsewhere in the task's map,
5715 		 * and we may have marked it for swap already.
5716 		 *
5717 		 * b) Or, the src c_seg is being filled by the compressor
5718 		 * thread. We don't want the added latency of waiting for
5719 		 * this c_seg in the freeze path and so we skip it.
5720 		 */
5721 
5722 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5723 
5724 		lck_mtx_unlock_always(&c_seg_src->c_lock);
5725 
5726 		c_seg_src = NULL;
5727 
5728 		goto out;
5729 	}
5730 
5731 	if (c_seg_src->c_busy) {
5732 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5733 		c_seg_wait_on_busy(c_seg_src);
5734 
5735 		c_seg_src = NULL;
5736 
5737 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
5738 
5739 		goto Relookup_src;
5740 	}
5741 
5742 	C_SEG_BUSY(c_seg_src);
5743 
5744 	lck_mtx_unlock_always(&c_seg_src->c_lock);
5745 
5746 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5747 
5748 	/* find the c_slot */
5749 	c_indx = src_slot->s_cindx;
5750 
5751 	c_src = C_SEG_SLOT_FROM_INDEX(c_seg_src, c_indx);
5752 
5753 	c_size = UNPACK_C_SIZE(c_src);
5754 
5755 	assert(c_size);
5756 
5757 	if (c_size > (uint32_t)(c_seg_bufsize - C_SEG_OFFSET_TO_BYTES((int32_t)c_seg_dst->c_nextoffset))) {
5758 		/*
5759 		 * This segment is full. We need a new one.
5760 		 */
5761 
5762 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
5763 
5764 		lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5765 		C_SEG_WAKEUP_DONE(c_seg_src);
5766 		lck_mtx_unlock_always(&c_seg_src->c_lock);
5767 
5768 		c_seg_src = NULL;
5769 
5770 		lck_mtx_lock_spin_always(&c_seg_dst->c_lock);
5771 
5772 		assert(c_seg_dst->c_busy);
5773 		assert(c_seg_dst->c_state == C_IS_FILLING);
5774 		assert(!c_seg_dst->c_on_minorcompact_q);
5775 
5776 		c_current_seg_filled(c_seg_dst, (c_segment_t *)current_chead);
5777 		assert(*current_chead == NULL);
5778 
5779 		C_SEG_WAKEUP_DONE(c_seg_dst);
5780 
5781 		lck_mtx_unlock_always(&c_seg_dst->c_lock);
5782 
5783 		c_seg_dst = NULL;
5784 
5785 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5786 
5787 		goto Relookup_dst;
5788 	}
5789 
5790 	c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot);
5791 
5792 	memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], c_size);
5793 	/*
5794 	 * Is platform alignment actually necessary since wkdm aligns its output?
5795 	 */
5796 	c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
5797 
5798 	cslot_copy(c_dst, c_src);
5799 	c_dst->c_offset = c_seg_dst->c_nextoffset;
5800 
5801 	if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) {
5802 		c_seg_dst->c_firstemptyslot++;
5803 	}
5804 
5805 	c_seg_dst->c_slots_used++;
5806 	c_seg_dst->c_nextslot++;
5807 	c_seg_dst->c_bytes_used += c_rounded_size;
5808 	c_seg_dst->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
5809 
5810 
5811 	PACK_C_SIZE(c_src, 0);
5812 
5813 	c_seg_src->c_bytes_used -= c_rounded_size;
5814 	c_seg_src->c_bytes_unused += c_rounded_size;
5815 
5816 	assert(c_seg_src->c_slots_used);
5817 	c_seg_src->c_slots_used--;
5818 
5819 	if (!c_seg_src->c_swappedin) {
5820 		/* Pessimistically lose swappedin status when non-swappedin pages are added. */
5821 		c_seg_dst->c_swappedin = false;
5822 	}
5823 
5824 	if (c_indx < c_seg_src->c_firstemptyslot) {
5825 		c_seg_src->c_firstemptyslot = c_indx;
5826 	}
5827 
5828 	c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, dst_slot);
5829 
5830 	PAGE_REPLACEMENT_ALLOWED(TRUE);
5831 	slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
5832 	/* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
5833 	slot_ptr->s_cseg = c_seg_dst->c_mysegno + 1;
5834 	slot_ptr->s_cindx = dst_slot;
5835 
5836 	PAGE_REPLACEMENT_ALLOWED(FALSE);
5837 
5838 out:
5839 	if (c_seg_src) {
5840 		lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5841 
5842 		C_SEG_WAKEUP_DONE(c_seg_src);
5843 
5844 		if (c_seg_src->c_bytes_used == 0 && c_seg_src->c_state != C_IS_FILLING) {
5845 			if (!c_seg_src->c_on_minorcompact_q) {
5846 				c_seg_need_delayed_compaction(c_seg_src, FALSE);
5847 			}
5848 		}
5849 
5850 		lck_mtx_unlock_always(&c_seg_src->c_lock);
5851 	}
5852 
5853 	if (c_seg_dst) {
5854 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
5855 
5856 		lck_mtx_lock_spin_always(&c_seg_dst->c_lock);
5857 
5858 		if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
5859 			/*
5860 			 * Nearing or exceeded maximum slot and offset capacity.
5861 			 */
5862 			assert(c_seg_dst->c_busy);
5863 			assert(c_seg_dst->c_state == C_IS_FILLING);
5864 			assert(!c_seg_dst->c_on_minorcompact_q);
5865 
5866 			c_current_seg_filled(c_seg_dst, (c_segment_t *)current_chead);
5867 			assert(*current_chead == NULL);
5868 		}
5869 
5870 		C_SEG_WAKEUP_DONE(c_seg_dst);
5871 
5872 		lck_mtx_unlock_always(&c_seg_dst->c_lock);
5873 
5874 		c_seg_dst = NULL;
5875 
5876 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5877 	}
5878 
5879 	return kr;
5880 }
5881 #endif /* CONFIG_FREEZE */
5882