xref: /xnu-8796.121.2/osfmk/vm/vm_compressor.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <vm/vm_compressor.h>
30 
31 #if CONFIG_PHANTOM_CACHE
32 #include <vm/vm_phantom_cache.h>
33 #endif
34 
35 #include <vm/vm_map.h>
36 #include <vm/vm_pageout.h>
37 #include <vm/memory_object.h>
38 #include <vm/vm_compressor_algorithms.h>
39 #include <vm/vm_compressor_backing_store.h>
40 #include <vm/vm_fault.h>
41 #include <vm/vm_protos.h>
42 #include <mach/mach_host.h>             /* for host_info() */
43 #if DEVELOPMENT || DEBUG
44 #include <kern/hvg_hypercall.h>
45 #endif
46 #include <kern/ledger.h>
47 #include <kern/policy_internal.h>
48 #include <kern/thread_group.h>
49 #include <san/kasan.h>
50 #include <os/log.h>
51 #include <pexpert/pexpert.h>
52 #include <pexpert/device_tree.h>
53 
54 #if defined(__x86_64__)
55 #include <i386/misc_protos.h>
56 #endif
57 #if defined(__arm64__)
58 #include <arm/machine_routines.h>
59 #endif
60 
61 #include <IOKit/IOHibernatePrivate.h>
62 
63 /*
64  * The segment buffer size is a tradeoff.
65  * A larger buffer leads to faster I/O throughput, better compression ratios
66  * (since fewer bytes are wasted at the end of the segment),
67  * and less overhead (both in time and space).
68  * However, a smaller buffer causes less swap when the system is overcommited
69  * b/c a higher percentage of the swapped-in segment is definitely accessed
70  * before it goes back out to storage.
71  *
72  * So on systems without swap, a larger segment is a clear win.
73  * On systems with swap, the choice is murkier. Empirically, we've
74  * found that a 64KB segment provides a better tradeoff both in terms of
75  * performance and swap writes than a 256KB segment on systems with fast SSDs
76  * and a HW compression block.
77  */
78 #define C_SEG_BUFSIZE_ARM_SWAP (1024 * 64)
79 #if XNU_TARGET_OS_OSX && defined(__arm64__)
80 #define C_SEG_BUFSIZE_DEFAULT C_SEG_BUFSIZE_ARM_SWAP
81 #else
82 #define C_SEG_BUFSIZE_DEFAULT (1024 * 256)
83 #endif /* TARGET_OS_OSX && defined(__arm64__) */
84 uint32_t c_seg_bufsize;
85 
86 uint32_t c_seg_max_pages, c_seg_off_limit, c_seg_allocsize, c_seg_slot_var_array_min_len;
87 
88 extern boolean_t vm_darkwake_mode;
89 extern zone_t vm_page_zone;
90 
91 #if DEVELOPMENT || DEBUG
92 /* sysctl defined in bsd/dev/arm64/sysctl.c */
93 int do_cseg_wedge_thread(void);
94 int do_cseg_unwedge_thread(void);
95 static event_t debug_cseg_wait_event = NULL;
96 #endif /* DEVELOPMENT || DEBUG */
97 
98 #if CONFIG_FREEZE
99 bool freezer_incore_cseg_acct = TRUE; /* Only count incore compressed memory for jetsams. */
100 void task_disown_frozen_csegs(task_t owner_task);
101 #endif /* CONFIG_FREEZE */
102 
103 #if POPCOUNT_THE_COMPRESSED_DATA
104 boolean_t popcount_c_segs = TRUE;
105 
106 static inline uint32_t
vmc_pop(uintptr_t ins,int sz)107 vmc_pop(uintptr_t ins, int sz)
108 {
109 	uint32_t rv = 0;
110 
111 	if (__probable(popcount_c_segs == FALSE)) {
112 		return 0xDEAD707C;
113 	}
114 
115 	while (sz >= 16) {
116 		uint32_t rv1, rv2;
117 		uint64_t *ins64 = (uint64_t *) ins;
118 		uint64_t *ins642 = (uint64_t *) (ins + 8);
119 		rv1 = __builtin_popcountll(*ins64);
120 		rv2 = __builtin_popcountll(*ins642);
121 		rv += rv1 + rv2;
122 		sz -= 16;
123 		ins += 16;
124 	}
125 
126 	while (sz >= 4) {
127 		uint32_t *ins32 = (uint32_t *) ins;
128 		rv += __builtin_popcount(*ins32);
129 		sz -= 4;
130 		ins += 4;
131 	}
132 
133 	while (sz > 0) {
134 		char *ins8 = (char *)ins;
135 		rv += __builtin_popcount(*ins8);
136 		sz--;
137 		ins++;
138 	}
139 	return rv;
140 }
141 #endif
142 
143 #if VALIDATE_C_SEGMENTS
144 boolean_t validate_c_segs = TRUE;
145 #endif
146 /*
147  * vm_compressor_mode has a hierarchy of control to set its value.
148  * boot-args are checked first, then device-tree, and finally
149  * the default value that is defined below. See vm_fault_init() for
150  * the boot-arg & device-tree code.
151  */
152 
153 #if !XNU_TARGET_OS_OSX
154 
155 #if CONFIG_FREEZE
156 int     vm_compressor_mode = VM_PAGER_FREEZER_DEFAULT;
157 struct  freezer_context freezer_context_global;
158 #else /* CONFIG_FREEZE */
159 int     vm_compressor_mode = VM_PAGER_NOT_CONFIGURED;
160 #endif /* CONFIG_FREEZE */
161 
162 #else /* !XNU_TARGET_OS_OSX */
163 int             vm_compressor_mode = VM_PAGER_COMPRESSOR_WITH_SWAP;
164 
165 #endif /* !XNU_TARGET_OS_OSX */
166 
167 TUNABLE(uint32_t, vm_compression_limit, "vm_compression_limit", 0);
168 int             vm_compressor_is_active = 0;
169 int             vm_compressor_available = 0;
170 
171 extern uint64_t vm_swap_get_max_configured_space(void);
172 extern void     vm_pageout_io_throttle(void);
173 bool vm_compressor_swapout_is_ripe(void);
174 
175 #if CHECKSUM_THE_DATA || CHECKSUM_THE_SWAP || CHECKSUM_THE_COMPRESSED_DATA
176 extern unsigned int hash_string(char *cp, int len);
177 static unsigned int vmc_hash(char *, int);
178 boolean_t checksum_c_segs = TRUE;
179 
180 unsigned int
vmc_hash(char * cp,int len)181 vmc_hash(char *cp, int len)
182 {
183 	if (__probable(checksum_c_segs == FALSE)) {
184 		return 0xDEAD7A37;
185 	}
186 	return hash_string(cp, len);
187 }
188 #endif
189 
190 #define UNPACK_C_SIZE(cs)       ((cs->c_size == (PAGE_SIZE-1)) ? PAGE_SIZE : cs->c_size)
191 #define PACK_C_SIZE(cs, size)   (cs->c_size = ((size == PAGE_SIZE) ? PAGE_SIZE - 1 : size))
192 
193 
194 struct c_sv_hash_entry {
195 	union {
196 		struct  {
197 			uint32_t        c_sv_he_ref;
198 			uint32_t        c_sv_he_data;
199 		} c_sv_he;
200 		uint64_t        c_sv_he_record;
201 	} c_sv_he_un;
202 };
203 
204 #define he_ref  c_sv_he_un.c_sv_he.c_sv_he_ref
205 #define he_data c_sv_he_un.c_sv_he.c_sv_he_data
206 #define he_record c_sv_he_un.c_sv_he_record
207 
208 #define C_SV_HASH_MAX_MISS      32
209 #define C_SV_HASH_SIZE          ((1 << 10))
210 #define C_SV_HASH_MASK          ((1 << 10) - 1)
211 #define C_SV_CSEG_ID            ((1 << 22) - 1)
212 
213 
214 union c_segu {
215 	c_segment_t     c_seg;
216 	uintptr_t       c_segno;
217 };
218 
219 #define C_SLOT_ASSERT_PACKABLE(ptr) \
220 	VM_ASSERT_POINTER_PACKABLE((vm_offset_t)(ptr), C_SLOT_PACKED_PTR);
221 
222 #define C_SLOT_PACK_PTR(ptr) \
223 	VM_PACK_POINTER((vm_offset_t)(ptr), C_SLOT_PACKED_PTR)
224 
225 #define C_SLOT_UNPACK_PTR(cslot) \
226 	(c_slot_mapping_t)VM_UNPACK_POINTER((cslot)->c_packed_ptr, C_SLOT_PACKED_PTR)
227 
228 /* for debugging purposes */
229 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) c_slot_packing_params =
230     VM_PACKING_PARAMS(C_SLOT_PACKED_PTR);
231 
232 uint32_t        c_segment_count = 0;
233 uint32_t        c_segment_count_max = 0;
234 
235 uint64_t        c_generation_id = 0;
236 uint64_t        c_generation_id_flush_barrier;
237 
238 
239 #define         HIBERNATE_FLUSHING_SECS_TO_COMPLETE     120
240 
241 boolean_t       hibernate_no_swapspace = FALSE;
242 boolean_t       hibernate_flush_timed_out = FALSE;
243 clock_sec_t     hibernate_flushing_deadline = 0;
244 
245 #if RECORD_THE_COMPRESSED_DATA
246 char    *c_compressed_record_sbuf;
247 char    *c_compressed_record_ebuf;
248 char    *c_compressed_record_cptr;
249 #endif
250 
251 
252 queue_head_t    c_age_list_head;
253 queue_head_t    c_early_swappedin_list_head, c_regular_swappedin_list_head, c_late_swappedin_list_head;
254 queue_head_t    c_early_swapout_list_head, c_regular_swapout_list_head, c_late_swapout_list_head;
255 queue_head_t    c_swapio_list_head;
256 queue_head_t    c_swappedout_list_head;
257 queue_head_t    c_swappedout_sparse_list_head;
258 queue_head_t    c_major_list_head;
259 queue_head_t    c_filling_list_head;
260 queue_head_t    c_bad_list_head;
261 
262 uint32_t        c_age_count = 0;
263 uint32_t        c_early_swappedin_count = 0, c_regular_swappedin_count = 0, c_late_swappedin_count = 0;
264 uint32_t        c_early_swapout_count = 0, c_regular_swapout_count = 0, c_late_swapout_count = 0;
265 uint32_t        c_swapio_count = 0;
266 uint32_t        c_swappedout_count = 0;
267 uint32_t        c_swappedout_sparse_count = 0;
268 uint32_t        c_major_count = 0;
269 uint32_t        c_filling_count = 0;
270 uint32_t        c_empty_count = 0;
271 uint32_t        c_bad_count = 0;
272 
273 
274 queue_head_t    c_minor_list_head;
275 uint32_t        c_minor_count = 0;
276 
277 int             c_overage_swapped_count = 0;
278 int             c_overage_swapped_limit = 0;
279 
280 int             c_seg_fixed_array_len;
281 union  c_segu   *c_segments;
282 vm_offset_t     c_buffers;
283 vm_size_t       c_buffers_size;
284 caddr_t         c_segments_next_page;
285 boolean_t       c_segments_busy;
286 uint32_t        c_segments_available;
287 uint32_t        c_segments_limit;
288 uint32_t        c_segments_nearing_limit;
289 
290 uint32_t        c_segment_svp_in_hash;
291 uint32_t        c_segment_svp_hash_succeeded;
292 uint32_t        c_segment_svp_hash_failed;
293 uint32_t        c_segment_svp_zero_compressions;
294 uint32_t        c_segment_svp_nonzero_compressions;
295 uint32_t        c_segment_svp_zero_decompressions;
296 uint32_t        c_segment_svp_nonzero_decompressions;
297 
298 uint32_t        c_segment_noncompressible_pages;
299 
300 uint32_t        c_segment_pages_compressed = 0; /* Tracks # of uncompressed pages fed into the compressor */
301 #if CONFIG_FREEZE
302 int32_t         c_segment_pages_compressed_incore = 0; /* Tracks # of uncompressed pages fed into the compressor that are in memory */
303 int32_t         c_segment_pages_compressed_incore_late_swapout = 0; /* Tracks # of uncompressed pages fed into the compressor that are in memory and tagged for swapout */
304 uint32_t        c_segments_incore_limit = 0; /* Tracks # of segments allowed to be in-core. Based on compressor pool size */
305 #endif /* CONFIG_FREEZE */
306 
307 uint32_t        c_segment_pages_compressed_limit;
308 uint32_t        c_segment_pages_compressed_nearing_limit;
309 uint32_t        c_free_segno_head = (uint32_t)-1;
310 
311 uint32_t        vm_compressor_minorcompact_threshold_divisor = 10;
312 uint32_t        vm_compressor_majorcompact_threshold_divisor = 10;
313 uint32_t        vm_compressor_unthrottle_threshold_divisor = 10;
314 uint32_t        vm_compressor_catchup_threshold_divisor = 10;
315 
316 uint32_t        vm_compressor_minorcompact_threshold_divisor_overridden = 0;
317 uint32_t        vm_compressor_majorcompact_threshold_divisor_overridden = 0;
318 uint32_t        vm_compressor_unthrottle_threshold_divisor_overridden = 0;
319 uint32_t        vm_compressor_catchup_threshold_divisor_overridden = 0;
320 
321 #define         C_SEGMENTS_PER_PAGE     (PAGE_SIZE / sizeof(union c_segu))
322 
323 LCK_GRP_DECLARE(vm_compressor_lck_grp, "vm_compressor");
324 LCK_RW_DECLARE(c_master_lock, &vm_compressor_lck_grp);
325 LCK_MTX_DECLARE(c_list_lock_storage, &vm_compressor_lck_grp);
326 
327 boolean_t       decompressions_blocked = FALSE;
328 
329 zone_t          compressor_segment_zone;
330 int             c_compressor_swap_trigger = 0;
331 
332 uint32_t        compressor_cpus;
333 char            *compressor_scratch_bufs;
334 char            *kdp_compressor_scratch_buf;
335 char            *kdp_compressor_decompressed_page;
336 addr64_t        kdp_compressor_decompressed_page_paddr;
337 ppnum_t         kdp_compressor_decompressed_page_ppnum;
338 
339 clock_sec_t     start_of_sample_period_sec = 0;
340 clock_nsec_t    start_of_sample_period_nsec = 0;
341 clock_sec_t     start_of_eval_period_sec = 0;
342 clock_nsec_t    start_of_eval_period_nsec = 0;
343 uint32_t        sample_period_decompression_count = 0;
344 uint32_t        sample_period_compression_count = 0;
345 uint32_t        last_eval_decompression_count = 0;
346 uint32_t        last_eval_compression_count = 0;
347 
348 #define         DECOMPRESSION_SAMPLE_MAX_AGE            (60 * 30)
349 
350 boolean_t       vm_swapout_ripe_segments = FALSE;
351 uint32_t        vm_ripe_target_age = (60 * 60 * 48);
352 
353 uint32_t        swapout_target_age = 0;
354 uint32_t        age_of_decompressions_during_sample_period[DECOMPRESSION_SAMPLE_MAX_AGE];
355 uint32_t        overage_decompressions_during_sample_period = 0;
356 
357 
358 void            do_fastwake_warmup(queue_head_t *, boolean_t);
359 boolean_t       fastwake_warmup = FALSE;
360 boolean_t       fastwake_recording_in_progress = FALSE;
361 clock_sec_t     dont_trim_until_ts = 0;
362 
363 uint64_t        c_segment_warmup_count;
364 uint64_t        first_c_segment_to_warm_generation_id = 0;
365 uint64_t        last_c_segment_to_warm_generation_id = 0;
366 boolean_t       hibernate_flushing = FALSE;
367 
368 int64_t         c_segment_input_bytes __attribute__((aligned(8))) = 0;
369 int64_t         c_segment_compressed_bytes __attribute__((aligned(8))) = 0;
370 int64_t         compressor_bytes_used __attribute__((aligned(8))) = 0;
371 
372 /* Keeps track of the most recent timestamp for when major compaction finished. */
373 mach_timespec_t major_compact_ts;
374 
375 struct c_sv_hash_entry c_segment_sv_hash_table[C_SV_HASH_SIZE]  __attribute__ ((aligned(8)));
376 
377 static void vm_compressor_swap_trigger_thread(void);
378 static void vm_compressor_do_delayed_compactions(boolean_t);
379 static void vm_compressor_compact_and_swap(boolean_t);
380 static void vm_compressor_process_regular_swapped_in_segments(boolean_t);
381 void vm_compressor_process_special_swapped_in_segments(void);
382 static void vm_compressor_process_special_swapped_in_segments_locked(void);
383 
384 struct vm_compressor_swapper_stats vmcs_stats;
385 
386 #if XNU_TARGET_OS_OSX
387 #if (__arm64__)
388 static void vm_compressor_process_major_segments(void);
389 #endif /* (__arm64__) */
390 static void vm_compressor_take_paging_space_action(void);
391 #endif /* XNU_TARGET_OS_OSX */
392 
393 void compute_swapout_target_age(void);
394 
395 boolean_t c_seg_major_compact(c_segment_t, c_segment_t);
396 boolean_t c_seg_major_compact_ok(c_segment_t, c_segment_t);
397 
398 int  c_seg_minor_compaction_and_unlock(c_segment_t, boolean_t);
399 int  c_seg_do_minor_compaction_and_unlock(c_segment_t, boolean_t, boolean_t, boolean_t);
400 void c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg);
401 
402 void c_seg_move_to_sparse_list(c_segment_t);
403 void c_seg_insert_into_q(queue_head_t *, c_segment_t);
404 
405 uint64_t vm_available_memory(void);
406 uint64_t vm_compressor_pages_compressed(void);
407 uint32_t vm_compressor_pool_size(void);
408 uint32_t vm_compressor_fragmentation_level(void);
409 uint32_t vm_compression_ratio(void);
410 
411 /*
412  * indicate the need to do a major compaction if
413  * the overall set of in-use compression segments
414  * becomes sparse... on systems that support pressure
415  * driven swapping, this will also cause swapouts to
416  * be initiated.
417  */
418 static inline bool
vm_compressor_needs_to_major_compact()419 vm_compressor_needs_to_major_compact()
420 {
421 	uint32_t        incore_seg_count;
422 
423 	incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
424 
425 	if ((c_segment_count >= (c_segments_nearing_limit / 8)) &&
426 	    ((incore_seg_count * c_seg_max_pages) - VM_PAGE_COMPRESSOR_COUNT) >
427 	    ((incore_seg_count / 8) * c_seg_max_pages)) {
428 		return true;
429 	}
430 	return false;
431 }
432 
433 
434 uint64_t
vm_available_memory(void)435 vm_available_memory(void)
436 {
437 	return ((uint64_t)AVAILABLE_NON_COMPRESSED_MEMORY) * PAGE_SIZE_64;
438 }
439 
440 
441 uint32_t
vm_compressor_pool_size(void)442 vm_compressor_pool_size(void)
443 {
444 	return VM_PAGE_COMPRESSOR_COUNT;
445 }
446 
447 uint32_t
vm_compressor_fragmentation_level(void)448 vm_compressor_fragmentation_level(void)
449 {
450 	const uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
451 	if ((incore_seg_count == 0) || (c_seg_max_pages == 0)) {
452 		return 0;
453 	}
454 	return 100 - (vm_compressor_pool_size() * 100 / (incore_seg_count * c_seg_max_pages));
455 }
456 
457 uint32_t
vm_compression_ratio(void)458 vm_compression_ratio(void)
459 {
460 	if (vm_compressor_pool_size() == 0) {
461 		return UINT32_MAX;
462 	}
463 	return c_segment_pages_compressed / vm_compressor_pool_size();
464 }
465 
466 uint64_t
vm_compressor_pages_compressed(void)467 vm_compressor_pages_compressed(void)
468 {
469 	return c_segment_pages_compressed * PAGE_SIZE_64;
470 }
471 
472 bool
vm_compressor_compressed_pages_nearing_limit(void)473 vm_compressor_compressed_pages_nearing_limit(void)
474 {
475 	uint32_t pages = 0;
476 
477 #if CONFIG_FREEZE
478 	pages = os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
479 #else /* CONFIG_FREEZE */
480 	pages = c_segment_pages_compressed;
481 #endif /* CONFIG_FREEZE */
482 
483 	return pages > c_segment_pages_compressed_nearing_limit;
484 }
485 
486 static bool
vm_compressor_segments_nearing_limit(void)487 vm_compressor_segments_nearing_limit(void)
488 {
489 	uint64_t segments;
490 
491 #if CONFIG_FREEZE
492 	if (freezer_incore_cseg_acct) {
493 		if (os_sub_overflow(c_segment_count, c_swappedout_count, &segments)) {
494 			segments = 0;
495 		}
496 		if (os_sub_overflow(segments, c_swappedout_sparse_count, &segments)) {
497 			segments = 0;
498 		}
499 	} else {
500 		segments = os_atomic_load(&c_segment_count, relaxed);
501 	}
502 #else /* CONFIG_FREEZE */
503 	segments = c_segment_count;
504 #endif /* CONFIG_FREEZE */
505 
506 	return segments > c_segments_nearing_limit;
507 }
508 
509 boolean_t
vm_compressor_low_on_space(void)510 vm_compressor_low_on_space(void)
511 {
512 	return vm_compressor_compressed_pages_nearing_limit() ||
513 	       vm_compressor_segments_nearing_limit();
514 }
515 
516 
517 boolean_t
vm_compressor_out_of_space(void)518 vm_compressor_out_of_space(void)
519 {
520 #if CONFIG_FREEZE
521 	uint64_t incore_seg_count;
522 	uint32_t incore_compressed_pages;
523 	if (freezer_incore_cseg_acct) {
524 		if (os_sub_overflow(c_segment_count, c_swappedout_count, &incore_seg_count)) {
525 			incore_seg_count = 0;
526 		}
527 		if (os_sub_overflow(incore_seg_count, c_swappedout_sparse_count, &incore_seg_count)) {
528 			incore_seg_count = 0;
529 		}
530 		incore_compressed_pages = os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
531 	} else {
532 		incore_seg_count = os_atomic_load(&c_segment_count, relaxed);
533 		incore_compressed_pages = os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
534 	}
535 
536 	if ((incore_compressed_pages >= c_segment_pages_compressed_limit) ||
537 	    (incore_seg_count > c_segments_incore_limit)) {
538 		return TRUE;
539 	}
540 #else /* CONFIG_FREEZE */
541 	if ((c_segment_pages_compressed >= c_segment_pages_compressed_limit) ||
542 	    (c_segment_count >= c_segments_limit)) {
543 		return TRUE;
544 	}
545 #endif /* CONFIG_FREEZE */
546 	return FALSE;
547 }
548 
549 bool
vm_compressor_is_thrashing()550 vm_compressor_is_thrashing()
551 {
552 	compute_swapout_target_age();
553 
554 	if (swapout_target_age) {
555 		c_segment_t     c_seg;
556 
557 		lck_mtx_lock_spin_always(c_list_lock);
558 
559 		if (!queue_empty(&c_age_list_head)) {
560 			c_seg = (c_segment_t) queue_first(&c_age_list_head);
561 
562 			if (c_seg->c_creation_ts > swapout_target_age) {
563 				swapout_target_age = 0;
564 			}
565 		}
566 		lck_mtx_unlock_always(c_list_lock);
567 	}
568 
569 	return swapout_target_age != 0;
570 }
571 
572 
573 int
vm_wants_task_throttled(task_t task)574 vm_wants_task_throttled(task_t task)
575 {
576 	ledger_amount_t compressed;
577 	if (task == kernel_task) {
578 		return 0;
579 	}
580 
581 	if (VM_CONFIG_SWAP_IS_ACTIVE) {
582 		if ((vm_compressor_low_on_space() || HARD_THROTTLE_LIMIT_REACHED())) {
583 			ledger_get_balance(task->ledger, task_ledgers.internal_compressed, &compressed);
584 			compressed >>= VM_MAP_PAGE_SHIFT(task->map);
585 			if ((unsigned int)compressed > (c_segment_pages_compressed / 4)) {
586 				return 1;
587 			}
588 		}
589 	}
590 	return 0;
591 }
592 
593 
594 #if DEVELOPMENT || DEBUG
595 /*
596  * On compressor/swap exhaustion, kill the largest process regardless of
597  * its chosen process policy.
598  */
599 TUNABLE(bool, kill_on_no_paging_space, "-kill_on_no_paging_space", false);
600 #endif /* DEVELOPMENT || DEBUG */
601 
602 #if CONFIG_JETSAM
603 boolean_t       memorystatus_kill_on_VM_compressor_space_shortage(boolean_t);
604 void            memorystatus_thread_wake(void);
605 extern uint32_t jetsam_kill_on_low_swap;
606 bool            memorystatus_disable_swap(void);
607 #if CONFIG_PHANTOM_CACHE
608 extern bool memorystatus_phantom_cache_pressure;
609 #endif /* CONFIG_PHANTOM_CACHE */
610 int             compressor_thrashing_induced_jetsam = 0;
611 int             filecache_thrashing_induced_jetsam = 0;
612 static boolean_t        vm_compressor_thrashing_detected = FALSE;
613 #else  /* CONFIG_JETSAM */
614 static uint32_t no_paging_space_action_in_progress = 0;
615 extern void memorystatus_send_low_swap_note(void);
616 #endif /* CONFIG_JETSAM */
617 
618 static void
vm_compressor_take_paging_space_action(void)619 vm_compressor_take_paging_space_action(void)
620 {
621 #if CONFIG_JETSAM
622 	/*
623 	 * On systems with both swap and jetsam,
624 	 * just wake up the jetsam thread and have it handle the low swap condition
625 	 * by killing apps.
626 	 */
627 	if (jetsam_kill_on_low_swap) {
628 		memorystatus_thread_wake();
629 	}
630 #else /* CONFIG_JETSAM */
631 	if (no_paging_space_action_in_progress == 0) {
632 		if (OSCompareAndSwap(0, 1, (UInt32 *)&no_paging_space_action_in_progress)) {
633 			if (no_paging_space_action()) {
634 #if DEVELOPMENT || DEBUG
635 				if (kill_on_no_paging_space) {
636 					/*
637 					 * Since we are choosing to always kill a process, we don't need the
638 					 * "out of application memory" dialog box in this mode. And, hence we won't
639 					 * send the knote.
640 					 */
641 					no_paging_space_action_in_progress = 0;
642 					return;
643 				}
644 #endif /* DEVELOPMENT || DEBUG */
645 				memorystatus_send_low_swap_note();
646 			}
647 
648 			no_paging_space_action_in_progress = 0;
649 		}
650 	}
651 #endif /* !CONFIG_JETSAM */
652 }
653 
654 
655 void
vm_decompressor_lock(void)656 vm_decompressor_lock(void)
657 {
658 	PAGE_REPLACEMENT_ALLOWED(TRUE);
659 
660 	decompressions_blocked = TRUE;
661 
662 	PAGE_REPLACEMENT_ALLOWED(FALSE);
663 }
664 
665 void
vm_decompressor_unlock(void)666 vm_decompressor_unlock(void)
667 {
668 	PAGE_REPLACEMENT_ALLOWED(TRUE);
669 
670 	decompressions_blocked = FALSE;
671 
672 	PAGE_REPLACEMENT_ALLOWED(FALSE);
673 
674 	thread_wakeup((event_t)&decompressions_blocked);
675 }
676 
677 static inline void
cslot_copy(c_slot_t cdst,c_slot_t csrc)678 cslot_copy(c_slot_t cdst, c_slot_t csrc)
679 {
680 #if CHECKSUM_THE_DATA
681 	cdst->c_hash_data = csrc->c_hash_data;
682 #endif
683 #if CHECKSUM_THE_COMPRESSED_DATA
684 	cdst->c_hash_compressed_data = csrc->c_hash_compressed_data;
685 #endif
686 #if POPCOUNT_THE_COMPRESSED_DATA
687 	cdst->c_pop_cdata = csrc->c_pop_cdata;
688 #endif
689 	cdst->c_size = csrc->c_size;
690 	cdst->c_packed_ptr = csrc->c_packed_ptr;
691 #if defined(__arm64__)
692 	cdst->c_codec = csrc->c_codec;
693 #endif
694 }
695 
696 #if XNU_TARGET_OS_OSX
697 #define VM_COMPRESSOR_MAX_POOL_SIZE (192UL << 30)
698 #else
699 #define VM_COMPRESSOR_MAX_POOL_SIZE (0)
700 #endif
701 
702 static vm_map_size_t compressor_size;
703 static SECURITY_READ_ONLY_LATE(struct mach_vm_range) compressor_range;
704 vm_map_t compressor_map;
705 uint64_t compressor_pool_max_size;
706 uint64_t compressor_pool_size;
707 uint32_t compressor_pool_multiplier;
708 
709 #if DEVELOPMENT || DEBUG
710 /*
711  * Compressor segments are write-protected in development/debug
712  * kernels to help debug memory corruption.
713  * In cases where performance is a concern, this can be disabled
714  * via the boot-arg "-disable_cseg_write_protection".
715  */
716 boolean_t write_protect_c_segs = TRUE;
717 int vm_compressor_test_seg_wp;
718 uint32_t vm_ktrace_enabled;
719 #endif /* DEVELOPMENT || DEBUG */
720 
721 #if (XNU_TARGET_OS_OSX && __arm64__)
722 
723 #include <IOKit/IOPlatformExpert.h>
724 #include <sys/random.h>
725 
726 static const char *csegbufsizeExperimentProperty = "_csegbufsz_experiment";
727 static thread_call_t csegbufsz_experiment_thread_call;
728 
729 extern boolean_t IOServiceWaitForMatchingResource(const char * property, uint64_t timeout);
730 static void
erase_csegbufsz_experiment_property(__unused void * param0,__unused void * param1)731 erase_csegbufsz_experiment_property(__unused void *param0, __unused void *param1)
732 {
733 	// Wait for NVRAM to be writable
734 	if (!IOServiceWaitForMatchingResource("IONVRAM", UINT64_MAX)) {
735 		printf("csegbufsz_experiment_property: Failed to wait for IONVRAM.");
736 	}
737 
738 	if (!PERemoveNVRAMProperty(csegbufsizeExperimentProperty)) {
739 		printf("csegbufsize_experiment_property: Failed to remove %s from NVRAM.", csegbufsizeExperimentProperty);
740 	}
741 	thread_call_free(csegbufsz_experiment_thread_call);
742 }
743 
744 static void
erase_csegbufsz_experiment_property_async()745 erase_csegbufsz_experiment_property_async()
746 {
747 	csegbufsz_experiment_thread_call = thread_call_allocate_with_priority(
748 		erase_csegbufsz_experiment_property,
749 		NULL,
750 		THREAD_CALL_PRIORITY_LOW
751 		);
752 	if (csegbufsz_experiment_thread_call == NULL) {
753 		printf("csegbufsize_experiment_property: Unable to allocate thread call.");
754 	} else {
755 		thread_call_enter(csegbufsz_experiment_thread_call);
756 	}
757 }
758 
759 static void
cleanup_csegbufsz_experiment(__unused void * arg0)760 cleanup_csegbufsz_experiment(__unused void *arg0)
761 {
762 	char nvram = 0;
763 	unsigned int len = sizeof(nvram);
764 	if (PEReadNVRAMProperty(csegbufsizeExperimentProperty, &nvram, &len)) {
765 		erase_csegbufsz_experiment_property_async();
766 	}
767 }
768 
769 STARTUP_ARG(EARLY_BOOT, STARTUP_RANK_FIRST, cleanup_csegbufsz_experiment, NULL);
770 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
771 
772 #if CONFIG_JETSAM
773 extern unsigned int memorystatus_swap_all_apps;
774 #endif /* CONFIG_JETSAM */
775 
776 TUNABLE_DT(uint64_t, swap_vol_min_capacity, "/defaults", "kern.swap_min_capacity", "kern.swap_min_capacity", 0, TUNABLE_DT_NONE);
777 
778 static void
vm_compressor_set_size(void)779 vm_compressor_set_size(void)
780 {
781 	/*
782 	 * Note that this function may be called multiple times on systems with app swap
783 	 * because the value of vm_swap_get_max_configured_space() and memorystatus_swap_all_apps
784 	 * can change based the size of the swap volume. On these systems, we'll call
785 	 * this function once early in boot to reserve the maximum amount of VA required
786 	 * for the compressor submap and then one more time in vm_compressor_init after
787 	 * determining the swap volume size. We must not return a larger value the second
788 	 * time around.
789 	 */
790 	vm_size_t       c_segments_arr_size = 0;
791 	struct c_slot_mapping tmp_slot_ptr;
792 
793 	/* The segment size can be overwritten by a boot-arg */
794 	if (!PE_parse_boot_argn("vm_compressor_segment_buffer_size", &c_seg_bufsize, sizeof(c_seg_bufsize))) {
795 #if CONFIG_JETSAM
796 		if (memorystatus_swap_all_apps) {
797 			c_seg_bufsize = C_SEG_BUFSIZE_ARM_SWAP;
798 		} else {
799 			c_seg_bufsize = C_SEG_BUFSIZE_DEFAULT;
800 		}
801 #else
802 		c_seg_bufsize = C_SEG_BUFSIZE_DEFAULT;
803 #endif /* CONFIG_JETSAM */
804 	}
805 
806 	vm_compressor_swap_init_swap_file_limit();
807 	if (vm_compression_limit) {
808 		compressor_pool_size = ptoa_64(vm_compression_limit);
809 	}
810 
811 	compressor_pool_max_size = C_SEG_MAX_LIMIT;
812 	compressor_pool_max_size *= c_seg_bufsize;
813 
814 #if XNU_TARGET_OS_OSX
815 
816 	if (vm_compression_limit == 0) {
817 		if (max_mem <= (4ULL * 1024ULL * 1024ULL * 1024ULL)) {
818 			compressor_pool_size = 16ULL * max_mem;
819 		} else if (max_mem <= (8ULL * 1024ULL * 1024ULL * 1024ULL)) {
820 			compressor_pool_size = 8ULL * max_mem;
821 		} else if (max_mem <= (32ULL * 1024ULL * 1024ULL * 1024ULL)) {
822 			compressor_pool_size = 4ULL * max_mem;
823 		} else {
824 			compressor_pool_size = 2ULL * max_mem;
825 		}
826 	}
827 	/*
828 	 * Cap the compressor pool size to a max of 192G
829 	 */
830 	if (compressor_pool_size > VM_COMPRESSOR_MAX_POOL_SIZE) {
831 		compressor_pool_size = VM_COMPRESSOR_MAX_POOL_SIZE;
832 	}
833 	if (max_mem <= (8ULL * 1024ULL * 1024ULL * 1024ULL)) {
834 		compressor_pool_multiplier = 1;
835 	} else if (max_mem <= (32ULL * 1024ULL * 1024ULL * 1024ULL)) {
836 		compressor_pool_multiplier = 2;
837 	} else {
838 		compressor_pool_multiplier = 4;
839 	}
840 
841 #elif defined(__arm64__) && defined(XNU_TARGET_OS_WATCH)
842 
843 	/*
844 	 * On M9 watches the compressor can become big and can lead to
845 	 * churn in workingset resulting in audio drops. Setting a cap
846 	 * on the compressor size favors reclaiming unused memory
847 	 * sitting in idle band via jetsams
848 	 */
849 
850 #define COMPRESSOR_CAP_PERCENTAGE        37ULL
851 
852 	if (compressor_pool_max_size > max_mem) {
853 		compressor_pool_max_size = max_mem;
854 	}
855 
856 	if (vm_compression_limit == 0) {
857 		compressor_pool_size = (max_mem * COMPRESSOR_CAP_PERCENTAGE) / 100ULL;
858 	}
859 	compressor_pool_multiplier = 1;
860 
861 #else
862 
863 	if (compressor_pool_max_size > max_mem) {
864 		compressor_pool_max_size = max_mem;
865 	}
866 
867 	if (vm_compression_limit == 0) {
868 		compressor_pool_size = max_mem;
869 	}
870 	compressor_pool_multiplier = 1;
871 #endif
872 	if (compressor_pool_size > compressor_pool_max_size) {
873 		compressor_pool_size = compressor_pool_max_size;
874 	}
875 
876 	c_seg_max_pages = (c_seg_bufsize / PAGE_SIZE);
877 	c_seg_slot_var_array_min_len = c_seg_max_pages;
878 
879 #if !defined(__x86_64__)
880 	c_seg_off_limit = (C_SEG_BYTES_TO_OFFSET((c_seg_bufsize - 512)));
881 	c_seg_allocsize = (c_seg_bufsize + PAGE_SIZE);
882 #else
883 	c_seg_off_limit = (C_SEG_BYTES_TO_OFFSET((c_seg_bufsize - 128)));
884 	c_seg_allocsize = c_seg_bufsize;
885 #endif /* !defined(__x86_64__) */
886 
887 	c_segments_limit = (uint32_t)(compressor_pool_size / (vm_size_t)(c_seg_allocsize));
888 	tmp_slot_ptr.s_cseg = c_segments_limit;
889 	/* Panic on internal configs*/
890 	assertf((tmp_slot_ptr.s_cseg == c_segments_limit), "vm_compressor_init: overflowed s_cseg field in c_slot_mapping with c_segno: %d", c_segments_limit);
891 
892 	if (tmp_slot_ptr.s_cseg != c_segments_limit) {
893 		tmp_slot_ptr.s_cseg = -1;
894 		c_segments_limit = tmp_slot_ptr.s_cseg - 1; /*limited by segment idx bits in c_slot_mapping*/
895 		compressor_pool_size = (c_segments_limit * (vm_size_t)(c_seg_allocsize));
896 	}
897 
898 	c_segments_nearing_limit = (uint32_t)(((uint64_t)c_segments_limit * 98ULL) / 100ULL);
899 
900 	c_segment_pages_compressed_limit = (c_segments_limit * (c_seg_bufsize / PAGE_SIZE) * compressor_pool_multiplier);
901 
902 	if (c_segment_pages_compressed_limit < (uint32_t)(max_mem / PAGE_SIZE)) {
903 #if defined(XNU_TARGET_OS_WATCH)
904 		c_segment_pages_compressed_limit = (uint32_t)(max_mem / PAGE_SIZE);
905 #else
906 		if (!vm_compression_limit) {
907 			c_segment_pages_compressed_limit = (uint32_t)(max_mem / PAGE_SIZE);
908 		}
909 #endif
910 	}
911 
912 	c_segment_pages_compressed_nearing_limit = (uint32_t)(((uint64_t)c_segment_pages_compressed_limit * 98ULL) / 100ULL);
913 
914 #if CONFIG_FREEZE
915 	/*
916 	 * Our in-core limits are based on the size of the compressor pool.
917 	 * The c_segments_nearing_limit is also based on the compressor pool
918 	 * size and calculated above.
919 	 */
920 	c_segments_incore_limit = c_segments_limit;
921 
922 	if (freezer_incore_cseg_acct) {
923 		/*
924 		 * Add enough segments to track all frozen c_segs that can be stored in swap.
925 		 */
926 		c_segments_limit += (uint32_t)(vm_swap_get_max_configured_space() / (vm_size_t)(c_seg_allocsize));
927 		tmp_slot_ptr.s_cseg = c_segments_limit;
928 		/* Panic on internal configs*/
929 		assertf((tmp_slot_ptr.s_cseg == c_segments_limit), "vm_compressor_init: freezer reserve overflowed s_cseg field in c_slot_mapping with c_segno: %d", c_segments_limit);
930 	}
931 #endif
932 	/*
933 	 * Submap needs space for:
934 	 * - c_segments
935 	 * - c_buffers
936 	 * - swap reclaimations -- c_seg_bufsize
937 	 */
938 	c_segments_arr_size = vm_map_round_page((sizeof(union c_segu) * c_segments_limit), VM_MAP_PAGE_MASK(kernel_map));
939 	c_buffers_size = vm_map_round_page(((vm_size_t)c_seg_allocsize * (vm_size_t)c_segments_limit), VM_MAP_PAGE_MASK(kernel_map));
940 
941 	compressor_size = c_segments_arr_size + c_buffers_size + c_seg_bufsize;
942 
943 #if RECORD_THE_COMPRESSED_DATA
944 	c_compressed_record_sbuf_size = (vm_size_t)c_seg_allocsize + (PAGE_SIZE * 2);
945 	compressor_size += c_compressed_record_sbuf_size;
946 #endif /* RECORD_THE_COMPRESSED_DATA */
947 }
948 STARTUP(KMEM, STARTUP_RANK_FIRST, vm_compressor_set_size);
949 
950 KMEM_RANGE_REGISTER_DYNAMIC(compressor, &compressor_range, ^() {
951 	return compressor_size;
952 });
953 
954 bool
osenvironment_is_diagnostics(void)955 osenvironment_is_diagnostics(void)
956 {
957 	DTEntry chosen;
958 	const char *osenvironment;
959 	unsigned int size;
960 	if (kSuccess == SecureDTLookupEntry(0, "/chosen", &chosen)) {
961 		if (kSuccess == SecureDTGetProperty(chosen, "osenvironment", (void const **) &osenvironment, &size)) {
962 			return strcmp(osenvironment, "diagnostics") == 0;
963 		}
964 	}
965 	return false;
966 }
967 
968 void
vm_compressor_init(void)969 vm_compressor_init(void)
970 {
971 	thread_t        thread;
972 #if RECORD_THE_COMPRESSED_DATA
973 	vm_size_t       c_compressed_record_sbuf_size = 0;
974 #endif /* RECORD_THE_COMPRESSED_DATA */
975 
976 #if DEVELOPMENT || DEBUG || CONFIG_FREEZE
977 	char bootarg_name[32];
978 #endif /* DEVELOPMENT || DEBUG || CONFIG_FREEZE */
979 	__unused uint64_t early_boot_compressor_size = compressor_size;
980 
981 #if CONFIG_JETSAM
982 	if (memorystatus_swap_all_apps && osenvironment_is_diagnostics()) {
983 		printf("osenvironment == \"diagnostics\". Disabling app swap.\n");
984 		memorystatus_disable_swap();
985 	}
986 
987 	if (memorystatus_swap_all_apps) {
988 		/*
989 		 * App swap is disabled on devices with small NANDs.
990 		 * Now that we're no longer in early boot, we can get
991 		 * the NAND size and re-run vm_compressor_set_size.
992 		 */
993 		int error = vm_swap_vol_get_capacity(SWAP_VOLUME_NAME, &vm_swap_volume_capacity);
994 #if DEVELOPMENT || DEBUG
995 		if (error != 0) {
996 			panic("vm_compressor_init: Unable to get swap volume capacity. error=%d\n", error);
997 		}
998 #else
999 		if (error != 0) {
1000 			os_log_with_startup_serial(OS_LOG_DEFAULT, "vm_compressor_init: Unable to get swap volume capacity. error=%d\n", error);
1001 		}
1002 #endif /* DEVELOPMENT || DEBUG */
1003 		if (vm_swap_volume_capacity < swap_vol_min_capacity) {
1004 			memorystatus_disable_swap();
1005 		}
1006 		/*
1007 		 * Resize the compressor and swap now that we know the capacity
1008 		 * of the swap volume.
1009 		 */
1010 		vm_compressor_set_size();
1011 		/*
1012 		 * We reserved a chunk of VA early in boot for the compressor submap.
1013 		 * We can't allocate more than that.
1014 		 */
1015 		assert(compressor_size <= early_boot_compressor_size);
1016 	}
1017 #endif /* CONFIG_JETSAM */
1018 
1019 #if DEVELOPMENT || DEBUG
1020 	if (PE_parse_boot_argn("-disable_cseg_write_protection", bootarg_name, sizeof(bootarg_name))) {
1021 		write_protect_c_segs = FALSE;
1022 	}
1023 
1024 	int vmcval = 1;
1025 #if defined(XNU_TARGET_OS_WATCH)
1026 	vmcval = 0;
1027 #endif /* XNU_TARGET_OS_WATCH */
1028 	PE_parse_boot_argn("vm_compressor_validation", &vmcval, sizeof(vmcval));
1029 
1030 	if (kern_feature_override(KF_COMPRSV_OVRD)) {
1031 		vmcval = 0;
1032 	}
1033 
1034 	if (vmcval == 0) {
1035 #if POPCOUNT_THE_COMPRESSED_DATA
1036 		popcount_c_segs = FALSE;
1037 #endif
1038 #if CHECKSUM_THE_DATA || CHECKSUM_THE_COMPRESSED_DATA
1039 		checksum_c_segs = FALSE;
1040 #endif
1041 #if VALIDATE_C_SEGMENTS
1042 		validate_c_segs = FALSE;
1043 #endif
1044 		write_protect_c_segs = FALSE;
1045 	}
1046 #endif /* DEVELOPMENT || DEBUG */
1047 
1048 #if CONFIG_FREEZE
1049 	if (PE_parse_boot_argn("-disable_freezer_cseg_acct", bootarg_name, sizeof(bootarg_name))) {
1050 		freezer_incore_cseg_acct = FALSE;
1051 	}
1052 #endif /* CONFIG_FREEZE */
1053 
1054 	assert((C_SEGMENTS_PER_PAGE * sizeof(union c_segu)) == PAGE_SIZE);
1055 
1056 #if !XNU_TARGET_OS_OSX
1057 	vm_compressor_minorcompact_threshold_divisor = 20;
1058 	vm_compressor_majorcompact_threshold_divisor = 30;
1059 	vm_compressor_unthrottle_threshold_divisor = 40;
1060 	vm_compressor_catchup_threshold_divisor = 60;
1061 #else /* !XNU_TARGET_OS_OSX */
1062 	if (max_mem <= (3ULL * 1024ULL * 1024ULL * 1024ULL)) {
1063 		vm_compressor_minorcompact_threshold_divisor = 11;
1064 		vm_compressor_majorcompact_threshold_divisor = 13;
1065 		vm_compressor_unthrottle_threshold_divisor = 20;
1066 		vm_compressor_catchup_threshold_divisor = 35;
1067 	} else {
1068 		vm_compressor_minorcompact_threshold_divisor = 20;
1069 		vm_compressor_majorcompact_threshold_divisor = 25;
1070 		vm_compressor_unthrottle_threshold_divisor = 35;
1071 		vm_compressor_catchup_threshold_divisor = 50;
1072 	}
1073 #endif /* !XNU_TARGET_OS_OSX */
1074 
1075 	queue_init(&c_bad_list_head);
1076 	queue_init(&c_age_list_head);
1077 	queue_init(&c_minor_list_head);
1078 	queue_init(&c_major_list_head);
1079 	queue_init(&c_filling_list_head);
1080 	queue_init(&c_early_swapout_list_head);
1081 	queue_init(&c_regular_swapout_list_head);
1082 	queue_init(&c_late_swapout_list_head);
1083 	queue_init(&c_swapio_list_head);
1084 	queue_init(&c_early_swappedin_list_head);
1085 	queue_init(&c_regular_swappedin_list_head);
1086 	queue_init(&c_late_swappedin_list_head);
1087 	queue_init(&c_swappedout_list_head);
1088 	queue_init(&c_swappedout_sparse_list_head);
1089 
1090 	c_free_segno_head = -1;
1091 	c_segments_available = 0;
1092 
1093 	compressor_map = kmem_suballoc(kernel_map, &compressor_range.min_address,
1094 	    compressor_size, VM_MAP_CREATE_NEVER_FAULTS,
1095 	    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_NOFAIL | KMS_PERMANENT,
1096 	    VM_KERN_MEMORY_COMPRESSOR).kmr_submap;
1097 
1098 	kmem_alloc(compressor_map, (vm_offset_t *)(&c_segments),
1099 	    (sizeof(union c_segu) * c_segments_limit),
1100 	    KMA_NOFAIL | KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT,
1101 	    VM_KERN_MEMORY_COMPRESSOR);
1102 	kmem_alloc(compressor_map, &c_buffers, c_buffers_size,
1103 	    KMA_NOFAIL | KMA_COMPRESSOR | KMA_VAONLY | KMA_PERMANENT,
1104 	    VM_KERN_MEMORY_COMPRESSOR);
1105 
1106 #if DEVELOPMENT || DEBUG
1107 	if (hvg_is_hcall_available(HVG_HCALL_SET_COREDUMP_DATA)) {
1108 		hvg_hcall_set_coredump_data();
1109 	}
1110 #endif
1111 
1112 	/*
1113 	 * Pick a good size that will minimize fragmentation in zalloc
1114 	 * by minimizing the fragmentation in a 16k run.
1115 	 *
1116 	 * c_seg_slot_var_array_min_len is larger on 4k systems than 16k ones,
1117 	 * making the fragmentation in a 4k page terrible. Using 16k for all
1118 	 * systems matches zalloc() and will minimize fragmentation.
1119 	 */
1120 	uint32_t c_segment_size = sizeof(struct c_segment) + (c_seg_slot_var_array_min_len * sizeof(struct c_slot));
1121 	uint32_t cnt  = (16 << 10) / c_segment_size;
1122 	uint32_t frag = (16 << 10) % c_segment_size;
1123 
1124 	c_seg_fixed_array_len = c_seg_slot_var_array_min_len;
1125 
1126 	while (cnt * sizeof(struct c_slot) < frag) {
1127 		c_segment_size += sizeof(struct c_slot);
1128 		c_seg_fixed_array_len++;
1129 		frag -= cnt * sizeof(struct c_slot);
1130 	}
1131 
1132 	compressor_segment_zone = zone_create("compressor_segment",
1133 	    c_segment_size, ZC_PGZ_USE_GUARDS | ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM);
1134 
1135 	c_segments_busy = FALSE;
1136 
1137 	c_segments_next_page = (caddr_t)c_segments;
1138 	vm_compressor_algorithm_init();
1139 
1140 	{
1141 		host_basic_info_data_t hinfo;
1142 		mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
1143 		size_t bufsize;
1144 		char *buf;
1145 
1146 #define BSD_HOST 1
1147 		host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
1148 
1149 		compressor_cpus = hinfo.max_cpus;
1150 
1151 		bufsize = PAGE_SIZE;
1152 		bufsize += compressor_cpus * vm_compressor_get_decode_scratch_size();
1153 		/* For the KDP path */
1154 		bufsize += vm_compressor_get_decode_scratch_size();
1155 #if CONFIG_FREEZE
1156 		bufsize += vm_compressor_get_encode_scratch_size();
1157 #endif
1158 #if RECORD_THE_COMPRESSED_DATA
1159 		bufsize += c_compressed_record_sbuf_size;
1160 #endif
1161 
1162 		kmem_alloc(kernel_map, (vm_offset_t *)&buf, bufsize,
1163 		    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
1164 		    VM_KERN_MEMORY_COMPRESSOR);
1165 
1166 		/*
1167 		 * kdp_compressor_decompressed_page must be page aligned because we access
1168 		 * it through the physical aperture by page number.
1169 		 */
1170 		kdp_compressor_decompressed_page = buf;
1171 		kdp_compressor_decompressed_page_paddr = kvtophys((vm_offset_t)kdp_compressor_decompressed_page);
1172 		kdp_compressor_decompressed_page_ppnum = (ppnum_t) atop(kdp_compressor_decompressed_page_paddr);
1173 		buf += PAGE_SIZE;
1174 		bufsize -= PAGE_SIZE;
1175 
1176 		compressor_scratch_bufs = buf;
1177 		buf += compressor_cpus * vm_compressor_get_decode_scratch_size();
1178 		bufsize -= compressor_cpus * vm_compressor_get_decode_scratch_size();
1179 
1180 		kdp_compressor_scratch_buf = buf;
1181 		buf += vm_compressor_get_decode_scratch_size();
1182 		bufsize -= vm_compressor_get_decode_scratch_size();
1183 
1184 #if CONFIG_FREEZE
1185 		freezer_context_global.freezer_ctx_compressor_scratch_buf = buf;
1186 		buf += vm_compressor_get_encode_scratch_size();
1187 		bufsize -= vm_compressor_get_encode_scratch_size();
1188 #endif
1189 
1190 #if RECORD_THE_COMPRESSED_DATA
1191 		c_compressed_record_sbuf = buf;
1192 		c_compressed_record_cptr = buf;
1193 		c_compressed_record_ebuf = c_compressed_record_sbuf + c_compressed_record_sbuf_size;
1194 		buf += c_compressed_record_sbuf_size;
1195 		bufsize -= c_compressed_record_sbuf_size;
1196 #endif
1197 		assert(bufsize == 0);
1198 	}
1199 
1200 	if (kernel_thread_start_priority((thread_continue_t)vm_compressor_swap_trigger_thread, NULL,
1201 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
1202 		panic("vm_compressor_swap_trigger_thread: create failed");
1203 	}
1204 	thread_deallocate(thread);
1205 
1206 	if (vm_pageout_internal_start() != KERN_SUCCESS) {
1207 		panic("vm_compressor_init: Failed to start the internal pageout thread.");
1208 	}
1209 	if (VM_CONFIG_SWAP_IS_PRESENT) {
1210 		vm_compressor_swap_init();
1211 	}
1212 
1213 	if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
1214 		vm_compressor_is_active = 1;
1215 	}
1216 
1217 #if CONFIG_FREEZE
1218 	memorystatus_freeze_enabled = TRUE;
1219 #endif /* CONFIG_FREEZE */
1220 
1221 	vm_compressor_available = 1;
1222 
1223 	vm_page_reactivate_all_throttled();
1224 
1225 	bzero(&vmcs_stats, sizeof(struct vm_compressor_swapper_stats));
1226 }
1227 
1228 
1229 #if VALIDATE_C_SEGMENTS
1230 
1231 static void
c_seg_validate(c_segment_t c_seg,boolean_t must_be_compact)1232 c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact)
1233 {
1234 	uint16_t        c_indx;
1235 	int32_t         bytes_used;
1236 	uint32_t        c_rounded_size;
1237 	uint32_t        c_size;
1238 	c_slot_t        cs;
1239 
1240 	if (__probable(validate_c_segs == FALSE)) {
1241 		return;
1242 	}
1243 	if (c_seg->c_firstemptyslot < c_seg->c_nextslot) {
1244 		c_indx = c_seg->c_firstemptyslot;
1245 		cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1246 
1247 		if (cs == NULL) {
1248 			panic("c_seg_validate:  no slot backing c_firstemptyslot");
1249 		}
1250 
1251 		if (cs->c_size) {
1252 			panic("c_seg_validate:  c_firstemptyslot has non-zero size (%d)", cs->c_size);
1253 		}
1254 	}
1255 	bytes_used = 0;
1256 
1257 	for (c_indx = 0; c_indx < c_seg->c_nextslot; c_indx++) {
1258 		cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1259 
1260 		c_size = UNPACK_C_SIZE(cs);
1261 
1262 		c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
1263 
1264 		bytes_used += c_rounded_size;
1265 
1266 #if CHECKSUM_THE_COMPRESSED_DATA
1267 		unsigned csvhash;
1268 		if (c_size && cs->c_hash_compressed_data != (csvhash = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size))) {
1269 			addr64_t csvphys = kvtophys((vm_offset_t)&c_seg->c_store.c_buffer[cs->c_offset]);
1270 			panic("Compressed data doesn't match original %p phys: 0x%llx %d %p %d %d 0x%x 0x%x", c_seg, csvphys, cs->c_offset, cs, c_indx, c_size, cs->c_hash_compressed_data, csvhash);
1271 		}
1272 #endif
1273 #if POPCOUNT_THE_COMPRESSED_DATA
1274 		unsigned csvpop;
1275 		if (c_size) {
1276 			uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
1277 			if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
1278 				panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, (uint64_t)cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
1279 			}
1280 		}
1281 #endif
1282 	}
1283 
1284 	if (bytes_used != c_seg->c_bytes_used) {
1285 		panic("c_seg_validate: bytes_used mismatch - found %d, segment has %d", bytes_used, c_seg->c_bytes_used);
1286 	}
1287 
1288 	if (c_seg->c_bytes_used > C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset)) {
1289 		panic("c_seg_validate: c_bytes_used > c_nextoffset - c_nextoffset = %d,  c_bytes_used = %d",
1290 		    (int32_t)C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset), c_seg->c_bytes_used);
1291 	}
1292 
1293 	if (must_be_compact) {
1294 		if (c_seg->c_bytes_used != C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset)) {
1295 			panic("c_seg_validate: c_bytes_used doesn't match c_nextoffset - c_nextoffset = %d,  c_bytes_used = %d",
1296 			    (int32_t)C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset), c_seg->c_bytes_used);
1297 		}
1298 	}
1299 }
1300 
1301 #endif
1302 
1303 
1304 void
c_seg_need_delayed_compaction(c_segment_t c_seg,boolean_t c_list_lock_held)1305 c_seg_need_delayed_compaction(c_segment_t c_seg, boolean_t c_list_lock_held)
1306 {
1307 	boolean_t       clear_busy = FALSE;
1308 
1309 	if (c_list_lock_held == FALSE) {
1310 		if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1311 			C_SEG_BUSY(c_seg);
1312 
1313 			lck_mtx_unlock_always(&c_seg->c_lock);
1314 			lck_mtx_lock_spin_always(c_list_lock);
1315 			lck_mtx_lock_spin_always(&c_seg->c_lock);
1316 
1317 			clear_busy = TRUE;
1318 		}
1319 	}
1320 	assert(c_seg->c_state != C_IS_FILLING);
1321 
1322 	if (!c_seg->c_on_minorcompact_q && !(C_SEG_IS_ON_DISK_OR_SOQ(c_seg)) && !c_seg->c_has_donated_pages) {
1323 		queue_enter(&c_minor_list_head, c_seg, c_segment_t, c_list);
1324 		c_seg->c_on_minorcompact_q = 1;
1325 		c_minor_count++;
1326 	}
1327 	if (c_list_lock_held == FALSE) {
1328 		lck_mtx_unlock_always(c_list_lock);
1329 	}
1330 
1331 	if (clear_busy == TRUE) {
1332 		C_SEG_WAKEUP_DONE(c_seg);
1333 	}
1334 }
1335 
1336 
1337 unsigned int c_seg_moved_to_sparse_list = 0;
1338 
1339 void
c_seg_move_to_sparse_list(c_segment_t c_seg)1340 c_seg_move_to_sparse_list(c_segment_t c_seg)
1341 {
1342 	boolean_t       clear_busy = FALSE;
1343 
1344 	if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1345 		C_SEG_BUSY(c_seg);
1346 
1347 		lck_mtx_unlock_always(&c_seg->c_lock);
1348 		lck_mtx_lock_spin_always(c_list_lock);
1349 		lck_mtx_lock_spin_always(&c_seg->c_lock);
1350 
1351 		clear_busy = TRUE;
1352 	}
1353 	c_seg_switch_state(c_seg, C_ON_SWAPPEDOUTSPARSE_Q, FALSE);
1354 
1355 	c_seg_moved_to_sparse_list++;
1356 
1357 	lck_mtx_unlock_always(c_list_lock);
1358 
1359 	if (clear_busy == TRUE) {
1360 		C_SEG_WAKEUP_DONE(c_seg);
1361 	}
1362 }
1363 
1364 
1365 void
c_seg_insert_into_q(queue_head_t * qhead,c_segment_t c_seg)1366 c_seg_insert_into_q(queue_head_t *qhead, c_segment_t c_seg)
1367 {
1368 	c_segment_t c_seg_next;
1369 
1370 	if (queue_empty(qhead)) {
1371 		queue_enter(qhead, c_seg, c_segment_t, c_age_list);
1372 	} else {
1373 		c_seg_next = (c_segment_t)queue_first(qhead);
1374 
1375 		while (TRUE) {
1376 			if (c_seg->c_generation_id < c_seg_next->c_generation_id) {
1377 				queue_insert_before(qhead, c_seg, c_seg_next, c_segment_t, c_age_list);
1378 				break;
1379 			}
1380 			c_seg_next = (c_segment_t) queue_next(&c_seg_next->c_age_list);
1381 
1382 			if (queue_end(qhead, (queue_entry_t) c_seg_next)) {
1383 				queue_enter(qhead, c_seg, c_segment_t, c_age_list);
1384 				break;
1385 			}
1386 		}
1387 	}
1388 }
1389 
1390 
1391 int try_minor_compaction_failed = 0;
1392 int try_minor_compaction_succeeded = 0;
1393 
1394 void
c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg)1395 c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg)
1396 {
1397 	assert(c_seg->c_on_minorcompact_q);
1398 	/*
1399 	 * c_seg is currently on the delayed minor compaction
1400 	 * queue and we have c_seg locked... if we can get the
1401 	 * c_list_lock w/o blocking (if we blocked we could deadlock
1402 	 * because the lock order is c_list_lock then c_seg's lock)
1403 	 * we'll pull it from the delayed list and free it directly
1404 	 */
1405 	if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1406 		/*
1407 		 * c_list_lock is held, we need to bail
1408 		 */
1409 		try_minor_compaction_failed++;
1410 
1411 		lck_mtx_unlock_always(&c_seg->c_lock);
1412 	} else {
1413 		try_minor_compaction_succeeded++;
1414 
1415 		C_SEG_BUSY(c_seg);
1416 		c_seg_do_minor_compaction_and_unlock(c_seg, TRUE, FALSE, FALSE);
1417 	}
1418 }
1419 
1420 
1421 int
c_seg_do_minor_compaction_and_unlock(c_segment_t c_seg,boolean_t clear_busy,boolean_t need_list_lock,boolean_t disallow_page_replacement)1422 c_seg_do_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy, boolean_t need_list_lock, boolean_t disallow_page_replacement)
1423 {
1424 	int     c_seg_freed;
1425 
1426 	assert(c_seg->c_busy);
1427 	assert(!C_SEG_IS_ON_DISK_OR_SOQ(c_seg));
1428 
1429 	/*
1430 	 * check for the case that can occur when we are not swapping
1431 	 * and this segment has been major compacted in the past
1432 	 * and moved to the majorcompact q to remove it from further
1433 	 * consideration... if the occupancy falls too low we need
1434 	 * to put it back on the age_q so that it will be considered
1435 	 * in the next major compaction sweep... if we don't do this
1436 	 * we will eventually run into the c_segments_limit
1437 	 */
1438 	if (c_seg->c_state == C_ON_MAJORCOMPACT_Q && C_SEG_SHOULD_MAJORCOMPACT_NOW(c_seg)) {
1439 		c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
1440 	}
1441 	if (!c_seg->c_on_minorcompact_q) {
1442 		if (clear_busy == TRUE) {
1443 			C_SEG_WAKEUP_DONE(c_seg);
1444 		}
1445 
1446 		lck_mtx_unlock_always(&c_seg->c_lock);
1447 
1448 		return 0;
1449 	}
1450 	queue_remove(&c_minor_list_head, c_seg, c_segment_t, c_list);
1451 	c_seg->c_on_minorcompact_q = 0;
1452 	c_minor_count--;
1453 
1454 	lck_mtx_unlock_always(c_list_lock);
1455 
1456 	if (disallow_page_replacement == TRUE) {
1457 		lck_mtx_unlock_always(&c_seg->c_lock);
1458 
1459 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
1460 
1461 		lck_mtx_lock_spin_always(&c_seg->c_lock);
1462 	}
1463 	c_seg_freed = c_seg_minor_compaction_and_unlock(c_seg, clear_busy);
1464 
1465 	if (disallow_page_replacement == TRUE) {
1466 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
1467 	}
1468 
1469 	if (need_list_lock == TRUE) {
1470 		lck_mtx_lock_spin_always(c_list_lock);
1471 	}
1472 
1473 	return c_seg_freed;
1474 }
1475 
1476 void
kdp_compressor_busy_find_owner(event64_t wait_event,thread_waitinfo_t * waitinfo)1477 kdp_compressor_busy_find_owner(event64_t wait_event, thread_waitinfo_t *waitinfo)
1478 {
1479 	c_segment_t c_seg = (c_segment_t) wait_event;
1480 
1481 	waitinfo->owner = thread_tid(c_seg->c_busy_for_thread);
1482 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(c_seg);
1483 }
1484 
1485 #if DEVELOPMENT || DEBUG
1486 int
do_cseg_wedge_thread(void)1487 do_cseg_wedge_thread(void)
1488 {
1489 	struct c_segment c_seg;
1490 	c_seg.c_busy_for_thread = current_thread();
1491 
1492 	debug_cseg_wait_event = (event_t) &c_seg;
1493 
1494 	thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor);
1495 	assert_wait((event_t) (&c_seg), THREAD_INTERRUPTIBLE);
1496 
1497 	thread_block(THREAD_CONTINUE_NULL);
1498 
1499 	return 0;
1500 }
1501 
1502 int
do_cseg_unwedge_thread(void)1503 do_cseg_unwedge_thread(void)
1504 {
1505 	thread_wakeup(debug_cseg_wait_event);
1506 	debug_cseg_wait_event = NULL;
1507 
1508 	return 0;
1509 }
1510 #endif /* DEVELOPMENT || DEBUG */
1511 
1512 void
c_seg_wait_on_busy(c_segment_t c_seg)1513 c_seg_wait_on_busy(c_segment_t c_seg)
1514 {
1515 	c_seg->c_wanted = 1;
1516 
1517 	thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor);
1518 	assert_wait((event_t) (c_seg), THREAD_UNINT);
1519 
1520 	lck_mtx_unlock_always(&c_seg->c_lock);
1521 	thread_block(THREAD_CONTINUE_NULL);
1522 }
1523 
1524 #if CONFIG_FREEZE
1525 /*
1526  * We don't have the task lock held while updating the task's
1527  * c_seg queues. We can do that because of the following restrictions:
1528  *
1529  * - SINGLE FREEZER CONTEXT:
1530  *   We 'insert' c_segs into the task list on the task_freeze path.
1531  *   There can only be one such freeze in progress and the task
1532  *   isn't disappearing because we have the VM map lock held throughout
1533  *   and we have a reference on the proc too.
1534  *
1535  * - SINGLE TASK DISOWN CONTEXT:
1536  *   We 'disown' c_segs of a task ONLY from the task_terminate context. So
1537  *   we don't need the task lock but we need the c_list_lock and the
1538  *   compressor master lock (shared). We also hold the individual
1539  *   c_seg locks (exclusive).
1540  *
1541  *   If we either:
1542  *   - can't get the c_seg lock on a try, then we start again because maybe
1543  *   the c_seg is part of a compaction and might get freed. So we can't trust
1544  *   that linkage and need to restart our queue traversal.
1545  *   - OR, we run into a busy c_seg (say being swapped in or free-ing) we
1546  *   drop all locks again and wait and restart our queue traversal.
1547  *
1548  * - The new_owner_task below is currently only the kernel or NULL.
1549  *
1550  */
1551 void
c_seg_update_task_owner(c_segment_t c_seg,task_t new_owner_task)1552 c_seg_update_task_owner(c_segment_t c_seg, task_t new_owner_task)
1553 {
1554 	task_t          owner_task = c_seg->c_task_owner;
1555 	uint64_t        uncompressed_bytes = ((c_seg->c_slots_used) * PAGE_SIZE_64);
1556 
1557 	LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
1558 	LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
1559 
1560 	if (owner_task) {
1561 		task_update_frozen_to_swap_acct(owner_task, uncompressed_bytes, DEBIT_FROM_SWAP);
1562 		queue_remove(&owner_task->task_frozen_cseg_q, c_seg,
1563 		    c_segment_t, c_task_list_next_cseg);
1564 	}
1565 
1566 	if (new_owner_task) {
1567 		queue_enter(&new_owner_task->task_frozen_cseg_q, c_seg,
1568 		    c_segment_t, c_task_list_next_cseg);
1569 		task_update_frozen_to_swap_acct(new_owner_task, uncompressed_bytes, CREDIT_TO_SWAP);
1570 	}
1571 
1572 	c_seg->c_task_owner = new_owner_task;
1573 }
1574 
1575 void
task_disown_frozen_csegs(task_t owner_task)1576 task_disown_frozen_csegs(task_t owner_task)
1577 {
1578 	c_segment_t c_seg = NULL, next_cseg = NULL;
1579 
1580 again:
1581 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
1582 	lck_mtx_lock_spin_always(c_list_lock);
1583 
1584 	for (c_seg = (c_segment_t) queue_first(&owner_task->task_frozen_cseg_q);
1585 	    !queue_end(&owner_task->task_frozen_cseg_q, (queue_entry_t) c_seg);
1586 	    c_seg = next_cseg) {
1587 		next_cseg = (c_segment_t) queue_next(&c_seg->c_task_list_next_cseg);
1588 
1589 		if (!lck_mtx_try_lock_spin_always(&c_seg->c_lock)) {
1590 			lck_mtx_unlock(c_list_lock);
1591 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
1592 			goto again;
1593 		}
1594 
1595 		if (c_seg->c_busy) {
1596 			lck_mtx_unlock(c_list_lock);
1597 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
1598 
1599 			c_seg_wait_on_busy(c_seg);
1600 
1601 			goto again;
1602 		}
1603 		assert(c_seg->c_task_owner == owner_task);
1604 		c_seg_update_task_owner(c_seg, kernel_task);
1605 		lck_mtx_unlock_always(&c_seg->c_lock);
1606 	}
1607 
1608 	lck_mtx_unlock(c_list_lock);
1609 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
1610 }
1611 #endif /* CONFIG_FREEZE */
1612 
1613 void
c_seg_switch_state(c_segment_t c_seg,int new_state,boolean_t insert_head)1614 c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
1615 {
1616 	int     old_state = c_seg->c_state;
1617 	queue_head_t *donate_swapout_list_head, *donate_swappedin_list_head;
1618 	uint32_t     *donate_swapout_count, *donate_swappedin_count;
1619 
1620 	/*
1621 	 * On macOS the donate queue is swapped first ie the c_early_swapout queue.
1622 	 * On other swap-capable platforms, we want to swap those out last. So we
1623 	 * use the c_late_swapout queue.
1624 	 */
1625 #if XNU_TARGET_OS_OSX
1626 #if (DEVELOPMENT || DEBUG)
1627 	if (new_state != C_IS_FILLING) {
1628 		LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
1629 	}
1630 	LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
1631 #endif /* DEVELOPMENT || DEBUG */
1632 
1633 	donate_swapout_list_head = &c_early_swapout_list_head;
1634 	donate_swapout_count = &c_early_swapout_count;
1635 	donate_swappedin_list_head = &c_early_swappedin_list_head;
1636 	donate_swappedin_count = &c_early_swappedin_count;
1637 #else /* XNU_TARGET_OS_OSX */
1638 	donate_swapout_list_head = &c_late_swapout_list_head;
1639 	donate_swapout_count = &c_late_swapout_count;
1640 	donate_swappedin_list_head = &c_late_swappedin_list_head;
1641 	donate_swappedin_count = &c_late_swappedin_count;
1642 #endif /* XNU_TARGET_OS_OSX */
1643 
1644 	switch (old_state) {
1645 	case C_IS_EMPTY:
1646 		assert(new_state == C_IS_FILLING || new_state == C_IS_FREE);
1647 
1648 		c_empty_count--;
1649 		break;
1650 
1651 	case C_IS_FILLING:
1652 		assert(new_state == C_ON_AGE_Q || new_state == C_ON_SWAPOUT_Q);
1653 
1654 		queue_remove(&c_filling_list_head, c_seg, c_segment_t, c_age_list);
1655 		c_filling_count--;
1656 		break;
1657 
1658 	case C_ON_AGE_Q:
1659 		assert(new_state == C_ON_SWAPOUT_Q || new_state == C_ON_MAJORCOMPACT_Q ||
1660 		    new_state == C_IS_FREE);
1661 
1662 		queue_remove(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1663 		c_age_count--;
1664 		break;
1665 
1666 	case C_ON_SWAPPEDIN_Q:
1667 		if (c_seg->c_has_donated_pages) {
1668 			assert(new_state == C_ON_SWAPOUT_Q || new_state == C_IS_FREE);
1669 			queue_remove(donate_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1670 			*donate_swappedin_count -= 1;
1671 		} else {
1672 			assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE);
1673 #if CONFIG_FREEZE
1674 			assert(c_seg->c_has_freezer_pages);
1675 			queue_remove(&c_early_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1676 			c_early_swappedin_count--;
1677 #else /* CONFIG_FREEZE */
1678 			queue_remove(&c_regular_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1679 			c_regular_swappedin_count--;
1680 #endif /* CONFIG_FREEZE */
1681 		}
1682 		break;
1683 
1684 	case C_ON_SWAPOUT_Q:
1685 		assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE || new_state == C_IS_EMPTY || new_state == C_ON_SWAPIO_Q);
1686 
1687 #if CONFIG_FREEZE
1688 		if (c_seg->c_has_freezer_pages) {
1689 			if (c_seg->c_task_owner && (new_state != C_ON_SWAPIO_Q)) {
1690 				c_seg_update_task_owner(c_seg, NULL);
1691 			}
1692 			queue_remove(&c_early_swapout_list_head, c_seg, c_segment_t, c_age_list);
1693 			c_early_swapout_count--;
1694 		} else
1695 #endif /* CONFIG_FREEZE */
1696 		{
1697 			if (c_seg->c_has_donated_pages) {
1698 				queue_remove(donate_swapout_list_head, c_seg, c_segment_t, c_age_list);
1699 				*donate_swapout_count -= 1;
1700 			} else {
1701 				queue_remove(&c_regular_swapout_list_head, c_seg, c_segment_t, c_age_list);
1702 				c_regular_swapout_count--;
1703 			}
1704 		}
1705 
1706 		if (new_state == C_ON_AGE_Q) {
1707 			c_seg->c_has_donated_pages = 0;
1708 		}
1709 		thread_wakeup((event_t)&compaction_swapper_running);
1710 		break;
1711 
1712 	case C_ON_SWAPIO_Q:
1713 #if CONFIG_FREEZE
1714 		if (c_seg->c_has_freezer_pages) {
1715 			assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_AGE_Q);
1716 		} else
1717 #endif /* CONFIG_FREEZE */
1718 		{
1719 			if (c_seg->c_has_donated_pages) {
1720 				assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_SWAPPEDIN_Q);
1721 			} else {
1722 				assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_AGE_Q);
1723 			}
1724 		}
1725 
1726 		queue_remove(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1727 		c_swapio_count--;
1728 		break;
1729 
1730 	case C_ON_SWAPPEDOUT_Q:
1731 		assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q ||
1732 		    new_state == C_ON_SWAPPEDOUTSPARSE_Q ||
1733 		    new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE);
1734 
1735 		queue_remove(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1736 		c_swappedout_count--;
1737 		break;
1738 
1739 	case C_ON_SWAPPEDOUTSPARSE_Q:
1740 		assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q ||
1741 		    new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE);
1742 
1743 		queue_remove(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1744 		c_swappedout_sparse_count--;
1745 		break;
1746 
1747 	case C_ON_MAJORCOMPACT_Q:
1748 		assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE);
1749 
1750 		queue_remove(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1751 		c_major_count--;
1752 		break;
1753 
1754 	case C_ON_BAD_Q:
1755 		assert(new_state == C_IS_FREE);
1756 
1757 		queue_remove(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1758 		c_bad_count--;
1759 		break;
1760 
1761 	default:
1762 		panic("c_seg %p has bad c_state = %d", c_seg, old_state);
1763 	}
1764 
1765 	switch (new_state) {
1766 	case C_IS_FREE:
1767 		assert(old_state != C_IS_FILLING);
1768 
1769 		break;
1770 
1771 	case C_IS_EMPTY:
1772 		assert(old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1773 
1774 		c_empty_count++;
1775 		break;
1776 
1777 	case C_IS_FILLING:
1778 		assert(old_state == C_IS_EMPTY);
1779 
1780 		queue_enter(&c_filling_list_head, c_seg, c_segment_t, c_age_list);
1781 		c_filling_count++;
1782 		break;
1783 
1784 	case C_ON_AGE_Q:
1785 		assert(old_state == C_IS_FILLING || old_state == C_ON_SWAPPEDIN_Q ||
1786 		    old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPIO_Q ||
1787 		    old_state == C_ON_MAJORCOMPACT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1788 
1789 		assert(!c_seg->c_has_donated_pages);
1790 		if (old_state == C_IS_FILLING) {
1791 			queue_enter(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1792 		} else {
1793 			if (!queue_empty(&c_age_list_head)) {
1794 				c_segment_t     c_first;
1795 
1796 				c_first = (c_segment_t)queue_first(&c_age_list_head);
1797 				c_seg->c_creation_ts = c_first->c_creation_ts;
1798 			}
1799 			queue_enter_first(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1800 		}
1801 		c_age_count++;
1802 		break;
1803 
1804 	case C_ON_SWAPPEDIN_Q:
1805 	{
1806 		queue_head_t *list_head;
1807 
1808 		assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q || old_state == C_ON_SWAPIO_Q);
1809 		if (c_seg->c_has_donated_pages) {
1810 			/* Error in swapouts could happen while the c_seg is still on the swapio queue */
1811 			list_head = donate_swappedin_list_head;
1812 			*donate_swappedin_count += 1;
1813 		} else {
1814 #if CONFIG_FREEZE
1815 			assert(c_seg->c_has_freezer_pages);
1816 			list_head = &c_early_swappedin_list_head;
1817 			c_early_swappedin_count++;
1818 #else /* CONFIG_FREEZE */
1819 			list_head = &c_regular_swappedin_list_head;
1820 			c_regular_swappedin_count++;
1821 #endif /* CONFIG_FREEZE */
1822 		}
1823 
1824 		if (insert_head == TRUE) {
1825 			queue_enter_first(list_head, c_seg, c_segment_t, c_age_list);
1826 		} else {
1827 			queue_enter(list_head, c_seg, c_segment_t, c_age_list);
1828 		}
1829 		break;
1830 	}
1831 
1832 	case C_ON_SWAPOUT_Q:
1833 	{
1834 		queue_head_t *list_head;
1835 
1836 #if CONFIG_FREEZE
1837 		/*
1838 		 * A segment with both identities of frozen + donated pages
1839 		 * will be put on early swapout Q ie the frozen identity wins.
1840 		 * This is because when both identities are set, the donation bit
1841 		 * is added on after in the c_current_seg_filled path for accounting
1842 		 * purposes.
1843 		 */
1844 		if (c_seg->c_has_freezer_pages) {
1845 			assert(old_state == C_ON_AGE_Q || old_state == C_IS_FILLING);
1846 			list_head = &c_early_swapout_list_head;
1847 			c_early_swapout_count++;
1848 		} else
1849 #endif
1850 		{
1851 			if (c_seg->c_has_donated_pages) {
1852 				assert(old_state == C_ON_SWAPPEDIN_Q || old_state == C_IS_FILLING);
1853 				list_head = donate_swapout_list_head;
1854 				*donate_swapout_count += 1;
1855 			} else {
1856 				assert(old_state == C_ON_AGE_Q || old_state == C_IS_FILLING);
1857 				list_head = &c_regular_swapout_list_head;
1858 				c_regular_swapout_count++;
1859 			}
1860 		}
1861 
1862 		if (insert_head == TRUE) {
1863 			queue_enter_first(list_head, c_seg, c_segment_t, c_age_list);
1864 		} else {
1865 			queue_enter(list_head, c_seg, c_segment_t, c_age_list);
1866 		}
1867 		break;
1868 	}
1869 
1870 	case C_ON_SWAPIO_Q:
1871 		assert(old_state == C_ON_SWAPOUT_Q);
1872 
1873 		if (insert_head == TRUE) {
1874 			queue_enter_first(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1875 		} else {
1876 			queue_enter(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1877 		}
1878 		c_swapio_count++;
1879 		break;
1880 
1881 	case C_ON_SWAPPEDOUT_Q:
1882 		assert(old_state == C_ON_SWAPIO_Q);
1883 
1884 		if (insert_head == TRUE) {
1885 			queue_enter_first(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1886 		} else {
1887 			queue_enter(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1888 		}
1889 		c_swappedout_count++;
1890 		break;
1891 
1892 	case C_ON_SWAPPEDOUTSPARSE_Q:
1893 		assert(old_state == C_ON_SWAPIO_Q || old_state == C_ON_SWAPPEDOUT_Q);
1894 
1895 		if (insert_head == TRUE) {
1896 			queue_enter_first(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1897 		} else {
1898 			queue_enter(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1899 		}
1900 
1901 		c_swappedout_sparse_count++;
1902 		break;
1903 
1904 	case C_ON_MAJORCOMPACT_Q:
1905 		assert(old_state == C_ON_AGE_Q);
1906 		assert(!c_seg->c_has_donated_pages);
1907 
1908 		if (insert_head == TRUE) {
1909 			queue_enter_first(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1910 		} else {
1911 			queue_enter(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1912 		}
1913 		c_major_count++;
1914 		break;
1915 
1916 	case C_ON_BAD_Q:
1917 		assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1918 
1919 		if (insert_head == TRUE) {
1920 			queue_enter_first(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1921 		} else {
1922 			queue_enter(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1923 		}
1924 		c_bad_count++;
1925 		break;
1926 
1927 	default:
1928 		panic("c_seg %p requesting bad c_state = %d", c_seg, new_state);
1929 	}
1930 	c_seg->c_state = new_state;
1931 }
1932 
1933 
1934 
1935 void
c_seg_free(c_segment_t c_seg)1936 c_seg_free(c_segment_t c_seg)
1937 {
1938 	assert(c_seg->c_busy);
1939 
1940 	lck_mtx_unlock_always(&c_seg->c_lock);
1941 	lck_mtx_lock_spin_always(c_list_lock);
1942 	lck_mtx_lock_spin_always(&c_seg->c_lock);
1943 
1944 	c_seg_free_locked(c_seg);
1945 }
1946 
1947 
1948 void
c_seg_free_locked(c_segment_t c_seg)1949 c_seg_free_locked(c_segment_t c_seg)
1950 {
1951 	int             segno;
1952 	int             pages_populated = 0;
1953 	int32_t         *c_buffer = NULL;
1954 	uint64_t        c_swap_handle = 0;
1955 
1956 	assert(c_seg->c_busy);
1957 	assert(c_seg->c_slots_used == 0);
1958 	assert(!c_seg->c_on_minorcompact_q);
1959 	assert(!c_seg->c_busy_swapping);
1960 
1961 	if (c_seg->c_overage_swap == TRUE) {
1962 		c_overage_swapped_count--;
1963 		c_seg->c_overage_swap = FALSE;
1964 	}
1965 	if (!(C_SEG_IS_ONDISK(c_seg))) {
1966 		c_buffer = c_seg->c_store.c_buffer;
1967 	} else {
1968 		c_swap_handle = c_seg->c_store.c_swap_handle;
1969 	}
1970 
1971 	c_seg_switch_state(c_seg, C_IS_FREE, FALSE);
1972 
1973 	if (c_buffer) {
1974 		pages_populated = (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) / PAGE_SIZE;
1975 		c_seg->c_store.c_buffer = NULL;
1976 	} else {
1977 #if CONFIG_FREEZE
1978 		c_seg_update_task_owner(c_seg, NULL);
1979 #endif /* CONFIG_FREEZE */
1980 
1981 		c_seg->c_store.c_swap_handle = (uint64_t)-1;
1982 	}
1983 
1984 	lck_mtx_unlock_always(&c_seg->c_lock);
1985 
1986 	lck_mtx_unlock_always(c_list_lock);
1987 
1988 	if (c_buffer) {
1989 		if (pages_populated) {
1990 			kernel_memory_depopulate((vm_offset_t)c_buffer,
1991 			    ptoa(pages_populated), KMA_COMPRESSOR,
1992 			    VM_KERN_MEMORY_COMPRESSOR);
1993 		}
1994 	} else if (c_swap_handle) {
1995 		/*
1996 		 * Free swap space on disk.
1997 		 */
1998 		vm_swap_free(c_swap_handle);
1999 	}
2000 	lck_mtx_lock_spin_always(&c_seg->c_lock);
2001 	/*
2002 	 * c_seg must remain busy until
2003 	 * after the call to vm_swap_free
2004 	 */
2005 	C_SEG_WAKEUP_DONE(c_seg);
2006 	lck_mtx_unlock_always(&c_seg->c_lock);
2007 
2008 	segno = c_seg->c_mysegno;
2009 
2010 	lck_mtx_lock_spin_always(c_list_lock);
2011 	/*
2012 	 * because the c_buffer is now associated with the segno,
2013 	 * we can't put the segno back on the free list until
2014 	 * after we have depopulated the c_buffer range, or
2015 	 * we run the risk of depopulating a range that is
2016 	 * now being used in one of the compressor heads
2017 	 */
2018 	c_segments[segno].c_segno = c_free_segno_head;
2019 	c_free_segno_head = segno;
2020 	c_segment_count--;
2021 
2022 	lck_mtx_unlock_always(c_list_lock);
2023 
2024 	lck_mtx_destroy(&c_seg->c_lock, &vm_compressor_lck_grp);
2025 
2026 	if (c_seg->c_slot_var_array_len) {
2027 		kfree_data(c_seg->c_slot_var_array,
2028 		    sizeof(struct c_slot) * c_seg->c_slot_var_array_len);
2029 	}
2030 
2031 	zfree(compressor_segment_zone, c_seg);
2032 }
2033 
2034 #if DEVELOPMENT || DEBUG
2035 int c_seg_trim_page_count = 0;
2036 #endif
2037 
2038 void
c_seg_trim_tail(c_segment_t c_seg)2039 c_seg_trim_tail(c_segment_t c_seg)
2040 {
2041 	c_slot_t        cs;
2042 	uint32_t        c_size;
2043 	uint32_t        c_offset;
2044 	uint32_t        c_rounded_size;
2045 	uint16_t        current_nextslot;
2046 	uint32_t        current_populated_offset;
2047 
2048 	if (c_seg->c_bytes_used == 0) {
2049 		return;
2050 	}
2051 	current_nextslot = c_seg->c_nextslot;
2052 	current_populated_offset = c_seg->c_populated_offset;
2053 
2054 	while (c_seg->c_nextslot) {
2055 		cs = C_SEG_SLOT_FROM_INDEX(c_seg, (c_seg->c_nextslot - 1));
2056 
2057 		c_size = UNPACK_C_SIZE(cs);
2058 
2059 		if (c_size) {
2060 			if (current_nextslot != c_seg->c_nextslot) {
2061 				c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
2062 				c_offset = cs->c_offset + C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2063 
2064 				c_seg->c_nextoffset = c_offset;
2065 				c_seg->c_populated_offset = (c_offset + (C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1)) &
2066 				    ~(C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1);
2067 
2068 				if (c_seg->c_firstemptyslot > c_seg->c_nextslot) {
2069 					c_seg->c_firstemptyslot = c_seg->c_nextslot;
2070 				}
2071 #if DEVELOPMENT || DEBUG
2072 				c_seg_trim_page_count += ((round_page_32(C_SEG_OFFSET_TO_BYTES(current_populated_offset)) -
2073 				    round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) /
2074 				    PAGE_SIZE);
2075 #endif
2076 			}
2077 			break;
2078 		}
2079 		c_seg->c_nextslot--;
2080 	}
2081 	assert(c_seg->c_nextslot);
2082 }
2083 
2084 
2085 int
c_seg_minor_compaction_and_unlock(c_segment_t c_seg,boolean_t clear_busy)2086 c_seg_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy)
2087 {
2088 	c_slot_mapping_t slot_ptr;
2089 	uint32_t        c_offset = 0;
2090 	uint32_t        old_populated_offset;
2091 	uint32_t        c_rounded_size;
2092 	uint32_t        c_size;
2093 	uint16_t        c_indx = 0;
2094 	int             i;
2095 	c_slot_t        c_dst;
2096 	c_slot_t        c_src;
2097 
2098 	assert(c_seg->c_busy);
2099 
2100 #if VALIDATE_C_SEGMENTS
2101 	c_seg_validate(c_seg, FALSE);
2102 #endif
2103 	if (c_seg->c_bytes_used == 0) {
2104 		c_seg_free(c_seg);
2105 		return 1;
2106 	}
2107 	lck_mtx_unlock_always(&c_seg->c_lock);
2108 
2109 	if (c_seg->c_firstemptyslot >= c_seg->c_nextslot || C_SEG_UNUSED_BYTES(c_seg) < PAGE_SIZE) {
2110 		goto done;
2111 	}
2112 
2113 /* TODO: assert first emptyslot's c_size is actually 0 */
2114 
2115 #if DEVELOPMENT || DEBUG
2116 	C_SEG_MAKE_WRITEABLE(c_seg);
2117 #endif
2118 
2119 #if VALIDATE_C_SEGMENTS
2120 	c_seg->c_was_minor_compacted++;
2121 #endif
2122 	c_indx = c_seg->c_firstemptyslot;
2123 	c_dst = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
2124 
2125 	old_populated_offset = c_seg->c_populated_offset;
2126 	c_offset = c_dst->c_offset;
2127 
2128 	for (i = c_indx + 1; i < c_seg->c_nextslot && c_offset < c_seg->c_nextoffset; i++) {
2129 		c_src = C_SEG_SLOT_FROM_INDEX(c_seg, i);
2130 
2131 		c_size = UNPACK_C_SIZE(c_src);
2132 
2133 		if (c_size == 0) {
2134 			continue;
2135 		}
2136 
2137 		c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
2138 /* N.B.: This memcpy may be an overlapping copy */
2139 		memcpy(&c_seg->c_store.c_buffer[c_offset], &c_seg->c_store.c_buffer[c_src->c_offset], c_rounded_size);
2140 
2141 		cslot_copy(c_dst, c_src);
2142 		c_dst->c_offset = c_offset;
2143 
2144 		slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
2145 		slot_ptr->s_cindx = c_indx;
2146 
2147 		c_offset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2148 		PACK_C_SIZE(c_src, 0);
2149 		c_indx++;
2150 
2151 		c_dst = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
2152 	}
2153 	c_seg->c_firstemptyslot = c_indx;
2154 	c_seg->c_nextslot = c_indx;
2155 	c_seg->c_nextoffset = c_offset;
2156 	c_seg->c_populated_offset = (c_offset + (C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1)) & ~(C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1);
2157 	c_seg->c_bytes_unused = 0;
2158 
2159 #if VALIDATE_C_SEGMENTS
2160 	c_seg_validate(c_seg, TRUE);
2161 #endif
2162 	if (old_populated_offset > c_seg->c_populated_offset) {
2163 		uint32_t        gc_size;
2164 		int32_t         *gc_ptr;
2165 
2166 		gc_size = C_SEG_OFFSET_TO_BYTES(old_populated_offset - c_seg->c_populated_offset);
2167 		gc_ptr = &c_seg->c_store.c_buffer[c_seg->c_populated_offset];
2168 
2169 		kernel_memory_depopulate((vm_offset_t)gc_ptr, gc_size,
2170 		    KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
2171 	}
2172 
2173 #if DEVELOPMENT || DEBUG
2174 	C_SEG_WRITE_PROTECT(c_seg);
2175 #endif
2176 
2177 done:
2178 	if (clear_busy == TRUE) {
2179 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2180 		C_SEG_WAKEUP_DONE(c_seg);
2181 		lck_mtx_unlock_always(&c_seg->c_lock);
2182 	}
2183 	return 0;
2184 }
2185 
2186 
2187 static void
c_seg_alloc_nextslot(c_segment_t c_seg)2188 c_seg_alloc_nextslot(c_segment_t c_seg)
2189 {
2190 	struct c_slot   *old_slot_array = NULL;
2191 	struct c_slot   *new_slot_array = NULL;
2192 	int             newlen;
2193 	int             oldlen;
2194 
2195 	if (c_seg->c_nextslot < c_seg_fixed_array_len) {
2196 		return;
2197 	}
2198 
2199 	if ((c_seg->c_nextslot - c_seg_fixed_array_len) >= c_seg->c_slot_var_array_len) {
2200 		oldlen = c_seg->c_slot_var_array_len;
2201 		old_slot_array = c_seg->c_slot_var_array;
2202 
2203 		if (oldlen == 0) {
2204 			newlen = c_seg_slot_var_array_min_len;
2205 		} else {
2206 			newlen = oldlen * 2;
2207 		}
2208 
2209 		new_slot_array = kalloc_data(sizeof(struct c_slot) * newlen,
2210 		    Z_WAITOK);
2211 
2212 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2213 
2214 		if (old_slot_array) {
2215 			memcpy(new_slot_array, old_slot_array,
2216 			    sizeof(struct c_slot) * oldlen);
2217 		}
2218 
2219 		c_seg->c_slot_var_array_len = newlen;
2220 		c_seg->c_slot_var_array = new_slot_array;
2221 
2222 		lck_mtx_unlock_always(&c_seg->c_lock);
2223 
2224 		kfree_data(old_slot_array, sizeof(struct c_slot) * oldlen);
2225 	}
2226 }
2227 
2228 
2229 #define C_SEG_MAJOR_COMPACT_STATS_MAX   (30)
2230 
2231 struct {
2232 	uint64_t asked_permission;
2233 	uint64_t compactions;
2234 	uint64_t moved_slots;
2235 	uint64_t moved_bytes;
2236 	uint64_t wasted_space_in_swapouts;
2237 	uint64_t count_of_swapouts;
2238 	uint64_t count_of_freed_segs;
2239 	uint64_t bailed_compactions;
2240 	uint64_t bytes_freed_rate_us;
2241 } c_seg_major_compact_stats[C_SEG_MAJOR_COMPACT_STATS_MAX];
2242 
2243 int c_seg_major_compact_stats_now = 0;
2244 
2245 
2246 #define C_MAJOR_COMPACTION_SIZE_APPROPRIATE     ((c_seg_bufsize * 90) / 100)
2247 
2248 
2249 boolean_t
c_seg_major_compact_ok(c_segment_t c_seg_dst,c_segment_t c_seg_src)2250 c_seg_major_compact_ok(
2251 	c_segment_t c_seg_dst,
2252 	c_segment_t c_seg_src)
2253 {
2254 	c_seg_major_compact_stats[c_seg_major_compact_stats_now].asked_permission++;
2255 
2256 	if (c_seg_src->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE &&
2257 	    c_seg_dst->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE) {
2258 		return FALSE;
2259 	}
2260 
2261 	if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
2262 		/*
2263 		 * destination segment is full... can't compact
2264 		 */
2265 		return FALSE;
2266 	}
2267 
2268 	return TRUE;
2269 }
2270 
2271 
2272 boolean_t
c_seg_major_compact(c_segment_t c_seg_dst,c_segment_t c_seg_src)2273 c_seg_major_compact(
2274 	c_segment_t c_seg_dst,
2275 	c_segment_t c_seg_src)
2276 {
2277 	c_slot_mapping_t slot_ptr;
2278 	uint32_t        c_rounded_size;
2279 	uint32_t        c_size;
2280 	uint16_t        dst_slot;
2281 	int             i;
2282 	c_slot_t        c_dst;
2283 	c_slot_t        c_src;
2284 	boolean_t       keep_compacting = TRUE;
2285 
2286 	/*
2287 	 * segments are not locked but they are both marked c_busy
2288 	 * which keeps c_decompress from working on them...
2289 	 * we can safely allocate new pages, move compressed data
2290 	 * from c_seg_src to c_seg_dst and update both c_segment's
2291 	 * state w/o holding the master lock
2292 	 */
2293 #if DEVELOPMENT || DEBUG
2294 	C_SEG_MAKE_WRITEABLE(c_seg_dst);
2295 #endif
2296 
2297 #if VALIDATE_C_SEGMENTS
2298 	c_seg_dst->c_was_major_compacted++;
2299 	c_seg_src->c_was_major_donor++;
2300 #endif
2301 	assertf(c_seg_dst->c_has_donated_pages == c_seg_src->c_has_donated_pages, "Mismatched donation status Dst: %p, Src: %p\n", c_seg_dst, c_seg_src);
2302 	c_seg_major_compact_stats[c_seg_major_compact_stats_now].compactions++;
2303 
2304 	dst_slot = c_seg_dst->c_nextslot;
2305 
2306 	for (i = 0; i < c_seg_src->c_nextslot; i++) {
2307 		c_src = C_SEG_SLOT_FROM_INDEX(c_seg_src, i);
2308 
2309 		c_size = UNPACK_C_SIZE(c_src);
2310 
2311 		if (c_size == 0) {
2312 			/* BATCH: move what we have so far; */
2313 			continue;
2314 		}
2315 
2316 		if (C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset - c_seg_dst->c_nextoffset) < (unsigned) c_size) {
2317 			int     size_to_populate;
2318 
2319 			/* doesn't fit */
2320 			size_to_populate = c_seg_bufsize - C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset);
2321 
2322 			if (size_to_populate == 0) {
2323 				/* can't fit */
2324 				keep_compacting = FALSE;
2325 				break;
2326 			}
2327 			if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) {
2328 				size_to_populate = C_SEG_MAX_POPULATE_SIZE;
2329 			}
2330 
2331 			kernel_memory_populate(
2332 				(vm_offset_t) &c_seg_dst->c_store.c_buffer[c_seg_dst->c_populated_offset],
2333 				size_to_populate,
2334 				KMA_NOFAIL | KMA_COMPRESSOR,
2335 				VM_KERN_MEMORY_COMPRESSOR);
2336 
2337 			c_seg_dst->c_populated_offset += C_SEG_BYTES_TO_OFFSET(size_to_populate);
2338 			assert(C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset) <= c_seg_bufsize);
2339 		}
2340 		c_seg_alloc_nextslot(c_seg_dst);
2341 
2342 		c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot);
2343 
2344 		memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], c_size);
2345 
2346 		c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
2347 
2348 		c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_slots++;
2349 		c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_bytes += c_size;
2350 
2351 		cslot_copy(c_dst, c_src);
2352 		c_dst->c_offset = c_seg_dst->c_nextoffset;
2353 
2354 		if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) {
2355 			c_seg_dst->c_firstemptyslot++;
2356 		}
2357 		c_seg_dst->c_slots_used++;
2358 		c_seg_dst->c_nextslot++;
2359 		c_seg_dst->c_bytes_used += c_rounded_size;
2360 		c_seg_dst->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2361 
2362 		PACK_C_SIZE(c_src, 0);
2363 
2364 		c_seg_src->c_bytes_used -= c_rounded_size;
2365 		c_seg_src->c_bytes_unused += c_rounded_size;
2366 		c_seg_src->c_firstemptyslot = 0;
2367 
2368 		assert(c_seg_src->c_slots_used);
2369 		c_seg_src->c_slots_used--;
2370 
2371 		if (!c_seg_src->c_swappedin) {
2372 			/* Pessimistically lose swappedin status when non-swappedin pages are added. */
2373 			c_seg_dst->c_swappedin = false;
2374 		}
2375 
2376 		if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
2377 			/* dest segment is now full */
2378 			keep_compacting = FALSE;
2379 			break;
2380 		}
2381 	}
2382 #if DEVELOPMENT || DEBUG
2383 	C_SEG_WRITE_PROTECT(c_seg_dst);
2384 #endif
2385 	if (dst_slot < c_seg_dst->c_nextslot) {
2386 		PAGE_REPLACEMENT_ALLOWED(TRUE);
2387 		/*
2388 		 * we've now locked out c_decompress from
2389 		 * converting the slot passed into it into
2390 		 * a c_segment_t which allows us to use
2391 		 * the backptr to change which c_segment and
2392 		 * index the slot points to
2393 		 */
2394 		while (dst_slot < c_seg_dst->c_nextslot) {
2395 			c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, dst_slot);
2396 
2397 			slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
2398 			/* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
2399 			slot_ptr->s_cseg = c_seg_dst->c_mysegno + 1;
2400 			slot_ptr->s_cindx = dst_slot++;
2401 		}
2402 		PAGE_REPLACEMENT_ALLOWED(FALSE);
2403 	}
2404 	return keep_compacting;
2405 }
2406 
2407 
2408 uint64_t
vm_compressor_compute_elapsed_msecs(clock_sec_t end_sec,clock_nsec_t end_nsec,clock_sec_t start_sec,clock_nsec_t start_nsec)2409 vm_compressor_compute_elapsed_msecs(clock_sec_t end_sec, clock_nsec_t end_nsec, clock_sec_t start_sec, clock_nsec_t start_nsec)
2410 {
2411 	uint64_t end_msecs;
2412 	uint64_t start_msecs;
2413 
2414 	end_msecs = (end_sec * 1000) + end_nsec / 1000000;
2415 	start_msecs = (start_sec * 1000) + start_nsec / 1000000;
2416 
2417 	return end_msecs - start_msecs;
2418 }
2419 
2420 
2421 
2422 uint32_t compressor_eval_period_in_msecs = 250;
2423 uint32_t compressor_sample_min_in_msecs = 500;
2424 uint32_t compressor_sample_max_in_msecs = 10000;
2425 uint32_t compressor_thrashing_threshold_per_10msecs = 50;
2426 uint32_t compressor_thrashing_min_per_10msecs = 20;
2427 
2428 /* When true, reset sample data next chance we get. */
2429 static boolean_t        compressor_need_sample_reset = FALSE;
2430 
2431 
2432 void
compute_swapout_target_age(void)2433 compute_swapout_target_age(void)
2434 {
2435 	clock_sec_t     cur_ts_sec;
2436 	clock_nsec_t    cur_ts_nsec;
2437 	uint32_t        min_operations_needed_in_this_sample;
2438 	uint64_t        elapsed_msecs_in_eval;
2439 	uint64_t        elapsed_msecs_in_sample;
2440 	boolean_t       need_eval_reset = FALSE;
2441 
2442 	clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
2443 
2444 	elapsed_msecs_in_sample = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, start_of_sample_period_sec, start_of_sample_period_nsec);
2445 
2446 	if (compressor_need_sample_reset ||
2447 	    elapsed_msecs_in_sample >= compressor_sample_max_in_msecs) {
2448 		compressor_need_sample_reset = TRUE;
2449 		need_eval_reset = TRUE;
2450 		goto done;
2451 	}
2452 	elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, start_of_eval_period_sec, start_of_eval_period_nsec);
2453 
2454 	if (elapsed_msecs_in_eval < compressor_eval_period_in_msecs) {
2455 		goto done;
2456 	}
2457 	need_eval_reset = TRUE;
2458 
2459 	KERNEL_DEBUG(0xe0400020 | DBG_FUNC_START, elapsed_msecs_in_eval, sample_period_compression_count, sample_period_decompression_count, 0, 0);
2460 
2461 	min_operations_needed_in_this_sample = (compressor_thrashing_min_per_10msecs * (uint32_t)elapsed_msecs_in_eval) / 10;
2462 
2463 	if ((sample_period_compression_count - last_eval_compression_count) < min_operations_needed_in_this_sample ||
2464 	    (sample_period_decompression_count - last_eval_decompression_count) < min_operations_needed_in_this_sample) {
2465 		KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, sample_period_compression_count - last_eval_compression_count,
2466 		    sample_period_decompression_count - last_eval_decompression_count, 0, 1, 0);
2467 
2468 		swapout_target_age = 0;
2469 
2470 		compressor_need_sample_reset = TRUE;
2471 		need_eval_reset = TRUE;
2472 		goto done;
2473 	}
2474 	last_eval_compression_count = sample_period_compression_count;
2475 	last_eval_decompression_count = sample_period_decompression_count;
2476 
2477 	if (elapsed_msecs_in_sample < compressor_sample_min_in_msecs) {
2478 		KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, swapout_target_age, 0, 0, 5, 0);
2479 		goto done;
2480 	}
2481 	if (sample_period_decompression_count > ((compressor_thrashing_threshold_per_10msecs * elapsed_msecs_in_sample) / 10)) {
2482 		uint64_t        running_total;
2483 		uint64_t        working_target;
2484 		uint64_t        aging_target;
2485 		uint32_t        oldest_age_of_csegs_sampled = 0;
2486 		uint64_t        working_set_approximation = 0;
2487 
2488 		swapout_target_age = 0;
2489 
2490 		working_target = (sample_period_decompression_count / 100) * 95;                /* 95 percent */
2491 		aging_target = (sample_period_decompression_count / 100) * 1;                   /* 1 percent */
2492 		running_total = 0;
2493 
2494 		for (oldest_age_of_csegs_sampled = 0; oldest_age_of_csegs_sampled < DECOMPRESSION_SAMPLE_MAX_AGE; oldest_age_of_csegs_sampled++) {
2495 			running_total += age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2496 
2497 			working_set_approximation += oldest_age_of_csegs_sampled * age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2498 
2499 			if (running_total >= working_target) {
2500 				break;
2501 			}
2502 		}
2503 		if (oldest_age_of_csegs_sampled < DECOMPRESSION_SAMPLE_MAX_AGE) {
2504 			working_set_approximation = (working_set_approximation * 1000) / elapsed_msecs_in_sample;
2505 
2506 			if (working_set_approximation < VM_PAGE_COMPRESSOR_COUNT) {
2507 				running_total = overage_decompressions_during_sample_period;
2508 
2509 				for (oldest_age_of_csegs_sampled = DECOMPRESSION_SAMPLE_MAX_AGE - 1; oldest_age_of_csegs_sampled; oldest_age_of_csegs_sampled--) {
2510 					running_total += age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2511 
2512 					if (running_total >= aging_target) {
2513 						break;
2514 					}
2515 				}
2516 				swapout_target_age = (uint32_t)cur_ts_sec - oldest_age_of_csegs_sampled;
2517 
2518 				KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, swapout_target_age, working_set_approximation, VM_PAGE_COMPRESSOR_COUNT, 2, 0);
2519 			} else {
2520 				KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, working_set_approximation, VM_PAGE_COMPRESSOR_COUNT, 0, 3, 0);
2521 			}
2522 		} else {
2523 			KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, working_target, running_total, 0, 4, 0);
2524 		}
2525 
2526 		compressor_need_sample_reset = TRUE;
2527 		need_eval_reset = TRUE;
2528 	} else {
2529 		KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, sample_period_decompression_count, (compressor_thrashing_threshold_per_10msecs * elapsed_msecs_in_sample) / 10, 0, 6, 0);
2530 	}
2531 done:
2532 	if (compressor_need_sample_reset == TRUE) {
2533 		bzero(age_of_decompressions_during_sample_period, sizeof(age_of_decompressions_during_sample_period));
2534 		overage_decompressions_during_sample_period = 0;
2535 
2536 		start_of_sample_period_sec = cur_ts_sec;
2537 		start_of_sample_period_nsec = cur_ts_nsec;
2538 		sample_period_decompression_count = 0;
2539 		sample_period_compression_count = 0;
2540 		last_eval_decompression_count = 0;
2541 		last_eval_compression_count = 0;
2542 		compressor_need_sample_reset = FALSE;
2543 	}
2544 	if (need_eval_reset == TRUE) {
2545 		start_of_eval_period_sec = cur_ts_sec;
2546 		start_of_eval_period_nsec = cur_ts_nsec;
2547 	}
2548 }
2549 
2550 
2551 int             compaction_swapper_init_now = 0;
2552 int             compaction_swapper_running = 0;
2553 int             compaction_swapper_awakened = 0;
2554 int             compaction_swapper_abort = 0;
2555 
2556 bool
vm_compressor_swapout_is_ripe()2557 vm_compressor_swapout_is_ripe()
2558 {
2559 	bool is_ripe = false;
2560 	if (vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit) {
2561 		c_segment_t     c_seg;
2562 		clock_sec_t     now;
2563 		clock_sec_t     age;
2564 		clock_nsec_t    nsec;
2565 
2566 		clock_get_system_nanotime(&now, &nsec);
2567 		age = 0;
2568 
2569 		lck_mtx_lock_spin_always(c_list_lock);
2570 
2571 		if (!queue_empty(&c_age_list_head)) {
2572 			c_seg = (c_segment_t) queue_first(&c_age_list_head);
2573 
2574 			age = now - c_seg->c_creation_ts;
2575 		}
2576 		lck_mtx_unlock_always(c_list_lock);
2577 
2578 		if (age >= vm_ripe_target_age) {
2579 			is_ripe = true;
2580 		}
2581 	}
2582 	return is_ripe;
2583 }
2584 
2585 static bool
compressor_swapout_conditions_met(void)2586 compressor_swapout_conditions_met(void)
2587 {
2588 	bool should_swap = false;
2589 	if (COMPRESSOR_NEEDS_TO_SWAP()) {
2590 		should_swap = true;
2591 		vmcs_stats.compressor_swap_threshold_exceeded++;
2592 	}
2593 	if (VM_PAGE_Q_THROTTLED(&vm_pageout_queue_external) && vm_page_anonymous_count < (vm_page_inactive_count / 20)) {
2594 		should_swap = true;
2595 		vmcs_stats.external_q_throttled++;
2596 	}
2597 	if (vm_page_free_count < (vm_page_free_reserved - (COMPRESSOR_FREE_RESERVED_LIMIT * 2))) {
2598 		should_swap = true;
2599 		vmcs_stats.free_count_below_reserve++;
2600 	}
2601 	return should_swap;
2602 }
2603 
2604 static bool
compressor_needs_to_swap()2605 compressor_needs_to_swap()
2606 {
2607 	bool should_swap = false;
2608 	if (vm_compressor_swapout_is_ripe()) {
2609 		should_swap = true;
2610 		goto check_if_low_space;
2611 	}
2612 
2613 	if (VM_CONFIG_SWAP_IS_ACTIVE) {
2614 		should_swap =  compressor_swapout_conditions_met();
2615 		if (should_swap) {
2616 			goto check_if_low_space;
2617 		}
2618 	}
2619 
2620 #if (XNU_TARGET_OS_OSX && __arm64__)
2621 	/*
2622 	 * Thrashing detection disabled.
2623 	 */
2624 #else /* (XNU_TARGET_OS_OSX && __arm64__) */
2625 
2626 	if (vm_compressor_is_thrashing()) {
2627 		should_swap = true;
2628 		vmcs_stats.thrashing_detected++;
2629 	}
2630 
2631 #if CONFIG_PHANTOM_CACHE
2632 	if (vm_phantom_cache_check_pressure()) {
2633 		os_atomic_store(&memorystatus_phantom_cache_pressure, true, release);
2634 		should_swap = true;
2635 	}
2636 #endif
2637 	if (swapout_target_age) {
2638 		should_swap = true;
2639 	}
2640 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
2641 
2642 check_if_low_space:
2643 
2644 #if CONFIG_JETSAM
2645 	if (should_swap || vm_compressor_low_on_space() == TRUE) {
2646 		if (vm_compressor_thrashing_detected == FALSE) {
2647 			vm_compressor_thrashing_detected = TRUE;
2648 
2649 			if (swapout_target_age) {
2650 				compressor_thrashing_induced_jetsam++;
2651 			} else if (vm_compressor_low_on_space() == TRUE) {
2652 				compressor_thrashing_induced_jetsam++;
2653 			} else {
2654 				filecache_thrashing_induced_jetsam++;
2655 			}
2656 			/*
2657 			 * Wake up the memorystatus thread so that it can return
2658 			 * the system to a healthy state (by killing processes).
2659 			 */
2660 			memorystatus_thread_wake();
2661 		}
2662 		/*
2663 		 * let the jetsam take precedence over
2664 		 * any major compactions we might have
2665 		 * been able to do... otherwise we run
2666 		 * the risk of doing major compactions
2667 		 * on segments we're about to free up
2668 		 * due to the jetsam activity.
2669 		 */
2670 		should_swap = false;
2671 		if (memorystatus_swap_all_apps && vm_swap_low_on_space()) {
2672 			vm_compressor_take_paging_space_action();
2673 		}
2674 	}
2675 
2676 #else /* CONFIG_JETSAM */
2677 	if (should_swap && vm_swap_low_on_space()) {
2678 		vm_compressor_take_paging_space_action();
2679 	}
2680 #endif /* CONFIG_JETSAM */
2681 
2682 	if (should_swap == false) {
2683 		/*
2684 		 * vm_compressor_needs_to_major_compact returns true only if we're
2685 		 * about to run out of available compressor segments... in this
2686 		 * case, we absolutely need to run a major compaction even if
2687 		 * we've just kicked off a jetsam or we don't otherwise need to
2688 		 * swap... terminating objects releases
2689 		 * pages back to the uncompressed cache, but does not guarantee
2690 		 * that we will free up even a single compression segment
2691 		 */
2692 		should_swap = vm_compressor_needs_to_major_compact();
2693 		if (should_swap) {
2694 			vmcs_stats.fragmentation_detected++;
2695 		}
2696 	}
2697 
2698 	/*
2699 	 * returning TRUE when swap_supported == FALSE
2700 	 * will cause the major compaction engine to
2701 	 * run, but will not trigger any swapping...
2702 	 * segments that have been major compacted
2703 	 * will be moved to the majorcompact queue
2704 	 */
2705 	return should_swap;
2706 }
2707 
2708 #if CONFIG_JETSAM
2709 /*
2710  * This function is called from the jetsam thread after killing something to
2711  * mitigate thrashing.
2712  *
2713  * We need to restart our thrashing detection heuristics since memory pressure
2714  * has potentially changed significantly, and we don't want to detect on old
2715  * data from before the jetsam.
2716  */
2717 void
vm_thrashing_jetsam_done(void)2718 vm_thrashing_jetsam_done(void)
2719 {
2720 	vm_compressor_thrashing_detected = FALSE;
2721 
2722 	/* Were we compressor-thrashing or filecache-thrashing? */
2723 	if (swapout_target_age) {
2724 		swapout_target_age = 0;
2725 		compressor_need_sample_reset = TRUE;
2726 	}
2727 #if CONFIG_PHANTOM_CACHE
2728 	else {
2729 		vm_phantom_cache_restart_sample();
2730 	}
2731 #endif
2732 }
2733 #endif /* CONFIG_JETSAM */
2734 
2735 uint32_t vm_wake_compactor_swapper_calls = 0;
2736 uint32_t vm_run_compactor_already_running = 0;
2737 uint32_t vm_run_compactor_empty_minor_q = 0;
2738 uint32_t vm_run_compactor_did_compact = 0;
2739 uint32_t vm_run_compactor_waited = 0;
2740 
2741 void
vm_run_compactor(void)2742 vm_run_compactor(void)
2743 {
2744 	if (c_segment_count == 0) {
2745 		return;
2746 	}
2747 
2748 	lck_mtx_lock_spin_always(c_list_lock);
2749 
2750 	if (c_minor_count == 0) {
2751 		vm_run_compactor_empty_minor_q++;
2752 
2753 		lck_mtx_unlock_always(c_list_lock);
2754 		return;
2755 	}
2756 	if (compaction_swapper_running) {
2757 		if (vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2758 			vm_run_compactor_already_running++;
2759 
2760 			lck_mtx_unlock_always(c_list_lock);
2761 			return;
2762 		}
2763 		vm_run_compactor_waited++;
2764 
2765 		assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
2766 
2767 		lck_mtx_unlock_always(c_list_lock);
2768 
2769 		thread_block(THREAD_CONTINUE_NULL);
2770 
2771 		return;
2772 	}
2773 	vm_run_compactor_did_compact++;
2774 
2775 	fastwake_warmup = FALSE;
2776 	compaction_swapper_running = 1;
2777 
2778 	vm_compressor_do_delayed_compactions(FALSE);
2779 
2780 	compaction_swapper_running = 0;
2781 
2782 	lck_mtx_unlock_always(c_list_lock);
2783 
2784 	thread_wakeup((event_t)&compaction_swapper_running);
2785 }
2786 
2787 
2788 void
vm_wake_compactor_swapper(void)2789 vm_wake_compactor_swapper(void)
2790 {
2791 	if (compaction_swapper_running || compaction_swapper_awakened || c_segment_count == 0) {
2792 		return;
2793 	}
2794 
2795 	if (c_minor_count || vm_compressor_needs_to_major_compact()) {
2796 		lck_mtx_lock_spin_always(c_list_lock);
2797 
2798 		fastwake_warmup = FALSE;
2799 
2800 		if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
2801 			vm_wake_compactor_swapper_calls++;
2802 
2803 			compaction_swapper_awakened = 1;
2804 			thread_wakeup((event_t)&c_compressor_swap_trigger);
2805 		}
2806 		lck_mtx_unlock_always(c_list_lock);
2807 	}
2808 }
2809 
2810 
2811 void
vm_consider_swapping()2812 vm_consider_swapping()
2813 {
2814 	c_segment_t     c_seg, c_seg_next;
2815 	clock_sec_t     now;
2816 	clock_nsec_t    nsec;
2817 
2818 	assert(VM_CONFIG_SWAP_IS_PRESENT);
2819 
2820 	lck_mtx_lock_spin_always(c_list_lock);
2821 
2822 	compaction_swapper_abort = 1;
2823 
2824 	while (compaction_swapper_running) {
2825 		assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
2826 
2827 		lck_mtx_unlock_always(c_list_lock);
2828 
2829 		thread_block(THREAD_CONTINUE_NULL);
2830 
2831 		lck_mtx_lock_spin_always(c_list_lock);
2832 	}
2833 	compaction_swapper_abort = 0;
2834 	compaction_swapper_running = 1;
2835 
2836 	vm_swapout_ripe_segments = TRUE;
2837 
2838 	if (!queue_empty(&c_major_list_head)) {
2839 		clock_get_system_nanotime(&now, &nsec);
2840 
2841 		c_seg = (c_segment_t)queue_first(&c_major_list_head);
2842 
2843 		while (!queue_end(&c_major_list_head, (queue_entry_t)c_seg)) {
2844 			if (c_overage_swapped_count >= c_overage_swapped_limit) {
2845 				break;
2846 			}
2847 
2848 			c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
2849 
2850 			if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
2851 				lck_mtx_lock_spin_always(&c_seg->c_lock);
2852 
2853 				c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
2854 
2855 				lck_mtx_unlock_always(&c_seg->c_lock);
2856 			}
2857 			c_seg = c_seg_next;
2858 		}
2859 	}
2860 	vm_compressor_compact_and_swap(FALSE);
2861 
2862 	compaction_swapper_running = 0;
2863 
2864 	vm_swapout_ripe_segments = FALSE;
2865 
2866 	lck_mtx_unlock_always(c_list_lock);
2867 
2868 	thread_wakeup((event_t)&compaction_swapper_running);
2869 }
2870 
2871 
2872 void
vm_consider_waking_compactor_swapper(void)2873 vm_consider_waking_compactor_swapper(void)
2874 {
2875 	boolean_t       need_wakeup = FALSE;
2876 
2877 	if (c_segment_count == 0) {
2878 		return;
2879 	}
2880 
2881 	if (compaction_swapper_running || compaction_swapper_awakened) {
2882 		return;
2883 	}
2884 
2885 	if (!compaction_swapper_inited && !compaction_swapper_init_now) {
2886 		compaction_swapper_init_now = 1;
2887 		need_wakeup = TRUE;
2888 	}
2889 
2890 	if (c_minor_count && (COMPRESSOR_NEEDS_TO_MINOR_COMPACT())) {
2891 		need_wakeup = TRUE;
2892 	} else if (compressor_needs_to_swap()) {
2893 		need_wakeup = TRUE;
2894 	} else if (c_minor_count) {
2895 		uint64_t        total_bytes;
2896 
2897 		total_bytes = compressor_object->resident_page_count * PAGE_SIZE_64;
2898 
2899 		if ((total_bytes - compressor_bytes_used) > total_bytes / 10) {
2900 			need_wakeup = TRUE;
2901 		}
2902 	}
2903 	if (need_wakeup == TRUE) {
2904 		lck_mtx_lock_spin_always(c_list_lock);
2905 
2906 		fastwake_warmup = FALSE;
2907 
2908 		if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
2909 			memoryshot(VM_WAKEUP_COMPACTOR_SWAPPER, DBG_FUNC_NONE);
2910 
2911 			compaction_swapper_awakened = 1;
2912 			thread_wakeup((event_t)&c_compressor_swap_trigger);
2913 		}
2914 		lck_mtx_unlock_always(c_list_lock);
2915 	}
2916 }
2917 
2918 
2919 #define C_SWAPOUT_LIMIT                 4
2920 #define DELAYED_COMPACTIONS_PER_PASS    30
2921 
2922 void
vm_compressor_do_delayed_compactions(boolean_t flush_all)2923 vm_compressor_do_delayed_compactions(boolean_t flush_all)
2924 {
2925 	c_segment_t     c_seg;
2926 	int             number_compacted = 0;
2927 	boolean_t       needs_to_swap = FALSE;
2928 	uint32_t        c_swapout_count = 0;
2929 
2930 
2931 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_START, c_minor_count, flush_all, 0, 0);
2932 
2933 #if XNU_TARGET_OS_OSX
2934 	LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
2935 #endif /* XNU_TARGET_OS_OSX */
2936 
2937 	while (!queue_empty(&c_minor_list_head) && needs_to_swap == FALSE) {
2938 		c_seg = (c_segment_t)queue_first(&c_minor_list_head);
2939 
2940 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2941 
2942 		if (c_seg->c_busy) {
2943 			lck_mtx_unlock_always(c_list_lock);
2944 			c_seg_wait_on_busy(c_seg);
2945 			lck_mtx_lock_spin_always(c_list_lock);
2946 
2947 			continue;
2948 		}
2949 		C_SEG_BUSY(c_seg);
2950 
2951 		c_seg_do_minor_compaction_and_unlock(c_seg, TRUE, FALSE, TRUE);
2952 
2953 		c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
2954 		if (VM_CONFIG_SWAP_IS_ACTIVE && (number_compacted++ > DELAYED_COMPACTIONS_PER_PASS)) {
2955 			if ((flush_all == TRUE || compressor_needs_to_swap()) && c_swapout_count < C_SWAPOUT_LIMIT) {
2956 				needs_to_swap = TRUE;
2957 			}
2958 
2959 			number_compacted = 0;
2960 		}
2961 		lck_mtx_lock_spin_always(c_list_lock);
2962 	}
2963 
2964 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_END, c_minor_count, number_compacted, needs_to_swap, 0);
2965 }
2966 
2967 int min_csegs_per_major_compaction = DELAYED_COMPACTIONS_PER_PASS;
2968 
2969 static bool
vm_compressor_major_compact_cseg(c_segment_t c_seg,uint32_t * c_seg_considered,bool * bail_wanted_cseg,uint64_t * total_bytes_freed)2970 vm_compressor_major_compact_cseg(c_segment_t c_seg, uint32_t* c_seg_considered, bool* bail_wanted_cseg, uint64_t* total_bytes_freed)
2971 {
2972 	/*
2973 	 * Major compaction
2974 	 */
2975 	bool keep_compacting = true, fully_compacted = true;
2976 	queue_head_t *list_head = NULL;
2977 	c_segment_t c_seg_next;
2978 	uint64_t        bytes_to_free = 0, bytes_freed = 0;
2979 	uint32_t        number_considered = 0;
2980 
2981 	if (c_seg->c_state == C_ON_AGE_Q) {
2982 		assert(!c_seg->c_has_donated_pages);
2983 		list_head = &c_age_list_head;
2984 	} else if (c_seg->c_state == C_ON_SWAPPEDIN_Q) {
2985 		assert(c_seg->c_has_donated_pages);
2986 		list_head = &c_late_swappedin_list_head;
2987 	}
2988 
2989 	while (keep_compacting == TRUE) {
2990 		assert(c_seg->c_busy);
2991 
2992 		/* look for another segment to consolidate */
2993 
2994 		c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
2995 
2996 		if (queue_end(list_head, (queue_entry_t)c_seg_next)) {
2997 			break;
2998 		}
2999 
3000 		assert(c_seg_next->c_state == c_seg->c_state);
3001 
3002 		number_considered++;
3003 
3004 		if (c_seg_major_compact_ok(c_seg, c_seg_next) == FALSE) {
3005 			break;
3006 		}
3007 
3008 		lck_mtx_lock_spin_always(&c_seg_next->c_lock);
3009 
3010 		if (c_seg_next->c_busy) {
3011 			/*
3012 			 * We are going to block for our neighbor.
3013 			 * If our c_seg is wanted, we should unbusy
3014 			 * it because we don't know how long we might
3015 			 * have to block here.
3016 			 */
3017 			if (c_seg->c_wanted) {
3018 				lck_mtx_unlock_always(&c_seg_next->c_lock);
3019 				fully_compacted = false;
3020 				c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
3021 				*bail_wanted_cseg = true;
3022 				break;
3023 			}
3024 
3025 			lck_mtx_unlock_always(c_list_lock);
3026 
3027 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 8, (void*) VM_KERNEL_ADDRPERM(c_seg_next), 0, 0);
3028 
3029 			c_seg_wait_on_busy(c_seg_next);
3030 			lck_mtx_lock_spin_always(c_list_lock);
3031 
3032 			continue;
3033 		}
3034 		/* grab that segment */
3035 		C_SEG_BUSY(c_seg_next);
3036 
3037 		bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3038 		if (c_seg_do_minor_compaction_and_unlock(c_seg_next, FALSE, TRUE, TRUE)) {
3039 			/*
3040 			 * found an empty c_segment and freed it
3041 			 * so we can't continue to use c_seg_next
3042 			 */
3043 			bytes_freed += bytes_to_free;
3044 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3045 			continue;
3046 		}
3047 
3048 		/* unlock the list ... */
3049 		lck_mtx_unlock_always(c_list_lock);
3050 
3051 		/* do the major compaction */
3052 
3053 		keep_compacting = c_seg_major_compact(c_seg, c_seg_next);
3054 
3055 		VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 9, keep_compacting, 0, 0);
3056 
3057 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
3058 
3059 		lck_mtx_lock_spin_always(&c_seg_next->c_lock);
3060 		/*
3061 		 * run a minor compaction on the donor segment
3062 		 * since we pulled at least some of it's
3063 		 * data into our target...  if we've emptied
3064 		 * it, now is a good time to free it which
3065 		 * c_seg_minor_compaction_and_unlock also takes care of
3066 		 *
3067 		 * by passing TRUE, we ask for c_busy to be cleared
3068 		 * and c_wanted to be taken care of
3069 		 */
3070 		bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3071 		if (c_seg_minor_compaction_and_unlock(c_seg_next, TRUE)) {
3072 			bytes_freed += bytes_to_free;
3073 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3074 		} else {
3075 			bytes_to_free -= C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3076 			bytes_freed += bytes_to_free;
3077 		}
3078 
3079 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
3080 
3081 		/* relock the list */
3082 		lck_mtx_lock_spin_always(c_list_lock);
3083 
3084 		if (c_seg->c_wanted) {
3085 			/*
3086 			 * Our c_seg is in demand. Let's
3087 			 * unbusy it and wakeup the waiters
3088 			 * instead of continuing the compaction
3089 			 * because we could be in this loop
3090 			 * for a while.
3091 			 */
3092 			fully_compacted = false;
3093 			*bail_wanted_cseg = true;
3094 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
3095 			break;
3096 		}
3097 	} /* major compaction */
3098 
3099 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 10, number_considered, *bail_wanted_cseg, 0);
3100 
3101 	*c_seg_considered += number_considered;
3102 	*total_bytes_freed += bytes_freed;
3103 
3104 	lck_mtx_lock_spin_always(&c_seg->c_lock);
3105 	return fully_compacted;
3106 }
3107 
3108 #define TIME_SUB(rsecs, secs, rfrac, frac, unit)                        \
3109 	MACRO_BEGIN                                                     \
3110 	if ((int)((rfrac) -= (frac)) < 0) {                             \
3111 	        (rfrac) += (unit);                                      \
3112 	        (rsecs) -= 1;                                           \
3113 	}                                                               \
3114 	(rsecs) -= (secs);                                              \
3115 	MACRO_END
3116 
3117 #if (XNU_TARGET_OS_OSX && __arm64__)
3118 clock_nsec_t c_process_major_report_over_ms = 9; /* report if over 9 ms */
3119 int c_process_major_yield_after = 1000; /* yield after moving 1,000 segments */
3120 uint64_t c_process_major_reports = 0;
3121 clock_sec_t c_process_major_max_sec = 0;
3122 clock_nsec_t c_process_major_max_nsec = 0;
3123 uint32_t c_process_major_peak_segcount = 0;
3124 static void
vm_compressor_process_major_segments(void)3125 vm_compressor_process_major_segments(void)
3126 {
3127 	c_segment_t c_seg = NULL;
3128 	int count = 0, total = 0, breaks = 0;
3129 	clock_sec_t start_sec, end_sec;
3130 	clock_nsec_t start_nsec, end_nsec;
3131 	clock_nsec_t report_over_ns;
3132 
3133 	if (queue_empty(&c_major_list_head)) {
3134 		return;
3135 	}
3136 
3137 	// printf("%s: starting to move segments from MAJORQ to AGEQ\n", __FUNCTION__);
3138 	if (c_process_major_report_over_ms != 0) {
3139 		report_over_ns = c_process_major_report_over_ms * NSEC_PER_MSEC;
3140 	} else {
3141 		report_over_ns = (clock_nsec_t)-1;
3142 	}
3143 	clock_get_system_nanotime(&start_sec, &start_nsec);
3144 	while (!queue_empty(&c_major_list_head)) {
3145 		/* start from the end to preserve aging order */
3146 		c_seg = (c_segment_t)queue_last(&c_major_list_head);
3147 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3148 		c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3149 		lck_mtx_unlock_always(&c_seg->c_lock);
3150 
3151 		count++;
3152 		if (count == c_process_major_yield_after ||
3153 		    queue_empty(&c_major_list_head)) {
3154 			/* done or time to take a break */
3155 		} else {
3156 			/* keep going */
3157 			continue;
3158 		}
3159 
3160 		total += count;
3161 		clock_get_system_nanotime(&end_sec, &end_nsec);
3162 		TIME_SUB(end_sec, start_sec, end_nsec, start_nsec, NSEC_PER_SEC);
3163 		if (end_sec > c_process_major_max_sec) {
3164 			c_process_major_max_sec = end_sec;
3165 			c_process_major_max_nsec = end_nsec;
3166 		} else if (end_sec == c_process_major_max_sec &&
3167 		    end_nsec > c_process_major_max_nsec) {
3168 			c_process_major_max_nsec = end_nsec;
3169 		}
3170 		if (total > c_process_major_peak_segcount) {
3171 			c_process_major_peak_segcount = total;
3172 		}
3173 		if (end_sec > 0 ||
3174 		    end_nsec >= report_over_ns) {
3175 			/* we used more than expected */
3176 			c_process_major_reports++;
3177 			printf("%s: moved %d/%d segments from MAJORQ to AGEQ in %lu.%09u seconds and %d breaks\n",
3178 			    __FUNCTION__, count, total,
3179 			    end_sec, end_nsec, breaks);
3180 		}
3181 		if (queue_empty(&c_major_list_head)) {
3182 			/* done */
3183 			break;
3184 		}
3185 		/* take a break to allow someone else to grab the lock */
3186 		lck_mtx_unlock_always(c_list_lock);
3187 		mutex_pause(0); /* 10 microseconds */
3188 		lck_mtx_lock_spin_always(c_list_lock);
3189 		/* start again */
3190 		clock_get_system_nanotime(&start_sec, &start_nsec);
3191 		count = 0;
3192 		breaks++;
3193 	}
3194 }
3195 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3196 
3197 /*
3198  * macOS special swappable csegs -> early_swapin queue
3199  * non-macOS special swappable+non-freezer csegs -> late_swapin queue
3200  * Processing special csegs means minor compacting each cseg and then
3201  * major compacting it and putting them on the early or late
3202  * (depending on platform) swapout queue.
3203  */
3204 static void
vm_compressor_process_special_swapped_in_segments_locked(void)3205 vm_compressor_process_special_swapped_in_segments_locked(void)
3206 {
3207 	c_segment_t c_seg = NULL;
3208 	bool            switch_state = true, bail_wanted_cseg = false;
3209 	unsigned int    number_considered = 0, yield_after_considered_per_pass = 0;
3210 	uint64_t        bytes_freed = 0;
3211 	queue_head_t    *special_swappedin_list_head;
3212 
3213 #if XNU_TARGET_OS_OSX
3214 	special_swappedin_list_head = &c_early_swappedin_list_head;
3215 #else /* XNU_TARGET_OS_OSX */
3216 	if (memorystatus_swap_all_apps) {
3217 		special_swappedin_list_head = &c_late_swappedin_list_head;
3218 	} else {
3219 		/* called on unsupported config*/
3220 		return;
3221 	}
3222 #endif /* XNU_TARGET_OS_OSX */
3223 
3224 	yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS);
3225 	while (!queue_empty(special_swappedin_list_head)) {
3226 		c_seg = (c_segment_t)queue_first(special_swappedin_list_head);
3227 
3228 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3229 
3230 		if (c_seg->c_busy) {
3231 			lck_mtx_unlock_always(c_list_lock);
3232 			c_seg_wait_on_busy(c_seg);
3233 			lck_mtx_lock_spin_always(c_list_lock);
3234 			continue;
3235 		}
3236 
3237 		C_SEG_BUSY(c_seg);
3238 		lck_mtx_unlock_always(&c_seg->c_lock);
3239 		lck_mtx_unlock_always(c_list_lock);
3240 
3241 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
3242 
3243 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3244 
3245 		if (c_seg_minor_compaction_and_unlock(c_seg, FALSE /*clear busy?*/)) {
3246 			/*
3247 			 * found an empty c_segment and freed it
3248 			 * so go grab the next guy in the queue
3249 			 */
3250 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
3251 			lck_mtx_lock_spin_always(c_list_lock);
3252 			continue;
3253 		}
3254 
3255 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
3256 		lck_mtx_lock_spin_always(c_list_lock);
3257 
3258 		switch_state = vm_compressor_major_compact_cseg(c_seg, &number_considered, &bail_wanted_cseg, &bytes_freed);
3259 		assert(c_seg->c_busy);
3260 		assert(!c_seg->c_on_minorcompact_q);
3261 
3262 		if (switch_state) {
3263 			if (VM_CONFIG_SWAP_IS_ACTIVE || VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
3264 				/*
3265 				 * Ordinarily we let swapped in segments age out + get
3266 				 * major compacted with the rest of the c_segs on the ageQ.
3267 				 * But the early donated c_segs, if well compacted, should be
3268 				 * kept ready to be swapped out if needed. These are typically
3269 				 * describing memory belonging to a leaky app (macOS) or a swap-
3270 				 * capable app (iPadOS) and for the latter we can keep these
3271 				 * around longer because we control the triggers in the memorystatus
3272 				 * subsystem
3273 				 */
3274 				c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
3275 			}
3276 		}
3277 
3278 		C_SEG_WAKEUP_DONE(c_seg);
3279 
3280 		lck_mtx_unlock_always(&c_seg->c_lock);
3281 
3282 		if (number_considered >= yield_after_considered_per_pass) {
3283 			if (bail_wanted_cseg) {
3284 				/*
3285 				 * We stopped major compactions on a c_seg
3286 				 * that is wanted. We don't know the priority
3287 				 * of the waiter unfortunately but we are at
3288 				 * a very high priority and so, just in case
3289 				 * the waiter is a critical system daemon or
3290 				 * UI thread, let's give up the CPU in case
3291 				 * the system is running a few CPU intensive
3292 				 * tasks.
3293 				 */
3294 				bail_wanted_cseg = false;
3295 				lck_mtx_unlock_always(c_list_lock);
3296 
3297 				mutex_pause(2); /* 100us yield */
3298 
3299 				lck_mtx_lock_spin_always(c_list_lock);
3300 			}
3301 
3302 			number_considered = 0;
3303 		}
3304 	}
3305 }
3306 
3307 void
vm_compressor_process_special_swapped_in_segments(void)3308 vm_compressor_process_special_swapped_in_segments(void)
3309 {
3310 	lck_mtx_lock_spin_always(c_list_lock);
3311 	vm_compressor_process_special_swapped_in_segments_locked();
3312 	lck_mtx_unlock_always(c_list_lock);
3313 }
3314 
3315 #define C_SEGMENT_SWAPPEDIN_AGE_LIMIT   10
3316 /*
3317  * Processing regular csegs means aging them.
3318  */
3319 static void
vm_compressor_process_regular_swapped_in_segments(boolean_t flush_all)3320 vm_compressor_process_regular_swapped_in_segments(boolean_t flush_all)
3321 {
3322 	c_segment_t     c_seg;
3323 	clock_sec_t     now;
3324 	clock_nsec_t    nsec;
3325 
3326 	clock_get_system_nanotime(&now, &nsec);
3327 
3328 	while (!queue_empty(&c_regular_swappedin_list_head)) {
3329 		c_seg = (c_segment_t)queue_first(&c_regular_swappedin_list_head);
3330 
3331 		if (flush_all == FALSE && (now - c_seg->c_swappedin_ts) < C_SEGMENT_SWAPPEDIN_AGE_LIMIT) {
3332 			break;
3333 		}
3334 
3335 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3336 
3337 		c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3338 		c_seg->c_agedin_ts = (uint32_t) now;
3339 
3340 		lck_mtx_unlock_always(&c_seg->c_lock);
3341 	}
3342 }
3343 
3344 
3345 extern  int     vm_num_swap_files;
3346 extern  int     vm_num_pinned_swap_files;
3347 extern  int     vm_swappin_enabled;
3348 
3349 extern  unsigned int    vm_swapfile_total_segs_used;
3350 extern  unsigned int    vm_swapfile_total_segs_alloced;
3351 
3352 
3353 void
vm_compressor_flush(void)3354 vm_compressor_flush(void)
3355 {
3356 	uint64_t        vm_swap_put_failures_at_start;
3357 	wait_result_t   wait_result = 0;
3358 	AbsoluteTime    startTime, endTime;
3359 	clock_sec_t     now_sec;
3360 	clock_nsec_t    now_nsec;
3361 	uint64_t        nsec;
3362 	c_segment_t     c_seg, c_seg_next;
3363 
3364 	HIBLOG("vm_compressor_flush - starting\n");
3365 
3366 	clock_get_uptime(&startTime);
3367 
3368 	lck_mtx_lock_spin_always(c_list_lock);
3369 
3370 	fastwake_warmup = FALSE;
3371 	compaction_swapper_abort = 1;
3372 
3373 	while (compaction_swapper_running) {
3374 		assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
3375 
3376 		lck_mtx_unlock_always(c_list_lock);
3377 
3378 		thread_block(THREAD_CONTINUE_NULL);
3379 
3380 		lck_mtx_lock_spin_always(c_list_lock);
3381 	}
3382 	compaction_swapper_abort = 0;
3383 	compaction_swapper_running = 1;
3384 
3385 	hibernate_flushing = TRUE;
3386 	hibernate_no_swapspace = FALSE;
3387 	hibernate_flush_timed_out = FALSE;
3388 	c_generation_id_flush_barrier = c_generation_id + 1000;
3389 
3390 	clock_get_system_nanotime(&now_sec, &now_nsec);
3391 	hibernate_flushing_deadline = now_sec + HIBERNATE_FLUSHING_SECS_TO_COMPLETE;
3392 
3393 	vm_swap_put_failures_at_start = vm_swap_put_failures;
3394 
3395 	/*
3396 	 * We are about to hibernate and so we want all segments flushed to disk.
3397 	 * Segments that are on the major compaction queue won't be considered in
3398 	 * the vm_compressor_compact_and_swap() pass. So we need to bring them to
3399 	 * the ageQ for consideration.
3400 	 */
3401 	if (!queue_empty(&c_major_list_head)) {
3402 		c_seg = (c_segment_t)queue_first(&c_major_list_head);
3403 
3404 		while (!queue_end(&c_major_list_head, (queue_entry_t)c_seg)) {
3405 			c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
3406 			lck_mtx_lock_spin_always(&c_seg->c_lock);
3407 			c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3408 			lck_mtx_unlock_always(&c_seg->c_lock);
3409 			c_seg = c_seg_next;
3410 		}
3411 	}
3412 	vm_compressor_compact_and_swap(TRUE);
3413 
3414 	while (!queue_empty(&c_early_swapout_list_head) || !queue_empty(&c_regular_swapout_list_head) || !queue_empty(&c_late_swapout_list_head)) {
3415 		assert_wait_timeout((event_t) &compaction_swapper_running, THREAD_INTERRUPTIBLE, 5000, 1000 * NSEC_PER_USEC);
3416 
3417 		lck_mtx_unlock_always(c_list_lock);
3418 
3419 		wait_result = thread_block(THREAD_CONTINUE_NULL);
3420 
3421 		lck_mtx_lock_spin_always(c_list_lock);
3422 
3423 		if (wait_result == THREAD_TIMED_OUT) {
3424 			break;
3425 		}
3426 	}
3427 	hibernate_flushing = FALSE;
3428 	compaction_swapper_running = 0;
3429 
3430 	if (vm_swap_put_failures > vm_swap_put_failures_at_start) {
3431 		HIBLOG("vm_compressor_flush failed to clean %llu segments - vm_page_compressor_count(%d)\n",
3432 		    vm_swap_put_failures - vm_swap_put_failures_at_start, VM_PAGE_COMPRESSOR_COUNT);
3433 	}
3434 
3435 	lck_mtx_unlock_always(c_list_lock);
3436 
3437 	thread_wakeup((event_t)&compaction_swapper_running);
3438 
3439 	clock_get_uptime(&endTime);
3440 	SUB_ABSOLUTETIME(&endTime, &startTime);
3441 	absolutetime_to_nanoseconds(endTime, &nsec);
3442 
3443 	HIBLOG("vm_compressor_flush completed - took %qd msecs - vm_num_swap_files = %d, vm_num_pinned_swap_files = %d, vm_swappin_enabled = %d\n",
3444 	    nsec / 1000000ULL, vm_num_swap_files, vm_num_pinned_swap_files, vm_swappin_enabled);
3445 }
3446 
3447 
3448 int             compaction_swap_trigger_thread_awakened = 0;
3449 
3450 static void
vm_compressor_swap_trigger_thread(void)3451 vm_compressor_swap_trigger_thread(void)
3452 {
3453 	current_thread()->options |= TH_OPT_VMPRIV;
3454 
3455 	/*
3456 	 * compaction_swapper_init_now is set when the first call to
3457 	 * vm_consider_waking_compactor_swapper is made from
3458 	 * vm_pageout_scan... since this function is called upon
3459 	 * thread creation, we want to make sure to delay adjusting
3460 	 * the tuneables until we are awakened via vm_pageout_scan
3461 	 * so that we are at a point where the vm_swapfile_open will
3462 	 * be operating on the correct directory (in case the default
3463 	 * of using the VM volume is overridden by the dynamic_pager)
3464 	 */
3465 	if (compaction_swapper_init_now) {
3466 		vm_compaction_swapper_do_init();
3467 
3468 		if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
3469 			thread_vm_bind_group_add();
3470 		}
3471 #if CONFIG_THREAD_GROUPS
3472 		thread_group_vm_add();
3473 #endif
3474 		thread_set_thread_name(current_thread(), "VM_cswap_trigger");
3475 		compaction_swapper_init_now = 0;
3476 	}
3477 	lck_mtx_lock_spin_always(c_list_lock);
3478 
3479 	compaction_swap_trigger_thread_awakened++;
3480 	compaction_swapper_awakened = 0;
3481 
3482 	if (compaction_swapper_running == 0) {
3483 		compaction_swapper_running = 1;
3484 
3485 		vm_compressor_compact_and_swap(FALSE);
3486 
3487 		compaction_swapper_running = 0;
3488 	}
3489 	assert_wait((event_t)&c_compressor_swap_trigger, THREAD_UNINT);
3490 
3491 	if (compaction_swapper_running == 0) {
3492 		thread_wakeup((event_t)&compaction_swapper_running);
3493 	}
3494 
3495 	lck_mtx_unlock_always(c_list_lock);
3496 
3497 	thread_block((thread_continue_t)vm_compressor_swap_trigger_thread);
3498 
3499 	/* NOTREACHED */
3500 }
3501 
3502 
3503 void
vm_compressor_record_warmup_start(void)3504 vm_compressor_record_warmup_start(void)
3505 {
3506 	c_segment_t     c_seg;
3507 
3508 	lck_mtx_lock_spin_always(c_list_lock);
3509 
3510 	if (first_c_segment_to_warm_generation_id == 0) {
3511 		if (!queue_empty(&c_age_list_head)) {
3512 			c_seg = (c_segment_t)queue_last(&c_age_list_head);
3513 
3514 			first_c_segment_to_warm_generation_id = c_seg->c_generation_id;
3515 		} else {
3516 			first_c_segment_to_warm_generation_id = 0;
3517 		}
3518 
3519 		fastwake_recording_in_progress = TRUE;
3520 	}
3521 	lck_mtx_unlock_always(c_list_lock);
3522 }
3523 
3524 
3525 void
vm_compressor_record_warmup_end(void)3526 vm_compressor_record_warmup_end(void)
3527 {
3528 	c_segment_t     c_seg;
3529 
3530 	lck_mtx_lock_spin_always(c_list_lock);
3531 
3532 	if (fastwake_recording_in_progress == TRUE) {
3533 		if (!queue_empty(&c_age_list_head)) {
3534 			c_seg = (c_segment_t)queue_last(&c_age_list_head);
3535 
3536 			last_c_segment_to_warm_generation_id = c_seg->c_generation_id;
3537 		} else {
3538 			last_c_segment_to_warm_generation_id = first_c_segment_to_warm_generation_id;
3539 		}
3540 
3541 		fastwake_recording_in_progress = FALSE;
3542 
3543 		HIBLOG("vm_compressor_record_warmup (%qd - %qd)\n", first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id);
3544 	}
3545 	lck_mtx_unlock_always(c_list_lock);
3546 }
3547 
3548 
3549 #define DELAY_TRIM_ON_WAKE_SECS         25
3550 
3551 void
vm_compressor_delay_trim(void)3552 vm_compressor_delay_trim(void)
3553 {
3554 	clock_sec_t     sec;
3555 	clock_nsec_t    nsec;
3556 
3557 	clock_get_system_nanotime(&sec, &nsec);
3558 	dont_trim_until_ts = sec + DELAY_TRIM_ON_WAKE_SECS;
3559 }
3560 
3561 
3562 void
vm_compressor_do_warmup(void)3563 vm_compressor_do_warmup(void)
3564 {
3565 	lck_mtx_lock_spin_always(c_list_lock);
3566 
3567 	if (first_c_segment_to_warm_generation_id == last_c_segment_to_warm_generation_id) {
3568 		first_c_segment_to_warm_generation_id = last_c_segment_to_warm_generation_id = 0;
3569 
3570 		lck_mtx_unlock_always(c_list_lock);
3571 		return;
3572 	}
3573 
3574 	if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
3575 		fastwake_warmup = TRUE;
3576 
3577 		compaction_swapper_awakened = 1;
3578 		thread_wakeup((event_t)&c_compressor_swap_trigger);
3579 	}
3580 	lck_mtx_unlock_always(c_list_lock);
3581 }
3582 
3583 void
do_fastwake_warmup_all(void)3584 do_fastwake_warmup_all(void)
3585 {
3586 	lck_mtx_lock_spin_always(c_list_lock);
3587 
3588 	if (queue_empty(&c_swappedout_list_head) && queue_empty(&c_swappedout_sparse_list_head)) {
3589 		lck_mtx_unlock_always(c_list_lock);
3590 		return;
3591 	}
3592 
3593 	fastwake_warmup = TRUE;
3594 
3595 	do_fastwake_warmup(&c_swappedout_list_head, TRUE);
3596 
3597 	do_fastwake_warmup(&c_swappedout_sparse_list_head, TRUE);
3598 
3599 	fastwake_warmup = FALSE;
3600 
3601 	lck_mtx_unlock_always(c_list_lock);
3602 }
3603 
3604 void
do_fastwake_warmup(queue_head_t * c_queue,boolean_t consider_all_cseg)3605 do_fastwake_warmup(queue_head_t *c_queue, boolean_t consider_all_cseg)
3606 {
3607 	c_segment_t     c_seg = NULL;
3608 	AbsoluteTime    startTime, endTime;
3609 	uint64_t        nsec;
3610 
3611 
3612 	HIBLOG("vm_compressor_fastwake_warmup (%qd - %qd) - starting\n", first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id);
3613 
3614 	clock_get_uptime(&startTime);
3615 
3616 	lck_mtx_unlock_always(c_list_lock);
3617 
3618 	proc_set_thread_policy(current_thread(),
3619 	    TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2);
3620 
3621 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
3622 
3623 	lck_mtx_lock_spin_always(c_list_lock);
3624 
3625 	while (!queue_empty(c_queue) && fastwake_warmup == TRUE) {
3626 		c_seg = (c_segment_t) queue_first(c_queue);
3627 
3628 		if (consider_all_cseg == FALSE) {
3629 			if (c_seg->c_generation_id < first_c_segment_to_warm_generation_id ||
3630 			    c_seg->c_generation_id > last_c_segment_to_warm_generation_id) {
3631 				break;
3632 			}
3633 
3634 			if (vm_page_free_count < (AVAILABLE_MEMORY / 4)) {
3635 				break;
3636 			}
3637 		}
3638 
3639 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3640 		lck_mtx_unlock_always(c_list_lock);
3641 
3642 		if (c_seg->c_busy) {
3643 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
3644 			c_seg_wait_on_busy(c_seg);
3645 			PAGE_REPLACEMENT_DISALLOWED(TRUE);
3646 		} else {
3647 			if (c_seg_swapin(c_seg, TRUE, FALSE) == 0) {
3648 				lck_mtx_unlock_always(&c_seg->c_lock);
3649 			}
3650 			c_segment_warmup_count++;
3651 
3652 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
3653 			vm_pageout_io_throttle();
3654 			PAGE_REPLACEMENT_DISALLOWED(TRUE);
3655 		}
3656 		lck_mtx_lock_spin_always(c_list_lock);
3657 	}
3658 	lck_mtx_unlock_always(c_list_lock);
3659 
3660 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
3661 
3662 	proc_set_thread_policy(current_thread(),
3663 	    TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER0);
3664 
3665 	clock_get_uptime(&endTime);
3666 	SUB_ABSOLUTETIME(&endTime, &startTime);
3667 	absolutetime_to_nanoseconds(endTime, &nsec);
3668 
3669 	HIBLOG("vm_compressor_fastwake_warmup completed - took %qd msecs\n", nsec / 1000000ULL);
3670 
3671 	lck_mtx_lock_spin_always(c_list_lock);
3672 
3673 	if (consider_all_cseg == FALSE) {
3674 		first_c_segment_to_warm_generation_id = last_c_segment_to_warm_generation_id = 0;
3675 	}
3676 }
3677 
3678 extern bool     vm_swapout_thread_running;
3679 extern boolean_t        compressor_store_stop_compaction;
3680 
3681 void
vm_compressor_compact_and_swap(boolean_t flush_all)3682 vm_compressor_compact_and_swap(boolean_t flush_all)
3683 {
3684 	c_segment_t     c_seg;
3685 	bool            switch_state, bail_wanted_cseg = false;
3686 	clock_sec_t     now;
3687 	clock_nsec_t    nsec;
3688 	mach_timespec_t start_ts, end_ts;
3689 	unsigned int    number_considered, wanted_cseg_found, yield_after_considered_per_pass, number_yields;
3690 	uint64_t        bytes_freed, delta_usec;
3691 	uint32_t        c_swapout_count = 0;
3692 
3693 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_START, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
3694 
3695 	if (fastwake_warmup == TRUE) {
3696 		uint64_t        starting_warmup_count;
3697 
3698 		starting_warmup_count = c_segment_warmup_count;
3699 
3700 		KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 11) | DBG_FUNC_START, c_segment_warmup_count,
3701 		    first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id, 0, 0);
3702 		do_fastwake_warmup(&c_swappedout_list_head, FALSE);
3703 		KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 11) | DBG_FUNC_END, c_segment_warmup_count, c_segment_warmup_count - starting_warmup_count, 0, 0, 0);
3704 
3705 		fastwake_warmup = FALSE;
3706 	}
3707 
3708 #if (XNU_TARGET_OS_OSX && __arm64__)
3709 	/*
3710 	 * Re-considering major csegs showed benefits on all platforms by
3711 	 * significantly reducing fragmentation and getting back memory.
3712 	 * However, on smaller devices, eg watch, there was increased power
3713 	 * use for the additional compactions. And the turnover in csegs on
3714 	 * those smaller platforms is high enough in the decompression/free
3715 	 * path that we can skip reconsidering them here because we already
3716 	 * consider them for major compaction in those paths.
3717 	 */
3718 	vm_compressor_process_major_segments();
3719 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3720 
3721 	/*
3722 	 * it's possible for the c_age_list_head to be empty if we
3723 	 * hit our limits for growing the compressor pool and we subsequently
3724 	 * hibernated... on the next hibernation we could see the queue as
3725 	 * empty and not proceeed even though we have a bunch of segments on
3726 	 * the swapped in queue that need to be dealt with.
3727 	 */
3728 	vm_compressor_do_delayed_compactions(flush_all);
3729 	vm_compressor_process_special_swapped_in_segments_locked();
3730 	vm_compressor_process_regular_swapped_in_segments(flush_all);
3731 
3732 	/*
3733 	 * we only need to grab the timestamp once per
3734 	 * invocation of this function since the
3735 	 * timescale we're interested in is measured
3736 	 * in days
3737 	 */
3738 	clock_get_system_nanotime(&now, &nsec);
3739 
3740 	start_ts.tv_sec = (int) now;
3741 	start_ts.tv_nsec = nsec;
3742 	delta_usec = 0;
3743 	number_considered = 0;
3744 	wanted_cseg_found = 0;
3745 	number_yields = 0;
3746 	bytes_freed = 0;
3747 	yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS);
3748 
3749 #if 0
3750 	/**
3751 	 * SW: Need to figure out how to properly rate limit this log because it is currently way too
3752 	 * noisy. rdar://99379414 (Figure out how to rate limit the fragmentation level logging)
3753 	 */
3754 	os_log(OS_LOG_DEFAULT, "memorystatus: before compaction fragmentation level %u\n", vm_compressor_fragmentation_level());
3755 #endif
3756 
3757 	while (!queue_empty(&c_age_list_head) && !compaction_swapper_abort && !compressor_store_stop_compaction) {
3758 		if (hibernate_flushing == TRUE) {
3759 			clock_sec_t     sec;
3760 
3761 			if (hibernate_should_abort()) {
3762 				HIBLOG("vm_compressor_flush - hibernate_should_abort returned TRUE\n");
3763 				break;
3764 			}
3765 			if (hibernate_no_swapspace == TRUE) {
3766 				HIBLOG("vm_compressor_flush - out of swap space\n");
3767 				break;
3768 			}
3769 			if (vm_swap_files_pinned() == FALSE) {
3770 				HIBLOG("vm_compressor_flush - unpinned swap files\n");
3771 				break;
3772 			}
3773 			if (hibernate_in_progress_with_pinned_swap == TRUE &&
3774 			    (vm_swapfile_total_segs_alloced == vm_swapfile_total_segs_used)) {
3775 				HIBLOG("vm_compressor_flush - out of pinned swap space\n");
3776 				break;
3777 			}
3778 			clock_get_system_nanotime(&sec, &nsec);
3779 
3780 			if (sec > hibernate_flushing_deadline) {
3781 				hibernate_flush_timed_out = TRUE;
3782 				HIBLOG("vm_compressor_flush - failed to finish before deadline\n");
3783 				break;
3784 			}
3785 		}
3786 
3787 		c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
3788 		if (VM_CONFIG_SWAP_IS_ACTIVE && !vm_swap_out_of_space() && c_swapout_count >= C_SWAPOUT_LIMIT) {
3789 			assert_wait_timeout((event_t) &compaction_swapper_running, THREAD_INTERRUPTIBLE, 100, 1000 * NSEC_PER_USEC);
3790 
3791 			if (!vm_swapout_thread_running) {
3792 				thread_wakeup((event_t)&vm_swapout_thread);
3793 			}
3794 
3795 			lck_mtx_unlock_always(c_list_lock);
3796 
3797 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 1, c_swapout_count, 0, 0);
3798 
3799 			thread_block(THREAD_CONTINUE_NULL);
3800 
3801 			lck_mtx_lock_spin_always(c_list_lock);
3802 		}
3803 		/*
3804 		 * Minor compactions
3805 		 */
3806 		vm_compressor_do_delayed_compactions(flush_all);
3807 
3808 		/*
3809 		 * vm_compressor_process_early_swapped_in_segments()
3810 		 * might be too aggressive. So OFF for now.
3811 		 */
3812 		vm_compressor_process_regular_swapped_in_segments(flush_all);
3813 
3814 		/* Recompute because we dropped the c_list_lock above*/
3815 		c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
3816 		if (VM_CONFIG_SWAP_IS_ACTIVE && !vm_swap_out_of_space() && c_swapout_count >= C_SWAPOUT_LIMIT) {
3817 			/*
3818 			 * we timed out on the above thread_block
3819 			 * let's loop around and try again
3820 			 * the timeout allows us to continue
3821 			 * to do minor compactions to make
3822 			 * more memory available
3823 			 */
3824 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 2, c_swapout_count, 0, 0);
3825 
3826 			continue;
3827 		}
3828 
3829 		/*
3830 		 * Swap out segments?
3831 		 */
3832 		if (flush_all == FALSE) {
3833 			bool needs_to_swap;
3834 
3835 			lck_mtx_unlock_always(c_list_lock);
3836 
3837 			needs_to_swap = compressor_needs_to_swap();
3838 
3839 			lck_mtx_lock_spin_always(c_list_lock);
3840 
3841 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 3, needs_to_swap, 0, 0);
3842 
3843 			if (!needs_to_swap) {
3844 				break;
3845 			}
3846 		}
3847 		if (queue_empty(&c_age_list_head)) {
3848 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 4, c_age_count, 0, 0);
3849 			break;
3850 		}
3851 		c_seg = (c_segment_t) queue_first(&c_age_list_head);
3852 
3853 		assert(c_seg->c_state == C_ON_AGE_Q);
3854 
3855 		if (flush_all == TRUE && c_seg->c_generation_id > c_generation_id_flush_barrier) {
3856 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 5, 0, 0, 0);
3857 			break;
3858 		}
3859 
3860 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3861 
3862 		if (c_seg->c_busy) {
3863 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 6, (void*) VM_KERNEL_ADDRPERM(c_seg), 0, 0);
3864 
3865 			lck_mtx_unlock_always(c_list_lock);
3866 			c_seg_wait_on_busy(c_seg);
3867 			lck_mtx_lock_spin_always(c_list_lock);
3868 
3869 			continue;
3870 		}
3871 		C_SEG_BUSY(c_seg);
3872 
3873 		if (c_seg_do_minor_compaction_and_unlock(c_seg, FALSE, TRUE, TRUE)) {
3874 			/*
3875 			 * found an empty c_segment and freed it
3876 			 * so go grab the next guy in the queue
3877 			 */
3878 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 7, 0, 0, 0);
3879 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3880 			continue;
3881 		}
3882 
3883 		switch_state = vm_compressor_major_compact_cseg(c_seg, &number_considered, &bail_wanted_cseg, &bytes_freed);
3884 		if (bail_wanted_cseg) {
3885 			wanted_cseg_found++;
3886 			bail_wanted_cseg = false;
3887 		}
3888 
3889 		assert(c_seg->c_busy);
3890 		assert(!c_seg->c_on_minorcompact_q);
3891 
3892 		if (switch_state) {
3893 			if (VM_CONFIG_SWAP_IS_ACTIVE) {
3894 				int new_state = C_ON_SWAPOUT_Q;
3895 #if (XNU_TARGET_OS_OSX && __arm64__)
3896 				if (flush_all == false && compressor_swapout_conditions_met() == false) {
3897 					new_state = C_ON_MAJORCOMPACT_Q;
3898 				}
3899 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3900 
3901 				if (new_state == C_ON_SWAPOUT_Q) {
3902 					/*
3903 					 * This mode of putting a generic c_seg on the swapout list is
3904 					 * only supported when we have general swapping enabled
3905 					 */
3906 					clock_sec_t lnow;
3907 					clock_nsec_t lnsec;
3908 					clock_get_system_nanotime(&lnow, &lnsec);
3909 					if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 30) {
3910 						vmcs_stats.unripe_under_30s++;
3911 					} else if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 60) {
3912 						vmcs_stats.unripe_under_60s++;
3913 					} else if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 300) {
3914 						vmcs_stats.unripe_under_300s++;
3915 					}
3916 				}
3917 
3918 				c_seg_switch_state(c_seg, new_state, FALSE);
3919 			} else {
3920 				if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) {
3921 					assert(VM_CONFIG_SWAP_IS_PRESENT);
3922 					/*
3923 					 * we are running compressor sweeps with swap-behind
3924 					 * make sure the c_seg has aged enough before swapping it
3925 					 * out...
3926 					 */
3927 					if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
3928 						c_seg->c_overage_swap = TRUE;
3929 						c_overage_swapped_count++;
3930 						c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
3931 					}
3932 				}
3933 			}
3934 			if (c_seg->c_state == C_ON_AGE_Q) {
3935 				/*
3936 				 * this c_seg didn't get moved to the swapout queue
3937 				 * so we need to move it out of the way...
3938 				 * we just did a major compaction on it so put it
3939 				 * on that queue
3940 				 */
3941 				c_seg_switch_state(c_seg, C_ON_MAJORCOMPACT_Q, FALSE);
3942 			} else {
3943 				c_seg_major_compact_stats[c_seg_major_compact_stats_now].wasted_space_in_swapouts += c_seg_bufsize - c_seg->c_bytes_used;
3944 				c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_swapouts++;
3945 			}
3946 		}
3947 
3948 		C_SEG_WAKEUP_DONE(c_seg);
3949 
3950 		lck_mtx_unlock_always(&c_seg->c_lock);
3951 
3952 		/*
3953 		 * On systems _with_ general swap, regardless of jetsam, we wake up the swapout thread here.
3954 		 * On systems _without_ general swap, it's the responsibility of the memorystatus
3955 		 * subsystem to wake up the swapper.
3956 		 * TODO: When we have full jetsam support on a swap enabled system, we will need to revisit
3957 		 * this policy.
3958 		 */
3959 		if (VM_CONFIG_SWAP_IS_ACTIVE && c_swapout_count) {
3960 			/*
3961 			 * We don't pause/yield here because we will either
3962 			 * yield below or at the top of the loop with the
3963 			 * assert_wait_timeout.
3964 			 */
3965 			if (!vm_swapout_thread_running) {
3966 				thread_wakeup((event_t)&vm_swapout_thread);
3967 			}
3968 		}
3969 
3970 		if (number_considered >= yield_after_considered_per_pass) {
3971 			if (wanted_cseg_found) {
3972 				/*
3973 				 * We stopped major compactions on a c_seg
3974 				 * that is wanted. We don't know the priority
3975 				 * of the waiter unfortunately but we are at
3976 				 * a very high priority and so, just in case
3977 				 * the waiter is a critical system daemon or
3978 				 * UI thread, let's give up the CPU in case
3979 				 * the system is running a few CPU intensive
3980 				 * tasks.
3981 				 */
3982 				lck_mtx_unlock_always(c_list_lock);
3983 
3984 				mutex_pause(2); /* 100us yield */
3985 
3986 				number_yields++;
3987 
3988 				VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 11, number_considered, number_yields, 0);
3989 
3990 				lck_mtx_lock_spin_always(c_list_lock);
3991 			}
3992 
3993 			number_considered = 0;
3994 			wanted_cseg_found = 0;
3995 		}
3996 	}
3997 	clock_get_system_nanotime(&now, &nsec);
3998 
3999 	end_ts = major_compact_ts = (mach_timespec_t){.tv_sec = (int)now, .tv_nsec = nsec};
4000 
4001 	SUB_MACH_TIMESPEC(&end_ts, &start_ts);
4002 
4003 	delta_usec = (end_ts.tv_sec * USEC_PER_SEC) + (end_ts.tv_nsec / NSEC_PER_USEC) - (number_yields * 100);
4004 
4005 	delta_usec = MAX(1, delta_usec); /* we could have 0 usec run if conditions weren't right */
4006 
4007 	c_seg_major_compact_stats[c_seg_major_compact_stats_now].bytes_freed_rate_us = (bytes_freed / delta_usec);
4008 
4009 	if ((c_seg_major_compact_stats_now + 1) == C_SEG_MAJOR_COMPACT_STATS_MAX) {
4010 		c_seg_major_compact_stats_now = 0;
4011 	} else {
4012 		c_seg_major_compact_stats_now++;
4013 	}
4014 
4015 	assert(c_seg_major_compact_stats_now < C_SEG_MAJOR_COMPACT_STATS_MAX);
4016 
4017 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_END, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
4018 }
4019 
4020 
4021 static c_segment_t
c_seg_allocate(c_segment_t * current_chead)4022 c_seg_allocate(c_segment_t *current_chead)
4023 {
4024 	c_segment_t     c_seg;
4025 	int             min_needed;
4026 	int             size_to_populate;
4027 	c_segment_t     *donate_queue_head;
4028 
4029 #if XNU_TARGET_OS_OSX
4030 	if (vm_compressor_low_on_space()) {
4031 		vm_compressor_take_paging_space_action();
4032 	}
4033 #endif /* XNU_TARGET_OS_OSX */
4034 
4035 	if ((c_seg = *current_chead) == NULL) {
4036 		uint32_t        c_segno;
4037 
4038 		lck_mtx_lock_spin_always(c_list_lock);
4039 
4040 		while (c_segments_busy == TRUE) {
4041 			assert_wait((event_t) (&c_segments_busy), THREAD_UNINT);
4042 
4043 			lck_mtx_unlock_always(c_list_lock);
4044 
4045 			thread_block(THREAD_CONTINUE_NULL);
4046 
4047 			lck_mtx_lock_spin_always(c_list_lock);
4048 		}
4049 		if (c_free_segno_head == (uint32_t)-1) {
4050 			uint32_t        c_segments_available_new;
4051 			uint32_t        compressed_pages;
4052 
4053 #if CONFIG_FREEZE
4054 			if (freezer_incore_cseg_acct) {
4055 				compressed_pages = c_segment_pages_compressed_incore;
4056 			} else {
4057 				compressed_pages = c_segment_pages_compressed;
4058 			}
4059 #else
4060 			compressed_pages = c_segment_pages_compressed;
4061 #endif /* CONFIG_FREEZE */
4062 
4063 			if (c_segments_available >= c_segments_limit || compressed_pages >= c_segment_pages_compressed_limit) {
4064 				lck_mtx_unlock_always(c_list_lock);
4065 
4066 				return NULL;
4067 			}
4068 			c_segments_busy = TRUE;
4069 			lck_mtx_unlock_always(c_list_lock);
4070 
4071 			kernel_memory_populate((vm_offset_t)c_segments_next_page,
4072 			    PAGE_SIZE, KMA_NOFAIL | KMA_KOBJECT,
4073 			    VM_KERN_MEMORY_COMPRESSOR);
4074 			c_segments_next_page += PAGE_SIZE;
4075 
4076 			c_segments_available_new = c_segments_available + C_SEGMENTS_PER_PAGE;
4077 
4078 			if (c_segments_available_new > c_segments_limit) {
4079 				c_segments_available_new = c_segments_limit;
4080 			}
4081 
4082 			for (c_segno = c_segments_available + 1; c_segno < c_segments_available_new; c_segno++) {
4083 				c_segments[c_segno - 1].c_segno = c_segno;
4084 			}
4085 
4086 			lck_mtx_lock_spin_always(c_list_lock);
4087 
4088 			c_segments[c_segno - 1].c_segno = c_free_segno_head;
4089 			c_free_segno_head = c_segments_available;
4090 			c_segments_available = c_segments_available_new;
4091 
4092 			c_segments_busy = FALSE;
4093 			thread_wakeup((event_t) (&c_segments_busy));
4094 		}
4095 		c_segno = c_free_segno_head;
4096 		assert(c_segno >= 0 && c_segno < c_segments_limit);
4097 
4098 		c_free_segno_head = (uint32_t)c_segments[c_segno].c_segno;
4099 
4100 		/*
4101 		 * do the rest of the bookkeeping now while we're still behind
4102 		 * the list lock and grab our generation id now into a local
4103 		 * so that we can install it once we have the c_seg allocated
4104 		 */
4105 		c_segment_count++;
4106 		if (c_segment_count > c_segment_count_max) {
4107 			c_segment_count_max = c_segment_count;
4108 		}
4109 
4110 		lck_mtx_unlock_always(c_list_lock);
4111 
4112 		c_seg = zalloc_flags(compressor_segment_zone, Z_WAITOK | Z_ZERO);
4113 
4114 		c_seg->c_store.c_buffer = (int32_t *)C_SEG_BUFFER_ADDRESS(c_segno);
4115 
4116 		lck_mtx_init(&c_seg->c_lock, &vm_compressor_lck_grp, LCK_ATTR_NULL);
4117 
4118 		c_seg->c_state = C_IS_EMPTY;
4119 		c_seg->c_firstemptyslot = C_SLOT_MAX_INDEX;
4120 		c_seg->c_mysegno = c_segno;
4121 
4122 		lck_mtx_lock_spin_always(c_list_lock);
4123 		c_empty_count++;
4124 		c_seg_switch_state(c_seg, C_IS_FILLING, FALSE);
4125 		c_segments[c_segno].c_seg = c_seg;
4126 		assert(c_segments[c_segno].c_segno > c_segments_available);
4127 		lck_mtx_unlock_always(c_list_lock);
4128 
4129 		for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
4130 #if XNU_TARGET_OS_OSX
4131 			donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_early_swapout_chead);
4132 #else /* XNU_TARGET_OS_OSX */
4133 			if (memorystatus_swap_all_apps) {
4134 				donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_late_swapout_chead);
4135 			} else {
4136 				donate_queue_head = NULL;
4137 			}
4138 #endif /* XNU_TARGET_OS_OSX */
4139 
4140 			if (current_chead == donate_queue_head) {
4141 				c_seg->c_has_donated_pages = 1;
4142 				break;
4143 			}
4144 		}
4145 
4146 		*current_chead = c_seg;
4147 
4148 #if DEVELOPMENT || DEBUG
4149 		C_SEG_MAKE_WRITEABLE(c_seg);
4150 #endif
4151 	}
4152 	c_seg_alloc_nextslot(c_seg);
4153 
4154 	size_to_populate = c_seg_allocsize - C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset);
4155 
4156 	if (size_to_populate) {
4157 		min_needed = PAGE_SIZE + (c_seg_allocsize - c_seg_bufsize);
4158 
4159 		if (C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset) < (unsigned) min_needed) {
4160 			if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) {
4161 				size_to_populate = C_SEG_MAX_POPULATE_SIZE;
4162 			}
4163 
4164 			OSAddAtomic64(size_to_populate / PAGE_SIZE, &vm_pageout_vminfo.vm_compressor_pages_grabbed);
4165 
4166 			kernel_memory_populate(
4167 				(vm_offset_t) &c_seg->c_store.c_buffer[c_seg->c_populated_offset],
4168 				size_to_populate,
4169 				KMA_NOFAIL | KMA_COMPRESSOR,
4170 				VM_KERN_MEMORY_COMPRESSOR);
4171 		} else {
4172 			size_to_populate = 0;
4173 		}
4174 	}
4175 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
4176 
4177 	lck_mtx_lock_spin_always(&c_seg->c_lock);
4178 
4179 	if (size_to_populate) {
4180 		c_seg->c_populated_offset += C_SEG_BYTES_TO_OFFSET(size_to_populate);
4181 	}
4182 
4183 	return c_seg;
4184 }
4185 
4186 #if DEVELOPMENT || DEBUG
4187 #if CONFIG_FREEZE
4188 extern boolean_t memorystatus_freeze_to_memory;
4189 #endif /* CONFIG_FREEZE */
4190 #endif /* DEVELOPMENT || DEBUG */
4191 uint64_t c_seg_total_donated_bytes = 0; /* For testing/debugging only for now. Remove and add new counters for vm_stat.*/
4192 
4193 uint64_t c_seg_filled_no_contention = 0;
4194 uint64_t c_seg_filled_contention = 0;
4195 clock_sec_t c_seg_filled_contention_sec_max = 0;
4196 clock_nsec_t c_seg_filled_contention_nsec_max = 0;
4197 
4198 static void
c_current_seg_filled(c_segment_t c_seg,c_segment_t * current_chead)4199 c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead)
4200 {
4201 	uint32_t        unused_bytes;
4202 	uint32_t        offset_to_depopulate;
4203 	int             new_state = C_ON_AGE_Q;
4204 	clock_sec_t     sec;
4205 	clock_nsec_t    nsec;
4206 	bool            head_insert = false, wakeup_swapout_thread = false;
4207 
4208 	unused_bytes = trunc_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset));
4209 
4210 	if (unused_bytes) {
4211 		offset_to_depopulate = C_SEG_BYTES_TO_OFFSET(round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_nextoffset)));
4212 
4213 		/*
4214 		 *  release the extra physical page(s) at the end of the segment
4215 		 */
4216 		lck_mtx_unlock_always(&c_seg->c_lock);
4217 
4218 		kernel_memory_depopulate(
4219 			(vm_offset_t) &c_seg->c_store.c_buffer[offset_to_depopulate],
4220 			unused_bytes,
4221 			KMA_COMPRESSOR,
4222 			VM_KERN_MEMORY_COMPRESSOR);
4223 
4224 		lck_mtx_lock_spin_always(&c_seg->c_lock);
4225 
4226 		c_seg->c_populated_offset = offset_to_depopulate;
4227 	}
4228 	assert(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset) <= c_seg_bufsize);
4229 
4230 #if DEVELOPMENT || DEBUG
4231 	{
4232 		boolean_t       c_seg_was_busy = FALSE;
4233 
4234 		if (!c_seg->c_busy) {
4235 			C_SEG_BUSY(c_seg);
4236 		} else {
4237 			c_seg_was_busy = TRUE;
4238 		}
4239 
4240 		lck_mtx_unlock_always(&c_seg->c_lock);
4241 
4242 		C_SEG_WRITE_PROTECT(c_seg);
4243 
4244 		lck_mtx_lock_spin_always(&c_seg->c_lock);
4245 
4246 		if (c_seg_was_busy == FALSE) {
4247 			C_SEG_WAKEUP_DONE(c_seg);
4248 		}
4249 	}
4250 #endif
4251 
4252 #if CONFIG_FREEZE
4253 	if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead) &&
4254 	    VM_CONFIG_SWAP_IS_PRESENT &&
4255 	    VM_CONFIG_FREEZER_SWAP_IS_ACTIVE
4256 #if DEVELOPMENT || DEBUG
4257 	    && !memorystatus_freeze_to_memory
4258 #endif /* DEVELOPMENT || DEBUG */
4259 	    ) {
4260 		new_state = C_ON_SWAPOUT_Q;
4261 		wakeup_swapout_thread = true;
4262 	}
4263 #endif /* CONFIG_FREEZE */
4264 
4265 	if (vm_darkwake_mode == TRUE) {
4266 		new_state = C_ON_SWAPOUT_Q;
4267 		head_insert = true;
4268 		wakeup_swapout_thread = true;
4269 	} else {
4270 		c_segment_t *donate_queue_head;
4271 		for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
4272 #if XNU_TARGET_OS_OSX
4273 			donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_early_swapout_chead);
4274 #else /* XNU_TARGET_OS_OSX */
4275 			donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_late_swapout_chead);
4276 #endif /* XNU_TARGET_OS_OSX */
4277 
4278 			if (current_chead == donate_queue_head) {
4279 				assert(c_seg->c_has_donated_pages);
4280 				new_state = C_ON_SWAPOUT_Q;
4281 				c_seg_total_donated_bytes += c_seg->c_bytes_used;
4282 				break;
4283 			}
4284 		}
4285 	}
4286 
4287 	clock_get_system_nanotime(&sec, &nsec);
4288 	c_seg->c_creation_ts = (uint32_t)sec;
4289 
4290 	if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
4291 		clock_sec_t     sec2;
4292 		clock_nsec_t    nsec2;
4293 
4294 		lck_mtx_lock_spin_always(c_list_lock);
4295 		clock_get_system_nanotime(&sec2, &nsec2);
4296 		TIME_SUB(sec2, sec, nsec2, nsec, NSEC_PER_SEC);
4297 		// printf("FBDP %s: head %p waited for c_list_lock for %lu.%09u seconds\n", __FUNCTION__, current_chead, sec2, nsec2);
4298 		if (sec2 > c_seg_filled_contention_sec_max) {
4299 			c_seg_filled_contention_sec_max = sec2;
4300 			c_seg_filled_contention_nsec_max = nsec2;
4301 		} else if (sec2 == c_seg_filled_contention_sec_max &&
4302 		    nsec2 > c_seg_filled_contention_nsec_max) {
4303 			c_seg_filled_contention_nsec_max = nsec2;
4304 		}
4305 		c_seg_filled_contention++;
4306 	} else {
4307 		c_seg_filled_no_contention++;
4308 	}
4309 
4310 #if CONFIG_FREEZE
4311 	if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead)) {
4312 		if (freezer_context_global.freezer_ctx_task->donates_own_pages) {
4313 			assert(!c_seg->c_has_donated_pages);
4314 			c_seg->c_has_donated_pages = 1;
4315 			OSAddAtomic(c_seg->c_slots_used, &c_segment_pages_compressed_incore_late_swapout);
4316 		}
4317 		c_seg->c_has_freezer_pages = 1;
4318 	}
4319 #endif /* CONFIG_FREEZE */
4320 
4321 	c_seg->c_generation_id = c_generation_id++;
4322 	c_seg_switch_state(c_seg, new_state, head_insert);
4323 
4324 #if CONFIG_FREEZE
4325 	/*
4326 	 * Donated segments count as frozen to swap if we go through the freezer.
4327 	 * TODO: What we need is a new ledger and cseg state that can describe
4328 	 * a frozen cseg from a donated task so we can accurately decrement it on
4329 	 * swapins.
4330 	 */
4331 	if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead) && (c_seg->c_state == C_ON_SWAPOUT_Q)) {
4332 		/*
4333 		 * darkwake and freezer can't co-exist together
4334 		 * We'll need to fix this accounting as a start.
4335 		 * And early donation c_segs are separate from frozen c_segs.
4336 		 */
4337 		assert(vm_darkwake_mode == FALSE);
4338 		c_seg_update_task_owner(c_seg, freezer_context_global.freezer_ctx_task);
4339 		freezer_context_global.freezer_ctx_swapped_bytes += c_seg->c_bytes_used;
4340 	}
4341 #endif /* CONFIG_FREEZE */
4342 
4343 	if (c_seg->c_state == C_ON_AGE_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
4344 #if CONFIG_FREEZE
4345 		assert(c_seg->c_task_owner == NULL);
4346 #endif /* CONFIG_FREEZE */
4347 		c_seg_need_delayed_compaction(c_seg, TRUE);
4348 	}
4349 
4350 	lck_mtx_unlock_always(c_list_lock);
4351 
4352 	if (wakeup_swapout_thread) {
4353 		/*
4354 		 * Darkwake and Freeze configs always
4355 		 * wake up the swapout thread because
4356 		 * the compactor thread that normally handles
4357 		 * it may not be running as much in these
4358 		 * configs.
4359 		 */
4360 		thread_wakeup((event_t)&vm_swapout_thread);
4361 	}
4362 
4363 	*current_chead = NULL;
4364 }
4365 
4366 /*
4367  * returns with c_seg locked
4368  */
4369 void
c_seg_swapin_requeue(c_segment_t c_seg,boolean_t has_data,boolean_t minor_compact_ok,boolean_t age_on_swapin_q)4370 c_seg_swapin_requeue(c_segment_t c_seg, boolean_t has_data, boolean_t minor_compact_ok, boolean_t age_on_swapin_q)
4371 {
4372 	clock_sec_t     sec;
4373 	clock_nsec_t    nsec;
4374 
4375 	clock_get_system_nanotime(&sec, &nsec);
4376 
4377 	lck_mtx_lock_spin_always(c_list_lock);
4378 	lck_mtx_lock_spin_always(&c_seg->c_lock);
4379 
4380 	assert(c_seg->c_busy_swapping);
4381 	assert(c_seg->c_busy);
4382 
4383 	c_seg->c_busy_swapping = 0;
4384 
4385 	if (c_seg->c_overage_swap == TRUE) {
4386 		c_overage_swapped_count--;
4387 		c_seg->c_overage_swap = FALSE;
4388 	}
4389 	if (has_data == TRUE) {
4390 		if (age_on_swapin_q == TRUE || c_seg->c_has_donated_pages) {
4391 #if CONFIG_FREEZE
4392 			/*
4393 			 * If a segment has both identities, frozen and donated bits set, the donated
4394 			 * bit wins on the swapin path. This is because the segment is being swapped back
4395 			 * in and so is in demand and should be given more time to spend in memory before
4396 			 * being swapped back out under pressure.
4397 			 */
4398 			if (c_seg->c_has_donated_pages) {
4399 				c_seg->c_has_freezer_pages = 0;
4400 			}
4401 #endif /* CONFIG_FREEZE */
4402 			c_seg_switch_state(c_seg, C_ON_SWAPPEDIN_Q, FALSE);
4403 		} else {
4404 			c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
4405 		}
4406 
4407 		if (minor_compact_ok == TRUE && !c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
4408 			c_seg_need_delayed_compaction(c_seg, TRUE);
4409 		}
4410 	} else {
4411 		c_seg->c_store.c_buffer = (int32_t*) NULL;
4412 		c_seg->c_populated_offset = C_SEG_BYTES_TO_OFFSET(0);
4413 
4414 		c_seg_switch_state(c_seg, C_ON_BAD_Q, FALSE);
4415 	}
4416 	c_seg->c_swappedin_ts = (uint32_t)sec;
4417 	c_seg->c_swappedin = true;
4418 
4419 	lck_mtx_unlock_always(c_list_lock);
4420 }
4421 
4422 
4423 
4424 /*
4425  * c_seg has to be locked and is returned locked if the c_seg isn't freed
4426  * PAGE_REPLACMENT_DISALLOWED has to be TRUE on entry and is returned TRUE
4427  * c_seg_swapin returns 1 if the c_seg was freed, 0 otherwise
4428  */
4429 
4430 int
c_seg_swapin(c_segment_t c_seg,boolean_t force_minor_compaction,boolean_t age_on_swapin_q)4431 c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction, boolean_t age_on_swapin_q)
4432 {
4433 	vm_offset_t     addr = 0;
4434 	uint32_t        io_size = 0;
4435 	uint64_t        f_offset;
4436 	thread_pri_floor_t token;
4437 
4438 	assert(C_SEG_IS_ONDISK(c_seg));
4439 
4440 #if !CHECKSUM_THE_SWAP
4441 	c_seg_trim_tail(c_seg);
4442 #endif
4443 	io_size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
4444 	f_offset = c_seg->c_store.c_swap_handle;
4445 
4446 	C_SEG_BUSY(c_seg);
4447 	c_seg->c_busy_swapping = 1;
4448 
4449 	/*
4450 	 * This thread is likely going to block for I/O.
4451 	 * Make sure it is ready to run when the I/O completes because
4452 	 * it needs to clear the busy bit on the c_seg so that other
4453 	 * waiting threads can make progress too.
4454 	 */
4455 	token = thread_priority_floor_start();
4456 	lck_mtx_unlock_always(&c_seg->c_lock);
4457 
4458 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
4459 
4460 	addr = (vm_offset_t)C_SEG_BUFFER_ADDRESS(c_seg->c_mysegno);
4461 	c_seg->c_store.c_buffer = (int32_t*) addr;
4462 
4463 	kernel_memory_populate(addr, io_size, KMA_NOFAIL | KMA_COMPRESSOR,
4464 	    VM_KERN_MEMORY_COMPRESSOR);
4465 
4466 	if (vm_swap_get(c_seg, f_offset, io_size) != KERN_SUCCESS) {
4467 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
4468 
4469 		kernel_memory_depopulate(addr, io_size, KMA_COMPRESSOR,
4470 		    VM_KERN_MEMORY_COMPRESSOR);
4471 
4472 		c_seg_swapin_requeue(c_seg, FALSE, TRUE, age_on_swapin_q);
4473 	} else {
4474 #if ENCRYPTED_SWAP
4475 		vm_swap_decrypt(c_seg);
4476 #endif /* ENCRYPTED_SWAP */
4477 
4478 #if CHECKSUM_THE_SWAP
4479 		if (c_seg->cseg_swap_size != io_size) {
4480 			panic("swapin size doesn't match swapout size");
4481 		}
4482 
4483 		if (c_seg->cseg_hash != vmc_hash((char*) c_seg->c_store.c_buffer, (int)io_size)) {
4484 			panic("c_seg_swapin - Swap hash mismatch");
4485 		}
4486 #endif /* CHECKSUM_THE_SWAP */
4487 
4488 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
4489 
4490 		c_seg_swapin_requeue(c_seg, TRUE, force_minor_compaction == TRUE ? FALSE : TRUE, age_on_swapin_q);
4491 
4492 #if CONFIG_FREEZE
4493 		/*
4494 		 * c_seg_swapin_requeue() returns with the c_seg lock held.
4495 		 */
4496 		if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
4497 			assert(c_seg->c_busy);
4498 
4499 			lck_mtx_unlock_always(&c_seg->c_lock);
4500 			lck_mtx_lock_spin_always(c_list_lock);
4501 			lck_mtx_lock_spin_always(&c_seg->c_lock);
4502 		}
4503 
4504 		if (c_seg->c_task_owner) {
4505 			c_seg_update_task_owner(c_seg, NULL);
4506 		}
4507 
4508 		lck_mtx_unlock_always(c_list_lock);
4509 
4510 		OSAddAtomic(c_seg->c_slots_used, &c_segment_pages_compressed_incore);
4511 		if (c_seg->c_has_donated_pages) {
4512 			OSAddAtomic(c_seg->c_slots_used, &c_segment_pages_compressed_incore_late_swapout);
4513 		}
4514 #endif /* CONFIG_FREEZE */
4515 
4516 		OSAddAtomic64(c_seg->c_bytes_used, &compressor_bytes_used);
4517 
4518 		if (force_minor_compaction == TRUE) {
4519 			if (c_seg_minor_compaction_and_unlock(c_seg, FALSE)) {
4520 				/*
4521 				 * c_seg was completely empty so it was freed,
4522 				 * so be careful not to reference it again
4523 				 *
4524 				 * Drop the boost so that the thread priority
4525 				 * is returned back to where it is supposed to be.
4526 				 */
4527 				thread_priority_floor_end(&token);
4528 				return 1;
4529 			}
4530 
4531 			lck_mtx_lock_spin_always(&c_seg->c_lock);
4532 		}
4533 	}
4534 	C_SEG_WAKEUP_DONE(c_seg);
4535 
4536 	/*
4537 	 * Drop the boost so that the thread priority
4538 	 * is returned back to where it is supposed to be.
4539 	 */
4540 	thread_priority_floor_end(&token);
4541 
4542 	return 0;
4543 }
4544 
4545 
4546 static void
c_segment_sv_hash_drop_ref(int hash_indx)4547 c_segment_sv_hash_drop_ref(int hash_indx)
4548 {
4549 	struct c_sv_hash_entry o_sv_he, n_sv_he;
4550 
4551 	while (1) {
4552 		o_sv_he.he_record = c_segment_sv_hash_table[hash_indx].he_record;
4553 
4554 		n_sv_he.he_ref = o_sv_he.he_ref - 1;
4555 		n_sv_he.he_data = o_sv_he.he_data;
4556 
4557 		if (OSCompareAndSwap64((UInt64)o_sv_he.he_record, (UInt64)n_sv_he.he_record, (UInt64 *) &c_segment_sv_hash_table[hash_indx].he_record) == TRUE) {
4558 			if (n_sv_he.he_ref == 0) {
4559 				OSAddAtomic(-1, &c_segment_svp_in_hash);
4560 			}
4561 			break;
4562 		}
4563 	}
4564 }
4565 
4566 
4567 static int
c_segment_sv_hash_insert(uint32_t data)4568 c_segment_sv_hash_insert(uint32_t data)
4569 {
4570 	int             hash_sindx;
4571 	int             misses;
4572 	struct c_sv_hash_entry o_sv_he, n_sv_he;
4573 	boolean_t       got_ref = FALSE;
4574 
4575 	if (data == 0) {
4576 		OSAddAtomic(1, &c_segment_svp_zero_compressions);
4577 	} else {
4578 		OSAddAtomic(1, &c_segment_svp_nonzero_compressions);
4579 	}
4580 
4581 	hash_sindx = data & C_SV_HASH_MASK;
4582 
4583 	for (misses = 0; misses < C_SV_HASH_MAX_MISS; misses++) {
4584 		o_sv_he.he_record = c_segment_sv_hash_table[hash_sindx].he_record;
4585 
4586 		while (o_sv_he.he_data == data || o_sv_he.he_ref == 0) {
4587 			n_sv_he.he_ref = o_sv_he.he_ref + 1;
4588 			n_sv_he.he_data = data;
4589 
4590 			if (OSCompareAndSwap64((UInt64)o_sv_he.he_record, (UInt64)n_sv_he.he_record, (UInt64 *) &c_segment_sv_hash_table[hash_sindx].he_record) == TRUE) {
4591 				if (n_sv_he.he_ref == 1) {
4592 					OSAddAtomic(1, &c_segment_svp_in_hash);
4593 				}
4594 				got_ref = TRUE;
4595 				break;
4596 			}
4597 			o_sv_he.he_record = c_segment_sv_hash_table[hash_sindx].he_record;
4598 		}
4599 		if (got_ref == TRUE) {
4600 			break;
4601 		}
4602 		hash_sindx++;
4603 
4604 		if (hash_sindx == C_SV_HASH_SIZE) {
4605 			hash_sindx = 0;
4606 		}
4607 	}
4608 	if (got_ref == FALSE) {
4609 		return -1;
4610 	}
4611 
4612 	return hash_sindx;
4613 }
4614 
4615 
4616 #if RECORD_THE_COMPRESSED_DATA
4617 
4618 static void
c_compressed_record_data(char * src,int c_size)4619 c_compressed_record_data(char *src, int c_size)
4620 {
4621 	if ((c_compressed_record_cptr + c_size + 4) >= c_compressed_record_ebuf) {
4622 		panic("c_compressed_record_cptr >= c_compressed_record_ebuf");
4623 	}
4624 
4625 	*(int *)((void *)c_compressed_record_cptr) = c_size;
4626 
4627 	c_compressed_record_cptr += 4;
4628 
4629 	memcpy(c_compressed_record_cptr, src, c_size);
4630 	c_compressed_record_cptr += c_size;
4631 }
4632 #endif
4633 
4634 
4635 static int
c_compress_page(char * src,c_slot_mapping_t slot_ptr,c_segment_t * current_chead,char * scratch_buf)4636 c_compress_page(char *src, c_slot_mapping_t slot_ptr, c_segment_t *current_chead, char *scratch_buf)
4637 {
4638 	int             c_size = -1;
4639 	int             c_rounded_size = 0;
4640 	int             max_csize;
4641 	c_slot_t        cs;
4642 	c_segment_t     c_seg;
4643 	bool            single_value = false;
4644 
4645 	KERNEL_DEBUG(0xe0400000 | DBG_FUNC_START, *current_chead, 0, 0, 0, 0);
4646 retry:
4647 	if ((c_seg = c_seg_allocate(current_chead)) == NULL) {
4648 		return 1;
4649 	}
4650 	/*
4651 	 * returns with c_seg lock held
4652 	 * and PAGE_REPLACEMENT_DISALLOWED(TRUE)...
4653 	 * c_nextslot has been allocated and
4654 	 * c_store.c_buffer populated
4655 	 */
4656 	assert(c_seg->c_state == C_IS_FILLING);
4657 
4658 	cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_seg->c_nextslot);
4659 
4660 	C_SLOT_ASSERT_PACKABLE(slot_ptr);
4661 	cs->c_packed_ptr = C_SLOT_PACK_PTR(slot_ptr);
4662 
4663 	cs->c_offset = c_seg->c_nextoffset;
4664 
4665 	max_csize = c_seg_bufsize - C_SEG_OFFSET_TO_BYTES((int32_t)cs->c_offset);
4666 
4667 	if (max_csize > PAGE_SIZE) {
4668 		max_csize = PAGE_SIZE;
4669 	}
4670 
4671 #if CHECKSUM_THE_DATA
4672 	cs->c_hash_data = vmc_hash(src, PAGE_SIZE);
4673 #endif
4674 	boolean_t incomp_copy = FALSE;
4675 	int max_csize_adj = (max_csize - 4);
4676 
4677 	if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) {
4678 #if defined(__arm64__)
4679 		uint16_t ccodec = CINVALID;
4680 		uint32_t inline_popcount;
4681 		if (max_csize >= C_SEG_OFFSET_ALIGNMENT_BOUNDARY) {
4682 			c_size = metacompressor((const uint8_t *) src,
4683 			    (uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset],
4684 			    max_csize_adj, &ccodec,
4685 			    scratch_buf, &incomp_copy, &inline_popcount);
4686 			assert(inline_popcount == C_SLOT_NO_POPCOUNT);
4687 
4688 #if C_SEG_OFFSET_ALIGNMENT_BOUNDARY > 4
4689 			if (c_size > max_csize_adj) {
4690 				c_size = -1;
4691 			}
4692 #endif
4693 		} else {
4694 			c_size = -1;
4695 		}
4696 		assert(ccodec == CCWK || ccodec == CCLZ4);
4697 		cs->c_codec = ccodec;
4698 #endif
4699 	} else {
4700 #if defined(__arm64__)
4701 		cs->c_codec = CCWK;
4702 		__unreachable_ok_push
4703 		if (PAGE_SIZE == 4096) {
4704 			c_size = WKdm_compress_4k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4705 			    (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4706 		} else {
4707 			c_size = WKdm_compress_16k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4708 			    (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4709 		}
4710 		__unreachable_ok_pop
4711 #else
4712 		c_size = WKdm_compress_new((const WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4713 		    (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4714 #endif
4715 	}
4716 	assertf(((c_size <= max_csize_adj) && (c_size >= -1)),
4717 	    "c_size invalid (%d, %d), cur compressions: %d", c_size, max_csize_adj, c_segment_pages_compressed);
4718 
4719 	if (c_size == -1) {
4720 		if (max_csize < PAGE_SIZE) {
4721 			c_current_seg_filled(c_seg, current_chead);
4722 			assert(*current_chead == NULL);
4723 
4724 			lck_mtx_unlock_always(&c_seg->c_lock);
4725 			/* TODO: it may be worth requiring codecs to distinguish
4726 			 * between incompressible inputs and failures due to
4727 			 * budget exhaustion.
4728 			 */
4729 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
4730 			goto retry;
4731 		}
4732 		c_size = PAGE_SIZE;
4733 
4734 		if (incomp_copy == FALSE) {
4735 			memcpy(&c_seg->c_store.c_buffer[cs->c_offset], src, c_size);
4736 		}
4737 
4738 		OSAddAtomic(1, &c_segment_noncompressible_pages);
4739 	} else if (c_size == 0) {
4740 		int             hash_index;
4741 
4742 		/*
4743 		 * special case - this is a page completely full of a single 32 bit value
4744 		 */
4745 		single_value = true;
4746 		hash_index = c_segment_sv_hash_insert(*(uint32_t *)(uintptr_t)src);
4747 
4748 		if (hash_index != -1) {
4749 			slot_ptr->s_cindx = hash_index;
4750 			slot_ptr->s_cseg = C_SV_CSEG_ID;
4751 
4752 			OSAddAtomic(1, &c_segment_svp_hash_succeeded);
4753 #if RECORD_THE_COMPRESSED_DATA
4754 			c_compressed_record_data(src, 4);
4755 #endif
4756 			goto sv_compression;
4757 		}
4758 		c_size = 4;
4759 
4760 		memcpy(&c_seg->c_store.c_buffer[cs->c_offset], src, c_size);
4761 
4762 		OSAddAtomic(1, &c_segment_svp_hash_failed);
4763 	}
4764 
4765 #if RECORD_THE_COMPRESSED_DATA
4766 	c_compressed_record_data((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size);
4767 #endif
4768 #if CHECKSUM_THE_COMPRESSED_DATA
4769 	cs->c_hash_compressed_data = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size);
4770 #endif
4771 #if POPCOUNT_THE_COMPRESSED_DATA
4772 	cs->c_pop_cdata = vmc_pop((uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset], c_size);
4773 #endif
4774 	c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
4775 
4776 	PACK_C_SIZE(cs, c_size);
4777 	c_seg->c_bytes_used += c_rounded_size;
4778 	c_seg->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
4779 	c_seg->c_slots_used++;
4780 
4781 #if CONFIG_FREEZE
4782 	/* TODO: should c_segment_pages_compressed be up here too? See 88598046 for details */
4783 	OSAddAtomic(1, &c_segment_pages_compressed_incore);
4784 	if (c_seg->c_has_donated_pages) {
4785 		OSAddAtomic(1, &c_segment_pages_compressed_incore_late_swapout);
4786 	}
4787 #endif /* CONFIG_FREEZE */
4788 
4789 	slot_ptr->s_cindx = c_seg->c_nextslot++;
4790 	/* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
4791 	slot_ptr->s_cseg = c_seg->c_mysegno + 1;
4792 
4793 sv_compression:
4794 	if (c_seg->c_nextoffset >= c_seg_off_limit || c_seg->c_nextslot >= C_SLOT_MAX_INDEX) {
4795 		c_current_seg_filled(c_seg, current_chead);
4796 		assert(*current_chead == NULL);
4797 	}
4798 
4799 	lck_mtx_unlock_always(&c_seg->c_lock);
4800 
4801 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
4802 
4803 #if RECORD_THE_COMPRESSED_DATA
4804 	if ((c_compressed_record_cptr - c_compressed_record_sbuf) >= c_seg_allocsize) {
4805 		c_compressed_record_write(c_compressed_record_sbuf, (int)(c_compressed_record_cptr - c_compressed_record_sbuf));
4806 		c_compressed_record_cptr = c_compressed_record_sbuf;
4807 	}
4808 #endif
4809 	if (c_size) {
4810 		OSAddAtomic64(c_size, &c_segment_compressed_bytes);
4811 		OSAddAtomic64(c_rounded_size, &compressor_bytes_used);
4812 	}
4813 	OSAddAtomic64(PAGE_SIZE, &c_segment_input_bytes);
4814 
4815 	OSAddAtomic(1, &c_segment_pages_compressed);
4816 #if DEVELOPMENT || DEBUG
4817 	if (!compressor_running_perf_test) {
4818 		/*
4819 		 * The perf_compressor benchmark should not be able to trigger
4820 		 * compressor thrashing jetsams.
4821 		 */
4822 		OSAddAtomic(1, &sample_period_compression_count);
4823 	}
4824 #else /* DEVELOPMENT || DEBUG */
4825 	OSAddAtomic(1, &sample_period_compression_count);
4826 #endif /* DEVELOPMENT || DEBUG */
4827 
4828 	KERNEL_DEBUG(0xe0400000 | DBG_FUNC_END, *current_chead, c_size, c_segment_input_bytes, c_segment_compressed_bytes, 0);
4829 
4830 	return 0;
4831 }
4832 
4833 static inline void
sv_decompress(int32_t * ddst,int32_t pattern)4834 sv_decompress(int32_t *ddst, int32_t pattern)
4835 {
4836 //	assert(__builtin_constant_p(PAGE_SIZE) != 0);
4837 #if defined(__x86_64__)
4838 	memset_word(ddst, pattern, PAGE_SIZE / sizeof(int32_t));
4839 #elif defined(__arm64__)
4840 	assert((PAGE_SIZE % 128) == 0);
4841 	if (pattern == 0) {
4842 		fill32_dczva((addr64_t)ddst, PAGE_SIZE);
4843 	} else {
4844 		fill32_nt((addr64_t)ddst, PAGE_SIZE, pattern);
4845 	}
4846 #else
4847 	size_t          i;
4848 
4849 	/* Unroll the pattern fill loop 4x to encourage the
4850 	 * compiler to emit NEON stores, cf.
4851 	 * <rdar://problem/25839866> Loop autovectorization
4852 	 * anomalies.
4853 	 */
4854 	/* * We use separate loops for each PAGE_SIZE
4855 	 * to allow the autovectorizer to engage, as PAGE_SIZE
4856 	 * may not be a constant.
4857 	 */
4858 
4859 	__unreachable_ok_push
4860 	if (PAGE_SIZE == 4096) {
4861 		for (i = 0; i < (4096U / sizeof(int32_t)); i += 4) {
4862 			*ddst++ = pattern;
4863 			*ddst++ = pattern;
4864 			*ddst++ = pattern;
4865 			*ddst++ = pattern;
4866 		}
4867 	} else {
4868 		assert(PAGE_SIZE == 16384);
4869 		for (i = 0; i < (int)(16384U / sizeof(int32_t)); i += 4) {
4870 			*ddst++ = pattern;
4871 			*ddst++ = pattern;
4872 			*ddst++ = pattern;
4873 			*ddst++ = pattern;
4874 		}
4875 	}
4876 	__unreachable_ok_pop
4877 #endif
4878 }
4879 
4880 static int
c_decompress_page(char * dst,volatile c_slot_mapping_t slot_ptr,int flags,int * zeroslot)4881 c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int *zeroslot)
4882 {
4883 	c_slot_t        cs;
4884 	c_segment_t     c_seg;
4885 	uint32_t        c_segno;
4886 	uint16_t        c_indx;
4887 	int             c_rounded_size;
4888 	uint32_t        c_size;
4889 	int             retval = 0;
4890 	boolean_t       need_unlock = TRUE;
4891 	boolean_t       consider_defragmenting = FALSE;
4892 	boolean_t       kdp_mode = FALSE;
4893 
4894 	if (__improbable(flags & C_KDP)) {
4895 		if (not_in_kdp) {
4896 			panic("C_KDP passed to decompress page from outside of debugger context");
4897 		}
4898 
4899 		assert((flags & C_KEEP) == C_KEEP);
4900 		assert((flags & C_DONT_BLOCK) == C_DONT_BLOCK);
4901 
4902 		if ((flags & (C_DONT_BLOCK | C_KEEP)) != (C_DONT_BLOCK | C_KEEP)) {
4903 			return -2;
4904 		}
4905 
4906 		kdp_mode = TRUE;
4907 		*zeroslot = 0;
4908 	}
4909 
4910 ReTry:
4911 	if (__probable(!kdp_mode)) {
4912 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
4913 	} else {
4914 		if (kdp_lck_rw_lock_is_acquired_exclusive(&c_master_lock)) {
4915 			return -2;
4916 		}
4917 	}
4918 
4919 #if HIBERNATION
4920 	/*
4921 	 * if hibernation is enabled, it indicates (via a call
4922 	 * to 'vm_decompressor_lock' that no further
4923 	 * decompressions are allowed once it reaches
4924 	 * the point of flushing all of the currently dirty
4925 	 * anonymous memory through the compressor and out
4926 	 * to disk... in this state we allow freeing of compressed
4927 	 * pages and must honor the C_DONT_BLOCK case
4928 	 */
4929 	if (__improbable(dst && decompressions_blocked == TRUE)) {
4930 		if (flags & C_DONT_BLOCK) {
4931 			if (__probable(!kdp_mode)) {
4932 				PAGE_REPLACEMENT_DISALLOWED(FALSE);
4933 			}
4934 
4935 			*zeroslot = 0;
4936 			return -2;
4937 		}
4938 		/*
4939 		 * it's safe to atomically assert and block behind the
4940 		 * lock held in shared mode because "decompressions_blocked" is
4941 		 * only set and cleared and the thread_wakeup done when the lock
4942 		 * is held exclusively
4943 		 */
4944 		assert_wait((event_t)&decompressions_blocked, THREAD_UNINT);
4945 
4946 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
4947 
4948 		thread_block(THREAD_CONTINUE_NULL);
4949 
4950 		goto ReTry;
4951 	}
4952 #endif
4953 	/* s_cseg is actually "segno+1" */
4954 	c_segno = slot_ptr->s_cseg - 1;
4955 
4956 	if (__improbable(c_segno >= c_segments_available)) {
4957 		panic("c_decompress_page: c_segno %d >= c_segments_available %d, slot_ptr(%p), slot_data(%x)",
4958 		    c_segno, c_segments_available, slot_ptr, *(int *)((void *)slot_ptr));
4959 	}
4960 
4961 	if (__improbable(c_segments[c_segno].c_segno < c_segments_available)) {
4962 		panic("c_decompress_page: c_segno %d is free, slot_ptr(%p), slot_data(%x)",
4963 		    c_segno, slot_ptr, *(int *)((void *)slot_ptr));
4964 	}
4965 
4966 	c_seg = c_segments[c_segno].c_seg;
4967 
4968 	if (__probable(!kdp_mode)) {
4969 		lck_mtx_lock_spin_always(&c_seg->c_lock);
4970 	} else {
4971 		if (kdp_lck_mtx_lock_spin_is_acquired(&c_seg->c_lock)) {
4972 			return -2;
4973 		}
4974 	}
4975 
4976 	assert(c_seg->c_state != C_IS_EMPTY && c_seg->c_state != C_IS_FREE);
4977 
4978 	if (dst == NULL && c_seg->c_busy_swapping) {
4979 		assert(c_seg->c_busy);
4980 
4981 		goto bypass_busy_check;
4982 	}
4983 	if (flags & C_DONT_BLOCK) {
4984 		if (c_seg->c_busy || (C_SEG_IS_ONDISK(c_seg) && dst)) {
4985 			*zeroslot = 0;
4986 
4987 			retval = -2;
4988 			goto done;
4989 		}
4990 	}
4991 	if (c_seg->c_busy) {
4992 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
4993 
4994 		c_seg_wait_on_busy(c_seg);
4995 
4996 		goto ReTry;
4997 	}
4998 bypass_busy_check:
4999 
5000 	c_indx = slot_ptr->s_cindx;
5001 
5002 	if (__improbable(c_indx >= c_seg->c_nextslot)) {
5003 		panic("c_decompress_page: c_indx %d >= c_nextslot %d, c_seg(%p), slot_ptr(%p), slot_data(%x)",
5004 		    c_indx, c_seg->c_nextslot, c_seg, slot_ptr, *(int *)((void *)slot_ptr));
5005 	}
5006 
5007 	cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5008 
5009 	c_size = UNPACK_C_SIZE(cs);
5010 
5011 	if (__improbable(c_size == 0)) {
5012 		panic("c_decompress_page: c_size == 0, c_seg(%p), slot_ptr(%p), slot_data(%x)",
5013 		    c_seg, slot_ptr, *(int *)((void *)slot_ptr));
5014 	}
5015 
5016 	c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
5017 
5018 	if (dst) {
5019 		uint32_t        age_of_cseg;
5020 		clock_sec_t     cur_ts_sec;
5021 		clock_nsec_t    cur_ts_nsec;
5022 
5023 		if (C_SEG_IS_ONDISK(c_seg)) {
5024 #if CONFIG_FREEZE
5025 			if (freezer_incore_cseg_acct) {
5026 				if ((c_seg->c_slots_used + c_segment_pages_compressed_incore) >= c_segment_pages_compressed_nearing_limit) {
5027 					PAGE_REPLACEMENT_DISALLOWED(FALSE);
5028 					lck_mtx_unlock_always(&c_seg->c_lock);
5029 
5030 					memorystatus_kill_on_VM_compressor_space_shortage(FALSE /* async */);
5031 
5032 					goto ReTry;
5033 				}
5034 
5035 				uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
5036 				if ((incore_seg_count + 1) >= c_segments_nearing_limit) {
5037 					PAGE_REPLACEMENT_DISALLOWED(FALSE);
5038 					lck_mtx_unlock_always(&c_seg->c_lock);
5039 
5040 					memorystatus_kill_on_VM_compressor_space_shortage(FALSE /* async */);
5041 
5042 					goto ReTry;
5043 				}
5044 			}
5045 #endif /* CONFIG_FREEZE */
5046 			assert(kdp_mode == FALSE);
5047 			retval = c_seg_swapin(c_seg, FALSE, TRUE);
5048 			assert(retval == 0);
5049 
5050 			retval = 1;
5051 		}
5052 		if (c_seg->c_state == C_ON_BAD_Q) {
5053 			assert(c_seg->c_store.c_buffer == NULL);
5054 			*zeroslot = 0;
5055 
5056 			retval = -1;
5057 			goto done;
5058 		}
5059 
5060 #if POPCOUNT_THE_COMPRESSED_DATA
5061 		unsigned csvpop;
5062 		uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
5063 		if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
5064 			panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%x 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
5065 		}
5066 #endif
5067 
5068 #if CHECKSUM_THE_COMPRESSED_DATA
5069 		unsigned csvhash;
5070 		if (cs->c_hash_compressed_data != (csvhash = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size))) {
5071 			panic("Compressed data doesn't match original %p %p %u %u %u", c_seg, cs, c_size, cs->c_hash_compressed_data, csvhash);
5072 		}
5073 #endif
5074 		if (c_rounded_size == PAGE_SIZE) {
5075 			/*
5076 			 * page wasn't compressible... just copy it out
5077 			 */
5078 			memcpy(dst, &c_seg->c_store.c_buffer[cs->c_offset], PAGE_SIZE);
5079 		} else if (c_size == 4) {
5080 			int32_t         data;
5081 			int32_t         *dptr;
5082 
5083 			/*
5084 			 * page was populated with a single value
5085 			 * that didn't fit into our fast hash
5086 			 * so we packed it in as a single non-compressed value
5087 			 * that we need to populate the page with
5088 			 */
5089 			dptr = (int32_t *)(uintptr_t)dst;
5090 			data = *(int32_t *)(&c_seg->c_store.c_buffer[cs->c_offset]);
5091 			sv_decompress(dptr, data);
5092 		} else {
5093 			uint32_t        my_cpu_no;
5094 			char            *scratch_buf;
5095 
5096 			if (__probable(!kdp_mode)) {
5097 				/*
5098 				 * we're behind the c_seg lock held in spin mode
5099 				 * which means pre-emption is disabled... therefore
5100 				 * the following sequence is atomic and safe
5101 				 */
5102 				my_cpu_no = cpu_number();
5103 
5104 				assert(my_cpu_no < compressor_cpus);
5105 
5106 				scratch_buf = &compressor_scratch_bufs[my_cpu_no * vm_compressor_get_decode_scratch_size()];
5107 			} else {
5108 				scratch_buf = kdp_compressor_scratch_buf;
5109 			}
5110 
5111 			if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) {
5112 #if defined(__arm64__)
5113 				uint16_t c_codec = cs->c_codec;
5114 				uint32_t inline_popcount;
5115 				if (!metadecompressor((const uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset],
5116 				    (uint8_t *)dst, c_size, c_codec, (void *)scratch_buf, &inline_popcount)) {
5117 					retval = -1;
5118 				} else {
5119 					assert(inline_popcount == C_SLOT_NO_POPCOUNT);
5120 				}
5121 #endif
5122 			} else {
5123 #if defined(__arm64__)
5124 				__unreachable_ok_push
5125 				if (PAGE_SIZE == 4096) {
5126 					WKdm_decompress_4k((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5127 					    (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5128 				} else {
5129 					WKdm_decompress_16k((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5130 					    (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5131 				}
5132 				__unreachable_ok_pop
5133 #else
5134 				WKdm_decompress_new((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5135 				    (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5136 #endif
5137 			}
5138 		}
5139 
5140 #if CHECKSUM_THE_DATA
5141 		if (cs->c_hash_data != vmc_hash(dst, PAGE_SIZE)) {
5142 #if defined(__arm64__)
5143 			int32_t *dinput = &c_seg->c_store.c_buffer[cs->c_offset];
5144 			panic("decompressed data doesn't match original cs: %p, hash: 0x%x, offset: %d, c_size: %d, c_rounded_size: %d, codec: %d, header: 0x%x 0x%x 0x%x", cs, cs->c_hash_data, cs->c_offset, c_size, c_rounded_size, cs->c_codec, *dinput, *(dinput + 1), *(dinput + 2));
5145 #else
5146 			panic("decompressed data doesn't match original cs: %p, hash: %d, offset: 0x%x, c_size: %d", cs, cs->c_hash_data, cs->c_offset, c_size);
5147 #endif
5148 		}
5149 #endif
5150 		if (c_seg->c_swappedin_ts == 0 && !kdp_mode) {
5151 			clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
5152 
5153 			age_of_cseg = (uint32_t)cur_ts_sec - c_seg->c_creation_ts;
5154 			if (age_of_cseg < DECOMPRESSION_SAMPLE_MAX_AGE) {
5155 				OSAddAtomic(1, &age_of_decompressions_during_sample_period[age_of_cseg]);
5156 			} else {
5157 				OSAddAtomic(1, &overage_decompressions_during_sample_period);
5158 			}
5159 
5160 			OSAddAtomic(1, &sample_period_decompression_count);
5161 		}
5162 	}
5163 #if CONFIG_FREEZE
5164 	else {
5165 		/*
5166 		 * We are freeing an uncompressed page from this c_seg and so balance the ledgers.
5167 		 */
5168 		if (C_SEG_IS_ONDISK(c_seg)) {
5169 			/*
5170 			 * The compression sweep feature will push out anonymous pages to disk
5171 			 * without going through the freezer path and so those c_segs, while
5172 			 * swapped out, won't have an owner.
5173 			 */
5174 			if (c_seg->c_task_owner) {
5175 				task_update_frozen_to_swap_acct(c_seg->c_task_owner, PAGE_SIZE_64, DEBIT_FROM_SWAP);
5176 			}
5177 
5178 			/*
5179 			 * We are freeing a page in swap without swapping it in. We bump the in-core
5180 			 * count here to simulate a swapin of a page so that we can accurately
5181 			 * decrement it below.
5182 			 */
5183 			OSAddAtomic(1, &c_segment_pages_compressed_incore);
5184 			if (c_seg->c_has_donated_pages) {
5185 				OSAddAtomic(1, &c_segment_pages_compressed_incore_late_swapout);
5186 			}
5187 		} else if (c_seg->c_state == C_ON_BAD_Q) {
5188 			assert(c_seg->c_store.c_buffer == NULL);
5189 			*zeroslot = 0;
5190 
5191 			retval = -1;
5192 			goto done;
5193 		}
5194 	}
5195 #endif /* CONFIG_FREEZE */
5196 
5197 	if (flags & C_KEEP) {
5198 		*zeroslot = 0;
5199 		goto done;
5200 	}
5201 	assert(kdp_mode == FALSE);
5202 
5203 	c_seg->c_bytes_unused += c_rounded_size;
5204 	c_seg->c_bytes_used -= c_rounded_size;
5205 
5206 	assert(c_seg->c_slots_used);
5207 	c_seg->c_slots_used--;
5208 	if (dst && c_seg->c_swappedin) {
5209 		task_t task = current_task();
5210 		if (task) {
5211 			ledger_credit(task->ledger, task_ledgers.swapins, PAGE_SIZE);
5212 		}
5213 	}
5214 
5215 	PACK_C_SIZE(cs, 0);
5216 
5217 	if (c_indx < c_seg->c_firstemptyslot) {
5218 		c_seg->c_firstemptyslot = c_indx;
5219 	}
5220 
5221 	OSAddAtomic(-1, &c_segment_pages_compressed);
5222 #if CONFIG_FREEZE
5223 	OSAddAtomic(-1, &c_segment_pages_compressed_incore);
5224 	assertf(c_segment_pages_compressed_incore >= 0, "-ve incore count %p 0x%x", c_seg, c_segment_pages_compressed_incore);
5225 	if (c_seg->c_has_donated_pages) {
5226 		OSAddAtomic(-1, &c_segment_pages_compressed_incore_late_swapout);
5227 		assertf(c_segment_pages_compressed_incore_late_swapout >= 0, "-ve lateswapout count %p 0x%x", c_seg, c_segment_pages_compressed_incore_late_swapout);
5228 	}
5229 #endif /* CONFIG_FREEZE */
5230 
5231 	if (c_seg->c_state != C_ON_BAD_Q && !(C_SEG_IS_ONDISK(c_seg))) {
5232 		/*
5233 		 * C_SEG_IS_ONDISK == TRUE can occur when we're doing a
5234 		 * free of a compressed page (i.e. dst == NULL)
5235 		 */
5236 		OSAddAtomic64(-c_rounded_size, &compressor_bytes_used);
5237 	}
5238 	if (c_seg->c_busy_swapping) {
5239 		/*
5240 		 * bypass case for c_busy_swapping...
5241 		 * let the swapin/swapout paths deal with putting
5242 		 * the c_seg on the minor compaction queue if needed
5243 		 */
5244 		assert(c_seg->c_busy);
5245 		goto done;
5246 	}
5247 	assert(!c_seg->c_busy);
5248 
5249 	if (c_seg->c_state != C_IS_FILLING) {
5250 		if (c_seg->c_bytes_used == 0) {
5251 			if (!(C_SEG_IS_ONDISK(c_seg))) {
5252 				int     pages_populated;
5253 
5254 				pages_populated = (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) / PAGE_SIZE;
5255 				c_seg->c_populated_offset = C_SEG_BYTES_TO_OFFSET(0);
5256 
5257 				if (pages_populated) {
5258 					assert(c_seg->c_state != C_ON_BAD_Q);
5259 					assert(c_seg->c_store.c_buffer != NULL);
5260 
5261 					C_SEG_BUSY(c_seg);
5262 					lck_mtx_unlock_always(&c_seg->c_lock);
5263 
5264 					kernel_memory_depopulate(
5265 						(vm_offset_t) c_seg->c_store.c_buffer,
5266 						ptoa(pages_populated),
5267 						KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
5268 
5269 					lck_mtx_lock_spin_always(&c_seg->c_lock);
5270 					C_SEG_WAKEUP_DONE(c_seg);
5271 				}
5272 				if (!c_seg->c_on_minorcompact_q && c_seg->c_state != C_ON_SWAPIO_Q) {
5273 					if (c_seg->c_state == C_ON_SWAPOUT_Q) {
5274 						bool clear_busy = false;
5275 						if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
5276 							C_SEG_BUSY(c_seg);
5277 
5278 							lck_mtx_unlock_always(&c_seg->c_lock);
5279 							lck_mtx_lock_spin_always(c_list_lock);
5280 							lck_mtx_lock_spin_always(&c_seg->c_lock);
5281 							clear_busy = true;
5282 						}
5283 						c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
5284 						if (clear_busy) {
5285 							C_SEG_WAKEUP_DONE(c_seg);
5286 							clear_busy = false;
5287 						}
5288 						lck_mtx_unlock_always(c_list_lock);
5289 					}
5290 					c_seg_need_delayed_compaction(c_seg, FALSE);
5291 				}
5292 			} else {
5293 				if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q) {
5294 					c_seg_move_to_sparse_list(c_seg);
5295 					consider_defragmenting = TRUE;
5296 				}
5297 			}
5298 		} else if (c_seg->c_on_minorcompact_q) {
5299 			assert(c_seg->c_state != C_ON_BAD_Q);
5300 			assert(!C_SEG_IS_ON_DISK_OR_SOQ(c_seg));
5301 
5302 			if (C_SEG_SHOULD_MINORCOMPACT_NOW(c_seg)) {
5303 				c_seg_try_minor_compaction_and_unlock(c_seg);
5304 				need_unlock = FALSE;
5305 			}
5306 		} else if (!(C_SEG_IS_ONDISK(c_seg))) {
5307 			if (c_seg->c_state != C_ON_BAD_Q && c_seg->c_state != C_ON_SWAPOUT_Q && c_seg->c_state != C_ON_SWAPIO_Q &&
5308 			    C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
5309 				c_seg_need_delayed_compaction(c_seg, FALSE);
5310 			}
5311 		} else if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q && C_SEG_ONDISK_IS_SPARSE(c_seg)) {
5312 			c_seg_move_to_sparse_list(c_seg);
5313 			consider_defragmenting = TRUE;
5314 		}
5315 	}
5316 done:
5317 	if (__improbable(kdp_mode)) {
5318 		return retval;
5319 	}
5320 
5321 	if (need_unlock == TRUE) {
5322 		lck_mtx_unlock_always(&c_seg->c_lock);
5323 	}
5324 
5325 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5326 
5327 	if (consider_defragmenting == TRUE) {
5328 		vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE);
5329 	}
5330 
5331 #if !XNU_TARGET_OS_OSX
5332 	if ((c_minor_count && COMPRESSOR_NEEDS_TO_MINOR_COMPACT()) || vm_compressor_needs_to_major_compact()) {
5333 		vm_wake_compactor_swapper();
5334 	}
5335 #endif /* !XNU_TARGET_OS_OSX */
5336 
5337 	return retval;
5338 }
5339 
5340 
5341 int
vm_compressor_get(ppnum_t pn,int * slot,int flags)5342 vm_compressor_get(ppnum_t pn, int *slot, int flags)
5343 {
5344 	c_slot_mapping_t  slot_ptr;
5345 	char    *dst;
5346 	int     zeroslot = 1;
5347 	int     retval;
5348 
5349 	dst = pmap_map_compressor_page(pn);
5350 	slot_ptr = (c_slot_mapping_t)slot;
5351 
5352 	assert(dst != NULL);
5353 
5354 	if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
5355 		int32_t         data;
5356 		int32_t         *dptr;
5357 
5358 		/*
5359 		 * page was populated with a single value
5360 		 * that found a home in our hash table
5361 		 * grab that value from the hash and populate the page
5362 		 * that we need to populate the page with
5363 		 */
5364 		dptr = (int32_t *)(uintptr_t)dst;
5365 		data = c_segment_sv_hash_table[slot_ptr->s_cindx].he_data;
5366 		sv_decompress(dptr, data);
5367 		if (!(flags & C_KEEP)) {
5368 			c_segment_sv_hash_drop_ref(slot_ptr->s_cindx);
5369 
5370 			OSAddAtomic(-1, &c_segment_pages_compressed);
5371 			*slot = 0;
5372 		}
5373 		if (data) {
5374 			OSAddAtomic(1, &c_segment_svp_nonzero_decompressions);
5375 		} else {
5376 			OSAddAtomic(1, &c_segment_svp_zero_decompressions);
5377 		}
5378 
5379 		pmap_unmap_compressor_page(pn, dst);
5380 		return 0;
5381 	}
5382 
5383 	retval = c_decompress_page(dst, slot_ptr, flags, &zeroslot);
5384 
5385 	/*
5386 	 * zeroslot will be set to 0 by c_decompress_page if (flags & C_KEEP)
5387 	 * or (flags & C_DONT_BLOCK) and we found 'c_busy' or 'C_SEG_IS_ONDISK' to be TRUE
5388 	 */
5389 	if (zeroslot) {
5390 		*slot = 0;
5391 	}
5392 
5393 	pmap_unmap_compressor_page(pn, dst);
5394 
5395 	/*
5396 	 * returns 0 if we successfully decompressed a page from a segment already in memory
5397 	 * returns 1 if we had to first swap in the segment, before successfully decompressing the page
5398 	 * returns -1 if we encountered an error swapping in the segment - decompression failed
5399 	 * returns -2 if (flags & C_DONT_BLOCK) and we found 'c_busy' or 'C_SEG_IS_ONDISK' to be true
5400 	 */
5401 	return retval;
5402 }
5403 
5404 #if DEVELOPMENT || DEBUG
5405 
5406 void
vm_compressor_inject_error(int * slot)5407 vm_compressor_inject_error(int *slot)
5408 {
5409 	c_slot_mapping_t slot_ptr = (c_slot_mapping_t)slot;
5410 
5411 	/* No error detection for single-value compression. */
5412 	if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
5413 		printf("%s(): cannot inject errors in SV-compressed pages\n", __func__ );
5414 		return;
5415 	}
5416 
5417 	/* s_cseg is actually "segno+1" */
5418 	const uint32_t c_segno = slot_ptr->s_cseg - 1;
5419 
5420 	assert(c_segno < c_segments_available);
5421 	assert(c_segments[c_segno].c_segno >= c_segments_available);
5422 
5423 	const c_segment_t c_seg = c_segments[c_segno].c_seg;
5424 
5425 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
5426 
5427 	lck_mtx_lock_spin_always(&c_seg->c_lock);
5428 	assert(c_seg->c_state != C_IS_EMPTY && c_seg->c_state != C_IS_FREE);
5429 
5430 	const uint16_t c_indx = slot_ptr->s_cindx;
5431 	assert(c_indx < c_seg->c_nextslot);
5432 
5433 	/*
5434 	 * To safely make this segment temporarily writable, we need to mark
5435 	 * the segment busy, which allows us to release the segment lock.
5436 	 */
5437 	while (c_seg->c_busy) {
5438 		c_seg_wait_on_busy(c_seg);
5439 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5440 	}
5441 	C_SEG_BUSY(c_seg);
5442 
5443 	bool already_writable = (c_seg->c_state == C_IS_FILLING);
5444 	if (!already_writable) {
5445 		/*
5446 		 * Protection update must be performed preemptibly, so temporarily drop
5447 		 * the lock. Having set c_busy will prevent most other concurrent
5448 		 * operations.
5449 		 */
5450 		lck_mtx_unlock_always(&c_seg->c_lock);
5451 		C_SEG_MAKE_WRITEABLE(c_seg);
5452 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5453 	}
5454 
5455 	/*
5456 	 * Once we've released the lock following our c_state == C_IS_FILLING check,
5457 	 * c_current_seg_filled() can (re-)write-protect the segment. However, it
5458 	 * will transition from C_IS_FILLING before releasing the c_seg lock, so we
5459 	 * can detect this by re-checking after we've reobtained the lock.
5460 	 */
5461 	if (already_writable && c_seg->c_state != C_IS_FILLING) {
5462 		lck_mtx_unlock_always(&c_seg->c_lock);
5463 		C_SEG_MAKE_WRITEABLE(c_seg);
5464 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5465 		already_writable = false;
5466 		/* Segment can't be freed while c_busy is set. */
5467 		assert(c_seg->c_state != C_IS_FILLING);
5468 	}
5469 
5470 	/*
5471 	 * Skip if the segment is on disk. This check can only be performed after
5472 	 * the final acquisition of the segment lock before we attempt to write to
5473 	 * the segment.
5474 	 */
5475 	if (!C_SEG_IS_ON_DISK_OR_SOQ(c_seg)) {
5476 		c_slot_t cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5477 		int32_t *data = &c_seg->c_store.c_buffer[cs->c_offset];
5478 		/* assume that the compressed data holds at least one int32_t */
5479 		assert(UNPACK_C_SIZE(cs) > sizeof(*data));
5480 		/*
5481 		 * This bit is known to be in the payload of a MISS packet resulting from
5482 		 * the pattern used in the test pattern from decompression_failure.c.
5483 		 * Flipping it should result in many corrupted bits in the test page.
5484 		 */
5485 		data[0] ^= 0x00000100;
5486 	}
5487 
5488 	if (!already_writable) {
5489 		lck_mtx_unlock_always(&c_seg->c_lock);
5490 		C_SEG_WRITE_PROTECT(c_seg);
5491 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5492 	}
5493 
5494 	C_SEG_WAKEUP_DONE(c_seg);
5495 	lck_mtx_unlock_always(&c_seg->c_lock);
5496 
5497 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5498 }
5499 
5500 #endif /* DEVELOPMENT || DEBUG */
5501 
5502 int
vm_compressor_free(int * slot,int flags)5503 vm_compressor_free(int *slot, int flags)
5504 {
5505 	c_slot_mapping_t  slot_ptr;
5506 	int     zeroslot = 1;
5507 	int     retval;
5508 
5509 	assert(flags == 0 || flags == C_DONT_BLOCK);
5510 
5511 	slot_ptr = (c_slot_mapping_t)slot;
5512 
5513 	if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
5514 		c_segment_sv_hash_drop_ref(slot_ptr->s_cindx);
5515 		OSAddAtomic(-1, &c_segment_pages_compressed);
5516 
5517 		*slot = 0;
5518 		return 0;
5519 	}
5520 	retval = c_decompress_page(NULL, slot_ptr, flags, &zeroslot);
5521 	/*
5522 	 * returns 0 if we successfully freed the specified compressed page
5523 	 * returns -1 if we encountered an error swapping in the segment - decompression failed
5524 	 * returns -2 if (flags & C_DONT_BLOCK) and we found 'c_busy' set
5525 	 */
5526 
5527 	if (retval == 0) {
5528 		*slot = 0;
5529 	}
5530 
5531 	return retval;
5532 }
5533 
5534 
5535 int
vm_compressor_put(ppnum_t pn,int * slot,void ** current_chead,char * scratch_buf)5536 vm_compressor_put(ppnum_t pn, int *slot, void  **current_chead, char *scratch_buf)
5537 {
5538 	char    *src;
5539 	int     retval;
5540 
5541 	src = pmap_map_compressor_page(pn);
5542 	assert(src != NULL);
5543 
5544 	retval = c_compress_page(src, (c_slot_mapping_t)slot, (c_segment_t *)current_chead, scratch_buf);
5545 	pmap_unmap_compressor_page(pn, src);
5546 
5547 	return retval;
5548 }
5549 
5550 void
vm_compressor_transfer(int * dst_slot_p,int * src_slot_p)5551 vm_compressor_transfer(
5552 	int     *dst_slot_p,
5553 	int     *src_slot_p)
5554 {
5555 	c_slot_mapping_t        dst_slot, src_slot;
5556 	c_segment_t             c_seg;
5557 	uint16_t                c_indx;
5558 	c_slot_t                cs;
5559 
5560 	src_slot = (c_slot_mapping_t) src_slot_p;
5561 
5562 	if (src_slot->s_cseg == C_SV_CSEG_ID) {
5563 		*dst_slot_p = *src_slot_p;
5564 		*src_slot_p = 0;
5565 		return;
5566 	}
5567 	dst_slot = (c_slot_mapping_t) dst_slot_p;
5568 Retry:
5569 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
5570 	/* get segment for src_slot */
5571 	c_seg = c_segments[src_slot->s_cseg - 1].c_seg;
5572 	/* lock segment */
5573 	lck_mtx_lock_spin_always(&c_seg->c_lock);
5574 	/* wait if it's busy */
5575 	if (c_seg->c_busy && !c_seg->c_busy_swapping) {
5576 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5577 		c_seg_wait_on_busy(c_seg);
5578 		goto Retry;
5579 	}
5580 	/* find the c_slot */
5581 	c_indx = src_slot->s_cindx;
5582 	cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5583 	/* point the c_slot back to dst_slot instead of src_slot */
5584 	C_SLOT_ASSERT_PACKABLE(dst_slot);
5585 	cs->c_packed_ptr = C_SLOT_PACK_PTR(dst_slot);
5586 	/* transfer */
5587 	*dst_slot_p = *src_slot_p;
5588 	*src_slot_p = 0;
5589 	lck_mtx_unlock_always(&c_seg->c_lock);
5590 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5591 }
5592 
5593 #if defined(__arm64__)
5594 extern clock_sec_t             vm_swapfile_last_failed_to_create_ts;
5595 __attribute__((noreturn))
5596 void
vm_panic_hibernate_write_image_failed(int err)5597 vm_panic_hibernate_write_image_failed(int err)
5598 {
5599 	panic("hibernate_write_image encountered error 0x%x - %u, %u, %d, %d, %d, %d, %d, %d, %d, %d, %llu, %d, %d, %d\n",
5600 	    err,
5601 	    VM_PAGE_COMPRESSOR_COUNT, vm_page_wire_count,
5602 	    c_age_count, c_major_count, c_minor_count, (c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count), c_swappedout_sparse_count,
5603 	    vm_num_swap_files, vm_num_pinned_swap_files, vm_swappin_enabled, vm_swap_put_failures,
5604 	    (vm_swapfile_last_failed_to_create_ts ? 1:0), hibernate_no_swapspace, hibernate_flush_timed_out);
5605 }
5606 #endif /*(__arm64__)*/
5607 
5608 #if CONFIG_FREEZE
5609 
5610 int     freezer_finished_filling = 0;
5611 
5612 void
vm_compressor_finished_filling(void ** current_chead)5613 vm_compressor_finished_filling(
5614 	void    **current_chead)
5615 {
5616 	c_segment_t     c_seg;
5617 
5618 	if ((c_seg = *(c_segment_t *)current_chead) == NULL) {
5619 		return;
5620 	}
5621 
5622 	assert(c_seg->c_state == C_IS_FILLING);
5623 
5624 	lck_mtx_lock_spin_always(&c_seg->c_lock);
5625 
5626 	c_current_seg_filled(c_seg, (c_segment_t *)current_chead);
5627 
5628 	lck_mtx_unlock_always(&c_seg->c_lock);
5629 
5630 	freezer_finished_filling++;
5631 }
5632 
5633 
5634 /*
5635  * This routine is used to transfer the compressed chunks from
5636  * the c_seg/cindx pointed to by slot_p into a new c_seg headed
5637  * by the current_chead and a new cindx within that c_seg.
5638  *
5639  * Currently, this routine is only used by the "freezer backed by
5640  * compressor with swap" mode to create a series of c_segs that
5641  * only contain compressed data belonging to one task. So, we
5642  * move a task's previously compressed data into a set of new
5643  * c_segs which will also hold the task's yet to be compressed data.
5644  */
5645 
5646 kern_return_t
vm_compressor_relocate(void ** current_chead,int * slot_p)5647 vm_compressor_relocate(
5648 	void            **current_chead,
5649 	int             *slot_p)
5650 {
5651 	c_slot_mapping_t        slot_ptr;
5652 	c_slot_mapping_t        src_slot;
5653 	uint32_t                c_rounded_size;
5654 	uint32_t                c_size;
5655 	uint16_t                dst_slot;
5656 	c_slot_t                c_dst;
5657 	c_slot_t                c_src;
5658 	uint16_t                c_indx;
5659 	c_segment_t             c_seg_dst = NULL;
5660 	c_segment_t             c_seg_src = NULL;
5661 	kern_return_t           kr = KERN_SUCCESS;
5662 
5663 
5664 	src_slot = (c_slot_mapping_t) slot_p;
5665 
5666 	if (src_slot->s_cseg == C_SV_CSEG_ID) {
5667 		/*
5668 		 * no need to relocate... this is a page full of a single
5669 		 * value which is hashed to a single entry not contained
5670 		 * in a c_segment_t
5671 		 */
5672 		return kr;
5673 	}
5674 
5675 Relookup_dst:
5676 	c_seg_dst = c_seg_allocate((c_segment_t *)current_chead);
5677 	/*
5678 	 * returns with c_seg lock held
5679 	 * and PAGE_REPLACEMENT_DISALLOWED(TRUE)...
5680 	 * c_nextslot has been allocated and
5681 	 * c_store.c_buffer populated
5682 	 */
5683 	if (c_seg_dst == NULL) {
5684 		/*
5685 		 * Out of compression segments?
5686 		 */
5687 		kr = KERN_RESOURCE_SHORTAGE;
5688 		goto out;
5689 	}
5690 
5691 	assert(c_seg_dst->c_busy == 0);
5692 
5693 	C_SEG_BUSY(c_seg_dst);
5694 
5695 	dst_slot = c_seg_dst->c_nextslot;
5696 
5697 	lck_mtx_unlock_always(&c_seg_dst->c_lock);
5698 
5699 Relookup_src:
5700 	c_seg_src = c_segments[src_slot->s_cseg - 1].c_seg;
5701 
5702 	assert(c_seg_dst != c_seg_src);
5703 
5704 	lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5705 
5706 	if (C_SEG_IS_ON_DISK_OR_SOQ(c_seg_src) ||
5707 	    c_seg_src->c_state == C_IS_FILLING) {
5708 		/*
5709 		 * Skip this page if :-
5710 		 * a) the src c_seg is already on-disk (or on its way there)
5711 		 *    A "thaw" can mark a process as eligible for
5712 		 * another freeze cycle without bringing any of
5713 		 * its swapped out c_segs back from disk (because
5714 		 * that is done on-demand).
5715 		 *    Or, this page may be mapped elsewhere in the task's map,
5716 		 * and we may have marked it for swap already.
5717 		 *
5718 		 * b) Or, the src c_seg is being filled by the compressor
5719 		 * thread. We don't want the added latency of waiting for
5720 		 * this c_seg in the freeze path and so we skip it.
5721 		 */
5722 
5723 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5724 
5725 		lck_mtx_unlock_always(&c_seg_src->c_lock);
5726 
5727 		c_seg_src = NULL;
5728 
5729 		goto out;
5730 	}
5731 
5732 	if (c_seg_src->c_busy) {
5733 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5734 		c_seg_wait_on_busy(c_seg_src);
5735 
5736 		c_seg_src = NULL;
5737 
5738 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
5739 
5740 		goto Relookup_src;
5741 	}
5742 
5743 	C_SEG_BUSY(c_seg_src);
5744 
5745 	lck_mtx_unlock_always(&c_seg_src->c_lock);
5746 
5747 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5748 
5749 	/* find the c_slot */
5750 	c_indx = src_slot->s_cindx;
5751 
5752 	c_src = C_SEG_SLOT_FROM_INDEX(c_seg_src, c_indx);
5753 
5754 	c_size = UNPACK_C_SIZE(c_src);
5755 
5756 	assert(c_size);
5757 
5758 	if (c_size > (uint32_t)(c_seg_bufsize - C_SEG_OFFSET_TO_BYTES((int32_t)c_seg_dst->c_nextoffset))) {
5759 		/*
5760 		 * This segment is full. We need a new one.
5761 		 */
5762 
5763 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
5764 
5765 		lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5766 		C_SEG_WAKEUP_DONE(c_seg_src);
5767 		lck_mtx_unlock_always(&c_seg_src->c_lock);
5768 
5769 		c_seg_src = NULL;
5770 
5771 		lck_mtx_lock_spin_always(&c_seg_dst->c_lock);
5772 
5773 		assert(c_seg_dst->c_busy);
5774 		assert(c_seg_dst->c_state == C_IS_FILLING);
5775 		assert(!c_seg_dst->c_on_minorcompact_q);
5776 
5777 		c_current_seg_filled(c_seg_dst, (c_segment_t *)current_chead);
5778 		assert(*current_chead == NULL);
5779 
5780 		C_SEG_WAKEUP_DONE(c_seg_dst);
5781 
5782 		lck_mtx_unlock_always(&c_seg_dst->c_lock);
5783 
5784 		c_seg_dst = NULL;
5785 
5786 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5787 
5788 		goto Relookup_dst;
5789 	}
5790 
5791 	c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot);
5792 
5793 	memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], c_size);
5794 	/*
5795 	 * Is platform alignment actually necessary since wkdm aligns its output?
5796 	 */
5797 	c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
5798 
5799 	cslot_copy(c_dst, c_src);
5800 	c_dst->c_offset = c_seg_dst->c_nextoffset;
5801 
5802 	if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) {
5803 		c_seg_dst->c_firstemptyslot++;
5804 	}
5805 
5806 	c_seg_dst->c_slots_used++;
5807 	c_seg_dst->c_nextslot++;
5808 	c_seg_dst->c_bytes_used += c_rounded_size;
5809 	c_seg_dst->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
5810 
5811 
5812 	PACK_C_SIZE(c_src, 0);
5813 
5814 	c_seg_src->c_bytes_used -= c_rounded_size;
5815 	c_seg_src->c_bytes_unused += c_rounded_size;
5816 
5817 	assert(c_seg_src->c_slots_used);
5818 	c_seg_src->c_slots_used--;
5819 
5820 	if (!c_seg_src->c_swappedin) {
5821 		/* Pessimistically lose swappedin status when non-swappedin pages are added. */
5822 		c_seg_dst->c_swappedin = false;
5823 	}
5824 
5825 	if (c_indx < c_seg_src->c_firstemptyslot) {
5826 		c_seg_src->c_firstemptyslot = c_indx;
5827 	}
5828 
5829 	c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, dst_slot);
5830 
5831 	PAGE_REPLACEMENT_ALLOWED(TRUE);
5832 	slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
5833 	/* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
5834 	slot_ptr->s_cseg = c_seg_dst->c_mysegno + 1;
5835 	slot_ptr->s_cindx = dst_slot;
5836 
5837 	PAGE_REPLACEMENT_ALLOWED(FALSE);
5838 
5839 out:
5840 	if (c_seg_src) {
5841 		lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5842 
5843 		C_SEG_WAKEUP_DONE(c_seg_src);
5844 
5845 		if (c_seg_src->c_bytes_used == 0 && c_seg_src->c_state != C_IS_FILLING) {
5846 			if (!c_seg_src->c_on_minorcompact_q) {
5847 				c_seg_need_delayed_compaction(c_seg_src, FALSE);
5848 			}
5849 		}
5850 
5851 		lck_mtx_unlock_always(&c_seg_src->c_lock);
5852 	}
5853 
5854 	if (c_seg_dst) {
5855 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
5856 
5857 		lck_mtx_lock_spin_always(&c_seg_dst->c_lock);
5858 
5859 		if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
5860 			/*
5861 			 * Nearing or exceeded maximum slot and offset capacity.
5862 			 */
5863 			assert(c_seg_dst->c_busy);
5864 			assert(c_seg_dst->c_state == C_IS_FILLING);
5865 			assert(!c_seg_dst->c_on_minorcompact_q);
5866 
5867 			c_current_seg_filled(c_seg_dst, (c_segment_t *)current_chead);
5868 			assert(*current_chead == NULL);
5869 		}
5870 
5871 		C_SEG_WAKEUP_DONE(c_seg_dst);
5872 
5873 		lck_mtx_unlock_always(&c_seg_dst->c_lock);
5874 
5875 		c_seg_dst = NULL;
5876 
5877 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5878 	}
5879 
5880 	return kr;
5881 }
5882 #endif /* CONFIG_FREEZE */
5883