xref: /xnu-8792.81.2/osfmk/vm/vm_compressor.c (revision 19c3b8c28c31cb8130e034cfb5df6bf9ba342d90)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <vm/vm_compressor.h>
30 
31 #if CONFIG_PHANTOM_CACHE
32 #include <vm/vm_phantom_cache.h>
33 #endif
34 
35 #include <vm/vm_map.h>
36 #include <vm/vm_pageout.h>
37 #include <vm/memory_object.h>
38 #include <vm/vm_compressor_algorithms.h>
39 #include <vm/vm_compressor_backing_store.h>
40 #include <vm/vm_fault.h>
41 #include <vm/vm_protos.h>
42 #include <mach/mach_host.h>             /* for host_info() */
43 #if DEVELOPMENT || DEBUG
44 #include <kern/hvg_hypercall.h>
45 #endif
46 #include <kern/ledger.h>
47 #include <kern/policy_internal.h>
48 #include <kern/thread_group.h>
49 #include <san/kasan.h>
50 #include <os/log.h>
51 #include <pexpert/pexpert.h>
52 #include <pexpert/device_tree.h>
53 
54 #if defined(__x86_64__)
55 #include <i386/misc_protos.h>
56 #endif
57 #if defined(__arm64__)
58 #include <arm/machine_routines.h>
59 #endif
60 
61 #include <IOKit/IOHibernatePrivate.h>
62 
63 /*
64  * The segment buffer size is a tradeoff.
65  * A larger buffer leads to faster I/O throughput, better compression ratios
66  * (since fewer bytes are wasted at the end of the segment),
67  * and less overhead (both in time and space).
68  * However, a smaller buffer causes less swap when the system is overcommited
69  * b/c a higher percentage of the swapped-in segment is definitely accessed
70  * before it goes back out to storage.
71  *
72  * So on systems without swap, a larger segment is a clear win.
73  * On systems with swap, the choice is murkier. Empirically, we've
74  * found that a 64KB segment provides a better tradeoff both in terms of
75  * performance and swap writes than a 256KB segment on systems with fast SSDs
76  * and a HW compression block.
77  */
78 #define C_SEG_BUFSIZE_ARM_SWAP (1024 * 64)
79 #if XNU_TARGET_OS_OSX && defined(__arm64__)
80 #define C_SEG_BUFSIZE_DEFAULT C_SEG_BUFSIZE_ARM_SWAP
81 #else
82 #define C_SEG_BUFSIZE_DEFAULT (1024 * 256)
83 #endif /* TARGET_OS_OSX && defined(__arm64__) */
84 uint32_t c_seg_bufsize;
85 
86 uint32_t c_seg_max_pages, c_seg_off_limit, c_seg_allocsize, c_seg_slot_var_array_min_len;
87 
88 extern boolean_t vm_darkwake_mode;
89 extern zone_t vm_page_zone;
90 
91 #if DEVELOPMENT || DEBUG
92 /* sysctl defined in bsd/dev/arm64/sysctl.c */
93 int do_cseg_wedge_thread(void);
94 int do_cseg_unwedge_thread(void);
95 static event_t debug_cseg_wait_event = NULL;
96 #endif /* DEVELOPMENT || DEBUG */
97 
98 #if CONFIG_FREEZE
99 bool freezer_incore_cseg_acct = TRUE; /* Only count incore compressed memory for jetsams. */
100 void task_disown_frozen_csegs(task_t owner_task);
101 #endif /* CONFIG_FREEZE */
102 
103 #if POPCOUNT_THE_COMPRESSED_DATA
104 boolean_t popcount_c_segs = TRUE;
105 
106 static inline uint32_t
vmc_pop(uintptr_t ins,int sz)107 vmc_pop(uintptr_t ins, int sz)
108 {
109 	uint32_t rv = 0;
110 
111 	if (__probable(popcount_c_segs == FALSE)) {
112 		return 0xDEAD707C;
113 	}
114 
115 	while (sz >= 16) {
116 		uint32_t rv1, rv2;
117 		uint64_t *ins64 = (uint64_t *) ins;
118 		uint64_t *ins642 = (uint64_t *) (ins + 8);
119 		rv1 = __builtin_popcountll(*ins64);
120 		rv2 = __builtin_popcountll(*ins642);
121 		rv += rv1 + rv2;
122 		sz -= 16;
123 		ins += 16;
124 	}
125 
126 	while (sz >= 4) {
127 		uint32_t *ins32 = (uint32_t *) ins;
128 		rv += __builtin_popcount(*ins32);
129 		sz -= 4;
130 		ins += 4;
131 	}
132 
133 	while (sz > 0) {
134 		char *ins8 = (char *)ins;
135 		rv += __builtin_popcount(*ins8);
136 		sz--;
137 		ins++;
138 	}
139 	return rv;
140 }
141 #endif
142 
143 #if VALIDATE_C_SEGMENTS
144 boolean_t validate_c_segs = TRUE;
145 #endif
146 /*
147  * vm_compressor_mode has a heirarchy of control to set its value.
148  * boot-args are checked first, then device-tree, and finally
149  * the default value that is defined below. See vm_fault_init() for
150  * the boot-arg & device-tree code.
151  */
152 
153 #if !XNU_TARGET_OS_OSX
154 
155 #if CONFIG_FREEZE
156 int     vm_compressor_mode = VM_PAGER_FREEZER_DEFAULT;
157 struct  freezer_context freezer_context_global;
158 #else /* CONFIG_FREEZE */
159 int     vm_compressor_mode = VM_PAGER_NOT_CONFIGURED;
160 #endif /* CONFIG_FREEZE */
161 
162 #else /* !XNU_TARGET_OS_OSX */
163 int             vm_compressor_mode = VM_PAGER_COMPRESSOR_WITH_SWAP;
164 
165 #endif /* !XNU_TARGET_OS_OSX */
166 
167 TUNABLE(uint32_t, vm_compression_limit, "vm_compression_limit", 0);
168 int             vm_compressor_is_active = 0;
169 int             vm_compressor_available = 0;
170 
171 extern uint64_t vm_swap_get_max_configured_space(void);
172 extern void     vm_pageout_io_throttle(void);
173 bool vm_compressor_swapout_is_ripe(void);
174 
175 #if CHECKSUM_THE_DATA || CHECKSUM_THE_SWAP || CHECKSUM_THE_COMPRESSED_DATA
176 extern unsigned int hash_string(char *cp, int len);
177 static unsigned int vmc_hash(char *, int);
178 boolean_t checksum_c_segs = TRUE;
179 
180 unsigned int
vmc_hash(char * cp,int len)181 vmc_hash(char *cp, int len)
182 {
183 	if (__probable(checksum_c_segs == FALSE)) {
184 		return 0xDEAD7A37;
185 	}
186 	return hash_string(cp, len);
187 }
188 #endif
189 
190 #define UNPACK_C_SIZE(cs)       ((cs->c_size == (PAGE_SIZE-1)) ? PAGE_SIZE : cs->c_size)
191 #define PACK_C_SIZE(cs, size)   (cs->c_size = ((size == PAGE_SIZE) ? PAGE_SIZE - 1 : size))
192 
193 
194 struct c_sv_hash_entry {
195 	union {
196 		struct  {
197 			uint32_t        c_sv_he_ref;
198 			uint32_t        c_sv_he_data;
199 		} c_sv_he;
200 		uint64_t        c_sv_he_record;
201 	} c_sv_he_un;
202 };
203 
204 #define he_ref  c_sv_he_un.c_sv_he.c_sv_he_ref
205 #define he_data c_sv_he_un.c_sv_he.c_sv_he_data
206 #define he_record c_sv_he_un.c_sv_he_record
207 
208 #define C_SV_HASH_MAX_MISS      32
209 #define C_SV_HASH_SIZE          ((1 << 10))
210 #define C_SV_HASH_MASK          ((1 << 10) - 1)
211 #define C_SV_CSEG_ID            ((1 << 22) - 1)
212 
213 
214 union c_segu {
215 	c_segment_t     c_seg;
216 	uintptr_t       c_segno;
217 };
218 
219 #define C_SLOT_ASSERT_PACKABLE(ptr) \
220 	VM_ASSERT_POINTER_PACKABLE((vm_offset_t)(ptr), C_SLOT_PACKED_PTR);
221 
222 #define C_SLOT_PACK_PTR(ptr) \
223 	VM_PACK_POINTER((vm_offset_t)(ptr), C_SLOT_PACKED_PTR)
224 
225 #define C_SLOT_UNPACK_PTR(cslot) \
226 	(c_slot_mapping_t)VM_UNPACK_POINTER((cslot)->c_packed_ptr, C_SLOT_PACKED_PTR)
227 
228 /* for debugging purposes */
229 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) c_slot_packing_params =
230     VM_PACKING_PARAMS(C_SLOT_PACKED_PTR);
231 
232 uint32_t        c_segment_count = 0;
233 uint32_t        c_segment_count_max = 0;
234 
235 uint64_t        c_generation_id = 0;
236 uint64_t        c_generation_id_flush_barrier;
237 
238 
239 #define         HIBERNATE_FLUSHING_SECS_TO_COMPLETE     120
240 
241 boolean_t       hibernate_no_swapspace = FALSE;
242 boolean_t       hibernate_flush_timed_out = FALSE;
243 clock_sec_t     hibernate_flushing_deadline = 0;
244 
245 #if RECORD_THE_COMPRESSED_DATA
246 char    *c_compressed_record_sbuf;
247 char    *c_compressed_record_ebuf;
248 char    *c_compressed_record_cptr;
249 #endif
250 
251 
252 queue_head_t    c_age_list_head;
253 queue_head_t    c_early_swappedin_list_head, c_regular_swappedin_list_head, c_late_swappedin_list_head;
254 queue_head_t    c_early_swapout_list_head, c_regular_swapout_list_head, c_late_swapout_list_head;
255 queue_head_t    c_swapio_list_head;
256 queue_head_t    c_swappedout_list_head;
257 queue_head_t    c_swappedout_sparse_list_head;
258 queue_head_t    c_major_list_head;
259 queue_head_t    c_filling_list_head;
260 queue_head_t    c_bad_list_head;
261 
262 uint32_t        c_age_count = 0;
263 uint32_t        c_early_swappedin_count = 0, c_regular_swappedin_count = 0, c_late_swappedin_count = 0;
264 uint32_t        c_early_swapout_count = 0, c_regular_swapout_count = 0, c_late_swapout_count = 0;
265 uint32_t        c_swapio_count = 0;
266 uint32_t        c_swappedout_count = 0;
267 uint32_t        c_swappedout_sparse_count = 0;
268 uint32_t        c_major_count = 0;
269 uint32_t        c_filling_count = 0;
270 uint32_t        c_empty_count = 0;
271 uint32_t        c_bad_count = 0;
272 
273 
274 queue_head_t    c_minor_list_head;
275 uint32_t        c_minor_count = 0;
276 
277 int             c_overage_swapped_count = 0;
278 int             c_overage_swapped_limit = 0;
279 
280 int             c_seg_fixed_array_len;
281 union  c_segu   *c_segments;
282 vm_offset_t     c_buffers;
283 vm_size_t       c_buffers_size;
284 caddr_t         c_segments_next_page;
285 boolean_t       c_segments_busy;
286 uint32_t        c_segments_available;
287 uint32_t        c_segments_limit;
288 uint32_t        c_segments_nearing_limit;
289 
290 uint32_t        c_segment_svp_in_hash;
291 uint32_t        c_segment_svp_hash_succeeded;
292 uint32_t        c_segment_svp_hash_failed;
293 uint32_t        c_segment_svp_zero_compressions;
294 uint32_t        c_segment_svp_nonzero_compressions;
295 uint32_t        c_segment_svp_zero_decompressions;
296 uint32_t        c_segment_svp_nonzero_decompressions;
297 
298 uint32_t        c_segment_noncompressible_pages;
299 
300 uint32_t        c_segment_pages_compressed = 0; /* Tracks # of uncompressed pages fed into the compressor */
301 #if CONFIG_FREEZE
302 int32_t         c_segment_pages_compressed_incore = 0; /* Tracks # of uncompressed pages fed into the compressor that are in memory */
303 int32_t         c_segment_pages_compressed_incore_late_swapout = 0; /* Tracks # of uncompressed pages fed into the compressor that are in memory and tagged for swapout */
304 uint32_t        c_segments_incore_limit = 0; /* Tracks # of segments allowed to be in-core. Based on compressor pool size */
305 #endif /* CONFIG_FREEZE */
306 
307 uint32_t        c_segment_pages_compressed_limit;
308 uint32_t        c_segment_pages_compressed_nearing_limit;
309 uint32_t        c_free_segno_head = (uint32_t)-1;
310 
311 uint32_t        vm_compressor_minorcompact_threshold_divisor = 10;
312 uint32_t        vm_compressor_majorcompact_threshold_divisor = 10;
313 uint32_t        vm_compressor_unthrottle_threshold_divisor = 10;
314 uint32_t        vm_compressor_catchup_threshold_divisor = 10;
315 
316 uint32_t        vm_compressor_minorcompact_threshold_divisor_overridden = 0;
317 uint32_t        vm_compressor_majorcompact_threshold_divisor_overridden = 0;
318 uint32_t        vm_compressor_unthrottle_threshold_divisor_overridden = 0;
319 uint32_t        vm_compressor_catchup_threshold_divisor_overridden = 0;
320 
321 #define         C_SEGMENTS_PER_PAGE     (PAGE_SIZE / sizeof(union c_segu))
322 
323 LCK_GRP_DECLARE(vm_compressor_lck_grp, "vm_compressor");
324 LCK_RW_DECLARE(c_master_lock, &vm_compressor_lck_grp);
325 LCK_MTX_DECLARE(c_list_lock_storage, &vm_compressor_lck_grp);
326 
327 boolean_t       decompressions_blocked = FALSE;
328 
329 zone_t          compressor_segment_zone;
330 int             c_compressor_swap_trigger = 0;
331 
332 uint32_t        compressor_cpus;
333 char            *compressor_scratch_bufs;
334 char            *kdp_compressor_scratch_buf;
335 char            *kdp_compressor_decompressed_page;
336 addr64_t        kdp_compressor_decompressed_page_paddr;
337 ppnum_t         kdp_compressor_decompressed_page_ppnum;
338 
339 clock_sec_t     start_of_sample_period_sec = 0;
340 clock_nsec_t    start_of_sample_period_nsec = 0;
341 clock_sec_t     start_of_eval_period_sec = 0;
342 clock_nsec_t    start_of_eval_period_nsec = 0;
343 uint32_t        sample_period_decompression_count = 0;
344 uint32_t        sample_period_compression_count = 0;
345 uint32_t        last_eval_decompression_count = 0;
346 uint32_t        last_eval_compression_count = 0;
347 
348 #define         DECOMPRESSION_SAMPLE_MAX_AGE            (60 * 30)
349 
350 boolean_t       vm_swapout_ripe_segments = FALSE;
351 uint32_t        vm_ripe_target_age = (60 * 60 * 48);
352 
353 uint32_t        swapout_target_age = 0;
354 uint32_t        age_of_decompressions_during_sample_period[DECOMPRESSION_SAMPLE_MAX_AGE];
355 uint32_t        overage_decompressions_during_sample_period = 0;
356 
357 
358 void            do_fastwake_warmup(queue_head_t *, boolean_t);
359 boolean_t       fastwake_warmup = FALSE;
360 boolean_t       fastwake_recording_in_progress = FALSE;
361 clock_sec_t     dont_trim_until_ts = 0;
362 
363 uint64_t        c_segment_warmup_count;
364 uint64_t        first_c_segment_to_warm_generation_id = 0;
365 uint64_t        last_c_segment_to_warm_generation_id = 0;
366 boolean_t       hibernate_flushing = FALSE;
367 
368 int64_t         c_segment_input_bytes __attribute__((aligned(8))) = 0;
369 int64_t         c_segment_compressed_bytes __attribute__((aligned(8))) = 0;
370 int64_t         compressor_bytes_used __attribute__((aligned(8))) = 0;
371 
372 /* Keeps track of the most recent timestamp for when major compaction finished. */
373 mach_timespec_t major_compact_ts;
374 
375 struct c_sv_hash_entry c_segment_sv_hash_table[C_SV_HASH_SIZE]  __attribute__ ((aligned(8)));
376 
377 static void vm_compressor_swap_trigger_thread(void);
378 static void vm_compressor_do_delayed_compactions(boolean_t);
379 static void vm_compressor_compact_and_swap(boolean_t);
380 static void vm_compressor_process_regular_swapped_in_segments(boolean_t);
381 void vm_compressor_process_special_swapped_in_segments(void);
382 static void vm_compressor_process_special_swapped_in_segments_locked(void);
383 
384 struct vm_compressor_swapper_stats vmcs_stats;
385 
386 #if XNU_TARGET_OS_OSX
387 #if (__arm64__)
388 static void vm_compressor_process_major_segments(void);
389 #endif /* (__arm64__) */
390 static void vm_compressor_take_paging_space_action(void);
391 #endif /* XNU_TARGET_OS_OSX */
392 
393 void compute_swapout_target_age(void);
394 
395 boolean_t c_seg_major_compact(c_segment_t, c_segment_t);
396 boolean_t c_seg_major_compact_ok(c_segment_t, c_segment_t);
397 
398 int  c_seg_minor_compaction_and_unlock(c_segment_t, boolean_t);
399 int  c_seg_do_minor_compaction_and_unlock(c_segment_t, boolean_t, boolean_t, boolean_t);
400 void c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg);
401 
402 void c_seg_move_to_sparse_list(c_segment_t);
403 void c_seg_insert_into_q(queue_head_t *, c_segment_t);
404 
405 uint64_t vm_available_memory(void);
406 uint64_t vm_compressor_pages_compressed(void);
407 uint32_t vm_compressor_pool_size(void);
408 uint32_t vm_compressor_fragmentation_level(void);
409 uint32_t vm_compression_ratio(void);
410 
411 /*
412  * indicate the need to do a major compaction if
413  * the overall set of in-use compression segments
414  * becomes sparse... on systems that support pressure
415  * driven swapping, this will also cause swapouts to
416  * be initiated.
417  */
418 static inline bool
vm_compressor_needs_to_major_compact()419 vm_compressor_needs_to_major_compact()
420 {
421 	uint32_t        incore_seg_count;
422 
423 	incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
424 
425 	if ((c_segment_count >= (c_segments_nearing_limit / 8)) &&
426 	    ((incore_seg_count * c_seg_max_pages) - VM_PAGE_COMPRESSOR_COUNT) >
427 	    ((incore_seg_count / 8) * c_seg_max_pages)) {
428 		return true;
429 	}
430 	return false;
431 }
432 
433 
434 uint64_t
vm_available_memory(void)435 vm_available_memory(void)
436 {
437 	return ((uint64_t)AVAILABLE_NON_COMPRESSED_MEMORY) * PAGE_SIZE_64;
438 }
439 
440 
441 uint32_t
vm_compressor_pool_size(void)442 vm_compressor_pool_size(void)
443 {
444 	return VM_PAGE_COMPRESSOR_COUNT;
445 }
446 
447 uint32_t
vm_compressor_fragmentation_level(void)448 vm_compressor_fragmentation_level(void)
449 {
450 	const uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
451 	if ((incore_seg_count == 0) || (c_seg_max_pages == 0)) {
452 		return 0;
453 	}
454 	return 100 - (vm_compressor_pool_size() * 100 / (incore_seg_count * c_seg_max_pages));
455 }
456 
457 uint32_t
vm_compression_ratio(void)458 vm_compression_ratio(void)
459 {
460 	if (vm_compressor_pool_size() == 0) {
461 		return UINT32_MAX;
462 	}
463 	return c_segment_pages_compressed / vm_compressor_pool_size();
464 }
465 
466 uint64_t
vm_compressor_pages_compressed(void)467 vm_compressor_pages_compressed(void)
468 {
469 	return c_segment_pages_compressed * PAGE_SIZE_64;
470 }
471 
472 bool
vm_compressor_compressed_pages_nearing_limit(void)473 vm_compressor_compressed_pages_nearing_limit(void)
474 {
475 	uint32_t pages = 0;
476 
477 #if CONFIG_FREEZE
478 	pages = os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
479 #else /* CONFIG_FREEZE */
480 	pages = c_segment_pages_compressed;
481 #endif /* CONFIG_FREEZE */
482 
483 	return pages > c_segment_pages_compressed_nearing_limit;
484 }
485 
486 static bool
vm_compressor_segments_nearing_limit(void)487 vm_compressor_segments_nearing_limit(void)
488 {
489 	uint64_t segments;
490 
491 #if CONFIG_FREEZE
492 	if (freezer_incore_cseg_acct) {
493 		if (os_sub_overflow(c_segment_count, c_swappedout_count, &segments)) {
494 			segments = 0;
495 		}
496 		if (os_sub_overflow(segments, c_swappedout_sparse_count, &segments)) {
497 			segments = 0;
498 		}
499 	} else {
500 		segments = os_atomic_load(&c_segment_count, relaxed);
501 	}
502 #else /* CONFIG_FREEZE */
503 	segments = c_segment_count;
504 #endif /* CONFIG_FREEZE */
505 
506 	return segments > c_segments_nearing_limit;
507 }
508 
509 boolean_t
vm_compressor_low_on_space(void)510 vm_compressor_low_on_space(void)
511 {
512 	return vm_compressor_compressed_pages_nearing_limit() ||
513 	       vm_compressor_segments_nearing_limit();
514 }
515 
516 
517 boolean_t
vm_compressor_out_of_space(void)518 vm_compressor_out_of_space(void)
519 {
520 #if CONFIG_FREEZE
521 	uint64_t incore_seg_count;
522 	uint32_t incore_compressed_pages;
523 	if (freezer_incore_cseg_acct) {
524 		if (os_sub_overflow(c_segment_count, c_swappedout_count, &incore_seg_count)) {
525 			incore_seg_count = 0;
526 		}
527 		if (os_sub_overflow(incore_seg_count, c_swappedout_sparse_count, &incore_seg_count)) {
528 			incore_seg_count = 0;
529 		}
530 		incore_compressed_pages = os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
531 	} else {
532 		incore_seg_count = os_atomic_load(&c_segment_count, relaxed);
533 		incore_compressed_pages = os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
534 	}
535 
536 	if ((incore_compressed_pages >= c_segment_pages_compressed_limit) ||
537 	    (incore_seg_count > c_segments_incore_limit)) {
538 		return TRUE;
539 	}
540 #else /* CONFIG_FREEZE */
541 	if ((c_segment_pages_compressed >= c_segment_pages_compressed_limit) ||
542 	    (c_segment_count >= c_segments_limit)) {
543 		return TRUE;
544 	}
545 #endif /* CONFIG_FREEZE */
546 	return FALSE;
547 }
548 
549 bool
vm_compressor_is_thrashing()550 vm_compressor_is_thrashing()
551 {
552 	compute_swapout_target_age();
553 
554 	if (swapout_target_age) {
555 		c_segment_t     c_seg;
556 
557 		lck_mtx_lock_spin_always(c_list_lock);
558 
559 		if (!queue_empty(&c_age_list_head)) {
560 			c_seg = (c_segment_t) queue_first(&c_age_list_head);
561 
562 			if (c_seg->c_creation_ts > swapout_target_age) {
563 				swapout_target_age = 0;
564 			}
565 		}
566 		lck_mtx_unlock_always(c_list_lock);
567 	}
568 
569 	return swapout_target_age != 0;
570 }
571 
572 
573 int
vm_wants_task_throttled(task_t task)574 vm_wants_task_throttled(task_t task)
575 {
576 	ledger_amount_t compressed;
577 	if (task == kernel_task) {
578 		return 0;
579 	}
580 
581 	if (VM_CONFIG_SWAP_IS_ACTIVE) {
582 		if ((vm_compressor_low_on_space() || HARD_THROTTLE_LIMIT_REACHED())) {
583 			ledger_get_balance(task->ledger, task_ledgers.internal_compressed, &compressed);
584 			compressed >>= VM_MAP_PAGE_SHIFT(task->map);
585 			if ((unsigned int)compressed > (c_segment_pages_compressed / 4)) {
586 				return 1;
587 			}
588 		}
589 	}
590 	return 0;
591 }
592 
593 
594 #if DEVELOPMENT || DEBUG
595 /*
596  * On compressor/swap exhaustion, kill the largest process regardless of
597  * its chosen process policy.
598  */
599 TUNABLE(bool, kill_on_no_paging_space, "-kill_on_no_paging_space", false);
600 #endif /* DEVELOPMENT || DEBUG */
601 
602 #if CONFIG_JETSAM
603 boolean_t       memorystatus_kill_on_VM_compressor_space_shortage(boolean_t);
604 void            memorystatus_thread_wake(void);
605 extern uint32_t jetsam_kill_on_low_swap;
606 bool            memorystatus_disable_swap(void);
607 #if CONFIG_PHANTOM_CACHE
608 extern bool memorystatus_phantom_cache_pressure;
609 #endif /* CONFIG_PHANTOM_CACHE */
610 int             compressor_thrashing_induced_jetsam = 0;
611 int             filecache_thrashing_induced_jetsam = 0;
612 static boolean_t        vm_compressor_thrashing_detected = FALSE;
613 #else  /* CONFIG_JETSAM */
614 static uint32_t no_paging_space_action_in_progress = 0;
615 extern void memorystatus_send_low_swap_note(void);
616 #endif /* CONFIG_JETSAM */
617 
618 static void
vm_compressor_take_paging_space_action(void)619 vm_compressor_take_paging_space_action(void)
620 {
621 #if CONFIG_JETSAM
622 	/*
623 	 * On systems with both swap and jetsam,
624 	 * just wake up the jetsam thread and have it handle the low swap condition
625 	 * by killing apps.
626 	 */
627 	if (jetsam_kill_on_low_swap) {
628 		memorystatus_thread_wake();
629 	}
630 #else /* CONFIG_JETSAM */
631 	if (no_paging_space_action_in_progress == 0) {
632 		if (OSCompareAndSwap(0, 1, (UInt32 *)&no_paging_space_action_in_progress)) {
633 			if (no_paging_space_action()) {
634 #if DEVELOPMENT || DEBUG
635 				if (kill_on_no_paging_space) {
636 					/*
637 					 * Since we are choosing to always kill a process, we don't need the
638 					 * "out of application memory" dialog box in this mode. And, hence we won't
639 					 * send the knote.
640 					 */
641 					no_paging_space_action_in_progress = 0;
642 					return;
643 				}
644 #endif /* DEVELOPMENT || DEBUG */
645 				memorystatus_send_low_swap_note();
646 			}
647 
648 			no_paging_space_action_in_progress = 0;
649 		}
650 	}
651 #endif /* !CONFIG_JETSAM */
652 }
653 
654 
655 void
vm_decompressor_lock(void)656 vm_decompressor_lock(void)
657 {
658 	PAGE_REPLACEMENT_ALLOWED(TRUE);
659 
660 	decompressions_blocked = TRUE;
661 
662 	PAGE_REPLACEMENT_ALLOWED(FALSE);
663 }
664 
665 void
vm_decompressor_unlock(void)666 vm_decompressor_unlock(void)
667 {
668 	PAGE_REPLACEMENT_ALLOWED(TRUE);
669 
670 	decompressions_blocked = FALSE;
671 
672 	PAGE_REPLACEMENT_ALLOWED(FALSE);
673 
674 	thread_wakeup((event_t)&decompressions_blocked);
675 }
676 
677 static inline void
cslot_copy(c_slot_t cdst,c_slot_t csrc)678 cslot_copy(c_slot_t cdst, c_slot_t csrc)
679 {
680 #if CHECKSUM_THE_DATA
681 	cdst->c_hash_data = csrc->c_hash_data;
682 #endif
683 #if CHECKSUM_THE_COMPRESSED_DATA
684 	cdst->c_hash_compressed_data = csrc->c_hash_compressed_data;
685 #endif
686 #if POPCOUNT_THE_COMPRESSED_DATA
687 	cdst->c_pop_cdata = csrc->c_pop_cdata;
688 #endif
689 	cdst->c_size = csrc->c_size;
690 	cdst->c_packed_ptr = csrc->c_packed_ptr;
691 #if defined(__arm64__)
692 	cdst->c_codec = csrc->c_codec;
693 #endif
694 #if __APPLE_WKDM_POPCNT_EXTENSIONS__
695 	cdst->c_inline_popcount = csrc->c_inline_popcount;
696 #endif
697 }
698 
699 #if XNU_TARGET_OS_OSX
700 #define VM_COMPRESSOR_MAX_POOL_SIZE (192UL << 30)
701 #else
702 #define VM_COMPRESSOR_MAX_POOL_SIZE (0)
703 #endif
704 
705 static vm_map_size_t compressor_size;
706 static SECURITY_READ_ONLY_LATE(struct mach_vm_range) compressor_range;
707 vm_map_t compressor_map;
708 uint64_t compressor_pool_max_size;
709 uint64_t compressor_pool_size;
710 uint32_t compressor_pool_multiplier;
711 
712 #if DEVELOPMENT || DEBUG
713 /*
714  * Compressor segments are write-protected in development/debug
715  * kernels to help debug memory corruption.
716  * In cases where performance is a concern, this can be disabled
717  * via the boot-arg "-disable_cseg_write_protection".
718  */
719 boolean_t write_protect_c_segs = TRUE;
720 int vm_compressor_test_seg_wp;
721 uint32_t vm_ktrace_enabled;
722 #endif /* DEVELOPMENT || DEBUG */
723 
724 #if (XNU_TARGET_OS_OSX && __arm64__)
725 
726 #include <IOKit/IOPlatformExpert.h>
727 #include <sys/random.h>
728 
729 static const char *csegbufsizeExperimentProperty = "_csegbufsz_experiment";
730 static thread_call_t csegbufsz_experiment_thread_call;
731 
732 extern boolean_t IOServiceWaitForMatchingResource(const char * property, uint64_t timeout);
733 static void
erase_csegbufsz_experiment_property(__unused void * param0,__unused void * param1)734 erase_csegbufsz_experiment_property(__unused void *param0, __unused void *param1)
735 {
736 	// Wait for NVRAM to be writable
737 	if (!IOServiceWaitForMatchingResource("IONVRAM", UINT64_MAX)) {
738 		printf("csegbufsz_experiment_property: Failed to wait for IONVRAM.");
739 	}
740 
741 	if (!PERemoveNVRAMProperty(csegbufsizeExperimentProperty)) {
742 		printf("csegbufsize_experiment_property: Failed to remove %s from NVRAM.", csegbufsizeExperimentProperty);
743 	}
744 	thread_call_free(csegbufsz_experiment_thread_call);
745 }
746 
747 static void
erase_csegbufsz_experiment_property_async()748 erase_csegbufsz_experiment_property_async()
749 {
750 	csegbufsz_experiment_thread_call = thread_call_allocate_with_priority(
751 		erase_csegbufsz_experiment_property,
752 		NULL,
753 		THREAD_CALL_PRIORITY_LOW
754 		);
755 	if (csegbufsz_experiment_thread_call == NULL) {
756 		printf("csegbufsize_experiment_property: Unable to allocate thread call.");
757 	} else {
758 		thread_call_enter(csegbufsz_experiment_thread_call);
759 	}
760 }
761 
762 static void
cleanup_csegbufsz_experiment(__unused void * arg0)763 cleanup_csegbufsz_experiment(__unused void *arg0)
764 {
765 	char nvram = 0;
766 	unsigned int len = sizeof(nvram);
767 	if (PEReadNVRAMProperty(csegbufsizeExperimentProperty, &nvram, &len)) {
768 		erase_csegbufsz_experiment_property_async();
769 	}
770 }
771 
772 STARTUP_ARG(EARLY_BOOT, STARTUP_RANK_FIRST, cleanup_csegbufsz_experiment, NULL);
773 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
774 
775 #if CONFIG_JETSAM
776 extern unsigned int memorystatus_swap_all_apps;
777 #endif /* CONFIG_JETSAM */
778 
779 TUNABLE_DT(uint64_t, swap_vol_min_capacity, "/defaults", "kern.swap_min_capacity", "kern.swap_min_capacity", 0, TUNABLE_DT_NONE);
780 
781 static void
vm_compressor_set_size(void)782 vm_compressor_set_size(void)
783 {
784 	/*
785 	 * Note that this function may be called multiple times on systems with app swap
786 	 * because the value of vm_swap_get_max_configured_space() and memorystatus_swap_all_apps
787 	 * can change based the size of the swap volume. On these systems, we'll call
788 	 * this function once early in boot to reserve the maximum amount of VA required
789 	 * for the compressor submap and then one more time in vm_compressor_init after
790 	 * determining the swap volume size. We must not return a larger value the second
791 	 * time around.
792 	 */
793 	vm_size_t       c_segments_arr_size = 0;
794 	struct c_slot_mapping tmp_slot_ptr;
795 
796 	/* The segment size can be overwritten by a boot-arg */
797 	if (!PE_parse_boot_argn("vm_compressor_segment_buffer_size", &c_seg_bufsize, sizeof(c_seg_bufsize))) {
798 #if CONFIG_JETSAM
799 		if (memorystatus_swap_all_apps) {
800 			c_seg_bufsize = C_SEG_BUFSIZE_ARM_SWAP;
801 		} else {
802 			c_seg_bufsize = C_SEG_BUFSIZE_DEFAULT;
803 		}
804 #else
805 		c_seg_bufsize = C_SEG_BUFSIZE_DEFAULT;
806 #endif /* CONFIG_JETSAM */
807 	}
808 
809 	vm_compressor_swap_init_swap_file_limit();
810 	if (vm_compression_limit) {
811 		compressor_pool_size = ptoa_64(vm_compression_limit);
812 	}
813 
814 	compressor_pool_max_size = C_SEG_MAX_LIMIT;
815 	compressor_pool_max_size *= c_seg_bufsize;
816 
817 #if XNU_TARGET_OS_OSX
818 
819 	if (vm_compression_limit == 0) {
820 		if (max_mem <= (4ULL * 1024ULL * 1024ULL * 1024ULL)) {
821 			compressor_pool_size = 16ULL * max_mem;
822 		} else if (max_mem <= (8ULL * 1024ULL * 1024ULL * 1024ULL)) {
823 			compressor_pool_size = 8ULL * max_mem;
824 		} else if (max_mem <= (32ULL * 1024ULL * 1024ULL * 1024ULL)) {
825 			compressor_pool_size = 4ULL * max_mem;
826 		} else {
827 			compressor_pool_size = 2ULL * max_mem;
828 		}
829 	}
830 	/*
831 	 * Cap the compressor pool size to a max of 192G
832 	 */
833 	if (compressor_pool_size > VM_COMPRESSOR_MAX_POOL_SIZE) {
834 		compressor_pool_size = VM_COMPRESSOR_MAX_POOL_SIZE;
835 	}
836 	if (max_mem <= (8ULL * 1024ULL * 1024ULL * 1024ULL)) {
837 		compressor_pool_multiplier = 1;
838 	} else if (max_mem <= (32ULL * 1024ULL * 1024ULL * 1024ULL)) {
839 		compressor_pool_multiplier = 2;
840 	} else {
841 		compressor_pool_multiplier = 4;
842 	}
843 
844 #elif defined(__arm64__) && defined(XNU_TARGET_OS_WATCH)
845 
846 	/*
847 	 * On M9 watches the compressor can become big and can lead to
848 	 * churn in workingset resulting in audio drops. Setting a cap
849 	 * on the compressor size favors reclaiming unused memory
850 	 * sitting in idle band via jetsams
851 	 */
852 
853 #define COMPRESSOR_CAP_PERCENTAGE        37ULL
854 
855 	if (compressor_pool_max_size > max_mem) {
856 		compressor_pool_max_size = max_mem;
857 	}
858 
859 	if (vm_compression_limit == 0) {
860 		compressor_pool_size = (max_mem * COMPRESSOR_CAP_PERCENTAGE) / 100ULL;
861 	}
862 	compressor_pool_multiplier = 1;
863 
864 #else
865 
866 	if (compressor_pool_max_size > max_mem) {
867 		compressor_pool_max_size = max_mem;
868 	}
869 
870 	if (vm_compression_limit == 0) {
871 		compressor_pool_size = max_mem;
872 	}
873 	compressor_pool_multiplier = 1;
874 #endif
875 	if (compressor_pool_size > compressor_pool_max_size) {
876 		compressor_pool_size = compressor_pool_max_size;
877 	}
878 
879 	c_seg_max_pages = (c_seg_bufsize / PAGE_SIZE);
880 	c_seg_slot_var_array_min_len = c_seg_max_pages;
881 
882 #if !defined(__x86_64__)
883 	c_seg_off_limit = (C_SEG_BYTES_TO_OFFSET((c_seg_bufsize - 512)));
884 	c_seg_allocsize = (c_seg_bufsize + PAGE_SIZE);
885 #else
886 	c_seg_off_limit = (C_SEG_BYTES_TO_OFFSET((c_seg_bufsize - 128)));
887 	c_seg_allocsize = c_seg_bufsize;
888 #endif /* !defined(__x86_64__) */
889 
890 	c_segments_limit = (uint32_t)(compressor_pool_size / (vm_size_t)(c_seg_allocsize));
891 	tmp_slot_ptr.s_cseg = c_segments_limit;
892 	/* Panic on internal configs*/
893 	assertf((tmp_slot_ptr.s_cseg == c_segments_limit), "vm_compressor_init: overflowed s_cseg field in c_slot_mapping with c_segno: %d", c_segments_limit);
894 
895 	if (tmp_slot_ptr.s_cseg != c_segments_limit) {
896 		tmp_slot_ptr.s_cseg = -1;
897 		c_segments_limit = tmp_slot_ptr.s_cseg - 1; /*limited by segment idx bits in c_slot_mapping*/
898 		compressor_pool_size = (c_segments_limit * (vm_size_t)(c_seg_allocsize));
899 	}
900 
901 	c_segments_nearing_limit = (uint32_t)(((uint64_t)c_segments_limit * 98ULL) / 100ULL);
902 
903 	c_segment_pages_compressed_limit = (c_segments_limit * (c_seg_bufsize / PAGE_SIZE) * compressor_pool_multiplier);
904 
905 	if (c_segment_pages_compressed_limit < (uint32_t)(max_mem / PAGE_SIZE)) {
906 #if defined(XNU_TARGET_OS_WATCH)
907 		c_segment_pages_compressed_limit = (uint32_t)(max_mem / PAGE_SIZE);
908 #else
909 		if (!vm_compression_limit) {
910 			c_segment_pages_compressed_limit = (uint32_t)(max_mem / PAGE_SIZE);
911 		}
912 #endif
913 	}
914 
915 	c_segment_pages_compressed_nearing_limit = (uint32_t)(((uint64_t)c_segment_pages_compressed_limit * 98ULL) / 100ULL);
916 
917 #if CONFIG_FREEZE
918 	/*
919 	 * Our in-core limits are based on the size of the compressor pool.
920 	 * The c_segments_nearing_limit is also based on the compressor pool
921 	 * size and calculated above.
922 	 */
923 	c_segments_incore_limit = c_segments_limit;
924 
925 	if (freezer_incore_cseg_acct) {
926 		/*
927 		 * Add enough segments to track all frozen c_segs that can be stored in swap.
928 		 */
929 		c_segments_limit += (uint32_t)(vm_swap_get_max_configured_space() / (vm_size_t)(c_seg_allocsize));
930 		tmp_slot_ptr.s_cseg = c_segments_limit;
931 		/* Panic on internal configs*/
932 		assertf((tmp_slot_ptr.s_cseg == c_segments_limit), "vm_compressor_init: freezer reserve overflowed s_cseg field in c_slot_mapping with c_segno: %d", c_segments_limit);
933 	}
934 #endif
935 	/*
936 	 * Submap needs space for:
937 	 * - c_segments
938 	 * - c_buffers
939 	 * - swap reclaimations -- c_seg_bufsize
940 	 */
941 	c_segments_arr_size = vm_map_round_page((sizeof(union c_segu) * c_segments_limit), VM_MAP_PAGE_MASK(kernel_map));
942 	c_buffers_size = vm_map_round_page(((vm_size_t)c_seg_allocsize * (vm_size_t)c_segments_limit), VM_MAP_PAGE_MASK(kernel_map));
943 
944 	compressor_size = c_segments_arr_size + c_buffers_size + c_seg_bufsize;
945 
946 #if RECORD_THE_COMPRESSED_DATA
947 	c_compressed_record_sbuf_size = (vm_size_t)c_seg_allocsize + (PAGE_SIZE * 2);
948 	compressor_size += c_compressed_record_sbuf_size;
949 #endif /* RECORD_THE_COMPRESSED_DATA */
950 }
951 STARTUP(KMEM, STARTUP_RANK_FIRST, vm_compressor_set_size);
952 
953 KMEM_RANGE_REGISTER_DYNAMIC(compressor, &compressor_range, ^() {
954 	return compressor_size;
955 });
956 
957 bool
osenvironment_is_diagnostics(void)958 osenvironment_is_diagnostics(void)
959 {
960 	DTEntry chosen;
961 	const char *osenvironment;
962 	unsigned int size;
963 	if (kSuccess == SecureDTLookupEntry(0, "/chosen", &chosen)) {
964 		if (kSuccess == SecureDTGetProperty(chosen, "osenvironment", (void const **) &osenvironment, &size)) {
965 			return strcmp(osenvironment, "diagnostics") == 0;
966 		}
967 	}
968 	return false;
969 }
970 
971 void
vm_compressor_init(void)972 vm_compressor_init(void)
973 {
974 	thread_t        thread;
975 #if RECORD_THE_COMPRESSED_DATA
976 	vm_size_t       c_compressed_record_sbuf_size = 0;
977 #endif /* RECORD_THE_COMPRESSED_DATA */
978 
979 #if DEVELOPMENT || DEBUG || CONFIG_FREEZE
980 	char bootarg_name[32];
981 #endif /* DEVELOPMENT || DEBUG || CONFIG_FREEZE */
982 	__unused uint64_t early_boot_compressor_size = compressor_size;
983 
984 #if CONFIG_JETSAM
985 	if (memorystatus_swap_all_apps && osenvironment_is_diagnostics()) {
986 		printf("osenvironment == \"diagnostics\". Disabling app swap.\n");
987 		memorystatus_disable_swap();
988 	}
989 
990 	if (memorystatus_swap_all_apps) {
991 		/*
992 		 * App swap is disabled on devices with small NANDs.
993 		 * Now that we're no longer in early boot, we can get
994 		 * the NAND size and re-run vm_compressor_set_size.
995 		 */
996 		int error = vm_swap_vol_get_capacity(SWAP_VOLUME_NAME, &vm_swap_volume_capacity);
997 #if DEVELOPMENT || DEBUG
998 		if (error != 0) {
999 			panic("vm_compressor_init: Unable to get swap volume capacity. error=%d\n", error);
1000 		}
1001 #else
1002 		if (error != 0) {
1003 			os_log_with_startup_serial(OS_LOG_DEFAULT, "vm_compressor_init: Unable to get swap volume capacity. error=%d\n", error);
1004 		}
1005 #endif /* DEVELOPMENT || DEBUG */
1006 		if (vm_swap_volume_capacity < swap_vol_min_capacity) {
1007 			memorystatus_disable_swap();
1008 		}
1009 		/*
1010 		 * Resize the compressor and swap now that we know the capacity
1011 		 * of the swap volume.
1012 		 */
1013 		vm_compressor_set_size();
1014 		/*
1015 		 * We reserved a chunk of VA early in boot for the compressor submap.
1016 		 * We can't allocate more than that.
1017 		 */
1018 		assert(compressor_size <= early_boot_compressor_size);
1019 	}
1020 #endif /* CONFIG_JETSAM */
1021 
1022 #if DEVELOPMENT || DEBUG
1023 	if (PE_parse_boot_argn("-disable_cseg_write_protection", bootarg_name, sizeof(bootarg_name))) {
1024 		write_protect_c_segs = FALSE;
1025 	}
1026 
1027 	int vmcval = 1;
1028 #if defined(XNU_TARGET_OS_WATCH)
1029 	vmcval = 0;
1030 #endif /* XNU_TARGET_OS_WATCH */
1031 	PE_parse_boot_argn("vm_compressor_validation", &vmcval, sizeof(vmcval));
1032 
1033 	if (kern_feature_override(KF_COMPRSV_OVRD)) {
1034 		vmcval = 0;
1035 	}
1036 
1037 	if (vmcval == 0) {
1038 #if POPCOUNT_THE_COMPRESSED_DATA
1039 		popcount_c_segs = FALSE;
1040 #endif
1041 #if CHECKSUM_THE_DATA || CHECKSUM_THE_COMPRESSED_DATA
1042 		checksum_c_segs = FALSE;
1043 #endif
1044 #if VALIDATE_C_SEGMENTS
1045 		validate_c_segs = FALSE;
1046 #endif
1047 		write_protect_c_segs = FALSE;
1048 	}
1049 #endif /* DEVELOPMENT || DEBUG */
1050 
1051 #if CONFIG_FREEZE
1052 	if (PE_parse_boot_argn("-disable_freezer_cseg_acct", bootarg_name, sizeof(bootarg_name))) {
1053 		freezer_incore_cseg_acct = FALSE;
1054 	}
1055 #endif /* CONFIG_FREEZE */
1056 
1057 	assert((C_SEGMENTS_PER_PAGE * sizeof(union c_segu)) == PAGE_SIZE);
1058 
1059 #if !XNU_TARGET_OS_OSX
1060 	vm_compressor_minorcompact_threshold_divisor = 20;
1061 	vm_compressor_majorcompact_threshold_divisor = 30;
1062 	vm_compressor_unthrottle_threshold_divisor = 40;
1063 	vm_compressor_catchup_threshold_divisor = 60;
1064 #else /* !XNU_TARGET_OS_OSX */
1065 	if (max_mem <= (3ULL * 1024ULL * 1024ULL * 1024ULL)) {
1066 		vm_compressor_minorcompact_threshold_divisor = 11;
1067 		vm_compressor_majorcompact_threshold_divisor = 13;
1068 		vm_compressor_unthrottle_threshold_divisor = 20;
1069 		vm_compressor_catchup_threshold_divisor = 35;
1070 	} else {
1071 		vm_compressor_minorcompact_threshold_divisor = 20;
1072 		vm_compressor_majorcompact_threshold_divisor = 25;
1073 		vm_compressor_unthrottle_threshold_divisor = 35;
1074 		vm_compressor_catchup_threshold_divisor = 50;
1075 	}
1076 #endif /* !XNU_TARGET_OS_OSX */
1077 
1078 	queue_init(&c_bad_list_head);
1079 	queue_init(&c_age_list_head);
1080 	queue_init(&c_minor_list_head);
1081 	queue_init(&c_major_list_head);
1082 	queue_init(&c_filling_list_head);
1083 	queue_init(&c_early_swapout_list_head);
1084 	queue_init(&c_regular_swapout_list_head);
1085 	queue_init(&c_late_swapout_list_head);
1086 	queue_init(&c_swapio_list_head);
1087 	queue_init(&c_early_swappedin_list_head);
1088 	queue_init(&c_regular_swappedin_list_head);
1089 	queue_init(&c_late_swappedin_list_head);
1090 	queue_init(&c_swappedout_list_head);
1091 	queue_init(&c_swappedout_sparse_list_head);
1092 
1093 	c_free_segno_head = -1;
1094 	c_segments_available = 0;
1095 
1096 	compressor_map = kmem_suballoc(kernel_map, &compressor_range.min_address,
1097 	    compressor_size, VM_MAP_CREATE_NEVER_FAULTS,
1098 	    VM_FLAGS_FIXED_RANGE_SUBALLOC, KMS_NOFAIL | KMS_PERMANENT,
1099 	    VM_KERN_MEMORY_COMPRESSOR).kmr_submap;
1100 
1101 	kmem_alloc(compressor_map, (vm_offset_t *)(&c_segments),
1102 	    (sizeof(union c_segu) * c_segments_limit),
1103 	    KMA_NOFAIL | KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT,
1104 	    VM_KERN_MEMORY_COMPRESSOR);
1105 	kmem_alloc(compressor_map, &c_buffers, c_buffers_size,
1106 	    KMA_NOFAIL | KMA_COMPRESSOR | KMA_VAONLY | KMA_PERMANENT,
1107 	    VM_KERN_MEMORY_COMPRESSOR);
1108 
1109 #if DEVELOPMENT || DEBUG
1110 	if (hvg_is_hcall_available(HVG_HCALL_SET_COREDUMP_DATA)) {
1111 		hvg_hcall_set_coredump_data();
1112 	}
1113 #endif
1114 
1115 	/*
1116 	 * Pick a good size that will minimize fragmentation in zalloc
1117 	 * by minimizing the fragmentation in a 16k run.
1118 	 *
1119 	 * c_seg_slot_var_array_min_len is larger on 4k systems than 16k ones,
1120 	 * making the fragmentation in a 4k page terrible. Using 16k for all
1121 	 * systems matches zalloc() and will minimize fragmentation.
1122 	 */
1123 	uint32_t c_segment_size = sizeof(struct c_segment) + (c_seg_slot_var_array_min_len * sizeof(struct c_slot));
1124 	uint32_t cnt  = (16 << 10) / c_segment_size;
1125 	uint32_t frag = (16 << 10) % c_segment_size;
1126 
1127 	c_seg_fixed_array_len = c_seg_slot_var_array_min_len;
1128 
1129 	while (cnt * sizeof(struct c_slot) < frag) {
1130 		c_segment_size += sizeof(struct c_slot);
1131 		c_seg_fixed_array_len++;
1132 		frag -= cnt * sizeof(struct c_slot);
1133 	}
1134 
1135 	compressor_segment_zone = zone_create("compressor_segment",
1136 	    c_segment_size, ZC_PGZ_USE_GUARDS | ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM);
1137 
1138 	c_segments_busy = FALSE;
1139 
1140 	c_segments_next_page = (caddr_t)c_segments;
1141 	vm_compressor_algorithm_init();
1142 
1143 	{
1144 		host_basic_info_data_t hinfo;
1145 		mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
1146 		size_t bufsize;
1147 		char *buf;
1148 
1149 #define BSD_HOST 1
1150 		host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
1151 
1152 		compressor_cpus = hinfo.max_cpus;
1153 
1154 		bufsize = PAGE_SIZE;
1155 		bufsize += compressor_cpus * vm_compressor_get_decode_scratch_size();
1156 		/* For the KDP path */
1157 		bufsize += vm_compressor_get_decode_scratch_size();
1158 #if CONFIG_FREEZE
1159 		bufsize += vm_compressor_get_encode_scratch_size();
1160 #endif
1161 #if RECORD_THE_COMPRESSED_DATA
1162 		bufsize += c_compressed_record_sbuf_size;
1163 #endif
1164 
1165 		kmem_alloc(kernel_map, (vm_offset_t *)&buf, bufsize,
1166 		    KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
1167 		    VM_KERN_MEMORY_COMPRESSOR);
1168 
1169 		/*
1170 		 * kdp_compressor_decompressed_page must be page aligned because we access
1171 		 * it through the physical aperture by page number.
1172 		 */
1173 		kdp_compressor_decompressed_page = buf;
1174 		kdp_compressor_decompressed_page_paddr = kvtophys((vm_offset_t)kdp_compressor_decompressed_page);
1175 		kdp_compressor_decompressed_page_ppnum = (ppnum_t) atop(kdp_compressor_decompressed_page_paddr);
1176 		buf += PAGE_SIZE;
1177 		bufsize -= PAGE_SIZE;
1178 
1179 		compressor_scratch_bufs = buf;
1180 		buf += compressor_cpus * vm_compressor_get_decode_scratch_size();
1181 		bufsize -= compressor_cpus * vm_compressor_get_decode_scratch_size();
1182 
1183 		kdp_compressor_scratch_buf = buf;
1184 		buf += vm_compressor_get_decode_scratch_size();
1185 		bufsize -= vm_compressor_get_decode_scratch_size();
1186 
1187 #if CONFIG_FREEZE
1188 		freezer_context_global.freezer_ctx_compressor_scratch_buf = buf;
1189 		buf += vm_compressor_get_encode_scratch_size();
1190 		bufsize -= vm_compressor_get_encode_scratch_size();
1191 #endif
1192 
1193 #if RECORD_THE_COMPRESSED_DATA
1194 		c_compressed_record_sbuf = buf;
1195 		c_compressed_record_cptr = buf;
1196 		c_compressed_record_ebuf = c_compressed_record_sbuf + c_compressed_record_sbuf_size;
1197 		buf += c_compressed_record_sbuf_size;
1198 		bufsize -= c_compressed_record_sbuf_size;
1199 #endif
1200 		assert(bufsize == 0);
1201 	}
1202 
1203 	if (kernel_thread_start_priority((thread_continue_t)vm_compressor_swap_trigger_thread, NULL,
1204 	    BASEPRI_VM, &thread) != KERN_SUCCESS) {
1205 		panic("vm_compressor_swap_trigger_thread: create failed");
1206 	}
1207 	thread_deallocate(thread);
1208 
1209 	if (vm_pageout_internal_start() != KERN_SUCCESS) {
1210 		panic("vm_compressor_init: Failed to start the internal pageout thread.");
1211 	}
1212 	if (VM_CONFIG_SWAP_IS_PRESENT) {
1213 		vm_compressor_swap_init();
1214 	}
1215 
1216 	if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
1217 		vm_compressor_is_active = 1;
1218 	}
1219 
1220 #if CONFIG_FREEZE
1221 	memorystatus_freeze_enabled = TRUE;
1222 #endif /* CONFIG_FREEZE */
1223 
1224 	vm_compressor_available = 1;
1225 
1226 	vm_page_reactivate_all_throttled();
1227 
1228 	bzero(&vmcs_stats, sizeof(struct vm_compressor_swapper_stats));
1229 }
1230 
1231 
1232 #if VALIDATE_C_SEGMENTS
1233 
1234 static void
c_seg_validate(c_segment_t c_seg,boolean_t must_be_compact)1235 c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact)
1236 {
1237 	uint16_t        c_indx;
1238 	int32_t         bytes_used;
1239 	uint32_t        c_rounded_size;
1240 	uint32_t        c_size;
1241 	c_slot_t        cs;
1242 
1243 	if (__probable(validate_c_segs == FALSE)) {
1244 		return;
1245 	}
1246 	if (c_seg->c_firstemptyslot < c_seg->c_nextslot) {
1247 		c_indx = c_seg->c_firstemptyslot;
1248 		cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1249 
1250 		if (cs == NULL) {
1251 			panic("c_seg_validate:  no slot backing c_firstemptyslot");
1252 		}
1253 
1254 		if (cs->c_size) {
1255 			panic("c_seg_validate:  c_firstemptyslot has non-zero size (%d)", cs->c_size);
1256 		}
1257 	}
1258 	bytes_used = 0;
1259 
1260 	for (c_indx = 0; c_indx < c_seg->c_nextslot; c_indx++) {
1261 		cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1262 
1263 		c_size = UNPACK_C_SIZE(cs);
1264 
1265 		c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
1266 
1267 		bytes_used += c_rounded_size;
1268 
1269 #if CHECKSUM_THE_COMPRESSED_DATA
1270 		unsigned csvhash;
1271 		if (c_size && cs->c_hash_compressed_data != (csvhash = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size))) {
1272 			addr64_t csvphys = kvtophys((vm_offset_t)&c_seg->c_store.c_buffer[cs->c_offset]);
1273 			panic("Compressed data doesn't match original %p phys: 0x%llx %d %p %d %d 0x%x 0x%x", c_seg, csvphys, cs->c_offset, cs, c_indx, c_size, cs->c_hash_compressed_data, csvhash);
1274 		}
1275 #endif
1276 #if POPCOUNT_THE_COMPRESSED_DATA
1277 		unsigned csvpop;
1278 		if (c_size) {
1279 			uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
1280 			if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
1281 				panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, (uint64_t)cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
1282 			}
1283 		}
1284 #endif
1285 	}
1286 
1287 	if (bytes_used != c_seg->c_bytes_used) {
1288 		panic("c_seg_validate: bytes_used mismatch - found %d, segment has %d", bytes_used, c_seg->c_bytes_used);
1289 	}
1290 
1291 	if (c_seg->c_bytes_used > C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset)) {
1292 		panic("c_seg_validate: c_bytes_used > c_nextoffset - c_nextoffset = %d,  c_bytes_used = %d",
1293 		    (int32_t)C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset), c_seg->c_bytes_used);
1294 	}
1295 
1296 	if (must_be_compact) {
1297 		if (c_seg->c_bytes_used != C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset)) {
1298 			panic("c_seg_validate: c_bytes_used doesn't match c_nextoffset - c_nextoffset = %d,  c_bytes_used = %d",
1299 			    (int32_t)C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset), c_seg->c_bytes_used);
1300 		}
1301 	}
1302 }
1303 
1304 #endif
1305 
1306 
1307 void
c_seg_need_delayed_compaction(c_segment_t c_seg,boolean_t c_list_lock_held)1308 c_seg_need_delayed_compaction(c_segment_t c_seg, boolean_t c_list_lock_held)
1309 {
1310 	boolean_t       clear_busy = FALSE;
1311 
1312 	if (c_list_lock_held == FALSE) {
1313 		if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1314 			C_SEG_BUSY(c_seg);
1315 
1316 			lck_mtx_unlock_always(&c_seg->c_lock);
1317 			lck_mtx_lock_spin_always(c_list_lock);
1318 			lck_mtx_lock_spin_always(&c_seg->c_lock);
1319 
1320 			clear_busy = TRUE;
1321 		}
1322 	}
1323 	assert(c_seg->c_state != C_IS_FILLING);
1324 
1325 	if (!c_seg->c_on_minorcompact_q && !(C_SEG_IS_ON_DISK_OR_SOQ(c_seg)) && !c_seg->c_has_donated_pages) {
1326 		queue_enter(&c_minor_list_head, c_seg, c_segment_t, c_list);
1327 		c_seg->c_on_minorcompact_q = 1;
1328 		c_minor_count++;
1329 	}
1330 	if (c_list_lock_held == FALSE) {
1331 		lck_mtx_unlock_always(c_list_lock);
1332 	}
1333 
1334 	if (clear_busy == TRUE) {
1335 		C_SEG_WAKEUP_DONE(c_seg);
1336 	}
1337 }
1338 
1339 
1340 unsigned int c_seg_moved_to_sparse_list = 0;
1341 
1342 void
c_seg_move_to_sparse_list(c_segment_t c_seg)1343 c_seg_move_to_sparse_list(c_segment_t c_seg)
1344 {
1345 	boolean_t       clear_busy = FALSE;
1346 
1347 	if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1348 		C_SEG_BUSY(c_seg);
1349 
1350 		lck_mtx_unlock_always(&c_seg->c_lock);
1351 		lck_mtx_lock_spin_always(c_list_lock);
1352 		lck_mtx_lock_spin_always(&c_seg->c_lock);
1353 
1354 		clear_busy = TRUE;
1355 	}
1356 	c_seg_switch_state(c_seg, C_ON_SWAPPEDOUTSPARSE_Q, FALSE);
1357 
1358 	c_seg_moved_to_sparse_list++;
1359 
1360 	lck_mtx_unlock_always(c_list_lock);
1361 
1362 	if (clear_busy == TRUE) {
1363 		C_SEG_WAKEUP_DONE(c_seg);
1364 	}
1365 }
1366 
1367 
1368 void
c_seg_insert_into_q(queue_head_t * qhead,c_segment_t c_seg)1369 c_seg_insert_into_q(queue_head_t *qhead, c_segment_t c_seg)
1370 {
1371 	c_segment_t c_seg_next;
1372 
1373 	if (queue_empty(qhead)) {
1374 		queue_enter(qhead, c_seg, c_segment_t, c_age_list);
1375 	} else {
1376 		c_seg_next = (c_segment_t)queue_first(qhead);
1377 
1378 		while (TRUE) {
1379 			if (c_seg->c_generation_id < c_seg_next->c_generation_id) {
1380 				queue_insert_before(qhead, c_seg, c_seg_next, c_segment_t, c_age_list);
1381 				break;
1382 			}
1383 			c_seg_next = (c_segment_t) queue_next(&c_seg_next->c_age_list);
1384 
1385 			if (queue_end(qhead, (queue_entry_t) c_seg_next)) {
1386 				queue_enter(qhead, c_seg, c_segment_t, c_age_list);
1387 				break;
1388 			}
1389 		}
1390 	}
1391 }
1392 
1393 
1394 int try_minor_compaction_failed = 0;
1395 int try_minor_compaction_succeeded = 0;
1396 
1397 void
c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg)1398 c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg)
1399 {
1400 	assert(c_seg->c_on_minorcompact_q);
1401 	/*
1402 	 * c_seg is currently on the delayed minor compaction
1403 	 * queue and we have c_seg locked... if we can get the
1404 	 * c_list_lock w/o blocking (if we blocked we could deadlock
1405 	 * because the lock order is c_list_lock then c_seg's lock)
1406 	 * we'll pull it from the delayed list and free it directly
1407 	 */
1408 	if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1409 		/*
1410 		 * c_list_lock is held, we need to bail
1411 		 */
1412 		try_minor_compaction_failed++;
1413 
1414 		lck_mtx_unlock_always(&c_seg->c_lock);
1415 	} else {
1416 		try_minor_compaction_succeeded++;
1417 
1418 		C_SEG_BUSY(c_seg);
1419 		c_seg_do_minor_compaction_and_unlock(c_seg, TRUE, FALSE, FALSE);
1420 	}
1421 }
1422 
1423 
1424 int
c_seg_do_minor_compaction_and_unlock(c_segment_t c_seg,boolean_t clear_busy,boolean_t need_list_lock,boolean_t disallow_page_replacement)1425 c_seg_do_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy, boolean_t need_list_lock, boolean_t disallow_page_replacement)
1426 {
1427 	int     c_seg_freed;
1428 
1429 	assert(c_seg->c_busy);
1430 	assert(!C_SEG_IS_ON_DISK_OR_SOQ(c_seg));
1431 
1432 	/*
1433 	 * check for the case that can occur when we are not swapping
1434 	 * and this segment has been major compacted in the past
1435 	 * and moved to the majorcompact q to remove it from further
1436 	 * consideration... if the occupancy falls too low we need
1437 	 * to put it back on the age_q so that it will be considered
1438 	 * in the next major compaction sweep... if we don't do this
1439 	 * we will eventually run into the c_segments_limit
1440 	 */
1441 	if (c_seg->c_state == C_ON_MAJORCOMPACT_Q && C_SEG_SHOULD_MAJORCOMPACT_NOW(c_seg)) {
1442 		c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
1443 	}
1444 	if (!c_seg->c_on_minorcompact_q) {
1445 		if (clear_busy == TRUE) {
1446 			C_SEG_WAKEUP_DONE(c_seg);
1447 		}
1448 
1449 		lck_mtx_unlock_always(&c_seg->c_lock);
1450 
1451 		return 0;
1452 	}
1453 	queue_remove(&c_minor_list_head, c_seg, c_segment_t, c_list);
1454 	c_seg->c_on_minorcompact_q = 0;
1455 	c_minor_count--;
1456 
1457 	lck_mtx_unlock_always(c_list_lock);
1458 
1459 	if (disallow_page_replacement == TRUE) {
1460 		lck_mtx_unlock_always(&c_seg->c_lock);
1461 
1462 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
1463 
1464 		lck_mtx_lock_spin_always(&c_seg->c_lock);
1465 	}
1466 	c_seg_freed = c_seg_minor_compaction_and_unlock(c_seg, clear_busy);
1467 
1468 	if (disallow_page_replacement == TRUE) {
1469 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
1470 	}
1471 
1472 	if (need_list_lock == TRUE) {
1473 		lck_mtx_lock_spin_always(c_list_lock);
1474 	}
1475 
1476 	return c_seg_freed;
1477 }
1478 
1479 void
kdp_compressor_busy_find_owner(event64_t wait_event,thread_waitinfo_t * waitinfo)1480 kdp_compressor_busy_find_owner(event64_t wait_event, thread_waitinfo_t *waitinfo)
1481 {
1482 	c_segment_t c_seg = (c_segment_t) wait_event;
1483 
1484 	waitinfo->owner = thread_tid(c_seg->c_busy_for_thread);
1485 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(c_seg);
1486 }
1487 
1488 #if DEVELOPMENT || DEBUG
1489 int
do_cseg_wedge_thread(void)1490 do_cseg_wedge_thread(void)
1491 {
1492 	struct c_segment c_seg;
1493 	c_seg.c_busy_for_thread = current_thread();
1494 
1495 	debug_cseg_wait_event = (event_t) &c_seg;
1496 
1497 	thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor);
1498 	assert_wait((event_t) (&c_seg), THREAD_INTERRUPTIBLE);
1499 
1500 	thread_block(THREAD_CONTINUE_NULL);
1501 
1502 	return 0;
1503 }
1504 
1505 int
do_cseg_unwedge_thread(void)1506 do_cseg_unwedge_thread(void)
1507 {
1508 	thread_wakeup(debug_cseg_wait_event);
1509 	debug_cseg_wait_event = NULL;
1510 
1511 	return 0;
1512 }
1513 #endif /* DEVELOPMENT || DEBUG */
1514 
1515 void
c_seg_wait_on_busy(c_segment_t c_seg)1516 c_seg_wait_on_busy(c_segment_t c_seg)
1517 {
1518 	c_seg->c_wanted = 1;
1519 
1520 	thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor);
1521 	assert_wait((event_t) (c_seg), THREAD_UNINT);
1522 
1523 	lck_mtx_unlock_always(&c_seg->c_lock);
1524 	thread_block(THREAD_CONTINUE_NULL);
1525 }
1526 
1527 #if CONFIG_FREEZE
1528 /*
1529  * We don't have the task lock held while updating the task's
1530  * c_seg queues. We can do that because of the following restrictions:
1531  *
1532  * - SINGLE FREEZER CONTEXT:
1533  *   We 'insert' c_segs into the task list on the task_freeze path.
1534  *   There can only be one such freeze in progress and the task
1535  *   isn't disappearing because we have the VM map lock held throughout
1536  *   and we have a reference on the proc too.
1537  *
1538  * - SINGLE TASK DISOWN CONTEXT:
1539  *   We 'disown' c_segs of a task ONLY from the task_terminate context. So
1540  *   we don't need the task lock but we need the c_list_lock and the
1541  *   compressor master lock (shared). We also hold the individual
1542  *   c_seg locks (exclusive).
1543  *
1544  *   If we either:
1545  *   - can't get the c_seg lock on a try, then we start again because maybe
1546  *   the c_seg is part of a compaction and might get freed. So we can't trust
1547  *   that linkage and need to restart our queue traversal.
1548  *   - OR, we run into a busy c_seg (say being swapped in or free-ing) we
1549  *   drop all locks again and wait and restart our queue traversal.
1550  *
1551  * - The new_owner_task below is currently only the kernel or NULL.
1552  *
1553  */
1554 void
c_seg_update_task_owner(c_segment_t c_seg,task_t new_owner_task)1555 c_seg_update_task_owner(c_segment_t c_seg, task_t new_owner_task)
1556 {
1557 	task_t          owner_task = c_seg->c_task_owner;
1558 	uint64_t        uncompressed_bytes = ((c_seg->c_slots_used) * PAGE_SIZE_64);
1559 
1560 	LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
1561 	LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
1562 
1563 	if (owner_task) {
1564 		task_update_frozen_to_swap_acct(owner_task, uncompressed_bytes, DEBIT_FROM_SWAP);
1565 		queue_remove(&owner_task->task_frozen_cseg_q, c_seg,
1566 		    c_segment_t, c_task_list_next_cseg);
1567 	}
1568 
1569 	if (new_owner_task) {
1570 		queue_enter(&new_owner_task->task_frozen_cseg_q, c_seg,
1571 		    c_segment_t, c_task_list_next_cseg);
1572 		task_update_frozen_to_swap_acct(new_owner_task, uncompressed_bytes, CREDIT_TO_SWAP);
1573 	}
1574 
1575 	c_seg->c_task_owner = new_owner_task;
1576 }
1577 
1578 void
task_disown_frozen_csegs(task_t owner_task)1579 task_disown_frozen_csegs(task_t owner_task)
1580 {
1581 	c_segment_t c_seg = NULL, next_cseg = NULL;
1582 
1583 again:
1584 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
1585 	lck_mtx_lock_spin_always(c_list_lock);
1586 
1587 	for (c_seg = (c_segment_t) queue_first(&owner_task->task_frozen_cseg_q);
1588 	    !queue_end(&owner_task->task_frozen_cseg_q, (queue_entry_t) c_seg);
1589 	    c_seg = next_cseg) {
1590 		next_cseg = (c_segment_t) queue_next(&c_seg->c_task_list_next_cseg);
1591 
1592 		if (!lck_mtx_try_lock_spin_always(&c_seg->c_lock)) {
1593 			lck_mtx_unlock(c_list_lock);
1594 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
1595 			goto again;
1596 		}
1597 
1598 		if (c_seg->c_busy) {
1599 			lck_mtx_unlock(c_list_lock);
1600 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
1601 
1602 			c_seg_wait_on_busy(c_seg);
1603 
1604 			goto again;
1605 		}
1606 		assert(c_seg->c_task_owner == owner_task);
1607 		c_seg_update_task_owner(c_seg, kernel_task);
1608 		lck_mtx_unlock_always(&c_seg->c_lock);
1609 	}
1610 
1611 	lck_mtx_unlock(c_list_lock);
1612 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
1613 }
1614 #endif /* CONFIG_FREEZE */
1615 
1616 void
c_seg_switch_state(c_segment_t c_seg,int new_state,boolean_t insert_head)1617 c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
1618 {
1619 	int     old_state = c_seg->c_state;
1620 	queue_head_t *donate_swapout_list_head, *donate_swappedin_list_head;
1621 	uint32_t     *donate_swapout_count, *donate_swappedin_count;
1622 
1623 	/*
1624 	 * On macOS the donate queue is swapped first ie the c_early_swapout queue.
1625 	 * On other swap-capable platforms, we want to swap those out last. So we
1626 	 * use the c_late_swapout queue.
1627 	 */
1628 #if XNU_TARGET_OS_OSX
1629 #if (DEVELOPMENT || DEBUG)
1630 	if (new_state != C_IS_FILLING) {
1631 		LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
1632 	}
1633 	LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
1634 #endif /* DEVELOPMENT || DEBUG */
1635 
1636 	donate_swapout_list_head = &c_early_swapout_list_head;
1637 	donate_swapout_count = &c_early_swapout_count;
1638 	donate_swappedin_list_head = &c_early_swappedin_list_head;
1639 	donate_swappedin_count = &c_early_swappedin_count;
1640 #else /* XNU_TARGET_OS_OSX */
1641 	donate_swapout_list_head = &c_late_swapout_list_head;
1642 	donate_swapout_count = &c_late_swapout_count;
1643 	donate_swappedin_list_head = &c_late_swappedin_list_head;
1644 	donate_swappedin_count = &c_late_swappedin_count;
1645 #endif /* XNU_TARGET_OS_OSX */
1646 
1647 	switch (old_state) {
1648 	case C_IS_EMPTY:
1649 		assert(new_state == C_IS_FILLING || new_state == C_IS_FREE);
1650 
1651 		c_empty_count--;
1652 		break;
1653 
1654 	case C_IS_FILLING:
1655 		assert(new_state == C_ON_AGE_Q || new_state == C_ON_SWAPOUT_Q);
1656 
1657 		queue_remove(&c_filling_list_head, c_seg, c_segment_t, c_age_list);
1658 		c_filling_count--;
1659 		break;
1660 
1661 	case C_ON_AGE_Q:
1662 		assert(new_state == C_ON_SWAPOUT_Q || new_state == C_ON_MAJORCOMPACT_Q ||
1663 		    new_state == C_IS_FREE);
1664 
1665 		queue_remove(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1666 		c_age_count--;
1667 		break;
1668 
1669 	case C_ON_SWAPPEDIN_Q:
1670 		if (c_seg->c_has_donated_pages) {
1671 			assert(new_state == C_ON_SWAPOUT_Q || new_state == C_IS_FREE);
1672 			queue_remove(donate_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1673 			*donate_swappedin_count -= 1;
1674 		} else {
1675 			assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE);
1676 #if CONFIG_FREEZE
1677 			assert(c_seg->c_has_freezer_pages);
1678 			queue_remove(&c_early_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1679 			c_early_swappedin_count--;
1680 #else /* CONFIG_FREEZE */
1681 			queue_remove(&c_regular_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1682 			c_regular_swappedin_count--;
1683 #endif /* CONFIG_FREEZE */
1684 		}
1685 		break;
1686 
1687 	case C_ON_SWAPOUT_Q:
1688 		assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE || new_state == C_IS_EMPTY || new_state == C_ON_SWAPIO_Q);
1689 
1690 #if CONFIG_FREEZE
1691 		if (c_seg->c_has_freezer_pages) {
1692 			if (c_seg->c_task_owner && (new_state != C_ON_SWAPIO_Q)) {
1693 				c_seg_update_task_owner(c_seg, NULL);
1694 			}
1695 			queue_remove(&c_early_swapout_list_head, c_seg, c_segment_t, c_age_list);
1696 			c_early_swapout_count--;
1697 		} else
1698 #endif /* CONFIG_FREEZE */
1699 		{
1700 			if (c_seg->c_has_donated_pages) {
1701 				queue_remove(donate_swapout_list_head, c_seg, c_segment_t, c_age_list);
1702 				*donate_swapout_count -= 1;
1703 			} else {
1704 				queue_remove(&c_regular_swapout_list_head, c_seg, c_segment_t, c_age_list);
1705 				c_regular_swapout_count--;
1706 			}
1707 		}
1708 
1709 		if (new_state == C_ON_AGE_Q) {
1710 			c_seg->c_has_donated_pages = 0;
1711 		}
1712 		thread_wakeup((event_t)&compaction_swapper_running);
1713 		break;
1714 
1715 	case C_ON_SWAPIO_Q:
1716 #if CONFIG_FREEZE
1717 		if (c_seg->c_has_freezer_pages) {
1718 			assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_AGE_Q);
1719 		} else
1720 #endif /* CONFIG_FREEZE */
1721 		{
1722 			if (c_seg->c_has_donated_pages) {
1723 				assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_SWAPPEDIN_Q);
1724 			} else {
1725 				assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_AGE_Q);
1726 			}
1727 		}
1728 
1729 		queue_remove(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1730 		c_swapio_count--;
1731 		break;
1732 
1733 	case C_ON_SWAPPEDOUT_Q:
1734 		assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q ||
1735 		    new_state == C_ON_SWAPPEDOUTSPARSE_Q ||
1736 		    new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE);
1737 
1738 		queue_remove(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1739 		c_swappedout_count--;
1740 		break;
1741 
1742 	case C_ON_SWAPPEDOUTSPARSE_Q:
1743 		assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q ||
1744 		    new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE);
1745 
1746 		queue_remove(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1747 		c_swappedout_sparse_count--;
1748 		break;
1749 
1750 	case C_ON_MAJORCOMPACT_Q:
1751 		assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE);
1752 
1753 		queue_remove(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1754 		c_major_count--;
1755 		break;
1756 
1757 	case C_ON_BAD_Q:
1758 		assert(new_state == C_IS_FREE);
1759 
1760 		queue_remove(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1761 		c_bad_count--;
1762 		break;
1763 
1764 	default:
1765 		panic("c_seg %p has bad c_state = %d", c_seg, old_state);
1766 	}
1767 
1768 	switch (new_state) {
1769 	case C_IS_FREE:
1770 		assert(old_state != C_IS_FILLING);
1771 
1772 		break;
1773 
1774 	case C_IS_EMPTY:
1775 		assert(old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1776 
1777 		c_empty_count++;
1778 		break;
1779 
1780 	case C_IS_FILLING:
1781 		assert(old_state == C_IS_EMPTY);
1782 
1783 		queue_enter(&c_filling_list_head, c_seg, c_segment_t, c_age_list);
1784 		c_filling_count++;
1785 		break;
1786 
1787 	case C_ON_AGE_Q:
1788 		assert(old_state == C_IS_FILLING || old_state == C_ON_SWAPPEDIN_Q ||
1789 		    old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPIO_Q ||
1790 		    old_state == C_ON_MAJORCOMPACT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1791 
1792 		assert(!c_seg->c_has_donated_pages);
1793 		if (old_state == C_IS_FILLING) {
1794 			queue_enter(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1795 		} else {
1796 			if (!queue_empty(&c_age_list_head)) {
1797 				c_segment_t     c_first;
1798 
1799 				c_first = (c_segment_t)queue_first(&c_age_list_head);
1800 				c_seg->c_creation_ts = c_first->c_creation_ts;
1801 			}
1802 			queue_enter_first(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1803 		}
1804 		c_age_count++;
1805 		break;
1806 
1807 	case C_ON_SWAPPEDIN_Q:
1808 	{
1809 		queue_head_t *list_head;
1810 
1811 		assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q || old_state == C_ON_SWAPIO_Q);
1812 		if (c_seg->c_has_donated_pages) {
1813 			/* Error in swapouts could happen while the c_seg is still on the swapio queue */
1814 			list_head = donate_swappedin_list_head;
1815 			*donate_swappedin_count += 1;
1816 		} else {
1817 #if CONFIG_FREEZE
1818 			assert(c_seg->c_has_freezer_pages);
1819 			list_head = &c_early_swappedin_list_head;
1820 			c_early_swappedin_count++;
1821 #else /* CONFIG_FREEZE */
1822 			list_head = &c_regular_swappedin_list_head;
1823 			c_regular_swappedin_count++;
1824 #endif /* CONFIG_FREEZE */
1825 		}
1826 
1827 		if (insert_head == TRUE) {
1828 			queue_enter_first(list_head, c_seg, c_segment_t, c_age_list);
1829 		} else {
1830 			queue_enter(list_head, c_seg, c_segment_t, c_age_list);
1831 		}
1832 		break;
1833 	}
1834 
1835 	case C_ON_SWAPOUT_Q:
1836 	{
1837 		queue_head_t *list_head;
1838 
1839 #if CONFIG_FREEZE
1840 		/*
1841 		 * A segment with both identities of frozen + donated pages
1842 		 * will be put on early swapout Q ie the frozen identity wins.
1843 		 * This is because when both identities are set, the donation bit
1844 		 * is added on after in the c_current_seg_filled path for accounting
1845 		 * purposes.
1846 		 */
1847 		if (c_seg->c_has_freezer_pages) {
1848 			assert(old_state == C_ON_AGE_Q || old_state == C_IS_FILLING);
1849 			list_head = &c_early_swapout_list_head;
1850 			c_early_swapout_count++;
1851 		} else
1852 #endif
1853 		{
1854 			if (c_seg->c_has_donated_pages) {
1855 				assert(old_state == C_ON_SWAPPEDIN_Q || old_state == C_IS_FILLING);
1856 				list_head = donate_swapout_list_head;
1857 				*donate_swapout_count += 1;
1858 			} else {
1859 				assert(old_state == C_ON_AGE_Q || old_state == C_IS_FILLING);
1860 				list_head = &c_regular_swapout_list_head;
1861 				c_regular_swapout_count++;
1862 			}
1863 		}
1864 
1865 		if (insert_head == TRUE) {
1866 			queue_enter_first(list_head, c_seg, c_segment_t, c_age_list);
1867 		} else {
1868 			queue_enter(list_head, c_seg, c_segment_t, c_age_list);
1869 		}
1870 		break;
1871 	}
1872 
1873 	case C_ON_SWAPIO_Q:
1874 		assert(old_state == C_ON_SWAPOUT_Q);
1875 
1876 		if (insert_head == TRUE) {
1877 			queue_enter_first(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1878 		} else {
1879 			queue_enter(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1880 		}
1881 		c_swapio_count++;
1882 		break;
1883 
1884 	case C_ON_SWAPPEDOUT_Q:
1885 		assert(old_state == C_ON_SWAPIO_Q);
1886 
1887 		if (insert_head == TRUE) {
1888 			queue_enter_first(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1889 		} else {
1890 			queue_enter(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1891 		}
1892 		c_swappedout_count++;
1893 		break;
1894 
1895 	case C_ON_SWAPPEDOUTSPARSE_Q:
1896 		assert(old_state == C_ON_SWAPIO_Q || old_state == C_ON_SWAPPEDOUT_Q);
1897 
1898 		if (insert_head == TRUE) {
1899 			queue_enter_first(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1900 		} else {
1901 			queue_enter(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1902 		}
1903 
1904 		c_swappedout_sparse_count++;
1905 		break;
1906 
1907 	case C_ON_MAJORCOMPACT_Q:
1908 		assert(old_state == C_ON_AGE_Q);
1909 		assert(!c_seg->c_has_donated_pages);
1910 
1911 		if (insert_head == TRUE) {
1912 			queue_enter_first(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1913 		} else {
1914 			queue_enter(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1915 		}
1916 		c_major_count++;
1917 		break;
1918 
1919 	case C_ON_BAD_Q:
1920 		assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1921 
1922 		if (insert_head == TRUE) {
1923 			queue_enter_first(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1924 		} else {
1925 			queue_enter(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1926 		}
1927 		c_bad_count++;
1928 		break;
1929 
1930 	default:
1931 		panic("c_seg %p requesting bad c_state = %d", c_seg, new_state);
1932 	}
1933 	c_seg->c_state = new_state;
1934 }
1935 
1936 
1937 
1938 void
c_seg_free(c_segment_t c_seg)1939 c_seg_free(c_segment_t c_seg)
1940 {
1941 	assert(c_seg->c_busy);
1942 
1943 	lck_mtx_unlock_always(&c_seg->c_lock);
1944 	lck_mtx_lock_spin_always(c_list_lock);
1945 	lck_mtx_lock_spin_always(&c_seg->c_lock);
1946 
1947 	c_seg_free_locked(c_seg);
1948 }
1949 
1950 
1951 void
c_seg_free_locked(c_segment_t c_seg)1952 c_seg_free_locked(c_segment_t c_seg)
1953 {
1954 	int             segno;
1955 	int             pages_populated = 0;
1956 	int32_t         *c_buffer = NULL;
1957 	uint64_t        c_swap_handle = 0;
1958 
1959 	assert(c_seg->c_busy);
1960 	assert(c_seg->c_slots_used == 0);
1961 	assert(!c_seg->c_on_minorcompact_q);
1962 	assert(!c_seg->c_busy_swapping);
1963 
1964 	if (c_seg->c_overage_swap == TRUE) {
1965 		c_overage_swapped_count--;
1966 		c_seg->c_overage_swap = FALSE;
1967 	}
1968 	if (!(C_SEG_IS_ONDISK(c_seg))) {
1969 		c_buffer = c_seg->c_store.c_buffer;
1970 	} else {
1971 		c_swap_handle = c_seg->c_store.c_swap_handle;
1972 	}
1973 
1974 	c_seg_switch_state(c_seg, C_IS_FREE, FALSE);
1975 
1976 	if (c_buffer) {
1977 		pages_populated = (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) / PAGE_SIZE;
1978 		c_seg->c_store.c_buffer = NULL;
1979 	} else {
1980 #if CONFIG_FREEZE
1981 		c_seg_update_task_owner(c_seg, NULL);
1982 #endif /* CONFIG_FREEZE */
1983 
1984 		c_seg->c_store.c_swap_handle = (uint64_t)-1;
1985 	}
1986 
1987 	lck_mtx_unlock_always(&c_seg->c_lock);
1988 
1989 	lck_mtx_unlock_always(c_list_lock);
1990 
1991 	if (c_buffer) {
1992 		if (pages_populated) {
1993 			kernel_memory_depopulate((vm_offset_t)c_buffer,
1994 			    ptoa(pages_populated), KMA_COMPRESSOR,
1995 			    VM_KERN_MEMORY_COMPRESSOR);
1996 		}
1997 	} else if (c_swap_handle) {
1998 		/*
1999 		 * Free swap space on disk.
2000 		 */
2001 		vm_swap_free(c_swap_handle);
2002 	}
2003 	lck_mtx_lock_spin_always(&c_seg->c_lock);
2004 	/*
2005 	 * c_seg must remain busy until
2006 	 * after the call to vm_swap_free
2007 	 */
2008 	C_SEG_WAKEUP_DONE(c_seg);
2009 	lck_mtx_unlock_always(&c_seg->c_lock);
2010 
2011 	segno = c_seg->c_mysegno;
2012 
2013 	lck_mtx_lock_spin_always(c_list_lock);
2014 	/*
2015 	 * because the c_buffer is now associated with the segno,
2016 	 * we can't put the segno back on the free list until
2017 	 * after we have depopulated the c_buffer range, or
2018 	 * we run the risk of depopulating a range that is
2019 	 * now being used in one of the compressor heads
2020 	 */
2021 	c_segments[segno].c_segno = c_free_segno_head;
2022 	c_free_segno_head = segno;
2023 	c_segment_count--;
2024 
2025 	lck_mtx_unlock_always(c_list_lock);
2026 
2027 	lck_mtx_destroy(&c_seg->c_lock, &vm_compressor_lck_grp);
2028 
2029 	if (c_seg->c_slot_var_array_len) {
2030 		kfree_data(c_seg->c_slot_var_array,
2031 		    sizeof(struct c_slot) * c_seg->c_slot_var_array_len);
2032 	}
2033 
2034 	zfree(compressor_segment_zone, c_seg);
2035 }
2036 
2037 #if DEVELOPMENT || DEBUG
2038 int c_seg_trim_page_count = 0;
2039 #endif
2040 
2041 void
c_seg_trim_tail(c_segment_t c_seg)2042 c_seg_trim_tail(c_segment_t c_seg)
2043 {
2044 	c_slot_t        cs;
2045 	uint32_t        c_size;
2046 	uint32_t        c_offset;
2047 	uint32_t        c_rounded_size;
2048 	uint16_t        current_nextslot;
2049 	uint32_t        current_populated_offset;
2050 
2051 	if (c_seg->c_bytes_used == 0) {
2052 		return;
2053 	}
2054 	current_nextslot = c_seg->c_nextslot;
2055 	current_populated_offset = c_seg->c_populated_offset;
2056 
2057 	while (c_seg->c_nextslot) {
2058 		cs = C_SEG_SLOT_FROM_INDEX(c_seg, (c_seg->c_nextslot - 1));
2059 
2060 		c_size = UNPACK_C_SIZE(cs);
2061 
2062 		if (c_size) {
2063 			if (current_nextslot != c_seg->c_nextslot) {
2064 				c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
2065 				c_offset = cs->c_offset + C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2066 
2067 				c_seg->c_nextoffset = c_offset;
2068 				c_seg->c_populated_offset = (c_offset + (C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1)) &
2069 				    ~(C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1);
2070 
2071 				if (c_seg->c_firstemptyslot > c_seg->c_nextslot) {
2072 					c_seg->c_firstemptyslot = c_seg->c_nextslot;
2073 				}
2074 #if DEVELOPMENT || DEBUG
2075 				c_seg_trim_page_count += ((round_page_32(C_SEG_OFFSET_TO_BYTES(current_populated_offset)) -
2076 				    round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) /
2077 				    PAGE_SIZE);
2078 #endif
2079 			}
2080 			break;
2081 		}
2082 		c_seg->c_nextslot--;
2083 	}
2084 	assert(c_seg->c_nextslot);
2085 }
2086 
2087 
2088 int
c_seg_minor_compaction_and_unlock(c_segment_t c_seg,boolean_t clear_busy)2089 c_seg_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy)
2090 {
2091 	c_slot_mapping_t slot_ptr;
2092 	uint32_t        c_offset = 0;
2093 	uint32_t        old_populated_offset;
2094 	uint32_t        c_rounded_size;
2095 	uint32_t        c_size;
2096 	uint16_t        c_indx = 0;
2097 	int             i;
2098 	c_slot_t        c_dst;
2099 	c_slot_t        c_src;
2100 
2101 	assert(c_seg->c_busy);
2102 
2103 #if VALIDATE_C_SEGMENTS
2104 	c_seg_validate(c_seg, FALSE);
2105 #endif
2106 	if (c_seg->c_bytes_used == 0) {
2107 		c_seg_free(c_seg);
2108 		return 1;
2109 	}
2110 	lck_mtx_unlock_always(&c_seg->c_lock);
2111 
2112 	if (c_seg->c_firstemptyslot >= c_seg->c_nextslot || C_SEG_UNUSED_BYTES(c_seg) < PAGE_SIZE) {
2113 		goto done;
2114 	}
2115 
2116 /* TODO: assert first emptyslot's c_size is actually 0 */
2117 
2118 #if DEVELOPMENT || DEBUG
2119 	C_SEG_MAKE_WRITEABLE(c_seg);
2120 #endif
2121 
2122 #if VALIDATE_C_SEGMENTS
2123 	c_seg->c_was_minor_compacted++;
2124 #endif
2125 	c_indx = c_seg->c_firstemptyslot;
2126 	c_dst = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
2127 
2128 	old_populated_offset = c_seg->c_populated_offset;
2129 	c_offset = c_dst->c_offset;
2130 
2131 	for (i = c_indx + 1; i < c_seg->c_nextslot && c_offset < c_seg->c_nextoffset; i++) {
2132 		c_src = C_SEG_SLOT_FROM_INDEX(c_seg, i);
2133 
2134 		c_size = UNPACK_C_SIZE(c_src);
2135 
2136 		if (c_size == 0) {
2137 			continue;
2138 		}
2139 
2140 		c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
2141 /* N.B.: This memcpy may be an overlapping copy */
2142 		memcpy(&c_seg->c_store.c_buffer[c_offset], &c_seg->c_store.c_buffer[c_src->c_offset], c_rounded_size);
2143 
2144 		cslot_copy(c_dst, c_src);
2145 		c_dst->c_offset = c_offset;
2146 
2147 		slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
2148 		slot_ptr->s_cindx = c_indx;
2149 
2150 		c_offset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2151 		PACK_C_SIZE(c_src, 0);
2152 		c_indx++;
2153 
2154 		c_dst = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
2155 	}
2156 	c_seg->c_firstemptyslot = c_indx;
2157 	c_seg->c_nextslot = c_indx;
2158 	c_seg->c_nextoffset = c_offset;
2159 	c_seg->c_populated_offset = (c_offset + (C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1)) & ~(C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1);
2160 	c_seg->c_bytes_unused = 0;
2161 
2162 #if VALIDATE_C_SEGMENTS
2163 	c_seg_validate(c_seg, TRUE);
2164 #endif
2165 	if (old_populated_offset > c_seg->c_populated_offset) {
2166 		uint32_t        gc_size;
2167 		int32_t         *gc_ptr;
2168 
2169 		gc_size = C_SEG_OFFSET_TO_BYTES(old_populated_offset - c_seg->c_populated_offset);
2170 		gc_ptr = &c_seg->c_store.c_buffer[c_seg->c_populated_offset];
2171 
2172 		kernel_memory_depopulate((vm_offset_t)gc_ptr, gc_size,
2173 		    KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
2174 	}
2175 
2176 #if DEVELOPMENT || DEBUG
2177 	C_SEG_WRITE_PROTECT(c_seg);
2178 #endif
2179 
2180 done:
2181 	if (clear_busy == TRUE) {
2182 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2183 		C_SEG_WAKEUP_DONE(c_seg);
2184 		lck_mtx_unlock_always(&c_seg->c_lock);
2185 	}
2186 	return 0;
2187 }
2188 
2189 
2190 static void
c_seg_alloc_nextslot(c_segment_t c_seg)2191 c_seg_alloc_nextslot(c_segment_t c_seg)
2192 {
2193 	struct c_slot   *old_slot_array = NULL;
2194 	struct c_slot   *new_slot_array = NULL;
2195 	int             newlen;
2196 	int             oldlen;
2197 
2198 	if (c_seg->c_nextslot < c_seg_fixed_array_len) {
2199 		return;
2200 	}
2201 
2202 	if ((c_seg->c_nextslot - c_seg_fixed_array_len) >= c_seg->c_slot_var_array_len) {
2203 		oldlen = c_seg->c_slot_var_array_len;
2204 		old_slot_array = c_seg->c_slot_var_array;
2205 
2206 		if (oldlen == 0) {
2207 			newlen = c_seg_slot_var_array_min_len;
2208 		} else {
2209 			newlen = oldlen * 2;
2210 		}
2211 
2212 		new_slot_array = kalloc_data(sizeof(struct c_slot) * newlen,
2213 		    Z_WAITOK);
2214 
2215 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2216 
2217 		if (old_slot_array) {
2218 			memcpy(new_slot_array, old_slot_array,
2219 			    sizeof(struct c_slot) * oldlen);
2220 		}
2221 
2222 		c_seg->c_slot_var_array_len = newlen;
2223 		c_seg->c_slot_var_array = new_slot_array;
2224 
2225 		lck_mtx_unlock_always(&c_seg->c_lock);
2226 
2227 		kfree_data(old_slot_array, sizeof(struct c_slot) * oldlen);
2228 	}
2229 }
2230 
2231 
2232 #define C_SEG_MAJOR_COMPACT_STATS_MAX   (30)
2233 
2234 struct {
2235 	uint64_t asked_permission;
2236 	uint64_t compactions;
2237 	uint64_t moved_slots;
2238 	uint64_t moved_bytes;
2239 	uint64_t wasted_space_in_swapouts;
2240 	uint64_t count_of_swapouts;
2241 	uint64_t count_of_freed_segs;
2242 	uint64_t bailed_compactions;
2243 	uint64_t bytes_freed_rate_us;
2244 } c_seg_major_compact_stats[C_SEG_MAJOR_COMPACT_STATS_MAX];
2245 
2246 int c_seg_major_compact_stats_now = 0;
2247 
2248 
2249 #define C_MAJOR_COMPACTION_SIZE_APPROPRIATE     ((c_seg_bufsize * 90) / 100)
2250 
2251 
2252 boolean_t
c_seg_major_compact_ok(c_segment_t c_seg_dst,c_segment_t c_seg_src)2253 c_seg_major_compact_ok(
2254 	c_segment_t c_seg_dst,
2255 	c_segment_t c_seg_src)
2256 {
2257 	c_seg_major_compact_stats[c_seg_major_compact_stats_now].asked_permission++;
2258 
2259 	if (c_seg_src->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE &&
2260 	    c_seg_dst->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE) {
2261 		return FALSE;
2262 	}
2263 
2264 	if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
2265 		/*
2266 		 * destination segment is full... can't compact
2267 		 */
2268 		return FALSE;
2269 	}
2270 
2271 	return TRUE;
2272 }
2273 
2274 
2275 boolean_t
c_seg_major_compact(c_segment_t c_seg_dst,c_segment_t c_seg_src)2276 c_seg_major_compact(
2277 	c_segment_t c_seg_dst,
2278 	c_segment_t c_seg_src)
2279 {
2280 	c_slot_mapping_t slot_ptr;
2281 	uint32_t        c_rounded_size;
2282 	uint32_t        c_size;
2283 	uint16_t        dst_slot;
2284 	int             i;
2285 	c_slot_t        c_dst;
2286 	c_slot_t        c_src;
2287 	boolean_t       keep_compacting = TRUE;
2288 
2289 	/*
2290 	 * segments are not locked but they are both marked c_busy
2291 	 * which keeps c_decompress from working on them...
2292 	 * we can safely allocate new pages, move compressed data
2293 	 * from c_seg_src to c_seg_dst and update both c_segment's
2294 	 * state w/o holding the master lock
2295 	 */
2296 #if DEVELOPMENT || DEBUG
2297 	C_SEG_MAKE_WRITEABLE(c_seg_dst);
2298 #endif
2299 
2300 #if VALIDATE_C_SEGMENTS
2301 	c_seg_dst->c_was_major_compacted++;
2302 	c_seg_src->c_was_major_donor++;
2303 #endif
2304 	assertf(c_seg_dst->c_has_donated_pages == c_seg_src->c_has_donated_pages, "Mismatched donation status Dst: %p, Src: %p\n", c_seg_dst, c_seg_src);
2305 	c_seg_major_compact_stats[c_seg_major_compact_stats_now].compactions++;
2306 
2307 	dst_slot = c_seg_dst->c_nextslot;
2308 
2309 	for (i = 0; i < c_seg_src->c_nextslot; i++) {
2310 		c_src = C_SEG_SLOT_FROM_INDEX(c_seg_src, i);
2311 
2312 		c_size = UNPACK_C_SIZE(c_src);
2313 
2314 		if (c_size == 0) {
2315 			/* BATCH: move what we have so far; */
2316 			continue;
2317 		}
2318 
2319 		if (C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset - c_seg_dst->c_nextoffset) < (unsigned) c_size) {
2320 			int     size_to_populate;
2321 
2322 			/* doesn't fit */
2323 			size_to_populate = c_seg_bufsize - C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset);
2324 
2325 			if (size_to_populate == 0) {
2326 				/* can't fit */
2327 				keep_compacting = FALSE;
2328 				break;
2329 			}
2330 			if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) {
2331 				size_to_populate = C_SEG_MAX_POPULATE_SIZE;
2332 			}
2333 
2334 			kernel_memory_populate(
2335 				(vm_offset_t) &c_seg_dst->c_store.c_buffer[c_seg_dst->c_populated_offset],
2336 				size_to_populate,
2337 				KMA_NOFAIL | KMA_COMPRESSOR,
2338 				VM_KERN_MEMORY_COMPRESSOR);
2339 
2340 			c_seg_dst->c_populated_offset += C_SEG_BYTES_TO_OFFSET(size_to_populate);
2341 			assert(C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset) <= c_seg_bufsize);
2342 		}
2343 		c_seg_alloc_nextslot(c_seg_dst);
2344 
2345 		c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot);
2346 
2347 		memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], c_size);
2348 
2349 		c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
2350 
2351 		c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_slots++;
2352 		c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_bytes += c_size;
2353 
2354 		cslot_copy(c_dst, c_src);
2355 		c_dst->c_offset = c_seg_dst->c_nextoffset;
2356 
2357 		if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) {
2358 			c_seg_dst->c_firstemptyslot++;
2359 		}
2360 		c_seg_dst->c_slots_used++;
2361 		c_seg_dst->c_nextslot++;
2362 		c_seg_dst->c_bytes_used += c_rounded_size;
2363 		c_seg_dst->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2364 
2365 		PACK_C_SIZE(c_src, 0);
2366 
2367 		c_seg_src->c_bytes_used -= c_rounded_size;
2368 		c_seg_src->c_bytes_unused += c_rounded_size;
2369 		c_seg_src->c_firstemptyslot = 0;
2370 
2371 		assert(c_seg_src->c_slots_used);
2372 		c_seg_src->c_slots_used--;
2373 
2374 		if (!c_seg_src->c_swappedin) {
2375 			/* Pessimistically lose swappedin status when non-swappedin pages are added. */
2376 			c_seg_dst->c_swappedin = false;
2377 		}
2378 
2379 		if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
2380 			/* dest segment is now full */
2381 			keep_compacting = FALSE;
2382 			break;
2383 		}
2384 	}
2385 #if DEVELOPMENT || DEBUG
2386 	C_SEG_WRITE_PROTECT(c_seg_dst);
2387 #endif
2388 	if (dst_slot < c_seg_dst->c_nextslot) {
2389 		PAGE_REPLACEMENT_ALLOWED(TRUE);
2390 		/*
2391 		 * we've now locked out c_decompress from
2392 		 * converting the slot passed into it into
2393 		 * a c_segment_t which allows us to use
2394 		 * the backptr to change which c_segment and
2395 		 * index the slot points to
2396 		 */
2397 		while (dst_slot < c_seg_dst->c_nextslot) {
2398 			c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, dst_slot);
2399 
2400 			slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
2401 			/* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
2402 			slot_ptr->s_cseg = c_seg_dst->c_mysegno + 1;
2403 			slot_ptr->s_cindx = dst_slot++;
2404 		}
2405 		PAGE_REPLACEMENT_ALLOWED(FALSE);
2406 	}
2407 	return keep_compacting;
2408 }
2409 
2410 
2411 uint64_t
vm_compressor_compute_elapsed_msecs(clock_sec_t end_sec,clock_nsec_t end_nsec,clock_sec_t start_sec,clock_nsec_t start_nsec)2412 vm_compressor_compute_elapsed_msecs(clock_sec_t end_sec, clock_nsec_t end_nsec, clock_sec_t start_sec, clock_nsec_t start_nsec)
2413 {
2414 	uint64_t end_msecs;
2415 	uint64_t start_msecs;
2416 
2417 	end_msecs = (end_sec * 1000) + end_nsec / 1000000;
2418 	start_msecs = (start_sec * 1000) + start_nsec / 1000000;
2419 
2420 	return end_msecs - start_msecs;
2421 }
2422 
2423 
2424 
2425 uint32_t compressor_eval_period_in_msecs = 250;
2426 uint32_t compressor_sample_min_in_msecs = 500;
2427 uint32_t compressor_sample_max_in_msecs = 10000;
2428 uint32_t compressor_thrashing_threshold_per_10msecs = 50;
2429 uint32_t compressor_thrashing_min_per_10msecs = 20;
2430 
2431 /* When true, reset sample data next chance we get. */
2432 static boolean_t        compressor_need_sample_reset = FALSE;
2433 
2434 
2435 void
compute_swapout_target_age(void)2436 compute_swapout_target_age(void)
2437 {
2438 	clock_sec_t     cur_ts_sec;
2439 	clock_nsec_t    cur_ts_nsec;
2440 	uint32_t        min_operations_needed_in_this_sample;
2441 	uint64_t        elapsed_msecs_in_eval;
2442 	uint64_t        elapsed_msecs_in_sample;
2443 	boolean_t       need_eval_reset = FALSE;
2444 
2445 	clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
2446 
2447 	elapsed_msecs_in_sample = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, start_of_sample_period_sec, start_of_sample_period_nsec);
2448 
2449 	if (compressor_need_sample_reset ||
2450 	    elapsed_msecs_in_sample >= compressor_sample_max_in_msecs) {
2451 		compressor_need_sample_reset = TRUE;
2452 		need_eval_reset = TRUE;
2453 		goto done;
2454 	}
2455 	elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, start_of_eval_period_sec, start_of_eval_period_nsec);
2456 
2457 	if (elapsed_msecs_in_eval < compressor_eval_period_in_msecs) {
2458 		goto done;
2459 	}
2460 	need_eval_reset = TRUE;
2461 
2462 	KERNEL_DEBUG(0xe0400020 | DBG_FUNC_START, elapsed_msecs_in_eval, sample_period_compression_count, sample_period_decompression_count, 0, 0);
2463 
2464 	min_operations_needed_in_this_sample = (compressor_thrashing_min_per_10msecs * (uint32_t)elapsed_msecs_in_eval) / 10;
2465 
2466 	if ((sample_period_compression_count - last_eval_compression_count) < min_operations_needed_in_this_sample ||
2467 	    (sample_period_decompression_count - last_eval_decompression_count) < min_operations_needed_in_this_sample) {
2468 		KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, sample_period_compression_count - last_eval_compression_count,
2469 		    sample_period_decompression_count - last_eval_decompression_count, 0, 1, 0);
2470 
2471 		swapout_target_age = 0;
2472 
2473 		compressor_need_sample_reset = TRUE;
2474 		need_eval_reset = TRUE;
2475 		goto done;
2476 	}
2477 	last_eval_compression_count = sample_period_compression_count;
2478 	last_eval_decompression_count = sample_period_decompression_count;
2479 
2480 	if (elapsed_msecs_in_sample < compressor_sample_min_in_msecs) {
2481 		KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, swapout_target_age, 0, 0, 5, 0);
2482 		goto done;
2483 	}
2484 	if (sample_period_decompression_count > ((compressor_thrashing_threshold_per_10msecs * elapsed_msecs_in_sample) / 10)) {
2485 		uint64_t        running_total;
2486 		uint64_t        working_target;
2487 		uint64_t        aging_target;
2488 		uint32_t        oldest_age_of_csegs_sampled = 0;
2489 		uint64_t        working_set_approximation = 0;
2490 
2491 		swapout_target_age = 0;
2492 
2493 		working_target = (sample_period_decompression_count / 100) * 95;                /* 95 percent */
2494 		aging_target = (sample_period_decompression_count / 100) * 1;                   /* 1 percent */
2495 		running_total = 0;
2496 
2497 		for (oldest_age_of_csegs_sampled = 0; oldest_age_of_csegs_sampled < DECOMPRESSION_SAMPLE_MAX_AGE; oldest_age_of_csegs_sampled++) {
2498 			running_total += age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2499 
2500 			working_set_approximation += oldest_age_of_csegs_sampled * age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2501 
2502 			if (running_total >= working_target) {
2503 				break;
2504 			}
2505 		}
2506 		if (oldest_age_of_csegs_sampled < DECOMPRESSION_SAMPLE_MAX_AGE) {
2507 			working_set_approximation = (working_set_approximation * 1000) / elapsed_msecs_in_sample;
2508 
2509 			if (working_set_approximation < VM_PAGE_COMPRESSOR_COUNT) {
2510 				running_total = overage_decompressions_during_sample_period;
2511 
2512 				for (oldest_age_of_csegs_sampled = DECOMPRESSION_SAMPLE_MAX_AGE - 1; oldest_age_of_csegs_sampled; oldest_age_of_csegs_sampled--) {
2513 					running_total += age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2514 
2515 					if (running_total >= aging_target) {
2516 						break;
2517 					}
2518 				}
2519 				swapout_target_age = (uint32_t)cur_ts_sec - oldest_age_of_csegs_sampled;
2520 
2521 				KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, swapout_target_age, working_set_approximation, VM_PAGE_COMPRESSOR_COUNT, 2, 0);
2522 			} else {
2523 				KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, working_set_approximation, VM_PAGE_COMPRESSOR_COUNT, 0, 3, 0);
2524 			}
2525 		} else {
2526 			KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, working_target, running_total, 0, 4, 0);
2527 		}
2528 
2529 		compressor_need_sample_reset = TRUE;
2530 		need_eval_reset = TRUE;
2531 	} else {
2532 		KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, sample_period_decompression_count, (compressor_thrashing_threshold_per_10msecs * elapsed_msecs_in_sample) / 10, 0, 6, 0);
2533 	}
2534 done:
2535 	if (compressor_need_sample_reset == TRUE) {
2536 		bzero(age_of_decompressions_during_sample_period, sizeof(age_of_decompressions_during_sample_period));
2537 		overage_decompressions_during_sample_period = 0;
2538 
2539 		start_of_sample_period_sec = cur_ts_sec;
2540 		start_of_sample_period_nsec = cur_ts_nsec;
2541 		sample_period_decompression_count = 0;
2542 		sample_period_compression_count = 0;
2543 		last_eval_decompression_count = 0;
2544 		last_eval_compression_count = 0;
2545 		compressor_need_sample_reset = FALSE;
2546 	}
2547 	if (need_eval_reset == TRUE) {
2548 		start_of_eval_period_sec = cur_ts_sec;
2549 		start_of_eval_period_nsec = cur_ts_nsec;
2550 	}
2551 }
2552 
2553 
2554 int             compaction_swapper_init_now = 0;
2555 int             compaction_swapper_running = 0;
2556 int             compaction_swapper_awakened = 0;
2557 int             compaction_swapper_abort = 0;
2558 
2559 bool
vm_compressor_swapout_is_ripe()2560 vm_compressor_swapout_is_ripe()
2561 {
2562 	bool is_ripe = false;
2563 	if (vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit) {
2564 		c_segment_t     c_seg;
2565 		clock_sec_t     now;
2566 		clock_sec_t     age;
2567 		clock_nsec_t    nsec;
2568 
2569 		clock_get_system_nanotime(&now, &nsec);
2570 		age = 0;
2571 
2572 		lck_mtx_lock_spin_always(c_list_lock);
2573 
2574 		if (!queue_empty(&c_age_list_head)) {
2575 			c_seg = (c_segment_t) queue_first(&c_age_list_head);
2576 
2577 			age = now - c_seg->c_creation_ts;
2578 		}
2579 		lck_mtx_unlock_always(c_list_lock);
2580 
2581 		if (age >= vm_ripe_target_age) {
2582 			is_ripe = true;
2583 		}
2584 	}
2585 	return is_ripe;
2586 }
2587 
2588 static bool
compressor_swapout_conditions_met(void)2589 compressor_swapout_conditions_met(void)
2590 {
2591 	bool should_swap = false;
2592 	if (COMPRESSOR_NEEDS_TO_SWAP()) {
2593 		should_swap = true;
2594 		vmcs_stats.compressor_swap_threshold_exceeded++;
2595 	}
2596 	if (VM_PAGE_Q_THROTTLED(&vm_pageout_queue_external) && vm_page_anonymous_count < (vm_page_inactive_count / 20)) {
2597 		should_swap = true;
2598 		vmcs_stats.external_q_throttled++;
2599 	}
2600 	if (vm_page_free_count < (vm_page_free_reserved - (COMPRESSOR_FREE_RESERVED_LIMIT * 2))) {
2601 		should_swap = true;
2602 		vmcs_stats.free_count_below_reserve++;
2603 	}
2604 	return should_swap;
2605 }
2606 
2607 static bool
compressor_needs_to_swap()2608 compressor_needs_to_swap()
2609 {
2610 	bool should_swap = false;
2611 	if (vm_compressor_swapout_is_ripe()) {
2612 		should_swap = true;
2613 		goto check_if_low_space;
2614 	}
2615 
2616 	if (VM_CONFIG_SWAP_IS_ACTIVE) {
2617 		should_swap =  compressor_swapout_conditions_met();
2618 		if (should_swap) {
2619 			goto check_if_low_space;
2620 		}
2621 	}
2622 
2623 #if (XNU_TARGET_OS_OSX && __arm64__)
2624 	/*
2625 	 * Thrashing detection disabled.
2626 	 */
2627 #else /* (XNU_TARGET_OS_OSX && __arm64__) */
2628 
2629 	if (vm_compressor_is_thrashing()) {
2630 		should_swap = true;
2631 		vmcs_stats.thrashing_detected++;
2632 	}
2633 
2634 #if CONFIG_PHANTOM_CACHE
2635 	if (vm_phantom_cache_check_pressure()) {
2636 		os_atomic_store(&memorystatus_phantom_cache_pressure, true, release);
2637 		should_swap = true;
2638 	}
2639 #endif
2640 	if (swapout_target_age) {
2641 		should_swap = true;
2642 	}
2643 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
2644 
2645 check_if_low_space:
2646 
2647 #if CONFIG_JETSAM
2648 	if (should_swap || vm_compressor_low_on_space() == TRUE) {
2649 		if (vm_compressor_thrashing_detected == FALSE) {
2650 			vm_compressor_thrashing_detected = TRUE;
2651 
2652 			if (swapout_target_age) {
2653 				compressor_thrashing_induced_jetsam++;
2654 			} else if (vm_compressor_low_on_space() == TRUE) {
2655 				compressor_thrashing_induced_jetsam++;
2656 			} else {
2657 				filecache_thrashing_induced_jetsam++;
2658 			}
2659 			/*
2660 			 * Wake up the memorystatus thread so that it can return
2661 			 * the system to a healthy state (by killing processes).
2662 			 */
2663 			memorystatus_thread_wake();
2664 		}
2665 		/*
2666 		 * let the jetsam take precedence over
2667 		 * any major compactions we might have
2668 		 * been able to do... otherwise we run
2669 		 * the risk of doing major compactions
2670 		 * on segments we're about to free up
2671 		 * due to the jetsam activity.
2672 		 */
2673 		should_swap = false;
2674 		if (memorystatus_swap_all_apps && vm_swap_low_on_space()) {
2675 			vm_compressor_take_paging_space_action();
2676 		}
2677 	}
2678 
2679 #else /* CONFIG_JETSAM */
2680 	if (should_swap && vm_swap_low_on_space()) {
2681 		vm_compressor_take_paging_space_action();
2682 	}
2683 #endif /* CONFIG_JETSAM */
2684 
2685 	if (should_swap == false) {
2686 		/*
2687 		 * vm_compressor_needs_to_major_compact returns true only if we're
2688 		 * about to run out of available compressor segments... in this
2689 		 * case, we absolutely need to run a major compaction even if
2690 		 * we've just kicked off a jetsam or we don't otherwise need to
2691 		 * swap... terminating objects releases
2692 		 * pages back to the uncompressed cache, but does not guarantee
2693 		 * that we will free up even a single compression segment
2694 		 */
2695 		should_swap = vm_compressor_needs_to_major_compact();
2696 		if (should_swap) {
2697 			vmcs_stats.fragmentation_detected++;
2698 		}
2699 	}
2700 
2701 	/*
2702 	 * returning TRUE when swap_supported == FALSE
2703 	 * will cause the major compaction engine to
2704 	 * run, but will not trigger any swapping...
2705 	 * segments that have been major compacted
2706 	 * will be moved to the majorcompact queue
2707 	 */
2708 	return should_swap;
2709 }
2710 
2711 #if CONFIG_JETSAM
2712 /*
2713  * This function is called from the jetsam thread after killing something to
2714  * mitigate thrashing.
2715  *
2716  * We need to restart our thrashing detection heuristics since memory pressure
2717  * has potentially changed significantly, and we don't want to detect on old
2718  * data from before the jetsam.
2719  */
2720 void
vm_thrashing_jetsam_done(void)2721 vm_thrashing_jetsam_done(void)
2722 {
2723 	vm_compressor_thrashing_detected = FALSE;
2724 
2725 	/* Were we compressor-thrashing or filecache-thrashing? */
2726 	if (swapout_target_age) {
2727 		swapout_target_age = 0;
2728 		compressor_need_sample_reset = TRUE;
2729 	}
2730 #if CONFIG_PHANTOM_CACHE
2731 	else {
2732 		vm_phantom_cache_restart_sample();
2733 	}
2734 #endif
2735 }
2736 #endif /* CONFIG_JETSAM */
2737 
2738 uint32_t vm_wake_compactor_swapper_calls = 0;
2739 uint32_t vm_run_compactor_already_running = 0;
2740 uint32_t vm_run_compactor_empty_minor_q = 0;
2741 uint32_t vm_run_compactor_did_compact = 0;
2742 uint32_t vm_run_compactor_waited = 0;
2743 
2744 void
vm_run_compactor(void)2745 vm_run_compactor(void)
2746 {
2747 	if (c_segment_count == 0) {
2748 		return;
2749 	}
2750 
2751 	lck_mtx_lock_spin_always(c_list_lock);
2752 
2753 	if (c_minor_count == 0) {
2754 		vm_run_compactor_empty_minor_q++;
2755 
2756 		lck_mtx_unlock_always(c_list_lock);
2757 		return;
2758 	}
2759 	if (compaction_swapper_running) {
2760 		if (vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2761 			vm_run_compactor_already_running++;
2762 
2763 			lck_mtx_unlock_always(c_list_lock);
2764 			return;
2765 		}
2766 		vm_run_compactor_waited++;
2767 
2768 		assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
2769 
2770 		lck_mtx_unlock_always(c_list_lock);
2771 
2772 		thread_block(THREAD_CONTINUE_NULL);
2773 
2774 		return;
2775 	}
2776 	vm_run_compactor_did_compact++;
2777 
2778 	fastwake_warmup = FALSE;
2779 	compaction_swapper_running = 1;
2780 
2781 	vm_compressor_do_delayed_compactions(FALSE);
2782 
2783 	compaction_swapper_running = 0;
2784 
2785 	lck_mtx_unlock_always(c_list_lock);
2786 
2787 	thread_wakeup((event_t)&compaction_swapper_running);
2788 }
2789 
2790 
2791 void
vm_wake_compactor_swapper(void)2792 vm_wake_compactor_swapper(void)
2793 {
2794 	if (compaction_swapper_running || compaction_swapper_awakened || c_segment_count == 0) {
2795 		return;
2796 	}
2797 
2798 	if (c_minor_count || vm_compressor_needs_to_major_compact()) {
2799 		lck_mtx_lock_spin_always(c_list_lock);
2800 
2801 		fastwake_warmup = FALSE;
2802 
2803 		if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
2804 			vm_wake_compactor_swapper_calls++;
2805 
2806 			compaction_swapper_awakened = 1;
2807 			thread_wakeup((event_t)&c_compressor_swap_trigger);
2808 		}
2809 		lck_mtx_unlock_always(c_list_lock);
2810 	}
2811 }
2812 
2813 
2814 void
vm_consider_swapping()2815 vm_consider_swapping()
2816 {
2817 	c_segment_t     c_seg, c_seg_next;
2818 	clock_sec_t     now;
2819 	clock_nsec_t    nsec;
2820 
2821 	assert(VM_CONFIG_SWAP_IS_PRESENT);
2822 
2823 	lck_mtx_lock_spin_always(c_list_lock);
2824 
2825 	compaction_swapper_abort = 1;
2826 
2827 	while (compaction_swapper_running) {
2828 		assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
2829 
2830 		lck_mtx_unlock_always(c_list_lock);
2831 
2832 		thread_block(THREAD_CONTINUE_NULL);
2833 
2834 		lck_mtx_lock_spin_always(c_list_lock);
2835 	}
2836 	compaction_swapper_abort = 0;
2837 	compaction_swapper_running = 1;
2838 
2839 	vm_swapout_ripe_segments = TRUE;
2840 
2841 	if (!queue_empty(&c_major_list_head)) {
2842 		clock_get_system_nanotime(&now, &nsec);
2843 
2844 		c_seg = (c_segment_t)queue_first(&c_major_list_head);
2845 
2846 		while (!queue_end(&c_major_list_head, (queue_entry_t)c_seg)) {
2847 			if (c_overage_swapped_count >= c_overage_swapped_limit) {
2848 				break;
2849 			}
2850 
2851 			c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
2852 
2853 			if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
2854 				lck_mtx_lock_spin_always(&c_seg->c_lock);
2855 
2856 				c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
2857 
2858 				lck_mtx_unlock_always(&c_seg->c_lock);
2859 			}
2860 			c_seg = c_seg_next;
2861 		}
2862 	}
2863 	vm_compressor_compact_and_swap(FALSE);
2864 
2865 	compaction_swapper_running = 0;
2866 
2867 	vm_swapout_ripe_segments = FALSE;
2868 
2869 	lck_mtx_unlock_always(c_list_lock);
2870 
2871 	thread_wakeup((event_t)&compaction_swapper_running);
2872 }
2873 
2874 
2875 void
vm_consider_waking_compactor_swapper(void)2876 vm_consider_waking_compactor_swapper(void)
2877 {
2878 	boolean_t       need_wakeup = FALSE;
2879 
2880 	if (c_segment_count == 0) {
2881 		return;
2882 	}
2883 
2884 	if (compaction_swapper_running || compaction_swapper_awakened) {
2885 		return;
2886 	}
2887 
2888 	if (!compaction_swapper_inited && !compaction_swapper_init_now) {
2889 		compaction_swapper_init_now = 1;
2890 		need_wakeup = TRUE;
2891 	}
2892 
2893 	if (c_minor_count && (COMPRESSOR_NEEDS_TO_MINOR_COMPACT())) {
2894 		need_wakeup = TRUE;
2895 	} else if (compressor_needs_to_swap()) {
2896 		need_wakeup = TRUE;
2897 	} else if (c_minor_count) {
2898 		uint64_t        total_bytes;
2899 
2900 		total_bytes = compressor_object->resident_page_count * PAGE_SIZE_64;
2901 
2902 		if ((total_bytes - compressor_bytes_used) > total_bytes / 10) {
2903 			need_wakeup = TRUE;
2904 		}
2905 	}
2906 	if (need_wakeup == TRUE) {
2907 		lck_mtx_lock_spin_always(c_list_lock);
2908 
2909 		fastwake_warmup = FALSE;
2910 
2911 		if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
2912 			memoryshot(VM_WAKEUP_COMPACTOR_SWAPPER, DBG_FUNC_NONE);
2913 
2914 			compaction_swapper_awakened = 1;
2915 			thread_wakeup((event_t)&c_compressor_swap_trigger);
2916 		}
2917 		lck_mtx_unlock_always(c_list_lock);
2918 	}
2919 }
2920 
2921 
2922 #define C_SWAPOUT_LIMIT                 4
2923 #define DELAYED_COMPACTIONS_PER_PASS    30
2924 
2925 void
vm_compressor_do_delayed_compactions(boolean_t flush_all)2926 vm_compressor_do_delayed_compactions(boolean_t flush_all)
2927 {
2928 	c_segment_t     c_seg;
2929 	int             number_compacted = 0;
2930 	boolean_t       needs_to_swap = FALSE;
2931 	uint32_t        c_swapout_count = 0;
2932 
2933 
2934 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_START, c_minor_count, flush_all, 0, 0);
2935 
2936 #if XNU_TARGET_OS_OSX
2937 	LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
2938 #endif /* XNU_TARGET_OS_OSX */
2939 
2940 	while (!queue_empty(&c_minor_list_head) && needs_to_swap == FALSE) {
2941 		c_seg = (c_segment_t)queue_first(&c_minor_list_head);
2942 
2943 		lck_mtx_lock_spin_always(&c_seg->c_lock);
2944 
2945 		if (c_seg->c_busy) {
2946 			lck_mtx_unlock_always(c_list_lock);
2947 			c_seg_wait_on_busy(c_seg);
2948 			lck_mtx_lock_spin_always(c_list_lock);
2949 
2950 			continue;
2951 		}
2952 		C_SEG_BUSY(c_seg);
2953 
2954 		c_seg_do_minor_compaction_and_unlock(c_seg, TRUE, FALSE, TRUE);
2955 
2956 		c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
2957 		if (VM_CONFIG_SWAP_IS_ACTIVE && (number_compacted++ > DELAYED_COMPACTIONS_PER_PASS)) {
2958 			if ((flush_all == TRUE || compressor_needs_to_swap()) && c_swapout_count < C_SWAPOUT_LIMIT) {
2959 				needs_to_swap = TRUE;
2960 			}
2961 
2962 			number_compacted = 0;
2963 		}
2964 		lck_mtx_lock_spin_always(c_list_lock);
2965 	}
2966 
2967 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_END, c_minor_count, number_compacted, needs_to_swap, 0);
2968 }
2969 
2970 int min_csegs_per_major_compaction = DELAYED_COMPACTIONS_PER_PASS;
2971 
2972 static bool
vm_compressor_major_compact_cseg(c_segment_t c_seg,uint32_t * c_seg_considered,bool * bail_wanted_cseg,uint64_t * total_bytes_freed)2973 vm_compressor_major_compact_cseg(c_segment_t c_seg, uint32_t* c_seg_considered, bool* bail_wanted_cseg, uint64_t* total_bytes_freed)
2974 {
2975 	/*
2976 	 * Major compaction
2977 	 */
2978 	bool keep_compacting = true, fully_compacted = true;
2979 	queue_head_t *list_head = NULL;
2980 	c_segment_t c_seg_next;
2981 	uint64_t        bytes_to_free = 0, bytes_freed = 0;
2982 	uint32_t        number_considered = 0;
2983 
2984 	if (c_seg->c_state == C_ON_AGE_Q) {
2985 		assert(!c_seg->c_has_donated_pages);
2986 		list_head = &c_age_list_head;
2987 	} else if (c_seg->c_state == C_ON_SWAPPEDIN_Q) {
2988 		assert(c_seg->c_has_donated_pages);
2989 		list_head = &c_late_swappedin_list_head;
2990 	}
2991 
2992 	while (keep_compacting == TRUE) {
2993 		assert(c_seg->c_busy);
2994 
2995 		/* look for another segment to consolidate */
2996 
2997 		c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
2998 
2999 		if (queue_end(list_head, (queue_entry_t)c_seg_next)) {
3000 			break;
3001 		}
3002 
3003 		assert(c_seg_next->c_state == c_seg->c_state);
3004 
3005 		number_considered++;
3006 
3007 		if (c_seg_major_compact_ok(c_seg, c_seg_next) == FALSE) {
3008 			break;
3009 		}
3010 
3011 		lck_mtx_lock_spin_always(&c_seg_next->c_lock);
3012 
3013 		if (c_seg_next->c_busy) {
3014 			/*
3015 			 * We are going to block for our neighbor.
3016 			 * If our c_seg is wanted, we should unbusy
3017 			 * it because we don't know how long we might
3018 			 * have to block here.
3019 			 */
3020 			if (c_seg->c_wanted) {
3021 				lck_mtx_unlock_always(&c_seg_next->c_lock);
3022 				fully_compacted = false;
3023 				c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
3024 				*bail_wanted_cseg = true;
3025 				break;
3026 			}
3027 
3028 			lck_mtx_unlock_always(c_list_lock);
3029 
3030 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 8, (void*) VM_KERNEL_ADDRPERM(c_seg_next), 0, 0);
3031 
3032 			c_seg_wait_on_busy(c_seg_next);
3033 			lck_mtx_lock_spin_always(c_list_lock);
3034 
3035 			continue;
3036 		}
3037 		/* grab that segment */
3038 		C_SEG_BUSY(c_seg_next);
3039 
3040 		bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3041 		if (c_seg_do_minor_compaction_and_unlock(c_seg_next, FALSE, TRUE, TRUE)) {
3042 			/*
3043 			 * found an empty c_segment and freed it
3044 			 * so we can't continue to use c_seg_next
3045 			 */
3046 			bytes_freed += bytes_to_free;
3047 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3048 			continue;
3049 		}
3050 
3051 		/* unlock the list ... */
3052 		lck_mtx_unlock_always(c_list_lock);
3053 
3054 		/* do the major compaction */
3055 
3056 		keep_compacting = c_seg_major_compact(c_seg, c_seg_next);
3057 
3058 		VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 9, keep_compacting, 0, 0);
3059 
3060 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
3061 
3062 		lck_mtx_lock_spin_always(&c_seg_next->c_lock);
3063 		/*
3064 		 * run a minor compaction on the donor segment
3065 		 * since we pulled at least some of it's
3066 		 * data into our target...  if we've emptied
3067 		 * it, now is a good time to free it which
3068 		 * c_seg_minor_compaction_and_unlock also takes care of
3069 		 *
3070 		 * by passing TRUE, we ask for c_busy to be cleared
3071 		 * and c_wanted to be taken care of
3072 		 */
3073 		bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3074 		if (c_seg_minor_compaction_and_unlock(c_seg_next, TRUE)) {
3075 			bytes_freed += bytes_to_free;
3076 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3077 		} else {
3078 			bytes_to_free -= C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3079 			bytes_freed += bytes_to_free;
3080 		}
3081 
3082 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
3083 
3084 		/* relock the list */
3085 		lck_mtx_lock_spin_always(c_list_lock);
3086 
3087 		if (c_seg->c_wanted) {
3088 			/*
3089 			 * Our c_seg is in demand. Let's
3090 			 * unbusy it and wakeup the waiters
3091 			 * instead of continuing the compaction
3092 			 * because we could be in this loop
3093 			 * for a while.
3094 			 */
3095 			fully_compacted = false;
3096 			*bail_wanted_cseg = true;
3097 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
3098 			break;
3099 		}
3100 	} /* major compaction */
3101 
3102 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 10, number_considered, *bail_wanted_cseg, 0);
3103 
3104 	*c_seg_considered += number_considered;
3105 	*total_bytes_freed += bytes_freed;
3106 
3107 	lck_mtx_lock_spin_always(&c_seg->c_lock);
3108 	return fully_compacted;
3109 }
3110 
3111 #define TIME_SUB(rsecs, secs, rfrac, frac, unit)                        \
3112 	MACRO_BEGIN                                                     \
3113 	if ((int)((rfrac) -= (frac)) < 0) {                             \
3114 	        (rfrac) += (unit);                                      \
3115 	        (rsecs) -= 1;                                           \
3116 	}                                                               \
3117 	(rsecs) -= (secs);                                              \
3118 	MACRO_END
3119 
3120 #if (XNU_TARGET_OS_OSX && __arm64__)
3121 clock_nsec_t c_process_major_report_over_ms = 9; /* report if over 9 ms */
3122 int c_process_major_yield_after = 1000; /* yield after moving 1,000 segments */
3123 uint64_t c_process_major_reports = 0;
3124 clock_sec_t c_process_major_max_sec = 0;
3125 clock_nsec_t c_process_major_max_nsec = 0;
3126 uint32_t c_process_major_peak_segcount = 0;
3127 static void
vm_compressor_process_major_segments(void)3128 vm_compressor_process_major_segments(void)
3129 {
3130 	c_segment_t c_seg = NULL;
3131 	int count = 0, total = 0, breaks = 0;
3132 	clock_sec_t start_sec, end_sec;
3133 	clock_nsec_t start_nsec, end_nsec;
3134 	clock_nsec_t report_over_ns;
3135 
3136 	if (queue_empty(&c_major_list_head)) {
3137 		return;
3138 	}
3139 
3140 	// printf("%s: starting to move segments from MAJORQ to AGEQ\n", __FUNCTION__);
3141 	if (c_process_major_report_over_ms != 0) {
3142 		report_over_ns = c_process_major_report_over_ms * NSEC_PER_MSEC;
3143 	} else {
3144 		report_over_ns = (clock_nsec_t)-1;
3145 	}
3146 	clock_get_system_nanotime(&start_sec, &start_nsec);
3147 	while (!queue_empty(&c_major_list_head)) {
3148 		/* start from the end to preserve aging order */
3149 		c_seg = (c_segment_t)queue_last(&c_major_list_head);
3150 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3151 		c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3152 		lck_mtx_unlock_always(&c_seg->c_lock);
3153 
3154 		count++;
3155 		if (count == c_process_major_yield_after ||
3156 		    queue_empty(&c_major_list_head)) {
3157 			/* done or time to take a break */
3158 		} else {
3159 			/* keep going */
3160 			continue;
3161 		}
3162 
3163 		total += count;
3164 		clock_get_system_nanotime(&end_sec, &end_nsec);
3165 		TIME_SUB(end_sec, start_sec, end_nsec, start_nsec, NSEC_PER_SEC);
3166 		if (end_sec > c_process_major_max_sec) {
3167 			c_process_major_max_sec = end_sec;
3168 			c_process_major_max_nsec = end_nsec;
3169 		} else if (end_sec == c_process_major_max_sec &&
3170 		    end_nsec > c_process_major_max_nsec) {
3171 			c_process_major_max_nsec = end_nsec;
3172 		}
3173 		if (total > c_process_major_peak_segcount) {
3174 			c_process_major_peak_segcount = total;
3175 		}
3176 		if (end_sec > 0 ||
3177 		    end_nsec >= report_over_ns) {
3178 			/* we used more than expected */
3179 			c_process_major_reports++;
3180 			printf("%s: moved %d/%d segments from MAJORQ to AGEQ in %lu.%09u seconds and %d breaks\n",
3181 			    __FUNCTION__, count, total,
3182 			    end_sec, end_nsec, breaks);
3183 		}
3184 		if (queue_empty(&c_major_list_head)) {
3185 			/* done */
3186 			break;
3187 		}
3188 		/* take a break to allow someone else to grab the lock */
3189 		lck_mtx_unlock_always(c_list_lock);
3190 		mutex_pause(0); /* 10 microseconds */
3191 		lck_mtx_lock_spin_always(c_list_lock);
3192 		/* start again */
3193 		clock_get_system_nanotime(&start_sec, &start_nsec);
3194 		count = 0;
3195 		breaks++;
3196 	}
3197 }
3198 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3199 
3200 /*
3201  * macOS special swappable csegs -> early_swapin queue
3202  * non-macOS special swappable+non-freezer csegs -> late_swapin queue
3203  * Processing special csegs means minor compacting each cseg and then
3204  * major compacting it and putting them on the early or late
3205  * (depending on platform) swapout queue.
3206  */
3207 static void
vm_compressor_process_special_swapped_in_segments_locked(void)3208 vm_compressor_process_special_swapped_in_segments_locked(void)
3209 {
3210 	c_segment_t c_seg = NULL;
3211 	bool            switch_state = true, bail_wanted_cseg = false;
3212 	unsigned int    number_considered = 0, yield_after_considered_per_pass = 0;
3213 	uint64_t        bytes_freed = 0;
3214 	queue_head_t    *special_swappedin_list_head;
3215 
3216 #if XNU_TARGET_OS_OSX
3217 	special_swappedin_list_head = &c_early_swappedin_list_head;
3218 #else /* XNU_TARGET_OS_OSX */
3219 	if (memorystatus_swap_all_apps) {
3220 		special_swappedin_list_head = &c_late_swappedin_list_head;
3221 	} else {
3222 		/* called on unsupported config*/
3223 		return;
3224 	}
3225 #endif /* XNU_TARGET_OS_OSX */
3226 
3227 	yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS);
3228 	while (!queue_empty(special_swappedin_list_head)) {
3229 		c_seg = (c_segment_t)queue_first(special_swappedin_list_head);
3230 
3231 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3232 
3233 		if (c_seg->c_busy) {
3234 			lck_mtx_unlock_always(c_list_lock);
3235 			c_seg_wait_on_busy(c_seg);
3236 			lck_mtx_lock_spin_always(c_list_lock);
3237 			continue;
3238 		}
3239 
3240 		C_SEG_BUSY(c_seg);
3241 		lck_mtx_unlock_always(&c_seg->c_lock);
3242 		lck_mtx_unlock_always(c_list_lock);
3243 
3244 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
3245 
3246 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3247 
3248 		if (c_seg_minor_compaction_and_unlock(c_seg, FALSE /*clear busy?*/)) {
3249 			/*
3250 			 * found an empty c_segment and freed it
3251 			 * so go grab the next guy in the queue
3252 			 */
3253 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
3254 			lck_mtx_lock_spin_always(c_list_lock);
3255 			continue;
3256 		}
3257 
3258 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
3259 		lck_mtx_lock_spin_always(c_list_lock);
3260 
3261 		switch_state = vm_compressor_major_compact_cseg(c_seg, &number_considered, &bail_wanted_cseg, &bytes_freed);
3262 		assert(c_seg->c_busy);
3263 		assert(!c_seg->c_on_minorcompact_q);
3264 
3265 		if (switch_state) {
3266 			if (VM_CONFIG_SWAP_IS_ACTIVE || VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
3267 				/*
3268 				 * Ordinarily we let swapped in segments age out + get
3269 				 * major compacted with the rest of the c_segs on the ageQ.
3270 				 * But the early donated c_segs, if well compacted, should be
3271 				 * kept ready to be swapped out if needed. These are typically
3272 				 * describing memory belonging to a leaky app (macOS) or a swap-
3273 				 * capable app (iPadOS) and for the latter we can keep these
3274 				 * around longer because we control the triggers in the memorystatus
3275 				 * subsystem
3276 				 */
3277 				c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
3278 			}
3279 		}
3280 
3281 		C_SEG_WAKEUP_DONE(c_seg);
3282 
3283 		lck_mtx_unlock_always(&c_seg->c_lock);
3284 
3285 		if (number_considered >= yield_after_considered_per_pass) {
3286 			if (bail_wanted_cseg) {
3287 				/*
3288 				 * We stopped major compactions on a c_seg
3289 				 * that is wanted. We don't know the priority
3290 				 * of the waiter unfortunately but we are at
3291 				 * a very high priority and so, just in case
3292 				 * the waiter is a critical system daemon or
3293 				 * UI thread, let's give up the CPU in case
3294 				 * the system is running a few CPU intensive
3295 				 * tasks.
3296 				 */
3297 				bail_wanted_cseg = false;
3298 				lck_mtx_unlock_always(c_list_lock);
3299 
3300 				mutex_pause(2); /* 100us yield */
3301 
3302 				lck_mtx_lock_spin_always(c_list_lock);
3303 			}
3304 
3305 			number_considered = 0;
3306 		}
3307 	}
3308 }
3309 
3310 void
vm_compressor_process_special_swapped_in_segments(void)3311 vm_compressor_process_special_swapped_in_segments(void)
3312 {
3313 	lck_mtx_lock_spin_always(c_list_lock);
3314 	vm_compressor_process_special_swapped_in_segments_locked();
3315 	lck_mtx_unlock_always(c_list_lock);
3316 }
3317 
3318 #define C_SEGMENT_SWAPPEDIN_AGE_LIMIT   10
3319 /*
3320  * Processing regular csegs means aging them.
3321  */
3322 static void
vm_compressor_process_regular_swapped_in_segments(boolean_t flush_all)3323 vm_compressor_process_regular_swapped_in_segments(boolean_t flush_all)
3324 {
3325 	c_segment_t     c_seg;
3326 	clock_sec_t     now;
3327 	clock_nsec_t    nsec;
3328 
3329 	clock_get_system_nanotime(&now, &nsec);
3330 
3331 	while (!queue_empty(&c_regular_swappedin_list_head)) {
3332 		c_seg = (c_segment_t)queue_first(&c_regular_swappedin_list_head);
3333 
3334 		if (flush_all == FALSE && (now - c_seg->c_swappedin_ts) < C_SEGMENT_SWAPPEDIN_AGE_LIMIT) {
3335 			break;
3336 		}
3337 
3338 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3339 
3340 		c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3341 		c_seg->c_agedin_ts = (uint32_t) now;
3342 
3343 		lck_mtx_unlock_always(&c_seg->c_lock);
3344 	}
3345 }
3346 
3347 
3348 extern  int     vm_num_swap_files;
3349 extern  int     vm_num_pinned_swap_files;
3350 extern  int     vm_swappin_enabled;
3351 
3352 extern  unsigned int    vm_swapfile_total_segs_used;
3353 extern  unsigned int    vm_swapfile_total_segs_alloced;
3354 
3355 
3356 void
vm_compressor_flush(void)3357 vm_compressor_flush(void)
3358 {
3359 	uint64_t        vm_swap_put_failures_at_start;
3360 	wait_result_t   wait_result = 0;
3361 	AbsoluteTime    startTime, endTime;
3362 	clock_sec_t     now_sec;
3363 	clock_nsec_t    now_nsec;
3364 	uint64_t        nsec;
3365 	c_segment_t     c_seg, c_seg_next;
3366 
3367 	HIBLOG("vm_compressor_flush - starting\n");
3368 
3369 	clock_get_uptime(&startTime);
3370 
3371 	lck_mtx_lock_spin_always(c_list_lock);
3372 
3373 	fastwake_warmup = FALSE;
3374 	compaction_swapper_abort = 1;
3375 
3376 	while (compaction_swapper_running) {
3377 		assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
3378 
3379 		lck_mtx_unlock_always(c_list_lock);
3380 
3381 		thread_block(THREAD_CONTINUE_NULL);
3382 
3383 		lck_mtx_lock_spin_always(c_list_lock);
3384 	}
3385 	compaction_swapper_abort = 0;
3386 	compaction_swapper_running = 1;
3387 
3388 	hibernate_flushing = TRUE;
3389 	hibernate_no_swapspace = FALSE;
3390 	hibernate_flush_timed_out = FALSE;
3391 	c_generation_id_flush_barrier = c_generation_id + 1000;
3392 
3393 	clock_get_system_nanotime(&now_sec, &now_nsec);
3394 	hibernate_flushing_deadline = now_sec + HIBERNATE_FLUSHING_SECS_TO_COMPLETE;
3395 
3396 	vm_swap_put_failures_at_start = vm_swap_put_failures;
3397 
3398 	/*
3399 	 * We are about to hibernate and so we want all segments flushed to disk.
3400 	 * Segments that are on the major compaction queue won't be considered in
3401 	 * the vm_compressor_compact_and_swap() pass. So we need to bring them to
3402 	 * the ageQ for consideration.
3403 	 */
3404 	if (!queue_empty(&c_major_list_head)) {
3405 		c_seg = (c_segment_t)queue_first(&c_major_list_head);
3406 
3407 		while (!queue_end(&c_major_list_head, (queue_entry_t)c_seg)) {
3408 			c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
3409 			lck_mtx_lock_spin_always(&c_seg->c_lock);
3410 			c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3411 			lck_mtx_unlock_always(&c_seg->c_lock);
3412 			c_seg = c_seg_next;
3413 		}
3414 	}
3415 	vm_compressor_compact_and_swap(TRUE);
3416 
3417 	while (!queue_empty(&c_early_swapout_list_head) || !queue_empty(&c_regular_swapout_list_head) || !queue_empty(&c_late_swapout_list_head)) {
3418 		assert_wait_timeout((event_t) &compaction_swapper_running, THREAD_INTERRUPTIBLE, 5000, 1000 * NSEC_PER_USEC);
3419 
3420 		lck_mtx_unlock_always(c_list_lock);
3421 
3422 		wait_result = thread_block(THREAD_CONTINUE_NULL);
3423 
3424 		lck_mtx_lock_spin_always(c_list_lock);
3425 
3426 		if (wait_result == THREAD_TIMED_OUT) {
3427 			break;
3428 		}
3429 	}
3430 	hibernate_flushing = FALSE;
3431 	compaction_swapper_running = 0;
3432 
3433 	if (vm_swap_put_failures > vm_swap_put_failures_at_start) {
3434 		HIBLOG("vm_compressor_flush failed to clean %llu segments - vm_page_compressor_count(%d)\n",
3435 		    vm_swap_put_failures - vm_swap_put_failures_at_start, VM_PAGE_COMPRESSOR_COUNT);
3436 	}
3437 
3438 	lck_mtx_unlock_always(c_list_lock);
3439 
3440 	thread_wakeup((event_t)&compaction_swapper_running);
3441 
3442 	clock_get_uptime(&endTime);
3443 	SUB_ABSOLUTETIME(&endTime, &startTime);
3444 	absolutetime_to_nanoseconds(endTime, &nsec);
3445 
3446 	HIBLOG("vm_compressor_flush completed - took %qd msecs - vm_num_swap_files = %d, vm_num_pinned_swap_files = %d, vm_swappin_enabled = %d\n",
3447 	    nsec / 1000000ULL, vm_num_swap_files, vm_num_pinned_swap_files, vm_swappin_enabled);
3448 }
3449 
3450 
3451 int             compaction_swap_trigger_thread_awakened = 0;
3452 
3453 static void
vm_compressor_swap_trigger_thread(void)3454 vm_compressor_swap_trigger_thread(void)
3455 {
3456 	current_thread()->options |= TH_OPT_VMPRIV;
3457 
3458 	/*
3459 	 * compaction_swapper_init_now is set when the first call to
3460 	 * vm_consider_waking_compactor_swapper is made from
3461 	 * vm_pageout_scan... since this function is called upon
3462 	 * thread creation, we want to make sure to delay adjusting
3463 	 * the tuneables until we are awakened via vm_pageout_scan
3464 	 * so that we are at a point where the vm_swapfile_open will
3465 	 * be operating on the correct directory (in case the default
3466 	 * of using the VM volume is overridden by the dynamic_pager)
3467 	 */
3468 	if (compaction_swapper_init_now) {
3469 		vm_compaction_swapper_do_init();
3470 
3471 		if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
3472 			thread_vm_bind_group_add();
3473 		}
3474 #if CONFIG_THREAD_GROUPS
3475 		thread_group_vm_add();
3476 #endif
3477 		thread_set_thread_name(current_thread(), "VM_cswap_trigger");
3478 		compaction_swapper_init_now = 0;
3479 	}
3480 	lck_mtx_lock_spin_always(c_list_lock);
3481 
3482 	compaction_swap_trigger_thread_awakened++;
3483 	compaction_swapper_awakened = 0;
3484 
3485 	if (compaction_swapper_running == 0) {
3486 		compaction_swapper_running = 1;
3487 
3488 		vm_compressor_compact_and_swap(FALSE);
3489 
3490 		compaction_swapper_running = 0;
3491 	}
3492 	assert_wait((event_t)&c_compressor_swap_trigger, THREAD_UNINT);
3493 
3494 	if (compaction_swapper_running == 0) {
3495 		thread_wakeup((event_t)&compaction_swapper_running);
3496 	}
3497 
3498 	lck_mtx_unlock_always(c_list_lock);
3499 
3500 	thread_block((thread_continue_t)vm_compressor_swap_trigger_thread);
3501 
3502 	/* NOTREACHED */
3503 }
3504 
3505 
3506 void
vm_compressor_record_warmup_start(void)3507 vm_compressor_record_warmup_start(void)
3508 {
3509 	c_segment_t     c_seg;
3510 
3511 	lck_mtx_lock_spin_always(c_list_lock);
3512 
3513 	if (first_c_segment_to_warm_generation_id == 0) {
3514 		if (!queue_empty(&c_age_list_head)) {
3515 			c_seg = (c_segment_t)queue_last(&c_age_list_head);
3516 
3517 			first_c_segment_to_warm_generation_id = c_seg->c_generation_id;
3518 		} else {
3519 			first_c_segment_to_warm_generation_id = 0;
3520 		}
3521 
3522 		fastwake_recording_in_progress = TRUE;
3523 	}
3524 	lck_mtx_unlock_always(c_list_lock);
3525 }
3526 
3527 
3528 void
vm_compressor_record_warmup_end(void)3529 vm_compressor_record_warmup_end(void)
3530 {
3531 	c_segment_t     c_seg;
3532 
3533 	lck_mtx_lock_spin_always(c_list_lock);
3534 
3535 	if (fastwake_recording_in_progress == TRUE) {
3536 		if (!queue_empty(&c_age_list_head)) {
3537 			c_seg = (c_segment_t)queue_last(&c_age_list_head);
3538 
3539 			last_c_segment_to_warm_generation_id = c_seg->c_generation_id;
3540 		} else {
3541 			last_c_segment_to_warm_generation_id = first_c_segment_to_warm_generation_id;
3542 		}
3543 
3544 		fastwake_recording_in_progress = FALSE;
3545 
3546 		HIBLOG("vm_compressor_record_warmup (%qd - %qd)\n", first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id);
3547 	}
3548 	lck_mtx_unlock_always(c_list_lock);
3549 }
3550 
3551 
3552 #define DELAY_TRIM_ON_WAKE_SECS         25
3553 
3554 void
vm_compressor_delay_trim(void)3555 vm_compressor_delay_trim(void)
3556 {
3557 	clock_sec_t     sec;
3558 	clock_nsec_t    nsec;
3559 
3560 	clock_get_system_nanotime(&sec, &nsec);
3561 	dont_trim_until_ts = sec + DELAY_TRIM_ON_WAKE_SECS;
3562 }
3563 
3564 
3565 void
vm_compressor_do_warmup(void)3566 vm_compressor_do_warmup(void)
3567 {
3568 	lck_mtx_lock_spin_always(c_list_lock);
3569 
3570 	if (first_c_segment_to_warm_generation_id == last_c_segment_to_warm_generation_id) {
3571 		first_c_segment_to_warm_generation_id = last_c_segment_to_warm_generation_id = 0;
3572 
3573 		lck_mtx_unlock_always(c_list_lock);
3574 		return;
3575 	}
3576 
3577 	if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
3578 		fastwake_warmup = TRUE;
3579 
3580 		compaction_swapper_awakened = 1;
3581 		thread_wakeup((event_t)&c_compressor_swap_trigger);
3582 	}
3583 	lck_mtx_unlock_always(c_list_lock);
3584 }
3585 
3586 void
do_fastwake_warmup_all(void)3587 do_fastwake_warmup_all(void)
3588 {
3589 	lck_mtx_lock_spin_always(c_list_lock);
3590 
3591 	if (queue_empty(&c_swappedout_list_head) && queue_empty(&c_swappedout_sparse_list_head)) {
3592 		lck_mtx_unlock_always(c_list_lock);
3593 		return;
3594 	}
3595 
3596 	fastwake_warmup = TRUE;
3597 
3598 	do_fastwake_warmup(&c_swappedout_list_head, TRUE);
3599 
3600 	do_fastwake_warmup(&c_swappedout_sparse_list_head, TRUE);
3601 
3602 	fastwake_warmup = FALSE;
3603 
3604 	lck_mtx_unlock_always(c_list_lock);
3605 }
3606 
3607 void
do_fastwake_warmup(queue_head_t * c_queue,boolean_t consider_all_cseg)3608 do_fastwake_warmup(queue_head_t *c_queue, boolean_t consider_all_cseg)
3609 {
3610 	c_segment_t     c_seg = NULL;
3611 	AbsoluteTime    startTime, endTime;
3612 	uint64_t        nsec;
3613 
3614 
3615 	HIBLOG("vm_compressor_fastwake_warmup (%qd - %qd) - starting\n", first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id);
3616 
3617 	clock_get_uptime(&startTime);
3618 
3619 	lck_mtx_unlock_always(c_list_lock);
3620 
3621 	proc_set_thread_policy(current_thread(),
3622 	    TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2);
3623 
3624 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
3625 
3626 	lck_mtx_lock_spin_always(c_list_lock);
3627 
3628 	while (!queue_empty(c_queue) && fastwake_warmup == TRUE) {
3629 		c_seg = (c_segment_t) queue_first(c_queue);
3630 
3631 		if (consider_all_cseg == FALSE) {
3632 			if (c_seg->c_generation_id < first_c_segment_to_warm_generation_id ||
3633 			    c_seg->c_generation_id > last_c_segment_to_warm_generation_id) {
3634 				break;
3635 			}
3636 
3637 			if (vm_page_free_count < (AVAILABLE_MEMORY / 4)) {
3638 				break;
3639 			}
3640 		}
3641 
3642 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3643 		lck_mtx_unlock_always(c_list_lock);
3644 
3645 		if (c_seg->c_busy) {
3646 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
3647 			c_seg_wait_on_busy(c_seg);
3648 			PAGE_REPLACEMENT_DISALLOWED(TRUE);
3649 		} else {
3650 			if (c_seg_swapin(c_seg, TRUE, FALSE) == 0) {
3651 				lck_mtx_unlock_always(&c_seg->c_lock);
3652 			}
3653 			c_segment_warmup_count++;
3654 
3655 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
3656 			vm_pageout_io_throttle();
3657 			PAGE_REPLACEMENT_DISALLOWED(TRUE);
3658 		}
3659 		lck_mtx_lock_spin_always(c_list_lock);
3660 	}
3661 	lck_mtx_unlock_always(c_list_lock);
3662 
3663 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
3664 
3665 	proc_set_thread_policy(current_thread(),
3666 	    TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER0);
3667 
3668 	clock_get_uptime(&endTime);
3669 	SUB_ABSOLUTETIME(&endTime, &startTime);
3670 	absolutetime_to_nanoseconds(endTime, &nsec);
3671 
3672 	HIBLOG("vm_compressor_fastwake_warmup completed - took %qd msecs\n", nsec / 1000000ULL);
3673 
3674 	lck_mtx_lock_spin_always(c_list_lock);
3675 
3676 	if (consider_all_cseg == FALSE) {
3677 		first_c_segment_to_warm_generation_id = last_c_segment_to_warm_generation_id = 0;
3678 	}
3679 }
3680 
3681 extern bool     vm_swapout_thread_running;
3682 extern boolean_t        compressor_store_stop_compaction;
3683 
3684 void
vm_compressor_compact_and_swap(boolean_t flush_all)3685 vm_compressor_compact_and_swap(boolean_t flush_all)
3686 {
3687 	c_segment_t     c_seg;
3688 	bool            switch_state, bail_wanted_cseg = false;
3689 	clock_sec_t     now;
3690 	clock_nsec_t    nsec;
3691 	mach_timespec_t start_ts, end_ts;
3692 	unsigned int    number_considered, wanted_cseg_found, yield_after_considered_per_pass, number_yields;
3693 	uint64_t        bytes_freed, delta_usec;
3694 	uint32_t        c_swapout_count = 0;
3695 
3696 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_START, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
3697 
3698 	if (fastwake_warmup == TRUE) {
3699 		uint64_t        starting_warmup_count;
3700 
3701 		starting_warmup_count = c_segment_warmup_count;
3702 
3703 		KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 11) | DBG_FUNC_START, c_segment_warmup_count,
3704 		    first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id, 0, 0);
3705 		do_fastwake_warmup(&c_swappedout_list_head, FALSE);
3706 		KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 11) | DBG_FUNC_END, c_segment_warmup_count, c_segment_warmup_count - starting_warmup_count, 0, 0, 0);
3707 
3708 		fastwake_warmup = FALSE;
3709 	}
3710 
3711 #if (XNU_TARGET_OS_OSX && __arm64__)
3712 	/*
3713 	 * Re-considering major csegs showed benefits on all platforms by
3714 	 * significantly reducing fragmentation and getting back memory.
3715 	 * However, on smaller devices, eg watch, there was increased power
3716 	 * use for the additional compactions. And the turnover in csegs on
3717 	 * those smaller platforms is high enough in the decompression/free
3718 	 * path that we can skip reconsidering them here because we already
3719 	 * consider them for major compaction in those paths.
3720 	 */
3721 	vm_compressor_process_major_segments();
3722 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3723 
3724 	/*
3725 	 * it's possible for the c_age_list_head to be empty if we
3726 	 * hit our limits for growing the compressor pool and we subsequently
3727 	 * hibernated... on the next hibernation we could see the queue as
3728 	 * empty and not proceeed even though we have a bunch of segments on
3729 	 * the swapped in queue that need to be dealt with.
3730 	 */
3731 	vm_compressor_do_delayed_compactions(flush_all);
3732 	vm_compressor_process_special_swapped_in_segments_locked();
3733 	vm_compressor_process_regular_swapped_in_segments(flush_all);
3734 
3735 	/*
3736 	 * we only need to grab the timestamp once per
3737 	 * invocation of this function since the
3738 	 * timescale we're interested in is measured
3739 	 * in days
3740 	 */
3741 	clock_get_system_nanotime(&now, &nsec);
3742 
3743 	start_ts.tv_sec = (int) now;
3744 	start_ts.tv_nsec = nsec;
3745 	delta_usec = 0;
3746 	number_considered = 0;
3747 	wanted_cseg_found = 0;
3748 	number_yields = 0;
3749 	bytes_freed = 0;
3750 	yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS);
3751 
3752 #if 0
3753 	/**
3754 	 * SW: Need to figure out how to properly rate limit this log because it is currently way too
3755 	 * noisy. rdar://99379414 (Figure out how to rate limit the fragmentation level logging)
3756 	 */
3757 	os_log(OS_LOG_DEFAULT, "memorystatus: before compaction fragmentation level %u\n", vm_compressor_fragmentation_level());
3758 #endif
3759 
3760 	while (!queue_empty(&c_age_list_head) && !compaction_swapper_abort && !compressor_store_stop_compaction) {
3761 		if (hibernate_flushing == TRUE) {
3762 			clock_sec_t     sec;
3763 
3764 			if (hibernate_should_abort()) {
3765 				HIBLOG("vm_compressor_flush - hibernate_should_abort returned TRUE\n");
3766 				break;
3767 			}
3768 			if (hibernate_no_swapspace == TRUE) {
3769 				HIBLOG("vm_compressor_flush - out of swap space\n");
3770 				break;
3771 			}
3772 			if (vm_swap_files_pinned() == FALSE) {
3773 				HIBLOG("vm_compressor_flush - unpinned swap files\n");
3774 				break;
3775 			}
3776 			if (hibernate_in_progress_with_pinned_swap == TRUE &&
3777 			    (vm_swapfile_total_segs_alloced == vm_swapfile_total_segs_used)) {
3778 				HIBLOG("vm_compressor_flush - out of pinned swap space\n");
3779 				break;
3780 			}
3781 			clock_get_system_nanotime(&sec, &nsec);
3782 
3783 			if (sec > hibernate_flushing_deadline) {
3784 				hibernate_flush_timed_out = TRUE;
3785 				HIBLOG("vm_compressor_flush - failed to finish before deadline\n");
3786 				break;
3787 			}
3788 		}
3789 
3790 		c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
3791 		if (VM_CONFIG_SWAP_IS_ACTIVE && !vm_swap_out_of_space() && c_swapout_count >= C_SWAPOUT_LIMIT) {
3792 			assert_wait_timeout((event_t) &compaction_swapper_running, THREAD_INTERRUPTIBLE, 100, 1000 * NSEC_PER_USEC);
3793 
3794 			if (!vm_swapout_thread_running) {
3795 				thread_wakeup((event_t)&vm_swapout_thread);
3796 			}
3797 
3798 			lck_mtx_unlock_always(c_list_lock);
3799 
3800 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 1, c_swapout_count, 0, 0);
3801 
3802 			thread_block(THREAD_CONTINUE_NULL);
3803 
3804 			lck_mtx_lock_spin_always(c_list_lock);
3805 		}
3806 		/*
3807 		 * Minor compactions
3808 		 */
3809 		vm_compressor_do_delayed_compactions(flush_all);
3810 
3811 		/*
3812 		 * vm_compressor_process_early_swapped_in_segments()
3813 		 * might be too aggressive. So OFF for now.
3814 		 */
3815 		vm_compressor_process_regular_swapped_in_segments(flush_all);
3816 
3817 		/* Recompute because we dropped the c_list_lock above*/
3818 		c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
3819 		if (VM_CONFIG_SWAP_IS_ACTIVE && !vm_swap_out_of_space() && c_swapout_count >= C_SWAPOUT_LIMIT) {
3820 			/*
3821 			 * we timed out on the above thread_block
3822 			 * let's loop around and try again
3823 			 * the timeout allows us to continue
3824 			 * to do minor compactions to make
3825 			 * more memory available
3826 			 */
3827 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 2, c_swapout_count, 0, 0);
3828 
3829 			continue;
3830 		}
3831 
3832 		/*
3833 		 * Swap out segments?
3834 		 */
3835 		if (flush_all == FALSE) {
3836 			bool needs_to_swap;
3837 
3838 			lck_mtx_unlock_always(c_list_lock);
3839 
3840 			needs_to_swap = compressor_needs_to_swap();
3841 
3842 			lck_mtx_lock_spin_always(c_list_lock);
3843 
3844 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 3, needs_to_swap, 0, 0);
3845 
3846 			if (!needs_to_swap) {
3847 				break;
3848 			}
3849 		}
3850 		if (queue_empty(&c_age_list_head)) {
3851 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 4, c_age_count, 0, 0);
3852 			break;
3853 		}
3854 		c_seg = (c_segment_t) queue_first(&c_age_list_head);
3855 
3856 		assert(c_seg->c_state == C_ON_AGE_Q);
3857 
3858 		if (flush_all == TRUE && c_seg->c_generation_id > c_generation_id_flush_barrier) {
3859 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 5, 0, 0, 0);
3860 			break;
3861 		}
3862 
3863 		lck_mtx_lock_spin_always(&c_seg->c_lock);
3864 
3865 		if (c_seg->c_busy) {
3866 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 6, (void*) VM_KERNEL_ADDRPERM(c_seg), 0, 0);
3867 
3868 			lck_mtx_unlock_always(c_list_lock);
3869 			c_seg_wait_on_busy(c_seg);
3870 			lck_mtx_lock_spin_always(c_list_lock);
3871 
3872 			continue;
3873 		}
3874 		C_SEG_BUSY(c_seg);
3875 
3876 		if (c_seg_do_minor_compaction_and_unlock(c_seg, FALSE, TRUE, TRUE)) {
3877 			/*
3878 			 * found an empty c_segment and freed it
3879 			 * so go grab the next guy in the queue
3880 			 */
3881 			VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 7, 0, 0, 0);
3882 			c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3883 			continue;
3884 		}
3885 
3886 		switch_state = vm_compressor_major_compact_cseg(c_seg, &number_considered, &bail_wanted_cseg, &bytes_freed);
3887 		if (bail_wanted_cseg) {
3888 			wanted_cseg_found++;
3889 			bail_wanted_cseg = false;
3890 		}
3891 
3892 		assert(c_seg->c_busy);
3893 		assert(!c_seg->c_on_minorcompact_q);
3894 
3895 		if (switch_state) {
3896 			if (VM_CONFIG_SWAP_IS_ACTIVE) {
3897 				int new_state = C_ON_SWAPOUT_Q;
3898 #if (XNU_TARGET_OS_OSX && __arm64__)
3899 				if (flush_all == false && compressor_swapout_conditions_met() == false) {
3900 					new_state = C_ON_MAJORCOMPACT_Q;
3901 				}
3902 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3903 
3904 				if (new_state == C_ON_SWAPOUT_Q) {
3905 					/*
3906 					 * This mode of putting a generic c_seg on the swapout list is
3907 					 * only supported when we have general swapping enabled
3908 					 */
3909 					clock_sec_t lnow;
3910 					clock_nsec_t lnsec;
3911 					clock_get_system_nanotime(&lnow, &lnsec);
3912 					if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 30) {
3913 						vmcs_stats.unripe_under_30s++;
3914 					} else if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 60) {
3915 						vmcs_stats.unripe_under_60s++;
3916 					} else if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 300) {
3917 						vmcs_stats.unripe_under_300s++;
3918 					}
3919 				}
3920 
3921 				c_seg_switch_state(c_seg, new_state, FALSE);
3922 			} else {
3923 				if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) {
3924 					assert(VM_CONFIG_SWAP_IS_PRESENT);
3925 					/*
3926 					 * we are running compressor sweeps with swap-behind
3927 					 * make sure the c_seg has aged enough before swapping it
3928 					 * out...
3929 					 */
3930 					if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
3931 						c_seg->c_overage_swap = TRUE;
3932 						c_overage_swapped_count++;
3933 						c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
3934 					}
3935 				}
3936 			}
3937 			if (c_seg->c_state == C_ON_AGE_Q) {
3938 				/*
3939 				 * this c_seg didn't get moved to the swapout queue
3940 				 * so we need to move it out of the way...
3941 				 * we just did a major compaction on it so put it
3942 				 * on that queue
3943 				 */
3944 				c_seg_switch_state(c_seg, C_ON_MAJORCOMPACT_Q, FALSE);
3945 			} else {
3946 				c_seg_major_compact_stats[c_seg_major_compact_stats_now].wasted_space_in_swapouts += c_seg_bufsize - c_seg->c_bytes_used;
3947 				c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_swapouts++;
3948 			}
3949 		}
3950 
3951 		C_SEG_WAKEUP_DONE(c_seg);
3952 
3953 		lck_mtx_unlock_always(&c_seg->c_lock);
3954 
3955 		/*
3956 		 * On systems _with_ general swap, regardless of jetsam, we wake up the swapout thread here.
3957 		 * On systems _without_ general swap, it's the responsibility of the memorystatus
3958 		 * subsystem to wake up the swapper.
3959 		 * TODO: When we have full jetsam support on a swap enabled system, we will need to revisit
3960 		 * this policy.
3961 		 */
3962 		if (VM_CONFIG_SWAP_IS_ACTIVE && c_swapout_count) {
3963 			/*
3964 			 * We don't pause/yield here because we will either
3965 			 * yield below or at the top of the loop with the
3966 			 * assert_wait_timeout.
3967 			 */
3968 			if (!vm_swapout_thread_running) {
3969 				thread_wakeup((event_t)&vm_swapout_thread);
3970 			}
3971 		}
3972 
3973 		if (number_considered >= yield_after_considered_per_pass) {
3974 			if (wanted_cseg_found) {
3975 				/*
3976 				 * We stopped major compactions on a c_seg
3977 				 * that is wanted. We don't know the priority
3978 				 * of the waiter unfortunately but we are at
3979 				 * a very high priority and so, just in case
3980 				 * the waiter is a critical system daemon or
3981 				 * UI thread, let's give up the CPU in case
3982 				 * the system is running a few CPU intensive
3983 				 * tasks.
3984 				 */
3985 				lck_mtx_unlock_always(c_list_lock);
3986 
3987 				mutex_pause(2); /* 100us yield */
3988 
3989 				number_yields++;
3990 
3991 				VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 11, number_considered, number_yields, 0);
3992 
3993 				lck_mtx_lock_spin_always(c_list_lock);
3994 			}
3995 
3996 			number_considered = 0;
3997 			wanted_cseg_found = 0;
3998 		}
3999 	}
4000 	clock_get_system_nanotime(&now, &nsec);
4001 
4002 	end_ts = major_compact_ts = (mach_timespec_t){.tv_sec = (int)now, .tv_nsec = nsec};
4003 
4004 	SUB_MACH_TIMESPEC(&end_ts, &start_ts);
4005 
4006 	delta_usec = (end_ts.tv_sec * USEC_PER_SEC) + (end_ts.tv_nsec / NSEC_PER_USEC) - (number_yields * 100);
4007 
4008 	delta_usec = MAX(1, delta_usec); /* we could have 0 usec run if conditions weren't right */
4009 
4010 	c_seg_major_compact_stats[c_seg_major_compact_stats_now].bytes_freed_rate_us = (bytes_freed / delta_usec);
4011 
4012 	if ((c_seg_major_compact_stats_now + 1) == C_SEG_MAJOR_COMPACT_STATS_MAX) {
4013 		c_seg_major_compact_stats_now = 0;
4014 	} else {
4015 		c_seg_major_compact_stats_now++;
4016 	}
4017 
4018 	assert(c_seg_major_compact_stats_now < C_SEG_MAJOR_COMPACT_STATS_MAX);
4019 
4020 	VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_END, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
4021 }
4022 
4023 
4024 static c_segment_t
c_seg_allocate(c_segment_t * current_chead)4025 c_seg_allocate(c_segment_t *current_chead)
4026 {
4027 	c_segment_t     c_seg;
4028 	int             min_needed;
4029 	int             size_to_populate;
4030 	c_segment_t     *donate_queue_head;
4031 
4032 #if XNU_TARGET_OS_OSX
4033 	if (vm_compressor_low_on_space()) {
4034 		vm_compressor_take_paging_space_action();
4035 	}
4036 #endif /* XNU_TARGET_OS_OSX */
4037 
4038 	if ((c_seg = *current_chead) == NULL) {
4039 		uint32_t        c_segno;
4040 
4041 		lck_mtx_lock_spin_always(c_list_lock);
4042 
4043 		while (c_segments_busy == TRUE) {
4044 			assert_wait((event_t) (&c_segments_busy), THREAD_UNINT);
4045 
4046 			lck_mtx_unlock_always(c_list_lock);
4047 
4048 			thread_block(THREAD_CONTINUE_NULL);
4049 
4050 			lck_mtx_lock_spin_always(c_list_lock);
4051 		}
4052 		if (c_free_segno_head == (uint32_t)-1) {
4053 			uint32_t        c_segments_available_new;
4054 			uint32_t        compressed_pages;
4055 
4056 #if CONFIG_FREEZE
4057 			if (freezer_incore_cseg_acct) {
4058 				compressed_pages = c_segment_pages_compressed_incore;
4059 			} else {
4060 				compressed_pages = c_segment_pages_compressed;
4061 			}
4062 #else
4063 			compressed_pages = c_segment_pages_compressed;
4064 #endif /* CONFIG_FREEZE */
4065 
4066 			if (c_segments_available >= c_segments_limit || compressed_pages >= c_segment_pages_compressed_limit) {
4067 				lck_mtx_unlock_always(c_list_lock);
4068 
4069 				return NULL;
4070 			}
4071 			c_segments_busy = TRUE;
4072 			lck_mtx_unlock_always(c_list_lock);
4073 
4074 			kernel_memory_populate((vm_offset_t)c_segments_next_page,
4075 			    PAGE_SIZE, KMA_NOFAIL | KMA_KOBJECT,
4076 			    VM_KERN_MEMORY_COMPRESSOR);
4077 			c_segments_next_page += PAGE_SIZE;
4078 
4079 			c_segments_available_new = c_segments_available + C_SEGMENTS_PER_PAGE;
4080 
4081 			if (c_segments_available_new > c_segments_limit) {
4082 				c_segments_available_new = c_segments_limit;
4083 			}
4084 
4085 			for (c_segno = c_segments_available + 1; c_segno < c_segments_available_new; c_segno++) {
4086 				c_segments[c_segno - 1].c_segno = c_segno;
4087 			}
4088 
4089 			lck_mtx_lock_spin_always(c_list_lock);
4090 
4091 			c_segments[c_segno - 1].c_segno = c_free_segno_head;
4092 			c_free_segno_head = c_segments_available;
4093 			c_segments_available = c_segments_available_new;
4094 
4095 			c_segments_busy = FALSE;
4096 			thread_wakeup((event_t) (&c_segments_busy));
4097 		}
4098 		c_segno = c_free_segno_head;
4099 		assert(c_segno >= 0 && c_segno < c_segments_limit);
4100 
4101 		c_free_segno_head = (uint32_t)c_segments[c_segno].c_segno;
4102 
4103 		/*
4104 		 * do the rest of the bookkeeping now while we're still behind
4105 		 * the list lock and grab our generation id now into a local
4106 		 * so that we can install it once we have the c_seg allocated
4107 		 */
4108 		c_segment_count++;
4109 		if (c_segment_count > c_segment_count_max) {
4110 			c_segment_count_max = c_segment_count;
4111 		}
4112 
4113 		lck_mtx_unlock_always(c_list_lock);
4114 
4115 		c_seg = zalloc_flags(compressor_segment_zone, Z_WAITOK | Z_ZERO);
4116 
4117 		c_seg->c_store.c_buffer = (int32_t *)C_SEG_BUFFER_ADDRESS(c_segno);
4118 
4119 		lck_mtx_init(&c_seg->c_lock, &vm_compressor_lck_grp, LCK_ATTR_NULL);
4120 
4121 		c_seg->c_state = C_IS_EMPTY;
4122 		c_seg->c_firstemptyslot = C_SLOT_MAX_INDEX;
4123 		c_seg->c_mysegno = c_segno;
4124 
4125 		lck_mtx_lock_spin_always(c_list_lock);
4126 		c_empty_count++;
4127 		c_seg_switch_state(c_seg, C_IS_FILLING, FALSE);
4128 		c_segments[c_segno].c_seg = c_seg;
4129 		assert(c_segments[c_segno].c_segno > c_segments_available);
4130 		lck_mtx_unlock_always(c_list_lock);
4131 
4132 		for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
4133 #if XNU_TARGET_OS_OSX
4134 			donate_queue_head = (c_segment_t*) &(ciq[i].current_early_swapout_chead);
4135 #else /* XNU_TARGET_OS_OSX */
4136 			if (memorystatus_swap_all_apps) {
4137 				donate_queue_head = (c_segment_t*) &(ciq[i].current_late_swapout_chead);
4138 			} else {
4139 				donate_queue_head = NULL;
4140 			}
4141 #endif /* XNU_TARGET_OS_OSX */
4142 
4143 			if (current_chead == donate_queue_head) {
4144 				c_seg->c_has_donated_pages = 1;
4145 				break;
4146 			}
4147 		}
4148 
4149 		*current_chead = c_seg;
4150 
4151 #if DEVELOPMENT || DEBUG
4152 		C_SEG_MAKE_WRITEABLE(c_seg);
4153 #endif
4154 	}
4155 	c_seg_alloc_nextslot(c_seg);
4156 
4157 	size_to_populate = c_seg_allocsize - C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset);
4158 
4159 	if (size_to_populate) {
4160 		min_needed = PAGE_SIZE + (c_seg_allocsize - c_seg_bufsize);
4161 
4162 		if (C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset) < (unsigned) min_needed) {
4163 			if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) {
4164 				size_to_populate = C_SEG_MAX_POPULATE_SIZE;
4165 			}
4166 
4167 			OSAddAtomic64(size_to_populate / PAGE_SIZE, &vm_pageout_vminfo.vm_compressor_pages_grabbed);
4168 
4169 			kernel_memory_populate(
4170 				(vm_offset_t) &c_seg->c_store.c_buffer[c_seg->c_populated_offset],
4171 				size_to_populate,
4172 				KMA_NOFAIL | KMA_COMPRESSOR,
4173 				VM_KERN_MEMORY_COMPRESSOR);
4174 		} else {
4175 			size_to_populate = 0;
4176 		}
4177 	}
4178 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
4179 
4180 	lck_mtx_lock_spin_always(&c_seg->c_lock);
4181 
4182 	if (size_to_populate) {
4183 		c_seg->c_populated_offset += C_SEG_BYTES_TO_OFFSET(size_to_populate);
4184 	}
4185 
4186 	return c_seg;
4187 }
4188 
4189 #if DEVELOPMENT || DEBUG
4190 #if CONFIG_FREEZE
4191 extern boolean_t memorystatus_freeze_to_memory;
4192 #endif /* CONFIG_FREEZE */
4193 #endif /* DEVELOPMENT || DEBUG */
4194 uint64_t c_seg_total_donated_bytes = 0; /* For testing/debugging only for now. Remove and add new counters for vm_stat.*/
4195 
4196 uint64_t c_seg_filled_no_contention = 0;
4197 uint64_t c_seg_filled_contention = 0;
4198 clock_sec_t c_seg_filled_contention_sec_max = 0;
4199 clock_nsec_t c_seg_filled_contention_nsec_max = 0;
4200 
4201 static void
c_current_seg_filled(c_segment_t c_seg,c_segment_t * current_chead)4202 c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead)
4203 {
4204 	uint32_t        unused_bytes;
4205 	uint32_t        offset_to_depopulate;
4206 	int             new_state = C_ON_AGE_Q;
4207 	clock_sec_t     sec;
4208 	clock_nsec_t    nsec;
4209 	bool            head_insert = false, wakeup_swapout_thread = false;
4210 
4211 	unused_bytes = trunc_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset));
4212 
4213 	if (unused_bytes) {
4214 		offset_to_depopulate = C_SEG_BYTES_TO_OFFSET(round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_nextoffset)));
4215 
4216 		/*
4217 		 *  release the extra physical page(s) at the end of the segment
4218 		 */
4219 		lck_mtx_unlock_always(&c_seg->c_lock);
4220 
4221 		kernel_memory_depopulate(
4222 			(vm_offset_t) &c_seg->c_store.c_buffer[offset_to_depopulate],
4223 			unused_bytes,
4224 			KMA_COMPRESSOR,
4225 			VM_KERN_MEMORY_COMPRESSOR);
4226 
4227 		lck_mtx_lock_spin_always(&c_seg->c_lock);
4228 
4229 		c_seg->c_populated_offset = offset_to_depopulate;
4230 	}
4231 	assert(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset) <= c_seg_bufsize);
4232 
4233 #if DEVELOPMENT || DEBUG
4234 	{
4235 		boolean_t       c_seg_was_busy = FALSE;
4236 
4237 		if (!c_seg->c_busy) {
4238 			C_SEG_BUSY(c_seg);
4239 		} else {
4240 			c_seg_was_busy = TRUE;
4241 		}
4242 
4243 		lck_mtx_unlock_always(&c_seg->c_lock);
4244 
4245 		C_SEG_WRITE_PROTECT(c_seg);
4246 
4247 		lck_mtx_lock_spin_always(&c_seg->c_lock);
4248 
4249 		if (c_seg_was_busy == FALSE) {
4250 			C_SEG_WAKEUP_DONE(c_seg);
4251 		}
4252 	}
4253 #endif
4254 
4255 #if CONFIG_FREEZE
4256 	if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead) &&
4257 	    VM_CONFIG_SWAP_IS_PRESENT &&
4258 	    VM_CONFIG_FREEZER_SWAP_IS_ACTIVE
4259 #if DEVELOPMENT || DEBUG
4260 	    && !memorystatus_freeze_to_memory
4261 #endif /* DEVELOPMENT || DEBUG */
4262 	    ) {
4263 		new_state = C_ON_SWAPOUT_Q;
4264 		wakeup_swapout_thread = true;
4265 	}
4266 #endif /* CONFIG_FREEZE */
4267 
4268 	if (vm_darkwake_mode == TRUE) {
4269 		new_state = C_ON_SWAPOUT_Q;
4270 		head_insert = true;
4271 		wakeup_swapout_thread = true;
4272 	} else {
4273 		c_segment_t *donate_queue_head;
4274 		for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
4275 #if XNU_TARGET_OS_OSX
4276 			donate_queue_head = (c_segment_t*) &(ciq[i].current_early_swapout_chead);
4277 #else /* XNU_TARGET_OS_OSX */
4278 			donate_queue_head = (c_segment_t*) &(ciq[i].current_late_swapout_chead);
4279 #endif /* XNU_TARGET_OS_OSX */
4280 
4281 			if (current_chead == donate_queue_head) {
4282 				assert(c_seg->c_has_donated_pages);
4283 				new_state = C_ON_SWAPOUT_Q;
4284 				c_seg_total_donated_bytes += c_seg->c_bytes_used;
4285 				break;
4286 			}
4287 		}
4288 	}
4289 
4290 	clock_get_system_nanotime(&sec, &nsec);
4291 	c_seg->c_creation_ts = (uint32_t)sec;
4292 
4293 	if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
4294 		clock_sec_t     sec2;
4295 		clock_nsec_t    nsec2;
4296 
4297 		lck_mtx_lock_spin_always(c_list_lock);
4298 		clock_get_system_nanotime(&sec2, &nsec2);
4299 		TIME_SUB(sec2, sec, nsec2, nsec, NSEC_PER_SEC);
4300 		// printf("FBDP %s: head %p waited for c_list_lock for %lu.%09u seconds\n", __FUNCTION__, current_chead, sec2, nsec2);
4301 		if (sec2 > c_seg_filled_contention_sec_max) {
4302 			c_seg_filled_contention_sec_max = sec2;
4303 			c_seg_filled_contention_nsec_max = nsec2;
4304 		} else if (sec2 == c_seg_filled_contention_sec_max &&
4305 		    nsec2 > c_seg_filled_contention_nsec_max) {
4306 			c_seg_filled_contention_nsec_max = nsec2;
4307 		}
4308 		c_seg_filled_contention++;
4309 	} else {
4310 		c_seg_filled_no_contention++;
4311 	}
4312 
4313 #if CONFIG_FREEZE
4314 	if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead)) {
4315 		if (freezer_context_global.freezer_ctx_task->donates_own_pages) {
4316 			assert(!c_seg->c_has_donated_pages);
4317 			c_seg->c_has_donated_pages = 1;
4318 			OSAddAtomic(c_seg->c_slots_used, &c_segment_pages_compressed_incore_late_swapout);
4319 		}
4320 		c_seg->c_has_freezer_pages = 1;
4321 	}
4322 #endif /* CONFIG_FREEZE */
4323 
4324 	c_seg->c_generation_id = c_generation_id++;
4325 	c_seg_switch_state(c_seg, new_state, head_insert);
4326 
4327 #if CONFIG_FREEZE
4328 	/*
4329 	 * Donated segments count as frozen to swap if we go through the freezer.
4330 	 * TODO: What we need is a new ledger and cseg state that can describe
4331 	 * a frozen cseg from a donated task so we can accurately decrement it on
4332 	 * swapins.
4333 	 */
4334 	if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead) && (c_seg->c_state == C_ON_SWAPOUT_Q)) {
4335 		/*
4336 		 * darkwake and freezer can't co-exist together
4337 		 * We'll need to fix this accounting as a start.
4338 		 * And early donation c_segs are separate from frozen c_segs.
4339 		 */
4340 		assert(vm_darkwake_mode == FALSE);
4341 		c_seg_update_task_owner(c_seg, freezer_context_global.freezer_ctx_task);
4342 		freezer_context_global.freezer_ctx_swapped_bytes += c_seg->c_bytes_used;
4343 	}
4344 #endif /* CONFIG_FREEZE */
4345 
4346 	if (c_seg->c_state == C_ON_AGE_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
4347 #if CONFIG_FREEZE
4348 		assert(c_seg->c_task_owner == NULL);
4349 #endif /* CONFIG_FREEZE */
4350 		c_seg_need_delayed_compaction(c_seg, TRUE);
4351 	}
4352 
4353 	lck_mtx_unlock_always(c_list_lock);
4354 
4355 	if (wakeup_swapout_thread) {
4356 		/*
4357 		 * Darkwake and Freeze configs always
4358 		 * wake up the swapout thread because
4359 		 * the compactor thread that normally handles
4360 		 * it may not be running as much in these
4361 		 * configs.
4362 		 */
4363 		thread_wakeup((event_t)&vm_swapout_thread);
4364 	}
4365 
4366 	*current_chead = NULL;
4367 }
4368 
4369 /*
4370  * returns with c_seg locked
4371  */
4372 void
c_seg_swapin_requeue(c_segment_t c_seg,boolean_t has_data,boolean_t minor_compact_ok,boolean_t age_on_swapin_q)4373 c_seg_swapin_requeue(c_segment_t c_seg, boolean_t has_data, boolean_t minor_compact_ok, boolean_t age_on_swapin_q)
4374 {
4375 	clock_sec_t     sec;
4376 	clock_nsec_t    nsec;
4377 
4378 	clock_get_system_nanotime(&sec, &nsec);
4379 
4380 	lck_mtx_lock_spin_always(c_list_lock);
4381 	lck_mtx_lock_spin_always(&c_seg->c_lock);
4382 
4383 	assert(c_seg->c_busy_swapping);
4384 	assert(c_seg->c_busy);
4385 
4386 	c_seg->c_busy_swapping = 0;
4387 
4388 	if (c_seg->c_overage_swap == TRUE) {
4389 		c_overage_swapped_count--;
4390 		c_seg->c_overage_swap = FALSE;
4391 	}
4392 	if (has_data == TRUE) {
4393 		if (age_on_swapin_q == TRUE || c_seg->c_has_donated_pages) {
4394 #if CONFIG_FREEZE
4395 			/*
4396 			 * If a segment has both identities, frozen and donated bits set, the donated
4397 			 * bit wins on the swapin path. This is because the segment is being swapped back
4398 			 * in and so is in demand and should be given more time to spend in memory before
4399 			 * being swapped back out under pressure.
4400 			 */
4401 			if (c_seg->c_has_donated_pages) {
4402 				c_seg->c_has_freezer_pages = 0;
4403 			}
4404 #endif /* CONFIG_FREEZE */
4405 			c_seg_switch_state(c_seg, C_ON_SWAPPEDIN_Q, FALSE);
4406 		} else {
4407 			c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
4408 		}
4409 
4410 		if (minor_compact_ok == TRUE && !c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
4411 			c_seg_need_delayed_compaction(c_seg, TRUE);
4412 		}
4413 	} else {
4414 		c_seg->c_store.c_buffer = (int32_t*) NULL;
4415 		c_seg->c_populated_offset = C_SEG_BYTES_TO_OFFSET(0);
4416 
4417 		c_seg_switch_state(c_seg, C_ON_BAD_Q, FALSE);
4418 	}
4419 	c_seg->c_swappedin_ts = (uint32_t)sec;
4420 	c_seg->c_swappedin = true;
4421 
4422 	lck_mtx_unlock_always(c_list_lock);
4423 }
4424 
4425 
4426 
4427 /*
4428  * c_seg has to be locked and is returned locked if the c_seg isn't freed
4429  * PAGE_REPLACMENT_DISALLOWED has to be TRUE on entry and is returned TRUE
4430  * c_seg_swapin returns 1 if the c_seg was freed, 0 otherwise
4431  */
4432 
4433 int
c_seg_swapin(c_segment_t c_seg,boolean_t force_minor_compaction,boolean_t age_on_swapin_q)4434 c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction, boolean_t age_on_swapin_q)
4435 {
4436 	vm_offset_t     addr = 0;
4437 	uint32_t        io_size = 0;
4438 	uint64_t        f_offset;
4439 	thread_pri_floor_t token;
4440 
4441 	assert(C_SEG_IS_ONDISK(c_seg));
4442 
4443 #if !CHECKSUM_THE_SWAP
4444 	c_seg_trim_tail(c_seg);
4445 #endif
4446 	io_size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
4447 	f_offset = c_seg->c_store.c_swap_handle;
4448 
4449 	C_SEG_BUSY(c_seg);
4450 	c_seg->c_busy_swapping = 1;
4451 
4452 	/*
4453 	 * This thread is likely going to block for I/O.
4454 	 * Make sure it is ready to run when the I/O completes because
4455 	 * it needs to clear the busy bit on the c_seg so that other
4456 	 * waiting threads can make progress too.
4457 	 */
4458 	token = thread_priority_floor_start();
4459 	lck_mtx_unlock_always(&c_seg->c_lock);
4460 
4461 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
4462 
4463 	addr = (vm_offset_t)C_SEG_BUFFER_ADDRESS(c_seg->c_mysegno);
4464 	c_seg->c_store.c_buffer = (int32_t*) addr;
4465 
4466 	kernel_memory_populate(addr, io_size, KMA_NOFAIL | KMA_COMPRESSOR,
4467 	    VM_KERN_MEMORY_COMPRESSOR);
4468 
4469 	if (vm_swap_get(c_seg, f_offset, io_size) != KERN_SUCCESS) {
4470 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
4471 
4472 		kernel_memory_depopulate(addr, io_size, KMA_COMPRESSOR,
4473 		    VM_KERN_MEMORY_COMPRESSOR);
4474 
4475 		c_seg_swapin_requeue(c_seg, FALSE, TRUE, age_on_swapin_q);
4476 	} else {
4477 #if ENCRYPTED_SWAP
4478 		vm_swap_decrypt(c_seg);
4479 #endif /* ENCRYPTED_SWAP */
4480 
4481 #if CHECKSUM_THE_SWAP
4482 		if (c_seg->cseg_swap_size != io_size) {
4483 			panic("swapin size doesn't match swapout size");
4484 		}
4485 
4486 		if (c_seg->cseg_hash != vmc_hash((char*) c_seg->c_store.c_buffer, (int)io_size)) {
4487 			panic("c_seg_swapin - Swap hash mismatch");
4488 		}
4489 #endif /* CHECKSUM_THE_SWAP */
4490 
4491 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
4492 
4493 		c_seg_swapin_requeue(c_seg, TRUE, force_minor_compaction == TRUE ? FALSE : TRUE, age_on_swapin_q);
4494 
4495 #if CONFIG_FREEZE
4496 		/*
4497 		 * c_seg_swapin_requeue() returns with the c_seg lock held.
4498 		 */
4499 		if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
4500 			assert(c_seg->c_busy);
4501 
4502 			lck_mtx_unlock_always(&c_seg->c_lock);
4503 			lck_mtx_lock_spin_always(c_list_lock);
4504 			lck_mtx_lock_spin_always(&c_seg->c_lock);
4505 		}
4506 
4507 		if (c_seg->c_task_owner) {
4508 			c_seg_update_task_owner(c_seg, NULL);
4509 		}
4510 
4511 		lck_mtx_unlock_always(c_list_lock);
4512 
4513 		OSAddAtomic(c_seg->c_slots_used, &c_segment_pages_compressed_incore);
4514 		if (c_seg->c_has_donated_pages) {
4515 			OSAddAtomic(c_seg->c_slots_used, &c_segment_pages_compressed_incore_late_swapout);
4516 		}
4517 #endif /* CONFIG_FREEZE */
4518 
4519 		OSAddAtomic64(c_seg->c_bytes_used, &compressor_bytes_used);
4520 
4521 		if (force_minor_compaction == TRUE) {
4522 			if (c_seg_minor_compaction_and_unlock(c_seg, FALSE)) {
4523 				/*
4524 				 * c_seg was completely empty so it was freed,
4525 				 * so be careful not to reference it again
4526 				 *
4527 				 * Drop the boost so that the thread priority
4528 				 * is returned back to where it is supposed to be.
4529 				 */
4530 				thread_priority_floor_end(&token);
4531 				return 1;
4532 			}
4533 
4534 			lck_mtx_lock_spin_always(&c_seg->c_lock);
4535 		}
4536 	}
4537 	C_SEG_WAKEUP_DONE(c_seg);
4538 
4539 	/*
4540 	 * Drop the boost so that the thread priority
4541 	 * is returned back to where it is supposed to be.
4542 	 */
4543 	thread_priority_floor_end(&token);
4544 
4545 	return 0;
4546 }
4547 
4548 
4549 static void
c_segment_sv_hash_drop_ref(int hash_indx)4550 c_segment_sv_hash_drop_ref(int hash_indx)
4551 {
4552 	struct c_sv_hash_entry o_sv_he, n_sv_he;
4553 
4554 	while (1) {
4555 		o_sv_he.he_record = c_segment_sv_hash_table[hash_indx].he_record;
4556 
4557 		n_sv_he.he_ref = o_sv_he.he_ref - 1;
4558 		n_sv_he.he_data = o_sv_he.he_data;
4559 
4560 		if (OSCompareAndSwap64((UInt64)o_sv_he.he_record, (UInt64)n_sv_he.he_record, (UInt64 *) &c_segment_sv_hash_table[hash_indx].he_record) == TRUE) {
4561 			if (n_sv_he.he_ref == 0) {
4562 				OSAddAtomic(-1, &c_segment_svp_in_hash);
4563 			}
4564 			break;
4565 		}
4566 	}
4567 }
4568 
4569 
4570 static int
c_segment_sv_hash_insert(uint32_t data)4571 c_segment_sv_hash_insert(uint32_t data)
4572 {
4573 	int             hash_sindx;
4574 	int             misses;
4575 	struct c_sv_hash_entry o_sv_he, n_sv_he;
4576 	boolean_t       got_ref = FALSE;
4577 
4578 	if (data == 0) {
4579 		OSAddAtomic(1, &c_segment_svp_zero_compressions);
4580 	} else {
4581 		OSAddAtomic(1, &c_segment_svp_nonzero_compressions);
4582 	}
4583 
4584 	hash_sindx = data & C_SV_HASH_MASK;
4585 
4586 	for (misses = 0; misses < C_SV_HASH_MAX_MISS; misses++) {
4587 		o_sv_he.he_record = c_segment_sv_hash_table[hash_sindx].he_record;
4588 
4589 		while (o_sv_he.he_data == data || o_sv_he.he_ref == 0) {
4590 			n_sv_he.he_ref = o_sv_he.he_ref + 1;
4591 			n_sv_he.he_data = data;
4592 
4593 			if (OSCompareAndSwap64((UInt64)o_sv_he.he_record, (UInt64)n_sv_he.he_record, (UInt64 *) &c_segment_sv_hash_table[hash_sindx].he_record) == TRUE) {
4594 				if (n_sv_he.he_ref == 1) {
4595 					OSAddAtomic(1, &c_segment_svp_in_hash);
4596 				}
4597 				got_ref = TRUE;
4598 				break;
4599 			}
4600 			o_sv_he.he_record = c_segment_sv_hash_table[hash_sindx].he_record;
4601 		}
4602 		if (got_ref == TRUE) {
4603 			break;
4604 		}
4605 		hash_sindx++;
4606 
4607 		if (hash_sindx == C_SV_HASH_SIZE) {
4608 			hash_sindx = 0;
4609 		}
4610 	}
4611 	if (got_ref == FALSE) {
4612 		return -1;
4613 	}
4614 
4615 	return hash_sindx;
4616 }
4617 
4618 
4619 #if RECORD_THE_COMPRESSED_DATA
4620 
4621 static void
c_compressed_record_data(char * src,int c_size)4622 c_compressed_record_data(char *src, int c_size)
4623 {
4624 	if ((c_compressed_record_cptr + c_size + 4) >= c_compressed_record_ebuf) {
4625 		panic("c_compressed_record_cptr >= c_compressed_record_ebuf");
4626 	}
4627 
4628 	*(int *)((void *)c_compressed_record_cptr) = c_size;
4629 
4630 	c_compressed_record_cptr += 4;
4631 
4632 	memcpy(c_compressed_record_cptr, src, c_size);
4633 	c_compressed_record_cptr += c_size;
4634 }
4635 #endif
4636 
4637 
4638 static int
c_compress_page(char * src,c_slot_mapping_t slot_ptr,c_segment_t * current_chead,char * scratch_buf)4639 c_compress_page(char *src, c_slot_mapping_t slot_ptr, c_segment_t *current_chead, char *scratch_buf)
4640 {
4641 	int             c_size = -1;
4642 	int             c_rounded_size = 0;
4643 	int             max_csize;
4644 	c_slot_t        cs;
4645 	c_segment_t     c_seg;
4646 	bool            single_value = false;
4647 
4648 	KERNEL_DEBUG(0xe0400000 | DBG_FUNC_START, *current_chead, 0, 0, 0, 0);
4649 retry:
4650 	if ((c_seg = c_seg_allocate(current_chead)) == NULL) {
4651 		return 1;
4652 	}
4653 	/*
4654 	 * returns with c_seg lock held
4655 	 * and PAGE_REPLACEMENT_DISALLOWED(TRUE)...
4656 	 * c_nextslot has been allocated and
4657 	 * c_store.c_buffer populated
4658 	 */
4659 	assert(c_seg->c_state == C_IS_FILLING);
4660 
4661 	cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_seg->c_nextslot);
4662 
4663 	C_SLOT_ASSERT_PACKABLE(slot_ptr);
4664 	cs->c_packed_ptr = C_SLOT_PACK_PTR(slot_ptr);
4665 
4666 	cs->c_offset = c_seg->c_nextoffset;
4667 
4668 	max_csize = c_seg_bufsize - C_SEG_OFFSET_TO_BYTES((int32_t)cs->c_offset);
4669 
4670 	if (max_csize > PAGE_SIZE) {
4671 		max_csize = PAGE_SIZE;
4672 	}
4673 
4674 #if CHECKSUM_THE_DATA
4675 	cs->c_hash_data = vmc_hash(src, PAGE_SIZE);
4676 #endif
4677 	boolean_t incomp_copy = FALSE;
4678 	int max_csize_adj = (max_csize - 4);
4679 
4680 	if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) {
4681 #if defined(__arm64__)
4682 		uint16_t ccodec = CINVALID;
4683 		uint32_t inline_popcount;
4684 		if (max_csize >= C_SEG_OFFSET_ALIGNMENT_BOUNDARY) {
4685 			c_size = metacompressor((const uint8_t *) src,
4686 			    (uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset],
4687 			    max_csize_adj, &ccodec,
4688 			    scratch_buf, &incomp_copy, &inline_popcount);
4689 #if __APPLE_WKDM_POPCNT_EXTENSIONS__
4690 			cs->c_inline_popcount = inline_popcount;
4691 #else
4692 			assert(inline_popcount == C_SLOT_NO_POPCOUNT);
4693 #endif
4694 
4695 #if C_SEG_OFFSET_ALIGNMENT_BOUNDARY > 4
4696 			if (c_size > max_csize_adj) {
4697 				c_size = -1;
4698 			}
4699 #endif
4700 		} else {
4701 			c_size = -1;
4702 		}
4703 		assert(ccodec == CCWK || ccodec == CCLZ4);
4704 		cs->c_codec = ccodec;
4705 #endif
4706 	} else {
4707 #if defined(__arm64__)
4708 		cs->c_codec = CCWK;
4709 		__unreachable_ok_push
4710 		if (PAGE_SIZE == 4096) {
4711 			c_size = WKdm_compress_4k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4712 			    (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4713 		} else {
4714 			c_size = WKdm_compress_16k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4715 			    (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4716 		}
4717 		__unreachable_ok_pop
4718 #else
4719 		c_size = WKdm_compress_new((const WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4720 		    (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4721 #endif
4722 	}
4723 	assertf(((c_size <= max_csize_adj) && (c_size >= -1)),
4724 	    "c_size invalid (%d, %d), cur compressions: %d", c_size, max_csize_adj, c_segment_pages_compressed);
4725 
4726 	if (c_size == -1) {
4727 		if (max_csize < PAGE_SIZE) {
4728 			c_current_seg_filled(c_seg, current_chead);
4729 			assert(*current_chead == NULL);
4730 
4731 			lck_mtx_unlock_always(&c_seg->c_lock);
4732 			/* TODO: it may be worth requiring codecs to distinguish
4733 			 * between incompressible inputs and failures due to
4734 			 * budget exhaustion.
4735 			 */
4736 			PAGE_REPLACEMENT_DISALLOWED(FALSE);
4737 			goto retry;
4738 		}
4739 		c_size = PAGE_SIZE;
4740 
4741 		if (incomp_copy == FALSE) {
4742 			memcpy(&c_seg->c_store.c_buffer[cs->c_offset], src, c_size);
4743 		}
4744 
4745 		OSAddAtomic(1, &c_segment_noncompressible_pages);
4746 	} else if (c_size == 0) {
4747 		int             hash_index;
4748 
4749 		/*
4750 		 * special case - this is a page completely full of a single 32 bit value
4751 		 */
4752 		single_value = true;
4753 		hash_index = c_segment_sv_hash_insert(*(uint32_t *)(uintptr_t)src);
4754 
4755 		if (hash_index != -1) {
4756 			slot_ptr->s_cindx = hash_index;
4757 			slot_ptr->s_cseg = C_SV_CSEG_ID;
4758 
4759 			OSAddAtomic(1, &c_segment_svp_hash_succeeded);
4760 #if RECORD_THE_COMPRESSED_DATA
4761 			c_compressed_record_data(src, 4);
4762 #endif
4763 			goto sv_compression;
4764 		}
4765 		c_size = 4;
4766 
4767 		memcpy(&c_seg->c_store.c_buffer[cs->c_offset], src, c_size);
4768 
4769 		OSAddAtomic(1, &c_segment_svp_hash_failed);
4770 	}
4771 
4772 #if RECORD_THE_COMPRESSED_DATA
4773 	c_compressed_record_data((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size);
4774 #endif
4775 #if CHECKSUM_THE_COMPRESSED_DATA
4776 	cs->c_hash_compressed_data = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size);
4777 #endif
4778 #if POPCOUNT_THE_COMPRESSED_DATA
4779 	cs->c_pop_cdata = vmc_pop((uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset], c_size);
4780 #endif
4781 	c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
4782 
4783 	PACK_C_SIZE(cs, c_size);
4784 	c_seg->c_bytes_used += c_rounded_size;
4785 	c_seg->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
4786 	c_seg->c_slots_used++;
4787 
4788 #if CONFIG_FREEZE
4789 	/* TODO: should c_segment_pages_compressed be up here too? See 88598046 for details */
4790 	OSAddAtomic(1, &c_segment_pages_compressed_incore);
4791 	if (c_seg->c_has_donated_pages) {
4792 		OSAddAtomic(1, &c_segment_pages_compressed_incore_late_swapout);
4793 	}
4794 #endif /* CONFIG_FREEZE */
4795 
4796 	slot_ptr->s_cindx = c_seg->c_nextslot++;
4797 	/* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
4798 	slot_ptr->s_cseg = c_seg->c_mysegno + 1;
4799 
4800 sv_compression:
4801 	if (c_seg->c_nextoffset >= c_seg_off_limit || c_seg->c_nextslot >= C_SLOT_MAX_INDEX) {
4802 		c_current_seg_filled(c_seg, current_chead);
4803 		assert(*current_chead == NULL);
4804 	}
4805 
4806 	lck_mtx_unlock_always(&c_seg->c_lock);
4807 
4808 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
4809 
4810 #if RECORD_THE_COMPRESSED_DATA
4811 	if ((c_compressed_record_cptr - c_compressed_record_sbuf) >= c_seg_allocsize) {
4812 		c_compressed_record_write(c_compressed_record_sbuf, (int)(c_compressed_record_cptr - c_compressed_record_sbuf));
4813 		c_compressed_record_cptr = c_compressed_record_sbuf;
4814 	}
4815 #endif
4816 	if (c_size) {
4817 		OSAddAtomic64(c_size, &c_segment_compressed_bytes);
4818 		OSAddAtomic64(c_rounded_size, &compressor_bytes_used);
4819 	}
4820 	OSAddAtomic64(PAGE_SIZE, &c_segment_input_bytes);
4821 
4822 	OSAddAtomic(1, &c_segment_pages_compressed);
4823 #if DEVELOPMENT || DEBUG
4824 	if (!compressor_running_perf_test) {
4825 		/*
4826 		 * The perf_compressor benchmark should not be able to trigger
4827 		 * compressor thrashing jetsams.
4828 		 */
4829 		OSAddAtomic(1, &sample_period_compression_count);
4830 	}
4831 #else /* DEVELOPMENT || DEBUG */
4832 	OSAddAtomic(1, &sample_period_compression_count);
4833 #endif /* DEVELOPMENT || DEBUG */
4834 
4835 	KERNEL_DEBUG(0xe0400000 | DBG_FUNC_END, *current_chead, c_size, c_segment_input_bytes, c_segment_compressed_bytes, 0);
4836 
4837 	return 0;
4838 }
4839 
4840 static inline void
sv_decompress(int32_t * ddst,int32_t pattern)4841 sv_decompress(int32_t *ddst, int32_t pattern)
4842 {
4843 //	assert(__builtin_constant_p(PAGE_SIZE) != 0);
4844 #if defined(__x86_64__)
4845 	memset_word(ddst, pattern, PAGE_SIZE / sizeof(int32_t));
4846 #elif defined(__arm64__)
4847 	assert((PAGE_SIZE % 128) == 0);
4848 	if (pattern == 0) {
4849 		fill32_dczva((addr64_t)ddst, PAGE_SIZE);
4850 	} else {
4851 		fill32_nt((addr64_t)ddst, PAGE_SIZE, pattern);
4852 	}
4853 #else
4854 	size_t          i;
4855 
4856 	/* Unroll the pattern fill loop 4x to encourage the
4857 	 * compiler to emit NEON stores, cf.
4858 	 * <rdar://problem/25839866> Loop autovectorization
4859 	 * anomalies.
4860 	 */
4861 	/* * We use separate loops for each PAGE_SIZE
4862 	 * to allow the autovectorizer to engage, as PAGE_SIZE
4863 	 * may not be a constant.
4864 	 */
4865 
4866 	__unreachable_ok_push
4867 	if (PAGE_SIZE == 4096) {
4868 		for (i = 0; i < (4096U / sizeof(int32_t)); i += 4) {
4869 			*ddst++ = pattern;
4870 			*ddst++ = pattern;
4871 			*ddst++ = pattern;
4872 			*ddst++ = pattern;
4873 		}
4874 	} else {
4875 		assert(PAGE_SIZE == 16384);
4876 		for (i = 0; i < (int)(16384U / sizeof(int32_t)); i += 4) {
4877 			*ddst++ = pattern;
4878 			*ddst++ = pattern;
4879 			*ddst++ = pattern;
4880 			*ddst++ = pattern;
4881 		}
4882 	}
4883 	__unreachable_ok_pop
4884 #endif
4885 }
4886 
4887 static int
c_decompress_page(char * dst,volatile c_slot_mapping_t slot_ptr,int flags,int * zeroslot)4888 c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int *zeroslot)
4889 {
4890 	c_slot_t        cs;
4891 	c_segment_t     c_seg;
4892 	uint32_t        c_segno;
4893 	uint16_t        c_indx;
4894 	int             c_rounded_size;
4895 	uint32_t        c_size;
4896 	int             retval = 0;
4897 	boolean_t       need_unlock = TRUE;
4898 	boolean_t       consider_defragmenting = FALSE;
4899 	boolean_t       kdp_mode = FALSE;
4900 
4901 	if (__improbable(flags & C_KDP)) {
4902 		if (not_in_kdp) {
4903 			panic("C_KDP passed to decompress page from outside of debugger context");
4904 		}
4905 
4906 		assert((flags & C_KEEP) == C_KEEP);
4907 		assert((flags & C_DONT_BLOCK) == C_DONT_BLOCK);
4908 
4909 		if ((flags & (C_DONT_BLOCK | C_KEEP)) != (C_DONT_BLOCK | C_KEEP)) {
4910 			return -2;
4911 		}
4912 
4913 		kdp_mode = TRUE;
4914 		*zeroslot = 0;
4915 	}
4916 
4917 ReTry:
4918 	if (__probable(!kdp_mode)) {
4919 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
4920 	} else {
4921 		if (kdp_lck_rw_lock_is_acquired_exclusive(&c_master_lock)) {
4922 			return -2;
4923 		}
4924 	}
4925 
4926 #if HIBERNATION
4927 	/*
4928 	 * if hibernation is enabled, it indicates (via a call
4929 	 * to 'vm_decompressor_lock' that no further
4930 	 * decompressions are allowed once it reaches
4931 	 * the point of flushing all of the currently dirty
4932 	 * anonymous memory through the compressor and out
4933 	 * to disk... in this state we allow freeing of compressed
4934 	 * pages and must honor the C_DONT_BLOCK case
4935 	 */
4936 	if (__improbable(dst && decompressions_blocked == TRUE)) {
4937 		if (flags & C_DONT_BLOCK) {
4938 			if (__probable(!kdp_mode)) {
4939 				PAGE_REPLACEMENT_DISALLOWED(FALSE);
4940 			}
4941 
4942 			*zeroslot = 0;
4943 			return -2;
4944 		}
4945 		/*
4946 		 * it's safe to atomically assert and block behind the
4947 		 * lock held in shared mode because "decompressions_blocked" is
4948 		 * only set and cleared and the thread_wakeup done when the lock
4949 		 * is held exclusively
4950 		 */
4951 		assert_wait((event_t)&decompressions_blocked, THREAD_UNINT);
4952 
4953 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
4954 
4955 		thread_block(THREAD_CONTINUE_NULL);
4956 
4957 		goto ReTry;
4958 	}
4959 #endif
4960 	/* s_cseg is actually "segno+1" */
4961 	c_segno = slot_ptr->s_cseg - 1;
4962 
4963 	if (__improbable(c_segno >= c_segments_available)) {
4964 		panic("c_decompress_page: c_segno %d >= c_segments_available %d, slot_ptr(%p), slot_data(%x)",
4965 		    c_segno, c_segments_available, slot_ptr, *(int *)((void *)slot_ptr));
4966 	}
4967 
4968 	if (__improbable(c_segments[c_segno].c_segno < c_segments_available)) {
4969 		panic("c_decompress_page: c_segno %d is free, slot_ptr(%p), slot_data(%x)",
4970 		    c_segno, slot_ptr, *(int *)((void *)slot_ptr));
4971 	}
4972 
4973 	c_seg = c_segments[c_segno].c_seg;
4974 
4975 	if (__probable(!kdp_mode)) {
4976 		lck_mtx_lock_spin_always(&c_seg->c_lock);
4977 	} else {
4978 		if (kdp_lck_mtx_lock_spin_is_acquired(&c_seg->c_lock)) {
4979 			return -2;
4980 		}
4981 	}
4982 
4983 	assert(c_seg->c_state != C_IS_EMPTY && c_seg->c_state != C_IS_FREE);
4984 
4985 	if (dst == NULL && c_seg->c_busy_swapping) {
4986 		assert(c_seg->c_busy);
4987 
4988 		goto bypass_busy_check;
4989 	}
4990 	if (flags & C_DONT_BLOCK) {
4991 		if (c_seg->c_busy || (C_SEG_IS_ONDISK(c_seg) && dst)) {
4992 			*zeroslot = 0;
4993 
4994 			retval = -2;
4995 			goto done;
4996 		}
4997 	}
4998 	if (c_seg->c_busy) {
4999 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5000 
5001 		c_seg_wait_on_busy(c_seg);
5002 
5003 		goto ReTry;
5004 	}
5005 bypass_busy_check:
5006 
5007 	c_indx = slot_ptr->s_cindx;
5008 
5009 	if (__improbable(c_indx >= c_seg->c_nextslot)) {
5010 		panic("c_decompress_page: c_indx %d >= c_nextslot %d, c_seg(%p), slot_ptr(%p), slot_data(%x)",
5011 		    c_indx, c_seg->c_nextslot, c_seg, slot_ptr, *(int *)((void *)slot_ptr));
5012 	}
5013 
5014 	cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5015 
5016 	c_size = UNPACK_C_SIZE(cs);
5017 
5018 	if (__improbable(c_size == 0)) {
5019 		panic("c_decompress_page: c_size == 0, c_seg(%p), slot_ptr(%p), slot_data(%x)",
5020 		    c_seg, slot_ptr, *(int *)((void *)slot_ptr));
5021 	}
5022 
5023 	c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
5024 
5025 	if (dst) {
5026 		uint32_t        age_of_cseg;
5027 		clock_sec_t     cur_ts_sec;
5028 		clock_nsec_t    cur_ts_nsec;
5029 
5030 		if (C_SEG_IS_ONDISK(c_seg)) {
5031 #if CONFIG_FREEZE
5032 			if (freezer_incore_cseg_acct) {
5033 				if ((c_seg->c_slots_used + c_segment_pages_compressed_incore) >= c_segment_pages_compressed_nearing_limit) {
5034 					PAGE_REPLACEMENT_DISALLOWED(FALSE);
5035 					lck_mtx_unlock_always(&c_seg->c_lock);
5036 
5037 					memorystatus_kill_on_VM_compressor_space_shortage(FALSE /* async */);
5038 
5039 					goto ReTry;
5040 				}
5041 
5042 				uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
5043 				if ((incore_seg_count + 1) >= c_segments_nearing_limit) {
5044 					PAGE_REPLACEMENT_DISALLOWED(FALSE);
5045 					lck_mtx_unlock_always(&c_seg->c_lock);
5046 
5047 					memorystatus_kill_on_VM_compressor_space_shortage(FALSE /* async */);
5048 
5049 					goto ReTry;
5050 				}
5051 			}
5052 #endif /* CONFIG_FREEZE */
5053 			assert(kdp_mode == FALSE);
5054 			retval = c_seg_swapin(c_seg, FALSE, TRUE);
5055 			assert(retval == 0);
5056 
5057 			retval = 1;
5058 		}
5059 		if (c_seg->c_state == C_ON_BAD_Q) {
5060 			assert(c_seg->c_store.c_buffer == NULL);
5061 			*zeroslot = 0;
5062 
5063 			retval = -1;
5064 			goto done;
5065 		}
5066 
5067 #if POPCOUNT_THE_COMPRESSED_DATA
5068 		unsigned csvpop;
5069 		uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
5070 		if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
5071 			panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%x 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
5072 		}
5073 #endif
5074 
5075 #if CHECKSUM_THE_COMPRESSED_DATA
5076 		unsigned csvhash;
5077 		if (cs->c_hash_compressed_data != (csvhash = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size))) {
5078 			panic("Compressed data doesn't match original %p %p %u %u %u", c_seg, cs, c_size, cs->c_hash_compressed_data, csvhash);
5079 		}
5080 #endif
5081 		if (c_rounded_size == PAGE_SIZE) {
5082 			/*
5083 			 * page wasn't compressible... just copy it out
5084 			 */
5085 			memcpy(dst, &c_seg->c_store.c_buffer[cs->c_offset], PAGE_SIZE);
5086 		} else if (c_size == 4) {
5087 			int32_t         data;
5088 			int32_t         *dptr;
5089 
5090 			/*
5091 			 * page was populated with a single value
5092 			 * that didn't fit into our fast hash
5093 			 * so we packed it in as a single non-compressed value
5094 			 * that we need to populate the page with
5095 			 */
5096 			dptr = (int32_t *)(uintptr_t)dst;
5097 			data = *(int32_t *)(&c_seg->c_store.c_buffer[cs->c_offset]);
5098 			sv_decompress(dptr, data);
5099 		} else {
5100 			uint32_t        my_cpu_no;
5101 			char            *scratch_buf;
5102 
5103 			if (__probable(!kdp_mode)) {
5104 				/*
5105 				 * we're behind the c_seg lock held in spin mode
5106 				 * which means pre-emption is disabled... therefore
5107 				 * the following sequence is atomic and safe
5108 				 */
5109 				my_cpu_no = cpu_number();
5110 
5111 				assert(my_cpu_no < compressor_cpus);
5112 
5113 				scratch_buf = &compressor_scratch_bufs[my_cpu_no * vm_compressor_get_decode_scratch_size()];
5114 			} else {
5115 				scratch_buf = kdp_compressor_scratch_buf;
5116 			}
5117 
5118 			if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) {
5119 #if defined(__arm64__)
5120 				uint16_t c_codec = cs->c_codec;
5121 				uint32_t inline_popcount;
5122 				if (!metadecompressor((const uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset],
5123 				    (uint8_t *)dst, c_size, c_codec, (void *)scratch_buf, &inline_popcount)) {
5124 					retval = -1;
5125 				} else {
5126 #if __APPLE_WKDM_POPCNT_EXTENSIONS__
5127 					if (inline_popcount != cs->c_inline_popcount) {
5128 						/*
5129 						 * The codec choice in compression and
5130 						 * decompression must agree, so there
5131 						 * should never be a disagreement in
5132 						 * whether an inline population count
5133 						 * was performed.
5134 						 */
5135 						assert(inline_popcount != C_SLOT_NO_POPCOUNT);
5136 						assert(cs->c_inline_popcount != C_SLOT_NO_POPCOUNT);
5137 						printf("decompression failure from physical region %llx+%05x: popcount mismatch (%d != %d)\n",
5138 						    (unsigned long long)kvtophys((uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset]), c_size,
5139 						    inline_popcount,
5140 						    cs->c_inline_popcount);
5141 						retval = -1;
5142 					}
5143 #else
5144 					assert(inline_popcount == C_SLOT_NO_POPCOUNT);
5145 #endif /* __APPLE_WKDM_POPCNT_EXTENSIONS__ */
5146 				}
5147 #endif
5148 			} else {
5149 #if defined(__arm64__)
5150 				__unreachable_ok_push
5151 				if (PAGE_SIZE == 4096) {
5152 					WKdm_decompress_4k((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5153 					    (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5154 				} else {
5155 					WKdm_decompress_16k((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5156 					    (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5157 				}
5158 				__unreachable_ok_pop
5159 #else
5160 				WKdm_decompress_new((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5161 				    (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5162 #endif
5163 			}
5164 		}
5165 
5166 #if CHECKSUM_THE_DATA
5167 		if (cs->c_hash_data != vmc_hash(dst, PAGE_SIZE)) {
5168 #if defined(__arm64__)
5169 			int32_t *dinput = &c_seg->c_store.c_buffer[cs->c_offset];
5170 			panic("decompressed data doesn't match original cs: %p, hash: 0x%x, offset: %d, c_size: %d, c_rounded_size: %d, codec: %d, header: 0x%x 0x%x 0x%x", cs, cs->c_hash_data, cs->c_offset, c_size, c_rounded_size, cs->c_codec, *dinput, *(dinput + 1), *(dinput + 2));
5171 #else
5172 			panic("decompressed data doesn't match original cs: %p, hash: %d, offset: 0x%x, c_size: %d", cs, cs->c_hash_data, cs->c_offset, c_size);
5173 #endif
5174 		}
5175 #endif
5176 		if (c_seg->c_swappedin_ts == 0 && !kdp_mode) {
5177 			clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
5178 
5179 			age_of_cseg = (uint32_t)cur_ts_sec - c_seg->c_creation_ts;
5180 			if (age_of_cseg < DECOMPRESSION_SAMPLE_MAX_AGE) {
5181 				OSAddAtomic(1, &age_of_decompressions_during_sample_period[age_of_cseg]);
5182 			} else {
5183 				OSAddAtomic(1, &overage_decompressions_during_sample_period);
5184 			}
5185 
5186 			OSAddAtomic(1, &sample_period_decompression_count);
5187 		}
5188 	}
5189 #if CONFIG_FREEZE
5190 	else {
5191 		/*
5192 		 * We are freeing an uncompressed page from this c_seg and so balance the ledgers.
5193 		 */
5194 		if (C_SEG_IS_ONDISK(c_seg)) {
5195 			/*
5196 			 * The compression sweep feature will push out anonymous pages to disk
5197 			 * without going through the freezer path and so those c_segs, while
5198 			 * swapped out, won't have an owner.
5199 			 */
5200 			if (c_seg->c_task_owner) {
5201 				task_update_frozen_to_swap_acct(c_seg->c_task_owner, PAGE_SIZE_64, DEBIT_FROM_SWAP);
5202 			}
5203 
5204 			/*
5205 			 * We are freeing a page in swap without swapping it in. We bump the in-core
5206 			 * count here to simulate a swapin of a page so that we can accurately
5207 			 * decrement it below.
5208 			 */
5209 			OSAddAtomic(1, &c_segment_pages_compressed_incore);
5210 			if (c_seg->c_has_donated_pages) {
5211 				OSAddAtomic(1, &c_segment_pages_compressed_incore_late_swapout);
5212 			}
5213 		} else if (c_seg->c_state == C_ON_BAD_Q) {
5214 			assert(c_seg->c_store.c_buffer == NULL);
5215 			*zeroslot = 0;
5216 
5217 			retval = -1;
5218 			goto done;
5219 		}
5220 	}
5221 #endif /* CONFIG_FREEZE */
5222 
5223 	if (flags & C_KEEP) {
5224 		*zeroslot = 0;
5225 		goto done;
5226 	}
5227 	assert(kdp_mode == FALSE);
5228 
5229 	c_seg->c_bytes_unused += c_rounded_size;
5230 	c_seg->c_bytes_used -= c_rounded_size;
5231 
5232 	assert(c_seg->c_slots_used);
5233 	c_seg->c_slots_used--;
5234 	if (dst && c_seg->c_swappedin) {
5235 		task_t task = current_task();
5236 		if (task) {
5237 			ledger_credit(task->ledger, task_ledgers.swapins, PAGE_SIZE);
5238 		}
5239 	}
5240 
5241 	PACK_C_SIZE(cs, 0);
5242 
5243 	if (c_indx < c_seg->c_firstemptyslot) {
5244 		c_seg->c_firstemptyslot = c_indx;
5245 	}
5246 
5247 	OSAddAtomic(-1, &c_segment_pages_compressed);
5248 #if CONFIG_FREEZE
5249 	OSAddAtomic(-1, &c_segment_pages_compressed_incore);
5250 	assertf(c_segment_pages_compressed_incore >= 0, "-ve incore count %p 0x%x", c_seg, c_segment_pages_compressed_incore);
5251 	if (c_seg->c_has_donated_pages) {
5252 		OSAddAtomic(-1, &c_segment_pages_compressed_incore_late_swapout);
5253 		assertf(c_segment_pages_compressed_incore_late_swapout >= 0, "-ve lateswapout count %p 0x%x", c_seg, c_segment_pages_compressed_incore_late_swapout);
5254 	}
5255 #endif /* CONFIG_FREEZE */
5256 
5257 	if (c_seg->c_state != C_ON_BAD_Q && !(C_SEG_IS_ONDISK(c_seg))) {
5258 		/*
5259 		 * C_SEG_IS_ONDISK == TRUE can occur when we're doing a
5260 		 * free of a compressed page (i.e. dst == NULL)
5261 		 */
5262 		OSAddAtomic64(-c_rounded_size, &compressor_bytes_used);
5263 	}
5264 	if (c_seg->c_busy_swapping) {
5265 		/*
5266 		 * bypass case for c_busy_swapping...
5267 		 * let the swapin/swapout paths deal with putting
5268 		 * the c_seg on the minor compaction queue if needed
5269 		 */
5270 		assert(c_seg->c_busy);
5271 		goto done;
5272 	}
5273 	assert(!c_seg->c_busy);
5274 
5275 	if (c_seg->c_state != C_IS_FILLING) {
5276 		if (c_seg->c_bytes_used == 0) {
5277 			if (!(C_SEG_IS_ONDISK(c_seg))) {
5278 				int     pages_populated;
5279 
5280 				pages_populated = (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) / PAGE_SIZE;
5281 				c_seg->c_populated_offset = C_SEG_BYTES_TO_OFFSET(0);
5282 
5283 				if (pages_populated) {
5284 					assert(c_seg->c_state != C_ON_BAD_Q);
5285 					assert(c_seg->c_store.c_buffer != NULL);
5286 
5287 					C_SEG_BUSY(c_seg);
5288 					lck_mtx_unlock_always(&c_seg->c_lock);
5289 
5290 					kernel_memory_depopulate(
5291 						(vm_offset_t) c_seg->c_store.c_buffer,
5292 						ptoa(pages_populated),
5293 						KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
5294 
5295 					lck_mtx_lock_spin_always(&c_seg->c_lock);
5296 					C_SEG_WAKEUP_DONE(c_seg);
5297 				}
5298 				if (!c_seg->c_on_minorcompact_q && c_seg->c_state != C_ON_SWAPIO_Q) {
5299 					if (c_seg->c_state == C_ON_SWAPOUT_Q) {
5300 						bool clear_busy = false;
5301 						if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
5302 							C_SEG_BUSY(c_seg);
5303 
5304 							lck_mtx_unlock_always(&c_seg->c_lock);
5305 							lck_mtx_lock_spin_always(c_list_lock);
5306 							lck_mtx_lock_spin_always(&c_seg->c_lock);
5307 							clear_busy = true;
5308 						}
5309 						c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
5310 						if (clear_busy) {
5311 							C_SEG_WAKEUP_DONE(c_seg);
5312 							clear_busy = false;
5313 						}
5314 						lck_mtx_unlock_always(c_list_lock);
5315 					}
5316 					c_seg_need_delayed_compaction(c_seg, FALSE);
5317 				}
5318 			} else {
5319 				if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q) {
5320 					c_seg_move_to_sparse_list(c_seg);
5321 					consider_defragmenting = TRUE;
5322 				}
5323 			}
5324 		} else if (c_seg->c_on_minorcompact_q) {
5325 			assert(c_seg->c_state != C_ON_BAD_Q);
5326 			assert(!C_SEG_IS_ON_DISK_OR_SOQ(c_seg));
5327 
5328 			if (C_SEG_SHOULD_MINORCOMPACT_NOW(c_seg)) {
5329 				c_seg_try_minor_compaction_and_unlock(c_seg);
5330 				need_unlock = FALSE;
5331 			}
5332 		} else if (!(C_SEG_IS_ONDISK(c_seg))) {
5333 			if (c_seg->c_state != C_ON_BAD_Q && c_seg->c_state != C_ON_SWAPOUT_Q && c_seg->c_state != C_ON_SWAPIO_Q &&
5334 			    C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
5335 				c_seg_need_delayed_compaction(c_seg, FALSE);
5336 			}
5337 		} else if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q && C_SEG_ONDISK_IS_SPARSE(c_seg)) {
5338 			c_seg_move_to_sparse_list(c_seg);
5339 			consider_defragmenting = TRUE;
5340 		}
5341 	}
5342 done:
5343 	if (__improbable(kdp_mode)) {
5344 		return retval;
5345 	}
5346 
5347 	if (need_unlock == TRUE) {
5348 		lck_mtx_unlock_always(&c_seg->c_lock);
5349 	}
5350 
5351 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5352 
5353 	if (consider_defragmenting == TRUE) {
5354 		vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE);
5355 	}
5356 
5357 #if !XNU_TARGET_OS_OSX
5358 	if ((c_minor_count && COMPRESSOR_NEEDS_TO_MINOR_COMPACT()) || vm_compressor_needs_to_major_compact()) {
5359 		vm_wake_compactor_swapper();
5360 	}
5361 #endif /* !XNU_TARGET_OS_OSX */
5362 
5363 	return retval;
5364 }
5365 
5366 
5367 int
vm_compressor_get(ppnum_t pn,int * slot,int flags)5368 vm_compressor_get(ppnum_t pn, int *slot, int flags)
5369 {
5370 	c_slot_mapping_t  slot_ptr;
5371 	char    *dst;
5372 	int     zeroslot = 1;
5373 	int     retval;
5374 
5375 	dst = pmap_map_compressor_page(pn);
5376 	slot_ptr = (c_slot_mapping_t)slot;
5377 
5378 	assert(dst != NULL);
5379 
5380 	if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
5381 		int32_t         data;
5382 		int32_t         *dptr;
5383 
5384 		/*
5385 		 * page was populated with a single value
5386 		 * that found a home in our hash table
5387 		 * grab that value from the hash and populate the page
5388 		 * that we need to populate the page with
5389 		 */
5390 		dptr = (int32_t *)(uintptr_t)dst;
5391 		data = c_segment_sv_hash_table[slot_ptr->s_cindx].he_data;
5392 		sv_decompress(dptr, data);
5393 		if (!(flags & C_KEEP)) {
5394 			c_segment_sv_hash_drop_ref(slot_ptr->s_cindx);
5395 
5396 			OSAddAtomic(-1, &c_segment_pages_compressed);
5397 			*slot = 0;
5398 		}
5399 		if (data) {
5400 			OSAddAtomic(1, &c_segment_svp_nonzero_decompressions);
5401 		} else {
5402 			OSAddAtomic(1, &c_segment_svp_zero_decompressions);
5403 		}
5404 
5405 		pmap_unmap_compressor_page(pn, dst);
5406 		return 0;
5407 	}
5408 
5409 	retval = c_decompress_page(dst, slot_ptr, flags, &zeroslot);
5410 
5411 	/*
5412 	 * zeroslot will be set to 0 by c_decompress_page if (flags & C_KEEP)
5413 	 * or (flags & C_DONT_BLOCK) and we found 'c_busy' or 'C_SEG_IS_ONDISK' to be TRUE
5414 	 */
5415 	if (zeroslot) {
5416 		*slot = 0;
5417 	}
5418 
5419 	pmap_unmap_compressor_page(pn, dst);
5420 
5421 	/*
5422 	 * returns 0 if we successfully decompressed a page from a segment already in memory
5423 	 * returns 1 if we had to first swap in the segment, before successfully decompressing the page
5424 	 * returns -1 if we encountered an error swapping in the segment - decompression failed
5425 	 * returns -2 if (flags & C_DONT_BLOCK) and we found 'c_busy' or 'C_SEG_IS_ONDISK' to be true
5426 	 */
5427 	return retval;
5428 }
5429 
5430 #if DEVELOPMENT || DEBUG
5431 
5432 void
vm_compressor_inject_error(int * slot)5433 vm_compressor_inject_error(int *slot)
5434 {
5435 	c_slot_mapping_t slot_ptr = (c_slot_mapping_t)slot;
5436 
5437 	/* No error detection for single-value compression. */
5438 	if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
5439 		printf("%s(): cannot inject errors in SV-compressed pages\n", __func__ );
5440 		return;
5441 	}
5442 
5443 	/* s_cseg is actually "segno+1" */
5444 	const uint32_t c_segno = slot_ptr->s_cseg - 1;
5445 
5446 	assert(c_segno < c_segments_available);
5447 	assert(c_segments[c_segno].c_segno >= c_segments_available);
5448 
5449 	const c_segment_t c_seg = c_segments[c_segno].c_seg;
5450 
5451 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
5452 
5453 	lck_mtx_lock_spin_always(&c_seg->c_lock);
5454 	assert(c_seg->c_state != C_IS_EMPTY && c_seg->c_state != C_IS_FREE);
5455 
5456 	const uint16_t c_indx = slot_ptr->s_cindx;
5457 	assert(c_indx < c_seg->c_nextslot);
5458 
5459 	/*
5460 	 * To safely make this segment temporarily writable, we need to mark
5461 	 * the segment busy, which allows us to release the segment lock.
5462 	 */
5463 	while (c_seg->c_busy) {
5464 		c_seg_wait_on_busy(c_seg);
5465 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5466 	}
5467 	C_SEG_BUSY(c_seg);
5468 
5469 	bool already_writable = (c_seg->c_state == C_IS_FILLING);
5470 	if (!already_writable) {
5471 		/*
5472 		 * Protection update must be performed preemptibly, so temporarily drop
5473 		 * the lock. Having set c_busy will prevent most other concurrent
5474 		 * operations.
5475 		 */
5476 		lck_mtx_unlock_always(&c_seg->c_lock);
5477 		C_SEG_MAKE_WRITEABLE(c_seg);
5478 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5479 	}
5480 
5481 	/*
5482 	 * Once we've released the lock following our c_state == C_IS_FILLING check,
5483 	 * c_current_seg_filled() can (re-)write-protect the segment. However, it
5484 	 * will transition from C_IS_FILLING before releasing the c_seg lock, so we
5485 	 * can detect this by re-checking after we've reobtained the lock.
5486 	 */
5487 	if (already_writable && c_seg->c_state != C_IS_FILLING) {
5488 		lck_mtx_unlock_always(&c_seg->c_lock);
5489 		C_SEG_MAKE_WRITEABLE(c_seg);
5490 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5491 		already_writable = false;
5492 		/* Segment can't be freed while c_busy is set. */
5493 		assert(c_seg->c_state != C_IS_FILLING);
5494 	}
5495 
5496 	/*
5497 	 * Skip if the segment is on disk. This check can only be performed after
5498 	 * the final acquisition of the segment lock before we attempt to write to
5499 	 * the segment.
5500 	 */
5501 	if (!C_SEG_IS_ON_DISK_OR_SOQ(c_seg)) {
5502 		c_slot_t cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5503 		int32_t *data = &c_seg->c_store.c_buffer[cs->c_offset];
5504 		/* assume that the compressed data holds at least one int32_t */
5505 		assert(UNPACK_C_SIZE(cs) > sizeof(*data));
5506 		/*
5507 		 * This bit is known to be in the payload of a MISS packet resulting from
5508 		 * the pattern used in the test pattern from decompression_failure.c.
5509 		 * Flipping it should result in many corrupted bits in the test page.
5510 		 */
5511 		data[0] ^= 0x00000100;
5512 	}
5513 
5514 	if (!already_writable) {
5515 		lck_mtx_unlock_always(&c_seg->c_lock);
5516 		C_SEG_WRITE_PROTECT(c_seg);
5517 		lck_mtx_lock_spin_always(&c_seg->c_lock);
5518 	}
5519 
5520 	C_SEG_WAKEUP_DONE(c_seg);
5521 	lck_mtx_unlock_always(&c_seg->c_lock);
5522 
5523 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5524 }
5525 
5526 #endif /* DEVELOPMENT || DEBUG */
5527 
5528 int
vm_compressor_free(int * slot,int flags)5529 vm_compressor_free(int *slot, int flags)
5530 {
5531 	c_slot_mapping_t  slot_ptr;
5532 	int     zeroslot = 1;
5533 	int     retval;
5534 
5535 	assert(flags == 0 || flags == C_DONT_BLOCK);
5536 
5537 	slot_ptr = (c_slot_mapping_t)slot;
5538 
5539 	if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
5540 		c_segment_sv_hash_drop_ref(slot_ptr->s_cindx);
5541 		OSAddAtomic(-1, &c_segment_pages_compressed);
5542 
5543 		*slot = 0;
5544 		return 0;
5545 	}
5546 	retval = c_decompress_page(NULL, slot_ptr, flags, &zeroslot);
5547 	/*
5548 	 * returns 0 if we successfully freed the specified compressed page
5549 	 * returns -1 if we encountered an error swapping in the segment - decompression failed
5550 	 * returns -2 if (flags & C_DONT_BLOCK) and we found 'c_busy' set
5551 	 */
5552 
5553 	if (retval == 0) {
5554 		*slot = 0;
5555 	}
5556 
5557 	return retval;
5558 }
5559 
5560 
5561 int
vm_compressor_put(ppnum_t pn,int * slot,void ** current_chead,char * scratch_buf)5562 vm_compressor_put(ppnum_t pn, int *slot, void  **current_chead, char *scratch_buf)
5563 {
5564 	char    *src;
5565 	int     retval;
5566 
5567 	src = pmap_map_compressor_page(pn);
5568 	assert(src != NULL);
5569 
5570 	retval = c_compress_page(src, (c_slot_mapping_t)slot, (c_segment_t *)current_chead, scratch_buf);
5571 	pmap_unmap_compressor_page(pn, src);
5572 
5573 	return retval;
5574 }
5575 
5576 void
vm_compressor_transfer(int * dst_slot_p,int * src_slot_p)5577 vm_compressor_transfer(
5578 	int     *dst_slot_p,
5579 	int     *src_slot_p)
5580 {
5581 	c_slot_mapping_t        dst_slot, src_slot;
5582 	c_segment_t             c_seg;
5583 	uint16_t                c_indx;
5584 	c_slot_t                cs;
5585 
5586 	src_slot = (c_slot_mapping_t) src_slot_p;
5587 
5588 	if (src_slot->s_cseg == C_SV_CSEG_ID) {
5589 		*dst_slot_p = *src_slot_p;
5590 		*src_slot_p = 0;
5591 		return;
5592 	}
5593 	dst_slot = (c_slot_mapping_t) dst_slot_p;
5594 Retry:
5595 	PAGE_REPLACEMENT_DISALLOWED(TRUE);
5596 	/* get segment for src_slot */
5597 	c_seg = c_segments[src_slot->s_cseg - 1].c_seg;
5598 	/* lock segment */
5599 	lck_mtx_lock_spin_always(&c_seg->c_lock);
5600 	/* wait if it's busy */
5601 	if (c_seg->c_busy && !c_seg->c_busy_swapping) {
5602 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5603 		c_seg_wait_on_busy(c_seg);
5604 		goto Retry;
5605 	}
5606 	/* find the c_slot */
5607 	c_indx = src_slot->s_cindx;
5608 	cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5609 	/* point the c_slot back to dst_slot instead of src_slot */
5610 	C_SLOT_ASSERT_PACKABLE(dst_slot);
5611 	cs->c_packed_ptr = C_SLOT_PACK_PTR(dst_slot);
5612 	/* transfer */
5613 	*dst_slot_p = *src_slot_p;
5614 	*src_slot_p = 0;
5615 	lck_mtx_unlock_always(&c_seg->c_lock);
5616 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5617 }
5618 
5619 #if defined(__arm64__)
5620 extern clock_sec_t             vm_swapfile_last_failed_to_create_ts;
5621 __attribute__((noreturn))
5622 void
vm_panic_hibernate_write_image_failed(int err)5623 vm_panic_hibernate_write_image_failed(int err)
5624 {
5625 	panic("hibernate_write_image encountered error 0x%x - %u, %u, %d, %d, %d, %d, %d, %d, %d, %d, %llu, %d, %d, %d\n",
5626 	    err,
5627 	    VM_PAGE_COMPRESSOR_COUNT, vm_page_wire_count,
5628 	    c_age_count, c_major_count, c_minor_count, (c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count), c_swappedout_sparse_count,
5629 	    vm_num_swap_files, vm_num_pinned_swap_files, vm_swappin_enabled, vm_swap_put_failures,
5630 	    (vm_swapfile_last_failed_to_create_ts ? 1:0), hibernate_no_swapspace, hibernate_flush_timed_out);
5631 }
5632 #endif /*(__arm64__)*/
5633 
5634 #if CONFIG_FREEZE
5635 
5636 int     freezer_finished_filling = 0;
5637 
5638 void
vm_compressor_finished_filling(void ** current_chead)5639 vm_compressor_finished_filling(
5640 	void    **current_chead)
5641 {
5642 	c_segment_t     c_seg;
5643 
5644 	if ((c_seg = *(c_segment_t *)current_chead) == NULL) {
5645 		return;
5646 	}
5647 
5648 	assert(c_seg->c_state == C_IS_FILLING);
5649 
5650 	lck_mtx_lock_spin_always(&c_seg->c_lock);
5651 
5652 	c_current_seg_filled(c_seg, (c_segment_t *)current_chead);
5653 
5654 	lck_mtx_unlock_always(&c_seg->c_lock);
5655 
5656 	freezer_finished_filling++;
5657 }
5658 
5659 
5660 /*
5661  * This routine is used to transfer the compressed chunks from
5662  * the c_seg/cindx pointed to by slot_p into a new c_seg headed
5663  * by the current_chead and a new cindx within that c_seg.
5664  *
5665  * Currently, this routine is only used by the "freezer backed by
5666  * compressor with swap" mode to create a series of c_segs that
5667  * only contain compressed data belonging to one task. So, we
5668  * move a task's previously compressed data into a set of new
5669  * c_segs which will also hold the task's yet to be compressed data.
5670  */
5671 
5672 kern_return_t
vm_compressor_relocate(void ** current_chead,int * slot_p)5673 vm_compressor_relocate(
5674 	void            **current_chead,
5675 	int             *slot_p)
5676 {
5677 	c_slot_mapping_t        slot_ptr;
5678 	c_slot_mapping_t        src_slot;
5679 	uint32_t                c_rounded_size;
5680 	uint32_t                c_size;
5681 	uint16_t                dst_slot;
5682 	c_slot_t                c_dst;
5683 	c_slot_t                c_src;
5684 	uint16_t                c_indx;
5685 	c_segment_t             c_seg_dst = NULL;
5686 	c_segment_t             c_seg_src = NULL;
5687 	kern_return_t           kr = KERN_SUCCESS;
5688 
5689 
5690 	src_slot = (c_slot_mapping_t) slot_p;
5691 
5692 	if (src_slot->s_cseg == C_SV_CSEG_ID) {
5693 		/*
5694 		 * no need to relocate... this is a page full of a single
5695 		 * value which is hashed to a single entry not contained
5696 		 * in a c_segment_t
5697 		 */
5698 		return kr;
5699 	}
5700 
5701 Relookup_dst:
5702 	c_seg_dst = c_seg_allocate((c_segment_t *)current_chead);
5703 	/*
5704 	 * returns with c_seg lock held
5705 	 * and PAGE_REPLACEMENT_DISALLOWED(TRUE)...
5706 	 * c_nextslot has been allocated and
5707 	 * c_store.c_buffer populated
5708 	 */
5709 	if (c_seg_dst == NULL) {
5710 		/*
5711 		 * Out of compression segments?
5712 		 */
5713 		kr = KERN_RESOURCE_SHORTAGE;
5714 		goto out;
5715 	}
5716 
5717 	assert(c_seg_dst->c_busy == 0);
5718 
5719 	C_SEG_BUSY(c_seg_dst);
5720 
5721 	dst_slot = c_seg_dst->c_nextslot;
5722 
5723 	lck_mtx_unlock_always(&c_seg_dst->c_lock);
5724 
5725 Relookup_src:
5726 	c_seg_src = c_segments[src_slot->s_cseg - 1].c_seg;
5727 
5728 	assert(c_seg_dst != c_seg_src);
5729 
5730 	lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5731 
5732 	if (C_SEG_IS_ON_DISK_OR_SOQ(c_seg_src) ||
5733 	    c_seg_src->c_state == C_IS_FILLING) {
5734 		/*
5735 		 * Skip this page if :-
5736 		 * a) the src c_seg is already on-disk (or on its way there)
5737 		 *    A "thaw" can mark a process as eligible for
5738 		 * another freeze cycle without bringing any of
5739 		 * its swapped out c_segs back from disk (because
5740 		 * that is done on-demand).
5741 		 *    Or, this page may be mapped elsewhere in the task's map,
5742 		 * and we may have marked it for swap already.
5743 		 *
5744 		 * b) Or, the src c_seg is being filled by the compressor
5745 		 * thread. We don't want the added latency of waiting for
5746 		 * this c_seg in the freeze path and so we skip it.
5747 		 */
5748 
5749 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5750 
5751 		lck_mtx_unlock_always(&c_seg_src->c_lock);
5752 
5753 		c_seg_src = NULL;
5754 
5755 		goto out;
5756 	}
5757 
5758 	if (c_seg_src->c_busy) {
5759 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5760 		c_seg_wait_on_busy(c_seg_src);
5761 
5762 		c_seg_src = NULL;
5763 
5764 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
5765 
5766 		goto Relookup_src;
5767 	}
5768 
5769 	C_SEG_BUSY(c_seg_src);
5770 
5771 	lck_mtx_unlock_always(&c_seg_src->c_lock);
5772 
5773 	PAGE_REPLACEMENT_DISALLOWED(FALSE);
5774 
5775 	/* find the c_slot */
5776 	c_indx = src_slot->s_cindx;
5777 
5778 	c_src = C_SEG_SLOT_FROM_INDEX(c_seg_src, c_indx);
5779 
5780 	c_size = UNPACK_C_SIZE(c_src);
5781 
5782 	assert(c_size);
5783 
5784 	if (c_size > (uint32_t)(c_seg_bufsize - C_SEG_OFFSET_TO_BYTES((int32_t)c_seg_dst->c_nextoffset))) {
5785 		/*
5786 		 * This segment is full. We need a new one.
5787 		 */
5788 
5789 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
5790 
5791 		lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5792 		C_SEG_WAKEUP_DONE(c_seg_src);
5793 		lck_mtx_unlock_always(&c_seg_src->c_lock);
5794 
5795 		c_seg_src = NULL;
5796 
5797 		lck_mtx_lock_spin_always(&c_seg_dst->c_lock);
5798 
5799 		assert(c_seg_dst->c_busy);
5800 		assert(c_seg_dst->c_state == C_IS_FILLING);
5801 		assert(!c_seg_dst->c_on_minorcompact_q);
5802 
5803 		c_current_seg_filled(c_seg_dst, (c_segment_t *)current_chead);
5804 		assert(*current_chead == NULL);
5805 
5806 		C_SEG_WAKEUP_DONE(c_seg_dst);
5807 
5808 		lck_mtx_unlock_always(&c_seg_dst->c_lock);
5809 
5810 		c_seg_dst = NULL;
5811 
5812 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5813 
5814 		goto Relookup_dst;
5815 	}
5816 
5817 	c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot);
5818 
5819 	memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], c_size);
5820 	/*
5821 	 * Is platform alignment actually necessary since wkdm aligns its output?
5822 	 */
5823 	c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
5824 
5825 	cslot_copy(c_dst, c_src);
5826 	c_dst->c_offset = c_seg_dst->c_nextoffset;
5827 
5828 	if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) {
5829 		c_seg_dst->c_firstemptyslot++;
5830 	}
5831 
5832 	c_seg_dst->c_slots_used++;
5833 	c_seg_dst->c_nextslot++;
5834 	c_seg_dst->c_bytes_used += c_rounded_size;
5835 	c_seg_dst->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
5836 
5837 
5838 	PACK_C_SIZE(c_src, 0);
5839 
5840 	c_seg_src->c_bytes_used -= c_rounded_size;
5841 	c_seg_src->c_bytes_unused += c_rounded_size;
5842 
5843 	assert(c_seg_src->c_slots_used);
5844 	c_seg_src->c_slots_used--;
5845 
5846 	if (!c_seg_src->c_swappedin) {
5847 		/* Pessimistically lose swappedin status when non-swappedin pages are added. */
5848 		c_seg_dst->c_swappedin = false;
5849 	}
5850 
5851 	if (c_indx < c_seg_src->c_firstemptyslot) {
5852 		c_seg_src->c_firstemptyslot = c_indx;
5853 	}
5854 
5855 	c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, dst_slot);
5856 
5857 	PAGE_REPLACEMENT_ALLOWED(TRUE);
5858 	slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
5859 	/* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
5860 	slot_ptr->s_cseg = c_seg_dst->c_mysegno + 1;
5861 	slot_ptr->s_cindx = dst_slot;
5862 
5863 	PAGE_REPLACEMENT_ALLOWED(FALSE);
5864 
5865 out:
5866 	if (c_seg_src) {
5867 		lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5868 
5869 		C_SEG_WAKEUP_DONE(c_seg_src);
5870 
5871 		if (c_seg_src->c_bytes_used == 0 && c_seg_src->c_state != C_IS_FILLING) {
5872 			if (!c_seg_src->c_on_minorcompact_q) {
5873 				c_seg_need_delayed_compaction(c_seg_src, FALSE);
5874 			}
5875 		}
5876 
5877 		lck_mtx_unlock_always(&c_seg_src->c_lock);
5878 	}
5879 
5880 	if (c_seg_dst) {
5881 		PAGE_REPLACEMENT_DISALLOWED(TRUE);
5882 
5883 		lck_mtx_lock_spin_always(&c_seg_dst->c_lock);
5884 
5885 		if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
5886 			/*
5887 			 * Nearing or exceeded maximum slot and offset capacity.
5888 			 */
5889 			assert(c_seg_dst->c_busy);
5890 			assert(c_seg_dst->c_state == C_IS_FILLING);
5891 			assert(!c_seg_dst->c_on_minorcompact_q);
5892 
5893 			c_current_seg_filled(c_seg_dst, (c_segment_t *)current_chead);
5894 			assert(*current_chead == NULL);
5895 		}
5896 
5897 		C_SEG_WAKEUP_DONE(c_seg_dst);
5898 
5899 		lck_mtx_unlock_always(&c_seg_dst->c_lock);
5900 
5901 		c_seg_dst = NULL;
5902 
5903 		PAGE_REPLACEMENT_DISALLOWED(FALSE);
5904 	}
5905 
5906 	return kr;
5907 }
5908 #endif /* CONFIG_FREEZE */
5909