1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <vm/vm_compressor_internal.h>
30
31 #if CONFIG_PHANTOM_CACHE
32 #include <vm/vm_phantom_cache_internal.h>
33 #endif
34
35 #include <vm/vm_map_xnu.h>
36 #include <vm/vm_pageout_xnu.h>
37 #include <vm/vm_map_internal.h>
38 #include <vm/memory_object.h>
39 #include <vm/vm_compressor_algorithms_internal.h>
40 #include <vm/vm_compressor_backing_store_internal.h>
41 #include <vm/vm_fault.h>
42 #include <vm/vm_protos.h>
43 #include <vm/vm_kern_xnu.h>
44 #include <vm/vm_compressor_pager_internal.h>
45 #include <vm/vm_iokit.h>
46 #include <vm/vm_far.h>
47 #include <mach/mach_host.h> /* for host_info() */
48 #if DEVELOPMENT || DEBUG
49 #include <kern/hvg_hypercall.h>
50 #include <vm/vm_compressor_info.h> /* for c_segment_info */
51 #endif
52 #include <kern/ledger.h>
53 #include <kern/policy_internal.h>
54 #include <kern/thread_group.h>
55 #include <san/kasan.h>
56 #include <sys/kern_memorystatus_xnu.h>
57 #include <os/atomic_private.h>
58 #include <vm/vm_log.h>
59 #include <pexpert/pexpert.h>
60 #include <pexpert/device_tree.h>
61
62 #if defined(__x86_64__)
63 #include <i386/misc_protos.h>
64 #endif
65 #if defined(__arm64__)
66 #include <arm/machine_routines.h>
67 #endif
68
69 #include <IOKit/IOHibernatePrivate.h>
70
71 /*
72 * The segment buffer size is a tradeoff.
73 * A larger buffer leads to faster I/O throughput, better compression ratios
74 * (since fewer bytes are wasted at the end of the segment),
75 * and less overhead (both in time and space).
76 * However, a smaller buffer causes less swap when the system is overcommited
77 * b/c a higher percentage of the swapped-in segment is definitely accessed
78 * before it goes back out to storage.
79 *
80 * So on systems without swap, a larger segment is a clear win.
81 * On systems with swap, the choice is murkier. Empirically, we've
82 * found that a 64KB segment provides a better tradeoff both in terms of
83 * performance and swap writes than a 256KB segment on systems with fast SSDs
84 * and a HW compression block.
85 */
86 #define C_SEG_BUFSIZE_ARM_SWAP (1024 * 64)
87 #if XNU_TARGET_OS_OSX && defined(__arm64__)
88 #define C_SEG_BUFSIZE_DEFAULT C_SEG_BUFSIZE_ARM_SWAP
89 #else
90 #define C_SEG_BUFSIZE_DEFAULT (1024 * 256)
91 #endif /* TARGET_OS_OSX && defined(__arm64__) */
92 uint32_t c_seg_bufsize;
93
94 uint32_t c_seg_max_pages; /* maximum number of pages the compressed data of a segment can take */
95 uint32_t c_seg_off_limit; /* if we've reached this size while filling the segment, don't bother trying to fill anymore
96 * because it's unlikely to succeed, in units of uint32_t, same as c_nextoffset */
97 uint32_t c_seg_allocsize, c_seg_slot_var_array_min_len;
98
99 extern boolean_t vm_darkwake_mode;
100 extern zone_t vm_page_zone;
101
102 #if DEVELOPMENT || DEBUG
103 /* sysctl defined in bsd/dev/arm64/sysctl.c */
104 static event_t debug_cseg_wait_event = NULL;
105 #endif /* DEVELOPMENT || DEBUG */
106
107 #if CONFIG_FREEZE
108 bool freezer_incore_cseg_acct = TRUE; /* Only count incore compressed memory for jetsams. */
109 #endif /* CONFIG_FREEZE */
110
111 #if POPCOUNT_THE_COMPRESSED_DATA
112 boolean_t popcount_c_segs = TRUE;
113
114 static inline uint32_t
vmc_pop(uintptr_t ins,int sz)115 vmc_pop(uintptr_t ins, int sz)
116 {
117 uint32_t rv = 0;
118
119 if (__probable(popcount_c_segs == FALSE)) {
120 return 0xDEAD707C;
121 }
122
123 while (sz >= 16) {
124 uint32_t rv1, rv2;
125 uint64_t *ins64 = (uint64_t *) ins;
126 uint64_t *ins642 = (uint64_t *) (ins + 8);
127 rv1 = __builtin_popcountll(*ins64);
128 rv2 = __builtin_popcountll(*ins642);
129 rv += rv1 + rv2;
130 sz -= 16;
131 ins += 16;
132 }
133
134 while (sz >= 4) {
135 uint32_t *ins32 = (uint32_t *) ins;
136 rv += __builtin_popcount(*ins32);
137 sz -= 4;
138 ins += 4;
139 }
140
141 while (sz > 0) {
142 char *ins8 = (char *)ins;
143 rv += __builtin_popcount(*ins8);
144 sz--;
145 ins++;
146 }
147 return rv;
148 }
149 #endif
150
151 #if VALIDATE_C_SEGMENTS
152 boolean_t validate_c_segs = TRUE;
153 #endif
154 /*
155 * vm_compressor_mode has a hierarchy of control to set its value.
156 * boot-args are checked first, then device-tree, and finally
157 * the default value that is defined below. See vm_fault_init() for
158 * the boot-arg & device-tree code.
159 */
160
161 #if !XNU_TARGET_OS_OSX
162
163 #if CONFIG_FREEZE
164 int vm_compressor_mode = VM_PAGER_FREEZER_DEFAULT;
165 struct freezer_context freezer_context_global;
166 #else /* CONFIG_FREEZE */
167 int vm_compressor_mode = VM_PAGER_NOT_CONFIGURED;
168 #endif /* CONFIG_FREEZE */
169
170 #else /* !XNU_TARGET_OS_OSX */
171 int vm_compressor_mode = VM_PAGER_COMPRESSOR_WITH_SWAP;
172
173 #endif /* !XNU_TARGET_OS_OSX */
174
175 TUNABLE(uint32_t, vm_compression_limit, "vm_compression_limit", 0);
176 int vm_compressor_is_active = 0;
177 int vm_compressor_available = 0;
178
179 extern uint64_t vm_swap_get_max_configured_space(void);
180 extern void vm_pageout_io_throttle(void);
181
182 #if CHECKSUM_THE_DATA || CHECKSUM_THE_SWAP || CHECKSUM_THE_COMPRESSED_DATA
183 extern unsigned int hash_string(char *cp, int len);
184 static unsigned int vmc_hash(char *, int);
185 boolean_t checksum_c_segs = TRUE;
186
187 unsigned int
vmc_hash(char * cp,int len)188 vmc_hash(char *cp, int len)
189 {
190 unsigned int result;
191 if (__probable(checksum_c_segs == FALSE)) {
192 return 0xDEAD7A37;
193 }
194 vm_memtag_disable_checking();
195 result = hash_string(cp, len);
196 vm_memtag_enable_checking();
197 return result;
198 }
199 #endif
200
201 #define UNPACK_C_SIZE(cs) ((cs->c_size == (PAGE_SIZE-1)) ? PAGE_SIZE : cs->c_size)
202 #define PACK_C_SIZE(cs, size) (cs->c_size = ((size == PAGE_SIZE) ? PAGE_SIZE - 1 : size))
203
204
205 struct c_sv_hash_entry {
206 union {
207 struct {
208 uint32_t c_sv_he_ref;
209 uint32_t c_sv_he_data;
210 } c_sv_he;
211 uint64_t c_sv_he_record;
212 } c_sv_he_un;
213 };
214
215 #define he_ref c_sv_he_un.c_sv_he.c_sv_he_ref
216 #define he_data c_sv_he_un.c_sv_he.c_sv_he_data
217 #define he_record c_sv_he_un.c_sv_he_record
218
219 #define C_SV_HASH_MAX_MISS 32
220 #define C_SV_HASH_SIZE ((1 << 10))
221 #define C_SV_HASH_MASK ((1 << 10) - 1)
222
223 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
224 #define C_SV_CSEG_ID ((1 << 21) - 1)
225 #else /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
226 #define C_SV_CSEG_ID ((1 << 22) - 1)
227 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
228
229 /* elements of c_segments array */
230 union c_segu {
231 c_segment_t c_seg;
232 uintptr_t c_segno; /* index of the next element in the segments free-list, c_free_segno_head is the head */
233 };
234
235 #define C_SLOT_ASSERT_PACKABLE(ptr) \
236 VM_ASSERT_POINTER_PACKABLE((vm_offset_t)(ptr), C_SLOT_PACKED_PTR);
237
238 #define C_SLOT_PACK_PTR(ptr) \
239 VM_PACK_POINTER((vm_offset_t)(ptr), C_SLOT_PACKED_PTR)
240
241 #define C_SLOT_UNPACK_PTR(cslot) \
242 (c_slot_mapping_t)VM_UNPACK_POINTER((cslot)->c_packed_ptr, C_SLOT_PACKED_PTR)
243
244 /* for debugging purposes */
245 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) c_slot_packing_params =
246 VM_PACKING_PARAMS(C_SLOT_PACKED_PTR);
247
248 uint32_t c_segment_count = 0; /* count all allocated c_segments in all queues */
249 uint32_t c_segment_count_max = 0; /* maximum c_segment_count has ever been */
250
251 uint64_t c_generation_id = 0;
252 uint64_t c_generation_id_flush_barrier;
253
254 boolean_t hibernate_no_swapspace = FALSE;
255 boolean_t hibernate_flush_timed_out = FALSE;
256 clock_sec_t hibernate_flushing_deadline = 0;
257
258 #if RECORD_THE_COMPRESSED_DATA
259 /* buffer used as an intermediate stage before writing to file */
260 char *c_compressed_record_sbuf; /* start */
261 char *c_compressed_record_ebuf; /* end */
262 char *c_compressed_record_cptr; /* next buffered write */
263 #endif
264
265 /* the different queues a c_segment can be in via c_age_list */
266 queue_head_t c_age_list_head;
267 queue_head_t c_early_swappedin_list_head, c_regular_swappedin_list_head, c_late_swappedin_list_head;
268 queue_head_t c_early_swapout_list_head, c_regular_swapout_list_head, c_late_swapout_list_head;
269 queue_head_t c_swapio_list_head;
270 queue_head_t c_swappedout_list_head;
271 queue_head_t c_swappedout_sparse_list_head;
272 queue_head_t c_major_list_head;
273 queue_head_t c_filling_list_head;
274 queue_head_t c_bad_list_head;
275
276 /* count of each of the queues above */
277 uint32_t c_age_count = 0;
278 uint32_t c_early_swappedin_count = 0, c_regular_swappedin_count = 0, c_late_swappedin_count = 0;
279 uint32_t c_early_swapout_count = 0, c_regular_swapout_count = 0, c_late_swapout_count = 0;
280 uint32_t c_swapio_count = 0;
281 uint32_t c_swappedout_count = 0;
282 uint32_t c_swappedout_sparse_count = 0;
283 uint32_t c_major_count = 0;
284 uint32_t c_filling_count = 0;
285 uint32_t c_empty_count = 0;
286 uint32_t c_bad_count = 0;
287
288 /* a c_segment can be in the minor-compact queue as well as one of the above ones, via c_list */
289 queue_head_t c_minor_list_head;
290 uint32_t c_minor_count = 0;
291
292 int c_overage_swapped_count = 0;
293 int c_overage_swapped_limit = 0;
294
295 int c_seg_fixed_array_len; /* number of slots in the c_segment inline slots array */
296 union c_segu *c_segments; /* array of all c_segments, not all of it may be populated */
297 vm_offset_t c_buffers; /* starting address of all compressed data pointed to by c_segment.c_store.c_buffer */
298 vm_size_t c_buffers_size; /* total size allocated in c_buffers */
299 caddr_t c_segments_next_page; /* next page to populate for extending c_segments */
300 boolean_t c_segments_busy;
301 uint32_t c_segments_available; /* how many segments are in populated memory (used or free), populated size of c_segments array */
302 uint32_t c_segments_limit; /* max size of c_segments array */
303 uint32_t c_segments_nearing_limit;
304
305 uint32_t c_segment_svp_in_hash;
306 uint32_t c_segment_svp_hash_succeeded;
307 uint32_t c_segment_svp_hash_failed;
308 uint32_t c_segment_svp_zero_compressions;
309 uint32_t c_segment_svp_nonzero_compressions;
310 uint32_t c_segment_svp_zero_decompressions;
311 uint32_t c_segment_svp_nonzero_decompressions;
312
313 uint32_t c_segment_noncompressible_pages;
314
315 uint32_t c_segment_pages_compressed = 0; /* Tracks # of uncompressed pages fed into the compressor, including SV (single value) pages */
316 #if CONFIG_FREEZE
317 int32_t c_segment_pages_compressed_incore = 0; /* Tracks # of uncompressed pages fed into the compressor that are in memory */
318 int32_t c_segment_pages_compressed_incore_late_swapout = 0; /* Tracks # of uncompressed pages fed into the compressor that are in memory and tagged for swapout */
319 uint32_t c_segments_incore_limit = 0; /* Tracks # of segments allowed to be in-core. Based on compressor pool size */
320 #endif /* CONFIG_FREEZE */
321
322 uint32_t c_segment_pages_compressed_limit;
323 uint32_t c_segment_pages_compressed_nearing_limit;
324 uint32_t c_free_segno_head = (uint32_t)-1; /* head of free list of c_segment pointers in c_segments */
325
326 uint32_t vm_compressor_minorcompact_threshold_divisor = 10;
327 uint32_t vm_compressor_majorcompact_threshold_divisor = 10;
328 uint32_t vm_compressor_unthrottle_threshold_divisor = 10;
329 uint32_t vm_compressor_catchup_threshold_divisor = 10;
330
331 uint32_t vm_compressor_minorcompact_threshold_divisor_overridden = 0;
332 uint32_t vm_compressor_majorcompact_threshold_divisor_overridden = 0;
333 uint32_t vm_compressor_unthrottle_threshold_divisor_overridden = 0;
334 uint32_t vm_compressor_catchup_threshold_divisor_overridden = 0;
335
336 #define C_SEGMENTS_PER_PAGE (PAGE_SIZE / sizeof(union c_segu))
337
338 LCK_GRP_DECLARE(vm_compressor_lck_grp, "vm_compressor");
339 LCK_RW_DECLARE(c_master_lock, &vm_compressor_lck_grp);
340 LCK_MTX_DECLARE(c_list_lock_storage, &vm_compressor_lck_grp);
341
342 boolean_t decompressions_blocked = FALSE;
343
344 zone_t compressor_segment_zone;
345 int c_compressor_swap_trigger = 0;
346
347 uint32_t compressor_cpus;
348 char *compressor_scratch_bufs;
349
350 struct vm_compressor_kdp_state vm_compressor_kdp_state;
351
352 clock_sec_t start_of_sample_period_sec = 0;
353 clock_nsec_t start_of_sample_period_nsec = 0;
354 clock_sec_t start_of_eval_period_sec = 0;
355 clock_nsec_t start_of_eval_period_nsec = 0;
356 uint32_t sample_period_decompression_count = 0;
357 uint32_t sample_period_compression_count = 0;
358 uint32_t last_eval_decompression_count = 0;
359 uint32_t last_eval_compression_count = 0;
360
361 #define DECOMPRESSION_SAMPLE_MAX_AGE (60 * 30)
362
363 boolean_t vm_swapout_ripe_segments = FALSE;
364 uint32_t vm_ripe_target_age = (60 * 60 * 48);
365
366 uint32_t swapout_target_age = 0;
367 uint32_t age_of_decompressions_during_sample_period[DECOMPRESSION_SAMPLE_MAX_AGE];
368 uint32_t overage_decompressions_during_sample_period = 0;
369
370
371 void do_fastwake_warmup(queue_head_t *, boolean_t);
372 boolean_t fastwake_warmup = FALSE;
373 boolean_t fastwake_recording_in_progress = FALSE;
374 uint64_t dont_trim_until_ts = 0;
375
376 uint64_t c_segment_warmup_count;
377 uint64_t first_c_segment_to_warm_generation_id = 0;
378 uint64_t last_c_segment_to_warm_generation_id = 0;
379 boolean_t hibernate_flushing = FALSE;
380
381 _Atomic uint64_t c_segment_input_bytes = 0;
382 _Atomic uint64_t c_segment_compressed_bytes = 0;
383 _Atomic uint64_t compressor_bytes_used = 0;
384
385 /* Keeps track of the most recent timestamp for when major compaction finished. */
386 mach_timespec_t major_compact_ts;
387
388 struct c_sv_hash_entry c_segment_sv_hash_table[C_SV_HASH_SIZE] __attribute__ ((aligned(8)));
389
390 static void vm_compressor_swap_trigger_thread(void);
391 static void vm_compressor_do_delayed_compactions(boolean_t);
392 static void vm_compressor_compact_and_swap(boolean_t);
393 static void vm_compressor_process_regular_swapped_in_segments(boolean_t);
394 static void vm_compressor_process_special_swapped_in_segments_locked(void);
395
396 struct vm_compressor_swapper_stats vmcs_stats;
397
398 static void vm_compressor_process_major_segments(bool);
399
400 void compute_swapout_target_age(void);
401
402 boolean_t c_seg_major_compact(c_segment_t, c_segment_t);
403 boolean_t c_seg_major_compact_ok(c_segment_t, c_segment_t);
404
405 int c_seg_minor_compaction_and_unlock(c_segment_t, boolean_t);
406 int c_seg_do_minor_compaction_and_unlock(c_segment_t, boolean_t, boolean_t, boolean_t);
407 void c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg);
408
409 void c_seg_move_to_sparse_list(c_segment_t);
410 void c_seg_insert_into_q(queue_head_t *, c_segment_t);
411
412 uint64_t vm_available_memory(void);
413
414 /*
415 * Get the address of a given entry in the c_segments array
416 */
417 static inline union c_segu *
c_segments_get(uint32_t segno)418 c_segments_get(uint32_t segno)
419 {
420 return VM_FAR_ADD_PTR_UNBOUNDED(c_segments, segno);
421 }
422
423 /*
424 * indicate the need to do a major compaction if
425 * the overall set of in-use compression segments
426 * becomes sparse... on systems that support pressure
427 * driven swapping, this will also cause swapouts to
428 * be initiated.
429 */
430 static bool
vm_compressor_needs_to_major_compact(void)431 vm_compressor_needs_to_major_compact(void)
432 {
433 uint32_t incore_seg_count;
434
435 incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
436
437 /* second condition:
438 * first term:
439 * - (incore_seg_count * c_seg_max_pages) is the maximum number of pages that all resident segments can hold in their buffers
440 * - VM_PAGE_COMPRESSOR_COUNT is the current size that is actually held by the buffers
441 * -- subtracting these gives the amount of pages that is wasted as holes due to segments not being full
442 * second term:
443 * - 1/8 of the maximum size that can be held by this many segments
444 * meaning of the comparison: is the ratio of wasted space greater than 1/8
445 * first condition:
446 * compare number of segments being used vs the number of segments that can ever be allocated
447 * if we don't have a lot of data in the compressor, then we don't need to bother caring about wasted space in holes
448 */
449
450 if ((c_segment_count >= (c_segments_nearing_limit / 8)) &&
451 ((incore_seg_count * c_seg_max_pages) - VM_PAGE_COMPRESSOR_COUNT) >
452 ((incore_seg_count / 8) * c_seg_max_pages)) {
453 return true;
454 }
455 return false;
456 }
457
458 uint32_t
vm_compressor_get_swapped_segment_count(void)459 vm_compressor_get_swapped_segment_count(void)
460 {
461 return c_swappedout_count + c_swappedout_sparse_count;
462 }
463
464 uint32_t
vm_compressor_incore_fragmentation_wasted_pages(void)465 vm_compressor_incore_fragmentation_wasted_pages(void)
466 {
467 /* return one of the components of the calculation in vm_compressor_needs_to_major_compact() */
468 uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
469 return (incore_seg_count * c_seg_max_pages) - VM_PAGE_COMPRESSOR_COUNT;
470 }
471
472 TUNABLE_WRITEABLE(uint64_t, vm_compressor_minor_fragmentation_threshold_pct, "vm_compressor_minor_frag_threshold_pct", 10);
473
474 static bool
vm_compressor_needs_to_minor_compact(void)475 vm_compressor_needs_to_minor_compact(void)
476 {
477 uint32_t compactible_seg_count = os_atomic_load(&c_minor_count, relaxed);
478 if (compactible_seg_count == 0) {
479 return false;
480 }
481
482 bool is_pressured = AVAILABLE_NON_COMPRESSED_MEMORY <
483 VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD;
484 if (!is_pressured) {
485 return false;
486 }
487
488 uint64_t bytes_used = os_atomic_load(&compressor_bytes_used, relaxed);
489 uint64_t bytes_total = VM_PAGE_COMPRESSOR_COUNT * PAGE_SIZE_64;
490 uint64_t bytes_frag = bytes_total - bytes_used;
491 bool is_fragmented = bytes_frag >
492 bytes_total * vm_compressor_minor_fragmentation_threshold_pct / 100;
493
494 return is_fragmented;
495 }
496
497 uint64_t
vm_available_memory(void)498 vm_available_memory(void)
499 {
500 return ((uint64_t)AVAILABLE_NON_COMPRESSED_MEMORY) * PAGE_SIZE_64;
501 }
502
503 uint32_t
vm_compressor_pool_size(void)504 vm_compressor_pool_size(void)
505 {
506 return VM_PAGE_COMPRESSOR_COUNT;
507 }
508
509 uint32_t
vm_compressor_fragmentation_level(void)510 vm_compressor_fragmentation_level(void)
511 {
512 const uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
513 if ((incore_seg_count == 0) || (c_seg_max_pages == 0)) {
514 return 0;
515 }
516 return 100 - (vm_compressor_pool_size() * 100 / (incore_seg_count * c_seg_max_pages));
517 }
518
519 uint32_t
vm_compression_ratio(void)520 vm_compression_ratio(void)
521 {
522 if (vm_compressor_pool_size() == 0) {
523 return UINT32_MAX;
524 }
525 return c_segment_pages_compressed / vm_compressor_pool_size();
526 }
527
528 uint32_t
vm_compressor_pages_compressed(void)529 vm_compressor_pages_compressed(void)
530 {
531 #if CONFIG_FREEZE
532 if (freezer_incore_cseg_acct) {
533 return os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
534 }
535 #endif /* CONFIG_FREEZE */
536 return os_atomic_load(&c_segment_pages_compressed, relaxed);
537 }
538
539 bool
vm_compressor_compressed_pages_nearing_limit(void)540 vm_compressor_compressed_pages_nearing_limit(void)
541 {
542 return vm_compressor_pages_compressed() > c_segment_pages_compressed_nearing_limit;
543 }
544
545 static bool
vm_compressor_segments_nearing_limit(void)546 vm_compressor_segments_nearing_limit(void)
547 {
548 uint64_t segments;
549
550 #if CONFIG_FREEZE
551 if (freezer_incore_cseg_acct) {
552 if (os_sub_overflow(c_segment_count, c_swappedout_count, &segments)) {
553 segments = 0;
554 }
555 if (os_sub_overflow(segments, c_swappedout_sparse_count, &segments)) {
556 segments = 0;
557 }
558 } else {
559 segments = os_atomic_load(&c_segment_count, relaxed);
560 }
561 #else /* CONFIG_FREEZE */
562 segments = c_segment_count;
563 #endif /* CONFIG_FREEZE */
564
565 return segments > c_segments_nearing_limit;
566 }
567
568 bool
vm_compressor_low_on_space(void)569 vm_compressor_low_on_space(void)
570 {
571 return vm_compressor_compressed_pages_nearing_limit() ||
572 vm_compressor_segments_nearing_limit();
573 }
574
575
576 bool
vm_compressor_out_of_space(void)577 vm_compressor_out_of_space(void)
578 {
579 #if CONFIG_FREEZE
580 uint64_t incore_seg_count;
581 uint32_t incore_compressed_pages;
582 if (freezer_incore_cseg_acct) {
583 if (os_sub_overflow(c_segment_count, c_swappedout_count, &incore_seg_count)) {
584 incore_seg_count = 0;
585 }
586 if (os_sub_overflow(incore_seg_count, c_swappedout_sparse_count, &incore_seg_count)) {
587 incore_seg_count = 0;
588 }
589 incore_compressed_pages = os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
590 } else {
591 incore_seg_count = os_atomic_load(&c_segment_count, relaxed);
592 incore_compressed_pages = os_atomic_load(&c_segment_pages_compressed_incore, relaxed);
593 }
594
595 if ((incore_compressed_pages >= c_segment_pages_compressed_limit) ||
596 (incore_seg_count > c_segments_incore_limit)) {
597 return true;
598 }
599 #else /* CONFIG_FREEZE */
600 if ((c_segment_pages_compressed >= c_segment_pages_compressed_limit) ||
601 (c_segment_count >= c_segments_limit)) {
602 return true;
603 }
604 #endif /* CONFIG_FREEZE */
605 return FALSE;
606 }
607
608 bool
vm_compressor_is_thrashing()609 vm_compressor_is_thrashing()
610 {
611 compute_swapout_target_age();
612
613 if (swapout_target_age) {
614 c_segment_t c_seg;
615
616 lck_mtx_lock_spin_always(c_list_lock);
617
618 if (!queue_empty(&c_age_list_head)) {
619 c_seg = (c_segment_t) queue_first(&c_age_list_head);
620
621 if (c_seg->c_creation_ts > swapout_target_age) {
622 swapout_target_age = 0;
623 }
624 }
625 lck_mtx_unlock_always(c_list_lock);
626 }
627
628 return swapout_target_age != 0;
629 }
630
631
632 int
vm_wants_task_throttled(task_t task)633 vm_wants_task_throttled(task_t task)
634 {
635 ledger_amount_t compressed;
636 if (task == kernel_task) {
637 return 0;
638 }
639
640 if (VM_CONFIG_SWAP_IS_ACTIVE) {
641 if ((vm_compressor_low_on_space() || HARD_THROTTLE_LIMIT_REACHED())) {
642 ledger_get_balance(task->ledger, task_ledgers.internal_compressed, &compressed);
643 compressed >>= VM_MAP_PAGE_SHIFT(task->map);
644 if ((unsigned int)compressed > (c_segment_pages_compressed / 4)) {
645 return 1;
646 }
647 }
648 }
649 return 0;
650 }
651
652 #if CONFIG_JETSAM
653 bool memorystatus_disable_swap(void);
654 #if CONFIG_PHANTOM_CACHE
655 extern bool memorystatus_phantom_cache_pressure;
656 #endif /* CONFIG_PHANTOM_CACHE */
657 int compressor_thrashing_induced_jetsam = 0;
658 int filecache_thrashing_induced_jetsam = 0;
659 static boolean_t vm_compressor_thrashing_detected = FALSE;
660 #endif /* CONFIG_JETSAM */
661
662 void
vm_decompressor_lock(void)663 vm_decompressor_lock(void)
664 {
665 PAGE_REPLACEMENT_ALLOWED(TRUE);
666
667 decompressions_blocked = TRUE;
668
669 PAGE_REPLACEMENT_ALLOWED(FALSE);
670 }
671
672 void
vm_decompressor_unlock(void)673 vm_decompressor_unlock(void)
674 {
675 PAGE_REPLACEMENT_ALLOWED(TRUE);
676
677 decompressions_blocked = FALSE;
678
679 PAGE_REPLACEMENT_ALLOWED(FALSE);
680
681 thread_wakeup((event_t)&decompressions_blocked);
682 }
683
684 static inline void
cslot_copy(c_slot_t cdst,c_slot_t csrc)685 cslot_copy(c_slot_t cdst, c_slot_t csrc)
686 {
687 #if CHECKSUM_THE_DATA
688 cdst->c_hash_data = csrc->c_hash_data;
689 #endif
690 #if CHECKSUM_THE_COMPRESSED_DATA
691 cdst->c_hash_compressed_data = csrc->c_hash_compressed_data;
692 #endif
693 #if POPCOUNT_THE_COMPRESSED_DATA
694 cdst->c_pop_cdata = csrc->c_pop_cdata;
695 #endif
696 cdst->c_size = csrc->c_size;
697 cdst->c_packed_ptr = csrc->c_packed_ptr;
698 #if defined(__arm64__)
699 cdst->c_codec = csrc->c_codec;
700 #endif
701 }
702
703 #if XNU_TARGET_OS_OSX
704 #define VM_COMPRESSOR_MAX_POOL_SIZE (192UL << 30)
705 #else
706 #define VM_COMPRESSOR_MAX_POOL_SIZE (0)
707 #endif
708
709 static vm_map_size_t compressor_size;
710 static SECURITY_READ_ONLY_LATE(struct mach_vm_range) compressor_range;
711 vm_map_t compressor_map;
712 uint64_t compressor_pool_max_size;
713 uint64_t compressor_pool_size;
714 uint32_t compressor_pool_multiplier;
715
716 #if DEVELOPMENT || DEBUG
717 /*
718 * Compressor segments are write-protected in development/debug
719 * kernels to help debug memory corruption.
720 * In cases where performance is a concern, this can be disabled
721 * via the boot-arg "-disable_cseg_write_protection".
722 */
723 boolean_t write_protect_c_segs = TRUE;
724 int vm_compressor_test_seg_wp;
725 uint32_t vm_ktrace_enabled;
726 #endif /* DEVELOPMENT || DEBUG */
727
728 #if (XNU_TARGET_OS_OSX && __arm64__)
729
730 #include <IOKit/IOPlatformExpert.h>
731 #include <sys/random.h>
732
733 static const char *csegbufsizeExperimentProperty = "_csegbufsz_experiment";
734 static thread_call_t csegbufsz_experiment_thread_call;
735
736 extern boolean_t IOServiceWaitForMatchingResource(const char * property, uint64_t timeout);
737 static void
erase_csegbufsz_experiment_property(__unused void * param0,__unused void * param1)738 erase_csegbufsz_experiment_property(__unused void *param0, __unused void *param1)
739 {
740 // Wait for NVRAM to be writable
741 if (!IOServiceWaitForMatchingResource("IONVRAM", UINT64_MAX)) {
742 printf("csegbufsz_experiment_property: Failed to wait for IONVRAM.");
743 }
744
745 if (!PERemoveNVRAMProperty(csegbufsizeExperimentProperty)) {
746 printf("csegbufsize_experiment_property: Failed to remove %s from NVRAM.", csegbufsizeExperimentProperty);
747 }
748 thread_call_free(csegbufsz_experiment_thread_call);
749 }
750
751 static void
erase_csegbufsz_experiment_property_async()752 erase_csegbufsz_experiment_property_async()
753 {
754 csegbufsz_experiment_thread_call = thread_call_allocate_with_priority(
755 erase_csegbufsz_experiment_property,
756 NULL,
757 THREAD_CALL_PRIORITY_LOW
758 );
759 if (csegbufsz_experiment_thread_call == NULL) {
760 printf("csegbufsize_experiment_property: Unable to allocate thread call.");
761 } else {
762 thread_call_enter(csegbufsz_experiment_thread_call);
763 }
764 }
765
766 static void
cleanup_csegbufsz_experiment(__unused void * arg0)767 cleanup_csegbufsz_experiment(__unused void *arg0)
768 {
769 char nvram = 0;
770 unsigned int len = sizeof(nvram);
771 if (PEReadNVRAMProperty(csegbufsizeExperimentProperty, &nvram, &len)) {
772 erase_csegbufsz_experiment_property_async();
773 }
774 }
775
776 STARTUP_ARG(EARLY_BOOT, STARTUP_RANK_FIRST, cleanup_csegbufsz_experiment, NULL);
777 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
778
779 #if CONFIG_JETSAM
780 extern unsigned int memorystatus_swap_all_apps;
781 #endif /* CONFIG_JETSAM */
782
783 TUNABLE_DT(uint64_t, swap_vol_min_capacity, "/defaults", "kern.swap_min_capacity", "kern.swap_min_capacity", 0, TUNABLE_DT_NONE);
784
785 static void
vm_compressor_set_size(void)786 vm_compressor_set_size(void)
787 {
788 /*
789 * Note that this function may be called multiple times on systems with app swap
790 * because the value of vm_swap_get_max_configured_space() and memorystatus_swap_all_apps
791 * can change based the size of the swap volume. On these systems, we'll call
792 * this function once early in boot to reserve the maximum amount of VA required
793 * for the compressor submap and then one more time in vm_compressor_init after
794 * determining the swap volume size. We must not return a larger value the second
795 * time around.
796 */
797 vm_size_t c_segments_arr_size = 0;
798 struct c_slot_mapping tmp_slot_ptr;
799
800 /* The segment size can be overwritten by a boot-arg */
801 if (!PE_parse_boot_argn("vm_compressor_segment_buffer_size", &c_seg_bufsize, sizeof(c_seg_bufsize))) {
802 #if CONFIG_JETSAM
803 if (memorystatus_swap_all_apps) {
804 c_seg_bufsize = C_SEG_BUFSIZE_ARM_SWAP;
805 } else {
806 c_seg_bufsize = C_SEG_BUFSIZE_DEFAULT;
807 }
808 #else
809 c_seg_bufsize = C_SEG_BUFSIZE_DEFAULT;
810 #endif /* CONFIG_JETSAM */
811 }
812
813 vm_compressor_swap_init_swap_file_limit();
814 if (vm_compression_limit) {
815 compressor_pool_size = ptoa_64(vm_compression_limit);
816 }
817
818 compressor_pool_max_size = C_SEG_MAX_LIMIT;
819 compressor_pool_max_size *= c_seg_bufsize;
820
821 #if XNU_TARGET_OS_OSX
822
823 if (vm_compression_limit == 0) {
824 if (max_mem <= (4ULL * 1024ULL * 1024ULL * 1024ULL)) {
825 compressor_pool_size = 16ULL * max_mem;
826 } else if (max_mem <= (8ULL * 1024ULL * 1024ULL * 1024ULL)) {
827 compressor_pool_size = 8ULL * max_mem;
828 } else if (max_mem <= (32ULL * 1024ULL * 1024ULL * 1024ULL)) {
829 compressor_pool_size = 4ULL * max_mem;
830 } else {
831 compressor_pool_size = 2ULL * max_mem;
832 }
833 }
834 /*
835 * Cap the compressor pool size to a max of 192G
836 */
837 if (compressor_pool_size > VM_COMPRESSOR_MAX_POOL_SIZE) {
838 compressor_pool_size = VM_COMPRESSOR_MAX_POOL_SIZE;
839 }
840 if (max_mem <= (8ULL * 1024ULL * 1024ULL * 1024ULL)) {
841 compressor_pool_multiplier = 1;
842 } else if (max_mem <= (32ULL * 1024ULL * 1024ULL * 1024ULL)) {
843 compressor_pool_multiplier = 2;
844 } else {
845 compressor_pool_multiplier = 4;
846 }
847
848 #else
849
850 if (compressor_pool_max_size > max_mem) {
851 compressor_pool_max_size = max_mem;
852 }
853
854 if (vm_compression_limit == 0) {
855 compressor_pool_size = max_mem;
856 }
857
858 #if XNU_TARGET_OS_WATCH
859 compressor_pool_multiplier = 2;
860 #elif XNU_TARGET_OS_IOS
861 if (max_mem <= (2ULL * 1024ULL * 1024ULL * 1024ULL)) {
862 compressor_pool_multiplier = 2;
863 } else {
864 compressor_pool_multiplier = 1;
865 }
866 #else
867 compressor_pool_multiplier = 1;
868 #endif
869
870 #endif
871
872 PE_parse_boot_argn("kern.compressor_pool_multiplier", &compressor_pool_multiplier, sizeof(compressor_pool_multiplier));
873 if (compressor_pool_multiplier < 1) {
874 compressor_pool_multiplier = 1;
875 }
876
877 if (compressor_pool_size > compressor_pool_max_size) {
878 compressor_pool_size = compressor_pool_max_size;
879 }
880
881 c_seg_max_pages = (c_seg_bufsize / PAGE_SIZE);
882 c_seg_slot_var_array_min_len = c_seg_max_pages;
883
884 #if !defined(__x86_64__)
885 c_seg_off_limit = (C_SEG_BYTES_TO_OFFSET((c_seg_bufsize - 512)));
886 c_seg_allocsize = (c_seg_bufsize + PAGE_SIZE);
887 #else
888 c_seg_off_limit = (C_SEG_BYTES_TO_OFFSET((c_seg_bufsize - 128)));
889 c_seg_allocsize = c_seg_bufsize;
890 #endif /* !defined(__x86_64__) */
891
892 c_segments_limit = (uint32_t)(compressor_pool_size / (vm_size_t)(c_seg_allocsize));
893 tmp_slot_ptr.s_cseg = c_segments_limit;
894 /* Panic on internal configs*/
895 assertf((tmp_slot_ptr.s_cseg == c_segments_limit), "vm_compressor_init: overflowed s_cseg field in c_slot_mapping with c_segno: %d", c_segments_limit);
896
897 if (tmp_slot_ptr.s_cseg != c_segments_limit) {
898 tmp_slot_ptr.s_cseg = -1;
899 c_segments_limit = tmp_slot_ptr.s_cseg - 1; /*limited by segment idx bits in c_slot_mapping*/
900 compressor_pool_size = (c_segments_limit * (vm_size_t)(c_seg_allocsize));
901 }
902
903 c_segments_nearing_limit = (uint32_t)(((uint64_t)c_segments_limit * 98ULL) / 100ULL);
904
905 /* an upper limit on how many input pages the compressor can hold */
906 c_segment_pages_compressed_limit = (c_segments_limit * (c_seg_bufsize / PAGE_SIZE) * compressor_pool_multiplier);
907
908 if (c_segment_pages_compressed_limit < (uint32_t)(max_mem / PAGE_SIZE)) {
909 #if defined(XNU_TARGET_OS_WATCH)
910 c_segment_pages_compressed_limit = (uint32_t)(max_mem / PAGE_SIZE);
911 #else
912 if (!vm_compression_limit) {
913 c_segment_pages_compressed_limit = (uint32_t)(max_mem / PAGE_SIZE);
914 }
915 #endif
916 }
917
918 c_segment_pages_compressed_nearing_limit = (uint32_t)(((uint64_t)c_segment_pages_compressed_limit * 98ULL) / 100ULL);
919
920 #if CONFIG_FREEZE
921 /*
922 * Our in-core limits are based on the size of the compressor pool.
923 * The c_segments_nearing_limit is also based on the compressor pool
924 * size and calculated above.
925 */
926 c_segments_incore_limit = c_segments_limit;
927
928 if (freezer_incore_cseg_acct) {
929 /*
930 * Add enough segments to track all frozen c_segs that can be stored in swap.
931 */
932 c_segments_limit += (uint32_t)(vm_swap_get_max_configured_space() / (vm_size_t)(c_seg_allocsize));
933 tmp_slot_ptr.s_cseg = c_segments_limit;
934 /* Panic on internal configs*/
935 assertf((tmp_slot_ptr.s_cseg == c_segments_limit), "vm_compressor_init: freezer reserve overflowed s_cseg field in c_slot_mapping with c_segno: %d", c_segments_limit);
936 }
937 #endif
938 /*
939 * Submap needs space for:
940 * - c_segments
941 * - c_buffers
942 * - swap reclaimations -- c_seg_bufsize
943 */
944 c_segments_arr_size = vm_map_round_page((sizeof(union c_segu) * c_segments_limit), VM_MAP_PAGE_MASK(kernel_map));
945 c_buffers_size = vm_map_round_page(((vm_size_t)c_seg_allocsize * (vm_size_t)c_segments_limit), VM_MAP_PAGE_MASK(kernel_map));
946
947 compressor_size = c_segments_arr_size + c_buffers_size + c_seg_bufsize;
948
949 #if RECORD_THE_COMPRESSED_DATA
950 c_compressed_record_sbuf_size = (vm_size_t)c_seg_allocsize + (PAGE_SIZE * 2);
951 compressor_size += c_compressed_record_sbuf_size;
952 #endif /* RECORD_THE_COMPRESSED_DATA */
953 }
954 STARTUP(KMEM, STARTUP_RANK_FIRST, vm_compressor_set_size);
955
956 KMEM_RANGE_REGISTER_DYNAMIC(compressor, &compressor_range, ^() {
957 return compressor_size;
958 });
959
960 bool
osenvironment_is_diagnostics(void)961 osenvironment_is_diagnostics(void)
962 {
963 DTEntry chosen;
964 const char *osenvironment;
965 unsigned int size;
966 if (kSuccess == SecureDTLookupEntry(0, "/chosen", &chosen)) {
967 if (kSuccess == SecureDTGetProperty(chosen, "osenvironment", (void const **) &osenvironment, &size)) {
968 return strcmp(osenvironment, "diagnostics") == 0;
969 }
970 }
971 return false;
972 }
973
974 void
vm_compressor_init(void)975 vm_compressor_init(void)
976 {
977 thread_t thread;
978 #if RECORD_THE_COMPRESSED_DATA
979 vm_size_t c_compressed_record_sbuf_size = 0;
980 #endif /* RECORD_THE_COMPRESSED_DATA */
981
982 #if DEVELOPMENT || DEBUG || CONFIG_FREEZE
983 char bootarg_name[32];
984 #endif /* DEVELOPMENT || DEBUG || CONFIG_FREEZE */
985 __unused uint64_t early_boot_compressor_size = compressor_size;
986
987 #if CONFIG_JETSAM
988 if (memorystatus_swap_all_apps && osenvironment_is_diagnostics()) {
989 printf("osenvironment == \"diagnostics\". Disabling app swap.\n");
990 memorystatus_disable_swap();
991 }
992
993 if (memorystatus_swap_all_apps) {
994 /*
995 * App swap is disabled on devices with small NANDs.
996 * Now that we're no longer in early boot, we can get
997 * the NAND size and re-run vm_compressor_set_size.
998 */
999 int error = vm_swap_vol_get_capacity(SWAP_VOLUME_NAME, &vm_swap_volume_capacity);
1000 #if DEVELOPMENT || DEBUG
1001 if (error != 0) {
1002 panic("vm_compressor_init: Unable to get swap volume capacity. error=%d\n", error);
1003 }
1004 #else
1005 if (error != 0) {
1006 vm_log_error("vm_compressor_init: Unable to get swap volume capacity. error=%d\n", error);
1007 }
1008 #endif /* DEVELOPMENT || DEBUG */
1009 if (vm_swap_volume_capacity < swap_vol_min_capacity) {
1010 memorystatus_disable_swap();
1011 }
1012 /*
1013 * Resize the compressor and swap now that we know the capacity
1014 * of the swap volume.
1015 */
1016 vm_compressor_set_size();
1017 /*
1018 * We reserved a chunk of VA early in boot for the compressor submap.
1019 * We can't allocate more than that.
1020 */
1021 assert(compressor_size <= early_boot_compressor_size);
1022 }
1023 #endif /* CONFIG_JETSAM */
1024
1025 #if DEVELOPMENT || DEBUG
1026 if (PE_parse_boot_argn("-disable_cseg_write_protection", bootarg_name, sizeof(bootarg_name))) {
1027 write_protect_c_segs = FALSE;
1028 }
1029
1030 int vmcval = 1;
1031 #if defined(XNU_TARGET_OS_WATCH)
1032 vmcval = 0;
1033 #endif /* XNU_TARGET_OS_WATCH */
1034 PE_parse_boot_argn("vm_compressor_validation", &vmcval, sizeof(vmcval));
1035
1036 if (kern_feature_override(KF_COMPRSV_OVRD)) {
1037 vmcval = 0;
1038 }
1039
1040 if (vmcval == 0) {
1041 #if POPCOUNT_THE_COMPRESSED_DATA
1042 popcount_c_segs = FALSE;
1043 #endif
1044 #if CHECKSUM_THE_DATA || CHECKSUM_THE_COMPRESSED_DATA
1045 checksum_c_segs = FALSE;
1046 #endif
1047 #if VALIDATE_C_SEGMENTS
1048 validate_c_segs = FALSE;
1049 #endif
1050 write_protect_c_segs = FALSE;
1051 }
1052 #endif /* DEVELOPMENT || DEBUG */
1053
1054 #if CONFIG_FREEZE
1055 if (PE_parse_boot_argn("-disable_freezer_cseg_acct", bootarg_name, sizeof(bootarg_name))) {
1056 freezer_incore_cseg_acct = FALSE;
1057 }
1058 #endif /* CONFIG_FREEZE */
1059
1060 assert((C_SEGMENTS_PER_PAGE * sizeof(union c_segu)) == PAGE_SIZE);
1061
1062 #if !XNU_TARGET_OS_OSX
1063 vm_compressor_minorcompact_threshold_divisor = 20;
1064 vm_compressor_majorcompact_threshold_divisor = 30;
1065 vm_compressor_unthrottle_threshold_divisor = 40;
1066 vm_compressor_catchup_threshold_divisor = 60;
1067 #else /* !XNU_TARGET_OS_OSX */
1068 if (max_mem <= (3ULL * 1024ULL * 1024ULL * 1024ULL)) {
1069 vm_compressor_minorcompact_threshold_divisor = 11;
1070 vm_compressor_majorcompact_threshold_divisor = 13;
1071 vm_compressor_unthrottle_threshold_divisor = 20;
1072 vm_compressor_catchup_threshold_divisor = 35;
1073 } else {
1074 vm_compressor_minorcompact_threshold_divisor = 20;
1075 vm_compressor_majorcompact_threshold_divisor = 25;
1076 vm_compressor_unthrottle_threshold_divisor = 35;
1077 vm_compressor_catchup_threshold_divisor = 50;
1078 }
1079 #endif /* !XNU_TARGET_OS_OSX */
1080
1081 queue_init(&c_bad_list_head);
1082 queue_init(&c_age_list_head);
1083 queue_init(&c_minor_list_head);
1084 queue_init(&c_major_list_head);
1085 queue_init(&c_filling_list_head);
1086 queue_init(&c_early_swapout_list_head);
1087 queue_init(&c_regular_swapout_list_head);
1088 queue_init(&c_late_swapout_list_head);
1089 queue_init(&c_swapio_list_head);
1090 queue_init(&c_early_swappedin_list_head);
1091 queue_init(&c_regular_swappedin_list_head);
1092 queue_init(&c_late_swappedin_list_head);
1093 queue_init(&c_swappedout_list_head);
1094 queue_init(&c_swappedout_sparse_list_head);
1095
1096 c_free_segno_head = -1;
1097 c_segments_available = 0;
1098
1099 compressor_map = kmem_suballoc(kernel_map, &compressor_range.min_address,
1100 compressor_size, VM_MAP_CREATE_NEVER_FAULTS,
1101 VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
1102 KMS_NOFAIL | KMS_PERMANENT | KMS_NOSOFTLIMIT,
1103 VM_KERN_MEMORY_COMPRESSOR).kmr_submap;
1104
1105 kmem_alloc(compressor_map, (vm_offset_t *)(&c_segments),
1106 (sizeof(union c_segu) * c_segments_limit),
1107 KMA_NOFAIL | KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT | KMA_NOSOFTLIMIT,
1108 VM_KERN_MEMORY_COMPRESSOR);
1109 kmem_alloc(compressor_map, &c_buffers, c_buffers_size,
1110 KMA_NOFAIL | KMA_COMPRESSOR | KMA_VAONLY | KMA_PERMANENT | KMA_NOSOFTLIMIT,
1111 VM_KERN_MEMORY_COMPRESSOR);
1112
1113 #if DEVELOPMENT || DEBUG
1114 if (hvg_is_hcall_available(HVG_HCALL_SET_COREDUMP_DATA)) {
1115 hvg_hcall_set_coredump_data();
1116 }
1117 #endif
1118
1119 /*
1120 * Pick a good size that will minimize fragmentation in zalloc
1121 * by minimizing the fragmentation in a 16k run.
1122 *
1123 * c_seg_slot_var_array_min_len is larger on 4k systems than 16k ones,
1124 * making the fragmentation in a 4k page terrible. Using 16k for all
1125 * systems matches zalloc() and will minimize fragmentation.
1126 */
1127 uint32_t c_segment_size = sizeof(struct c_segment) + (c_seg_slot_var_array_min_len * sizeof(struct c_slot));
1128 uint32_t cnt = (16 << 10) / c_segment_size;
1129 uint32_t frag = (16 << 10) % c_segment_size;
1130
1131 c_seg_fixed_array_len = c_seg_slot_var_array_min_len;
1132
1133 while (cnt * sizeof(struct c_slot) < frag) {
1134 c_segment_size += sizeof(struct c_slot);
1135 c_seg_fixed_array_len++;
1136 frag -= cnt * sizeof(struct c_slot);
1137 }
1138
1139 compressor_segment_zone = zone_create("compressor_segment",
1140 c_segment_size, ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM);
1141
1142 c_segments_busy = FALSE;
1143
1144 c_segments_next_page = (caddr_t)c_segments;
1145 vm_compressor_algorithm_init();
1146
1147 {
1148 host_basic_info_data_t hinfo;
1149 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
1150 size_t bufsize;
1151 char *buf;
1152
1153 #define BSD_HOST 1
1154 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
1155
1156 compressor_cpus = hinfo.max_cpus;
1157
1158 /* allocate various scratch buffers at the same place */
1159 bufsize = PAGE_SIZE;
1160 bufsize += compressor_cpus * vm_compressor_get_decode_scratch_size();
1161 /* For the panic path */
1162 bufsize += vm_compressor_get_decode_scratch_size();
1163 #if CONFIG_FREEZE
1164 bufsize += vm_compressor_get_encode_scratch_size();
1165 #endif
1166 #if RECORD_THE_COMPRESSED_DATA
1167 bufsize += c_compressed_record_sbuf_size;
1168 #endif
1169
1170 kmem_alloc(kernel_map, (vm_offset_t *)&buf, bufsize,
1171 KMA_DATA_SHARED | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
1172 VM_KERN_MEMORY_COMPRESSOR);
1173
1174 /*
1175 * vm_compressor_kdp_state.kc_decompressed_page must be page aligned because we access
1176 * it through the physical aperture by page number.
1177 */
1178 vm_compressor_kdp_state.kc_panic_decompressed_page = buf;
1179 vm_compressor_kdp_state.kc_panic_decompressed_page_paddr = kvtophys((vm_offset_t)vm_compressor_kdp_state.kc_panic_decompressed_page);
1180 vm_compressor_kdp_state.kc_panic_decompressed_page_ppnum = (ppnum_t) atop(vm_compressor_kdp_state.kc_panic_decompressed_page_paddr);
1181 buf += PAGE_SIZE;
1182 bufsize -= PAGE_SIZE;
1183
1184 compressor_scratch_bufs = buf;
1185 buf += compressor_cpus * vm_compressor_get_decode_scratch_size();
1186 bufsize -= compressor_cpus * vm_compressor_get_decode_scratch_size();
1187
1188 vm_compressor_kdp_state.kc_panic_scratch_buf = buf;
1189 buf += vm_compressor_get_decode_scratch_size();
1190 bufsize -= vm_compressor_get_decode_scratch_size();
1191
1192 /* This is set up before each stackshot in vm_compressor_kdp_init */
1193 vm_compressor_kdp_state.kc_scratch_bufs = NULL;
1194
1195 #if CONFIG_FREEZE
1196 freezer_context_global.freezer_ctx_compressor_scratch_buf = buf;
1197 buf += vm_compressor_get_encode_scratch_size();
1198 bufsize -= vm_compressor_get_encode_scratch_size();
1199 #endif
1200
1201 #if RECORD_THE_COMPRESSED_DATA
1202 c_compressed_record_sbuf = buf;
1203 c_compressed_record_cptr = buf;
1204 c_compressed_record_ebuf = c_compressed_record_sbuf + c_compressed_record_sbuf_size;
1205 buf += c_compressed_record_sbuf_size;
1206 bufsize -= c_compressed_record_sbuf_size;
1207 #endif
1208 assert(bufsize == 0);
1209 }
1210
1211 if (kernel_thread_start_priority((thread_continue_t)vm_compressor_swap_trigger_thread, NULL,
1212 BASEPRI_VM, &thread) != KERN_SUCCESS) {
1213 panic("vm_compressor_swap_trigger_thread: create failed");
1214 }
1215 thread_deallocate(thread);
1216
1217 if (vm_pageout_internal_start() != KERN_SUCCESS) {
1218 panic("vm_compressor_init: Failed to start the internal pageout thread.");
1219 }
1220 if (VM_CONFIG_SWAP_IS_PRESENT) {
1221 vm_compressor_swap_init();
1222 }
1223
1224 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
1225 vm_compressor_is_active = 1;
1226 }
1227
1228 vm_compressor_available = 1;
1229
1230 vm_page_reactivate_all_throttled();
1231
1232 bzero(&vmcs_stats, sizeof(struct vm_compressor_swapper_stats));
1233 }
1234
1235 #define COMPRESSOR_KDP_BUFSIZE (\
1236 (vm_compressor_get_decode_scratch_size() * compressor_cpus) + \
1237 (PAGE_SIZE * compressor_cpus)) + \
1238 (sizeof(*vm_compressor_kdp_state.kc_decompressed_pages_paddr) * compressor_cpus) + \
1239 (sizeof(*vm_compressor_kdp_state.kc_decompressed_pages_ppnum) * compressor_cpus)
1240
1241
1242 /**
1243 * Initializes the VM compressor in preparation for a stackshot.
1244 * Stackshot mutex must be held.
1245 */
1246 kern_return_t
vm_compressor_kdp_init(void)1247 vm_compressor_kdp_init(void)
1248 {
1249 char *buf;
1250 kern_return_t err;
1251 size_t bufsize;
1252 size_t total_decode_size;
1253
1254 #if DEVELOPMENT || DEBUG
1255 extern lck_mtx_t stackshot_subsys_mutex;
1256 lck_mtx_assert(&stackshot_subsys_mutex, LCK_MTX_ASSERT_OWNED);
1257 #endif /* DEVELOPMENT || DEBUG */
1258
1259 if (!vm_compressor_available) {
1260 return KERN_SUCCESS;
1261 }
1262
1263 bufsize = COMPRESSOR_KDP_BUFSIZE;
1264
1265 /* Allocate the per-cpu decompression pages. */
1266 err = kmem_alloc(kernel_map, (vm_offset_t *)&buf, bufsize,
1267 KMA_DATA_SHARED | KMA_NOFAIL | KMA_KOBJECT,
1268 VM_KERN_MEMORY_COMPRESSOR);
1269
1270 if (err != KERN_SUCCESS) {
1271 return err;
1272 }
1273
1274 assert(vm_compressor_kdp_state.kc_scratch_bufs == NULL);
1275 vm_compressor_kdp_state.kc_scratch_bufs = buf;
1276 total_decode_size = vm_compressor_get_decode_scratch_size() * compressor_cpus;
1277 buf += total_decode_size;
1278 bufsize -= total_decode_size;
1279
1280 /*
1281 * vm_compressor_kdp_state.kc_decompressed_page must be page aligned because we access
1282 * it through the physical aperture by page number.
1283 */
1284 assert(vm_compressor_kdp_state.kc_decompressed_pages == NULL);
1285 vm_compressor_kdp_state.kc_decompressed_pages = buf;
1286 buf += PAGE_SIZE * compressor_cpus;
1287 bufsize -= PAGE_SIZE * compressor_cpus;
1288
1289 /* Scary! This will be aligned, I promise :) */
1290 assert(((vm_address_t) buf) % _Alignof(addr64_t) == 0);
1291 assert(vm_compressor_kdp_state.kc_decompressed_pages_paddr == NULL);
1292 vm_compressor_kdp_state.kc_decompressed_pages_paddr = (addr64_t*) (void*) buf;
1293 buf += sizeof(*vm_compressor_kdp_state.kc_decompressed_pages_paddr) * compressor_cpus;
1294 bufsize -= sizeof(*vm_compressor_kdp_state.kc_decompressed_pages_paddr) * compressor_cpus;
1295
1296 assert(((vm_address_t) buf) % _Alignof(ppnum_t) == 0);
1297 assert(vm_compressor_kdp_state.kc_decompressed_pages_ppnum == NULL);
1298 vm_compressor_kdp_state.kc_decompressed_pages_ppnum = (ppnum_t*) (void*) buf;
1299 buf += sizeof(*vm_compressor_kdp_state.kc_decompressed_pages_ppnum) * compressor_cpus;
1300 bufsize -= sizeof(*vm_compressor_kdp_state.kc_decompressed_pages_ppnum) * compressor_cpus;
1301
1302 assert(bufsize == 0);
1303
1304 for (size_t i = 0; i < compressor_cpus; i++) {
1305 vm_offset_t offset = (vm_offset_t) &vm_compressor_kdp_state.kc_decompressed_pages[i * PAGE_SIZE];
1306 vm_compressor_kdp_state.kc_decompressed_pages_paddr[i] = kvtophys(offset);
1307 vm_compressor_kdp_state.kc_decompressed_pages_ppnum[i] = (ppnum_t) atop(vm_compressor_kdp_state.kc_decompressed_pages_paddr[i]);
1308 }
1309
1310 return KERN_SUCCESS;
1311 }
1312
1313 /*
1314 * Frees up compressor buffers used by stackshot.
1315 * Stackshot mutex must be held.
1316 */
1317 void
vm_compressor_kdp_teardown(void)1318 vm_compressor_kdp_teardown(void)
1319 {
1320 extern lck_mtx_t stackshot_subsys_mutex;
1321 LCK_MTX_ASSERT(&stackshot_subsys_mutex, LCK_MTX_ASSERT_OWNED);
1322
1323 if (vm_compressor_kdp_state.kc_scratch_bufs == NULL) {
1324 return;
1325 }
1326
1327 /* Deallocate the per-cpu decompression pages. */
1328 kmem_free(kernel_map, (vm_offset_t) vm_compressor_kdp_state.kc_scratch_bufs, COMPRESSOR_KDP_BUFSIZE);
1329
1330 vm_compressor_kdp_state.kc_scratch_bufs = NULL;
1331 vm_compressor_kdp_state.kc_decompressed_pages = NULL;
1332 vm_compressor_kdp_state.kc_decompressed_pages_paddr = 0;
1333 vm_compressor_kdp_state.kc_decompressed_pages_ppnum = 0;
1334 }
1335
1336 static uint32_t
c_slot_extra_size(c_slot_t cs)1337 c_slot_extra_size(c_slot_t cs)
1338 {
1339 #pragma unused(cs)
1340 return 0;
1341 }
1342
1343 #if VALIDATE_C_SEGMENTS
1344
1345 static void
c_seg_validate(c_segment_t c_seg,boolean_t must_be_compact)1346 c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact)
1347 {
1348 uint16_t c_indx;
1349 int32_t bytes_used;
1350 uint32_t c_rounded_size;
1351 uint32_t c_size;
1352 c_slot_t cs;
1353
1354 if (__probable(validate_c_segs == FALSE)) {
1355 return;
1356 }
1357 if (c_seg->c_firstemptyslot < c_seg->c_nextslot) {
1358 c_indx = c_seg->c_firstemptyslot;
1359 cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1360
1361 if (cs == NULL) {
1362 panic("c_seg_validate: no slot backing c_firstemptyslot");
1363 }
1364
1365 if (cs->c_size) {
1366 panic("c_seg_validate: c_firstemptyslot has non-zero size (%d)", cs->c_size);
1367 }
1368 }
1369 bytes_used = 0;
1370
1371 for (c_indx = 0; c_indx < c_seg->c_nextslot; c_indx++) {
1372 cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1373
1374 c_size = UNPACK_C_SIZE(cs);
1375
1376 c_rounded_size = C_SEG_ROUND_TO_ALIGNMENT(c_size + c_slot_extra_size(cs));
1377
1378 bytes_used += c_rounded_size;
1379
1380 #if CHECKSUM_THE_COMPRESSED_DATA
1381 unsigned csvhash;
1382 if (c_size && cs->c_hash_compressed_data != (csvhash = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size))) {
1383 addr64_t csvphys = kvtophys((vm_offset_t)&c_seg->c_store.c_buffer[cs->c_offset]);
1384 panic("Compressed data doesn't match original %p phys: 0x%llx %d %p %d %d 0x%x 0x%x", c_seg, csvphys, cs->c_offset, cs, c_indx, c_size, cs->c_hash_compressed_data, csvhash);
1385 }
1386 #endif
1387 #if POPCOUNT_THE_COMPRESSED_DATA
1388 unsigned csvpop;
1389 if (c_size) {
1390 uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
1391 if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
1392 panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, (uint64_t)cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
1393 }
1394 }
1395 #endif
1396 }
1397
1398 if (bytes_used != c_seg->c_bytes_used) {
1399 panic("c_seg_validate: bytes_used mismatch - found %d, segment has %d", bytes_used, c_seg->c_bytes_used);
1400 }
1401
1402 if (c_seg->c_bytes_used > C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset)) {
1403 panic("c_seg_validate: c_bytes_used > c_nextoffset - c_nextoffset = %d, c_bytes_used = %d",
1404 (int32_t)C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset), c_seg->c_bytes_used);
1405 }
1406
1407 if (must_be_compact) {
1408 if (c_seg->c_bytes_used != C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset)) {
1409 panic("c_seg_validate: c_bytes_used doesn't match c_nextoffset - c_nextoffset = %d, c_bytes_used = %d",
1410 (int32_t)C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset), c_seg->c_bytes_used);
1411 }
1412 }
1413 }
1414
1415 #endif
1416
1417
1418 void
c_seg_need_delayed_compaction(c_segment_t c_seg,boolean_t c_list_lock_held)1419 c_seg_need_delayed_compaction(c_segment_t c_seg, boolean_t c_list_lock_held)
1420 {
1421 boolean_t clear_busy = FALSE;
1422
1423 if (c_list_lock_held == FALSE) {
1424 if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1425 C_SEG_BUSY(c_seg);
1426
1427 lck_mtx_unlock_always(&c_seg->c_lock);
1428 lck_mtx_lock_spin_always(c_list_lock);
1429 lck_mtx_lock_spin_always(&c_seg->c_lock);
1430
1431 clear_busy = TRUE;
1432 }
1433 }
1434 assert(c_seg->c_state != C_IS_FILLING);
1435
1436 if (!c_seg->c_on_minorcompact_q && !(C_SEG_IS_ON_DISK_OR_SOQ(c_seg)) && !c_seg->c_has_donated_pages) {
1437 queue_enter(&c_minor_list_head, c_seg, c_segment_t, c_list);
1438 c_seg->c_on_minorcompact_q = 1;
1439 os_atomic_inc(&c_minor_count, relaxed);
1440 }
1441 if (c_list_lock_held == FALSE) {
1442 lck_mtx_unlock_always(c_list_lock);
1443 }
1444
1445 if (clear_busy == TRUE) {
1446 C_SEG_WAKEUP_DONE(c_seg);
1447 }
1448 }
1449
1450
1451 unsigned int c_seg_moved_to_sparse_list = 0;
1452
1453 void
c_seg_move_to_sparse_list(c_segment_t c_seg)1454 c_seg_move_to_sparse_list(c_segment_t c_seg)
1455 {
1456 boolean_t clear_busy = FALSE;
1457
1458 if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1459 C_SEG_BUSY(c_seg);
1460
1461 lck_mtx_unlock_always(&c_seg->c_lock);
1462 lck_mtx_lock_spin_always(c_list_lock);
1463 lck_mtx_lock_spin_always(&c_seg->c_lock);
1464
1465 clear_busy = TRUE;
1466 }
1467 c_seg_switch_state(c_seg, C_ON_SWAPPEDOUTSPARSE_Q, FALSE);
1468
1469 c_seg_moved_to_sparse_list++;
1470
1471 lck_mtx_unlock_always(c_list_lock);
1472
1473 if (clear_busy == TRUE) {
1474 C_SEG_WAKEUP_DONE(c_seg);
1475 }
1476 }
1477
1478
1479
1480
1481 int try_minor_compaction_failed = 0;
1482 int try_minor_compaction_succeeded = 0;
1483
1484 void
c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg)1485 c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg)
1486 {
1487 assert(c_seg->c_on_minorcompact_q);
1488 /*
1489 * c_seg is currently on the delayed minor compaction
1490 * queue and we have c_seg locked... if we can get the
1491 * c_list_lock w/o blocking (if we blocked we could deadlock
1492 * because the lock order is c_list_lock then c_seg's lock)
1493 * we'll pull it from the delayed list and free it directly
1494 */
1495 if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1496 /*
1497 * c_list_lock is held, we need to bail
1498 */
1499 try_minor_compaction_failed++;
1500
1501 lck_mtx_unlock_always(&c_seg->c_lock);
1502 } else {
1503 try_minor_compaction_succeeded++;
1504
1505 C_SEG_BUSY(c_seg);
1506 c_seg_do_minor_compaction_and_unlock(c_seg, TRUE, FALSE, FALSE);
1507 }
1508 }
1509
1510
1511 int
c_seg_do_minor_compaction_and_unlock(c_segment_t c_seg,boolean_t clear_busy,boolean_t need_list_lock,boolean_t disallow_page_replacement)1512 c_seg_do_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy, boolean_t need_list_lock, boolean_t disallow_page_replacement)
1513 {
1514 int c_seg_freed;
1515
1516 assert(c_seg->c_busy);
1517 assert(!C_SEG_IS_ON_DISK_OR_SOQ(c_seg));
1518
1519 /*
1520 * check for the case that can occur when we are not swapping
1521 * and this segment has been major compacted in the past
1522 * and moved to the majorcompact q to remove it from further
1523 * consideration... if the occupancy falls too low we need
1524 * to put it back on the age_q so that it will be considered
1525 * in the next major compaction sweep... if we don't do this
1526 * we will eventually run into the c_segments_limit
1527 */
1528 if (c_seg->c_state == C_ON_MAJORCOMPACT_Q && C_SEG_SHOULD_MAJORCOMPACT_NOW(c_seg)) {
1529 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
1530 }
1531 if (!c_seg->c_on_minorcompact_q) {
1532 if (clear_busy == TRUE) {
1533 C_SEG_WAKEUP_DONE(c_seg);
1534 }
1535
1536 lck_mtx_unlock_always(&c_seg->c_lock);
1537
1538 return 0;
1539 }
1540 queue_remove(&c_minor_list_head, c_seg, c_segment_t, c_list);
1541 c_seg->c_on_minorcompact_q = 0;
1542 os_atomic_dec(&c_minor_count, relaxed);
1543
1544 lck_mtx_unlock_always(c_list_lock);
1545
1546 if (disallow_page_replacement == TRUE) {
1547 lck_mtx_unlock_always(&c_seg->c_lock);
1548
1549 PAGE_REPLACEMENT_DISALLOWED(TRUE);
1550
1551 lck_mtx_lock_spin_always(&c_seg->c_lock);
1552 }
1553 c_seg_freed = c_seg_minor_compaction_and_unlock(c_seg, clear_busy);
1554
1555 if (disallow_page_replacement == TRUE) {
1556 PAGE_REPLACEMENT_DISALLOWED(FALSE);
1557 }
1558
1559 if (need_list_lock == TRUE) {
1560 lck_mtx_lock_spin_always(c_list_lock);
1561 }
1562
1563 return c_seg_freed;
1564 }
1565
1566 void
kdp_compressor_busy_find_owner(event64_t wait_event,thread_waitinfo_t * waitinfo)1567 kdp_compressor_busy_find_owner(event64_t wait_event, thread_waitinfo_t *waitinfo)
1568 {
1569 c_segment_t c_seg = (c_segment_t) wait_event;
1570
1571 waitinfo->owner = thread_tid(c_seg->c_busy_for_thread);
1572 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(c_seg);
1573 }
1574
1575 #if DEVELOPMENT || DEBUG
1576 int
do_cseg_wedge_thread(void)1577 do_cseg_wedge_thread(void)
1578 {
1579 struct c_segment c_seg;
1580 c_seg.c_busy_for_thread = current_thread();
1581
1582 debug_cseg_wait_event = (event_t) &c_seg;
1583
1584 thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor);
1585 assert_wait((event_t) (&c_seg), THREAD_INTERRUPTIBLE);
1586
1587 thread_block(THREAD_CONTINUE_NULL);
1588
1589 return 0;
1590 }
1591
1592 int
do_cseg_unwedge_thread(void)1593 do_cseg_unwedge_thread(void)
1594 {
1595 thread_wakeup(debug_cseg_wait_event);
1596 debug_cseg_wait_event = NULL;
1597
1598 return 0;
1599 }
1600 #endif /* DEVELOPMENT || DEBUG */
1601
1602 void
c_seg_wait_on_busy(c_segment_t c_seg)1603 c_seg_wait_on_busy(c_segment_t c_seg)
1604 {
1605 c_seg->c_wanted = 1;
1606
1607 thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor);
1608 assert_wait((event_t) (c_seg), THREAD_UNINT);
1609
1610 lck_mtx_unlock_always(&c_seg->c_lock);
1611 thread_block(THREAD_CONTINUE_NULL);
1612 }
1613
1614 #if CONFIG_FREEZE
1615 /*
1616 * We don't have the task lock held while updating the task's
1617 * c_seg queues. We can do that because of the following restrictions:
1618 *
1619 * - SINGLE FREEZER CONTEXT:
1620 * We 'insert' c_segs into the task list on the task_freeze path.
1621 * There can only be one such freeze in progress and the task
1622 * isn't disappearing because we have the VM map lock held throughout
1623 * and we have a reference on the proc too.
1624 *
1625 * - SINGLE TASK DISOWN CONTEXT:
1626 * We 'disown' c_segs of a task ONLY from the task_terminate context. So
1627 * we don't need the task lock but we need the c_list_lock and the
1628 * compressor master lock (shared). We also hold the individual
1629 * c_seg locks (exclusive).
1630 *
1631 * If we either:
1632 * - can't get the c_seg lock on a try, then we start again because maybe
1633 * the c_seg is part of a compaction and might get freed. So we can't trust
1634 * that linkage and need to restart our queue traversal.
1635 * - OR, we run into a busy c_seg (say being swapped in or free-ing) we
1636 * drop all locks again and wait and restart our queue traversal.
1637 *
1638 * - The new_owner_task below is currently only the kernel or NULL.
1639 *
1640 */
1641 void
c_seg_update_task_owner(c_segment_t c_seg,task_t new_owner_task)1642 c_seg_update_task_owner(c_segment_t c_seg, task_t new_owner_task)
1643 {
1644 task_t owner_task = c_seg->c_task_owner;
1645 uint64_t uncompressed_bytes = ((c_seg->c_slots_used) * PAGE_SIZE_64);
1646
1647 LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
1648 LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
1649
1650 if (owner_task) {
1651 task_update_frozen_to_swap_acct(owner_task, uncompressed_bytes, DEBIT_FROM_SWAP);
1652 queue_remove(&owner_task->task_frozen_cseg_q, c_seg,
1653 c_segment_t, c_task_list_next_cseg);
1654 }
1655
1656 if (new_owner_task) {
1657 queue_enter(&new_owner_task->task_frozen_cseg_q, c_seg,
1658 c_segment_t, c_task_list_next_cseg);
1659 task_update_frozen_to_swap_acct(new_owner_task, uncompressed_bytes, CREDIT_TO_SWAP);
1660 }
1661
1662 c_seg->c_task_owner = new_owner_task;
1663 }
1664
1665 void
task_disown_frozen_csegs(task_t owner_task)1666 task_disown_frozen_csegs(task_t owner_task)
1667 {
1668 c_segment_t c_seg = NULL, next_cseg = NULL;
1669
1670 again:
1671 PAGE_REPLACEMENT_DISALLOWED(TRUE);
1672 lck_mtx_lock_spin_always(c_list_lock);
1673
1674 for (c_seg = (c_segment_t) queue_first(&owner_task->task_frozen_cseg_q);
1675 !queue_end(&owner_task->task_frozen_cseg_q, (queue_entry_t) c_seg);
1676 c_seg = next_cseg) {
1677 next_cseg = (c_segment_t) queue_next(&c_seg->c_task_list_next_cseg);
1678
1679 if (!lck_mtx_try_lock_spin_always(&c_seg->c_lock)) {
1680 lck_mtx_unlock(c_list_lock);
1681 PAGE_REPLACEMENT_DISALLOWED(FALSE);
1682 goto again;
1683 }
1684
1685 if (c_seg->c_busy) {
1686 lck_mtx_unlock(c_list_lock);
1687 PAGE_REPLACEMENT_DISALLOWED(FALSE);
1688
1689 c_seg_wait_on_busy(c_seg);
1690
1691 goto again;
1692 }
1693 assert(c_seg->c_task_owner == owner_task);
1694 c_seg_update_task_owner(c_seg, kernel_task);
1695 lck_mtx_unlock_always(&c_seg->c_lock);
1696 }
1697
1698 lck_mtx_unlock(c_list_lock);
1699 PAGE_REPLACEMENT_DISALLOWED(FALSE);
1700 }
1701 #endif /* CONFIG_FREEZE */
1702
1703 void
c_seg_switch_state(c_segment_t c_seg,int new_state,boolean_t insert_head)1704 c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
1705 {
1706 int old_state = c_seg->c_state;
1707 queue_head_t *donate_swapout_list_head, *donate_swappedin_list_head;
1708 uint32_t *donate_swapout_count, *donate_swappedin_count;
1709
1710 /*
1711 * On macOS the donate queue is swapped first ie the c_early_swapout queue.
1712 * On other swap-capable platforms, we want to swap those out last. So we
1713 * use the c_late_swapout queue.
1714 */
1715 #if XNU_TARGET_OS_OSX /* tag:DONATE */
1716 #if (DEVELOPMENT || DEBUG)
1717 if (new_state != C_IS_FILLING) {
1718 LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
1719 }
1720 LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
1721 #endif /* DEVELOPMENT || DEBUG */
1722
1723 donate_swapout_list_head = &c_early_swapout_list_head;
1724 donate_swapout_count = &c_early_swapout_count;
1725 donate_swappedin_list_head = &c_early_swappedin_list_head;
1726 donate_swappedin_count = &c_early_swappedin_count;
1727 #else /* XNU_TARGET_OS_OSX */
1728 donate_swapout_list_head = &c_late_swapout_list_head;
1729 donate_swapout_count = &c_late_swapout_count;
1730 donate_swappedin_list_head = &c_late_swappedin_list_head;
1731 donate_swappedin_count = &c_late_swappedin_count;
1732 #endif /* XNU_TARGET_OS_OSX */
1733
1734 switch (old_state) {
1735 case C_IS_EMPTY:
1736 assert(new_state == C_IS_FILLING || new_state == C_IS_FREE);
1737
1738 c_empty_count--;
1739 break;
1740
1741 case C_IS_FILLING:
1742 assert(new_state == C_ON_AGE_Q || new_state == C_ON_SWAPOUT_Q);
1743
1744 queue_remove(&c_filling_list_head, c_seg, c_segment_t, c_age_list);
1745 c_filling_count--;
1746 break;
1747
1748 case C_ON_AGE_Q:
1749 assert(new_state == C_ON_SWAPOUT_Q || new_state == C_ON_MAJORCOMPACT_Q ||
1750 new_state == C_IS_FREE);
1751
1752 queue_remove(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1753 c_age_count--;
1754 break;
1755
1756 case C_ON_SWAPPEDIN_Q:
1757 if (c_seg->c_has_donated_pages) {
1758 assert(new_state == C_ON_SWAPOUT_Q || new_state == C_IS_FREE);
1759 queue_remove(donate_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1760 *donate_swappedin_count -= 1;
1761 } else {
1762 assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE);
1763 #if CONFIG_FREEZE
1764 assert(c_seg->c_has_freezer_pages);
1765 queue_remove(&c_early_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1766 c_early_swappedin_count--;
1767 #else /* CONFIG_FREEZE */
1768 queue_remove(&c_regular_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1769 c_regular_swappedin_count--;
1770 #endif /* CONFIG_FREEZE */
1771 }
1772 break;
1773
1774 case C_ON_SWAPOUT_Q:
1775 assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE || new_state == C_IS_EMPTY || new_state == C_ON_SWAPIO_Q);
1776
1777 #if CONFIG_FREEZE
1778 if (c_seg->c_has_freezer_pages) {
1779 if (c_seg->c_task_owner && (new_state != C_ON_SWAPIO_Q)) {
1780 c_seg_update_task_owner(c_seg, NULL);
1781 }
1782 queue_remove(&c_early_swapout_list_head, c_seg, c_segment_t, c_age_list);
1783 c_early_swapout_count--;
1784 } else
1785 #endif /* CONFIG_FREEZE */
1786 {
1787 if (c_seg->c_has_donated_pages) {
1788 queue_remove(donate_swapout_list_head, c_seg, c_segment_t, c_age_list);
1789 *donate_swapout_count -= 1;
1790 } else {
1791 queue_remove(&c_regular_swapout_list_head, c_seg, c_segment_t, c_age_list);
1792 c_regular_swapout_count--;
1793 }
1794 }
1795
1796 if (new_state == C_ON_AGE_Q) {
1797 c_seg->c_has_donated_pages = 0;
1798 }
1799 thread_wakeup((event_t)&compaction_swapper_running);
1800 break;
1801
1802 case C_ON_SWAPIO_Q:
1803 #if CONFIG_FREEZE
1804 if (c_seg->c_has_freezer_pages) {
1805 assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_AGE_Q);
1806 } else
1807 #endif /* CONFIG_FREEZE */
1808 {
1809 if (c_seg->c_has_donated_pages) {
1810 assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_SWAPPEDIN_Q);
1811 } else {
1812 assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_AGE_Q);
1813 }
1814 }
1815
1816 queue_remove(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1817 c_swapio_count--;
1818 break;
1819
1820 case C_ON_SWAPPEDOUT_Q:
1821 assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q ||
1822 new_state == C_ON_SWAPPEDOUTSPARSE_Q ||
1823 new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE);
1824
1825 queue_remove(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1826 c_swappedout_count--;
1827 break;
1828
1829 case C_ON_SWAPPEDOUTSPARSE_Q:
1830 assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q ||
1831 new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE);
1832
1833 queue_remove(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1834 c_swappedout_sparse_count--;
1835 break;
1836
1837 case C_ON_MAJORCOMPACT_Q:
1838 assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE);
1839
1840 queue_remove(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1841 c_major_count--;
1842 break;
1843
1844 case C_ON_BAD_Q:
1845 assert(new_state == C_IS_FREE);
1846
1847 queue_remove(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1848 c_bad_count--;
1849 break;
1850
1851 default:
1852 panic("c_seg %p has bad c_state = %d", c_seg, old_state);
1853 }
1854
1855 switch (new_state) {
1856 case C_IS_FREE:
1857 assert(old_state != C_IS_FILLING);
1858
1859 break;
1860
1861 case C_IS_EMPTY:
1862 assert(old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1863
1864 c_empty_count++;
1865 break;
1866
1867 case C_IS_FILLING:
1868 assert(old_state == C_IS_EMPTY);
1869
1870 queue_enter(&c_filling_list_head, c_seg, c_segment_t, c_age_list);
1871 c_filling_count++;
1872 break;
1873
1874 case C_ON_AGE_Q:
1875 assert(old_state == C_IS_FILLING || old_state == C_ON_SWAPPEDIN_Q ||
1876 old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPIO_Q ||
1877 old_state == C_ON_MAJORCOMPACT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1878
1879 assert(!c_seg->c_has_donated_pages);
1880 if (old_state == C_IS_FILLING) {
1881 queue_enter(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1882 } else {
1883 if (!queue_empty(&c_age_list_head)) {
1884 c_segment_t c_first;
1885
1886 c_first = (c_segment_t)queue_first(&c_age_list_head);
1887 c_seg->c_creation_ts = c_first->c_creation_ts;
1888 }
1889 queue_enter_first(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1890 }
1891 c_age_count++;
1892 break;
1893
1894 case C_ON_SWAPPEDIN_Q:
1895 {
1896 queue_head_t *list_head;
1897
1898 assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q || old_state == C_ON_SWAPIO_Q);
1899 if (c_seg->c_has_donated_pages) {
1900 /* Error in swapouts could happen while the c_seg is still on the swapio queue */
1901 list_head = donate_swappedin_list_head;
1902 *donate_swappedin_count += 1;
1903 } else {
1904 #if CONFIG_FREEZE
1905 assert(c_seg->c_has_freezer_pages);
1906 list_head = &c_early_swappedin_list_head;
1907 c_early_swappedin_count++;
1908 #else /* CONFIG_FREEZE */
1909 list_head = &c_regular_swappedin_list_head;
1910 c_regular_swappedin_count++;
1911 #endif /* CONFIG_FREEZE */
1912 }
1913
1914 if (insert_head == TRUE) {
1915 queue_enter_first(list_head, c_seg, c_segment_t, c_age_list);
1916 } else {
1917 queue_enter(list_head, c_seg, c_segment_t, c_age_list);
1918 }
1919 break;
1920 }
1921
1922 case C_ON_SWAPOUT_Q:
1923 {
1924 queue_head_t *list_head;
1925
1926 #if CONFIG_FREEZE
1927 /*
1928 * A segment with both identities of frozen + donated pages
1929 * will be put on early swapout Q ie the frozen identity wins.
1930 * This is because when both identities are set, the donation bit
1931 * is added on after in the c_current_seg_filled path for accounting
1932 * purposes.
1933 */
1934 if (c_seg->c_has_freezer_pages) {
1935 assert(old_state == C_ON_AGE_Q || old_state == C_IS_FILLING);
1936 list_head = &c_early_swapout_list_head;
1937 c_early_swapout_count++;
1938 } else
1939 #endif
1940 {
1941 if (c_seg->c_has_donated_pages) {
1942 assert(old_state == C_ON_SWAPPEDIN_Q || old_state == C_IS_FILLING);
1943 list_head = donate_swapout_list_head;
1944 *donate_swapout_count += 1;
1945 } else {
1946 assert(old_state == C_ON_AGE_Q || old_state == C_IS_FILLING);
1947 list_head = &c_regular_swapout_list_head;
1948 c_regular_swapout_count++;
1949 }
1950 }
1951
1952 if (insert_head == TRUE) {
1953 queue_enter_first(list_head, c_seg, c_segment_t, c_age_list);
1954 } else {
1955 queue_enter(list_head, c_seg, c_segment_t, c_age_list);
1956 }
1957 break;
1958 }
1959
1960 case C_ON_SWAPIO_Q:
1961 assert(old_state == C_ON_SWAPOUT_Q);
1962
1963 if (insert_head == TRUE) {
1964 queue_enter_first(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1965 } else {
1966 queue_enter(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1967 }
1968 c_swapio_count++;
1969 break;
1970
1971 case C_ON_SWAPPEDOUT_Q:
1972 assert(old_state == C_ON_SWAPIO_Q);
1973
1974 if (insert_head == TRUE) {
1975 queue_enter_first(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1976 } else {
1977 queue_enter(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1978 }
1979 c_swappedout_count++;
1980 break;
1981
1982 case C_ON_SWAPPEDOUTSPARSE_Q:
1983 assert(old_state == C_ON_SWAPIO_Q || old_state == C_ON_SWAPPEDOUT_Q);
1984
1985 if (insert_head == TRUE) {
1986 queue_enter_first(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1987 } else {
1988 queue_enter(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1989 }
1990
1991 c_swappedout_sparse_count++;
1992 break;
1993
1994 case C_ON_MAJORCOMPACT_Q:
1995 assert(old_state == C_ON_AGE_Q);
1996 assert(!c_seg->c_has_donated_pages);
1997
1998 if (insert_head == TRUE) {
1999 queue_enter_first(&c_major_list_head, c_seg, c_segment_t, c_age_list);
2000 } else {
2001 queue_enter(&c_major_list_head, c_seg, c_segment_t, c_age_list);
2002 }
2003 c_major_count++;
2004 break;
2005
2006 case C_ON_BAD_Q:
2007 assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
2008
2009 if (insert_head == TRUE) {
2010 queue_enter_first(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
2011 } else {
2012 queue_enter(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
2013 }
2014 c_bad_count++;
2015 break;
2016
2017 default:
2018 panic("c_seg %p requesting bad c_state = %d", c_seg, new_state);
2019 }
2020 c_seg->c_state = new_state;
2021 }
2022
2023
2024
2025 void
c_seg_free(c_segment_t c_seg)2026 c_seg_free(c_segment_t c_seg)
2027 {
2028 assert(c_seg->c_busy);
2029
2030 lck_mtx_unlock_always(&c_seg->c_lock);
2031 lck_mtx_lock_spin_always(c_list_lock);
2032 lck_mtx_lock_spin_always(&c_seg->c_lock);
2033
2034 c_seg_free_locked(c_seg);
2035 }
2036
2037
2038 void
c_seg_free_locked(c_segment_t c_seg)2039 c_seg_free_locked(c_segment_t c_seg)
2040 {
2041 int segno;
2042 int pages_populated = 0;
2043 int32_t *c_buffer = NULL;
2044 uint64_t c_swap_handle = 0;
2045
2046 assert(c_seg->c_busy);
2047 assert(c_seg->c_slots_used == 0);
2048 assert(!c_seg->c_on_minorcompact_q);
2049 assert(!c_seg->c_busy_swapping);
2050
2051 if (c_seg->c_overage_swap == TRUE) {
2052 c_overage_swapped_count--;
2053 c_seg->c_overage_swap = FALSE;
2054 }
2055 if (!(C_SEG_IS_ONDISK(c_seg))) {
2056 c_buffer = c_seg->c_store.c_buffer;
2057 } else {
2058 c_swap_handle = c_seg->c_store.c_swap_handle;
2059 }
2060
2061 c_seg_switch_state(c_seg, C_IS_FREE, FALSE);
2062
2063 if (c_buffer) {
2064 pages_populated = (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) / PAGE_SIZE;
2065 c_seg->c_store.c_buffer = NULL;
2066 } else {
2067 #if CONFIG_FREEZE
2068 c_seg_update_task_owner(c_seg, NULL);
2069 #endif /* CONFIG_FREEZE */
2070
2071 c_seg->c_store.c_swap_handle = (uint64_t)-1;
2072 }
2073
2074 lck_mtx_unlock_always(&c_seg->c_lock);
2075
2076 lck_mtx_unlock_always(c_list_lock);
2077
2078 if (c_buffer) {
2079 if (pages_populated) {
2080 kernel_memory_depopulate((vm_offset_t)c_buffer,
2081 ptoa(pages_populated), KMA_COMPRESSOR,
2082 VM_KERN_MEMORY_COMPRESSOR);
2083 }
2084 } else if (c_swap_handle) {
2085 /*
2086 * Free swap space on disk.
2087 */
2088 vm_swap_free(c_swap_handle);
2089 }
2090 lck_mtx_lock_spin_always(&c_seg->c_lock);
2091 /*
2092 * c_seg must remain busy until
2093 * after the call to vm_swap_free
2094 */
2095 C_SEG_WAKEUP_DONE(c_seg);
2096 lck_mtx_unlock_always(&c_seg->c_lock);
2097
2098 segno = c_seg->c_mysegno;
2099
2100 lck_mtx_lock_spin_always(c_list_lock);
2101 /*
2102 * because the c_buffer is now associated with the segno,
2103 * we can't put the segno back on the free list until
2104 * after we have depopulated the c_buffer range, or
2105 * we run the risk of depopulating a range that is
2106 * now being used in one of the compressor heads
2107 */
2108 c_segments_get(segno)->c_segno = c_free_segno_head;
2109 c_free_segno_head = segno;
2110 c_segment_count--;
2111
2112 lck_mtx_unlock_always(c_list_lock);
2113
2114 lck_mtx_destroy(&c_seg->c_lock, &vm_compressor_lck_grp);
2115
2116 if (c_seg->c_slot_var_array_len) {
2117 kfree_type(struct c_slot, c_seg->c_slot_var_array_len,
2118 c_seg->c_slot_var_array);
2119 }
2120
2121 zfree(compressor_segment_zone, c_seg);
2122 }
2123
2124 #if DEVELOPMENT || DEBUG
2125 int c_seg_trim_page_count = 0;
2126 #endif
2127
2128 void
c_seg_trim_tail(c_segment_t c_seg)2129 c_seg_trim_tail(c_segment_t c_seg)
2130 {
2131 c_slot_t cs;
2132 uint32_t c_size;
2133 uint32_t c_offset;
2134 uint32_t c_rounded_size;
2135 uint16_t current_nextslot;
2136 uint32_t current_populated_offset;
2137
2138 if (c_seg->c_bytes_used == 0) {
2139 return;
2140 }
2141 current_nextslot = c_seg->c_nextslot;
2142 current_populated_offset = c_seg->c_populated_offset;
2143
2144 while (c_seg->c_nextslot) {
2145 cs = C_SEG_SLOT_FROM_INDEX(c_seg, (c_seg->c_nextslot - 1));
2146
2147 c_size = UNPACK_C_SIZE(cs);
2148
2149 if (c_size) {
2150 if (current_nextslot != c_seg->c_nextslot) {
2151 c_rounded_size = C_SEG_ROUND_TO_ALIGNMENT(c_size + c_slot_extra_size(cs));
2152 c_offset = cs->c_offset + C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2153
2154 c_seg->c_nextoffset = c_offset;
2155 c_seg->c_populated_offset = (c_offset + (C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1)) &
2156 ~(C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1);
2157
2158 if (c_seg->c_firstemptyslot > c_seg->c_nextslot) {
2159 c_seg->c_firstemptyslot = c_seg->c_nextslot;
2160 }
2161 #if DEVELOPMENT || DEBUG
2162 c_seg_trim_page_count += ((round_page_32(C_SEG_OFFSET_TO_BYTES(current_populated_offset)) -
2163 round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) /
2164 PAGE_SIZE);
2165 #endif
2166 }
2167 break;
2168 }
2169 c_seg->c_nextslot--;
2170 }
2171 assert(c_seg->c_nextslot);
2172 }
2173
2174
2175 int
c_seg_minor_compaction_and_unlock(c_segment_t c_seg,boolean_t clear_busy)2176 c_seg_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy)
2177 {
2178 c_slot_mapping_t slot_ptr;
2179 uint32_t c_offset = 0;
2180 uint32_t old_populated_offset;
2181 uint32_t c_rounded_size;
2182 uint32_t c_size;
2183 uint16_t c_indx = 0;
2184 int i;
2185 c_slot_t c_dst;
2186 c_slot_t c_src;
2187
2188 assert(c_seg->c_busy);
2189
2190 #if VALIDATE_C_SEGMENTS
2191 c_seg_validate(c_seg, FALSE);
2192 #endif
2193 if (c_seg->c_bytes_used == 0) {
2194 c_seg_free(c_seg);
2195 return 1;
2196 }
2197 lck_mtx_unlock_always(&c_seg->c_lock);
2198
2199 if (c_seg->c_firstemptyslot >= c_seg->c_nextslot || C_SEG_UNUSED_BYTES(c_seg) < PAGE_SIZE) {
2200 goto done;
2201 }
2202
2203 /* TODO: assert first emptyslot's c_size is actually 0 */
2204
2205 #if DEVELOPMENT || DEBUG
2206 C_SEG_MAKE_WRITEABLE(c_seg);
2207 #endif
2208
2209 #if VALIDATE_C_SEGMENTS
2210 c_seg->c_was_minor_compacted++;
2211 #endif
2212 c_indx = c_seg->c_firstemptyslot;
2213 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
2214
2215 old_populated_offset = c_seg->c_populated_offset;
2216 c_offset = c_dst->c_offset;
2217
2218 for (i = c_indx + 1; i < c_seg->c_nextslot && c_offset < c_seg->c_nextoffset; i++) {
2219 c_src = C_SEG_SLOT_FROM_INDEX(c_seg, i);
2220
2221 c_size = UNPACK_C_SIZE(c_src);
2222
2223 if (c_size == 0) {
2224 continue;
2225 }
2226
2227 c_rounded_size = C_SEG_ROUND_TO_ALIGNMENT(c_size + c_slot_extra_size(c_src));
2228
2229 /* N.B.: This memcpy may be an overlapping copy */
2230 memcpy(&c_seg->c_store.c_buffer[c_offset], &c_seg->c_store.c_buffer[c_src->c_offset], c_rounded_size);
2231
2232 cslot_copy(c_dst, c_src);
2233 c_dst->c_offset = c_offset;
2234
2235 slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
2236 slot_ptr->s_cindx = c_indx;
2237
2238 c_offset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2239 PACK_C_SIZE(c_src, 0);
2240 c_indx++;
2241
2242 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
2243 }
2244 c_seg->c_firstemptyslot = c_indx;
2245 c_seg->c_nextslot = c_indx;
2246 c_seg->c_nextoffset = c_offset;
2247 c_seg->c_populated_offset = (c_offset + (C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1)) & ~(C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1);
2248 c_seg->c_bytes_unused = 0;
2249
2250 #if VALIDATE_C_SEGMENTS
2251 c_seg_validate(c_seg, TRUE);
2252 #endif
2253 if (old_populated_offset > c_seg->c_populated_offset) {
2254 uint32_t gc_size;
2255 int32_t *gc_ptr;
2256
2257 gc_size = C_SEG_OFFSET_TO_BYTES(old_populated_offset - c_seg->c_populated_offset);
2258 gc_ptr = &c_seg->c_store.c_buffer[c_seg->c_populated_offset];
2259
2260 kernel_memory_depopulate((vm_offset_t)gc_ptr, gc_size,
2261 KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
2262 }
2263
2264 #if DEVELOPMENT || DEBUG
2265 C_SEG_WRITE_PROTECT(c_seg);
2266 #endif
2267
2268 done:
2269 if (clear_busy == TRUE) {
2270 lck_mtx_lock_spin_always(&c_seg->c_lock);
2271 C_SEG_WAKEUP_DONE(c_seg);
2272 lck_mtx_unlock_always(&c_seg->c_lock);
2273 }
2274 return 0;
2275 }
2276
2277
2278 static void
c_seg_alloc_nextslot(c_segment_t c_seg)2279 c_seg_alloc_nextslot(c_segment_t c_seg)
2280 {
2281 struct c_slot *old_slot_array = NULL;
2282 struct c_slot *new_slot_array = NULL;
2283 int newlen;
2284 int oldlen;
2285
2286 if (c_seg->c_nextslot < c_seg_fixed_array_len) {
2287 return;
2288 }
2289
2290 if ((c_seg->c_nextslot - c_seg_fixed_array_len) >= c_seg->c_slot_var_array_len) {
2291 oldlen = c_seg->c_slot_var_array_len;
2292 old_slot_array = c_seg->c_slot_var_array;
2293
2294 if (oldlen == 0) {
2295 newlen = c_seg_slot_var_array_min_len;
2296 } else {
2297 newlen = oldlen * 2;
2298 }
2299
2300 new_slot_array = kalloc_type(struct c_slot, newlen, Z_WAITOK);
2301
2302 lck_mtx_lock_spin_always(&c_seg->c_lock);
2303
2304 if (old_slot_array) {
2305 memcpy(new_slot_array, old_slot_array,
2306 sizeof(struct c_slot) * oldlen);
2307 }
2308
2309 c_seg->c_slot_var_array_len = newlen;
2310 c_seg->c_slot_var_array = new_slot_array;
2311
2312 lck_mtx_unlock_always(&c_seg->c_lock);
2313
2314 kfree_type(struct c_slot, oldlen, old_slot_array);
2315 }
2316 }
2317
2318
2319 #define C_SEG_MAJOR_COMPACT_STATS_MAX (30)
2320
2321 struct {
2322 uint64_t asked_permission;
2323 uint64_t compactions;
2324 uint64_t moved_slots;
2325 uint64_t moved_bytes;
2326 uint64_t wasted_space_in_swapouts;
2327 uint64_t count_of_swapouts;
2328 uint64_t count_of_freed_segs;
2329 uint64_t bailed_compactions;
2330 uint64_t bytes_freed_rate_us;
2331 } c_seg_major_compact_stats[C_SEG_MAJOR_COMPACT_STATS_MAX];
2332
2333 int c_seg_major_compact_stats_now = 0;
2334
2335
2336 #define C_MAJOR_COMPACTION_SIZE_APPROPRIATE ((c_seg_bufsize * 90) / 100)
2337
2338
2339 boolean_t
c_seg_major_compact_ok(c_segment_t c_seg_dst,c_segment_t c_seg_src)2340 c_seg_major_compact_ok(
2341 c_segment_t c_seg_dst,
2342 c_segment_t c_seg_src)
2343 {
2344 c_seg_major_compact_stats[c_seg_major_compact_stats_now].asked_permission++;
2345
2346 if (c_seg_src->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE &&
2347 c_seg_dst->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE) {
2348 return FALSE;
2349 }
2350
2351 if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
2352 /*
2353 * destination segment is full... can't compact
2354 */
2355 return FALSE;
2356 }
2357
2358 return TRUE;
2359 }
2360
2361 /*
2362 * Move slots from src to dst
2363 * returns TRUE if we can continue compacting further to the same dst segment
2364 */
2365 boolean_t
c_seg_major_compact(c_segment_t c_seg_dst,c_segment_t c_seg_src)2366 c_seg_major_compact(
2367 c_segment_t c_seg_dst,
2368 c_segment_t c_seg_src)
2369 {
2370 c_slot_mapping_t slot_ptr;
2371 uint32_t c_rounded_size;
2372 uint32_t c_size;
2373 uint16_t dst_slot;
2374 int i;
2375 c_slot_t c_dst;
2376 c_slot_t c_src;
2377 boolean_t keep_compacting = TRUE;
2378
2379 /*
2380 * segments are not locked but they are both marked c_busy
2381 * which keeps c_decompress from working on them...
2382 * we can safely allocate new pages, move compressed data
2383 * from c_seg_src to c_seg_dst and update both c_segment's
2384 * state w/o holding the master lock
2385 */
2386 #if DEVELOPMENT || DEBUG
2387 C_SEG_MAKE_WRITEABLE(c_seg_dst);
2388 #endif
2389
2390 #if VALIDATE_C_SEGMENTS
2391 c_seg_dst->c_was_major_compacted++;
2392 c_seg_src->c_was_major_donor++;
2393 #endif
2394 assertf(c_seg_dst->c_has_donated_pages == c_seg_src->c_has_donated_pages, "Mismatched donation status Dst: %p, Src: %p\n", c_seg_dst, c_seg_src);
2395 c_seg_major_compact_stats[c_seg_major_compact_stats_now].compactions++;
2396
2397 dst_slot = c_seg_dst->c_nextslot;
2398
2399 for (i = 0; i < c_seg_src->c_nextslot; i++) {
2400 c_src = C_SEG_SLOT_FROM_INDEX(c_seg_src, i);
2401
2402 c_size = UNPACK_C_SIZE(c_src);
2403
2404 if (c_size == 0) {
2405 /* BATCH: move what we have so far; */
2406 continue;
2407 }
2408
2409 int combined_size = c_size + c_slot_extra_size(c_src);
2410
2411 c_rounded_size = C_SEG_ROUND_TO_ALIGNMENT(combined_size);
2412
2413 int size_left = c_seg_bufsize - C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_nextoffset);
2414 /* we're going to increment c_nextoffset by c_rounded_size so it should not overflow the segment bufsize */
2415 if (size_left < c_rounded_size) {
2416 keep_compacting = FALSE;
2417 break;
2418 }
2419
2420 /* Do we have enough populated space left in dst? */
2421 assertf(c_seg_dst->c_populated_offset >= c_seg_dst->c_nextoffset, "Unexpected segment offsets: %u,%u", c_seg_dst->c_populated_offset, c_seg_dst->c_nextoffset);
2422 if (C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset - c_seg_dst->c_nextoffset) < (unsigned) combined_size) {
2423 int size_to_populate;
2424
2425 /* eagerly populate the entire segment in expectation to fill it */
2426 assert(c_seg_bufsize >= C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset));
2427 size_to_populate = c_seg_bufsize - C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset);
2428
2429 if (size_to_populate == 0) {
2430 /* can't populate any more pages in this segment */
2431 keep_compacting = FALSE;
2432 break;
2433 }
2434 if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) {
2435 size_to_populate = C_SEG_MAX_POPULATE_SIZE;
2436 }
2437
2438 kernel_memory_populate(
2439 (vm_offset_t) &c_seg_dst->c_store.c_buffer[c_seg_dst->c_populated_offset],
2440 size_to_populate,
2441 KMA_NOFAIL | KMA_COMPRESSOR,
2442 VM_KERN_MEMORY_COMPRESSOR);
2443
2444 c_seg_dst->c_populated_offset += C_SEG_BYTES_TO_OFFSET(size_to_populate);
2445 assert(C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset) <= c_seg_bufsize);
2446 }
2447 c_seg_alloc_nextslot(c_seg_dst);
2448
2449 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot);
2450
2451 /*
2452 * We don't want pages to get stolen by the contiguous memory allocator
2453 * when copying data from one segment to another.
2454 */
2455 PAGE_REPLACEMENT_DISALLOWED(TRUE);
2456 memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], combined_size);
2457 PAGE_REPLACEMENT_DISALLOWED(FALSE);
2458
2459 c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_slots++;
2460 c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_bytes += combined_size;
2461
2462 cslot_copy(c_dst, c_src);
2463 c_dst->c_offset = c_seg_dst->c_nextoffset;
2464
2465 if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) {
2466 c_seg_dst->c_firstemptyslot++;
2467 }
2468 c_seg_dst->c_slots_used++;
2469 c_seg_dst->c_nextslot++;
2470 c_seg_dst->c_bytes_used += c_rounded_size;
2471 c_seg_dst->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2472
2473 PACK_C_SIZE(c_src, 0);
2474
2475 c_seg_src->c_bytes_used -= c_rounded_size;
2476 c_seg_src->c_bytes_unused += c_rounded_size;
2477 c_seg_src->c_firstemptyslot = 0;
2478
2479 assert(c_seg_src->c_slots_used);
2480 c_seg_src->c_slots_used--;
2481
2482 if (!c_seg_src->c_swappedin) {
2483 /* Pessimistically lose swappedin status when non-swappedin pages are added. */
2484 c_seg_dst->c_swappedin = false;
2485 }
2486
2487 if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
2488 /* dest segment is now full */
2489 keep_compacting = FALSE;
2490 break;
2491 }
2492 }
2493 #if DEVELOPMENT || DEBUG
2494 C_SEG_WRITE_PROTECT(c_seg_dst);
2495 #endif
2496 if (dst_slot < c_seg_dst->c_nextslot) {
2497 PAGE_REPLACEMENT_ALLOWED(TRUE);
2498 /*
2499 * we've now locked out c_decompress from
2500 * converting the slot passed into it into
2501 * a c_segment_t which allows us to use
2502 * the backptr to change which c_segment and
2503 * index the slot points to
2504 */
2505 while (dst_slot < c_seg_dst->c_nextslot) {
2506 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, dst_slot);
2507
2508 slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
2509 /* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
2510 slot_ptr->s_cseg = c_seg_dst->c_mysegno + 1;
2511 slot_ptr->s_cindx = dst_slot++;
2512 }
2513 PAGE_REPLACEMENT_ALLOWED(FALSE);
2514 }
2515 return keep_compacting;
2516 }
2517
2518
2519 uint64_t
vm_compressor_compute_elapsed_msecs(clock_sec_t end_sec,clock_nsec_t end_nsec,clock_sec_t start_sec,clock_nsec_t start_nsec)2520 vm_compressor_compute_elapsed_msecs(clock_sec_t end_sec, clock_nsec_t end_nsec, clock_sec_t start_sec, clock_nsec_t start_nsec)
2521 {
2522 uint64_t end_msecs;
2523 uint64_t start_msecs;
2524
2525 end_msecs = (end_sec * 1000) + end_nsec / 1000000;
2526 start_msecs = (start_sec * 1000) + start_nsec / 1000000;
2527
2528 return end_msecs - start_msecs;
2529 }
2530
2531
2532
2533 uint32_t compressor_eval_period_in_msecs = 250;
2534 uint32_t compressor_sample_min_in_msecs = 500;
2535 uint32_t compressor_sample_max_in_msecs = 10000;
2536 uint32_t compressor_thrashing_threshold_per_10msecs = 50;
2537 uint32_t compressor_thrashing_min_per_10msecs = 20;
2538
2539 /* When true, reset sample data next chance we get. */
2540 static boolean_t compressor_need_sample_reset = FALSE;
2541
2542
2543 void
compute_swapout_target_age(void)2544 compute_swapout_target_age(void)
2545 {
2546 clock_sec_t cur_ts_sec;
2547 clock_nsec_t cur_ts_nsec;
2548 uint32_t min_operations_needed_in_this_sample;
2549 uint64_t elapsed_msecs_in_eval;
2550 uint64_t elapsed_msecs_in_sample;
2551 boolean_t need_eval_reset = FALSE;
2552
2553 clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
2554
2555 elapsed_msecs_in_sample = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, start_of_sample_period_sec, start_of_sample_period_nsec);
2556
2557 if (compressor_need_sample_reset ||
2558 elapsed_msecs_in_sample >= compressor_sample_max_in_msecs) {
2559 compressor_need_sample_reset = TRUE;
2560 need_eval_reset = TRUE;
2561 goto done;
2562 }
2563 elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, start_of_eval_period_sec, start_of_eval_period_nsec);
2564
2565 if (elapsed_msecs_in_eval < compressor_eval_period_in_msecs) {
2566 goto done;
2567 }
2568 need_eval_reset = TRUE;
2569
2570 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_START, elapsed_msecs_in_eval, sample_period_compression_count, sample_period_decompression_count, 0, 0);
2571
2572 min_operations_needed_in_this_sample = (compressor_thrashing_min_per_10msecs * (uint32_t)elapsed_msecs_in_eval) / 10;
2573
2574 if ((sample_period_compression_count - last_eval_compression_count) < min_operations_needed_in_this_sample ||
2575 (sample_period_decompression_count - last_eval_decompression_count) < min_operations_needed_in_this_sample) {
2576 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, sample_period_compression_count - last_eval_compression_count,
2577 sample_period_decompression_count - last_eval_decompression_count, 0, 1, 0);
2578
2579 swapout_target_age = 0;
2580
2581 compressor_need_sample_reset = TRUE;
2582 need_eval_reset = TRUE;
2583 goto done;
2584 }
2585 last_eval_compression_count = sample_period_compression_count;
2586 last_eval_decompression_count = sample_period_decompression_count;
2587
2588 if (elapsed_msecs_in_sample < compressor_sample_min_in_msecs) {
2589 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, swapout_target_age, 0, 0, 5, 0);
2590 goto done;
2591 }
2592 if (sample_period_decompression_count > ((compressor_thrashing_threshold_per_10msecs * elapsed_msecs_in_sample) / 10)) {
2593 uint64_t running_total;
2594 uint64_t working_target;
2595 uint64_t aging_target;
2596 uint32_t oldest_age_of_csegs_sampled = 0;
2597 uint64_t working_set_approximation = 0;
2598
2599 swapout_target_age = 0;
2600
2601 working_target = (sample_period_decompression_count / 100) * 95; /* 95 percent */
2602 aging_target = (sample_period_decompression_count / 100) * 1; /* 1 percent */
2603 running_total = 0;
2604
2605 for (oldest_age_of_csegs_sampled = 0; oldest_age_of_csegs_sampled < DECOMPRESSION_SAMPLE_MAX_AGE; oldest_age_of_csegs_sampled++) {
2606 running_total += age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2607
2608 working_set_approximation += oldest_age_of_csegs_sampled * age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2609
2610 if (running_total >= working_target) {
2611 break;
2612 }
2613 }
2614 if (oldest_age_of_csegs_sampled < DECOMPRESSION_SAMPLE_MAX_AGE) {
2615 working_set_approximation = (working_set_approximation * 1000) / elapsed_msecs_in_sample;
2616
2617 if (working_set_approximation < VM_PAGE_COMPRESSOR_COUNT) {
2618 running_total = overage_decompressions_during_sample_period;
2619
2620 for (oldest_age_of_csegs_sampled = DECOMPRESSION_SAMPLE_MAX_AGE - 1; oldest_age_of_csegs_sampled; oldest_age_of_csegs_sampled--) {
2621 running_total += age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2622
2623 if (running_total >= aging_target) {
2624 break;
2625 }
2626 }
2627 swapout_target_age = (uint32_t)cur_ts_sec - oldest_age_of_csegs_sampled;
2628
2629 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, swapout_target_age, working_set_approximation, VM_PAGE_COMPRESSOR_COUNT, 2, 0);
2630 } else {
2631 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, working_set_approximation, VM_PAGE_COMPRESSOR_COUNT, 0, 3, 0);
2632 }
2633 } else {
2634 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, working_target, running_total, 0, 4, 0);
2635 }
2636
2637 compressor_need_sample_reset = TRUE;
2638 need_eval_reset = TRUE;
2639 } else {
2640 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, sample_period_decompression_count, (compressor_thrashing_threshold_per_10msecs * elapsed_msecs_in_sample) / 10, 0, 6, 0);
2641 }
2642 done:
2643 if (compressor_need_sample_reset == TRUE) {
2644 bzero(age_of_decompressions_during_sample_period, sizeof(age_of_decompressions_during_sample_period));
2645 overage_decompressions_during_sample_period = 0;
2646
2647 start_of_sample_period_sec = cur_ts_sec;
2648 start_of_sample_period_nsec = cur_ts_nsec;
2649 sample_period_decompression_count = 0;
2650 sample_period_compression_count = 0;
2651 last_eval_decompression_count = 0;
2652 last_eval_compression_count = 0;
2653 compressor_need_sample_reset = FALSE;
2654 }
2655 if (need_eval_reset == TRUE) {
2656 start_of_eval_period_sec = cur_ts_sec;
2657 start_of_eval_period_nsec = cur_ts_nsec;
2658 }
2659 }
2660
2661
2662 int compaction_swapper_init_now = 0;
2663 int compaction_swapper_running = 0;
2664 int compaction_swapper_awakened = 0;
2665 int compaction_swapper_abort = 0;
2666
2667 bool
vm_compressor_swapout_is_ripe()2668 vm_compressor_swapout_is_ripe()
2669 {
2670 bool is_ripe = false;
2671 if (vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit) {
2672 c_segment_t c_seg;
2673 clock_sec_t now;
2674 clock_sec_t age;
2675 clock_nsec_t nsec;
2676
2677 clock_get_system_nanotime(&now, &nsec);
2678 age = 0;
2679
2680 lck_mtx_lock_spin_always(c_list_lock);
2681
2682 if (!queue_empty(&c_age_list_head)) {
2683 c_seg = (c_segment_t) queue_first(&c_age_list_head);
2684
2685 age = now - c_seg->c_creation_ts;
2686 }
2687 lck_mtx_unlock_always(c_list_lock);
2688
2689 if (age >= vm_ripe_target_age) {
2690 is_ripe = true;
2691 }
2692 }
2693 return is_ripe;
2694 }
2695
2696 static bool
compressor_swapout_conditions_met(void)2697 compressor_swapout_conditions_met(void)
2698 {
2699 bool should_swap = false;
2700 if (COMPRESSOR_NEEDS_TO_SWAP()) {
2701 should_swap = true;
2702 vmcs_stats.compressor_swap_threshold_exceeded++;
2703 }
2704 if (VM_PAGE_Q_THROTTLED(&vm_pageout_queue_external) && vm_page_anonymous_count < (vm_page_inactive_count / 20)) {
2705 should_swap = true;
2706 vmcs_stats.external_q_throttled++;
2707 }
2708 if (vm_page_free_count < (vm_page_free_reserved - (COMPRESSOR_FREE_RESERVED_LIMIT * 2))) {
2709 should_swap = true;
2710 vmcs_stats.free_count_below_reserve++;
2711 }
2712 return should_swap;
2713 }
2714
2715 static bool
compressor_needs_to_swap()2716 compressor_needs_to_swap()
2717 {
2718 bool should_swap = false;
2719 if (vm_compressor_swapout_is_ripe()) {
2720 should_swap = true;
2721 goto check_if_low_space;
2722 }
2723
2724 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2725 should_swap = compressor_swapout_conditions_met();
2726 if (should_swap) {
2727 goto check_if_low_space;
2728 }
2729 }
2730
2731 #if (XNU_TARGET_OS_OSX && __arm64__)
2732 /*
2733 * Thrashing detection disabled.
2734 */
2735 #else /* (XNU_TARGET_OS_OSX && __arm64__) */
2736
2737 if (vm_compressor_is_thrashing()) {
2738 should_swap = true;
2739 vmcs_stats.thrashing_detected++;
2740 }
2741
2742 #if CONFIG_PHANTOM_CACHE
2743 if (vm_phantom_cache_check_pressure()) {
2744 os_atomic_store(&memorystatus_phantom_cache_pressure, true, release);
2745 should_swap = true;
2746 }
2747 #endif
2748 if (swapout_target_age) {
2749 should_swap = true;
2750 }
2751 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
2752
2753 check_if_low_space:
2754
2755 #if CONFIG_JETSAM
2756 if (should_swap || vm_compressor_low_on_space()) {
2757 if (vm_compressor_thrashing_detected == FALSE) {
2758 vm_compressor_thrashing_detected = TRUE;
2759
2760 if (swapout_target_age) {
2761 compressor_thrashing_induced_jetsam++;
2762 } else if (vm_compressor_low_on_space()) {
2763 compressor_thrashing_induced_jetsam++;
2764 } else {
2765 filecache_thrashing_induced_jetsam++;
2766 }
2767 /*
2768 * Wake up the memorystatus thread so that it can return
2769 * the system to a healthy state (by killing processes).
2770 */
2771 memorystatus_thread_wake();
2772 }
2773 /*
2774 * let the jetsam take precedence over
2775 * any major compactions we might have
2776 * been able to do... otherwise we run
2777 * the risk of doing major compactions
2778 * on segments we're about to free up
2779 * due to the jetsam activity.
2780 */
2781 should_swap = false;
2782 if (memorystatus_swap_all_apps && vm_swap_low_on_space()) {
2783 memorystatus_respond_to_swap_exhaustion();
2784 }
2785 }
2786 #else /* CONFIG_JETSAM */
2787 if (should_swap && vm_swap_low_on_space()) {
2788 memorystatus_respond_to_swap_exhaustion();
2789 }
2790 #endif /* CONFIG_JETSAM */
2791
2792 if (should_swap == false) {
2793 /*
2794 * vm_compressor_needs_to_major_compact returns true only if we're
2795 * about to run out of available compressor segments... in this
2796 * case, we absolutely need to run a major compaction even if
2797 * we've just kicked off a jetsam or we don't otherwise need to
2798 * swap... terminating objects releases
2799 * pages back to the uncompressed cache, but does not guarantee
2800 * that we will free up even a single compression segment
2801 */
2802 should_swap = vm_compressor_needs_to_major_compact();
2803 if (should_swap) {
2804 vmcs_stats.fragmentation_detected++;
2805 }
2806 }
2807
2808 /*
2809 * returning TRUE when swap_supported == FALSE
2810 * will cause the major compaction engine to
2811 * run, but will not trigger any swapping...
2812 * segments that have been major compacted
2813 * will be moved to the majorcompact queue
2814 */
2815 return should_swap;
2816 }
2817
2818 #if CONFIG_JETSAM
2819 /*
2820 * This function is called from the jetsam thread after killing something to
2821 * mitigate thrashing.
2822 *
2823 * We need to restart our thrashing detection heuristics since memory pressure
2824 * has potentially changed significantly, and we don't want to detect on old
2825 * data from before the jetsam.
2826 */
2827 void
vm_thrashing_jetsam_done(void)2828 vm_thrashing_jetsam_done(void)
2829 {
2830 vm_compressor_thrashing_detected = FALSE;
2831
2832 /* Were we compressor-thrashing or filecache-thrashing? */
2833 if (swapout_target_age) {
2834 swapout_target_age = 0;
2835 compressor_need_sample_reset = TRUE;
2836 }
2837 #if CONFIG_PHANTOM_CACHE
2838 else {
2839 vm_phantom_cache_restart_sample();
2840 }
2841 #endif
2842 }
2843 #endif /* CONFIG_JETSAM */
2844
2845 uint32_t vm_wake_compactor_swapper_calls = 0;
2846 uint32_t vm_run_compactor_already_running = 0;
2847 uint32_t vm_run_compactor_empty_minor_q = 0;
2848 uint32_t vm_run_compactor_did_compact = 0;
2849 uint32_t vm_run_compactor_waited = 0;
2850
2851 /* run minor compaction right now, if the compaction-swapper thread is not already running */
2852 void
vm_run_compactor(void)2853 vm_run_compactor(void)
2854 {
2855 if (c_segment_count == 0) {
2856 return;
2857 }
2858
2859 if (os_atomic_load(&c_minor_count, relaxed) == 0) {
2860 vm_run_compactor_empty_minor_q++;
2861 return;
2862 }
2863
2864 lck_mtx_lock_spin_always(c_list_lock);
2865
2866 if (compaction_swapper_running) {
2867 if (vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2868 vm_run_compactor_already_running++;
2869
2870 lck_mtx_unlock_always(c_list_lock);
2871 return;
2872 }
2873 vm_run_compactor_waited++;
2874
2875 assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
2876
2877 lck_mtx_unlock_always(c_list_lock);
2878
2879 thread_block(THREAD_CONTINUE_NULL);
2880
2881 return;
2882 }
2883 vm_run_compactor_did_compact++;
2884
2885 fastwake_warmup = FALSE;
2886 compaction_swapper_running = 1;
2887
2888 vm_compressor_do_delayed_compactions(FALSE);
2889
2890 compaction_swapper_running = 0;
2891
2892 lck_mtx_unlock_always(c_list_lock);
2893
2894 thread_wakeup((event_t)&compaction_swapper_running);
2895 }
2896
2897
2898 void
vm_wake_compactor_swapper(void)2899 vm_wake_compactor_swapper(void)
2900 {
2901 if (compaction_swapper_running || compaction_swapper_awakened || c_segment_count == 0) {
2902 return;
2903 }
2904
2905 if (os_atomic_load(&c_minor_count, relaxed) ||
2906 vm_compressor_needs_to_major_compact()) {
2907 lck_mtx_lock_spin_always(c_list_lock);
2908
2909 fastwake_warmup = FALSE;
2910
2911 if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
2912 vm_wake_compactor_swapper_calls++;
2913
2914 compaction_swapper_awakened = 1;
2915 thread_wakeup((event_t)&c_compressor_swap_trigger);
2916 }
2917 lck_mtx_unlock_always(c_list_lock);
2918 }
2919 }
2920
2921
2922 void
vm_consider_swapping()2923 vm_consider_swapping()
2924 {
2925 assert(VM_CONFIG_SWAP_IS_PRESENT);
2926
2927 lck_mtx_lock_spin_always(c_list_lock);
2928
2929 compaction_swapper_abort = 1;
2930
2931 while (compaction_swapper_running) {
2932 assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
2933
2934 lck_mtx_unlock_always(c_list_lock);
2935
2936 thread_block(THREAD_CONTINUE_NULL);
2937
2938 lck_mtx_lock_spin_always(c_list_lock);
2939 }
2940 compaction_swapper_abort = 0;
2941 compaction_swapper_running = 1;
2942
2943 vm_swapout_ripe_segments = TRUE;
2944
2945 vm_compressor_process_major_segments(vm_swapout_ripe_segments);
2946
2947 vm_compressor_compact_and_swap(FALSE);
2948
2949 compaction_swapper_running = 0;
2950
2951 vm_swapout_ripe_segments = FALSE;
2952
2953 lck_mtx_unlock_always(c_list_lock);
2954
2955 thread_wakeup((event_t)&compaction_swapper_running);
2956 }
2957
2958
2959 void
vm_consider_waking_compactor_swapper(void)2960 vm_consider_waking_compactor_swapper(void)
2961 {
2962 bool need_wakeup = false;
2963
2964 if (c_segment_count == 0) {
2965 return;
2966 }
2967
2968 if (compaction_swapper_running || compaction_swapper_awakened) {
2969 return;
2970 }
2971
2972 if (!compaction_swapper_inited && !compaction_swapper_init_now) {
2973 compaction_swapper_init_now = 1;
2974 need_wakeup = true;
2975 } else if (vm_compressor_needs_to_minor_compact() ||
2976 compressor_needs_to_swap()) {
2977 need_wakeup = true;
2978 }
2979
2980 if (need_wakeup) {
2981 lck_mtx_lock_spin_always(c_list_lock);
2982
2983 fastwake_warmup = FALSE;
2984
2985 if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
2986 memoryshot(DBG_VM_WAKEUP_COMPACTOR_SWAPPER, DBG_FUNC_NONE);
2987
2988 compaction_swapper_awakened = 1;
2989 thread_wakeup((event_t)&c_compressor_swap_trigger);
2990 }
2991 lck_mtx_unlock_always(c_list_lock);
2992 }
2993 }
2994
2995
2996 #define C_SWAPOUT_LIMIT 4
2997 #define DELAYED_COMPACTIONS_PER_PASS 30
2998
2999 /* process segments that are in the minor compaction queue */
3000 void
vm_compressor_do_delayed_compactions(boolean_t flush_all)3001 vm_compressor_do_delayed_compactions(boolean_t flush_all)
3002 {
3003 c_segment_t c_seg;
3004 int number_compacted = 0;
3005 boolean_t needs_to_swap = FALSE;
3006 uint32_t c_swapout_count = 0;
3007
3008
3009 VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, DBG_VM_COMPRESSOR_DELAYED_COMPACT, DBG_FUNC_START, c_minor_count, flush_all, 0, 0);
3010
3011 #if XNU_TARGET_OS_OSX
3012 LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
3013 #endif /* XNU_TARGET_OS_OSX */
3014
3015 while (!queue_empty(&c_minor_list_head) && needs_to_swap == FALSE) {
3016 c_seg = (c_segment_t)queue_first(&c_minor_list_head);
3017
3018 lck_mtx_lock_spin_always(&c_seg->c_lock);
3019
3020 if (c_seg->c_busy) {
3021 lck_mtx_unlock_always(c_list_lock);
3022 c_seg_wait_on_busy(c_seg);
3023 lck_mtx_lock_spin_always(c_list_lock);
3024
3025 continue;
3026 }
3027 C_SEG_BUSY(c_seg);
3028
3029 c_seg_do_minor_compaction_and_unlock(c_seg, TRUE, FALSE, TRUE);
3030
3031 c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
3032 if (VM_CONFIG_SWAP_IS_ACTIVE && (number_compacted++ > DELAYED_COMPACTIONS_PER_PASS)) {
3033 if ((flush_all == TRUE || compressor_needs_to_swap()) && c_swapout_count < C_SWAPOUT_LIMIT) {
3034 needs_to_swap = TRUE;
3035 }
3036
3037 number_compacted = 0;
3038 }
3039 lck_mtx_lock_spin_always(c_list_lock);
3040 }
3041
3042 VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, DBG_VM_COMPRESSOR_DELAYED_COMPACT, DBG_FUNC_END, c_minor_count, number_compacted, needs_to_swap, 0);
3043 }
3044
3045 int min_csegs_per_major_compaction = DELAYED_COMPACTIONS_PER_PASS;
3046
3047 static bool
vm_compressor_major_compact_cseg(c_segment_t c_seg,uint32_t * c_seg_considered,bool * bail_wanted_cseg,uint64_t * total_bytes_freed)3048 vm_compressor_major_compact_cseg(c_segment_t c_seg, uint32_t* c_seg_considered, bool* bail_wanted_cseg, uint64_t* total_bytes_freed)
3049 {
3050 /*
3051 * Major compaction
3052 */
3053 bool keep_compacting = true, fully_compacted = true;
3054 queue_head_t *list_head = NULL;
3055 c_segment_t c_seg_next;
3056 uint64_t bytes_to_free = 0, bytes_freed = 0;
3057 uint32_t number_considered = 0;
3058
3059 if (c_seg->c_state == C_ON_AGE_Q) {
3060 assert(!c_seg->c_has_donated_pages);
3061 list_head = &c_age_list_head;
3062 } else if (c_seg->c_state == C_ON_SWAPPEDIN_Q) {
3063 assert(c_seg->c_has_donated_pages);
3064 list_head = &c_late_swappedin_list_head;
3065 }
3066
3067 while (keep_compacting == TRUE) {
3068 assert(c_seg->c_busy);
3069
3070 /* look for another segment to consolidate */
3071
3072 c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
3073
3074 if (queue_end(list_head, (queue_entry_t)c_seg_next)) {
3075 break;
3076 }
3077
3078 assert(c_seg_next->c_state == c_seg->c_state);
3079
3080 number_considered++;
3081
3082 if (c_seg_major_compact_ok(c_seg, c_seg_next) == FALSE) {
3083 break;
3084 }
3085
3086 lck_mtx_lock_spin_always(&c_seg_next->c_lock);
3087
3088 if (c_seg_next->c_busy) {
3089 /*
3090 * We are going to block for our neighbor.
3091 * If our c_seg is wanted, we should unbusy
3092 * it because we don't know how long we might
3093 * have to block here.
3094 */
3095 if (c_seg->c_wanted) {
3096 lck_mtx_unlock_always(&c_seg_next->c_lock);
3097 fully_compacted = false;
3098 c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
3099 *bail_wanted_cseg = true;
3100 break;
3101 }
3102
3103 lck_mtx_unlock_always(c_list_lock);
3104
3105 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 8, (void*) VM_KERNEL_ADDRPERM(c_seg_next), 0, 0);
3106
3107 c_seg_wait_on_busy(c_seg_next);
3108 lck_mtx_lock_spin_always(c_list_lock);
3109
3110 continue;
3111 }
3112 /* grab that segment */
3113 C_SEG_BUSY(c_seg_next);
3114
3115 bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3116 if (c_seg_do_minor_compaction_and_unlock(c_seg_next, FALSE, TRUE, TRUE)) {
3117 /*
3118 * found an empty c_segment and freed it
3119 * so we can't continue to use c_seg_next
3120 */
3121 bytes_freed += bytes_to_free;
3122 c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3123 continue;
3124 }
3125
3126 /* unlock the list ... */
3127 lck_mtx_unlock_always(c_list_lock);
3128
3129 /* do the major compaction */
3130
3131 keep_compacting = c_seg_major_compact(c_seg, c_seg_next);
3132
3133 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 9, keep_compacting, 0, 0);
3134
3135 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3136
3137 lck_mtx_lock_spin_always(&c_seg_next->c_lock);
3138 /*
3139 * run a minor compaction on the donor segment
3140 * since we pulled at least some of it's
3141 * data into our target... if we've emptied
3142 * it, now is a good time to free it which
3143 * c_seg_minor_compaction_and_unlock also takes care of
3144 *
3145 * by passing TRUE, we ask for c_busy to be cleared
3146 * and c_wanted to be taken care of
3147 */
3148 bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3149 if (c_seg_minor_compaction_and_unlock(c_seg_next, TRUE)) {
3150 bytes_freed += bytes_to_free;
3151 c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3152 } else {
3153 bytes_to_free -= C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3154 bytes_freed += bytes_to_free;
3155 }
3156
3157 PAGE_REPLACEMENT_DISALLOWED(FALSE);
3158
3159 /* relock the list */
3160 lck_mtx_lock_spin_always(c_list_lock);
3161
3162 if (c_seg->c_wanted) {
3163 /*
3164 * Our c_seg is in demand. Let's
3165 * unbusy it and wakeup the waiters
3166 * instead of continuing the compaction
3167 * because we could be in this loop
3168 * for a while.
3169 */
3170 fully_compacted = false;
3171 *bail_wanted_cseg = true;
3172 c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
3173 break;
3174 }
3175 } /* major compaction */
3176
3177 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 10, number_considered, *bail_wanted_cseg, 0);
3178
3179 *c_seg_considered += number_considered;
3180 *total_bytes_freed += bytes_freed;
3181
3182 lck_mtx_lock_spin_always(&c_seg->c_lock);
3183 return fully_compacted;
3184 }
3185
3186 #define TIME_SUB(rsecs, secs, rfrac, frac, unit) \
3187 MACRO_BEGIN \
3188 if ((int)((rfrac) -= (frac)) < 0) { \
3189 (rfrac) += (unit); \
3190 (rsecs) -= 1; \
3191 } \
3192 (rsecs) -= (secs); \
3193 MACRO_END
3194
3195 clock_nsec_t c_process_major_report_over_ms = 9; /* report if over 9 ms */
3196 int c_process_major_yield_after = 1000; /* yield after moving 1,000 segments */
3197 uint64_t c_process_major_reports = 0;
3198 clock_sec_t c_process_major_max_sec = 0;
3199 clock_nsec_t c_process_major_max_nsec = 0;
3200 uint32_t c_process_major_peak_segcount = 0;
3201 static void
vm_compressor_process_major_segments(bool ripe_age_only)3202 vm_compressor_process_major_segments(bool ripe_age_only)
3203 {
3204 c_segment_t c_seg = NULL;
3205 int count = 0, total = 0, breaks = 0;
3206 clock_sec_t start_sec, end_sec;
3207 clock_nsec_t start_nsec, end_nsec;
3208 clock_nsec_t report_over_ns;
3209
3210 if (queue_empty(&c_major_list_head)) {
3211 return;
3212 }
3213
3214 // printf("%s: starting to move segments from MAJORQ to AGEQ\n", __FUNCTION__);
3215 if (c_process_major_report_over_ms != 0) {
3216 report_over_ns = c_process_major_report_over_ms * NSEC_PER_MSEC;
3217 } else {
3218 report_over_ns = (clock_nsec_t)-1;
3219 }
3220
3221 if (ripe_age_only) {
3222 if (c_overage_swapped_count >= c_overage_swapped_limit) {
3223 /*
3224 * Return while we wait for the overage segments
3225 * in our queue to get pushed out first.
3226 */
3227 return;
3228 }
3229 }
3230
3231 clock_get_system_nanotime(&start_sec, &start_nsec);
3232 while (!queue_empty(&c_major_list_head)) {
3233 if (!ripe_age_only) {
3234 /*
3235 * Start from the end to preserve aging order. The newer
3236 * segments are at the tail and so need to be inserted in
3237 * the aging queue in this way so we have the older segments
3238 * at the end of the AGE_Q.
3239 */
3240 c_seg = (c_segment_t)queue_last(&c_major_list_head);
3241 } else {
3242 c_seg = (c_segment_t)queue_first(&c_major_list_head);
3243 if ((start_sec - c_seg->c_creation_ts) < vm_ripe_target_age) {
3244 /*
3245 * We have found the first segment in our queue that is not ripe. Segments after it
3246 * will be the same. So let's bail here. Return with c_list_lock held.
3247 */
3248 break;
3249 }
3250 }
3251
3252 lck_mtx_lock_spin_always(&c_seg->c_lock);
3253 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3254 lck_mtx_unlock_always(&c_seg->c_lock);
3255
3256 count++;
3257 if (count == c_process_major_yield_after ||
3258 queue_empty(&c_major_list_head)) {
3259 /* done or time to take a break */
3260 } else {
3261 /* keep going */
3262 continue;
3263 }
3264
3265 total += count;
3266 clock_get_system_nanotime(&end_sec, &end_nsec);
3267 TIME_SUB(end_sec, start_sec, end_nsec, start_nsec, NSEC_PER_SEC);
3268 if (end_sec > c_process_major_max_sec) {
3269 c_process_major_max_sec = end_sec;
3270 c_process_major_max_nsec = end_nsec;
3271 } else if (end_sec == c_process_major_max_sec &&
3272 end_nsec > c_process_major_max_nsec) {
3273 c_process_major_max_nsec = end_nsec;
3274 }
3275 if (total > c_process_major_peak_segcount) {
3276 c_process_major_peak_segcount = total;
3277 }
3278 if (end_sec > 0 ||
3279 end_nsec >= report_over_ns) {
3280 /* we used more than expected */
3281 c_process_major_reports++;
3282 printf("%s: moved %d/%d segments from MAJORQ to AGEQ in %lu.%09u seconds and %d breaks\n",
3283 __FUNCTION__, count, total,
3284 end_sec, end_nsec, breaks);
3285 }
3286 if (queue_empty(&c_major_list_head)) {
3287 /* done */
3288 break;
3289 }
3290 /* take a break to allow someone else to grab the lock */
3291 lck_mtx_unlock_always(c_list_lock);
3292 mutex_pause(0); /* 10 microseconds */
3293 lck_mtx_lock_spin_always(c_list_lock);
3294 /* start again */
3295 clock_get_system_nanotime(&start_sec, &start_nsec);
3296 count = 0;
3297 breaks++;
3298 }
3299 }
3300
3301 /*
3302 * macOS special swappable csegs -> early_swapin queue
3303 * non-macOS special swappable+non-freezer csegs -> late_swapin queue
3304 * Processing special csegs means minor compacting each cseg and then
3305 * major compacting it and putting them on the early or late
3306 * (depending on platform) swapout queue. tag:DONATE
3307 */
3308 static void
vm_compressor_process_special_swapped_in_segments_locked(void)3309 vm_compressor_process_special_swapped_in_segments_locked(void)
3310 {
3311 c_segment_t c_seg = NULL;
3312 bool switch_state = true, bail_wanted_cseg = false;
3313 unsigned int number_considered = 0, yield_after_considered_per_pass = 0;
3314 uint64_t bytes_freed = 0;
3315 queue_head_t *special_swappedin_list_head;
3316
3317 #if XNU_TARGET_OS_OSX
3318 special_swappedin_list_head = &c_early_swappedin_list_head;
3319 #else /* XNU_TARGET_OS_OSX */
3320 if (memorystatus_swap_all_apps) {
3321 special_swappedin_list_head = &c_late_swappedin_list_head;
3322 } else {
3323 /* called on unsupported config*/
3324 return;
3325 }
3326 #endif /* XNU_TARGET_OS_OSX */
3327
3328 yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS);
3329 while (!queue_empty(special_swappedin_list_head)) {
3330 c_seg = (c_segment_t)queue_first(special_swappedin_list_head);
3331
3332 lck_mtx_lock_spin_always(&c_seg->c_lock);
3333
3334 if (c_seg->c_busy) {
3335 lck_mtx_unlock_always(c_list_lock);
3336 c_seg_wait_on_busy(c_seg);
3337 lck_mtx_lock_spin_always(c_list_lock);
3338 continue;
3339 }
3340
3341 C_SEG_BUSY(c_seg);
3342 lck_mtx_unlock_always(&c_seg->c_lock);
3343 lck_mtx_unlock_always(c_list_lock);
3344
3345 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3346
3347 lck_mtx_lock_spin_always(&c_seg->c_lock);
3348
3349 if (c_seg_minor_compaction_and_unlock(c_seg, FALSE /*clear busy?*/)) {
3350 /*
3351 * found an empty c_segment and freed it
3352 * so go grab the next guy in the queue
3353 */
3354 PAGE_REPLACEMENT_DISALLOWED(FALSE);
3355 lck_mtx_lock_spin_always(c_list_lock);
3356 continue;
3357 }
3358
3359 PAGE_REPLACEMENT_DISALLOWED(FALSE);
3360 lck_mtx_lock_spin_always(c_list_lock);
3361
3362 switch_state = vm_compressor_major_compact_cseg(c_seg, &number_considered, &bail_wanted_cseg, &bytes_freed);
3363 assert(c_seg->c_busy);
3364 assert(!c_seg->c_on_minorcompact_q);
3365
3366 if (switch_state) {
3367 if (VM_CONFIG_SWAP_IS_ACTIVE || VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
3368 /*
3369 * Ordinarily we let swapped in segments age out + get
3370 * major compacted with the rest of the c_segs on the ageQ.
3371 * But the early donated c_segs, if well compacted, should be
3372 * kept ready to be swapped out if needed. These are typically
3373 * describing memory belonging to a leaky app (macOS) or a swap-
3374 * capable app (iPadOS) and for the latter we can keep these
3375 * around longer because we control the triggers in the memorystatus
3376 * subsystem
3377 */
3378 c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
3379 }
3380 }
3381
3382 C_SEG_WAKEUP_DONE(c_seg);
3383
3384 lck_mtx_unlock_always(&c_seg->c_lock);
3385
3386 if (number_considered >= yield_after_considered_per_pass) {
3387 if (bail_wanted_cseg) {
3388 /*
3389 * We stopped major compactions on a c_seg
3390 * that is wanted. We don't know the priority
3391 * of the waiter unfortunately but we are at
3392 * a very high priority and so, just in case
3393 * the waiter is a critical system daemon or
3394 * UI thread, let's give up the CPU in case
3395 * the system is running a few CPU intensive
3396 * tasks.
3397 */
3398 bail_wanted_cseg = false;
3399 lck_mtx_unlock_always(c_list_lock);
3400
3401 mutex_pause(2); /* 100us yield */
3402
3403 lck_mtx_lock_spin_always(c_list_lock);
3404 }
3405
3406 number_considered = 0;
3407 }
3408 }
3409 }
3410
3411 void
vm_compressor_process_special_swapped_in_segments(void)3412 vm_compressor_process_special_swapped_in_segments(void)
3413 {
3414 lck_mtx_lock_spin_always(c_list_lock);
3415 vm_compressor_process_special_swapped_in_segments_locked();
3416 lck_mtx_unlock_always(c_list_lock);
3417 }
3418
3419 #define ENABLE_DYNAMIC_SWAPPED_AGE_LIMIT 1
3420
3421 /* minimum time that segments can be in swappedin q as a grace period after they were swapped-in
3422 * before they are added to age-q */
3423 #define C_SEGMENT_SWAPPEDIN_AGE_LIMIT_LOW 1 /* seconds */
3424 #define C_SEGMENT_SWAPPEDIN_AGE_LIMIT_NORMAL 10 /* seconds */
3425 #define C_AGE_Q_COUNT_LOW_THRESHOLD 50
3426
3427 /*
3428 * Processing regular csegs means aging them.
3429 */
3430 static void
vm_compressor_process_regular_swapped_in_segments(boolean_t flush_all)3431 vm_compressor_process_regular_swapped_in_segments(boolean_t flush_all)
3432 {
3433 c_segment_t c_seg;
3434 clock_sec_t now;
3435 clock_nsec_t nsec;
3436
3437 unsigned long limit = C_SEGMENT_SWAPPEDIN_AGE_LIMIT_NORMAL;
3438
3439 #ifdef ENABLE_DYNAMIC_SWAPPED_AGE_LIMIT
3440 /* In normal operation, segments are kept in the swapped-in-q for a grace period of 10 seconds so that whoever
3441 * needed to decompress something from a segment that was just swapped-in would have a chance to decompress
3442 * more out of it.
3443 * If the system is in high memory pressure state, this may cause the age-q to be completely empty so that
3444 * there are no candidate segments for swap-out. In this state we use a lower limit of 1 second.
3445 * condition 1: the age-q absolute size is too low
3446 * condition 2: there are more segments in swapped-in-q than in age-q
3447 * each of these represent a bad situation which we want to try to alleviate by moving more segments from
3448 * swappped-in-q to age-q so that we have a better selection of who to swap-out
3449 */
3450 if (c_age_count < C_AGE_Q_COUNT_LOW_THRESHOLD || c_age_count < c_regular_swappedin_count) {
3451 limit = C_SEGMENT_SWAPPEDIN_AGE_LIMIT_LOW;
3452 }
3453 #endif
3454
3455 clock_get_system_nanotime(&now, &nsec);
3456
3457 while (!queue_empty(&c_regular_swappedin_list_head)) {
3458 c_seg = (c_segment_t)queue_first(&c_regular_swappedin_list_head);
3459
3460 if (flush_all == FALSE && (now - c_seg->c_swappedin_ts) < limit) {
3461 /* swappedin q is sorted by the order of time of addition os if we reached a seg that's too
3462 * young, we know that all the rest after it are also too young */
3463 break;
3464 }
3465
3466 lck_mtx_lock_spin_always(&c_seg->c_lock);
3467
3468 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3469 c_seg->c_agedin_ts = (uint32_t) now;
3470
3471 lck_mtx_unlock_always(&c_seg->c_lock);
3472 }
3473 }
3474
3475
3476 extern int vm_num_swap_files;
3477 extern int vm_num_pinned_swap_files;
3478 extern int vm_swappin_enabled;
3479
3480 extern unsigned int vm_swapfile_total_segs_used;
3481 extern unsigned int vm_swapfile_total_segs_alloced;
3482
3483
3484 void
vm_compressor_flush(void)3485 vm_compressor_flush(void)
3486 {
3487 uint64_t vm_swap_put_failures_at_start;
3488 wait_result_t wait_result = 0;
3489 AbsoluteTime startTime, endTime;
3490 clock_sec_t now_sec;
3491 clock_nsec_t now_nsec;
3492 uint64_t nsec;
3493 c_segment_t c_seg, c_seg_next;
3494
3495 HIBLOG("vm_compressor_flush - starting\n");
3496
3497 clock_get_uptime(&startTime);
3498
3499 lck_mtx_lock_spin_always(c_list_lock);
3500
3501 fastwake_warmup = FALSE;
3502 compaction_swapper_abort = 1;
3503
3504 while (compaction_swapper_running) {
3505 assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
3506
3507 lck_mtx_unlock_always(c_list_lock);
3508
3509 thread_block(THREAD_CONTINUE_NULL);
3510
3511 lck_mtx_lock_spin_always(c_list_lock);
3512 }
3513 compaction_swapper_abort = 0;
3514 compaction_swapper_running = 1;
3515
3516 hibernate_flushing = TRUE;
3517 hibernate_no_swapspace = FALSE;
3518 hibernate_flush_timed_out = FALSE;
3519 c_generation_id_flush_barrier = c_generation_id + 1000;
3520
3521 clock_get_system_nanotime(&now_sec, &now_nsec);
3522 hibernate_flushing_deadline = now_sec + HIBERNATE_FLUSHING_SECS_TO_COMPLETE;
3523
3524 vm_swap_put_failures_at_start = vm_swap_put_failures;
3525
3526 /*
3527 * We are about to hibernate and so we want all segments flushed to disk.
3528 * Segments that are on the major compaction queue won't be considered in
3529 * the vm_compressor_compact_and_swap() pass. So we need to bring them to
3530 * the ageQ for consideration.
3531 */
3532 if (!queue_empty(&c_major_list_head)) {
3533 c_seg = (c_segment_t)queue_first(&c_major_list_head);
3534
3535 while (!queue_end(&c_major_list_head, (queue_entry_t)c_seg)) {
3536 c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
3537 lck_mtx_lock_spin_always(&c_seg->c_lock);
3538 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3539 lck_mtx_unlock_always(&c_seg->c_lock);
3540 c_seg = c_seg_next;
3541 }
3542 }
3543 vm_compressor_compact_and_swap(TRUE);
3544 /* need to wait here since the swap thread may also be running in parallel and handling segments */
3545 while (!queue_empty(&c_early_swapout_list_head) || !queue_empty(&c_regular_swapout_list_head) || !queue_empty(&c_late_swapout_list_head)) {
3546 assert_wait_timeout((event_t) &compaction_swapper_running, THREAD_INTERRUPTIBLE, 5000, 1000 * NSEC_PER_USEC);
3547
3548 lck_mtx_unlock_always(c_list_lock);
3549
3550 wait_result = thread_block(THREAD_CONTINUE_NULL);
3551
3552 lck_mtx_lock_spin_always(c_list_lock);
3553
3554 if (wait_result == THREAD_TIMED_OUT) {
3555 break;
3556 }
3557 }
3558 hibernate_flushing = FALSE;
3559 compaction_swapper_running = 0;
3560
3561 if (vm_swap_put_failures > vm_swap_put_failures_at_start) {
3562 HIBLOG("vm_compressor_flush failed to clean %llu segments - vm_page_compressor_count(%d)\n",
3563 vm_swap_put_failures - vm_swap_put_failures_at_start, VM_PAGE_COMPRESSOR_COUNT);
3564 }
3565
3566 lck_mtx_unlock_always(c_list_lock);
3567
3568 thread_wakeup((event_t)&compaction_swapper_running);
3569
3570 clock_get_uptime(&endTime);
3571 SUB_ABSOLUTETIME(&endTime, &startTime);
3572 absolutetime_to_nanoseconds(endTime, &nsec);
3573
3574 HIBLOG("vm_compressor_flush completed - took %qd msecs - vm_num_swap_files = %d, vm_num_pinned_swap_files = %d, vm_swappin_enabled = %d\n",
3575 nsec / 1000000ULL, vm_num_swap_files, vm_num_pinned_swap_files, vm_swappin_enabled);
3576 }
3577
3578
3579 int compaction_swap_trigger_thread_awakened = 0;
3580
3581 static void
vm_compressor_swap_trigger_thread(void)3582 vm_compressor_swap_trigger_thread(void)
3583 {
3584 current_thread()->options |= TH_OPT_VMPRIV;
3585
3586 /*
3587 * compaction_swapper_init_now is set when the first call to
3588 * vm_consider_waking_compactor_swapper is made from
3589 * vm_pageout_scan... since this function is called upon
3590 * thread creation, we want to make sure to delay adjusting
3591 * the tuneables until we are awakened via vm_pageout_scan
3592 * so that we are at a point where the vm_swapfile_open will
3593 * be operating on the correct directory (in case the default
3594 * of using the VM volume is overridden by the dynamic_pager)
3595 */
3596 if (compaction_swapper_init_now) {
3597 vm_compaction_swapper_do_init();
3598
3599 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
3600 thread_vm_bind_group_add();
3601 }
3602 #if CONFIG_THREAD_GROUPS
3603 thread_group_vm_add();
3604 #endif
3605 thread_set_thread_name(current_thread(), "VM_cswap_trigger");
3606 compaction_swapper_init_now = 0;
3607 }
3608 lck_mtx_lock_spin_always(c_list_lock);
3609
3610 compaction_swap_trigger_thread_awakened++;
3611 compaction_swapper_awakened = 0;
3612
3613 if (compaction_swapper_running == 0) {
3614 compaction_swapper_running = 1;
3615
3616 vm_compressor_compact_and_swap(FALSE);
3617
3618 compaction_swapper_running = 0;
3619 }
3620 assert_wait((event_t)&c_compressor_swap_trigger, THREAD_UNINT);
3621
3622 if (compaction_swapper_running == 0) {
3623 thread_wakeup((event_t)&compaction_swapper_running);
3624 }
3625
3626 lck_mtx_unlock_always(c_list_lock);
3627
3628 thread_block((thread_continue_t)vm_compressor_swap_trigger_thread);
3629
3630 /* NOTREACHED */
3631 }
3632
3633
3634 void
vm_compressor_record_warmup_start(void)3635 vm_compressor_record_warmup_start(void)
3636 {
3637 c_segment_t c_seg;
3638
3639 lck_mtx_lock_spin_always(c_list_lock);
3640
3641 if (first_c_segment_to_warm_generation_id == 0) {
3642 if (!queue_empty(&c_age_list_head)) {
3643 c_seg = (c_segment_t)queue_last(&c_age_list_head);
3644
3645 first_c_segment_to_warm_generation_id = c_seg->c_generation_id;
3646 } else {
3647 first_c_segment_to_warm_generation_id = 0;
3648 }
3649
3650 fastwake_recording_in_progress = TRUE;
3651 }
3652 lck_mtx_unlock_always(c_list_lock);
3653 }
3654
3655
3656 void
vm_compressor_record_warmup_end(void)3657 vm_compressor_record_warmup_end(void)
3658 {
3659 c_segment_t c_seg;
3660
3661 lck_mtx_lock_spin_always(c_list_lock);
3662
3663 if (fastwake_recording_in_progress == TRUE) {
3664 if (!queue_empty(&c_age_list_head)) {
3665 c_seg = (c_segment_t)queue_last(&c_age_list_head);
3666
3667 last_c_segment_to_warm_generation_id = c_seg->c_generation_id;
3668 } else {
3669 last_c_segment_to_warm_generation_id = first_c_segment_to_warm_generation_id;
3670 }
3671
3672 fastwake_recording_in_progress = FALSE;
3673
3674 HIBLOG("vm_compressor_record_warmup (%qd - %qd)\n", first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id);
3675 }
3676 lck_mtx_unlock_always(c_list_lock);
3677 }
3678
3679
3680 #define DELAY_TRIM_ON_WAKE_NS (25 * NSEC_PER_SEC)
3681
3682 void
vm_compressor_delay_trim(void)3683 vm_compressor_delay_trim(void)
3684 {
3685 uint64_t now = mach_absolute_time();
3686 uint64_t delay_abstime;
3687 nanoseconds_to_absolutetime(DELAY_TRIM_ON_WAKE_NS, &delay_abstime);
3688 dont_trim_until_ts = now + delay_abstime;
3689 }
3690
3691
3692 void
vm_compressor_do_warmup(void)3693 vm_compressor_do_warmup(void)
3694 {
3695 lck_mtx_lock_spin_always(c_list_lock);
3696
3697 if (first_c_segment_to_warm_generation_id == last_c_segment_to_warm_generation_id) {
3698 first_c_segment_to_warm_generation_id = last_c_segment_to_warm_generation_id = 0;
3699
3700 lck_mtx_unlock_always(c_list_lock);
3701 return;
3702 }
3703
3704 if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
3705 fastwake_warmup = TRUE;
3706
3707 compaction_swapper_awakened = 1;
3708 thread_wakeup((event_t)&c_compressor_swap_trigger);
3709 }
3710 lck_mtx_unlock_always(c_list_lock);
3711 }
3712
3713 void
do_fastwake_warmup_all(void)3714 do_fastwake_warmup_all(void)
3715 {
3716 lck_mtx_lock_spin_always(c_list_lock);
3717
3718 if (queue_empty(&c_swappedout_list_head) && queue_empty(&c_swappedout_sparse_list_head)) {
3719 lck_mtx_unlock_always(c_list_lock);
3720 return;
3721 }
3722
3723 fastwake_warmup = TRUE;
3724
3725 do_fastwake_warmup(&c_swappedout_list_head, TRUE);
3726
3727 do_fastwake_warmup(&c_swappedout_sparse_list_head, TRUE);
3728
3729 fastwake_warmup = FALSE;
3730
3731 lck_mtx_unlock_always(c_list_lock);
3732 }
3733
3734 void
do_fastwake_warmup(queue_head_t * c_queue,boolean_t consider_all_cseg)3735 do_fastwake_warmup(queue_head_t *c_queue, boolean_t consider_all_cseg)
3736 {
3737 c_segment_t c_seg = NULL;
3738 AbsoluteTime startTime, endTime;
3739 uint64_t nsec;
3740
3741
3742 HIBLOG("vm_compressor_fastwake_warmup (%qd - %qd) - starting\n", first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id);
3743
3744 clock_get_uptime(&startTime);
3745
3746 lck_mtx_unlock_always(c_list_lock);
3747
3748 proc_set_thread_policy(current_thread(),
3749 TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2);
3750
3751 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3752
3753 lck_mtx_lock_spin_always(c_list_lock);
3754
3755 while (!queue_empty(c_queue) && fastwake_warmup == TRUE) {
3756 c_seg = (c_segment_t) queue_first(c_queue);
3757
3758 if (consider_all_cseg == FALSE) {
3759 if (c_seg->c_generation_id < first_c_segment_to_warm_generation_id ||
3760 c_seg->c_generation_id > last_c_segment_to_warm_generation_id) {
3761 break;
3762 }
3763
3764 if (vm_page_free_count < (AVAILABLE_MEMORY / 4)) {
3765 break;
3766 }
3767 }
3768
3769 lck_mtx_lock_spin_always(&c_seg->c_lock);
3770 lck_mtx_unlock_always(c_list_lock);
3771
3772 if (c_seg->c_busy) {
3773 PAGE_REPLACEMENT_DISALLOWED(FALSE);
3774 c_seg_wait_on_busy(c_seg);
3775 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3776 } else {
3777 if (c_seg_swapin(c_seg, TRUE, FALSE) == 0) {
3778 lck_mtx_unlock_always(&c_seg->c_lock);
3779 }
3780 c_segment_warmup_count++;
3781
3782 PAGE_REPLACEMENT_DISALLOWED(FALSE);
3783 vm_pageout_io_throttle();
3784 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3785 }
3786 lck_mtx_lock_spin_always(c_list_lock);
3787 }
3788 lck_mtx_unlock_always(c_list_lock);
3789
3790 PAGE_REPLACEMENT_DISALLOWED(FALSE);
3791
3792 proc_set_thread_policy(current_thread(),
3793 TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER0);
3794
3795 clock_get_uptime(&endTime);
3796 SUB_ABSOLUTETIME(&endTime, &startTime);
3797 absolutetime_to_nanoseconds(endTime, &nsec);
3798
3799 HIBLOG("vm_compressor_fastwake_warmup completed - took %qd msecs\n", nsec / 1000000ULL);
3800
3801 lck_mtx_lock_spin_always(c_list_lock);
3802
3803 if (consider_all_cseg == FALSE) {
3804 first_c_segment_to_warm_generation_id = last_c_segment_to_warm_generation_id = 0;
3805 }
3806 }
3807
3808 extern bool vm_swapout_thread_running;
3809 extern boolean_t compressor_store_stop_compaction;
3810
3811 void
vm_compressor_compact_and_swap(boolean_t flush_all)3812 vm_compressor_compact_and_swap(boolean_t flush_all)
3813 {
3814 c_segment_t c_seg;
3815 bool switch_state, bail_wanted_cseg = false;
3816 clock_sec_t now;
3817 clock_nsec_t nsec;
3818 mach_timespec_t start_ts, end_ts;
3819 unsigned int number_considered, wanted_cseg_found, yield_after_considered_per_pass, number_yields;
3820 uint64_t bytes_freed, delta_usec;
3821 uint32_t c_swapout_count = 0;
3822
3823 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_START, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
3824
3825 if (fastwake_warmup == TRUE) {
3826 uint64_t starting_warmup_count;
3827
3828 starting_warmup_count = c_segment_warmup_count;
3829
3830 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 11) | DBG_FUNC_START, c_segment_warmup_count,
3831 first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id, 0, 0);
3832 do_fastwake_warmup(&c_swappedout_list_head, FALSE);
3833 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 11) | DBG_FUNC_END, c_segment_warmup_count, c_segment_warmup_count - starting_warmup_count, 0, 0, 0);
3834
3835 fastwake_warmup = FALSE;
3836 }
3837
3838 #if (XNU_TARGET_OS_OSX && __arm64__)
3839 /*
3840 * Re-considering major csegs showed benefits on all platforms by
3841 * significantly reducing fragmentation and getting back memory.
3842 * However, on smaller devices, eg watch, there was increased power
3843 * use for the additional compactions. And the turnover in csegs on
3844 * those smaller platforms is high enough in the decompression/free
3845 * path that we can skip reconsidering them here because we already
3846 * consider them for major compaction in those paths.
3847 */
3848 vm_compressor_process_major_segments(false /*all segments and not just the ripe-aged ones*/);
3849 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3850
3851 /*
3852 * it's possible for the c_age_list_head to be empty if we
3853 * hit our limits for growing the compressor pool and we subsequently
3854 * hibernated... on the next hibernation we could see the queue as
3855 * empty and not proceeed even though we have a bunch of segments on
3856 * the swapped in queue that need to be dealt with.
3857 */
3858 vm_compressor_do_delayed_compactions(flush_all);
3859 vm_compressor_process_special_swapped_in_segments_locked();
3860 vm_compressor_process_regular_swapped_in_segments(flush_all);
3861
3862 /*
3863 * we only need to grab the timestamp once per
3864 * invocation of this function since the
3865 * timescale we're interested in is measured
3866 * in days
3867 */
3868 clock_get_system_nanotime(&now, &nsec);
3869
3870 start_ts.tv_sec = (int) now;
3871 start_ts.tv_nsec = nsec;
3872 delta_usec = 0;
3873 number_considered = 0;
3874 wanted_cseg_found = 0;
3875 number_yields = 0;
3876 bytes_freed = 0;
3877 yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS);
3878
3879 /**
3880 * SW: Need to figure out how to properly rate limit this log because it is currently way too
3881 * noisy. rdar://99379414 (Figure out how to rate limit the fragmentation level logging)
3882 */
3883 vm_log_debug("before compaction fragmentation level %u\n", vm_compressor_fragmentation_level());
3884
3885 while (!queue_empty(&c_age_list_head) && !compaction_swapper_abort && !compressor_store_stop_compaction) {
3886 if (hibernate_flushing == TRUE) {
3887 clock_sec_t sec;
3888
3889 if (hibernate_should_abort()) {
3890 HIBLOG("vm_compressor_flush - hibernate_should_abort returned TRUE\n");
3891 break;
3892 }
3893 if (hibernate_no_swapspace == TRUE) {
3894 HIBLOG("vm_compressor_flush - out of swap space\n");
3895 break;
3896 }
3897 if (vm_swap_files_pinned() == FALSE) {
3898 HIBLOG("vm_compressor_flush - unpinned swap files\n");
3899 break;
3900 }
3901 if (hibernate_in_progress_with_pinned_swap == TRUE &&
3902 (vm_swapfile_total_segs_alloced == vm_swapfile_total_segs_used)) {
3903 HIBLOG("vm_compressor_flush - out of pinned swap space\n");
3904 break;
3905 }
3906 clock_get_system_nanotime(&sec, &nsec);
3907
3908 if (sec > hibernate_flushing_deadline) {
3909 hibernate_flush_timed_out = TRUE;
3910 HIBLOG("vm_compressor_flush - failed to finish before deadline\n");
3911 break;
3912 }
3913 }
3914
3915 c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
3916 if (VM_CONFIG_SWAP_IS_ACTIVE && !vm_swap_out_of_space() && c_swapout_count >= C_SWAPOUT_LIMIT) {
3917 assert_wait_timeout((event_t) &compaction_swapper_running, THREAD_INTERRUPTIBLE, 100, 1000 * NSEC_PER_USEC);
3918
3919 if (!vm_swapout_thread_running) {
3920 thread_wakeup((event_t)&vm_swapout_thread);
3921 }
3922
3923 lck_mtx_unlock_always(c_list_lock);
3924
3925 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 1, c_swapout_count, 0, 0);
3926
3927 thread_block(THREAD_CONTINUE_NULL);
3928
3929 lck_mtx_lock_spin_always(c_list_lock);
3930 }
3931 /*
3932 * Minor compactions
3933 */
3934 vm_compressor_do_delayed_compactions(flush_all);
3935
3936 /*
3937 * vm_compressor_process_early_swapped_in_segments()
3938 * might be too aggressive. So OFF for now.
3939 */
3940 vm_compressor_process_regular_swapped_in_segments(flush_all);
3941
3942 /* Recompute because we dropped the c_list_lock above*/
3943 c_swapout_count = c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count;
3944 if (VM_CONFIG_SWAP_IS_ACTIVE && !vm_swap_out_of_space() && c_swapout_count >= C_SWAPOUT_LIMIT) {
3945 /*
3946 * we timed out on the above thread_block
3947 * let's loop around and try again
3948 * the timeout allows us to continue
3949 * to do minor compactions to make
3950 * more memory available
3951 */
3952 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 2, c_swapout_count, 0, 0);
3953
3954 continue;
3955 }
3956
3957 /*
3958 * Swap out segments?
3959 */
3960 if (flush_all == FALSE) {
3961 bool needs_to_swap;
3962
3963 lck_mtx_unlock_always(c_list_lock);
3964
3965 needs_to_swap = compressor_needs_to_swap();
3966
3967 lck_mtx_lock_spin_always(c_list_lock);
3968
3969 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 3, needs_to_swap, 0, 0);
3970
3971 if (!needs_to_swap) {
3972 break;
3973 }
3974 }
3975 if (queue_empty(&c_age_list_head)) {
3976 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 4, c_age_count, 0, 0);
3977 break;
3978 }
3979 c_seg = (c_segment_t) queue_first(&c_age_list_head);
3980
3981 assert(c_seg->c_state == C_ON_AGE_Q);
3982
3983 if (flush_all == TRUE && c_seg->c_generation_id > c_generation_id_flush_barrier) {
3984 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 5, 0, 0, 0);
3985 break;
3986 }
3987
3988 lck_mtx_lock_spin_always(&c_seg->c_lock);
3989
3990 if (c_seg->c_busy) {
3991 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 6, (void*) VM_KERNEL_ADDRPERM(c_seg), 0, 0);
3992
3993 lck_mtx_unlock_always(c_list_lock);
3994 c_seg_wait_on_busy(c_seg);
3995 lck_mtx_lock_spin_always(c_list_lock);
3996
3997 continue;
3998 }
3999 C_SEG_BUSY(c_seg);
4000
4001 if (c_seg_do_minor_compaction_and_unlock(c_seg, FALSE, TRUE, TRUE)) {
4002 /*
4003 * found an empty c_segment and freed it
4004 * so go grab the next guy in the queue
4005 */
4006 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 7, 0, 0, 0);
4007 c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
4008 continue;
4009 }
4010
4011 switch_state = vm_compressor_major_compact_cseg(c_seg, &number_considered, &bail_wanted_cseg, &bytes_freed);
4012 if (bail_wanted_cseg) {
4013 wanted_cseg_found++;
4014 bail_wanted_cseg = false;
4015 }
4016
4017 assert(c_seg->c_busy);
4018 assert(!c_seg->c_on_minorcompact_q);
4019
4020 if (switch_state) {
4021 if (VM_CONFIG_SWAP_IS_ACTIVE) {
4022 int new_state = C_ON_SWAPOUT_Q;
4023 #if (XNU_TARGET_OS_OSX && __arm64__)
4024 if (flush_all == false && compressor_swapout_conditions_met() == false) {
4025 new_state = C_ON_MAJORCOMPACT_Q;
4026 }
4027 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
4028
4029 if (new_state == C_ON_SWAPOUT_Q) {
4030 /*
4031 * This mode of putting a generic c_seg on the swapout list is
4032 * only supported when we have general swapping enabled
4033 */
4034 clock_sec_t lnow;
4035 clock_nsec_t lnsec;
4036 clock_get_system_nanotime(&lnow, &lnsec);
4037 if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 30) {
4038 vmcs_stats.unripe_under_30s++;
4039 } else if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 60) {
4040 vmcs_stats.unripe_under_60s++;
4041 } else if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 300) {
4042 vmcs_stats.unripe_under_300s++;
4043 }
4044 }
4045
4046 c_seg_switch_state(c_seg, new_state, FALSE);
4047 } else {
4048 if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) {
4049 assert(VM_CONFIG_SWAP_IS_PRESENT);
4050 /*
4051 * we are running compressor sweeps with swap-behind
4052 * make sure the c_seg has aged enough before swapping it
4053 * out...
4054 */
4055 if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
4056 c_seg->c_overage_swap = TRUE;
4057 c_overage_swapped_count++;
4058 c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
4059 }
4060 }
4061 }
4062 if (c_seg->c_state == C_ON_AGE_Q) {
4063 /*
4064 * this c_seg didn't get moved to the swapout queue
4065 * so we need to move it out of the way...
4066 * we just did a major compaction on it so put it
4067 * on that queue
4068 */
4069 c_seg_switch_state(c_seg, C_ON_MAJORCOMPACT_Q, FALSE);
4070 } else {
4071 c_seg_major_compact_stats[c_seg_major_compact_stats_now].wasted_space_in_swapouts += c_seg_bufsize - c_seg->c_bytes_used;
4072 c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_swapouts++;
4073 }
4074 }
4075
4076 C_SEG_WAKEUP_DONE(c_seg);
4077
4078 lck_mtx_unlock_always(&c_seg->c_lock);
4079
4080 /*
4081 * On systems _with_ general swap, regardless of jetsam, we wake up the swapout thread here.
4082 * On systems _without_ general swap, it's the responsibility of the memorystatus
4083 * subsystem to wake up the swapper.
4084 * TODO: When we have full jetsam support on a swap enabled system, we will need to revisit
4085 * this policy.
4086 */
4087 if (VM_CONFIG_SWAP_IS_ACTIVE && c_swapout_count) {
4088 /*
4089 * We don't pause/yield here because we will either
4090 * yield below or at the top of the loop with the
4091 * assert_wait_timeout.
4092 */
4093 if (!vm_swapout_thread_running) {
4094 thread_wakeup((event_t)&vm_swapout_thread);
4095 }
4096 }
4097
4098 if (number_considered >= yield_after_considered_per_pass) {
4099 if (wanted_cseg_found) {
4100 /*
4101 * We stopped major compactions on a c_seg
4102 * that is wanted. We don't know the priority
4103 * of the waiter unfortunately but we are at
4104 * a very high priority and so, just in case
4105 * the waiter is a critical system daemon or
4106 * UI thread, let's give up the CPU in case
4107 * the system is running a few CPU intensive
4108 * tasks.
4109 */
4110 lck_mtx_unlock_always(c_list_lock);
4111
4112 mutex_pause(2); /* 100us yield */
4113
4114 number_yields++;
4115
4116 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 11, number_considered, number_yields, 0);
4117
4118 lck_mtx_lock_spin_always(c_list_lock);
4119 }
4120
4121 number_considered = 0;
4122 wanted_cseg_found = 0;
4123 }
4124 }
4125 clock_get_system_nanotime(&now, &nsec);
4126
4127 end_ts = major_compact_ts = (mach_timespec_t){.tv_sec = (int)now, .tv_nsec = nsec};
4128
4129 SUB_MACH_TIMESPEC(&end_ts, &start_ts);
4130
4131 delta_usec = (end_ts.tv_sec * USEC_PER_SEC) + (end_ts.tv_nsec / NSEC_PER_USEC) - (number_yields * 100);
4132
4133 delta_usec = MAX(1, delta_usec); /* we could have 0 usec run if conditions weren't right */
4134
4135 c_seg_major_compact_stats[c_seg_major_compact_stats_now].bytes_freed_rate_us = (bytes_freed / delta_usec);
4136
4137 if ((c_seg_major_compact_stats_now + 1) == C_SEG_MAJOR_COMPACT_STATS_MAX) {
4138 c_seg_major_compact_stats_now = 0;
4139 } else {
4140 c_seg_major_compact_stats_now++;
4141 }
4142
4143 assert(c_seg_major_compact_stats_now < C_SEG_MAJOR_COMPACT_STATS_MAX);
4144
4145 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, DBG_VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_END, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
4146 }
4147
4148
4149 static c_segment_t
c_seg_allocate(c_segment_t * current_chead,bool * nearing_limits)4150 c_seg_allocate(c_segment_t *current_chead, bool *nearing_limits)
4151 {
4152 c_segment_t c_seg;
4153 int min_needed;
4154 int size_to_populate;
4155 c_segment_t *donate_queue_head;
4156 uint32_t compressed_pages;
4157
4158 *nearing_limits = false;
4159
4160 compressed_pages = vm_compressor_pages_compressed();
4161
4162 if (compressed_pages >= c_segment_pages_compressed_nearing_limit) {
4163 *nearing_limits = true;
4164 }
4165 if (compressed_pages >= c_segment_pages_compressed_limit) {
4166 /*
4167 * We've reached the compressed pages limit, don't return
4168 * a segment to compress into
4169 */
4170 return NULL;
4171 }
4172
4173 if ((c_seg = *current_chead) == NULL) {
4174 uint32_t c_segno;
4175
4176 lck_mtx_lock_spin_always(c_list_lock);
4177
4178 while (c_segments_busy == TRUE) {
4179 assert_wait((event_t) (&c_segments_busy), THREAD_UNINT);
4180
4181 lck_mtx_unlock_always(c_list_lock);
4182
4183 thread_block(THREAD_CONTINUE_NULL);
4184
4185 lck_mtx_lock_spin_always(c_list_lock);
4186 }
4187 if (c_free_segno_head == (uint32_t)-1) {
4188 uint32_t c_segments_available_new;
4189
4190 /*
4191 * We may have dropped the c_list_lock, re-evaluate
4192 * the compressed pages count
4193 */
4194 compressed_pages = vm_compressor_pages_compressed();
4195
4196 if (c_segments_available >= c_segments_nearing_limit ||
4197 compressed_pages >= c_segment_pages_compressed_nearing_limit) {
4198 *nearing_limits = true;
4199 }
4200 if (c_segments_available >= c_segments_limit ||
4201 compressed_pages >= c_segment_pages_compressed_limit) {
4202 lck_mtx_unlock_always(c_list_lock);
4203
4204 return NULL;
4205 }
4206 c_segments_busy = TRUE;
4207 lck_mtx_unlock_always(c_list_lock);
4208
4209 /* pages for c_segments are never depopulated, c_segments_available never goes down */
4210 kernel_memory_populate((vm_offset_t)c_segments_next_page,
4211 PAGE_SIZE, KMA_NOFAIL | KMA_KOBJECT,
4212 VM_KERN_MEMORY_COMPRESSOR);
4213 c_segments_next_page += PAGE_SIZE;
4214
4215 c_segments_available_new = c_segments_available + C_SEGMENTS_PER_PAGE;
4216
4217 if (c_segments_available_new > c_segments_limit) {
4218 c_segments_available_new = c_segments_limit;
4219 }
4220
4221 /* add the just-added segments to the top of the free-list */
4222 for (c_segno = c_segments_available + 1; c_segno < c_segments_available_new; c_segno++) {
4223 c_segments_get(c_segno - 1)->c_segno = c_segno; /* next free is the one after you */
4224 }
4225
4226 lck_mtx_lock_spin_always(c_list_lock);
4227
4228 c_segments_get(c_segno - 1)->c_segno = c_free_segno_head; /* link to the rest of, existing freelist */
4229 c_free_segno_head = c_segments_available; /* first one in the page that was just allocated */
4230 c_segments_available = c_segments_available_new;
4231
4232 c_segments_busy = FALSE;
4233 thread_wakeup((event_t) (&c_segments_busy));
4234 }
4235 c_segno = c_free_segno_head;
4236 assert(c_segno >= 0 && c_segno < c_segments_limit);
4237
4238 c_free_segno_head = (uint32_t)c_segments_get(c_segno)->c_segno;
4239
4240 /*
4241 * do the rest of the bookkeeping now while we're still behind
4242 * the list lock and grab our generation id now into a local
4243 * so that we can install it once we have the c_seg allocated
4244 */
4245 c_segment_count++;
4246 if (c_segment_count > c_segment_count_max) {
4247 c_segment_count_max = c_segment_count;
4248 }
4249
4250 lck_mtx_unlock_always(c_list_lock);
4251
4252 c_seg = zalloc_flags(compressor_segment_zone, Z_WAITOK | Z_ZERO);
4253
4254 c_seg->c_store.c_buffer = (int32_t *)C_SEG_BUFFER_ADDRESS(c_segno);
4255
4256 lck_mtx_init(&c_seg->c_lock, &vm_compressor_lck_grp, LCK_ATTR_NULL);
4257
4258 c_seg->c_state = C_IS_EMPTY;
4259 c_seg->c_firstemptyslot = C_SLOT_MAX_INDEX;
4260 c_seg->c_mysegno = c_segno;
4261
4262 lck_mtx_lock_spin_always(c_list_lock);
4263 c_empty_count++; /* going to be immediately decremented in the next call */
4264 c_seg_switch_state(c_seg, C_IS_FILLING, FALSE);
4265 c_segments_get(c_segno)->c_seg = c_seg;
4266 assert(c_segments_get(c_segno)->c_segno > c_segments_available); /* we just assigned a pointer to it so this is an indication that it is occupied */
4267 lck_mtx_unlock_always(c_list_lock);
4268
4269 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
4270 #if XNU_TARGET_OS_OSX /* tag:DONATE */
4271 donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_early_swapout_chead);
4272 #else /* XNU_TARGET_OS_OSX */
4273 if (memorystatus_swap_all_apps) {
4274 donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_late_swapout_chead);
4275 } else {
4276 donate_queue_head = NULL;
4277 }
4278 #endif /* XNU_TARGET_OS_OSX */
4279
4280 if (current_chead == donate_queue_head) {
4281 c_seg->c_has_donated_pages = 1;
4282 break;
4283 }
4284 }
4285
4286 *current_chead = c_seg;
4287
4288 #if DEVELOPMENT || DEBUG
4289 C_SEG_MAKE_WRITEABLE(c_seg);
4290 #endif
4291 }
4292 c_seg_alloc_nextslot(c_seg);
4293
4294 size_to_populate = c_seg_allocsize - C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset);
4295
4296 if (size_to_populate) {
4297 min_needed = PAGE_SIZE + (c_seg_allocsize - c_seg_bufsize);
4298
4299 if (C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset) < (unsigned) min_needed) {
4300 if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) {
4301 size_to_populate = C_SEG_MAX_POPULATE_SIZE;
4302 }
4303
4304 os_atomic_add(&vm_pageout_vminfo.vm_compressor_pages_grabbed, size_to_populate / PAGE_SIZE, relaxed);
4305
4306 kernel_memory_populate(
4307 (vm_offset_t) &c_seg->c_store.c_buffer[c_seg->c_populated_offset],
4308 size_to_populate,
4309 KMA_NOFAIL | KMA_COMPRESSOR,
4310 VM_KERN_MEMORY_COMPRESSOR);
4311 } else {
4312 size_to_populate = 0;
4313 }
4314 }
4315 PAGE_REPLACEMENT_DISALLOWED(TRUE);
4316
4317 lck_mtx_lock_spin_always(&c_seg->c_lock);
4318
4319 if (size_to_populate) {
4320 c_seg->c_populated_offset += C_SEG_BYTES_TO_OFFSET(size_to_populate);
4321 }
4322
4323 return c_seg;
4324 }
4325
4326 #if DEVELOPMENT || DEBUG
4327 #if CONFIG_FREEZE
4328 extern boolean_t memorystatus_freeze_to_memory;
4329 #endif /* CONFIG_FREEZE */
4330 #endif /* DEVELOPMENT || DEBUG */
4331 uint64_t c_seg_total_donated_bytes = 0; /* For testing/debugging only for now. Remove and add new counters for vm_stat.*/
4332
4333 uint64_t c_seg_filled_no_contention = 0;
4334 uint64_t c_seg_filled_contention = 0;
4335 clock_sec_t c_seg_filled_contention_sec_max = 0;
4336 clock_nsec_t c_seg_filled_contention_nsec_max = 0;
4337
4338 static void
c_current_seg_filled(c_segment_t c_seg,c_segment_t * current_chead)4339 c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead)
4340 {
4341 uint32_t unused_bytes;
4342 uint32_t offset_to_depopulate;
4343 int new_state = C_ON_AGE_Q;
4344 clock_sec_t sec;
4345 clock_nsec_t nsec;
4346 bool head_insert = false, wakeup_swapout_thread = false;
4347
4348 unused_bytes = trunc_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset));
4349
4350 if (unused_bytes) {
4351 /* if this is a platform that need an extra page at the end of the segment when running compress
4352 * then now is the time to depopulate that extra page. it still takes virtual space but doesn't
4353 * actually waste memory */
4354 offset_to_depopulate = C_SEG_BYTES_TO_OFFSET(round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_nextoffset)));
4355
4356 /* release the extra physical page(s) at the end of the segment */
4357 lck_mtx_unlock_always(&c_seg->c_lock);
4358
4359 kernel_memory_depopulate(
4360 (vm_offset_t) &c_seg->c_store.c_buffer[offset_to_depopulate],
4361 unused_bytes,
4362 KMA_COMPRESSOR,
4363 VM_KERN_MEMORY_COMPRESSOR);
4364
4365 lck_mtx_lock_spin_always(&c_seg->c_lock);
4366
4367 c_seg->c_populated_offset = offset_to_depopulate;
4368 }
4369 assert(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset) <= c_seg_bufsize);
4370
4371 #if DEVELOPMENT || DEBUG
4372 {
4373 boolean_t c_seg_was_busy = FALSE;
4374
4375 if (!c_seg->c_busy) {
4376 C_SEG_BUSY(c_seg);
4377 } else {
4378 c_seg_was_busy = TRUE;
4379 }
4380
4381 lck_mtx_unlock_always(&c_seg->c_lock);
4382
4383 C_SEG_WRITE_PROTECT(c_seg);
4384
4385 lck_mtx_lock_spin_always(&c_seg->c_lock);
4386
4387 if (c_seg_was_busy == FALSE) {
4388 C_SEG_WAKEUP_DONE(c_seg);
4389 }
4390 }
4391 #endif
4392
4393 #if CONFIG_FREEZE
4394 if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead) &&
4395 VM_CONFIG_SWAP_IS_PRESENT &&
4396 VM_CONFIG_FREEZER_SWAP_IS_ACTIVE
4397 #if DEVELOPMENT || DEBUG
4398 && !memorystatus_freeze_to_memory
4399 #endif /* DEVELOPMENT || DEBUG */
4400 ) {
4401 new_state = C_ON_SWAPOUT_Q;
4402 wakeup_swapout_thread = true;
4403 }
4404 #endif /* CONFIG_FREEZE */
4405
4406 if (vm_darkwake_mode == TRUE) {
4407 new_state = C_ON_SWAPOUT_Q;
4408 head_insert = true;
4409 wakeup_swapout_thread = true;
4410 } else {
4411 c_segment_t *donate_queue_head;
4412 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
4413 #if XNU_TARGET_OS_OSX /* tag:DONATE */
4414 donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_early_swapout_chead);
4415 #else /* XNU_TARGET_OS_OSX */
4416 donate_queue_head = (c_segment_t*) &(pgo_iothread_internal_state[i].current_late_swapout_chead);
4417 #endif /* XNU_TARGET_OS_OSX */
4418 if (current_chead == donate_queue_head) {
4419 /* This is the place where the "donating" task actually does the so-called donation
4420 * Instead of continueing to take place in memory in the compressor, the segment goes directly
4421 * to swap-out instead of going to AGE_Q */
4422 assert(c_seg->c_has_donated_pages);
4423 new_state = C_ON_SWAPOUT_Q;
4424 c_seg_total_donated_bytes += c_seg->c_bytes_used;
4425 break;
4426 }
4427 }
4428 }
4429
4430 clock_get_system_nanotime(&sec, &nsec);
4431 c_seg->c_creation_ts = (uint32_t)sec;
4432
4433 if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
4434 clock_sec_t sec2;
4435 clock_nsec_t nsec2;
4436
4437 lck_mtx_lock_spin_always(c_list_lock);
4438 clock_get_system_nanotime(&sec2, &nsec2);
4439 TIME_SUB(sec2, sec, nsec2, nsec, NSEC_PER_SEC);
4440 /* keep track of how much time we've waited for c_list_lock */
4441 if (sec2 > c_seg_filled_contention_sec_max) {
4442 c_seg_filled_contention_sec_max = sec2;
4443 c_seg_filled_contention_nsec_max = nsec2;
4444 } else if (sec2 == c_seg_filled_contention_sec_max && nsec2 > c_seg_filled_contention_nsec_max) {
4445 c_seg_filled_contention_nsec_max = nsec2;
4446 }
4447 c_seg_filled_contention++;
4448 } else {
4449 c_seg_filled_no_contention++;
4450 }
4451
4452 #if CONFIG_FREEZE
4453 if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead)) {
4454 if (freezer_context_global.freezer_ctx_task->donates_own_pages) {
4455 assert(!c_seg->c_has_donated_pages);
4456 c_seg->c_has_donated_pages = 1;
4457 os_atomic_add(&c_segment_pages_compressed_incore_late_swapout, c_seg->c_slots_used, relaxed);
4458 }
4459 c_seg->c_has_freezer_pages = 1;
4460 }
4461 #endif /* CONFIG_FREEZE */
4462
4463 c_seg->c_generation_id = c_generation_id++;
4464 c_seg_switch_state(c_seg, new_state, head_insert);
4465
4466 #if CONFIG_FREEZE
4467 /*
4468 * Donated segments count as frozen to swap if we go through the freezer.
4469 * TODO: What we need is a new ledger and cseg state that can describe
4470 * a frozen cseg from a donated task so we can accurately decrement it on
4471 * swapins.
4472 */
4473 if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead) && (c_seg->c_state == C_ON_SWAPOUT_Q)) {
4474 /*
4475 * darkwake and freezer can't co-exist together
4476 * We'll need to fix this accounting as a start.
4477 * And early donation c_segs are separate from frozen c_segs.
4478 */
4479 assert(vm_darkwake_mode == FALSE);
4480 c_seg_update_task_owner(c_seg, freezer_context_global.freezer_ctx_task);
4481 freezer_context_global.freezer_ctx_swapped_bytes += c_seg->c_bytes_used;
4482 }
4483 #endif /* CONFIG_FREEZE */
4484
4485 if (c_seg->c_state == C_ON_AGE_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
4486 /* this is possible if we decompressed a page from the segment before it ended filling */
4487 #if CONFIG_FREEZE
4488 assert(c_seg->c_task_owner == NULL);
4489 #endif /* CONFIG_FREEZE */
4490 c_seg_need_delayed_compaction(c_seg, TRUE);
4491 }
4492
4493 lck_mtx_unlock_always(c_list_lock);
4494
4495 if (wakeup_swapout_thread) {
4496 /*
4497 * Darkwake and Freeze configs always
4498 * wake up the swapout thread because
4499 * the compactor thread that normally handles
4500 * it may not be running as much in these
4501 * configs.
4502 */
4503 thread_wakeup((event_t)&vm_swapout_thread);
4504 }
4505
4506 *current_chead = NULL;
4507 }
4508
4509 /*
4510 * returns with c_seg locked
4511 */
4512 void
c_seg_swapin_requeue(c_segment_t c_seg,boolean_t has_data,boolean_t minor_compact_ok,boolean_t age_on_swapin_q)4513 c_seg_swapin_requeue(c_segment_t c_seg, boolean_t has_data, boolean_t minor_compact_ok, boolean_t age_on_swapin_q)
4514 {
4515 clock_sec_t sec;
4516 clock_nsec_t nsec;
4517
4518 clock_get_system_nanotime(&sec, &nsec);
4519
4520 lck_mtx_lock_spin_always(c_list_lock);
4521 lck_mtx_lock_spin_always(&c_seg->c_lock);
4522
4523 assert(c_seg->c_busy_swapping);
4524 assert(c_seg->c_busy);
4525
4526 c_seg->c_busy_swapping = 0;
4527
4528 if (c_seg->c_overage_swap == TRUE) {
4529 c_overage_swapped_count--;
4530 c_seg->c_overage_swap = FALSE;
4531 }
4532 if (has_data == TRUE) {
4533 if (age_on_swapin_q == TRUE || c_seg->c_has_donated_pages) {
4534 #if CONFIG_FREEZE
4535 /*
4536 * If a segment has both identities, frozen and donated bits set, the donated
4537 * bit wins on the swapin path. This is because the segment is being swapped back
4538 * in and so is in demand and should be given more time to spend in memory before
4539 * being swapped back out under pressure.
4540 */
4541 if (c_seg->c_has_donated_pages) {
4542 c_seg->c_has_freezer_pages = 0;
4543 }
4544 #endif /* CONFIG_FREEZE */
4545 c_seg_switch_state(c_seg, C_ON_SWAPPEDIN_Q, FALSE);
4546 } else {
4547 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
4548 }
4549
4550 if (minor_compact_ok == TRUE && !c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
4551 c_seg_need_delayed_compaction(c_seg, TRUE);
4552 }
4553 } else {
4554 c_seg->c_store.c_buffer = (int32_t*) NULL;
4555 c_seg->c_populated_offset = C_SEG_BYTES_TO_OFFSET(0);
4556
4557 c_seg_switch_state(c_seg, C_ON_BAD_Q, FALSE);
4558 }
4559 c_seg->c_swappedin_ts = (uint32_t)sec;
4560 c_seg->c_swappedin = true;
4561 #if TRACK_C_SEGMENT_UTILIZATION
4562 c_seg->c_decompressions_since_swapin = 0;
4563 #endif /* TRACK_C_SEGMENT_UTILIZATION */
4564
4565 lck_mtx_unlock_always(c_list_lock);
4566 }
4567
4568
4569
4570 /*
4571 * c_seg has to be locked and is returned locked if the c_seg isn't freed
4572 * PAGE_REPLACMENT_DISALLOWED has to be TRUE on entry and is returned TRUE
4573 * c_seg_swapin returns 1 if the c_seg was freed, 0 otherwise
4574 */
4575
4576 int
c_seg_swapin(c_segment_t c_seg,boolean_t force_minor_compaction,boolean_t age_on_swapin_q)4577 c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction, boolean_t age_on_swapin_q)
4578 {
4579 vm_offset_t addr = 0;
4580 uint32_t io_size = 0;
4581 uint64_t f_offset;
4582 thread_pri_floor_t token;
4583
4584 assert(C_SEG_IS_ONDISK(c_seg));
4585
4586 #if !CHECKSUM_THE_SWAP
4587 c_seg_trim_tail(c_seg);
4588 #endif
4589 io_size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
4590 f_offset = c_seg->c_store.c_swap_handle;
4591
4592 C_SEG_BUSY(c_seg);
4593 c_seg->c_busy_swapping = 1;
4594
4595 /*
4596 * This thread is likely going to block for I/O.
4597 * Make sure it is ready to run when the I/O completes because
4598 * it needs to clear the busy bit on the c_seg so that other
4599 * waiting threads can make progress too.
4600 */
4601 token = thread_priority_floor_start();
4602 lck_mtx_unlock_always(&c_seg->c_lock);
4603
4604 PAGE_REPLACEMENT_DISALLOWED(FALSE);
4605
4606 addr = (vm_offset_t)C_SEG_BUFFER_ADDRESS(c_seg->c_mysegno);
4607 c_seg->c_store.c_buffer = (int32_t*) addr;
4608
4609 kernel_memory_populate(addr, io_size, KMA_NOFAIL | KMA_COMPRESSOR,
4610 VM_KERN_MEMORY_COMPRESSOR);
4611
4612 if (vm_swap_get(c_seg, f_offset, io_size) != KERN_SUCCESS) {
4613 PAGE_REPLACEMENT_DISALLOWED(TRUE);
4614
4615 kernel_memory_depopulate(addr, io_size, KMA_COMPRESSOR,
4616 VM_KERN_MEMORY_COMPRESSOR);
4617
4618 c_seg_swapin_requeue(c_seg, FALSE, TRUE, age_on_swapin_q);
4619 } else {
4620 #if ENCRYPTED_SWAP
4621 vm_swap_decrypt(c_seg, true);
4622 #endif /* ENCRYPTED_SWAP */
4623
4624 #if CHECKSUM_THE_SWAP
4625 if (c_seg->cseg_swap_size != io_size) {
4626 panic("swapin size doesn't match swapout size");
4627 }
4628
4629 if (c_seg->cseg_hash != vmc_hash((char*) c_seg->c_store.c_buffer, (int)io_size)) {
4630 panic("c_seg_swapin - Swap hash mismatch");
4631 }
4632 #endif /* CHECKSUM_THE_SWAP */
4633
4634 PAGE_REPLACEMENT_DISALLOWED(TRUE);
4635
4636 c_seg_swapin_requeue(c_seg, TRUE, force_minor_compaction == TRUE ? FALSE : TRUE, age_on_swapin_q);
4637
4638 #if CONFIG_FREEZE
4639 /*
4640 * c_seg_swapin_requeue() returns with the c_seg lock held.
4641 */
4642 if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
4643 assert(c_seg->c_busy);
4644
4645 lck_mtx_unlock_always(&c_seg->c_lock);
4646 lck_mtx_lock_spin_always(c_list_lock);
4647 lck_mtx_lock_spin_always(&c_seg->c_lock);
4648 }
4649
4650 if (c_seg->c_task_owner) {
4651 c_seg_update_task_owner(c_seg, NULL);
4652 }
4653
4654 lck_mtx_unlock_always(c_list_lock);
4655
4656 os_atomic_add(&c_segment_pages_compressed_incore, c_seg->c_slots_used, relaxed);
4657 if (c_seg->c_has_donated_pages) {
4658 os_atomic_add(&c_segment_pages_compressed_incore_late_swapout, c_seg->c_slots_used, relaxed);
4659 }
4660 #endif /* CONFIG_FREEZE */
4661
4662 __assert_only unsigned int prev_swapped_count = os_atomic_sub_orig(
4663 &vm_page_swapped_count, c_seg->c_slots_used, relaxed);
4664 assert3u(prev_swapped_count, >=, c_seg->c_slots_used);
4665 os_atomic_add(&compressor_bytes_used, c_seg->c_bytes_used, relaxed);
4666
4667 if (force_minor_compaction == TRUE) {
4668 if (c_seg_minor_compaction_and_unlock(c_seg, FALSE)) {
4669 /*
4670 * c_seg was completely empty so it was freed,
4671 * so be careful not to reference it again
4672 *
4673 * Drop the boost so that the thread priority
4674 * is returned back to where it is supposed to be.
4675 */
4676 thread_priority_floor_end(&token);
4677 return 1;
4678 }
4679
4680 lck_mtx_lock_spin_always(&c_seg->c_lock);
4681 }
4682 }
4683 C_SEG_WAKEUP_DONE(c_seg);
4684
4685 /*
4686 * Drop the boost so that the thread priority
4687 * is returned back to where it is supposed to be.
4688 */
4689 thread_priority_floor_end(&token);
4690
4691 return 0;
4692 }
4693
4694 /*
4695 * TODO: refactor the CAS loops in c_segment_sv_hash_drop_ref() and c_segment_sv_hash_instert()
4696 * to os_atomic_rmw_loop() [rdar://139546215]
4697 */
4698
4699 static void
c_segment_sv_hash_drop_ref(int hash_indx)4700 c_segment_sv_hash_drop_ref(int hash_indx)
4701 {
4702 struct c_sv_hash_entry o_sv_he, n_sv_he;
4703
4704 while (1) {
4705 o_sv_he.he_record = c_segment_sv_hash_table[hash_indx].he_record;
4706
4707 n_sv_he.he_ref = o_sv_he.he_ref - 1;
4708 n_sv_he.he_data = o_sv_he.he_data;
4709
4710 if (OSCompareAndSwap64((UInt64)o_sv_he.he_record, (UInt64)n_sv_he.he_record, (UInt64 *) &c_segment_sv_hash_table[hash_indx].he_record) == TRUE) {
4711 if (n_sv_he.he_ref == 0) {
4712 os_atomic_dec(&c_segment_svp_in_hash, relaxed);
4713 }
4714 break;
4715 }
4716 }
4717 }
4718
4719
4720 static int
c_segment_sv_hash_insert(uint32_t data)4721 c_segment_sv_hash_insert(uint32_t data)
4722 {
4723 int hash_sindx;
4724 int misses;
4725 struct c_sv_hash_entry o_sv_he, n_sv_he;
4726 boolean_t got_ref = FALSE;
4727
4728 if (data == 0) {
4729 os_atomic_inc(&c_segment_svp_zero_compressions, relaxed);
4730 } else {
4731 os_atomic_inc(&c_segment_svp_nonzero_compressions, relaxed);
4732 }
4733
4734 hash_sindx = data & C_SV_HASH_MASK;
4735
4736 for (misses = 0; misses < C_SV_HASH_MAX_MISS; misses++) {
4737 o_sv_he.he_record = c_segment_sv_hash_table[hash_sindx].he_record;
4738
4739 while (o_sv_he.he_data == data || o_sv_he.he_ref == 0) {
4740 n_sv_he.he_ref = o_sv_he.he_ref + 1;
4741 n_sv_he.he_data = data;
4742
4743 if (OSCompareAndSwap64((UInt64)o_sv_he.he_record, (UInt64)n_sv_he.he_record, (UInt64 *) &c_segment_sv_hash_table[hash_sindx].he_record) == TRUE) {
4744 if (n_sv_he.he_ref == 1) {
4745 os_atomic_inc(&c_segment_svp_in_hash, relaxed);
4746 }
4747 got_ref = TRUE;
4748 break;
4749 }
4750 o_sv_he.he_record = c_segment_sv_hash_table[hash_sindx].he_record;
4751 }
4752 if (got_ref == TRUE) {
4753 break;
4754 }
4755 hash_sindx++;
4756
4757 if (hash_sindx == C_SV_HASH_SIZE) {
4758 hash_sindx = 0;
4759 }
4760 }
4761 if (got_ref == FALSE) {
4762 return -1;
4763 }
4764
4765 return hash_sindx;
4766 }
4767
4768
4769 #if RECORD_THE_COMPRESSED_DATA
4770
4771 static void
c_compressed_record_data(char * src,int c_size)4772 c_compressed_record_data(char *src, int c_size)
4773 {
4774 if ((c_compressed_record_cptr + c_size + 4) >= c_compressed_record_ebuf) {
4775 panic("c_compressed_record_cptr >= c_compressed_record_ebuf");
4776 }
4777
4778 *(int *)((void *)c_compressed_record_cptr) = c_size;
4779
4780 c_compressed_record_cptr += 4;
4781
4782 memcpy(c_compressed_record_cptr, src, c_size);
4783 c_compressed_record_cptr += c_size;
4784 }
4785 #endif
4786
4787
4788 /**
4789 * Do the actual compression of the given page
4790 * @param src [IN] address in the physical aperture of the page to compress.
4791 * @param slot_ptr [OUT] fill the slot-mapping of the c_seg+slot where the page ends up being stored
4792 * @param current_chead [IN-OUT] current filling c_seg. pointer comes from the current compression thread state
4793 * On the very first call this is going to point to NULL and this function will fill that pointer with a new
4794 * filling c_sec if the current filling c_seg doesn't have enough space, it will be replaced in this location
4795 * with a new filling c_seg
4796 * @param scratch_buf [IN] pointer from the current thread state, used by the compression codec
4797 * @return KERN_RESOURCE_SHORTAGE if the compressor has been exhausted
4798 */
4799 static kern_return_t
c_compress_page(char * src,c_slot_mapping_t slot_ptr,c_segment_t * current_chead,char * scratch_buf,__unused vm_compressor_options_t flags)4800 c_compress_page(
4801 char *src,
4802 c_slot_mapping_t slot_ptr,
4803 c_segment_t *current_chead,
4804 char *scratch_buf,
4805 __unused vm_compressor_options_t flags)
4806 {
4807 int c_size = -1;
4808 int c_rounded_size = 0;
4809 int max_csize;
4810 bool nearing_limits;
4811 c_slot_t cs;
4812 c_segment_t c_seg;
4813
4814 KERNEL_DEBUG(0xe0400000 | DBG_FUNC_START, *current_chead, 0, 0, 0, 0);
4815 retry: /* may need to retry if the currently filling c_seg will not have enough space */
4816 c_seg = c_seg_allocate(current_chead, &nearing_limits);
4817 if (c_seg == NULL) {
4818 if (nearing_limits) {
4819 memorystatus_respond_to_compressor_exhaustion();
4820 }
4821 return KERN_RESOURCE_SHORTAGE;
4822 }
4823
4824 /*
4825 * c_seg_allocate() returns with c_seg lock held
4826 * and PAGE_REPLACEMENT_DISALLOWED(TRUE)...
4827 * c_nextslot has been allocated and
4828 * c_store.c_buffer populated
4829 */
4830 assert(c_seg->c_state == C_IS_FILLING);
4831
4832 cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_seg->c_nextslot);
4833
4834 C_SLOT_ASSERT_PACKABLE(slot_ptr);
4835 cs->c_packed_ptr = C_SLOT_PACK_PTR(slot_ptr);
4836
4837 cs->c_offset = c_seg->c_nextoffset;
4838
4839 unsigned int avail_space = c_seg_bufsize - C_SEG_OFFSET_TO_BYTES((int32_t)cs->c_offset);
4840
4841
4842 max_csize = avail_space;
4843 if (max_csize > PAGE_SIZE) {
4844 max_csize = PAGE_SIZE;
4845 }
4846
4847 #if CHECKSUM_THE_DATA
4848 cs->c_hash_data = vmc_hash(src, PAGE_SIZE);
4849 #endif
4850 boolean_t incomp_copy = FALSE; /* codec indicates it already did copy an incompressible page */
4851 /* The SW codec case needs 4 bytes for its header and these are not accounted for in the bytes_budget argument.
4852 * Also, the the SV-not-in-hash case needs 4 bytes. */
4853 int max_csize_adj = (max_csize - 4);
4854 if (__improbable(max_csize_adj < 0)) {
4855 max_csize_adj = 0;
4856 }
4857
4858 if (max_csize > 0 && max_csize_adj > 0) {
4859 if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) {
4860 #if defined(__arm64__)
4861 uint16_t ccodec = CINVALID;
4862 uint32_t inline_popcount;
4863 if (max_csize >= C_SEG_OFFSET_ALIGNMENT_BOUNDARY) {
4864 vm_memtag_disable_checking();
4865 c_size = metacompressor((const uint8_t *) src,
4866 (uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset],
4867 max_csize_adj, &ccodec,
4868 scratch_buf, &incomp_copy, &inline_popcount);
4869 vm_memtag_enable_checking();
4870 assert(inline_popcount == C_SLOT_NO_POPCOUNT);
4871
4872 #if C_SEG_OFFSET_ALIGNMENT_BOUNDARY > 4
4873 /* The case of HW codec doesn't detect overflow on its own, instead it spills the the next page
4874 * and we need to detect this happened */
4875 if (c_size > max_csize_adj) {
4876 c_size = -1;
4877 }
4878 #endif
4879 } else {
4880 c_size = -1;
4881 }
4882 assert(ccodec == CCWK || ccodec == CCLZ4);
4883 cs->c_codec = ccodec;
4884 #endif
4885 } else {
4886 #if defined(__arm64__)
4887 vm_memtag_disable_checking();
4888 cs->c_codec = CCWK;
4889 __unreachable_ok_push
4890 if (PAGE_SIZE == 4096) {
4891 c_size = WKdm_compress_4k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4892 (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4893 } else {
4894 c_size = WKdm_compress_16k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4895 (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4896 }
4897 __unreachable_ok_pop
4898 vm_memtag_enable_checking();
4899 #else
4900 vm_memtag_disable_checking();
4901 c_size = WKdm_compress_new((const WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4902 (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4903 vm_memtag_enable_checking();
4904 #endif
4905 }
4906 } else { /* max_csize == 0 or max_csize_adj == 0 */
4907 c_size = -1;
4908 }
4909 /* c_size is the size written by the codec, or 0 if it's uniform 32 bit value or (-1 if there was not enough space
4910 * or it was incompressible) */
4911 assertf(((c_size <= max_csize_adj) && (c_size >= -1)),
4912 "c_size invalid (%d, %d), cur compressions: %d", c_size, max_csize_adj, c_segment_pages_compressed);
4913
4914 if (c_size == -1) {
4915 if (max_csize < PAGE_SIZE) {
4916 c_current_seg_filled(c_seg, current_chead);
4917 assert(*current_chead == NULL);
4918
4919 lck_mtx_unlock_always(&c_seg->c_lock);
4920 /* TODO: it may be worth requiring codecs to distinguish
4921 * between incompressible inputs and failures due to budget exhaustion.
4922 * right now this assumes that if the space we had is > PAGE_SIZE, then the codec failed due to incompressible input */
4923
4924 PAGE_REPLACEMENT_DISALLOWED(FALSE);
4925 goto retry; /* previous c_seg didn't have enough space, we finalized it and can try again with a fresh c_seg */
4926 }
4927 c_size = PAGE_SIZE; /* tag:WK-INCOMPRESSIBLE */
4928
4929 if (incomp_copy == FALSE) { /* codec did not copy the incompressible input */
4930 vm_memtag_disable_checking();
4931 memcpy(&c_seg->c_store.c_buffer[cs->c_offset], src, c_size);
4932 vm_memtag_enable_checking();
4933 }
4934
4935 os_atomic_inc(&c_segment_noncompressible_pages, relaxed);
4936 } else if (c_size == 0) {
4937 {
4938 /*
4939 * Special case - this is a page completely full of a single 32 bit value.
4940 * We store some values directly in the c_slot_mapping, if not there, the
4941 * 4 byte value goes in the compressor segment.
4942 */
4943 int hash_index = c_segment_sv_hash_insert(*(uint32_t *) (uintptr_t) src);
4944
4945 if (hash_index != -1) {
4946 slot_ptr->s_cindx = hash_index;
4947 slot_ptr->s_cseg = C_SV_CSEG_ID;
4948 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4949 slot_ptr->s_uncompressed = 0;
4950 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4951
4952 os_atomic_inc(&c_segment_svp_hash_succeeded, relaxed);
4953 #if RECORD_THE_COMPRESSED_DATA
4954 c_compressed_record_data(src, 4);
4955 #endif
4956 /* we didn't write anything to c_buffer and didn't end up using the slot in the c_seg at all, so skip all
4957 * the book-keeping of the case that we did */
4958 goto sv_compression;
4959 }
4960 }
4961 os_atomic_inc(&c_segment_svp_hash_failed, relaxed);
4962
4963 c_size = 4;
4964 vm_memtag_disable_checking();
4965 memcpy(&c_seg->c_store.c_buffer[cs->c_offset], src, c_size);
4966 vm_memtag_enable_checking();
4967 }
4968
4969 #if RECORD_THE_COMPRESSED_DATA
4970 c_compressed_record_data((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size);
4971 #endif
4972 #if CHECKSUM_THE_COMPRESSED_DATA
4973 cs->c_hash_compressed_data = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size);
4974 #endif
4975 #if POPCOUNT_THE_COMPRESSED_DATA
4976 cs->c_pop_cdata = vmc_pop((uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset], c_size);
4977 #endif
4978
4979 PACK_C_SIZE(cs, c_size);
4980
4981 c_rounded_size = C_SEG_ROUND_TO_ALIGNMENT(c_size);
4982
4983 c_seg->c_bytes_used += c_rounded_size;
4984 c_seg->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
4985 c_seg->c_slots_used++;
4986
4987 #if CONFIG_FREEZE
4988 /* TODO: should c_segment_pages_compressed be up here too? See 88598046 for details */
4989 os_atomic_inc(&c_segment_pages_compressed_incore, relaxed);
4990 if (c_seg->c_has_donated_pages) {
4991 os_atomic_inc(&c_segment_pages_compressed_incore_late_swapout, relaxed);
4992 }
4993 #endif /* CONFIG_FREEZE */
4994
4995 slot_ptr->s_cindx = c_seg->c_nextslot++;
4996 /* <csegno=0,indx=0> would mean "empty slot", so use csegno+1, see other usages of s_cseg where it's decremented */
4997 slot_ptr->s_cseg = c_seg->c_mysegno + 1;
4998
4999 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
5000 slot_ptr->s_uncompressed = 0;
5001 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
5002
5003 sv_compression:
5004 /* can we say this c_seg is full? */
5005 if (c_seg->c_nextoffset >= c_seg_off_limit || c_seg->c_nextslot >= C_SLOT_MAX_INDEX) {
5006 /* condition 1: segment buffer is almost full, don't bother trying to fill it further.
5007 * condition 2: we can't have any more slots in this c_segment even if we had buffer space */
5008 c_current_seg_filled(c_seg, current_chead);
5009 assert(*current_chead == NULL);
5010 }
5011
5012 lck_mtx_unlock_always(&c_seg->c_lock);
5013
5014 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5015
5016 #if RECORD_THE_COMPRESSED_DATA
5017 if ((c_compressed_record_cptr - c_compressed_record_sbuf) >= c_seg_allocsize) {
5018 c_compressed_record_write(c_compressed_record_sbuf, (int)(c_compressed_record_cptr - c_compressed_record_sbuf));
5019 c_compressed_record_cptr = c_compressed_record_sbuf;
5020 }
5021 #endif
5022 if (c_size) {
5023 os_atomic_add(&c_segment_compressed_bytes, c_size, relaxed);
5024 os_atomic_add(&compressor_bytes_used, c_rounded_size, relaxed);
5025 }
5026 os_atomic_add(&c_segment_input_bytes, PAGE_SIZE, relaxed);
5027
5028 os_atomic_inc(&c_segment_pages_compressed, relaxed);
5029 #if DEVELOPMENT || DEBUG
5030 if (!compressor_running_perf_test) {
5031 /*
5032 * The perf_compressor benchmark should not be able to trigger
5033 * compressor thrashing jetsams.
5034 */
5035 os_atomic_inc(&sample_period_compression_count, relaxed);
5036 }
5037 #else /* DEVELOPMENT || DEBUG */
5038 os_atomic_inc(&sample_period_compression_count, relaxed);
5039 #endif /* DEVELOPMENT || DEBUG */
5040
5041 if (nearing_limits) {
5042 memorystatus_respond_to_compressor_exhaustion();
5043 }
5044
5045 KERNEL_DEBUG(0xe0400000 | DBG_FUNC_END, *current_chead, c_size, c_segment_input_bytes, c_segment_compressed_bytes, 0);
5046
5047 return KERN_SUCCESS;
5048 }
5049
5050 static inline void
sv_decompress(int32_t * ddst,int32_t pattern)5051 sv_decompress(int32_t *ddst, int32_t pattern)
5052 {
5053 // assert(__builtin_constant_p(PAGE_SIZE) != 0);
5054 #if defined(__x86_64__)
5055 memset_word(ddst, pattern, PAGE_SIZE / sizeof(int32_t));
5056 #elif defined(__arm64__)
5057 assert((PAGE_SIZE % 128) == 0);
5058 if (pattern == 0) {
5059 fill32_dczva((addr64_t)ddst, PAGE_SIZE);
5060 } else {
5061 fill32_nt((addr64_t)ddst, PAGE_SIZE, pattern);
5062 }
5063 #else
5064 size_t i;
5065
5066 /* Unroll the pattern fill loop 4x to encourage the
5067 * compiler to emit NEON stores, cf.
5068 * <rdar://problem/25839866> Loop autovectorization
5069 * anomalies.
5070 */
5071 /* * We use separate loops for each PAGE_SIZE
5072 * to allow the autovectorizer to engage, as PAGE_SIZE
5073 * may not be a constant.
5074 */
5075
5076 __unreachable_ok_push
5077 if (PAGE_SIZE == 4096) {
5078 for (i = 0; i < (4096U / sizeof(int32_t)); i += 4) {
5079 *ddst++ = pattern;
5080 *ddst++ = pattern;
5081 *ddst++ = pattern;
5082 *ddst++ = pattern;
5083 }
5084 } else {
5085 assert(PAGE_SIZE == 16384);
5086 for (i = 0; i < (int)(16384U / sizeof(int32_t)); i += 4) {
5087 *ddst++ = pattern;
5088 *ddst++ = pattern;
5089 *ddst++ = pattern;
5090 *ddst++ = pattern;
5091 }
5092 }
5093 __unreachable_ok_pop
5094 #endif
5095 }
5096
5097 static vm_decompress_result_t
c_decompress_page(char * dst,volatile c_slot_mapping_t slot_ptr,vm_compressor_options_t flags,int * zeroslot)5098 c_decompress_page(
5099 char *dst,
5100 volatile c_slot_mapping_t slot_ptr, /* why volatile? perhaps due to changes across hibernation */
5101 vm_compressor_options_t flags,
5102 int *zeroslot)
5103 {
5104 c_slot_t cs;
5105 c_segment_t c_seg;
5106 uint32_t c_segno;
5107 uint16_t c_indx;
5108 int c_rounded_size;
5109 uint32_t c_size;
5110 vm_decompress_result_t retval = 0;
5111 boolean_t need_unlock = TRUE;
5112 boolean_t consider_defragmenting = FALSE;
5113 boolean_t kdp_mode = FALSE;
5114
5115 if (__improbable(flags & C_KDP)) {
5116 if (not_in_kdp) {
5117 panic("C_KDP passed to decompress page from outside of debugger context");
5118 }
5119
5120 assert((flags & C_KEEP) == C_KEEP);
5121 assert((flags & C_DONT_BLOCK) == C_DONT_BLOCK);
5122
5123 if ((flags & (C_DONT_BLOCK | C_KEEP)) != (C_DONT_BLOCK | C_KEEP)) {
5124 return DECOMPRESS_NEED_BLOCK;
5125 }
5126
5127 kdp_mode = TRUE;
5128 *zeroslot = 0;
5129 }
5130
5131 ReTry:
5132 if (__probable(!kdp_mode)) {
5133 PAGE_REPLACEMENT_DISALLOWED(TRUE);
5134 } else {
5135 if (kdp_lck_rw_lock_is_acquired_exclusive(&c_master_lock)) {
5136 return DECOMPRESS_NEED_BLOCK;
5137 }
5138 }
5139
5140 #if HIBERNATION
5141 /*
5142 * if hibernation is enabled, it indicates (via a call
5143 * to 'vm_decompressor_lock' that no further
5144 * decompressions are allowed once it reaches
5145 * the point of flushing all of the currently dirty
5146 * anonymous memory through the compressor and out
5147 * to disk... in this state we allow freeing of compressed
5148 * pages and must honor the C_DONT_BLOCK case
5149 */
5150 if (__improbable(dst && decompressions_blocked == TRUE)) {
5151 if (flags & C_DONT_BLOCK) {
5152 if (__probable(!kdp_mode)) {
5153 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5154 }
5155
5156 *zeroslot = 0;
5157 return -2;
5158 }
5159 /*
5160 * it's safe to atomically assert and block behind the
5161 * lock held in shared mode because "decompressions_blocked" is
5162 * only set and cleared and the thread_wakeup done when the lock
5163 * is held exclusively
5164 */
5165 assert_wait((event_t)&decompressions_blocked, THREAD_UNINT);
5166
5167 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5168
5169 thread_block(THREAD_CONTINUE_NULL);
5170
5171 goto ReTry;
5172 }
5173 #endif
5174 /* s_cseg is actually "segno+1" */
5175 c_segno = slot_ptr->s_cseg - 1;
5176
5177 if (__improbable(c_segno >= c_segments_available)) {
5178 panic("c_decompress_page: c_segno %d >= c_segments_available %d, slot_ptr(%p), slot_data(%x)",
5179 c_segno, c_segments_available, slot_ptr, *(int *)((void *)slot_ptr));
5180 }
5181
5182 if (__improbable(c_segments_get(c_segno)->c_segno < c_segments_available)) {
5183 panic("c_decompress_page: c_segno %d is free, slot_ptr(%p), slot_data(%x)",
5184 c_segno, slot_ptr, *(int *)((void *)slot_ptr));
5185 }
5186
5187 c_seg = c_segments_get(c_segno)->c_seg;
5188
5189 if (__probable(!kdp_mode)) {
5190 lck_mtx_lock_spin_always(&c_seg->c_lock);
5191 } else {
5192 if (kdp_lck_mtx_lock_spin_is_acquired(&c_seg->c_lock)) {
5193 return DECOMPRESS_NEED_BLOCK;
5194 }
5195 }
5196
5197 assert(c_seg->c_state != C_IS_EMPTY && c_seg->c_state != C_IS_FREE);
5198
5199 if (dst == NULL && c_seg->c_busy_swapping) {
5200 assert(c_seg->c_busy);
5201
5202 goto bypass_busy_check;
5203 }
5204 if (flags & C_DONT_BLOCK) {
5205 if (c_seg->c_busy || (C_SEG_IS_ONDISK(c_seg) && dst)) {
5206 *zeroslot = 0;
5207
5208 retval = DECOMPRESS_NEED_BLOCK;
5209 goto done;
5210 }
5211 }
5212 if (c_seg->c_busy) {
5213 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5214
5215 c_seg_wait_on_busy(c_seg);
5216
5217 goto ReTry;
5218 }
5219 bypass_busy_check:
5220
5221 c_indx = slot_ptr->s_cindx;
5222
5223 if (__improbable(c_indx >= c_seg->c_nextslot)) {
5224 panic("c_decompress_page: c_indx %d >= c_nextslot %d, c_seg(%p), slot_ptr(%p), slot_data(%x)",
5225 c_indx, c_seg->c_nextslot, c_seg, slot_ptr, *(int *)((void *)slot_ptr));
5226 }
5227
5228 cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5229
5230 c_size = UNPACK_C_SIZE(cs);
5231
5232
5233 if (__improbable(c_size == 0)) { /* sanity check it's not an empty slot */
5234 panic("c_decompress_page: c_size == 0, c_seg(%p), slot_ptr(%p), slot_data(%x)",
5235 c_seg, slot_ptr, *(int *)((void *)slot_ptr));
5236 }
5237
5238 c_rounded_size = C_SEG_ROUND_TO_ALIGNMENT(c_size + c_slot_extra_size(cs));
5239 /* c_rounded_size should not change after this point so that it remains consistent on all branches */
5240
5241 if (dst) { /* would be NULL if we don't want the page content, from free */
5242 uint32_t age_of_cseg;
5243 clock_sec_t cur_ts_sec;
5244 clock_nsec_t cur_ts_nsec;
5245
5246 if (C_SEG_IS_ONDISK(c_seg)) {
5247 #if CONFIG_FREEZE
5248 if (freezer_incore_cseg_acct) {
5249 if ((c_seg->c_slots_used + c_segment_pages_compressed_incore) >= c_segment_pages_compressed_nearing_limit) {
5250 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5251 lck_mtx_unlock_always(&c_seg->c_lock);
5252
5253 memorystatus_kill_on_VM_compressor_space_shortage(FALSE /* async */);
5254
5255 goto ReTry;
5256 }
5257
5258 uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
5259 if ((incore_seg_count + 1) >= c_segments_nearing_limit) {
5260 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5261 lck_mtx_unlock_always(&c_seg->c_lock);
5262
5263 memorystatus_kill_on_VM_compressor_space_shortage(FALSE /* async */);
5264
5265 goto ReTry;
5266 }
5267 }
5268 #endif /* CONFIG_FREEZE */
5269 assert(kdp_mode == FALSE);
5270 retval = c_seg_swapin(c_seg, FALSE, TRUE);
5271 assert(retval == 0);
5272
5273 retval = DECOMPRESS_SUCCESS_SWAPPEDIN;
5274 }
5275 if (c_seg->c_state == C_ON_BAD_Q) {
5276 assert(c_seg->c_store.c_buffer == NULL);
5277 *zeroslot = 0;
5278
5279 retval = DECOMPRESS_FAILED_BAD_Q;
5280 goto done;
5281 }
5282
5283 #if POPCOUNT_THE_COMPRESSED_DATA
5284 unsigned csvpop;
5285 uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
5286 if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
5287 panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%x 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
5288 }
5289 #endif
5290
5291 #if CHECKSUM_THE_COMPRESSED_DATA
5292 unsigned csvhash;
5293 if (cs->c_hash_compressed_data != (csvhash = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size))) {
5294 panic("Compressed data doesn't match original %p %p %u %u %u", c_seg, cs, c_size, cs->c_hash_compressed_data, csvhash);
5295 }
5296 #endif
5297 if (c_size == PAGE_SIZE) { /* tag:WK-INCOMPRESSIBLE */
5298 /* page wasn't compressible... just copy it out */
5299 vm_memtag_disable_checking();
5300 memcpy(dst, &c_seg->c_store.c_buffer[cs->c_offset], PAGE_SIZE);
5301 vm_memtag_enable_checking();
5302 } else if (c_size == 4) {
5303 int32_t data;
5304 int32_t *dptr;
5305
5306 /*
5307 * page was populated with a single value
5308 * that didn't fit into our fast hash
5309 * so we packed it in as a single non-compressed value
5310 * that we need to populate the page with
5311 */
5312 dptr = (int32_t *)(uintptr_t)dst;
5313 data = *(int32_t *)(&c_seg->c_store.c_buffer[cs->c_offset]);
5314 vm_memtag_disable_checking();
5315 sv_decompress(dptr, data);
5316 vm_memtag_enable_checking();
5317 } else { /* normal segment decompress */
5318 uint32_t my_cpu_no;
5319 char *scratch_buf;
5320
5321 my_cpu_no = cpu_number();
5322
5323 assert(my_cpu_no < compressor_cpus);
5324
5325 if (__probable(!kdp_mode)) {
5326 /*
5327 * we're behind the c_seg lock held in spin mode
5328 * which means pre-emption is disabled... therefore
5329 * the following sequence is atomic and safe
5330 */
5331 scratch_buf = &compressor_scratch_bufs[my_cpu_no * vm_compressor_get_decode_scratch_size()];
5332 } else if (flags & C_KDP_MULTICPU) {
5333 assert(vm_compressor_kdp_state.kc_scratch_bufs != NULL);
5334 scratch_buf = &vm_compressor_kdp_state.kc_scratch_bufs[my_cpu_no * vm_compressor_get_decode_scratch_size()];
5335 } else {
5336 scratch_buf = vm_compressor_kdp_state.kc_panic_scratch_buf;
5337 }
5338
5339 if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) {
5340 #if defined(__arm64__)
5341 uint16_t c_codec = cs->c_codec;
5342 uint32_t inline_popcount;
5343 vm_memtag_disable_checking();
5344 if (!metadecompressor((const uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset],
5345 (uint8_t *)dst, c_size, c_codec, (void *)scratch_buf, &inline_popcount)) {
5346 vm_memtag_enable_checking();
5347 retval = DECOMPRESS_FAILED_ALGO_ERROR;
5348 } else {
5349 vm_memtag_enable_checking();
5350 assert(inline_popcount == C_SLOT_NO_POPCOUNT);
5351 }
5352 #endif
5353 } else { /* algorithm == VM_COMPRESSOR_DEFAULT_CODEC */
5354 vm_memtag_disable_checking();
5355 #if defined(__arm64__)
5356 __unreachable_ok_push
5357 if (PAGE_SIZE == 4096) {
5358 WKdm_decompress_4k((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5359 (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5360 } else {
5361 WKdm_decompress_16k((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5362 (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5363 }
5364 __unreachable_ok_pop
5365 #else
5366 WKdm_decompress_new((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
5367 (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
5368 #endif
5369 vm_memtag_enable_checking();
5370 }
5371 } /* normal segment decompress */
5372
5373 #if CHECKSUM_THE_DATA
5374 if (cs->c_hash_data != vmc_hash(dst, PAGE_SIZE)) {
5375 #if defined(__arm64__)
5376 int32_t *dinput = &c_seg->c_store.c_buffer[cs->c_offset];
5377 panic("decompressed data doesn't match original cs: %p, hash: 0x%x, offset: %d, c_size: %d, c_rounded_size: %d, codec: %d, header: 0x%x 0x%x 0x%x", cs, cs->c_hash_data, cs->c_offset, c_size, c_rounded_size, cs->c_codec, *dinput, *(dinput + 1), *(dinput + 2));
5378 #else /* defined(__arm64__) */
5379 panic("decompressed data doesn't match original cs: %p, hash: %d, offset: 0x%x, c_size: %d", cs, cs->c_hash_data, cs->c_offset, c_size);
5380 #endif /* defined(__arm64__) */
5381 }
5382 #endif /* CHECKSUM_THE_DATA */
5383 if (c_seg->c_swappedin_ts == 0 && !kdp_mode) {
5384 clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
5385
5386 age_of_cseg = (uint32_t)cur_ts_sec - c_seg->c_creation_ts;
5387 if (age_of_cseg < DECOMPRESSION_SAMPLE_MAX_AGE) {
5388 os_atomic_inc(&age_of_decompressions_during_sample_period[age_of_cseg], relaxed);
5389 } else {
5390 os_atomic_inc(&overage_decompressions_during_sample_period, relaxed);
5391 }
5392
5393 os_atomic_inc(&sample_period_decompression_count, relaxed);
5394 }
5395
5396
5397 #if TRACK_C_SEGMENT_UTILIZATION
5398 if (c_seg->c_swappedin) {
5399 c_seg->c_decompressions_since_swapin++;
5400 }
5401 #endif /* TRACK_C_SEGMENT_UTILIZATION */
5402 } /* dst */
5403 else {
5404 /*
5405 * We are freeing an uncompressed page from this c_seg and so balance the ledgers.
5406 */
5407 if (C_SEG_IS_ONDISK(c_seg)) {
5408 __assert_only unsigned int prev_swapped_count =
5409 os_atomic_dec_orig(&vm_page_swapped_count, relaxed);
5410 assert3u(prev_swapped_count, >, 0);
5411 #if CONFIG_FREEZE
5412 /*
5413 * The compression sweep feature will push out anonymous pages to disk
5414 * without going through the freezer path and so those c_segs, while
5415 * swapped out, won't have an owner.
5416 */
5417 if (c_seg->c_task_owner) {
5418 task_update_frozen_to_swap_acct(c_seg->c_task_owner, PAGE_SIZE_64, DEBIT_FROM_SWAP);
5419 }
5420
5421 /*
5422 * We are freeing a page in swap without swapping it in. We bump the in-core
5423 * count here to simulate a swapin of a page so that we can accurately
5424 * decrement it below.
5425 */
5426 os_atomic_inc(&c_segment_pages_compressed_incore, relaxed);
5427 if (c_seg->c_has_donated_pages) {
5428 os_atomic_inc(&c_segment_pages_compressed_incore_late_swapout, relaxed);
5429 }
5430 } else if (c_seg->c_state == C_ON_BAD_Q) {
5431 assert(c_seg->c_store.c_buffer == NULL);
5432 *zeroslot = 0;
5433
5434 retval = DECOMPRESS_FAILED_BAD_Q_FREEZE;
5435 goto done; /* this is intended to avoid the decrement of c_segment_pages_compressed_incore below */
5436 #endif /* CONFIG_FREEZE */
5437 }
5438 }
5439
5440 if (flags & C_KEEP) {
5441 *zeroslot = 0;
5442 goto done;
5443 }
5444
5445
5446 /* now perform needed bookkeeping for the removal of the slot from the segment */
5447 assert(kdp_mode == FALSE);
5448
5449 c_seg->c_bytes_unused += c_rounded_size;
5450 c_seg->c_bytes_used -= c_rounded_size;
5451
5452 assert(c_seg->c_slots_used);
5453 c_seg->c_slots_used--;
5454 if (dst && c_seg->c_swappedin) {
5455 task_t task = current_task();
5456 if (task) {
5457 ledger_credit(task->ledger, task_ledgers.swapins, PAGE_SIZE);
5458 }
5459 }
5460
5461 PACK_C_SIZE(cs, 0); /* mark slot as empty */
5462
5463 if (c_indx < c_seg->c_firstemptyslot) {
5464 c_seg->c_firstemptyslot = c_indx;
5465 }
5466
5467 os_atomic_dec(&c_segment_pages_compressed, relaxed);
5468 #if CONFIG_FREEZE
5469 os_atomic_dec(&c_segment_pages_compressed_incore, relaxed);
5470 assertf(c_segment_pages_compressed_incore >= 0, "-ve incore count %p 0x%x", c_seg, c_segment_pages_compressed_incore);
5471 if (c_seg->c_has_donated_pages) {
5472 os_atomic_dec(&c_segment_pages_compressed_incore_late_swapout, relaxed);
5473 assertf(c_segment_pages_compressed_incore_late_swapout >= 0, "-ve lateswapout count %p 0x%x", c_seg, c_segment_pages_compressed_incore_late_swapout);
5474 }
5475 #endif /* CONFIG_FREEZE */
5476
5477 if (c_seg->c_state != C_ON_BAD_Q && !(C_SEG_IS_ONDISK(c_seg))) {
5478 /*
5479 * C_SEG_IS_ONDISK == TRUE can occur when we're doing a
5480 * free of a compressed page (i.e. dst == NULL)
5481 */
5482 os_atomic_sub(&compressor_bytes_used, c_rounded_size, relaxed);
5483 }
5484 if (c_seg->c_busy_swapping) {
5485 /*
5486 * bypass case for c_busy_swapping...
5487 * let the swapin/swapout paths deal with putting
5488 * the c_seg on the minor compaction queue if needed
5489 */
5490 assert(c_seg->c_busy);
5491 goto done;
5492 }
5493 assert(!c_seg->c_busy);
5494
5495 if (c_seg->c_state != C_IS_FILLING) {
5496 /* did we just remove the last slot from the segment? */
5497 if (c_seg->c_bytes_used == 0) {
5498 if (!(C_SEG_IS_ONDISK(c_seg))) {
5499 /* it was compressed resident in memory */
5500 int pages_populated;
5501
5502 pages_populated = (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) / PAGE_SIZE;
5503 c_seg->c_populated_offset = C_SEG_BYTES_TO_OFFSET(0);
5504
5505 if (pages_populated) {
5506 assert(c_seg->c_state != C_ON_BAD_Q);
5507 assert(c_seg->c_store.c_buffer != NULL);
5508
5509 C_SEG_BUSY(c_seg);
5510 lck_mtx_unlock_always(&c_seg->c_lock);
5511
5512 kernel_memory_depopulate(
5513 (vm_offset_t) c_seg->c_store.c_buffer,
5514 ptoa(pages_populated),
5515 KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
5516
5517 lck_mtx_lock_spin_always(&c_seg->c_lock);
5518 C_SEG_WAKEUP_DONE(c_seg);
5519 }
5520 /* minor compaction will free it */
5521 if (!c_seg->c_on_minorcompact_q && c_seg->c_state != C_ON_SWAPIO_Q) {
5522 if (c_seg->c_state == C_ON_SWAPOUT_Q) {
5523 /* If we're on the swapout q, we want to get out of it since there's no reason to swapout
5524 * anymore, so put on AGE Q in the meantime until minor compact */
5525 bool clear_busy = false;
5526 if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
5527 C_SEG_BUSY(c_seg);
5528
5529 lck_mtx_unlock_always(&c_seg->c_lock);
5530 lck_mtx_lock_spin_always(c_list_lock);
5531 lck_mtx_lock_spin_always(&c_seg->c_lock);
5532 clear_busy = true;
5533 }
5534 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
5535 if (clear_busy) {
5536 C_SEG_WAKEUP_DONE(c_seg);
5537 clear_busy = false;
5538 }
5539 lck_mtx_unlock_always(c_list_lock);
5540 }
5541 c_seg_need_delayed_compaction(c_seg, FALSE);
5542 }
5543 } else { /* C_SEG_IS_ONDISK(c_seg) */
5544 /* it's empty and on-disk, make sure it's marked as sparse */
5545 if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q) {
5546 c_seg_move_to_sparse_list(c_seg);
5547 consider_defragmenting = TRUE;
5548 }
5549 }
5550 } else if (c_seg->c_on_minorcompact_q) {
5551 assert(c_seg->c_state != C_ON_BAD_Q);
5552 assert(!C_SEG_IS_ON_DISK_OR_SOQ(c_seg));
5553
5554 if (C_SEG_SHOULD_MINORCOMPACT_NOW(c_seg)) {
5555 c_seg_try_minor_compaction_and_unlock(c_seg);
5556 need_unlock = FALSE;
5557 }
5558 } else if (!(C_SEG_IS_ONDISK(c_seg))) {
5559 if (c_seg->c_state != C_ON_BAD_Q && c_seg->c_state != C_ON_SWAPOUT_Q && c_seg->c_state != C_ON_SWAPIO_Q &&
5560 C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
5561 c_seg_need_delayed_compaction(c_seg, FALSE);
5562 }
5563 } else if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q && C_SEG_ONDISK_IS_SPARSE(c_seg)) {
5564 c_seg_move_to_sparse_list(c_seg);
5565 consider_defragmenting = TRUE;
5566 }
5567 } /* c_state != C_IS_FILLING */
5568 done:
5569 if (__improbable(kdp_mode)) {
5570 return retval;
5571 }
5572
5573 if (need_unlock == TRUE) {
5574 lck_mtx_unlock_always(&c_seg->c_lock);
5575 }
5576
5577 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5578
5579 if (consider_defragmenting == TRUE) {
5580 vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE);
5581 }
5582
5583 #if !XNU_TARGET_OS_OSX
5584 /*
5585 * Decompressions will generate fragmentation in the compressor pool
5586 * over time. Consider waking the compactor thread if any of the
5587 * fragmentation thresholds have been crossed as a result of this
5588 * decompression.
5589 */
5590 vm_consider_waking_compactor_swapper();
5591 #endif /* !XNU_TARGET_OS_OSX */
5592
5593 return retval;
5594 }
5595
5596
5597 inline bool
vm_compressor_is_slot_compressed(int * slot)5598 vm_compressor_is_slot_compressed(int *slot)
5599 {
5600 #if !CONFIG_TRACK_UNMODIFIED_ANON_PAGES
5601 #pragma unused(slot)
5602 return true;
5603 #else /* !CONFIG_TRACK_UNMODIFIED_ANON_PAGES*/
5604 c_slot_mapping_t slot_ptr = (c_slot_mapping_t)slot;
5605 return !slot_ptr->s_uncompressed;
5606 #endif /* !CONFIG_TRACK_UNMODIFIED_ANON_PAGES*/
5607 }
5608
5609 vm_decompress_result_t
vm_compressor_get(ppnum_t pn,int * slot,vm_compressor_options_t flags)5610 vm_compressor_get(ppnum_t pn, int *slot, vm_compressor_options_t flags)
5611 {
5612 c_slot_mapping_t slot_ptr;
5613 char *dst;
5614 int zeroslot = 1;
5615 vm_decompress_result_t retval;
5616
5617 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
5618 if (flags & C_PAGE_UNMODIFIED) {
5619 int iretval = vm_uncompressed_get(pn, slot, flags | C_KEEP);
5620 if (iretval == 0) {
5621 os_atomic_inc(&compressor_ro_uncompressed_get, relaxed);
5622 return DECOMPRESS_SUCCESS;
5623 }
5624
5625 return DECOMPRESS_FAILED_UNMODIFIED;
5626 }
5627 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
5628
5629 /* get address in physical aperture of this page for fill into */
5630 dst = pmap_map_compressor_page(pn);
5631 slot_ptr = (c_slot_mapping_t)slot;
5632
5633 assert(dst != NULL);
5634
5635 if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
5636 int32_t data;
5637 int32_t *dptr;
5638
5639 /*
5640 * page was populated with a single value
5641 * that found a home in our hash table
5642 * grab that value from the hash and populate the page
5643 * that we need to populate the page with
5644 */
5645 dptr = (int32_t *)(uintptr_t)dst;
5646 data = c_segment_sv_hash_table[slot_ptr->s_cindx].he_data;
5647 sv_decompress(dptr, data);
5648
5649 if (!(flags & C_KEEP)) {
5650 c_segment_sv_hash_drop_ref(slot_ptr->s_cindx);
5651
5652 os_atomic_dec(&c_segment_pages_compressed, relaxed);
5653 *slot = 0;
5654 }
5655 if (data) {
5656 os_atomic_inc(&c_segment_svp_nonzero_decompressions, relaxed);
5657 } else {
5658 os_atomic_inc(&c_segment_svp_zero_decompressions, relaxed);
5659 }
5660
5661 pmap_unmap_compressor_page(pn, dst);
5662 return DECOMPRESS_SUCCESS;
5663 }
5664 retval = c_decompress_page(dst, slot_ptr, flags, &zeroslot);
5665
5666 /*
5667 * zeroslot will be set to 0 by c_decompress_page if (flags & C_KEEP)
5668 * or (flags & C_DONT_BLOCK) and we found 'c_busy' or 'C_SEG_IS_ONDISK' to be TRUE
5669 */
5670 if (zeroslot) {
5671 *slot = 0;
5672 }
5673
5674 pmap_unmap_compressor_page(pn, dst);
5675
5676 /*
5677 * returns 0 if we successfully decompressed a page from a segment already in memory
5678 * returns 1 if we had to first swap in the segment, before successfully decompressing the page
5679 * returns -1 if we encountered an error swapping in the segment - decompression failed
5680 * returns -2 if (flags & C_DONT_BLOCK) and we found 'c_busy' or 'C_SEG_IS_ONDISK' to be true
5681 */
5682 return retval;
5683 }
5684
5685 vm_decompress_result_t
vm_compressor_free(int * slot,vm_compressor_options_t flags)5686 vm_compressor_free(int *slot, vm_compressor_options_t flags)
5687 {
5688 bool slot_is_compressed = vm_compressor_is_slot_compressed(slot);
5689
5690 if (slot_is_compressed) {
5691 c_slot_mapping_t slot_ptr;
5692 int zeroslot = 1;
5693 vm_decompress_result_t retval = DECOMPRESS_SUCCESS;
5694
5695 assert(flags == 0 || flags == C_DONT_BLOCK);
5696
5697 slot_ptr = (c_slot_mapping_t)slot;
5698
5699 if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
5700 c_segment_sv_hash_drop_ref(slot_ptr->s_cindx);
5701 os_atomic_dec(&c_segment_pages_compressed, relaxed);
5702
5703 *slot = 0;
5704 return 0;
5705 }
5706
5707 retval = c_decompress_page(NULL, slot_ptr, flags, &zeroslot);
5708 /*
5709 * returns 0 if we successfully freed the specified compressed page
5710 * returns -1 if we encountered an error swapping in the segment - decompression failed
5711 * returns -2 if (flags & C_DONT_BLOCK) and we found 'c_busy' set
5712 */
5713
5714 if (retval == DECOMPRESS_SUCCESS) {
5715 *slot = 0;
5716 }
5717
5718 return retval;
5719 }
5720 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
5721 else {
5722 if ((flags & C_PAGE_UNMODIFIED) == 0) {
5723 /* moving from uncompressed state to compressed. Free it.*/
5724 vm_uncompressed_free(slot, 0);
5725 assert(*slot == 0);
5726 }
5727 }
5728 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
5729 return KERN_SUCCESS;
5730 }
5731
5732 kern_return_t
vm_compressor_put(ppnum_t pn,int * slot,void ** current_chead,char * scratch_buf,vm_compressor_options_t flags)5733 vm_compressor_put(ppnum_t pn, int *slot, void **current_chead, char *scratch_buf, vm_compressor_options_t flags)
5734 {
5735 char *src;
5736 kern_return_t kr;
5737
5738 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
5739 if (flags & C_PAGE_UNMODIFIED) {
5740 if (*slot) {
5741 os_atomic_inc(&compressor_ro_uncompressed_skip_returned, relaxed);
5742 return KERN_SUCCESS;
5743 } else {
5744 kr = vm_uncompressed_put(pn, slot);
5745 if (kr == KERN_SUCCESS) {
5746 os_atomic_inc(&compressor_ro_uncompressed_put, relaxed);
5747 return kr;
5748 }
5749 }
5750 }
5751 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
5752
5753 /* get the address of the page in the physical apperture in the kernel task virtual memory */
5754 src = pmap_map_compressor_page(pn);
5755 assert(src != NULL);
5756
5757 kr = c_compress_page(src, (c_slot_mapping_t)slot, (c_segment_t *)current_chead, scratch_buf, flags);
5758 pmap_unmap_compressor_page(pn, src);
5759
5760 return kr;
5761 }
5762
5763 void
vm_compressor_transfer(int * dst_slot_p,int * src_slot_p)5764 vm_compressor_transfer(
5765 int *dst_slot_p,
5766 int *src_slot_p)
5767 {
5768 c_slot_mapping_t dst_slot, src_slot;
5769 c_segment_t c_seg;
5770 uint16_t c_indx;
5771 c_slot_t cs;
5772
5773 src_slot = (c_slot_mapping_t) src_slot_p;
5774
5775 if (src_slot->s_cseg == C_SV_CSEG_ID || !vm_compressor_is_slot_compressed(src_slot_p)) {
5776 *dst_slot_p = *src_slot_p;
5777 *src_slot_p = 0;
5778 return;
5779 }
5780 dst_slot = (c_slot_mapping_t) dst_slot_p;
5781 Retry:
5782 PAGE_REPLACEMENT_DISALLOWED(TRUE);
5783 /* get segment for src_slot */
5784 c_seg = c_segments_get(src_slot->s_cseg - 1)->c_seg;
5785 /* lock segment */
5786 lck_mtx_lock_spin_always(&c_seg->c_lock);
5787 /* wait if it's busy */
5788 if (c_seg->c_busy && !c_seg->c_busy_swapping) {
5789 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5790 c_seg_wait_on_busy(c_seg);
5791 goto Retry;
5792 }
5793 /* find the c_slot */
5794 c_indx = src_slot->s_cindx;
5795 cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5796 /* point the c_slot back to dst_slot instead of src_slot */
5797 C_SLOT_ASSERT_PACKABLE(dst_slot);
5798 cs->c_packed_ptr = C_SLOT_PACK_PTR(dst_slot);
5799 /* transfer */
5800 *dst_slot_p = *src_slot_p;
5801 *src_slot_p = 0;
5802 lck_mtx_unlock_always(&c_seg->c_lock);
5803 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5804 }
5805
5806 #if defined(__arm64__)
5807 extern uint64_t vm_swapfile_last_failed_to_create_ts;
5808 __attribute__((noreturn))
5809 void
vm_panic_hibernate_write_image_failed(int err,uint64_t file_size_min,uint64_t file_size_max,uint64_t file_size)5810 vm_panic_hibernate_write_image_failed(
5811 int err,
5812 uint64_t file_size_min,
5813 uint64_t file_size_max,
5814 uint64_t file_size)
5815 {
5816 panic("hibernate_write_image encountered error 0x%x - %u, %u, %d, %d, %d, %d, %d, %d, %d, %d, %llu, %d, %d, %d, %llu, %llu, %llu\n",
5817 err,
5818 VM_PAGE_COMPRESSOR_COUNT, vm_page_wire_count,
5819 c_age_count, c_major_count, c_minor_count, (c_early_swapout_count + c_regular_swapout_count + c_late_swapout_count), c_swappedout_sparse_count,
5820 vm_num_swap_files, vm_num_pinned_swap_files, vm_swappin_enabled, vm_swap_put_failures,
5821 (vm_swapfile_last_failed_to_create_ts ? 1:0), hibernate_no_swapspace, hibernate_flush_timed_out,
5822 file_size_min, file_size_max, file_size);
5823 }
5824 #endif /*(__arm64__)*/
5825
5826 #if CONFIG_FREEZE
5827
5828 int freezer_finished_filling = 0;
5829
5830 void
vm_compressor_finished_filling(void ** current_chead)5831 vm_compressor_finished_filling(
5832 void **current_chead)
5833 {
5834 c_segment_t c_seg;
5835
5836 if ((c_seg = *(c_segment_t *)current_chead) == NULL) {
5837 return;
5838 }
5839
5840 assert(c_seg->c_state == C_IS_FILLING);
5841
5842 lck_mtx_lock_spin_always(&c_seg->c_lock);
5843
5844 c_current_seg_filled(c_seg, (c_segment_t *)current_chead);
5845
5846 lck_mtx_unlock_always(&c_seg->c_lock);
5847
5848 freezer_finished_filling++;
5849 }
5850
5851
5852 /*
5853 * This routine is used to transfer the compressed chunks from
5854 * the c_seg/cindx pointed to by slot_p into a new c_seg headed
5855 * by the current_chead and a new cindx within that c_seg.
5856 *
5857 * Currently, this routine is only used by the "freezer backed by
5858 * compressor with swap" mode to create a series of c_segs that
5859 * only contain compressed data belonging to one task. So, we
5860 * move a task's previously compressed data into a set of new
5861 * c_segs which will also hold the task's yet to be compressed data.
5862 */
5863
5864 kern_return_t
vm_compressor_relocate(void ** current_chead,int * slot_p)5865 vm_compressor_relocate(
5866 void **current_chead,
5867 int *slot_p)
5868 {
5869 c_slot_mapping_t slot_ptr;
5870 c_slot_mapping_t src_slot;
5871 uint32_t c_rounded_size;
5872 uint32_t c_size;
5873 uint16_t dst_slot;
5874 c_slot_t c_dst;
5875 c_slot_t c_src;
5876 uint16_t c_indx;
5877 c_segment_t c_seg_dst = NULL;
5878 c_segment_t c_seg_src = NULL;
5879 kern_return_t kr = KERN_SUCCESS;
5880 bool nearing_limits;
5881
5882
5883 src_slot = (c_slot_mapping_t) slot_p;
5884
5885 if (src_slot->s_cseg == C_SV_CSEG_ID) {
5886 /*
5887 * no need to relocate... this is a page full of a single
5888 * value which is hashed to a single entry not contained
5889 * in a c_segment_t
5890 */
5891 return kr;
5892 }
5893
5894 if (vm_compressor_is_slot_compressed((int *)src_slot) == false) {
5895 /*
5896 * Unmodified anonymous pages are sitting uncompressed on disk.
5897 * So don't pull them back in again.
5898 */
5899 return kr;
5900 }
5901
5902 Relookup_dst:
5903 c_seg_dst = c_seg_allocate((c_segment_t *)current_chead, &nearing_limits);
5904 /*
5905 * returns with c_seg lock held
5906 * and PAGE_REPLACEMENT_DISALLOWED(TRUE)...
5907 * c_nextslot has been allocated and
5908 * c_store.c_buffer populated
5909 */
5910 if (c_seg_dst == NULL) {
5911 /*
5912 * Out of compression segments?
5913 */
5914 if (nearing_limits) {
5915 memorystatus_respond_to_compressor_exhaustion();
5916 }
5917 kr = KERN_RESOURCE_SHORTAGE;
5918 goto out;
5919 }
5920
5921 assert(c_seg_dst->c_busy == 0);
5922
5923 C_SEG_BUSY(c_seg_dst);
5924
5925 dst_slot = c_seg_dst->c_nextslot;
5926
5927 lck_mtx_unlock_always(&c_seg_dst->c_lock);
5928 if (nearing_limits) {
5929 memorystatus_respond_to_compressor_exhaustion();
5930 }
5931
5932 Relookup_src:
5933 c_seg_src = c_segments_get(src_slot->s_cseg - 1)->c_seg;
5934
5935 assert(c_seg_dst != c_seg_src);
5936
5937 lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5938
5939 if (C_SEG_IS_ON_DISK_OR_SOQ(c_seg_src) ||
5940 c_seg_src->c_state == C_IS_FILLING) {
5941 /*
5942 * Skip this page if :-
5943 * a) the src c_seg is already on-disk (or on its way there)
5944 * A "thaw" can mark a process as eligible for
5945 * another freeze cycle without bringing any of
5946 * its swapped out c_segs back from disk (because
5947 * that is done on-demand).
5948 * Or, this page may be mapped elsewhere in the task's map,
5949 * and we may have marked it for swap already.
5950 *
5951 * b) Or, the src c_seg is being filled by the compressor
5952 * thread. We don't want the added latency of waiting for
5953 * this c_seg in the freeze path and so we skip it.
5954 */
5955
5956 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5957
5958 lck_mtx_unlock_always(&c_seg_src->c_lock);
5959
5960 c_seg_src = NULL;
5961
5962 goto out;
5963 }
5964
5965 if (c_seg_src->c_busy) {
5966 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5967 c_seg_wait_on_busy(c_seg_src);
5968
5969 c_seg_src = NULL;
5970
5971 PAGE_REPLACEMENT_DISALLOWED(TRUE);
5972
5973 goto Relookup_src;
5974 }
5975
5976 C_SEG_BUSY(c_seg_src);
5977
5978 lck_mtx_unlock_always(&c_seg_src->c_lock);
5979
5980 /* find the c_slot */
5981 c_indx = src_slot->s_cindx;
5982
5983 c_src = C_SEG_SLOT_FROM_INDEX(c_seg_src, c_indx);
5984
5985 c_size = UNPACK_C_SIZE(c_src);
5986
5987 assert(c_size);
5988 int combined_size = c_size + c_slot_extra_size(c_src);
5989
5990 if (combined_size > (uint32_t)(c_seg_bufsize - C_SEG_OFFSET_TO_BYTES((int32_t)c_seg_dst->c_nextoffset))) {
5991 /*
5992 * This segment is full. We need a new one.
5993 */
5994
5995 lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5996 C_SEG_WAKEUP_DONE(c_seg_src);
5997 lck_mtx_unlock_always(&c_seg_src->c_lock);
5998
5999 c_seg_src = NULL;
6000
6001 lck_mtx_lock_spin_always(&c_seg_dst->c_lock);
6002
6003 assert(c_seg_dst->c_busy);
6004 assert(c_seg_dst->c_state == C_IS_FILLING);
6005 assert(!c_seg_dst->c_on_minorcompact_q);
6006
6007 c_current_seg_filled(c_seg_dst, (c_segment_t *)current_chead);
6008 assert(*current_chead == NULL);
6009
6010 C_SEG_WAKEUP_DONE(c_seg_dst);
6011
6012 lck_mtx_unlock_always(&c_seg_dst->c_lock);
6013
6014 c_seg_dst = NULL;
6015
6016 PAGE_REPLACEMENT_DISALLOWED(FALSE);
6017
6018 goto Relookup_dst;
6019 }
6020
6021 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot);
6022
6023 memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], combined_size);
6024 PAGE_REPLACEMENT_DISALLOWED(FALSE);
6025 /*
6026 * Is platform alignment actually necessary since wkdm aligns its output?
6027 */
6028 c_rounded_size = C_SEG_ROUND_TO_ALIGNMENT(combined_size);
6029
6030 cslot_copy(c_dst, c_src);
6031 c_dst->c_offset = c_seg_dst->c_nextoffset;
6032
6033 if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) {
6034 c_seg_dst->c_firstemptyslot++;
6035 }
6036
6037 c_seg_dst->c_slots_used++;
6038 c_seg_dst->c_nextslot++;
6039 c_seg_dst->c_bytes_used += c_rounded_size;
6040 c_seg_dst->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
6041
6042
6043 PACK_C_SIZE(c_src, 0);
6044
6045 c_seg_src->c_bytes_used -= c_rounded_size;
6046 c_seg_src->c_bytes_unused += c_rounded_size;
6047
6048 assert(c_seg_src->c_slots_used);
6049 c_seg_src->c_slots_used--;
6050
6051 if (!c_seg_src->c_swappedin) {
6052 /* Pessimistically lose swappedin status when non-swappedin pages are added. */
6053 c_seg_dst->c_swappedin = false;
6054 }
6055
6056 if (c_indx < c_seg_src->c_firstemptyslot) {
6057 c_seg_src->c_firstemptyslot = c_indx;
6058 }
6059
6060 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, dst_slot);
6061
6062 PAGE_REPLACEMENT_ALLOWED(TRUE);
6063 slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
6064 /* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
6065 slot_ptr->s_cseg = c_seg_dst->c_mysegno + 1;
6066 slot_ptr->s_cindx = dst_slot;
6067
6068 PAGE_REPLACEMENT_ALLOWED(FALSE);
6069
6070 out:
6071 if (c_seg_src) {
6072 lck_mtx_lock_spin_always(&c_seg_src->c_lock);
6073
6074 C_SEG_WAKEUP_DONE(c_seg_src);
6075
6076 if (c_seg_src->c_bytes_used == 0 && c_seg_src->c_state != C_IS_FILLING) {
6077 if (!c_seg_src->c_on_minorcompact_q) {
6078 c_seg_need_delayed_compaction(c_seg_src, FALSE);
6079 }
6080 }
6081
6082 lck_mtx_unlock_always(&c_seg_src->c_lock);
6083 }
6084
6085 if (c_seg_dst) {
6086 PAGE_REPLACEMENT_DISALLOWED(TRUE);
6087
6088 lck_mtx_lock_spin_always(&c_seg_dst->c_lock);
6089
6090 if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
6091 /*
6092 * Nearing or exceeded maximum slot and offset capacity.
6093 */
6094 assert(c_seg_dst->c_busy);
6095 assert(c_seg_dst->c_state == C_IS_FILLING);
6096 assert(!c_seg_dst->c_on_minorcompact_q);
6097
6098 c_current_seg_filled(c_seg_dst, (c_segment_t *)current_chead);
6099 assert(*current_chead == NULL);
6100 }
6101
6102 C_SEG_WAKEUP_DONE(c_seg_dst);
6103
6104 lck_mtx_unlock_always(&c_seg_dst->c_lock);
6105
6106 c_seg_dst = NULL;
6107
6108 PAGE_REPLACEMENT_DISALLOWED(FALSE);
6109 }
6110
6111 return kr;
6112 }
6113 #endif /* CONFIG_FREEZE */
6114
6115 #if DEVELOPMENT || DEBUG
6116
6117 void
vm_compressor_inject_error(int * slot)6118 vm_compressor_inject_error(int *slot)
6119 {
6120 c_slot_mapping_t slot_ptr = (c_slot_mapping_t)slot;
6121
6122 /* No error detection for single-value compression. */
6123 if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
6124 printf("%s(): cannot inject errors in SV-compressed pages\n", __func__ );
6125 return;
6126 }
6127
6128 /* s_cseg is actually "segno+1" */
6129 const uint32_t c_segno = slot_ptr->s_cseg - 1;
6130
6131 assert(c_segno < c_segments_available);
6132 assert(c_segments_get(c_segno)->c_segno >= c_segments_available);
6133
6134 const c_segment_t c_seg = c_segments_get(c_segno)->c_seg;
6135
6136 PAGE_REPLACEMENT_DISALLOWED(TRUE);
6137
6138 lck_mtx_lock_spin_always(&c_seg->c_lock);
6139 assert(c_seg->c_state != C_IS_EMPTY && c_seg->c_state != C_IS_FREE);
6140
6141 const uint16_t c_indx = slot_ptr->s_cindx;
6142 assert(c_indx < c_seg->c_nextslot);
6143
6144 /*
6145 * To safely make this segment temporarily writable, we need to mark
6146 * the segment busy, which allows us to release the segment lock.
6147 */
6148 while (c_seg->c_busy) {
6149 c_seg_wait_on_busy(c_seg);
6150 lck_mtx_lock_spin_always(&c_seg->c_lock);
6151 }
6152 C_SEG_BUSY(c_seg);
6153
6154 bool already_writable = (c_seg->c_state == C_IS_FILLING);
6155 if (!already_writable) {
6156 /*
6157 * Protection update must be performed preemptibly, so temporarily drop
6158 * the lock. Having set c_busy will prevent most other concurrent
6159 * operations.
6160 */
6161 lck_mtx_unlock_always(&c_seg->c_lock);
6162 C_SEG_MAKE_WRITEABLE(c_seg);
6163 lck_mtx_lock_spin_always(&c_seg->c_lock);
6164 }
6165
6166 /*
6167 * Once we've released the lock following our c_state == C_IS_FILLING check,
6168 * c_current_seg_filled() can (re-)write-protect the segment. However, it
6169 * will transition from C_IS_FILLING before releasing the c_seg lock, so we
6170 * can detect this by re-checking after we've reobtained the lock.
6171 */
6172 if (already_writable && c_seg->c_state != C_IS_FILLING) {
6173 lck_mtx_unlock_always(&c_seg->c_lock);
6174 C_SEG_MAKE_WRITEABLE(c_seg);
6175 lck_mtx_lock_spin_always(&c_seg->c_lock);
6176 already_writable = false;
6177 /* Segment can't be freed while c_busy is set. */
6178 assert(c_seg->c_state != C_IS_FILLING);
6179 }
6180
6181 /*
6182 * Skip if the segment is on disk. This check can only be performed after
6183 * the final acquisition of the segment lock before we attempt to write to
6184 * the segment.
6185 */
6186 if (!C_SEG_IS_ON_DISK_OR_SOQ(c_seg)) {
6187 c_slot_t cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
6188 int32_t *data = &c_seg->c_store.c_buffer[cs->c_offset];
6189 /* assume that the compressed data holds at least one int32_t */
6190 assert(UNPACK_C_SIZE(cs) > sizeof(*data));
6191 /*
6192 * This bit is known to be in the payload of a MISS packet resulting from
6193 * the pattern used in the test pattern from decompression_failure.c.
6194 * Flipping it should result in many corrupted bits in the test page.
6195 */
6196 data[0] ^= 0x00000100;
6197 }
6198
6199 if (!already_writable) {
6200 lck_mtx_unlock_always(&c_seg->c_lock);
6201 C_SEG_WRITE_PROTECT(c_seg);
6202 lck_mtx_lock_spin_always(&c_seg->c_lock);
6203 }
6204
6205 C_SEG_WAKEUP_DONE(c_seg);
6206 lck_mtx_unlock_always(&c_seg->c_lock);
6207
6208 PAGE_REPLACEMENT_DISALLOWED(FALSE);
6209 }
6210
6211 /*
6212 * Serialize information about a specific segment
6213 * returns true if the segment was written or there's nothing to write for the segno
6214 * false if there's not enough space
6215 * argument size input - the size of the input buffer, output - the size written, set to 0 on failure
6216 */
6217 kern_return_t
vm_compressor_serialize_segment_debug_info(int segno,char * buf,size_t * size,vm_c_serialize_add_data_t with_data)6218 vm_compressor_serialize_segment_debug_info(int segno, char *buf, size_t *size, vm_c_serialize_add_data_t with_data)
6219 {
6220 size_t insize = *size;
6221 size_t offset = 0;
6222 *size = 0;
6223 if (c_segments_get(segno)->c_segno < c_segments_available) {
6224 /* This check means there's no pointer assigned here so it must be an index in the free list.
6225 * if this was an active c_segment, .c_seg would be assigned to, which is a pointer, interpreted as an int it
6226 * would be higher than c_segments_available. See also assert to this effect right after assigning to c_seg in
6227 * c_seg_allocate()
6228 */
6229 return KERN_SUCCESS;
6230 }
6231 if (c_segments_get(segno)->c_segno == (uint32_t)-1) {
6232 /* c_segno of the end of the free-list */
6233 return KERN_SUCCESS;
6234 }
6235
6236 const struct c_segment* c_seg = c_segments_get(segno)->c_seg;
6237 if (c_seg->c_state == C_IS_FREE) {
6238 return KERN_SUCCESS; /* nothing needs to be done */
6239 }
6240
6241 int nslots = c_seg->c_nextslot;
6242 /* do we have enough space for slots (without data)? */
6243 if (sizeof(struct c_segment_info) + (nslots * sizeof(struct c_slot_info)) > insize) {
6244 return KERN_NO_SPACE; /* not enough space, please call me again */
6245 }
6246
6247 struct c_segment_info* csi = (struct c_segment_info*)buf;
6248 offset += sizeof(struct c_segment_info);
6249
6250 csi->csi_mysegno = c_seg->c_mysegno;
6251 csi->csi_creation_ts = c_seg->c_creation_ts;
6252 csi->csi_swappedin_ts = c_seg->c_swappedin_ts;
6253 csi->csi_bytes_unused = c_seg->c_bytes_unused;
6254 csi->csi_bytes_used = c_seg->c_bytes_used;
6255 csi->csi_populated_offset = c_seg->c_populated_offset;
6256 csi->csi_state = c_seg->c_state;
6257 csi->csi_swappedin = c_seg->c_swappedin;
6258 csi->csi_on_minor_compact_q = c_seg->c_on_minorcompact_q;
6259 csi->csi_has_donated_pages = c_seg->c_has_donated_pages;
6260 csi->csi_slots_used = (uint16_t)c_seg->c_slots_used;
6261 csi->csi_slot_var_array_len = c_seg->c_slot_var_array_len;
6262 csi->csi_slots_len = (uint16_t)nslots;
6263 #if TRACK_C_SEGMENT_UTILIZATION
6264 csi->csi_decompressions_since_swapin = c_seg->c_decompressions_since_swapin;
6265 #else
6266 csi->csi_decompressions_since_swapin = 0;
6267 #endif /* TRACK_C_SEGMENT_UTILIZATION */
6268 /* This entire data collection races with the compressor threads which can change any
6269 * of this data members, and specifically can drop the data buffer to swap
6270 * We don't take the segment lock since that would slow the iteration over the segments down
6271 * and hurt the "snapshot-ness" of the data. The race risk is acceptable since this is
6272 * used only for a tester in development. */
6273
6274 for (int si = 0; si < nslots; ++si) {
6275 if (offset + sizeof(struct c_slot_info) > insize) {
6276 return KERN_NO_SPACE;
6277 }
6278 /* see also c_seg_validate() for some of the details */
6279 const struct c_slot* cs = C_SEG_SLOT_FROM_INDEX(c_seg, si);
6280 struct c_slot_info* ssi = (struct c_slot_info*)(buf + offset);
6281 offset += sizeof(struct c_slot_info);
6282 ssi->csi_size = (uint16_t)UNPACK_C_SIZE(cs);
6283 #pragma unused(with_data)
6284 ssi->csi_unused = 0;
6285 }
6286 *size = offset;
6287 return KERN_SUCCESS;
6288 }
6289
6290 #endif /* DEVELOPMENT || DEBUG */
6291
6292 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
6293
6294 struct vnode;
6295 extern void vm_swapfile_open(const char *path, struct vnode **vp);
6296 extern int vm_swapfile_preallocate(struct vnode *vp, uint64_t *size, boolean_t *pin);
6297
6298 struct vnode *uncompressed_vp0 = NULL;
6299 struct vnode *uncompressed_vp1 = NULL;
6300 uint32_t uncompressed_file0_free_pages = 0, uncompressed_file1_free_pages = 0;
6301 uint64_t uncompressed_file0_free_offset = 0, uncompressed_file1_free_offset = 0;
6302
6303 uint64_t compressor_ro_uncompressed = 0;
6304 uint64_t compressor_ro_uncompressed_total_returned = 0;
6305 uint64_t compressor_ro_uncompressed_skip_returned = 0;
6306 uint64_t compressor_ro_uncompressed_get = 0;
6307 uint64_t compressor_ro_uncompressed_put = 0;
6308 uint64_t compressor_ro_uncompressed_swap_usage = 0;
6309
6310 extern void vnode_put(struct vnode* vp);
6311 extern int vnode_getwithref(struct vnode* vp);
6312 extern int vm_swapfile_io(struct vnode *vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_ctx);
6313
6314 #define MAX_OFFSET_PAGES (255)
6315 uint64_t uncompressed_file0_space_bitmap[MAX_OFFSET_PAGES];
6316 uint64_t uncompressed_file1_space_bitmap[MAX_OFFSET_PAGES];
6317
6318 #define UNCOMPRESSED_FILEIDX_OFFSET_MASK (((uint32_t)1<<31ull) - 1)
6319 #define UNCOMPRESSED_FILEIDX_SHIFT (29)
6320 #define UNCOMPRESSED_FILEIDX_MASK (3)
6321 #define UNCOMPRESSED_OFFSET_SHIFT (29)
6322 #define UNCOMPRESSED_OFFSET_MASK (7)
6323
6324 static uint32_t
vm_uncompressed_extract_swap_file(int slot)6325 vm_uncompressed_extract_swap_file(int slot)
6326 {
6327 uint32_t fileidx = (((uint32_t)slot & UNCOMPRESSED_FILEIDX_OFFSET_MASK) >> UNCOMPRESSED_FILEIDX_SHIFT) & UNCOMPRESSED_FILEIDX_MASK;
6328 return fileidx;
6329 }
6330
6331 static uint32_t
vm_uncompressed_extract_swap_offset(int slot)6332 vm_uncompressed_extract_swap_offset(int slot)
6333 {
6334 return slot & (uint32_t)(~(UNCOMPRESSED_OFFSET_MASK << UNCOMPRESSED_OFFSET_SHIFT));
6335 }
6336
6337 static void
vm_uncompressed_return_space_to_swap(int slot)6338 vm_uncompressed_return_space_to_swap(int slot)
6339 {
6340 PAGE_REPLACEMENT_ALLOWED(TRUE);
6341 uint32_t fileidx = vm_uncompressed_extract_swap_file(slot);
6342 if (fileidx == 1) {
6343 uint32_t free_offset = vm_uncompressed_extract_swap_offset(slot);
6344 uint64_t pgidx = free_offset / PAGE_SIZE_64;
6345 uint64_t chunkidx = pgidx / 64;
6346 uint64_t chunkoffset = pgidx % 64;
6347 #if DEVELOPMENT || DEBUG
6348 uint64_t vaddr = (uint64_t)&uncompressed_file0_space_bitmap[chunkidx];
6349 uint64_t maxvaddr = (uint64_t)&uncompressed_file0_space_bitmap[MAX_OFFSET_PAGES];
6350 assertf(vaddr < maxvaddr, "0x%llx 0x%llx", vaddr, maxvaddr);
6351 #endif /*DEVELOPMENT || DEBUG*/
6352 assertf((uncompressed_file0_space_bitmap[chunkidx] & ((uint64_t)1 << chunkoffset)),
6353 "0x%x %llu %llu", slot, chunkidx, chunkoffset);
6354 uncompressed_file0_space_bitmap[chunkidx] &= ~((uint64_t)1 << chunkoffset);
6355 assertf(!(uncompressed_file0_space_bitmap[chunkidx] & ((uint64_t)1 << chunkoffset)),
6356 "0x%x %llu %llu", slot, chunkidx, chunkoffset);
6357
6358 uncompressed_file0_free_pages++;
6359 } else {
6360 uint32_t free_offset = vm_uncompressed_extract_swap_offset(slot);
6361 uint64_t pgidx = free_offset / PAGE_SIZE_64;
6362 uint64_t chunkidx = pgidx / 64;
6363 uint64_t chunkoffset = pgidx % 64;
6364 assertf((uncompressed_file1_space_bitmap[chunkidx] & ((uint64_t)1 << chunkoffset)),
6365 "%llu %llu", chunkidx, chunkoffset);
6366 uncompressed_file1_space_bitmap[chunkidx] &= ~((uint64_t)1 << chunkoffset);
6367
6368 uncompressed_file1_free_pages++;
6369 }
6370 compressor_ro_uncompressed_swap_usage--;
6371 PAGE_REPLACEMENT_ALLOWED(FALSE);
6372 }
6373
6374 static int
vm_uncompressed_reserve_space_in_swap()6375 vm_uncompressed_reserve_space_in_swap()
6376 {
6377 int slot = 0;
6378 if (uncompressed_file0_free_pages == 0 && uncompressed_file1_free_pages == 0) {
6379 return -1;
6380 }
6381
6382 PAGE_REPLACEMENT_ALLOWED(TRUE);
6383 if (uncompressed_file0_free_pages) {
6384 uint64_t chunkidx = 0;
6385 uint64_t chunkoffset = 0;
6386 while (uncompressed_file0_space_bitmap[chunkidx] == 0xffffffffffffffff) {
6387 chunkidx++;
6388 }
6389 while (uncompressed_file0_space_bitmap[chunkidx] & ((uint64_t)1 << chunkoffset)) {
6390 chunkoffset++;
6391 }
6392
6393 assertf((uncompressed_file0_space_bitmap[chunkidx] & ((uint64_t)1 << chunkoffset)) == 0,
6394 "%llu %llu", chunkidx, chunkoffset);
6395 #if DEVELOPMENT || DEBUG
6396 uint64_t vaddr = (uint64_t)&uncompressed_file0_space_bitmap[chunkidx];
6397 uint64_t maxvaddr = (uint64_t)&uncompressed_file0_space_bitmap[MAX_OFFSET_PAGES];
6398 assertf(vaddr < maxvaddr, "0x%llx 0x%llx", vaddr, maxvaddr);
6399 #endif /*DEVELOPMENT || DEBUG*/
6400 uncompressed_file0_space_bitmap[chunkidx] |= ((uint64_t)1 << chunkoffset);
6401 uncompressed_file0_free_offset = ((chunkidx * 64) + chunkoffset) * PAGE_SIZE_64;
6402 assertf((uncompressed_file0_space_bitmap[chunkidx] & ((uint64_t)1 << chunkoffset)),
6403 "%llu %llu", chunkidx, chunkoffset);
6404
6405 assert(uncompressed_file0_free_offset <= (1 << UNCOMPRESSED_OFFSET_SHIFT));
6406 slot = (int)((1 << UNCOMPRESSED_FILEIDX_SHIFT) + uncompressed_file0_free_offset);
6407 uncompressed_file0_free_pages--;
6408 } else {
6409 uint64_t chunkidx = 0;
6410 uint64_t chunkoffset = 0;
6411 while (uncompressed_file1_space_bitmap[chunkidx] == 0xFFFFFFFFFFFFFFFF) {
6412 chunkidx++;
6413 }
6414 while (uncompressed_file1_space_bitmap[chunkidx] & ((uint64_t)1 << chunkoffset)) {
6415 chunkoffset++;
6416 }
6417 assert((uncompressed_file1_space_bitmap[chunkidx] & ((uint64_t)1 << chunkoffset)) == 0);
6418 uncompressed_file1_space_bitmap[chunkidx] |= ((uint64_t)1 << chunkoffset);
6419 uncompressed_file1_free_offset = ((chunkidx * 64) + chunkoffset) * PAGE_SIZE_64;
6420 slot = (int)((2 << UNCOMPRESSED_FILEIDX_SHIFT) + uncompressed_file1_free_offset);
6421 uncompressed_file1_free_pages--;
6422 }
6423 compressor_ro_uncompressed_swap_usage++;
6424 PAGE_REPLACEMENT_ALLOWED(FALSE);
6425 return slot;
6426 }
6427
6428 #define MAX_IO_REQ (16)
6429 struct _uncompressor_io_req {
6430 uint64_t addr;
6431 bool inuse;
6432 } uncompressor_io_req[MAX_IO_REQ];
6433
6434 int
vm_uncompressed_put(ppnum_t pn,int * slot)6435 vm_uncompressed_put(ppnum_t pn, int *slot)
6436 {
6437 int retval = 0;
6438 struct vnode *uncompressed_vp = NULL;
6439 uint64_t uncompress_offset = 0;
6440
6441 again:
6442 if (uncompressed_vp0 == NULL) {
6443 PAGE_REPLACEMENT_ALLOWED(TRUE);
6444 if (uncompressed_vp0 == NULL) {
6445 uint64_t size = (MAX_OFFSET_PAGES * 1024 * 1024ULL);
6446 vm_swapfile_open("/private/var/vm/uncompressedswap0", &uncompressed_vp0);
6447 if (uncompressed_vp0 == NULL) {
6448 PAGE_REPLACEMENT_ALLOWED(FALSE);
6449 return KERN_NO_ACCESS;
6450 }
6451 vm_swapfile_preallocate(uncompressed_vp0, &size, NULL);
6452 uncompressed_file0_free_pages = (uint32_t)atop(size);
6453 bzero(uncompressed_file0_space_bitmap, sizeof(uint64_t) * MAX_OFFSET_PAGES);
6454
6455 int i = 0;
6456 for (; i < MAX_IO_REQ; i++) {
6457 kmem_alloc(kernel_map, (vm_offset_t*)&uncompressor_io_req[i].addr, PAGE_SIZE_64, KMA_NOFAIL | KMA_KOBJECT, VM_KERN_MEMORY_COMPRESSOR);
6458 uncompressor_io_req[i].inuse = false;
6459 }
6460
6461 vm_swapfile_open("/private/var/vm/uncompressedswap1", &uncompressed_vp1);
6462 assert(uncompressed_vp1);
6463 vm_swapfile_preallocate(uncompressed_vp1, &size, NULL);
6464 uncompressed_file1_free_pages = (uint32_t)atop(size);
6465 bzero(uncompressed_file1_space_bitmap, sizeof(uint64_t) * MAX_OFFSET_PAGES);
6466 PAGE_REPLACEMENT_ALLOWED(FALSE);
6467 } else {
6468 PAGE_REPLACEMENT_ALLOWED(FALSE);
6469 delay(100);
6470 goto again;
6471 }
6472 }
6473
6474 int swapinfo = vm_uncompressed_reserve_space_in_swap();
6475 if (swapinfo == -1) {
6476 *slot = 0;
6477 return KERN_RESOURCE_SHORTAGE;
6478 }
6479
6480 if (vm_uncompressed_extract_swap_file(swapinfo) == 1) {
6481 uncompressed_vp = uncompressed_vp0;
6482 } else {
6483 uncompressed_vp = uncompressed_vp1;
6484 }
6485 uncompress_offset = vm_uncompressed_extract_swap_offset(swapinfo);
6486 if ((retval = vnode_getwithref(uncompressed_vp)) != 0) {
6487 vm_log_error("vm_uncompressed_put: vnode_getwithref on swapfile failed with %d\n", retval);
6488 } else {
6489 int i = 0;
6490 retry:
6491 PAGE_REPLACEMENT_ALLOWED(TRUE);
6492 for (i = 0; i < MAX_IO_REQ; i++) {
6493 if (uncompressor_io_req[i].inuse == false) {
6494 uncompressor_io_req[i].inuse = true;
6495 break;
6496 }
6497 }
6498 if (i == MAX_IO_REQ) {
6499 assert_wait((event_t)&uncompressor_io_req, THREAD_UNINT);
6500 PAGE_REPLACEMENT_ALLOWED(FALSE);
6501 thread_block(THREAD_CONTINUE_NULL);
6502 goto retry;
6503 }
6504 PAGE_REPLACEMENT_ALLOWED(FALSE);
6505 void *addr = pmap_map_compressor_page(pn);
6506 memcpy((void*)uncompressor_io_req[i].addr, addr, PAGE_SIZE_64);
6507 pmap_unmap_compressor_page(pn, addr);
6508
6509 retval = vm_swapfile_io(uncompressed_vp, uncompress_offset, (uint64_t)uncompressor_io_req[i].addr, 1, SWAP_WRITE, NULL);
6510 if (retval) {
6511 *slot = 0;
6512 } else {
6513 *slot = (int)swapinfo;
6514 ((c_slot_mapping_t)(slot))->s_uncompressed = 1;
6515 }
6516 vnode_put(uncompressed_vp);
6517 PAGE_REPLACEMENT_ALLOWED(TRUE);
6518 uncompressor_io_req[i].inuse = false;
6519 thread_wakeup((event_t)&uncompressor_io_req);
6520 PAGE_REPLACEMENT_ALLOWED(FALSE);
6521 }
6522 return retval;
6523 }
6524
6525 int
vm_uncompressed_get(ppnum_t pn,int * slot,__unused vm_compressor_options_t flags)6526 vm_uncompressed_get(ppnum_t pn, int *slot, __unused vm_compressor_options_t flags)
6527 {
6528 int retval = 0;
6529 struct vnode *uncompressed_vp = NULL;
6530 uint32_t fileidx = vm_uncompressed_extract_swap_file(*slot);
6531 uint64_t uncompress_offset = vm_uncompressed_extract_swap_offset(*slot);
6532
6533 if (__improbable(flags & C_KDP)) {
6534 return -2;
6535 }
6536
6537 if (fileidx == 1) {
6538 uncompressed_vp = uncompressed_vp0;
6539 } else {
6540 uncompressed_vp = uncompressed_vp1;
6541 }
6542
6543 if ((retval = vnode_getwithref(uncompressed_vp)) != 0) {
6544 vm_log_error("vm_uncompressed_put: vnode_getwithref on swapfile failed with %d\n", retval);
6545 } else {
6546 int i = 0;
6547 retry:
6548 PAGE_REPLACEMENT_ALLOWED(TRUE);
6549 for (i = 0; i < MAX_IO_REQ; i++) {
6550 if (uncompressor_io_req[i].inuse == false) {
6551 uncompressor_io_req[i].inuse = true;
6552 break;
6553 }
6554 }
6555 if (i == MAX_IO_REQ) {
6556 assert_wait((event_t)&uncompressor_io_req, THREAD_UNINT);
6557 PAGE_REPLACEMENT_ALLOWED(FALSE);
6558 thread_block(THREAD_CONTINUE_NULL);
6559 goto retry;
6560 }
6561 PAGE_REPLACEMENT_ALLOWED(FALSE);
6562 retval = vm_swapfile_io(uncompressed_vp, uncompress_offset, (uint64_t)uncompressor_io_req[i].addr, 1, SWAP_READ, NULL);
6563 vnode_put(uncompressed_vp);
6564 void *addr = pmap_map_compressor_page(pn);
6565 memcpy(addr, (void*)uncompressor_io_req[i].addr, PAGE_SIZE_64);
6566 pmap_unmap_compressor_page(pn, addr);
6567 PAGE_REPLACEMENT_ALLOWED(TRUE);
6568 uncompressor_io_req[i].inuse = false;
6569 thread_wakeup((event_t)&uncompressor_io_req);
6570 PAGE_REPLACEMENT_ALLOWED(FALSE);
6571 }
6572 return retval;
6573 }
6574
6575 int
vm_uncompressed_free(int * slot,__unused vm_compressor_options_t flags)6576 vm_uncompressed_free(int *slot, __unused vm_compressor_options_t flags)
6577 {
6578 vm_uncompressed_return_space_to_swap(*slot);
6579 *slot = 0;
6580 return 0;
6581 }
6582
6583 #endif /*CONFIG_TRACK_UNMODIFIED_ANON_PAGES*/
6584