1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <vm/vm_compressor.h>
30
31 #if CONFIG_PHANTOM_CACHE
32 #include <vm/vm_phantom_cache.h>
33 #endif
34
35 #include <vm/vm_map.h>
36 #include <vm/vm_pageout.h>
37 #include <vm/memory_object.h>
38 #include <vm/vm_compressor_algorithms.h>
39 #include <vm/vm_fault.h>
40 #include <vm/vm_protos.h>
41 #include <mach/mach_host.h> /* for host_info() */
42 #if DEVELOPMENT || DEBUG
43 #include <kern/hvg_hypercall.h>
44 #endif
45 #include <kern/ledger.h>
46 #include <kern/policy_internal.h>
47 #include <kern/thread_group.h>
48 #include <san/kasan.h>
49
50 #if defined(__x86_64__)
51 #include <i386/misc_protos.h>
52 #endif
53 #if defined(__arm64__)
54 #include <arm/machine_routines.h>
55 #endif
56
57 #include <IOKit/IOHibernatePrivate.h>
58
59 /*
60 * The segment buffer size is a tradeoff.
61 * A larger buffer leads to faster I/O throughput, better compression ratios
62 * (since fewer bytes are wasted at the end of the segment),
63 * and less overhead (both in time and space).
64 * However, a smaller buffer causes less swap when the system is overcommited
65 * b/c a higher percentage of the swapped-in segment is definitely accessed
66 * before it goes back out to storage.
67 *
68 * So on systems without swap, a larger segment is a clear win.
69 * On systems with swap, the choice is murkier. Empirically, we've
70 * found that a 64KB segment provides a better tradeoff both in terms of
71 * performance and swap writes than a 256KB segment on systems with fast SSDs
72 * and a HW compression block.
73 */
74 #if XNU_TARGET_OS_OSX && defined(__arm64__)
75 #define C_SEG_BUFSIZE (1024 * 64)
76 #else
77 #define C_SEG_BUFSIZE (1024 * 256)
78 #endif /* TARGET_OS_OSX && defined(__arm64__) */
79
80 TUNABLE(uint32_t, c_seg_bufsize, "vm_compressor_segment_buffer_size", C_SEG_BUFSIZE);
81
82 uint32_t c_seg_max_pages, c_seg_off_limit, c_seg_allocsize, c_seg_slot_var_array_min_len;
83
84 extern boolean_t vm_darkwake_mode;
85 extern zone_t vm_page_zone;
86
87 #if DEVELOPMENT || DEBUG
88 /* sysctl defined in bsd/dev/arm64/sysctl.c */
89 int do_cseg_wedge_thread(void);
90 int do_cseg_unwedge_thread(void);
91 static event_t debug_cseg_wait_event = NULL;
92 #endif /* DEVELOPMENT || DEBUG */
93
94 #if CONFIG_FREEZE
95 bool freezer_incore_cseg_acct = TRUE; /* Only count incore compressed memory for jetsams. */
96 void task_disown_frozen_csegs(task_t owner_task);
97 #endif /* CONFIG_FREEZE */
98
99 #if POPCOUNT_THE_COMPRESSED_DATA
100 boolean_t popcount_c_segs = TRUE;
101
102 static inline uint32_t
vmc_pop(uintptr_t ins,int sz)103 vmc_pop(uintptr_t ins, int sz)
104 {
105 uint32_t rv = 0;
106
107 if (__probable(popcount_c_segs == FALSE)) {
108 return 0xDEAD707C;
109 }
110
111 while (sz >= 16) {
112 uint32_t rv1, rv2;
113 uint64_t *ins64 = (uint64_t *) ins;
114 uint64_t *ins642 = (uint64_t *) (ins + 8);
115 rv1 = __builtin_popcountll(*ins64);
116 rv2 = __builtin_popcountll(*ins642);
117 rv += rv1 + rv2;
118 sz -= 16;
119 ins += 16;
120 }
121
122 while (sz >= 4) {
123 uint32_t *ins32 = (uint32_t *) ins;
124 rv += __builtin_popcount(*ins32);
125 sz -= 4;
126 ins += 4;
127 }
128
129 while (sz > 0) {
130 char *ins8 = (char *)ins;
131 rv += __builtin_popcount(*ins8);
132 sz--;
133 ins++;
134 }
135 return rv;
136 }
137 #endif
138
139 #if VALIDATE_C_SEGMENTS
140 boolean_t validate_c_segs = TRUE;
141 #endif
142 /*
143 * vm_compressor_mode has a heirarchy of control to set its value.
144 * boot-args are checked first, then device-tree, and finally
145 * the default value that is defined below. See vm_fault_init() for
146 * the boot-arg & device-tree code.
147 */
148
149 #if !XNU_TARGET_OS_OSX
150
151 #if CONFIG_FREEZE
152 int vm_compressor_mode = VM_PAGER_FREEZER_DEFAULT;
153 struct freezer_context freezer_context_global;
154 #else /* CONFIG_FREEZE */
155 int vm_compressor_mode = VM_PAGER_NOT_CONFIGURED;
156 #endif /* CONFIG_FREEZE */
157
158 #else /* !XNU_TARGET_OS_OSX */
159 int vm_compressor_mode = VM_PAGER_COMPRESSOR_WITH_SWAP;
160
161 #endif /* !XNU_TARGET_OS_OSX */
162
163 TUNABLE(uint32_t, vm_compression_limit, "vm_compression_limit", 0);
164 int vm_compressor_is_active = 0;
165 int vm_compressor_available = 0;
166
167 extern uint64_t vm_swap_get_max_configured_space(void);
168 extern void vm_pageout_io_throttle(void);
169
170 #if CHECKSUM_THE_DATA || CHECKSUM_THE_SWAP || CHECKSUM_THE_COMPRESSED_DATA
171 extern unsigned int hash_string(char *cp, int len);
172 static unsigned int vmc_hash(char *, int);
173 boolean_t checksum_c_segs = TRUE;
174
175 unsigned int
vmc_hash(char * cp,int len)176 vmc_hash(char *cp, int len)
177 {
178 if (__probable(checksum_c_segs == FALSE)) {
179 return 0xDEAD7A37;
180 }
181 return hash_string(cp, len);
182 }
183 #endif
184
185 #define UNPACK_C_SIZE(cs) ((cs->c_size == (PAGE_SIZE-1)) ? PAGE_SIZE : cs->c_size)
186 #define PACK_C_SIZE(cs, size) (cs->c_size = ((size == PAGE_SIZE) ? PAGE_SIZE - 1 : size))
187
188
189 struct c_sv_hash_entry {
190 union {
191 struct {
192 uint32_t c_sv_he_ref;
193 uint32_t c_sv_he_data;
194 } c_sv_he;
195 uint64_t c_sv_he_record;
196 } c_sv_he_un;
197 };
198
199 #define he_ref c_sv_he_un.c_sv_he.c_sv_he_ref
200 #define he_data c_sv_he_un.c_sv_he.c_sv_he_data
201 #define he_record c_sv_he_un.c_sv_he_record
202
203 #define C_SV_HASH_MAX_MISS 32
204 #define C_SV_HASH_SIZE ((1 << 10))
205 #define C_SV_HASH_MASK ((1 << 10) - 1)
206 #define C_SV_CSEG_ID ((1 << 22) - 1)
207
208
209 union c_segu {
210 c_segment_t c_seg;
211 uintptr_t c_segno;
212 };
213
214 #define C_SLOT_ASSERT_PACKABLE(ptr) \
215 VM_ASSERT_POINTER_PACKABLE((vm_offset_t)(ptr), C_SLOT_PACKED_PTR);
216
217 #define C_SLOT_PACK_PTR(ptr) \
218 VM_PACK_POINTER((vm_offset_t)(ptr), C_SLOT_PACKED_PTR)
219
220 #define C_SLOT_UNPACK_PTR(cslot) \
221 (c_slot_mapping_t)VM_UNPACK_POINTER((cslot)->c_packed_ptr, C_SLOT_PACKED_PTR)
222
223 /* for debugging purposes */
224 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) c_slot_packing_params =
225 VM_PACKING_PARAMS(C_SLOT_PACKED_PTR);
226
227 uint32_t c_segment_count = 0;
228 uint32_t c_segment_count_max = 0;
229
230 uint64_t c_generation_id = 0;
231 uint64_t c_generation_id_flush_barrier;
232
233
234 #define HIBERNATE_FLUSHING_SECS_TO_COMPLETE 120
235
236 boolean_t hibernate_no_swapspace = FALSE;
237 boolean_t hibernate_flush_timed_out = FALSE;
238 clock_sec_t hibernate_flushing_deadline = 0;
239
240
241 #if RECORD_THE_COMPRESSED_DATA
242 char *c_compressed_record_sbuf;
243 char *c_compressed_record_ebuf;
244 char *c_compressed_record_cptr;
245 #endif
246
247
248 queue_head_t c_age_list_head;
249 queue_head_t c_swappedin_list_head;
250 queue_head_t c_swapout_list_head;
251 queue_head_t c_swapio_list_head;
252 queue_head_t c_swappedout_list_head;
253 queue_head_t c_swappedout_sparse_list_head;
254 queue_head_t c_major_list_head;
255 queue_head_t c_filling_list_head;
256 queue_head_t c_bad_list_head;
257
258 uint32_t c_age_count = 0;
259 uint32_t c_swappedin_count = 0;
260 uint32_t c_swapout_count = 0;
261 uint32_t c_swapio_count = 0;
262 uint32_t c_swappedout_count = 0;
263 uint32_t c_swappedout_sparse_count = 0;
264 uint32_t c_major_count = 0;
265 uint32_t c_filling_count = 0;
266 uint32_t c_empty_count = 0;
267 uint32_t c_bad_count = 0;
268
269
270 queue_head_t c_minor_list_head;
271 uint32_t c_minor_count = 0;
272
273 int c_overage_swapped_count = 0;
274 int c_overage_swapped_limit = 0;
275
276 int c_seg_fixed_array_len;
277 union c_segu *c_segments;
278 vm_offset_t c_buffers;
279 vm_size_t c_buffers_size;
280 caddr_t c_segments_next_page;
281 boolean_t c_segments_busy;
282 uint32_t c_segments_available;
283 uint32_t c_segments_limit;
284 uint32_t c_segments_nearing_limit;
285
286 uint32_t c_segment_svp_in_hash;
287 uint32_t c_segment_svp_hash_succeeded;
288 uint32_t c_segment_svp_hash_failed;
289 uint32_t c_segment_svp_zero_compressions;
290 uint32_t c_segment_svp_nonzero_compressions;
291 uint32_t c_segment_svp_zero_decompressions;
292 uint32_t c_segment_svp_nonzero_decompressions;
293
294 uint32_t c_segment_noncompressible_pages;
295
296 uint32_t c_segment_pages_compressed = 0; /* Tracks # of uncompressed pages fed into the compressor */
297 #if CONFIG_FREEZE
298 int32_t c_segment_pages_compressed_incore = 0; /* Tracks # of uncompressed pages fed into the compressor that are in memory */
299 uint32_t c_segments_incore_limit = 0; /* Tracks # of segments allowed to be in-core. Based on compressor pool size */
300 #endif /* CONFIG_FREEZE */
301
302 uint32_t c_segment_pages_compressed_limit;
303 uint32_t c_segment_pages_compressed_nearing_limit;
304 uint32_t c_free_segno_head = (uint32_t)-1;
305
306 uint32_t vm_compressor_minorcompact_threshold_divisor = 10;
307 uint32_t vm_compressor_majorcompact_threshold_divisor = 10;
308 uint32_t vm_compressor_unthrottle_threshold_divisor = 10;
309 uint32_t vm_compressor_catchup_threshold_divisor = 10;
310
311 uint32_t vm_compressor_minorcompact_threshold_divisor_overridden = 0;
312 uint32_t vm_compressor_majorcompact_threshold_divisor_overridden = 0;
313 uint32_t vm_compressor_unthrottle_threshold_divisor_overridden = 0;
314 uint32_t vm_compressor_catchup_threshold_divisor_overridden = 0;
315
316 #define C_SEGMENTS_PER_PAGE (PAGE_SIZE / sizeof(union c_segu))
317
318 LCK_GRP_DECLARE(vm_compressor_lck_grp, "vm_compressor");
319 LCK_RW_DECLARE(c_master_lock, &vm_compressor_lck_grp);
320 LCK_MTX_DECLARE(c_list_lock_storage, &vm_compressor_lck_grp);
321
322 boolean_t decompressions_blocked = FALSE;
323
324 zone_t compressor_segment_zone;
325 int c_compressor_swap_trigger = 0;
326
327 uint32_t compressor_cpus;
328 char *compressor_scratch_bufs;
329 char *kdp_compressor_scratch_buf;
330 char *kdp_compressor_decompressed_page;
331 addr64_t kdp_compressor_decompressed_page_paddr;
332 ppnum_t kdp_compressor_decompressed_page_ppnum;
333
334 clock_sec_t start_of_sample_period_sec = 0;
335 clock_nsec_t start_of_sample_period_nsec = 0;
336 clock_sec_t start_of_eval_period_sec = 0;
337 clock_nsec_t start_of_eval_period_nsec = 0;
338 uint32_t sample_period_decompression_count = 0;
339 uint32_t sample_period_compression_count = 0;
340 uint32_t last_eval_decompression_count = 0;
341 uint32_t last_eval_compression_count = 0;
342
343 #define DECOMPRESSION_SAMPLE_MAX_AGE (60 * 30)
344
345 boolean_t vm_swapout_ripe_segments = FALSE;
346 uint32_t vm_ripe_target_age = (60 * 60 * 48);
347
348 uint32_t swapout_target_age = 0;
349 uint32_t age_of_decompressions_during_sample_period[DECOMPRESSION_SAMPLE_MAX_AGE];
350 uint32_t overage_decompressions_during_sample_period = 0;
351
352
353 void do_fastwake_warmup(queue_head_t *, boolean_t);
354 boolean_t fastwake_warmup = FALSE;
355 boolean_t fastwake_recording_in_progress = FALSE;
356 clock_sec_t dont_trim_until_ts = 0;
357
358 uint64_t c_segment_warmup_count;
359 uint64_t first_c_segment_to_warm_generation_id = 0;
360 uint64_t last_c_segment_to_warm_generation_id = 0;
361 boolean_t hibernate_flushing = FALSE;
362
363 int64_t c_segment_input_bytes __attribute__((aligned(8))) = 0;
364 int64_t c_segment_compressed_bytes __attribute__((aligned(8))) = 0;
365 int64_t compressor_bytes_used __attribute__((aligned(8))) = 0;
366
367
368 struct c_sv_hash_entry c_segment_sv_hash_table[C_SV_HASH_SIZE] __attribute__ ((aligned(8)));
369
370 static boolean_t compressor_needs_to_swap(void);
371 static void vm_compressor_swap_trigger_thread(void);
372 static void vm_compressor_do_delayed_compactions(boolean_t);
373 static void vm_compressor_compact_and_swap(boolean_t);
374 static void vm_compressor_age_swapped_in_segments(boolean_t);
375
376 struct vm_compressor_swapper_stats vmcs_stats;
377
378 #if XNU_TARGET_OS_OSX
379 #if (__arm64__)
380 static void vm_compressor_process_major_segments(void);
381 #endif /* (__arm64__) */
382 static void vm_compressor_take_paging_space_action(void);
383 #endif /* XNU_TARGET_OS_OSX */
384
385 void compute_swapout_target_age(void);
386
387 boolean_t c_seg_major_compact(c_segment_t, c_segment_t);
388 boolean_t c_seg_major_compact_ok(c_segment_t, c_segment_t);
389
390 int c_seg_minor_compaction_and_unlock(c_segment_t, boolean_t);
391 int c_seg_do_minor_compaction_and_unlock(c_segment_t, boolean_t, boolean_t, boolean_t);
392 void c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg);
393
394 void c_seg_move_to_sparse_list(c_segment_t);
395 void c_seg_insert_into_q(queue_head_t *, c_segment_t);
396
397 uint64_t vm_available_memory(void);
398 uint64_t vm_compressor_pages_compressed(void);
399 uint32_t vm_compressor_pool_size(void);
400
401 /*
402 * indicate the need to do a major compaction if
403 * the overall set of in-use compression segments
404 * becomes sparse... on systems that support pressure
405 * driven swapping, this will also cause swapouts to
406 * be initiated.
407 */
408 static inline boolean_t
vm_compressor_needs_to_major_compact()409 vm_compressor_needs_to_major_compact()
410 {
411 uint32_t incore_seg_count;
412
413 incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
414
415 if ((c_segment_count >= (c_segments_nearing_limit / 8)) &&
416 ((incore_seg_count * c_seg_max_pages) - VM_PAGE_COMPRESSOR_COUNT) >
417 ((incore_seg_count / 8) * c_seg_max_pages)) {
418 return 1;
419 }
420 return 0;
421 }
422
423
424 uint64_t
vm_available_memory(void)425 vm_available_memory(void)
426 {
427 return ((uint64_t)AVAILABLE_NON_COMPRESSED_MEMORY) * PAGE_SIZE_64;
428 }
429
430
431 uint32_t
vm_compressor_pool_size(void)432 vm_compressor_pool_size(void)
433 {
434 return VM_PAGE_COMPRESSOR_COUNT;
435 }
436
437 uint64_t
vm_compressor_pages_compressed(void)438 vm_compressor_pages_compressed(void)
439 {
440 return c_segment_pages_compressed * PAGE_SIZE_64;
441 }
442
443
444 boolean_t
vm_compressor_low_on_space(void)445 vm_compressor_low_on_space(void)
446 {
447 #if CONFIG_FREEZE
448 uint64_t incore_seg_count;
449 uint32_t incore_compressed_pages;
450 if (freezer_incore_cseg_acct) {
451 incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
452 incore_compressed_pages = c_segment_pages_compressed_incore;
453 } else {
454 incore_seg_count = c_segment_count;
455 incore_compressed_pages = c_segment_pages_compressed;
456 }
457
458 if ((incore_compressed_pages > c_segment_pages_compressed_nearing_limit) ||
459 (incore_seg_count > c_segments_nearing_limit)) {
460 return TRUE;
461 }
462 #else /* CONFIG_FREEZE */
463 if ((c_segment_pages_compressed > c_segment_pages_compressed_nearing_limit) ||
464 (c_segment_count > c_segments_nearing_limit)) {
465 return TRUE;
466 }
467 #endif /* CONFIG_FREEZE */
468 return FALSE;
469 }
470
471
472 boolean_t
vm_compressor_out_of_space(void)473 vm_compressor_out_of_space(void)
474 {
475 #if CONFIG_FREEZE
476 uint64_t incore_seg_count;
477 uint32_t incore_compressed_pages;
478 if (freezer_incore_cseg_acct) {
479 incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
480 incore_compressed_pages = c_segment_pages_compressed_incore;
481 } else {
482 incore_seg_count = c_segment_count;
483 incore_compressed_pages = c_segment_pages_compressed;
484 }
485
486 if ((incore_compressed_pages >= c_segment_pages_compressed_limit) ||
487 (incore_seg_count > c_segments_incore_limit)) {
488 return TRUE;
489 }
490 #else /* CONFIG_FREEZE */
491 if ((c_segment_pages_compressed >= c_segment_pages_compressed_limit) ||
492 (c_segment_count >= c_segments_limit)) {
493 return TRUE;
494 }
495 #endif /* CONFIG_FREEZE */
496 return FALSE;
497 }
498
499
500 int
vm_wants_task_throttled(task_t task)501 vm_wants_task_throttled(task_t task)
502 {
503 ledger_amount_t compressed;
504 if (task == kernel_task) {
505 return 0;
506 }
507
508 if (VM_CONFIG_SWAP_IS_ACTIVE) {
509 if ((vm_compressor_low_on_space() || HARD_THROTTLE_LIMIT_REACHED())) {
510 ledger_get_balance(task->ledger, task_ledgers.internal_compressed, &compressed);
511 compressed >>= VM_MAP_PAGE_SHIFT(task->map);
512 if ((unsigned int)compressed > (c_segment_pages_compressed / 4)) {
513 return 1;
514 }
515 }
516 }
517 return 0;
518 }
519
520
521 #if DEVELOPMENT || DEBUG
522 /*
523 * On compressor/swap exhaustion, kill the largest process regardless of
524 * its chosen process policy.
525 */
526 TUNABLE(bool, kill_on_no_paging_space, "-kill_on_no_paging_space", false);
527 #endif /* DEVELOPMENT || DEBUG */
528
529 #if XNU_TARGET_OS_OSX
530
531 static uint32_t no_paging_space_action_in_progress = 0;
532 extern void memorystatus_send_low_swap_note(void);
533
534 static void
vm_compressor_take_paging_space_action(void)535 vm_compressor_take_paging_space_action(void)
536 {
537 if (no_paging_space_action_in_progress == 0) {
538 if (OSCompareAndSwap(0, 1, (UInt32 *)&no_paging_space_action_in_progress)) {
539 if (no_paging_space_action()) {
540 #if DEVELOPMENT || DEBUG
541 if (kill_on_no_paging_space) {
542 /*
543 * Since we are choosing to always kill a process, we don't need the
544 * "out of application memory" dialog box in this mode. And, hence we won't
545 * send the knote.
546 */
547 no_paging_space_action_in_progress = 0;
548 return;
549 }
550 #endif /* DEVELOPMENT || DEBUG */
551 memorystatus_send_low_swap_note();
552 }
553
554 no_paging_space_action_in_progress = 0;
555 }
556 }
557 }
558 #endif /* XNU_TARGET_OS_OSX */
559
560
561 void
vm_decompressor_lock(void)562 vm_decompressor_lock(void)
563 {
564 PAGE_REPLACEMENT_ALLOWED(TRUE);
565
566 decompressions_blocked = TRUE;
567
568 PAGE_REPLACEMENT_ALLOWED(FALSE);
569 }
570
571 void
vm_decompressor_unlock(void)572 vm_decompressor_unlock(void)
573 {
574 PAGE_REPLACEMENT_ALLOWED(TRUE);
575
576 decompressions_blocked = FALSE;
577
578 PAGE_REPLACEMENT_ALLOWED(FALSE);
579
580 thread_wakeup((event_t)&decompressions_blocked);
581 }
582
583 static inline void
cslot_copy(c_slot_t cdst,c_slot_t csrc)584 cslot_copy(c_slot_t cdst, c_slot_t csrc)
585 {
586 #if CHECKSUM_THE_DATA
587 cdst->c_hash_data = csrc->c_hash_data;
588 #endif
589 #if CHECKSUM_THE_COMPRESSED_DATA
590 cdst->c_hash_compressed_data = csrc->c_hash_compressed_data;
591 #endif
592 #if POPCOUNT_THE_COMPRESSED_DATA
593 cdst->c_pop_cdata = csrc->c_pop_cdata;
594 #endif
595 cdst->c_size = csrc->c_size;
596 cdst->c_packed_ptr = csrc->c_packed_ptr;
597 #if defined(__arm__) || defined(__arm64__)
598 cdst->c_codec = csrc->c_codec;
599 #endif
600 #if __APPLE_WKDM_POPCNT_EXTENSIONS__
601 cdst->c_inline_popcount = csrc->c_inline_popcount;
602 #endif
603 }
604
605 #if XNU_TARGET_OS_OSX
606 #define VM_COMPRESSOR_MAX_POOL_SIZE (192UL << 30)
607 #else
608 #define VM_COMPRESSOR_MAX_POOL_SIZE (0)
609 #endif
610
611 static vm_map_size_t compressor_size;
612 static SECURITY_READ_ONLY_LATE(struct kmem_range) compressor_range;
613 vm_map_t compressor_map;
614 uint64_t compressor_pool_max_size;
615 uint64_t compressor_pool_size;
616 uint32_t compressor_pool_multiplier;
617
618 #if DEVELOPMENT || DEBUG
619 /*
620 * Compressor segments are write-protected in development/debug
621 * kernels to help debug memory corruption.
622 * In cases where performance is a concern, this can be disabled
623 * via the boot-arg "-disable_cseg_write_protection".
624 */
625 boolean_t write_protect_c_segs = TRUE;
626 int vm_compressor_test_seg_wp;
627 uint32_t vm_ktrace_enabled;
628 #endif /* DEVELOPMENT || DEBUG */
629
630 #if (XNU_TARGET_OS_OSX && __arm64__)
631
632 #include <IOKit/IOPlatformExpert.h>
633 #include <sys/random.h>
634
635 static const char *csegbufsizeExperimentProperty = "_csegbufsz_experiment";
636 static thread_call_t csegbufsz_experiment_thread_call;
637
638 extern boolean_t IOServiceWaitForMatchingResource(const char * property, uint64_t timeout);
639 static void
erase_csegbufsz_experiment_property(__unused void * param0,__unused void * param1)640 erase_csegbufsz_experiment_property(__unused void *param0, __unused void *param1)
641 {
642 // Wait for NVRAM to be writable
643 if (!IOServiceWaitForMatchingResource("IONVRAM", UINT64_MAX)) {
644 printf("csegbufsz_experiment_property: Failed to wait for IONVRAM.");
645 }
646
647 if (!PERemoveNVRAMProperty(csegbufsizeExperimentProperty)) {
648 printf("csegbufsize_experiment_property: Failed to remove %s from NVRAM.", csegbufsizeExperimentProperty);
649 }
650 thread_call_free(csegbufsz_experiment_thread_call);
651 }
652
653 static void
erase_csegbufsz_experiment_property_async()654 erase_csegbufsz_experiment_property_async()
655 {
656 csegbufsz_experiment_thread_call = thread_call_allocate_with_priority(
657 erase_csegbufsz_experiment_property,
658 NULL,
659 THREAD_CALL_PRIORITY_LOW
660 );
661 if (csegbufsz_experiment_thread_call == NULL) {
662 printf("csegbufsize_experiment_property: Unable to allocate thread call.");
663 } else {
664 thread_call_enter(csegbufsz_experiment_thread_call);
665 }
666 }
667
668 static void
cleanup_csegbufsz_experiment(__unused void * arg0)669 cleanup_csegbufsz_experiment(__unused void *arg0)
670 {
671 char nvram = 0;
672 unsigned int len = sizeof(nvram);
673 if (PEReadNVRAMProperty(csegbufsizeExperimentProperty, &nvram, &len)) {
674 erase_csegbufsz_experiment_property_async();
675 }
676 }
677
678 STARTUP_ARG(EARLY_BOOT, STARTUP_RANK_FIRST, cleanup_csegbufsz_experiment, NULL);
679 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
680
681 __startup_func
682 static void
vm_compressor_set_size(void)683 vm_compressor_set_size(void)
684 {
685 __assert_only struct c_slot_mapping tmp_slot_ptr;
686 vm_size_t c_segments_arr_size = 0;
687
688 if (vm_compression_limit) {
689 compressor_pool_size = ptoa_64(vm_compression_limit);
690 }
691
692 compressor_pool_max_size = C_SEG_MAX_LIMIT;
693 compressor_pool_max_size *= c_seg_bufsize;
694
695 #if XNU_TARGET_OS_OSX
696
697 if (vm_compression_limit == 0) {
698 if (max_mem <= (4ULL * 1024ULL * 1024ULL * 1024ULL)) {
699 compressor_pool_size = 16ULL * max_mem;
700 } else if (max_mem <= (8ULL * 1024ULL * 1024ULL * 1024ULL)) {
701 compressor_pool_size = 8ULL * max_mem;
702 } else if (max_mem <= (32ULL * 1024ULL * 1024ULL * 1024ULL)) {
703 compressor_pool_size = 4ULL * max_mem;
704 } else {
705 compressor_pool_size = 2ULL * max_mem;
706 }
707 }
708 /*
709 * Cap the compressor pool size to a max of 192G
710 */
711 if (compressor_pool_size > VM_COMPRESSOR_MAX_POOL_SIZE) {
712 compressor_pool_size = VM_COMPRESSOR_MAX_POOL_SIZE;
713 }
714 if (max_mem <= (8ULL * 1024ULL * 1024ULL * 1024ULL)) {
715 compressor_pool_multiplier = 1;
716 } else if (max_mem <= (32ULL * 1024ULL * 1024ULL * 1024ULL)) {
717 compressor_pool_multiplier = 2;
718 } else {
719 compressor_pool_multiplier = 4;
720 }
721
722 #elif defined(__arm__)
723
724 #define MAX_COMPRESSOR_POOL_SIZE (1024 * 1024 * 300)
725
726 if (compressor_pool_max_size > MAX_COMPRESSOR_POOL_SIZE) {
727 compressor_pool_max_size = MAX_COMPRESSOR_POOL_SIZE;
728 }
729
730 if (vm_compression_limit == 0) {
731 compressor_pool_size = MAX_COMPRESSOR_POOL_SIZE;
732 }
733 compressor_pool_multiplier = 1;
734
735 #elif defined(__arm64__) && defined(XNU_TARGET_OS_WATCH)
736
737 /*
738 * On M9 watches the compressor can become big and can lead to
739 * churn in workingset resulting in audio drops. Setting a cap
740 * on the compressor size favors reclaiming unused memory
741 * sitting in idle band via jetsams
742 */
743
744 #define COMPRESSOR_CAP_PERCENTAGE 37ULL
745
746 if (compressor_pool_max_size > max_mem) {
747 compressor_pool_max_size = max_mem;
748 }
749
750 if (vm_compression_limit == 0) {
751 compressor_pool_size = (max_mem * COMPRESSOR_CAP_PERCENTAGE) / 100ULL;
752 }
753 compressor_pool_multiplier = 1;
754
755 #else
756
757 if (compressor_pool_max_size > max_mem) {
758 compressor_pool_max_size = max_mem;
759 }
760
761 if (vm_compression_limit == 0) {
762 compressor_pool_size = max_mem;
763 }
764 compressor_pool_multiplier = 1;
765 #endif
766 if (compressor_pool_size > compressor_pool_max_size) {
767 compressor_pool_size = compressor_pool_max_size;
768 }
769
770 c_seg_max_pages = (c_seg_bufsize / PAGE_SIZE);
771 c_seg_slot_var_array_min_len = c_seg_max_pages;
772
773 #if !defined(__x86_64__)
774 c_seg_off_limit = (C_SEG_BYTES_TO_OFFSET((c_seg_bufsize - 512)));
775 c_seg_allocsize = (c_seg_bufsize + PAGE_SIZE);
776 #else
777 c_seg_off_limit = (C_SEG_BYTES_TO_OFFSET((c_seg_bufsize - 128)));
778 c_seg_allocsize = c_seg_bufsize;
779 #endif /* !defined(__x86_64__) */
780
781 c_segments_limit = (uint32_t)(compressor_pool_size / (vm_size_t)(c_seg_allocsize));
782 tmp_slot_ptr.s_cseg = c_segments_limit;
783 /* Panic on internal configs*/
784 assertf((tmp_slot_ptr.s_cseg == c_segments_limit), "vm_compressor_init: overflowed s_cseg field in c_slot_mapping with c_segno: %d", c_segments_limit);
785
786 if (tmp_slot_ptr.s_cseg != c_segments_limit) {
787 tmp_slot_ptr.s_cseg = -1;
788 c_segments_limit = tmp_slot_ptr.s_cseg - 1; /*limited by segment idx bits in c_slot_mapping*/
789 compressor_pool_size = (c_segments_limit * (vm_size_t)(c_seg_allocsize));
790 }
791
792 c_segments_nearing_limit = (uint32_t)(((uint64_t)c_segments_limit * 98ULL) / 100ULL);
793
794 c_segment_pages_compressed_limit = (c_segments_limit * (c_seg_bufsize / PAGE_SIZE) * compressor_pool_multiplier);
795
796 if (c_segment_pages_compressed_limit < (uint32_t)(max_mem / PAGE_SIZE)) {
797 #if defined(XNU_TARGET_OS_WATCH)
798 c_segment_pages_compressed_limit = (uint32_t)(max_mem / PAGE_SIZE);
799 #else
800 if (!vm_compression_limit) {
801 c_segment_pages_compressed_limit = (uint32_t)(max_mem / PAGE_SIZE);
802 }
803 #endif
804 }
805
806 c_segment_pages_compressed_nearing_limit = (uint32_t)(((uint64_t)c_segment_pages_compressed_limit * 98ULL) / 100ULL);
807
808 #if CONFIG_FREEZE
809 /*
810 * Our in-core limits are based on the size of the compressor pool.
811 * The c_segments_nearing_limit is also based on the compressor pool
812 * size and calculated above.
813 */
814 c_segments_incore_limit = c_segments_limit;
815
816 if (freezer_incore_cseg_acct) {
817 /*
818 * Add enough segments to track all frozen c_segs that can be stored in swap.
819 */
820 c_segments_limit += (uint32_t)(vm_swap_get_max_configured_space() / (vm_size_t)(c_seg_allocsize));
821 tmp_slot_ptr.s_cseg = c_segments_limit;
822 /* Panic on internal configs*/
823 assertf((tmp_slot_ptr.s_cseg == c_segments_limit), "vm_compressor_init: freezer reserve overflowed s_cseg field in c_slot_mapping with c_segno: %d", c_segments_limit);
824 }
825 #endif
826 /*
827 * Submap needs space for:
828 * - c_segments
829 * - c_buffers
830 * - swap reclaimations -- c_seg_bufsize
831 */
832 c_segments_arr_size = vm_map_round_page((sizeof(union c_segu) * c_segments_limit), VM_MAP_PAGE_MASK(kernel_map));
833 c_buffers_size = vm_map_round_page(((vm_size_t)c_seg_allocsize * (vm_size_t)c_segments_limit), VM_MAP_PAGE_MASK(kernel_map));
834
835 compressor_size = c_segments_arr_size + c_buffers_size + c_seg_bufsize;
836
837 #if RECORD_THE_COMPRESSED_DATA
838 c_compressed_record_sbuf_size = (vm_size_t)c_seg_allocsize + (PAGE_SIZE * 2);
839 compressor_size += c_compressed_record_sbuf_size;
840 #endif /* RECORD_THE_COMPRESSED_DATA */
841 }
842 STARTUP(KMEM, STARTUP_RANK_FIRST, vm_compressor_set_size);
843
844 KMEM_RANGE_REGISTER_DYNAMIC(compressor, &compressor_range, ^() {
845 return compressor_size;
846 });
847
848 void
vm_compressor_init(void)849 vm_compressor_init(void)
850 {
851 thread_t thread;
852 #if RECORD_THE_COMPRESSED_DATA
853 vm_size_t c_compressed_record_sbuf_size = 0;
854 #endif /* RECORD_THE_COMPRESSED_DATA */
855
856 #if DEVELOPMENT || DEBUG || CONFIG_FREEZE
857 char bootarg_name[32];
858 #endif /* DEVELOPMENT || DEBUG || CONFIG_FREEZE */
859
860 #if DEVELOPMENT || DEBUG
861 if (PE_parse_boot_argn("-disable_cseg_write_protection", bootarg_name, sizeof(bootarg_name))) {
862 write_protect_c_segs = FALSE;
863 }
864 int vmcval = 1;
865 PE_parse_boot_argn("vm_compressor_validation", &vmcval, sizeof(vmcval));
866
867 if (kern_feature_override(KF_COMPRSV_OVRD)) {
868 vmcval = 0;
869 }
870 if (vmcval == 0) {
871 #if POPCOUNT_THE_COMPRESSED_DATA
872 popcount_c_segs = FALSE;
873 #endif
874 #if CHECKSUM_THE_DATA || CHECKSUM_THE_COMPRESSED_DATA
875 checksum_c_segs = FALSE;
876 #endif
877 #if VALIDATE_C_SEGMENTS
878 validate_c_segs = FALSE;
879 #endif
880 write_protect_c_segs = FALSE;
881 }
882 #endif /* DEVELOPMENT || DEBUG */
883
884 #if CONFIG_FREEZE
885 if (PE_parse_boot_argn("-disable_freezer_cseg_acct", bootarg_name, sizeof(bootarg_name))) {
886 freezer_incore_cseg_acct = FALSE;
887 }
888 #endif /* CONFIG_FREEZE */
889
890 assert((C_SEGMENTS_PER_PAGE * sizeof(union c_segu)) == PAGE_SIZE);
891
892 #if !XNU_TARGET_OS_OSX
893 vm_compressor_minorcompact_threshold_divisor = 20;
894 vm_compressor_majorcompact_threshold_divisor = 30;
895 vm_compressor_unthrottle_threshold_divisor = 40;
896 vm_compressor_catchup_threshold_divisor = 60;
897 #else /* !XNU_TARGET_OS_OSX */
898 if (max_mem <= (3ULL * 1024ULL * 1024ULL * 1024ULL)) {
899 vm_compressor_minorcompact_threshold_divisor = 11;
900 vm_compressor_majorcompact_threshold_divisor = 13;
901 vm_compressor_unthrottle_threshold_divisor = 20;
902 vm_compressor_catchup_threshold_divisor = 35;
903 } else {
904 vm_compressor_minorcompact_threshold_divisor = 20;
905 vm_compressor_majorcompact_threshold_divisor = 25;
906 vm_compressor_unthrottle_threshold_divisor = 35;
907 vm_compressor_catchup_threshold_divisor = 50;
908 }
909 #endif /* !XNU_TARGET_OS_OSX */
910
911 queue_init(&c_bad_list_head);
912 queue_init(&c_age_list_head);
913 queue_init(&c_minor_list_head);
914 queue_init(&c_major_list_head);
915 queue_init(&c_filling_list_head);
916 queue_init(&c_swapout_list_head);
917 queue_init(&c_swapio_list_head);
918 queue_init(&c_swappedin_list_head);
919 queue_init(&c_swappedout_list_head);
920 queue_init(&c_swappedout_sparse_list_head);
921
922 c_free_segno_head = -1;
923 c_segments_available = 0;
924
925 compressor_map = kmem_suballoc(kernel_map, &compressor_range.min_address,
926 compressor_size, VM_MAP_CREATE_NEVER_FAULTS,
927 VM_FLAGS_FIXED_RANGE_SUBALLOC, KMS_NOFAIL | KMS_PERMANENT,
928 VM_KERN_MEMORY_COMPRESSOR).kmr_submap;
929
930 kmem_alloc(compressor_map, (vm_offset_t *)(&c_segments),
931 (sizeof(union c_segu) * c_segments_limit),
932 KMA_NOFAIL | KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT,
933 VM_KERN_MEMORY_COMPRESSOR);
934 kmem_alloc(compressor_map, &c_buffers, c_buffers_size,
935 KMA_NOFAIL | KMA_COMPRESSOR | KMA_VAONLY | KMA_PERMANENT,
936 VM_KERN_MEMORY_COMPRESSOR);
937
938 #if DEVELOPMENT || DEBUG
939 hvg_hcall_set_coredump_data();
940 #endif
941
942 /*
943 * Pick a good size that will minimize fragmentation in zalloc
944 * by minimizing the fragmentation in a 16k run.
945 *
946 * c_seg_slot_var_array_min_len is larger on 4k systems than 16k ones,
947 * making the fragmentation in a 4k page terrible. Using 16k for all
948 * systems matches zalloc() and will minimize fragmentation.
949 */
950 uint32_t c_segment_size = sizeof(struct c_segment) + (c_seg_slot_var_array_min_len * sizeof(struct c_slot));
951 uint32_t cnt = (16 << 10) / c_segment_size;
952 uint32_t frag = (16 << 10) % c_segment_size;
953
954 c_seg_fixed_array_len = c_seg_slot_var_array_min_len;
955
956 while (cnt * sizeof(struct c_slot) < frag) {
957 c_segment_size += sizeof(struct c_slot);
958 c_seg_fixed_array_len++;
959 frag -= cnt * sizeof(struct c_slot);
960 }
961
962 compressor_segment_zone = zone_create("compressor_segment",
963 c_segment_size, ZC_PGZ_USE_GUARDS | ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM);
964
965 c_segments_busy = FALSE;
966
967 c_segments_next_page = (caddr_t)c_segments;
968 vm_compressor_algorithm_init();
969
970 {
971 host_basic_info_data_t hinfo;
972 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
973 size_t bufsize;
974 char *buf;
975
976 #define BSD_HOST 1
977 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
978
979 compressor_cpus = hinfo.max_cpus;
980
981 bufsize = PAGE_SIZE;
982 bufsize += compressor_cpus * vm_compressor_get_decode_scratch_size();
983 /* For the KDP path */
984 bufsize += vm_compressor_get_decode_scratch_size();
985 #if CONFIG_FREEZE
986 bufsize += vm_compressor_get_encode_scratch_size();
987 #endif
988 #if RECORD_THE_COMPRESSED_DATA
989 bufsize += c_compressed_record_sbuf_size;
990 #endif
991
992 kmem_alloc(kernel_map, (vm_offset_t *)&buf, bufsize,
993 KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
994 VM_KERN_MEMORY_COMPRESSOR);
995
996 /*
997 * kdp_compressor_decompressed_page must be page aligned because we access
998 * it through the physical aperture by page number.
999 */
1000 kdp_compressor_decompressed_page = buf;
1001 kdp_compressor_decompressed_page_paddr = kvtophys((vm_offset_t)kdp_compressor_decompressed_page);
1002 kdp_compressor_decompressed_page_ppnum = (ppnum_t) atop(kdp_compressor_decompressed_page_paddr);
1003 buf += PAGE_SIZE;
1004 bufsize -= PAGE_SIZE;
1005
1006 compressor_scratch_bufs = buf;
1007 buf += compressor_cpus * vm_compressor_get_decode_scratch_size();
1008 bufsize -= compressor_cpus * vm_compressor_get_decode_scratch_size();
1009
1010 kdp_compressor_scratch_buf = buf;
1011 buf += vm_compressor_get_decode_scratch_size();
1012 bufsize -= vm_compressor_get_decode_scratch_size();
1013
1014 #if CONFIG_FREEZE
1015 freezer_context_global.freezer_ctx_compressor_scratch_buf = buf;
1016 buf += vm_compressor_get_encode_scratch_size();
1017 bufsize -= vm_compressor_get_encode_scratch_size();
1018 #endif
1019
1020 #if RECORD_THE_COMPRESSED_DATA
1021 c_compressed_record_sbuf = buf;
1022 c_compressed_record_cptr = buf;
1023 c_compressed_record_ebuf = c_compressed_record_sbuf + c_compressed_record_sbuf_size;
1024 buf += c_compressed_record_sbuf_size;
1025 bufsize -= c_compressed_record_sbuf_size;
1026 #endif
1027 assert(bufsize == 0);
1028 }
1029
1030 if (kernel_thread_start_priority((thread_continue_t)vm_compressor_swap_trigger_thread, NULL,
1031 BASEPRI_VM, &thread) != KERN_SUCCESS) {
1032 panic("vm_compressor_swap_trigger_thread: create failed");
1033 }
1034 thread_deallocate(thread);
1035
1036 if (vm_pageout_internal_start() != KERN_SUCCESS) {
1037 panic("vm_compressor_init: Failed to start the internal pageout thread.");
1038 }
1039 if (VM_CONFIG_SWAP_IS_PRESENT) {
1040 vm_compressor_swap_init();
1041 }
1042
1043 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
1044 vm_compressor_is_active = 1;
1045 }
1046
1047 #if CONFIG_FREEZE
1048 memorystatus_freeze_enabled = TRUE;
1049 #endif /* CONFIG_FREEZE */
1050
1051 vm_compressor_available = 1;
1052
1053 vm_page_reactivate_all_throttled();
1054
1055 bzero(&vmcs_stats, sizeof(struct vm_compressor_swapper_stats));
1056 }
1057
1058
1059 #if VALIDATE_C_SEGMENTS
1060
1061 static void
c_seg_validate(c_segment_t c_seg,boolean_t must_be_compact)1062 c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact)
1063 {
1064 uint16_t c_indx;
1065 int32_t bytes_used;
1066 uint32_t c_rounded_size;
1067 uint32_t c_size;
1068 c_slot_t cs;
1069
1070 if (__probable(validate_c_segs == FALSE)) {
1071 return;
1072 }
1073 if (c_seg->c_firstemptyslot < c_seg->c_nextslot) {
1074 c_indx = c_seg->c_firstemptyslot;
1075 cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1076
1077 if (cs == NULL) {
1078 panic("c_seg_validate: no slot backing c_firstemptyslot");
1079 }
1080
1081 if (cs->c_size) {
1082 panic("c_seg_validate: c_firstemptyslot has non-zero size (%d)", cs->c_size);
1083 }
1084 }
1085 bytes_used = 0;
1086
1087 for (c_indx = 0; c_indx < c_seg->c_nextslot; c_indx++) {
1088 cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1089
1090 c_size = UNPACK_C_SIZE(cs);
1091
1092 c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
1093
1094 bytes_used += c_rounded_size;
1095
1096 #if CHECKSUM_THE_COMPRESSED_DATA
1097 unsigned csvhash;
1098 if (c_size && cs->c_hash_compressed_data != (csvhash = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size))) {
1099 addr64_t csvphys = kvtophys((vm_offset_t)&c_seg->c_store.c_buffer[cs->c_offset]);
1100 panic("Compressed data doesn't match original %p phys: 0x%llx %d %p %d %d 0x%x 0x%x", c_seg, csvphys, cs->c_offset, cs, c_indx, c_size, cs->c_hash_compressed_data, csvhash);
1101 }
1102 #endif
1103 #if POPCOUNT_THE_COMPRESSED_DATA
1104 unsigned csvpop;
1105 if (c_size) {
1106 uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
1107 if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
1108 panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, (uint64_t)cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
1109 }
1110 }
1111 #endif
1112 }
1113
1114 if (bytes_used != c_seg->c_bytes_used) {
1115 panic("c_seg_validate: bytes_used mismatch - found %d, segment has %d", bytes_used, c_seg->c_bytes_used);
1116 }
1117
1118 if (c_seg->c_bytes_used > C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset)) {
1119 panic("c_seg_validate: c_bytes_used > c_nextoffset - c_nextoffset = %d, c_bytes_used = %d",
1120 (int32_t)C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset), c_seg->c_bytes_used);
1121 }
1122
1123 if (must_be_compact) {
1124 if (c_seg->c_bytes_used != C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset)) {
1125 panic("c_seg_validate: c_bytes_used doesn't match c_nextoffset - c_nextoffset = %d, c_bytes_used = %d",
1126 (int32_t)C_SEG_OFFSET_TO_BYTES((int32_t)c_seg->c_nextoffset), c_seg->c_bytes_used);
1127 }
1128 }
1129 }
1130
1131 #endif
1132
1133
1134 void
c_seg_need_delayed_compaction(c_segment_t c_seg,boolean_t c_list_lock_held)1135 c_seg_need_delayed_compaction(c_segment_t c_seg, boolean_t c_list_lock_held)
1136 {
1137 boolean_t clear_busy = FALSE;
1138
1139 if (c_list_lock_held == FALSE) {
1140 if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1141 C_SEG_BUSY(c_seg);
1142
1143 lck_mtx_unlock_always(&c_seg->c_lock);
1144 lck_mtx_lock_spin_always(c_list_lock);
1145 lck_mtx_lock_spin_always(&c_seg->c_lock);
1146
1147 clear_busy = TRUE;
1148 }
1149 }
1150 assert(c_seg->c_state != C_IS_FILLING);
1151
1152 if (!c_seg->c_on_minorcompact_q && !(C_SEG_IS_ON_DISK_OR_SOQ(c_seg))) {
1153 queue_enter(&c_minor_list_head, c_seg, c_segment_t, c_list);
1154 c_seg->c_on_minorcompact_q = 1;
1155 c_minor_count++;
1156 }
1157 if (c_list_lock_held == FALSE) {
1158 lck_mtx_unlock_always(c_list_lock);
1159 }
1160
1161 if (clear_busy == TRUE) {
1162 C_SEG_WAKEUP_DONE(c_seg);
1163 }
1164 }
1165
1166
1167 unsigned int c_seg_moved_to_sparse_list = 0;
1168
1169 void
c_seg_move_to_sparse_list(c_segment_t c_seg)1170 c_seg_move_to_sparse_list(c_segment_t c_seg)
1171 {
1172 boolean_t clear_busy = FALSE;
1173
1174 if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1175 C_SEG_BUSY(c_seg);
1176
1177 lck_mtx_unlock_always(&c_seg->c_lock);
1178 lck_mtx_lock_spin_always(c_list_lock);
1179 lck_mtx_lock_spin_always(&c_seg->c_lock);
1180
1181 clear_busy = TRUE;
1182 }
1183 c_seg_switch_state(c_seg, C_ON_SWAPPEDOUTSPARSE_Q, FALSE);
1184
1185 c_seg_moved_to_sparse_list++;
1186
1187 lck_mtx_unlock_always(c_list_lock);
1188
1189 if (clear_busy == TRUE) {
1190 C_SEG_WAKEUP_DONE(c_seg);
1191 }
1192 }
1193
1194
1195 void
c_seg_insert_into_q(queue_head_t * qhead,c_segment_t c_seg)1196 c_seg_insert_into_q(queue_head_t *qhead, c_segment_t c_seg)
1197 {
1198 c_segment_t c_seg_next;
1199
1200 if (queue_empty(qhead)) {
1201 queue_enter(qhead, c_seg, c_segment_t, c_age_list);
1202 } else {
1203 c_seg_next = (c_segment_t)queue_first(qhead);
1204
1205 while (TRUE) {
1206 if (c_seg->c_generation_id < c_seg_next->c_generation_id) {
1207 queue_insert_before(qhead, c_seg, c_seg_next, c_segment_t, c_age_list);
1208 break;
1209 }
1210 c_seg_next = (c_segment_t) queue_next(&c_seg_next->c_age_list);
1211
1212 if (queue_end(qhead, (queue_entry_t) c_seg_next)) {
1213 queue_enter(qhead, c_seg, c_segment_t, c_age_list);
1214 break;
1215 }
1216 }
1217 }
1218 }
1219
1220
1221 int try_minor_compaction_failed = 0;
1222 int try_minor_compaction_succeeded = 0;
1223
1224 void
c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg)1225 c_seg_try_minor_compaction_and_unlock(c_segment_t c_seg)
1226 {
1227 assert(c_seg->c_on_minorcompact_q);
1228 /*
1229 * c_seg is currently on the delayed minor compaction
1230 * queue and we have c_seg locked... if we can get the
1231 * c_list_lock w/o blocking (if we blocked we could deadlock
1232 * because the lock order is c_list_lock then c_seg's lock)
1233 * we'll pull it from the delayed list and free it directly
1234 */
1235 if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
1236 /*
1237 * c_list_lock is held, we need to bail
1238 */
1239 try_minor_compaction_failed++;
1240
1241 lck_mtx_unlock_always(&c_seg->c_lock);
1242 } else {
1243 try_minor_compaction_succeeded++;
1244
1245 C_SEG_BUSY(c_seg);
1246 c_seg_do_minor_compaction_and_unlock(c_seg, TRUE, FALSE, FALSE);
1247 }
1248 }
1249
1250
1251 int
c_seg_do_minor_compaction_and_unlock(c_segment_t c_seg,boolean_t clear_busy,boolean_t need_list_lock,boolean_t disallow_page_replacement)1252 c_seg_do_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy, boolean_t need_list_lock, boolean_t disallow_page_replacement)
1253 {
1254 int c_seg_freed;
1255
1256 assert(c_seg->c_busy);
1257 assert(!C_SEG_IS_ON_DISK_OR_SOQ(c_seg));
1258
1259 /*
1260 * check for the case that can occur when we are not swapping
1261 * and this segment has been major compacted in the past
1262 * and moved to the majorcompact q to remove it from further
1263 * consideration... if the occupancy falls too low we need
1264 * to put it back on the age_q so that it will be considered
1265 * in the next major compaction sweep... if we don't do this
1266 * we will eventually run into the c_segments_limit
1267 */
1268 if (c_seg->c_state == C_ON_MAJORCOMPACT_Q && C_SEG_SHOULD_MAJORCOMPACT_NOW(c_seg)) {
1269 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
1270 }
1271 if (!c_seg->c_on_minorcompact_q) {
1272 if (clear_busy == TRUE) {
1273 C_SEG_WAKEUP_DONE(c_seg);
1274 }
1275
1276 lck_mtx_unlock_always(&c_seg->c_lock);
1277
1278 return 0;
1279 }
1280 queue_remove(&c_minor_list_head, c_seg, c_segment_t, c_list);
1281 c_seg->c_on_minorcompact_q = 0;
1282 c_minor_count--;
1283
1284 lck_mtx_unlock_always(c_list_lock);
1285
1286 if (disallow_page_replacement == TRUE) {
1287 lck_mtx_unlock_always(&c_seg->c_lock);
1288
1289 PAGE_REPLACEMENT_DISALLOWED(TRUE);
1290
1291 lck_mtx_lock_spin_always(&c_seg->c_lock);
1292 }
1293 c_seg_freed = c_seg_minor_compaction_and_unlock(c_seg, clear_busy);
1294
1295 if (disallow_page_replacement == TRUE) {
1296 PAGE_REPLACEMENT_DISALLOWED(FALSE);
1297 }
1298
1299 if (need_list_lock == TRUE) {
1300 lck_mtx_lock_spin_always(c_list_lock);
1301 }
1302
1303 return c_seg_freed;
1304 }
1305
1306 void
kdp_compressor_busy_find_owner(event64_t wait_event,thread_waitinfo_t * waitinfo)1307 kdp_compressor_busy_find_owner(event64_t wait_event, thread_waitinfo_t *waitinfo)
1308 {
1309 c_segment_t c_seg = (c_segment_t) wait_event;
1310
1311 waitinfo->owner = thread_tid(c_seg->c_busy_for_thread);
1312 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(c_seg);
1313 }
1314
1315 #if DEVELOPMENT || DEBUG
1316 int
do_cseg_wedge_thread(void)1317 do_cseg_wedge_thread(void)
1318 {
1319 struct c_segment c_seg;
1320 c_seg.c_busy_for_thread = current_thread();
1321
1322 debug_cseg_wait_event = (event_t) &c_seg;
1323
1324 thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor);
1325 assert_wait((event_t) (&c_seg), THREAD_INTERRUPTIBLE);
1326
1327 thread_block(THREAD_CONTINUE_NULL);
1328
1329 return 0;
1330 }
1331
1332 int
do_cseg_unwedge_thread(void)1333 do_cseg_unwedge_thread(void)
1334 {
1335 thread_wakeup(debug_cseg_wait_event);
1336 debug_cseg_wait_event = NULL;
1337
1338 return 0;
1339 }
1340 #endif /* DEVELOPMENT || DEBUG */
1341
1342 void
c_seg_wait_on_busy(c_segment_t c_seg)1343 c_seg_wait_on_busy(c_segment_t c_seg)
1344 {
1345 c_seg->c_wanted = 1;
1346
1347 thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor);
1348 assert_wait((event_t) (c_seg), THREAD_UNINT);
1349
1350 lck_mtx_unlock_always(&c_seg->c_lock);
1351 thread_block(THREAD_CONTINUE_NULL);
1352 }
1353
1354 #if CONFIG_FREEZE
1355 /*
1356 * We don't have the task lock held while updating the task's
1357 * c_seg queues. We can do that because of the following restrictions:
1358 *
1359 * - SINGLE FREEZER CONTEXT:
1360 * We 'insert' c_segs into the task list on the task_freeze path.
1361 * There can only be one such freeze in progress and the task
1362 * isn't disappearing because we have the VM map lock held throughout
1363 * and we have a reference on the proc too.
1364 *
1365 * - SINGLE TASK DISOWN CONTEXT:
1366 * We 'disown' c_segs of a task ONLY from the task_terminate context. So
1367 * we don't need the task lock but we need the c_list_lock and the
1368 * compressor master lock (shared). We also hold the individual
1369 * c_seg locks (exclusive).
1370 *
1371 * If we either:
1372 * - can't get the c_seg lock on a try, then we start again because maybe
1373 * the c_seg is part of a compaction and might get freed. So we can't trust
1374 * that linkage and need to restart our queue traversal.
1375 * - OR, we run into a busy c_seg (say being swapped in or free-ing) we
1376 * drop all locks again and wait and restart our queue traversal.
1377 *
1378 * - The new_owner_task below is currently only the kernel or NULL.
1379 *
1380 */
1381 void
c_seg_update_task_owner(c_segment_t c_seg,task_t new_owner_task)1382 c_seg_update_task_owner(c_segment_t c_seg, task_t new_owner_task)
1383 {
1384 task_t owner_task = c_seg->c_task_owner;
1385 uint64_t uncompressed_bytes = ((c_seg->c_slots_used) * PAGE_SIZE_64);
1386
1387 LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
1388 LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
1389
1390 if (owner_task) {
1391 task_update_frozen_to_swap_acct(owner_task, uncompressed_bytes, DEBIT_FROM_SWAP);
1392 queue_remove(&owner_task->task_frozen_cseg_q, c_seg,
1393 c_segment_t, c_task_list_next_cseg);
1394 }
1395
1396 if (new_owner_task) {
1397 queue_enter(&new_owner_task->task_frozen_cseg_q, c_seg,
1398 c_segment_t, c_task_list_next_cseg);
1399 task_update_frozen_to_swap_acct(new_owner_task, uncompressed_bytes, CREDIT_TO_SWAP);
1400 }
1401
1402 c_seg->c_task_owner = new_owner_task;
1403 }
1404
1405 void
task_disown_frozen_csegs(task_t owner_task)1406 task_disown_frozen_csegs(task_t owner_task)
1407 {
1408 c_segment_t c_seg = NULL, next_cseg = NULL;
1409
1410 again:
1411 PAGE_REPLACEMENT_DISALLOWED(TRUE);
1412 lck_mtx_lock_spin_always(c_list_lock);
1413
1414 for (c_seg = (c_segment_t) queue_first(&owner_task->task_frozen_cseg_q);
1415 !queue_end(&owner_task->task_frozen_cseg_q, (queue_entry_t) c_seg);
1416 c_seg = next_cseg) {
1417 next_cseg = (c_segment_t) queue_next(&c_seg->c_task_list_next_cseg);
1418
1419 if (!lck_mtx_try_lock_spin_always(&c_seg->c_lock)) {
1420 lck_mtx_unlock(c_list_lock);
1421 PAGE_REPLACEMENT_DISALLOWED(FALSE);
1422 goto again;
1423 }
1424
1425 if (c_seg->c_busy) {
1426 lck_mtx_unlock(c_list_lock);
1427 PAGE_REPLACEMENT_DISALLOWED(FALSE);
1428
1429 c_seg_wait_on_busy(c_seg);
1430
1431 goto again;
1432 }
1433 assert(c_seg->c_task_owner == owner_task);
1434 c_seg_update_task_owner(c_seg, kernel_task);
1435 lck_mtx_unlock_always(&c_seg->c_lock);
1436 }
1437
1438 lck_mtx_unlock(c_list_lock);
1439 PAGE_REPLACEMENT_DISALLOWED(FALSE);
1440 }
1441 #endif /* CONFIG_FREEZE */
1442
1443 void
c_seg_switch_state(c_segment_t c_seg,int new_state,boolean_t insert_head)1444 c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
1445 {
1446 int old_state = c_seg->c_state;
1447
1448 #if XNU_TARGET_OS_OSX
1449 #if DEVELOPMENT || DEBUG
1450 if (new_state != C_IS_FILLING) {
1451 LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
1452 }
1453 LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
1454 #endif
1455 #endif /* XNU_TARGET_OS_OSX */
1456 switch (old_state) {
1457 case C_IS_EMPTY:
1458 assert(new_state == C_IS_FILLING || new_state == C_IS_FREE);
1459
1460 c_empty_count--;
1461 break;
1462
1463 case C_IS_FILLING:
1464 assert(new_state == C_ON_AGE_Q || new_state == C_ON_SWAPOUT_Q);
1465
1466 queue_remove(&c_filling_list_head, c_seg, c_segment_t, c_age_list);
1467 c_filling_count--;
1468 break;
1469
1470 case C_ON_AGE_Q:
1471 assert(new_state == C_ON_SWAPOUT_Q || new_state == C_ON_MAJORCOMPACT_Q ||
1472 new_state == C_IS_FREE);
1473
1474 queue_remove(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1475 c_age_count--;
1476 break;
1477
1478 case C_ON_SWAPPEDIN_Q:
1479 assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE);
1480
1481 queue_remove(&c_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1482 c_swappedin_count--;
1483 break;
1484
1485 case C_ON_SWAPOUT_Q:
1486 assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE || new_state == C_IS_EMPTY || new_state == C_ON_SWAPIO_Q);
1487
1488 #if CONFIG_FREEZE
1489 if (c_seg->c_task_owner && (new_state != C_ON_SWAPIO_Q)) {
1490 c_seg_update_task_owner(c_seg, NULL);
1491 }
1492 #endif /* CONFIG_FREEZE */
1493
1494 queue_remove(&c_swapout_list_head, c_seg, c_segment_t, c_age_list);
1495 thread_wakeup((event_t)&compaction_swapper_running);
1496 c_swapout_count--;
1497 break;
1498
1499 case C_ON_SWAPIO_Q:
1500 assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_AGE_Q);
1501
1502 queue_remove(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1503 c_swapio_count--;
1504 break;
1505
1506 case C_ON_SWAPPEDOUT_Q:
1507 assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q ||
1508 new_state == C_ON_SWAPPEDOUTSPARSE_Q ||
1509 new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE);
1510
1511 queue_remove(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1512 c_swappedout_count--;
1513 break;
1514
1515 case C_ON_SWAPPEDOUTSPARSE_Q:
1516 assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q ||
1517 new_state == C_ON_BAD_Q || new_state == C_IS_EMPTY || new_state == C_IS_FREE);
1518
1519 queue_remove(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1520 c_swappedout_sparse_count--;
1521 break;
1522
1523 case C_ON_MAJORCOMPACT_Q:
1524 assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE);
1525
1526 queue_remove(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1527 c_major_count--;
1528 break;
1529
1530 case C_ON_BAD_Q:
1531 assert(new_state == C_IS_FREE);
1532
1533 queue_remove(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1534 c_bad_count--;
1535 break;
1536
1537 default:
1538 panic("c_seg %p has bad c_state = %d", c_seg, old_state);
1539 }
1540
1541 switch (new_state) {
1542 case C_IS_FREE:
1543 assert(old_state != C_IS_FILLING);
1544
1545 break;
1546
1547 case C_IS_EMPTY:
1548 assert(old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1549
1550 c_empty_count++;
1551 break;
1552
1553 case C_IS_FILLING:
1554 assert(old_state == C_IS_EMPTY);
1555
1556 queue_enter(&c_filling_list_head, c_seg, c_segment_t, c_age_list);
1557 c_filling_count++;
1558 break;
1559
1560 case C_ON_AGE_Q:
1561 assert(old_state == C_IS_FILLING || old_state == C_ON_SWAPPEDIN_Q ||
1562 old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPIO_Q ||
1563 old_state == C_ON_MAJORCOMPACT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1564
1565 if (old_state == C_IS_FILLING) {
1566 queue_enter(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1567 } else {
1568 if (!queue_empty(&c_age_list_head)) {
1569 c_segment_t c_first;
1570
1571 c_first = (c_segment_t)queue_first(&c_age_list_head);
1572 c_seg->c_creation_ts = c_first->c_creation_ts;
1573 }
1574 queue_enter_first(&c_age_list_head, c_seg, c_segment_t, c_age_list);
1575 }
1576 c_age_count++;
1577 break;
1578
1579 case C_ON_SWAPPEDIN_Q:
1580 assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1581
1582 if (insert_head == TRUE) {
1583 queue_enter_first(&c_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1584 } else {
1585 queue_enter(&c_swappedin_list_head, c_seg, c_segment_t, c_age_list);
1586 }
1587 c_swappedin_count++;
1588 break;
1589
1590 case C_ON_SWAPOUT_Q:
1591 assert(old_state == C_ON_AGE_Q || old_state == C_IS_FILLING);
1592
1593 if (insert_head == TRUE) {
1594 queue_enter_first(&c_swapout_list_head, c_seg, c_segment_t, c_age_list);
1595 } else {
1596 queue_enter(&c_swapout_list_head, c_seg, c_segment_t, c_age_list);
1597 }
1598 c_swapout_count++;
1599 break;
1600
1601 case C_ON_SWAPIO_Q:
1602 assert(old_state == C_ON_SWAPOUT_Q);
1603
1604 if (insert_head == TRUE) {
1605 queue_enter_first(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1606 } else {
1607 queue_enter(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
1608 }
1609 c_swapio_count++;
1610 break;
1611
1612 case C_ON_SWAPPEDOUT_Q:
1613 assert(old_state == C_ON_SWAPIO_Q);
1614
1615 if (insert_head == TRUE) {
1616 queue_enter_first(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1617 } else {
1618 queue_enter(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
1619 }
1620 c_swappedout_count++;
1621 break;
1622
1623 case C_ON_SWAPPEDOUTSPARSE_Q:
1624 assert(old_state == C_ON_SWAPIO_Q || old_state == C_ON_SWAPPEDOUT_Q);
1625
1626 if (insert_head == TRUE) {
1627 queue_enter_first(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1628 } else {
1629 queue_enter(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
1630 }
1631
1632 c_swappedout_sparse_count++;
1633 break;
1634
1635 case C_ON_MAJORCOMPACT_Q:
1636 assert(old_state == C_ON_AGE_Q);
1637
1638 if (insert_head == TRUE) {
1639 queue_enter_first(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1640 } else {
1641 queue_enter(&c_major_list_head, c_seg, c_segment_t, c_age_list);
1642 }
1643 c_major_count++;
1644 break;
1645
1646 case C_ON_BAD_Q:
1647 assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
1648
1649 if (insert_head == TRUE) {
1650 queue_enter_first(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1651 } else {
1652 queue_enter(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
1653 }
1654 c_bad_count++;
1655 break;
1656
1657 default:
1658 panic("c_seg %p requesting bad c_state = %d", c_seg, new_state);
1659 }
1660 c_seg->c_state = new_state;
1661 }
1662
1663
1664
1665 void
c_seg_free(c_segment_t c_seg)1666 c_seg_free(c_segment_t c_seg)
1667 {
1668 assert(c_seg->c_busy);
1669
1670 lck_mtx_unlock_always(&c_seg->c_lock);
1671 lck_mtx_lock_spin_always(c_list_lock);
1672 lck_mtx_lock_spin_always(&c_seg->c_lock);
1673
1674 c_seg_free_locked(c_seg);
1675 }
1676
1677
1678 void
c_seg_free_locked(c_segment_t c_seg)1679 c_seg_free_locked(c_segment_t c_seg)
1680 {
1681 int segno;
1682 int pages_populated = 0;
1683 int32_t *c_buffer = NULL;
1684 uint64_t c_swap_handle = 0;
1685
1686 assert(c_seg->c_busy);
1687 assert(c_seg->c_slots_used == 0);
1688 assert(!c_seg->c_on_minorcompact_q);
1689 assert(!c_seg->c_busy_swapping);
1690
1691 if (c_seg->c_overage_swap == TRUE) {
1692 c_overage_swapped_count--;
1693 c_seg->c_overage_swap = FALSE;
1694 }
1695 if (!(C_SEG_IS_ONDISK(c_seg))) {
1696 c_buffer = c_seg->c_store.c_buffer;
1697 } else {
1698 c_swap_handle = c_seg->c_store.c_swap_handle;
1699 }
1700
1701 c_seg_switch_state(c_seg, C_IS_FREE, FALSE);
1702
1703 if (c_buffer) {
1704 pages_populated = (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) / PAGE_SIZE;
1705 c_seg->c_store.c_buffer = NULL;
1706 } else {
1707 #if CONFIG_FREEZE
1708 c_seg_update_task_owner(c_seg, NULL);
1709 #endif /* CONFIG_FREEZE */
1710
1711 c_seg->c_store.c_swap_handle = (uint64_t)-1;
1712 }
1713
1714 lck_mtx_unlock_always(&c_seg->c_lock);
1715
1716 lck_mtx_unlock_always(c_list_lock);
1717
1718 if (c_buffer) {
1719 if (pages_populated) {
1720 kernel_memory_depopulate((vm_offset_t)c_buffer,
1721 ptoa(pages_populated), KMA_COMPRESSOR,
1722 VM_KERN_MEMORY_COMPRESSOR);
1723 }
1724 } else if (c_swap_handle) {
1725 /*
1726 * Free swap space on disk.
1727 */
1728 vm_swap_free(c_swap_handle);
1729 }
1730 lck_mtx_lock_spin_always(&c_seg->c_lock);
1731 /*
1732 * c_seg must remain busy until
1733 * after the call to vm_swap_free
1734 */
1735 C_SEG_WAKEUP_DONE(c_seg);
1736 lck_mtx_unlock_always(&c_seg->c_lock);
1737
1738 segno = c_seg->c_mysegno;
1739
1740 lck_mtx_lock_spin_always(c_list_lock);
1741 /*
1742 * because the c_buffer is now associated with the segno,
1743 * we can't put the segno back on the free list until
1744 * after we have depopulated the c_buffer range, or
1745 * we run the risk of depopulating a range that is
1746 * now being used in one of the compressor heads
1747 */
1748 c_segments[segno].c_segno = c_free_segno_head;
1749 c_free_segno_head = segno;
1750 c_segment_count--;
1751
1752 lck_mtx_unlock_always(c_list_lock);
1753
1754 lck_mtx_destroy(&c_seg->c_lock, &vm_compressor_lck_grp);
1755
1756 if (c_seg->c_slot_var_array_len) {
1757 kfree_data(c_seg->c_slot_var_array,
1758 sizeof(struct c_slot) * c_seg->c_slot_var_array_len);
1759 }
1760
1761 zfree(compressor_segment_zone, c_seg);
1762 }
1763
1764 #if DEVELOPMENT || DEBUG
1765 int c_seg_trim_page_count = 0;
1766 #endif
1767
1768 void
c_seg_trim_tail(c_segment_t c_seg)1769 c_seg_trim_tail(c_segment_t c_seg)
1770 {
1771 c_slot_t cs;
1772 uint32_t c_size;
1773 uint32_t c_offset;
1774 uint32_t c_rounded_size;
1775 uint16_t current_nextslot;
1776 uint32_t current_populated_offset;
1777
1778 if (c_seg->c_bytes_used == 0) {
1779 return;
1780 }
1781 current_nextslot = c_seg->c_nextslot;
1782 current_populated_offset = c_seg->c_populated_offset;
1783
1784 while (c_seg->c_nextslot) {
1785 cs = C_SEG_SLOT_FROM_INDEX(c_seg, (c_seg->c_nextslot - 1));
1786
1787 c_size = UNPACK_C_SIZE(cs);
1788
1789 if (c_size) {
1790 if (current_nextslot != c_seg->c_nextslot) {
1791 c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
1792 c_offset = cs->c_offset + C_SEG_BYTES_TO_OFFSET(c_rounded_size);
1793
1794 c_seg->c_nextoffset = c_offset;
1795 c_seg->c_populated_offset = (c_offset + (C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1)) &
1796 ~(C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1);
1797
1798 if (c_seg->c_firstemptyslot > c_seg->c_nextslot) {
1799 c_seg->c_firstemptyslot = c_seg->c_nextslot;
1800 }
1801 #if DEVELOPMENT || DEBUG
1802 c_seg_trim_page_count += ((round_page_32(C_SEG_OFFSET_TO_BYTES(current_populated_offset)) -
1803 round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) /
1804 PAGE_SIZE);
1805 #endif
1806 }
1807 break;
1808 }
1809 c_seg->c_nextslot--;
1810 }
1811 assert(c_seg->c_nextslot);
1812 }
1813
1814
1815 int
c_seg_minor_compaction_and_unlock(c_segment_t c_seg,boolean_t clear_busy)1816 c_seg_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy)
1817 {
1818 c_slot_mapping_t slot_ptr;
1819 uint32_t c_offset = 0;
1820 uint32_t old_populated_offset;
1821 uint32_t c_rounded_size;
1822 uint32_t c_size;
1823 uint16_t c_indx = 0;
1824 int i;
1825 c_slot_t c_dst;
1826 c_slot_t c_src;
1827
1828 assert(c_seg->c_busy);
1829
1830 #if VALIDATE_C_SEGMENTS
1831 c_seg_validate(c_seg, FALSE);
1832 #endif
1833 if (c_seg->c_bytes_used == 0) {
1834 c_seg_free(c_seg);
1835 return 1;
1836 }
1837 lck_mtx_unlock_always(&c_seg->c_lock);
1838
1839 if (c_seg->c_firstemptyslot >= c_seg->c_nextslot || C_SEG_UNUSED_BYTES(c_seg) < PAGE_SIZE) {
1840 goto done;
1841 }
1842
1843 /* TODO: assert first emptyslot's c_size is actually 0 */
1844
1845 #if DEVELOPMENT || DEBUG
1846 C_SEG_MAKE_WRITEABLE(c_seg);
1847 #endif
1848
1849 #if VALIDATE_C_SEGMENTS
1850 c_seg->c_was_minor_compacted++;
1851 #endif
1852 c_indx = c_seg->c_firstemptyslot;
1853 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1854
1855 old_populated_offset = c_seg->c_populated_offset;
1856 c_offset = c_dst->c_offset;
1857
1858 for (i = c_indx + 1; i < c_seg->c_nextslot && c_offset < c_seg->c_nextoffset; i++) {
1859 c_src = C_SEG_SLOT_FROM_INDEX(c_seg, i);
1860
1861 c_size = UNPACK_C_SIZE(c_src);
1862
1863 if (c_size == 0) {
1864 continue;
1865 }
1866
1867 c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
1868 /* N.B.: This memcpy may be an overlapping copy */
1869 memcpy(&c_seg->c_store.c_buffer[c_offset], &c_seg->c_store.c_buffer[c_src->c_offset], c_rounded_size);
1870
1871 cslot_copy(c_dst, c_src);
1872 c_dst->c_offset = c_offset;
1873
1874 slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
1875 slot_ptr->s_cindx = c_indx;
1876
1877 c_offset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
1878 PACK_C_SIZE(c_src, 0);
1879 c_indx++;
1880
1881 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
1882 }
1883 c_seg->c_firstemptyslot = c_indx;
1884 c_seg->c_nextslot = c_indx;
1885 c_seg->c_nextoffset = c_offset;
1886 c_seg->c_populated_offset = (c_offset + (C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1)) & ~(C_SEG_BYTES_TO_OFFSET(PAGE_SIZE) - 1);
1887 c_seg->c_bytes_unused = 0;
1888
1889 #if VALIDATE_C_SEGMENTS
1890 c_seg_validate(c_seg, TRUE);
1891 #endif
1892 if (old_populated_offset > c_seg->c_populated_offset) {
1893 uint32_t gc_size;
1894 int32_t *gc_ptr;
1895
1896 gc_size = C_SEG_OFFSET_TO_BYTES(old_populated_offset - c_seg->c_populated_offset);
1897 gc_ptr = &c_seg->c_store.c_buffer[c_seg->c_populated_offset];
1898
1899 kernel_memory_depopulate((vm_offset_t)gc_ptr, gc_size,
1900 KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
1901 }
1902
1903 #if DEVELOPMENT || DEBUG
1904 C_SEG_WRITE_PROTECT(c_seg);
1905 #endif
1906
1907 done:
1908 if (clear_busy == TRUE) {
1909 lck_mtx_lock_spin_always(&c_seg->c_lock);
1910 C_SEG_WAKEUP_DONE(c_seg);
1911 lck_mtx_unlock_always(&c_seg->c_lock);
1912 }
1913 return 0;
1914 }
1915
1916
1917 static void
c_seg_alloc_nextslot(c_segment_t c_seg)1918 c_seg_alloc_nextslot(c_segment_t c_seg)
1919 {
1920 struct c_slot *old_slot_array = NULL;
1921 struct c_slot *new_slot_array = NULL;
1922 int newlen;
1923 int oldlen;
1924
1925 if (c_seg->c_nextslot < c_seg_fixed_array_len) {
1926 return;
1927 }
1928
1929 if ((c_seg->c_nextslot - c_seg_fixed_array_len) >= c_seg->c_slot_var_array_len) {
1930 oldlen = c_seg->c_slot_var_array_len;
1931 old_slot_array = c_seg->c_slot_var_array;
1932
1933 if (oldlen == 0) {
1934 newlen = c_seg_slot_var_array_min_len;
1935 } else {
1936 newlen = oldlen * 2;
1937 }
1938
1939 new_slot_array = kalloc_data(sizeof(struct c_slot) * newlen,
1940 Z_WAITOK);
1941
1942 lck_mtx_lock_spin_always(&c_seg->c_lock);
1943
1944 if (old_slot_array) {
1945 memcpy(new_slot_array, old_slot_array,
1946 sizeof(struct c_slot) * oldlen);
1947 }
1948
1949 c_seg->c_slot_var_array_len = newlen;
1950 c_seg->c_slot_var_array = new_slot_array;
1951
1952 lck_mtx_unlock_always(&c_seg->c_lock);
1953
1954 kfree_data(old_slot_array, sizeof(struct c_slot) * oldlen);
1955 }
1956 }
1957
1958
1959 #define C_SEG_MAJOR_COMPACT_STATS_MAX (30)
1960
1961 struct {
1962 uint64_t asked_permission;
1963 uint64_t compactions;
1964 uint64_t moved_slots;
1965 uint64_t moved_bytes;
1966 uint64_t wasted_space_in_swapouts;
1967 uint64_t count_of_swapouts;
1968 uint64_t count_of_freed_segs;
1969 uint64_t bailed_compactions;
1970 uint64_t bytes_freed_rate_us;
1971 } c_seg_major_compact_stats[C_SEG_MAJOR_COMPACT_STATS_MAX];
1972
1973 int c_seg_major_compact_stats_now = 0;
1974
1975
1976 #define C_MAJOR_COMPACTION_SIZE_APPROPRIATE ((c_seg_bufsize * 90) / 100)
1977
1978
1979 boolean_t
c_seg_major_compact_ok(c_segment_t c_seg_dst,c_segment_t c_seg_src)1980 c_seg_major_compact_ok(
1981 c_segment_t c_seg_dst,
1982 c_segment_t c_seg_src)
1983 {
1984 c_seg_major_compact_stats[c_seg_major_compact_stats_now].asked_permission++;
1985
1986 if (c_seg_src->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE &&
1987 c_seg_dst->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE) {
1988 return FALSE;
1989 }
1990
1991 if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
1992 /*
1993 * destination segment is full... can't compact
1994 */
1995 return FALSE;
1996 }
1997
1998 return TRUE;
1999 }
2000
2001
2002 boolean_t
c_seg_major_compact(c_segment_t c_seg_dst,c_segment_t c_seg_src)2003 c_seg_major_compact(
2004 c_segment_t c_seg_dst,
2005 c_segment_t c_seg_src)
2006 {
2007 c_slot_mapping_t slot_ptr;
2008 uint32_t c_rounded_size;
2009 uint32_t c_size;
2010 uint16_t dst_slot;
2011 int i;
2012 c_slot_t c_dst;
2013 c_slot_t c_src;
2014 boolean_t keep_compacting = TRUE;
2015
2016 /*
2017 * segments are not locked but they are both marked c_busy
2018 * which keeps c_decompress from working on them...
2019 * we can safely allocate new pages, move compressed data
2020 * from c_seg_src to c_seg_dst and update both c_segment's
2021 * state w/o holding the master lock
2022 */
2023 #if DEVELOPMENT || DEBUG
2024 C_SEG_MAKE_WRITEABLE(c_seg_dst);
2025 #endif
2026
2027 #if VALIDATE_C_SEGMENTS
2028 c_seg_dst->c_was_major_compacted++;
2029 c_seg_src->c_was_major_donor++;
2030 #endif
2031 c_seg_major_compact_stats[c_seg_major_compact_stats_now].compactions++;
2032
2033 dst_slot = c_seg_dst->c_nextslot;
2034
2035 for (i = 0; i < c_seg_src->c_nextslot; i++) {
2036 c_src = C_SEG_SLOT_FROM_INDEX(c_seg_src, i);
2037
2038 c_size = UNPACK_C_SIZE(c_src);
2039
2040 if (c_size == 0) {
2041 /* BATCH: move what we have so far; */
2042 continue;
2043 }
2044
2045 if (C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset - c_seg_dst->c_nextoffset) < (unsigned) c_size) {
2046 int size_to_populate;
2047
2048 /* doesn't fit */
2049 size_to_populate = c_seg_bufsize - C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset);
2050
2051 if (size_to_populate == 0) {
2052 /* can't fit */
2053 keep_compacting = FALSE;
2054 break;
2055 }
2056 if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) {
2057 size_to_populate = C_SEG_MAX_POPULATE_SIZE;
2058 }
2059
2060 kernel_memory_populate(
2061 (vm_offset_t) &c_seg_dst->c_store.c_buffer[c_seg_dst->c_populated_offset],
2062 size_to_populate,
2063 KMA_NOFAIL | KMA_COMPRESSOR,
2064 VM_KERN_MEMORY_COMPRESSOR);
2065
2066 c_seg_dst->c_populated_offset += C_SEG_BYTES_TO_OFFSET(size_to_populate);
2067 assert(C_SEG_OFFSET_TO_BYTES(c_seg_dst->c_populated_offset) <= c_seg_bufsize);
2068 }
2069 c_seg_alloc_nextslot(c_seg_dst);
2070
2071 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot);
2072
2073 memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], c_size);
2074
2075 c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
2076
2077 c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_slots++;
2078 c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_bytes += c_size;
2079
2080 cslot_copy(c_dst, c_src);
2081 c_dst->c_offset = c_seg_dst->c_nextoffset;
2082
2083 if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) {
2084 c_seg_dst->c_firstemptyslot++;
2085 }
2086 c_seg_dst->c_slots_used++;
2087 c_seg_dst->c_nextslot++;
2088 c_seg_dst->c_bytes_used += c_rounded_size;
2089 c_seg_dst->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
2090
2091 PACK_C_SIZE(c_src, 0);
2092
2093 c_seg_src->c_bytes_used -= c_rounded_size;
2094 c_seg_src->c_bytes_unused += c_rounded_size;
2095 c_seg_src->c_firstemptyslot = 0;
2096
2097 assert(c_seg_src->c_slots_used);
2098 c_seg_src->c_slots_used--;
2099
2100 if (!c_seg_src->c_swappedin) {
2101 /* Pessimistically lose swappedin status when non-swappedin pages are added. */
2102 c_seg_dst->c_swappedin = false;
2103 }
2104
2105 if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
2106 /* dest segment is now full */
2107 keep_compacting = FALSE;
2108 break;
2109 }
2110 }
2111 #if DEVELOPMENT || DEBUG
2112 C_SEG_WRITE_PROTECT(c_seg_dst);
2113 #endif
2114 if (dst_slot < c_seg_dst->c_nextslot) {
2115 PAGE_REPLACEMENT_ALLOWED(TRUE);
2116 /*
2117 * we've now locked out c_decompress from
2118 * converting the slot passed into it into
2119 * a c_segment_t which allows us to use
2120 * the backptr to change which c_segment and
2121 * index the slot points to
2122 */
2123 while (dst_slot < c_seg_dst->c_nextslot) {
2124 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, dst_slot);
2125
2126 slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
2127 /* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
2128 slot_ptr->s_cseg = c_seg_dst->c_mysegno + 1;
2129 slot_ptr->s_cindx = dst_slot++;
2130 }
2131 PAGE_REPLACEMENT_ALLOWED(FALSE);
2132 }
2133 return keep_compacting;
2134 }
2135
2136
2137 uint64_t
vm_compressor_compute_elapsed_msecs(clock_sec_t end_sec,clock_nsec_t end_nsec,clock_sec_t start_sec,clock_nsec_t start_nsec)2138 vm_compressor_compute_elapsed_msecs(clock_sec_t end_sec, clock_nsec_t end_nsec, clock_sec_t start_sec, clock_nsec_t start_nsec)
2139 {
2140 uint64_t end_msecs;
2141 uint64_t start_msecs;
2142
2143 end_msecs = (end_sec * 1000) + end_nsec / 1000000;
2144 start_msecs = (start_sec * 1000) + start_nsec / 1000000;
2145
2146 return end_msecs - start_msecs;
2147 }
2148
2149
2150
2151 uint32_t compressor_eval_period_in_msecs = 250;
2152 uint32_t compressor_sample_min_in_msecs = 500;
2153 uint32_t compressor_sample_max_in_msecs = 10000;
2154 uint32_t compressor_thrashing_threshold_per_10msecs = 50;
2155 uint32_t compressor_thrashing_min_per_10msecs = 20;
2156
2157 /* When true, reset sample data next chance we get. */
2158 static boolean_t compressor_need_sample_reset = FALSE;
2159
2160
2161 void
compute_swapout_target_age(void)2162 compute_swapout_target_age(void)
2163 {
2164 clock_sec_t cur_ts_sec;
2165 clock_nsec_t cur_ts_nsec;
2166 uint32_t min_operations_needed_in_this_sample;
2167 uint64_t elapsed_msecs_in_eval;
2168 uint64_t elapsed_msecs_in_sample;
2169 boolean_t need_eval_reset = FALSE;
2170
2171 clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
2172
2173 elapsed_msecs_in_sample = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, start_of_sample_period_sec, start_of_sample_period_nsec);
2174
2175 if (compressor_need_sample_reset ||
2176 elapsed_msecs_in_sample >= compressor_sample_max_in_msecs) {
2177 compressor_need_sample_reset = TRUE;
2178 need_eval_reset = TRUE;
2179 goto done;
2180 }
2181 elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, start_of_eval_period_sec, start_of_eval_period_nsec);
2182
2183 if (elapsed_msecs_in_eval < compressor_eval_period_in_msecs) {
2184 goto done;
2185 }
2186 need_eval_reset = TRUE;
2187
2188 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_START, elapsed_msecs_in_eval, sample_period_compression_count, sample_period_decompression_count, 0, 0);
2189
2190 min_operations_needed_in_this_sample = (compressor_thrashing_min_per_10msecs * (uint32_t)elapsed_msecs_in_eval) / 10;
2191
2192 if ((sample_period_compression_count - last_eval_compression_count) < min_operations_needed_in_this_sample ||
2193 (sample_period_decompression_count - last_eval_decompression_count) < min_operations_needed_in_this_sample) {
2194 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, sample_period_compression_count - last_eval_compression_count,
2195 sample_period_decompression_count - last_eval_decompression_count, 0, 1, 0);
2196
2197 swapout_target_age = 0;
2198
2199 compressor_need_sample_reset = TRUE;
2200 need_eval_reset = TRUE;
2201 goto done;
2202 }
2203 last_eval_compression_count = sample_period_compression_count;
2204 last_eval_decompression_count = sample_period_decompression_count;
2205
2206 if (elapsed_msecs_in_sample < compressor_sample_min_in_msecs) {
2207 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, swapout_target_age, 0, 0, 5, 0);
2208 goto done;
2209 }
2210 if (sample_period_decompression_count > ((compressor_thrashing_threshold_per_10msecs * elapsed_msecs_in_sample) / 10)) {
2211 uint64_t running_total;
2212 uint64_t working_target;
2213 uint64_t aging_target;
2214 uint32_t oldest_age_of_csegs_sampled = 0;
2215 uint64_t working_set_approximation = 0;
2216
2217 swapout_target_age = 0;
2218
2219 working_target = (sample_period_decompression_count / 100) * 95; /* 95 percent */
2220 aging_target = (sample_period_decompression_count / 100) * 1; /* 1 percent */
2221 running_total = 0;
2222
2223 for (oldest_age_of_csegs_sampled = 0; oldest_age_of_csegs_sampled < DECOMPRESSION_SAMPLE_MAX_AGE; oldest_age_of_csegs_sampled++) {
2224 running_total += age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2225
2226 working_set_approximation += oldest_age_of_csegs_sampled * age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2227
2228 if (running_total >= working_target) {
2229 break;
2230 }
2231 }
2232 if (oldest_age_of_csegs_sampled < DECOMPRESSION_SAMPLE_MAX_AGE) {
2233 working_set_approximation = (working_set_approximation * 1000) / elapsed_msecs_in_sample;
2234
2235 if (working_set_approximation < VM_PAGE_COMPRESSOR_COUNT) {
2236 running_total = overage_decompressions_during_sample_period;
2237
2238 for (oldest_age_of_csegs_sampled = DECOMPRESSION_SAMPLE_MAX_AGE - 1; oldest_age_of_csegs_sampled; oldest_age_of_csegs_sampled--) {
2239 running_total += age_of_decompressions_during_sample_period[oldest_age_of_csegs_sampled];
2240
2241 if (running_total >= aging_target) {
2242 break;
2243 }
2244 }
2245 swapout_target_age = (uint32_t)cur_ts_sec - oldest_age_of_csegs_sampled;
2246
2247 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, swapout_target_age, working_set_approximation, VM_PAGE_COMPRESSOR_COUNT, 2, 0);
2248 } else {
2249 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, working_set_approximation, VM_PAGE_COMPRESSOR_COUNT, 0, 3, 0);
2250 }
2251 } else {
2252 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, working_target, running_total, 0, 4, 0);
2253 }
2254
2255 compressor_need_sample_reset = TRUE;
2256 need_eval_reset = TRUE;
2257 } else {
2258 KERNEL_DEBUG(0xe0400020 | DBG_FUNC_END, sample_period_decompression_count, (compressor_thrashing_threshold_per_10msecs * elapsed_msecs_in_sample) / 10, 0, 6, 0);
2259 }
2260 done:
2261 if (compressor_need_sample_reset == TRUE) {
2262 bzero(age_of_decompressions_during_sample_period, sizeof(age_of_decompressions_during_sample_period));
2263 overage_decompressions_during_sample_period = 0;
2264
2265 start_of_sample_period_sec = cur_ts_sec;
2266 start_of_sample_period_nsec = cur_ts_nsec;
2267 sample_period_decompression_count = 0;
2268 sample_period_compression_count = 0;
2269 last_eval_decompression_count = 0;
2270 last_eval_compression_count = 0;
2271 compressor_need_sample_reset = FALSE;
2272 }
2273 if (need_eval_reset == TRUE) {
2274 start_of_eval_period_sec = cur_ts_sec;
2275 start_of_eval_period_nsec = cur_ts_nsec;
2276 }
2277 }
2278
2279
2280 int compaction_swapper_init_now = 0;
2281 int compaction_swapper_running = 0;
2282 int compaction_swapper_awakened = 0;
2283 int compaction_swapper_abort = 0;
2284
2285
2286 #if CONFIG_JETSAM
2287 boolean_t memorystatus_kill_on_VM_compressor_thrashing(boolean_t);
2288 boolean_t memorystatus_kill_on_VM_compressor_space_shortage(boolean_t);
2289 boolean_t memorystatus_kill_on_FC_thrashing(boolean_t);
2290 int compressor_thrashing_induced_jetsam = 0;
2291 int filecache_thrashing_induced_jetsam = 0;
2292 static boolean_t vm_compressor_thrashing_detected = FALSE;
2293 #endif /* CONFIG_JETSAM */
2294
2295 static bool
compressor_swapout_conditions_met(void)2296 compressor_swapout_conditions_met(void)
2297 {
2298 bool should_swap = false;
2299
2300 if (COMPRESSOR_NEEDS_TO_SWAP()) {
2301 should_swap = true;
2302 vmcs_stats.compressor_swap_threshold_exceeded++;
2303 }
2304 if (VM_PAGE_Q_THROTTLED(&vm_pageout_queue_external) && vm_page_anonymous_count < (vm_page_inactive_count / 20)) {
2305 should_swap = true;
2306 vmcs_stats.external_q_throttled++;
2307 }
2308 if (vm_page_free_count < (vm_page_free_reserved - (COMPRESSOR_FREE_RESERVED_LIMIT * 2))) {
2309 should_swap = true;
2310 vmcs_stats.free_count_below_reserve++;
2311 }
2312 return should_swap;
2313 }
2314
2315 static boolean_t
compressor_needs_to_swap(void)2316 compressor_needs_to_swap(void)
2317 {
2318 boolean_t should_swap = FALSE;
2319
2320 if (vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit) {
2321 c_segment_t c_seg;
2322 clock_sec_t now;
2323 clock_sec_t age;
2324 clock_nsec_t nsec;
2325
2326 clock_get_system_nanotime(&now, &nsec);
2327 age = 0;
2328
2329 lck_mtx_lock_spin_always(c_list_lock);
2330
2331 if (!queue_empty(&c_age_list_head)) {
2332 c_seg = (c_segment_t) queue_first(&c_age_list_head);
2333
2334 age = now - c_seg->c_creation_ts;
2335 }
2336 lck_mtx_unlock_always(c_list_lock);
2337
2338 if (age >= vm_ripe_target_age) {
2339 should_swap = TRUE;
2340 goto check_if_low_space;
2341 }
2342 }
2343 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2344 should_swap = compressor_swapout_conditions_met();
2345 if (should_swap) {
2346 goto check_if_low_space;
2347 }
2348 }
2349
2350 #if (XNU_TARGET_OS_OSX && __arm64__)
2351 /*
2352 * Thrashing detection disabled.
2353 */
2354 #else /* (XNU_TARGET_OS_OSX && __arm64__) */
2355
2356 compute_swapout_target_age();
2357
2358 if (swapout_target_age) {
2359 c_segment_t c_seg;
2360
2361 lck_mtx_lock_spin_always(c_list_lock);
2362
2363 if (!queue_empty(&c_age_list_head)) {
2364 c_seg = (c_segment_t) queue_first(&c_age_list_head);
2365
2366 if (c_seg->c_creation_ts > swapout_target_age) {
2367 swapout_target_age = 0;
2368 }
2369 }
2370 lck_mtx_unlock_always(c_list_lock);
2371 }
2372 #if CONFIG_PHANTOM_CACHE
2373 if (vm_phantom_cache_check_pressure()) {
2374 should_swap = TRUE;
2375 }
2376 #endif
2377 if (swapout_target_age) {
2378 should_swap = TRUE;
2379 vmcs_stats.thrashing_detected++;
2380 }
2381 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
2382
2383 check_if_low_space:
2384
2385 #if CONFIG_JETSAM
2386 if (should_swap || vm_compressor_low_on_space() == TRUE) {
2387 if (vm_compressor_thrashing_detected == FALSE) {
2388 vm_compressor_thrashing_detected = TRUE;
2389
2390 if (swapout_target_age) {
2391 /* The compressor is thrashing. */
2392 memorystatus_kill_on_VM_compressor_thrashing(TRUE /* async */);
2393 compressor_thrashing_induced_jetsam++;
2394 } else if (vm_compressor_low_on_space() == TRUE) {
2395 /* The compressor is running low on space. */
2396 memorystatus_kill_on_VM_compressor_space_shortage(TRUE /* async */);
2397 compressor_thrashing_induced_jetsam++;
2398 } else {
2399 memorystatus_kill_on_FC_thrashing(TRUE /* async */);
2400 filecache_thrashing_induced_jetsam++;
2401 }
2402 }
2403 /*
2404 * let the jetsam take precedence over
2405 * any major compactions we might have
2406 * been able to do... otherwise we run
2407 * the risk of doing major compactions
2408 * on segments we're about to free up
2409 * due to the jetsam activity.
2410 */
2411 should_swap = FALSE;
2412 }
2413
2414 #else /* CONFIG_JETSAM */
2415 if (should_swap && vm_swap_low_on_space()) {
2416 vm_compressor_take_paging_space_action();
2417 }
2418 #endif /* CONFIG_JETSAM */
2419
2420 if (should_swap == FALSE) {
2421 /*
2422 * vm_compressor_needs_to_major_compact returns true only if we're
2423 * about to run out of available compressor segments... in this
2424 * case, we absolutely need to run a major compaction even if
2425 * we've just kicked off a jetsam or we don't otherwise need to
2426 * swap... terminating objects releases
2427 * pages back to the uncompressed cache, but does not guarantee
2428 * that we will free up even a single compression segment
2429 */
2430 should_swap = vm_compressor_needs_to_major_compact();
2431 if (should_swap) {
2432 vmcs_stats.fragmentation_detected++;
2433 }
2434 }
2435
2436 /*
2437 * returning TRUE when swap_supported == FALSE
2438 * will cause the major compaction engine to
2439 * run, but will not trigger any swapping...
2440 * segments that have been major compacted
2441 * will be moved to the majorcompact queue
2442 */
2443 return should_swap;
2444 }
2445
2446 #if CONFIG_JETSAM
2447 /*
2448 * This function is called from the jetsam thread after killing something to
2449 * mitigate thrashing.
2450 *
2451 * We need to restart our thrashing detection heuristics since memory pressure
2452 * has potentially changed significantly, and we don't want to detect on old
2453 * data from before the jetsam.
2454 */
2455 void
vm_thrashing_jetsam_done(void)2456 vm_thrashing_jetsam_done(void)
2457 {
2458 vm_compressor_thrashing_detected = FALSE;
2459
2460 /* Were we compressor-thrashing or filecache-thrashing? */
2461 if (swapout_target_age) {
2462 swapout_target_age = 0;
2463 compressor_need_sample_reset = TRUE;
2464 }
2465 #if CONFIG_PHANTOM_CACHE
2466 else {
2467 vm_phantom_cache_restart_sample();
2468 }
2469 #endif
2470 }
2471 #endif /* CONFIG_JETSAM */
2472
2473 uint32_t vm_wake_compactor_swapper_calls = 0;
2474 uint32_t vm_run_compactor_already_running = 0;
2475 uint32_t vm_run_compactor_empty_minor_q = 0;
2476 uint32_t vm_run_compactor_did_compact = 0;
2477 uint32_t vm_run_compactor_waited = 0;
2478
2479 void
vm_run_compactor(void)2480 vm_run_compactor(void)
2481 {
2482 if (c_segment_count == 0) {
2483 return;
2484 }
2485
2486 lck_mtx_lock_spin_always(c_list_lock);
2487
2488 if (c_minor_count == 0) {
2489 vm_run_compactor_empty_minor_q++;
2490
2491 lck_mtx_unlock_always(c_list_lock);
2492 return;
2493 }
2494 if (compaction_swapper_running) {
2495 if (vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2496 vm_run_compactor_already_running++;
2497
2498 lck_mtx_unlock_always(c_list_lock);
2499 return;
2500 }
2501 vm_run_compactor_waited++;
2502
2503 assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
2504
2505 lck_mtx_unlock_always(c_list_lock);
2506
2507 thread_block(THREAD_CONTINUE_NULL);
2508
2509 return;
2510 }
2511 vm_run_compactor_did_compact++;
2512
2513 fastwake_warmup = FALSE;
2514 compaction_swapper_running = 1;
2515
2516 vm_compressor_do_delayed_compactions(FALSE);
2517
2518 compaction_swapper_running = 0;
2519
2520 lck_mtx_unlock_always(c_list_lock);
2521
2522 thread_wakeup((event_t)&compaction_swapper_running);
2523 }
2524
2525
2526 void
vm_wake_compactor_swapper(void)2527 vm_wake_compactor_swapper(void)
2528 {
2529 if (compaction_swapper_running || compaction_swapper_awakened || c_segment_count == 0) {
2530 return;
2531 }
2532
2533 if (c_minor_count || vm_compressor_needs_to_major_compact()) {
2534 lck_mtx_lock_spin_always(c_list_lock);
2535
2536 fastwake_warmup = FALSE;
2537
2538 if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
2539 vm_wake_compactor_swapper_calls++;
2540
2541 compaction_swapper_awakened = 1;
2542 thread_wakeup((event_t)&c_compressor_swap_trigger);
2543 }
2544 lck_mtx_unlock_always(c_list_lock);
2545 }
2546 }
2547
2548
2549 void
vm_consider_swapping()2550 vm_consider_swapping()
2551 {
2552 c_segment_t c_seg, c_seg_next;
2553 clock_sec_t now;
2554 clock_nsec_t nsec;
2555
2556 assert(VM_CONFIG_SWAP_IS_PRESENT);
2557
2558 lck_mtx_lock_spin_always(c_list_lock);
2559
2560 compaction_swapper_abort = 1;
2561
2562 while (compaction_swapper_running) {
2563 assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
2564
2565 lck_mtx_unlock_always(c_list_lock);
2566
2567 thread_block(THREAD_CONTINUE_NULL);
2568
2569 lck_mtx_lock_spin_always(c_list_lock);
2570 }
2571 compaction_swapper_abort = 0;
2572 compaction_swapper_running = 1;
2573
2574 vm_swapout_ripe_segments = TRUE;
2575
2576 if (!queue_empty(&c_major_list_head)) {
2577 clock_get_system_nanotime(&now, &nsec);
2578
2579 c_seg = (c_segment_t)queue_first(&c_major_list_head);
2580
2581 while (!queue_end(&c_major_list_head, (queue_entry_t)c_seg)) {
2582 if (c_overage_swapped_count >= c_overage_swapped_limit) {
2583 break;
2584 }
2585
2586 c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
2587
2588 if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
2589 lck_mtx_lock_spin_always(&c_seg->c_lock);
2590
2591 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
2592
2593 lck_mtx_unlock_always(&c_seg->c_lock);
2594 }
2595 c_seg = c_seg_next;
2596 }
2597 }
2598 vm_compressor_compact_and_swap(FALSE);
2599
2600 compaction_swapper_running = 0;
2601
2602 vm_swapout_ripe_segments = FALSE;
2603
2604 lck_mtx_unlock_always(c_list_lock);
2605
2606 thread_wakeup((event_t)&compaction_swapper_running);
2607 }
2608
2609
2610 void
vm_consider_waking_compactor_swapper(void)2611 vm_consider_waking_compactor_swapper(void)
2612 {
2613 boolean_t need_wakeup = FALSE;
2614
2615 if (c_segment_count == 0) {
2616 return;
2617 }
2618
2619 if (compaction_swapper_running || compaction_swapper_awakened) {
2620 return;
2621 }
2622
2623 if (!compaction_swapper_inited && !compaction_swapper_init_now) {
2624 compaction_swapper_init_now = 1;
2625 need_wakeup = TRUE;
2626 }
2627
2628 if (c_minor_count && (COMPRESSOR_NEEDS_TO_MINOR_COMPACT())) {
2629 need_wakeup = TRUE;
2630 } else if (compressor_needs_to_swap()) {
2631 need_wakeup = TRUE;
2632 } else if (c_minor_count) {
2633 uint64_t total_bytes;
2634
2635 total_bytes = compressor_object->resident_page_count * PAGE_SIZE_64;
2636
2637 if ((total_bytes - compressor_bytes_used) > total_bytes / 10) {
2638 need_wakeup = TRUE;
2639 }
2640 }
2641 if (need_wakeup == TRUE) {
2642 lck_mtx_lock_spin_always(c_list_lock);
2643
2644 fastwake_warmup = FALSE;
2645
2646 if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
2647 memoryshot(VM_WAKEUP_COMPACTOR_SWAPPER, DBG_FUNC_NONE);
2648
2649 compaction_swapper_awakened = 1;
2650 thread_wakeup((event_t)&c_compressor_swap_trigger);
2651 }
2652 lck_mtx_unlock_always(c_list_lock);
2653 }
2654 }
2655
2656
2657 #define C_SWAPOUT_LIMIT 4
2658 #define DELAYED_COMPACTIONS_PER_PASS 30
2659
2660 void
vm_compressor_do_delayed_compactions(boolean_t flush_all)2661 vm_compressor_do_delayed_compactions(boolean_t flush_all)
2662 {
2663 c_segment_t c_seg;
2664 int number_compacted = 0;
2665 boolean_t needs_to_swap = FALSE;
2666
2667
2668 VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_START, c_minor_count, flush_all, 0, 0);
2669
2670 #if XNU_TARGET_OS_OSX
2671 LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
2672 #endif /* XNU_TARGET_OS_OSX */
2673
2674 while (!queue_empty(&c_minor_list_head) && needs_to_swap == FALSE) {
2675 c_seg = (c_segment_t)queue_first(&c_minor_list_head);
2676
2677 lck_mtx_lock_spin_always(&c_seg->c_lock);
2678
2679 if (c_seg->c_busy) {
2680 lck_mtx_unlock_always(c_list_lock);
2681 c_seg_wait_on_busy(c_seg);
2682 lck_mtx_lock_spin_always(c_list_lock);
2683
2684 continue;
2685 }
2686 C_SEG_BUSY(c_seg);
2687
2688 c_seg_do_minor_compaction_and_unlock(c_seg, TRUE, FALSE, TRUE);
2689
2690 if (VM_CONFIG_SWAP_IS_ACTIVE && (number_compacted++ > DELAYED_COMPACTIONS_PER_PASS)) {
2691 if ((flush_all == TRUE || compressor_needs_to_swap() == TRUE) && c_swapout_count < C_SWAPOUT_LIMIT) {
2692 needs_to_swap = TRUE;
2693 }
2694
2695 number_compacted = 0;
2696 }
2697 lck_mtx_lock_spin_always(c_list_lock);
2698 }
2699
2700 VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_END, c_minor_count, number_compacted, needs_to_swap, 0);
2701 }
2702
2703
2704 #define C_SEGMENT_SWAPPEDIN_AGE_LIMIT 10
2705
2706 static void
vm_compressor_age_swapped_in_segments(boolean_t flush_all)2707 vm_compressor_age_swapped_in_segments(boolean_t flush_all)
2708 {
2709 c_segment_t c_seg;
2710 clock_sec_t now;
2711 clock_nsec_t nsec;
2712
2713 clock_get_system_nanotime(&now, &nsec);
2714
2715 while (!queue_empty(&c_swappedin_list_head)) {
2716 c_seg = (c_segment_t)queue_first(&c_swappedin_list_head);
2717
2718 if (flush_all == FALSE && (now - c_seg->c_swappedin_ts) < C_SEGMENT_SWAPPEDIN_AGE_LIMIT) {
2719 break;
2720 }
2721
2722 lck_mtx_lock_spin_always(&c_seg->c_lock);
2723
2724 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
2725 c_seg->c_agedin_ts = (uint32_t) now;
2726
2727 lck_mtx_unlock_always(&c_seg->c_lock);
2728 }
2729 }
2730
2731
2732 extern int vm_num_swap_files;
2733 extern int vm_num_pinned_swap_files;
2734 extern int vm_swappin_enabled;
2735
2736 extern unsigned int vm_swapfile_total_segs_used;
2737 extern unsigned int vm_swapfile_total_segs_alloced;
2738
2739
2740 void
vm_compressor_flush(void)2741 vm_compressor_flush(void)
2742 {
2743 uint64_t vm_swap_put_failures_at_start;
2744 wait_result_t wait_result = 0;
2745 AbsoluteTime startTime, endTime;
2746 clock_sec_t now_sec;
2747 clock_nsec_t now_nsec;
2748 uint64_t nsec;
2749 c_segment_t c_seg, c_seg_next;
2750
2751 HIBLOG("vm_compressor_flush - starting\n");
2752
2753 clock_get_uptime(&startTime);
2754
2755 lck_mtx_lock_spin_always(c_list_lock);
2756
2757 fastwake_warmup = FALSE;
2758 compaction_swapper_abort = 1;
2759
2760 while (compaction_swapper_running) {
2761 assert_wait((event_t)&compaction_swapper_running, THREAD_UNINT);
2762
2763 lck_mtx_unlock_always(c_list_lock);
2764
2765 thread_block(THREAD_CONTINUE_NULL);
2766
2767 lck_mtx_lock_spin_always(c_list_lock);
2768 }
2769 compaction_swapper_abort = 0;
2770 compaction_swapper_running = 1;
2771
2772 hibernate_flushing = TRUE;
2773 hibernate_no_swapspace = FALSE;
2774 hibernate_flush_timed_out = FALSE;
2775 c_generation_id_flush_barrier = c_generation_id + 1000;
2776
2777 clock_get_system_nanotime(&now_sec, &now_nsec);
2778 hibernate_flushing_deadline = now_sec + HIBERNATE_FLUSHING_SECS_TO_COMPLETE;
2779
2780 vm_swap_put_failures_at_start = vm_swap_put_failures;
2781
2782 /*
2783 * We are about to hibernate and so we want all segments flushed to disk.
2784 * Segments that are on the major compaction queue won't be considered in
2785 * the vm_compressor_compact_and_swap() pass. So we need to bring them to
2786 * the ageQ for consideration.
2787 */
2788 if (!queue_empty(&c_major_list_head)) {
2789 c_seg = (c_segment_t)queue_first(&c_major_list_head);
2790
2791 while (!queue_end(&c_major_list_head, (queue_entry_t)c_seg)) {
2792 c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
2793 lck_mtx_lock_spin_always(&c_seg->c_lock);
2794 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
2795 lck_mtx_unlock_always(&c_seg->c_lock);
2796 c_seg = c_seg_next;
2797 }
2798 }
2799 vm_compressor_compact_and_swap(TRUE);
2800
2801 while (!queue_empty(&c_swapout_list_head)) {
2802 assert_wait_timeout((event_t) &compaction_swapper_running, THREAD_INTERRUPTIBLE, 5000, 1000 * NSEC_PER_USEC);
2803
2804 lck_mtx_unlock_always(c_list_lock);
2805
2806 wait_result = thread_block(THREAD_CONTINUE_NULL);
2807
2808 lck_mtx_lock_spin_always(c_list_lock);
2809
2810 if (wait_result == THREAD_TIMED_OUT) {
2811 break;
2812 }
2813 }
2814 hibernate_flushing = FALSE;
2815 compaction_swapper_running = 0;
2816
2817 if (vm_swap_put_failures > vm_swap_put_failures_at_start) {
2818 HIBLOG("vm_compressor_flush failed to clean %llu segments - vm_page_compressor_count(%d)\n",
2819 vm_swap_put_failures - vm_swap_put_failures_at_start, VM_PAGE_COMPRESSOR_COUNT);
2820 }
2821
2822 lck_mtx_unlock_always(c_list_lock);
2823
2824 thread_wakeup((event_t)&compaction_swapper_running);
2825
2826 clock_get_uptime(&endTime);
2827 SUB_ABSOLUTETIME(&endTime, &startTime);
2828 absolutetime_to_nanoseconds(endTime, &nsec);
2829
2830 HIBLOG("vm_compressor_flush completed - took %qd msecs - vm_num_swap_files = %d, vm_num_pinned_swap_files = %d, vm_swappin_enabled = %d\n",
2831 nsec / 1000000ULL, vm_num_swap_files, vm_num_pinned_swap_files, vm_swappin_enabled);
2832 }
2833
2834
2835 int compaction_swap_trigger_thread_awakened = 0;
2836
2837 static void
vm_compressor_swap_trigger_thread(void)2838 vm_compressor_swap_trigger_thread(void)
2839 {
2840 current_thread()->options |= TH_OPT_VMPRIV;
2841
2842 /*
2843 * compaction_swapper_init_now is set when the first call to
2844 * vm_consider_waking_compactor_swapper is made from
2845 * vm_pageout_scan... since this function is called upon
2846 * thread creation, we want to make sure to delay adjusting
2847 * the tuneables until we are awakened via vm_pageout_scan
2848 * so that we are at a point where the vm_swapfile_open will
2849 * be operating on the correct directory (in case the default
2850 * of using the VM volume is overridden by the dynamic_pager)
2851 */
2852 if (compaction_swapper_init_now) {
2853 vm_compaction_swapper_do_init();
2854
2855 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
2856 thread_vm_bind_group_add();
2857 }
2858 #if CONFIG_THREAD_GROUPS
2859 thread_group_vm_add();
2860 #endif
2861 thread_set_thread_name(current_thread(), "VM_cswap_trigger");
2862 compaction_swapper_init_now = 0;
2863 }
2864 lck_mtx_lock_spin_always(c_list_lock);
2865
2866 compaction_swap_trigger_thread_awakened++;
2867 compaction_swapper_awakened = 0;
2868
2869 if (compaction_swapper_running == 0) {
2870 compaction_swapper_running = 1;
2871
2872 vm_compressor_compact_and_swap(FALSE);
2873
2874 compaction_swapper_running = 0;
2875 }
2876 assert_wait((event_t)&c_compressor_swap_trigger, THREAD_UNINT);
2877
2878 if (compaction_swapper_running == 0) {
2879 thread_wakeup((event_t)&compaction_swapper_running);
2880 }
2881
2882 lck_mtx_unlock_always(c_list_lock);
2883
2884 thread_block((thread_continue_t)vm_compressor_swap_trigger_thread);
2885
2886 /* NOTREACHED */
2887 }
2888
2889
2890 void
vm_compressor_record_warmup_start(void)2891 vm_compressor_record_warmup_start(void)
2892 {
2893 c_segment_t c_seg;
2894
2895 lck_mtx_lock_spin_always(c_list_lock);
2896
2897 if (first_c_segment_to_warm_generation_id == 0) {
2898 if (!queue_empty(&c_age_list_head)) {
2899 c_seg = (c_segment_t)queue_last(&c_age_list_head);
2900
2901 first_c_segment_to_warm_generation_id = c_seg->c_generation_id;
2902 } else {
2903 first_c_segment_to_warm_generation_id = 0;
2904 }
2905
2906 fastwake_recording_in_progress = TRUE;
2907 }
2908 lck_mtx_unlock_always(c_list_lock);
2909 }
2910
2911
2912 void
vm_compressor_record_warmup_end(void)2913 vm_compressor_record_warmup_end(void)
2914 {
2915 c_segment_t c_seg;
2916
2917 lck_mtx_lock_spin_always(c_list_lock);
2918
2919 if (fastwake_recording_in_progress == TRUE) {
2920 if (!queue_empty(&c_age_list_head)) {
2921 c_seg = (c_segment_t)queue_last(&c_age_list_head);
2922
2923 last_c_segment_to_warm_generation_id = c_seg->c_generation_id;
2924 } else {
2925 last_c_segment_to_warm_generation_id = first_c_segment_to_warm_generation_id;
2926 }
2927
2928 fastwake_recording_in_progress = FALSE;
2929
2930 HIBLOG("vm_compressor_record_warmup (%qd - %qd)\n", first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id);
2931 }
2932 lck_mtx_unlock_always(c_list_lock);
2933 }
2934
2935
2936 #define DELAY_TRIM_ON_WAKE_SECS 25
2937
2938 void
vm_compressor_delay_trim(void)2939 vm_compressor_delay_trim(void)
2940 {
2941 clock_sec_t sec;
2942 clock_nsec_t nsec;
2943
2944 clock_get_system_nanotime(&sec, &nsec);
2945 dont_trim_until_ts = sec + DELAY_TRIM_ON_WAKE_SECS;
2946 }
2947
2948
2949 void
vm_compressor_do_warmup(void)2950 vm_compressor_do_warmup(void)
2951 {
2952 lck_mtx_lock_spin_always(c_list_lock);
2953
2954 if (first_c_segment_to_warm_generation_id == last_c_segment_to_warm_generation_id) {
2955 first_c_segment_to_warm_generation_id = last_c_segment_to_warm_generation_id = 0;
2956
2957 lck_mtx_unlock_always(c_list_lock);
2958 return;
2959 }
2960
2961 if (compaction_swapper_running == 0 && compaction_swapper_awakened == 0) {
2962 fastwake_warmup = TRUE;
2963
2964 compaction_swapper_awakened = 1;
2965 thread_wakeup((event_t)&c_compressor_swap_trigger);
2966 }
2967 lck_mtx_unlock_always(c_list_lock);
2968 }
2969
2970 void
do_fastwake_warmup_all(void)2971 do_fastwake_warmup_all(void)
2972 {
2973 lck_mtx_lock_spin_always(c_list_lock);
2974
2975 if (queue_empty(&c_swappedout_list_head) && queue_empty(&c_swappedout_sparse_list_head)) {
2976 lck_mtx_unlock_always(c_list_lock);
2977 return;
2978 }
2979
2980 fastwake_warmup = TRUE;
2981
2982 do_fastwake_warmup(&c_swappedout_list_head, TRUE);
2983
2984 do_fastwake_warmup(&c_swappedout_sparse_list_head, TRUE);
2985
2986 fastwake_warmup = FALSE;
2987
2988 lck_mtx_unlock_always(c_list_lock);
2989 }
2990
2991 void
do_fastwake_warmup(queue_head_t * c_queue,boolean_t consider_all_cseg)2992 do_fastwake_warmup(queue_head_t *c_queue, boolean_t consider_all_cseg)
2993 {
2994 c_segment_t c_seg = NULL;
2995 AbsoluteTime startTime, endTime;
2996 uint64_t nsec;
2997
2998
2999 HIBLOG("vm_compressor_fastwake_warmup (%qd - %qd) - starting\n", first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id);
3000
3001 clock_get_uptime(&startTime);
3002
3003 lck_mtx_unlock_always(c_list_lock);
3004
3005 proc_set_thread_policy(current_thread(),
3006 TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2);
3007
3008 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3009
3010 lck_mtx_lock_spin_always(c_list_lock);
3011
3012 while (!queue_empty(c_queue) && fastwake_warmup == TRUE) {
3013 c_seg = (c_segment_t) queue_first(c_queue);
3014
3015 if (consider_all_cseg == FALSE) {
3016 if (c_seg->c_generation_id < first_c_segment_to_warm_generation_id ||
3017 c_seg->c_generation_id > last_c_segment_to_warm_generation_id) {
3018 break;
3019 }
3020
3021 if (vm_page_free_count < (AVAILABLE_MEMORY / 4)) {
3022 break;
3023 }
3024 }
3025
3026 lck_mtx_lock_spin_always(&c_seg->c_lock);
3027 lck_mtx_unlock_always(c_list_lock);
3028
3029 if (c_seg->c_busy) {
3030 PAGE_REPLACEMENT_DISALLOWED(FALSE);
3031 c_seg_wait_on_busy(c_seg);
3032 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3033 } else {
3034 if (c_seg_swapin(c_seg, TRUE, FALSE) == 0) {
3035 lck_mtx_unlock_always(&c_seg->c_lock);
3036 }
3037 c_segment_warmup_count++;
3038
3039 PAGE_REPLACEMENT_DISALLOWED(FALSE);
3040 vm_pageout_io_throttle();
3041 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3042 }
3043 lck_mtx_lock_spin_always(c_list_lock);
3044 }
3045 lck_mtx_unlock_always(c_list_lock);
3046
3047 PAGE_REPLACEMENT_DISALLOWED(FALSE);
3048
3049 proc_set_thread_policy(current_thread(),
3050 TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER0);
3051
3052 clock_get_uptime(&endTime);
3053 SUB_ABSOLUTETIME(&endTime, &startTime);
3054 absolutetime_to_nanoseconds(endTime, &nsec);
3055
3056 HIBLOG("vm_compressor_fastwake_warmup completed - took %qd msecs\n", nsec / 1000000ULL);
3057
3058 lck_mtx_lock_spin_always(c_list_lock);
3059
3060 if (consider_all_cseg == FALSE) {
3061 first_c_segment_to_warm_generation_id = last_c_segment_to_warm_generation_id = 0;
3062 }
3063 }
3064
3065 int min_csegs_per_major_compaction = DELAYED_COMPACTIONS_PER_PASS;
3066 extern bool vm_swapout_thread_running;
3067 extern boolean_t compressor_store_stop_compaction;
3068
3069 void
vm_compressor_compact_and_swap(boolean_t flush_all)3070 vm_compressor_compact_and_swap(boolean_t flush_all)
3071 {
3072 c_segment_t c_seg, c_seg_next;
3073 boolean_t keep_compacting, switch_state;
3074 clock_sec_t now;
3075 clock_nsec_t nsec;
3076 mach_timespec_t start_ts, end_ts;
3077 unsigned int number_considered, wanted_cseg_found, yield_after_considered_per_pass, number_yields;
3078 uint64_t bytes_to_free, bytes_freed, delta_usec;
3079
3080 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_START, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
3081
3082 if (fastwake_warmup == TRUE) {
3083 uint64_t starting_warmup_count;
3084
3085 starting_warmup_count = c_segment_warmup_count;
3086
3087 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 11) | DBG_FUNC_START, c_segment_warmup_count,
3088 first_c_segment_to_warm_generation_id, last_c_segment_to_warm_generation_id, 0, 0);
3089 do_fastwake_warmup(&c_swappedout_list_head, FALSE);
3090 KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 11) | DBG_FUNC_END, c_segment_warmup_count, c_segment_warmup_count - starting_warmup_count, 0, 0, 0);
3091
3092 fastwake_warmup = FALSE;
3093 }
3094
3095 #if (XNU_TARGET_OS_OSX && __arm64__)
3096 /*
3097 * Re-considering major csegs showed benefits on all platforms by
3098 * significantly reducing fragmentation and getting back memory.
3099 * However, on smaller devices, eg watch, there was increased power
3100 * use for the additional compactions. And the turnover in csegs on
3101 * those smaller platforms is high enough in the decompression/free
3102 * path that we can skip reconsidering them here because we already
3103 * consider them for major compaction in those paths.
3104 */
3105 vm_compressor_process_major_segments();
3106 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3107
3108 /*
3109 * it's possible for the c_age_list_head to be empty if we
3110 * hit our limits for growing the compressor pool and we subsequently
3111 * hibernated... on the next hibernation we could see the queue as
3112 * empty and not proceeed even though we have a bunch of segments on
3113 * the swapped in queue that need to be dealt with.
3114 */
3115 vm_compressor_do_delayed_compactions(flush_all);
3116
3117 vm_compressor_age_swapped_in_segments(flush_all);
3118
3119 /*
3120 * we only need to grab the timestamp once per
3121 * invocation of this function since the
3122 * timescale we're interested in is measured
3123 * in days
3124 */
3125 clock_get_system_nanotime(&now, &nsec);
3126
3127 start_ts.tv_sec = (int) now;
3128 start_ts.tv_nsec = nsec;
3129 delta_usec = 0;
3130 number_considered = 0;
3131 wanted_cseg_found = 0;
3132 number_yields = 0;
3133 bytes_to_free = 0;
3134 bytes_freed = 0;
3135 yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS);
3136
3137 while (!queue_empty(&c_age_list_head) && !compaction_swapper_abort && !compressor_store_stop_compaction) {
3138 if (hibernate_flushing == TRUE) {
3139 clock_sec_t sec;
3140
3141 if (hibernate_should_abort()) {
3142 HIBLOG("vm_compressor_flush - hibernate_should_abort returned TRUE\n");
3143 break;
3144 }
3145 if (hibernate_no_swapspace == TRUE) {
3146 HIBLOG("vm_compressor_flush - out of swap space\n");
3147 break;
3148 }
3149 if (vm_swap_files_pinned() == FALSE) {
3150 HIBLOG("vm_compressor_flush - unpinned swap files\n");
3151 break;
3152 }
3153 if (hibernate_in_progress_with_pinned_swap == TRUE &&
3154 (vm_swapfile_total_segs_alloced == vm_swapfile_total_segs_used)) {
3155 HIBLOG("vm_compressor_flush - out of pinned swap space\n");
3156 break;
3157 }
3158 clock_get_system_nanotime(&sec, &nsec);
3159
3160 if (sec > hibernate_flushing_deadline) {
3161 hibernate_flush_timed_out = TRUE;
3162 HIBLOG("vm_compressor_flush - failed to finish before deadline\n");
3163 break;
3164 }
3165 }
3166 if (!vm_swap_out_of_space() && c_swapout_count >= C_SWAPOUT_LIMIT) {
3167 assert_wait_timeout((event_t) &compaction_swapper_running, THREAD_INTERRUPTIBLE, 100, 1000 * NSEC_PER_USEC);
3168
3169 if (!vm_swapout_thread_running) {
3170 thread_wakeup((event_t)&c_swapout_list_head);
3171 }
3172
3173 lck_mtx_unlock_always(c_list_lock);
3174
3175 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 1, c_swapout_count, 0, 0);
3176
3177 thread_block(THREAD_CONTINUE_NULL);
3178
3179 lck_mtx_lock_spin_always(c_list_lock);
3180 }
3181 /*
3182 * Minor compactions
3183 */
3184 vm_compressor_do_delayed_compactions(flush_all);
3185
3186 vm_compressor_age_swapped_in_segments(flush_all);
3187
3188 if (!vm_swap_out_of_space() && c_swapout_count >= C_SWAPOUT_LIMIT) {
3189 /*
3190 * we timed out on the above thread_block
3191 * let's loop around and try again
3192 * the timeout allows us to continue
3193 * to do minor compactions to make
3194 * more memory available
3195 */
3196 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 2, c_swapout_count, 0, 0);
3197
3198 continue;
3199 }
3200
3201 /*
3202 * Swap out segments?
3203 */
3204 if (flush_all == FALSE) {
3205 boolean_t needs_to_swap;
3206
3207 lck_mtx_unlock_always(c_list_lock);
3208
3209 needs_to_swap = compressor_needs_to_swap();
3210
3211 lck_mtx_lock_spin_always(c_list_lock);
3212
3213 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 3, needs_to_swap, 0, 0);
3214
3215 if (needs_to_swap == FALSE) {
3216 break;
3217 }
3218 }
3219 if (queue_empty(&c_age_list_head)) {
3220 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 4, c_age_count, 0, 0);
3221 break;
3222 }
3223 c_seg = (c_segment_t) queue_first(&c_age_list_head);
3224
3225 assert(c_seg->c_state == C_ON_AGE_Q);
3226
3227 if (flush_all == TRUE && c_seg->c_generation_id > c_generation_id_flush_barrier) {
3228 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 5, 0, 0, 0);
3229 break;
3230 }
3231
3232 lck_mtx_lock_spin_always(&c_seg->c_lock);
3233
3234 if (c_seg->c_busy) {
3235 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 6, (void*) VM_KERNEL_ADDRPERM(c_seg), 0, 0);
3236
3237 lck_mtx_unlock_always(c_list_lock);
3238 c_seg_wait_on_busy(c_seg);
3239 lck_mtx_lock_spin_always(c_list_lock);
3240
3241 continue;
3242 }
3243 C_SEG_BUSY(c_seg);
3244
3245 if (c_seg_do_minor_compaction_and_unlock(c_seg, FALSE, TRUE, TRUE)) {
3246 /*
3247 * found an empty c_segment and freed it
3248 * so go grab the next guy in the queue
3249 */
3250 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 7, 0, 0, 0);
3251 c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3252 continue;
3253 }
3254 /*
3255 * Major compaction
3256 */
3257 keep_compacting = TRUE;
3258 switch_state = TRUE;
3259
3260 while (keep_compacting == TRUE) {
3261 assert(c_seg->c_busy);
3262
3263 /* look for another segment to consolidate */
3264
3265 c_seg_next = (c_segment_t) queue_next(&c_seg->c_age_list);
3266
3267 if (queue_end(&c_age_list_head, (queue_entry_t)c_seg_next)) {
3268 break;
3269 }
3270
3271 assert(c_seg_next->c_state == C_ON_AGE_Q);
3272
3273 number_considered++;
3274
3275 if (c_seg_major_compact_ok(c_seg, c_seg_next) == FALSE) {
3276 break;
3277 }
3278
3279 lck_mtx_lock_spin_always(&c_seg_next->c_lock);
3280
3281 if (c_seg_next->c_busy) {
3282 /*
3283 * We are going to block for our neighbor.
3284 * If our c_seg is wanted, we should unbusy
3285 * it because we don't know how long we might
3286 * have to block here.
3287 */
3288 if (c_seg->c_wanted) {
3289 lck_mtx_unlock_always(&c_seg_next->c_lock);
3290 switch_state = FALSE;
3291 c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
3292 wanted_cseg_found++;
3293 break;
3294 }
3295
3296 lck_mtx_unlock_always(c_list_lock);
3297
3298 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 8, (void*) VM_KERNEL_ADDRPERM(c_seg_next), 0, 0);
3299
3300 c_seg_wait_on_busy(c_seg_next);
3301 lck_mtx_lock_spin_always(c_list_lock);
3302
3303 continue;
3304 }
3305 /* grab that segment */
3306 C_SEG_BUSY(c_seg_next);
3307
3308 bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3309 if (c_seg_do_minor_compaction_and_unlock(c_seg_next, FALSE, TRUE, TRUE)) {
3310 /*
3311 * found an empty c_segment and freed it
3312 * so we can't continue to use c_seg_next
3313 */
3314 bytes_freed += bytes_to_free;
3315 c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3316 continue;
3317 }
3318
3319 /* unlock the list ... */
3320 lck_mtx_unlock_always(c_list_lock);
3321
3322 /* do the major compaction */
3323
3324 keep_compacting = c_seg_major_compact(c_seg, c_seg_next);
3325
3326 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 9, keep_compacting, 0, 0);
3327
3328 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3329
3330 lck_mtx_lock_spin_always(&c_seg_next->c_lock);
3331 /*
3332 * run a minor compaction on the donor segment
3333 * since we pulled at least some of it's
3334 * data into our target... if we've emptied
3335 * it, now is a good time to free it which
3336 * c_seg_minor_compaction_and_unlock also takes care of
3337 *
3338 * by passing TRUE, we ask for c_busy to be cleared
3339 * and c_wanted to be taken care of
3340 */
3341 bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3342 if (c_seg_minor_compaction_and_unlock(c_seg_next, TRUE)) {
3343 bytes_freed += bytes_to_free;
3344 c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
3345 } else {
3346 bytes_to_free -= C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
3347 bytes_freed += bytes_to_free;
3348 }
3349
3350 PAGE_REPLACEMENT_DISALLOWED(FALSE);
3351
3352 /* relock the list */
3353 lck_mtx_lock_spin_always(c_list_lock);
3354
3355 if (c_seg->c_wanted) {
3356 /*
3357 * Our c_seg is in demand. Let's
3358 * unbusy it and wakeup the waiters
3359 * instead of continuing the compaction
3360 * because we could be in this loop
3361 * for a while.
3362 */
3363 switch_state = FALSE;
3364 wanted_cseg_found++;
3365 c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
3366 break;
3367 }
3368 } /* major compaction */
3369
3370 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 10, number_considered, wanted_cseg_found, 0);
3371
3372 lck_mtx_lock_spin_always(&c_seg->c_lock);
3373
3374 assert(c_seg->c_busy);
3375 assert(!c_seg->c_on_minorcompact_q);
3376
3377 if (switch_state) {
3378 if (VM_CONFIG_SWAP_IS_ACTIVE) {
3379 int new_state = C_ON_SWAPOUT_Q;
3380
3381 #if (XNU_TARGET_OS_OSX && __arm64__)
3382 if (flush_all == false && compressor_swapout_conditions_met() == false) {
3383 new_state = C_ON_MAJORCOMPACT_Q;
3384 }
3385 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3386
3387 if (new_state == C_ON_SWAPOUT_Q) {
3388 /*
3389 * This mode of putting a generic c_seg on the swapout list is
3390 * only supported when we have general swapping enabled
3391 */
3392 clock_sec_t lnow;
3393 clock_nsec_t lnsec;
3394 clock_get_system_nanotime(&lnow, &lnsec);
3395 if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 30) {
3396 vmcs_stats.unripe_under_30s++;
3397 } else if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 60) {
3398 vmcs_stats.unripe_under_60s++;
3399 } else if (c_seg->c_agedin_ts && (lnow - c_seg->c_agedin_ts) < 300) {
3400 vmcs_stats.unripe_under_300s++;
3401 }
3402 }
3403
3404 c_seg_switch_state(c_seg, new_state, FALSE);
3405 } else {
3406 if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) {
3407 assert(VM_CONFIG_SWAP_IS_PRESENT);
3408 /*
3409 * we are running compressor sweeps with swap-behind
3410 * make sure the c_seg has aged enough before swapping it
3411 * out...
3412 */
3413 if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
3414 c_seg->c_overage_swap = TRUE;
3415 c_overage_swapped_count++;
3416 c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
3417 }
3418 }
3419 }
3420 if (c_seg->c_state == C_ON_AGE_Q) {
3421 /*
3422 * this c_seg didn't get moved to the swapout queue
3423 * so we need to move it out of the way...
3424 * we just did a major compaction on it so put it
3425 * on that queue
3426 */
3427 c_seg_switch_state(c_seg, C_ON_MAJORCOMPACT_Q, FALSE);
3428 } else {
3429 c_seg_major_compact_stats[c_seg_major_compact_stats_now].wasted_space_in_swapouts += c_seg_bufsize - c_seg->c_bytes_used;
3430 c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_swapouts++;
3431 }
3432 }
3433
3434 C_SEG_WAKEUP_DONE(c_seg);
3435
3436 lck_mtx_unlock_always(&c_seg->c_lock);
3437
3438 if (c_swapout_count) {
3439 /*
3440 * We don't pause/yield here because we will either
3441 * yield below or at the top of the loop with the
3442 * assert_wait_timeout.
3443 */
3444 if (!vm_swapout_thread_running) {
3445 thread_wakeup((event_t)&c_swapout_list_head);
3446 }
3447 }
3448
3449 if (number_considered >= yield_after_considered_per_pass) {
3450 if (wanted_cseg_found) {
3451 /*
3452 * We stopped major compactions on a c_seg
3453 * that is wanted. We don't know the priority
3454 * of the waiter unfortunately but we are at
3455 * a very high priority and so, just in case
3456 * the waiter is a critical system daemon or
3457 * UI thread, let's give up the CPU in case
3458 * the system is running a few CPU intensive
3459 * tasks.
3460 */
3461 lck_mtx_unlock_always(c_list_lock);
3462
3463 mutex_pause(2); /* 100us yield */
3464
3465 number_yields++;
3466
3467 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 11, number_considered, number_yields, 0);
3468
3469 lck_mtx_lock_spin_always(c_list_lock);
3470 }
3471
3472 number_considered = 0;
3473 wanted_cseg_found = 0;
3474 }
3475 }
3476 clock_get_system_nanotime(&now, &nsec);
3477 end_ts.tv_sec = (int) now;
3478 end_ts.tv_nsec = nsec;
3479
3480 SUB_MACH_TIMESPEC(&end_ts, &start_ts);
3481
3482 delta_usec = (end_ts.tv_sec * USEC_PER_SEC) + (end_ts.tv_nsec / NSEC_PER_USEC) - (number_yields * 100);
3483
3484 delta_usec = MAX(1, delta_usec); /* we could have 0 usec run if conditions weren't right */
3485
3486 c_seg_major_compact_stats[c_seg_major_compact_stats_now].bytes_freed_rate_us = (bytes_freed / delta_usec);
3487
3488 if ((c_seg_major_compact_stats_now + 1) == C_SEG_MAJOR_COMPACT_STATS_MAX) {
3489 c_seg_major_compact_stats_now = 0;
3490 } else {
3491 c_seg_major_compact_stats_now++;
3492 }
3493
3494 assert(c_seg_major_compact_stats_now < C_SEG_MAJOR_COMPACT_STATS_MAX);
3495
3496 VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_END, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
3497 }
3498
3499
3500 static c_segment_t
c_seg_allocate(c_segment_t * current_chead)3501 c_seg_allocate(c_segment_t *current_chead)
3502 {
3503 c_segment_t c_seg;
3504 int min_needed;
3505 int size_to_populate;
3506
3507 #if XNU_TARGET_OS_OSX
3508 if (vm_compressor_low_on_space()) {
3509 vm_compressor_take_paging_space_action();
3510 }
3511 #endif /* XNU_TARGET_OS_OSX */
3512
3513 if ((c_seg = *current_chead) == NULL) {
3514 uint32_t c_segno;
3515
3516 lck_mtx_lock_spin_always(c_list_lock);
3517
3518 while (c_segments_busy == TRUE) {
3519 assert_wait((event_t) (&c_segments_busy), THREAD_UNINT);
3520
3521 lck_mtx_unlock_always(c_list_lock);
3522
3523 thread_block(THREAD_CONTINUE_NULL);
3524
3525 lck_mtx_lock_spin_always(c_list_lock);
3526 }
3527 if (c_free_segno_head == (uint32_t)-1) {
3528 uint32_t c_segments_available_new;
3529 uint32_t compressed_pages;
3530
3531 #if CONFIG_FREEZE
3532 if (freezer_incore_cseg_acct) {
3533 compressed_pages = c_segment_pages_compressed_incore;
3534 } else {
3535 compressed_pages = c_segment_pages_compressed;
3536 }
3537 #else
3538 compressed_pages = c_segment_pages_compressed;
3539 #endif /* CONFIG_FREEZE */
3540
3541 if (c_segments_available >= c_segments_limit || compressed_pages >= c_segment_pages_compressed_limit) {
3542 lck_mtx_unlock_always(c_list_lock);
3543
3544 return NULL;
3545 }
3546 c_segments_busy = TRUE;
3547 lck_mtx_unlock_always(c_list_lock);
3548
3549 kernel_memory_populate((vm_offset_t)c_segments_next_page,
3550 PAGE_SIZE, KMA_NOFAIL | KMA_KOBJECT,
3551 VM_KERN_MEMORY_COMPRESSOR);
3552 c_segments_next_page += PAGE_SIZE;
3553
3554 c_segments_available_new = c_segments_available + C_SEGMENTS_PER_PAGE;
3555
3556 if (c_segments_available_new > c_segments_limit) {
3557 c_segments_available_new = c_segments_limit;
3558 }
3559
3560 for (c_segno = c_segments_available + 1; c_segno < c_segments_available_new; c_segno++) {
3561 c_segments[c_segno - 1].c_segno = c_segno;
3562 }
3563
3564 lck_mtx_lock_spin_always(c_list_lock);
3565
3566 c_segments[c_segno - 1].c_segno = c_free_segno_head;
3567 c_free_segno_head = c_segments_available;
3568 c_segments_available = c_segments_available_new;
3569
3570 c_segments_busy = FALSE;
3571 thread_wakeup((event_t) (&c_segments_busy));
3572 }
3573 c_segno = c_free_segno_head;
3574 assert(c_segno >= 0 && c_segno < c_segments_limit);
3575
3576 c_free_segno_head = (uint32_t)c_segments[c_segno].c_segno;
3577
3578 /*
3579 * do the rest of the bookkeeping now while we're still behind
3580 * the list lock and grab our generation id now into a local
3581 * so that we can install it once we have the c_seg allocated
3582 */
3583 c_segment_count++;
3584 if (c_segment_count > c_segment_count_max) {
3585 c_segment_count_max = c_segment_count;
3586 }
3587
3588 lck_mtx_unlock_always(c_list_lock);
3589
3590 c_seg = zalloc_flags(compressor_segment_zone, Z_WAITOK | Z_ZERO);
3591
3592 c_seg->c_store.c_buffer = (int32_t *)C_SEG_BUFFER_ADDRESS(c_segno);
3593
3594 lck_mtx_init(&c_seg->c_lock, &vm_compressor_lck_grp, LCK_ATTR_NULL);
3595
3596 c_seg->c_state = C_IS_EMPTY;
3597 c_seg->c_firstemptyslot = C_SLOT_MAX_INDEX;
3598 c_seg->c_mysegno = c_segno;
3599
3600 lck_mtx_lock_spin_always(c_list_lock);
3601 c_empty_count++;
3602 c_seg_switch_state(c_seg, C_IS_FILLING, FALSE);
3603 c_segments[c_segno].c_seg = c_seg;
3604 assert(c_segments[c_segno].c_segno > c_segments_available);
3605 lck_mtx_unlock_always(c_list_lock);
3606
3607 *current_chead = c_seg;
3608
3609 #if DEVELOPMENT || DEBUG
3610 C_SEG_MAKE_WRITEABLE(c_seg);
3611 #endif
3612 }
3613 c_seg_alloc_nextslot(c_seg);
3614
3615 size_to_populate = c_seg_allocsize - C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset);
3616
3617 if (size_to_populate) {
3618 min_needed = PAGE_SIZE + (c_seg_allocsize - c_seg_bufsize);
3619
3620 if (C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset) < (unsigned) min_needed) {
3621 if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) {
3622 size_to_populate = C_SEG_MAX_POPULATE_SIZE;
3623 }
3624
3625 OSAddAtomic64(size_to_populate / PAGE_SIZE, &vm_pageout_vminfo.vm_compressor_pages_grabbed);
3626
3627 kernel_memory_populate(
3628 (vm_offset_t) &c_seg->c_store.c_buffer[c_seg->c_populated_offset],
3629 size_to_populate,
3630 KMA_NOFAIL | KMA_COMPRESSOR,
3631 VM_KERN_MEMORY_COMPRESSOR);
3632 } else {
3633 size_to_populate = 0;
3634 }
3635 }
3636 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3637
3638 lck_mtx_lock_spin_always(&c_seg->c_lock);
3639
3640 if (size_to_populate) {
3641 c_seg->c_populated_offset += C_SEG_BYTES_TO_OFFSET(size_to_populate);
3642 }
3643
3644 return c_seg;
3645 }
3646
3647 #if DEVELOPMENT || DEBUG
3648 #if CONFIG_FREEZE
3649 extern boolean_t memorystatus_freeze_to_memory;
3650 #endif /* CONFIG_FREEZE */
3651 #endif /* DEVELOPMENT || DEBUG */
3652
3653 #define TIME_SUB(rsecs, secs, rfrac, frac, unit) \
3654 MACRO_BEGIN \
3655 if ((int)((rfrac) -= (frac)) < 0) { \
3656 (rfrac) += (unit); \
3657 (rsecs) -= 1; \
3658 } \
3659 (rsecs) -= (secs); \
3660 MACRO_END
3661
3662 uint64_t c_seg_filled_no_contention = 0;
3663 uint64_t c_seg_filled_contention = 0;
3664 clock_sec_t c_seg_filled_contention_sec_max = 0;
3665 clock_nsec_t c_seg_filled_contention_nsec_max = 0;
3666
3667 static void
c_current_seg_filled(c_segment_t c_seg,c_segment_t * current_chead)3668 c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead)
3669 {
3670 uint32_t unused_bytes;
3671 uint32_t offset_to_depopulate;
3672 int new_state = C_ON_AGE_Q;
3673 clock_sec_t sec;
3674 clock_nsec_t nsec;
3675 boolean_t head_insert = FALSE;
3676
3677 unused_bytes = trunc_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset));
3678
3679 if (unused_bytes) {
3680 offset_to_depopulate = C_SEG_BYTES_TO_OFFSET(round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_nextoffset)));
3681
3682 /*
3683 * release the extra physical page(s) at the end of the segment
3684 */
3685 lck_mtx_unlock_always(&c_seg->c_lock);
3686
3687 kernel_memory_depopulate(
3688 (vm_offset_t) &c_seg->c_store.c_buffer[offset_to_depopulate],
3689 unused_bytes,
3690 KMA_COMPRESSOR,
3691 VM_KERN_MEMORY_COMPRESSOR);
3692
3693 lck_mtx_lock_spin_always(&c_seg->c_lock);
3694
3695 c_seg->c_populated_offset = offset_to_depopulate;
3696 }
3697 assert(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset) <= c_seg_bufsize);
3698
3699 #if DEVELOPMENT || DEBUG
3700 {
3701 boolean_t c_seg_was_busy = FALSE;
3702
3703 if (!c_seg->c_busy) {
3704 C_SEG_BUSY(c_seg);
3705 } else {
3706 c_seg_was_busy = TRUE;
3707 }
3708
3709 lck_mtx_unlock_always(&c_seg->c_lock);
3710
3711 C_SEG_WRITE_PROTECT(c_seg);
3712
3713 lck_mtx_lock_spin_always(&c_seg->c_lock);
3714
3715 if (c_seg_was_busy == FALSE) {
3716 C_SEG_WAKEUP_DONE(c_seg);
3717 }
3718 }
3719 #endif
3720
3721 #if CONFIG_FREEZE
3722 if (current_chead == (c_segment_t*) &(freezer_context_global.freezer_ctx_chead) &&
3723 VM_CONFIG_SWAP_IS_PRESENT &&
3724 VM_CONFIG_FREEZER_SWAP_IS_ACTIVE
3725 #if DEVELOPMENT || DEBUG
3726 && !memorystatus_freeze_to_memory
3727 #endif /* DEVELOPMENT || DEBUG */
3728 ) {
3729 new_state = C_ON_SWAPOUT_Q;
3730 }
3731 #endif /* CONFIG_FREEZE */
3732
3733 if (vm_darkwake_mode == TRUE) {
3734 new_state = C_ON_SWAPOUT_Q;
3735 head_insert = TRUE;
3736 }
3737
3738 clock_get_system_nanotime(&sec, &nsec);
3739 c_seg->c_creation_ts = (uint32_t)sec;
3740
3741 if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
3742 clock_sec_t sec2;
3743 clock_nsec_t nsec2;
3744
3745 lck_mtx_lock_spin_always(c_list_lock);
3746 clock_get_system_nanotime(&sec2, &nsec2);
3747 TIME_SUB(sec2, sec, nsec2, nsec, NSEC_PER_SEC);
3748 // printf("FBDP %s: head %p waited for c_list_lock for %lu.%09u seconds\n", __FUNCTION__, current_chead, sec2, nsec2);
3749 if (sec2 > c_seg_filled_contention_sec_max) {
3750 c_seg_filled_contention_sec_max = sec2;
3751 c_seg_filled_contention_nsec_max = nsec2;
3752 } else if (sec2 == c_seg_filled_contention_sec_max &&
3753 nsec2 > c_seg_filled_contention_nsec_max) {
3754 c_seg_filled_contention_nsec_max = nsec2;
3755 }
3756 c_seg_filled_contention++;
3757 } else {
3758 c_seg_filled_no_contention++;
3759 }
3760
3761 c_seg->c_generation_id = c_generation_id++;
3762 c_seg_switch_state(c_seg, new_state, head_insert);
3763
3764 #if CONFIG_FREEZE
3765 if (c_seg->c_state == C_ON_SWAPOUT_Q) {
3766 /*
3767 * darkwake and freezer can't co-exist together
3768 * We'll need to fix this accounting as a start.
3769 */
3770 assert(vm_darkwake_mode == FALSE);
3771 c_seg_update_task_owner(c_seg, freezer_context_global.freezer_ctx_task);
3772 freezer_context_global.freezer_ctx_swapped_bytes += c_seg->c_bytes_used;
3773 }
3774 #endif /* CONFIG_FREEZE */
3775
3776 if (c_seg->c_state == C_ON_AGE_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
3777 #if CONFIG_FREEZE
3778 assert(c_seg->c_task_owner == NULL);
3779 #endif /* CONFIG_FREEZE */
3780 c_seg_need_delayed_compaction(c_seg, TRUE);
3781 }
3782
3783 lck_mtx_unlock_always(c_list_lock);
3784
3785 if (c_seg->c_state == C_ON_SWAPOUT_Q) {
3786 /*
3787 * Darkwake and Freeze configs always
3788 * wake up the swapout thread because
3789 * the compactor thread that normally handles
3790 * it may not be running as much in these
3791 * configs.
3792 */
3793 thread_wakeup((event_t)&c_swapout_list_head);
3794 }
3795
3796 *current_chead = NULL;
3797 }
3798
3799
3800 #if (XNU_TARGET_OS_OSX && __arm64__)
3801 clock_nsec_t c_process_major_report_over_ms = 9; /* report if over 9 ms */
3802 int c_process_major_yield_after = 1000; /* yield after moving 1,000 segments */
3803 uint64_t c_process_major_reports = 0;
3804 clock_sec_t c_process_major_max_sec = 0;
3805 clock_nsec_t c_process_major_max_nsec = 0;
3806 uint32_t c_process_major_peak_segcount = 0;
3807 static void
vm_compressor_process_major_segments(void)3808 vm_compressor_process_major_segments(void)
3809 {
3810 c_segment_t c_seg = NULL;
3811 int count = 0, total = 0, breaks = 0;
3812 clock_sec_t start_sec, end_sec;
3813 clock_nsec_t start_nsec, end_nsec;
3814 clock_nsec_t report_over_ns;
3815
3816 if (queue_empty(&c_major_list_head)) {
3817 return;
3818 }
3819
3820 // printf("%s: starting to move segments from MAJORQ to AGEQ\n", __FUNCTION__);
3821 if (c_process_major_report_over_ms != 0) {
3822 report_over_ns = c_process_major_report_over_ms * NSEC_PER_MSEC;
3823 } else {
3824 report_over_ns = (clock_nsec_t)-1;
3825 }
3826 clock_get_system_nanotime(&start_sec, &start_nsec);
3827 while (!queue_empty(&c_major_list_head)) {
3828 /* start from the end to preserve aging order */
3829 c_seg = (c_segment_t)queue_last(&c_major_list_head);
3830 lck_mtx_lock_spin_always(&c_seg->c_lock);
3831 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3832 lck_mtx_unlock_always(&c_seg->c_lock);
3833
3834 count++;
3835 if (count == c_process_major_yield_after ||
3836 queue_empty(&c_major_list_head)) {
3837 /* done or time to take a break */
3838 } else {
3839 /* keep going */
3840 continue;
3841 }
3842
3843 total += count;
3844 clock_get_system_nanotime(&end_sec, &end_nsec);
3845 TIME_SUB(end_sec, start_sec, end_nsec, start_nsec, NSEC_PER_SEC);
3846 if (end_sec > c_process_major_max_sec) {
3847 c_process_major_max_sec = end_sec;
3848 c_process_major_max_nsec = end_nsec;
3849 } else if (end_sec == c_process_major_max_sec &&
3850 end_nsec > c_process_major_max_nsec) {
3851 c_process_major_max_nsec = end_nsec;
3852 }
3853 if (total > c_process_major_peak_segcount) {
3854 c_process_major_peak_segcount = total;
3855 }
3856 if (end_sec > 0 ||
3857 end_nsec >= report_over_ns) {
3858 /* we used more than expected */
3859 c_process_major_reports++;
3860 printf("%s: moved %d/%d segments from MAJORQ to AGEQ in %lu.%09u seconds and %d breaks\n",
3861 __FUNCTION__, count, total,
3862 end_sec, end_nsec, breaks);
3863 }
3864 if (queue_empty(&c_major_list_head)) {
3865 /* done */
3866 break;
3867 }
3868 /* take a break to allow someone else to grab the lock */
3869 lck_mtx_unlock_always(c_list_lock);
3870 mutex_pause(0); /* 10 microseconds */
3871 lck_mtx_lock_spin_always(c_list_lock);
3872 /* start again */
3873 clock_get_system_nanotime(&start_sec, &start_nsec);
3874 count = 0;
3875 breaks++;
3876 }
3877 }
3878 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3879
3880 /*
3881 * returns with c_seg locked
3882 */
3883 void
c_seg_swapin_requeue(c_segment_t c_seg,boolean_t has_data,boolean_t minor_compact_ok,boolean_t age_on_swapin_q)3884 c_seg_swapin_requeue(c_segment_t c_seg, boolean_t has_data, boolean_t minor_compact_ok, boolean_t age_on_swapin_q)
3885 {
3886 clock_sec_t sec;
3887 clock_nsec_t nsec;
3888
3889 clock_get_system_nanotime(&sec, &nsec);
3890
3891 lck_mtx_lock_spin_always(c_list_lock);
3892 lck_mtx_lock_spin_always(&c_seg->c_lock);
3893
3894 assert(c_seg->c_busy_swapping);
3895 assert(c_seg->c_busy);
3896
3897 c_seg->c_busy_swapping = 0;
3898
3899 if (c_seg->c_overage_swap == TRUE) {
3900 c_overage_swapped_count--;
3901 c_seg->c_overage_swap = FALSE;
3902 }
3903 if (has_data == TRUE) {
3904 if (age_on_swapin_q == TRUE) {
3905 c_seg_switch_state(c_seg, C_ON_SWAPPEDIN_Q, FALSE);
3906 } else {
3907 c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
3908 }
3909
3910 if (minor_compact_ok == TRUE && !c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
3911 c_seg_need_delayed_compaction(c_seg, TRUE);
3912 }
3913 } else {
3914 c_seg->c_store.c_buffer = (int32_t*) NULL;
3915 c_seg->c_populated_offset = C_SEG_BYTES_TO_OFFSET(0);
3916
3917 c_seg_switch_state(c_seg, C_ON_BAD_Q, FALSE);
3918 }
3919 c_seg->c_swappedin_ts = (uint32_t)sec;
3920 c_seg->c_swappedin = true;
3921
3922 lck_mtx_unlock_always(c_list_lock);
3923 }
3924
3925
3926
3927 /*
3928 * c_seg has to be locked and is returned locked if the c_seg isn't freed
3929 * PAGE_REPLACMENT_DISALLOWED has to be TRUE on entry and is returned TRUE
3930 * c_seg_swapin returns 1 if the c_seg was freed, 0 otherwise
3931 */
3932
3933 int
c_seg_swapin(c_segment_t c_seg,boolean_t force_minor_compaction,boolean_t age_on_swapin_q)3934 c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction, boolean_t age_on_swapin_q)
3935 {
3936 vm_offset_t addr = 0;
3937 uint32_t io_size = 0;
3938 uint64_t f_offset;
3939 thread_pri_floor_t token;
3940
3941 assert(C_SEG_IS_ONDISK(c_seg));
3942
3943 #if !CHECKSUM_THE_SWAP
3944 c_seg_trim_tail(c_seg);
3945 #endif
3946 io_size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
3947 f_offset = c_seg->c_store.c_swap_handle;
3948
3949 C_SEG_BUSY(c_seg);
3950 c_seg->c_busy_swapping = 1;
3951
3952 /*
3953 * This thread is likely going to block for I/O.
3954 * Make sure it is ready to run when the I/O completes because
3955 * it needs to clear the busy bit on the c_seg so that other
3956 * waiting threads can make progress too.
3957 */
3958 token = thread_priority_floor_start();
3959 lck_mtx_unlock_always(&c_seg->c_lock);
3960
3961 PAGE_REPLACEMENT_DISALLOWED(FALSE);
3962
3963 addr = (vm_offset_t)C_SEG_BUFFER_ADDRESS(c_seg->c_mysegno);
3964 c_seg->c_store.c_buffer = (int32_t*) addr;
3965
3966 kernel_memory_populate(addr, io_size, KMA_NOFAIL | KMA_COMPRESSOR,
3967 VM_KERN_MEMORY_COMPRESSOR);
3968
3969 if (vm_swap_get(c_seg, f_offset, io_size) != KERN_SUCCESS) {
3970 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3971
3972 kernel_memory_depopulate(addr, io_size, KMA_COMPRESSOR,
3973 VM_KERN_MEMORY_COMPRESSOR);
3974
3975 c_seg_swapin_requeue(c_seg, FALSE, TRUE, age_on_swapin_q);
3976 } else {
3977 #if ENCRYPTED_SWAP
3978 vm_swap_decrypt(c_seg);
3979 #endif /* ENCRYPTED_SWAP */
3980
3981 #if CHECKSUM_THE_SWAP
3982 if (c_seg->cseg_swap_size != io_size) {
3983 panic("swapin size doesn't match swapout size");
3984 }
3985
3986 if (c_seg->cseg_hash != vmc_hash((char*) c_seg->c_store.c_buffer, (int)io_size)) {
3987 panic("c_seg_swapin - Swap hash mismatch");
3988 }
3989 #endif /* CHECKSUM_THE_SWAP */
3990
3991 PAGE_REPLACEMENT_DISALLOWED(TRUE);
3992
3993 c_seg_swapin_requeue(c_seg, TRUE, force_minor_compaction == TRUE ? FALSE : TRUE, age_on_swapin_q);
3994
3995 #if CONFIG_FREEZE
3996 /*
3997 * c_seg_swapin_requeue() returns with the c_seg lock held.
3998 */
3999 if (!lck_mtx_try_lock_spin_always(c_list_lock)) {
4000 assert(c_seg->c_busy);
4001
4002 lck_mtx_unlock_always(&c_seg->c_lock);
4003 lck_mtx_lock_spin_always(c_list_lock);
4004 lck_mtx_lock_spin_always(&c_seg->c_lock);
4005 }
4006
4007 if (c_seg->c_task_owner) {
4008 c_seg_update_task_owner(c_seg, NULL);
4009 }
4010
4011 lck_mtx_unlock_always(c_list_lock);
4012
4013 OSAddAtomic(c_seg->c_slots_used, &c_segment_pages_compressed_incore);
4014 #endif /* CONFIG_FREEZE */
4015
4016 OSAddAtomic64(c_seg->c_bytes_used, &compressor_bytes_used);
4017
4018 if (force_minor_compaction == TRUE) {
4019 if (c_seg_minor_compaction_and_unlock(c_seg, FALSE)) {
4020 /*
4021 * c_seg was completely empty so it was freed,
4022 * so be careful not to reference it again
4023 *
4024 * Drop the boost so that the thread priority
4025 * is returned back to where it is supposed to be.
4026 */
4027 thread_priority_floor_end(&token);
4028 return 1;
4029 }
4030
4031 lck_mtx_lock_spin_always(&c_seg->c_lock);
4032 }
4033 }
4034 C_SEG_WAKEUP_DONE(c_seg);
4035
4036 /*
4037 * Drop the boost so that the thread priority
4038 * is returned back to where it is supposed to be.
4039 */
4040 thread_priority_floor_end(&token);
4041
4042 return 0;
4043 }
4044
4045
4046 static void
c_segment_sv_hash_drop_ref(int hash_indx)4047 c_segment_sv_hash_drop_ref(int hash_indx)
4048 {
4049 struct c_sv_hash_entry o_sv_he, n_sv_he;
4050
4051 while (1) {
4052 o_sv_he.he_record = c_segment_sv_hash_table[hash_indx].he_record;
4053
4054 n_sv_he.he_ref = o_sv_he.he_ref - 1;
4055 n_sv_he.he_data = o_sv_he.he_data;
4056
4057 if (OSCompareAndSwap64((UInt64)o_sv_he.he_record, (UInt64)n_sv_he.he_record, (UInt64 *) &c_segment_sv_hash_table[hash_indx].he_record) == TRUE) {
4058 if (n_sv_he.he_ref == 0) {
4059 OSAddAtomic(-1, &c_segment_svp_in_hash);
4060 }
4061 break;
4062 }
4063 }
4064 }
4065
4066
4067 static int
c_segment_sv_hash_insert(uint32_t data)4068 c_segment_sv_hash_insert(uint32_t data)
4069 {
4070 int hash_sindx;
4071 int misses;
4072 struct c_sv_hash_entry o_sv_he, n_sv_he;
4073 boolean_t got_ref = FALSE;
4074
4075 if (data == 0) {
4076 OSAddAtomic(1, &c_segment_svp_zero_compressions);
4077 } else {
4078 OSAddAtomic(1, &c_segment_svp_nonzero_compressions);
4079 }
4080
4081 hash_sindx = data & C_SV_HASH_MASK;
4082
4083 for (misses = 0; misses < C_SV_HASH_MAX_MISS; misses++) {
4084 o_sv_he.he_record = c_segment_sv_hash_table[hash_sindx].he_record;
4085
4086 while (o_sv_he.he_data == data || o_sv_he.he_ref == 0) {
4087 n_sv_he.he_ref = o_sv_he.he_ref + 1;
4088 n_sv_he.he_data = data;
4089
4090 if (OSCompareAndSwap64((UInt64)o_sv_he.he_record, (UInt64)n_sv_he.he_record, (UInt64 *) &c_segment_sv_hash_table[hash_sindx].he_record) == TRUE) {
4091 if (n_sv_he.he_ref == 1) {
4092 OSAddAtomic(1, &c_segment_svp_in_hash);
4093 }
4094 got_ref = TRUE;
4095 break;
4096 }
4097 o_sv_he.he_record = c_segment_sv_hash_table[hash_sindx].he_record;
4098 }
4099 if (got_ref == TRUE) {
4100 break;
4101 }
4102 hash_sindx++;
4103
4104 if (hash_sindx == C_SV_HASH_SIZE) {
4105 hash_sindx = 0;
4106 }
4107 }
4108 if (got_ref == FALSE) {
4109 return -1;
4110 }
4111
4112 return hash_sindx;
4113 }
4114
4115
4116 #if RECORD_THE_COMPRESSED_DATA
4117
4118 static void
c_compressed_record_data(char * src,int c_size)4119 c_compressed_record_data(char *src, int c_size)
4120 {
4121 if ((c_compressed_record_cptr + c_size + 4) >= c_compressed_record_ebuf) {
4122 panic("c_compressed_record_cptr >= c_compressed_record_ebuf");
4123 }
4124
4125 *(int *)((void *)c_compressed_record_cptr) = c_size;
4126
4127 c_compressed_record_cptr += 4;
4128
4129 memcpy(c_compressed_record_cptr, src, c_size);
4130 c_compressed_record_cptr += c_size;
4131 }
4132 #endif
4133
4134
4135 static int
c_compress_page(char * src,c_slot_mapping_t slot_ptr,c_segment_t * current_chead,char * scratch_buf)4136 c_compress_page(char *src, c_slot_mapping_t slot_ptr, c_segment_t *current_chead, char *scratch_buf)
4137 {
4138 int c_size = -1;
4139 int c_rounded_size = 0;
4140 int max_csize;
4141 c_slot_t cs;
4142 c_segment_t c_seg;
4143
4144 KERNEL_DEBUG(0xe0400000 | DBG_FUNC_START, *current_chead, 0, 0, 0, 0);
4145 retry:
4146 if ((c_seg = c_seg_allocate(current_chead)) == NULL) {
4147 return 1;
4148 }
4149 /*
4150 * returns with c_seg lock held
4151 * and PAGE_REPLACEMENT_DISALLOWED(TRUE)...
4152 * c_nextslot has been allocated and
4153 * c_store.c_buffer populated
4154 */
4155 assert(c_seg->c_state == C_IS_FILLING);
4156
4157 cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_seg->c_nextslot);
4158
4159 C_SLOT_ASSERT_PACKABLE(slot_ptr);
4160 cs->c_packed_ptr = C_SLOT_PACK_PTR(slot_ptr);
4161
4162 cs->c_offset = c_seg->c_nextoffset;
4163
4164 max_csize = c_seg_bufsize - C_SEG_OFFSET_TO_BYTES((int32_t)cs->c_offset);
4165
4166 if (max_csize > PAGE_SIZE) {
4167 max_csize = PAGE_SIZE;
4168 }
4169
4170 #if CHECKSUM_THE_DATA
4171 cs->c_hash_data = vmc_hash(src, PAGE_SIZE);
4172 #endif
4173 boolean_t incomp_copy = FALSE;
4174 int max_csize_adj = (max_csize - 4);
4175
4176 if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) {
4177 #if defined(__arm__) || defined(__arm64__)
4178 uint16_t ccodec = CINVALID;
4179 uint32_t inline_popcount;
4180 if (max_csize >= C_SEG_OFFSET_ALIGNMENT_BOUNDARY) {
4181 c_size = metacompressor((const uint8_t *) src,
4182 (uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset],
4183 max_csize_adj, &ccodec,
4184 scratch_buf, &incomp_copy, &inline_popcount);
4185 #if __APPLE_WKDM_POPCNT_EXTENSIONS__
4186 cs->c_inline_popcount = inline_popcount;
4187 #else
4188 assert(inline_popcount == C_SLOT_NO_POPCOUNT);
4189 #endif
4190
4191 #if C_SEG_OFFSET_ALIGNMENT_BOUNDARY > 4
4192 if (c_size > max_csize_adj) {
4193 c_size = -1;
4194 }
4195 #endif
4196 } else {
4197 c_size = -1;
4198 }
4199 assert(ccodec == CCWK || ccodec == CCLZ4);
4200 cs->c_codec = ccodec;
4201 #endif
4202 } else {
4203 #if defined(__arm__) || defined(__arm64__)
4204 cs->c_codec = CCWK;
4205 #endif
4206 #if defined(__arm64__)
4207 __unreachable_ok_push
4208 if (PAGE_SIZE == 4096) {
4209 c_size = WKdm_compress_4k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4210 (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4211 } else {
4212 c_size = WKdm_compress_16k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4213 (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4214 }
4215 __unreachable_ok_pop
4216 #else
4217 c_size = WKdm_compress_new((const WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4218 (WK_word *)(uintptr_t)scratch_buf, max_csize_adj);
4219 #endif
4220 }
4221 assertf(((c_size <= max_csize_adj) && (c_size >= -1)),
4222 "c_size invalid (%d, %d), cur compressions: %d", c_size, max_csize_adj, c_segment_pages_compressed);
4223
4224 if (c_size == -1) {
4225 if (max_csize < PAGE_SIZE) {
4226 c_current_seg_filled(c_seg, current_chead);
4227 assert(*current_chead == NULL);
4228
4229 lck_mtx_unlock_always(&c_seg->c_lock);
4230 /* TODO: it may be worth requiring codecs to distinguish
4231 * between incompressible inputs and failures due to
4232 * budget exhaustion.
4233 */
4234 PAGE_REPLACEMENT_DISALLOWED(FALSE);
4235 goto retry;
4236 }
4237 c_size = PAGE_SIZE;
4238
4239 if (incomp_copy == FALSE) {
4240 memcpy(&c_seg->c_store.c_buffer[cs->c_offset], src, c_size);
4241 }
4242
4243 OSAddAtomic(1, &c_segment_noncompressible_pages);
4244 } else if (c_size == 0) {
4245 int hash_index;
4246
4247 /*
4248 * special case - this is a page completely full of a single 32 bit value
4249 */
4250 hash_index = c_segment_sv_hash_insert(*(uint32_t *)(uintptr_t)src);
4251
4252 if (hash_index != -1) {
4253 slot_ptr->s_cindx = hash_index;
4254 slot_ptr->s_cseg = C_SV_CSEG_ID;
4255
4256 OSAddAtomic(1, &c_segment_svp_hash_succeeded);
4257 #if RECORD_THE_COMPRESSED_DATA
4258 c_compressed_record_data(src, 4);
4259 #endif
4260 goto sv_compression;
4261 }
4262 c_size = 4;
4263
4264 memcpy(&c_seg->c_store.c_buffer[cs->c_offset], src, c_size);
4265
4266 OSAddAtomic(1, &c_segment_svp_hash_failed);
4267 }
4268
4269 #if RECORD_THE_COMPRESSED_DATA
4270 c_compressed_record_data((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size);
4271 #endif
4272 #if CHECKSUM_THE_COMPRESSED_DATA
4273 cs->c_hash_compressed_data = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size);
4274 #endif
4275 #if POPCOUNT_THE_COMPRESSED_DATA
4276 cs->c_pop_cdata = vmc_pop((uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset], c_size);
4277 #endif
4278 c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
4279
4280 PACK_C_SIZE(cs, c_size);
4281 c_seg->c_bytes_used += c_rounded_size;
4282 c_seg->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
4283 c_seg->c_slots_used++;
4284
4285 slot_ptr->s_cindx = c_seg->c_nextslot++;
4286 /* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
4287 slot_ptr->s_cseg = c_seg->c_mysegno + 1;
4288
4289 sv_compression:
4290 if (c_seg->c_nextoffset >= c_seg_off_limit || c_seg->c_nextslot >= C_SLOT_MAX_INDEX) {
4291 c_current_seg_filled(c_seg, current_chead);
4292 assert(*current_chead == NULL);
4293 }
4294 lck_mtx_unlock_always(&c_seg->c_lock);
4295
4296 PAGE_REPLACEMENT_DISALLOWED(FALSE);
4297
4298 #if RECORD_THE_COMPRESSED_DATA
4299 if ((c_compressed_record_cptr - c_compressed_record_sbuf) >= c_seg_allocsize) {
4300 c_compressed_record_write(c_compressed_record_sbuf, (int)(c_compressed_record_cptr - c_compressed_record_sbuf));
4301 c_compressed_record_cptr = c_compressed_record_sbuf;
4302 }
4303 #endif
4304 if (c_size) {
4305 OSAddAtomic64(c_size, &c_segment_compressed_bytes);
4306 OSAddAtomic64(c_rounded_size, &compressor_bytes_used);
4307 }
4308 OSAddAtomic64(PAGE_SIZE, &c_segment_input_bytes);
4309
4310 OSAddAtomic(1, &c_segment_pages_compressed);
4311 #if CONFIG_FREEZE
4312 OSAddAtomic(1, &c_segment_pages_compressed_incore);
4313 #endif /* CONFIG_FREEZE */
4314 OSAddAtomic(1, &sample_period_compression_count);
4315
4316 KERNEL_DEBUG(0xe0400000 | DBG_FUNC_END, *current_chead, c_size, c_segment_input_bytes, c_segment_compressed_bytes, 0);
4317
4318 return 0;
4319 }
4320
4321 static inline void
sv_decompress(int32_t * ddst,int32_t pattern)4322 sv_decompress(int32_t *ddst, int32_t pattern)
4323 {
4324 // assert(__builtin_constant_p(PAGE_SIZE) != 0);
4325 #if defined(__x86_64__)
4326 memset_word(ddst, pattern, PAGE_SIZE / sizeof(int32_t));
4327 #elif defined(__arm64__)
4328 assert((PAGE_SIZE % 128) == 0);
4329 if (pattern == 0) {
4330 fill32_dczva((addr64_t)ddst, PAGE_SIZE);
4331 } else {
4332 fill32_nt((addr64_t)ddst, PAGE_SIZE, pattern);
4333 }
4334 #else
4335 size_t i;
4336
4337 /* Unroll the pattern fill loop 4x to encourage the
4338 * compiler to emit NEON stores, cf.
4339 * <rdar://problem/25839866> Loop autovectorization
4340 * anomalies.
4341 */
4342 /* * We use separate loops for each PAGE_SIZE
4343 * to allow the autovectorizer to engage, as PAGE_SIZE
4344 * may not be a constant.
4345 */
4346
4347 __unreachable_ok_push
4348 if (PAGE_SIZE == 4096) {
4349 for (i = 0; i < (4096U / sizeof(int32_t)); i += 4) {
4350 *ddst++ = pattern;
4351 *ddst++ = pattern;
4352 *ddst++ = pattern;
4353 *ddst++ = pattern;
4354 }
4355 } else {
4356 assert(PAGE_SIZE == 16384);
4357 for (i = 0; i < (int)(16384U / sizeof(int32_t)); i += 4) {
4358 *ddst++ = pattern;
4359 *ddst++ = pattern;
4360 *ddst++ = pattern;
4361 *ddst++ = pattern;
4362 }
4363 }
4364 __unreachable_ok_pop
4365 #endif
4366 }
4367
4368 static int
c_decompress_page(char * dst,volatile c_slot_mapping_t slot_ptr,int flags,int * zeroslot)4369 c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int *zeroslot)
4370 {
4371 c_slot_t cs;
4372 c_segment_t c_seg;
4373 uint32_t c_segno;
4374 uint16_t c_indx;
4375 int c_rounded_size;
4376 uint32_t c_size;
4377 int retval = 0;
4378 boolean_t need_unlock = TRUE;
4379 boolean_t consider_defragmenting = FALSE;
4380 boolean_t kdp_mode = FALSE;
4381
4382 if (__improbable(flags & C_KDP)) {
4383 if (not_in_kdp) {
4384 panic("C_KDP passed to decompress page from outside of debugger context");
4385 }
4386
4387 assert((flags & C_KEEP) == C_KEEP);
4388 assert((flags & C_DONT_BLOCK) == C_DONT_BLOCK);
4389
4390 if ((flags & (C_DONT_BLOCK | C_KEEP)) != (C_DONT_BLOCK | C_KEEP)) {
4391 return -2;
4392 }
4393
4394 kdp_mode = TRUE;
4395 *zeroslot = 0;
4396 }
4397
4398 ReTry:
4399 if (__probable(!kdp_mode)) {
4400 PAGE_REPLACEMENT_DISALLOWED(TRUE);
4401 } else {
4402 if (kdp_lck_rw_lock_is_acquired_exclusive(&c_master_lock)) {
4403 return -2;
4404 }
4405 }
4406
4407 #if HIBERNATION
4408 /*
4409 * if hibernation is enabled, it indicates (via a call
4410 * to 'vm_decompressor_lock' that no further
4411 * decompressions are allowed once it reaches
4412 * the point of flushing all of the currently dirty
4413 * anonymous memory through the compressor and out
4414 * to disk... in this state we allow freeing of compressed
4415 * pages and must honor the C_DONT_BLOCK case
4416 */
4417 if (__improbable(dst && decompressions_blocked == TRUE)) {
4418 if (flags & C_DONT_BLOCK) {
4419 if (__probable(!kdp_mode)) {
4420 PAGE_REPLACEMENT_DISALLOWED(FALSE);
4421 }
4422
4423 *zeroslot = 0;
4424 return -2;
4425 }
4426 /*
4427 * it's safe to atomically assert and block behind the
4428 * lock held in shared mode because "decompressions_blocked" is
4429 * only set and cleared and the thread_wakeup done when the lock
4430 * is held exclusively
4431 */
4432 assert_wait((event_t)&decompressions_blocked, THREAD_UNINT);
4433
4434 PAGE_REPLACEMENT_DISALLOWED(FALSE);
4435
4436 thread_block(THREAD_CONTINUE_NULL);
4437
4438 goto ReTry;
4439 }
4440 #endif
4441 /* s_cseg is actually "segno+1" */
4442 c_segno = slot_ptr->s_cseg - 1;
4443
4444 if (__improbable(c_segno >= c_segments_available)) {
4445 panic("c_decompress_page: c_segno %d >= c_segments_available %d, slot_ptr(%p), slot_data(%x)",
4446 c_segno, c_segments_available, slot_ptr, *(int *)((void *)slot_ptr));
4447 }
4448
4449 if (__improbable(c_segments[c_segno].c_segno < c_segments_available)) {
4450 panic("c_decompress_page: c_segno %d is free, slot_ptr(%p), slot_data(%x)",
4451 c_segno, slot_ptr, *(int *)((void *)slot_ptr));
4452 }
4453
4454 c_seg = c_segments[c_segno].c_seg;
4455
4456 if (__probable(!kdp_mode)) {
4457 lck_mtx_lock_spin_always(&c_seg->c_lock);
4458 } else {
4459 if (kdp_lck_mtx_lock_spin_is_acquired(&c_seg->c_lock)) {
4460 return -2;
4461 }
4462 }
4463
4464 assert(c_seg->c_state != C_IS_EMPTY && c_seg->c_state != C_IS_FREE);
4465
4466 if (dst == NULL && c_seg->c_busy_swapping) {
4467 assert(c_seg->c_busy);
4468
4469 goto bypass_busy_check;
4470 }
4471 if (flags & C_DONT_BLOCK) {
4472 if (c_seg->c_busy || (C_SEG_IS_ONDISK(c_seg) && dst)) {
4473 *zeroslot = 0;
4474
4475 retval = -2;
4476 goto done;
4477 }
4478 }
4479 if (c_seg->c_busy) {
4480 PAGE_REPLACEMENT_DISALLOWED(FALSE);
4481
4482 c_seg_wait_on_busy(c_seg);
4483
4484 goto ReTry;
4485 }
4486 bypass_busy_check:
4487
4488 c_indx = slot_ptr->s_cindx;
4489
4490 if (__improbable(c_indx >= c_seg->c_nextslot)) {
4491 panic("c_decompress_page: c_indx %d >= c_nextslot %d, c_seg(%p), slot_ptr(%p), slot_data(%x)",
4492 c_indx, c_seg->c_nextslot, c_seg, slot_ptr, *(int *)((void *)slot_ptr));
4493 }
4494
4495 cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
4496
4497 c_size = UNPACK_C_SIZE(cs);
4498
4499 if (__improbable(c_size == 0)) {
4500 panic("c_decompress_page: c_size == 0, c_seg(%p), slot_ptr(%p), slot_data(%x)",
4501 c_seg, slot_ptr, *(int *)((void *)slot_ptr));
4502 }
4503
4504 c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
4505
4506 if (dst) {
4507 uint32_t age_of_cseg;
4508 clock_sec_t cur_ts_sec;
4509 clock_nsec_t cur_ts_nsec;
4510
4511 if (C_SEG_IS_ONDISK(c_seg)) {
4512 #if CONFIG_FREEZE
4513 if (freezer_incore_cseg_acct) {
4514 if ((c_seg->c_slots_used + c_segment_pages_compressed_incore) >= c_segment_pages_compressed_nearing_limit) {
4515 PAGE_REPLACEMENT_DISALLOWED(FALSE);
4516 lck_mtx_unlock_always(&c_seg->c_lock);
4517
4518 memorystatus_kill_on_VM_compressor_space_shortage(FALSE /* async */);
4519
4520 goto ReTry;
4521 }
4522
4523 uint32_t incore_seg_count = c_segment_count - c_swappedout_count - c_swappedout_sparse_count;
4524 if ((incore_seg_count + 1) >= c_segments_nearing_limit) {
4525 PAGE_REPLACEMENT_DISALLOWED(FALSE);
4526 lck_mtx_unlock_always(&c_seg->c_lock);
4527
4528 memorystatus_kill_on_VM_compressor_space_shortage(FALSE /* async */);
4529
4530 goto ReTry;
4531 }
4532 }
4533 #endif /* CONFIG_FREEZE */
4534 assert(kdp_mode == FALSE);
4535 retval = c_seg_swapin(c_seg, FALSE, TRUE);
4536 assert(retval == 0);
4537
4538 retval = 1;
4539 }
4540 if (c_seg->c_state == C_ON_BAD_Q) {
4541 assert(c_seg->c_store.c_buffer == NULL);
4542 *zeroslot = 0;
4543
4544 retval = -1;
4545 goto done;
4546 }
4547
4548 #if POPCOUNT_THE_COMPRESSED_DATA
4549 unsigned csvpop;
4550 uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
4551 if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
4552 panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%x 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
4553 }
4554 #endif
4555
4556 #if CHECKSUM_THE_COMPRESSED_DATA
4557 unsigned csvhash;
4558 if (cs->c_hash_compressed_data != (csvhash = vmc_hash((char *)&c_seg->c_store.c_buffer[cs->c_offset], c_size))) {
4559 panic("Compressed data doesn't match original %p %p %u %u %u", c_seg, cs, c_size, cs->c_hash_compressed_data, csvhash);
4560 }
4561 #endif
4562 if (c_rounded_size == PAGE_SIZE) {
4563 /*
4564 * page wasn't compressible... just copy it out
4565 */
4566 memcpy(dst, &c_seg->c_store.c_buffer[cs->c_offset], PAGE_SIZE);
4567 } else if (c_size == 4) {
4568 int32_t data;
4569 int32_t *dptr;
4570
4571 /*
4572 * page was populated with a single value
4573 * that didn't fit into our fast hash
4574 * so we packed it in as a single non-compressed value
4575 * that we need to populate the page with
4576 */
4577 dptr = (int32_t *)(uintptr_t)dst;
4578 data = *(int32_t *)(&c_seg->c_store.c_buffer[cs->c_offset]);
4579 sv_decompress(dptr, data);
4580 } else {
4581 uint32_t my_cpu_no;
4582 char *scratch_buf;
4583
4584 if (__probable(!kdp_mode)) {
4585 /*
4586 * we're behind the c_seg lock held in spin mode
4587 * which means pre-emption is disabled... therefore
4588 * the following sequence is atomic and safe
4589 */
4590 my_cpu_no = cpu_number();
4591
4592 assert(my_cpu_no < compressor_cpus);
4593
4594 scratch_buf = &compressor_scratch_bufs[my_cpu_no * vm_compressor_get_decode_scratch_size()];
4595 } else {
4596 scratch_buf = kdp_compressor_scratch_buf;
4597 }
4598
4599 if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) {
4600 #if defined(__arm__) || defined(__arm64__)
4601 uint16_t c_codec = cs->c_codec;
4602 uint32_t inline_popcount;
4603 if (!metadecompressor((const uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset],
4604 (uint8_t *)dst, c_size, c_codec, (void *)scratch_buf, &inline_popcount)) {
4605 retval = -1;
4606 } else {
4607 #if __APPLE_WKDM_POPCNT_EXTENSIONS__
4608 if (inline_popcount != cs->c_inline_popcount) {
4609 /*
4610 * The codec choice in compression and
4611 * decompression must agree, so there
4612 * should never be a disagreement in
4613 * whether an inline population count
4614 * was performed.
4615 */
4616 assert(inline_popcount != C_SLOT_NO_POPCOUNT);
4617 assert(cs->c_inline_popcount != C_SLOT_NO_POPCOUNT);
4618 printf("decompression failure from physical region %llx+%05x: popcount mismatch (%d != %d)\n",
4619 (unsigned long long)kvtophys((uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset]), c_size,
4620 inline_popcount,
4621 cs->c_inline_popcount);
4622 retval = -1;
4623 }
4624 #else
4625 assert(inline_popcount == C_SLOT_NO_POPCOUNT);
4626 #endif /* __APPLE_WKDM_POPCNT_EXTENSIONS__ */
4627 }
4628 #endif
4629 } else {
4630 #if defined(__arm64__)
4631 __unreachable_ok_push
4632 if (PAGE_SIZE == 4096) {
4633 WKdm_decompress_4k((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4634 (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
4635 } else {
4636 WKdm_decompress_16k((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4637 (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
4638 }
4639 __unreachable_ok_pop
4640 #else
4641 WKdm_decompress_new((WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset],
4642 (WK_word *)(uintptr_t)dst, (WK_word *)(uintptr_t)scratch_buf, c_size);
4643 #endif
4644 }
4645 }
4646
4647 #if CHECKSUM_THE_DATA
4648 if (cs->c_hash_data != vmc_hash(dst, PAGE_SIZE)) {
4649 #if defined(__arm__) || defined(__arm64__)
4650 int32_t *dinput = &c_seg->c_store.c_buffer[cs->c_offset];
4651 panic("decompressed data doesn't match original cs: %p, hash: 0x%x, offset: %d, c_size: %d, c_rounded_size: %d, codec: %d, header: 0x%x 0x%x 0x%x", cs, cs->c_hash_data, cs->c_offset, c_size, c_rounded_size, cs->c_codec, *dinput, *(dinput + 1), *(dinput + 2));
4652 #else
4653 panic("decompressed data doesn't match original cs: %p, hash: %d, offset: 0x%x, c_size: %d", cs, cs->c_hash_data, cs->c_offset, c_size);
4654 #endif
4655 }
4656 #endif
4657 if (c_seg->c_swappedin_ts == 0 && !kdp_mode) {
4658 clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
4659
4660 age_of_cseg = (uint32_t)cur_ts_sec - c_seg->c_creation_ts;
4661 if (age_of_cseg < DECOMPRESSION_SAMPLE_MAX_AGE) {
4662 OSAddAtomic(1, &age_of_decompressions_during_sample_period[age_of_cseg]);
4663 } else {
4664 OSAddAtomic(1, &overage_decompressions_during_sample_period);
4665 }
4666
4667 OSAddAtomic(1, &sample_period_decompression_count);
4668 }
4669 }
4670 #if CONFIG_FREEZE
4671 else {
4672 /*
4673 * We are freeing an uncompressed page from this c_seg and so balance the ledgers.
4674 */
4675 if (C_SEG_IS_ONDISK(c_seg)) {
4676 /*
4677 * The compression sweep feature will push out anonymous pages to disk
4678 * without going through the freezer path and so those c_segs, while
4679 * swapped out, won't have an owner.
4680 */
4681 if (c_seg->c_task_owner) {
4682 task_update_frozen_to_swap_acct(c_seg->c_task_owner, PAGE_SIZE_64, DEBIT_FROM_SWAP);
4683 }
4684
4685 /*
4686 * We are freeing a page in swap without swapping it in. We bump the in-core
4687 * count here to simulate a swapin of a page so that we can accurately
4688 * decrement it below.
4689 */
4690 OSAddAtomic(1, &c_segment_pages_compressed_incore);
4691 }
4692 }
4693 #endif /* CONFIG_FREEZE */
4694
4695 if (flags & C_KEEP) {
4696 *zeroslot = 0;
4697 goto done;
4698 }
4699 assert(kdp_mode == FALSE);
4700
4701 c_seg->c_bytes_unused += c_rounded_size;
4702 c_seg->c_bytes_used -= c_rounded_size;
4703
4704 assert(c_seg->c_slots_used);
4705 c_seg->c_slots_used--;
4706 if (dst && c_seg->c_swappedin) {
4707 task_t task = current_task();
4708 if (task) {
4709 ledger_credit(task->ledger, task_ledgers.swapins, PAGE_SIZE);
4710 }
4711 }
4712
4713 PACK_C_SIZE(cs, 0);
4714
4715 if (c_indx < c_seg->c_firstemptyslot) {
4716 c_seg->c_firstemptyslot = c_indx;
4717 }
4718
4719 OSAddAtomic(-1, &c_segment_pages_compressed);
4720 #if CONFIG_FREEZE
4721 OSAddAtomic(-1, &c_segment_pages_compressed_incore);
4722 assertf(c_segment_pages_compressed_incore >= 0, "-ve incore count %p 0x%x", c_seg, c_segment_pages_compressed_incore);
4723 #endif /* CONFIG_FREEZE */
4724
4725 if (c_seg->c_state != C_ON_BAD_Q && !(C_SEG_IS_ONDISK(c_seg))) {
4726 /*
4727 * C_SEG_IS_ONDISK == TRUE can occur when we're doing a
4728 * free of a compressed page (i.e. dst == NULL)
4729 */
4730 OSAddAtomic64(-c_rounded_size, &compressor_bytes_used);
4731 }
4732 if (c_seg->c_busy_swapping) {
4733 /*
4734 * bypass case for c_busy_swapping...
4735 * let the swapin/swapout paths deal with putting
4736 * the c_seg on the minor compaction queue if needed
4737 */
4738 assert(c_seg->c_busy);
4739 goto done;
4740 }
4741 assert(!c_seg->c_busy);
4742
4743 if (c_seg->c_state != C_IS_FILLING) {
4744 if (c_seg->c_bytes_used == 0) {
4745 if (!(C_SEG_IS_ONDISK(c_seg))) {
4746 int pages_populated;
4747
4748 pages_populated = (round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset))) / PAGE_SIZE;
4749 c_seg->c_populated_offset = C_SEG_BYTES_TO_OFFSET(0);
4750
4751 if (pages_populated) {
4752 assert(c_seg->c_state != C_ON_BAD_Q);
4753 assert(c_seg->c_store.c_buffer != NULL);
4754
4755 C_SEG_BUSY(c_seg);
4756 lck_mtx_unlock_always(&c_seg->c_lock);
4757
4758 kernel_memory_depopulate(
4759 (vm_offset_t) c_seg->c_store.c_buffer,
4760 ptoa(pages_populated),
4761 KMA_COMPRESSOR, VM_KERN_MEMORY_COMPRESSOR);
4762
4763 lck_mtx_lock_spin_always(&c_seg->c_lock);
4764 C_SEG_WAKEUP_DONE(c_seg);
4765 }
4766 if (!c_seg->c_on_minorcompact_q && c_seg->c_state != C_ON_SWAPOUT_Q && c_seg->c_state != C_ON_SWAPIO_Q) {
4767 c_seg_need_delayed_compaction(c_seg, FALSE);
4768 }
4769 } else {
4770 if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q) {
4771 c_seg_move_to_sparse_list(c_seg);
4772 consider_defragmenting = TRUE;
4773 }
4774 }
4775 } else if (c_seg->c_on_minorcompact_q) {
4776 assert(c_seg->c_state != C_ON_BAD_Q);
4777 assert(!C_SEG_IS_ON_DISK_OR_SOQ(c_seg));
4778
4779 if (C_SEG_SHOULD_MINORCOMPACT_NOW(c_seg)) {
4780 c_seg_try_minor_compaction_and_unlock(c_seg);
4781 need_unlock = FALSE;
4782 }
4783 } else if (!(C_SEG_IS_ONDISK(c_seg))) {
4784 if (c_seg->c_state != C_ON_BAD_Q && c_seg->c_state != C_ON_SWAPOUT_Q && c_seg->c_state != C_ON_SWAPIO_Q &&
4785 C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
4786 c_seg_need_delayed_compaction(c_seg, FALSE);
4787 }
4788 } else if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q && C_SEG_ONDISK_IS_SPARSE(c_seg)) {
4789 c_seg_move_to_sparse_list(c_seg);
4790 consider_defragmenting = TRUE;
4791 }
4792 }
4793 done:
4794 if (__improbable(kdp_mode)) {
4795 return retval;
4796 }
4797
4798 if (need_unlock == TRUE) {
4799 lck_mtx_unlock_always(&c_seg->c_lock);
4800 }
4801
4802 PAGE_REPLACEMENT_DISALLOWED(FALSE);
4803
4804 if (consider_defragmenting == TRUE) {
4805 vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE);
4806 }
4807
4808 #if !XNU_TARGET_OS_OSX
4809 if ((c_minor_count && COMPRESSOR_NEEDS_TO_MINOR_COMPACT()) || vm_compressor_needs_to_major_compact()) {
4810 vm_wake_compactor_swapper();
4811 }
4812 #endif /* !XNU_TARGET_OS_OSX */
4813
4814 return retval;
4815 }
4816
4817
4818 int
vm_compressor_get(ppnum_t pn,int * slot,int flags)4819 vm_compressor_get(ppnum_t pn, int *slot, int flags)
4820 {
4821 c_slot_mapping_t slot_ptr;
4822 char *dst;
4823 int zeroslot = 1;
4824 int retval;
4825
4826 dst = pmap_map_compressor_page(pn);
4827 slot_ptr = (c_slot_mapping_t)slot;
4828
4829 assert(dst != NULL);
4830
4831 if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
4832 int32_t data;
4833 int32_t *dptr;
4834
4835 /*
4836 * page was populated with a single value
4837 * that found a home in our hash table
4838 * grab that value from the hash and populate the page
4839 * that we need to populate the page with
4840 */
4841 dptr = (int32_t *)(uintptr_t)dst;
4842 data = c_segment_sv_hash_table[slot_ptr->s_cindx].he_data;
4843 sv_decompress(dptr, data);
4844 if (!(flags & C_KEEP)) {
4845 c_segment_sv_hash_drop_ref(slot_ptr->s_cindx);
4846
4847 OSAddAtomic(-1, &c_segment_pages_compressed);
4848 #if CONFIG_FREEZE
4849 OSAddAtomic(-1, &c_segment_pages_compressed_incore);
4850 assertf(c_segment_pages_compressed_incore >= 0, "-ve incore count 0x%x", c_segment_pages_compressed_incore);
4851 #endif /* CONFIG_FREEZE */
4852 *slot = 0;
4853 }
4854 if (data) {
4855 OSAddAtomic(1, &c_segment_svp_nonzero_decompressions);
4856 } else {
4857 OSAddAtomic(1, &c_segment_svp_zero_decompressions);
4858 }
4859
4860 pmap_unmap_compressor_page(pn, dst);
4861 return 0;
4862 }
4863
4864 retval = c_decompress_page(dst, slot_ptr, flags, &zeroslot);
4865
4866 /*
4867 * zeroslot will be set to 0 by c_decompress_page if (flags & C_KEEP)
4868 * or (flags & C_DONT_BLOCK) and we found 'c_busy' or 'C_SEG_IS_ONDISK' to be TRUE
4869 */
4870 if (zeroslot) {
4871 *slot = 0;
4872 }
4873
4874 pmap_unmap_compressor_page(pn, dst);
4875
4876 /*
4877 * returns 0 if we successfully decompressed a page from a segment already in memory
4878 * returns 1 if we had to first swap in the segment, before successfully decompressing the page
4879 * returns -1 if we encountered an error swapping in the segment - decompression failed
4880 * returns -2 if (flags & C_DONT_BLOCK) and we found 'c_busy' or 'C_SEG_IS_ONDISK' to be true
4881 */
4882 return retval;
4883 }
4884
4885 #if DEVELOPMENT || DEBUG
4886
4887 void
vm_compressor_inject_error(int * slot)4888 vm_compressor_inject_error(int *slot)
4889 {
4890 c_slot_mapping_t slot_ptr = (c_slot_mapping_t)slot;
4891
4892 /* No error detection for single-value compression. */
4893 if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
4894 printf("%s(): cannot inject errors in SV-compressed pages\n", __func__ );
4895 return;
4896 }
4897
4898 /* s_cseg is actually "segno+1" */
4899 const uint32_t c_segno = slot_ptr->s_cseg - 1;
4900
4901 assert(c_segno < c_segments_available);
4902 assert(c_segments[c_segno].c_segno >= c_segments_available);
4903
4904 const c_segment_t c_seg = c_segments[c_segno].c_seg;
4905
4906 PAGE_REPLACEMENT_DISALLOWED(TRUE);
4907
4908 lck_mtx_lock_spin_always(&c_seg->c_lock);
4909 assert(c_seg->c_state != C_IS_EMPTY && c_seg->c_state != C_IS_FREE);
4910
4911 const uint16_t c_indx = slot_ptr->s_cindx;
4912 assert(c_indx < c_seg->c_nextslot);
4913
4914 /*
4915 * To safely make this segment temporarily writable, we need to mark
4916 * the segment busy, which allows us to release the segment lock.
4917 */
4918 while (c_seg->c_busy) {
4919 c_seg_wait_on_busy(c_seg);
4920 lck_mtx_lock_spin_always(&c_seg->c_lock);
4921 }
4922 C_SEG_BUSY(c_seg);
4923
4924 bool already_writable = (c_seg->c_state == C_IS_FILLING);
4925 if (!already_writable) {
4926 /*
4927 * Protection update must be performed preemptibly, so temporarily drop
4928 * the lock. Having set c_busy will prevent most other concurrent
4929 * operations.
4930 */
4931 lck_mtx_unlock_always(&c_seg->c_lock);
4932 C_SEG_MAKE_WRITEABLE(c_seg);
4933 lck_mtx_lock_spin_always(&c_seg->c_lock);
4934 }
4935
4936 /*
4937 * Once we've released the lock following our c_state == C_IS_FILLING check,
4938 * c_current_seg_filled() can (re-)write-protect the segment. However, it
4939 * will transition from C_IS_FILLING before releasing the c_seg lock, so we
4940 * can detect this by re-checking after we've reobtained the lock.
4941 */
4942 if (already_writable && c_seg->c_state != C_IS_FILLING) {
4943 lck_mtx_unlock_always(&c_seg->c_lock);
4944 C_SEG_MAKE_WRITEABLE(c_seg);
4945 lck_mtx_lock_spin_always(&c_seg->c_lock);
4946 already_writable = false;
4947 /* Segment can't be freed while c_busy is set. */
4948 assert(c_seg->c_state != C_IS_FILLING);
4949 }
4950
4951 /*
4952 * Skip if the segment is on disk. This check can only be performed after
4953 * the final acquisition of the segment lock before we attempt to write to
4954 * the segment.
4955 */
4956 if (!C_SEG_IS_ON_DISK_OR_SOQ(c_seg)) {
4957 c_slot_t cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
4958 int32_t *data = &c_seg->c_store.c_buffer[cs->c_offset];
4959 /* assume that the compressed data holds at least one int32_t */
4960 assert(UNPACK_C_SIZE(cs) > sizeof(*data));
4961 /*
4962 * This bit is known to be in the payload of a MISS packet resulting from
4963 * the pattern used in the test pattern from decompression_failure.c.
4964 * Flipping it should result in many corrupted bits in the test page.
4965 */
4966 data[0] ^= 0x00000100;
4967 }
4968
4969 if (!already_writable) {
4970 lck_mtx_unlock_always(&c_seg->c_lock);
4971 C_SEG_WRITE_PROTECT(c_seg);
4972 lck_mtx_lock_spin_always(&c_seg->c_lock);
4973 }
4974
4975 C_SEG_WAKEUP_DONE(c_seg);
4976 lck_mtx_unlock_always(&c_seg->c_lock);
4977
4978 PAGE_REPLACEMENT_DISALLOWED(FALSE);
4979 }
4980
4981 #endif /* DEVELOPMENT || DEBUG */
4982
4983 int
vm_compressor_free(int * slot,int flags)4984 vm_compressor_free(int *slot, int flags)
4985 {
4986 c_slot_mapping_t slot_ptr;
4987 int zeroslot = 1;
4988 int retval;
4989
4990 assert(flags == 0 || flags == C_DONT_BLOCK);
4991
4992 slot_ptr = (c_slot_mapping_t)slot;
4993
4994 if (slot_ptr->s_cseg == C_SV_CSEG_ID) {
4995 c_segment_sv_hash_drop_ref(slot_ptr->s_cindx);
4996 OSAddAtomic(-1, &c_segment_pages_compressed);
4997 #if CONFIG_FREEZE
4998 OSAddAtomic(-1, &c_segment_pages_compressed_incore);
4999 assertf(c_segment_pages_compressed_incore >= 0, "-ve incore count 0x%x", c_segment_pages_compressed_incore);
5000 #endif /* CONFIG_FREEZE */
5001
5002 *slot = 0;
5003 return 0;
5004 }
5005 retval = c_decompress_page(NULL, slot_ptr, flags, &zeroslot);
5006 /*
5007 * returns 0 if we successfully freed the specified compressed page
5008 * returns -2 if (flags & C_DONT_BLOCK) and we found 'c_busy' set
5009 */
5010
5011 if (retval == 0) {
5012 *slot = 0;
5013 } else {
5014 assert(retval == -2);
5015 }
5016
5017 return retval;
5018 }
5019
5020
5021 int
vm_compressor_put(ppnum_t pn,int * slot,void ** current_chead,char * scratch_buf)5022 vm_compressor_put(ppnum_t pn, int *slot, void **current_chead, char *scratch_buf)
5023 {
5024 char *src;
5025 int retval;
5026
5027 src = pmap_map_compressor_page(pn);
5028 assert(src != NULL);
5029
5030 retval = c_compress_page(src, (c_slot_mapping_t)slot, (c_segment_t *)current_chead, scratch_buf);
5031 pmap_unmap_compressor_page(pn, src);
5032
5033 return retval;
5034 }
5035
5036 void
vm_compressor_transfer(int * dst_slot_p,int * src_slot_p)5037 vm_compressor_transfer(
5038 int *dst_slot_p,
5039 int *src_slot_p)
5040 {
5041 c_slot_mapping_t dst_slot, src_slot;
5042 c_segment_t c_seg;
5043 uint16_t c_indx;
5044 c_slot_t cs;
5045
5046 src_slot = (c_slot_mapping_t) src_slot_p;
5047
5048 if (src_slot->s_cseg == C_SV_CSEG_ID) {
5049 *dst_slot_p = *src_slot_p;
5050 *src_slot_p = 0;
5051 return;
5052 }
5053 dst_slot = (c_slot_mapping_t) dst_slot_p;
5054 Retry:
5055 PAGE_REPLACEMENT_DISALLOWED(TRUE);
5056 /* get segment for src_slot */
5057 c_seg = c_segments[src_slot->s_cseg - 1].c_seg;
5058 /* lock segment */
5059 lck_mtx_lock_spin_always(&c_seg->c_lock);
5060 /* wait if it's busy */
5061 if (c_seg->c_busy && !c_seg->c_busy_swapping) {
5062 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5063 c_seg_wait_on_busy(c_seg);
5064 goto Retry;
5065 }
5066 /* find the c_slot */
5067 c_indx = src_slot->s_cindx;
5068 cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
5069 /* point the c_slot back to dst_slot instead of src_slot */
5070 C_SLOT_ASSERT_PACKABLE(dst_slot);
5071 cs->c_packed_ptr = C_SLOT_PACK_PTR(dst_slot);
5072 /* transfer */
5073 *dst_slot_p = *src_slot_p;
5074 *src_slot_p = 0;
5075 lck_mtx_unlock_always(&c_seg->c_lock);
5076 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5077 }
5078
5079 #if defined(__arm64__)
5080 extern clock_sec_t vm_swapfile_last_failed_to_create_ts;
5081 __attribute__((noreturn))
5082 void
vm_panic_hibernate_write_image_failed(int err)5083 vm_panic_hibernate_write_image_failed(int err)
5084 {
5085 panic("hibernate_write_image encountered error 0x%x - %u, %u, %d, %d, %d, %d, %d, %d, %d, %d, %llu, %d, %d, %d\n",
5086 err,
5087 VM_PAGE_COMPRESSOR_COUNT, vm_page_wire_count,
5088 c_age_count, c_major_count, c_minor_count, c_swapout_count, c_swappedout_sparse_count,
5089 vm_num_swap_files, vm_num_pinned_swap_files, vm_swappin_enabled, vm_swap_put_failures,
5090 (vm_swapfile_last_failed_to_create_ts ? 1:0), hibernate_no_swapspace, hibernate_flush_timed_out);
5091 }
5092 #endif /*(__arm64__)*/
5093
5094 #if CONFIG_FREEZE
5095
5096 int freezer_finished_filling = 0;
5097
5098 void
vm_compressor_finished_filling(void ** current_chead)5099 vm_compressor_finished_filling(
5100 void **current_chead)
5101 {
5102 c_segment_t c_seg;
5103
5104 if ((c_seg = *(c_segment_t *)current_chead) == NULL) {
5105 return;
5106 }
5107
5108 assert(c_seg->c_state == C_IS_FILLING);
5109
5110 lck_mtx_lock_spin_always(&c_seg->c_lock);
5111
5112 c_current_seg_filled(c_seg, (c_segment_t *)current_chead);
5113
5114 lck_mtx_unlock_always(&c_seg->c_lock);
5115
5116 freezer_finished_filling++;
5117 }
5118
5119
5120 /*
5121 * This routine is used to transfer the compressed chunks from
5122 * the c_seg/cindx pointed to by slot_p into a new c_seg headed
5123 * by the current_chead and a new cindx within that c_seg.
5124 *
5125 * Currently, this routine is only used by the "freezer backed by
5126 * compressor with swap" mode to create a series of c_segs that
5127 * only contain compressed data belonging to one task. So, we
5128 * move a task's previously compressed data into a set of new
5129 * c_segs which will also hold the task's yet to be compressed data.
5130 */
5131
5132 kern_return_t
vm_compressor_relocate(void ** current_chead,int * slot_p)5133 vm_compressor_relocate(
5134 void **current_chead,
5135 int *slot_p)
5136 {
5137 c_slot_mapping_t slot_ptr;
5138 c_slot_mapping_t src_slot;
5139 uint32_t c_rounded_size;
5140 uint32_t c_size;
5141 uint16_t dst_slot;
5142 c_slot_t c_dst;
5143 c_slot_t c_src;
5144 uint16_t c_indx;
5145 c_segment_t c_seg_dst = NULL;
5146 c_segment_t c_seg_src = NULL;
5147 kern_return_t kr = KERN_SUCCESS;
5148
5149
5150 src_slot = (c_slot_mapping_t) slot_p;
5151
5152 if (src_slot->s_cseg == C_SV_CSEG_ID) {
5153 /*
5154 * no need to relocate... this is a page full of a single
5155 * value which is hashed to a single entry not contained
5156 * in a c_segment_t
5157 */
5158 return kr;
5159 }
5160
5161 Relookup_dst:
5162 c_seg_dst = c_seg_allocate((c_segment_t *)current_chead);
5163 /*
5164 * returns with c_seg lock held
5165 * and PAGE_REPLACEMENT_DISALLOWED(TRUE)...
5166 * c_nextslot has been allocated and
5167 * c_store.c_buffer populated
5168 */
5169 if (c_seg_dst == NULL) {
5170 /*
5171 * Out of compression segments?
5172 */
5173 kr = KERN_RESOURCE_SHORTAGE;
5174 goto out;
5175 }
5176
5177 assert(c_seg_dst->c_busy == 0);
5178
5179 C_SEG_BUSY(c_seg_dst);
5180
5181 dst_slot = c_seg_dst->c_nextslot;
5182
5183 lck_mtx_unlock_always(&c_seg_dst->c_lock);
5184
5185 Relookup_src:
5186 c_seg_src = c_segments[src_slot->s_cseg - 1].c_seg;
5187
5188 assert(c_seg_dst != c_seg_src);
5189
5190 lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5191
5192 if (C_SEG_IS_ON_DISK_OR_SOQ(c_seg_src) ||
5193 c_seg_src->c_state == C_IS_FILLING) {
5194 /*
5195 * Skip this page if :-
5196 * a) the src c_seg is already on-disk (or on its way there)
5197 * A "thaw" can mark a process as eligible for
5198 * another freeze cycle without bringing any of
5199 * its swapped out c_segs back from disk (because
5200 * that is done on-demand).
5201 * Or, this page may be mapped elsewhere in the task's map,
5202 * and we may have marked it for swap already.
5203 *
5204 * b) Or, the src c_seg is being filled by the compressor
5205 * thread. We don't want the added latency of waiting for
5206 * this c_seg in the freeze path and so we skip it.
5207 */
5208
5209 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5210
5211 lck_mtx_unlock_always(&c_seg_src->c_lock);
5212
5213 c_seg_src = NULL;
5214
5215 goto out;
5216 }
5217
5218 if (c_seg_src->c_busy) {
5219 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5220 c_seg_wait_on_busy(c_seg_src);
5221
5222 c_seg_src = NULL;
5223
5224 PAGE_REPLACEMENT_DISALLOWED(TRUE);
5225
5226 goto Relookup_src;
5227 }
5228
5229 C_SEG_BUSY(c_seg_src);
5230
5231 lck_mtx_unlock_always(&c_seg_src->c_lock);
5232
5233 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5234
5235 /* find the c_slot */
5236 c_indx = src_slot->s_cindx;
5237
5238 c_src = C_SEG_SLOT_FROM_INDEX(c_seg_src, c_indx);
5239
5240 c_size = UNPACK_C_SIZE(c_src);
5241
5242 assert(c_size);
5243
5244 if (c_size > (uint32_t)(c_seg_bufsize - C_SEG_OFFSET_TO_BYTES((int32_t)c_seg_dst->c_nextoffset))) {
5245 /*
5246 * This segment is full. We need a new one.
5247 */
5248
5249 PAGE_REPLACEMENT_DISALLOWED(TRUE);
5250
5251 lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5252 C_SEG_WAKEUP_DONE(c_seg_src);
5253 lck_mtx_unlock_always(&c_seg_src->c_lock);
5254
5255 c_seg_src = NULL;
5256
5257 lck_mtx_lock_spin_always(&c_seg_dst->c_lock);
5258
5259 assert(c_seg_dst->c_busy);
5260 assert(c_seg_dst->c_state == C_IS_FILLING);
5261 assert(!c_seg_dst->c_on_minorcompact_q);
5262
5263 c_current_seg_filled(c_seg_dst, (c_segment_t *)current_chead);
5264 assert(*current_chead == NULL);
5265
5266 C_SEG_WAKEUP_DONE(c_seg_dst);
5267
5268 lck_mtx_unlock_always(&c_seg_dst->c_lock);
5269
5270 c_seg_dst = NULL;
5271
5272 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5273
5274 goto Relookup_dst;
5275 }
5276
5277 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot);
5278
5279 memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], c_size);
5280 /*
5281 * Is platform alignment actually necessary since wkdm aligns its output?
5282 */
5283 c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
5284
5285 cslot_copy(c_dst, c_src);
5286 c_dst->c_offset = c_seg_dst->c_nextoffset;
5287
5288 if (c_seg_dst->c_firstemptyslot == c_seg_dst->c_nextslot) {
5289 c_seg_dst->c_firstemptyslot++;
5290 }
5291
5292 c_seg_dst->c_slots_used++;
5293 c_seg_dst->c_nextslot++;
5294 c_seg_dst->c_bytes_used += c_rounded_size;
5295 c_seg_dst->c_nextoffset += C_SEG_BYTES_TO_OFFSET(c_rounded_size);
5296
5297
5298 PACK_C_SIZE(c_src, 0);
5299
5300 c_seg_src->c_bytes_used -= c_rounded_size;
5301 c_seg_src->c_bytes_unused += c_rounded_size;
5302
5303 assert(c_seg_src->c_slots_used);
5304 c_seg_src->c_slots_used--;
5305
5306 if (!c_seg_src->c_swappedin) {
5307 /* Pessimistically lose swappedin status when non-swappedin pages are added. */
5308 c_seg_dst->c_swappedin = false;
5309 }
5310
5311 if (c_indx < c_seg_src->c_firstemptyslot) {
5312 c_seg_src->c_firstemptyslot = c_indx;
5313 }
5314
5315 c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, dst_slot);
5316
5317 PAGE_REPLACEMENT_ALLOWED(TRUE);
5318 slot_ptr = C_SLOT_UNPACK_PTR(c_dst);
5319 /* <csegno=0,indx=0> would mean "empty slot", so use csegno+1 */
5320 slot_ptr->s_cseg = c_seg_dst->c_mysegno + 1;
5321 slot_ptr->s_cindx = dst_slot;
5322
5323 PAGE_REPLACEMENT_ALLOWED(FALSE);
5324
5325 out:
5326 if (c_seg_src) {
5327 lck_mtx_lock_spin_always(&c_seg_src->c_lock);
5328
5329 C_SEG_WAKEUP_DONE(c_seg_src);
5330
5331 if (c_seg_src->c_bytes_used == 0 && c_seg_src->c_state != C_IS_FILLING) {
5332 if (!c_seg_src->c_on_minorcompact_q) {
5333 c_seg_need_delayed_compaction(c_seg_src, FALSE);
5334 }
5335 }
5336
5337 lck_mtx_unlock_always(&c_seg_src->c_lock);
5338 }
5339
5340 if (c_seg_dst) {
5341 PAGE_REPLACEMENT_DISALLOWED(TRUE);
5342
5343 lck_mtx_lock_spin_always(&c_seg_dst->c_lock);
5344
5345 if (c_seg_dst->c_nextoffset >= c_seg_off_limit || c_seg_dst->c_nextslot >= C_SLOT_MAX_INDEX) {
5346 /*
5347 * Nearing or exceeded maximum slot and offset capacity.
5348 */
5349 assert(c_seg_dst->c_busy);
5350 assert(c_seg_dst->c_state == C_IS_FILLING);
5351 assert(!c_seg_dst->c_on_minorcompact_q);
5352
5353 c_current_seg_filled(c_seg_dst, (c_segment_t *)current_chead);
5354 assert(*current_chead == NULL);
5355 }
5356
5357 C_SEG_WAKEUP_DONE(c_seg_dst);
5358
5359 lck_mtx_unlock_always(&c_seg_dst->c_lock);
5360
5361 c_seg_dst = NULL;
5362
5363 PAGE_REPLACEMENT_DISALLOWED(FALSE);
5364 }
5365
5366 return kr;
5367 }
5368 #endif /* CONFIG_FREEZE */
5369