xref: /xnu-12377.1.9/bsd/kern/kern_memorystatus_policy.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2006-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  *
28  */
29 
30 #include <kern/task.h>
31 #include <libkern/libkern.h>
32 #include <machine/atomic.h>
33 #include <mach/coalition.h>
34 #include <os/log.h>
35 #include <sys/coalition.h>
36 #include <sys/proc.h>
37 #include <sys/proc_internal.h>
38 #include <sys/sysctl.h>
39 #include <sys/kdebug.h>
40 #include <sys/kern_memorystatus.h>
41 #include <vm/vm_protos.h>
42 #include <vm/vm_compressor_xnu.h>
43 
44 #include <kern/kern_memorystatus_internal.h>
45 
46 /*
47  * All memory pressure policy decisions should live here, and there should be
48  * as little mechanism as possible. This file prioritizes readability.
49  */
50 
51 #pragma mark Policy Function Declarations
52 
53 #if CONFIG_JETSAM
54 static bool memorystatus_check_aggressive_jetsam_needed(int *jld_idle_kills);
55 #endif /* CONFIG_JETSAM */
56 
57 #pragma mark Memorystatus Health Check
58 
59 /*
60  * Each subsystem that relies on the memorystatus thread
61  * for resource exhaustion should put a health check in this section.
62  * The memorystatus thread runs all of the health checks
63  * to determine if the system is healthy. If the system is unhealthy
64  * it picks an action based on the system health status. See the
65  * Memorystatus Thread Actions section below.
66  */
67 
68 
69 #if XNU_TARGET_OS_WATCH
70 #define FREEZE_PREVENT_REFREEZE_OF_LAST_THAWED true
71 #define FREEZE_PREVENT_REFREEZE_OF_LAST_THAWED_TIMEOUT_SECONDS (60 * 15)
72 #else
73 #define FREEZE_PREVENT_REFREEZE_OF_LAST_THAWED false
74 #endif
75 extern pid_t memorystatus_freeze_last_pid_thawed;
76 extern uint64_t memorystatus_freeze_last_pid_thawed_ts;
77 
78 extern uint64_t memstat_oldest_reapable_proc_prio_start;
79 extern uint64_t memstat_reaper_min_age_secs;
80 extern uint64_t memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu;
81 extern bool     memstat_reaper_is_currently_sweeping;
82 
83 extern vm_pressure_level_t memorystatus_vm_pressure_level;
84 
85 static void
memstat_evaluate_health_conditions(memorystatus_system_health_t status)86 memstat_evaluate_health_conditions(memorystatus_system_health_t status)
87 {
88 	memset(status, 0, sizeof(memorystatus_system_health_t));
89 	status->msh_compressor_low_on_space = vm_compressor_low_on_space() ||
90 	    os_atomic_load(&memorystatus_compressor_space_shortage, relaxed);
91 	status->msh_compressor_exhausted = vm_compressor_out_of_space();
92 	status->msh_swap_low_on_space = vm_swap_low_on_space();
93 	status->msh_swap_exhausted = vm_swap_out_of_space();
94 #if CONFIG_JETSAM
95 	memstat_evaluate_page_shortage(
96 		&status->msh_available_pages_below_soft,
97 		&status->msh_available_pages_below_idle,
98 		&status->msh_available_pages_below_critical,
99 		&status->msh_available_pages_below_reaper);
100 	status->msh_compressor_is_thrashing = !memorystatus_swap_all_apps && vm_compressor_is_thrashing();
101 #if CONFIG_PHANTOM_CACHE
102 	status->msh_phantom_cache_pressure = os_atomic_load(&memorystatus_phantom_cache_pressure, relaxed);
103 #else
104 	status->msh_phantom_cache_pressure = false;
105 #endif /* CONFIG_PHANTOM_CACHE */
106 	if (!memorystatus_swap_all_apps &&
107 	    status->msh_phantom_cache_pressure &&
108 	    !(status->msh_compressor_is_thrashing && status->msh_compressor_exhausted)) {
109 		status->msh_filecache_is_thrashing = true;
110 	}
111 	status->msh_pageout_starved = os_atomic_load(&memorystatus_pageout_starved, relaxed);
112 	status->msh_swappable_compressor_segments_over_limit = memorystatus_swap_over_trigger(100);
113 	status->msh_swapin_queue_over_limit = memorystatus_swapin_over_trigger();
114 #else /* !CONFIG_JETSAM */
115 	vm_pressure_level_t pressure_level = memorystatus_vm_pressure_level;
116 	status->msh_vm_pressure_critical = (pressure_level == kVMPressureCritical);
117 	status->msh_vm_pressure_warning = (pressure_level >= kVMPressureWarning);
118 #endif /* CONFIG_JETSAM */
119 	status->msh_zone_map_is_exhausted = os_atomic_load(&memorystatus_zone_map_is_exhausted, relaxed);
120 }
121 
122 static bool
memstat_is_system_healthy(const memorystatus_system_health_t status)123 memstat_is_system_healthy(const memorystatus_system_health_t status)
124 {
125 #if CONFIG_JETSAM
126 	return !(status->msh_available_pages_below_critical ||
127 	       status->msh_compressor_is_thrashing ||
128 	       status->msh_compressor_exhausted ||
129 	       status->msh_compressor_low_on_space ||
130 	       status->msh_filecache_is_thrashing ||
131 	       status->msh_zone_map_is_exhausted ||
132 	       status->msh_pageout_starved);
133 #else /* CONFIG_JETSAM */
134 	return !(status->msh_zone_map_is_exhausted ||
135 	       status->msh_compressor_exhausted ||
136 	       status->msh_compressor_low_on_space ||
137 	       status->msh_swap_exhausted ||
138 	       status->msh_swap_low_on_space ||
139 	       status->msh_vm_pressure_critical ||
140 	       status->msh_vm_pressure_warning);
141 #endif /* CONFIG_JETSAM */
142 }
143 
144 static void
memstat_log_system_health(const memorystatus_system_health_t status)145 memstat_log_system_health(const memorystatus_system_health_t status)
146 {
147 	static struct memorystatus_system_health_s prev_status = {0};
148 
149 	bool healthy = memstat_is_system_healthy(status);
150 
151 	/*
152 	 * Avoid spamming logs by only logging when the system status has changed.
153 	 */
154 	if (prev_status.msh_zone_map_is_exhausted == status->msh_zone_map_is_exhausted &&
155 	    prev_status.msh_compressor_exhausted == status->msh_compressor_exhausted &&
156 	    prev_status.msh_swap_low_on_space == status->msh_swap_low_on_space &&
157 	    prev_status.msh_swap_exhausted == status->msh_swap_exhausted
158 #if CONFIG_JETSAM
159 	    &&
160 	    prev_status.msh_available_pages_below_idle == status->msh_available_pages_below_idle &&
161 	    prev_status.msh_available_pages_below_soft == status->msh_available_pages_below_soft &&
162 	    prev_status.msh_available_pages_below_critical == status->msh_available_pages_below_critical &&
163 	    prev_status.msh_available_pages_below_reaper == status->msh_available_pages_below_reaper &&
164 	    prev_status.msh_compressor_needs_to_swap == status->msh_compressor_needs_to_swap &&
165 	    prev_status.msh_compressor_is_thrashing == status->msh_compressor_is_thrashing &&
166 	    prev_status.msh_filecache_is_thrashing == status->msh_filecache_is_thrashing &&
167 	    prev_status.msh_phantom_cache_pressure == status->msh_phantom_cache_pressure &&
168 	    prev_status.msh_swapin_queue_over_limit == status->msh_swapin_queue_over_limit &&
169 	    prev_status.msh_pageout_starved == status->msh_pageout_starved
170 #endif /* CONFIG_JETSAM */
171 	    ) {
172 		/* No change */
173 		return;
174 	}
175 
176 #if CONFIG_JETSAM
177 	if (healthy) {
178 		if (status->msh_available_pages_below_soft) {
179 			memorystatus_log(
180 				"memorystatus: System will begin enforcing "
181 				"soft memory limits. "
182 				"memorystatus_available_pages: %llu compressor_size: %u\n",
183 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
184 		} else if (status->msh_available_pages_below_idle) {
185 			memorystatus_log(
186 				"memorystatus: System will begin enacting "
187 				"idle-exits. "
188 				"memorystatus_available_pages: %llu compressor_size: %u\n",
189 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
190 		} else if (status->msh_available_pages_below_reaper) {
191 			memorystatus_log(
192 				"memorystatus: System will begin reaping "
193 				"long-idle processes. "
194 				"memorystatus_available_pages: %llu compressor_size: %u\n",
195 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
196 		} else {
197 			memorystatus_log(
198 				"memorystatus: System is healthy. "
199 				"memorystatus_available_pages: %llu compressor_size:%u\n",
200 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
201 		}
202 	} else {
203 		/* Unhealthy */
204 		memorystatus_log("memorystatus: System is unhealthy! memorystatus_available_pages: %llu compressor_size:%u\n",
205 		    (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
206 		memorystatus_log(
207 			"memorystatus: {"
208 			"\"available_pages_below_critical\": %d, "
209 			"\"available_pages_below_idle\": %d, "
210 			"\"available_pages_below_soft\": %d, "
211 			"\"available_pages_below_reaper\": %d, "
212 			"\"compressor_needs_to_swap\": %d, "
213 			"\"compressor_exhausted\": %d, "
214 			"\"compressor_is_thrashing\": %d, "
215 			"\"filecache_is_thrashing\": %d, "
216 			"\"zone_map_is_exhausted\": %d, "
217 			"\"phantom_cache_pressure\": %d, "
218 			"\"swappable_compressor_segments_over_limit\": %d, "
219 			"\"swapin_queue_over_limit\": %d, "
220 			"\"swap_low\": %d, "
221 			"\"swap_exhausted\": %d"
222 			"}\n",
223 			status->msh_available_pages_below_critical,
224 			status->msh_available_pages_below_idle,
225 			status->msh_available_pages_below_soft,
226 			status->msh_available_pages_below_reaper,
227 			status->msh_compressor_needs_to_swap,
228 			status->msh_compressor_exhausted,
229 			status->msh_compressor_is_thrashing,
230 			status->msh_filecache_is_thrashing,
231 			status->msh_zone_map_is_exhausted,
232 			status->msh_phantom_cache_pressure,
233 			status->msh_swappable_compressor_segments_over_limit,
234 			status->msh_swapin_queue_over_limit,
235 			status->msh_swap_low_on_space,
236 			status->msh_swap_exhausted);
237 	}
238 #else /* CONFIG_JETSAM */
239 	memorystatus_log("memorystatus: System is %s. memorystatus_available_pages: %llu compressor_size:%u\n",
240 	    healthy ? "healthy" : "unhealthy",
241 	    (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
242 	if (!healthy) {
243 		memorystatus_log(
244 			"memorystatus: {"
245 			"\"compressor_exhausted\": %d, "
246 			"\"zone_map_is_exhausted\": %d, "
247 			"\"swap_low\": %d, "
248 			"\"swap_exhausted\": %d"
249 			"}\n",
250 			status->msh_compressor_exhausted,
251 			status->msh_zone_map_is_exhausted,
252 			status->msh_swap_low_on_space,
253 			status->msh_swap_exhausted);
254 	}
255 #endif /* CONFIG_JETSAM */
256 	prev_status = *status;
257 }
258 
259 bool
memstat_check_system_health(memorystatus_system_health_t status)260 memstat_check_system_health(memorystatus_system_health_t status)
261 {
262 	memstat_evaluate_health_conditions(status);
263 	memstat_log_system_health(status);
264 	return memstat_is_system_healthy(status);
265 }
266 
267 #pragma mark Memorystatus Thread Actions
268 
269 /*
270  * This section picks the appropriate memorystatus_action & deploys it.
271  */
272 
273 uint64_t memstat_last_cache_purge_ts;
274 /* Purge caches under critical pressure up to every 1 min */
275 TUNABLE(uint64_t, memstat_cache_purge_backoff_ns,
276     "memorystatus_cache_purge_backoff_ns", 1 * 60 * NSEC_PER_SEC);
277 
278 static uint32_t
memorystatus_pick_kill_cause(const memorystatus_system_health_t status)279 memorystatus_pick_kill_cause(const memorystatus_system_health_t status)
280 {
281 	assert(!memstat_is_system_healthy(status));
282 #if CONFIG_JETSAM
283 	if (status->msh_compressor_is_thrashing) {
284 		return kMemorystatusKilledVMCompressorThrashing;
285 	} else if (status->msh_compressor_exhausted) {
286 		return kMemorystatusKilledVMCompressorSpaceShortage;
287 	} else if (status->msh_swap_low_on_space) {
288 		return kMemorystatusKilledLowSwap;
289 	} else if (status->msh_filecache_is_thrashing) {
290 		return kMemorystatusKilledFCThrashing;
291 	} else if (status->msh_zone_map_is_exhausted) {
292 		return kMemorystatusKilledZoneMapExhaustion;
293 	} else if (status->msh_pageout_starved) {
294 		return kMemorystatusKilledVMPageoutStarvation;
295 	} else {
296 		assert(status->msh_available_pages_below_critical);
297 		return kMemorystatusKilledVMPageShortage;
298 	}
299 #else /* CONFIG_JETSAM */
300 	if (status->msh_zone_map_is_exhausted) {
301 		return kMemorystatusKilledZoneMapExhaustion;
302 	} else if (status->msh_compressor_exhausted) {
303 		return kMemorystatusKilledVMCompressorSpaceShortage;
304 	} else if (status->msh_swap_exhausted) {
305 		return kMemorystatusKilledLowSwap;
306 	} else {
307 		return kMemorystatusKilled;
308 	}
309 #endif /* CONFIG_JETSAM */
310 }
311 
312 /*
313  * Inspects the state of various resources in the system to see if
314  * the system is healthy. If the system is not healthy, picks a
315  * memorystatus_action_t to recover the system.
316  *
317  * Every time the memorystatus thread wakes up it calls into here
318  * to pick an action. It will continue performing memorystatus actions until this
319  * function returns MEMORYSTATUS_KILL_NONE. At that point the thread will block.
320  */
321 memorystatus_action_t
memorystatus_pick_action(jetsam_state_t state,uint32_t * kill_cause,bool highwater_remaining,bool suspended_swappable_apps_remaining,bool swappable_apps_remaining,int * jld_idle_kills)322 memorystatus_pick_action(jetsam_state_t state,
323     uint32_t *kill_cause,
324     bool highwater_remaining,
325     bool suspended_swappable_apps_remaining,
326     bool swappable_apps_remaining,
327     int *jld_idle_kills)
328 {
329 	struct memorystatus_system_health_s status;
330 	bool is_system_healthy = memstat_check_system_health(&status);
331 
332 #if CONFIG_JETSAM
333 	if (status.msh_available_pages_below_soft || !is_system_healthy) {
334 		/*
335 		 * If swap is enabled, first check if we're running low or are out of swap space.
336 		 */
337 		if (memorystatus_swap_all_apps && jetsam_kill_on_low_swap) {
338 			if (swappable_apps_remaining && status.msh_swap_exhausted) {
339 				*kill_cause = kMemorystatusKilledLowSwap;
340 				return MEMORYSTATUS_KILL_SWAPPABLE;
341 			} else if (suspended_swappable_apps_remaining && status.msh_swap_low_on_space) {
342 				*kill_cause = kMemorystatusKilledLowSwap;
343 				return MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE;
344 			}
345 		}
346 
347 		/*
348 		 * We're below the pressure level or the system is unhealthy,
349 		 * regardless of the system health let's check if we should be swapping
350 		 * and if there are high watermark kills left to do.
351 		 */
352 		if (memorystatus_swap_all_apps) {
353 			if (status.msh_swappable_compressor_segments_over_limit && !vm_swapout_thread_running && !os_atomic_load(&vm_swapout_wake_pending, relaxed)) {
354 				/*
355 				 * TODO: The swapper will keep running until it has drained the entire early swapout queue.
356 				 * That might be overly aggressive & we should look into tuning it.
357 				 * See rdar://84102304.
358 				 */
359 				return MEMORYSTATUS_WAKE_SWAPPER;
360 			} else if (status.msh_swapin_queue_over_limit) {
361 				return MEMORYSTATUS_PROCESS_SWAPIN_QUEUE;
362 			} else if (status.msh_swappable_compressor_segments_over_limit) {
363 				memorystatus_log_info(
364 					"memorystatus: Skipping swap wakeup because the swap thread is already running. vm_swapout_thread_running=%d, vm_swapout_wake_pending=%d\n",
365 					vm_swapout_thread_running, os_atomic_load(&vm_swapout_wake_pending, relaxed));
366 			}
367 		}
368 
369 		if (status.msh_compressor_exhausted || status.msh_compressor_low_on_space) {
370 			*kill_cause = kMemorystatusKilledVMCompressorSpaceShortage;
371 			return MEMORYSTATUS_KILL_TOP_PROCESS;
372 		}
373 
374 		if (highwater_remaining) {
375 			*kill_cause = kMemorystatusKilledHiwat;
376 			return MEMORYSTATUS_KILL_HIWATER;
377 		}
378 	}
379 
380 	if (status.msh_available_pages_below_idle &&
381 	    memstat_get_idle_proccnt() > 0 &&
382 	    is_system_healthy) {
383 		/*
384 		 * The system is below the idle threshold but otherwise healthy.
385 		 */
386 		*kill_cause = kMemorystatusKilledIdleExit;
387 		return MEMORYSTATUS_KILL_IDLE;
388 	}
389 
390 	if (memstat_reaper_is_currently_sweeping && is_system_healthy) {
391 		/*
392 		 * The system is healthy and we're in a reaper sweep.
393 		 */
394 		*kill_cause = kMemorystatusKilledLongIdleExit;
395 		return MEMORYSTATUS_KILL_LONG_IDLE;
396 	}
397 
398 	if (is_system_healthy) {
399 		*kill_cause = 0;
400 		return MEMORYSTATUS_KILL_NONE;
401 	}
402 
403 	/*
404 	 * At this point the system is unhealthy and there are no
405 	 * more highwatermark processes to kill.
406 	 */
407 
408 	if (!state->limit_to_low_bands) {
409 		if (memorystatus_check_aggressive_jetsam_needed(jld_idle_kills)) {
410 			memorystatus_log("memorystatus: Starting aggressive jetsam.\n");
411 			*kill_cause = kMemorystatusKilledProcThrashing;
412 			return MEMORYSTATUS_KILL_AGGRESSIVE;
413 		}
414 	}
415 
416 	/*
417 	 * The system is unhealthy and we either don't need aggressive jetsam
418 	 * or are not allowed to deploy it.
419 	 * Kill in priority order. We'll use LRU within every band except the
420 	 * FG (which will be sorted by coalition role).
421 	 */
422 	*kill_cause = memorystatus_pick_kill_cause(&status);
423 	return MEMORYSTATUS_KILL_TOP_PROCESS;
424 #else /* !CONFIG_JETSAM */
425 	(void) state;
426 	(void) jld_idle_kills;
427 	(void) suspended_swappable_apps_remaining;
428 	(void) swappable_apps_remaining;
429 	(void) highwater_remaining;
430 
431 	/*
432 	 * Without CONFIG_JETSAM, we only kill if the system is unhealthy.
433 	 * There is no aggressive jetsam and no
434 	 * early highwatermark killing.
435 	 */
436 	if (is_system_healthy) {
437 		*kill_cause = 0;
438 		return MEMORYSTATUS_KILL_NONE;
439 	}
440 	*kill_cause = memorystatus_pick_kill_cause(&status);
441 	if (status.msh_zone_map_is_exhausted) {
442 		return MEMORYSTATUS_KILL_TOP_PROCESS;
443 	}
444 	if (status.msh_compressor_exhausted || status.msh_swap_exhausted) {
445 		if (kill_on_no_paging_space) {
446 			return MEMORYSTATUS_KILL_TOP_PROCESS;
447 		}
448 	}
449 	if (status.msh_compressor_low_on_space || status.msh_swap_low_on_space) {
450 		if (memstat_get_idle_proccnt() > 0) {
451 			/* Kill all idle processes before invoking the no paging space action */
452 			return MEMORYSTATUS_KILL_IDLE;
453 		}
454 		/*
455 		 * Throttle how often the no-paging-space action is performed.
456 		 */
457 		uint64_t now = mach_absolute_time();
458 		uint64_t delta_since_last_no_space_ns;
459 		uint64_t last_action_ts = os_atomic_load(&last_no_space_action_ts, relaxed);
460 		assert3u(now, >=, last_action_ts);
461 		absolutetime_to_nanoseconds(now - last_action_ts, &delta_since_last_no_space_ns);
462 		if (delta_since_last_no_space_ns > no_paging_space_action_throttle_delay_ns) {
463 			return MEMORYSTATUS_NO_PAGING_SPACE;
464 		} else {
465 			return MEMORYSTATUS_KILL_NONE;
466 		}
467 	}
468 	if (status.msh_vm_pressure_critical) {
469 		/*
470 		 * The system is under critical memory pressure. First terminate any low-risk
471 		 * idle processes. When they are exhausted, purge system memory caches.
472 		 */
473 		if (memstat_pressure_config & MEMSTAT_WARNING_KILL_LONG_IDLE &&
474 		    memstat_get_long_idle_proccnt() > 0) {
475 			*kill_cause = kMemorystatusKilledLongIdleExit;
476 			return MEMORYSTATUS_KILL_LONG_IDLE;
477 		}
478 		if (memstat_pressure_config & MEMSTAT_CRITICAL_KILL_IDLE &&
479 		    memstat_get_idle_proccnt() > 0) {
480 			*kill_cause = kMemorystatusKilledIdleExit;
481 			return MEMORYSTATUS_KILL_IDLE;
482 		}
483 		if (memstat_pressure_config & MEMSTAT_CRITICAL_PURGE_CACHES) {
484 			uint64_t now = mach_absolute_time();
485 			uint64_t delta_ns;
486 			uint64_t last_purge_ts = os_atomic_load(&memstat_last_cache_purge_ts, relaxed);
487 			assert3u(now, >=, last_purge_ts);
488 			absolutetime_to_nanoseconds(now - last_purge_ts, &delta_ns);
489 			if (delta_ns > memstat_cache_purge_backoff_ns) {
490 				memstat_last_cache_purge_ts = now;
491 				return MEMORYSTATUS_PURGE_CACHES;
492 			}
493 		}
494 		return MEMORYSTATUS_KILL_NONE;
495 	} else if (status.msh_vm_pressure_warning) {
496 		/*
497 		 * The system is under pressure and is likely to start swapping soon. Reap
498 		 * any long-idle daemons.
499 		 */
500 		if (memstat_pressure_config & MEMSTAT_WARNING_KILL_LONG_IDLE &&
501 		    memstat_get_long_idle_proccnt() > 0) {
502 			*kill_cause = kMemorystatusKilledLongIdleExit;
503 			return MEMORYSTATUS_KILL_LONG_IDLE;
504 		}
505 		return MEMORYSTATUS_KILL_NONE;
506 	}
507 #endif /* CONFIG_JETSAM */
508 	panic("System is unhealthy but no action has been chosen");
509 }
510 
511 #pragma mark Aggressive Jetsam
512 /*
513  * This section defines when we deploy aggressive jetsam.
514  * Aggressive jetsam kills everything up to the jld_priority_band_max band.
515  */
516 
517 #if CONFIG_JETSAM
518 
519 static bool
520 memorystatus_aggressive_jetsam_needed_sysproc_aging(__unused int jld_eval_aggressive_count, __unused int *jld_idle_kills, __unused int jld_idle_kill_candidates, int *total_candidates);
521 
522 /*
523  * kJetsamHighRelaunchCandidatesThreshold defines the percentage of candidates
524  * in the idle & deferred bands that need to be bad candidates in order to trigger
525  * aggressive jetsam.
526  */
527 TUNABLE_DEV_WRITEABLE(unsigned int, kJetsamHighRelaunchCandidatesThreshold, "jetsam_high_relaunch_candidates_threshold_percent", 100);
528 #if DEVELOPMENT || DEBUG
529 SYSCTL_UINT(_kern, OID_AUTO, jetsam_high_relaunch_candidates_threshold_percent, CTLFLAG_RW | CTLFLAG_LOCKED, &kJetsamHighRelaunchCandidatesThreshold, 100, "");
530 #endif /* DEVELOPMENT || DEBUG */
531 
532 /* kJetsamMinCandidatesThreshold defines the minimum number of candidates in the
533  * idle/deferred bands to trigger aggressive jetsam. This value basically decides
534  * how much memory the system is ready to hold in the lower bands without triggering
535  * aggressive jetsam. This number should ideally be tuned based on the memory config
536  * of the device.
537  */
538 TUNABLE_DT_DEV_WRITEABLE(unsigned int, kJetsamMinCandidatesThreshold, "/defaults", "kern.jetsam_min_candidates_threshold", "jetsam_min_candidates_threshold", 5, TUNABLE_DT_CHECK_CHOSEN);
539 #if DEVELOPMENT || DEBUG
540 SYSCTL_UINT(_kern, OID_AUTO, jetsam_min_candidates_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &kJetsamMinCandidatesThreshold, 5, "");
541 #endif /* DEVELOPMENT || DEBUG */
542 
543 static bool
memorystatus_check_aggressive_jetsam_needed(int * jld_idle_kills)544 memorystatus_check_aggressive_jetsam_needed(int *jld_idle_kills)
545 {
546 	bool aggressive_jetsam_needed = false;
547 	int total_candidates = 0;
548 	/*
549 	 * The aggressive jetsam logic looks at the number of times it has been in the
550 	 * aggressive loop to determine the max priority band it should kill upto. The
551 	 * static variables below are used to track that property.
552 	 *
553 	 * To reset those values, the implementation checks if it has been
554 	 * memorystatus_jld_eval_period_msecs since the parameters were reset.
555 	 */
556 
557 	if (memorystatus_jld_enabled == FALSE) {
558 		/* If aggressive jetsam is disabled, nothing to do here */
559 		return false;
560 	}
561 
562 	/* Get current timestamp (msecs only) */
563 	struct timeval  jld_now_tstamp = {0, 0};
564 	uint64_t        jld_now_msecs = 0;
565 	microuptime(&jld_now_tstamp);
566 	jld_now_msecs = (jld_now_tstamp.tv_sec * 1000);
567 
568 	/*
569 	 * Look at the number of candidates in the idle and deferred band and
570 	 * how many out of them are marked as high relaunch probability.
571 	 */
572 	aggressive_jetsam_needed = memorystatus_aggressive_jetsam_needed_sysproc_aging(jld_eval_aggressive_count,
573 	    jld_idle_kills, jld_idle_kill_candidates, &total_candidates);
574 
575 	/*
576 	 * It is also possible that the system is down to a very small number of processes in the candidate
577 	 * bands. In that case, the decisions made by the memorystatus_aggressive_jetsam_needed_* routines
578 	 * would not be useful. In that case, do not trigger aggressive jetsam.
579 	 */
580 	if (total_candidates < kJetsamMinCandidatesThreshold) {
581 		memorystatus_log_debug(
582 			"memorystatus: aggressive: [FAILED] Low Candidate "
583 			"Count (current: %d, threshold: %d)\n",
584 			total_candidates, kJetsamMinCandidatesThreshold);
585 		aggressive_jetsam_needed = false;
586 	}
587 
588 	/*
589 	 * Check if its been really long since the aggressive jetsam evaluation
590 	 * parameters have been refreshed. This logic also resets the jld_eval_aggressive_count
591 	 * counter to make sure we reset the aggressive jetsam severity.
592 	 */
593 	if ((total_candidates == 0) ||
594 	    (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) {
595 		jld_timestamp_msecs       = jld_now_msecs;
596 		jld_idle_kill_candidates  = total_candidates;
597 		*jld_idle_kills           = 0;
598 		jld_eval_aggressive_count = 0;
599 	}
600 
601 	return aggressive_jetsam_needed;
602 }
603 
604 static bool
memorystatus_aggressive_jetsam_needed_sysproc_aging(__unused int eval_aggressive_count,__unused int * idle_kills,__unused int idle_kill_candidates,int * total_candidates)605 memorystatus_aggressive_jetsam_needed_sysproc_aging(__unused int eval_aggressive_count, __unused int *idle_kills, __unused int idle_kill_candidates, int *total_candidates)
606 {
607 	bool aggressive_jetsam_needed = false;
608 
609 	/*
610 	 * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, we maintain the jetsam
611 	 * relaunch behavior for all daemons. Also, daemons and apps are aged in deferred bands on
612 	 * every dirty->clean transition. For this aging policy, the best way to determine if
613 	 * aggressive jetsam is needed, is to see if the kill candidates are mostly bad candidates.
614 	 * If yes, then we need to go to higher bands to reclaim memory.
615 	 */
616 	proc_list_lock();
617 	/* Get total candidate counts for idle and idle deferred bands */
618 	*total_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].count + memstat_bucket[system_procs_aging_band].count;
619 	/* Get counts of bad kill candidates in idle and idle deferred bands */
620 	int bad_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].relaunch_high_count + memstat_bucket[system_procs_aging_band].relaunch_high_count;
621 
622 	proc_list_unlock();
623 
624 	/* Check if the number of bad candidates is greater than kJetsamHighRelaunchCandidatesThreshold % */
625 	aggressive_jetsam_needed = (((bad_candidates * 100) / *total_candidates) >= kJetsamHighRelaunchCandidatesThreshold);
626 
627 	/*
628 	 * Since the new aging policy bases the aggressive jetsam trigger on percentage of
629 	 * bad candidates, it is prone to being overly aggressive. In order to mitigate that,
630 	 * make sure the system is really under memory pressure before triggering aggressive
631 	 * jetsam.
632 	 */
633 	if (memorystatus_available_pages > memorystatus_sysproc_aging_aggr_pages) {
634 		aggressive_jetsam_needed = false;
635 	}
636 
637 #if DEVELOPMENT || DEBUG
638 	memorystatus_log_info(
639 		"memorystatus: aggressive%d: [%s] Bad Candidate Threshold Check (total: %d, bad: %d, threshold: %d %%); Memory Pressure Check (available_pgs: %llu, threshold_pgs: %llu)\n",
640 		eval_aggressive_count, aggressive_jetsam_needed ? "PASSED" : "FAILED", *total_candidates, bad_candidates,
641 		kJetsamHighRelaunchCandidatesThreshold, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, (uint64_t)memorystatus_sysproc_aging_aggr_pages);
642 #endif /* DEVELOPMENT || DEBUG */
643 	return aggressive_jetsam_needed;
644 }
645 
646 #endif /* CONFIG_JETSAM */
647 
648 #pragma mark Freezer
649 #if CONFIG_FREEZE
650 /*
651  * Freezer policies
652  */
653 
654 /*
655  * These functions determine what is eligible for the freezer
656  * and the order that we consider freezing them
657  */
658 
659 /*
660  * Checks if the given process is eligible for the freezer.
661  * Processes can only be frozen if this returns true.
662  */
663 bool
memorystatus_is_process_eligible_for_freeze(proc_t p)664 memorystatus_is_process_eligible_for_freeze(proc_t p)
665 {
666 	/*
667 	 * Called with proc_list_lock held.
668 	 */
669 
670 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
671 
672 	bool should_freeze = false;
673 	uint32_t state = 0, pages = 0;
674 	bool first_consideration = true;
675 	task_t task;
676 
677 	state = p->p_memstat_state;
678 
679 	if (state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) {
680 		if (state & P_MEMSTAT_FREEZE_DISABLED) {
681 			p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonDisabled;
682 		}
683 		goto out;
684 	}
685 
686 	task = proc_task(p);
687 
688 	if (isSysProc(p)) {
689 		/*
690 		 * Daemon:- We consider freezing it if:
691 		 * - it belongs to a coalition and the leader is frozen, and,
692 		 * - its role in the coalition is XPC service.
693 		 *
694 		 * We skip memory size requirements in this case.
695 		 */
696 		int task_role_in_coalition = 0;
697 		proc_t leader_proc = memorystatus_get_coalition_leader_and_role(p, &task_role_in_coalition);
698 		if (leader_proc == PROC_NULL || leader_proc == p) {
699 			/*
700 			 * Jetsam coalition is leaderless or the leader is not an app.
701 			 * Either way, don't freeze this proc.
702 			 */
703 			goto out;
704 		}
705 
706 		/* Leader must be frozen */
707 		if (!(leader_proc->p_memstat_state & P_MEMSTAT_FROZEN)) {
708 			goto out;
709 		}
710 		/* Only freeze XPC services */
711 		if (task_role_in_coalition == COALITION_TASKROLE_XPC) {
712 			should_freeze = true;
713 		}
714 
715 		goto out;
716 	} else {
717 		/*
718 		 * Application. Only freeze if it's suspended.
719 		 */
720 		if (!(state & P_MEMSTAT_SUSPENDED)) {
721 			goto out;
722 		}
723 	}
724 
725 	/*
726 	 * We're interested in tracking what percentage of
727 	 * eligible apps actually get frozen.
728 	 * To avoid skewing the metrics towards processes which
729 	 * are considered more frequently, we only track failures once
730 	 * per process.
731 	 */
732 	first_consideration = !(state & P_MEMSTAT_FREEZE_CONSIDERED);
733 
734 	if (first_consideration) {
735 		memorystatus_freezer_stats.mfs_process_considered_count++;
736 		p->p_memstat_state |= P_MEMSTAT_FREEZE_CONSIDERED;
737 	}
738 
739 	/* Only freeze applications meeting our minimum resident page criteria */
740 	memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
741 	if (pages < memorystatus_freeze_pages_min) {
742 		if (first_consideration) {
743 			memorystatus_freezer_stats.mfs_error_below_min_pages_count++;
744 		}
745 		p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonBelowMinPages;
746 		goto out;
747 	}
748 
749 	/* Don't freeze processes that are already exiting on core. It may have started exiting
750 	 * after we chose it for freeze, but before we obtained the proc_list_lock.
751 	 * NB: This is only possible if we're coming in from memorystatus_freeze_process_sync.
752 	 * memorystatus_freeze_top_process holds the proc_list_lock while it traverses the bands.
753 	 */
754 	if (proc_list_exited(p)) {
755 		if (first_consideration) {
756 			memorystatus_freezer_stats.mfs_error_other_count++;
757 		}
758 		p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonOther;
759 		goto out;
760 	}
761 
762 	if (!memorystatus_freezer_use_ordered_list) {
763 		/*
764 		 * We're not using the ordered list so we need to check
765 		 * that dasd recommended the process. Note that the ordered list
766 		 * algorithm only considers processes on the list in the first place
767 		 * so there's no need to double check here.
768 		 */
769 		if (!memorystatus_freeze_process_is_recommended(p)) {
770 			if (first_consideration) {
771 				memorystatus_freezer_stats.mfs_error_low_probability_of_use_count++;
772 			}
773 			p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonLowProbOfUse;
774 			goto out;
775 		}
776 	}
777 
778 	if (!(state & P_MEMSTAT_FROZEN) && p->p_memstat_effectivepriority > memorystatus_freeze_max_candidate_band) {
779 		/*
780 		 * Proc has been elevated by something else.
781 		 * Don't freeze it.
782 		 */
783 		if (first_consideration) {
784 			memorystatus_freezer_stats.mfs_error_elevated_count++;
785 		}
786 		p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonElevated;
787 		goto out;
788 	}
789 
790 	should_freeze = true;
791 out:
792 	if (should_freeze && !(state & P_MEMSTAT_FROZEN)) {
793 		/*
794 		 * Reset the skip reason. If it's killed before we manage to actually freeze it
795 		 * we failed to consider it early enough.
796 		 */
797 		p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
798 		if (!first_consideration) {
799 			/*
800 			 * We're freezing this for the first time and we previously considered it ineligible.
801 			 * Bump the considered count so that we track this as 1 failure
802 			 * and 1 success.
803 			 */
804 			memorystatus_freezer_stats.mfs_process_considered_count++;
805 		}
806 	}
807 	return should_freeze;
808 }
809 
810 bool
memorystatus_freeze_proc_is_refreeze_eligible(proc_t p)811 memorystatus_freeze_proc_is_refreeze_eligible(proc_t p)
812 {
813 	return (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) != 0;
814 }
815 
816 
817 static proc_t
memorystatus_freeze_pick_refreeze_process(proc_t last_p)818 memorystatus_freeze_pick_refreeze_process(proc_t last_p)
819 {
820 	proc_t p = PROC_NULL, next_p = PROC_NULL;
821 	unsigned int band = (unsigned int) memorystatus_freeze_jetsam_band;
822 	if (last_p == PROC_NULL) {
823 		next_p = memorystatus_get_first_proc_locked(&band, FALSE);
824 	} else {
825 		next_p = memorystatus_get_next_proc_locked(&band, last_p, FALSE);
826 	}
827 	while (next_p) {
828 		p = next_p;
829 		next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
830 		if ((p->p_memstat_state & P_MEMSTAT_FROZEN) && !memorystatus_freeze_proc_is_refreeze_eligible(p)) {
831 			/* Process is already frozen & hasn't been thawed. */
832 			continue;
833 		}
834 		/*
835 		 * Has to have been frozen once before.
836 		 */
837 		if (!(p->p_memstat_state & P_MEMSTAT_FROZEN)) {
838 			continue;
839 		}
840 
841 		/*
842 		 * Not currently being looked at for something.
843 		 */
844 		if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
845 			continue;
846 		}
847 
848 #if FREEZE_PREVENT_REFREEZE_OF_LAST_THAWED
849 		/*
850 		 * Don't refreeze the last process we just thawed if still within the timeout window
851 		 */
852 		if (p->p_pid == memorystatus_freeze_last_pid_thawed) {
853 			uint64_t timeout_delta_abs;
854 			nanoseconds_to_absolutetime(FREEZE_PREVENT_REFREEZE_OF_LAST_THAWED_TIMEOUT_SECONDS * NSEC_PER_SEC, &timeout_delta_abs);
855 			if (mach_absolute_time() < (memorystatus_freeze_last_pid_thawed_ts + timeout_delta_abs)) {
856 				continue;
857 			}
858 		}
859 #endif
860 
861 		/*
862 		 * Found it
863 		 */
864 		return p;
865 	}
866 	return PROC_NULL;
867 }
868 
869 proc_t
memorystatus_freeze_pick_process(struct memorystatus_freeze_list_iterator * iterator)870 memorystatus_freeze_pick_process(struct memorystatus_freeze_list_iterator *iterator)
871 {
872 	proc_t p = PROC_NULL, next_p = PROC_NULL;
873 	unsigned int band = JETSAM_PRIORITY_IDLE;
874 
875 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
876 	/*
877 	 * If the freezer is full, only consider refreezes.
878 	 */
879 	if (iterator->refreeze_only || memorystatus_frozen_count >= memorystatus_frozen_processes_max) {
880 		if (!iterator->refreeze_only) {
881 			/*
882 			 * The first time the iterator starts to return refreeze
883 			 * candidates, we need to reset the last pointer b/c it's pointing into the wrong band.
884 			 */
885 			iterator->last_p = PROC_NULL;
886 			iterator->refreeze_only = true;
887 		}
888 		iterator->last_p = memorystatus_freeze_pick_refreeze_process(iterator->last_p);
889 		return iterator->last_p;
890 	}
891 
892 	/*
893 	 * Search for the next freezer candidate.
894 	 */
895 	if (memorystatus_freezer_use_ordered_list) {
896 		while (iterator->global_freeze_list_index < memorystatus_global_freeze_list.mfcl_length) {
897 			p = memorystatus_freezer_candidate_list_get_proc(
898 				&memorystatus_global_freeze_list,
899 				(iterator->global_freeze_list_index)++,
900 				&memorystatus_freezer_stats.mfs_freeze_pid_mismatches);
901 
902 			if (p != PROC_NULL && memorystatus_is_process_eligible_for_freeze(p)) {
903 #if FREEZE_PREVENT_REFREEZE_OF_LAST_THAWED
904 				/*
905 				 * Don't refreeze the last process we just thawed if still within the timeout window
906 				 */
907 				if (p->p_pid == memorystatus_freeze_last_pid_thawed) {
908 					uint64_t timeout_delta_abs;
909 					nanoseconds_to_absolutetime(FREEZE_PREVENT_REFREEZE_OF_LAST_THAWED_TIMEOUT_SECONDS * NSEC_PER_SEC, &timeout_delta_abs);
910 					if (mach_absolute_time() < (memorystatus_freeze_last_pid_thawed_ts + timeout_delta_abs)) {
911 						continue;
912 					}
913 				}
914 #endif
915 				iterator->last_p = p;
916 				return iterator->last_p;
917 			}
918 		}
919 	} else {
920 		if (iterator->last_p == PROC_NULL) {
921 			next_p = memorystatus_get_first_proc_locked(&band, FALSE);
922 		} else {
923 			next_p = memorystatus_get_next_proc_locked(&band, iterator->last_p, FALSE);
924 		}
925 		while (next_p) {
926 			p = next_p;
927 			if (memorystatus_is_process_eligible_for_freeze(p)) {
928 				iterator->last_p = p;
929 				return iterator->last_p;
930 			} else {
931 				next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
932 			}
933 		}
934 	}
935 
936 	/*
937 	 * Failed to find a new freezer candidate.
938 	 * Try to re-freeze.
939 	 */
940 	if (memorystatus_refreeze_eligible_count >= memorystatus_min_thaw_refreeze_threshold) {
941 		assert(!iterator->refreeze_only);
942 		iterator->refreeze_only = true;
943 		iterator->last_p = memorystatus_freeze_pick_refreeze_process(PROC_NULL);
944 		return iterator->last_p;
945 	}
946 	return PROC_NULL;
947 }
948 
949 /*
950  * memorystatus_pages_update calls this function whenever the number
951  * of available pages changes. It wakes the freezer thread iff the function returns
952  * true. The freezer thread will try to freeze (or refreeze) up to 1 process
953  * before blocking again.
954  *
955  * Note the freezer thread is also woken up by memorystatus_on_inactivity.
956  */
957 
958 bool
memorystatus_freeze_thread_should_run()959 memorystatus_freeze_thread_should_run()
960 {
961 	/*
962 	 * No freezer_mutex held here...see why near call-site
963 	 * within memorystatus_pages_update().
964 	 */
965 
966 	if (memorystatus_freeze_enabled == false) {
967 		return false;
968 	}
969 
970 	if (memorystatus_available_pages > memorystatus_freeze_threshold) {
971 		return false;
972 	}
973 
974 	memorystatus_freezer_stats.mfs_below_threshold_count++;
975 
976 	if ((memorystatus_frozen_count >= memorystatus_frozen_processes_max)) {
977 		/*
978 		 * Consider this as a skip even if we wake up to refreeze because
979 		 * we won't freeze any new procs.
980 		 */
981 		memorystatus_freezer_stats.mfs_skipped_full_count++;
982 		if (memorystatus_refreeze_eligible_count < memorystatus_min_thaw_refreeze_threshold) {
983 			return false;
984 		}
985 	}
986 
987 	if (memorystatus_frozen_shared_mb_max && (memorystatus_frozen_shared_mb >= memorystatus_frozen_shared_mb_max)) {
988 		memorystatus_freezer_stats.mfs_skipped_shared_mb_high_count++;
989 		return false;
990 	}
991 
992 	uint64_t curr_time = mach_absolute_time();
993 
994 	if (curr_time < memorystatus_freezer_thread_next_run_ts) {
995 		return false;
996 	}
997 
998 	return true;
999 }
1000 
1001 size_t
memorystatus_pick_freeze_count_for_wakeup()1002 memorystatus_pick_freeze_count_for_wakeup()
1003 {
1004 	size_t num_to_freeze = 0;
1005 	if (!memorystatus_swap_all_apps) {
1006 		num_to_freeze = 1;
1007 	} else {
1008 		/*
1009 		 * When app swap is enabled, we want the freezer thread to aggressively freeze
1010 		 * all candidates so we clear out space for the fg working set.
1011 		 * But we still cap it to the current size of the candidate bands to avoid
1012 		 * consuming excessive CPU if there's a lot of churn in the candidate band.
1013 		 */
1014 		proc_list_lock();
1015 		for (unsigned int band = JETSAM_PRIORITY_IDLE; band <= memorystatus_freeze_max_candidate_band; band++) {
1016 			num_to_freeze += memstat_bucket[band].count;
1017 		}
1018 		proc_list_unlock();
1019 	}
1020 
1021 	return num_to_freeze;
1022 }
1023 
1024 #endif /* CONFIG_FREEZE */
1025