xref: /xnu-12377.81.4/bsd/kern/kern_memorystatus_policy.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2006-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  *
28  */
29 
30 #include <kern/task.h>
31 #include <libkern/libkern.h>
32 #include <machine/atomic.h>
33 #include <mach/coalition.h>
34 #include <os/log.h>
35 #include <sys/coalition.h>
36 #include <sys/proc.h>
37 #include <sys/proc_internal.h>
38 #include <sys/sysctl.h>
39 #include <sys/kdebug.h>
40 #include <sys/kern_memorystatus.h>
41 #include <vm/vm_protos.h>
42 #include <vm/vm_compressor_xnu.h>
43 
44 #include <kern/kern_memorystatus_internal.h>
45 
46 /*
47  * All memory pressure policy decisions should live here, and there should be
48  * as little mechanism as possible. This file prioritizes readability.
49  */
50 
51 #pragma mark Policy Function Declarations
52 
53 #if CONFIG_JETSAM
54 static bool memorystatus_check_aggressive_jetsam_needed(int *jld_idle_kills);
55 #endif /* CONFIG_JETSAM */
56 
57 #pragma mark Memorystatus Health Check
58 
59 /*
60  * Each subsystem that relies on the memorystatus thread
61  * for resource exhaustion should put a health check in this section.
62  * The memorystatus thread runs all of the health checks
63  * to determine if the system is healthy. If the system is unhealthy
64  * it picks an action based on the system health status. See the
65  * Memorystatus Thread Actions section below.
66  */
67 
68 
69 extern uint64_t memstat_oldest_reapable_proc_prio_start;
70 extern uint64_t memstat_reaper_min_age_secs;
71 extern uint64_t memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu;
72 extern bool     memstat_reaper_is_currently_sweeping;
73 
74 extern vm_pressure_level_t memorystatus_vm_pressure_level;
75 
76 static void
memstat_evaluate_health_conditions(memorystatus_system_health_t status)77 memstat_evaluate_health_conditions(memorystatus_system_health_t status)
78 {
79 	memset(status, 0, sizeof(memorystatus_system_health_t));
80 	status->msh_compressor_low_on_space = vm_compressor_low_on_space() ||
81 	    os_atomic_load(&memorystatus_compressor_space_shortage, relaxed);
82 	status->msh_compressor_exhausted = vm_compressor_out_of_space();
83 	status->msh_swap_low_on_space = vm_swap_low_on_space();
84 	status->msh_swap_exhausted = vm_swap_out_of_space();
85 #if CONFIG_JETSAM
86 	memstat_evaluate_page_shortage(
87 		&status->msh_available_pages_below_soft,
88 		&status->msh_available_pages_below_idle,
89 		&status->msh_available_pages_below_critical,
90 		&status->msh_available_pages_below_reaper);
91 	status->msh_compressor_is_thrashing = !memorystatus_swap_all_apps && vm_compressor_is_thrashing();
92 #if CONFIG_PHANTOM_CACHE
93 	status->msh_phantom_cache_pressure = os_atomic_load(&memorystatus_phantom_cache_pressure, relaxed);
94 #else
95 	status->msh_phantom_cache_pressure = false;
96 #endif /* CONFIG_PHANTOM_CACHE */
97 	if (!memorystatus_swap_all_apps &&
98 	    status->msh_phantom_cache_pressure &&
99 	    !(status->msh_compressor_is_thrashing && status->msh_compressor_exhausted)) {
100 		status->msh_filecache_is_thrashing = true;
101 	}
102 	status->msh_pageout_starved = os_atomic_load(&memorystatus_pageout_starved, relaxed);
103 	status->msh_swappable_compressor_segments_over_limit = memorystatus_swap_over_trigger(100);
104 	status->msh_swapin_queue_over_limit = memorystatus_swapin_over_trigger();
105 #else /* !CONFIG_JETSAM */
106 	vm_pressure_level_t pressure_level = memorystatus_vm_pressure_level;
107 	status->msh_vm_pressure_critical = (pressure_level == kVMPressureCritical);
108 	status->msh_vm_pressure_warning = (pressure_level >= kVMPressureWarning);
109 #endif /* CONFIG_JETSAM */
110 	status->msh_zone_map_is_exhausted = os_atomic_load(&memorystatus_zone_map_is_exhausted, relaxed);
111 }
112 
113 static bool
memstat_is_system_healthy(const memorystatus_system_health_t status)114 memstat_is_system_healthy(const memorystatus_system_health_t status)
115 {
116 #if CONFIG_JETSAM
117 	return !(status->msh_available_pages_below_critical ||
118 	       status->msh_compressor_is_thrashing ||
119 	       status->msh_compressor_exhausted ||
120 	       status->msh_compressor_low_on_space ||
121 	       status->msh_filecache_is_thrashing ||
122 	       status->msh_zone_map_is_exhausted ||
123 	       status->msh_pageout_starved);
124 #else /* CONFIG_JETSAM */
125 	return !(status->msh_zone_map_is_exhausted ||
126 	       status->msh_compressor_exhausted ||
127 	       status->msh_compressor_low_on_space ||
128 	       status->msh_swap_exhausted ||
129 	       status->msh_swap_low_on_space ||
130 	       status->msh_vm_pressure_critical ||
131 	       status->msh_vm_pressure_warning);
132 #endif /* CONFIG_JETSAM */
133 }
134 
135 static void
memstat_log_system_health(const memorystatus_system_health_t status)136 memstat_log_system_health(const memorystatus_system_health_t status)
137 {
138 	static struct memorystatus_system_health_s prev_status = {0};
139 
140 	bool healthy = memstat_is_system_healthy(status);
141 
142 	/*
143 	 * Avoid spamming logs by only logging when the system status has changed.
144 	 */
145 	if (prev_status.msh_zone_map_is_exhausted == status->msh_zone_map_is_exhausted &&
146 	    prev_status.msh_compressor_exhausted == status->msh_compressor_exhausted &&
147 	    prev_status.msh_swap_low_on_space == status->msh_swap_low_on_space &&
148 	    prev_status.msh_swap_exhausted == status->msh_swap_exhausted
149 #if CONFIG_JETSAM
150 	    &&
151 	    prev_status.msh_available_pages_below_idle == status->msh_available_pages_below_idle &&
152 	    prev_status.msh_available_pages_below_soft == status->msh_available_pages_below_soft &&
153 	    prev_status.msh_available_pages_below_critical == status->msh_available_pages_below_critical &&
154 	    prev_status.msh_available_pages_below_reaper == status->msh_available_pages_below_reaper &&
155 	    prev_status.msh_compressor_needs_to_swap == status->msh_compressor_needs_to_swap &&
156 	    prev_status.msh_compressor_is_thrashing == status->msh_compressor_is_thrashing &&
157 	    prev_status.msh_filecache_is_thrashing == status->msh_filecache_is_thrashing &&
158 	    prev_status.msh_phantom_cache_pressure == status->msh_phantom_cache_pressure &&
159 	    prev_status.msh_swapin_queue_over_limit == status->msh_swapin_queue_over_limit &&
160 	    prev_status.msh_pageout_starved == status->msh_pageout_starved
161 #endif /* CONFIG_JETSAM */
162 	    ) {
163 		/* No change */
164 		return;
165 	}
166 
167 #if CONFIG_JETSAM
168 	if (healthy) {
169 		if (status->msh_available_pages_below_soft) {
170 			memorystatus_log(
171 				"memorystatus: System will begin enforcing "
172 				"soft memory limits. "
173 				"memorystatus_available_pages: %llu compressor_size: %u\n",
174 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
175 		} else if (status->msh_available_pages_below_idle) {
176 			memorystatus_log(
177 				"memorystatus: System will begin enacting "
178 				"idle-exits. "
179 				"memorystatus_available_pages: %llu compressor_size: %u\n",
180 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
181 		} else if (status->msh_available_pages_below_reaper) {
182 			memorystatus_log(
183 				"memorystatus: System will begin reaping "
184 				"long-idle processes. "
185 				"memorystatus_available_pages: %llu compressor_size: %u\n",
186 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
187 		} else {
188 			memorystatus_log(
189 				"memorystatus: System is healthy. "
190 				"memorystatus_available_pages: %llu compressor_size:%u\n",
191 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
192 		}
193 	} else {
194 		/* Unhealthy */
195 		memorystatus_log("memorystatus: System is unhealthy! memorystatus_available_pages: %llu compressor_size:%u\n",
196 		    (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
197 		memorystatus_log(
198 			"memorystatus: {"
199 			"\"available_pages_below_critical\": %d, "
200 			"\"available_pages_below_idle\": %d, "
201 			"\"available_pages_below_soft\": %d, "
202 			"\"available_pages_below_reaper\": %d, "
203 			"\"compressor_needs_to_swap\": %d, "
204 			"\"compressor_exhausted\": %d, "
205 			"\"compressor_is_thrashing\": %d, "
206 			"\"filecache_is_thrashing\": %d, "
207 			"\"zone_map_is_exhausted\": %d, "
208 			"\"phantom_cache_pressure\": %d, "
209 			"\"swappable_compressor_segments_over_limit\": %d, "
210 			"\"swapin_queue_over_limit\": %d, "
211 			"\"swap_low\": %d, "
212 			"\"swap_exhausted\": %d"
213 			"}\n",
214 			status->msh_available_pages_below_critical,
215 			status->msh_available_pages_below_idle,
216 			status->msh_available_pages_below_soft,
217 			status->msh_available_pages_below_reaper,
218 			status->msh_compressor_needs_to_swap,
219 			status->msh_compressor_exhausted,
220 			status->msh_compressor_is_thrashing,
221 			status->msh_filecache_is_thrashing,
222 			status->msh_zone_map_is_exhausted,
223 			status->msh_phantom_cache_pressure,
224 			status->msh_swappable_compressor_segments_over_limit,
225 			status->msh_swapin_queue_over_limit,
226 			status->msh_swap_low_on_space,
227 			status->msh_swap_exhausted);
228 	}
229 #else /* CONFIG_JETSAM */
230 	memorystatus_log("memorystatus: System is %s. memorystatus_available_pages: %llu compressor_size:%u\n",
231 	    healthy ? "healthy" : "unhealthy",
232 	    (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
233 	if (!healthy) {
234 		memorystatus_log(
235 			"memorystatus: {"
236 			"\"compressor_exhausted\": %d, "
237 			"\"zone_map_is_exhausted\": %d, "
238 			"\"swap_low\": %d, "
239 			"\"swap_exhausted\": %d"
240 			"}\n",
241 			status->msh_compressor_exhausted,
242 			status->msh_zone_map_is_exhausted,
243 			status->msh_swap_low_on_space,
244 			status->msh_swap_exhausted);
245 	}
246 #endif /* CONFIG_JETSAM */
247 	prev_status = *status;
248 }
249 
250 bool
memstat_check_system_health(memorystatus_system_health_t status)251 memstat_check_system_health(memorystatus_system_health_t status)
252 {
253 	memstat_evaluate_health_conditions(status);
254 	memstat_log_system_health(status);
255 	return memstat_is_system_healthy(status);
256 }
257 
258 #pragma mark Memorystatus Thread Actions
259 
260 /*
261  * This section picks the appropriate memorystatus_action & deploys it.
262  */
263 
264 uint64_t memstat_last_cache_purge_ts;
265 /* Purge caches under critical pressure up to every 1 min */
266 TUNABLE(uint64_t, memstat_cache_purge_backoff_ns,
267     "memorystatus_cache_purge_backoff_ns", 1 * 60 * NSEC_PER_SEC);
268 
269 static uint32_t
memorystatus_pick_kill_cause(const memorystatus_system_health_t status)270 memorystatus_pick_kill_cause(const memorystatus_system_health_t status)
271 {
272 	assert(!memstat_is_system_healthy(status));
273 #if CONFIG_JETSAM
274 	if (status->msh_available_pages_below_critical) {
275 		return kMemorystatusKilledVMPageShortage;
276 	} else if (status->msh_compressor_exhausted) {
277 		return kMemorystatusKilledVMCompressorSpaceShortage;
278 	} else if (status->msh_compressor_is_thrashing) {
279 		return kMemorystatusKilledVMCompressorThrashing;
280 	} else if (status->msh_filecache_is_thrashing) {
281 		return kMemorystatusKilledFCThrashing;
282 	} else if (status->msh_zone_map_is_exhausted) {
283 		return kMemorystatusKilledZoneMapExhaustion;
284 	} else if (status->msh_pageout_starved) {
285 		return kMemorystatusKilledVMPageoutStarvation;
286 	} else {
287 		panic("decided to kill-top-process for unknown cause");
288 	}
289 #else /* CONFIG_JETSAM */
290 	if (status->msh_zone_map_is_exhausted) {
291 		return kMemorystatusKilledZoneMapExhaustion;
292 	} else if (status->msh_compressor_exhausted) {
293 		return kMemorystatusKilledVMCompressorSpaceShortage;
294 	} else if (status->msh_swap_exhausted) {
295 		return kMemorystatusKilledLowSwap;
296 	} else {
297 		return kMemorystatusKilled;
298 	}
299 #endif /* CONFIG_JETSAM */
300 }
301 
302 /*
303  * Inspects the state of various resources in the system to see if
304  * the system is healthy. If the system is not healthy, picks a
305  * memorystatus_action_t to recover the system.
306  *
307  * Every time the memorystatus thread wakes up it calls into here
308  * to pick an action. It will continue performing memorystatus actions until this
309  * function returns MEMORYSTATUS_KILL_NONE. At that point the thread will block.
310  */
311 memorystatus_action_t
memorystatus_pick_action(jetsam_state_t state,uint32_t * kill_cause,bool highwater_remaining,bool suspended_swappable_apps_remaining,bool swappable_apps_remaining,int * jld_idle_kills)312 memorystatus_pick_action(jetsam_state_t state,
313     uint32_t *kill_cause,
314     bool highwater_remaining,
315     bool suspended_swappable_apps_remaining,
316     bool swappable_apps_remaining,
317     int *jld_idle_kills)
318 {
319 	struct memorystatus_system_health_s status;
320 	bool is_system_healthy = memstat_check_system_health(&status);
321 
322 #if CONFIG_JETSAM
323 	if (status.msh_available_pages_below_soft || !is_system_healthy) {
324 		/*
325 		 * If swap is enabled, first check if we're running low or are out of swap space.
326 		 */
327 		if (memorystatus_swap_all_apps && jetsam_kill_on_low_swap) {
328 			if (swappable_apps_remaining && status.msh_swap_exhausted) {
329 				*kill_cause = kMemorystatusKilledLowSwap;
330 				return MEMORYSTATUS_KILL_SWAPPABLE;
331 			} else if (suspended_swappable_apps_remaining && status.msh_swap_low_on_space) {
332 				*kill_cause = kMemorystatusKilledLowSwap;
333 				return MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE;
334 			}
335 		}
336 
337 		/*
338 		 * We're below the pressure level or the system is unhealthy,
339 		 * regardless of the system health let's check if we should be swapping
340 		 * and if there are high watermark kills left to do.
341 		 */
342 		if (memorystatus_swap_all_apps) {
343 			if (status.msh_swappable_compressor_segments_over_limit && !vm_swapout_thread_running && !os_atomic_load(&vm_swapout_wake_pending, relaxed)) {
344 				/*
345 				 * TODO: The swapper will keep running until it has drained the entire early swapout queue.
346 				 * That might be overly aggressive & we should look into tuning it.
347 				 * See rdar://84102304.
348 				 */
349 				return MEMORYSTATUS_WAKE_SWAPPER;
350 			} else if (status.msh_swapin_queue_over_limit) {
351 				return MEMORYSTATUS_PROCESS_SWAPIN_QUEUE;
352 			} else if (status.msh_swappable_compressor_segments_over_limit) {
353 				memorystatus_log_info(
354 					"memorystatus: Skipping swap wakeup because the swap thread is already running. vm_swapout_thread_running=%d, vm_swapout_wake_pending=%d\n",
355 					vm_swapout_thread_running, os_atomic_load(&vm_swapout_wake_pending, relaxed));
356 			}
357 		}
358 
359 		if (status.msh_compressor_exhausted || status.msh_compressor_low_on_space) {
360 			*kill_cause = kMemorystatusKilledVMCompressorSpaceShortage;
361 			return MEMORYSTATUS_KILL_TOP_PROCESS;
362 		}
363 
364 		if (highwater_remaining) {
365 			*kill_cause = kMemorystatusKilledHiwat;
366 			return MEMORYSTATUS_KILL_HIWATER;
367 		}
368 	}
369 
370 	if (status.msh_available_pages_below_idle &&
371 	    memstat_get_idle_proccnt() > 0 &&
372 	    is_system_healthy) {
373 		/*
374 		 * The system is below the idle threshold but otherwise healthy.
375 		 */
376 		*kill_cause = kMemorystatusKilledIdleExit;
377 		return MEMORYSTATUS_KILL_IDLE;
378 	}
379 
380 	if (memstat_reaper_is_currently_sweeping && is_system_healthy) {
381 		/*
382 		 * The system is healthy and we're in a reaper sweep.
383 		 */
384 		*kill_cause = kMemorystatusKilledLongIdleExit;
385 		return MEMORYSTATUS_KILL_LONG_IDLE;
386 	}
387 
388 	if (is_system_healthy) {
389 		*kill_cause = 0;
390 		return MEMORYSTATUS_KILL_NONE;
391 	}
392 
393 	/*
394 	 * At this point the system is unhealthy and there are no
395 	 * more highwatermark processes to kill.
396 	 */
397 
398 	if (!state->limit_to_low_bands) {
399 		if (memorystatus_check_aggressive_jetsam_needed(jld_idle_kills)) {
400 			memorystatus_log("memorystatus: Starting aggressive jetsam.\n");
401 			*kill_cause = kMemorystatusKilledProcThrashing;
402 			return MEMORYSTATUS_KILL_AGGRESSIVE;
403 		}
404 	}
405 
406 	/*
407 	 * The system is unhealthy and we either don't need aggressive jetsam
408 	 * or are not allowed to deploy it.
409 	 * Kill in priority order. We'll use LRU within every band except the
410 	 * FG (which will be sorted by coalition role).
411 	 */
412 	*kill_cause = memorystatus_pick_kill_cause(&status);
413 	return MEMORYSTATUS_KILL_TOP_PROCESS;
414 #else /* !CONFIG_JETSAM */
415 	(void) state;
416 	(void) jld_idle_kills;
417 	(void) suspended_swappable_apps_remaining;
418 	(void) swappable_apps_remaining;
419 	(void) highwater_remaining;
420 
421 	/*
422 	 * Without CONFIG_JETSAM, we only kill if the system is unhealthy.
423 	 * There is no aggressive jetsam and no
424 	 * early highwatermark killing.
425 	 */
426 	if (is_system_healthy) {
427 		*kill_cause = 0;
428 		return MEMORYSTATUS_KILL_NONE;
429 	}
430 	*kill_cause = memorystatus_pick_kill_cause(&status);
431 	if (status.msh_zone_map_is_exhausted) {
432 		return MEMORYSTATUS_KILL_TOP_PROCESS;
433 	}
434 	if (status.msh_compressor_exhausted || status.msh_swap_exhausted) {
435 		if (kill_on_no_paging_space) {
436 			return MEMORYSTATUS_KILL_TOP_PROCESS;
437 		}
438 	}
439 	if (status.msh_compressor_low_on_space || status.msh_swap_low_on_space) {
440 		if (memstat_get_idle_proccnt() > 0) {
441 			/* Kill all idle processes before invoking the no paging space action */
442 			return MEMORYSTATUS_KILL_IDLE;
443 		}
444 		/*
445 		 * Throttle how often the no-paging-space action is performed.
446 		 */
447 		uint64_t now = mach_absolute_time();
448 		uint64_t delta_since_last_no_space_ns;
449 		uint64_t last_action_ts = os_atomic_load(&last_no_space_action_ts, relaxed);
450 		assert3u(now, >=, last_action_ts);
451 		absolutetime_to_nanoseconds(now - last_action_ts, &delta_since_last_no_space_ns);
452 		if (delta_since_last_no_space_ns > no_paging_space_action_throttle_delay_ns) {
453 			return MEMORYSTATUS_NO_PAGING_SPACE;
454 		} else {
455 			return MEMORYSTATUS_KILL_NONE;
456 		}
457 	}
458 	if (status.msh_vm_pressure_critical) {
459 		/*
460 		 * The system is under critical memory pressure. First terminate any low-risk
461 		 * idle processes. When they are exhausted, purge system memory caches.
462 		 */
463 		if (memstat_pressure_config & MEMSTAT_WARNING_KILL_LONG_IDLE &&
464 		    memstat_get_long_idle_proccnt() > 0) {
465 			*kill_cause = kMemorystatusKilledLongIdleExit;
466 			return MEMORYSTATUS_KILL_LONG_IDLE;
467 		}
468 		if (memstat_pressure_config & MEMSTAT_CRITICAL_KILL_IDLE &&
469 		    memstat_get_idle_proccnt() > 0) {
470 			*kill_cause = kMemorystatusKilledIdleExit;
471 			return MEMORYSTATUS_KILL_IDLE;
472 		}
473 		if (memstat_pressure_config & MEMSTAT_CRITICAL_PURGE_CACHES) {
474 			uint64_t now = mach_absolute_time();
475 			uint64_t delta_ns;
476 			uint64_t last_purge_ts = os_atomic_load(&memstat_last_cache_purge_ts, relaxed);
477 			assert3u(now, >=, last_purge_ts);
478 			absolutetime_to_nanoseconds(now - last_purge_ts, &delta_ns);
479 			if (delta_ns > memstat_cache_purge_backoff_ns) {
480 				memstat_last_cache_purge_ts = now;
481 				return MEMORYSTATUS_PURGE_CACHES;
482 			}
483 		}
484 		return MEMORYSTATUS_KILL_NONE;
485 	} else if (status.msh_vm_pressure_warning) {
486 		/*
487 		 * The system is under pressure and is likely to start swapping soon. Reap
488 		 * any long-idle daemons.
489 		 */
490 		if (memstat_pressure_config & MEMSTAT_WARNING_KILL_LONG_IDLE &&
491 		    memstat_get_long_idle_proccnt() > 0) {
492 			*kill_cause = kMemorystatusKilledLongIdleExit;
493 			return MEMORYSTATUS_KILL_LONG_IDLE;
494 		}
495 		return MEMORYSTATUS_KILL_NONE;
496 	}
497 #endif /* CONFIG_JETSAM */
498 	panic("System is unhealthy but no action has been chosen");
499 }
500 
501 #pragma mark Aggressive Jetsam
502 /*
503  * This section defines when we deploy aggressive jetsam.
504  * Aggressive jetsam kills everything up to the jld_priority_band_max band.
505  */
506 
507 #if CONFIG_JETSAM
508 
509 static bool
510 memorystatus_aggressive_jetsam_needed_sysproc_aging(__unused int jld_eval_aggressive_count, __unused int *jld_idle_kills, __unused int jld_idle_kill_candidates, int *total_candidates);
511 
512 /*
513  * kJetsamHighRelaunchCandidatesThreshold defines the percentage of candidates
514  * in the idle & deferred bands that need to be bad candidates in order to trigger
515  * aggressive jetsam.
516  */
517 TUNABLE_DEV_WRITEABLE(unsigned int, kJetsamHighRelaunchCandidatesThreshold, "jetsam_high_relaunch_candidates_threshold_percent", 100);
518 #if DEVELOPMENT || DEBUG
519 SYSCTL_UINT(_kern, OID_AUTO, jetsam_high_relaunch_candidates_threshold_percent, CTLFLAG_RW | CTLFLAG_LOCKED, &kJetsamHighRelaunchCandidatesThreshold, 100, "");
520 #endif /* DEVELOPMENT || DEBUG */
521 
522 /* kJetsamMinCandidatesThreshold defines the minimum number of candidates in the
523  * idle/deferred bands to trigger aggressive jetsam. This value basically decides
524  * how much memory the system is ready to hold in the lower bands without triggering
525  * aggressive jetsam. This number should ideally be tuned based on the memory config
526  * of the device.
527  */
528 TUNABLE_DT_DEV_WRITEABLE(unsigned int, kJetsamMinCandidatesThreshold, "/defaults", "kern.jetsam_min_candidates_threshold", "jetsam_min_candidates_threshold", 5, TUNABLE_DT_CHECK_CHOSEN);
529 #if DEVELOPMENT || DEBUG
530 SYSCTL_UINT(_kern, OID_AUTO, jetsam_min_candidates_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &kJetsamMinCandidatesThreshold, 5, "");
531 #endif /* DEVELOPMENT || DEBUG */
532 
533 static bool
memorystatus_check_aggressive_jetsam_needed(int * jld_idle_kills)534 memorystatus_check_aggressive_jetsam_needed(int *jld_idle_kills)
535 {
536 	bool aggressive_jetsam_needed = false;
537 	int total_candidates = 0;
538 	/*
539 	 * The aggressive jetsam logic looks at the number of times it has been in the
540 	 * aggressive loop to determine the max priority band it should kill upto. The
541 	 * static variables below are used to track that property.
542 	 *
543 	 * To reset those values, the implementation checks if it has been
544 	 * memorystatus_jld_eval_period_msecs since the parameters were reset.
545 	 */
546 
547 	if (memorystatus_jld_enabled == FALSE) {
548 		/* If aggressive jetsam is disabled, nothing to do here */
549 		return false;
550 	}
551 
552 	/* Get current timestamp (msecs only) */
553 	struct timeval  jld_now_tstamp = {0, 0};
554 	uint64_t        jld_now_msecs = 0;
555 	microuptime(&jld_now_tstamp);
556 	jld_now_msecs = (jld_now_tstamp.tv_sec * 1000);
557 
558 	/*
559 	 * Look at the number of candidates in the idle and deferred band and
560 	 * how many out of them are marked as high relaunch probability.
561 	 */
562 	aggressive_jetsam_needed = memorystatus_aggressive_jetsam_needed_sysproc_aging(jld_eval_aggressive_count,
563 	    jld_idle_kills, jld_idle_kill_candidates, &total_candidates);
564 
565 	/*
566 	 * It is also possible that the system is down to a very small number of processes in the candidate
567 	 * bands. In that case, the decisions made by the memorystatus_aggressive_jetsam_needed_* routines
568 	 * would not be useful. In that case, do not trigger aggressive jetsam.
569 	 */
570 	if (total_candidates < kJetsamMinCandidatesThreshold) {
571 		memorystatus_log_debug(
572 			"memorystatus: aggressive: [FAILED] Low Candidate "
573 			"Count (current: %d, threshold: %d)\n",
574 			total_candidates, kJetsamMinCandidatesThreshold);
575 		aggressive_jetsam_needed = false;
576 	}
577 
578 	/*
579 	 * Check if its been really long since the aggressive jetsam evaluation
580 	 * parameters have been refreshed. This logic also resets the jld_eval_aggressive_count
581 	 * counter to make sure we reset the aggressive jetsam severity.
582 	 */
583 	if ((total_candidates == 0) ||
584 	    (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) {
585 		jld_timestamp_msecs       = jld_now_msecs;
586 		jld_idle_kill_candidates  = total_candidates;
587 		*jld_idle_kills           = 0;
588 		jld_eval_aggressive_count = 0;
589 	}
590 
591 	return aggressive_jetsam_needed;
592 }
593 
594 static bool
memorystatus_aggressive_jetsam_needed_sysproc_aging(__unused int eval_aggressive_count,__unused int * idle_kills,__unused int idle_kill_candidates,int * total_candidates)595 memorystatus_aggressive_jetsam_needed_sysproc_aging(__unused int eval_aggressive_count, __unused int *idle_kills, __unused int idle_kill_candidates, int *total_candidates)
596 {
597 	bool aggressive_jetsam_needed = false;
598 
599 	/*
600 	 * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, we maintain the jetsam
601 	 * relaunch behavior for all daemons. Also, daemons and apps are aged in deferred bands on
602 	 * every dirty->clean transition. For this aging policy, the best way to determine if
603 	 * aggressive jetsam is needed, is to see if the kill candidates are mostly bad candidates.
604 	 * If yes, then we need to go to higher bands to reclaim memory.
605 	 */
606 	proc_list_lock();
607 	/* Get total candidate counts for idle and idle deferred bands */
608 	*total_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].count + memstat_bucket[system_procs_aging_band].count;
609 	/* Get counts of bad kill candidates in idle and idle deferred bands */
610 	int bad_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].relaunch_high_count + memstat_bucket[system_procs_aging_band].relaunch_high_count;
611 
612 	proc_list_unlock();
613 
614 	/* Check if the number of bad candidates is greater than kJetsamHighRelaunchCandidatesThreshold % */
615 	aggressive_jetsam_needed = (((bad_candidates * 100) / *total_candidates) >= kJetsamHighRelaunchCandidatesThreshold);
616 
617 	/*
618 	 * Since the new aging policy bases the aggressive jetsam trigger on percentage of
619 	 * bad candidates, it is prone to being overly aggressive. In order to mitigate that,
620 	 * make sure the system is really under memory pressure before triggering aggressive
621 	 * jetsam.
622 	 */
623 	if (memorystatus_available_pages > memorystatus_sysproc_aging_aggr_pages) {
624 		aggressive_jetsam_needed = false;
625 	}
626 
627 #if DEVELOPMENT || DEBUG
628 	memorystatus_log_info(
629 		"memorystatus: aggressive%d: [%s] Bad Candidate Threshold Check (total: %d, bad: %d, threshold: %d %%); Memory Pressure Check (available_pgs: %llu, threshold_pgs: %llu)\n",
630 		eval_aggressive_count, aggressive_jetsam_needed ? "PASSED" : "FAILED", *total_candidates, bad_candidates,
631 		kJetsamHighRelaunchCandidatesThreshold, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, (uint64_t)memorystatus_sysproc_aging_aggr_pages);
632 #endif /* DEVELOPMENT || DEBUG */
633 	return aggressive_jetsam_needed;
634 }
635 
636 #endif /* CONFIG_JETSAM */
637 
638 #pragma mark Freezer
639 #if CONFIG_FREEZE
640 /*
641  * Freezer policies
642  */
643 
644 /*
645  * These functions determine what is eligible for the freezer
646  * and the order that we consider freezing them
647  */
648 
649 /*
650  * Checks if the given process is eligible for the freezer.
651  * Processes can only be frozen if this returns true.
652  */
653 bool
memorystatus_is_process_eligible_for_freeze(proc_t p)654 memorystatus_is_process_eligible_for_freeze(proc_t p)
655 {
656 	/*
657 	 * Called with proc_list_lock held.
658 	 */
659 
660 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
661 
662 	bool should_freeze = false;
663 	uint32_t state = 0, pages = 0;
664 	bool first_consideration = true;
665 	task_t task;
666 
667 	state = p->p_memstat_state;
668 
669 	if (state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) {
670 		if (state & P_MEMSTAT_FREEZE_DISABLED) {
671 			p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonDisabled;
672 		}
673 		goto out;
674 	}
675 
676 	task = proc_task(p);
677 
678 	if (isSysProc(p)) {
679 		/*
680 		 * Daemon:- We consider freezing it if:
681 		 * - it belongs to a coalition and the leader is frozen, and,
682 		 * - its role in the coalition is XPC service.
683 		 *
684 		 * We skip memory size requirements in this case.
685 		 */
686 		int task_role_in_coalition = 0;
687 		proc_t leader_proc = memorystatus_get_coalition_leader_and_role(p, &task_role_in_coalition);
688 		if (leader_proc == PROC_NULL || leader_proc == p) {
689 			/*
690 			 * Jetsam coalition is leaderless or the leader is not an app.
691 			 * Either way, don't freeze this proc.
692 			 */
693 			goto out;
694 		}
695 
696 		/* Leader must be frozen */
697 		if (!(leader_proc->p_memstat_state & P_MEMSTAT_FROZEN)) {
698 			goto out;
699 		}
700 		/* Only freeze XPC services */
701 		if (task_role_in_coalition == COALITION_TASKROLE_XPC) {
702 			should_freeze = true;
703 		}
704 
705 		goto out;
706 	} else {
707 		/*
708 		 * Application. Only freeze if it's suspended.
709 		 */
710 		if (!(state & P_MEMSTAT_SUSPENDED)) {
711 			goto out;
712 		}
713 	}
714 
715 	/*
716 	 * We're interested in tracking what percentage of
717 	 * eligible apps actually get frozen.
718 	 * To avoid skewing the metrics towards processes which
719 	 * are considered more frequently, we only track failures once
720 	 * per process.
721 	 */
722 	first_consideration = !(state & P_MEMSTAT_FREEZE_CONSIDERED);
723 
724 	if (first_consideration) {
725 		memorystatus_freezer_stats.mfs_process_considered_count++;
726 		p->p_memstat_state |= P_MEMSTAT_FREEZE_CONSIDERED;
727 	}
728 
729 	/* Only freeze applications meeting our minimum resident page criteria */
730 	memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
731 	if (pages < memorystatus_freeze_pages_min) {
732 		if (first_consideration) {
733 			memorystatus_freezer_stats.mfs_error_below_min_pages_count++;
734 		}
735 		p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonBelowMinPages;
736 		goto out;
737 	}
738 
739 	/* Don't freeze processes that are already exiting on core. It may have started exiting
740 	 * after we chose it for freeze, but before we obtained the proc_list_lock.
741 	 * NB: This is only possible if we're coming in from memorystatus_freeze_process_sync.
742 	 * memorystatus_freeze_top_process holds the proc_list_lock while it traverses the bands.
743 	 */
744 	if (proc_list_exited(p)) {
745 		if (first_consideration) {
746 			memorystatus_freezer_stats.mfs_error_other_count++;
747 		}
748 		p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonOther;
749 		goto out;
750 	}
751 
752 	if (!memorystatus_freezer_use_ordered_list) {
753 		/*
754 		 * We're not using the ordered list so we need to check
755 		 * that dasd recommended the process. Note that the ordered list
756 		 * algorithm only considers processes on the list in the first place
757 		 * so there's no need to double check here.
758 		 */
759 		if (!memorystatus_freeze_process_is_recommended(p)) {
760 			if (first_consideration) {
761 				memorystatus_freezer_stats.mfs_error_low_probability_of_use_count++;
762 			}
763 			p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonLowProbOfUse;
764 			goto out;
765 		}
766 	}
767 
768 	if (!(state & P_MEMSTAT_FROZEN) && p->p_memstat_effectivepriority > memorystatus_freeze_max_candidate_band) {
769 		/*
770 		 * Proc has been elevated by something else.
771 		 * Don't freeze it.
772 		 */
773 		if (first_consideration) {
774 			memorystatus_freezer_stats.mfs_error_elevated_count++;
775 		}
776 		p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonElevated;
777 		goto out;
778 	}
779 
780 	should_freeze = true;
781 out:
782 	if (should_freeze && !(state & P_MEMSTAT_FROZEN)) {
783 		/*
784 		 * Reset the skip reason. If it's killed before we manage to actually freeze it
785 		 * we failed to consider it early enough.
786 		 */
787 		p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
788 		if (!first_consideration) {
789 			/*
790 			 * We're freezing this for the first time and we previously considered it ineligible.
791 			 * Bump the considered count so that we track this as 1 failure
792 			 * and 1 success.
793 			 */
794 			memorystatus_freezer_stats.mfs_process_considered_count++;
795 		}
796 	}
797 	return should_freeze;
798 }
799 
800 bool
memorystatus_freeze_proc_is_refreeze_eligible(proc_t p)801 memorystatus_freeze_proc_is_refreeze_eligible(proc_t p)
802 {
803 	return (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) != 0;
804 }
805 
806 
807 static proc_t
memorystatus_freeze_pick_refreeze_process(proc_t last_p)808 memorystatus_freeze_pick_refreeze_process(proc_t last_p)
809 {
810 	proc_t p = PROC_NULL, next_p = PROC_NULL;
811 	unsigned int band = (unsigned int) memorystatus_freeze_jetsam_band;
812 	if (last_p == PROC_NULL) {
813 		next_p = memorystatus_get_first_proc_locked(&band, FALSE);
814 	} else {
815 		next_p = memorystatus_get_next_proc_locked(&band, last_p, FALSE);
816 	}
817 	while (next_p) {
818 		p = next_p;
819 		next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
820 		if ((p->p_memstat_state & P_MEMSTAT_FROZEN) && !memorystatus_freeze_proc_is_refreeze_eligible(p)) {
821 			/* Process is already frozen & hasn't been thawed. */
822 			continue;
823 		}
824 		/*
825 		 * Has to have been frozen once before.
826 		 */
827 		if (!(p->p_memstat_state & P_MEMSTAT_FROZEN)) {
828 			continue;
829 		}
830 
831 		/*
832 		 * Not currently being looked at for something.
833 		 */
834 		if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
835 			continue;
836 		}
837 
838 		/*
839 		 * Don't refreeze a last process we just thawed if still within the timeout window
840 		 */
841 		if (memorystatus_freeze_prevent_refreeze_of_recently_thawed && memorystatus_freeze_was_process_recently_thawed(p)) {
842 			memorystatus_log("memorystatus: too soon to refreeze pid %d [%s], in memorystatus_freeze_pick_refreeze_process\n", p->p_pid, proc_best_name(p));
843 			continue;
844 		}
845 
846 		/*
847 		 * Found it
848 		 */
849 		return p;
850 	}
851 	return PROC_NULL;
852 }
853 
854 proc_t
memorystatus_freeze_pick_process(struct memorystatus_freeze_list_iterator * iterator)855 memorystatus_freeze_pick_process(struct memorystatus_freeze_list_iterator *iterator)
856 {
857 	proc_t p = PROC_NULL, next_p = PROC_NULL;
858 	unsigned int band = JETSAM_PRIORITY_IDLE;
859 
860 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
861 	/*
862 	 * If the freezer is full, only consider refreezes.
863 	 */
864 	if (iterator->refreeze_only || memorystatus_frozen_count >= memorystatus_frozen_processes_max) {
865 		if (!iterator->refreeze_only) {
866 			/*
867 			 * The first time the iterator starts to return refreeze
868 			 * candidates, we need to reset the last pointer b/c it's pointing into the wrong band.
869 			 */
870 			iterator->last_p = PROC_NULL;
871 			iterator->refreeze_only = true;
872 		}
873 		iterator->last_p = memorystatus_freeze_pick_refreeze_process(iterator->last_p);
874 		return iterator->last_p;
875 	}
876 
877 	/*
878 	 * Search for the next freezer candidate.
879 	 */
880 	if (memorystatus_freezer_use_ordered_list) {
881 		while (iterator->global_freeze_list_index < memorystatus_global_freeze_list.mfcl_length) {
882 			p = memorystatus_freezer_candidate_list_get_proc(
883 				&memorystatus_global_freeze_list,
884 				(iterator->global_freeze_list_index)++,
885 				&memorystatus_freezer_stats.mfs_freeze_pid_mismatches);
886 
887 			if (p != PROC_NULL && memorystatus_is_process_eligible_for_freeze(p)) {
888 				/*
889 				 * Don't refreeze the a process we just thawed if still within the timeout window
890 				 */
891 				if (memorystatus_freeze_prevent_refreeze_of_recently_thawed && memorystatus_freeze_was_process_recently_thawed(p)) {
892 					memorystatus_log("memorystatus: too soon to refreeze pid %d [%s], in memorystatus_freeze_pick_process\n", p->p_pid, proc_best_name(p));
893 					continue;
894 				}
895 				iterator->last_p = p;
896 				return iterator->last_p;
897 			}
898 		}
899 	} else {
900 		if (iterator->last_p == PROC_NULL) {
901 			next_p = memorystatus_get_first_proc_locked(&band, FALSE);
902 		} else {
903 			next_p = memorystatus_get_next_proc_locked(&band, iterator->last_p, FALSE);
904 		}
905 		while (next_p) {
906 			p = next_p;
907 			if (memorystatus_is_process_eligible_for_freeze(p)) {
908 				iterator->last_p = p;
909 				return iterator->last_p;
910 			} else {
911 				next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
912 			}
913 		}
914 	}
915 
916 	/*
917 	 * Failed to find a new freezer candidate.
918 	 * Try to re-freeze.
919 	 */
920 	if (memorystatus_refreeze_eligible_count >= memorystatus_min_thaw_refreeze_threshold) {
921 		assert(!iterator->refreeze_only);
922 		iterator->refreeze_only = true;
923 		iterator->last_p = memorystatus_freeze_pick_refreeze_process(PROC_NULL);
924 		return iterator->last_p;
925 	}
926 	return PROC_NULL;
927 }
928 
929 /*
930  * memorystatus_pages_update calls this function whenever the number
931  * of available pages changes. It wakes the freezer thread iff the function returns
932  * true. The freezer thread will try to freeze (or refreeze) up to 1 process
933  * before blocking again.
934  *
935  * Note the freezer thread is also woken up by memorystatus_on_inactivity.
936  */
937 
938 bool
memorystatus_freeze_thread_should_run()939 memorystatus_freeze_thread_should_run()
940 {
941 	/*
942 	 * No freezer_mutex held here...see why near call-site
943 	 * within memorystatus_pages_update().
944 	 */
945 
946 	if (memorystatus_freeze_enabled == false) {
947 		return false;
948 	}
949 
950 	if (memorystatus_available_pages > memorystatus_freeze_threshold) {
951 		return false;
952 	}
953 
954 	memorystatus_freezer_stats.mfs_below_threshold_count++;
955 
956 	if ((memorystatus_frozen_count >= memorystatus_frozen_processes_max)) {
957 		/*
958 		 * Consider this as a skip even if we wake up to refreeze because
959 		 * we won't freeze any new procs.
960 		 */
961 		memorystatus_freezer_stats.mfs_skipped_full_count++;
962 		if (memorystatus_refreeze_eligible_count < memorystatus_min_thaw_refreeze_threshold) {
963 			return false;
964 		}
965 	}
966 
967 	if (memorystatus_frozen_shared_mb_max && (memorystatus_frozen_shared_mb >= memorystatus_frozen_shared_mb_max)) {
968 		memorystatus_freezer_stats.mfs_skipped_shared_mb_high_count++;
969 		return false;
970 	}
971 
972 	uint64_t curr_time = mach_absolute_time();
973 
974 	if (curr_time < memorystatus_freezer_thread_next_run_ts) {
975 		return false;
976 	}
977 
978 	return true;
979 }
980 
981 size_t
memorystatus_pick_freeze_count_for_wakeup()982 memorystatus_pick_freeze_count_for_wakeup()
983 {
984 	size_t num_to_freeze = 0;
985 	if (!memorystatus_swap_all_apps) {
986 		num_to_freeze = 1;
987 	} else {
988 		/*
989 		 * When app swap is enabled, we want the freezer thread to aggressively freeze
990 		 * all candidates so we clear out space for the fg working set.
991 		 * But we still cap it to the current size of the candidate bands to avoid
992 		 * consuming excessive CPU if there's a lot of churn in the candidate band.
993 		 */
994 		proc_list_lock();
995 		for (unsigned int band = JETSAM_PRIORITY_IDLE; band <= memorystatus_freeze_max_candidate_band; band++) {
996 			num_to_freeze += memstat_bucket[band].count;
997 		}
998 		proc_list_unlock();
999 	}
1000 
1001 	return num_to_freeze;
1002 }
1003 
1004 #endif /* CONFIG_FREEZE */
1005