xref: /xnu-12377.61.12/bsd/kern/kern_memorystatus.c (revision 4d495c6e23c53686cf65f45067f79024cf5dcee8)
1 /*
2  * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  *
28  */
29 
30 #include <kern/sched_prim.h>
31 #include <kern/kalloc.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/host.h>
38 #include <kern/policy_internal.h>
39 #include <kern/thread_group.h>
40 #include <kern/zalloc.h>
41 
42 #include <corpses/task_corpse.h>
43 #include <libkern/libkern.h>
44 #include <mach/mach_time.h>
45 #include <mach/task.h>
46 #include <mach/host_priv.h>
47 #include <mach/mach_host.h>
48 #include <pexpert/pexpert.h>
49 #include <sys/coalition.h>
50 #include <sys/code_signing.h>
51 #include <sys/kern_event.h>
52 #include <sys/proc.h>
53 #include <sys/proc_info.h>
54 #include <sys/reason.h>
55 #include <sys/signal.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysproto.h>
59 #include <sys/spawn_internal.h>
60 #include <sys/wait.h>
61 #include <sys/tree.h>
62 #include <sys/priv.h>
63 #include <vm/pmap.h>
64 #include <vm/vm_reclaim_xnu.h>
65 #include <vm/vm_pageout_xnu.h>
66 #include <vm/vm_protos.h>
67 #include <vm/vm_purgeable_xnu.h>
68 #include <vm/vm_page.h>
69 #include <vm/vm_compressor_xnu.h>
70 #include <mach/machine/sdt.h>
71 #include <libkern/section_keywords.h>
72 #include <os/atomic_private.h>
73 #include <os/overflow.h>
74 #include <mach/mach_time.h>
75 
76 #include <IOKit/IOBSD.h>
77 
78 #if CONFIG_MACF
79 #include <security/mac_framework.h>
80 #endif
81 
82 #if CONFIG_FREEZE
83 #include <vm/vm_map.h>
84 #endif /* CONFIG_FREEZE */
85 
86 #include <kern/kern_memorystatus_internal.h>
87 #include <sys/kern_memorystatus.h>
88 #include <sys/kern_memorystatus_xnu.h>
89 #include <sys/kern_memorystatus_freeze.h>
90 #include <sys/kern_memorystatus_notify.h>
91 #include <sys/kdebug_triage.h>
92 #include <sys/file_internal.h>
93 #include <net/necp.h>
94 
95 errno_t mach_to_bsd_errno(kern_return_t mach_err);
96 extern uint32_t vm_compressor_pool_size(void);
97 extern uint32_t vm_compressor_fragmentation_level(void);
98 
99 int block_corpses = 0; /* counter to block new corpses if jetsam purges them */
100 
101 /* For logging clarity */
102 static const char *memstat_kill_cause_name[] = {
103 	"",                                             /* kMemorystatusInvalid							*/
104 	"jettisoned",                                   /* kMemorystatusKilled							*/
105 	"highwater",                                    /* kMemorystatusKilledHiwat						*/
106 	"vnode-limit",                                  /* kMemorystatusKilledVnodes					*/
107 	"vm-pageshortage",                              /* kMemorystatusKilledVMPageShortage			*/
108 	"proc-thrashing",                               /* kMemorystatusKilledProcThrashing				*/
109 	"fc-thrashing",                                 /* kMemorystatusKilledFCThrashing				*/
110 	"per-process-limit",                            /* kMemorystatusKilledPerProcessLimit			*/
111 	"disk-space-shortage",                          /* kMemorystatusKilledDiskSpaceShortage			*/
112 	"idle-exit",                                    /* kMemorystatusKilledIdleExit					*/
113 	"zone-map-exhaustion",                          /* kMemorystatusKilledZoneMapExhaustion			*/
114 	"vm-compressor-thrashing",                      /* kMemorystatusKilledVMCompressorThrashing		*/
115 	"vm-compressor-space-shortage",                 /* kMemorystatusKilledVMCompressorSpaceShortage	*/
116 	"low-swap",                                     /* kMemorystatusKilledLowSwap                   */
117 	"sustained-memory-pressure",                    /* kMemorystatusKilledSustainedPressure         */
118 	"vm-pageout-starvation",                        /* kMemorystatusKilledVMPageoutStarvation       */
119 	"conclave-limit",                               /* kMemorystatusKilledConclaveLimit             */
120 	"long-idle-exit",                               /* kMemorystatusKilledLongIdleExit				*/
121 };
122 
123 static const char *
memorystatus_priority_band_name(int32_t priority)124 memorystatus_priority_band_name(int32_t priority)
125 {
126 	switch (priority) {
127 	case JETSAM_PRIORITY_FOREGROUND:
128 		return "FOREGROUND";
129 	case JETSAM_PRIORITY_AUDIO_AND_ACCESSORY:
130 		return "AUDIO_AND_ACCESSORY";
131 	case JETSAM_PRIORITY_CONDUCTOR:
132 		return "CONDUCTOR";
133 	case JETSAM_PRIORITY_DRIVER_APPLE:
134 		return "DRIVER_APPLE";
135 	case JETSAM_PRIORITY_HOME:
136 		return "HOME";
137 	case JETSAM_PRIORITY_EXECUTIVE:
138 		return "EXECUTIVE";
139 	case JETSAM_PRIORITY_IMPORTANT:
140 		return "IMPORTANT";
141 	case JETSAM_PRIORITY_CRITICAL:
142 		return "CRITICAL";
143 	}
144 
145 	return "?";
146 }
147 
148 bool
is_reason_thrashing(unsigned cause)149 is_reason_thrashing(unsigned cause)
150 {
151 	switch (cause) {
152 	case kMemorystatusKilledFCThrashing:
153 	case kMemorystatusKilledVMCompressorThrashing:
154 	case kMemorystatusKilledVMCompressorSpaceShortage:
155 		return true;
156 	default:
157 		return false;
158 	}
159 }
160 
161 bool
is_reason_zone_map_exhaustion(unsigned cause)162 is_reason_zone_map_exhaustion(unsigned cause)
163 {
164 	return cause == kMemorystatusKilledZoneMapExhaustion;
165 }
166 
167 /*
168  * Returns the current zone map size and capacity to include in the jetsam snapshot.
169  * Defined in zalloc.c
170  */
171 extern void get_zone_map_size(uint64_t *current_size, uint64_t *capacity);
172 
173 /*
174  * Returns the name of the largest zone and its size to include in the jetsam snapshot.
175  * Defined in zalloc.c
176  */
177 extern void get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size);
178 
179 static int memstat_set_memlimits_locked(proc_t p, int32_t active_limit,
180     int32_t inactive_limit, memlimit_options_t options);
181 static bool memstat_proc_is_active_locked(proc_t);
182 
183 static int memorystatus_highwater_enabled = 1;  /* Update the cached memlimit data. */
184 
185 /*
186  * Cache this proc's active limit as its current limit before writing it to
187  * the ledger. Returns whether the new limit should be written to the ledger.
188  */
189 static inline bool
memstat_update_memlimit_locked(proc_t p,bool use_active)190 memstat_update_memlimit_locked(proc_t p, bool use_active)
191 {
192 	bool ledger_needed = false;
193 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
194 
195 	/* Cache limit value */
196 	if (use_active && p->p_memstat_memlimit != p->p_memstat_memlimit_active) {
197 		p->p_memstat_memlimit = p->p_memstat_memlimit_active;
198 		ledger_needed = true;
199 	} else if (!use_active &&
200 	    p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) {
201 		p->p_memstat_memlimit = p->p_memstat_memlimit_inactive;
202 		ledger_needed = true;
203 	}
204 
205 	/* Cache limit fatality */
206 	if (_memstat_proc_memlimit_is_fatal(p, use_active) &&
207 	    !_memstat_proc_cached_memlimit_is_fatal(p)) {
208 		p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
209 		ledger_needed = true;
210 	} else if (!_memstat_proc_memlimit_is_fatal(p, use_active) &&
211 	    _memstat_proc_cached_memlimit_is_fatal(p)) {
212 		p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
213 		ledger_needed = true;
214 	}
215 
216 	return ledger_needed;
217 }
218 
219 /*
220  * Write the process' current memlimit to the ledger for enforcement.
221  *
222  * Holding the proc_list_lock while writing to the ledgers (where the task
223  * lock is taken) can be problematic.  The proc list lock may optionally be
224  * dropped and re-taken while writing limits to the ledger. (rdar://21394491)
225  */
226 static int
_memstat_write_memlimit_to_ledger_locked(proc_t p,bool is_active,bool drop_lock)227 _memstat_write_memlimit_to_ledger_locked(proc_t p, bool is_active, bool drop_lock)
228 {
229 	kern_return_t kr;
230 	bool is_fatal = _memstat_proc_cached_memlimit_is_fatal(p);
231 
232 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
233 
234 #if MACH_ASSERT
235 	if (memorystatus_highwater_enabled) {
236 		if (is_active) {
237 			assert3u(is_fatal, ==, _memstat_proc_active_memlimit_is_fatal(p));
238 			assert3u(p->p_memstat_memlimit, ==, p->p_memstat_memlimit_active);
239 		} else {
240 			assert3u(is_fatal, ==, _memstat_proc_inactive_memlimit_is_fatal(p));
241 			assert3u(p->p_memstat_memlimit, ==, p->p_memstat_memlimit_inactive);
242 		}
243 	}
244 #endif /* MACH_ASSERT */
245 
246 	if (drop_lock) {
247 		if (proc_ref(p, true) != p) {
248 			memorystatus_log_error("Unable to take a reference on proc %s [%d]. "
249 			    "Cannot update memlimit", proc_best_name(p), proc_getpid(p));
250 			return ESRCH;
251 		}
252 		proc_list_unlock();
253 	}
254 
255 	memorystatus_log_debug("memorystatus: new limit on pid %d (%dMB %s)\n",
256 	    proc_getpid(p), (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
257 	    (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"));
258 
259 	kr = task_set_phys_footprint_limit_internal(proc_task(p),
260 	    (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1,
261 	    NULL, is_active, is_fatal);
262 
263 	if (drop_lock) {
264 		proc_list_lock();
265 		proc_rele(p);
266 	}
267 
268 	if (kr != KERN_SUCCESS) {
269 		memorystatus_log_fault("memorystatus: error (%d) setting memlimit in "
270 		    "ledger for %s [%d]\n", kr, proc_best_name(p), proc_pid(p));
271 		return mach_to_bsd_errno(kr);
272 	}
273 	return 0;
274 }
275 
276 #pragma mark General Tunables
277 
278 #define MEMORYSTATUS_SMALL_MEMORY_THRESHOLD (3UL * (1UL << 30))
279 #define MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD (6UL * (1UL << 30))
280 
281 #define MEMORYSTATUS_CLEAR_THE_DECKS_OFFSET_PERCENTAGE 5UL
282 #define MEMORYSTATUS_BALLAST_OFFSET_PERCENTAGE 5UL
283 #define MEMORYSTATUS_AGGR_SYSPROC_AGING_PERCENTAGE 7UL
284 #define MEMORYSTATUS_DELTA_PERCENTAGE_LARGE 4UL
285 #define MEMORYSTATUS_DELTA_PERCENTAGE_SMALL 5UL
286 
287 /*
288  * Fall back to these percentages/ratios if a mb value is not provided via EDT
289  *  DRAM (GB) | critical | idle | pressure | reaper | freeze
290  *  (0,3]     | 5%       | 10%  | 15%      | 20%    | 50%
291  *  (3,6]     | 4%       | 9%   | 15%      | 18%    | 50%
292  *  (6,∞)     | 4%       | 8%   | 12%      | 16%    | 50%
293  */
294 
295 #define MEMORYSTATUS_CRITICAL_PERCENTAGE_SMALL 5UL
296 #define MEMORYSTATUS_CRITICAL_PERCENTAGE_LARGE 4UL
297 
298 #define MEMORYSTATUS_IDLE_RATIO_NUM 2UL
299 #define MEMORYSTATUS_IDLE_RATIO_DENOM 1UL
300 #define MEMORYSTATUS_PRESSURE_RATIO_NUM 3UL
301 #define MEMORYSTATUS_PRESSURE_RATIO_DENOM 1UL
302 #define MEMORYSTATUS_REAPER_RATIO_NUM 4UL
303 #define MEMORYSTATUS_REAPER_RATIO_DENOM 1UL
304 
305 #if (XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR) || XNU_TARGET_OS_WATCH
306 #define MEMORYSTATUS_REAPER_ENABLED_DEFAULT TRUE
307 #else
308 #define MEMORYSTATUS_REAPER_ENABLED_DEFAULT FALSE
309 #endif /* (XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR) || XNU_TARGET_OS_WATCH */
310 #define MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT 300
311 #define MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT 300
312 #define MEMORYSTATUS_REAPER_MAX_PRIORITY_DEFAULT JETSAM_PRIORITY_IDLE
313 #define MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT 30
314 #define MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT -1
315 
316 #define MEMORYSTATUS_REAPER_REAP_RELAUNCH_MASK_UNKNOWN (P_MEMSTAT_RELAUNCH_HIGH << 1)
317 #define MEMORYSTATUS_REAPER_REAP_RELAUNCH_MASK_DEFAULT (P_MEMSTAT_RELAUNCH_LOW | MEMORYSTATUS_REAPER_REAP_RELAUNCH_MASK_UNKNOWN)
318 
319 /*
320  * For historical reasons, devices with "medium"-sized memory configs have a critical:idle:pressure ratio of
321  * 4:9:15. This ratio is preserved for these devices when a fixed-mb base value has not been provided by EDT/boot-arg;
322  * all other devices use a 1:2:3 ratio.
323  */
324 #define MEMORYSTATUS_IDLE_RATIO_NUM_MEDIUM 9UL
325 #define MEMORYSTATUS_IDLE_RATIO_DENOM_MEDIUM 4UL
326 #define MEMORYSTATUS_PRESSURE_RATIO_NUM_MEDIUM  15UL
327 #define MEMORYSTATUS_PRESSURE_RATIO_DENOM_MEDIUM  4UL
328 #define MEMORYSTATUS_REAPER_RATIO_NUM_MEDIUM 20UL
329 #define MEMORYSTATUS_REAPER_RATIO_DENOM_MEDIUM 4UL
330 #define MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT_MEDIUM 240
331 #define MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT_MEDIUM 240
332 
333 /*
334  * For Large config device, set the reaper threhsold to be 19% of the Memsize
335  */
336 #define MEMORYSTATUS_REAPER_RATIO_NUM_LARGE 19UL
337 #define MEMORYSTATUS_REAPER_RATIO_DENOM_LARGE 4UL
338 
339 static int32_t memorystatus_get_default_task_active_limit(proc_t p);
340 static int32_t memorystatus_get_default_task_inactive_limit(proc_t p);
341 
342 /*
343  * default jetsam snapshot support
344  */
345 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
346 
347 #if CONFIG_FREEZE
348 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_freezer;
349 /*
350  * The size of the freezer snapshot is given by memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR
351  * The freezer snapshot can be much smaller than the default snapshot
352  * because it only includes apps that have been killed and dasd consumes it every 30 minutes.
353  * Since the snapshots are always wired we don't want to overallocate too much.
354  */
355 #define JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR 20
356 unsigned int memorystatus_jetsam_snapshot_freezer_max;
357 unsigned int memorystatus_jetsam_snapshot_freezer_size;
358 TUNABLE(bool, memorystatus_jetsam_use_freezer_snapshot, "kern.jetsam_user_freezer_snapshot", true);
359 
360 #define MEMORYSTATUS_FREEZE_THRESHOLD_PERCENTAGE 50UL
361 TUNABLE_DT(uint32_t, memorystatus_freeze_threshold_mb, "/defaults", "kern.memstat_freeze_mb",
362     "memorystatus_freeze_threshold_mb", 0, TUNABLE_DT_NONE);
363 #endif /* CONFIG_FREEZE */
364 
365 unsigned int memorystatus_jetsam_snapshot_count = 0;
366 unsigned int memorystatus_jetsam_snapshot_max = 0;
367 unsigned int memorystatus_jetsam_snapshot_size = 0;
368 uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
369 uint64_t memorystatus_jetsam_snapshot_timeout = 0;
370 
371 #define JETSAM_SNAPSHOT_TIMEOUT_SECS 30
372 
373 #if DEVELOPMENT || DEBUG
374 /*
375  * On development and debug kernels, we allow one pid to take ownership
376  * of some memorystatus data structures for testing purposes (via memorystatus_control).
377  * If there's an owner, then only they may consume the jetsam snapshot & set freezer probabilities.
378  * This is used when testing these interface to avoid racing with other
379  * processes on the system that typically use them (namely OSAnalytics & dasd).
380  */
381 static pid_t memorystatus_testing_pid = 0;
382 SYSCTL_INT(_kern, OID_AUTO, memorystatus_testing_pid, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_testing_pid, 0, "");
383 #endif /* DEVELOPMENT || DEBUG */
384 
385 /*
386  * jetsam zprint snapshot data
387  */
388 #if JETSAM_ZPRINT_SNAPSHOT
389 static unsigned int        jzs_trigger_band = JETSAM_PRIORITY_FOREGROUND;
390 static mach_zone_name_t    *jzs_names = NULL;
391 static mach_zone_info_t    *jzs_info = NULL;
392 static int                *jzs_coalesce = NULL;
393 static unsigned int        jzs_zone_cnt = 0;
394 static mach_memory_info_t *jzs_meminfo = NULL;
395 static unsigned int        jzs_meminfo_cnt = 0;
396 static uint64_t            jzs_gencount = (uint64_t) -1ll;
397 
398 #if DEVELOPMENT || DEBUG
399 SYSCTL_UINT(_kern, OID_AUTO, jzs_trigger_band, CTLFLAG_RW | CTLFLAG_LOCKED, &jzs_trigger_band, 0, "Priority band threshold for taking jetsam zprint snapshot");
400 #endif /* DEVELOPMENT || DEBUG */
401 #endif /* JETSAM_ZPRINT_SNAPSHOT */
402 
403 
404 static void memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot);
405 
406 /* General memorystatus stuff */
407 
408 /*
409  * Daemons: The actual idle deferred time for the daemon is based on
410  * the relaunch behavior of the daemon. The relaunch behavior determines
411  * the scaling factor applied to memorystatus_sysprocs_idle_delay_time. See
412  * kJetsamSysProcsIdleDelayTime* ratios defined in kern_memorystatus.c
413  *
414  * Apps: The apps are aged for memorystatus_apps_idle_delay_time factored
415  * by kJetsamAppsIdleDelayTimeRatio.
416  */
417 TUNABLE(uint64_t, memstat_idle_deferral_time_s, "memorystatus_idle_deferral_time_s", 10);
418 TUNABLE(uint64_t, memstat_aging_stuck_time_s, "memorystatus_aging_stuck_time_s", 30);
419 uint64_t memorystatus_sysprocs_idle_delay_time = 0;
420 uint64_t memorystatus_apps_idle_delay_time = 0;
421 uint64_t memorystatus_aging_stuck_delay_time = 0;
422 /* 2GB devices support an entitlement for a higher app memory limit of "almost 2GB". */
423 static int32_t memorystatus_ios13extended_footprint_limit_mb = 1800;
424 
425 #define CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT XNU_TARGET_OS_XR
426 
427 /* Some devices give entitled apps a higher memory limit */
428 TUNABLE_DT_WRITEABLE(int32_t, memorystatus_entitled_max_task_footprint_mb,
429     "/defaults", "kern.entitled_max_task_pmem",
430     "entitled_max_task_pmem", 0, TUNABLE_DT_NONE);
431 TUNABLE_DT_WRITEABLE(int32_t, memorystatus_entitled_dev_max_task_footprint_mb,
432     "/defaults", "kern.entitled_dev_max_task_pmem",
433     "entitled_dev_max_task_pmem", 0, TUNABLE_DT_NONE);
434 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
435 /* On visionOS, we want a separate high memory limit for bincompat (iOS) apps. */
436 TUNABLE_DT_WRITEABLE(int32_t, memorystatus_entitled_bincompat_max_task_footprint_mb,
437     "/defaults", "kern.entitled_bc_max_task_pmem",
438     "entitled_bincompat_max_task_pmem", 0, TUNABLE_DT_NONE);
439 #endif // CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
440 #if DEBUG || DEVELOPMENT
441 TUNABLE(bool, memstat_ignore_task_limit_increase, "memstat_no_task_limit_increase", false);
442 #endif /* DEBUG || DEVELOPMENT */
443 
444 #if __arm64__
445 #if DEVELOPMENT || DEBUG
446 SYSCTL_INT(_kern, OID_AUTO, ios13extended_footprint_limit_mb,
447     CTLFLAG_RD | CTLFLAG_LOCKED,
448     &memorystatus_ios13extended_footprint_limit_mb, 0, "");
449 SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem,
450     CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN,
451     &memorystatus_entitled_max_task_footprint_mb, 0, "");
452 SYSCTL_INT(_kern, OID_AUTO, entitled_dev_max_task_pmem,
453     CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN,
454     &memorystatus_entitled_dev_max_task_footprint_mb, 0, "");
455 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
456 SYSCTL_INT(_kern, OID_AUTO, entitled_bincompat_max_task_pmem,
457     CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN,
458     &memorystatus_entitled_bincompat_max_task_footprint_mb, 0, "");
459 #endif /* CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT */
460 #else /* !(DEVELOPMENT || DEBUG) */
461 SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem,
462     CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_KERN,
463     &memorystatus_entitled_max_task_footprint_mb, 0, "");
464 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
465 SYSCTL_INT(_kern, OID_AUTO, entitled_bincompat_max_task_pmem,
466     CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_KERN,
467     &memorystatus_entitled_bincompat_max_task_footprint_mb, 0, "");
468 #endif /* CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT */
469 #endif /* DEVELOPMENT || DEBUG */
470 #endif /* __arm64__ */
471 
472 #pragma mark Logging
473 
474 os_log_t memorystatus_log_handle;
475 
476 TUNABLE_WRITEABLE(memorystatus_log_level_t, memorystatus_log_level, "memorystatus_log_level", MEMORYSTATUS_LOG_LEVEL_DEFAULT);
477 
478 #if DEBUG || DEVELOPMENT
479 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_log_level, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_log_level, MEMORYSTATUS_LOG_LEVEL_DEFAULT, "");
480 #endif
481 
482 #pragma mark Locks
483 
484 static LCK_GRP_DECLARE(memorystatus_lock_group, "memorystatus");
485 
486 /* Synchronizes jetsam pressure broadcasts */
487 LCK_MTX_DECLARE(memorystatus_jetsam_broadcast_lock, &memorystatus_lock_group);
488 
489 #if DEVELOPMENT || DEBUG
490 static LCK_MTX_DECLARE(disconnect_page_mappings_mutex, &memorystatus_lock_group);
491 #endif /* DEVELOPMENT || DEBUG */
492 
493 /* Idle guard handling */
494 
495 static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
496 static void _memstat_invalidate_idle_demotion_locked(proc_t p);
497 static void _memstat_schedule_idle_demotion_locked(proc_t p);
498 static void _memstat_reschedule_idle_demotion_locked(void);
499 int memorystatus_update_priority_for_appnap(proc_t p);
500 vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
501 boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
502 void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
503 void memorystatus_send_low_swap_note(void);
504 boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count,
505     uint32_t *errors, uint64_t *memory_reclaimed);
506 static bool memorystatus_kill_proc(proc_t p, uint32_t cause,
507     os_reason_t jetsam_reason, bool *killed, uint64_t *footprint_of_killed_proc);
508 /* Synchronously kill a process in priority order */
509 static bool memstat_kill_top_process(uint32_t cause, os_reason_t jetsam_reason,
510     int32_t max_priority, memstat_kill_options_t options,
511     int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed);
512 
513 uint64_t memorystatus_available_memory_internal(proc_t p);
514 void memorystatus_thread_wake(void);
515 static bool _memstat_consider_waking_jetsam_thread(void);
516 #if CONFIG_JETSAM
517 static void memorystatus_thread_pool_default(void);
518 static void memorystatus_thread_pool_max(void);
519 #endif /* CONFIG_JETSAM */
520 
521 unsigned int memorystatus_level = 0;
522 static int memorystatus_list_count = 0;
523 memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
524 static thread_call_t memorystatus_idle_demotion_call;
525 uint64_t memstat_idle_demotion_deadline = 0;
526 #if CONFIG_FREEZE
527 unsigned int memorystatus_suspended_count = 0;
528 #endif /* CONFIG_FREEZE */
529 
530 #if XNU_TARGET_OS_OSX
531 /*
532  * Effectively disable the system process and application demotion
533  * logic on macOS. This means system processes and apps won't get the
534  * 10 second protection before landing in the IDLE band after moving
535  * out of their active band. Reasons:-
536  * - daemons + extensions + apps on macOS don't behave the way they
537  *   do on iOS and so they are confusing the demotion logic. For example,
538  *   not all apps go from FG to IDLE. Some sit in higher bands instead. This
539  *   is causing multiple asserts to fire internally.
540  * - we use the aging bands to protect processes from jetsam. But on macOS,
541  *   we have a very limited jetsam that is only invoked under extreme conditions
542  *   where we have no more swap / compressor space OR are under critical pressure.
543  */
544 int system_procs_aging_band = 0;
545 int system_procs_aging_band_stuck = 0;
546 int applications_aging_band = 0;
547 #else /* XNU_TARGET_OS_OSX */
548 int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
549 int system_procs_aging_band_stuck = JETSAM_PRIORITY_AGING_BAND1_STUCK;
550 int applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
551 #endif /* XNU_TARGET_OS_OSX */
552 
553 /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */
554 #if CONFIG_FREEZE
555 int memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_FREEZER;
556 #else /* CONFIG_FREEZE */
557 int memorystatus_freeze_jetsam_band = 0;
558 #endif /* CONFIG_FREEZE */
559 
560 _Atomic bool memorystatus_zone_map_is_exhausted = false;
561 _Atomic bool memorystatus_compressor_space_shortage = false;
562 _Atomic bool memorystatus_pageout_starved = false;
563 #if CONFIG_PHANTOM_CACHE
564 _Atomic bool memorystatus_phantom_cache_pressure = false;
565 #endif /* CONFIG_PHANTOM_CACHE */
566 
567 bool memorystatus_should_issue_fg_band_notify = true;
568 
569 extern void coalition_mark_swappable(coalition_t coal);
570 extern bool coalition_is_swappable(coalition_t coal);
571 boolean_t memorystatus_allowed_vm_map_fork(task_t, bool *);
572 #if DEVELOPMENT || DEBUG
573 void memorystatus_abort_vm_map_fork(task_t);
574 #endif
575 
576 SYSCTL_NODE(_kern, OID_AUTO, memorystatus,
577     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "memorystatus subsystem");
578 
579 /*
580  * Idle delay timeout factors for daemons based on relaunch behavior. Only used in
581  * kJetsamAgingPolicySysProcsReclaimedFirst aging policy.
582  */
583 #define kJetsamSysProcsIdleDelayTimeLowRatio    (5)
584 #define kJetsamSysProcsIdleDelayTimeMedRatio    (2)
585 #define kJetsamSysProcsIdleDelayTimeHighRatio   (1)
586 
587 /*
588  * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, treat apps as well
589  * behaved daemons for aging purposes.
590  */
591 #define kJetsamAppsIdleDelayTimeRatio   (kJetsamSysProcsIdleDelayTimeLowRatio)
592 
593 static uint64_t
memorystatus_sysprocs_idle_time(proc_t p)594 memorystatus_sysprocs_idle_time(proc_t p)
595 {
596 	uint64_t idle_delay_time = 0;
597 	/*
598 	 * For system processes, base the idle delay time on the
599 	 * jetsam relaunch behavior specified by launchd. The idea
600 	 * is to provide extra protection to the daemons which would
601 	 * relaunch immediately after jetsam.
602 	 */
603 	switch (p->p_memstat_relaunch_flags) {
604 	case P_MEMSTAT_RELAUNCH_UNKNOWN:
605 	case P_MEMSTAT_RELAUNCH_LOW:
606 		idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeLowRatio;
607 		break;
608 	case P_MEMSTAT_RELAUNCH_MED:
609 		idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeMedRatio;
610 		break;
611 	case P_MEMSTAT_RELAUNCH_HIGH:
612 		idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeHighRatio;
613 		break;
614 	default:
615 		panic("Unknown relaunch flags on process!");
616 		break;
617 	}
618 	return idle_delay_time;
619 }
620 
621 static uint64_t
memorystatus_apps_idle_time(__unused proc_t p)622 memorystatus_apps_idle_time(__unused proc_t p)
623 {
624 	return memorystatus_apps_idle_delay_time / kJetsamAppsIdleDelayTimeRatio;
625 }
626 
627 static uint64_t
_memstat_sysprocs_aging_stuck_delay_time(__unused proc_t p)628 _memstat_sysprocs_aging_stuck_delay_time(__unused proc_t p)
629 {
630 	return memorystatus_aging_stuck_delay_time;
631 }
632 
633 
634 static int
635 sysctl_jetsam_set_sysprocs_idle_delay_time SYSCTL_HANDLER_ARGS
636 {
637 #pragma unused(oidp, arg1, arg2)
638 
639 	int error = 0, val = 0, old_time_in_secs = 0;
640 	uint64_t old_time_in_ns = 0;
641 
642 	absolutetime_to_nanoseconds(memorystatus_sysprocs_idle_delay_time, &old_time_in_ns);
643 	old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
644 
645 	error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
646 	if (error || !req->newptr) {
647 		return error;
648 	}
649 
650 	if ((val < 0) || (val > INT32_MAX)) {
651 		memorystatus_log_error("jetsam: new idle delay interval has invalid value.\n");
652 		return EINVAL;
653 	}
654 
655 	nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
656 
657 	return 0;
658 }
659 
660 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, sysprocs_idle_delay_time_ns, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
661     0, 0, sysctl_jetsam_set_sysprocs_idle_delay_time, "I", "Aging window for system processes");
662 
663 
664 static int
665 sysctl_jetsam_set_apps_idle_delay_time SYSCTL_HANDLER_ARGS
666 {
667 #pragma unused(oidp, arg1, arg2)
668 
669 	int error = 0, val = 0, old_time_in_secs = 0;
670 	uint64_t old_time_in_ns = 0;
671 
672 	absolutetime_to_nanoseconds(memorystatus_apps_idle_delay_time, &old_time_in_ns);
673 	old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
674 
675 	error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
676 	if (error || !req->newptr) {
677 		return error;
678 	}
679 
680 	if ((val < 0) || (val > INT32_MAX)) {
681 		memorystatus_log_error("jetsam: new idle delay interval has invalid value.\n");
682 		return EINVAL;
683 	}
684 
685 	nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
686 
687 	return 0;
688 }
689 
690 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, apps_idle_delay_time_ns, CTLTYPE_INT | CTLFLAG_RW,
691     0, 0, sysctl_jetsam_set_apps_idle_delay_time, "I", "Aging window for applications");
692 
693 SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_KERN, &max_task_footprint_mb, 0, "");
694 
695 #if __arm64__
696 int legacy_footprint_bonus_mb = 50; /* This value was chosen after looking at the top 30 apps
697                                      * that needed the additional room in their footprint when
698                                      * the 'correct' accounting methods were applied to them.
699                                      */
700 
701 #if DEVELOPMENT || DEBUG
702 SYSCTL_INT(_kern, OID_AUTO, legacy_footprint_bonus_mb, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_footprint_bonus_mb, 0, "");
703 #endif /* DEVELOPMENT || DEBUG */
704 /*
705  * Raise the inactive and active memory limits to new values.
706  * Will only raise the limits and will do nothing if either of the current
707  * limits are 0.
708  * Caller must hold the proc_list_lock
709  */
710 static void
memorystatus_raise_memlimit_locked(proc_t p,int new_memlimit_active,int new_memlimit_inactive)711 memorystatus_raise_memlimit_locked(proc_t p,
712     int new_memlimit_active,
713     int new_memlimit_inactive)
714 {
715 	int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
716 	memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
717 
718 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
719 
720 	if (p->p_memstat_memlimit_active > 0) {
721 		memlimit_mb_active = p->p_memstat_memlimit_active;
722 	} else if (p->p_memstat_memlimit_active == -1) {
723 		memlimit_mb_active = max_task_footprint_mb;
724 	} else {
725 		/*
726 		 * Nothing to do for '0' which is
727 		 * a special value only used internally
728 		 * to test 'no limits'.
729 		 */
730 		return;
731 	}
732 
733 	if (p->p_memstat_memlimit_inactive > 0) {
734 		memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
735 	} else if (p->p_memstat_memlimit_inactive == -1) {
736 		memlimit_mb_inactive = max_task_footprint_mb;
737 	} else {
738 		/*
739 		 * Nothing to do for '0' which is
740 		 * a special value only used internally
741 		 * to test 'no limits'.
742 		 */
743 		return;
744 	}
745 
746 	memlimit_mb_active = MAX(new_memlimit_active, memlimit_mb_active);
747 	memlimit_mb_inactive = MAX(new_memlimit_inactive, memlimit_mb_inactive);
748 
749 	/* Maintain pre-existing limit fatality */
750 	if (_memstat_proc_active_memlimit_is_fatal(p)) {
751 		memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
752 	}
753 	if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
754 		memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
755 	}
756 
757 	memstat_set_memlimits_locked(p, memlimit_mb_active,
758 	    memlimit_mb_inactive, memlimit_options);
759 }
760 
761 void
memorystatus_act_on_legacy_footprint_entitlement(proc_t p,boolean_t footprint_increase)762 memorystatus_act_on_legacy_footprint_entitlement(proc_t p, boolean_t footprint_increase)
763 {
764 	int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
765 
766 	if (p == NULL) {
767 		return;
768 	}
769 
770 	proc_list_lock();
771 
772 	if (p->p_memstat_memlimit_active > 0) {
773 		memlimit_mb_active = p->p_memstat_memlimit_active;
774 	} else if (p->p_memstat_memlimit_active == -1) {
775 		memlimit_mb_active = max_task_footprint_mb;
776 	} else {
777 		/*
778 		 * Nothing to do for '0' which is
779 		 * a special value only used internally
780 		 * to test 'no limits'.
781 		 */
782 		proc_list_unlock();
783 		return;
784 	}
785 
786 	if (p->p_memstat_memlimit_inactive > 0) {
787 		memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
788 	} else if (p->p_memstat_memlimit_inactive == -1) {
789 		memlimit_mb_inactive = max_task_footprint_mb;
790 	} else {
791 		/*
792 		 * Nothing to do for '0' which is
793 		 * a special value only used internally
794 		 * to test 'no limits'.
795 		 */
796 		proc_list_unlock();
797 		return;
798 	}
799 
800 	if (footprint_increase) {
801 		memlimit_mb_active += legacy_footprint_bonus_mb;
802 		memlimit_mb_inactive += legacy_footprint_bonus_mb;
803 	} else {
804 		memlimit_mb_active -= legacy_footprint_bonus_mb;
805 		if (memlimit_mb_active == max_task_footprint_mb) {
806 			memlimit_mb_active = -1; /* reverting back to default system limit */
807 		}
808 
809 		memlimit_mb_inactive -= legacy_footprint_bonus_mb;
810 		if (memlimit_mb_inactive == max_task_footprint_mb) {
811 			memlimit_mb_inactive = -1; /* reverting back to default system limit */
812 		}
813 	}
814 	memorystatus_raise_memlimit_locked(p, memlimit_mb_active, memlimit_mb_inactive);
815 
816 	proc_list_unlock();
817 }
818 
819 void
memorystatus_act_on_ios13extended_footprint_entitlement(proc_t p)820 memorystatus_act_on_ios13extended_footprint_entitlement(proc_t p)
821 {
822 	proc_list_lock();
823 	memorystatus_raise_memlimit_locked(p,
824 	    memorystatus_ios13extended_footprint_limit_mb,
825 	    memorystatus_ios13extended_footprint_limit_mb);
826 	proc_list_unlock();
827 }
828 
829 void
memorystatus_act_on_entitled_task_limit(proc_t p)830 memorystatus_act_on_entitled_task_limit(proc_t p)
831 {
832 	int memlimit;
833 	if (memorystatus_entitled_max_task_footprint_mb == 0) {
834 		// Entitlement is not supported on this device.
835 		return;
836 	}
837 	proc_list_lock();
838 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
839 	// We want a separate memory limit for bincompat (iPad) apps on visionOS.
840 	switch (proc_platform(p)) {
841 	case PLATFORM_XROS:
842 	case PLATFORM_XROSSIMULATOR:
843 		memlimit = memorystatus_entitled_max_task_footprint_mb;
844 		break;
845 	default:
846 		if (memorystatus_entitled_bincompat_max_task_footprint_mb != 0) {
847 			memlimit = memorystatus_entitled_bincompat_max_task_footprint_mb;
848 		} else {
849 			memlimit = memorystatus_entitled_max_task_footprint_mb;
850 		}
851 		break;
852 	}
853 #else // CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
854 	memlimit = memorystatus_entitled_max_task_footprint_mb;
855 #endif // !CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
856 	memorystatus_raise_memlimit_locked(p, memlimit, memlimit);
857 	proc_list_unlock();
858 }
859 
860 void
memorystatus_act_on_entitled_developer_task_limit(proc_t p)861 memorystatus_act_on_entitled_developer_task_limit(proc_t p)
862 {
863 	if (memorystatus_entitled_dev_max_task_footprint_mb == 0) {
864 		// Entitlement not supported on this device
865 		return;
866 	}
867 	memorystatus_log("memorystatus: WARNING %s [%d] is receiving an entitled "
868 	    "debugging memory limit. This is intended only for debugging and "
869 	    "can result in unstable device behavior.",
870 	    proc_best_name(p), proc_getpid(p));
871 	proc_list_lock();
872 	memorystatus_raise_memlimit_locked(p,
873 	    memorystatus_entitled_dev_max_task_footprint_mb,
874 	    memorystatus_entitled_dev_max_task_footprint_mb);
875 	proc_list_unlock();
876 }
877 
878 #endif /* __arm64__ */
879 
880 SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_level, 0, "");
881 
882 int
memorystatus_get_level(__unused struct proc * p,struct memorystatus_get_level_args * args,__unused int * ret)883 memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
884 {
885 	user_addr_t     level = 0;
886 
887 	level = args->level;
888 
889 	if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
890 		return EFAULT;
891 	}
892 
893 	return 0;
894 }
895 
896 static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
897 
898 /* Memory Limits */
899 
900 static bool memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
901 static bool memstat_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
902 
903 
904 static int memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
905 
906 #if DEBUG || DEVELOPMENT
907 static int memorystatus_cmd_set_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
908 static int memorystatus_cmd_get_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
909 static int memorystatus_set_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry);
910 static int memorystatus_get_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry);
911 #endif  // DEBUG || DEVELOPMENT
912 static int memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry);
913 
914 static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
915 
916 static int memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
917 
918 static void memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry);
919 
920 int proc_get_memstat_priority(proc_t, boolean_t);
921 
922 static boolean_t memorystatus_idle_snapshot = 0;
923 
924 unsigned int memorystatus_delta = 0;
925 
926 /* Jetsam Loop Detection */
927 boolean_t memorystatus_jld_enabled = FALSE;              /* Enable jetsam loop detection */
928 uint32_t memorystatus_jld_eval_period_msecs = 0;         /* Init pass sets this based on device memory size */
929 int      memorystatus_jld_max_kill_loops = 2;            /* How many times should we try and kill up to the target band */
930 
931 /*
932  * A FG app can request that the aggressive jetsam mechanism display some leniency in the FG band. This 'lenient' mode is described as:
933  * --- if aggressive jetsam kills an app in the FG band and gets back >=AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD memory, it will stop the aggressive march further into and up the jetsam bands.
934  *
935  * RESTRICTIONS:
936  * - Such a request is respected/acknowledged only once while that 'requesting' app is in the FG band i.e. if aggressive jetsam was
937  * needed and the 'lenient' mode was deployed then that's it for this special mode while the app is in the FG band.
938  *
939  * - If the app is still in the FG band and aggressive jetsam is needed again, there will be no stop-and-check the next time around.
940  *
941  * - Also, the transition of the 'requesting' app away from the FG band will void this special behavior.
942  */
943 
944 #define AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD        25
945 boolean_t       memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
946 boolean_t       memorystatus_aggressive_jetsam_lenient = FALSE;
947 
948 #if DEVELOPMENT || DEBUG
949 /*
950  * Jetsam Loop Detection tunables.
951  */
952 
953 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_eval_period_msecs, 0, "");
954 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_max_kill_loops, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_max_kill_loops, 0, "");
955 #endif /* DEVELOPMENT || DEBUG */
956 
957 /*
958  * snapshot support for memstats collected at boot.
959  */
960 static memorystatus_jetsam_snapshot_t memorystatus_at_boot_snapshot;
961 
962 static void memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count);
963 static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount);
964 static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime);
965 
966 static void memorystatus_clear_errors(void);
967 
968 static void memorystatus_get_task_phys_footprint_page_counts(task_t task,
969     uint64_t *internal_pages, uint64_t *internal_compressed_pages,
970     uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
971     uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
972     uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages,
973     uint64_t *neural_nofootprint_total_pages);
974 
975 static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *count);
976 
977 static memorystatus_proc_state_t _memstat_build_state(proc_t p);
978 //static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
979 
980 static bool memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, int32_t max_kills, uint32_t *errors, uint64_t *memory_reclaimed);
981 static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed);
982 static bool _memstat_proc_is_reapable(proc_t p);
983 static void _memstat_refresh_oldest_reapable_proc_info(void);
984 static bool _memstat_proc_is_application(proc_t p);
985 
986 #if CONFIG_JETSAM
987 static void _memstat_reaper_check_oldest_reapable_proc_info_timeout(void);
988 static void _memstat_reaper_start_sweep(void);
989 static void _memstat_reaper_end_sweep(void);
990 static void _memstat_reaper_record_kill(uint64_t bytes_freed);
991 #endif /* CONFIG_JETSAM */
992 static const char* _memstat_relaunch_flags_description(uint32_t flags);
993 static const char* _memstat_proc_type_description(proc_t p);
994 
995 
996 /* Priority Band Sorting Routines */
997 static int  memstat_sort_bucket(unsigned int bucket_index, memorystatus_jetsam_sort_order_t sort_order);
998 static void memstat_sort_coals_locked(unsigned int bucket_index, memorystatus_jetsam_sort_order_t sort_order);
999 static void memstat_sort_by_footprint_locked(unsigned int bucket_index);
1000 
1001 #define JETSAM_SORT_IDLE_DEFAULT JETSAM_SORT_FOOTPRINT_NOCOAL
1002 #if XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR
1003 #define JETSAM_SORT_FG_DEFAULT JETSAM_SORT_LRU
1004 #else /* XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR */
1005 #define JETSAM_SORT_FG_DEFAULT JETSAM_SORT_FOOTPRINT
1006 #endif /* !(XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR) */
1007 
1008 TUNABLE_DT(memorystatus_jetsam_sort_order_t, memstat_jetsam_fg_sort_order, "/defaults",
1009     "kern.memstat_fg_sort_order", "memstat_fg_sort_order", JETSAM_SORT_FG_DEFAULT, TUNABLE_DT_NONE);
1010 
1011 /* qsort routines */
1012 typedef int (*cmpfunc_t)(const void *a, const void *b);
1013 extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
1014 
1015 /* VM pressure */
1016 
1017 #if CONFIG_SECLUDED_MEMORY
1018 extern unsigned int     vm_page_secluded_count;
1019 extern unsigned int     vm_page_secluded_count_over_target;
1020 #endif /* CONFIG_SECLUDED_MEMORY */
1021 
1022 /* Aggressive jetsam pages threshold for sysproc aging policy */
1023 unsigned int memorystatus_sysproc_aging_aggr_pages = 0;
1024 
1025 uint32_t memorystatus_available_pages = UINT32_MAX;
1026 
1027 __options_closed_decl(memorystatus_policy_t, uint8_t, {
1028 	kPolicyDefault        = 0x00,
1029 	kPolicyClearTheDecks  = 0x01,
1030 	kPolicyBallastDrain   = 0x02,
1031 });
1032 
1033 static memorystatus_policy_t memstat_policy_config = kPolicyDefault;
1034 
1035 #define MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX ((uint32_t)(atop_64(max_mem) / 2))
1036 
1037 /*
1038  * Jetsam Page Shortage Thresholds (PSTs):
1039  *  - critical: jetsam above the idle band
1040  *  - idle: jetsam in the idle band
1041  *  - pressure: jetsam soft memory limit violators
1042  *  - reaper: jetsam long-idle processes
1043  *  - ballast: offset applied to non-critical thresholds upon request
1044  *    from userspace
1045  *  - ctd (clear-the-decks): offset applied to non-critical thresholds upon request
1046  *    from userspace
1047  */
1048 uint32_t memstat_critical_threshold = 0;
1049 uint32_t memstat_idle_threshold = 0;
1050 uint32_t memstat_soft_threshold = 0;
1051 uint32_t memstat_reaper_threshold = 0;
1052 uint32_t memstat_ballast_offset = 0;
1053 uint32_t memstat_ctd_offset = 0;
1054 
1055 int32_t  memstat_reaper_min_age_secs = MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT;
1056 int32_t  memstat_reaper_min_age_apps_secs = MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT;
1057 boolean_t memstat_reaper_enabled = MEMORYSTATUS_REAPER_ENABLED_DEFAULT;
1058 uint32_t memstat_reaper_max_priority = MEMORYSTATUS_REAPER_MAX_PRIORITY_DEFAULT;
1059 int32_t  memstat_reaper_rescan_secs = MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT;
1060 bool     memstat_reaper_is_currently_sweeping = false;
1061 uint64_t memstat_reaper_can_run_after_ts_matu = 0;
1062 uint64_t memstat_reaper_start_ts_matu = 0;
1063 
1064 uint32_t memstat_reaper_reap_relaunch_mask = MEMORYSTATUS_REAPER_REAP_RELAUNCH_MASK_DEFAULT;
1065 
1066 #define MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_UNKNOWN UINT64_MAX
1067 #define MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_NONE    (UINT64_MAX-1)
1068 uint64_t memstat_oldest_reapable_proc_prio_start = MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_UNKNOWN;
1069 uint64_t memstat_oldest_reapable_proc_info_expiration_ts_matu = 0;
1070 
1071 uint64_t memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu = 0;
1072 
1073 typedef struct memstat_reaper_stats {
1074 	uint32_t sweep_count;
1075 	uint32_t kill_count;
1076 	uint64_t memory_freed_bytes;
1077 } memstat_reaper_stats_t;
1078 
1079 memstat_reaper_stats_t memstat_reaper_current_sweep_stats;
1080 memstat_reaper_stats_t memstat_reaper_cumulative_stats;
1081 uint32_t memstat_reaper_cumulative_memory_freed_mb = 0;
1082 /*
1083  * NB: These MiB thresholds are only read at boot and may become out of sync
1084  * with the PSTs above.
1085  */
1086 TUNABLE_DT(uint32_t, memorystatus_critical_threshold_mb, "/defaults",
1087     "kern.memstat_critical_mb", "memorystatus_critical_threshold_mb", 0, TUNABLE_DT_NONE);
1088 TUNABLE_DT(uint32_t, memorystatus_idle_threshold_mb, "/defaults",
1089     "kern.memstat_idle_mb", "memorystatus_idle_threshold_mb", 0, TUNABLE_DT_NONE);
1090 TUNABLE_DT(uint32_t, memorystatus_pressure_threshold_mb, "/defaults",
1091     "kern.memstat_pressure_mb", "memorystatus_pressure_threshold_mb", 0, TUNABLE_DT_NONE);
1092 TUNABLE_DT(uint32_t, memorystatus_reaper_threshold_mb, "/defaults",
1093     "kern.memstat_reaper_mb", "memorystatus_reaper_threshold_mb", 0, TUNABLE_DT_NONE);
1094 TUNABLE_DT(uint32_t, memstat_ballast_offset_mb, "/defaults",
1095     "kern.memstat_ballast_mb", "memstat_ballast_offset_mb", 0, TUNABLE_DT_NONE);
1096 TUNABLE(uint32_t, memstat_ctd_offset_mb, "memstat_ballast_offset_mb", 0);
1097 
1098 /*
1099  * Kill count tracking
1100  *
1101  * Since idle exit is only applicable to processes in the idle band, track it
1102  * separately to save space. We also don't care about kMemorysatusInvalid.
1103  */
1104 uint32_t _Atomic memorystatus_kill_counts[JETSAM_PRIORITY_MAX + 1][JETSAM_REASON_MEMORYSTATUS_MAX - 1];
1105 uint32_t _Atomic memorystatus_idle_exit_kill_count = 0;
1106 
1107 TUNABLE_DT(int32_t, memorystatus_reaper_minimum_age_seconds, "/defaults",
1108     "kern.memstat_reaper_minage_secs", "memorystatus_reaper_minimum_age_seconds", MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT, TUNABLE_DT_NONE);
1109 TUNABLE_DT(int32_t, memorystatus_reaper_minimum_age_apps_seconds, "/defaults",
1110     "kern.memstat_reaper_minapp_secs", "memorystatus_reaper_minimum_age_apps_seconds", MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT, TUNABLE_DT_NONE);
1111 TUNABLE_DT(uint32_t, memorystatus_reaper_rescan_delay_seconds, "/defaults",
1112     "kern.memstat_reaper_rescan_secs", "memorystatus_reaper_rescan_delay_seconds", MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT, TUNABLE_DT_NONE);
1113 TUNABLE_DT(boolean_t, memorystatus_reaper_enabled, "/defaults",
1114     "kern.memstat_reaper_enabled", "memorystatus_reaper_enabled", MEMORYSTATUS_REAPER_ENABLED_DEFAULT, TUNABLE_DT_NONE);
1115 
1116 
1117 #if CONFIG_JETSAM
1118 TUNABLE_DT_WRITEABLE(unsigned int, memorystatus_swap_all_apps, "/defaults", "kern.swap_all_apps", "kern.swap_all_apps", false, TUNABLE_DT_NONE);
1119 /* Will compact the early swapin queue if there are >= this many csegs on it. */
1120 static unsigned int memorystatus_swapin_trigger_segments = 10;
1121 unsigned int memorystatus_swapin_trigger_pages = 0;
1122 
1123 #if DEVELOPMENT || DEBUG
1124 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
1125 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_swapin_trigger_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_swapin_trigger_pages, 0, "");
1126 #else
1127 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
1128 #endif /* DEVELOPMENT || DEBUG */
1129 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_swap_all_apps, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_swap_all_apps, 0, "");
1130 
1131 static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit);
1132 
1133 proc_name_t memorystatus_jetsam_proc_name_panic; /* Panic when we are about to jetsam this process. */
1134 uint32_t    memorystatus_jetsam_proc_cause_panic = 0; /* If specified, panic only when we are about to jetsam the process above for this cause. */
1135 uint32_t    memorystatus_jetsam_proc_size_panic = 0; /* If specified, panic only when we are about to jetsam the process above and its footprint is more than this in MB. */
1136 
1137 /* If set, kill swappable processes when we're low on swap space. Currently off until we can allocate more swap space (rdar://87800902) */
1138 TUNABLE(bool, jetsam_kill_on_low_swap, "jetsam_kill_on_low_swap", false);
1139 
1140 /*
1141  * Global switch for enabling fast jetsam. Fast jetsam is
1142  * hooked up via the system_override() system call. When
1143  * enabled, the following features can be toggled:
1144  * - clear-the-decks jetsam
1145  * - ballast-drain jetsam
1146  */
1147 TUNABLE_WRITEABLE(bool, fast_jetsam_enabled, "fast_jetsam_enabled", true);
1148 
1149 #else /* !CONFIG_JETSAM */
1150 
1151 /*
1152  * On compressor/swap exhaustion, kill the largest process regardless of
1153  * its chosen process policy.
1154  */
1155 #if DEVELOPMENT || DEBUG
1156 TUNABLE(bool, kill_on_no_paging_space, "-kill_on_no_paging_space", false);
1157 #else /* !(DEVELOPMENT || DEBUG) */
1158 bool kill_on_no_paging_space = false;
1159 #endif /* DEVELOPMENT || DEBUG */
1160 
1161 /* The timestamp (MAS) of the last no paging space action */
1162 _Atomic uint64_t last_no_space_action_ts = 0;
1163 /* The minimum duration between no paging space actions */
1164 TUNABLE(uint64_t, no_paging_space_action_throttle_delay_ns,
1165     "no_paging_space_throttle_delay_ns", 5 * NSEC_PER_SEC);
1166 
1167 #endif /* CONFIG_JETSAM */
1168 
1169 static inline uint32_t
roundToNearestMB(uint32_t in)1170 roundToNearestMB(uint32_t in)
1171 {
1172 	return (in + ((1 << 20) - 1)) >> 20;
1173 }
1174 
1175 #if DEVELOPMENT || DEBUG
1176 static int memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase);
1177 #endif
1178 
1179 #if __arm64__
1180 extern int legacy_footprint_entitlement_mode;
1181 #endif /* __arm64__ */
1182 
1183 /* Debug */
1184 
1185 extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
1186 
1187 #if DEVELOPMENT || DEBUG
1188 
1189 static unsigned int memorystatus_debug_dump_this_bucket = 0;
1190 
1191 static void
memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)1192 memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)
1193 {
1194 	proc_t p = NULL;
1195 	uint64_t bytes = 0;
1196 	int ledger_limit = 0;
1197 	unsigned int b = bucket_index;
1198 	boolean_t traverse_all_buckets = FALSE;
1199 
1200 	if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1201 		traverse_all_buckets = TRUE;
1202 		b = 0;
1203 	} else {
1204 		traverse_all_buckets = FALSE;
1205 		b = bucket_index;
1206 	}
1207 
1208 	/*
1209 	 * footprint reported in [pages / MB ]
1210 	 * limits reported as:
1211 	 *      L-limit  proc's Ledger limit
1212 	 *      C-limit  proc's Cached limit, should match Ledger
1213 	 *      A-limit  proc's Active limit
1214 	 *     IA-limit  proc's Inactive limit
1215 	 *	F==Fatal,  NF==NonFatal
1216 	 */
1217 
1218 	memorystatus_log_debug("memorystatus_debug_dump ***START*(PAGE_SIZE_64=%llu)**\n", PAGE_SIZE_64);
1219 	memorystatus_log_debug("bucket [pid]       [pages / MB]     [state]      [EP / RP / AP]   dirty     deadline [L-limit / C-limit / A-limit / IA-limit] name\n");
1220 	p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets);
1221 	while (p) {
1222 		bytes = get_task_phys_footprint(proc_task(p));
1223 		task_get_phys_footprint_limit(proc_task(p), &ledger_limit);
1224 		memorystatus_log_debug("%2d     [%5d]     [%5lld /%3lldMB]   0x%-8x   [%2d / %2d / %2d]   0x%-3x   %10lld    [%3d / %3d%s / %3d%s / %3d%s]   %s\n",
1225 		    b, proc_getpid(p),
1226 		    (bytes / PAGE_SIZE_64),             /* task's footprint converted from bytes to pages     */
1227 		    (bytes / (1024ULL * 1024ULL)),      /* task's footprint converted from bytes to MB */
1228 		    p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_assertionpriority,
1229 		    p->p_memstat_dirty, p->p_memstat_idledeadline,
1230 		    ledger_limit,
1231 		    p->p_memstat_memlimit,
1232 		    (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"),
1233 		    p->p_memstat_memlimit_active,
1234 		    (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL ? "F " : "NF"),
1235 		    p->p_memstat_memlimit_inactive,
1236 		    (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL ? "F " : "NF"),
1237 		    (*p->p_name ? p->p_name : "unknown"));
1238 		p = memorystatus_get_next_proc_locked(&b, p, traverse_all_buckets);
1239 	}
1240 	memorystatus_log_debug("memorystatus_debug_dump ***END***\n");
1241 }
1242 
1243 static int
1244 sysctl_memorystatus_debug_dump_bucket SYSCTL_HANDLER_ARGS
1245 {
1246 #pragma unused(oidp, arg2)
1247 	int bucket_index = 0;
1248 	int error;
1249 	error = SYSCTL_OUT(req, arg1, sizeof(int));
1250 	if (error || !req->newptr) {
1251 		return error;
1252 	}
1253 	error = SYSCTL_IN(req, &bucket_index, sizeof(int));
1254 	if (error || !req->newptr) {
1255 		return error;
1256 	}
1257 	if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1258 		/*
1259 		 * All jetsam buckets will be dumped.
1260 		 */
1261 	} else {
1262 		/*
1263 		 * Only a single bucket will be dumped.
1264 		 */
1265 	}
1266 
1267 	proc_list_lock();
1268 	memorystatus_debug_dump_bucket_locked(bucket_index);
1269 	proc_list_unlock();
1270 	memorystatus_debug_dump_this_bucket = bucket_index;
1271 	return error;
1272 }
1273 
1274 /*
1275  * Debug aid to look at jetsam buckets and proc jetsam fields.
1276  *	Use this sysctl to act on a particular jetsam bucket.
1277  *	Writing the sysctl triggers the dump.
1278  *      Usage: sysctl kern.memorystatus_debug_dump_this_bucket=<bucket_index>
1279  */
1280 
1281 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_debug_dump_this_bucket, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_debug_dump_this_bucket, 0, sysctl_memorystatus_debug_dump_bucket, "I", "");
1282 
1283 
1284 /* Debug aid to aid determination of limit */
1285 
1286 static int
1287 sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
1288 {
1289 #pragma unused(oidp, arg2)
1290 	proc_t p;
1291 	unsigned int b = 0;
1292 	int error, enable = 0;
1293 	bool use_active;   /* use the active limit and active limit attributes */
1294 
1295 	error = SYSCTL_OUT(req, arg1, sizeof(int));
1296 	if (error || !req->newptr) {
1297 		return error;
1298 	}
1299 
1300 	error = SYSCTL_IN(req, &enable, sizeof(int));
1301 	if (error || !req->newptr) {
1302 		return error;
1303 	}
1304 
1305 	if (!(enable == 0 || enable == 1)) {
1306 		return EINVAL;
1307 	}
1308 
1309 	proc_list_lock();
1310 
1311 	memorystatus_highwater_enabled = enable;
1312 
1313 	p = memorystatus_get_first_proc_locked(&b, TRUE);
1314 	while (p) {
1315 		use_active = memstat_proc_is_active_locked(p);
1316 
1317 		if (enable) {
1318 			(void)memstat_update_memlimit_locked(p, use_active);
1319 		} else {
1320 			/*
1321 			 * Disabling limits does not touch the stored variants.
1322 			 * Set the cached limit fields to system_wide defaults.
1323 			 */
1324 			p->p_memstat_memlimit = -1;
1325 			p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
1326 		}
1327 
1328 		/*
1329 		 * Enforce the cached limit by writing to the ledger.
1330 		 */
1331 		_memstat_write_memlimit_to_ledger_locked(p, use_active, false);
1332 
1333 		p = memorystatus_get_next_proc_locked(&b, p, TRUE);
1334 	}
1335 
1336 
1337 	proc_list_unlock();
1338 
1339 	return 0;
1340 }
1341 
1342 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
1343 
1344 SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
1345 
1346 #endif /* DEVELOPMENT || DEBUG */
1347 
1348 #if CONFIG_JETSAM
1349 #if DEVELOPMENT || DEBUG
1350 static int
1351 memstat_page_shortage_threshold_sysctl_handler SYSCTL_HANDLER_ARGS
1352 {
1353 	uint32_t threshold;
1354 	if (arg1 == &memstat_idle_threshold) {
1355 		threshold = memorystatus_get_idle_exit_page_shortage_threshold();
1356 	} else if (arg1 == &memstat_soft_threshold) {
1357 		threshold = memorystatus_get_soft_memlimit_page_shortage_threshold();
1358 	} else if (arg1 == &memstat_critical_threshold) {
1359 		threshold = memorystatus_get_critical_page_shortage_threshold();
1360 	} else if (arg1 == &memstat_reaper_threshold) {
1361 		threshold = memorystatus_get_reaper_page_shortage_threshold();
1362 	} else {
1363 		return EINVAL;
1364 	}
1365 	return sysctl_handle_int(oidp, NULL, threshold, req);
1366 }
1367 
1368 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_critical,
1369     CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_critical_threshold, 0,
1370     memstat_page_shortage_threshold_sysctl_handler, "IU",
1371     "");
1372 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_idle,
1373     CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_idle_threshold, 0,
1374     memstat_page_shortage_threshold_sysctl_handler, "IU",
1375     "");
1376 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_soft,
1377     CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_soft_threshold, 0,
1378     memstat_page_shortage_threshold_sysctl_handler, "IU",
1379     "");
1380 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_reaper,
1381     CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_reaper_threshold, 0,
1382     memstat_page_shortage_threshold_sysctl_handler, "IU",
1383     "");
1384 
1385 SYSCTL_INT(_kern_memorystatus, OID_AUTO, ballast_offset_pages,
1386     CTLFLAG_RD | CTLFLAG_LOCKED,
1387     &memstat_ballast_offset, 0, "");
1388 SYSCTL_INT(_kern_memorystatus, OID_AUTO, ctd_offset_pages,
1389     CTLFLAG_RD | CTLFLAG_LOCKED,
1390     &memstat_ctd_offset, 0, "");
1391 #endif /* DEBUG || DEVELOPMENT */
1392 
1393 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_enabled, &memstat_reaper_enabled, FALSE, TRUE, "");
1394 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_min_age_secs, &memstat_reaper_min_age_secs, 0, UINT32_MAX, "");
1395 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_min_age_apps_secs, &memstat_reaper_min_age_apps_secs, 0, UINT32_MAX, "");
1396 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_max_priority, &memstat_reaper_max_priority, 0, JETSAM_PRIORITY_MAX, "");
1397 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_reap_relaunch_mask, &memstat_reaper_reap_relaunch_mask, 0, UINT32_MAX, "");
1398 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_rescan_secs, &memstat_reaper_rescan_secs, 0, UINT32_MAX, "");
1399 
1400 SYSCTL_INT(_kern_memorystatus, OID_AUTO, reaper_stats_sweep_count,
1401     CTLFLAG_RD | CTLFLAG_LOCKED,
1402     &(memstat_reaper_cumulative_stats.sweep_count), 0, "");
1403 SYSCTL_INT(_kern_memorystatus, OID_AUTO, reaper_stats_total_kills,
1404     CTLFLAG_RD | CTLFLAG_LOCKED,
1405     &(memstat_reaper_cumulative_stats.kill_count), 0, "");
1406 SYSCTL_INT(_kern_memorystatus, OID_AUTO, reaper_stats_total_freed_mb,
1407     CTLFLAG_RD | CTLFLAG_LOCKED,
1408     &memstat_reaper_cumulative_memory_freed_mb, 0, "");
1409 
1410 static int
1411 memstat_page_shortage_threshold_experiment_handler SYSCTL_HANDLER_ARGS
1412 {
1413 	uint32_t threshold_mb;
1414 	int error;
1415 
1416 	assert3p(arg1, !=, NULL);
1417 	threshold_mb = ptoa_32(os_atomic_load((uint32_t *)arg1, relaxed)) >> 20;
1418 
1419 	error = sysctl_handle_int(oidp, &threshold_mb, 0, req);
1420 	if (error || !req->newptr) {
1421 		return error;
1422 	}
1423 
1424 	if (threshold_mb > UINT32_MAX >> 20) {
1425 		/* Converting to bytes would overflow */
1426 		return EINVAL;
1427 	}
1428 
1429 	uint32_t new_threshold_pages = atop_32(threshold_mb << 20);
1430 	/*
1431 	 * Page shortage thresholds may not exceed 1/2 max_mem
1432 	 */
1433 	if (new_threshold_pages > MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX) {
1434 		return EINVAL;
1435 	}
1436 	if ((arg1 == &memstat_soft_threshold ||
1437 	    arg1 == &memstat_idle_threshold ||
1438 	    arg1 == &memstat_critical_threshold ||
1439 	    arg1 == &memstat_reaper_threshold) &&
1440 	    new_threshold_pages == 0) {
1441 		return EINVAL;
1442 	}
1443 
1444 	if (arg1 == &memstat_soft_threshold) {
1445 		memorystatus_log("memorystatus: setting soft memory limit "
1446 		    "page shortage threshold to %u MiB\n", threshold_mb);
1447 	} else if (arg1 == &memstat_idle_threshold) {
1448 		memorystatus_log("memorystatus: setting idle exit page "
1449 		    "shortage threshold to %u MiB\n", threshold_mb);
1450 	} else if (arg1 == &memstat_critical_threshold) {
1451 		memorystatus_log("memorystatus: setting critical page shortage"
1452 		    " threshold to %u MiB\n", threshold_mb);
1453 	} else if (arg1 == &memstat_reaper_threshold) {
1454 		memorystatus_log("memorystatus: setting reaper page shortage"
1455 		    " threshold to %u MiB\n", threshold_mb);
1456 	} else if (arg1 == &memstat_ctd_offset) {
1457 		memorystatus_log("memorystatus: setting clear-the-decks page shortage"
1458 		    " offset to %u MiB\n", threshold_mb);
1459 	} else if (arg1 == &memstat_ballast_offset) {
1460 		memorystatus_log("memorystatus: setting ballast page shortage"
1461 		    " offset to %u MiB\n", threshold_mb);
1462 	} else {
1463 		return EINVAL;
1464 	}
1465 	os_atomic_store((uint32_t *)arg1, new_threshold_pages, relaxed);
1466 
1467 	return 0;
1468 }
1469 
1470 #if DEVELOPMENT || DEBUG
1471 #define MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED)
1472 #else /* RELEASE */
1473 #define MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED)
1474 #endif /* DEVELOPMENT || DEBUG */
1475 
1476 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, soft_threshold_mb,
1477     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1478     &memstat_soft_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1479     "IU",
1480     "The minimum amount of available memory to maintain before killing "
1481     "processes which have violated there soft memory limit");
1482 
1483 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, idle_threshold_mb,
1484     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1485     &memstat_idle_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1486     "IU",
1487     "The minimum amount of available memory to maintain before exiting idle "
1488     "processes");
1489 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, critical_threshold_mb,
1490     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1491     &memstat_critical_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1492     "IU",
1493     "The minimum amount of available memory to maintain before killing non-idle "
1494     "processes");
1495 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, reaper_threshold_mb,
1496     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1497     &memstat_reaper_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1498     "IU",
1499     "The minimum amount of available memory to maintain before killing long-idle "
1500     "processes");
1501 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, ballast_offset_mb,
1502     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1503     &memstat_ballast_offset, 0, memstat_page_shortage_threshold_experiment_handler,
1504     "IU",
1505     "An offset to apply to all non-critical page shortage thresholds when "
1506     "ballast is filling");
1507 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, clear_the_decks_offset_mb,
1508     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1509     &memstat_ctd_offset, 0, memstat_page_shortage_threshold_experiment_handler,
1510     "IU",
1511     "An offset to apply to all non-critical page shortage thresholds when "
1512     "clear-the-decks is engaged");
1513 
1514 int
memorystatus_ballast_control(bool drain)1515 memorystatus_ballast_control(bool drain)
1516 {
1517 	if (!fast_jetsam_enabled) {
1518 		memorystatus_log_error("memorystatus: fast-jetsam "
1519 		    "has been disabled on this system. denying request to %s ballast\n",
1520 		    drain ? "drain" : "flood");
1521 		return ENOTSUP;
1522 	}
1523 	if (memstat_ballast_offset == 0) {
1524 		/* nothing to do */
1525 		return 0;
1526 	}
1527 	if (drain) {
1528 		/*
1529 		 * Drain the ballast tanks, providing additional buoyancy by requiring that
1530 		 * they only be used to store "available" memory.
1531 		 */
1532 		memorystatus_policy_t orig_policy = os_atomic_or_orig(
1533 			&memstat_policy_config,
1534 			(memorystatus_policy_t)kPolicyBallastDrain, relaxed);
1535 		if (orig_policy & kPolicyBallastDrain) {
1536 			return 0;
1537 		}
1538 		memorystatus_log("memorystatus: draining ballast "
1539 		    "-- will add %u MiB to non-critical page shortage "
1540 		    "thresholds\n", ptoa_32(memstat_ballast_offset) >> 20);
1541 		memorystatus_thread_pool_max();
1542 		_memstat_consider_waking_jetsam_thread();
1543 	} else {
1544 		/*
1545 		 * Flood the ballast tanks, removing the extra buoyancy by allowing them to be
1546 		 * filled with "unavailable" memory.
1547 		 */
1548 		memorystatus_policy_t orig_policy = os_atomic_andnot_orig(
1549 			&memstat_policy_config,
1550 			(memorystatus_policy_t)kPolicyBallastDrain, relaxed);
1551 		if (!(orig_policy & kPolicyBallastDrain)) {
1552 			/* already disabled */
1553 			return 0;
1554 		}
1555 		assertf(fast_jetsam_enabled, "ballast was drained while fast-jetsam was disabled");
1556 		memorystatus_log("memorystatus: flooding ballast "
1557 		    "-- will subtract %u MiB from non-critical page shortage "
1558 		    "thresholds\n", ptoa_32(memstat_ballast_offset) >> 20);
1559 		memorystatus_thread_pool_default();
1560 		_memstat_consider_waking_jetsam_thread();
1561 	}
1562 	return 0;
1563 }
1564 
1565 static int
1566 sysctl_kern_memorystatus_ballast_drain SYSCTL_HANDLER_ARGS
1567 {
1568 	int error = 0;
1569 
1570 	boolean_t drained = os_atomic_load(&memstat_policy_config, relaxed) & kPolicyBallastDrain ? TRUE : FALSE;
1571 
1572 	error = sysctl_handle_int(oidp, &drained, 0, req);
1573 	if (error || !req->newptr) {
1574 		return error;
1575 	}
1576 
1577 	/*
1578 	 * Writers must be root or have the com.apple.private.kernel.jetsam entitlement
1579 	 */
1580 	error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
1581 	if (error) {
1582 		return error;
1583 	}
1584 
1585 	return memorystatus_ballast_control(drained);
1586 }
1587 
1588 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, ballast_drained,
1589     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS, 0, 0,
1590     sysctl_kern_memorystatus_ballast_drain, "IU",
1591     "If true, apply an offset (kern.memorystatus.ballast_offset_mb) to "
1592     "all non-critical page shortage thresholds");
1593 
1594 #if DEVELOPMENT || DEBUG
1595 /*
1596  * In preparation for a storm, sailors may "clear the decks" of non-essential
1597  * cargo to increase the seaworthiness of a vessel. In our analogy, the
1598  * non-essential cargo is idle processes or processes which have exceeded
1599  * their memory limit. The storm may be any foreseeable user activity that will
1600  * require significant memory demand.
1601  *
1602  * Mechanically, clearing the decks involves adding a configurable offset to
1603  * the idle and soft available page shortage thresholds.
1604  *
1605  * Readers may note that the clear-the-decks policy is mechanically identical
1606  * the ballast-draining policy. Their difference lies in intended use.
1607  * Clear-the-decks is intended to address imminent memory demand and may be
1608  * configured with an offset that wouldn't be sustainable for long-term system
1609  * use. The interface is generally intended to allow clients to hint to the
1610  * system that they will need a significant amount of memory in the near future,
1611  * and the system should proactively try to free unneeded reserves to satisfy
1612  * to be able to better satisfy the demand.
1613  *
1614  * This policy is currently only exposed on development kernels for prototyping
1615  * until a productized use case emerges
1616  *
1617  * TODO: If adopted on production systems, this mechanism should use a
1618  * dedicated system-call / memorystatus-command
1619  */
1620 static int
memstat_clear_the_decks(bool clear)1621 memstat_clear_the_decks(bool clear)
1622 {
1623 	if (!fast_jetsam_enabled) {
1624 		memorystatus_log_error("memorystatus: fast-jetsam "
1625 		    "has been disabled on this system\n");
1626 		return ENOTSUP;
1627 	}
1628 	if (clear) {
1629 		/*
1630 		 * Clear the decks of non-essential cargo.
1631 		 */
1632 		memorystatus_policy_t orig_policy = os_atomic_or_orig(
1633 			&memstat_policy_config,
1634 			(memorystatus_policy_t)kPolicyClearTheDecks, relaxed);
1635 		if (orig_policy & kPolicyClearTheDecks) {
1636 			return EALREADY;
1637 		}
1638 		memorystatus_log("memorystatus: clear-the-decks engaged "
1639 		    "-- will add %u MiB to non-critical page shortage "
1640 		    "thresholds\n", ptoa_32(memstat_ctd_offset) >> 20);
1641 		memorystatus_thread_pool_max();
1642 		_memstat_consider_waking_jetsam_thread();
1643 	} else {
1644 		/*
1645 		 * Allow the decks to be reloaded with non-essential cargo.
1646 		 */
1647 		memorystatus_policy_t orig_policy = os_atomic_andnot_orig(
1648 			&memstat_policy_config,
1649 			(memorystatus_policy_t)kPolicyClearTheDecks, relaxed);
1650 		if (!(orig_policy & kPolicyClearTheDecks)) {
1651 			return EALREADY;
1652 		}
1653 		assertf(fast_jetsam_enabled, "clear the decks was set while fast-jetsam was disabled");
1654 		memorystatus_log("memorystatus: clear-the-decks disengaged "
1655 		    "-- will subtract %u MiB from non-critical page shortage "
1656 		    "thresholds\n", ptoa_32(memstat_ctd_offset) >> 20);
1657 		memorystatus_thread_pool_default();
1658 		_memstat_consider_waking_jetsam_thread();
1659 	}
1660 	return 0;
1661 }
1662 
1663 static int
1664 sysctl_kern_memorystatus_decks_cleared SYSCTL_HANDLER_ARGS
1665 {
1666 	int error = 0;
1667 
1668 	boolean_t cleared = os_atomic_load(&memstat_policy_config, relaxed) & kPolicyClearTheDecks ? TRUE : FALSE;
1669 
1670 	error = sysctl_handle_int(oidp, &cleared, 0, req);
1671 	if (error || !req->newptr) {
1672 		return error;
1673 	}
1674 
1675 	/*
1676 	 * Writers must be root or have the com.apple.private.kernel.jetsam entitlement
1677 	 */
1678 	error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
1679 	if (error) {
1680 		return error;
1681 	}
1682 
1683 	return memstat_clear_the_decks(cleared);
1684 }
1685 
1686 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, decks_cleared,
1687     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1688     0, 0, sysctl_kern_memorystatus_decks_cleared, "I",
1689     "If true, apply an offset (kern.memorystatus_ctd_offset_mb) to "
1690     "all non-critical page shortage thresholds");
1691 #endif /* DEVELOPMENT || DEBUG */
1692 #endif /* CONFIG_JETSAM */
1693 
1694 extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
1695     void *parameter,
1696     integer_t priority,
1697     thread_t *new_thread);
1698 
1699 #if DEVELOPMENT || DEBUG
1700 
1701 static int
1702 sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS
1703 {
1704 #pragma unused(arg1, arg2)
1705 	int     error = 0, pid = 0;
1706 	proc_t  p;
1707 
1708 	error = sysctl_handle_int(oidp, &pid, 0, req);
1709 	if (error || !req->newptr) {
1710 		return error;
1711 	}
1712 
1713 	lck_mtx_lock(&disconnect_page_mappings_mutex);
1714 
1715 	if (pid == -1) {
1716 		vm_pageout_disconnect_all_pages();
1717 	} else {
1718 		p = proc_find(pid);
1719 
1720 		if (p != NULL) {
1721 			error = task_disconnect_page_mappings(proc_task(p));
1722 
1723 			proc_rele(p);
1724 
1725 			if (error) {
1726 				error = EIO;
1727 			}
1728 		} else {
1729 			error = EINVAL;
1730 		}
1731 	}
1732 	lck_mtx_unlock(&disconnect_page_mappings_mutex);
1733 
1734 	return error;
1735 }
1736 
1737 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1738     0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", "");
1739 
1740 #endif /* DEVELOPMENT || DEBUG */
1741 
1742 /*
1743  * Sorts the given bucket.
1744  *
1745  * Input:
1746  *	bucket_index - jetsam priority band to be sorted.
1747  *	sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1748  *
1749  * proc_list_lock must be held by the caller.
1750  */
1751 static void
memstat_sort_bucket_locked(unsigned int bucket_index,memorystatus_jetsam_sort_order_t sort_order)1752 memstat_sort_bucket_locked(
1753 	unsigned int bucket_index,
1754 	memorystatus_jetsam_sort_order_t sort_order)
1755 {
1756 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
1757 	assert(bucket_index < MEMSTAT_BUCKET_COUNT);
1758 	if (memstat_bucket[bucket_index].count == 0) {
1759 		return;
1760 	}
1761 
1762 	switch (sort_order) {
1763 	case JETSAM_SORT_NONE:
1764 		break;
1765 	case JETSAM_SORT_LRU:
1766 	case JETSAM_SORT_FOOTPRINT:
1767 		memstat_sort_coals_locked(bucket_index, sort_order);
1768 		break;
1769 	case JETSAM_SORT_FOOTPRINT_NOCOAL:
1770 		memstat_sort_by_footprint_locked(bucket_index);
1771 	}
1772 }
1773 
1774 /*
1775  * Picks the sorting routine for a given jetsam priority band.
1776  *
1777  * Input:
1778  *	bucket_index - jetsam priority band to be sorted.
1779  *	sort_order - sort order to use
1780  *
1781  * Return:
1782  *	0     on success
1783  *      non-0 on failure
1784  */
1785 static int
memstat_sort_bucket(unsigned int bucket_index,memorystatus_jetsam_sort_order_t sort_order)1786 memstat_sort_bucket(
1787 	unsigned int bucket_index,
1788 	memorystatus_jetsam_sort_order_t sort_order)
1789 {
1790 	assert(bucket_index < MEMSTAT_BUCKET_COUNT);
1791 
1792 	proc_list_lock();
1793 	memstat_sort_bucket_locked(bucket_index, sort_order);
1794 	proc_list_unlock();
1795 
1796 	return 0;
1797 }
1798 
1799 /*
1800  * Sort processes by size for a single jetsam bucket.
1801  */
1802 
1803 static void
memstat_sort_by_footprint_locked(unsigned int bucket_index)1804 memstat_sort_by_footprint_locked(unsigned int bucket_index)
1805 {
1806 	proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
1807 	proc_t next_p = NULL, prev_max_proc = NULL;
1808 	uint32_t pages = 0, max_pages = 0;
1809 	memstat_bucket_t *current_bucket;
1810 
1811 	if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1812 		return;
1813 	}
1814 
1815 	current_bucket = &memstat_bucket[bucket_index];
1816 
1817 	p = TAILQ_FIRST(&current_bucket->list);
1818 
1819 	while (p) {
1820 		memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
1821 		max_pages = pages;
1822 		max_proc = p;
1823 		prev_max_proc = p;
1824 
1825 		while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
1826 			/* traversing list until we find next largest process */
1827 			p = next_p;
1828 			memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
1829 			if (pages > max_pages) {
1830 				max_pages = pages;
1831 				max_proc = p;
1832 			}
1833 		}
1834 
1835 		if (prev_max_proc != max_proc) {
1836 			/* found a larger process, place it in the list */
1837 			TAILQ_REMOVE(&current_bucket->list, max_proc, p_memstat_list);
1838 			if (insert_after_proc == NULL) {
1839 				TAILQ_INSERT_HEAD(&current_bucket->list, max_proc, p_memstat_list);
1840 			} else {
1841 				TAILQ_INSERT_AFTER(&current_bucket->list, insert_after_proc, max_proc, p_memstat_list);
1842 			}
1843 			prev_max_proc = max_proc;
1844 		}
1845 
1846 		insert_after_proc = max_proc;
1847 
1848 		p = TAILQ_NEXT(max_proc, p_memstat_list);
1849 	}
1850 }
1851 
1852 proc_t
memorystatus_get_first_proc_locked(unsigned int * bucket_index,boolean_t search)1853 memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search)
1854 {
1855 	memstat_bucket_t *current_bucket;
1856 	proc_t next_p;
1857 
1858 	if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
1859 		return NULL;
1860 	}
1861 
1862 	current_bucket = &memstat_bucket[*bucket_index];
1863 	next_p = TAILQ_FIRST(&current_bucket->list);
1864 	if (!next_p && search) {
1865 		while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1866 			current_bucket = &memstat_bucket[*bucket_index];
1867 			next_p = TAILQ_FIRST(&current_bucket->list);
1868 		}
1869 	}
1870 
1871 	return next_p;
1872 }
1873 
1874 proc_t
memorystatus_get_next_proc_locked(unsigned int * bucket_index,proc_t p,boolean_t search)1875 memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search)
1876 {
1877 	memstat_bucket_t *current_bucket;
1878 	proc_t next_p;
1879 
1880 	if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
1881 		return NULL;
1882 	}
1883 
1884 	next_p = TAILQ_NEXT(p, p_memstat_list);
1885 	while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1886 		current_bucket = &memstat_bucket[*bucket_index];
1887 		next_p = TAILQ_FIRST(&current_bucket->list);
1888 	}
1889 
1890 	return next_p;
1891 }
1892 
1893 jetsam_state_t jetsam_threads;
1894 
1895 /* Maximum number of jetsam threads allowed */
1896 #define JETSAM_THREADS_LIMIT   3
1897 
1898 /* Number of active jetsam threads */
1899 _Atomic unsigned int active_jetsam_threads = 1;
1900 /* Number of maximum jetsam threads configured */
1901 unsigned int max_jetsam_threads = 1;
1902 
1903 static jetsam_state_t
jetsam_current_thread()1904 jetsam_current_thread()
1905 {
1906 	for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) {
1907 		if (jetsam_threads[thr_id].thread == current_thread()) {
1908 			return &(jetsam_threads[thr_id]);
1909 		}
1910 	}
1911 	return NULL;
1912 }
1913 
1914 #if CONFIG_JETSAM
1915 static void
initialize_entitled_max_task_limit()1916 initialize_entitled_max_task_limit()
1917 {
1918 #if !XNU_TARGET_OS_XR
1919 	/**
1920 	 * We've already stored the potential boot-arg "entitled_max_task_pmem" in
1921 	 * memorystatus_entitled_max_task_footprint_mb as a TUNABLE_DT.  We provide
1922 	 * argptr=NULL and max_len=0 here to check only for existence of the boot-arg.
1923 	 *
1924 	 * The boot-arg takes precedence over memorystatus_swap_all_apps.
1925 	 */
1926 	if (!PE_parse_boot_argn("entitled_max_task_pmem", NULL, 0) && memorystatus_swap_all_apps) {
1927 		/*
1928 		 * When we have swap, we let entitled apps go up to the dram config
1929 		 * regardless of what's set in EDT,
1930 		 * This can still be overriden with the entitled_max_task_pmem boot-arg.
1931 		 *
1932 		 * We do not want to do this on visionOS, since we can have an effectively
1933 		 * infinite number of apps open at a time, and cannot swap our way to safety.
1934 		 */
1935 		memorystatus_entitled_max_task_footprint_mb =
1936 		    (int32_t)(max_mem_actual / (1ULL << 20));
1937 		memorystatus_entitled_dev_max_task_footprint_mb =
1938 		    memorystatus_entitled_max_task_footprint_mb;
1939 	}
1940 #endif
1941 
1942 	if (memorystatus_entitled_max_task_footprint_mb < 0) {
1943 		memorystatus_log_error("Invalid value (%d) for entitled_max_task_pmem. "
1944 		    "Setting to 0\n", memorystatus_entitled_max_task_footprint_mb);
1945 		memorystatus_entitled_max_task_footprint_mb = 0;
1946 	}
1947 
1948 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
1949 	if (memorystatus_entitled_bincompat_max_task_footprint_mb < 0) {
1950 		memorystatus_log_error("Invalid value (%d) for entitled_bincompat_max_task_pmem. "
1951 		    "Setting to 0\n", memorystatus_entitled_bincompat_max_task_footprint_mb);
1952 		memorystatus_entitled_bincompat_max_task_footprint_mb = 0;
1953 	}
1954 #endif /* CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT */
1955 
1956 	if (memorystatus_entitled_dev_max_task_footprint_mb < -1) {
1957 		memorystatus_log_error("Invalid value (%d) for entitled_max_developer_task_pmem. "
1958 		    "Setting to 0\n", memorystatus_entitled_dev_max_task_footprint_mb);
1959 		memorystatus_entitled_dev_max_task_footprint_mb = 0;
1960 	} else if (memorystatus_entitled_dev_max_task_footprint_mb == -1) {
1961 		memorystatus_entitled_dev_max_task_footprint_mb = (int32_t)
1962 		    (max_mem_actual >> 20);
1963 	}
1964 
1965 	if (memorystatus_entitled_dev_max_task_footprint_mb &&
1966 	    memorystatus_entitled_dev_max_task_footprint_mb <
1967 	    memorystatus_entitled_max_task_footprint_mb) {
1968 		memorystatus_log_error("memorystatus: Entitled developer limit (%d MB) "
1969 		    "must be ≥ entitled task limit (%d MB)\n",
1970 		    memorystatus_entitled_dev_max_task_footprint_mb,
1971 		    memorystatus_entitled_max_task_footprint_mb);
1972 		memorystatus_entitled_dev_max_task_footprint_mb =
1973 		    memorystatus_entitled_max_task_footprint_mb;
1974 	}
1975 }
1976 
1977 #endif /* CONFIG_JETSAM */
1978 
1979 
1980 __private_extern__ void
memorystatus_init(void)1981 memorystatus_init(void)
1982 {
1983 	kern_return_t result;
1984 	int i;
1985 
1986 #if CONFIG_FREEZE
1987 	memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX_DEFAULT;
1988 	memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */
1989 	memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4);
1990 	memorystatus_freeze_pages_min = FREEZE_PAGES_MIN_DEFAULT;
1991 	memorystatus_freeze_pages_max = FREEZE_PAGES_MAX_DEFAULT;
1992 	memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS_DEFAULT;
1993 	memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD_DEFAULT;
1994 	memorystatus_min_thaw_refreeze_threshold = MIN_THAW_REFREEZE_THRESHOLD_DEFAULT;
1995 #endif /* CONFIG_FREEZE */
1996 
1997 	// Note: no-op pending rdar://27006343 (Custom kernel log handles)
1998 	memorystatus_log_handle = os_log_create("com.apple.xnu", "memorystatus");
1999 
2000 	/* Init buckets */
2001 	for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
2002 		TAILQ_INIT(&memstat_bucket[i].list);
2003 		memstat_bucket[i].count = 0;
2004 		memstat_bucket[i].relaunch_high_count = 0;
2005 	}
2006 	memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
2007 
2008 	nanoseconds_to_absolutetime(memstat_idle_deferral_time_s * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
2009 	nanoseconds_to_absolutetime(memstat_idle_deferral_time_s * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
2010 	nanoseconds_to_absolutetime(memstat_aging_stuck_time_s * NSEC_PER_SEC, &memorystatus_aging_stuck_delay_time);
2011 	assert3u(memstat_idle_deferral_time_s, >=, kJetsamSysProcsIdleDelayTimeLowRatio);
2012 
2013 #if CONFIG_JETSAM
2014 	bzero(memorystatus_jetsam_proc_name_panic, sizeof(memorystatus_jetsam_proc_name_panic));
2015 	if (PE_parse_boot_argn("jetsam_proc_name_panic", &memorystatus_jetsam_proc_name_panic, sizeof(memorystatus_jetsam_proc_name_panic))) {
2016 		/*
2017 		 * No bounds check to see if this is a valid cause.
2018 		 * This is a debugging aid. The callers should know precisely which cause they wish to track.
2019 		 */
2020 		PE_parse_boot_argn("jetsam_proc_cause_panic", &memorystatus_jetsam_proc_cause_panic, sizeof(memorystatus_jetsam_proc_cause_panic));
2021 		PE_parse_boot_argn("jetsam_proc_size_panic", &memorystatus_jetsam_proc_size_panic, sizeof(memorystatus_jetsam_proc_size_panic));
2022 	}
2023 
2024 	if (memorystatus_swap_all_apps && vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
2025 		panic("kern.swap_all_apps is not supported on this platform");
2026 	}
2027 
2028 	/*
2029 	 * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE
2030 	 * band and must be below it in priority. This is so that we don't have to make
2031 	 * our 'aging' code worry about a mix of processes, some of which need to age
2032 	 * and some others that need to stay elevated in the jetsam bands.
2033 	 */
2034 	assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band);
2035 	assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band_stuck);
2036 	assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band);
2037 
2038 	/* Take snapshots for idle-exit kills by default? First check the boot-arg... */
2039 	if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot))) {
2040 		/* ...no boot-arg, so check the device tree */
2041 		PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
2042 	}
2043 
2044 	memorystatus_sysproc_aging_aggr_pages = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_AGGR_SYSPROC_AGING_PERCENTAGE);
2045 
2046 	if (max_mem <= MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) {
2047 		memorystatus_delta = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_DELTA_PERCENTAGE_SMALL);
2048 	} else {
2049 		memorystatus_delta = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_DELTA_PERCENTAGE_LARGE);
2050 	}
2051 
2052 	if (memorystatus_critical_threshold_mb != 0) {
2053 		memstat_critical_threshold = atop_32(memorystatus_critical_threshold_mb << 20);
2054 	} else {
2055 		if (max_mem <= MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) {
2056 			memstat_critical_threshold = MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_CRITICAL_PERCENTAGE_SMALL);
2057 		} else {
2058 			memstat_critical_threshold = MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_CRITICAL_PERCENTAGE_LARGE);
2059 		}
2060 	}
2061 	assert3u(memstat_critical_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2062 
2063 	if (memorystatus_idle_threshold_mb != 0) {
2064 		memstat_idle_threshold = atop_32(memorystatus_idle_threshold_mb << 20);
2065 	} else {
2066 		/*
2067 		 * For historical reasons, devices with "medium"-sized memory configs have a different critical:idle:pressure ratio
2068 		 */
2069 		if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
2070 		    (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
2071 			memstat_idle_threshold = (MEMORYSTATUS_IDLE_RATIO_NUM_MEDIUM * memstat_critical_threshold) /
2072 			    MEMORYSTATUS_IDLE_RATIO_DENOM_MEDIUM;
2073 		} else {
2074 			memstat_idle_threshold = (MEMORYSTATUS_IDLE_RATIO_NUM * memstat_critical_threshold) /
2075 			    MEMORYSTATUS_IDLE_RATIO_DENOM;
2076 		}
2077 	}
2078 	assert3u(memstat_idle_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2079 
2080 	if (memorystatus_pressure_threshold_mb != 0) {
2081 		memstat_soft_threshold = atop_32(memorystatus_pressure_threshold_mb << 20);
2082 	} else {
2083 		if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
2084 		    (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
2085 			memstat_soft_threshold = (MEMORYSTATUS_PRESSURE_RATIO_NUM_MEDIUM * memstat_critical_threshold) /
2086 			    MEMORYSTATUS_PRESSURE_RATIO_DENOM_MEDIUM;
2087 		} else {
2088 			memstat_soft_threshold = (MEMORYSTATUS_PRESSURE_RATIO_NUM * memstat_critical_threshold) /
2089 			    MEMORYSTATUS_PRESSURE_RATIO_DENOM;
2090 		}
2091 	}
2092 	assert3u(memstat_soft_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2093 
2094 	memstat_reaper_max_priority = MEMORYSTATUS_REAPER_MAX_PRIORITY_DEFAULT;
2095 
2096 	if (memorystatus_reaper_threshold_mb != 0) {
2097 		memstat_reaper_threshold = atop_32(memorystatus_reaper_threshold_mb << 20);
2098 	} else {
2099 		if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
2100 		    (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
2101 			memstat_reaper_threshold = (MEMORYSTATUS_REAPER_RATIO_NUM_MEDIUM * memstat_critical_threshold) /
2102 			    MEMORYSTATUS_REAPER_RATIO_DENOM_MEDIUM;
2103 		} else if (max_mem > MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD) {
2104 			memstat_reaper_threshold = (MEMORYSTATUS_REAPER_RATIO_NUM_LARGE * memstat_critical_threshold) /
2105 			    MEMORYSTATUS_REAPER_RATIO_DENOM_LARGE;
2106 		} else {
2107 			memstat_reaper_threshold = (MEMORYSTATUS_REAPER_RATIO_NUM * memstat_critical_threshold) /
2108 			    MEMORYSTATUS_REAPER_RATIO_DENOM;
2109 		}
2110 	}
2111 	assert3u(memstat_reaper_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2112 
2113 	if (memorystatus_reaper_minimum_age_seconds != MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT) {
2114 		memstat_reaper_min_age_secs = memorystatus_reaper_minimum_age_seconds;
2115 	} else {
2116 		if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
2117 		    (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
2118 			memstat_reaper_min_age_secs = MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT_MEDIUM;
2119 		} else {
2120 			memstat_reaper_min_age_secs = MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT;
2121 		}
2122 	}
2123 
2124 	if (memorystatus_reaper_minimum_age_apps_seconds != MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT) {
2125 		memstat_reaper_min_age_apps_secs = memorystatus_reaper_minimum_age_apps_seconds;
2126 	} else {
2127 		if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
2128 		    (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
2129 			memstat_reaper_min_age_apps_secs = MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT_MEDIUM;
2130 		} else {
2131 			memstat_reaper_min_age_apps_secs = MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT;
2132 		}
2133 	}
2134 
2135 	if (memorystatus_reaper_rescan_delay_seconds != MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT) {
2136 		memstat_reaper_rescan_secs = memorystatus_reaper_rescan_delay_seconds;
2137 	} else {
2138 		memstat_reaper_rescan_secs = MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT;
2139 	}
2140 
2141 	memstat_reaper_enabled = memorystatus_reaper_enabled;
2142 
2143 	if (memstat_ballast_offset_mb != 0) {
2144 		memstat_ballast_offset = atop_32(memstat_ballast_offset_mb << 20);
2145 	}
2146 	assert3u(memstat_ballast_offset, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2147 
2148 	if (memstat_ctd_offset_mb != 0) {
2149 		memstat_ctd_offset = atop_32(memstat_ctd_offset_mb << 20);
2150 	}
2151 	assert3u(memstat_ctd_offset, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2152 
2153 	/* Set the swapin trigger in pages based on the maximum size allocated for each c_seg */
2154 	memorystatus_swapin_trigger_pages = (unsigned int) atop_64(memorystatus_swapin_trigger_segments * c_seg_allocsize);
2155 
2156 	/* Jetsam Loop Detection */
2157 	if (max_mem <= (512 * 1024 * 1024)) {
2158 		/* 512 MB devices */
2159 		memorystatus_jld_eval_period_msecs = 8000;      /* 8000 msecs == 8 second window */
2160 	} else {
2161 		/* 1GB and larger devices */
2162 		memorystatus_jld_eval_period_msecs = 6000;      /* 6000 msecs == 6 second window */
2163 	}
2164 
2165 	memorystatus_jld_enabled = TRUE;
2166 
2167 	initialize_entitled_max_task_limit();
2168 #endif /* CONFIG_JETSAM */
2169 
2170 	memorystatus_jetsam_snapshot_max = maxproc;
2171 
2172 	memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
2173 	    (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
2174 
2175 	memorystatus_jetsam_snapshot = kalloc_data(memorystatus_jetsam_snapshot_size, Z_WAITOK | Z_ZERO);
2176 	if (!memorystatus_jetsam_snapshot) {
2177 		panic("Could not allocate memorystatus_jetsam_snapshot");
2178 	}
2179 
2180 #if CONFIG_FREEZE
2181 	memorystatus_jetsam_snapshot_freezer_max = memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR;
2182 	memorystatus_jetsam_snapshot_freezer_size = sizeof(memorystatus_jetsam_snapshot_t) +
2183 	    (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_freezer_max);
2184 
2185 	memorystatus_jetsam_snapshot_freezer =
2186 	    zalloc_permanent(memorystatus_jetsam_snapshot_freezer_size, ZALIGN_PTR);
2187 #endif /* CONFIG_FREEZE */
2188 
2189 	nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
2190 
2191 	memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
2192 
2193 #if CONFIG_FREEZE
2194 	if (memorystatus_freeze_threshold_mb != 0) {
2195 		memorystatus_freeze_threshold = (unsigned int)atop_64((uint64_t)memorystatus_freeze_threshold_mb << 20);
2196 	} else {
2197 		memorystatus_freeze_threshold = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_FREEZE_THRESHOLD_PERCENTAGE);
2198 	}
2199 	assert(memorystatus_freeze_threshold < (unsigned int)atop_64(max_mem));
2200 
2201 	if (memorystatus_swap_all_apps) {
2202 		/*
2203 		 * Swap is enabled, so we expect a larger working set & larger apps.
2204 		 * Adjust thresholds accordingly.
2205 		 */
2206 		memorystatus_freeze_configure_for_swap();
2207 	}
2208 #endif
2209 
2210 	/* Check the boot-arg to configure the maximum number of jetsam threads */
2211 	if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof(max_jetsam_threads))) {
2212 		max_jetsam_threads = JETSAM_THREADS_LIMIT;
2213 	}
2214 
2215 	/* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */
2216 	if (max_jetsam_threads > JETSAM_THREADS_LIMIT) {
2217 		max_jetsam_threads = JETSAM_THREADS_LIMIT;
2218 	}
2219 
2220 #if CONFIG_JETSAM
2221 	/* For low CPU systems disable fast jetsam mechanism */
2222 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
2223 		max_jetsam_threads = 1;
2224 	}
2225 #endif /* CONFIG_JETSAM */
2226 
2227 #if DEVELOPMENT || DEBUG
2228 	if (PE_parse_boot_argn("-memorystatus-skip-fg-notify", &i, sizeof(i))) {
2229 		memorystatus_should_issue_fg_band_notify = false;
2230 	}
2231 
2232 	if (PE_parse_boot_argn("memorystatus_kill_on_sustained_pressure", &i, sizeof(i))) {
2233 		if (i) {
2234 			memstat_pressure_config |= MEMSTAT_WARNING_KILL_SUSTAINED;
2235 		} else {
2236 			memstat_pressure_config &= ~MEMSTAT_WARNING_KILL_SUSTAINED;
2237 		}
2238 	}
2239 #endif /* DEVELOPMENT || DEBUG */
2240 
2241 	/* Initialize the jetsam_threads state array */
2242 	jetsam_threads = zalloc_permanent(sizeof(struct jetsam_state_s) *
2243 	    max_jetsam_threads, ZALIGN(struct jetsam_state_s));
2244 
2245 	/* Initialize all the jetsam threads */
2246 	for (i = 0; i < max_jetsam_threads; i++) {
2247 		jetsam_threads[i].inited = false;
2248 		jetsam_threads[i].index = i;
2249 		result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread);
2250 		if (result != KERN_SUCCESS) {
2251 			panic("Could not create memorystatus_thread %d", i);
2252 		}
2253 		thread_deallocate(jetsam_threads[i].thread);
2254 	}
2255 
2256 #if VM_PRESSURE_EVENTS
2257 	memorystatus_notify_init();
2258 #endif /* VM_PRESSURE_EVENTS */
2259 
2260 #if JETSAM_ZPRINT_SNAPSHOT
2261 	size_t jzs_names_size, jzs_info_size, jzs_meminfo_size;
2262 
2263 	jzs_zone_cnt = zone_max_zones();
2264 	jzs_names_size = jzs_zone_cnt * sizeof(mach_zone_name_t);
2265 	jzs_names = zalloc_permanent(jzs_names_size, ZALIGN(mach_zone_name_t));
2266 
2267 	jzs_info_size = jzs_zone_cnt * sizeof(mach_zone_info_t);
2268 	jzs_info = zalloc_permanent(jzs_info_size, ZALIGN(mach_zone_info_t));
2269 
2270 	jzs_coalesce = zalloc_permanent(jzs_zone_cnt * sizeof(int), ZALIGN(int));
2271 
2272 	jzs_meminfo_cnt = vm_page_diagnose_estimate();
2273 	jzs_meminfo_size = jzs_meminfo_cnt * sizeof(mach_memory_info_t);
2274 	jzs_meminfo = kalloc_data_tag(jzs_meminfo_size, Z_WAITOK, VM_KERN_MEMORY_DIAG);
2275 #endif /* JETSAM_ZPRINT_SNAPSHOT */
2276 
2277 	bzero(memorystatus_kill_counts, sizeof(memorystatus_kill_counts));
2278 }
2279 
2280 #if CONFIG_JETSAM
2281 bool
memorystatus_disable_swap(void)2282 memorystatus_disable_swap(void)
2283 {
2284 #if DEVELOPMENT || DEBUG
2285 	int boot_arg_val = 0;
2286 	if (PE_parse_boot_argn("kern.swap_all_apps", &boot_arg_val, sizeof(boot_arg_val))) {
2287 		if (boot_arg_val) {
2288 			/* Can't disable app swap if it was set via a boot-arg */
2289 			return false;
2290 		}
2291 	}
2292 #endif /* DEVELOPMENT || DEBUG */
2293 	memorystatus_swap_all_apps = false;
2294 #if CONFIG_FREEZE
2295 	/* Go back to the smaller freezer thresholds */
2296 	memorystatus_freeze_disable_swap();
2297 #endif /* CONFIG_FREEZE */
2298 	initialize_entitled_max_task_limit();
2299 	return true;
2300 }
2301 #endif /* CONFIG_JETSAM */
2302 
2303 static void
_memstat_record_kill(int32_t priority,memorystatus_kill_cause_t cause)2304 _memstat_record_kill(int32_t priority, memorystatus_kill_cause_t cause)
2305 {
2306 	uint32_t _Atomic *count;
2307 	uint32_t orig;
2308 
2309 	/* Check validity of reason / cause */
2310 	if ((priority < JETSAM_PRIORITY_IDLE) ||
2311 	    (priority > JETSAM_PRIORITY_MAX) ||
2312 	    (cause <= kMemorystatusInvalid) ||
2313 	    (cause > JETSAM_REASON_MEMORYSTATUS_MAX)) {
2314 		memorystatus_log_error("memorystatus: not tracking kill with invalid priority %d / cause %d\n",
2315 		    priority, cause);
2316 		return;
2317 	}
2318 
2319 	if ((priority == JETSAM_PRIORITY_IDLE) && (cause == kMemorystatusKilledIdleExit)) {
2320 		/* rdar://141462516 */
2321 		count = &memorystatus_idle_exit_kill_count;
2322 	} else {
2323 		if (cause == kMemorystatusKilledIdleExit) {
2324 			memorystatus_log_error("memorystatus: not tracking idle exit kill for priority %d\n", priority);
2325 			return;
2326 		}
2327 
2328 		/* kMemorystatusKilledIdleExit and kMemorystatusInvalid are not in the array */
2329 		if (cause < kMemorystatusKilledIdleExit) {
2330 			count = &memorystatus_kill_counts[priority][cause - 1];
2331 		} else {
2332 			count = &memorystatus_kill_counts[priority][cause - 2];
2333 		}
2334 	}
2335 
2336 	orig = os_atomic_inc_orig(count, relaxed);
2337 	if (orig == UINT32_MAX) {
2338 		os_atomic_dec(count, relaxed);
2339 		memorystatus_log_error("memorystatus: overflowed kill count for priority %d + cause %d\n", priority, cause);
2340 	}
2341 }
2342 
2343 /*
2344  * The jetsam no frills kill call
2345  *      Return: 0 on success
2346  *		error code on failure (EINVAL...)
2347  */
2348 static int
jetsam_do_kill(proc_t p,int jetsam_flags,os_reason_t jetsam_reason)2349 jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason)
2350 {
2351 	int error = 0;
2352 	error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason);
2353 	return error;
2354 }
2355 
2356 /*
2357  * Wrapper for processes exiting with memorystatus details
2358  */
2359 static bool
memorystatus_do_kill(proc_t p,uint32_t cause,os_reason_t jetsam_reason,uint64_t * footprint_out)2360 memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason, uint64_t *footprint_out)
2361 {
2362 	int error = 0;
2363 	__unused pid_t victim_pid = proc_getpid(p);
2364 	uint64_t footprint = get_task_phys_footprint(proc_task(p));
2365 #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
2366 	int32_t memstat_effectivepriority = p->p_memstat_effectivepriority;
2367 #endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
2368 
2369 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DO_KILL) | DBG_FUNC_START,
2370 	    victim_pid, cause, vm_page_free_count, footprint);
2371 	DTRACE_MEMORYSTATUS4(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause, uint64_t, footprint);
2372 
2373 #if CONFIG_JETSAM
2374 	if (*p->p_name && !strncmp(memorystatus_jetsam_proc_name_panic, p->p_name, sizeof(p->p_name))) { /* name */
2375 		if ((!memorystatus_jetsam_proc_cause_panic || cause == memorystatus_jetsam_proc_cause_panic) && /* cause */
2376 		    (!memorystatus_jetsam_proc_size_panic || (footprint >> 20) >= memorystatus_jetsam_proc_size_panic)) { /* footprint */
2377 			panic("memorystatus_do_kill(): requested panic on jetsam of %s (cause: %d and footprint: %llu mb)",
2378 			    memorystatus_jetsam_proc_name_panic, cause, footprint >> 20);
2379 		}
2380 	}
2381 #else /* CONFIG_JETSAM */
2382 #pragma unused(cause)
2383 #endif /* CONFIG_JETSAM */
2384 
2385 	if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
2386 		memorystatus_log(
2387 			"memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n",
2388 			proc_getpid(p), (*p->p_name ? p->p_name : "unknown"),
2389 			memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority,
2390 			(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
2391 	}
2392 
2393 	_memstat_record_kill(p->p_memstat_effectivepriority, cause);
2394 
2395 	/*
2396 	 * The jetsam_reason (os_reason_t) has enough information about the kill cause.
2397 	 * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped.
2398 	 */
2399 	int jetsam_flags = P_LTERM_JETSAM;
2400 	switch (cause) {
2401 	case kMemorystatusKilledHiwat:                                          jetsam_flags |= P_JETSAM_HIWAT; break;
2402 	case kMemorystatusKilledVnodes:                                         jetsam_flags |= P_JETSAM_VNODE; break;
2403 	case kMemorystatusKilledVMPageShortage:                         jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
2404 	case kMemorystatusKilledVMCompressorThrashing:
2405 	case kMemorystatusKilledVMCompressorSpaceShortage:      jetsam_flags |= P_JETSAM_VMTHRASHING; break;
2406 	case kMemorystatusKilledFCThrashing:                            jetsam_flags |= P_JETSAM_FCTHRASHING; break;
2407 	case kMemorystatusKilledPerProcessLimit:                        jetsam_flags |= P_JETSAM_PID; break;
2408 	case kMemorystatusKilledIdleExit:                                       jetsam_flags |= P_JETSAM_IDLEEXIT; break;
2409 	case kMemorystatusKilledConclaveLimit:                        jetsam_flags |= P_JETSAM_PID; break;
2410 	}
2411 	/* jetsam_do_kill drops a reference. */
2412 	os_reason_ref(jetsam_reason);
2413 	error = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
2414 	if (footprint_out) {
2415 		*footprint_out = ((error == 0) ? footprint : 0);
2416 	}
2417 
2418 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DO_KILL) | DBG_FUNC_END,
2419 	    victim_pid, memstat_effectivepriority, vm_page_free_count, error);
2420 
2421 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_COMPACTOR_RUN) | DBG_FUNC_START,
2422 	    victim_pid, cause, vm_page_free_count, footprint);
2423 
2424 	if (jetsam_reason->osr_code == JETSAM_REASON_VNODE) {
2425 		/*
2426 		 * vnode jetsams are syncronous and not caused by memory pressure.
2427 		 * Running the compactor on this thread adds significant latency to the filesystem operation
2428 		 * that triggered this jetsam.
2429 		 * Kick of compactor thread asyncronously instead.
2430 		 */
2431 		vm_wake_compactor_swapper();
2432 	} else {
2433 		/* compact now, except for idle reaper kills.
2434 		 * idle reaper kills are done in batches, so we defer compaction until the end of the batch.
2435 		 */
2436 		if (jetsam_reason->osr_code != JETSAM_REASON_MEMORY_LONGIDLE_EXIT) {
2437 			vm_run_compactor();
2438 		}
2439 	}
2440 
2441 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_COMPACTOR_RUN) | DBG_FUNC_END,
2442 	    victim_pid, cause, vm_page_free_count);
2443 
2444 	os_reason_free(jetsam_reason);
2445 	return error == 0;
2446 }
2447 
2448 static int
memstat_update_inactive_priority(proc_t p,boolean_t enable,int jetsam_prio,boolean_t effective_now)2449 memstat_update_inactive_priority(proc_t  p, boolean_t enable, int jetsam_prio, boolean_t effective_now)
2450 {
2451 	if (_memstat_proc_is_internal(p)) {
2452 		return EINVAL;
2453 	}
2454 
2455 	if ((enable && _memstat_proc_is_elevated(p)) ||
2456 	    (!enable && !_memstat_proc_is_elevated(p))) {
2457 		/*
2458 		 * No change in state.
2459 		 */
2460 	} else {
2461 		proc_list_lock();
2462 
2463 		if (enable) {
2464 			p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
2465 
2466 			if (effective_now) {
2467 				if (p->p_memstat_effectivepriority < jetsam_prio) {
2468 					memstat_update_priority_locked(p, jetsam_prio, MEMSTAT_PRIORITY_OPTIONS_NONE);
2469 				}
2470 			} else {
2471 				if (_memstat_proc_is_aging(p)) {
2472 					memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2473 				}
2474 			}
2475 		} else {
2476 			p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
2477 
2478 			if (effective_now) {
2479 				if (p->p_memstat_effectivepriority == jetsam_prio) {
2480 					memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2481 				}
2482 			} else {
2483 				if (_memstat_proc_is_aging(p)) {
2484 					memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2485 				}
2486 			}
2487 		}
2488 		proc_list_unlock();
2489 	}
2490 	return 0;
2491 }
2492 
2493 /*
2494  * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work.
2495  * For an application: that means no longer in the FG band
2496  * For a daemon: that means no longer in its 'requested' jetsam priority band
2497  */
2498 
2499 int
memorystatus_update_inactive_jetsam_priority_band(pid_t pid,uint32_t op_flags,int jetsam_prio,boolean_t effective_now)2500 memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now)
2501 {
2502 	int error = 0;
2503 	boolean_t enable = FALSE;
2504 	proc_t  p = NULL;
2505 
2506 	/* Validate inputs */
2507 	if (pid == 0) {
2508 		return EINVAL;
2509 	}
2510 
2511 	if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) {
2512 		enable = TRUE;
2513 	} else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) {
2514 		enable = FALSE;
2515 	} else {
2516 		return EINVAL;
2517 	}
2518 
2519 	p = proc_find(pid);
2520 	if (p != NULL) {
2521 		error = memstat_update_inactive_priority(p, enable, jetsam_prio, effective_now);
2522 		proc_rele(p);
2523 	} else {
2524 		error = ESRCH;
2525 	}
2526 	return error;
2527 }
2528 
2529 static bool
_memstat_proc_has_importance_assertion(proc_t p)2530 _memstat_proc_has_importance_assertion(proc_t p)
2531 {
2532 	return (p->p_memstat_state & P_MEMSTAT_TEST_IMP_ASSERTION) || task_has_assertions(proc_task(p));
2533 }
2534 
2535 static void
_memstat_perform_idle_demotion_for_band(unsigned int demote_prio_band)2536 _memstat_perform_idle_demotion_for_band(unsigned int demote_prio_band)
2537 {
2538 	proc_t p;
2539 	uint64_t current_time = 0, idle_delay_time = 0;
2540 	memstat_bucket_t *demotion_bucket;
2541 
2542 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2543 
2544 	current_time = mach_absolute_time();
2545 
2546 	demotion_bucket = &memstat_bucket[demote_prio_band];
2547 	p = memorystatus_get_first_proc_locked(&demote_prio_band, FALSE);
2548 
2549 	while (p) {
2550 		memorystatus_log_debug("memorystatus_perform_idle_demotion() found %s [%d]\n", proc_best_name(p), proc_getpid(p));
2551 
2552 		assert(p->p_memstat_idledeadline);
2553 
2554 		assert(_memstat_proc_is_aging(p));
2555 
2556 		if (current_time >= p->p_memstat_idledeadline) {
2557 			proc_t next_proc = NULL;
2558 
2559 			next_proc = memorystatus_get_next_proc_locked(&demote_prio_band, p, FALSE);
2560 
2561 			if ((isSysProc(p) && _memstat_proc_is_dirty(p)) || /* system proc marked dirty*/
2562 			    _memstat_proc_has_importance_assertion(p)) {    /* has outstanding assertions which might indicate outstanding work too */
2563 				idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p);
2564 
2565 				if (isSysProc(p) && _memstat_proc_has_importance_assertion(p)) {
2566 					if (demote_prio_band != system_procs_aging_band_stuck) {
2567 						memorystatus_log_debug("memorystatus_perform_idle_demotion() found stuck process %d [%s], moving to JETSAM_PRIORITY_AGING_BAND1_STUCK\n",
2568 						    proc_getpid(p), (*p->p_name ? p->p_name : "(unknown)"));
2569 						memstat_update_priority_locked(p, JETSAM_PRIORITY_AGING_BAND1_STUCK, MEMSTAT_PRIORITY_NO_AGING);
2570 						idle_delay_time = _memstat_sysprocs_aging_stuck_delay_time(p);
2571 						KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_NONE,
2572 						    proc_pid(p), JETSAM_PRIORITY_AGING_BAND1_STUCK, p->p_memstat_idledeadline + idle_delay_time);
2573 					} else {
2574 						memorystatus_log("memorystatus_perform_idle_demotion() timed out stuck process %d [%s], moving to idle band\n",
2575 						    proc_getpid(p), proc_best_name(p));
2576 						memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2577 						KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_NONE,
2578 						    proc_pid(p), JETSAM_PRIORITY_IDLE, p->p_memstat_idledeadline);
2579 						idle_delay_time = 0;
2580 					}
2581 				}
2582 
2583 				p->p_memstat_idledeadline += idle_delay_time;
2584 			} else {
2585 				memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2586 				KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_NONE,
2587 				    proc_pid(p), JETSAM_PRIORITY_IDLE, p->p_memstat_idledeadline);
2588 			}
2589 			p = next_proc;
2590 		} else {
2591 			// No further candidates
2592 			break;
2593 		}
2594 	}
2595 }
2596 
2597 static void
memorystatus_perform_idle_demotion(__unused void * spare1,__unused void * spare2)2598 memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
2599 {
2600 	memorystatus_log_debug("memorystatus_perform_idle_demotion()\n");
2601 
2602 	if (!system_procs_aging_band && !system_procs_aging_band_stuck && !applications_aging_band) {
2603 		return;
2604 	}
2605 
2606 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START);
2607 
2608 	proc_list_lock();
2609 
2610 	_memstat_perform_idle_demotion_for_band(system_procs_aging_band);
2611 	_memstat_perform_idle_demotion_for_band(system_procs_aging_band_stuck);
2612 	_memstat_perform_idle_demotion_for_band(applications_aging_band);
2613 
2614 	_memstat_reschedule_idle_demotion_locked();
2615 
2616 	proc_list_unlock();
2617 
2618 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END);
2619 }
2620 
2621 /*
2622  * Schedule a process for idle demotion. Updates the process' idle deadline
2623  * and marks it as aging. The caller is responsible for rescheduling the idle
2624  * demotion thread
2625  */
2626 static void
_memstat_schedule_idle_demotion_locked(proc_t p)2627 _memstat_schedule_idle_demotion_locked(proc_t p)
2628 {
2629 	uint64_t  idle_delay_time = 0;
2630 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2631 	assert(system_procs_aging_band || applications_aging_band);
2632 	assert(!_memstat_proc_is_aging(p));
2633 
2634 	memorystatus_log_debug(
2635 		"%s: scheduling demotion to idle band for pid %d (dirty:0x%x).\n",
2636 		__func__, proc_getpid(p), p->p_memstat_dirty);
2637 
2638 	idle_delay_time = isSysProc(p) ? memorystatus_sysprocs_idle_time(p) :
2639 	    memorystatus_apps_idle_time(p);
2640 	p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS;
2641 	p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time;
2642 }
2643 
2644 /*
2645  * Cancel a process' idle demotion. The caller must also reschedule the idle
2646  * demotion thread.
2647  */
2648 static void
_memstat_invalidate_idle_demotion_locked(proc_t p)2649 _memstat_invalidate_idle_demotion_locked(proc_t p)
2650 {
2651 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2652 	assert(system_procs_aging_band || applications_aging_band);
2653 	assert(_memstat_proc_is_aging(p));
2654 
2655 	memorystatus_log_debug(
2656 		"%s: invalidating demotion to idle band for %s [%d]\n",
2657 		__func__, proc_best_name(p), proc_getpid(p));
2658 
2659 	p->p_memstat_idledeadline = 0;
2660 	p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS;
2661 }
2662 
2663 /*
2664  * Return the earliest idle deadline of all aging procs. Returns 0 if there are
2665  * no aging procs.
2666  */
2667 static uint64_t
_memstat_find_earliest_idle_deadline(void)2668 _memstat_find_earliest_idle_deadline(void)
2669 {
2670 	memstat_bucket_t *demotion_bucket;
2671 	proc_t oldest_proc = PROC_NULL;
2672 	uint32_t aging_app_count = 0, aging_sysproc_count = 0, aging_sysproc_count_stuck = 0;
2673 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2674 	assert(system_procs_aging_band || system_procs_aging_band_stuck || applications_aging_band);
2675 
2676 	if (system_procs_aging_band) {
2677 		aging_sysproc_count = memstat_bucket[system_procs_aging_band].count;
2678 	}
2679 	if (system_procs_aging_band_stuck) {
2680 		aging_sysproc_count_stuck = memstat_bucket[system_procs_aging_band_stuck].count;
2681 	}
2682 	if (applications_aging_band) {
2683 		aging_app_count = memstat_bucket[applications_aging_band].count;
2684 	}
2685 
2686 	if ((aging_app_count + aging_sysproc_count + aging_sysproc_count_stuck) == 0) {
2687 		return 0;
2688 	}
2689 
2690 	if (system_procs_aging_band && aging_sysproc_count > 0) {
2691 		demotion_bucket = &memstat_bucket[system_procs_aging_band];
2692 		oldest_proc = TAILQ_FIRST(&demotion_bucket->list);
2693 	}
2694 
2695 	if (system_procs_aging_band_stuck && aging_sysproc_count_stuck > 0) {
2696 		proc_t oldest_sysproc_stuck;
2697 		demotion_bucket = &memstat_bucket[system_procs_aging_band_stuck];
2698 		oldest_sysproc_stuck = TAILQ_FIRST(&demotion_bucket->list);
2699 
2700 		if (oldest_proc) {
2701 			if (oldest_sysproc_stuck->p_memstat_idledeadline <
2702 			    oldest_proc->p_memstat_idledeadline) {
2703 				oldest_proc = oldest_sysproc_stuck;
2704 			}
2705 		} else {
2706 			oldest_proc = oldest_sysproc_stuck;
2707 		}
2708 	}
2709 
2710 	if (applications_aging_band && aging_app_count > 0) {
2711 		proc_t oldest_app;
2712 		demotion_bucket = &memstat_bucket[applications_aging_band];
2713 		oldest_app = TAILQ_FIRST(&demotion_bucket->list);
2714 
2715 		if (!oldest_proc ||
2716 		    (oldest_app->p_memstat_idledeadline <
2717 		    oldest_proc->p_memstat_idledeadline)) {
2718 			oldest_proc = oldest_app;
2719 		}
2720 	}
2721 
2722 	assert(oldest_proc);
2723 	assert(oldest_proc->p_memstat_idledeadline);
2724 	assert(_memstat_proc_is_aging(oldest_proc));
2725 
2726 	return oldest_proc->p_memstat_idledeadline;
2727 }
2728 
2729 /*
2730  * Reschedule or cancel a pending wakeup of the idle_demotion thread. If called
2731  * in response to a process transitioning in/out of the aging band, then
2732  * rescheduling must occur *after* the new priority is updated.
2733  */
2734 static void
_memstat_reschedule_idle_demotion_locked(void)2735 _memstat_reschedule_idle_demotion_locked(void)
2736 {
2737 	uint64_t idle_deadline;
2738 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2739 
2740 	if (!system_procs_aging_band && !applications_aging_band) {
2741 		return;
2742 	}
2743 	idle_deadline = _memstat_find_earliest_idle_deadline();
2744 	if (idle_deadline == 0) {
2745 		/* No aging processes, cancel call to demotion thread */
2746 		thread_call_cancel(memorystatus_idle_demotion_call);
2747 	} else if (memstat_idle_demotion_deadline != idle_deadline) {
2748 		thread_call_enter_delayed(memorystatus_idle_demotion_call, idle_deadline);
2749 	}
2750 	memstat_idle_demotion_deadline = idle_deadline;
2751 }
2752 
2753 /*
2754  * List manipulation
2755  */
2756 
2757 int
memorystatus_add(proc_t p,boolean_t locked)2758 memorystatus_add(proc_t p, boolean_t locked)
2759 {
2760 	memstat_bucket_t *bucket;
2761 	bool reschedule_demotion = false;
2762 
2763 	memorystatus_log_debug("memorystatus_list_add(): adding pid %d with priority %d.\n",
2764 	    proc_getpid(p), p->p_memstat_effectivepriority);
2765 
2766 	if (!locked) {
2767 		proc_list_lock();
2768 	}
2769 
2770 	DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority);
2771 
2772 	/* Processes marked internal do not have priority tracked */
2773 	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2774 		goto exit;
2775 	}
2776 
2777 	/*
2778 	 * Opt out system processes from being frozen by default.
2779 	 * For coalition-based freezing, we only want to freeze sysprocs that have specifically opted in.
2780 	 */
2781 	if (isSysProc(p)) {
2782 		p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED;
2783 	}
2784 #if CONFIG_FREEZE
2785 	memorystatus_freeze_init_proc(p);
2786 #endif
2787 
2788 	bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2789 
2790 	if ((system_procs_aging_band &&
2791 	    p->p_memstat_effectivepriority == system_procs_aging_band) ||
2792 	    (applications_aging_band &&
2793 	    p->p_memstat_effectivepriority == applications_aging_band)) {
2794 		_memstat_schedule_idle_demotion_locked(p);
2795 		reschedule_demotion = true;
2796 	}
2797 
2798 	p->p_memstat_prio_start = mach_absolute_time();
2799 
2800 	TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
2801 	bucket->count++;
2802 	if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2803 		bucket->relaunch_high_count++;
2804 	}
2805 
2806 	memorystatus_list_count++;
2807 
2808 	if (reschedule_demotion) {
2809 		_memstat_reschedule_idle_demotion_locked();
2810 	}
2811 
2812 	task_t t = proc_task(p);
2813 	if (t && task_is_app_suspended(t)) {
2814 		_memstat_proc_set_suspended(p);
2815 	}
2816 
2817 	_memstat_consider_waking_jetsam_thread();
2818 
2819 exit:
2820 	if (!locked) {
2821 		proc_list_unlock();
2822 	}
2823 
2824 	return 0;
2825 }
2826 
2827 /*
2828  * Record timestamps if process p is transitioning in/out of the IDLE band.
2829  */
2830 static void
_memstat_record_prio_transition(proc_t p,int new_priority)2831 _memstat_record_prio_transition(proc_t p, int new_priority)
2832 {
2833 	uint64_t now;
2834 
2835 	if (p->p_memstat_effectivepriority == new_priority) {
2836 		/* no change in priority */
2837 		return;
2838 	}
2839 
2840 	now = mach_absolute_time();
2841 
2842 	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2843 		/*
2844 		 * Transitioning out of the idle priority bucket.
2845 		 * Record idle delta.
2846 		 */
2847 		assert(p->p_memstat_prio_start != 0);
2848 		if (now < p->p_memstat_prio_start) {
2849 			// rdar://139660508
2850 			memorystatus_log_error("memorystatus: prio_start > mach_absolute_time "
2851 			    "for %s(%d)? Using delta of 0.\n",
2852 			    proc_best_name(p), proc_getpid(p));
2853 			p->p_memstat_prio_start = now;
2854 		}
2855 		p->p_memstat_idle_delta = now - p->p_memstat_prio_start;
2856 
2857 		/*
2858 		 * About to become active and so memory footprint could change.
2859 		 * So mark it eligible for freeze-considerations next time around.
2860 		 */
2861 		p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE;
2862 
2863 		_memstat_consider_waking_jetsam_thread();
2864 	}
2865 	p->p_memstat_prio_start = now;
2866 }
2867 
2868 /*
2869  * Description:
2870  *	Moves a process from one jetsam bucket to another.
2871  *	which changes the LRU position of the process.
2872  *
2873  *	Monitors transition between buckets and if necessary
2874  *	will update cached memory limits accordingly.
2875  *
2876  */
2877 void
memstat_update_priority_locked(proc_t p,int priority,memstat_priority_options_t options)2878 memstat_update_priority_locked(proc_t p,
2879     int priority,
2880     memstat_priority_options_t options)
2881 {
2882 	memstat_bucket_t *old_bucket, *new_bucket;
2883 	bool reschedule_demotion = false;
2884 
2885 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2886 
2887 	assert(priority < MEMSTAT_BUCKET_COUNT);
2888 	/* Not allowed */
2889 	assert(!_memstat_proc_is_internal(p));
2890 
2891 	/* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
2892 	if (proc_list_exited(p)) {
2893 		return;
2894 	}
2895 
2896 	memorystatus_log_debug("memorystatus: setting %s(%d) to priority %d, inserting at %s\n",
2897 	    (*p->p_name ? p->p_name : "unknown"), proc_getpid(p), priority,
2898 	    (options & MEMSTAT_PRIORITY_INSERT_HEAD) ? "head" : "tail");
2899 
2900 	DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority);
2901 
2902 	old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2903 
2904 	if (priority == JETSAM_PRIORITY_IDLE &&
2905 	    !(_memstat_proc_can_idle_exit(p) && !_memstat_proc_is_dirty(p)) &&
2906 	    !(_memstat_proc_is_managed(p) && !_memstat_proc_has_priority_assertion(p))) {
2907 		priority = JETSAM_PRIORITY_BACKGROUND;
2908 		memorystatus_log_error("memorystatus: %s [%d] is neither "
2909 		    "clean (0x%x) nor assertion-less (0x%x) and cannot "
2910 		    "therefore be idle - overriding to pri %d\n",
2911 		    proc_best_name(p), proc_getpid(p), p->p_memstat_dirty,
2912 		    p->p_memstat_state, priority);
2913 	}
2914 
2915 	if (!(options & MEMSTAT_PRIORITY_NO_AGING)) {
2916 		if (_memstat_proc_is_elevated(p)) {
2917 			/*
2918 			 * 2 types of processes can use the non-standard elevated inactive band:
2919 			 * - Frozen processes that always land in memorystatus_freeze_jetsam_band
2920 			 * OR
2921 			 * - processes that specifically opt-in to the elevated inactive support e.g. docked processes.
2922 			 */
2923 			if (_memstat_proc_is_frozen(p) &&
2924 			    priority <= memorystatus_freeze_jetsam_band) {
2925 				priority = memorystatus_freeze_jetsam_band;
2926 			} else if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
2927 				priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2928 			}
2929 		}
2930 		if (_memstat_proc_is_tracked(p)) {
2931 			if (system_procs_aging_band && priority <= system_procs_aging_band) {
2932 				if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2933 					/* process has already aged */
2934 					priority = JETSAM_PRIORITY_IDLE;
2935 				} else {
2936 					priority = system_procs_aging_band;
2937 				}
2938 			} else if (system_procs_aging_band_stuck && priority <= system_procs_aging_band_stuck) {
2939 				if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2940 					/* process has already aged */
2941 					priority = JETSAM_PRIORITY_IDLE;
2942 				} else {
2943 					/* don't let anyone move anything between sysproc and sysproc stuck inclusive */
2944 					priority = system_procs_aging_band;
2945 				}
2946 			}
2947 		} else if (_memstat_proc_is_managed(p)) {
2948 			if (applications_aging_band && priority <= applications_aging_band) {
2949 				if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2950 					/* process has already aged */
2951 					priority = JETSAM_PRIORITY_IDLE;
2952 				} else {
2953 					priority = applications_aging_band;
2954 				}
2955 			}
2956 		}
2957 	}
2958 
2959 	TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
2960 	old_bucket->count--;
2961 	if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2962 		old_bucket->relaunch_high_count--;
2963 	}
2964 
2965 	new_bucket = &memstat_bucket[priority];
2966 	if (options & MEMSTAT_PRIORITY_INSERT_HEAD) {
2967 		TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
2968 	} else {
2969 		TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
2970 	}
2971 	new_bucket->count++;
2972 	if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2973 		new_bucket->relaunch_high_count++;
2974 	}
2975 
2976 	if (p->p_memstat_effectivepriority != priority) {
2977 		/*
2978 		 * This process is transitioning between
2979 		 * jetsam priority buckets.
2980 		 */
2981 		_memstat_record_prio_transition(p, priority);
2982 
2983 		if ((system_procs_aging_band &&
2984 		    p->p_memstat_effectivepriority == system_procs_aging_band) ||
2985 		    (system_procs_aging_band_stuck &&
2986 		    p->p_memstat_effectivepriority == system_procs_aging_band_stuck) ||
2987 		    (applications_aging_band &&
2988 		    p->p_memstat_effectivepriority == applications_aging_band)) {
2989 			/* removing this process from an aging band */
2990 			_memstat_invalidate_idle_demotion_locked(p);
2991 			reschedule_demotion = true;
2992 		}
2993 
2994 		if ((system_procs_aging_band &&
2995 		    priority == system_procs_aging_band) ||
2996 		    (system_procs_aging_band_stuck &&
2997 		    priority == system_procs_aging_band_stuck) ||
2998 		    (applications_aging_band &&
2999 		    priority == applications_aging_band)) {
3000 			/* placing this process into an aging band */
3001 			_memstat_schedule_idle_demotion_locked(p);
3002 			reschedule_demotion = true;
3003 		}
3004 
3005 		if (reschedule_demotion) {
3006 			_memstat_reschedule_idle_demotion_locked();
3007 		}
3008 
3009 		KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CHANGE_PRIORITY) | DBG_FUNC_NONE,
3010 		    proc_getpid(p), priority, p->p_memstat_effectivepriority);
3011 		p->p_memstat_effectivepriority = priority;
3012 	}
3013 
3014 	if (memorystatus_highwater_enabled) {
3015 		const bool use_active = memstat_proc_is_active_locked(p);
3016 		if (memstat_update_memlimit_locked(p, use_active)) {
3017 			_memstat_write_memlimit_to_ledger_locked(p, use_active, false);
3018 		}
3019 	}
3020 
3021 #if CONFIG_SECLUDED_MEMORY
3022 	if (secluded_for_apps &&
3023 	    task_could_use_secluded_mem(proc_task(p))) {
3024 		task_set_can_use_secluded_mem(
3025 			proc_task(p),
3026 			(priority >= JETSAM_PRIORITY_FOREGROUND));
3027 	}
3028 #endif /* CONFIG_SECLUDED_MEMORY */
3029 
3030 	_memstat_consider_waking_jetsam_thread();
3031 }
3032 
3033 int
memorystatus_relaunch_flags_update(proc_t p,int relaunch_flags)3034 memorystatus_relaunch_flags_update(proc_t p, int relaunch_flags)
3035 {
3036 	p->p_memstat_relaunch_flags = relaunch_flags;
3037 	KDBG(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_RELAUNCH_FLAGS), proc_getpid(p), relaunch_flags);
3038 	return 0;
3039 }
3040 
3041 #if DEVELOPMENT || DEBUG
3042 static int sysctl_memorystatus_relaunch_flags SYSCTL_HANDLER_ARGS {
3043 #pragma unused(oidp, arg1, arg2)
3044 	proc_t p;
3045 	int relaunch_flags = 0;
3046 
3047 	p = current_proc();
3048 	relaunch_flags = p->p_memstat_relaunch_flags;
3049 	switch (relaunch_flags) {
3050 	case P_MEMSTAT_RELAUNCH_LOW:
3051 		relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_LOW;
3052 		break;
3053 	case P_MEMSTAT_RELAUNCH_MED:
3054 		relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MED;
3055 		break;
3056 	case P_MEMSTAT_RELAUNCH_HIGH:
3057 		relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_HIGH;
3058 		break;
3059 	}
3060 
3061 	return SYSCTL_OUT(req, &relaunch_flags, sizeof(relaunch_flags));
3062 }
3063 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_relaunch_flags, CTLTYPE_INT | CTLFLAG_RD |
3064     CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, sysctl_memorystatus_relaunch_flags, "I", "get relaunch flags for current process");
3065 #endif /* DEVELOPMENT || DEBUG */
3066 
3067 /*
3068  * Everything between the idle band and the application agining band
3069  * are reserved for internal use. We allow some entitled user space programs
3070  * to use this range for experimentation.
3071  */
3072 static bool
current_task_can_use_entitled_range()3073 current_task_can_use_entitled_range()
3074 {
3075 	static const char kInternalJetsamRangeEntitlement[] = "com.apple.private.internal-jetsam-range";
3076 	task_t task = current_task();
3077 	if (task == kernel_task) {
3078 		return true;
3079 	}
3080 	return IOTaskHasEntitlement(task, kInternalJetsamRangeEntitlement);
3081 }
3082 
3083 /*
3084  * Set a process' requested priority band. This is the entry point used during
3085  * spawn and by memorystatus_control.
3086  */
3087 int
memorystatus_set_priority(proc_t p,int priority,uint64_t user_data,memstat_priority_options_t options)3088 memorystatus_set_priority(proc_t p, int priority, uint64_t user_data,
3089     memstat_priority_options_t options)
3090 {
3091 	int ret;
3092 
3093 	memorystatus_log_debug("memorystatus: changing (%s) pid %d: priority %d, user_data 0x%llx\n",
3094 	    (*p->p_name ? p->p_name : "unknown"), proc_getpid(p), priority, user_data);
3095 
3096 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, proc_getpid(p), priority, user_data, options);
3097 
3098 	if (priority == -1) {
3099 		/* Use as shorthand for default priority */
3100 		priority = JETSAM_PRIORITY_DEFAULT;
3101 	} else if (priority > JETSAM_PRIORITY_IDLE && priority <= JETSAM_PRIORITY_AGING_BAND2) {
3102 		/*
3103 		 * Everything between idle and the aging bands are reserved for internal use.
3104 		 * if requested, adjust to JETSAM_PRIORITY_IDLE.
3105 		 * Entitled processes (just munch) can use a subset of this range for testing.
3106 		 */
3107 		if (priority > JETSAM_PRIORITY_ENTITLED_MAX ||
3108 		    !current_task_can_use_entitled_range()) {
3109 			priority = JETSAM_PRIORITY_IDLE;
3110 			options |= MEMSTAT_PRIORITY_NO_AGING;
3111 		}
3112 	} else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
3113 		/* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
3114 		priority = JETSAM_PRIORITY_IDLE;
3115 		options |= (MEMSTAT_PRIORITY_INSERT_HEAD | MEMSTAT_PRIORITY_NO_AGING);
3116 	} else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
3117 		/* Sanity check */
3118 		ret = EINVAL;
3119 		goto out;
3120 	}
3121 
3122 	proc_list_lock();
3123 
3124 	assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
3125 
3126 	if ((options & MEMSTAT_PRIORITY_IS_EFFECTIVE) &&
3127 	    (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
3128 		ret = EALREADY;
3129 		proc_list_unlock();
3130 		memorystatus_log_error("memorystatus_update: effective change specified for pid %d, but change already occurred.\n",
3131 		    proc_getpid(p));
3132 		goto out;
3133 	}
3134 
3135 	if ((p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) || proc_list_exited(p)) {
3136 		/*
3137 		 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
3138 		 */
3139 		ret = EBUSY;
3140 		proc_list_unlock();
3141 		goto out;
3142 	}
3143 
3144 	p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
3145 	p->p_memstat_userdata = user_data;
3146 
3147 	if ((options & MEMSTAT_PRIORITY_IS_ASSERTION)) {
3148 		if (priority != JETSAM_PRIORITY_IDLE) {
3149 			/*
3150 			 * Process is now being managed by assertions,
3151 			 */
3152 			p->p_memstat_state |= P_MEMSTAT_PRIORITY_ASSERTION;
3153 			p->p_memstat_assertionpriority = priority;
3154 		} else if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
3155 			/*
3156 			 * Assertions relinquish control when the process is heading to IDLE.
3157 			 */
3158 			p->p_memstat_state &= ~P_MEMSTAT_PRIORITY_ASSERTION;
3159 		}
3160 
3161 		if (_memstat_proc_is_tracked(p) &&
3162 		    (_memstat_proc_is_dirty(p) || !_memstat_proc_can_idle_exit(p))) {
3163 			priority = MAX(p->p_memstat_assertionpriority,
3164 			    p->p_memstat_requestedpriority);
3165 		}
3166 	} else {
3167 		p->p_memstat_requestedpriority = priority;
3168 	}
3169 
3170 	memstat_update_priority_locked(p, priority, options);
3171 
3172 	proc_list_unlock();
3173 	ret = 0;
3174 
3175 out:
3176 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret);
3177 
3178 	return ret;
3179 }
3180 
3181 #if DEVELOPMENT || DEBUG
3182 static int32_t
memstat_increased_limit(int32_t limit,int32_t increase)3183 memstat_increased_limit(int32_t limit, int32_t increase)
3184 {
3185 	int32_t offset_limit;
3186 	if (limit <= 0) {
3187 		return 0;
3188 	}
3189 	if (os_add_overflow(limit, increase, &offset_limit)) {
3190 		return INT32_MAX;
3191 	}
3192 	return offset_limit;
3193 }
3194 #endif /* DEVELOPMENT || DEBUG */
3195 
3196 static int
memstat_set_memlimits_locked(proc_t p,int32_t active_limit,int32_t inactive_limit,memlimit_options_t options)3197 memstat_set_memlimits_locked(proc_t p, int32_t active_limit,
3198     int32_t inactive_limit, memlimit_options_t options)
3199 {
3200 	/*
3201 	 * Posix_spawn'd processes and managed processes come through this path to
3202 	 * instantiate ledger limits. Forked processes do not come through this
3203 	 * path and will always receive the default task limit.
3204 	 */
3205 
3206 	int err = 0;
3207 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
3208 
3209 	int32_t default_active_limit = memorystatus_get_default_task_active_limit(p);
3210 	int32_t default_inactive_limit = memorystatus_get_default_task_inactive_limit(p);
3211 
3212 	/*
3213 	 * The special value of -1 specifies that this proc wants the default
3214 	 * memory limit
3215 	 */
3216 	if (active_limit <= 0) {
3217 		active_limit = default_active_limit;
3218 	}
3219 	if (inactive_limit <= 0) {
3220 		inactive_limit = default_inactive_limit;
3221 	}
3222 
3223 #if DEVELOPMENT || DEBUG
3224 	if (p->p_memlimit_increase) {
3225 		/* Apply memlimit increase (for testing with overlay roots) */
3226 		int32_t memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
3227 		active_limit = memstat_increased_limit(active_limit, memlimit_increase);
3228 		inactive_limit = memstat_increased_limit(inactive_limit, memlimit_increase);
3229 	}
3230 #endif /* DEVELOPMENT || DEBUG */
3231 
3232 	/*
3233 	 * Work around a bug in JetsamProperties whereby processes may mistakenly receive
3234 	 * ActiveSoftMemoryLimit := -1 by forcing the default task limit to be fatal.
3235 	 */
3236 	if (default_active_limit && active_limit == default_active_limit) {
3237 		options |= MEMLIMIT_ACTIVE_FATAL;
3238 	}
3239 
3240 	if (default_inactive_limit && inactive_limit == default_inactive_limit) {
3241 		options |= MEMLIMIT_INACTIVE_FATAL;
3242 	}
3243 
3244 	memorystatus_log_debug(
3245 		"memorystatus: setting memlimit for %s [%d], "
3246 		"Active(%dMB %s), Inactive(%dMB, %s)\n",
3247 		proc_best_name(p), proc_getpid(p),
3248 		active_limit, ((options & MEMLIMIT_ACTIVE_FATAL) ? "F" : "NF"),
3249 		inactive_limit, ((options & MEMLIMIT_INACTIVE_FATAL) ? "F" : "NF"));
3250 
3251 	p->p_memstat_memlimit_active = active_limit;
3252 	p->p_memstat_memlimit_inactive = inactive_limit;
3253 	if (options & MEMLIMIT_INACTIVE_FATAL) {
3254 		p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL;
3255 	} else {
3256 		p->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL;
3257 	}
3258 	if (options & MEMLIMIT_ACTIVE_FATAL) {
3259 		p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL;
3260 	} else {
3261 		p->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL;
3262 	}
3263 
3264 	/*
3265 	 * Initialize the cached limits for target process.
3266 	 * When the target process is dirty tracked, it's typically
3267 	 * in a clean state.  Non dirty tracked processes are
3268 	 * typically active (Foreground or above).
3269 	 * But just in case, we don't make assumptions...
3270 	 */
3271 	const bool use_active = memstat_proc_is_active_locked(p);
3272 	if (memorystatus_highwater_enabled &&
3273 	    memstat_update_memlimit_locked(p, use_active)) {
3274 		err = _memstat_write_memlimit_to_ledger_locked(p, use_active, false);
3275 	}
3276 
3277 	return err;
3278 }
3279 
3280 int
memorystatus_set_memlimits(proc_t p,int32_t active_limit,int32_t inactive_limit,memlimit_options_t options)3281 memorystatus_set_memlimits(proc_t p, int32_t active_limit,
3282     int32_t inactive_limit, memlimit_options_t options)
3283 {
3284 	int err;
3285 	proc_list_lock();
3286 	err = memstat_set_memlimits_locked(p, active_limit, inactive_limit,
3287 	    options);
3288 	proc_list_unlock();
3289 	return err;
3290 }
3291 
3292 int
memorystatus_remove(proc_t p)3293 memorystatus_remove(proc_t p)
3294 {
3295 	int ret;
3296 	memstat_bucket_t *bucket;
3297 	bool reschedule = false;
3298 
3299 	memorystatus_log_debug("memorystatus_list_remove: removing pid %d\n", proc_getpid(p));
3300 
3301 	/* Processes marked internal do not have priority tracked */
3302 	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3303 		return 0;
3304 	}
3305 
3306 	/*
3307 	 * Check if this proc is locked (because we're performing a freeze).
3308 	 * If so, we fail and instruct the caller to try again later.
3309 	 */
3310 	if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
3311 		return EAGAIN;
3312 	}
3313 
3314 	assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
3315 
3316 	bucket = &memstat_bucket[p->p_memstat_effectivepriority];
3317 
3318 	if ((system_procs_aging_band &&
3319 	    p->p_memstat_effectivepriority == system_procs_aging_band) ||
3320 	    (system_procs_aging_band_stuck &&
3321 	    p->p_memstat_effectivepriority == system_procs_aging_band_stuck) ||
3322 	    (applications_aging_band &&
3323 	    p->p_memstat_effectivepriority == applications_aging_band)) {
3324 		_memstat_invalidate_idle_demotion_locked(p);
3325 		reschedule = true;
3326 	}
3327 
3328 	/*
3329 	 * Record idle delta
3330 	 */
3331 
3332 	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
3333 		uint64_t now = mach_absolute_time();
3334 		if (now > p->p_memstat_prio_start) {
3335 			p->p_memstat_idle_delta = now - p->p_memstat_prio_start;
3336 		}
3337 	}
3338 
3339 	TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
3340 	bucket->count--;
3341 	if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
3342 		bucket->relaunch_high_count--;
3343 	}
3344 
3345 	memorystatus_list_count--;
3346 
3347 	/* If awaiting demotion to the idle band, clean up */
3348 	if (reschedule) {
3349 		_memstat_reschedule_idle_demotion_locked();
3350 	}
3351 
3352 #if CONFIG_FREEZE
3353 	if (_memstat_proc_is_frozen(p)) {
3354 		if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
3355 			p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
3356 			assert(memorystatus_refreeze_eligible_count > 0);
3357 			memorystatus_refreeze_eligible_count--;
3358 		}
3359 
3360 		assert(memorystatus_frozen_count > 0);
3361 		memorystatus_frozen_count--;
3362 		if (p->p_memstat_state & P_MEMSTAT_FROZEN_XPC_SERVICE) {
3363 			assert(memorystatus_frozen_count_xpc_service > 0);
3364 			memorystatus_frozen_count_xpc_service--;
3365 		}
3366 		if (strcmp(p->p_name, "com.apple.WebKit.WebContent") == 0) {
3367 			assert(memorystatus_frozen_count_webcontent > 0);
3368 			memorystatus_frozen_count_webcontent--;
3369 		}
3370 		memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
3371 		p->p_memstat_freeze_sharedanon_pages = 0;
3372 	}
3373 #endif /* CONFIG_FREEZE */
3374 
3375 	_memstat_proc_set_resumed(p);
3376 
3377 #if DEVELOPMENT || DEBUG
3378 	if (proc_getpid(p) == memorystatus_testing_pid) {
3379 		memorystatus_testing_pid = 0;
3380 	}
3381 #endif /* DEVELOPMENT || DEBUG */
3382 
3383 	if (p) {
3384 		ret = 0;
3385 	} else {
3386 		ret = ESRCH;
3387 	}
3388 
3389 	return ret;
3390 }
3391 
3392 /*
3393  * Validate dirty tracking flags with process state.
3394  *
3395  * Return:
3396  *	0     on success
3397  *      non-0 on failure
3398  *
3399  * The proc_list_lock is held by the caller.
3400  */
3401 
3402 static int
memorystatus_validate_track_flags(struct proc * target_p,uint32_t pcontrol)3403 memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol)
3404 {
3405 	/* See that the process isn't marked for termination */
3406 	if (_memstat_proc_is_terminating(target_p)) {
3407 		return EBUSY;
3408 	}
3409 
3410 	/* Idle exit requires that process be tracked */
3411 	if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
3412 	    !(pcontrol & PROC_DIRTY_TRACK)) {
3413 		return EINVAL;
3414 	}
3415 
3416 	/* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
3417 	if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
3418 	    !(pcontrol & PROC_DIRTY_TRACK)) {
3419 		return EINVAL;
3420 	}
3421 
3422 	/* Only one type of DEFER behavior is allowed.*/
3423 	if ((pcontrol & PROC_DIRTY_DEFER) &&
3424 	    (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) {
3425 		return EINVAL;
3426 	}
3427 
3428 	/* Deferral is only relevant if idle exit is specified */
3429 	if (((pcontrol & PROC_DIRTY_DEFER) ||
3430 	    (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) &&
3431 	    !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
3432 		return EINVAL;
3433 	}
3434 
3435 	return 0;
3436 }
3437 
3438 /*
3439  * Processes can opt to have their state tracked by the kernel, indicating  when they are busy (dirty) or idle
3440  * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
3441  * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
3442  * priority idle band when clean (and killed earlier, protecting higher priority procesess).
3443  *
3444  * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
3445  * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
3446  * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
3447  * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
3448  * band. The deferral can be cleared early by clearing the appropriate flag.
3449  *
3450  * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
3451  * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
3452  * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
3453  */
3454 
3455 int
memorystatus_dirty_track(proc_t p,uint32_t pcontrol)3456 memorystatus_dirty_track(proc_t p, uint32_t pcontrol)
3457 {
3458 	unsigned int old_dirty;
3459 	boolean_t defer_now = FALSE;
3460 	int ret = 0;
3461 	int priority;
3462 	bool kill = false;
3463 	memstat_priority_options_t priority_options =
3464 	    MEMSTAT_PRIORITY_OPTIONS_NONE;
3465 
3466 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_TRACK),
3467 	    proc_getpid(p), p->p_memstat_dirty, pcontrol);
3468 
3469 	proc_list_lock();
3470 
3471 	if (proc_list_exited(p)) {
3472 		/*
3473 		 * Process is on its way out.
3474 		 */
3475 		ret = EBUSY;
3476 		goto exit;
3477 	}
3478 
3479 	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3480 		ret = EPERM;
3481 		goto exit;
3482 	}
3483 
3484 	if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
3485 		/* error  */
3486 		goto exit;
3487 	}
3488 
3489 	old_dirty = p->p_memstat_dirty;
3490 
3491 	/* These bits are cumulative, as per <rdar://problem/11159924> */
3492 	if (pcontrol & PROC_DIRTY_TRACK) {
3493 		/* Request to turn ON Dirty tracking... */
3494 		if (p->p_memstat_state & P_MEMSTAT_MANAGED) {
3495 			/* on a process managed by RunningBoard or its equivalent...*/
3496 			if (!_memstat_proc_cached_memlimit_is_fatal(p)) {
3497 				/* but this might be an app because there's no fatal limits
3498 				 * NB: This _big_ assumption is not universal. What we really
3499 				 * need is a way to say this is an _APP_ and we can't have dirty
3500 				 * tracking turned ON for it. Lacking that functionality we clump
3501 				 * together some checks and try to do the best detection we can.
3502 				 * Reason we can't allow addition of these flags is because, per the
3503 				 * kernel checks, they change the role of a process from app to daemon. And the
3504 				 * AGING_IN_PROGRESS bits might still be set i.e. it needs to be demoted
3505 				 * correctly from the right aging band (app or sysproc). We can't simply try
3506 				 * to invalidate the demotion here because, owing to assertion priorities, we
3507 				 * might not be in the aging bands.
3508 				 */
3509 				memorystatus_log(
3510 					"memorystatus: Denying dirty-tracking opt-in for managed %s [%d]\n",
3511 					proc_best_name(p), proc_getpid(p));
3512 				/* fail silently to avoid an XPC assertion... */
3513 				ret = 0;
3514 				goto exit;
3515 			}
3516 		}
3517 
3518 		p->p_memstat_dirty |= P_DIRTY_TRACK;
3519 	}
3520 
3521 	if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
3522 		p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
3523 	}
3524 
3525 	if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3526 		p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
3527 	}
3528 
3529 	/*
3530 	 * NB: All processes are now automatically enrolled in idle aging
3531 	 * regardless of whether they request to be deferred.
3532 	 */
3533 	if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
3534 		if ((pcontrol & (PROC_DIRTY_DEFER)) &&
3535 		    !(old_dirty & P_DIRTY_DEFER)) {
3536 			p->p_memstat_dirty |= P_DIRTY_DEFER;
3537 		}
3538 
3539 		if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) &&
3540 		    !(old_dirty & P_DIRTY_DEFER_ALWAYS)) {
3541 			p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS;
3542 		}
3543 
3544 		defer_now = TRUE;
3545 	}
3546 
3547 	if (pcontrol & PROC_DIRTY_SHUTDOWN_ON_CLEAN) {
3548 		p->p_memstat_dirty |= P_DIRTY_SHUTDOWN_ON_CLEAN;
3549 
3550 		if (_memstat_proc_is_tracked(p) && !_memstat_proc_is_dirty(p)) {
3551 			kill = true;
3552 		}
3553 	}
3554 
3555 	memorystatus_log_info(
3556 		"%s [%d] enrolled in ActivityTracking tracked %d / idle-exit %d / defer %d / dirty %d",
3557 		proc_best_name(p), proc_getpid(p),
3558 		_memstat_proc_is_tracked(p), _memstat_proc_can_idle_exit(p), defer_now,
3559 		_memstat_proc_is_dirty(p));
3560 
3561 	if (!_memstat_proc_is_dirty(p) && _memstat_proc_is_tracked(p) &&
3562 	    _memstat_proc_can_idle_exit(p)) {
3563 		priority = JETSAM_PRIORITY_IDLE;
3564 		if (!defer_now && _memstat_proc_is_aging(p)) {
3565 			/*
3566 			 * Historically, some processes have tried to use this to opt out
3567 			 * of the 'aging' facility.
3568 			 */
3569 			priority_options |= MEMSTAT_PRIORITY_NO_AGING;
3570 		}
3571 	} else {
3572 		priority = p->p_memstat_requestedpriority;
3573 	}
3574 
3575 	if (_memstat_proc_has_priority_assertion(p)) {
3576 		priority = MAX(priority, p->p_memstat_assertionpriority);
3577 	}
3578 
3579 	memstat_update_priority_locked(p, priority, priority_options);
3580 
3581 exit:
3582 	if (kill && proc_ref(p, true) == p) {
3583 		proc_list_unlock();
3584 		psignal(p, SIGKILL);
3585 		proc_rele(p);
3586 	} else {
3587 		proc_list_unlock();
3588 	}
3589 
3590 	return ret;
3591 }
3592 
3593 int
memorystatus_dirty_set(proc_t p,boolean_t self,uint32_t pcontrol)3594 memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol)
3595 {
3596 	int ret = 0;
3597 	bool kill = false;
3598 	bool was_dirty;
3599 	bool now_dirty = false;
3600 	int priority;
3601 	task_t t = proc_task(p);
3602 
3603 	memorystatus_log_debug("memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, proc_getpid(p), pcontrol, p->p_memstat_dirty);
3604 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_SET), proc_getpid(p), self, pcontrol);
3605 
3606 	proc_list_lock();
3607 
3608 	if (proc_list_exited(p)) {
3609 		/*
3610 		 * Process is on its way out.
3611 		 */
3612 		ret = EBUSY;
3613 		goto exit;
3614 	}
3615 
3616 	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3617 		ret = EPERM;
3618 		goto exit;
3619 	}
3620 
3621 	was_dirty = _memstat_proc_is_dirty(p);
3622 
3623 	if (!_memstat_proc_is_tracked(p)) {
3624 		/* Dirty tracking not enabled */
3625 		ret = EINVAL;
3626 		goto exit;
3627 	} else if (pcontrol && _memstat_proc_is_terminating(p)) {
3628 		/*
3629 		 * Process is set to be terminated and we're attempting to mark it dirty.
3630 		 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
3631 		 */
3632 		ret = EBUSY;
3633 		goto exit;
3634 	}
3635 
3636 	int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
3637 	if (pcontrol && !(p->p_memstat_dirty & flag)) {
3638 		/* Mark the process as having been dirtied at some point */
3639 		p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
3640 	} else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
3641 		if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
3642 			/* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
3643 			p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3644 			kill = true;
3645 		} else if ((flag == P_DIRTY) && _memstat_proc_is_terminating(p)) {
3646 			/* Kill previously terminated processes if set clean */
3647 			kill = true;
3648 		}
3649 		p->p_memstat_dirty &= ~flag;
3650 	} else {
3651 		/* Already set */
3652 		ret = EALREADY;
3653 		goto exit;
3654 	}
3655 
3656 	now_dirty = _memstat_proc_is_dirty(p);
3657 
3658 	if (was_dirty && !now_dirty) {
3659 		if (_memstat_proc_can_idle_exit(p)) {
3660 			/*
3661 			 * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back
3662 			 * there once it's clean again. For the legacy case, this only applies if it has some protection window left.
3663 			 * P_DIRTY_DEFER: one-time protection window given at launch
3664 			 * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode.
3665 			 *
3666 			 * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over
3667 			 * in that band on it's way to IDLE.
3668 			 */
3669 			assert(!_memstat_proc_is_aging(p));
3670 			priority = JETSAM_PRIORITY_IDLE;
3671 		} else {
3672 			priority = p->p_memstat_requestedpriority;
3673 		}
3674 		task_ledger_settle_dirty_time(t);
3675 		task_set_dirty_start(t, 0);
3676 		if (_memstat_proc_shutdown_on_clean(p)) {
3677 			kill = true;
3678 		}
3679 	} else if (!was_dirty && now_dirty) {
3680 		priority = p->p_memstat_requestedpriority;
3681 		task_set_dirty_start(t, mach_absolute_time());
3682 	}
3683 
3684 	if (_memstat_proc_has_priority_assertion(p)) {
3685 		priority = MAX(priority, p->p_memstat_assertionpriority);
3686 	}
3687 
3688 	memstat_update_priority_locked(p, priority, MEMSTAT_PRIORITY_OPTIONS_NONE);
3689 
3690 exit:
3691 	if (kill && proc_ref(p, true) == p) {
3692 		proc_list_unlock();
3693 		psignal(p, SIGKILL);
3694 		proc_rele(p);
3695 	} else {
3696 		proc_list_unlock();
3697 	}
3698 
3699 	return ret;
3700 }
3701 
3702 int
memorystatus_dirty_clear(proc_t p,uint32_t pcontrol)3703 memorystatus_dirty_clear(proc_t p, uint32_t pcontrol)
3704 {
3705 	int ret = 0;
3706 
3707 	memorystatus_log_debug("memorystatus_dirty_clear(): %d 0x%x 0x%x\n", proc_getpid(p), pcontrol, p->p_memstat_dirty);
3708 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_CLEAR), proc_getpid(p), pcontrol);
3709 
3710 	proc_list_lock();
3711 
3712 	if (proc_list_exited(p)) {
3713 		/*
3714 		 * Process is on its way out.
3715 		 */
3716 		ret = EBUSY;
3717 		goto exit;
3718 	}
3719 
3720 	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3721 		ret = EPERM;
3722 		goto exit;
3723 	}
3724 
3725 	if (!_memstat_proc_is_tracked(p)) {
3726 		/* Dirty tracking not enabled */
3727 		ret = EINVAL;
3728 		goto exit;
3729 	}
3730 
3731 	if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) {
3732 		ret = EINVAL;
3733 		goto exit;
3734 	}
3735 
3736 	if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3737 		p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
3738 	}
3739 
3740 	/* This can be set and cleared exactly once. */
3741 	if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
3742 		if (p->p_memstat_dirty & P_DIRTY_DEFER) {
3743 			p->p_memstat_dirty &= ~(P_DIRTY_DEFER);
3744 		}
3745 
3746 		if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
3747 			p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS);
3748 		}
3749 
3750 		if (_memstat_proc_is_aging(p)) {
3751 			memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE,
3752 			    MEMSTAT_PRIORITY_NO_AGING);
3753 		}
3754 	}
3755 
3756 	ret = 0;
3757 exit:
3758 	proc_list_unlock();
3759 
3760 	return ret;
3761 }
3762 
3763 int
memorystatus_dirty_get(proc_t p,boolean_t locked)3764 memorystatus_dirty_get(proc_t p, boolean_t locked)
3765 {
3766 	int ret = 0;
3767 
3768 	if (!locked) {
3769 		proc_list_lock();
3770 	}
3771 
3772 	if (_memstat_proc_is_tracked(p)) {
3773 		ret |= PROC_DIRTY_TRACKED;
3774 		if (_memstat_proc_can_idle_exit(p)) {
3775 			ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
3776 		}
3777 		if (p->p_memstat_dirty & P_DIRTY) {
3778 			ret |= PROC_DIRTY_IS_DIRTY;
3779 		}
3780 		if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
3781 			ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
3782 		}
3783 	}
3784 
3785 	if (!locked) {
3786 		proc_list_unlock();
3787 	}
3788 
3789 	return ret;
3790 }
3791 
3792 int
memorystatus_on_terminate(proc_t p)3793 memorystatus_on_terminate(proc_t p)
3794 {
3795 	int sig;
3796 
3797 	proc_list_lock();
3798 
3799 	p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3800 
3801 	if ((_memstat_proc_is_tracked(p) && !_memstat_proc_is_dirty(p)) ||
3802 	    (_memstat_proc_is_suspended(p))) {
3803 		/*
3804 		 * Mark as terminated and issue SIGKILL if:-
3805 		 * - process is clean, or,
3806 		 * - if process is dirty but suspended. This case is likely
3807 		 * an extension because apps don't opt into dirty-tracking
3808 		 * and daemons aren't suspended.
3809 		 */
3810 #if DEVELOPMENT || DEBUG
3811 		if (_memstat_proc_is_suspended(p)) {
3812 			memorystatus_log(
3813 				"memorystatus: sending suspended process %s (pid %d) SIGKILL\n",
3814 				(*p->p_name ? p->p_name : "unknown"), proc_getpid(p));
3815 		}
3816 #endif /* DEVELOPMENT || DEBUG */
3817 		sig = SIGKILL;
3818 	} else {
3819 		/* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
3820 		sig = SIGTERM;
3821 	}
3822 
3823 	proc_list_unlock();
3824 
3825 	return sig;
3826 }
3827 
3828 void
memorystatus_on_suspend(proc_t p)3829 memorystatus_on_suspend(proc_t p)
3830 {
3831 #if CONFIG_FREEZE
3832 	uint32_t pages;
3833 	memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
3834 #endif
3835 	proc_list_lock();
3836 
3837 	_memstat_proc_set_suspended(p);
3838 
3839 	/* Check if proc is marked for termination */
3840 	bool kill_process = _memstat_proc_is_terminating(p);
3841 	proc_list_unlock();
3842 
3843 	if (kill_process) {
3844 		psignal(p, SIGKILL);
3845 	}
3846 }
3847 
3848 extern uint64_t memorystatus_thaw_count_since_boot;
3849 
3850 void
memorystatus_on_resume(proc_t p)3851 memorystatus_on_resume(proc_t p)
3852 {
3853 #if CONFIG_FREEZE
3854 	pid_t pid;
3855 #endif
3856 
3857 	proc_list_lock();
3858 
3859 #if CONFIG_FREEZE
3860 	const bool frozen = _memstat_proc_is_frozen(p);
3861 	if (frozen) {
3862 		/*
3863 		 * Now that we don't _thaw_ a process completely,
3864 		 * resuming it (and having some on-demand swapins)
3865 		 * shouldn't preclude it from being counted as frozen.
3866 		 *
3867 		 * memorystatus_frozen_count--;
3868 		 *
3869 		 * We preserve the P_MEMSTAT_FROZEN state since the process
3870 		 * could have state on disk AND so will deserve some protection
3871 		 * in the jetsam bands.
3872 		 */
3873 		if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) {
3874 			p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE;
3875 			memorystatus_refreeze_eligible_count++;
3876 		}
3877 		if (p->p_memstat_thaw_count == 0 || p->p_memstat_last_thaw_interval < memorystatus_freeze_current_interval) {
3878 			os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed), relaxed);
3879 			if (strcmp(p->p_name, "com.apple.WebKit.WebContent") == 0) {
3880 				os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed_webcontent), relaxed);
3881 			}
3882 		}
3883 		p->p_memstat_last_thaw_interval = memorystatus_freeze_current_interval;
3884 		p->p_memstat_thaw_count++;
3885 
3886 		memorystatus_log("memorystatus: resuming/thawing pid %d [%s]\n", p->p_pid, proc_best_name(p));
3887 		memorystatus_freeze_record_process_thawed(p);
3888 
3889 		memorystatus_thaw_count++;
3890 		memorystatus_thaw_count_since_boot++;
3891 	}
3892 
3893 	pid = proc_getpid(p);
3894 #endif
3895 
3896 	/*
3897 	 * P_MEMSTAT_FROZEN will remain unchanged. This used to be:
3898 	 * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
3899 	 */
3900 	_memstat_proc_set_resumed(p);
3901 
3902 	proc_list_unlock();
3903 
3904 #if CONFIG_FREEZE
3905 	if (frozen) {
3906 		memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
3907 		memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
3908 	}
3909 #endif
3910 }
3911 
3912 void
memorystatus_on_inactivity(proc_t p)3913 memorystatus_on_inactivity(proc_t p)
3914 {
3915 #pragma unused(p)
3916 #if CONFIG_FREEZE
3917 	/* Wake the freeze thread */
3918 	thread_wakeup((event_t)&memorystatus_freeze_wakeup);
3919 #endif
3920 }
3921 
3922 /*
3923  * The proc_list_lock is held by the caller.
3924  */
3925 static memorystatus_proc_state_t
_memstat_build_state(proc_t p)3926 _memstat_build_state(proc_t p)
3927 {
3928 	uint32_t snapshot_state = 0;
3929 
3930 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
3931 
3932 	/* General */
3933 	if (_memstat_proc_is_suspended(p)) {
3934 		snapshot_state |= kMemorystatusSuspended;
3935 	}
3936 	if (_memstat_proc_is_frozen(p)) {
3937 		snapshot_state |= kMemorystatusFrozen;
3938 	}
3939 	if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
3940 		snapshot_state |= kMemorystatusWasThawed;
3941 	}
3942 	if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
3943 		snapshot_state |= kMemorystatusAssertion;
3944 	}
3945 
3946 	/* Tracking */
3947 	if (_memstat_proc_is_tracked(p)) {
3948 		snapshot_state |= kMemorystatusTracked;
3949 	}
3950 	if (_memstat_proc_can_idle_exit(p)) {
3951 		snapshot_state |= kMemorystatusSupportsIdleExit;
3952 	}
3953 	if (_memstat_proc_is_dirty(p)) {
3954 		snapshot_state |= kMemorystatusDirty;
3955 	}
3956 	if (memstat_proc_is_active_locked(p)) {
3957 		snapshot_state |= kMemorystatusActive;
3958 	}
3959 
3960 	/* Probable relaunch behavior */
3961 	if (p->p_memstat_relaunch_flags & P_MEMSTAT_RELAUNCH_LOW) {
3962 		snapshot_state |= kMemorystatusRelaunchLow;
3963 	}
3964 	if (p->p_memstat_relaunch_flags & P_MEMSTAT_RELAUNCH_MED) {
3965 		snapshot_state |= kMemorystatusRelaunchMed;
3966 	}
3967 	if (p->p_memstat_relaunch_flags & P_MEMSTAT_RELAUNCH_HIGH) {
3968 		snapshot_state |= kMemorystatusRelaunchHigh;
3969 	}
3970 
3971 	return snapshot_state;
3972 }
3973 
3974 bool
memstat_kill_idle_process(memorystatus_kill_cause_t cause,uint64_t * footprint_out)3975 memstat_kill_idle_process(memorystatus_kill_cause_t cause,
3976     uint64_t *footprint_out)
3977 {
3978 	proc_t p = PROC_NULL;
3979 	uint64_t current_time;
3980 	bool killed = FALSE;
3981 	unsigned int i = 0;
3982 	os_reason_t jetsam_reason = OS_REASON_NULL;
3983 
3984 	/* Pick next idle exit victim. */
3985 	current_time = mach_absolute_time();
3986 
3987 	jetsam_reason = os_reason_create(OS_REASON_JETSAM, (jetsam_reason_t)cause);
3988 	if (jetsam_reason == OS_REASON_NULL) {
3989 		memorystatus_log_error("memorystatus: failed to allocate jetsam reason\n");
3990 	}
3991 
3992 	proc_list_lock();
3993 
3994 	p = memorystatus_get_first_proc_locked(&i, FALSE);
3995 	while (p) {
3996 		/* No need to look beyond the idle band */
3997 		if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
3998 			break;
3999 		}
4000 
4001 		if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
4002 			if (current_time >= p->p_memstat_idledeadline) {
4003 				p->p_memstat_dirty |= P_DIRTY_TERMINATED;
4004 				p = proc_ref(p, true);
4005 				break;
4006 			}
4007 		}
4008 
4009 		p = memorystatus_get_next_proc_locked(&i, p, FALSE);
4010 	}
4011 
4012 	proc_list_unlock();
4013 
4014 	if (p) {
4015 		memorystatus_log(
4016 			"memorystatus: killing (idle) %s [%d] due to %s (%u)\n",
4017 			proc_best_name(p), proc_getpid(p), memstat_kill_cause_name[cause], cause);
4018 		memorystatus_kill_proc(p, cause, jetsam_reason, &killed, footprint_out);
4019 		proc_rele(p);
4020 	} else {
4021 		os_reason_free(jetsam_reason);
4022 	}
4023 
4024 	return killed;
4025 }
4026 
4027 /*
4028  * Consider waking the jetsam thread. Returns true if the thread was awoken.
4029  */
4030 static bool
_memstat_consider_waking_jetsam_thread(void)4031 _memstat_consider_waking_jetsam_thread(void)
4032 {
4033 #if CONFIG_JETSAM
4034 	if (memstat_evaluate_page_shortage(NULL, NULL, NULL, NULL)) {
4035 		memorystatus_thread_wake();
4036 		return true;
4037 	}
4038 #endif /* CONFIG_JETSAM */
4039 	return false;
4040 }
4041 
4042 void
memorystatus_thread_wake()4043 memorystatus_thread_wake()
4044 {
4045 	int thr_id = 0;
4046 	int active_thr = atomic_load(&active_jetsam_threads);
4047 
4048 	/* Wakeup all the jetsam threads */
4049 	for (thr_id = 0; thr_id < active_thr; thr_id++) {
4050 		jetsam_state_t jetsam_thread = &jetsam_threads[thr_id];
4051 		sched_cond_signal(&(jetsam_thread->jt_wakeup_cond), jetsam_thread->thread);
4052 	}
4053 }
4054 
4055 void
memorystatus_respond_to_compressor_exhaustion(void)4056 memorystatus_respond_to_compressor_exhaustion(void)
4057 {
4058 #if CONFIG_JETSAM
4059 	memorystatus_thread_wake();
4060 #else /* !CONFIG_JETSAM */
4061 	if (kill_on_no_paging_space) {
4062 		memorystatus_thread_wake();
4063 	} else {
4064 		/*
4065 		 * Throttle how often the jetsam thread is woken due to
4066 		 * compressor/swap exhaustion
4067 		 */
4068 		uint64_t now = mach_absolute_time();
4069 		uint64_t delta_since_last_no_space_ns;
4070 		uint64_t last_action_ts = os_atomic_load(&last_no_space_action_ts, relaxed);
4071 		if (now < last_action_ts) {
4072 			/* Raced with a concurrent no-paging-space action */
4073 			return;
4074 		}
4075 		absolutetime_to_nanoseconds(now - last_action_ts, &delta_since_last_no_space_ns);
4076 		if (delta_since_last_no_space_ns > no_paging_space_action_throttle_delay_ns) {
4077 			memorystatus_thread_wake();
4078 		}
4079 	}
4080 #endif /* CONFIG_JETSAM */
4081 }
4082 
4083 void
memorystatus_respond_to_swap_exhaustion(void)4084 memorystatus_respond_to_swap_exhaustion(void)
4085 {
4086 #if CONFIG_JETSAM
4087 	/*
4088 	 * On systems with both swap and jetsam,
4089 	 * just wake up the jetsam thread and have it handle the low swap condition
4090 	 * by killing apps.
4091 	 */
4092 	if (jetsam_kill_on_low_swap) {
4093 		memorystatus_thread_wake();
4094 	}
4095 #else /* !CONFIG_JETSAM */
4096 	memorystatus_respond_to_compressor_exhaustion();
4097 #endif /* CONFIG_JETSAM */
4098 }
4099 
4100 #if CONFIG_JETSAM
4101 static void
memorystatus_thread_pool_max()4102 memorystatus_thread_pool_max()
4103 {
4104 	/* Increase the jetsam thread pool to max_jetsam_threads */
4105 	int max_threads = max_jetsam_threads;
4106 	memorystatus_log_info("Expanding memorystatus pool to %d\n", max_threads);
4107 	os_atomic_store(&active_jetsam_threads, max_threads, relaxed);
4108 }
4109 
4110 static void
memorystatus_thread_pool_default()4111 memorystatus_thread_pool_default()
4112 {
4113 	/* Restore the jetsam thread pool to a single thread */
4114 	memorystatus_log_info("Reverting memorystatus pool back to 1\n");
4115 	os_atomic_store(&active_jetsam_threads, 1, relaxed);
4116 }
4117 #endif /* CONFIG_JETSAM */
4118 
4119 /*
4120  * An offset applied to non-critical page shortage thresholds.
4121  */
4122 static uint32_t
_memstat_page_shortage_offset(void)4123 _memstat_page_shortage_offset(void)
4124 {
4125 	uint32_t offset = 0;
4126 	if (os_atomic_load(&memstat_policy_config, acquire) & kPolicyClearTheDecks) {
4127 		offset += memstat_ctd_offset;
4128 	}
4129 	if (os_atomic_load(&memstat_policy_config, acquire) & kPolicyBallastDrain) {
4130 		offset += memstat_ballast_offset;
4131 	}
4132 	return offset;
4133 }
4134 
4135 uint32_t
memorystatus_get_critical_page_shortage_threshold(void)4136 memorystatus_get_critical_page_shortage_threshold(void)
4137 {
4138 	return memstat_critical_threshold;
4139 }
4140 
4141 uint32_t
memorystatus_get_idle_exit_page_shortage_threshold(void)4142 memorystatus_get_idle_exit_page_shortage_threshold(void)
4143 {
4144 	uint32_t offset = _memstat_page_shortage_offset();
4145 	return memstat_idle_threshold + offset;
4146 }
4147 
4148 uint32_t
memorystatus_get_soft_memlimit_page_shortage_threshold(void)4149 memorystatus_get_soft_memlimit_page_shortage_threshold(void)
4150 {
4151 	uint32_t offset = _memstat_page_shortage_offset();
4152 	return memstat_soft_threshold + offset;
4153 }
4154 
4155 uint32_t
memorystatus_get_reaper_page_shortage_threshold(void)4156 memorystatus_get_reaper_page_shortage_threshold(void)
4157 {
4158 	uint32_t offset = _memstat_page_shortage_offset();
4159 	return memstat_reaper_threshold + offset;
4160 }
4161 
4162 #if CONFIG_JETSAM
4163 void
_memstat_reaper_check_oldest_reapable_proc_info_timeout(void)4164 _memstat_reaper_check_oldest_reapable_proc_info_timeout(void)
4165 {
4166 	if (memstat_oldest_reapable_proc_prio_start != MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_UNKNOWN) {
4167 		uint64_t curr_ts_matu = mach_absolute_time();
4168 		if (curr_ts_matu > memstat_oldest_reapable_proc_info_expiration_ts_matu) {
4169 			memstat_oldest_reapable_proc_prio_start = MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_UNKNOWN;
4170 			memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu = 0;
4171 		}
4172 	}
4173 }
4174 
4175 void
_memstat_reaper_start_sweep(void)4176 _memstat_reaper_start_sweep(void)
4177 {
4178 	if (!memstat_reaper_is_currently_sweeping) {
4179 		memorystatus_log("memorystatus: _memstat_reaper_start_sweep: reaper sweep starting\n");
4180 		memstat_reaper_is_currently_sweeping = true;
4181 		memstat_reaper_start_ts_matu = mach_absolute_time();
4182 		memstat_reaper_current_sweep_stats.kill_count = 0;
4183 		memstat_reaper_current_sweep_stats.memory_freed_bytes = 0;
4184 	}
4185 }
4186 
4187 void
_memstat_reaper_end_sweep(void)4188 _memstat_reaper_end_sweep(void)
4189 {
4190 	uint64_t rescan_delta_matu;
4191 	if (memstat_reaper_is_currently_sweeping) {
4192 		uint64_t delta_time_nsec;
4193 
4194 		/* For idle reaper kills, we skip the normal compaction after each kill,
4195 		 * and do one compaction here at the end of the sweep.
4196 		 */
4197 		vm_run_compactor();
4198 
4199 		absolutetime_to_nanoseconds(mach_absolute_time() - memstat_reaper_start_ts_matu, &delta_time_nsec);
4200 		memstat_reaper_cumulative_stats.sweep_count++;
4201 		memstat_reaper_cumulative_memory_freed_mb = (uint32_t)(memstat_reaper_cumulative_stats.memory_freed_bytes >> 20);
4202 		memorystatus_log("memorystatus: _memstat_reaper_end_sweep: reaper sweep ended, %d processes killed, %lluMB freed, %llums elapsed, %lluus/process\n",
4203 		    memstat_reaper_current_sweep_stats.kill_count,
4204 		    memstat_reaper_current_sweep_stats.memory_freed_bytes >> 20,
4205 		    (delta_time_nsec / NSEC_PER_MSEC),
4206 		    memstat_reaper_current_sweep_stats.kill_count ? ((delta_time_nsec / NSEC_PER_USEC) / memstat_reaper_current_sweep_stats.kill_count) : 0);
4207 		memorystatus_log("memorystatus: _memstat_reaper_end_sweep: reaper totals: %d sweeps, %d processes killed, %dMB freed\n",
4208 		    memstat_reaper_cumulative_stats.sweep_count,
4209 		    memstat_reaper_cumulative_stats.kill_count,
4210 		    memstat_reaper_cumulative_memory_freed_mb);
4211 		memstat_reaper_is_currently_sweeping = false;
4212 		nanoseconds_to_absolutetime((memstat_reaper_rescan_secs * NSEC_PER_SEC), &rescan_delta_matu);
4213 		memstat_reaper_can_run_after_ts_matu = mach_absolute_time() + rescan_delta_matu;
4214 	}
4215 }
4216 
4217 void
_memstat_reaper_record_kill(uint64_t bytes_freed)4218 _memstat_reaper_record_kill(uint64_t bytes_freed)
4219 {
4220 	memstat_reaper_current_sweep_stats.kill_count++;
4221 	memstat_reaper_current_sweep_stats.memory_freed_bytes += bytes_freed;
4222 	memstat_reaper_cumulative_stats.kill_count++;
4223 	memstat_reaper_cumulative_stats.memory_freed_bytes += bytes_freed;
4224 }
4225 #endif /* CONFIG_JETSAM */
4226 
4227 const char*
_memstat_relaunch_flags_description(uint32_t flags)4228 _memstat_relaunch_flags_description(uint32_t flags)
4229 {
4230 	switch (flags) {
4231 	case P_MEMSTAT_RELAUNCH_UNKNOWN:
4232 		return "-";
4233 	case P_MEMSTAT_RELAUNCH_LOW:
4234 		return "low";
4235 	case P_MEMSTAT_RELAUNCH_MED:
4236 		return "med";
4237 	case P_MEMSTAT_RELAUNCH_HIGH:
4238 		return "high";
4239 	default:
4240 		return "??";
4241 	}
4242 }
4243 
4244 const char*
_memstat_proc_type_description(proc_t p)4245 _memstat_proc_type_description(proc_t p)
4246 {
4247 	if (_memstat_proc_is_application(p)) {
4248 		return "app";
4249 	} else {
4250 		return "daemon";
4251 	}
4252 }
4253 
4254 bool
memstat_evaluate_page_shortage(bool * should_enforce_memlimits,bool * should_idle_exit,bool * should_jetsam,bool * should_reap)4255 memstat_evaluate_page_shortage(
4256 	bool *should_enforce_memlimits,
4257 	bool *should_idle_exit,
4258 	bool *should_jetsam,
4259 	bool *should_reap)
4260 {
4261 	bool requires_action = false;
4262 	if (should_enforce_memlimits) {
4263 		*should_enforce_memlimits = false;
4264 	}
4265 	if (should_idle_exit) {
4266 		*should_idle_exit = false;
4267 	}
4268 	if (should_jetsam) {
4269 		*should_jetsam = false;
4270 	}
4271 	if (should_reap) {
4272 		*should_reap = false;
4273 	}
4274 #if CONFIG_JETSAM
4275 	uint32_t available_page_count = os_atomic_load(&memorystatus_available_pages, relaxed);
4276 #if VM_PRESSURE_EVENTS
4277 	if (available_page_count <
4278 	    memorystatus_get_soft_memlimit_page_shortage_threshold()) {
4279 		/*
4280 		 * Only wake the jetsam thread if there are hwm violators to
4281 		 * kill
4282 		 */
4283 		bool hwm_candidates = os_atomic_load(&memorystatus_hwm_candidates, acquire);
4284 		requires_action = requires_action || hwm_candidates;
4285 		if (should_enforce_memlimits) {
4286 			*should_enforce_memlimits = true;
4287 		}
4288 	}
4289 #endif /* VM_PRESSURE_EVENTS */
4290 
4291 	if (memstat_reaper_enabled) {
4292 		/*
4293 		 * Only wake the jetsam thread to do reaper kills if the reaper is currently alreay running a sweep
4294 		 * OR if other conditions suggest that we should start a sweep
4295 		 */
4296 
4297 		// if we are already in the middle of a reaper sweep already, continue it
4298 		if (memstat_reaper_is_currently_sweeping) {
4299 			requires_action = true;
4300 			if (should_reap) {
4301 				*should_reap = true;
4302 			}
4303 		} else {
4304 			uint64_t curr_ts_matu = mach_absolute_time();
4305 			// if we are not already in the middle of a reaper sweep, do very quick tests to see if we should possibly start one:
4306 			// - the minimum rescan time has passed since the end of the last sweep
4307 			// - we are below the page threshold
4308 			// - the oldest reapable process is old enough to be a reaper candidate now
4309 
4310 			if ((curr_ts_matu > memstat_reaper_can_run_after_ts_matu)
4311 			    && (available_page_count < memorystatus_get_reaper_page_shortage_threshold())) {
4312 				_memstat_reaper_check_oldest_reapable_proc_info_timeout();
4313 
4314 				if (memstat_oldest_reapable_proc_prio_start == MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_NONE) {
4315 					memorystatus_log_debug("memorystatus: memstat_evaluate_page_shortage: no known-reapable processes\n");
4316 				} else {
4317 					if (curr_ts_matu >= memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu) {
4318 						requires_action = true;
4319 						if (should_reap) {
4320 							*should_reap = true;
4321 							memorystatus_log_debug("memorystatus: memstat_evaluate_page_shortage: should start reaping long-idle processes\n");
4322 						}
4323 						_memstat_reaper_start_sweep();
4324 					}
4325 				}
4326 			}
4327 		}
4328 	}
4329 	if (available_page_count < memorystatus_get_idle_exit_page_shortage_threshold()) {
4330 		/*
4331 		 * Only wake the jetsam thread if there are idle processes that
4332 		 * could exit.
4333 		 */
4334 		uint32_t idle_proc_count = os_atomic_load(
4335 			&memstat_bucket[JETSAM_PRIORITY_IDLE].count, relaxed);
4336 		requires_action = requires_action || (idle_proc_count > 0);
4337 		if (should_idle_exit) {
4338 			*should_idle_exit = true;
4339 		}
4340 	}
4341 	if (available_page_count < memorystatus_get_critical_page_shortage_threshold()) {
4342 		if (should_jetsam) {
4343 			*should_jetsam = true;
4344 		}
4345 		requires_action = true;
4346 	}
4347 #endif /* CONFIG_JETSAM */
4348 	return requires_action;
4349 }
4350 
4351 #if CONFIG_JETSAM
4352 static uint64_t
memorystatus_swap_trigger_pages(void)4353 memorystatus_swap_trigger_pages(void)
4354 {
4355 	/*
4356 	 * The swapout trigger varies based on the current memorystatus_level.
4357 	 * When available memory is somewhat high (at memorystatus_available_pages_pressure)
4358 	 * we keep more swappable compressor segments in memory.
4359 	 * However, as available memory drops to our idle and eventually critical kill
4360 	 * thresholds we start swapping more aggressively.
4361 	 */
4362 	static uint32_t available_pages_factor[] = {0, 1, 1, 1, 2, 2, 3, 5, 7, 8, 10, 13, 15, 17, 20};
4363 	size_t index = MIN(memorystatus_level, sizeof(available_pages_factor) / sizeof(uint32_t) - 1);
4364 	return available_pages_factor[index] * memorystatus_available_pages / 10;
4365 }
4366 
4367 static int
4368 sysctl_memorystatus_swap_trigger_pages SYSCTL_HANDLER_ARGS
4369 {
4370 #pragma unused(arg1, arg2)
4371 	uint64_t trigger_pages = memorystatus_swap_trigger_pages();
4372 	return SYSCTL_OUT(req, &trigger_pages, sizeof(trigger_pages));
4373 }
4374 
4375 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_swap_trigger_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
4376     0, 0, &sysctl_memorystatus_swap_trigger_pages, "I", "");
4377 
4378 /*
4379  * Check if the number of full swappable csegments is over the trigger
4380  * threshold to start swapping.
4381  * The adjustment_factor is applied to the trigger to raise or lower
4382  * it. For example an adjustement factor of 110 will raise the threshold by 10%.
4383  */
4384 bool
memorystatus_swap_over_trigger(uint64_t adjustment_factor)4385 memorystatus_swap_over_trigger(uint64_t adjustment_factor)
4386 {
4387 	if (!memorystatus_swap_all_apps) {
4388 		return false;
4389 	}
4390 	uint64_t trigger_pages = memorystatus_swap_trigger_pages();
4391 	trigger_pages = trigger_pages * adjustment_factor / 100;
4392 	return atop_64(c_late_swapout_count * c_seg_allocsize) > trigger_pages;
4393 }
4394 
4395 /*
4396  * Check if the number of segments on the early swapin queue
4397  * is over the trigger to start compacting it.
4398  */
4399 bool
memorystatus_swapin_over_trigger(void)4400 memorystatus_swapin_over_trigger(void)
4401 {
4402 	return atop_64(c_late_swappedin_count * c_seg_allocsize) > memorystatus_swapin_trigger_pages;
4403 }
4404 #endif /* CONFIG_JETSAM */
4405 
4406 #if DEVELOPMENT || DEBUG
4407 SYSCTL_UINT(_vm, OID_AUTO, c_late_swapout_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swapout_count, 0, "");
4408 SYSCTL_UINT(_vm, OID_AUTO, c_seg_allocsize, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_allocsize, 0, "");
4409 #if CONFIG_FREEZE
4410 extern int32_t c_segment_pages_compressed_incore_late_swapout;
4411 SYSCTL_INT(_vm, OID_AUTO, c_segment_pages_compressed_incore_late_swapout, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_pages_compressed_incore_late_swapout, 0, "");
4412 #endif /* CONFIG_FREEZE */
4413 #endif /* DEVELOPMENT || DEBUG */
4414 
4415 static boolean_t
memorystatus_should_post_snapshot(int32_t priority,uint32_t cause)4416 memorystatus_should_post_snapshot(int32_t priority, uint32_t cause)
4417 {
4418 	boolean_t is_idle_priority;
4419 
4420 	is_idle_priority = (priority == JETSAM_PRIORITY_IDLE || priority == JETSAM_PRIORITY_IDLE_DEFERRED);
4421 #if CONFIG_JETSAM
4422 #pragma unused(cause)
4423 	/*
4424 	 * Don't generate logs for steady-state idle-exit kills,
4425 	 * unless it is overridden for debug or by the device
4426 	 * tree.
4427 	 */
4428 
4429 	return !is_idle_priority || memorystatus_idle_snapshot;
4430 
4431 #else /* CONFIG_JETSAM */
4432 	/*
4433 	 * Don't generate logs for steady-state idle-exit kills,
4434 	 * unless
4435 	 * - it is overridden for debug or by the device
4436 	 * tree.
4437 	 * OR
4438 	 * - the kill causes are important i.e. not kMemorystatusKilledIdleExit
4439 	 */
4440 
4441 	boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause));
4442 	return !is_idle_priority || memorystatus_idle_snapshot || snapshot_eligible_kill_cause;
4443 #endif /* CONFIG_JETSAM */
4444 }
4445 
4446 
4447 static boolean_t
memorystatus_act_on_hiwat_processes(uint32_t * errors,uint32_t * hwm_kill,bool * post_snapshot,uint64_t * memory_reclaimed)4448 memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, bool *post_snapshot, uint64_t *memory_reclaimed)
4449 {
4450 	boolean_t purged = FALSE, killed = FALSE;
4451 
4452 	*memory_reclaimed = 0;
4453 	killed = memorystatus_kill_hiwat_proc(errors, &purged, memory_reclaimed);
4454 
4455 	if (killed) {
4456 		*hwm_kill = *hwm_kill + 1;
4457 		*post_snapshot = TRUE;
4458 		return TRUE;
4459 	} else {
4460 		if (purged == FALSE) {
4461 			/* couldn't purge and couldn't kill */
4462 			os_atomic_store(&memorystatus_hwm_candidates, false, relaxed);
4463 		}
4464 	}
4465 
4466 	return killed;
4467 }
4468 
4469 /*
4470  * Purge kernel memory caches
4471  */
4472 static void
memstat_purge_caches(jetsam_state_t state)4473 memstat_purge_caches(jetsam_state_t state)
4474 {
4475 	memorystatus_log("memorystatus: purging kernel memory caches\n");
4476 
4477 	uint64_t pmap_released = pmap_release_pages_fast();
4478 	memorystatus_log("memorystatus: recovered %llu pages from pmap\n",
4479 	    pmap_released);
4480 
4481 	/*
4482 	 * Only purge corpses once per jetsam event. No new corpses can be created
4483 	 * after the initial purge (block_corpses)
4484 	 */
4485 	if (!state->corpse_list_purged) {
4486 		memorystatus_log("memorystatus: purging all corpses\n");
4487 		os_atomic_inc(&block_corpses, relaxed);
4488 		assert(block_corpses > 0);
4489 		if (total_corpses_count() > 0) {
4490 			task_purge_all_corpses();
4491 		} else {
4492 			memorystatus_log("memorystatus: no corpses to purge\n");
4493 		}
4494 		state->corpse_list_purged = true;
4495 	}
4496 
4497 #if CONFIG_DEFERRED_RECLAIM
4498 	/* TODO: estimate memory recovered from deferred reclaim */
4499 	memorystatus_log("memorystatus: reclaiming all deferred user memory\n");
4500 	mach_vm_size_t vmdr_bytes_reclaimed;
4501 	vm_deferred_reclamation_gc(RECLAIM_GC_DRAIN, &vmdr_bytes_reclaimed,
4502 	    RECLAIM_NO_FAULT | RECLAIM_NO_WAIT);
4503 	memorystatus_log("memorystatus: purged %llu KiB of deferred user memory\n",
4504 	    vmdr_bytes_reclaimed);
4505 #endif /* CONFIG_DEFERRED_RECLAIM */
4506 
4507 	/* TODO: estimate wired memory recovered from zone_gc */
4508 	memorystatus_log("memorystatus: trimming kernel zone allocator\n");
4509 	zone_gc_trim();
4510 }
4511 
4512 static void
memstat_no_victim(jetsam_state_t state,memorystatus_kill_cause_t cause)4513 memstat_no_victim(jetsam_state_t state,
4514     memorystatus_kill_cause_t cause)
4515 {
4516 	/*
4517 	 * We tried to kill a process, but failed to find anyone to kill. It's
4518 	 * possible we chose not to because we reclaimed some purgeable memory or
4519 	 * hit this thread's priority limit.
4520 	 */
4521 	assert3u(state->memory_reclaimed, ==, 0);
4522 	if (state->limit_to_low_bands) {
4523 		/*
4524 		 * This thread isn't allowed to reach the high bands -- no need to overreact.
4525 		 */
4526 		return;
4527 	}
4528 	/*
4529 	 * We should have found someone to kill. Either we failed because of a transient
4530 	 * error or we've run out of candidates and the issue is caused by the kernel.
4531 	 */
4532 	memorystatus_log("memorystatus: failed to find a %s victim!\n", memstat_kill_cause_name[cause]);
4533 	if (state->errors && !state->errors_cleared) {
4534 		/*
4535 		 * It's possible that all of the kill candidates had the error bit set
4536 		 * (e.g. because we caught them in exec()). Clear all the error bits and
4537 		 * try to kill them one more time in the hopes that they are now killable.
4538 		 */
4539 		memorystatus_log("memorystatus: clearing kill errors and retrying\n");
4540 		memorystatus_clear_errors();
4541 		state->errors_cleared = true;
4542 	} else {
4543 		/* The memory may be held by a corpse or zalloc. */
4544 		memstat_purge_caches(state);
4545 		struct memorystatus_system_health_s health_status;
4546 		bool is_system_healthy = memstat_check_system_health(&health_status);
4547 		if (!is_system_healthy) {
4548 			memorystatus_log("memorystatus: system still unhealthy after cache purge!\n");
4549 			/*
4550 			 * We trimmed the zones above but it's possible there is a bug with
4551 			 * working set estimation and we needed a full drain.
4552 			 */
4553 			memorystatus_log_fault("memorystatus: fully draining kernel zone allocator\n");
4554 			zone_gc_drain();
4555 			is_system_healthy = memstat_check_system_health(&health_status);
4556 			if (!is_system_healthy) {
4557 				/*
4558 				 * We've killed everything and purged all xnu caches. There is nothing
4559 				 * left to do but panic.
4560 				 */
4561 				panic("memorystatus: all %s victims exhausted", memstat_kill_cause_name[cause]);
4562 			}
4563 		}
4564 	}
4565 }
4566 
4567 /*
4568  * Called before jetsamming in the foreground band in the hope that we'll
4569  * avoid a jetsam.
4570  */
4571 static void
memstat_approaching_fg_band(jetsam_state_t state)4572 memstat_approaching_fg_band(jetsam_state_t state)
4573 {
4574 	memorystatus_log("memorystatus: jetsam is approaching JETSAM_PRIORITY_FOREGROUND\n");
4575 	if (memorystatus_should_issue_fg_band_notify) {
4576 		memorystatus_broadcast_jetsam_pressure(kVMPressureForegroundJetsam);
4577 	}
4578 	memstat_purge_caches(state);
4579 }
4580 
4581 unsigned int jld_eval_aggressive_count = 0;
4582 uint64_t  jld_timestamp_msecs = 0;
4583 int       jld_idle_kill_candidates = 0;
4584 
4585 /*
4586  * Progressively raise the maximum priority to aggressively kill to
4587  * when a jetsam loop is detected. Background work often happens at
4588  * @c JETSAM_PRIORITY_MAIL. Start there and elevate as needed if
4589  * the jetsam loop re-occurs in a short time window.
4590  */
4591 int jld_max_priority_arr[] = {
4592 	JETSAM_PRIORITY_MAIL,
4593 	JETSAM_PRIORITY_MAIL,
4594 	JETSAM_PRIORITY_UI_SUPPORT,
4595 	JETSAM_PRIORITY_UI_SUPPORT,
4596 	JETSAM_PRIORITY_DRIVER_APPLE,
4597 };
4598 #define JLD_MAX_PRIORITY_ARR_COUNT (sizeof(jld_max_priority_arr) / sizeof(jld_max_priority_arr[0]))
4599 
4600 static bool
memorystatus_act_aggressive(jetsam_state_t state,uint32_t cause,os_reason_t jetsam_reason)4601 memorystatus_act_aggressive(jetsam_state_t state, uint32_t cause, os_reason_t jetsam_reason)
4602 {
4603 	boolean_t killed;
4604 	uint32_t errors = 0;
4605 	uint64_t footprint_of_killed_proc = 0;
4606 	int elevated_bucket_count = 0, maximum_kills = 0, band = 0;
4607 	state->memory_reclaimed = 0;
4608 
4609 	unsigned int iteration_no = jld_eval_aggressive_count++;
4610 	int max_kill_pri = jld_max_priority_arr[MIN(iteration_no, JLD_MAX_PRIORITY_ARR_COUNT - 1)];
4611 	assert3u(max_kill_pri, <=, MEMSTAT_BUCKET_COUNT);
4612 
4613 	if (max_kill_pri >= JETSAM_PRIORITY_FOREGROUND) {
4614 		memstat_approaching_fg_band(state);
4615 	}
4616 
4617 	proc_list_lock();
4618 	elevated_bucket_count = memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE].count;
4619 	proc_list_unlock();
4620 
4621 	/* Visit elevated processes first */
4622 	while (elevated_bucket_count) {
4623 		elevated_bucket_count--;
4624 
4625 		/*
4626 		 * memorystatus_kill_elevated_process() drops a reference,
4627 		 * so take another one so we can continue to use this exit reason
4628 		 * even after it returns.
4629 		 */
4630 
4631 		os_reason_ref(jetsam_reason);
4632 		killed = memorystatus_kill_elevated_process(
4633 			cause,
4634 			jetsam_reason,
4635 			JETSAM_PRIORITY_ELEVATED_INACTIVE,
4636 			jld_eval_aggressive_count,
4637 			&errors, &footprint_of_killed_proc);
4638 		if (killed) {
4639 			state->post_snapshot = true;
4640 			state->memory_reclaimed += footprint_of_killed_proc;
4641 			if (!memstat_evaluate_page_shortage(NULL, NULL, NULL, NULL)) {
4642 				/*
4643 				 * System is no longer under pressure --
4644 				 * bail early because the pressure was
4645 				 * coming from an inactive process
4646 				 */
4647 				return true;
4648 			}
4649 		} else {
4650 			/*
4651 			 * No pinned processes left to kill.
4652 			 * Abandon elevated band.
4653 			 */
4654 			break;
4655 		}
4656 	}
4657 
4658 	proc_list_lock();
4659 	for (band = JETSAM_PRIORITY_IDLE; band < max_kill_pri; band++) {
4660 		maximum_kills += memstat_bucket[band].count;
4661 	}
4662 	proc_list_unlock();
4663 	maximum_kills *= memorystatus_jld_max_kill_loops;
4664 	/*
4665 	 * memorystatus_kill_processes_aggressive() allocates its own
4666 	 * jetsam_reason so the kMemorystatusKilledProcThrashing cause
4667 	 * is consistent throughout the aggressive march.
4668 	 */
4669 	killed = memorystatus_kill_processes_aggressive(
4670 		kMemorystatusKilledProcThrashing,
4671 		jld_eval_aggressive_count,
4672 		max_kill_pri,
4673 		maximum_kills,
4674 		&errors, &footprint_of_killed_proc);
4675 
4676 	if (killed) {
4677 		/* Always generate logs after aggressive kill */
4678 		state->post_snapshot = true;
4679 		state->memory_reclaimed += footprint_of_killed_proc;
4680 		state->jld_idle_kills = 0;
4681 	}
4682 
4683 	return killed;
4684 }
4685 
4686 /*
4687  * Sets up a new jetsam thread.
4688  */
4689 static void
memorystatus_thread_init(jetsam_state_t jetsam_thread)4690 memorystatus_thread_init(jetsam_state_t jetsam_thread)
4691 {
4692 	char name[32];
4693 	thread_wire_internal(host_priv_self(), current_thread(), TRUE, NULL);
4694 	snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1);
4695 
4696 	/* Limit all but one thread to the lower jetsam bands, as that's where most of the victims are. */
4697 	if (jetsam_thread->index == 0) {
4698 		if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4699 			thread_vm_bind_group_add();
4700 		}
4701 		jetsam_thread->limit_to_low_bands = false;
4702 	} else {
4703 		jetsam_thread->limit_to_low_bands = true;
4704 	}
4705 #if CONFIG_THREAD_GROUPS
4706 	thread_group_vm_add();
4707 #endif
4708 	thread_set_thread_name(current_thread(), name);
4709 	sched_cond_init(&(jetsam_thread->jt_wakeup_cond));
4710 	jetsam_thread->inited = true;
4711 }
4712 
4713 /*
4714  * Create a new jetsam reason from the given kill cause.
4715  */
4716 static os_reason_t
create_jetsam_reason(memorystatus_kill_cause_t cause)4717 create_jetsam_reason(memorystatus_kill_cause_t cause)
4718 {
4719 	os_reason_t jetsam_reason = OS_REASON_NULL;
4720 
4721 	jetsam_reason_t reason_code = (jetsam_reason_t)cause;
4722 	assert3u(reason_code, <=, JETSAM_REASON_MEMORYSTATUS_MAX);
4723 
4724 	jetsam_reason = os_reason_create(OS_REASON_JETSAM, reason_code);
4725 	if (jetsam_reason == OS_REASON_NULL) {
4726 		memorystatus_log_error("memorystatus: failed to allocate jetsam reason for cause %u\n", cause);
4727 	}
4728 	return jetsam_reason;
4729 }
4730 
4731 /*
4732  * Do one kill as we're marching up the priority bands.
4733  * This is a wrapper around memstat_kill_top_process that also
4734  * sets post_snapshot, tracks jld_idle_kills, and notifies if we're appraoching the fg band.
4735  */
4736 static bool
memstat_do_priority_kill(jetsam_state_t state,uint32_t kill_cause,int32_t max_priority,memstat_kill_options_t options)4737 memstat_do_priority_kill(jetsam_state_t state,
4738     uint32_t kill_cause, int32_t max_priority, memstat_kill_options_t options)
4739 {
4740 	os_reason_t jetsam_reason = OS_REASON_NULL;
4741 	bool killed = false;
4742 	int priority;
4743 
4744 	jetsam_reason = create_jetsam_reason(kill_cause);
4745 	/*
4746 	 * memstat_kill_top_process() drops a reference,
4747 	 * so take another one so we can continue to use this exit reason
4748 	 * even after it returns
4749 	 */
4750 	os_reason_ref(jetsam_reason);
4751 
4752 	if (state->sort_flag) {
4753 		options |= MEMSTAT_SORT_BUCKET;
4754 	}
4755 	/* LRU */
4756 	killed = memstat_kill_top_process(kill_cause, jetsam_reason, max_priority,
4757 	    options, &priority, &state->errors, &state->memory_reclaimed);
4758 	state->sort_flag = false;
4759 
4760 	if (killed) {
4761 		if (memorystatus_should_post_snapshot(priority, kill_cause) == TRUE) {
4762 			state->post_snapshot = true;
4763 		}
4764 
4765 		/* Jetsam Loop Detection */
4766 		if (memorystatus_jld_enabled == TRUE) {
4767 			if (priority <= applications_aging_band) {
4768 				state->jld_idle_kills++;
4769 			} else {
4770 				/*
4771 				 * We've reached into bands beyond idle deferred.
4772 				 * We make no attempt to monitor them
4773 				 */
4774 			}
4775 		}
4776 
4777 		if (priority >= JETSAM_PRIORITY_FREEZER && !state->fg_approached) {
4778 			state->fg_approached = true;
4779 			memstat_approaching_fg_band(state);
4780 		}
4781 		if (priority >= JETSAM_PRIORITY_BACKGROUND && !state->bg_approached) {
4782 			state->bg_approached = true;
4783 			memorystatus_broadcast_jetsam_pressure(kVMPressureBackgroundJetsam);
4784 		}
4785 	}
4786 	os_reason_free(jetsam_reason);
4787 
4788 	return killed;
4789 }
4790 
4791 static bool
memstat_perform_no_paging_space_action(memorystatus_kill_cause_t cause)4792 memstat_perform_no_paging_space_action(memorystatus_kill_cause_t cause)
4793 {
4794 #if !CONFIG_JETSAM
4795 	uint64_t now = mach_absolute_time();
4796 	os_atomic_store(&last_no_space_action_ts, now, relaxed);
4797 
4798 	bool should_notify = no_paging_space_action(cause);
4799 	if (should_notify) {
4800 		/*
4801 		 * Put up the "Out of Application Memory" dialogue. The user will be
4802 		 * prompted to select applications to Force Quit.
4803 		 */
4804 		memorystatus_log("memorystatus: sending out-of-application memory knote\n");
4805 		memorystatus_send_low_swap_note();
4806 		return false;
4807 	}
4808 	return true;
4809 #else /* CONFIG_JETSAM */
4810 	(void)cause;
4811 	panic("No-Paging-Space Action unsupported on this platform");
4812 #endif /* !CONFIG_JETSAM */
4813 }
4814 
4815 static bool
memorystatus_do_action(jetsam_state_t state,memorystatus_action_t action,memorystatus_kill_cause_t kill_cause)4816 memorystatus_do_action(jetsam_state_t state, memorystatus_action_t action, memorystatus_kill_cause_t kill_cause)
4817 {
4818 	bool killed = false;
4819 	os_reason_t jetsam_reason = OS_REASON_NULL;
4820 
4821 	switch (action) {
4822 	case MEMORYSTATUS_KILL_HIWATER:
4823 		killed = memorystatus_act_on_hiwat_processes(&state->errors, &state->hwm_kills,
4824 		    &state->post_snapshot, &state->memory_reclaimed);
4825 		break;
4826 	case MEMORYSTATUS_KILL_AGGRESSIVE:
4827 		jetsam_reason = create_jetsam_reason(kill_cause);
4828 		killed = memorystatus_act_aggressive(state, kill_cause, jetsam_reason);
4829 		os_reason_free(jetsam_reason);
4830 		break;
4831 	case MEMORYSTATUS_KILL_TOP_PROCESS:
4832 		killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_MAX, 0);
4833 		break;
4834 	case MEMORYSTATUS_WAKE_SWAPPER:
4835 		memorystatus_log_info(
4836 			"memorystatus_do_action: Waking up swap thread. memorystatus_available_pages: %llu\n",
4837 			(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4838 		os_atomic_store(&vm_swapout_wake_pending, true, relaxed);
4839 		thread_wakeup((event_t)&vm_swapout_thread);
4840 		break;
4841 	case MEMORYSTATUS_PROCESS_SWAPIN_QUEUE:
4842 		memorystatus_log_info(
4843 			"memorystatus_do_action: Processing swapin queue of length: %u memorystatus_available_pages: %llu\n",
4844 			c_late_swappedin_count, (uint64_t) MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4845 		vm_compressor_process_special_swapped_in_segments();
4846 		break;
4847 	case MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE:
4848 		killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_BACKGROUND - 1, MEMSTAT_ONLY_SWAPPABBLE);
4849 		break;
4850 	case MEMORYSTATUS_KILL_SWAPPABLE:
4851 		killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_MAX, MEMSTAT_ONLY_SWAPPABBLE);
4852 		break;
4853 	case MEMORYSTATUS_KILL_IDLE:
4854 		killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_IDLE, 0);
4855 		break;
4856 	case MEMORYSTATUS_KILL_LONG_IDLE:
4857 		killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_IDLE, MEMSTAT_ONLY_LONG_IDLE);
4858 		break;
4859 	case MEMORYSTATUS_NO_PAGING_SPACE:
4860 		killed = memstat_perform_no_paging_space_action(kill_cause);
4861 		break;
4862 	case MEMORYSTATUS_PURGE_CACHES:
4863 		memstat_purge_caches(state);
4864 		killed = true;
4865 		break;
4866 	case MEMORYSTATUS_KILL_NONE:
4867 		panic("memorystatus_do_action: Impossible! memorystatus_do_action called with action = NONE\n");
4868 	}
4869 	return killed;
4870 }
4871 
4872 void
memorystatus_post_snapshot()4873 memorystatus_post_snapshot()
4874 {
4875 	proc_list_lock();
4876 	size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
4877 	    sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
4878 	uint64_t timestamp_now = mach_absolute_time();
4879 	memorystatus_jetsam_snapshot->notification_time = timestamp_now;
4880 	memorystatus_jetsam_snapshot->js_gencount++;
4881 	if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
4882 	    timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
4883 		proc_list_unlock();
4884 		int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4885 		if (!ret) {
4886 			proc_list_lock();
4887 			memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; proc_list_unlock();
4888 		}
4889 	} else {
4890 		proc_list_unlock();
4891 	}
4892 }
4893 
4894 #if JETSAM_ZPRINT_SNAPSHOT
4895 
4896 /*
4897  *  Called by memorystatus_update_jetsam_snapshot_entry_locked to take a zprint snapshot.
4898  */
4899 static void
memorystatus_collect_jetsam_snapshot_zprint(void)4900 memorystatus_collect_jetsam_snapshot_zprint(void)
4901 {
4902 	unsigned int new_meminfo_cnt;
4903 
4904 	jzs_zone_cnt = zone_max_zones();
4905 
4906 	new_meminfo_cnt = vm_page_diagnose_estimate();
4907 	if (new_meminfo_cnt > jzs_meminfo_cnt) {
4908 		jzs_meminfo = krealloc_data_tag(jzs_meminfo,
4909 		    jzs_meminfo_cnt * sizeof(mach_memory_info_t),
4910 		    new_meminfo_cnt * sizeof(mach_memory_info_t),
4911 		    Z_WAITOK,
4912 		    VM_KERN_MEMORY_DIAG);
4913 
4914 		jzs_meminfo_cnt = new_meminfo_cnt;
4915 	}
4916 
4917 	mach_memory_info_sample(jzs_names, jzs_info, jzs_coalesce, &jzs_zone_cnt, jzs_meminfo, jzs_meminfo_cnt, true);
4918 }
4919 
4920 #endif /* JETSAM_ZPRINT_SNAPSHOT */
4921 
4922 /*
4923  * Main entrypoint for the memorystatus thread.
4924  * This thread is woken up when we're low on one of the following resources:
4925  * - available pages (free + filebacked)
4926  * - zone memory
4927  * - compressor space
4928  *
4929  * Or when thrashing is detected in the compressor or file cache.
4930  */
4931 static void
memorystatus_thread_internal(jetsam_state_t state)4932 memorystatus_thread_internal(jetsam_state_t state)
4933 {
4934 	uint64_t total_memory_reclaimed = 0;
4935 	bool highwater_remaining = true;
4936 	bool swappable_apps_remaining = false;
4937 	bool suspended_swappable_apps_remaining = false;
4938 
4939 #if CONFIG_JETSAM
4940 	swappable_apps_remaining = memorystatus_swap_all_apps;
4941 	suspended_swappable_apps_remaining = memorystatus_swap_all_apps;
4942 #endif /* CONFIG_JETSAM */
4943 
4944 	assert(state != NULL);
4945 	state->jld_idle_kills = 0;
4946 	state->errors = 0;
4947 	state->errors_cleared = false;
4948 	state->hwm_kills = 0;
4949 	state->sort_flag = true;
4950 	state->corpse_list_purged = false;
4951 	state->bg_approached = false;
4952 	state->fg_approached = false;
4953 	state->post_snapshot = false;
4954 	state->memory_reclaimed = 0;
4955 
4956 	if (state->inited == FALSE) {
4957 		/*
4958 		 * It's the first time the thread has run, so just mark the thread as privileged and block.
4959 		 */
4960 		memorystatus_thread_init(state);
4961 		sched_cond_wait(&state->jt_wakeup_cond, THREAD_UNINT, memorystatus_thread);
4962 	}
4963 
4964 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
4965 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, jld_eval_aggressive_count);
4966 
4967 	extern uint32_t c_segment_count;
4968 	extern mach_timespec_t major_compact_ts;
4969 	clock_sec_t now;
4970 	clock_nsec_t nsec;
4971 	clock_get_system_nanotime(&now, &nsec);
4972 	mach_timespec_t major_compact_diff = {.tv_sec = (int)now, .tv_nsec = nsec};
4973 	SUB_MACH_TIMESPEC(&major_compact_diff, &major_compact_ts);
4974 	memorystatus_log_info(
4975 		"memorystatus: c_segment_count=%u major compaction occurred %u seconds ago\n",
4976 		c_segment_count, major_compact_diff.tv_sec);
4977 
4978 	/*
4979 	 * Jetsam aware version.
4980 	 *
4981 	 * The VM pressure notification thread is working its way through clients in parallel.
4982 	 *
4983 	 * So, while the pressure notification thread is targeting processes in order of
4984 	 * increasing jetsam priority, we can hopefully reduce / stop its work by killing
4985 	 * any processes that have exceeded their highwater mark.
4986 	 *
4987 	 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
4988 	 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
4989 	 */
4990 	while (true) {
4991 		bool killed;
4992 		state->memory_reclaimed = 0;
4993 		uint32_t cause = 0;
4994 
4995 		memorystatus_action_t action = memorystatus_pick_action(state, &cause,
4996 		    highwater_remaining, suspended_swappable_apps_remaining, swappable_apps_remaining,
4997 		    &state->jld_idle_kills);
4998 		if (action == MEMORYSTATUS_KILL_NONE) {
4999 			break;
5000 		}
5001 
5002 		if (cause == kMemorystatusKilledVMCompressorThrashing || cause == kMemorystatusKilledVMCompressorSpaceShortage) {
5003 			memorystatus_log("memorystatus: killing due to \"%s\" - compression_ratio=%u\n", memstat_kill_cause_name[cause], vm_compression_ratio());
5004 		}
5005 
5006 		killed = memorystatus_do_action(state, action, cause);
5007 		total_memory_reclaimed += state->memory_reclaimed;
5008 
5009 		if (!killed && !state->memory_reclaimed) {
5010 			switch (action) {
5011 			case MEMORYSTATUS_KILL_HIWATER:
5012 				highwater_remaining = false;
5013 				break;
5014 			case MEMORYSTATUS_KILL_SWAPPABLE:
5015 				swappable_apps_remaining = false;
5016 				suspended_swappable_apps_remaining = false;
5017 				break;
5018 			case MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE:
5019 				suspended_swappable_apps_remaining = false;
5020 				break;
5021 			case MEMORYSTATUS_KILL_TOP_PROCESS:
5022 				memstat_no_victim(state, cause);
5023 				break;
5024 			default:
5025 				memorystatus_log("memorystatus: no victim found (action: %d)\n", action);
5026 				break;
5027 			}
5028 		} else {
5029 			/* We successfully killed a process */
5030 			if (cause == kMemorystatusKilledVMCompressorThrashing || cause == kMemorystatusKilledVMCompressorSpaceShortage) {
5031 				memorystatus_log("memorystatus: post-jetsam compressor fragmentation_level=%u\n", vm_compressor_fragmentation_level());
5032 			}
5033 			/* Always re-check for highwater and swappable kills after doing a kill. */
5034 			highwater_remaining = true;
5035 			swappable_apps_remaining = true;
5036 			suspended_swappable_apps_remaining = true;
5037 		}
5038 
5039 
5040 		/*
5041 		 * If we did a kill on behalf of another subsystem (compressor or zalloc)
5042 		 * notify them.
5043 		 */
5044 		if (killed && is_reason_thrashing(cause)) {
5045 			os_atomic_store(&memorystatus_compressor_space_shortage, false, release);
5046 #if CONFIG_PHANTOM_CACHE
5047 			os_atomic_store(&memorystatus_phantom_cache_pressure, false, release);
5048 #endif /* CONFIG_PHANTOM_CACHE */
5049 #if CONFIG_JETSAM
5050 			vm_thrashing_jetsam_done();
5051 #endif /* CONFIG_JETSAM */
5052 		} else if (killed && is_reason_zone_map_exhaustion(cause)) {
5053 			os_atomic_store(&memorystatus_zone_map_is_exhausted, false, release);
5054 		} else if (killed && cause == kMemorystatusKilledVMPageoutStarvation) {
5055 			os_atomic_store(&memorystatus_pageout_starved, false, release);
5056 		}
5057 	}
5058 
5059 	if (state->errors) {
5060 		memorystatus_clear_errors();
5061 	}
5062 
5063 	if (state->post_snapshot) {
5064 		memorystatus_post_snapshot();
5065 	}
5066 
5067 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
5068 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, total_memory_reclaimed);
5069 
5070 	if (state->corpse_list_purged) {
5071 		os_atomic_dec(&block_corpses, relaxed);
5072 		assert(block_corpses >= 0);
5073 	}
5074 }
5075 
5076 OS_NORETURN
5077 static void
memorystatus_thread(void * param __unused,wait_result_t wr __unused)5078 memorystatus_thread(void *param __unused, wait_result_t wr __unused)
5079 {
5080 	jetsam_state_t jetsam_thread = jetsam_current_thread();
5081 	sched_cond_ack(&(jetsam_thread->jt_wakeup_cond));
5082 	while (1) {
5083 		memorystatus_thread_internal(jetsam_thread);
5084 		sched_cond_wait(&(jetsam_thread->jt_wakeup_cond), THREAD_UNINT, memorystatus_thread);
5085 	}
5086 }
5087 
5088 /*
5089  * Callback invoked when allowable physical memory footprint exceeded
5090  * (dirty pages + IOKit mappings)
5091  *
5092  * This is invoked for both advisory, non-fatal per-task high watermarks,
5093  * as well as the fatal task memory limits.
5094  */
5095 void
memorystatus_on_ledger_footprint_exceeded(boolean_t warning,boolean_t memlimit_is_active,boolean_t memlimit_is_fatal)5096 memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
5097 {
5098 	os_reason_t jetsam_reason = OS_REASON_NULL;
5099 
5100 	proc_t p = current_proc();
5101 
5102 #if VM_PRESSURE_EVENTS
5103 	if (warning == TRUE) {
5104 		/*
5105 		 * This is a warning path which implies that the current process is close, but has
5106 		 * not yet exceeded its per-process memory limit.
5107 		 */
5108 		if (memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) {
5109 			/* Print warning, since it's possible that task has not registered for pressure notifications */
5110 			memorystatus_log_debug(
5111 				"memorystatus_on_ledger_footprint_exceeded: failed to warn %s [%d] (exiting, or no handler registered?).\n",
5112 				proc_best_name(p), proc_getpid(p));
5113 		}
5114 		return;
5115 	}
5116 #endif /* VM_PRESSURE_EVENTS */
5117 
5118 	if (memlimit_is_fatal) {
5119 		/*
5120 		 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
5121 		 * has violated either the system-wide per-task memory limit OR its own task limit.
5122 		 */
5123 		jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT);
5124 		if (jetsam_reason == NULL) {
5125 			memorystatus_log_error("task_exceeded footprint: failed to allocate jetsam reason\n");
5126 		} else if (corpse_for_fatal_memkill && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) {
5127 			/* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
5128 			jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
5129 		}
5130 
5131 		if (memstat_kill_process_sync(proc_getpid(p), kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) {
5132 			memorystatus_log_error("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
5133 		}
5134 	} else {
5135 		/*
5136 		 * HWM offender exists. Done without locks or synchronization.
5137 		 * See comment near its declaration for more details.
5138 		 */
5139 		os_atomic_store(&memorystatus_hwm_candidates, true, release);
5140 		_memstat_consider_waking_jetsam_thread();
5141 
5142 #if VM_PRESSURE_EVENTS
5143 		/*
5144 		 * The current process is not in the warning path.
5145 		 * This path implies the current process has exceeded a non-fatal (soft) memory limit.
5146 		 * Failure to send note is ignored here.
5147 		 */
5148 		(void)memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */);
5149 
5150 #endif /* VM_PRESSURE_EVENTS */
5151 	}
5152 }
5153 
5154 void
memorystatus_log_exception(const int max_footprint_mb,boolean_t memlimit_is_active,boolean_t memlimit_is_fatal)5155 memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
5156 {
5157 	proc_t p = current_proc();
5158 
5159 	/*
5160 	 * The limit violation is logged here, but only once per process per limit.
5161 	 * Soft memory limit is a non-fatal high-water-mark
5162 	 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
5163 	 */
5164 
5165 	memorystatus_log("memorystatus: %s [%d] exceeded mem limit: %s%s %d MB (%s)\n",
5166 	    ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), (memlimit_is_active ? "Active" : "Inactive"),
5167 	    (memlimit_is_fatal  ? "Hard" : "Soft"), max_footprint_mb,
5168 	    (memlimit_is_fatal  ? "fatal" : "non-fatal"));
5169 }
5170 
5171 void
memorystatus_log_diag_threshold_exception(const int diag_threshold_value)5172 memorystatus_log_diag_threshold_exception(const int diag_threshold_value)
5173 {
5174 	proc_t p = current_proc();
5175 
5176 	/*
5177 	 * The limit violation is logged here, but only once per process per limit.
5178 	 * Soft memory limit is a non-fatal high-water-mark
5179 	 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
5180 	 */
5181 
5182 	memorystatus_log("memorystatus: %s [%d] exceeded diag threshold limit: %d MB \n",
5183 	    ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), diag_threshold_value);
5184 }
5185 
5186 void
memorystatus_on_conclave_limit_exceeded(const int max_footprint_mb)5187 memorystatus_on_conclave_limit_exceeded(const int max_footprint_mb)
5188 {
5189 	os_reason_t jetsam_reason = OS_REASON_NULL;
5190 	proc_t p = current_proc();
5191 
5192 	/*
5193 	 * The limit violation is logged here; it's always fatal.
5194 	 */
5195 	memorystatus_log("memorystatus: %s [%d] exceeded conclave limit: %d MB \n",
5196 	    ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), max_footprint_mb);
5197 
5198 	jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_CONCLAVELIMIT);
5199 	if (jetsam_reason == NULL) {
5200 		memorystatus_log_error("task_exceeded_conclave: failed to allocate jetsam reason\n");
5201 	} else if (corpse_for_fatal_memkill && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) {
5202 		/* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
5203 		jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
5204 	}
5205 
5206 	if (memstat_kill_process_sync(proc_getpid(p), kMemorystatusKilledConclaveLimit, jetsam_reason) != TRUE) {
5207 		memorystatus_log_error("task_exceeded_conclave: failed to kill the current task (exiting?).\n");
5208 	}
5209 }
5210 
5211 /*
5212  * Description:
5213  *	Evaluates process state to determine which limit
5214  *	should be applied (active vs. inactive limit).
5215  *
5216  *	Return: TRUE if active
5217  *		False if inactive
5218  */
5219 static bool
memstat_proc_is_active_locked(proc_t p)5220 memstat_proc_is_active_locked(proc_t p)
5221 {
5222 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5223 
5224 	if (_memstat_proc_is_elevated(p) &&
5225 	    (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE ||
5226 	    p->p_memstat_effectivepriority == JETSAM_PRIORITY_FREEZER)) {
5227 		/* This process is sitting in an elevated inactive band. */
5228 		if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE) {
5229 			/*
5230 			 * This process is in an elevated band and may be doing background
5231 			 * work.
5232 			 */
5233 			return true;
5234 		} else {
5235 			/* This process is frozen. */
5236 			return false;
5237 		}
5238 	} else if (_memstat_proc_is_tracked(p)) {
5239 		/*
5240 		 * Process has enrolled in ActivityTracking. Its limit will be
5241 		 * determined based on whether it is clean or dirty.
5242 		 */
5243 		if (_memstat_proc_is_dirty(p)) {
5244 			/* Dirty processes are always active */
5245 			return true;
5246 		} else if (_memstat_proc_can_idle_exit(p) &&
5247 		    p->p_memstat_effectivepriority > JETSAM_PRIORITY_IDLE) {
5248 			/*
5249 			 * This process is clean and supports idle exit, but has not made
5250 			 * its way to the idle band. It is either aging in the deferred
5251 			 * idle band or has a RunningBoard assertion that is keeping it
5252 			 * from going idle.
5253 			 */
5254 			return true;
5255 		} else {
5256 			/*
5257 			 * This process is clean and either:
5258 			 *   - does not support idle exit
5259 			 *   or
5260 			 *   - does support idle exit and is now idle
5261 			 */
5262 			return false;
5263 		}
5264 	} else if (_memstat_proc_is_managed(p)) {
5265 		/*
5266 		 * RunningBoard-managed processes are active if they have any
5267 		 * outstanding assertions
5268 		 */
5269 		return _memstat_proc_has_priority_assertion(p);
5270 	} else {
5271 		/*
5272 		 * Unmanaged and untracked processes receive an active limit unless
5273 		 * they are completely idle.
5274 		 */
5275 		return p->p_memstat_effectivepriority > JETSAM_PRIORITY_IDLE;
5276 	}
5277 }
5278 
5279 static bool
memstat_kill_process_sync(pid_t victim_pid,uint32_t cause,os_reason_t jetsam_reason)5280 memstat_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
5281 {
5282 	bool killed;
5283 
5284 	uint32_t errors = 0;
5285 	uint64_t memory_reclaimed = 0;
5286 
5287 	/* Validate inputs */
5288 	if (victim_pid == 0) {
5289 		return false;
5290 	}
5291 
5292 	if (victim_pid == -1) {
5293 		uint32_t max_priority;
5294 #if CONFIG_JETSAM
5295 		max_priority = JETSAM_PRIORITY_MAX;
5296 #else /* !CONFIG_JETSAM */
5297 		if (kill_on_no_paging_space ||
5298 		    cause == kMemorystatusKilledZoneMapExhaustion) {
5299 			max_priority = JETSAM_PRIORITY_MAX;
5300 		} else if (cause == kMemorystatusKilledSustainedPressure) {
5301 			max_priority = memstat_sustained_pressure_max_pri;
5302 		} else {
5303 			max_priority = JETSAM_PRIORITY_IDLE;
5304 		}
5305 #endif /* CONFIG_JETSAM */
5306 		/* No pid, so kill first process */
5307 		killed = memstat_kill_top_process(cause, jetsam_reason,
5308 		    max_priority, MEMSTAT_SORT_BUCKET, NULL, &errors, &memory_reclaimed);
5309 	} else {
5310 		killed = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason);
5311 	}
5312 
5313 	if (errors) {
5314 		memorystatus_clear_errors();
5315 	}
5316 
5317 	if (killed) {
5318 		/* Fire off snapshot notification */
5319 		proc_list_lock();
5320 		size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
5321 		    sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
5322 		uint64_t timestamp_now = mach_absolute_time();
5323 		memorystatus_jetsam_snapshot->notification_time = timestamp_now;
5324 		if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
5325 		    timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
5326 			proc_list_unlock();
5327 			int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
5328 			if (!ret) {
5329 				proc_list_lock();
5330 				memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
5331 				proc_list_unlock();
5332 			}
5333 		} else {
5334 			proc_list_unlock();
5335 		}
5336 	}
5337 
5338 	return killed;
5339 }
5340 
5341 /*
5342  * Jetsam a specific process.
5343  */
5344 static bool
memorystatus_kill_specific_process(pid_t victim_pid,uint32_t cause,os_reason_t jetsam_reason)5345 memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
5346 {
5347 	bool killed;
5348 	proc_t p;
5349 	uint64_t killtime = 0;
5350 	uint64_t time_in_priority_band_secs = 0;
5351 	uint64_t footprint_of_killed_proc;
5352 	clock_sec_t     tv_sec;
5353 	clock_usec_t    tv_usec;
5354 	uint32_t        tv_msec;
5355 
5356 	/* TODO - add a victim queue and push this into the main jetsam thread */
5357 
5358 	p = proc_find(victim_pid);
5359 	if (!p) {
5360 		os_reason_free(jetsam_reason);
5361 		return false;
5362 	}
5363 
5364 	proc_list_lock();
5365 
5366 	if (_memstat_proc_was_killed(p)) {
5367 		/*
5368 		 * Someone beat us to this kill.
5369 		 * Nothing to do here.
5370 		 */
5371 		proc_list_unlock();
5372 		os_reason_free(jetsam_reason);
5373 		proc_rele(p);
5374 		return false;
5375 	}
5376 	p->p_memstat_state |= P_MEMSTAT_TERMINATED;
5377 
5378 	if (memorystatus_jetsam_snapshot_count == 0) {
5379 		memorystatus_init_jetsam_snapshot_locked(NULL, 0);
5380 	}
5381 
5382 	killtime = mach_absolute_time();
5383 	absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5384 	tv_msec = tv_usec / 1000;
5385 
5386 	memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
5387 
5388 	proc_list_unlock();
5389 
5390 	absolutetime_to_nanoseconds(killtime - p->p_memstat_prio_start, &time_in_priority_band_secs);
5391 	time_in_priority_band_secs /= NSEC_PER_SEC;
5392 
5393 	killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
5394 
5395 	memorystatus_log("%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d %llus rf:%s type:%s) %lluKB - memorystatus_available_pages: %llu\n",
5396 	    (unsigned long)tv_sec, tv_msec, victim_pid, ((p && *p->p_name) ? p->p_name : "unknown"),
5397 	    memstat_kill_cause_name[cause], (p ? p->p_memstat_effectivepriority: -1), time_in_priority_band_secs,
5398 	    (p ? _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags) : 0), _memstat_proc_type_description(p),
5399 	    footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
5400 
5401 	if (!killed) {
5402 		proc_list_lock();
5403 		p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5404 		proc_list_unlock();
5405 	}
5406 
5407 	proc_rele(p);
5408 
5409 	return killed;
5410 }
5411 
5412 
5413 /*
5414  * Toggle the P_MEMSTAT_SKIP bit.
5415  * Takes the proc_list_lock.
5416  */
5417 void
proc_memstat_skip(proc_t p,boolean_t set)5418 proc_memstat_skip(proc_t p, boolean_t set)
5419 {
5420 #if DEVELOPMENT || DEBUG
5421 	if (p) {
5422 		proc_list_lock();
5423 		if (set == TRUE) {
5424 			p->p_memstat_state |= P_MEMSTAT_SKIP;
5425 		} else {
5426 			p->p_memstat_state &= ~P_MEMSTAT_SKIP;
5427 		}
5428 		proc_list_unlock();
5429 	}
5430 #else
5431 #pragma unused(p, set)
5432 	/*
5433 	 * do nothing
5434 	 */
5435 #endif /* DEVELOPMENT || DEBUG */
5436 	return;
5437 }
5438 
5439 
5440 #if CONFIG_JETSAM
5441 /*
5442  * This is invoked when cpulimits have been exceeded while in fatal mode.
5443  * The jetsam_flags do not apply as those are for memory related kills.
5444  * We call this routine so that the offending process is killed with
5445  * a non-zero exit status.
5446  */
5447 void
jetsam_on_ledger_cpulimit_exceeded(void)5448 jetsam_on_ledger_cpulimit_exceeded(void)
5449 {
5450 	int retval = 0;
5451 	int jetsam_flags = 0;  /* make it obvious */
5452 	proc_t p = current_proc();
5453 	os_reason_t jetsam_reason = OS_REASON_NULL;
5454 
5455 	memorystatus_log("memorystatus: killing %s [%d] due to cpulimit "
5456 	    "violation\n", proc_best_name(p), proc_getpid(p));
5457 
5458 	jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT);
5459 	if (jetsam_reason == OS_REASON_NULL) {
5460 		memorystatus_log_error("memorystatus: unable to allocate memory for jetsam reason\n");
5461 	}
5462 
5463 	retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
5464 
5465 	if (retval) {
5466 		memorystatus_log_error("memorystatus: failed to kill current task (exiting?).\n");
5467 	}
5468 }
5469 
5470 #endif /* CONFIG_JETSAM */
5471 
5472 static void
memorystatus_get_task_memory_region_count(task_t task,uint64_t * count)5473 memorystatus_get_task_memory_region_count(task_t task, uint64_t *count)
5474 {
5475 	assert(task);
5476 	assert(count);
5477 
5478 	*count = get_task_memory_region_count(task);
5479 }
5480 
5481 
5482 #define MEMORYSTATUS_VM_MAP_FORK_ALLOWED     0x100000000
5483 #define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000
5484 
5485 #if DEVELOPMENT || DEBUG
5486 
5487 /*
5488  * Sysctl only used to test memorystatus_allowed_vm_map_fork() path.
5489  *   set a new pidwatch value
5490  *	or
5491  *   get the current pidwatch value
5492  *
5493  * The pidwatch_val starts out with a PID to watch for in the map_fork path.
5494  * Its value is:
5495  * - OR'd with MEMORYSTATUS_VM_MAP_FORK_ALLOWED if we allow the map_fork.
5496  * - OR'd with MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED if we disallow the map_fork.
5497  * - set to -1ull if the map_fork() is aborted for other reasons.
5498  */
5499 
5500 uint64_t memorystatus_vm_map_fork_pidwatch_val = 0;
5501 
5502 static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS {
5503 #pragma unused(oidp, arg1, arg2)
5504 
5505 	uint64_t new_value = 0;
5506 	uint64_t old_value = 0;
5507 	int error = 0;
5508 
5509 	/*
5510 	 * The pid is held in the low 32 bits.
5511 	 * The 'allowed' flags are in the upper 32 bits.
5512 	 */
5513 	old_value = memorystatus_vm_map_fork_pidwatch_val;
5514 
5515 	error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL);
5516 
5517 	if (error || !req->newptr) {
5518 		/*
5519 		 * No new value passed in.
5520 		 */
5521 		return error;
5522 	}
5523 
5524 	/*
5525 	 * A new pid was passed in via req->newptr.
5526 	 * Ignore any attempt to set the higher order bits.
5527 	 */
5528 	memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF;
5529 	memorystatus_log_debug("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx\n", old_value, new_value);
5530 
5531 	return error;
5532 }
5533 
5534 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED,
5535     0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork");
5536 
5537 
5538 /*
5539  * Record if a watched process fails to qualify for a vm_map_fork().
5540  */
5541 void
memorystatus_abort_vm_map_fork(task_t task)5542 memorystatus_abort_vm_map_fork(task_t task)
5543 {
5544 	if (memorystatus_vm_map_fork_pidwatch_val != 0) {
5545 		proc_t p = get_bsdtask_info(task);
5546 		if (p != NULL && memorystatus_vm_map_fork_pidwatch_val == (uint64_t)proc_getpid(p)) {
5547 			memorystatus_vm_map_fork_pidwatch_val = -1ull;
5548 		}
5549 	}
5550 }
5551 
5552 static void
set_vm_map_fork_pidwatch(task_t task,uint64_t x)5553 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
5554 {
5555 	if (memorystatus_vm_map_fork_pidwatch_val != 0) {
5556 		proc_t p = get_bsdtask_info(task);
5557 		if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)proc_getpid(p))) {
5558 			memorystatus_vm_map_fork_pidwatch_val |= x;
5559 		}
5560 	}
5561 }
5562 
5563 #else /* DEVELOPMENT || DEBUG */
5564 
5565 
5566 static void
set_vm_map_fork_pidwatch(task_t task,uint64_t x)5567 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
5568 {
5569 #pragma unused(task)
5570 #pragma unused(x)
5571 }
5572 
5573 #endif /* DEVELOPMENT || DEBUG */
5574 
5575 /*
5576  * Called during EXC_RESOURCE handling when a process exceeds a soft
5577  * memory limit.  This is the corpse fork path and here we decide if
5578  * vm_map_fork will be allowed when creating the corpse.
5579  * The task being considered is suspended.
5580  *
5581  * By default, a vm_map_fork is allowed to proceed.
5582  *
5583  * A few simple policy assumptions:
5584  *	If the device has a zero system-wide task limit,
5585  *	then the vm_map_fork is allowed. macOS always has a zero
5586  *	system wide task limit (unless overriden by a boot-arg).
5587  *
5588  *	And if a process's memory footprint calculates less
5589  *	than or equal to quarter of the system-wide task limit,
5590  *	then the vm_map_fork is allowed.  This calculation
5591  *	is based on the assumption that a process can
5592  *	munch memory up to the system-wide task limit.
5593  *
5594  *      For watchOS, which has a low task limit, we use a
5595  *      different value. Current task limit has been reduced
5596  *      to 300MB and it's been decided the limit should be 200MB.
5597  */
5598 int large_corpse_count = 0;
5599 boolean_t
memorystatus_allowed_vm_map_fork(task_t task,bool * is_large)5600 memorystatus_allowed_vm_map_fork(task_t task, bool *is_large)
5601 {
5602 	boolean_t is_allowed = TRUE;   /* default */
5603 	uint64_t footprint_in_bytes;
5604 	uint64_t max_allowed_bytes;
5605 	thread_t self = current_thread();
5606 
5607 	*is_large = false;
5608 
5609 	/* Jetsam in high bands blocks any new corpse */
5610 	if (os_atomic_load(&block_corpses, relaxed) != 0) {
5611 		memorystatus_log("memorystatus_allowed_vm_map_fork: corpse for pid %d blocked by jetsam).\n", task_pid(task));
5612 		ktriage_record(thread_tid(self), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CORPSE, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CORPSE_BLOCKED_JETSAM), 0 /* arg */);
5613 		return FALSE;
5614 	}
5615 
5616 	if (max_task_footprint_mb == 0) {
5617 		set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
5618 		return is_allowed;
5619 	}
5620 
5621 	footprint_in_bytes = get_task_phys_footprint(task);
5622 
5623 	/*
5624 	 * Maximum is 1/4 of the system-wide task limit by default.
5625 	 */
5626 	max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2;
5627 
5628 #if XNU_TARGET_OS_WATCH
5629 	/*
5630 	 * For watches with > 1G, use a limit of 200MB and allow
5631 	 * one corpse at a time of up to 300MB.
5632 	 */
5633 #define LARGE_CORPSE_LIMIT 1
5634 	if (sane_size > 1 * 1024 * 1024 * 1024) {
5635 		int cnt = large_corpse_count;
5636 		if (footprint_in_bytes > 200 * 1024 * 1024 &&
5637 		    footprint_in_bytes <= 300 * 1024 * 1024 &&
5638 		    cnt < LARGE_CORPSE_LIMIT &&
5639 		    OSCompareAndSwap(cnt, cnt + 1, &large_corpse_count)) {
5640 			*is_large = true;
5641 			max_allowed_bytes = MAX(max_allowed_bytes, 300 * 1024 * 1024);
5642 		} else {
5643 			max_allowed_bytes = MAX(max_allowed_bytes, 200 * 1024 * 1024);
5644 		}
5645 	}
5646 #endif /* XNU_TARGET_OS_WATCH */
5647 
5648 #if DEBUG || DEVELOPMENT
5649 	if (corpse_threshold_system_limit) {
5650 		max_allowed_bytes = (uint64_t)max_task_footprint_mb * (1UL << 20);
5651 	}
5652 #endif /* DEBUG || DEVELOPMENT */
5653 
5654 	if (footprint_in_bytes > max_allowed_bytes) {
5655 		memorystatus_log("memorystatus disallowed vm_map_fork %lld  %lld\n", footprint_in_bytes, max_allowed_bytes);
5656 		set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED);
5657 		ktriage_record(thread_tid(self), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CORPSE, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CORPSE_PROC_TOO_BIG), 0 /* arg */);
5658 		return !is_allowed;
5659 	}
5660 
5661 	set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
5662 	return is_allowed;
5663 }
5664 
5665 void
memorystatus_get_task_page_counts(task_t task,uint32_t * footprint,uint32_t * max_footprint_lifetime,uint32_t * purgeable_pages)5666 memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
5667 {
5668 	assert(task);
5669 	assert(footprint);
5670 
5671 	uint64_t pages;
5672 
5673 	pages = (get_task_phys_footprint(task) / PAGE_SIZE_64);
5674 	assert(((uint32_t)pages) == pages);
5675 	*footprint = (uint32_t)pages;
5676 
5677 	if (max_footprint_lifetime) {
5678 		pages = (get_task_phys_footprint_lifetime_max(task) / PAGE_SIZE_64);
5679 		assert(((uint32_t)pages) == pages);
5680 		*max_footprint_lifetime = (uint32_t)pages;
5681 	}
5682 	if (purgeable_pages) {
5683 		pages = (get_task_purgeable_size(task) / PAGE_SIZE_64);
5684 		assert(((uint32_t)pages) == pages);
5685 		*purgeable_pages = (uint32_t)pages;
5686 	}
5687 }
5688 
5689 static void
memorystatus_get_task_phys_footprint_page_counts(task_t task,uint64_t * internal_pages,uint64_t * internal_compressed_pages,uint64_t * purgeable_nonvolatile_pages,uint64_t * purgeable_nonvolatile_compressed_pages,uint64_t * alternate_accounting_pages,uint64_t * alternate_accounting_compressed_pages,uint64_t * iokit_mapped_pages,uint64_t * page_table_pages,uint64_t * frozen_to_swap_pages,uint64_t * neural_nofootprint_total_pages)5690 memorystatus_get_task_phys_footprint_page_counts(task_t task,
5691     uint64_t *internal_pages, uint64_t *internal_compressed_pages,
5692     uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
5693     uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
5694     uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages,
5695     uint64_t *neural_nofootprint_total_pages)
5696 {
5697 	assert(task);
5698 
5699 	if (internal_pages) {
5700 		*internal_pages = (get_task_internal(task) / PAGE_SIZE_64);
5701 	}
5702 
5703 	if (internal_compressed_pages) {
5704 		*internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64);
5705 	}
5706 
5707 	if (purgeable_nonvolatile_pages) {
5708 		*purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64);
5709 	}
5710 
5711 	if (purgeable_nonvolatile_compressed_pages) {
5712 		*purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64);
5713 	}
5714 
5715 	if (alternate_accounting_pages) {
5716 		*alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64);
5717 	}
5718 
5719 	if (alternate_accounting_compressed_pages) {
5720 		*alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64);
5721 	}
5722 
5723 	if (iokit_mapped_pages) {
5724 		*iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64);
5725 	}
5726 
5727 	if (page_table_pages) {
5728 		*page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64);
5729 	}
5730 
5731 	if (neural_nofootprint_total_pages) {
5732 		*neural_nofootprint_total_pages = (get_task_neural_nofootprint_total(task) / PAGE_SIZE_64);
5733 	}
5734 
5735 #if CONFIG_FREEZE
5736 	if (frozen_to_swap_pages) {
5737 		*frozen_to_swap_pages = (get_task_frozen_to_swap(task) / PAGE_SIZE_64);
5738 	}
5739 #else /* CONFIG_FREEZE */
5740 #pragma unused(frozen_to_swap_pages)
5741 #endif /* CONFIG_FREEZE */
5742 }
5743 
5744 #if CONFIG_FREEZE
5745 /*
5746  * Copies the source entry into the destination snapshot.
5747  * Returns true on success. Fails if the destination snapshot is full.
5748  * Caller must hold the proc list lock.
5749  */
5750 static bool
memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t * dst_snapshot,unsigned int dst_snapshot_size,const memorystatus_jetsam_snapshot_entry_t * src_entry)5751 memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t *dst_snapshot, unsigned int dst_snapshot_size, const memorystatus_jetsam_snapshot_entry_t *src_entry)
5752 {
5753 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5754 	assert(dst_snapshot);
5755 
5756 	if (dst_snapshot->entry_count == dst_snapshot_size) {
5757 		/* Destination snapshot is full. Can not be updated until it is consumed. */
5758 		return false;
5759 	}
5760 	if (dst_snapshot->entry_count == 0) {
5761 		memorystatus_init_jetsam_snapshot_header(dst_snapshot);
5762 	}
5763 	memorystatus_jetsam_snapshot_entry_t *dst_entry = &dst_snapshot->entries[dst_snapshot->entry_count++];
5764 	memcpy(dst_entry, src_entry, sizeof(memorystatus_jetsam_snapshot_entry_t));
5765 	return true;
5766 }
5767 #endif /* CONFIG_FREEZE */
5768 
5769 static bool
memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t * snapshot,proc_t p,uint32_t kill_cause,uint64_t killtime,memorystatus_jetsam_snapshot_entry_t ** entry)5770 memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t *snapshot, proc_t p, uint32_t kill_cause, uint64_t killtime, memorystatus_jetsam_snapshot_entry_t **entry)
5771 {
5772 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5773 	memorystatus_jetsam_snapshot_entry_t *snapshot_list = snapshot->entries;
5774 	size_t i = snapshot->entry_count;
5775 
5776 	if (memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], (snapshot->js_gencount)) == TRUE) {
5777 		*entry = &snapshot_list[i];
5778 		(*entry)->killed       = kill_cause;
5779 		(*entry)->jse_killtime = killtime;
5780 
5781 		snapshot->entry_count = i + 1;
5782 		return true;
5783 	}
5784 	return false;
5785 }
5786 
5787 /*
5788  * This routine only acts on the global jetsam event snapshot.
5789  * Updating the process's entry can race when the memorystatus_thread
5790  * has chosen to kill a process that is racing to exit on another core.
5791  */
5792 static void
memorystatus_update_jetsam_snapshot_entry_locked(proc_t p,uint32_t kill_cause,uint64_t killtime)5793 memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime)
5794 {
5795 	memorystatus_jetsam_snapshot_entry_t *entry = NULL;
5796 	memorystatus_jetsam_snapshot_t *snapshot    = NULL;
5797 	memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
5798 
5799 	unsigned int i;
5800 #if CONFIG_FREEZE
5801 	bool copied_to_freezer_snapshot = false;
5802 #endif /* CONFIG_FREEZE */
5803 
5804 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5805 
5806 	if (memorystatus_jetsam_snapshot_count == 0) {
5807 		/*
5808 		 * No active snapshot.
5809 		 * Nothing to do.
5810 		 */
5811 		goto exit;
5812 	}
5813 
5814 	/*
5815 	 * Sanity check as this routine should only be called
5816 	 * from a jetsam kill path.
5817 	 */
5818 	assert(kill_cause != 0 && killtime != 0);
5819 
5820 	snapshot       = memorystatus_jetsam_snapshot;
5821 	snapshot_list  = memorystatus_jetsam_snapshot->entries;
5822 
5823 #if JETSAM_ZPRINT_SNAPSHOT
5824 	/*
5825 	 * Collect the snapshot zprint info if we've reached the right priority
5826 	 */
5827 	if (p->p_memstat_effectivepriority >= (int)jzs_trigger_band &&
5828 	    jzs_gencount != snapshot->js_gencount) {
5829 		memorystatus_collect_jetsam_snapshot_zprint();
5830 		jzs_gencount = snapshot->js_gencount;
5831 	}
5832 #endif
5833 
5834 	for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
5835 		if (snapshot_list[i].pid == proc_getpid(p)) {
5836 			entry = &snapshot_list[i];
5837 
5838 			if (entry->killed || entry->jse_killtime) {
5839 				/*
5840 				 * We apparently raced on the exit path
5841 				 * for this process, as it's snapshot entry
5842 				 * has already recorded a kill.
5843 				 */
5844 				assert(entry->killed && entry->jse_killtime);
5845 				break;
5846 			}
5847 
5848 			/*
5849 			 * Update the entry we just found in the snapshot.
5850 			 */
5851 
5852 			entry->killed       = kill_cause;
5853 			entry->jse_killtime = killtime;
5854 			entry->jse_gencount = snapshot->js_gencount;
5855 			entry->jse_idle_delta = p->p_memstat_idle_delta;
5856 			entry->jse_prio_start = p->p_memstat_prio_start;
5857 #if CONFIG_FREEZE
5858 			entry->jse_thaw_count = p->p_memstat_thaw_count;
5859 			entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason;
5860 #else /* CONFIG_FREEZE */
5861 			entry->jse_thaw_count = 0;
5862 			entry->jse_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
5863 #endif /* CONFIG_FREEZE */
5864 
5865 			/*
5866 			 * If a process has moved between bands since snapshot was
5867 			 * initialized, then likely these fields changed too.
5868 			 */
5869 			if (entry->priority != p->p_memstat_effectivepriority) {
5870 				strlcpy(entry->name, p->p_name, sizeof(entry->name));
5871 				entry->priority  = p->p_memstat_effectivepriority;
5872 				entry->state     = _memstat_build_state(p);
5873 				entry->user_data = p->p_memstat_userdata;
5874 				entry->fds       = p->p_fd.fd_nfiles;
5875 			}
5876 
5877 			/*
5878 			 * Always update the page counts on a kill.
5879 			 */
5880 
5881 			uint32_t pages              = 0;
5882 			uint32_t max_pages_lifetime = 0;
5883 			uint32_t purgeable_pages    = 0;
5884 
5885 			memorystatus_get_task_page_counts(proc_task(p), &pages, &max_pages_lifetime, &purgeable_pages);
5886 			entry->pages              = (uint64_t)pages;
5887 			entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
5888 			entry->purgeable_pages    = (uint64_t)purgeable_pages;
5889 
5890 			uint64_t internal_pages                        = 0;
5891 			uint64_t internal_compressed_pages             = 0;
5892 			uint64_t purgeable_nonvolatile_pages           = 0;
5893 			uint64_t purgeable_nonvolatile_compressed_pages = 0;
5894 			uint64_t alternate_accounting_pages            = 0;
5895 			uint64_t alternate_accounting_compressed_pages = 0;
5896 			uint64_t iokit_mapped_pages                    = 0;
5897 			uint64_t page_table_pages                      = 0;
5898 			uint64_t frozen_to_swap_pages                  = 0;
5899 			uint64_t neural_nofootprint_total_pages        = 0;
5900 
5901 			memorystatus_get_task_phys_footprint_page_counts(proc_task(p), &internal_pages, &internal_compressed_pages,
5902 			    &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
5903 			    &alternate_accounting_pages, &alternate_accounting_compressed_pages,
5904 			    &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages, &neural_nofootprint_total_pages);
5905 
5906 			entry->jse_internal_pages = internal_pages;
5907 			entry->jse_internal_compressed_pages = internal_compressed_pages;
5908 			entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
5909 			entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
5910 			entry->jse_alternate_accounting_pages = alternate_accounting_pages;
5911 			entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
5912 			entry->jse_iokit_mapped_pages = iokit_mapped_pages;
5913 			entry->jse_page_table_pages = page_table_pages;
5914 			entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
5915 			entry->jse_neural_nofootprint_total_pages = neural_nofootprint_total_pages;
5916 
5917 			uint64_t region_count = 0;
5918 			memorystatus_get_task_memory_region_count(proc_task(p), &region_count);
5919 			entry->jse_memory_region_count = region_count;
5920 			entry->csflags = proc_getcsflags(p);
5921 			goto exit;
5922 		}
5923 	}
5924 
5925 	if (entry == NULL) {
5926 		/*
5927 		 * The entry was not found in the snapshot, so the process must have
5928 		 * launched after the snapshot was initialized.
5929 		 * Let's try to append the new entry.
5930 		 */
5931 		if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) {
5932 			/*
5933 			 * A populated snapshot buffer exists
5934 			 * and there is room to init a new entry.
5935 			 */
5936 			assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count);
5937 
5938 			if (memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry)) {
5939 				memorystatus_jetsam_snapshot_count++;
5940 
5941 				if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) {
5942 					/*
5943 					 * We just used the last slot in the snapshot buffer.
5944 					 * We only want to log it once... so we do it here
5945 					 * when we notice we've hit the max.
5946 					 */
5947 					memorystatus_log_error("memorystatus: WARNING snapshot buffer is full, count %d\n", memorystatus_jetsam_snapshot_count);
5948 				}
5949 			}
5950 		}
5951 	}
5952 
5953 exit:
5954 	if (entry) {
5955 #if CONFIG_FREEZE
5956 		if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5957 			/* This is an app kill. Record it in the freezer snapshot so dasd can incorporate this in its recommendations. */
5958 			copied_to_freezer_snapshot = memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_freezer, memorystatus_jetsam_snapshot_freezer_max, entry);
5959 			if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5960 				/*
5961 				 * We just used the last slot in the freezer snapshot buffer.
5962 				 * We only want to log it once... so we do it here
5963 				 * when we notice we've hit the max.
5964 				 */
5965 				memorystatus_log_error("memorystatus: WARNING freezer snapshot buffer is full, count %zu\n",
5966 				    memorystatus_jetsam_snapshot_freezer->entry_count);
5967 			}
5968 		}
5969 #endif /* CONFIG_FREEZE */
5970 	} else {
5971 		/*
5972 		 * If we reach here, the snapshot buffer could not be updated.
5973 		 * Most likely, the buffer is full, in which case we would have
5974 		 * logged a warning in the previous call.
5975 		 *
5976 		 * For now, we will stop appending snapshot entries.
5977 		 * When the buffer is consumed, the snapshot state will reset.
5978 		 */
5979 
5980 		memorystatus_log_error(
5981 			"memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n",
5982 			proc_getpid(p), p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count);
5983 
5984 #if CONFIG_FREEZE
5985 		/* We still attempt to record this in the freezer snapshot */
5986 		if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5987 			snapshot = memorystatus_jetsam_snapshot_freezer;
5988 			if (snapshot->entry_count < memorystatus_jetsam_snapshot_freezer_max) {
5989 				copied_to_freezer_snapshot = memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry);
5990 				if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5991 					/*
5992 					 * We just used the last slot in the freezer snapshot buffer.
5993 					 * We only want to log it once... so we do it here
5994 					 * when we notice we've hit the max.
5995 					 */
5996 					memorystatus_log_error("memorystatus: WARNING freezer snapshot buffer is full, count %zu\n",
5997 					    memorystatus_jetsam_snapshot_freezer->entry_count);
5998 				}
5999 			}
6000 		}
6001 #endif /* CONFIG_FREEZE */
6002 	}
6003 
6004 	return;
6005 }
6006 
6007 uint32_t
memorystatus_get_available_page_count(void)6008 memorystatus_get_available_page_count(void)
6009 {
6010 	return os_atomic_load(&memorystatus_available_pages, relaxed);
6011 }
6012 
6013 void
memorystatus_update_available_page_count(uint32_t available_page_count)6014 memorystatus_update_available_page_count(uint32_t available_page_count)
6015 {
6016 	os_atomic_store(&memorystatus_available_pages, available_page_count,
6017 	    relaxed);
6018 #if VM_PRESSURE_EVENTS
6019 	/*
6020 	 * Since memorystatus_available_pages changes, we should
6021 	 * re-evaluate the pressure levels on the system and
6022 	 * check if we need to wake the pressure thread.
6023 	 * We also update memorystatus_level in that routine.
6024 	 */
6025 	vm_pressure_response();
6026 #endif /* VM_PRESSURE_EVENTS */
6027 #if CONFIG_FREEZE
6028 	/*
6029 	 * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect
6030 	 * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this
6031 	 * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here
6032 	 * will result in the "mutex with preemption disabled" panic.
6033 	 */
6034 
6035 	if (memorystatus_freeze_thread_should_run()) {
6036 		/*
6037 		 * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process).
6038 		 * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here.
6039 		 */
6040 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
6041 			thread_wakeup((event_t)&memorystatus_freeze_wakeup);
6042 		}
6043 	}
6044 #endif /* CONFIG_FREEZE */
6045 	_memstat_consider_waking_jetsam_thread();
6046 }
6047 
6048 static boolean_t
memorystatus_init_jetsam_snapshot_entry_locked(proc_t p,memorystatus_jetsam_snapshot_entry_t * entry,uint64_t gencount)6049 memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount)
6050 {
6051 	clock_sec_t                     tv_sec;
6052 	clock_usec_t                    tv_usec;
6053 	uint32_t pages = 0;
6054 	uint32_t max_pages_lifetime = 0;
6055 	uint32_t purgeable_pages = 0;
6056 	uint64_t internal_pages                         = 0;
6057 	uint64_t internal_compressed_pages              = 0;
6058 	uint64_t purgeable_nonvolatile_pages            = 0;
6059 	uint64_t purgeable_nonvolatile_compressed_pages = 0;
6060 	uint64_t alternate_accounting_pages             = 0;
6061 	uint64_t alternate_accounting_compressed_pages  = 0;
6062 	uint64_t iokit_mapped_pages                     = 0;
6063 	uint64_t page_table_pages                       = 0;
6064 	uint64_t frozen_to_swap_pages                   = 0;
6065 	uint64_t neural_nofootprint_total_pages         = 0;
6066 	uint64_t region_count                           = 0;
6067 	uint64_t cids[COALITION_NUM_TYPES];
6068 	uint32_t trust                                  = 0;
6069 	kern_return_t ret                               = 0;
6070 	memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
6071 
6072 	entry->pid = proc_getpid(p);
6073 	strlcpy(&entry->name[0], p->p_name, sizeof(entry->name));
6074 	entry->priority = p->p_memstat_effectivepriority;
6075 
6076 	memorystatus_get_task_page_counts(proc_task(p), &pages, &max_pages_lifetime, &purgeable_pages);
6077 	entry->pages              = (uint64_t)pages;
6078 	entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
6079 	entry->purgeable_pages    = (uint64_t)purgeable_pages;
6080 
6081 	memorystatus_get_task_phys_footprint_page_counts(proc_task(p), &internal_pages, &internal_compressed_pages,
6082 	    &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
6083 	    &alternate_accounting_pages, &alternate_accounting_compressed_pages,
6084 	    &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages, &neural_nofootprint_total_pages);
6085 
6086 	entry->jse_internal_pages = internal_pages;
6087 	entry->jse_internal_compressed_pages = internal_compressed_pages;
6088 	entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
6089 	entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
6090 	entry->jse_alternate_accounting_pages = alternate_accounting_pages;
6091 	entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
6092 	entry->jse_iokit_mapped_pages = iokit_mapped_pages;
6093 	entry->jse_page_table_pages = page_table_pages;
6094 	entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
6095 	entry->jse_neural_nofootprint_total_pages = neural_nofootprint_total_pages;
6096 
6097 	memorystatus_get_task_memory_region_count(proc_task(p), &region_count);
6098 	entry->jse_memory_region_count = region_count;
6099 
6100 	entry->state     = _memstat_build_state(p);
6101 	entry->user_data = p->p_memstat_userdata;
6102 	proc_getexecutableuuid(p, &entry->uuid[0], sizeof(entry->uuid));
6103 	entry->fds       = p->p_fd.fd_nfiles;
6104 
6105 	absolutetime_to_microtime(get_task_cpu_time(proc_task(p)), &tv_sec, &tv_usec);
6106 	entry->cpu_time.tv_sec = (int64_t)tv_sec;
6107 	entry->cpu_time.tv_usec = (int64_t)tv_usec;
6108 
6109 	assert(p->p_stats != NULL);
6110 	entry->jse_starttime =  p->p_stats->ps_start;   /* abstime process started */
6111 	entry->jse_killtime = 0;                        /* abstime jetsam chose to kill process */
6112 	entry->killed       = 0;                        /* the jetsam kill cause */
6113 	entry->jse_gencount = gencount;                 /* indicates a pass through jetsam thread, when process was targeted to be killed */
6114 
6115 	entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */
6116 	entry->jse_prio_start = p->p_memstat_prio_start; /* Time moved to current band */
6117 
6118 #if CONFIG_FREEZE
6119 	entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason;
6120 	entry->jse_thaw_count = p->p_memstat_thaw_count;
6121 #else /* CONFIG_FREEZE */
6122 	entry->jse_thaw_count = 0;
6123 	entry->jse_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
6124 #endif /* CONFIG_FREEZE */
6125 
6126 	proc_coalitionids(p, cids);
6127 	entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM];
6128 	entry->csflags = proc_getcsflags(p);
6129 	ret = get_trust_level_kdp(get_task_pmap(proc_task(p)), &trust);
6130 	if (ret != KERN_SUCCESS) {
6131 		trust = KCDATA_INVALID_CS_TRUST_LEVEL;
6132 	}
6133 	entry->cs_trust_level = trust;
6134 	return TRUE;
6135 }
6136 
6137 static void
memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t * snapshot)6138 memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
6139 {
6140 	kern_return_t kr = KERN_SUCCESS;
6141 	mach_msg_type_number_t  count = HOST_VM_INFO64_COUNT;
6142 	vm_statistics64_data_t  vm_stat;
6143 
6144 	if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count)) != KERN_SUCCESS) {
6145 		memorystatus_log_error("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
6146 		memset(&snapshot->stats, 0, sizeof(snapshot->stats));
6147 	} else {
6148 		snapshot->stats.free_pages      = vm_stat.free_count;
6149 		snapshot->stats.active_pages    = vm_stat.active_count;
6150 		snapshot->stats.inactive_pages  = vm_stat.inactive_count;
6151 		snapshot->stats.throttled_pages = vm_stat.throttled_count;
6152 		snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
6153 		snapshot->stats.wired_pages     = vm_stat.wire_count;
6154 
6155 		snapshot->stats.speculative_pages = vm_stat.speculative_count;
6156 		snapshot->stats.filebacked_pages  = vm_stat.external_page_count;
6157 		snapshot->stats.anonymous_pages   = vm_stat.internal_page_count;
6158 		snapshot->stats.compressions      = vm_stat.compressions;
6159 		snapshot->stats.decompressions    = vm_stat.decompressions;
6160 		snapshot->stats.compressor_pages  = vm_stat.compressor_page_count;
6161 		snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
6162 	}
6163 
6164 	get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity);
6165 
6166 	bzero(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name));
6167 	get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name),
6168 	    &snapshot->stats.largest_zone_size);
6169 }
6170 
6171 /*
6172  * Collect vm statistics at boot.
6173  * Called only once (see kern_exec.c)
6174  * Data can be consumed at any time.
6175  */
6176 void
memorystatus_init_at_boot_snapshot()6177 memorystatus_init_at_boot_snapshot()
6178 {
6179 	memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
6180 	memorystatus_at_boot_snapshot.entry_count = 0;
6181 	memorystatus_at_boot_snapshot.notification_time = 0;   /* updated when consumed */
6182 	memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
6183 }
6184 
6185 static void
memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t * snapshot)6186 memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot)
6187 {
6188 	memorystatus_init_snapshot_vmstats(snapshot);
6189 	snapshot->snapshot_time = mach_absolute_time();
6190 	snapshot->notification_time = 0;
6191 	snapshot->js_gencount = 0;
6192 }
6193 
6194 static void
memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t * od_snapshot,uint32_t ods_list_count)6195 memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
6196 {
6197 	proc_t p, next_p;
6198 	unsigned int b = 0, i = 0;
6199 
6200 	memorystatus_jetsam_snapshot_t *snapshot = NULL;
6201 	memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
6202 	unsigned int snapshot_max = 0;
6203 
6204 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
6205 
6206 	if (od_snapshot) {
6207 		/*
6208 		 * This is an on_demand snapshot
6209 		 */
6210 		snapshot      = od_snapshot;
6211 		snapshot_list = od_snapshot->entries;
6212 		snapshot_max  = ods_list_count;
6213 	} else {
6214 		/*
6215 		 * This is a jetsam event snapshot
6216 		 */
6217 		snapshot      = memorystatus_jetsam_snapshot;
6218 		snapshot_list = memorystatus_jetsam_snapshot->entries;
6219 		snapshot_max  = memorystatus_jetsam_snapshot_max;
6220 	}
6221 
6222 	memorystatus_init_jetsam_snapshot_header(snapshot);
6223 
6224 	next_p = memorystatus_get_first_proc_locked(&b, TRUE);
6225 	while (next_p) {
6226 		p = next_p;
6227 		next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
6228 
6229 		if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) {
6230 			continue;
6231 		}
6232 
6233 		if (++i == snapshot_max) {
6234 			break;
6235 		}
6236 	}
6237 
6238 	/* Log launchd and kernel_task as well to see more context, even though jetsam doesn't apply to them. */
6239 	if (i < snapshot_max) {
6240 		memorystatus_init_jetsam_snapshot_entry_locked(initproc, &snapshot_list[i], snapshot->js_gencount);
6241 		i++;
6242 	}
6243 
6244 	if (i < snapshot_max) {
6245 		memorystatus_init_jetsam_snapshot_entry_locked(kernproc, &snapshot_list[i], snapshot->js_gencount);
6246 		i++;
6247 	}
6248 
6249 	snapshot->entry_count = i;
6250 
6251 	if (!od_snapshot) {
6252 		/* update the system buffer count */
6253 		memorystatus_jetsam_snapshot_count = i;
6254 	}
6255 }
6256 
6257 /* number of entries added to the end of the jetsam snapshot (for launchd and kernel) */
6258 static const int memorystatus_artificial_snapshot_entry_count = 2;
6259 
6260 #if DEVELOPMENT || DEBUG
6261 
6262 /*
6263  * Fills an array with the given pids in the order they are seen in a
6264  * jetsam band.
6265  */
6266 static int
memorystatus_get_sort_order(unsigned int bucket_index,pid_t * pids,pid_t * order,size_t num_pids)6267 memorystatus_get_sort_order(
6268 	unsigned int bucket_index,
6269 	pid_t *pids,
6270 	pid_t *order,
6271 	size_t num_pids)
6272 {
6273 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
6274 
6275 	proc_t p = NULL;
6276 	size_t i, out_idx = 0;
6277 
6278 	/*
6279 	 * Read out the order of all the pids into the order array.
6280 	 */
6281 	p = memorystatus_get_first_proc_locked(&bucket_index, FALSE);
6282 	while (p) {
6283 		for (i = 0; i < num_pids; i++) {
6284 			if (pids[i] == proc_getpid(p)) {
6285 				if (out_idx >= num_pids) {
6286 					/* Did we somehow see something twice? */
6287 					return EINVAL;
6288 				}
6289 				order[out_idx] = pids[i];
6290 				out_idx++;
6291 			}
6292 		}
6293 		p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE);
6294 	}
6295 	return 0;
6296 }
6297 
6298 /*
6299  * Triggers a sort_order on a specified jetsam priority band.
6300  * This is for testing only, used to force a path through the sort
6301  * function.
6302  */
6303 static int
memorystatus_cmd_test_jetsam_sort(int priority,int sort_order,user_addr_t expected_order_user,size_t expected_order_user_len)6304 memorystatus_cmd_test_jetsam_sort(int priority,
6305     int sort_order,
6306     user_addr_t expected_order_user,
6307     size_t expected_order_user_len)
6308 {
6309 	pid_t *expected_order, *actual_order;
6310 	int error = 0;
6311 	size_t num_pids = expected_order_user_len / sizeof(pid_t);
6312 
6313 	if (num_pids > 512) { /* Just so we don't allocate some huge buffer */
6314 		return EINVAL;
6315 	}
6316 
6317 	if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
6318 		return EINVAL;
6319 	}
6320 
6321 	expected_order = kalloc_data_tag(num_pids * sizeof(pid_t), Z_WAITOK, VM_KERN_MEMORY_DIAG);
6322 	actual_order = kalloc_data_tag(num_pids * sizeof(pid_t), Z_WAITOK, VM_KERN_MEMORY_DIAG);
6323 
6324 	error = copyin(expected_order_user, expected_order, expected_order_user_len);
6325 	if (error != 0) {
6326 		goto err;
6327 	}
6328 
6329 	/*
6330 	 * Acquire lock before sorting so we can check the sort order
6331 	 * while still holding the lock.
6332 	 */
6333 	proc_list_lock();
6334 
6335 	memstat_sort_bucket_locked(priority, sort_order);
6336 
6337 	if (expected_order_user != CAST_USER_ADDR_T(NULL) && expected_order_user_len > 0) {
6338 		bzero(actual_order, num_pids * sizeof(pid_t));
6339 		error = memorystatus_get_sort_order(priority, expected_order, actual_order, num_pids);
6340 		/* Even if we get an error, we still want to copyout what we had */
6341 		copyout(actual_order, expected_order_user, num_pids * sizeof(pid_t));
6342 	}
6343 
6344 	proc_list_unlock();
6345 
6346 err:
6347 	kfree_data(expected_order, num_pids * sizeof(pid_t));
6348 	kfree_data(actual_order, num_pids * sizeof(pid_t));
6349 	return error;
6350 }
6351 
6352 #endif /* DEVELOPMENT || DEBUG */
6353 
6354 /*
6355  * Prepare the process to be killed (set state, update snapshot) and kill it.
6356  */
6357 static uint64_t memorystatus_purge_before_jetsam_success = 0;
6358 
6359 #if SOCKETS
6360 static int
networking_memstatus_callout(proc_t p,uint32_t status)6361 networking_memstatus_callout(proc_t p, uint32_t status)
6362 {
6363 	struct fileproc *fp;
6364 
6365 	/*
6366 	 * proc list lock NOT held
6367 	 * proc lock NOT held
6368 	 * a reference on the proc has been held / shall be dropped by the caller.
6369 	 */
6370 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
6371 	LCK_MTX_ASSERT(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED);
6372 
6373 	proc_fdlock(p);
6374 
6375 	fdt_foreach(fp, p) {
6376 		switch (FILEGLOB_DTYPE(fp->fp_glob)) {
6377 #if NECP
6378 		case DTYPE_NETPOLICY:
6379 			necp_fd_memstatus(p, status,
6380 			    (struct necp_fd_data *)fp_get_data(fp));
6381 			break;
6382 #endif /* NECP */
6383 #if SKYWALK
6384 		case DTYPE_CHANNEL:
6385 			kern_channel_memstatus(p, status,
6386 			    (struct kern_channel *)fp_get_data(fp));
6387 			break;
6388 #endif /* SKYWALK */
6389 		default:
6390 			break;
6391 		}
6392 	}
6393 	proc_fdunlock(p);
6394 
6395 	return 1;
6396 }
6397 #endif /* SOCKETS */
6398 
6399 static bool
memorystatus_kill_proc(proc_t p,uint32_t cause,os_reason_t jetsam_reason,bool * killed,uint64_t * footprint_out)6400 memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool *killed, uint64_t *footprint_out)
6401 {
6402 	pid_t aPid = 0;
6403 	uint32_t aPid_ep = 0;
6404 
6405 	uint64_t        killtime = 0;
6406 	uint64_t        time_in_priority_band_secs = 0;
6407 	clock_sec_t     tv_sec;
6408 	clock_usec_t    tv_usec;
6409 	uint32_t        tv_msec;
6410 	bool retval = false;
6411 
6412 	aPid = proc_getpid(p);
6413 	aPid_ep = p->p_memstat_effectivepriority;
6414 
6415 	if (cause != kMemorystatusKilledVnodes && cause != kMemorystatusKilledZoneMapExhaustion) {
6416 		/*
6417 		 * Genuine memory pressure and not other (vnode/zone) resource exhaustion.
6418 		 */
6419 		boolean_t success = FALSE;
6420 		uint64_t num_pages_purged;
6421 		uint64_t num_pages_reclaimed = 0;
6422 		uint64_t num_pages_unsecluded = 0;
6423 
6424 		networking_memstatus_callout(p, cause);
6425 		num_pages_purged = vm_purgeable_purge_task_owned(proc_task(p));
6426 		num_pages_reclaimed += num_pages_purged;
6427 #if CONFIG_SECLUDED_MEMORY
6428 		if (cause == kMemorystatusKilledVMPageShortage &&
6429 		    vm_page_secluded_count > 0 &&
6430 		    task_can_use_secluded_mem(proc_task(p), FALSE)) {
6431 			/*
6432 			 * We're about to kill a process that has access
6433 			 * to the secluded pool.  Drain that pool into the
6434 			 * free or active queues to make these pages re-appear
6435 			 * as "available", which might make us no longer need
6436 			 * to kill that process.
6437 			 * Since the secluded pool does not get refilled while
6438 			 * a process has access to it, it should remain
6439 			 * drained.
6440 			 */
6441 			num_pages_unsecluded = vm_page_secluded_drain();
6442 			num_pages_reclaimed += num_pages_unsecluded;
6443 		}
6444 #endif /* CONFIG_SECLUDED_MEMORY */
6445 
6446 		if (num_pages_reclaimed) {
6447 			/*
6448 			 * We actually reclaimed something and so let's
6449 			 * check if we need to continue with the kill.
6450 			 */
6451 			if (cause == kMemorystatusKilledHiwat) {
6452 				uint64_t footprint_in_bytes = get_task_phys_footprint(proc_task(p));
6453 				uint64_t memlimit_in_bytes  = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL);  /* convert MB to bytes */
6454 				success = (footprint_in_bytes <= memlimit_in_bytes);
6455 			} else {
6456 				success = !(memorystatus_get_available_page_count() < memorystatus_get_soft_memlimit_page_shortage_threshold());
6457 #if CONFIG_SECLUDED_MEMORY
6458 				if (!success && num_pages_unsecluded) {
6459 					/*
6460 					 * We just drained the secluded pool
6461 					 * because we're about to kill a
6462 					 * process that has access to it.
6463 					 * This is an important process and
6464 					 * we'd rather not kill it unless
6465 					 * absolutely necessary, so declare
6466 					 * success even if draining the pool
6467 					 * did not quite get us out of the
6468 					 * "pressure" level but still got
6469 					 * us out of the "critical" level.
6470 					 */
6471 					success = !(
6472 						memorystatus_get_available_page_count() <
6473 						memorystatus_get_critical_page_shortage_threshold());
6474 				}
6475 #endif /* CONFIG_SECLUDED_MEMORY */
6476 			}
6477 
6478 			if (success) {
6479 				memorystatus_purge_before_jetsam_success++;
6480 
6481 				memorystatus_log_info("memorystatus: reclaimed %llu pages (%llu purged, %llu unsecluded) from pid %d [%s] and avoided %s\n",
6482 				    num_pages_reclaimed, num_pages_purged, num_pages_unsecluded, aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memstat_kill_cause_name[cause]);
6483 
6484 				*killed = false;
6485 				*footprint_out = num_pages_reclaimed + num_pages_purged + num_pages_unsecluded;
6486 
6487 				return true;
6488 			}
6489 		}
6490 	}
6491 
6492 	killtime = mach_absolute_time();
6493 	absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
6494 	tv_msec = tv_usec / 1000;
6495 
6496 	proc_list_lock();
6497 	memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
6498 	proc_list_unlock();
6499 
6500 	char kill_reason_string[128];
6501 
6502 	if (cause == kMemorystatusKilledHiwat) {
6503 		strlcpy(kill_reason_string, "killing_highwater_process", 128);
6504 	} else {
6505 		if (aPid_ep == JETSAM_PRIORITY_IDLE) {
6506 			strlcpy(kill_reason_string, "killing_idle_process", 128);
6507 		} else {
6508 			strlcpy(kill_reason_string, "killing_top_process", 128);
6509 		}
6510 	}
6511 
6512 	/*
6513 	 * memorystatus_do_kill drops a reference, so take another one so we can
6514 	 * continue to use this exit reason even after memorystatus_do_kill()
6515 	 * returns
6516 	 */
6517 	os_reason_ref(jetsam_reason);
6518 
6519 	retval = memorystatus_do_kill(p, cause, jetsam_reason, footprint_out);
6520 	*killed = retval;
6521 
6522 	absolutetime_to_nanoseconds(killtime - p->p_memstat_prio_start, &time_in_priority_band_secs);
6523 	time_in_priority_band_secs /= NSEC_PER_SEC;
6524 
6525 	memorystatus_log("memorystatus: %s pid %d [%s] (%s %d %llus rf:%s type:%s) %lluKB - memorystatus_available_pages: %llu compressor_size:%u\n",
6526 	    kill_reason_string,
6527 	    aPid, proc_best_name(p),
6528 	    memstat_kill_cause_name[cause], aPid_ep, time_in_priority_band_secs,
6529 	    _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags), _memstat_proc_type_description(p),
6530 	    (*footprint_out) >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
6531 
6532 	return retval;
6533 }
6534 
6535 /*
6536  * Jetsam the first process in the queue.
6537  */
6538 static bool
memstat_kill_top_process(uint32_t cause,os_reason_t jetsam_reason,int32_t max_priority,memstat_kill_options_t options,int32_t * priority_out,uint32_t * errors_out,uint64_t * memory_reclaimed_out)6539 memstat_kill_top_process(uint32_t cause, os_reason_t jetsam_reason,
6540     int32_t max_priority, memstat_kill_options_t options,
6541     int32_t *priority_out, uint32_t *errors_out, uint64_t *memory_reclaimed_out)
6542 {
6543 	pid_t aPid;
6544 	proc_t p = PROC_NULL, next_p = PROC_NULL;
6545 	bool new_snapshot = false, force_new_snapshot = false, killed = false, freed_mem = false;
6546 	unsigned int i = 0;
6547 	uint32_t aPid_ep;
6548 	uint64_t footprint_of_killed_proc = 0;
6549 
6550 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
6551 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6552 
6553 	bool only_long_idle = options & MEMSTAT_ONLY_LONG_IDLE;
6554 	bool only_swappable = options & MEMSTAT_ONLY_SWAPPABBLE;
6555 	bool sort_bucket = options & MEMSTAT_SORT_BUCKET;
6556 
6557 #if CONFIG_JETSAM
6558 	if (sort_bucket) {
6559 		(void)memstat_sort_bucket(JETSAM_PRIORITY_FOREGROUND, memstat_jetsam_fg_sort_order);
6560 	}
6561 
6562 	if (memory_reclaimed_out) {
6563 		*memory_reclaimed_out = 0;
6564 	}
6565 
6566 	force_new_snapshot = false;
6567 
6568 #else /* CONFIG_JETSAM */
6569 	if (sort_bucket) {
6570 		(void)memstat_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_IDLE_DEFAULT);
6571 	}
6572 
6573 	/*
6574 	 * And, because we are here under extreme circumstances, we force a snapshot even for
6575 	 * IDLE kills.
6576 	 */
6577 	force_new_snapshot = true;
6578 
6579 #endif /* CONFIG_JETSAM */
6580 
6581 	if (cause != kMemorystatusKilledZoneMapExhaustion &&
6582 	    jetsam_current_thread() != NULL &&
6583 	    jetsam_current_thread()->limit_to_low_bands &&
6584 	    max_priority > JETSAM_PRIORITY_MAIL) {
6585 		max_priority = JETSAM_PRIORITY_MAIL;
6586 	}
6587 
6588 	_memstat_refresh_oldest_reapable_proc_info();
6589 
6590 	proc_list_lock();
6591 
6592 	next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6593 	while (next_p && (next_p->p_memstat_effectivepriority <= max_priority)) {
6594 		p = next_p;
6595 		next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6596 
6597 
6598 		aPid = proc_getpid(p);
6599 		aPid_ep = p->p_memstat_effectivepriority;
6600 
6601 		if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6602 			continue;   /* with lock held */
6603 		}
6604 
6605 		if (cause == kMemorystatusKilledVnodes) {
6606 			/*
6607 			 * If the system runs out of vnodes, we systematically jetsam
6608 			 * processes in hopes of stumbling onto a vnode gain that helps
6609 			 * the system recover.  The process that happens to trigger
6610 			 * this path has no known relationship to the vnode shortage.
6611 			 * Deadlock avoidance: attempt to safeguard the caller.
6612 			 */
6613 
6614 			if (p == current_proc()) {
6615 				/* do not jetsam the current process */
6616 				continue;
6617 			}
6618 		}
6619 
6620 		if (only_swappable && !task_donates_own_pages(proc_task(p))) {
6621 			continue;
6622 		}
6623 
6624 		if (only_long_idle) {
6625 			if (!_memstat_proc_is_reapable(p)) {
6626 				memorystatus_log_debug("memorystatus: memstat_kill_top_process: skipping non-reapable process %s [%d]\n",
6627 				    proc_best_name(p), p->p_pid);
6628 				continue;
6629 			}
6630 			memorystatus_log_debug("memorystatus: memstat_kill_top_process: found reapable long-idle process %s [%d]\n",
6631 			    proc_best_name(p), p->p_pid);
6632 		}
6633 
6634 #if !CONFIG_JETSAM
6635 		if (max_priority == JETSAM_PRIORITY_IDLE &&
6636 		    ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) != (P_DIRTY_ALLOW_IDLE_EXIT))) {
6637 			/*
6638 			 * This process is in the idle band but is not clean+idle-exitable or
6639 			 * managed+assertion-less. Skip it.
6640 			 */
6641 			memorystatus_log_error("memorystatus: skipping idle but not idle-exitable process "
6642 			    "%s [%d] (0x%x)\n", proc_best_name(p), proc_getpid(p), p->p_memstat_state);
6643 			continue;
6644 		}
6645 #endif /* !CONFIG_JETSAM */
6646 #if CONFIG_FREEZE
6647 		if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
6648 			continue;
6649 		}
6650 #endif
6651 		if (proc_ref(p, true) == p) {
6652 			/*
6653 			 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6654 			 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6655 			 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6656 			 * acquisition of the proc lock.
6657 			 */
6658 			p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6659 		} else {
6660 			/*
6661 			 * We need to restart the search again because
6662 			 * proc_ref _can_ drop the proc_list lock
6663 			 * and we could have lost our stored next_p via
6664 			 * an exit() on another core.
6665 			 */
6666 			i = 0;
6667 			next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6668 			continue;
6669 		}
6670 
6671 		/*
6672 		 * Capture a snapshot if none exists and:
6673 		 * - we are forcing a new snapshot creation, either because:
6674 		 *      - on a particular platform we need these snapshots every time, OR
6675 		 *	- a boot-arg/embedded device tree property has been set.
6676 		 * - priority was not requested (this is something other than an ambient kill)
6677 		 * - the priority was requested *and* the targeted process is not at idle priority
6678 		 */
6679 		if ((memorystatus_jetsam_snapshot_count == 0) &&
6680 		    (force_new_snapshot || memorystatus_idle_snapshot || ((!priority_out) || (priority_out && (aPid_ep != JETSAM_PRIORITY_IDLE))))) {
6681 			memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6682 			new_snapshot = true;
6683 		}
6684 
6685 		proc_list_unlock();
6686 
6687 		freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed, &footprint_of_killed_proc); /* purged and/or killed 'p' */
6688 		/* Success? */
6689 		if (freed_mem) {
6690 			if (memory_reclaimed_out) {
6691 				*memory_reclaimed_out = footprint_of_killed_proc;
6692 			}
6693 			if (killed) {
6694 				if (priority_out) {
6695 					*priority_out = aPid_ep;
6696 				}
6697 			} else {
6698 				/* purged */
6699 				proc_list_lock();
6700 				p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6701 				proc_list_unlock();
6702 			}
6703 			proc_rele(p);
6704 			goto exit;
6705 		}
6706 
6707 		/*
6708 		 * Failure - first unwind the state,
6709 		 * then fall through to restart the search.
6710 		 */
6711 		proc_list_lock();
6712 		proc_rele(p);
6713 		p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6714 		p->p_memstat_state |= P_MEMSTAT_ERROR;
6715 		if (errors_out) {
6716 			*errors_out += 1;
6717 		}
6718 
6719 		i = 0;
6720 		next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6721 	}
6722 
6723 	proc_list_unlock();
6724 
6725 exit:
6726 	os_reason_free(jetsam_reason);
6727 
6728 	if (!killed) {
6729 		/* Clear snapshot if freshly captured and no target was found */
6730 		if (new_snapshot) {
6731 			proc_list_lock();
6732 			memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6733 			proc_list_unlock();
6734 		}
6735 #if CONFIG_JETSAM
6736 		if (only_long_idle) {
6737 			_memstat_reaper_end_sweep();
6738 		}
6739 #endif
6740 	}
6741 
6742 #if CONFIG_JETSAM
6743 	if (killed && only_long_idle) {
6744 		_memstat_reaper_record_kill(footprint_of_killed_proc);
6745 	}
6746 #endif
6747 
6748 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6749 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, footprint_of_killed_proc);
6750 
6751 	return killed;
6752 }
6753 
6754 static
6755 void
_memstat_refresh_oldest_reapable_proc_info()6756 _memstat_refresh_oldest_reapable_proc_info()
6757 {
6758 	uint64_t oldest_prio_start = MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_NONE;
6759 	proc_t p = PROC_NULL, next_p = PROC_NULL;
6760 	unsigned int i = 0;
6761 	uint64_t mat = mach_absolute_time();
6762 	uint64_t rescan_timeout_duration_matu;
6763 
6764 
6765 	/* If we're still within 'memstat_reaper_rescan_secs' of the last process-list walk,
6766 	 * don't do another walk yet, and just use the existing information.
6767 	 */
6768 	if (mat < memstat_oldest_reapable_proc_info_expiration_ts_matu) {
6769 		memorystatus_log_debug("memorystatus: _memstat_refresh_oldest_reapable_proc_info: re-using existing data\n");
6770 		return;
6771 	} else {
6772 		memorystatus_log_debug("memorystatus: _memstat_refresh_oldest_reapable_proc_info: rescanning proc list\n");
6773 	}
6774 
6775 	proc_list_lock();
6776 
6777 	next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6778 	while (next_p) {
6779 		p = next_p;
6780 		next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6781 
6782 		/* Since the process list is sorted in priority order, once we find the first
6783 		 * process that is out of the reaper's acceptible range, we can skip the rest
6784 		 */
6785 		if (p->p_memstat_effectivepriority > memstat_reaper_max_priority) {
6786 			break;
6787 		}
6788 
6789 		if (_memstat_proc_is_reapable(p)) {
6790 			uint64_t proc_prio_start = p->p_memstat_prio_start;
6791 			if (proc_prio_start < oldest_prio_start) {
6792 				oldest_prio_start = proc_prio_start;
6793 				/* Since the process list is sorted in age order within priority bands,
6794 				 * the first process will be the oldest one, and we can bail out and skip the rest
6795 				 */
6796 				break;
6797 			}
6798 		}
6799 	}
6800 
6801 	proc_list_unlock();
6802 
6803 	memstat_oldest_reapable_proc_prio_start = oldest_prio_start;
6804 
6805 	if (memstat_oldest_reapable_proc_prio_start != MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_NONE) {
6806 		uint64_t min_age_matu;
6807 		nanoseconds_to_absolutetime((memstat_reaper_min_age_secs * NSEC_PER_SEC), &min_age_matu);
6808 		memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu = memstat_oldest_reapable_proc_prio_start + min_age_matu;
6809 	}
6810 
6811 	nanoseconds_to_absolutetime((memstat_reaper_rescan_secs * NSEC_PER_SEC), &rescan_timeout_duration_matu);
6812 	memstat_oldest_reapable_proc_info_expiration_ts_matu = mat + rescan_timeout_duration_matu;
6813 }
6814 
6815 static bool
_memstat_proc_is_reapable(proc_t proc)6816 _memstat_proc_is_reapable(proc_t proc)
6817 {
6818 	uint32_t priority_band;
6819 	uint64_t time_in_priority_band_secs;
6820 	uint32_t relaunch_probability_acceptable_mask;
6821 
6822 	/*
6823 	 *  To be potentially reapable, the process
6824 	 *  - must be in or below the max reapable priority and
6825 	 *  - must not have a relaunch probability of High or Medium (per memstat_reaper_reap_relaunch_mask)
6826 	 *  - must have been in that priority band longer than the reaper minimum age threshold
6827 	 *  - must have been in that priority band longer than the reaper minimum age threshold for applications, if process is an application
6828 	 */
6829 	priority_band = proc->p_memstat_effectivepriority;
6830 	if (priority_band > memstat_reaper_max_priority) {
6831 		memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] not reapable because priority (%d) is above threshold (%d)\n",
6832 		    proc_best_name(proc), proc->p_pid, priority_band, memstat_reaper_max_priority);
6833 		return false;
6834 	}
6835 
6836 	uint32_t relaunch_flags = proc->p_memstat_relaunch_flags;
6837 	// There's no explicit flag for "unknown" relaunch probability, and we need one for our control bitmask.
6838 	// So if none of the Low Medium or High bits are set, we set the next higher bit as the "unknown relaunch probability" bit
6839 	// and then test all the bits at once, below, with a bitwise-and.
6840 	if ((relaunch_flags & (P_MEMSTAT_RELAUNCH_LOW | P_MEMSTAT_RELAUNCH_MED | P_MEMSTAT_RELAUNCH_HIGH)) == 0) {
6841 		relaunch_flags |= MEMORYSTATUS_REAPER_REAP_RELAUNCH_MASK_UNKNOWN; // The bit for 'unknown' is the one just to the left (above) of High, e.g. 0x08
6842 	}
6843 	relaunch_probability_acceptable_mask = relaunch_flags & memstat_reaper_reap_relaunch_mask;
6844 
6845 	if (relaunch_probability_acceptable_mask == 0) {
6846 		memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] not reapable because relaunch probability bitmask (0x%02X) does not match with the memstat_reaper_reap_relaunch_mask (0x%02X).\n",
6847 		    proc_best_name(proc), proc->p_pid, relaunch_flags, memstat_reaper_reap_relaunch_mask);
6848 		return false;
6849 	}
6850 
6851 	absolutetime_to_nanoseconds(mach_absolute_time() - proc->p_memstat_prio_start, &time_in_priority_band_secs);
6852 	time_in_priority_band_secs /= NSEC_PER_SEC;
6853 
6854 	if (_memstat_proc_is_application(proc)) {
6855 		if ((time_in_priority_band_secs < memstat_reaper_min_age_apps_secs)) {
6856 			memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] not reapable because it is an application and age (%llu) is below min age for apps (%d)\n",
6857 			    proc_best_name(proc), proc->p_pid, time_in_priority_band_secs, memstat_reaper_min_age_apps_secs);
6858 			return false;
6859 		}
6860 	} else {
6861 		if (time_in_priority_band_secs < memstat_reaper_min_age_secs) {
6862 			memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] not reapable because age (%llu) is below min age (%d)\n",
6863 			    proc_best_name(proc), proc->p_pid, time_in_priority_band_secs, memstat_reaper_min_age_secs);
6864 			return false;
6865 		}
6866 	}
6867 
6868 	memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] is reapable; priority=%d, age=%d, relaunch_probability_acceptable_mask=0x%02X, type=%s\n",
6869 	    proc_best_name(proc), proc->p_pid, priority_band, (uint32_t)(time_in_priority_band_secs), relaunch_probability_acceptable_mask,
6870 	    _memstat_proc_type_description(proc));
6871 	return true;
6872 }
6873 
6874 static bool
_memstat_proc_is_application(proc_t proc)6875 _memstat_proc_is_application(proc_t proc)
6876 {
6877 	bool isApp = false;
6878 
6879 	task_t task = proc_task(proc);
6880 	if (task != NULL) {
6881 		isApp = task_is_app( task);
6882 	}
6883 
6884 	return isApp;
6885 }
6886 
6887 /*
6888  * Jetsam aggressively
6889  */
6890 static bool
memorystatus_kill_processes_aggressive(uint32_t cause,int aggr_count,int32_t priority_max,int max_kills,uint32_t * errors,uint64_t * memory_reclaimed)6891 memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count,
6892     int32_t priority_max, int max_kills, uint32_t *errors, uint64_t *memory_reclaimed)
6893 {
6894 	pid_t aPid;
6895 	proc_t p = PROC_NULL, next_p = PROC_NULL;
6896 	boolean_t new_snapshot = FALSE, killed = FALSE;
6897 	int kill_count = 0;
6898 	unsigned int priority_band = JETSAM_PRIORITY_IDLE;
6899 	int32_t aPid_ep = 0;
6900 	unsigned int memorystatus_level_snapshot = 0;
6901 	uint64_t killtime = 0;
6902 	uint64_t time_in_priority_band_secs = 0;
6903 	clock_sec_t     tv_sec;
6904 	clock_usec_t    tv_usec;
6905 	uint32_t        tv_msec;
6906 	os_reason_t jetsam_reason = OS_REASON_NULL;
6907 	uint64_t footprint_of_killed_proc = 0;
6908 
6909 	*memory_reclaimed = 0;
6910 
6911 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
6912 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, priority_max);
6913 
6914 	if (priority_max >= JETSAM_PRIORITY_FOREGROUND) {
6915 		/*
6916 		 * Check if aggressive jetsam has been asked to kill upto or beyond the
6917 		 * JETSAM_PRIORITY_FOREGROUND bucket. If yes, sort the FG band based on
6918 		 * coalition footprint.
6919 		 */
6920 		memstat_sort_bucket(JETSAM_PRIORITY_FOREGROUND, memstat_jetsam_fg_sort_order);
6921 	}
6922 
6923 	jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause);
6924 	if (jetsam_reason == OS_REASON_NULL) {
6925 		memorystatus_log_error("memorystatus_kill_processes_aggressive: failed to allocate exit reason\n");
6926 	}
6927 	memorystatus_log("memorystatus: aggressively killing up to %d processes below band %d.\n", max_kills, priority_max + 1);
6928 	proc_list_lock();
6929 
6930 	next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
6931 	while (next_p) {
6932 		if (proc_list_exited(next_p) ||
6933 		    ((unsigned int)(next_p->p_memstat_effectivepriority) != priority_band)) {
6934 			/*
6935 			 * We have raced with next_p running on another core.
6936 			 * It may be exiting or it may have moved to a different
6937 			 * jetsam priority band.  This means we have lost our
6938 			 * place in line while traversing the jetsam list.  We
6939 			 * attempt to recover by rewinding to the beginning of the band
6940 			 * we were already traversing.  By doing this, we do not guarantee
6941 			 * that no process escapes this aggressive march, but we can make
6942 			 * skipping an entire range of processes less likely. (PR-21069019)
6943 			 */
6944 
6945 			memorystatus_log_debug(
6946 				"memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n",
6947 				aggr_count, priority_band, (*next_p->p_name ? next_p->p_name : "unknown"), proc_getpid(next_p));
6948 
6949 			next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
6950 			continue;
6951 		}
6952 
6953 		p = next_p;
6954 		next_p = memorystatus_get_next_proc_locked(&priority_band, p, TRUE);
6955 
6956 		if (p->p_memstat_effectivepriority > priority_max) {
6957 			/*
6958 			 * Bail out of this killing spree if we have
6959 			 * reached beyond the priority_max jetsam band.
6960 			 * That is, we kill up to and through the
6961 			 * priority_max jetsam band.
6962 			 */
6963 			proc_list_unlock();
6964 			goto exit;
6965 		}
6966 
6967 		aPid = proc_getpid(p);
6968 		aPid_ep = p->p_memstat_effectivepriority;
6969 
6970 		if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6971 			continue;
6972 		}
6973 
6974 		/*
6975 		 * Capture a snapshot if none exists.
6976 		 */
6977 		if (memorystatus_jetsam_snapshot_count == 0) {
6978 			memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6979 			new_snapshot = TRUE;
6980 		}
6981 
6982 		/*
6983 		 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6984 		 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6985 		 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6986 		 * acquisition of the proc lock.
6987 		 */
6988 		p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6989 
6990 		killtime = mach_absolute_time();
6991 		absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
6992 		tv_msec = tv_usec / 1000;
6993 
6994 		/* Shift queue, update stats */
6995 		memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
6996 
6997 		/*
6998 		 * In order to kill the target process, we will drop the proc_list_lock.
6999 		 * To guaranteee that p and next_p don't disappear out from under the lock,
7000 		 * we must take a ref on both.
7001 		 * If we cannot get a reference, then it's likely we've raced with
7002 		 * that process exiting on another core.
7003 		 */
7004 		if (proc_ref(p, true) == p) {
7005 			if (next_p) {
7006 				while (next_p && (proc_ref(next_p, true) != next_p)) {
7007 					proc_t temp_p;
7008 
7009 					/*
7010 					 * We must have raced with next_p exiting on another core.
7011 					 * Recover by getting the next eligible process in the band.
7012 					 */
7013 
7014 					memorystatus_log_debug(
7015 						"memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
7016 						aggr_count, proc_getpid(next_p), (*next_p->p_name ? next_p->p_name : "(unknown)"));
7017 
7018 					temp_p = next_p;
7019 					next_p = memorystatus_get_next_proc_locked(&priority_band, temp_p, TRUE);
7020 				}
7021 			}
7022 			proc_list_unlock();
7023 
7024 			if (aPid_ep <= system_procs_aging_band &&
7025 			    (p->p_memstat_relaunch_flags & P_MEMSTAT_RELAUNCH_HIGH)) {
7026 				memorystatus_log("memorystatus: killing %s [%d] in band %d "
7027 				    "with high relaunch probability\n",
7028 				    proc_best_name(p), aPid, aPid_ep);
7029 			}
7030 			absolutetime_to_nanoseconds(killtime - p->p_memstat_prio_start, &time_in_priority_band_secs);
7031 			time_in_priority_band_secs /= NSEC_PER_SEC;
7032 			memorystatus_log(
7033 				"memorystatus: %s%d pid %d [%s] (%s %d %llus rf:%s type:%s) - memorystatus_available_pages: %llu\n",
7034 				((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"),
7035 				aggr_count, aPid, proc_best_name(p),
7036 				memstat_kill_cause_name[cause], aPid_ep,
7037 				time_in_priority_band_secs, _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags), _memstat_proc_type_description(p),
7038 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
7039 
7040 			memorystatus_level_snapshot = memorystatus_level;
7041 
7042 			/*
7043 			 * memorystatus_do_kill() drops a reference, so take another one so we can
7044 			 * continue to use this exit reason even after memorystatus_do_kill()
7045 			 * returns.
7046 			 */
7047 			os_reason_ref(jetsam_reason);
7048 			killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
7049 
7050 			/* Success? */
7051 			if (killed) {
7052 				*memory_reclaimed += footprint_of_killed_proc;
7053 				proc_rele(p);
7054 				kill_count++;
7055 				p = NULL;
7056 				killed = FALSE;
7057 
7058 				/*
7059 				 * Continue the killing spree.
7060 				 */
7061 				proc_list_lock();
7062 				if (next_p) {
7063 					proc_rele(next_p);
7064 				}
7065 
7066 				if (kill_count == max_kills) {
7067 					memorystatus_log_info(
7068 						"memorystatus: giving up aggressive kill after killing "
7069 						"%d processes below band %d.\n",
7070 						max_kills, priority_max + 1);
7071 					break;
7072 				}
7073 
7074 				if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) {
7075 					if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) {
7076 #if DEVELOPMENT || DEBUG
7077 						memorystatus_log_info("Disabling Lenient mode after one-time deployment.\n");
7078 #endif /* DEVELOPMENT || DEBUG */
7079 						memorystatus_aggressive_jetsam_lenient = FALSE;
7080 						break;
7081 					}
7082 				}
7083 
7084 				continue;
7085 			}
7086 
7087 			/*
7088 			 * Failure - first unwind the state,
7089 			 * then fall through to restart the search.
7090 			 */
7091 			proc_list_lock();
7092 			proc_rele(p);
7093 			if (next_p) {
7094 				proc_rele(next_p);
7095 			}
7096 			p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
7097 			p->p_memstat_state |= P_MEMSTAT_ERROR;
7098 			*errors += 1;
7099 			p = NULL;
7100 		}
7101 
7102 		/*
7103 		 * Failure - restart the search at the beginning of
7104 		 * the band we were already traversing.
7105 		 *
7106 		 * We might have raced with "p" exiting on another core, resulting in no
7107 		 * ref on "p".  Or, we may have failed to kill "p".
7108 		 *
7109 		 * Either way, we fall thru to here, leaving the proc in the
7110 		 * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
7111 		 *
7112 		 * And, we hold the the proc_list_lock at this point.
7113 		 */
7114 
7115 		next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
7116 	}
7117 
7118 	proc_list_unlock();
7119 
7120 exit:
7121 	os_reason_free(jetsam_reason);
7122 
7123 	/* Clear snapshot if freshly captured and no target was found */
7124 	if (new_snapshot && (kill_count == 0)) {
7125 		proc_list_lock();
7126 		memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
7127 		proc_list_unlock();
7128 	}
7129 
7130 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
7131 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, kill_count, *memory_reclaimed);
7132 
7133 	return kill_count > 0;
7134 }
7135 
7136 static boolean_t
memorystatus_kill_hiwat_proc(uint32_t * errors,boolean_t * purged,uint64_t * memory_reclaimed)7137 memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed)
7138 {
7139 	pid_t aPid = 0;
7140 	proc_t p = PROC_NULL, next_p = PROC_NULL;
7141 	bool new_snapshot = false, killed = false, freed_mem = false;
7142 	unsigned int i = 0;
7143 	uint32_t aPid_ep;
7144 	os_reason_t jetsam_reason = OS_REASON_NULL;
7145 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
7146 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES);
7147 
7148 	jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER);
7149 	if (jetsam_reason == OS_REASON_NULL) {
7150 		memorystatus_log_error("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n");
7151 	}
7152 
7153 	proc_list_lock();
7154 
7155 	next_p = memorystatus_get_first_proc_locked(&i, TRUE);
7156 	while (next_p) {
7157 		uint64_t footprint_in_bytes = 0;
7158 		uint64_t memlimit_in_bytes  = 0;
7159 		boolean_t skip = 0;
7160 
7161 		p = next_p;
7162 		next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
7163 
7164 		aPid = proc_getpid(p);
7165 		aPid_ep = p->p_memstat_effectivepriority;
7166 
7167 		if (p->p_memstat_state  & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
7168 			continue;
7169 		}
7170 
7171 		/* skip if no limit set */
7172 		if (p->p_memstat_memlimit <= 0) {
7173 			continue;
7174 		}
7175 
7176 		footprint_in_bytes = get_task_phys_footprint(proc_task(p));
7177 		memlimit_in_bytes  = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL);   /* convert MB to bytes */
7178 		skip = (footprint_in_bytes <= memlimit_in_bytes);
7179 
7180 #if CONFIG_FREEZE
7181 		if (!skip) {
7182 			if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
7183 				skip = TRUE;
7184 			} else {
7185 				skip = FALSE;
7186 			}
7187 		}
7188 #endif
7189 
7190 		if (skip) {
7191 			continue;
7192 		} else {
7193 			if (memorystatus_jetsam_snapshot_count == 0) {
7194 				memorystatus_init_jetsam_snapshot_locked(NULL, 0);
7195 				new_snapshot = true;
7196 			}
7197 
7198 			if (proc_ref(p, true) == p) {
7199 				/*
7200 				 * Mark as terminated so that if exit1() indicates success, but the process (for example)
7201 				 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
7202 				 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
7203 				 * acquisition of the proc lock.
7204 				 */
7205 				p->p_memstat_state |= P_MEMSTAT_TERMINATED;
7206 
7207 				proc_list_unlock();
7208 			} else {
7209 				/*
7210 				 * We need to restart the search again because
7211 				 * proc_ref _can_ drop the proc_list lock
7212 				 * and we could have lost our stored next_p via
7213 				 * an exit() on another core.
7214 				 */
7215 				i = 0;
7216 				next_p = memorystatus_get_first_proc_locked(&i, TRUE);
7217 				continue;
7218 			}
7219 
7220 			footprint_in_bytes = 0;
7221 			freed_mem = memorystatus_kill_proc(p, kMemorystatusKilledHiwat, jetsam_reason, &killed, &footprint_in_bytes); /* purged and/or killed 'p' */
7222 
7223 			/* Success? */
7224 			if (freed_mem) {
7225 				if (!killed) {
7226 					/* purged 'p'..don't reset HWM candidate count */
7227 					*purged = TRUE;
7228 
7229 					proc_list_lock();
7230 					p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
7231 					proc_list_unlock();
7232 				} else {
7233 					*memory_reclaimed = footprint_in_bytes;
7234 				}
7235 				proc_rele(p);
7236 				goto exit;
7237 			}
7238 			/*
7239 			 * Failure - first unwind the state,
7240 			 * then fall through to restart the search.
7241 			 */
7242 			proc_list_lock();
7243 			proc_rele(p);
7244 			p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
7245 			p->p_memstat_state |= P_MEMSTAT_ERROR;
7246 			*errors += 1;
7247 
7248 			i = 0;
7249 			next_p = memorystatus_get_first_proc_locked(&i, TRUE);
7250 		}
7251 	}
7252 
7253 	proc_list_unlock();
7254 
7255 exit:
7256 	os_reason_free(jetsam_reason);
7257 
7258 	if (!killed) {
7259 		*memory_reclaimed = 0;
7260 
7261 		/* Clear snapshot if freshly captured and no target was found */
7262 		if (new_snapshot) {
7263 			proc_list_lock();
7264 			memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
7265 			proc_list_unlock();
7266 		}
7267 	}
7268 
7269 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
7270 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, *memory_reclaimed, 0);
7271 
7272 	return killed;
7273 }
7274 
7275 /*
7276  * Jetsam a process pinned in the elevated band.
7277  *
7278  * Return:  true -- a pinned process was jetsammed
7279  *	    false -- no pinned process was jetsammed
7280  */
7281 boolean_t
memorystatus_kill_elevated_process(uint32_t cause,os_reason_t jetsam_reason,unsigned int band,int aggr_count,uint32_t * errors,uint64_t * memory_reclaimed)7282 memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors, uint64_t *memory_reclaimed)
7283 {
7284 	pid_t aPid = 0;
7285 	proc_t p = PROC_NULL, next_p = PROC_NULL;
7286 	boolean_t new_snapshot = FALSE, killed = FALSE;
7287 	int kill_count = 0;
7288 	uint32_t aPid_ep;
7289 	uint64_t killtime = 0;
7290 	uint64_t time_in_priority_band_secs = 0;
7291 	clock_sec_t     tv_sec;
7292 	clock_usec_t    tv_usec;
7293 	uint32_t        tv_msec;
7294 	uint64_t footprint_of_killed_proc = 0;
7295 
7296 
7297 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
7298 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES);
7299 
7300 #if CONFIG_FREEZE
7301 	boolean_t consider_frozen_only = FALSE;
7302 
7303 	if (band == (unsigned int) memorystatus_freeze_jetsam_band) {
7304 		consider_frozen_only = TRUE;
7305 	}
7306 #endif /* CONFIG_FREEZE */
7307 
7308 	proc_list_lock();
7309 
7310 	next_p = memorystatus_get_first_proc_locked(&band, FALSE);
7311 	while (next_p) {
7312 		p = next_p;
7313 		next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
7314 
7315 		aPid = proc_getpid(p);
7316 		aPid_ep = p->p_memstat_effectivepriority;
7317 
7318 		/*
7319 		 * Only pick a process pinned in this elevated band
7320 		 */
7321 		if (!_memstat_proc_is_elevated(p)) {
7322 			continue;
7323 		}
7324 
7325 		if (p->p_memstat_state  & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
7326 			continue;
7327 		}
7328 
7329 #if CONFIG_FREEZE
7330 		if (consider_frozen_only && !_memstat_proc_is_frozen(p)) {
7331 			continue;
7332 		}
7333 
7334 		if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
7335 			continue;
7336 		}
7337 #endif /* CONFIG_FREEZE */
7338 
7339 #if DEVELOPMENT || DEBUG
7340 		memorystatus_log_info(
7341 			"jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n",
7342 			aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"), MEMORYSTATUS_LOG_AVAILABLE_PAGES);
7343 #endif /* DEVELOPMENT || DEBUG */
7344 
7345 		if (memorystatus_jetsam_snapshot_count == 0) {
7346 			memorystatus_init_jetsam_snapshot_locked(NULL, 0);
7347 			new_snapshot = TRUE;
7348 		}
7349 
7350 		p->p_memstat_state |= P_MEMSTAT_TERMINATED;
7351 
7352 		killtime = mach_absolute_time();
7353 		absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
7354 		tv_msec = tv_usec / 1000;
7355 
7356 		memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
7357 
7358 		if (proc_ref(p, true) == p) {
7359 			proc_list_unlock();
7360 
7361 			/*
7362 			 * memorystatus_do_kill drops a reference, so take another one so we can
7363 			 * continue to use this exit reason even after memorystatus_do_kill()
7364 			 * returns
7365 			 */
7366 			os_reason_ref(jetsam_reason);
7367 			killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
7368 
7369 			absolutetime_to_nanoseconds(killtime - p->p_memstat_prio_start, &time_in_priority_band_secs);
7370 			time_in_priority_band_secs /= NSEC_PER_SEC;
7371 			memorystatus_log("%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d %llus rf:%s type:%s) %lluKB - memorystatus_available_pages: %llu\n",
7372 			    (unsigned long)tv_sec, tv_msec,
7373 			    aggr_count,
7374 			    aPid, ((p && *p->p_name) ? p->p_name : "unknown"),
7375 			    memstat_kill_cause_name[cause], aPid_ep,
7376 			    time_in_priority_band_secs, _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags),
7377 			    _memstat_proc_type_description(p),
7378 			    footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
7379 
7380 			/* Success? */
7381 			if (killed) {
7382 				*memory_reclaimed = footprint_of_killed_proc;
7383 				proc_rele(p);
7384 				kill_count++;
7385 				goto exit;
7386 			}
7387 
7388 			/*
7389 			 * Failure - first unwind the state,
7390 			 * then fall through to restart the search.
7391 			 */
7392 			proc_list_lock();
7393 			proc_rele(p);
7394 			p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
7395 			p->p_memstat_state |= P_MEMSTAT_ERROR;
7396 			*errors += 1;
7397 		}
7398 
7399 		/*
7400 		 * Failure - restart the search.
7401 		 *
7402 		 * We might have raced with "p" exiting on another core, resulting in no
7403 		 * ref on "p".  Or, we may have failed to kill "p".
7404 		 *
7405 		 * Either way, we fall thru to here, leaving the proc in the
7406 		 * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state.
7407 		 *
7408 		 * And, we hold the the proc_list_lock at this point.
7409 		 */
7410 
7411 		next_p = memorystatus_get_first_proc_locked(&band, FALSE);
7412 	}
7413 
7414 	proc_list_unlock();
7415 
7416 exit:
7417 	os_reason_free(jetsam_reason);
7418 
7419 	if (kill_count == 0) {
7420 		*memory_reclaimed = 0;
7421 
7422 		/* Clear snapshot if freshly captured and no target was found */
7423 		if (new_snapshot) {
7424 			proc_list_lock();
7425 			memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
7426 			proc_list_unlock();
7427 		}
7428 	}
7429 
7430 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
7431 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, kill_count, *memory_reclaimed);
7432 
7433 	return killed;
7434 }
7435 
7436 bool
memorystatus_kill_on_VM_compressor_space_shortage(bool async)7437 memorystatus_kill_on_VM_compressor_space_shortage(bool async)
7438 {
7439 	if (async) {
7440 		os_atomic_store(&memorystatus_compressor_space_shortage, true, release);
7441 		memorystatus_thread_wake();
7442 		return true;
7443 	} else {
7444 		os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE);
7445 		if (jetsam_reason == OS_REASON_NULL) {
7446 			memorystatus_log_error("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n");
7447 		}
7448 
7449 		return memstat_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason);
7450 	}
7451 }
7452 
7453 #if CONFIG_JETSAM
7454 
7455 void
memorystatus_kill_on_vps_starvation(void)7456 memorystatus_kill_on_vps_starvation(void)
7457 {
7458 	os_atomic_store(&memorystatus_pageout_starved, true, release);
7459 	memorystatus_thread_wake();
7460 }
7461 
7462 bool
memorystatus_kill_on_vnode_exhaustion(void)7463 memorystatus_kill_on_vnode_exhaustion(void)
7464 {
7465 	os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE);
7466 	if (jetsam_reason == OS_REASON_NULL) {
7467 		memorystatus_log_error("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n");
7468 	}
7469 
7470 	return memstat_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason);
7471 }
7472 
7473 #endif /* CONFIG_JETSAM */
7474 
7475 bool
memorystatus_kill_on_sustained_pressure()7476 memorystatus_kill_on_sustained_pressure()
7477 {
7478 	os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_SUSTAINED_PRESSURE);
7479 	if (jetsam_reason == OS_REASON_NULL) {
7480 		memorystatus_log_error("%s() failed to allocate jetsam reason\n", __func__);
7481 	}
7482 
7483 	return memstat_kill_process_sync(-1, kMemorystatusKilledSustainedPressure, jetsam_reason);
7484 }
7485 
7486 bool
memstat_kill_with_jetsam_reason_sync(pid_t pid,os_reason_t jetsam_reason)7487 memstat_kill_with_jetsam_reason_sync(pid_t pid, os_reason_t jetsam_reason)
7488 {
7489 	uint32_t kill_cause = jetsam_reason->osr_code <= JETSAM_REASON_MEMORYSTATUS_MAX ?
7490 	    (uint32_t) jetsam_reason->osr_code : JETSAM_REASON_INVALID;
7491 	return memstat_kill_process_sync(pid, kill_cause, jetsam_reason);
7492 }
7493 
7494 bool
memorystatus_kill_on_zone_map_exhaustion(pid_t pid)7495 memorystatus_kill_on_zone_map_exhaustion(pid_t pid)
7496 {
7497 	if (pid == -1) {
7498 		os_atomic_store(&memorystatus_zone_map_is_exhausted, true, release);
7499 		memorystatus_thread_wake();
7500 		return true;
7501 	} else {
7502 		os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION);
7503 		if (jetsam_reason == OS_REASON_NULL) {
7504 			memorystatus_log_error("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n");
7505 		}
7506 		return memstat_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason);
7507 	}
7508 }
7509 
7510 void
memorystatus_on_pageout_scan_end(void)7511 memorystatus_on_pageout_scan_end(void)
7512 {
7513 	/* No-op */
7514 }
7515 
7516 static size_t
memorystatus_priority_list_size(pid_t pid,size_t entry_size)7517 memorystatus_priority_list_size(pid_t pid, size_t entry_size)
7518 {
7519 	assert(
7520 		(entry_size == sizeof(memorystatus_priority_entry_t)) ||
7521 		(entry_size == sizeof(memorystatus_priority_entry_v2_t)));
7522 	uint32_t list_count = (pid == 0) ? memorystatus_list_count : 1;
7523 	return entry_size * list_count;
7524 }
7525 
7526 /* Return both allocated and actual size, since there's a race between allocation and list compilation */
7527 static int
memorystatus_get_priority_list(memorystatus_priority_entry_v2_t ** list_ptr,size_t * buffer_size,size_t * list_size,size_t entry_size)7528 memorystatus_get_priority_list(memorystatus_priority_entry_v2_t **list_ptr, size_t *buffer_size, size_t *list_size, size_t entry_size)
7529 {
7530 	memorystatus_priority_entry_v2_t *entry;
7531 	proc_t p;
7532 	uint32_t i = 0;
7533 
7534 	*list_size = memorystatus_priority_list_size(0, entry_size);
7535 
7536 	/* Otherwise, validate the size of the buffer */
7537 	if (*buffer_size < *list_size) {
7538 		return EINVAL;
7539 	}
7540 
7541 	*list_ptr = kalloc_data(*list_size, Z_WAITOK | Z_ZERO);
7542 	if (!*list_ptr) {
7543 		return ENOMEM;
7544 	}
7545 
7546 	*buffer_size = *list_size;
7547 	*list_size = 0;
7548 
7549 	entry = *list_ptr;
7550 
7551 	proc_list_lock();
7552 
7553 	p = memorystatus_get_first_proc_locked(&i, TRUE);
7554 	while (p && (*list_size < *buffer_size)) {
7555 		entry->pid = proc_getpid(p);
7556 		entry->priority = p->p_memstat_effectivepriority;
7557 		entry->user_data = p->p_memstat_userdata;
7558 
7559 		if (p->p_memstat_memlimit <= 0) {
7560 			task_get_phys_footprint_limit(proc_task(p), &entry->limit);
7561 		} else {
7562 			entry->limit = p->p_memstat_memlimit;
7563 		}
7564 
7565 		entry->state = _memstat_build_state(p);
7566 
7567 		if (entry_size == sizeof(memorystatus_priority_entry_t)) {
7568 			entry = (memorystatus_priority_entry_v2_t *) (((memorystatus_priority_entry_t *)entry) + 1);
7569 		} else {
7570 			/* Only add v2 entries if we're not using the legacy version of this call */
7571 			entry->priority_start_mtime = p->p_memstat_prio_start;
7572 
7573 			entry++;
7574 		}
7575 
7576 		*list_size += entry_size;
7577 		p = memorystatus_get_next_proc_locked(&i, p, TRUE);
7578 	}
7579 
7580 	proc_list_unlock();
7581 
7582 	memorystatus_log_debug("memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
7583 
7584 	return 0;
7585 }
7586 
7587 static int
memorystatus_get_priority_pid(pid_t pid,user_addr_t buffer,size_t buffer_size)7588 memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size)
7589 {
7590 	int error = 0;
7591 	bool zombref = false;
7592 	memorystatus_priority_entry_v2_t mp_entry;
7593 	kern_return_t ret;
7594 	boolean_t size_valid =
7595 	    (buffer_size == sizeof(memorystatus_priority_entry_v2_t)) ||
7596 	    (buffer_size == sizeof(memorystatus_priority_entry_t));
7597 
7598 	/* Validate inputs */
7599 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || !size_valid) {
7600 		return EINVAL;
7601 	}
7602 
7603 	proc_list_lock();
7604 	proc_t p = proc_find_locked(pid);
7605 	if (!p) {
7606 		zombref = true;
7607 		p = proc_find_zombref_locked(pid);
7608 		if (!p) {
7609 			proc_list_unlock();
7610 			return ESRCH;
7611 		}
7612 	}
7613 
7614 	memset(&mp_entry, 0, sizeof(memorystatus_priority_entry_v2_t));
7615 
7616 	mp_entry.pid = proc_getpid(p);
7617 	mp_entry.priority = p->p_memstat_effectivepriority;
7618 	mp_entry.user_data = p->p_memstat_userdata;
7619 	if (p->p_memstat_memlimit <= 0 && !zombref) {
7620 		task_t task = proc_task(p);
7621 		assert(task);
7622 		ret = task_get_phys_footprint_limit(task, &mp_entry.limit);
7623 		if (ret != KERN_SUCCESS) {
7624 			error = mach_to_bsd_errno(ret);
7625 			proc_list_unlock();
7626 			goto done;
7627 		}
7628 	} else {
7629 		mp_entry.limit = p->p_memstat_memlimit;
7630 	}
7631 
7632 	mp_entry.state = _memstat_build_state(p);
7633 	mp_entry.priority_start_mtime = p->p_memstat_prio_start;
7634 	proc_list_unlock();
7635 
7636 	error = copyout(&mp_entry, buffer, buffer_size);
7637 
7638 done:
7639 	if (zombref) {
7640 		proc_drop_zombref(p);
7641 	} else {
7642 		proc_rele(p);
7643 	}
7644 
7645 	return error;
7646 }
7647 
7648 static int
memorystatus_cmd_get_priority_list(pid_t pid,user_addr_t buffer,size_t buffer_size,int32_t * retval,size_t entry_size)7649 memorystatus_cmd_get_priority_list(pid_t pid, user_addr_t buffer, size_t buffer_size, int32_t *retval, size_t entry_size)
7650 {
7651 	int error = 0;
7652 	boolean_t size_only;
7653 	size_t list_size;
7654 
7655 	static_assert(sizeof(memorystatus_priority_entry_v2_t) == 128);
7656 	assert(
7657 		(entry_size == sizeof(memorystatus_priority_entry_t)) ||
7658 		(entry_size == sizeof(memorystatus_priority_entry_v2_t)));
7659 
7660 	/*
7661 	 * When a non-zero pid is provided, the 'list' has only one entry.
7662 	 */
7663 
7664 	size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
7665 
7666 	if (pid != 0) {
7667 		/* One PID */
7668 		list_size = entry_size;
7669 		if (!size_only) {
7670 			error = memorystatus_get_priority_pid(pid, buffer, buffer_size);
7671 		}
7672 	} else if (size_only) {
7673 		/* List size query */
7674 		list_size = memorystatus_priority_list_size(0, entry_size);
7675 	} else {
7676 		/* List */
7677 		memorystatus_priority_entry_v2_t *list = NULL;
7678 		error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, entry_size);
7679 		if (error == 0) {
7680 			error = copyout(list, buffer, list_size);
7681 			kfree_data(list, buffer_size);
7682 		}
7683 	}
7684 
7685 	if (error == 0) {
7686 		assert(list_size <= INT32_MAX);
7687 		*retval = (int32_t) list_size;
7688 	}
7689 
7690 	return error;
7691 }
7692 
7693 static void
memorystatus_clear_errors(void)7694 memorystatus_clear_errors(void)
7695 {
7696 	proc_t p;
7697 	unsigned int i = 0;
7698 
7699 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START);
7700 
7701 	proc_list_lock();
7702 
7703 	p = memorystatus_get_first_proc_locked(&i, TRUE);
7704 	while (p) {
7705 		if (p->p_memstat_state & P_MEMSTAT_ERROR) {
7706 			p->p_memstat_state &= ~P_MEMSTAT_ERROR;
7707 		}
7708 		p = memorystatus_get_next_proc_locked(&i, p, TRUE);
7709 	}
7710 
7711 	proc_list_unlock();
7712 
7713 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END);
7714 }
7715 
7716 void
memorystatus_fast_jetsam_override(bool enable_override)7717 memorystatus_fast_jetsam_override(bool enable_override)
7718 {
7719 #if CONFIG_JETSAM
7720 	fast_jetsam_enabled = !enable_override;
7721 	if (!fast_jetsam_enabled) {
7722 		/* Disable any pre-configured policies */
7723 		os_atomic_store(&memstat_policy_config, kPolicyDefault, relaxed);
7724 		memorystatus_thread_pool_default();
7725 		_memstat_consider_waking_jetsam_thread();
7726 	}
7727 #else /* CONFIG_JETSAM */
7728 	(void)enable_override;
7729 #endif /* CONFIG_JETSAM */
7730 }
7731 
7732 /*
7733  * Get the at_boot snapshot
7734  */
7735 static int
memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)7736 memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
7737 {
7738 	size_t input_size = *snapshot_size;
7739 
7740 	/*
7741 	 * The at_boot snapshot has no entry list.
7742 	 */
7743 	*snapshot_size = sizeof(memorystatus_jetsam_snapshot_t);
7744 
7745 	if (size_only) {
7746 		return 0;
7747 	}
7748 
7749 	/*
7750 	 * Validate the size of the snapshot buffer
7751 	 */
7752 	if (input_size < *snapshot_size) {
7753 		return EINVAL;
7754 	}
7755 
7756 	/*
7757 	 * Update the notification_time only
7758 	 */
7759 	memorystatus_at_boot_snapshot.notification_time = mach_absolute_time();
7760 	*snapshot = &memorystatus_at_boot_snapshot;
7761 
7762 	memorystatus_log_debug(
7763 		"memorystatus_get_at_boot_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%d)\n",
7764 		(long)input_size, (long)*snapshot_size, 0);
7765 	return 0;
7766 }
7767 
7768 #if CONFIG_FREEZE
7769 static int
memorystatus_get_jetsam_snapshot_freezer(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)7770 memorystatus_get_jetsam_snapshot_freezer(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
7771 {
7772 	size_t input_size = *snapshot_size;
7773 
7774 	if (memorystatus_jetsam_snapshot_freezer->entry_count > 0) {
7775 		*snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_freezer->entry_count));
7776 	} else {
7777 		*snapshot_size = 0;
7778 	}
7779 	assert(*snapshot_size <= memorystatus_jetsam_snapshot_freezer_size);
7780 
7781 	if (size_only) {
7782 		return 0;
7783 	}
7784 
7785 	if (input_size < *snapshot_size) {
7786 		return EINVAL;
7787 	}
7788 
7789 	*snapshot = memorystatus_jetsam_snapshot_freezer;
7790 
7791 	memorystatus_log_debug(
7792 		"memorystatus_get_jetsam_snapshot_freezer: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7793 		(long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_freezer->entry_count);
7794 
7795 	return 0;
7796 }
7797 #endif /* CONFIG_FREEZE */
7798 
7799 static int
memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)7800 memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
7801 {
7802 	size_t input_size = *snapshot_size;
7803 	uint32_t ods_list_count = memorystatus_list_count + memorystatus_artificial_snapshot_entry_count;
7804 	memorystatus_jetsam_snapshot_t *ods = NULL;     /* The on_demand snapshot buffer */
7805 
7806 	*snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (ods_list_count));
7807 
7808 	if (size_only) {
7809 		return 0;
7810 	}
7811 
7812 	/*
7813 	 * Validate the size of the snapshot buffer.
7814 	 * This is inherently racey. May want to revisit
7815 	 * this error condition and trim the output when
7816 	 * it doesn't fit.
7817 	 */
7818 	if (input_size < *snapshot_size) {
7819 		return EINVAL;
7820 	}
7821 
7822 	/*
7823 	 * Allocate and initialize a snapshot buffer.
7824 	 */
7825 	ods = kalloc_data(*snapshot_size, Z_WAITOK | Z_ZERO);
7826 	if (!ods) {
7827 		return ENOMEM;
7828 	}
7829 
7830 	proc_list_lock();
7831 	memorystatus_init_jetsam_snapshot_locked(ods, ods_list_count);
7832 	proc_list_unlock();
7833 
7834 	/*
7835 	 * Return the kernel allocated, on_demand buffer.
7836 	 * The caller of this routine will copy the data out
7837 	 * to user space and then free the kernel allocated
7838 	 * buffer.
7839 	 */
7840 	*snapshot = ods;
7841 
7842 	memorystatus_log_debug(
7843 		"memorystatus_get_on_demand_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7844 		(long)input_size, (long)*snapshot_size, (long)ods_list_count);
7845 
7846 	return 0;
7847 }
7848 
7849 static int
memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)7850 memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
7851 {
7852 	size_t input_size = *snapshot_size;
7853 
7854 	if (memorystatus_jetsam_snapshot_count > 0) {
7855 		*snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
7856 	} else {
7857 		*snapshot_size = 0;
7858 	}
7859 
7860 	if (size_only) {
7861 		return 0;
7862 	}
7863 
7864 	if (input_size < *snapshot_size) {
7865 		return EINVAL;
7866 	}
7867 
7868 	*snapshot = memorystatus_jetsam_snapshot;
7869 
7870 	memorystatus_log_debug(
7871 		"memorystatus_get_jetsam_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7872 		(long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_count);
7873 
7874 	return 0;
7875 }
7876 
7877 #if JETSAM_ZPRINT_SNAPSHOT
7878 /*
7879  * Utility function to handle copyout of jetsam zprint snapshot data
7880  */
7881 static int
memorystatus_cmd_get_data_buffer(user_addr_t buffer,size_t buffer_size,int32_t * retval,size_t data_size,void * data)7882 memorystatus_cmd_get_data_buffer(
7883 	user_addr_t  buffer,
7884 	size_t       buffer_size,
7885 	int32_t      *retval,
7886 	size_t       data_size,
7887 	void         *data)
7888 {
7889 	boolean_t size_only = (buffer == USER_ADDR_NULL);
7890 	int error;
7891 
7892 	/* Nothing to return if there's no data yet, instruct the caller to try again later. */
7893 	if (data == NULL) {
7894 		*retval = -1;
7895 		return EAGAIN;
7896 	}
7897 
7898 	/* Handle just a size request */
7899 	if (size_only) {
7900 		*retval = (int32_t)data_size;
7901 		return 0;
7902 	}
7903 
7904 	/* buffer needs to be large enough */
7905 	if (buffer_size < data_size) {
7906 		*retval = -1;
7907 		return EINVAL;
7908 	}
7909 
7910 	error = copyout(data, buffer, data_size);
7911 	if (error == 0) {
7912 		*retval = (int32_t)data_size;
7913 	} else {
7914 		*retval = -1;
7915 	}
7916 
7917 	return error;
7918 }
7919 #endif
7920 
7921 static int
memorystatus_cmd_get_jetsam_snapshot(int32_t flags,user_addr_t buffer,size_t buffer_size,int32_t * retval)7922 memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval)
7923 {
7924 	int error = EINVAL;
7925 	boolean_t size_only;
7926 	boolean_t is_default_snapshot = FALSE;
7927 	boolean_t is_on_demand_snapshot = FALSE;
7928 	boolean_t is_at_boot_snapshot = FALSE;
7929 #if CONFIG_FREEZE
7930 	bool is_freezer_snapshot = false;
7931 #endif /* CONFIG_FREEZE */
7932 	memorystatus_jetsam_snapshot_t *snapshot;
7933 
7934 	size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
7935 
7936 	if (flags == 0) {
7937 		/* Default */
7938 		is_default_snapshot = TRUE;
7939 		error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only);
7940 	} else {
7941 		if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT | MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER)) {
7942 			/*
7943 			 * Unsupported bit set in flag.
7944 			 */
7945 			return EINVAL;
7946 		}
7947 
7948 		if (flags & (flags - 0x1)) {
7949 			/*
7950 			 * Can't have multiple flags set at the same time.
7951 			 */
7952 			return EINVAL;
7953 		}
7954 
7955 		if (flags & MEMORYSTATUS_SNAPSHOT_ON_DEMAND) {
7956 			is_on_demand_snapshot = TRUE;
7957 			/*
7958 			 * When not requesting the size only, the following call will allocate
7959 			 * an on_demand snapshot buffer, which is freed below.
7960 			 */
7961 			error = memorystatus_get_on_demand_snapshot(&snapshot, &buffer_size, size_only);
7962 		} else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) {
7963 			is_at_boot_snapshot = TRUE;
7964 			error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only);
7965 #if CONFIG_FREEZE
7966 		} else if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER) {
7967 			is_freezer_snapshot = true;
7968 			error = memorystatus_get_jetsam_snapshot_freezer(&snapshot, &buffer_size, size_only);
7969 #endif /* CONFIG_FREEZE */
7970 		} else {
7971 			/*
7972 			 * Invalid flag setting.
7973 			 */
7974 			return EINVAL;
7975 		}
7976 	}
7977 
7978 	if (error) {
7979 		goto out;
7980 	}
7981 
7982 	/*
7983 	 * Copy the data out to user space and clear the snapshot buffer.
7984 	 * If working with the jetsam snapshot,
7985 	 *	clearing the buffer means, reset the count.
7986 	 * If working with an on_demand snapshot
7987 	 *	clearing the buffer means, free it.
7988 	 * If working with the at_boot snapshot
7989 	 *	there is nothing to clear or update.
7990 	 * If working with a copy of the snapshot
7991 	 *	there is nothing to clear or update.
7992 	 * If working with the freezer snapshot
7993 	 *	clearing the buffer means, reset the count.
7994 	 */
7995 	if (!size_only) {
7996 		if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
7997 #if CONFIG_FREEZE
7998 			if (is_default_snapshot || is_freezer_snapshot) {
7999 #else
8000 			if (is_default_snapshot) {
8001 #endif /* CONFIG_FREEZE */
8002 				/*
8003 				 * The jetsam snapshot is never freed, its count is simply reset.
8004 				 * However, we make a copy for any parties that might be interested
8005 				 * in the previous fully populated snapshot.
8006 				 */
8007 				proc_list_lock();
8008 #if DEVELOPMENT || DEBUG
8009 				if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != proc_getpid(current_proc())) {
8010 					/* Snapshot is currently owned by someone else. Don't consume it. */
8011 					proc_list_unlock();
8012 					goto out;
8013 				}
8014 #endif /* (DEVELOPMENT || DEBUG)*/
8015 				if (is_default_snapshot) {
8016 					snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
8017 				}
8018 #if CONFIG_FREEZE
8019 				else if (is_freezer_snapshot) {
8020 					memorystatus_jetsam_snapshot_freezer->entry_count = 0;
8021 				}
8022 #endif /* CONFIG_FREEZE */
8023 				proc_list_unlock();
8024 			}
8025 		}
8026 
8027 		if (is_on_demand_snapshot) {
8028 			/*
8029 			 * The on_demand snapshot is always freed,
8030 			 * even if the copyout failed.
8031 			 */
8032 			kfree_data(snapshot, buffer_size);
8033 		}
8034 	}
8035 
8036 out:
8037 	if (error == 0) {
8038 		assert(buffer_size <= INT32_MAX);
8039 		*retval = (int32_t) buffer_size;
8040 	}
8041 	return error;
8042 }
8043 
8044 #if DEVELOPMENT || DEBUG
8045 static int
8046 memorystatus_cmd_set_testing_pid(int32_t flags)
8047 {
8048 	int error = EINVAL;
8049 	proc_t caller = current_proc();
8050 	assert(caller != kernproc);
8051 	proc_list_lock();
8052 	if (flags & MEMORYSTATUS_FLAGS_SET_TESTING_PID) {
8053 		if (memorystatus_testing_pid == 0) {
8054 			memorystatus_testing_pid = proc_getpid(caller);
8055 			error = 0;
8056 		} else if (memorystatus_testing_pid == proc_getpid(caller)) {
8057 			error = 0;
8058 		} else {
8059 			/* We don't allow ownership to be taken from another proc. */
8060 			error = EBUSY;
8061 		}
8062 	} else if (flags & MEMORYSTATUS_FLAGS_UNSET_TESTING_PID) {
8063 		if (memorystatus_testing_pid == proc_getpid(caller)) {
8064 			memorystatus_testing_pid = 0;
8065 			error = 0;
8066 		} else if (memorystatus_testing_pid != 0) {
8067 			/* We don't allow ownership to be taken from another proc. */
8068 			error = EPERM;
8069 		}
8070 	} else if (flags & MEMORYSTATUS_FLAGS_SET_IMP_TESTING_PID) {
8071 		caller->p_memstat_state |= P_MEMSTAT_TEST_IMP_ASSERTION;
8072 		error = 0;
8073 	}
8074 	proc_list_unlock();
8075 
8076 	return error;
8077 }
8078 #endif /* DEVELOPMENT || DEBUG */
8079 
8080 /*
8081  *      Routine:	memorystatus_cmd_grp_set_priorities
8082  *	Purpose:	Update priorities for a group of processes.
8083  *
8084  *	[priority]
8085  *		Move each process out of its effective priority
8086  *		band and into a new priority band.
8087  *		Maintains relative order from lowest to highest priority.
8088  *		In single band, maintains relative order from head to tail.
8089  *
8090  *		eg: before	[effectivepriority | pid]
8091  *				[18 | p101              ]
8092  *				[17 | p55, p67, p19     ]
8093  *				[12 | p103 p10          ]
8094  *				[ 7 | p25               ]
8095  *			        [ 0 | p71, p82,         ]
8096  *
8097  *		after	[ new band | pid]
8098  *			[ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
8099  *
8100  *	Returns:  0 on success, else non-zero.
8101  *
8102  *	Caveat:   We know there is a race window regarding recycled pids.
8103  *		  A process could be killed before the kernel can act on it here.
8104  *		  If a pid cannot be found in any of the jetsam priority bands,
8105  *		  then we simply ignore it.  No harm.
8106  *		  But, if the pid has been recycled then it could be an issue.
8107  *		  In that scenario, we might move an unsuspecting process to the new
8108  *		  priority band. It's not clear how the kernel can safeguard
8109  *		  against this, but it would be an extremely rare case anyway.
8110  *		  The caller of this api might avoid such race conditions by
8111  *		  ensuring that the processes passed in the pid list are suspended.
8112  */
8113 
8114 
8115 static int
8116 memorystatus_cmd_grp_set_priorities(user_addr_t buffer, size_t buffer_size)
8117 {
8118 	/*
8119 	 * We only handle setting priority
8120 	 * per process
8121 	 */
8122 	int error = 0;
8123 	memorystatus_properties_entry_v1_t *entries = NULL;
8124 	size_t entry_count = 0;
8125 
8126 	/* This will be the ordered proc list */
8127 	typedef struct memorystatus_internal_properties {
8128 		proc_t proc;
8129 		int32_t priority;
8130 	} memorystatus_internal_properties_t;
8131 
8132 	memorystatus_internal_properties_t *table = NULL;
8133 	uint32_t table_count = 0;
8134 
8135 	size_t i = 0;
8136 	uint32_t bucket_index = 0;
8137 	int32_t new_priority;
8138 
8139 	proc_t p;
8140 
8141 	/* Verify inputs */
8142 	if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
8143 		error = EINVAL;
8144 		goto out;
8145 	}
8146 
8147 	entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
8148 	if (entry_count == 0) {
8149 		/* buffer size was not large enough for a single entry */
8150 		error = EINVAL;
8151 		goto out;
8152 	}
8153 
8154 	if ((entries = kalloc_data(buffer_size, Z_WAITOK)) == NULL) {
8155 		error = ENOMEM;
8156 		goto out;
8157 	}
8158 
8159 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count);
8160 
8161 	if ((error = copyin(buffer, entries, buffer_size)) != 0) {
8162 		goto out;
8163 	}
8164 
8165 	/* Verify sanity of input priorities */
8166 	if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
8167 		if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
8168 			error = EINVAL;
8169 			goto out;
8170 		}
8171 	} else {
8172 		error = EINVAL;
8173 		goto out;
8174 	}
8175 
8176 	for (i = 0; i < entry_count; i++) {
8177 		if (entries[i].priority == -1) {
8178 			/* Use as shorthand for default priority */
8179 			entries[i].priority = JETSAM_PRIORITY_DEFAULT;
8180 		} else if (entries[i].priority > JETSAM_PRIORITY_IDLE && entries[i].priority <= applications_aging_band) {
8181 			/*
8182 			 * Everything between idle and the aging bands are reserved for internal use.
8183 			 * if requested, adjust to JETSAM_PRIORITY_IDLE.
8184 			 * Entitled processes (just munch) can use a subset of this range for testing.
8185 			 */
8186 			if (entries[i].priority > JETSAM_PRIORITY_ENTITLED_MAX ||
8187 			    !current_task_can_use_entitled_range()) {
8188 				entries[i].priority = JETSAM_PRIORITY_IDLE;
8189 			}
8190 		} else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
8191 			/* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
8192 			 * queue */
8193 			/* Deal with this later */
8194 		} else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
8195 			/* Sanity check */
8196 			error = EINVAL;
8197 			goto out;
8198 		}
8199 	}
8200 
8201 	table = kalloc_type(memorystatus_internal_properties_t, entry_count,
8202 	    Z_WAITOK | Z_ZERO);
8203 	if (table == NULL) {
8204 		error = ENOMEM;
8205 		goto out;
8206 	}
8207 
8208 
8209 	/*
8210 	 * For each jetsam bucket entry, spin through the input property list.
8211 	 * When a matching pid is found, populate an adjacent table with the
8212 	 * appropriate proc pointer and new property values.
8213 	 * This traversal automatically preserves order from lowest
8214 	 * to highest priority.
8215 	 */
8216 
8217 	bucket_index = 0;
8218 
8219 	proc_list_lock();
8220 
8221 	/* Create the ordered table */
8222 	p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
8223 	while (p && (table_count < entry_count)) {
8224 		for (i = 0; i < entry_count; i++) {
8225 			if (proc_getpid(p) == entries[i].pid) {
8226 				/* Build the table data  */
8227 				table[table_count].proc = p;
8228 				table[table_count].priority = entries[i].priority;
8229 				table_count++;
8230 				break;
8231 			}
8232 		}
8233 		p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
8234 	}
8235 
8236 	/* We now have ordered list of procs ready to move */
8237 	for (i = 0; i < table_count; i++) {
8238 		p = table[i].proc;
8239 		assert(p != NULL);
8240 		memstat_priority_options_t priority_options = MEMSTAT_PRIORITY_OPTIONS_NONE;
8241 
8242 		/* Allow head inserts -- but relative order is now  */
8243 		if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
8244 			new_priority = JETSAM_PRIORITY_IDLE;
8245 			priority_options |= (MEMSTAT_PRIORITY_INSERT_HEAD | MEMSTAT_PRIORITY_NO_AGING);
8246 		} else {
8247 			new_priority = table[i].priority;
8248 		}
8249 
8250 		/* Not allowed */
8251 		if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
8252 			continue;
8253 		}
8254 
8255 		memstat_update_priority_locked(p, new_priority, priority_options);
8256 	}
8257 
8258 	proc_list_unlock();
8259 
8260 	/*
8261 	 * if (table_count != entry_count)
8262 	 * then some pids were not found in a jetsam band.
8263 	 * harmless but interesting...
8264 	 */
8265 out:
8266 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, table_count);
8267 
8268 	kfree_data(entries, buffer_size);
8269 	kfree_type(memorystatus_internal_properties_t, entry_count, table);
8270 
8271 	return error;
8272 }
8273 
8274 memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL;
8275 size_t memorystatus_global_probabilities_size = 0;
8276 
8277 static int
8278 memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size)
8279 {
8280 	int error = 0;
8281 	memorystatus_properties_entry_v1_t *entries = NULL;
8282 	size_t entry_count = 0, i = 0;
8283 	memorystatus_internal_probabilities_t *tmp_table_new = NULL, *tmp_table_old = NULL;
8284 	size_t tmp_table_new_size = 0, tmp_table_old_size = 0;
8285 #if DEVELOPMENT || DEBUG
8286 	if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != proc_getpid(current_proc())) {
8287 		/* probabilites are currently owned by someone else. Don't change them. */
8288 		error = EPERM;
8289 		goto out;
8290 	}
8291 #endif /* (DEVELOPMENT || DEBUG)*/
8292 
8293 	/* Verify inputs */
8294 	if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
8295 		error = EINVAL;
8296 		goto out;
8297 	}
8298 
8299 	entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
8300 	if (entry_count == 0) {
8301 		error = EINVAL;
8302 		goto out;
8303 	}
8304 
8305 	if ((entries = kalloc_data(buffer_size, Z_WAITOK)) == NULL) {
8306 		error = ENOMEM;
8307 		goto out;
8308 	}
8309 
8310 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count);
8311 
8312 	if ((error = copyin(buffer, entries, buffer_size)) != 0) {
8313 		goto out;
8314 	}
8315 
8316 	if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
8317 		if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
8318 			error = EINVAL;
8319 			goto out;
8320 		}
8321 	} else {
8322 		error = EINVAL;
8323 		goto out;
8324 	}
8325 
8326 	/* Verify sanity of input priorities */
8327 	for (i = 0; i < entry_count; i++) {
8328 		/*
8329 		 * 0 - low probability of use.
8330 		 * 1 - high probability of use.
8331 		 *
8332 		 * Keeping this field an int (& not a bool) to allow
8333 		 * us to experiment with different values/approaches
8334 		 * later on.
8335 		 */
8336 		if (entries[i].use_probability > 1) {
8337 			error = EINVAL;
8338 			goto out;
8339 		}
8340 	}
8341 
8342 	tmp_table_new_size = sizeof(memorystatus_internal_probabilities_t) * entry_count;
8343 
8344 	if ((tmp_table_new = kalloc_data(tmp_table_new_size, Z_WAITOK | Z_ZERO)) == NULL) {
8345 		error = ENOMEM;
8346 		goto out;
8347 	}
8348 
8349 	proc_list_lock();
8350 
8351 	if (memorystatus_global_probabilities_table) {
8352 		tmp_table_old = memorystatus_global_probabilities_table;
8353 		tmp_table_old_size = memorystatus_global_probabilities_size;
8354 	}
8355 
8356 	memorystatus_global_probabilities_table = tmp_table_new;
8357 	memorystatus_global_probabilities_size = tmp_table_new_size;
8358 	tmp_table_new = NULL;
8359 
8360 	for (i = 0; i < entry_count; i++) {
8361 		/* Build the table data  */
8362 		strlcpy(memorystatus_global_probabilities_table[i].proc_name, entries[i].proc_name, MAXCOMLEN + 1);
8363 		memorystatus_global_probabilities_table[i].use_probability = entries[i].use_probability;
8364 	}
8365 
8366 	proc_list_unlock();
8367 
8368 out:
8369 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, tmp_table_new_size);
8370 
8371 	kfree_data(entries, buffer_size);
8372 	kfree_data(tmp_table_old, tmp_table_old_size);
8373 
8374 	return error;
8375 }
8376 
8377 static int
8378 memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8379 {
8380 	int error = 0;
8381 
8382 	if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) {
8383 		error = memorystatus_cmd_grp_set_priorities(buffer, buffer_size);
8384 	} else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) == MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) {
8385 		error = memorystatus_cmd_grp_set_probabilities(buffer, buffer_size);
8386 #if CONFIG_FREEZE
8387 	} else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_FREEZE_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_FREEZE_PRIORITY) {
8388 		error = memorystatus_cmd_grp_set_freeze_list(buffer, buffer_size);
8389 	} else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_DEMOTE_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_DEMOTE_PRIORITY) {
8390 		error = memorystatus_cmd_grp_set_demote_list(buffer, buffer_size);
8391 #endif /* CONFIG_FREEZE */
8392 	} else {
8393 		error = EINVAL;
8394 	}
8395 
8396 	return error;
8397 }
8398 
8399 /*
8400  * This routine is used to update a process's jetsam priority position and stored user_data.
8401  * It is not used for the setting of memory limits.
8402  *
8403  * Flags passed into this call are used to distinguish the motivation behind a jetsam priority
8404  * transition.  By default, the kernel updates the process's original requested priority when
8405  * no flag is passed.  But when the MEMORYSTATUS_SET_PRIORITY_ASSERTION flag is used, the kernel
8406  * updates the process's assertion driven priority.
8407  *
8408  * The assertion flag was introduced for use by the device's assertion mediator (eg: runningboardd).
8409  * When an assertion is controlling a process's jetsam priority, it may conflict with that process's
8410  * dirty/clean (active/inactive) jetsam state.  The kernel attempts to resolve a priority transition
8411  * conflict by reviewing the process state and then choosing the maximum jetsam band at play,
8412  * eg: requested priority versus assertion priority.
8413  */
8414 
8415 static int
8416 memorystatus_cmd_set_priority_properties(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8417 {
8418 	int error = 0;
8419 	memorystatus_priority_properties_t mpp_entry;
8420 
8421 	/* Validate inputs */
8422 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_properties_t))) {
8423 		return EINVAL;
8424 	}
8425 
8426 	/* Validate flags */
8427 	if (flags & ~(MEMORYSTATUS_SET_PRIORITY_ASSERTION)) {
8428 		/*
8429 		 * Unsupported bit set in flag.
8430 		 */
8431 		return EINVAL;
8432 	}
8433 
8434 	error = copyin(buffer, &mpp_entry, buffer_size);
8435 
8436 	if (error == 0) {
8437 		proc_t p;
8438 
8439 		p = proc_find(pid);
8440 		if (!p) {
8441 			return ESRCH;
8442 		}
8443 
8444 		if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
8445 			proc_rele(p);
8446 			return EPERM;
8447 		}
8448 
8449 		if ((flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) &&
8450 		    !(p->p_memstat_state & P_MEMSTAT_MANAGED)) {
8451 			/*
8452 			 * Assertion-
8453 			 * processes.
8454 			 */
8455 			proc_rele(p);
8456 			return EPERM;
8457 		}
8458 
8459 		memstat_priority_options_t options = MEMSTAT_PRIORITY_OPTIONS_NONE;
8460 		if (flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) {
8461 			options |= MEMSTAT_PRIORITY_IS_ASSERTION;
8462 		}
8463 		error = memorystatus_set_priority(p, mpp_entry.priority, mpp_entry.user_data,
8464 		    options);
8465 		proc_rele(p);
8466 	}
8467 
8468 	return error;
8469 }
8470 
8471 static int
8472 memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8473 {
8474 	int error = 0;
8475 	memorystatus_memlimit_properties_t mmp_entry;
8476 
8477 	/* Validate inputs */
8478 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
8479 		return EINVAL;
8480 	}
8481 
8482 	error = copyin(buffer, &mmp_entry, buffer_size);
8483 
8484 	if (error == 0) {
8485 		error = memorystatus_set_memlimit_properties(pid, &mmp_entry);
8486 	}
8487 
8488 	return error;
8489 }
8490 
8491 #if DEBUG || DEVELOPMENT
8492 static int
8493 memorystatus_cmd_set_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8494 {
8495 	int error = 0;
8496 	memorystatus_diag_memlimit_properties_t mmp_entry;
8497 	proc_t p = proc_find(pid);
8498 	if (!p) {
8499 		return ESRCH;
8500 	}
8501 
8502 	/* Validate inputs */
8503 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_diag_memlimit_properties_t))) {
8504 		proc_rele(p);
8505 		return EINVAL;
8506 	}
8507 
8508 	error = copyin(buffer, &mmp_entry, buffer_size);
8509 
8510 	if (error == 0) {
8511 		proc_list_lock();
8512 		error = memorystatus_set_diag_memlimit_properties_internal(p, &mmp_entry);
8513 		proc_list_unlock();
8514 	}
8515 	proc_rele(p);
8516 	return error;
8517 }
8518 
8519 static int
8520 memorystatus_cmd_get_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8521 {
8522 	int error = 0;
8523 	memorystatus_diag_memlimit_properties_t mmp_entry;
8524 	proc_t p = proc_find(pid);
8525 	if (!p) {
8526 		return ESRCH;
8527 	}
8528 
8529 	/* Validate inputs */
8530 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_diag_memlimit_properties_t))) {
8531 		proc_rele(p);
8532 		return EINVAL;
8533 	}
8534 	proc_list_lock();
8535 	error = memorystatus_get_diag_memlimit_properties_internal(p, &mmp_entry);
8536 	proc_list_unlock();
8537 	proc_rele(p);
8538 	if (error == 0) {
8539 		error = copyout(&mmp_entry, buffer, buffer_size);
8540 	}
8541 
8542 
8543 	return error;
8544 }
8545 #endif //DEBUG || DEVELOPMENT
8546 
8547 static int
8548 _memstat_get_process_conclave_mem_limit(pid_t pid, int32_t *retval)
8549 {
8550 	kern_return_t error;
8551 	proc_t p = proc_find(pid);
8552 	if (!p) {
8553 		return ESRCH;
8554 	}
8555 
8556 	uint64_t conclave_limit;
8557 	error = task_get_conclave_mem_limit(proc_task(p), &conclave_limit);
8558 
8559 	if (error == KERN_SUCCESS) {
8560 		*retval = roundToNearestMB((uint32_t)conclave_limit);
8561 	}
8562 
8563 	proc_rele(p);
8564 	return mach_to_bsd_errno(error);
8565 }
8566 
8567 static void
8568 memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t* p_entry)
8569 {
8570 	memset(p_entry, 0, sizeof(memorystatus_memlimit_properties_t));
8571 
8572 	if (p->p_memstat_memlimit_active > 0) {
8573 		p_entry->memlimit_active = p->p_memstat_memlimit_active;
8574 	} else {
8575 		task_convert_phys_footprint_limit(-1, &p_entry->memlimit_active);
8576 	}
8577 
8578 	if (_memstat_proc_active_memlimit_is_fatal(p)) {
8579 		p_entry->memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8580 	}
8581 
8582 	/*
8583 	 * Get the inactive limit and attributes
8584 	 */
8585 	if (p->p_memstat_memlimit_inactive <= 0) {
8586 		task_convert_phys_footprint_limit(-1, &p_entry->memlimit_inactive);
8587 	} else {
8588 		p_entry->memlimit_inactive = p->p_memstat_memlimit_inactive;
8589 	}
8590 	if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
8591 		p_entry->memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8592 	}
8593 }
8594 
8595 /*
8596  * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit().
8597  * That gets the proc's cached memlimit and there is no guarantee that the active/inactive
8598  * limits will be the same in the no-limit case.  Instead we convert limits <= 0 using
8599  * task_convert_phys_footprint_limit(). It computes the same limit value that would be written
8600  * to the task's ledgers via task_set_phys_footprint_limit().
8601  */
8602 static int
8603 memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8604 {
8605 	memorystatus_memlimit_properties2_t mmp_entry;
8606 
8607 	/* Validate inputs */
8608 	if ((pid == 0) || (buffer == USER_ADDR_NULL) ||
8609 	    ((buffer_size != sizeof(memorystatus_memlimit_properties_t)) &&
8610 	    (buffer_size != sizeof(memorystatus_memlimit_properties2_t)))) {
8611 		return EINVAL;
8612 	}
8613 
8614 	memset(&mmp_entry, 0, sizeof(memorystatus_memlimit_properties2_t));
8615 
8616 	proc_t p = proc_find(pid);
8617 	if (!p) {
8618 		return ESRCH;
8619 	}
8620 
8621 	/*
8622 	 * Get the active limit and attributes.
8623 	 * No locks taken since we hold a reference to the proc.
8624 	 */
8625 
8626 	memorystatus_get_memlimit_properties_internal(p, &mmp_entry.v1);
8627 
8628 #if CONFIG_JETSAM
8629 #if DEVELOPMENT || DEBUG
8630 	/*
8631 	 * Get the limit increased via SPI
8632 	 */
8633 	mmp_entry.memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
8634 	mmp_entry.memlimit_increase_bytes = p->p_memlimit_increase;
8635 #endif /* DEVELOPMENT || DEBUG */
8636 #endif /* CONFIG_JETSAM */
8637 
8638 	proc_rele(p);
8639 
8640 	int error = copyout(&mmp_entry, buffer, buffer_size);
8641 
8642 	return error;
8643 }
8644 
8645 
8646 /*
8647  * SPI for kbd - pr24956468
8648  * This is a very simple snapshot that calculates how much a
8649  * process's phys_footprint exceeds a specific memory limit.
8650  * Only the inactive memory limit is supported for now.
8651  * The delta is returned as bytes in excess or zero.
8652  */
8653 static int
8654 memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8655 {
8656 	int error = 0;
8657 	uint64_t footprint_in_bytes = 0;
8658 	uint64_t delta_in_bytes = 0;
8659 	int32_t  memlimit_mb = 0;
8660 	uint64_t memlimit_bytes = 0;
8661 
8662 	/* Validate inputs */
8663 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(uint64_t)) || (flags != 0)) {
8664 		return EINVAL;
8665 	}
8666 
8667 	proc_t p = proc_find(pid);
8668 	if (!p) {
8669 		return ESRCH;
8670 	}
8671 
8672 	/*
8673 	 * Get the inactive limit.
8674 	 * No locks taken since we hold a reference to the proc.
8675 	 */
8676 
8677 	if (p->p_memstat_memlimit_inactive <= 0) {
8678 		task_convert_phys_footprint_limit(-1, &memlimit_mb);
8679 	} else {
8680 		memlimit_mb = p->p_memstat_memlimit_inactive;
8681 	}
8682 
8683 	footprint_in_bytes = get_task_phys_footprint(proc_task(p));
8684 
8685 	proc_rele(p);
8686 
8687 	memlimit_bytes = memlimit_mb * 1024 * 1024;     /* MB to bytes */
8688 
8689 	/*
8690 	 * Computed delta always returns >= 0 bytes
8691 	 */
8692 	if (footprint_in_bytes > memlimit_bytes) {
8693 		delta_in_bytes = footprint_in_bytes - memlimit_bytes;
8694 	}
8695 
8696 	error = copyout(&delta_in_bytes, buffer, sizeof(delta_in_bytes));
8697 
8698 	return error;
8699 }
8700 
8701 
8702 static int
8703 memorystatus_cmd_get_pressure_status(int32_t *retval)
8704 {
8705 	int error;
8706 
8707 	/* Need privilege for check */
8708 	error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
8709 	if (error) {
8710 		return error;
8711 	}
8712 
8713 	/* Inherently racy, so it's not worth taking a lock here */
8714 	*retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
8715 
8716 	return error;
8717 }
8718 
8719 int
8720 memorystatus_get_pressure_status_kdp()
8721 {
8722 	return (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
8723 }
8724 
8725 /*
8726  * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
8727  *
8728  * This call is inflexible -- it does not distinguish between active/inactive, fatal/non-fatal
8729  * So, with 2-level HWM preserving previous behavior will map as follows.
8730  *      - treat the limit passed in as both an active and inactive limit.
8731  *      - treat the is_fatal_limit flag as though it applies to both active and inactive limits.
8732  *
8733  * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK
8734  *      - the is_fatal_limit is FALSE, meaning the active and inactive limits are non-fatal/soft
8735  *      - so mapping is (active/non-fatal, inactive/non-fatal)
8736  *
8737  * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT
8738  *      - the is_fatal_limit is TRUE, meaning the process's active and inactive limits are fatal/hard
8739  *      - so mapping is (active/fatal, inactive/fatal)
8740  */
8741 
8742 #if CONFIG_JETSAM
8743 static int
8744 memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit)
8745 {
8746 	int error = 0;
8747 	memorystatus_memlimit_properties_t entry;
8748 
8749 	entry.memlimit_active = high_water_mark;
8750 	entry.memlimit_active_attr = 0;
8751 	entry.memlimit_inactive = high_water_mark;
8752 	entry.memlimit_inactive_attr = 0;
8753 
8754 	if (is_fatal_limit == TRUE) {
8755 		entry.memlimit_active_attr   |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8756 		entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8757 	}
8758 
8759 	error = memorystatus_set_memlimit_properties(pid, &entry);
8760 	return error;
8761 }
8762 
8763 static int
8764 memorystatus_cmd_mark_process_coalition_swappable(pid_t pid, __unused int32_t *retval)
8765 {
8766 	int error = 0;
8767 	proc_t p = PROC_NULL;
8768 	coalition_t coal = COALITION_NULL;
8769 
8770 	if (!memorystatus_swap_all_apps) {
8771 		/* Swap is not supported on this device. */
8772 		return ENOTSUP;
8773 	}
8774 	p = proc_find(pid);
8775 	if (!p) {
8776 		return ESRCH;
8777 	}
8778 	coal = task_get_coalition((task_t) proc_task(p), COALITION_TYPE_JETSAM);
8779 	if (coal && coalition_is_leader((task_t) proc_task(p), coal)) {
8780 		coalition_mark_swappable(coal);
8781 	} else {
8782 		/* This SPI is only supported on coalition leaders. */
8783 		error = EINVAL;
8784 	}
8785 
8786 	proc_rele(p);
8787 	return error;
8788 }
8789 
8790 static int
8791 memorystatus_cmd_get_process_coalition_is_swappable(pid_t pid, int32_t *retval)
8792 {
8793 	int error = 0;
8794 	proc_t p = PROC_NULL;
8795 	coalition_t coal = COALITION_NULL;
8796 
8797 	if (!memorystatus_swap_all_apps) {
8798 		/* Swap is not supported on this device. */
8799 		return ENOTSUP;
8800 	}
8801 	p = proc_find(pid);
8802 	if (!p) {
8803 		return ESRCH;
8804 	}
8805 	coal = task_get_coalition((task_t) proc_task(p), COALITION_TYPE_JETSAM);
8806 	if (coal) {
8807 		*retval = coalition_is_swappable(coal);
8808 	} else {
8809 		error = EINVAL;
8810 	}
8811 
8812 	proc_rele(p);
8813 	return error;
8814 }
8815 
8816 static int
8817 memorystatus_cmd_convert_memlimit_mb(pid_t pid, int32_t limit, int32_t *retval)
8818 {
8819 	int error = 0;
8820 	proc_t p;
8821 	p = proc_find(pid);
8822 	if (!p) {
8823 		return ESRCH;
8824 	}
8825 	if (limit <= 0) {
8826 		/*
8827 		 * A limit of <= 0 implies that the task gets its default limit.
8828 		 */
8829 		limit = memorystatus_get_default_task_active_limit(p);
8830 		if (limit <= 0) {
8831 			/* Task uses system wide default limit */
8832 			limit = max_task_footprint_mb ? max_task_footprint_mb : INT32_MAX;
8833 		}
8834 		*retval = limit;
8835 	} else {
8836 #if DEVELOPMENT || DEBUG
8837 		/* add the current increase to it, for roots */
8838 		limit += roundToNearestMB(p->p_memlimit_increase);
8839 #endif /* DEVELOPMENT || DEBUG */
8840 		*retval = limit;
8841 	}
8842 
8843 	proc_rele(p);
8844 	return error;
8845 }
8846 
8847 static int
8848 _memstat_rearm_proc_memlimit(proc_t proc, void* flagsptr)
8849 {
8850 	task_t task = proc_task(proc);
8851 	uint32_t flags = *((uint32_t *) flagsptr);
8852 
8853 	if (flags & MEMORYSTATUS_FLAGS_REARM_ACTIVE) {
8854 		task_reset_triggered_exc_resource(task, true);
8855 	}
8856 	if (flags & MEMORYSTATUS_FLAGS_REARM_INACTIVE) {
8857 		task_reset_triggered_exc_resource(task, false);
8858 	}
8859 
8860 	return 0;
8861 }
8862 
8863 static int
8864 memorystatus_cmd_rearm_memlimit(pid_t pid, uint32_t flags, __unused int32_t *retval)
8865 {
8866 	if (pid == -1) {
8867 		/* Re-arm all pids */
8868 		proc_iterate(
8869 			PROC_ALLPROCLIST,
8870 			_memstat_rearm_proc_memlimit,
8871 			&flags,
8872 			NULL,
8873 			NULL);
8874 	} else {
8875 		/* Re-arm one pid */
8876 		proc_t p = (pid == proc_selfpid()) ? proc_self() : proc_find(pid);
8877 		if (!p) {
8878 			return ESRCH;
8879 		}
8880 		_memstat_rearm_proc_memlimit(p, &flags);
8881 		proc_rele(p);
8882 	}
8883 
8884 	return 0;
8885 }
8886 #endif /* CONFIG_JETSAM */
8887 
8888 #if DEBUG || DEVELOPMENT
8889 static int
8890 memorystatus_set_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry)
8891 {
8892 	int error = 0;
8893 	uint64_t old_limit = 0;
8894 
8895 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
8896 	/* Enforce the limit by writing to the ledgers */
8897 	error = (task_set_diag_footprint_limit_internal(proc_task(p), p_entry->memlimit, &old_limit) == KERN_SUCCESS) ? KERN_SUCCESS : EINVAL;
8898 
8899 	memorystatus_log_debug( "memorystatus_set_diag_memlimit_properties: new limit on pid %d (%lluMB old %lluMB)\n",
8900 	    proc_getpid(p), (p_entry->memlimit > 0 ? p_entry->memlimit : -1), (old_limit)
8901 	    );
8902 	DTRACE_MEMORYSTATUS2(memorystatus_diag_memlimit_properties_t, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
8903 	return error;
8904 }
8905 
8906 static int
8907 memorystatus_get_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry)
8908 {
8909 	int error = 0;
8910 	/* Enforce the limit by writing to the ledgers */
8911 	error = (task_get_diag_footprint_limit_internal(proc_task(p), &p_entry->memlimit, &p_entry->threshold_enabled) == KERN_SUCCESS) ? KERN_SUCCESS : EINVAL;
8912 
8913 	DTRACE_MEMORYSTATUS2(memorystatus_diag_memlimit_properties_t, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
8914 	return error;
8915 }
8916 #endif // DEBUG || DEVELOPMENT
8917 
8918 bool
8919 memorystatus_task_has_increased_memory_limit_entitlement(task_t task)
8920 {
8921 	if (memorystatus_entitled_max_task_footprint_mb == 0) {
8922 		// Entitlement is not supported on this device.
8923 		return false;
8924 	}
8925 	return IOTaskHasEntitlement(task,
8926 	           "com.apple.developer.kernel.increased-memory-limit");
8927 }
8928 
8929 bool
8930 memorystatus_task_has_increased_debugging_memory_limit_entitlement(task_t task)
8931 {
8932 	if (memorystatus_entitled_dev_max_task_footprint_mb == 0) {
8933 		// Entitlement is not supported on this device.
8934 		return false;
8935 	}
8936 	return IOTaskHasEntitlement(task,
8937 	           "com.apple.developer.kernel.increased-debugging-memory-limit");
8938 }
8939 
8940 bool
8941 memorystatus_task_has_legacy_footprint_entitlement(task_t task)
8942 {
8943 	return IOTaskHasEntitlement(task,
8944 	           "com.apple.private.memory.legacy_footprint");
8945 }
8946 
8947 bool
8948 memorystatus_task_has_ios13extended_footprint_limit(task_t task)
8949 {
8950 	if (max_mem < 1500ULL * 1024 * 1024 ||
8951 	    max_mem > 2ULL * 1024 * 1024 * 1024) {
8952 		/* ios13extended_footprint is only for 2GB devices */
8953 		return false;
8954 	}
8955 	return IOTaskHasEntitlement(task,
8956 	           "com.apple.developer.memory.ios13extended_footprint");
8957 }
8958 
8959 static int32_t
8960 memorystatus_get_default_task_active_limit(proc_t p)
8961 {
8962 	int32_t limit = (int32_t)max_task_footprint_mb;
8963 	task_t task = proc_task(p);
8964 
8965 	/*
8966 	 * Check for the various entitlement footprint hacks
8967 	 * and try to apply each one. Note that if multiple entitlements are present
8968 	 * whichever results in the largest limit applies.
8969 	 */
8970 	if (memorystatus_task_has_increased_debugging_memory_limit_entitlement(task)) {
8971 		limit = MAX(limit, memorystatus_entitled_dev_max_task_footprint_mb);
8972 	}
8973 	if (memorystatus_task_has_increased_memory_limit_entitlement(task)) {
8974 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
8975 		/* On visionOS, we want a separate memory limit for iOS (bincompat) apps. */
8976 		if ((proc_platform(p) == PLATFORM_IOS) &&
8977 		    (memorystatus_entitled_bincompat_max_task_footprint_mb != 0)) {
8978 			limit = MAX(limit, memorystatus_entitled_bincompat_max_task_footprint_mb);
8979 		} else {
8980 			limit = MAX(limit, memorystatus_entitled_max_task_footprint_mb);
8981 		}
8982 #else /* CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT */
8983 		limit = MAX(limit, memorystatus_entitled_max_task_footprint_mb);
8984 #endif /* !CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT */
8985 	}
8986 #if __arm64__
8987 	if (legacy_footprint_entitlement_mode == LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE &&
8988 	    memorystatus_task_has_legacy_footprint_entitlement(task)) {
8989 		limit = MAX(limit, max_task_footprint_mb + legacy_footprint_bonus_mb);
8990 	}
8991 #endif /* __arm64__ */
8992 	if (memorystatus_task_has_ios13extended_footprint_limit(task)) {
8993 		limit = MAX(limit, memorystatus_ios13extended_footprint_limit_mb);
8994 	}
8995 
8996 	return limit;
8997 }
8998 
8999 static int32_t
9000 memorystatus_get_default_task_inactive_limit(proc_t p)
9001 {
9002 	// Currently the default active and inactive limits are always the same.
9003 	return memorystatus_get_default_task_active_limit(p);
9004 }
9005 
9006 static int
9007 memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry)
9008 {
9009 	int32_t memlimit_active, memlimit_inactive;
9010 	memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
9011 
9012 	proc_t p = proc_find(pid);
9013 	if (!p) {
9014 		return ESRCH;
9015 	}
9016 
9017 	/*
9018 	 * Check for valid attribute flags.
9019 	 */
9020 	const uint32_t valid_attrs = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
9021 	if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
9022 		proc_rele(p);
9023 		return EINVAL;
9024 	}
9025 	if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
9026 		proc_rele(p);
9027 		return EINVAL;
9028 	}
9029 
9030 	/*
9031 	 * Setup the active memlimit properties
9032 	 */
9033 	memlimit_active = entry->memlimit_active;
9034 	if (entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
9035 		memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
9036 	}
9037 
9038 	/*
9039 	 * Setup the inactive memlimit properties
9040 	 */
9041 	memlimit_inactive = entry->memlimit_inactive;
9042 	if (entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
9043 		memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
9044 	}
9045 
9046 	int error = memorystatus_set_memlimits(p, memlimit_active,
9047 	    memlimit_inactive, memlimit_options);
9048 	proc_rele(p);
9049 	return error;
9050 }
9051 
9052 /*
9053  * Returns the jetsam priority (effective or requested) of the process
9054  * associated with this task.
9055  */
9056 int
9057 proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
9058 {
9059 	if (p) {
9060 		if (effective_priority) {
9061 			return p->p_memstat_effectivepriority;
9062 		} else {
9063 			return p->p_memstat_requestedpriority;
9064 		}
9065 	}
9066 	return 0;
9067 }
9068 
9069 static int
9070 memorystatus_get_process_is_managed(pid_t pid, int *is_managed)
9071 {
9072 	proc_t p = NULL;
9073 
9074 	/* Validate inputs */
9075 	if (pid == 0) {
9076 		return EINVAL;
9077 	}
9078 
9079 	p = proc_find(pid);
9080 	if (!p) {
9081 		return ESRCH;
9082 	}
9083 
9084 	*is_managed = memorystatus_get_proc_is_managed(p) ? 1 : 0;
9085 
9086 	proc_rele(p);
9087 
9088 	return 0;
9089 }
9090 
9091 bool
9092 memorystatus_get_proc_is_managed(proc_t proc)
9093 {
9094 	proc_list_lock();
9095 	bool is_managed = _memstat_proc_is_managed(proc);
9096 	proc_list_unlock();
9097 	return is_managed;
9098 }
9099 
9100 
9101 static int
9102 memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed)
9103 {
9104 	proc_t p = NULL;
9105 
9106 	/* Validate inputs */
9107 	if (pid == 0) {
9108 		return EINVAL;
9109 	}
9110 
9111 	p = proc_find(pid);
9112 	if (!p) {
9113 		return ESRCH;
9114 	}
9115 
9116 	proc_list_lock();
9117 
9118 	if (set_managed == TRUE) {
9119 		p->p_memstat_state |= P_MEMSTAT_MANAGED;
9120 		/*
9121 		 * The P_MEMSTAT_MANAGED bit is set by Runningboard for Apps.
9122 		 * Also opt them in to being frozen (they might have started
9123 		 * off with the P_MEMSTAT_FREEZE_DISABLED bit set.)
9124 		 */
9125 		p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED;
9126 	} else {
9127 		p->p_memstat_state &= ~P_MEMSTAT_MANAGED;
9128 	}
9129 
9130 	if (_memstat_proc_is_tracked(p)) {
9131 		memorystatus_log_error("memorystatus: process %s [%d] opted in to both "
9132 		    "Management and ActivityTracking\n", proc_best_name(p),
9133 		    proc_pid(p));
9134 	}
9135 
9136 	proc_list_unlock();
9137 
9138 	proc_rele(p);
9139 
9140 	return 0;
9141 }
9142 
9143 static int
9144 _memstat_get_kill_count(int priority, memorystatus_kill_cause_t cause, bool clear)
9145 {
9146 	uint32_t _Atomic *ptr;
9147 
9148 	assert(priority >= JETSAM_PRIORITY_IDLE);
9149 	assert(priority <= JETSAM_PRIORITY_MAX);
9150 
9151 	/* rdar://141462516 */
9152 	if (cause == kMemorystatusInvalid) {
9153 		return 0;
9154 	} else if (cause == kMemorystatusKilledIdleExit) {
9155 		if (priority == JETSAM_PRIORITY_IDLE) {
9156 			ptr = &memorystatus_idle_exit_kill_count;
9157 		} else {
9158 			return 0; /* This never happens */
9159 		}
9160 	} else {
9161 		if (cause < kMemorystatusKilledIdleExit) {
9162 			ptr = &memorystatus_kill_counts[priority][cause - 1];
9163 		} else {
9164 			ptr = &memorystatus_kill_counts[priority][cause - 2];
9165 		}
9166 	}
9167 
9168 	if (clear) {
9169 		return os_atomic_xchg(ptr, 0, relaxed);
9170 	} else {
9171 		return os_atomic_load(ptr, relaxed);
9172 	}
9173 }
9174 
9175 static int
9176 memorystatus_cmd_get_kill_counts(int priority, user_addr_t buffer, size_t buffer_size, int flags)
9177 {
9178 	memorystatus_kill_cause_t cause;
9179 	uint32_t outbuf[JETSAM_REASON_MEMORYSTATUS_MAX + 1];
9180 	bool clear = flags & MEMORYSTATUS_GET_KILL_COUNTS_CLEAR;
9181 
9182 	if (((buffer_size % sizeof(uint32_t)) != 0) ||
9183 	    (priority < JETSAM_PRIORITY_IDLE) ||
9184 	    (priority > JETSAM_PRIORITY_MAX)) {
9185 		return EINVAL;
9186 	}
9187 
9188 	for (cause = kMemorystatusInvalid; cause <= JETSAM_REASON_MEMORYSTATUS_MAX; cause++) {
9189 		outbuf[cause] = _memstat_get_kill_count(priority, cause, clear);
9190 	}
9191 
9192 	return copyout(outbuf, buffer, MIN(buffer_size, sizeof(outbuf)));
9193 }
9194 
9195 int
9196 memorystatus_control(struct proc *p, struct memorystatus_control_args *args, int *ret)
9197 {
9198 	int error = EINVAL;
9199 	boolean_t skip_auth_check = FALSE;
9200 	os_reason_t jetsam_reason = OS_REASON_NULL;
9201 
9202 #if !CONFIG_JETSAM
9203     #pragma unused(ret)
9204     #pragma unused(jetsam_reason)
9205 #endif
9206 
9207 	/* We don't need entitlements if we're setting / querying the freeze preference or frozen status for a process. */
9208 	if (args->command == MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE ||
9209 	    args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE ||
9210 	    args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN) {
9211 		skip_auth_check = TRUE;
9212 	}
9213 
9214 	/*
9215 	 * On development kernel, we don't need entitlements if we're adjusting the limit.
9216 	 * This required for limit adjustment by dyld when roots are detected, see rdar://99669958
9217 	 */
9218 #if DEVELOPMENT || DEBUG
9219 	if (args->command == MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT && proc_getpid(p) == args->pid) {
9220 		skip_auth_check = TRUE;
9221 	}
9222 #endif /* DEVELOPMENT || DEBUG */
9223 
9224 #if DEVELOPMENT || DEBUG
9225 	/*
9226 	 * On development kernels, processes should be able to re-arm themselves
9227 	 * without entitlement for testing.
9228 	 */
9229 	if (args->command == MEMORYSTATUS_CMD_REARM_MEMLIMIT && proc_getpid(p) == args->pid) {
9230 		skip_auth_check = TRUE;
9231 	}
9232 #endif
9233 
9234 	/* Need to be root or have entitlement. */
9235 	if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT) && !skip_auth_check) {
9236 		error = EPERM;
9237 		goto out;
9238 	}
9239 
9240 	/*
9241 	 * Sanity check.
9242 	 * Do not enforce it for snapshots or v2 priority list.
9243 	 * (the latter always allocates an appropriately-sized buffer.)
9244 	 */
9245 	if (args->command != MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT &&
9246 	    args->command != MEMORYSTATUS_CMD_GET_PRIORITY_LIST_V2 &&
9247 	    args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_NAMES &&
9248 	    args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_INFO &&
9249 	    args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_MEMINFO) {
9250 		if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
9251 			error = EINVAL;
9252 			goto out;
9253 		}
9254 	}
9255 
9256 #if CONFIG_MACF
9257 	error = mac_proc_check_memorystatus_control(p, args->command, args->pid);
9258 	if (error) {
9259 		goto out;
9260 	}
9261 #endif /* MAC */
9262 
9263 	switch (args->command) {
9264 	case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
9265 		error = memorystatus_cmd_get_priority_list(
9266 			args->pid,
9267 			args->buffer,
9268 			args->buffersize,
9269 			ret,
9270 			sizeof(memorystatus_priority_entry_t));
9271 		break;
9272 	case MEMORYSTATUS_CMD_GET_PRIORITY_LIST_V2:
9273 		error = memorystatus_cmd_get_priority_list(
9274 			args->pid,
9275 			args->buffer,
9276 			args->buffersize,
9277 			ret,
9278 			sizeof(memorystatus_priority_entry_v2_t));
9279 		break;
9280 	case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
9281 		error = memorystatus_cmd_set_priority_properties(args->pid, args->flags, args->buffer, args->buffersize, ret);
9282 		break;
9283 	case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES:
9284 		error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
9285 		break;
9286 	case MEMORYSTATUS_CMD_GET_MEMLIMIT_PROPERTIES:
9287 		error = memorystatus_cmd_get_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
9288 		break;
9289 	case MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS:
9290 		error = memorystatus_cmd_get_memlimit_excess_np(args->pid, args->flags, args->buffer, args->buffersize, ret);
9291 		break;
9292 	case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
9293 		error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
9294 		break;
9295 	case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
9296 		error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret);
9297 		break;
9298 #if JETSAM_ZPRINT_SNAPSHOT
9299 	case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_NAMES:
9300 		error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
9301 		    jzs_zone_cnt * sizeof(mach_zone_name_t), jzs_names);
9302 		break;
9303 	case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_INFO:
9304 		error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
9305 		    jzs_zone_cnt * sizeof(mach_zone_info_t), jzs_info);
9306 		break;
9307 	case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_MEMINFO:
9308 		error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
9309 		    jzs_meminfo_cnt * sizeof(mach_memory_info_t), jzs_meminfo);
9310 		break;
9311 #endif
9312 #if DEVELOPMENT || DEBUG
9313 	case MEMORYSTATUS_CMD_SET_TESTING_PID:
9314 		error = memorystatus_cmd_set_testing_pid((int32_t) args->flags);
9315 		break;
9316 #endif
9317 	case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
9318 		error = memorystatus_cmd_get_pressure_status(ret);
9319 		break;
9320 #if CONFIG_JETSAM
9321 	case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
9322 		/*
9323 		 * This call does not distinguish between active and inactive limits.
9324 		 * Default behavior in 2-level HWM world is to set both.
9325 		 * Non-fatal limit is also assumed for both.
9326 		 */
9327 		error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
9328 		break;
9329 	case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
9330 		/*
9331 		 * This call does not distinguish between active and inactive limits.
9332 		 * Default behavior in 2-level HWM world is to set both.
9333 		 * Fatal limit is also assumed for both.
9334 		 */
9335 		error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
9336 		break;
9337 	case MEMORYSTATUS_CMD_MARK_PROCESS_COALITION_SWAPPABLE:
9338 		error = memorystatus_cmd_mark_process_coalition_swappable(args->pid, ret);
9339 		break;
9340 
9341 	case MEMORYSTATUS_CMD_GET_PROCESS_COALITION_IS_SWAPPABLE:
9342 		error = memorystatus_cmd_get_process_coalition_is_swappable(args->pid, ret);
9343 		break;
9344 
9345 	case MEMORYSTATUS_CMD_CONVERT_MEMLIMIT_MB:
9346 		error = memorystatus_cmd_convert_memlimit_mb(args->pid, (int32_t) args->flags, ret);
9347 		break;
9348 
9349 	case MEMORYSTATUS_CMD_REARM_MEMLIMIT:
9350 		error = memorystatus_cmd_rearm_memlimit(args->pid, args->flags, ret);
9351 		break;
9352 #endif /* CONFIG_JETSAM */
9353 		/* Test commands */
9354 #if DEVELOPMENT || DEBUG
9355 	case MEMORYSTATUS_CMD_TEST_JETSAM:
9356 		jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_GENERIC);
9357 		if (jetsam_reason == OS_REASON_NULL) {
9358 			memorystatus_log_error("memorystatus_control: failed to allocate jetsam reason\n");
9359 		}
9360 
9361 		error = memstat_kill_process_sync(args->pid, kMemorystatusKilled, jetsam_reason) ? 0 : EINVAL;
9362 		break;
9363 	case MEMORYSTATUS_CMD_TEST_JETSAM_SORT:
9364 		error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags, args->buffer, args->buffersize);
9365 		break;
9366 #else /* DEVELOPMENT || DEBUG */
9367 	#pragma unused(jetsam_reason)
9368 #endif /* DEVELOPMENT || DEBUG */
9369 	case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE:
9370 		if (memorystatus_aggressive_jetsam_lenient_allowed == FALSE) {
9371 #if DEVELOPMENT || DEBUG
9372 			memorystatus_log_info("Enabling Lenient Mode\n");
9373 #endif /* DEVELOPMENT || DEBUG */
9374 
9375 			memorystatus_aggressive_jetsam_lenient_allowed = TRUE;
9376 			memorystatus_aggressive_jetsam_lenient = TRUE;
9377 			error = 0;
9378 		}
9379 		break;
9380 	case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE:
9381 #if DEVELOPMENT || DEBUG
9382 		memorystatus_log_info("Disabling Lenient mode\n");
9383 #endif /* DEVELOPMENT || DEBUG */
9384 		memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
9385 		memorystatus_aggressive_jetsam_lenient = FALSE;
9386 		error = 0;
9387 		break;
9388 	case MEMORYSTATUS_CMD_GET_AGGRESSIVE_JETSAM_LENIENT_MODE:
9389 		*ret = (memorystatus_aggressive_jetsam_lenient ? 1 : 0);
9390 		error = 0;
9391 		break;
9392 	case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE:
9393 	case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE:
9394 		error = memorystatus_low_mem_privileged_listener(args->command);
9395 		break;
9396 
9397 	case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE:
9398 	case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE:
9399 		error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, JETSAM_PRIORITY_ELEVATED_INACTIVE, args->flags ? TRUE : FALSE);
9400 		break;
9401 	case MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED:
9402 		error = memorystatus_set_process_is_managed(args->pid, args->flags);
9403 		break;
9404 
9405 	case MEMORYSTATUS_CMD_GET_PROCESS_IS_MANAGED:
9406 		error = memorystatus_get_process_is_managed(args->pid, ret);
9407 		break;
9408 
9409 #if CONFIG_FREEZE
9410 	case MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE:
9411 		error = memorystatus_set_process_is_freezable(args->pid, args->flags ? TRUE : FALSE);
9412 		break;
9413 
9414 	case MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE:
9415 		error = memorystatus_get_process_is_freezable(args->pid, ret);
9416 		break;
9417 	case MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN:
9418 		error = memorystatus_get_process_is_frozen(args->pid, ret);
9419 		break;
9420 
9421 	case MEMORYSTATUS_CMD_FREEZER_CONTROL:
9422 		error = memorystatus_freezer_control(args->flags, args->buffer, args->buffersize, ret);
9423 		break;
9424 #endif /* CONFIG_FREEZE */
9425 
9426 #if DEVELOPMENT || DEBUG
9427 	case MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT:
9428 		error = memorystatus_cmd_increase_jetsam_task_limit(args->pid, args->flags);
9429 		break;
9430 	case MEMORYSTATUS_CMD_SET_DIAG_LIMIT:
9431 		error = memorystatus_cmd_set_diag_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
9432 		break;
9433 	case MEMORYSTATUS_CMD_GET_DIAG_LIMIT:
9434 		error = memorystatus_cmd_get_diag_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
9435 		break;
9436 #endif /* DEVELOPMENT || DEBUG */
9437 
9438 	case MEMORYSTATUS_CMD_GET_KILL_COUNTS:
9439 		error = memorystatus_cmd_get_kill_counts(args->pid, args->buffer, args->buffersize, args->flags);
9440 		break;
9441 
9442 	case MEMORYSTATUS_CMD_GET_CONCLAVE_LIMIT:
9443 		error = _memstat_get_process_conclave_mem_limit(args->pid, ret);
9444 		break;
9445 
9446 	default:
9447 		error = EINVAL;
9448 		break;
9449 	}
9450 
9451 out:
9452 	return error;
9453 }
9454 
9455 /* Coalition support */
9456 
9457 /*
9458  * Inserts a list of pids before the given proc in the bucket. If any of the
9459  * pids in the given list are not already in the bucket, they will be ignored.
9460  */
9461 static void
9462 memstat_insert_list_locked(
9463 	proc_t before,
9464 	unsigned int bucket_idx,
9465 	pid_t *pid_list,
9466 	int list_sz)
9467 {
9468 	int i;
9469 	proc_t p;
9470 	memstat_bucket_t *bucket;
9471 
9472 	assert(bucket_idx < MEMSTAT_BUCKET_COUNT);
9473 
9474 	bucket = &memstat_bucket[bucket_idx];
9475 
9476 	if ((pid_list == NULL) || (list_sz <= 0)) {
9477 		return;
9478 	}
9479 
9480 	for (i = list_sz - 1; i >= 0; i--) {
9481 		p = proc_find_locked(pid_list[i]);
9482 
9483 		if (p == NULL) {
9484 			continue;
9485 		}
9486 
9487 		if ((p == before) || (p->p_memstat_effectivepriority != bucket_idx)) {
9488 			/*
9489 			 * We can encounter p == before when we try to sort a coalition with an in-
9490 			 * progress exec of the leader, such that the leader and the exec-ing
9491 			 * member have the same PID. Just skip over it for now, since this member
9492 			 * will soon be removed from the proc list anyway.
9493 			 */
9494 			proc_rele(p);
9495 			continue;
9496 		}
9497 
9498 		TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
9499 		TAILQ_INSERT_BEFORE(before, p, p_memstat_list);
9500 		proc_rele(p);
9501 	}
9502 }
9503 /*
9504  * Return the number of pids rearranged during this sort.
9505  */
9506 static void
9507 memstat_sort_coals_locked(unsigned int bucket_index, memorystatus_jetsam_sort_order_t sort_order)
9508 {
9509 #define MAX_SORT_PIDS           80
9510 
9511 	int ntasks = 0;
9512 	proc_t p = NULL;
9513 	coalition_t coal = COALITION_NULL;
9514 	pid_t pid_list[MAX_SORT_PIDS];
9515 	memstat_bucket_t *bucket;
9516 
9517 	assert((sort_order == JETSAM_SORT_LRU) || (sort_order == JETSAM_SORT_FOOTPRINT));
9518 	assert(bucket_index < MEMSTAT_BUCKET_COUNT);
9519 
9520 	switch (sort_order) {
9521 	case JETSAM_SORT_LRU:
9522 		/* Nothing to do, buckets are already LRU */
9523 		break;
9524 	case JETSAM_SORT_FOOTPRINT:
9525 		/* Sort bucket by footprint first */
9526 		memstat_sort_by_footprint_locked(bucket_index);
9527 		break;
9528 	default:
9529 		panic("Invalid sort order %d passed to memstat_sort_coals", sort_order);
9530 	}
9531 
9532 	/*
9533 	 * During coalition sorting, processes in a priority band are rearranged
9534 	 * by being re-inserted at the head of the queue.  So, when handling a
9535 	 * list, the first process that gets moved to the head of the queue,
9536 	 * ultimately gets pushed toward the queue tail, and hence, jetsams last.
9537 	 *
9538 	 * So, for example, the coalition leader is expected to jetsam last,
9539 	 * after its coalition members.  Therefore, the coalition leader is
9540 	 * inserted at the head of the queue first.
9541 	 *
9542 	 * After processing a coalition, the jetsam order is as follows:
9543 	 *   undefs(jetsam first), extensions, xpc services, leader(jetsam last)
9544 	 */
9545 
9546 	/*
9547 	 * Coalition members are rearranged in the priority bucket here,
9548 	 * based on their coalition role.
9549 	 */
9550 
9551 	bucket = &memstat_bucket[bucket_index];
9552 	p = TAILQ_FIRST(&bucket->list);
9553 	while (p) {
9554 		coal = task_get_coalition(proc_task(p), COALITION_TYPE_JETSAM);
9555 		if (!coalition_is_leader(proc_task(p), coal)) {
9556 			p = TAILQ_NEXT(p, p_memstat_list);
9557 			continue;
9558 		}
9559 
9560 		/* undefined coalition members should be the first to jetsam */
9561 		ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_UNDEF,
9562 		    COALITION_SORT_DEFAULT, pid_list, MAX_SORT_PIDS);
9563 
9564 		if (ntasks > 0) {
9565 			memstat_insert_list_locked(p, bucket_index, pid_list,
9566 			    (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
9567 		}
9568 
9569 		/* extensions should jetsam after unmarked processes */
9570 		ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_EXT,
9571 		    COALITION_SORT_DEFAULT, pid_list, MAX_SORT_PIDS);
9572 
9573 		if (ntasks > 0) {
9574 			memstat_insert_list_locked(p, bucket_index, pid_list,
9575 			    (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
9576 		}
9577 
9578 		/* xpc services should jetsam after extensions */
9579 		ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_XPC,
9580 		    COALITION_SORT_DEFAULT, pid_list, MAX_SORT_PIDS);
9581 
9582 		if (ntasks > 0) {
9583 			memstat_insert_list_locked(p, bucket_index, pid_list,
9584 			    (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
9585 		}
9586 
9587 		/*
9588 		 * And then, the leader will jetsam last since we inserted everyone else
9589 		 * before it in the bucket
9590 		 */
9591 
9592 		p = TAILQ_NEXT(p, p_memstat_list);
9593 	} /* end for */
9594 }
9595 
9596 
9597 
9598 uint32_t
9599 memstat_get_idle_proccnt(void)
9600 {
9601 #if CONFIG_JETSAM
9602 	/*
9603 	 * On fully jetsam-enabled systems, all processes on the idle band may
9604 	 * be idle-exited
9605 	 */
9606 	return os_atomic_load(&memstat_bucket[JETSAM_PRIORITY_IDLE].count, relaxed);
9607 #else /* !CONFIG_JETSAM */
9608 	uint32_t count = 0;
9609 	uint32_t bucket = JETSAM_PRIORITY_IDLE;
9610 
9611 	proc_list_lock();
9612 	for (proc_t p = memorystatus_get_first_proc_locked(&bucket, FALSE);
9613 	    p != PROC_NULL;
9614 	    p = memorystatus_get_next_proc_locked(&bucket, p, FALSE)) {
9615 		/*
9616 		 * On macOS, we can only exit clean daemons. In the future, we
9617 		 * should include assertion-less managed daemons. Apps may make
9618 		 * their way into this band as well, and we cannot jetsam those.
9619 		 */
9620 		if (_memstat_proc_can_idle_exit(p) &&
9621 		    !_memstat_proc_is_dirty(p) &&
9622 		    !_memstat_proc_is_terminating(p)) {
9623 			count++;
9624 		}
9625 	}
9626 	proc_list_unlock();
9627 
9628 	return count;
9629 #endif /* CONFIG_JETSAM */
9630 }
9631 
9632 uint32_t
9633 memstat_get_long_idle_proccnt(void)
9634 {
9635 	uint32_t count = 0;
9636 	uint32_t bucket = JETSAM_PRIORITY_IDLE;
9637 
9638 	proc_list_lock();
9639 	for (proc_t p = memorystatus_get_first_proc_locked(&bucket, FALSE);
9640 	    p != PROC_NULL;
9641 	    p = memorystatus_get_next_proc_locked(&bucket, p, FALSE)) {
9642 		if (!_memstat_proc_is_dirty(p) && _memstat_proc_can_idle_exit(p) &&
9643 		    !_memstat_proc_is_terminating(p) && _memstat_proc_is_reapable(p)) {
9644 			count++;
9645 		}
9646 	}
9647 	proc_list_unlock();
9648 
9649 	return count;
9650 }
9651 
9652 uint32_t
9653 memstat_get_proccnt_upto_priority(uint32_t max_bucket_index)
9654 {
9655 	int32_t i = JETSAM_PRIORITY_IDLE;
9656 	int count = 0;
9657 
9658 	assert3u(max_bucket_index, <=, MEMSTAT_BUCKET_COUNT);
9659 
9660 	while (i <= max_bucket_index) {
9661 		/*
9662 		 * NB: We don't hold the proc-list lock here; that's ok b/c this is just an
9663 		 * estimate.
9664 		 */
9665 		count += os_atomic_load(&memstat_bucket[i++].count, relaxed);
9666 	}
9667 
9668 	return count;
9669 }
9670 
9671 int
9672 memorystatus_update_priority_for_appnap(proc_t p)
9673 {
9674 #if !CONFIG_JETSAM
9675 	if (!p || (!isApp(p)) || (p->p_memstat_state & (P_MEMSTAT_INTERNAL | P_MEMSTAT_MANAGED))) {
9676 		/*
9677 		 * Ineligible processes OR system processes e.g. launchd.
9678 		 */
9679 		return -1;
9680 	}
9681 
9682 	int32_t priority = 0;
9683 
9684 	proc_list_lock();
9685 
9686 	if (proc_list_exited(p) ||
9687 	    (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP))) {
9688 		/*
9689 		 * If the process is on its way out OR
9690 		 * jetsam has alread tried and failed to kill this process,
9691 		 * let's skip the whole jetsam band transition.
9692 		 */
9693 		proc_list_unlock();
9694 		return 0;
9695 	}
9696 
9697 	/*
9698 	 * Update priority. We don't want the aging logic because that's only applicable on
9699 	 * configs with CONFIG_JETSAM.
9700 	 */
9701 	priority = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_SUP_ACTIVE) ?
9702 	    JETSAM_PRIORITY_BACKGROUND :
9703 	    p->p_memstat_requestedpriority;
9704 	if (_memstat_proc_has_priority_assertion(p)) {
9705 		priority = MAX(priority, p->p_memstat_assertionpriority);
9706 	}
9707 	memstat_update_priority_locked(p, priority, MEMSTAT_PRIORITY_NO_AGING);
9708 
9709 	proc_list_unlock();
9710 
9711 	return 0;
9712 
9713 #else /* !CONFIG_JETSAM */
9714 	#pragma unused(p)
9715 	return -1;
9716 #endif /* !CONFIG_JETSAM */
9717 }
9718 
9719 uint64_t
9720 memorystatus_available_memory_internal(struct proc *p)
9721 {
9722 #ifdef XNU_TARGET_OS_OSX
9723 	if (p->p_memstat_memlimit <= 0) {
9724 		return 0;
9725 	}
9726 #endif /* XNU_TARGET_OS_OSX */
9727 	const uint64_t footprint_in_bytes = get_task_phys_footprint(proc_task(p));
9728 	int32_t memlimit_mb;
9729 	int64_t memlimit_bytes;
9730 	int64_t rc;
9731 
9732 	if (isApp(p) == FALSE) {
9733 		return 0;
9734 	}
9735 
9736 	if (p->p_memstat_memlimit > 0) {
9737 		memlimit_mb = p->p_memstat_memlimit;
9738 	} else if (task_convert_phys_footprint_limit(-1, &memlimit_mb) != KERN_SUCCESS) {
9739 		return 0;
9740 	}
9741 
9742 	if (memlimit_mb <= 0) {
9743 		memlimit_bytes = INT_MAX & ~((1 << 20) - 1);
9744 	} else {
9745 		memlimit_bytes = ((int64_t) memlimit_mb) << 20;
9746 	}
9747 
9748 	rc = memlimit_bytes - footprint_in_bytes;
9749 
9750 	return (rc >= 0) ? rc : 0;
9751 }
9752 
9753 int
9754 memorystatus_available_memory(struct proc *p, __unused struct memorystatus_available_memory_args *args, uint64_t *ret)
9755 {
9756 	*ret = memorystatus_available_memory_internal(p);
9757 
9758 	return 0;
9759 }
9760 
9761 #if DEVELOPMENT || DEBUG
9762 static int
9763 memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase)
9764 {
9765 	int32_t memlimit_active, memlimit_inactive;
9766 
9767 	/* Validate inputs */
9768 	if ((pid == 0) || (byte_increase == 0)) {
9769 		return EINVAL;
9770 	}
9771 
9772 	if (memstat_ignore_task_limit_increase) {
9773 		/* If the bootarg is set, lie and say we did it */
9774 		return 0;
9775 	}
9776 
9777 	proc_t p = proc_find(pid);
9778 
9779 	if (!p) {
9780 		return ESRCH;
9781 	}
9782 
9783 	const uint32_t current_memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
9784 	/* round to page */
9785 	const int32_t page_aligned_increase = (int32_t) MIN(round_page(p->p_memlimit_increase + byte_increase), INT32_MAX);
9786 
9787 	proc_list_lock();
9788 
9789 	memlimit_active = p->p_memstat_memlimit_active;
9790 	if (memlimit_active > 0) {
9791 		memlimit_active -= current_memlimit_increase;
9792 		memlimit_active += roundToNearestMB(page_aligned_increase);
9793 	}
9794 
9795 	memlimit_inactive = p->p_memstat_memlimit_inactive;
9796 	if (memlimit_inactive > 0) {
9797 		memlimit_inactive -= current_memlimit_increase;
9798 		memlimit_inactive += roundToNearestMB(page_aligned_increase);
9799 	}
9800 
9801 	/*
9802 	 * Store the updated delta limit in the proc.
9803 	 */
9804 	p->p_memlimit_increase = page_aligned_increase;
9805 
9806 	memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
9807 	if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
9808 		memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
9809 	}
9810 	if (_memstat_proc_active_memlimit_is_fatal(p)) {
9811 		memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
9812 	}
9813 
9814 	int error = memstat_set_memlimits_locked(p,
9815 	    memlimit_active, memlimit_inactive,
9816 	    memlimit_options);
9817 
9818 	proc_list_unlock();
9819 	proc_rele(p);
9820 
9821 	return error;
9822 }
9823 #endif /* DEVELOPMENT */
9824