xref: /xnu-11215.81.4/bsd/kern/kern_memorystatus.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  *
28  */
29 
30 #include <kern/sched_prim.h>
31 #include <kern/kalloc.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/host.h>
38 #include <kern/policy_internal.h>
39 #include <kern/thread_group.h>
40 #include <kern/zalloc.h>
41 
42 #include <corpses/task_corpse.h>
43 #include <libkern/libkern.h>
44 #include <mach/mach_time.h>
45 #include <mach/task.h>
46 #include <mach/host_priv.h>
47 #include <mach/mach_host.h>
48 #include <pexpert/pexpert.h>
49 #include <sys/coalition.h>
50 #include <sys/code_signing.h>
51 #include <sys/kern_event.h>
52 #include <sys/proc.h>
53 #include <sys/proc_info.h>
54 #include <sys/reason.h>
55 #include <sys/signal.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysproto.h>
59 #include <sys/spawn_internal.h>
60 #include <sys/wait.h>
61 #include <sys/tree.h>
62 #include <sys/priv.h>
63 #include <vm/pmap.h>
64 #include <vm/vm_reclaim_xnu.h>
65 #include <vm/vm_pageout_xnu.h>
66 #include <vm/vm_protos.h>
67 #include <vm/vm_purgeable_xnu.h>
68 #include <vm/vm_page.h>
69 #include <vm/vm_compressor_xnu.h>
70 #include <mach/machine/sdt.h>
71 #include <libkern/section_keywords.h>
72 #include <os/atomic_private.h>
73 #include <os/overflow.h>
74 
75 #include <IOKit/IOBSD.h>
76 
77 #if CONFIG_MACF
78 #include <security/mac_framework.h>
79 #endif
80 
81 #if CONFIG_FREEZE
82 #include <vm/vm_map.h>
83 #endif /* CONFIG_FREEZE */
84 
85 #include <kern/kern_memorystatus_internal.h>
86 #include <sys/kern_memorystatus.h>
87 #include <sys/kern_memorystatus_xnu.h>
88 #include <sys/kern_memorystatus_freeze.h>
89 #include <sys/kern_memorystatus_notify.h>
90 #include <sys/kdebug_triage.h>
91 #include <sys/file_internal.h>
92 #include <net/necp.h>
93 
94 errno_t mach_to_bsd_errno(kern_return_t mach_err);
95 extern uint32_t vm_compressor_pool_size(void);
96 extern uint32_t vm_compressor_fragmentation_level(void);
97 
98 pid_t memorystatus_freeze_last_pid_thawed = 0;
99 uint64_t memorystatus_freeze_last_pid_thawed_ts = 0;
100 
101 int block_corpses = 0; /* counter to block new corpses if jetsam purges them */
102 
103 /* For logging clarity */
104 static const char *memorystatus_kill_cause_name[] = {
105 	"",                                             /* kMemorystatusInvalid							*/
106 	"jettisoned",                                   /* kMemorystatusKilled							*/
107 	"highwater",                                    /* kMemorystatusKilledHiwat						*/
108 	"vnode-limit",                                  /* kMemorystatusKilledVnodes					*/
109 	"vm-pageshortage",                              /* kMemorystatusKilledVMPageShortage			*/
110 	"proc-thrashing",                               /* kMemorystatusKilledProcThrashing				*/
111 	"fc-thrashing",                                 /* kMemorystatusKilledFCThrashing				*/
112 	"per-process-limit",                            /* kMemorystatusKilledPerProcessLimit			*/
113 	"disk-space-shortage",                          /* kMemorystatusKilledDiskSpaceShortage			*/
114 	"idle-exit",                                    /* kMemorystatusKilledIdleExit					*/
115 	"zone-map-exhaustion",                         /* kMemorystatusKilledZoneMapExhaustion			*/
116 	"vm-compressor-thrashing",                     /* kMemorystatusKilledVMCompressorThrashing		*/
117 	"vm-compressor-space-shortage",                /* kMemorystatusKilledVMCompressorSpaceShortage	*/
118 	"low-swap",                                    /* kMemorystatusKilledLowSwap                   */
119 	"sustained-memory-pressure",                   /* kMemorystatusKilledSustainedPressure         */
120 	"vm-pageout-starvation",                       /* kMemorystatusKilledVMPageoutStarvation       */
121 };
122 
123 static const char *
memorystatus_priority_band_name(int32_t priority)124 memorystatus_priority_band_name(int32_t priority)
125 {
126 	switch (priority) {
127 	case JETSAM_PRIORITY_FOREGROUND:
128 		return "FOREGROUND";
129 	case JETSAM_PRIORITY_AUDIO_AND_ACCESSORY:
130 		return "AUDIO_AND_ACCESSORY";
131 	case JETSAM_PRIORITY_CONDUCTOR:
132 		return "CONDUCTOR";
133 	case JETSAM_PRIORITY_DRIVER_APPLE:
134 		return "DRIVER_APPLE";
135 	case JETSAM_PRIORITY_HOME:
136 		return "HOME";
137 	case JETSAM_PRIORITY_EXECUTIVE:
138 		return "EXECUTIVE";
139 	case JETSAM_PRIORITY_IMPORTANT:
140 		return "IMPORTANT";
141 	case JETSAM_PRIORITY_CRITICAL:
142 		return "CRITICAL";
143 	}
144 
145 	return "?";
146 }
147 
148 bool
is_reason_thrashing(unsigned cause)149 is_reason_thrashing(unsigned cause)
150 {
151 	switch (cause) {
152 	case kMemorystatusKilledFCThrashing:
153 	case kMemorystatusKilledVMCompressorThrashing:
154 	case kMemorystatusKilledVMCompressorSpaceShortage:
155 		return true;
156 	default:
157 		return false;
158 	}
159 }
160 
161 bool
is_reason_zone_map_exhaustion(unsigned cause)162 is_reason_zone_map_exhaustion(unsigned cause)
163 {
164 	return cause == kMemorystatusKilledZoneMapExhaustion;
165 }
166 
167 /*
168  * Returns the current zone map size and capacity to include in the jetsam snapshot.
169  * Defined in zalloc.c
170  */
171 extern void get_zone_map_size(uint64_t *current_size, uint64_t *capacity);
172 
173 /*
174  * Returns the name of the largest zone and its size to include in the jetsam snapshot.
175  * Defined in zalloc.c
176  */
177 extern void get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size);
178 
179 static int memstat_set_memlimits_locked(proc_t p, int32_t active_limit,
180     int32_t inactive_limit, memlimit_options_t options);
181 static bool _memstat_proc_is_active_locked(proc_t);
182 
183 static int memorystatus_highwater_enabled = 1;  /* Update the cached memlimit data. */
184 
185 /*
186  * Cache this proc's active limit as its current limit before writing it to
187  * the ledger. Returns whether the new limit should be written to the ledger.
188  */
189 static inline bool
_memstat_update_memlimit_locked(proc_t p,bool use_active)190 _memstat_update_memlimit_locked(proc_t p, bool use_active)
191 {
192 	bool ledger_needed = false;
193 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
194 
195 	/* Cache limit value */
196 	if (use_active && p->p_memstat_memlimit != p->p_memstat_memlimit_active) {
197 		p->p_memstat_memlimit = p->p_memstat_memlimit_active;
198 		ledger_needed = true;
199 	} else if (!use_active &&
200 	    p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) {
201 		p->p_memstat_memlimit = p->p_memstat_memlimit_inactive;
202 		ledger_needed = true;
203 	}
204 
205 	/* Cache limit fatality */
206 	if (_memstat_proc_memlimit_is_fatal(p, use_active) &&
207 	    !_memstat_proc_cached_memlimit_is_fatal(p)) {
208 		p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
209 		ledger_needed = true;
210 	} else if (!_memstat_proc_memlimit_is_fatal(p, use_active) &&
211 	    _memstat_proc_cached_memlimit_is_fatal(p)) {
212 		p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
213 		ledger_needed = true;
214 	}
215 
216 	return ledger_needed;
217 }
218 
219 /*
220  * Write the process' current memlimit to the ledger for enforcement.
221  *
222  * Holding the proc_list_lock while writing to the ledgers (where the task
223  * lock is taken) can be problematic.  The proc list lock may optionally be
224  * dropped and re-taken while writing limits to the ledger. (rdar://21394491)
225  */
226 static int
_memstat_write_memlimit_to_ledger_locked(proc_t p,bool is_active,bool drop_lock)227 _memstat_write_memlimit_to_ledger_locked(proc_t p, bool is_active, bool drop_lock)
228 {
229 	kern_return_t kr;
230 	bool is_fatal = _memstat_proc_cached_memlimit_is_fatal(p);
231 
232 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
233 
234 #if MACH_ASSERT
235 	if (memorystatus_highwater_enabled) {
236 		if (is_active) {
237 			assert3u(is_fatal, ==, _memstat_proc_active_memlimit_is_fatal(p));
238 			assert3u(p->p_memstat_memlimit, ==, p->p_memstat_memlimit_active);
239 		} else {
240 			assert3u(is_fatal, ==, _memstat_proc_inactive_memlimit_is_fatal(p));
241 			assert3u(p->p_memstat_memlimit, ==, p->p_memstat_memlimit_inactive);
242 		}
243 	}
244 #endif /* MACH_ASSERT */
245 
246 	if (drop_lock) {
247 		if (proc_ref(p, true) != p) {
248 			memorystatus_log_error("Unable to take a reference on proc %s [%d]. "
249 			    "Cannot update memlimit", proc_best_name(p), proc_getpid(p));
250 			return ESRCH;
251 		}
252 		proc_list_unlock();
253 	}
254 
255 	memorystatus_log_debug("memorystatus: new limit on pid %d (%dMB %s)\n",
256 	    proc_getpid(p), (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
257 	    (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"));
258 
259 	kr = task_set_phys_footprint_limit_internal(proc_task(p),
260 	    (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1,
261 	    NULL, is_active, is_fatal);
262 
263 	if (drop_lock) {
264 		proc_list_lock();
265 		proc_rele(p);
266 	}
267 
268 	if (kr != KERN_SUCCESS) {
269 		memorystatus_log_fault("memorystatus: error (%d) setting memlimit in "
270 		    "ledger for %s [%d]\n", kr, proc_best_name(p), proc_pid(p));
271 		return mach_to_bsd_errno(kr);
272 	}
273 	return 0;
274 }
275 
276 #pragma mark General Tunables
277 
278 #define MEMORYSTATUS_SMALL_MEMORY_THRESHOLD (3UL * (1UL << 30))
279 #define MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD (6UL * (1UL << 30))
280 
281 #define MEMORYSTATUS_CLEAR_THE_DECKS_OFFSET_PERCENTAGE 5UL
282 #define MEMORYSTATUS_BALLAST_OFFSET_PERCENTAGE 5UL
283 #define MEMORYSTATUS_AGGR_SYSPROC_AGING_PERCENTAGE 7UL
284 #define MEMORYSTATUS_DELTA_PERCENTAGE_LARGE 4UL
285 #define MEMORYSTATUS_DELTA_PERCENTAGE_SMALL 5UL
286 
287 /*
288  * Fall back to these percentages/ratios if a mb value is not provided via EDT
289  *  DRAM (GB) | critical | idle | pressure | freeze
290  *  (0,3]     | 5%       | 10%  | 15%      | 50%
291  *  (3,6]     | 4%       | 9%   | 15%      | 50%
292  *  (6,∞)     | 4%       | 8%   | 12%      | 50%
293  */
294 
295 #define MEMORYSTATUS_CRITICAL_PERCENTAGE_SMALL 5UL
296 #define MEMORYSTATUS_CRITICAL_PERCENTAGE_LARGE 4UL
297 
298 #define MEMORYSTATUS_IDLE_RATIO_NUM 2UL
299 #define MEMORYSTATUS_IDLE_RATIO_DENOM 1UL
300 #define MEMORYSTATUS_PRESSURE_RATIO_NUM 3UL
301 #define MEMORYSTATUS_PRESSURE_RATIO_DENOM 1UL
302 
303 /*
304  * For historical reasons, devices with "medium"-sized memory configs have a critical:idle:pressure ratio of
305  * 4:9:15. This ratio is preserved for these devices when a fixed-mb base value has not been provided by EDT/boot-arg;
306  * all other devices use a 1:2:3 ratio.
307  */
308 #define MEMORYSTATUS_IDLE_RATIO_NUM_MEDIUM 9UL
309 #define MEMORYSTATUS_IDLE_RATIO_DENOM_MEDIUM 4UL
310 #define MEMORYSTATUS_PRESSURE_RATIO_NUM_MEDIUM  15UL
311 #define MEMORYSTATUS_PRESSURE_RATIO_DENOM_MEDIUM  4UL
312 
313 static int32_t memorystatus_get_default_task_active_limit(proc_t p);
314 static int32_t memorystatus_get_default_task_inactive_limit(proc_t p);
315 
316 /*
317  * default jetsam snapshot support
318  */
319 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
320 
321 #if CONFIG_FREEZE
322 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_freezer;
323 /*
324  * The size of the freezer snapshot is given by memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR
325  * The freezer snapshot can be much smaller than the default snapshot
326  * because it only includes apps that have been killed and dasd consumes it every 30 minutes.
327  * Since the snapshots are always wired we don't want to overallocate too much.
328  */
329 #define JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR 20
330 unsigned int memorystatus_jetsam_snapshot_freezer_max;
331 unsigned int memorystatus_jetsam_snapshot_freezer_size;
332 TUNABLE(bool, memorystatus_jetsam_use_freezer_snapshot, "kern.jetsam_user_freezer_snapshot", true);
333 
334 #define MEMORYSTATUS_FREEZE_THRESHOLD_PERCENTAGE 50UL
335 TUNABLE_DT(uint32_t, memorystatus_freeze_threshold_mb, "/defaults", "kern.memstat_freeze_mb",
336     "memorystatus_freeze_threshold_mb", 0, TUNABLE_DT_NONE);
337 #endif /* CONFIG_FREEZE */
338 
339 unsigned int memorystatus_jetsam_snapshot_count = 0;
340 unsigned int memorystatus_jetsam_snapshot_max = 0;
341 unsigned int memorystatus_jetsam_snapshot_size = 0;
342 uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
343 uint64_t memorystatus_jetsam_snapshot_timeout = 0;
344 
345 #if DEVELOPMENT || DEBUG
346 /*
347  * On development and debug kernels, we allow one pid to take ownership
348  * of some memorystatus data structures for testing purposes (via memorystatus_control).
349  * If there's an owner, then only they may consume the jetsam snapshot & set freezer probabilities.
350  * This is used when testing these interface to avoid racing with other
351  * processes on the system that typically use them (namely OSAnalytics & dasd).
352  */
353 static pid_t memorystatus_testing_pid = 0;
354 SYSCTL_INT(_kern, OID_AUTO, memorystatus_testing_pid, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_testing_pid, 0, "");
355 #endif /* DEVELOPMENT || DEBUG */
356 
357 /*
358  * jetsam zprint snapshot data
359  */
360 #if JETSAM_ZPRINT_SNAPSHOT
361 static unsigned int        jzs_trigger_band = JETSAM_PRIORITY_FOREGROUND;
362 static mach_zone_name_t    *jzs_names = NULL;
363 static mach_zone_info_t    *jzs_info = NULL;
364 static int                *jzs_coalesce = NULL;
365 static unsigned int        jzs_zone_cnt = 0;
366 static mach_memory_info_t *jzs_meminfo = NULL;
367 static unsigned int        jzs_meminfo_cnt = 0;
368 static uint64_t            jzs_gencount = (uint64_t) -1ll;
369 
370 #if DEVELOPMENT || DEBUG
371 SYSCTL_UINT(_kern, OID_AUTO, jzs_trigger_band, CTLFLAG_RW | CTLFLAG_LOCKED, &jzs_trigger_band, 0, "Priority band threshold for taking jetsam zprint snapshot");
372 #endif /* DEVELOPMENT || DEBUG */
373 #endif /* JETSAM_ZPRINT_SNAPSHOT */
374 
375 
376 static void memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot);
377 
378 /* General memorystatus stuff */
379 
380 /*
381  * Daemons: The actual idle deferred time for the daemon is based on
382  * the relaunch behavior of the daemon. The relaunch behavior determines
383  * the scaling factor applied to memorystatus_sysprocs_idle_delay_time. See
384  * kJetsamSysProcsIdleDelayTime* ratios defined in kern_memorystatus.c
385  *
386  * Apps: The apps are aged for memorystatus_apps_idle_delay_time factored
387  * by kJetsamAppsIdleDelayTimeRatio.
388  */
389 TUNABLE(uint64_t, memstat_idle_deferral_time_s, "memorystatus_idle_deferral_time_s", 10);
390 uint64_t memorystatus_sysprocs_idle_delay_time = 0;
391 uint64_t memorystatus_apps_idle_delay_time = 0;
392 /* 2GB devices support an entitlement for a higher app memory limit of "almost 2GB". */
393 static int32_t memorystatus_ios13extended_footprint_limit_mb = 1800;
394 
395 /* Some devices give entitled apps a higher memory limit */
396 TUNABLE_DT_WRITEABLE(int32_t, memorystatus_entitled_max_task_footprint_mb,
397     "/defaults", "kern.entitled_max_task_pmem",
398     "entitled_max_task_pmem", 0, TUNABLE_DT_NONE);
399 TUNABLE_DT_WRITEABLE(int32_t, memorystatus_entitled_dev_max_task_footprint_mb,
400     "/defaults", "kern.entitled_dev_max_task_pmem",
401     "entitled_dev_max_task_pmem", 0, TUNABLE_DT_NONE);
402 
403 #if __arm64__
404 #if DEVELOPMENT || DEBUG
405 SYSCTL_INT(_kern, OID_AUTO, ios13extended_footprint_limit_mb,
406     CTLFLAG_RD | CTLFLAG_LOCKED,
407     &memorystatus_ios13extended_footprint_limit_mb, 0, "");
408 SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem,
409     CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN,
410     &memorystatus_entitled_max_task_footprint_mb, 0, "");
411 SYSCTL_INT(_kern, OID_AUTO, entitled_dev_max_task_pmem,
412     CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN,
413     &memorystatus_entitled_dev_max_task_footprint_mb, 0, "");
414 #else /* !(DEVELOPMENT || DEBUG) */
415 SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem,
416     CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_KERN,
417     &memorystatus_entitled_max_task_footprint_mb, 0, "");
418 #endif /* DEVELOPMENT || DEBUG */
419 #endif /* __arm64__ */
420 
421 #pragma mark Logging
422 
423 os_log_t memorystatus_log_handle;
424 
425 TUNABLE_WRITEABLE(memorystatus_log_level_t, memorystatus_log_level, "memorystatus_log_level", MEMORYSTATUS_LOG_LEVEL_DEFAULT);
426 
427 #if DEBUG || DEVELOPMENT
428 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_log_level, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_log_level, MEMORYSTATUS_LOG_LEVEL_DEFAULT, "");
429 #endif
430 
431 #pragma mark Locks
432 
433 static LCK_GRP_DECLARE(memorystatus_lock_group, "memorystatus");
434 
435 /* Synchronizes jetsam pressure broadcasts */
436 LCK_MTX_DECLARE(memorystatus_jetsam_broadcast_lock, &memorystatus_lock_group);
437 
438 #if DEVELOPMENT || DEBUG
439 static LCK_MTX_DECLARE(disconnect_page_mappings_mutex, &memorystatus_lock_group);
440 #endif /* DEVELOPMENT || DEBUG */
441 
442 /* Idle guard handling */
443 
444 static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
445 static void _memstat_invalidate_idle_demotion_locked(proc_t p);
446 static void _memstat_schedule_idle_demotion_locked(proc_t p);
447 static void _memstat_reschedule_idle_demotion_locked(void);
448 int memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap);
449 vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
450 boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
451 void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
452 void memorystatus_send_low_swap_note(void);
453 boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count,
454     uint32_t *errors, uint64_t *memory_reclaimed);
455 uint64_t memorystatus_available_memory_internal(proc_t p);
456 void memorystatus_thread_wake(void);
457 static bool _memstat_consider_waking_jetsam_thread(void);
458 #if CONFIG_JETSAM
459 static void memorystatus_thread_pool_default(void);
460 static void memorystatus_thread_pool_max(void);
461 #endif /* CONFIG_JETSAM */
462 
463 unsigned int memorystatus_level = 0;
464 static int memorystatus_list_count = 0;
465 memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
466 static thread_call_t memorystatus_idle_demotion_call;
467 uint64_t memstat_idle_demotion_deadline = 0;
468 #if CONFIG_FREEZE
469 unsigned int memorystatus_suspended_count = 0;
470 #endif /* CONFIG_FREEZE */
471 
472 #ifdef XNU_TARGET_OS_OSX
473 /*
474  * Effectively disable the system process and application demotion
475  * logic on macOS. This means system processes and apps won't get the
476  * 10 second protection before landing in the IDLE band after moving
477  * out of their active band. Reasons:-
478  * - daemons + extensions + apps on macOS don't behave the way they
479  *   do on iOS and so they are confusing the demotion logic. For example,
480  *   not all apps go from FG to IDLE. Some sit in higher bands instead. This
481  *   is causing multiple asserts to fire internally.
482  * - we use the aging bands to protect processes from jetsam. But on macOS,
483  *   we have a very limited jetsam that is only invoked under extreme conditions
484  *   where we have no more swap / compressor space OR are under critical pressure.
485  */
486 int system_procs_aging_band = 0;
487 int system_procs_aging_band_stuck = 0;
488 int applications_aging_band = 0;
489 #else /* XNU_TARGET_OS_OSX */
490 int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
491 int system_procs_aging_band_stuck = JETSAM_PRIORITY_AGING_BAND1_STUCK;
492 int applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
493 #endif /* XNU_TARGET_OS_OSX */
494 
495 /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */
496 #if CONFIG_FREEZE
497 int memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_FREEZER;
498 #else /* CONFIG_FREEZE */
499 int memorystatus_freeze_jetsam_band = 0;
500 #endif /* CONFIG_FREEZE */
501 
502 _Atomic bool memorystatus_zone_map_is_exhausted = false;
503 _Atomic bool memorystatus_compressor_space_shortage = false;
504 _Atomic bool memorystatus_pageout_starved = false;
505 #if CONFIG_PHANTOM_CACHE
506 _Atomic bool memorystatus_phantom_cache_pressure = false;
507 #endif /* CONFIG_PHANTOM_CACHE */
508 
509 bool memorystatus_should_issue_fg_band_notify = true;
510 
511 extern void coalition_mark_swappable(coalition_t coal);
512 extern bool coalition_is_swappable(coalition_t coal);
513 boolean_t memorystatus_allowed_vm_map_fork(task_t, bool *);
514 #if DEVELOPMENT || DEBUG
515 void memorystatus_abort_vm_map_fork(task_t);
516 #endif
517 
518 SYSCTL_NODE(_kern, OID_AUTO, memorystatus,
519     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "memorystatus subsystem");
520 
521 /*
522  * Idle delay timeout factors for daemons based on relaunch behavior. Only used in
523  * kJetsamAgingPolicySysProcsReclaimedFirst aging policy.
524  */
525 #define kJetsamSysProcsIdleDelayTimeLowRatio    (5)
526 #define kJetsamSysProcsIdleDelayTimeMedRatio    (2)
527 #define kJetsamSysProcsIdleDelayTimeHighRatio   (1)
528 
529 /*
530  * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, treat apps as well
531  * behaved daemons for aging purposes.
532  */
533 #define kJetsamAppsIdleDelayTimeRatio   (kJetsamSysProcsIdleDelayTimeLowRatio)
534 
535 static uint64_t
memorystatus_sysprocs_idle_time(proc_t p)536 memorystatus_sysprocs_idle_time(proc_t p)
537 {
538 	uint64_t idle_delay_time = 0;
539 	/*
540 	 * For system processes, base the idle delay time on the
541 	 * jetsam relaunch behavior specified by launchd. The idea
542 	 * is to provide extra protection to the daemons which would
543 	 * relaunch immediately after jetsam.
544 	 */
545 	switch (p->p_memstat_relaunch_flags) {
546 	case P_MEMSTAT_RELAUNCH_UNKNOWN:
547 	case P_MEMSTAT_RELAUNCH_LOW:
548 		idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeLowRatio;
549 		break;
550 	case P_MEMSTAT_RELAUNCH_MED:
551 		idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeMedRatio;
552 		break;
553 	case P_MEMSTAT_RELAUNCH_HIGH:
554 		idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeHighRatio;
555 		break;
556 	default:
557 		panic("Unknown relaunch flags on process!");
558 		break;
559 	}
560 	return idle_delay_time;
561 }
562 
563 static uint64_t
memorystatus_apps_idle_time(__unused proc_t p)564 memorystatus_apps_idle_time(__unused proc_t p)
565 {
566 	return memorystatus_apps_idle_delay_time / kJetsamAppsIdleDelayTimeRatio;
567 }
568 
569 
570 static int
571 sysctl_jetsam_set_sysprocs_idle_delay_time SYSCTL_HANDLER_ARGS
572 {
573 #pragma unused(oidp, arg1, arg2)
574 
575 	int error = 0, val = 0, old_time_in_secs = 0;
576 	uint64_t old_time_in_ns = 0;
577 
578 	absolutetime_to_nanoseconds(memorystatus_sysprocs_idle_delay_time, &old_time_in_ns);
579 	old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
580 
581 	error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
582 	if (error || !req->newptr) {
583 		return error;
584 	}
585 
586 	if ((val < 0) || (val > INT32_MAX)) {
587 		memorystatus_log_error("jetsam: new idle delay interval has invalid value.\n");
588 		return EINVAL;
589 	}
590 
591 	nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
592 
593 	return 0;
594 }
595 
596 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, sysprocs_idle_delay_time_ns, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
597     0, 0, sysctl_jetsam_set_sysprocs_idle_delay_time, "I", "Aging window for system processes");
598 
599 
600 static int
601 sysctl_jetsam_set_apps_idle_delay_time SYSCTL_HANDLER_ARGS
602 {
603 #pragma unused(oidp, arg1, arg2)
604 
605 	int error = 0, val = 0, old_time_in_secs = 0;
606 	uint64_t old_time_in_ns = 0;
607 
608 	absolutetime_to_nanoseconds(memorystatus_apps_idle_delay_time, &old_time_in_ns);
609 	old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
610 
611 	error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
612 	if (error || !req->newptr) {
613 		return error;
614 	}
615 
616 	if ((val < 0) || (val > INT32_MAX)) {
617 		memorystatus_log_error("jetsam: new idle delay interval has invalid value.\n");
618 		return EINVAL;
619 	}
620 
621 	nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
622 
623 	return 0;
624 }
625 
626 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, apps_idle_delay_time_ns, CTLTYPE_INT | CTLFLAG_RW,
627     0, 0, sysctl_jetsam_set_apps_idle_delay_time, "I", "Aging window for applications");
628 
629 SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_KERN, &max_task_footprint_mb, 0, "");
630 
631 #if __arm64__
632 int legacy_footprint_bonus_mb = 50; /* This value was chosen after looking at the top 30 apps
633                                      * that needed the additional room in their footprint when
634                                      * the 'correct' accounting methods were applied to them.
635                                      */
636 
637 #if DEVELOPMENT || DEBUG
638 SYSCTL_INT(_kern, OID_AUTO, legacy_footprint_bonus_mb, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_footprint_bonus_mb, 0, "");
639 #endif /* DEVELOPMENT || DEBUG */
640 /*
641  * Raise the inactive and active memory limits to new values.
642  * Will only raise the limits and will do nothing if either of the current
643  * limits are 0.
644  * Caller must hold the proc_list_lock
645  */
646 static void
memorystatus_raise_memlimit_locked(proc_t p,int new_memlimit_active,int new_memlimit_inactive)647 memorystatus_raise_memlimit_locked(proc_t p,
648     int new_memlimit_active,
649     int new_memlimit_inactive)
650 {
651 	int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
652 	memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
653 
654 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
655 
656 	if (p->p_memstat_memlimit_active > 0) {
657 		memlimit_mb_active = p->p_memstat_memlimit_active;
658 	} else if (p->p_memstat_memlimit_active == -1) {
659 		memlimit_mb_active = max_task_footprint_mb;
660 	} else {
661 		/*
662 		 * Nothing to do for '0' which is
663 		 * a special value only used internally
664 		 * to test 'no limits'.
665 		 */
666 		return;
667 	}
668 
669 	if (p->p_memstat_memlimit_inactive > 0) {
670 		memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
671 	} else if (p->p_memstat_memlimit_inactive == -1) {
672 		memlimit_mb_inactive = max_task_footprint_mb;
673 	} else {
674 		/*
675 		 * Nothing to do for '0' which is
676 		 * a special value only used internally
677 		 * to test 'no limits'.
678 		 */
679 		return;
680 	}
681 
682 	memlimit_mb_active = MAX(new_memlimit_active, memlimit_mb_active);
683 	memlimit_mb_inactive = MAX(new_memlimit_inactive, memlimit_mb_inactive);
684 
685 	/* Maintain pre-existing limit fatality */
686 	if (_memstat_proc_active_memlimit_is_fatal(p)) {
687 		memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
688 	}
689 	if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
690 		memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
691 	}
692 
693 	memstat_set_memlimits_locked(p, memlimit_mb_active,
694 	    memlimit_mb_inactive, memlimit_options);
695 }
696 
697 void
memorystatus_act_on_legacy_footprint_entitlement(proc_t p,boolean_t footprint_increase)698 memorystatus_act_on_legacy_footprint_entitlement(proc_t p, boolean_t footprint_increase)
699 {
700 	int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
701 
702 	if (p == NULL) {
703 		return;
704 	}
705 
706 	proc_list_lock();
707 
708 	if (p->p_memstat_memlimit_active > 0) {
709 		memlimit_mb_active = p->p_memstat_memlimit_active;
710 	} else if (p->p_memstat_memlimit_active == -1) {
711 		memlimit_mb_active = max_task_footprint_mb;
712 	} else {
713 		/*
714 		 * Nothing to do for '0' which is
715 		 * a special value only used internally
716 		 * to test 'no limits'.
717 		 */
718 		proc_list_unlock();
719 		return;
720 	}
721 
722 	if (p->p_memstat_memlimit_inactive > 0) {
723 		memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
724 	} else if (p->p_memstat_memlimit_inactive == -1) {
725 		memlimit_mb_inactive = max_task_footprint_mb;
726 	} else {
727 		/*
728 		 * Nothing to do for '0' which is
729 		 * a special value only used internally
730 		 * to test 'no limits'.
731 		 */
732 		proc_list_unlock();
733 		return;
734 	}
735 
736 	if (footprint_increase) {
737 		memlimit_mb_active += legacy_footprint_bonus_mb;
738 		memlimit_mb_inactive += legacy_footprint_bonus_mb;
739 	} else {
740 		memlimit_mb_active -= legacy_footprint_bonus_mb;
741 		if (memlimit_mb_active == max_task_footprint_mb) {
742 			memlimit_mb_active = -1; /* reverting back to default system limit */
743 		}
744 
745 		memlimit_mb_inactive -= legacy_footprint_bonus_mb;
746 		if (memlimit_mb_inactive == max_task_footprint_mb) {
747 			memlimit_mb_inactive = -1; /* reverting back to default system limit */
748 		}
749 	}
750 	memorystatus_raise_memlimit_locked(p, memlimit_mb_active, memlimit_mb_inactive);
751 
752 	proc_list_unlock();
753 }
754 
755 void
memorystatus_act_on_ios13extended_footprint_entitlement(proc_t p)756 memorystatus_act_on_ios13extended_footprint_entitlement(proc_t p)
757 {
758 	proc_list_lock();
759 	memorystatus_raise_memlimit_locked(p,
760 	    memorystatus_ios13extended_footprint_limit_mb,
761 	    memorystatus_ios13extended_footprint_limit_mb);
762 	proc_list_unlock();
763 }
764 
765 void
memorystatus_act_on_entitled_task_limit(proc_t p)766 memorystatus_act_on_entitled_task_limit(proc_t p)
767 {
768 	if (memorystatus_entitled_max_task_footprint_mb == 0) {
769 		// Entitlement is not supported on this device.
770 		return;
771 	}
772 	proc_list_lock();
773 	memorystatus_raise_memlimit_locked(p,
774 	    memorystatus_entitled_max_task_footprint_mb,
775 	    memorystatus_entitled_max_task_footprint_mb);
776 	proc_list_unlock();
777 }
778 
779 void
memorystatus_act_on_entitled_developer_task_limit(proc_t p)780 memorystatus_act_on_entitled_developer_task_limit(proc_t p)
781 {
782 	if (memorystatus_entitled_dev_max_task_footprint_mb == 0) {
783 		// Entitlement not supported on this device
784 		return;
785 	}
786 	memorystatus_log("memorystatus: WARNING %s [%d] is receiving an entitled "
787 	    "debugging memory limit. This is intended only for debugging and "
788 	    "can result in unstable device behavior.",
789 	    proc_best_name(p), proc_getpid(p));
790 	proc_list_lock();
791 	memorystatus_raise_memlimit_locked(p,
792 	    memorystatus_entitled_dev_max_task_footprint_mb,
793 	    memorystatus_entitled_dev_max_task_footprint_mb);
794 	proc_list_unlock();
795 }
796 
797 #endif /* __arm64__ */
798 
799 SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_level, 0, "");
800 
801 int
memorystatus_get_level(__unused struct proc * p,struct memorystatus_get_level_args * args,__unused int * ret)802 memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
803 {
804 	user_addr_t     level = 0;
805 
806 	level = args->level;
807 
808 	if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
809 		return EFAULT;
810 	}
811 
812 	return 0;
813 }
814 
815 static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
816 
817 /* Memory Limits */
818 
819 static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
820 static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
821 
822 
823 static int memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
824 
825 #if DEBUG || DEVELOPMENT
826 static int memorystatus_cmd_set_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
827 static int memorystatus_cmd_get_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
828 static int memorystatus_set_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry);
829 static int memorystatus_get_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry);
830 #endif  // DEBUG || DEVELOPMENT
831 static int memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry);
832 
833 static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
834 
835 static int memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
836 
837 static void memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry);
838 
839 int proc_get_memstat_priority(proc_t, boolean_t);
840 
841 static boolean_t memorystatus_idle_snapshot = 0;
842 
843 unsigned int memorystatus_delta = 0;
844 
845 /* Jetsam Loop Detection */
846 boolean_t memorystatus_jld_enabled = FALSE;              /* Enable jetsam loop detection */
847 uint32_t memorystatus_jld_eval_period_msecs = 0;         /* Init pass sets this based on device memory size */
848 int      memorystatus_jld_max_kill_loops = 2;            /* How many times should we try and kill up to the target band */
849 
850 /*
851  * A FG app can request that the aggressive jetsam mechanism display some leniency in the FG band. This 'lenient' mode is described as:
852  * --- if aggressive jetsam kills an app in the FG band and gets back >=AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD memory, it will stop the aggressive march further into and up the jetsam bands.
853  *
854  * RESTRICTIONS:
855  * - Such a request is respected/acknowledged only once while that 'requesting' app is in the FG band i.e. if aggressive jetsam was
856  * needed and the 'lenient' mode was deployed then that's it for this special mode while the app is in the FG band.
857  *
858  * - If the app is still in the FG band and aggressive jetsam is needed again, there will be no stop-and-check the next time around.
859  *
860  * - Also, the transition of the 'requesting' app away from the FG band will void this special behavior.
861  */
862 
863 #define AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD        25
864 boolean_t       memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
865 boolean_t       memorystatus_aggressive_jetsam_lenient = FALSE;
866 
867 #if DEVELOPMENT || DEBUG
868 /*
869  * Jetsam Loop Detection tunables.
870  */
871 
872 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_eval_period_msecs, 0, "");
873 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_max_kill_loops, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_max_kill_loops, 0, "");
874 #endif /* DEVELOPMENT || DEBUG */
875 
876 /*
877  * snapshot support for memstats collected at boot.
878  */
879 static memorystatus_jetsam_snapshot_t memorystatus_at_boot_snapshot;
880 
881 static void memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count);
882 static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount);
883 static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime);
884 
885 static void memorystatus_clear_errors(void);
886 
887 static void memorystatus_get_task_phys_footprint_page_counts(task_t task,
888     uint64_t *internal_pages, uint64_t *internal_compressed_pages,
889     uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
890     uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
891     uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages,
892     uint64_t *neural_nofootprint_total_pages);
893 
894 static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *count);
895 
896 static uint32_t memorystatus_build_state(proc_t p);
897 //static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
898 
899 static bool memorystatus_kill_top_process(bool any, bool sort_flag, uint32_t cause, os_reason_t jetsam_reason,
900     int32_t max_priority, bool only_swappable,
901     int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed);
902 static bool memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, int32_t max_kills, uint32_t *errors, uint64_t *memory_reclaimed);
903 static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed);
904 
905 /* Priority Band Sorting Routines */
906 static int  memorystatus_sort_bucket(unsigned int bucket_index, int sort_order);
907 static int  memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order);
908 static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index);
909 static int  memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz);
910 
911 /* qsort routines */
912 typedef int (*cmpfunc_t)(const void *a, const void *b);
913 extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
914 static int memstat_asc_cmp(const void *a, const void *b);
915 
916 /* VM pressure */
917 
918 #if CONFIG_SECLUDED_MEMORY
919 extern unsigned int     vm_page_secluded_count;
920 extern unsigned int     vm_page_secluded_count_over_target;
921 #endif /* CONFIG_SECLUDED_MEMORY */
922 
923 /* Aggressive jetsam pages threshold for sysproc aging policy */
924 unsigned int memorystatus_sysproc_aging_aggr_pages = 0;
925 
926 uint32_t memorystatus_available_pages = UINT32_MAX;
927 
928 __options_closed_decl(memorystatus_policy_t, uint8_t, {
929 	kPolicyDefault        = 0x00,
930 	kPolicyClearTheDecks  = 0x01,
931 	kPolicyBallastDrain   = 0x02,
932 });
933 
934 static memorystatus_policy_t memstat_policy_config = kPolicyDefault;
935 
936 #define MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX ((uint32_t)(atop_64(max_mem) / 2))
937 
938 /*
939  * Jetsam Page Shortage Thresholds (PSTs):
940  *  - critical: jetsam above the idle band
941  *  - idle: jetsam in the idle band
942  *  - pressure: jetsam soft memory limit violators
943  *  - ballast: offset applied to non-critical thresholds upon request
944  *    from userspace
945  *  - ctd (clear-the-decks): offset applied to non-critical thresholds upon request
946  *    from userspace
947  */
948 uint32_t memstat_critical_threshold = 0;
949 uint32_t memstat_idle_threshold = 0;
950 uint32_t memstat_soft_threshold = 0;
951 uint32_t memstat_ballast_offset = 0;
952 uint32_t memstat_ctd_offset = 0;
953 
954 /*
955  * NB: These MiB thresholds are only read at boot and may become out of sync
956  * with the PSTs above.
957  */
958 TUNABLE_DT(uint32_t, memorystatus_critical_threshold_mb, "/defaults",
959     "kern.memstat_critical_mb", "memorystatus_critical_threshold_mb", 0, TUNABLE_DT_NONE);
960 TUNABLE_DT(uint32_t, memorystatus_idle_threshold_mb, "/defaults",
961     "kern.memstat_idle_mb", "memorystatus_idle_threshold_mb", 0, TUNABLE_DT_NONE);
962 TUNABLE_DT(uint32_t, memorystatus_pressure_threshold_mb, "/defaults",
963     "kern.memstat_pressure_mb", "memorystatus_pressure_threshold_mb", 0, TUNABLE_DT_NONE);
964 TUNABLE_DT(uint32_t, memstat_ballast_offset_mb, "/defaults",
965     "kern.memstat_ballast_mb", "memstat_ballast_offset_mb", 0, TUNABLE_DT_NONE);
966 TUNABLE(uint32_t, memstat_ctd_offset_mb, "memstat_ballast_offset_mb", 0);
967 
968 #if CONFIG_JETSAM
969 TUNABLE_DT_WRITEABLE(unsigned int, memorystatus_swap_all_apps, "/defaults", "kern.swap_all_apps", "kern.swap_all_apps", false, TUNABLE_DT_NONE);
970 /* Will compact the early swapin queue if there are >= this many csegs on it. */
971 static unsigned int memorystatus_swapin_trigger_segments = 10;
972 unsigned int memorystatus_swapin_trigger_pages = 0;
973 
974 #if DEVELOPMENT || DEBUG
975 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
976 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_swapin_trigger_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_swapin_trigger_pages, 0, "");
977 #else
978 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
979 #endif /* DEVELOPMENT || DEBUG */
980 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_swap_all_apps, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_swap_all_apps, 0, "");
981 
982 static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit);
983 
984 int32_t max_kill_priority = JETSAM_PRIORITY_MAX;
985 
986 proc_name_t memorystatus_jetsam_proc_name_panic; /* Panic when we are about to jetsam this process. */
987 uint32_t    memorystatus_jetsam_proc_cause_panic = 0; /* If specified, panic only when we are about to jetsam the process above for this cause. */
988 uint32_t    memorystatus_jetsam_proc_size_panic = 0; /* If specified, panic only when we are about to jetsam the process above and its footprint is more than this in MB. */
989 
990 /* If set, kill swappable processes when we're low on swap space. Currently off until we can allocate more swap space (rdar://87800902) */
991 uint32_t jetsam_kill_on_low_swap = 0;
992 
993 /*
994  * Global switch for enabling fast jetsam. Fast jetsam is
995  * hooked up via the system_override() system call. When
996  * enabled, the following features can be toggled:
997  * - clear-the-decks jetsam
998  * - ballast-drain jetsam
999  */
1000 TUNABLE_WRITEABLE(bool, fast_jetsam_enabled, "fast_jetsam_enabled", true);
1001 
1002 #else /* CONFIG_JETSAM */
1003 int32_t max_kill_priority = JETSAM_PRIORITY_IDLE;
1004 #endif /* CONFIG_JETSAM */
1005 
1006 #if DEVELOPMENT || DEBUG
1007 extern bool kill_on_no_paging_space;
1008 #endif /* DEVELOPMENT || DEBUG */
1009 
1010 #if DEVELOPMENT || DEBUG
1011 static inline uint32_t
roundToNearestMB(uint32_t in)1012 roundToNearestMB(uint32_t in)
1013 {
1014 	return (in + ((1 << 20) - 1)) >> 20;
1015 }
1016 
1017 static int memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase);
1018 #endif
1019 
1020 #if __arm64__
1021 extern int legacy_footprint_entitlement_mode;
1022 #endif /* __arm64__ */
1023 
1024 /* Debug */
1025 
1026 extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
1027 
1028 #if DEVELOPMENT || DEBUG
1029 
1030 static unsigned int memorystatus_debug_dump_this_bucket = 0;
1031 
1032 static void
memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)1033 memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)
1034 {
1035 	proc_t p = NULL;
1036 	uint64_t bytes = 0;
1037 	int ledger_limit = 0;
1038 	unsigned int b = bucket_index;
1039 	boolean_t traverse_all_buckets = FALSE;
1040 
1041 	if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1042 		traverse_all_buckets = TRUE;
1043 		b = 0;
1044 	} else {
1045 		traverse_all_buckets = FALSE;
1046 		b = bucket_index;
1047 	}
1048 
1049 	/*
1050 	 * footprint reported in [pages / MB ]
1051 	 * limits reported as:
1052 	 *      L-limit  proc's Ledger limit
1053 	 *      C-limit  proc's Cached limit, should match Ledger
1054 	 *      A-limit  proc's Active limit
1055 	 *     IA-limit  proc's Inactive limit
1056 	 *	F==Fatal,  NF==NonFatal
1057 	 */
1058 
1059 	memorystatus_log_debug("memorystatus_debug_dump ***START*(PAGE_SIZE_64=%llu)**\n", PAGE_SIZE_64);
1060 	memorystatus_log_debug("bucket [pid]       [pages / MB]     [state]      [EP / RP / AP]   dirty     deadline [L-limit / C-limit / A-limit / IA-limit] name\n");
1061 	p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets);
1062 	while (p) {
1063 		bytes = get_task_phys_footprint(proc_task(p));
1064 		task_get_phys_footprint_limit(proc_task(p), &ledger_limit);
1065 		memorystatus_log_debug("%2d     [%5d]     [%5lld /%3lldMB]   0x%-8x   [%2d / %2d / %2d]   0x%-3x   %10lld    [%3d / %3d%s / %3d%s / %3d%s]   %s\n",
1066 		    b, proc_getpid(p),
1067 		    (bytes / PAGE_SIZE_64),             /* task's footprint converted from bytes to pages     */
1068 		    (bytes / (1024ULL * 1024ULL)),      /* task's footprint converted from bytes to MB */
1069 		    p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_assertionpriority,
1070 		    p->p_memstat_dirty, p->p_memstat_idledeadline,
1071 		    ledger_limit,
1072 		    p->p_memstat_memlimit,
1073 		    (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"),
1074 		    p->p_memstat_memlimit_active,
1075 		    (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL ? "F " : "NF"),
1076 		    p->p_memstat_memlimit_inactive,
1077 		    (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL ? "F " : "NF"),
1078 		    (*p->p_name ? p->p_name : "unknown"));
1079 		p = memorystatus_get_next_proc_locked(&b, p, traverse_all_buckets);
1080 	}
1081 	memorystatus_log_debug("memorystatus_debug_dump ***END***\n");
1082 }
1083 
1084 static int
1085 sysctl_memorystatus_debug_dump_bucket SYSCTL_HANDLER_ARGS
1086 {
1087 #pragma unused(oidp, arg2)
1088 	int bucket_index = 0;
1089 	int error;
1090 	error = SYSCTL_OUT(req, arg1, sizeof(int));
1091 	if (error || !req->newptr) {
1092 		return error;
1093 	}
1094 	error = SYSCTL_IN(req, &bucket_index, sizeof(int));
1095 	if (error || !req->newptr) {
1096 		return error;
1097 	}
1098 	if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1099 		/*
1100 		 * All jetsam buckets will be dumped.
1101 		 */
1102 	} else {
1103 		/*
1104 		 * Only a single bucket will be dumped.
1105 		 */
1106 	}
1107 
1108 	proc_list_lock();
1109 	memorystatus_debug_dump_bucket_locked(bucket_index);
1110 	proc_list_unlock();
1111 	memorystatus_debug_dump_this_bucket = bucket_index;
1112 	return error;
1113 }
1114 
1115 /*
1116  * Debug aid to look at jetsam buckets and proc jetsam fields.
1117  *	Use this sysctl to act on a particular jetsam bucket.
1118  *	Writing the sysctl triggers the dump.
1119  *      Usage: sysctl kern.memorystatus_debug_dump_this_bucket=<bucket_index>
1120  */
1121 
1122 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_debug_dump_this_bucket, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_debug_dump_this_bucket, 0, sysctl_memorystatus_debug_dump_bucket, "I", "");
1123 
1124 
1125 /* Debug aid to aid determination of limit */
1126 
1127 static int
1128 sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
1129 {
1130 #pragma unused(oidp, arg2)
1131 	proc_t p;
1132 	unsigned int b = 0;
1133 	int error, enable = 0;
1134 	bool use_active;   /* use the active limit and active limit attributes */
1135 
1136 	error = SYSCTL_OUT(req, arg1, sizeof(int));
1137 	if (error || !req->newptr) {
1138 		return error;
1139 	}
1140 
1141 	error = SYSCTL_IN(req, &enable, sizeof(int));
1142 	if (error || !req->newptr) {
1143 		return error;
1144 	}
1145 
1146 	if (!(enable == 0 || enable == 1)) {
1147 		return EINVAL;
1148 	}
1149 
1150 	proc_list_lock();
1151 
1152 	memorystatus_highwater_enabled = enable;
1153 
1154 	p = memorystatus_get_first_proc_locked(&b, TRUE);
1155 	while (p) {
1156 		use_active = _memstat_proc_is_active_locked(p);
1157 
1158 		if (enable) {
1159 			(void)_memstat_update_memlimit_locked(p, use_active);
1160 		} else {
1161 			/*
1162 			 * Disabling limits does not touch the stored variants.
1163 			 * Set the cached limit fields to system_wide defaults.
1164 			 */
1165 			p->p_memstat_memlimit = -1;
1166 			p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
1167 		}
1168 
1169 		/*
1170 		 * Enforce the cached limit by writing to the ledger.
1171 		 */
1172 		_memstat_write_memlimit_to_ledger_locked(p, use_active, false);
1173 
1174 		p = memorystatus_get_next_proc_locked(&b, p, TRUE);
1175 	}
1176 
1177 
1178 	proc_list_unlock();
1179 
1180 	return 0;
1181 }
1182 
1183 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
1184 
1185 SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
1186 
1187 #endif /* DEVELOPMENT || DEBUG */
1188 
1189 #if CONFIG_JETSAM
1190 #if DEVELOPMENT || DEBUG
1191 static int
1192 memstat_page_shortage_threshold_sysctl_handler SYSCTL_HANDLER_ARGS
1193 {
1194 	uint32_t threshold;
1195 	if (arg1 == &memstat_idle_threshold) {
1196 		threshold = memorystatus_get_idle_exit_page_shortage_threshold();
1197 	} else if (arg1 == &memstat_soft_threshold) {
1198 		threshold = memorystatus_get_soft_memlimit_page_shortage_threshold();
1199 	} else if (arg1 == &memstat_critical_threshold) {
1200 		threshold = memorystatus_get_critical_page_shortage_threshold();
1201 	} else {
1202 		return EINVAL;
1203 	}
1204 	return sysctl_handle_int(oidp, NULL, threshold, req);
1205 }
1206 
1207 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_critical,
1208     CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_critical_threshold, 0,
1209     memstat_page_shortage_threshold_sysctl_handler, "IU",
1210     "");
1211 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_idle,
1212     CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_idle_threshold, 0,
1213     memstat_page_shortage_threshold_sysctl_handler, "IU",
1214     "");
1215 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_soft,
1216     CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_soft_threshold, 0,
1217     memstat_page_shortage_threshold_sysctl_handler, "IU",
1218     "");
1219 
1220 SYSCTL_INT(_kern_memorystatus, OID_AUTO, ballast_offset_pages,
1221     CTLFLAG_RD | CTLFLAG_LOCKED,
1222     &memstat_ballast_offset, 0, "");
1223 SYSCTL_INT(_kern_memorystatus, OID_AUTO, ctd_offset_pages,
1224     CTLFLAG_RD | CTLFLAG_LOCKED,
1225     &memstat_ctd_offset, 0, "");
1226 #endif /* DEVELOPMENT || DEBUG */
1227 
1228 static int
1229 memstat_page_shortage_threshold_experiment_handler SYSCTL_HANDLER_ARGS
1230 {
1231 	uint32_t threshold_mb;
1232 	int error;
1233 
1234 	assert3p(arg1, !=, NULL);
1235 	threshold_mb = ptoa_32(os_atomic_load((uint32_t *)arg1, relaxed)) >> 20;
1236 
1237 	error = sysctl_handle_int(oidp, &threshold_mb, 0, req);
1238 	if (error || !req->newptr) {
1239 		return error;
1240 	}
1241 
1242 	if (threshold_mb > UINT32_MAX >> 20) {
1243 		/* Converting to bytes would overflow */
1244 		return EINVAL;
1245 	}
1246 
1247 	uint32_t new_threshold_pages = atop_32(threshold_mb << 20);
1248 	/*
1249 	 * Page shortage thresholds may not exceed 1/2 max_mem
1250 	 */
1251 	if (new_threshold_pages > MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX) {
1252 		return EINVAL;
1253 	}
1254 	if ((arg1 == &memstat_soft_threshold ||
1255 	    arg1 == &memstat_idle_threshold ||
1256 	    arg1 == &memstat_critical_threshold) &&
1257 	    new_threshold_pages == 0) {
1258 		return EINVAL;
1259 	}
1260 
1261 	if (arg1 == &memstat_soft_threshold) {
1262 		memorystatus_log("memorystatus: setting soft memory limit "
1263 		    "page shortage threshold to %u MiB\n", threshold_mb);
1264 	} else if (arg1 == &memstat_idle_threshold) {
1265 		memorystatus_log("memorystatus: setting idle exit page "
1266 		    "shortage threshold to %u MiB\n", threshold_mb);
1267 	} else if (arg1 == &memstat_critical_threshold) {
1268 		memorystatus_log("memorystatus: setting critical page shortage"
1269 		    " threshold to %u MiB\n", threshold_mb);
1270 	} else if (arg1 == &memstat_ctd_offset) {
1271 		memorystatus_log("memorystatus: setting clear-the-decks page shortage"
1272 		    " offset to %u MiB\n", threshold_mb);
1273 	} else if (arg1 == &memstat_ballast_offset) {
1274 		memorystatus_log("memorystatus: setting ballast page shortage"
1275 		    " offset to %u MiB\n", threshold_mb);
1276 	} else {
1277 		return EINVAL;
1278 	}
1279 	os_atomic_store((uint32_t *)arg1, new_threshold_pages, relaxed);
1280 
1281 	return 0;
1282 }
1283 
1284 #if DEVELOPMENT || DEBUG
1285 #define MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED)
1286 #else /* RELEASE */
1287 #define MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED)
1288 #endif /* DEVELOPMENT || DEBUG */
1289 
1290 EXPERIMENT_FACTOR_PROC(_kern_memorystatus, soft_threshold_mb,
1291     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1292     &memstat_soft_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1293     "IU",
1294     "The minimum amount of available memory to maintain before killing "
1295     "processes which have violated there soft memory limit");
1296 
1297 EXPERIMENT_FACTOR_PROC(_kern_memorystatus, idle_threshold_mb,
1298     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1299     &memstat_idle_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1300     "IU",
1301     "The minimum amount of available memory to maintain before exiting idle "
1302     "processes");
1303 EXPERIMENT_FACTOR_PROC(_kern_memorystatus, critical_threshold_mb,
1304     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1305     &memstat_critical_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1306     "IU",
1307     "The minimum amount of available memory to maintain before killing non-idle "
1308     "processes");
1309 EXPERIMENT_FACTOR_PROC(_kern_memorystatus, ballast_offset_mb,
1310     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1311     &memstat_ballast_offset, 0, memstat_page_shortage_threshold_experiment_handler,
1312     "IU",
1313     "An offset to apply to all non-critical page shortage thresholds when "
1314     "ballast is filling");
1315 EXPERIMENT_FACTOR_PROC(_kern_memorystatus, clear_the_decks_offset_mb,
1316     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1317     &memstat_ctd_offset, 0, memstat_page_shortage_threshold_experiment_handler,
1318     "IU",
1319     "An offset to apply to all non-critical page shortage thresholds when "
1320     "clear-the-decks is engaged");
1321 
1322 int
memorystatus_ballast_control(bool drain)1323 memorystatus_ballast_control(bool drain)
1324 {
1325 	if (!fast_jetsam_enabled) {
1326 		memorystatus_log_error("memorystatus: fast-jetsam "
1327 		    "has been disabled on this system. denying request to %s ballast\n",
1328 		    drain ? "drain" : "flood");
1329 		return ENOTSUP;
1330 	}
1331 	if (memstat_ballast_offset == 0) {
1332 		/* nothing to do */
1333 		return 0;
1334 	}
1335 	if (drain) {
1336 		/*
1337 		 * Drain the ballast tanks, providing additional buoyancy by requiring that
1338 		 * they only be used to store "available" memory.
1339 		 */
1340 		memorystatus_policy_t orig_policy = os_atomic_or_orig(
1341 			&memstat_policy_config,
1342 			(memorystatus_policy_t)kPolicyBallastDrain, relaxed);
1343 		if (orig_policy & kPolicyBallastDrain) {
1344 			return 0;
1345 		}
1346 		memorystatus_log("memorystatus: draining ballast "
1347 		    "-- will add %u MiB to non-critical page shortage "
1348 		    "thresholds\n", ptoa_32(memstat_ballast_offset) >> 20);
1349 		memorystatus_thread_pool_max();
1350 		_memstat_consider_waking_jetsam_thread();
1351 	} else {
1352 		/*
1353 		 * Flood the ballast tanks, removing the extra buoyancy by allowing them to be
1354 		 * filled with "unavailable" memory.
1355 		 */
1356 		memorystatus_policy_t orig_policy = os_atomic_andnot_orig(
1357 			&memstat_policy_config,
1358 			(memorystatus_policy_t)kPolicyBallastDrain, relaxed);
1359 		if (!(orig_policy & kPolicyBallastDrain)) {
1360 			/* already disabled */
1361 			return 0;
1362 		}
1363 		assertf(fast_jetsam_enabled, "ballast was drained while fast-jetsam was disabled");
1364 		memorystatus_log("memorystatus: flooding ballast "
1365 		    "-- will subtract %u MiB from non-critical page shortage "
1366 		    "thresholds\n", ptoa_32(memstat_ballast_offset) >> 20);
1367 		memorystatus_thread_pool_default();
1368 		_memstat_consider_waking_jetsam_thread();
1369 	}
1370 	return 0;
1371 }
1372 
1373 static int
1374 sysctl_kern_memorystatus_ballast_drain SYSCTL_HANDLER_ARGS
1375 {
1376 	int error = 0;
1377 
1378 	boolean_t drained = os_atomic_load(&memstat_policy_config, relaxed) & kPolicyBallastDrain ? TRUE : FALSE;
1379 
1380 	error = sysctl_handle_int(oidp, &drained, 0, req);
1381 	if (error || !req->newptr) {
1382 		return error;
1383 	}
1384 
1385 	/*
1386 	 * Writers must be root or have the com.apple.private.kernel.jetsam entitlement
1387 	 */
1388 	error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
1389 	if (error) {
1390 		return error;
1391 	}
1392 
1393 	return memorystatus_ballast_control(drained);
1394 }
1395 
1396 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, ballast_drained,
1397     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS, 0, 0,
1398     sysctl_kern_memorystatus_ballast_drain, "IU",
1399     "If true, apply an offset (kern.memorystatus.ballast_offset_mb) to "
1400     "all non-critical page shortage thresholds");
1401 
1402 #if DEVELOPMENT || DEBUG
1403 /*
1404  * In preparation for a storm, sailors may "clear the decks" of non-essential
1405  * cargo to increase the seaworthiness of a vessel. In our analogy, the
1406  * non-essential cargo is idle processes or processes which have exceeded
1407  * their memory limit. The storm may be any foreseeable user activity that will
1408  * require significant memory demand.
1409  *
1410  * Mechanically, clearing the decks involves adding a configurable offset to
1411  * the idle and soft available page shortage thresholds.
1412  *
1413  * Readers may note that the clear-the-decks policy is mechanically identical
1414  * the ballast-draining policy. Their difference lies in intended use.
1415  * Clear-the-decks is intended to address imminent memory demand and may be
1416  * configured with an offset that wouldn't be sustainable for long-term system
1417  * use. The interface is generally intended to allow clients to hint to the
1418  * system that they will need a significant amount of memory in the near future,
1419  * and the system should proactively try to free unneeded reserves to satisfy
1420  * to be able to better satisfy the demand.
1421  *
1422  * This policy is currently only exposed on development kernels for prototyping
1423  * until a productized use case emerges
1424  *
1425  * TODO: If adopted on production systems, this mechanism should use a
1426  * dedicated system-call / memorystatus-command
1427  */
1428 static int
memstat_clear_the_decks(bool clear)1429 memstat_clear_the_decks(bool clear)
1430 {
1431 	if (!fast_jetsam_enabled) {
1432 		memorystatus_log_error("memorystatus: fast-jetsam "
1433 		    "has been disabled on this system\n");
1434 		return ENOTSUP;
1435 	}
1436 	if (clear) {
1437 		/*
1438 		 * Clear the decks of non-essential cargo.
1439 		 */
1440 		memorystatus_policy_t orig_policy = os_atomic_or_orig(
1441 			&memstat_policy_config,
1442 			(memorystatus_policy_t)kPolicyClearTheDecks, relaxed);
1443 		if (orig_policy & kPolicyClearTheDecks) {
1444 			return EALREADY;
1445 		}
1446 		memorystatus_log("memorystatus: clear-the-decks engaged "
1447 		    "-- will add %u MiB to non-critical page shortage "
1448 		    "thresholds\n", ptoa_32(memstat_ctd_offset) >> 20);
1449 		memorystatus_thread_pool_max();
1450 		_memstat_consider_waking_jetsam_thread();
1451 	} else {
1452 		/*
1453 		 * Allow the decks to be reloaded with non-essential cargo.
1454 		 */
1455 		memorystatus_policy_t orig_policy = os_atomic_andnot_orig(
1456 			&memstat_policy_config,
1457 			(memorystatus_policy_t)kPolicyClearTheDecks, relaxed);
1458 		if (!(orig_policy & kPolicyClearTheDecks)) {
1459 			return EALREADY;
1460 		}
1461 		assertf(fast_jetsam_enabled, "clear the decks was set while fast-jetsam was disabled");
1462 		memorystatus_log("memorystatus: clear-the-decks disengaged "
1463 		    "-- will subtract %u MiB from non-critical page shortage "
1464 		    "thresholds\n", ptoa_32(memstat_ctd_offset) >> 20);
1465 		memorystatus_thread_pool_default();
1466 		_memstat_consider_waking_jetsam_thread();
1467 	}
1468 	return 0;
1469 }
1470 
1471 static int
1472 sysctl_kern_memorystatus_decks_cleared SYSCTL_HANDLER_ARGS
1473 {
1474 	int error = 0;
1475 
1476 	boolean_t cleared = os_atomic_load(&memstat_policy_config, relaxed) & kPolicyClearTheDecks ? TRUE : FALSE;
1477 
1478 	error = sysctl_handle_int(oidp, &cleared, 0, req);
1479 	if (error || !req->newptr) {
1480 		return error;
1481 	}
1482 
1483 	/*
1484 	 * Writers must be root or have the com.apple.private.kernel.jetsam entitlement
1485 	 */
1486 	error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
1487 	if (error) {
1488 		return error;
1489 	}
1490 
1491 	return memstat_clear_the_decks(cleared);
1492 }
1493 
1494 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, decks_cleared,
1495     MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1496     0, 0, sysctl_kern_memorystatus_decks_cleared, "I",
1497     "If true, apply an offset (kern.memorystatus_ctd_offset_mb) to "
1498     "all non-critical page shortage thresholds");
1499 #endif /* DEVELOPMENT || DEBUG */
1500 #endif /* CONFIG_JETSAM */
1501 
1502 extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
1503     void *parameter,
1504     integer_t priority,
1505     thread_t *new_thread);
1506 
1507 #if DEVELOPMENT || DEBUG
1508 
1509 static int
1510 sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS
1511 {
1512 #pragma unused(arg1, arg2)
1513 	int     error = 0, pid = 0;
1514 	proc_t  p;
1515 
1516 	error = sysctl_handle_int(oidp, &pid, 0, req);
1517 	if (error || !req->newptr) {
1518 		return error;
1519 	}
1520 
1521 	lck_mtx_lock(&disconnect_page_mappings_mutex);
1522 
1523 	if (pid == -1) {
1524 		vm_pageout_disconnect_all_pages();
1525 	} else {
1526 		p = proc_find(pid);
1527 
1528 		if (p != NULL) {
1529 			error = task_disconnect_page_mappings(proc_task(p));
1530 
1531 			proc_rele(p);
1532 
1533 			if (error) {
1534 				error = EIO;
1535 			}
1536 		} else {
1537 			error = EINVAL;
1538 		}
1539 	}
1540 	lck_mtx_unlock(&disconnect_page_mappings_mutex);
1541 
1542 	return error;
1543 }
1544 
1545 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1546     0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", "");
1547 
1548 #endif /* DEVELOPMENT || DEBUG */
1549 
1550 /*
1551  * Sorts the given bucket.
1552  *
1553  * Input:
1554  *	bucket_index - jetsam priority band to be sorted.
1555  *	sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1556  *		Currently sort_order is only meaningful when handling
1557  *		coalitions.
1558  *
1559  * proc_list_lock must be held by the caller.
1560  */
1561 static void
memorystatus_sort_bucket_locked(unsigned int bucket_index,int sort_order)1562 memorystatus_sort_bucket_locked(unsigned int bucket_index, int sort_order)
1563 {
1564 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
1565 	if (memstat_bucket[bucket_index].count == 0) {
1566 		return;
1567 	}
1568 
1569 	switch (bucket_index) {
1570 	case JETSAM_PRIORITY_FOREGROUND:
1571 		if (memorystatus_sort_by_largest_coalition_locked(bucket_index, sort_order) == 0) {
1572 			/*
1573 			 * Fall back to per process sorting when zero coalitions are found.
1574 			 */
1575 			memorystatus_sort_by_largest_process_locked(bucket_index);
1576 		}
1577 		break;
1578 	default:
1579 		memorystatus_sort_by_largest_process_locked(bucket_index);
1580 		break;
1581 	}
1582 }
1583 
1584 /*
1585  * Picks the sorting routine for a given jetsam priority band.
1586  *
1587  * Input:
1588  *	bucket_index - jetsam priority band to be sorted.
1589  *	sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1590  *		Currently sort_order is only meaningful when handling
1591  *		coalitions.
1592  *
1593  * Return:
1594  *	0     on success
1595  *      non-0 on failure
1596  */
1597 static int
memorystatus_sort_bucket(unsigned int bucket_index,int sort_order)1598 memorystatus_sort_bucket(unsigned int bucket_index, int sort_order)
1599 {
1600 	int coal_sort_order;
1601 
1602 	/*
1603 	 * Verify the jetsam priority
1604 	 */
1605 	if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1606 		return EINVAL;
1607 	}
1608 
1609 #if DEVELOPMENT || DEBUG
1610 	if (sort_order == JETSAM_SORT_DEFAULT) {
1611 		coal_sort_order = COALITION_SORT_DEFAULT;
1612 	} else {
1613 		coal_sort_order = sort_order;           /* only used for testing scenarios */
1614 	}
1615 #else
1616 	/* Verify default */
1617 	if (sort_order == JETSAM_SORT_DEFAULT) {
1618 		coal_sort_order = COALITION_SORT_DEFAULT;
1619 	} else {
1620 		return EINVAL;
1621 	}
1622 #endif
1623 
1624 	proc_list_lock();
1625 	memorystatus_sort_bucket_locked(bucket_index, coal_sort_order);
1626 	proc_list_unlock();
1627 
1628 	return 0;
1629 }
1630 
1631 /*
1632  * Sort processes by size for a single jetsam bucket.
1633  */
1634 
1635 static void
memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)1636 memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
1637 {
1638 	proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
1639 	proc_t next_p = NULL, prev_max_proc = NULL;
1640 	uint32_t pages = 0, max_pages = 0;
1641 	memstat_bucket_t *current_bucket;
1642 
1643 	if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1644 		return;
1645 	}
1646 
1647 	current_bucket = &memstat_bucket[bucket_index];
1648 
1649 	p = TAILQ_FIRST(&current_bucket->list);
1650 
1651 	while (p) {
1652 		memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
1653 		max_pages = pages;
1654 		max_proc = p;
1655 		prev_max_proc = p;
1656 
1657 		while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
1658 			/* traversing list until we find next largest process */
1659 			p = next_p;
1660 			memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
1661 			if (pages > max_pages) {
1662 				max_pages = pages;
1663 				max_proc = p;
1664 			}
1665 		}
1666 
1667 		if (prev_max_proc != max_proc) {
1668 			/* found a larger process, place it in the list */
1669 			TAILQ_REMOVE(&current_bucket->list, max_proc, p_memstat_list);
1670 			if (insert_after_proc == NULL) {
1671 				TAILQ_INSERT_HEAD(&current_bucket->list, max_proc, p_memstat_list);
1672 			} else {
1673 				TAILQ_INSERT_AFTER(&current_bucket->list, insert_after_proc, max_proc, p_memstat_list);
1674 			}
1675 			prev_max_proc = max_proc;
1676 		}
1677 
1678 		insert_after_proc = max_proc;
1679 
1680 		p = TAILQ_NEXT(max_proc, p_memstat_list);
1681 	}
1682 }
1683 
1684 proc_t
memorystatus_get_first_proc_locked(unsigned int * bucket_index,boolean_t search)1685 memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search)
1686 {
1687 	memstat_bucket_t *current_bucket;
1688 	proc_t next_p;
1689 
1690 	if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
1691 		return NULL;
1692 	}
1693 
1694 	current_bucket = &memstat_bucket[*bucket_index];
1695 	next_p = TAILQ_FIRST(&current_bucket->list);
1696 	if (!next_p && search) {
1697 		while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1698 			current_bucket = &memstat_bucket[*bucket_index];
1699 			next_p = TAILQ_FIRST(&current_bucket->list);
1700 		}
1701 	}
1702 
1703 	return next_p;
1704 }
1705 
1706 proc_t
memorystatus_get_next_proc_locked(unsigned int * bucket_index,proc_t p,boolean_t search)1707 memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search)
1708 {
1709 	memstat_bucket_t *current_bucket;
1710 	proc_t next_p;
1711 
1712 	if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
1713 		return NULL;
1714 	}
1715 
1716 	next_p = TAILQ_NEXT(p, p_memstat_list);
1717 	while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1718 		current_bucket = &memstat_bucket[*bucket_index];
1719 		next_p = TAILQ_FIRST(&current_bucket->list);
1720 	}
1721 
1722 	return next_p;
1723 }
1724 
1725 jetsam_state_t jetsam_threads;
1726 
1727 /* Maximum number of jetsam threads allowed */
1728 #define JETSAM_THREADS_LIMIT   3
1729 
1730 /* Number of active jetsam threads */
1731 _Atomic unsigned int active_jetsam_threads = 1;
1732 /* Number of maximum jetsam threads configured */
1733 unsigned int max_jetsam_threads = 1;
1734 
1735 static jetsam_state_t
jetsam_current_thread()1736 jetsam_current_thread()
1737 {
1738 	for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) {
1739 		if (jetsam_threads[thr_id].thread == current_thread()) {
1740 			return &(jetsam_threads[thr_id]);
1741 		}
1742 	}
1743 	return NULL;
1744 }
1745 
1746 #if CONFIG_JETSAM
1747 static void
initialize_entitled_max_task_limit()1748 initialize_entitled_max_task_limit()
1749 {
1750 	/**
1751 	 * We've already stored the potential boot-arg "entitled_max_task_pmem" in
1752 	 * memorystatus_entitled_max_task_footprint_mb as a TUNABLE_DT.  We provide
1753 	 * argptr=NULL and max_len=0 here to check only for existence of the boot-arg.
1754 	 *
1755 	 * The boot-arg takes precedence over memorystatus_swap_all_apps.
1756 	 */
1757 	if (!PE_parse_boot_argn("entitled_max_task_pmem", NULL, 0) && memorystatus_swap_all_apps) {
1758 		/*
1759 		 * When we have swap, we let entitled apps go up to the dram config
1760 		 * regardless of what's set in EDT,
1761 		 * This can still be overriden with the entitled_max_task_pmem boot-arg.
1762 		 */
1763 		memorystatus_entitled_max_task_footprint_mb =
1764 		    (int32_t)(max_mem_actual / (1ULL << 20));
1765 		memorystatus_entitled_dev_max_task_footprint_mb =
1766 		    memorystatus_entitled_max_task_footprint_mb;
1767 	}
1768 
1769 	if (memorystatus_entitled_max_task_footprint_mb < 0) {
1770 		memorystatus_log_error("Invalid value (%d) for entitled_max_task_pmem. "
1771 		    "Setting to 0\n", memorystatus_entitled_max_task_footprint_mb);
1772 		memorystatus_entitled_max_task_footprint_mb = 0;
1773 	}
1774 
1775 	if (memorystatus_entitled_dev_max_task_footprint_mb < -1) {
1776 		memorystatus_log_error("Invalid value (%d) for entitled_max_developer_task_pmem. "
1777 		    "Setting to 0\n", memorystatus_entitled_dev_max_task_footprint_mb);
1778 		memorystatus_entitled_dev_max_task_footprint_mb = 0;
1779 	} else if (memorystatus_entitled_dev_max_task_footprint_mb == -1) {
1780 		memorystatus_entitled_dev_max_task_footprint_mb = (int32_t)
1781 		    (max_mem_actual >> 20);
1782 	}
1783 
1784 	if (memorystatus_entitled_dev_max_task_footprint_mb &&
1785 	    memorystatus_entitled_dev_max_task_footprint_mb <
1786 	    memorystatus_entitled_max_task_footprint_mb) {
1787 		memorystatus_log_error("memorystatus: Entitled developer limit (%d MB) "
1788 		    "must be ≥ entitled task limit (%d MB)\n",
1789 		    memorystatus_entitled_dev_max_task_footprint_mb,
1790 		    memorystatus_entitled_max_task_footprint_mb);
1791 		memorystatus_entitled_dev_max_task_footprint_mb =
1792 		    memorystatus_entitled_max_task_footprint_mb;
1793 	}
1794 }
1795 
1796 #endif /* CONFIG_JETSAM */
1797 
1798 
1799 __private_extern__ void
memorystatus_init(void)1800 memorystatus_init(void)
1801 {
1802 	kern_return_t result;
1803 	int i;
1804 
1805 #if CONFIG_FREEZE
1806 	memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX_DEFAULT;
1807 	memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */
1808 	memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4);
1809 	memorystatus_freeze_pages_min = FREEZE_PAGES_MIN_DEFAULT;
1810 	memorystatus_freeze_pages_max = FREEZE_PAGES_MAX_DEFAULT;
1811 	memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS_DEFAULT;
1812 	memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD_DEFAULT;
1813 	memorystatus_min_thaw_refreeze_threshold = MIN_THAW_REFREEZE_THRESHOLD_DEFAULT;
1814 #endif /* CONFIG_FREEZE */
1815 
1816 #if DEVELOPMENT || DEBUG
1817 	if (kill_on_no_paging_space) {
1818 		max_kill_priority = JETSAM_PRIORITY_MAX;
1819 	}
1820 #endif
1821 	// Note: no-op pending rdar://27006343 (Custom kernel log handles)
1822 	memorystatus_log_handle = os_log_create("com.apple.xnu", "memorystatus");
1823 
1824 	/* Init buckets */
1825 	for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
1826 		TAILQ_INIT(&memstat_bucket[i].list);
1827 		memstat_bucket[i].count = 0;
1828 		memstat_bucket[i].relaunch_high_count = 0;
1829 	}
1830 	memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
1831 
1832 	nanoseconds_to_absolutetime(memstat_idle_deferral_time_s * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
1833 	nanoseconds_to_absolutetime(memstat_idle_deferral_time_s * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
1834 	assert3u(memstat_idle_deferral_time_s, >=, kJetsamSysProcsIdleDelayTimeLowRatio);
1835 
1836 #if CONFIG_JETSAM
1837 	bzero(memorystatus_jetsam_proc_name_panic, sizeof(memorystatus_jetsam_proc_name_panic));
1838 	if (PE_parse_boot_argn("jetsam_proc_name_panic", &memorystatus_jetsam_proc_name_panic, sizeof(memorystatus_jetsam_proc_name_panic))) {
1839 		/*
1840 		 * No bounds check to see if this is a valid cause.
1841 		 * This is a debugging aid. The callers should know precisely which cause they wish to track.
1842 		 */
1843 		PE_parse_boot_argn("jetsam_proc_cause_panic", &memorystatus_jetsam_proc_cause_panic, sizeof(memorystatus_jetsam_proc_cause_panic));
1844 		PE_parse_boot_argn("jetsam_proc_size_panic", &memorystatus_jetsam_proc_size_panic, sizeof(memorystatus_jetsam_proc_size_panic));
1845 	}
1846 
1847 	if (memorystatus_swap_all_apps && vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1848 		panic("kern.swap_all_apps is not supported on this platform");
1849 	}
1850 
1851 	/*
1852 	 * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE
1853 	 * band and must be below it in priority. This is so that we don't have to make
1854 	 * our 'aging' code worry about a mix of processes, some of which need to age
1855 	 * and some others that need to stay elevated in the jetsam bands.
1856 	 */
1857 	assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band);
1858 	assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band_stuck);
1859 	assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band);
1860 
1861 	/* Take snapshots for idle-exit kills by default? First check the boot-arg... */
1862 	if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot))) {
1863 		/* ...no boot-arg, so check the device tree */
1864 		PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
1865 	}
1866 
1867 	memorystatus_sysproc_aging_aggr_pages = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_AGGR_SYSPROC_AGING_PERCENTAGE);
1868 
1869 	if (max_mem <= MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) {
1870 		memorystatus_delta = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_DELTA_PERCENTAGE_SMALL);
1871 	} else {
1872 		memorystatus_delta = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_DELTA_PERCENTAGE_LARGE);
1873 	}
1874 
1875 	if (memorystatus_critical_threshold_mb != 0) {
1876 		memstat_critical_threshold = atop_32(memorystatus_critical_threshold_mb << 20);
1877 	} else {
1878 		if (max_mem <= MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) {
1879 			memstat_critical_threshold = MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_CRITICAL_PERCENTAGE_SMALL);
1880 		} else {
1881 			memstat_critical_threshold = MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_CRITICAL_PERCENTAGE_LARGE);
1882 		}
1883 	}
1884 	assert3u(memstat_critical_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
1885 
1886 	if (memorystatus_idle_threshold_mb != 0) {
1887 		memstat_idle_threshold = atop_32(memorystatus_idle_threshold_mb << 20);
1888 	} else {
1889 		/*
1890 		 * For historical reasons, devices with "medium"-sized memory configs have a different critical:idle:pressure ratio
1891 		 */
1892 		if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
1893 		    (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
1894 			memstat_idle_threshold = (MEMORYSTATUS_IDLE_RATIO_NUM_MEDIUM * memstat_critical_threshold) /
1895 			    MEMORYSTATUS_IDLE_RATIO_DENOM_MEDIUM;
1896 		} else {
1897 			memstat_idle_threshold = (MEMORYSTATUS_IDLE_RATIO_NUM * memstat_critical_threshold) /
1898 			    MEMORYSTATUS_IDLE_RATIO_DENOM;
1899 		}
1900 	}
1901 	assert3u(memstat_idle_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
1902 
1903 	if (memorystatus_pressure_threshold_mb != 0) {
1904 		memstat_soft_threshold = atop_32(memorystatus_pressure_threshold_mb << 20);
1905 	} else {
1906 		if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
1907 		    (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
1908 			memstat_soft_threshold = (MEMORYSTATUS_PRESSURE_RATIO_NUM_MEDIUM * memstat_critical_threshold) /
1909 			    MEMORYSTATUS_PRESSURE_RATIO_DENOM_MEDIUM;
1910 		} else {
1911 			memstat_soft_threshold = (MEMORYSTATUS_PRESSURE_RATIO_NUM * memstat_critical_threshold) /
1912 			    MEMORYSTATUS_PRESSURE_RATIO_DENOM;
1913 		}
1914 	}
1915 	assert3u(memstat_soft_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
1916 
1917 	if (memstat_ballast_offset_mb != 0) {
1918 		memstat_ballast_offset = atop_32(memstat_ballast_offset_mb << 20);
1919 	}
1920 	assert3u(memstat_ballast_offset, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
1921 
1922 	if (memstat_ctd_offset_mb != 0) {
1923 		memstat_ctd_offset = atop_32(memstat_ctd_offset_mb << 20);
1924 	}
1925 	assert3u(memstat_ctd_offset, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
1926 
1927 	/* Set the swapin trigger in pages based on the maximum size allocated for each c_seg */
1928 	memorystatus_swapin_trigger_pages = (unsigned int) atop_64(memorystatus_swapin_trigger_segments * c_seg_allocsize);
1929 
1930 	/* Jetsam Loop Detection */
1931 	if (max_mem <= (512 * 1024 * 1024)) {
1932 		/* 512 MB devices */
1933 		memorystatus_jld_eval_period_msecs = 8000;      /* 8000 msecs == 8 second window */
1934 	} else {
1935 		/* 1GB and larger devices */
1936 		memorystatus_jld_eval_period_msecs = 6000;      /* 6000 msecs == 6 second window */
1937 	}
1938 
1939 	memorystatus_jld_enabled = TRUE;
1940 
1941 	initialize_entitled_max_task_limit();
1942 #endif /* CONFIG_JETSAM */
1943 
1944 	memorystatus_jetsam_snapshot_max = maxproc;
1945 
1946 	memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
1947 	    (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
1948 
1949 	memorystatus_jetsam_snapshot = kalloc_data(memorystatus_jetsam_snapshot_size, Z_WAITOK | Z_ZERO);
1950 	if (!memorystatus_jetsam_snapshot) {
1951 		panic("Could not allocate memorystatus_jetsam_snapshot");
1952 	}
1953 
1954 #if CONFIG_FREEZE
1955 	memorystatus_jetsam_snapshot_freezer_max = memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR;
1956 	memorystatus_jetsam_snapshot_freezer_size = sizeof(memorystatus_jetsam_snapshot_t) +
1957 	    (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_freezer_max);
1958 
1959 	memorystatus_jetsam_snapshot_freezer =
1960 	    zalloc_permanent(memorystatus_jetsam_snapshot_freezer_size, ZALIGN_PTR);
1961 #endif /* CONFIG_FREEZE */
1962 
1963 	nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
1964 
1965 	memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
1966 
1967 #if CONFIG_FREEZE
1968 	if (memorystatus_freeze_threshold_mb != 0) {
1969 		memorystatus_freeze_threshold = (unsigned int)atop_64((uint64_t)memorystatus_freeze_threshold_mb << 20);
1970 	} else {
1971 		memorystatus_freeze_threshold = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_FREEZE_THRESHOLD_PERCENTAGE);
1972 	}
1973 	assert(memorystatus_freeze_threshold < (unsigned int)atop_64(max_mem));
1974 
1975 	if (memorystatus_swap_all_apps) {
1976 		/*
1977 		 * Swap is enabled, so we expect a larger working set & larger apps.
1978 		 * Adjust thresholds accordingly.
1979 		 */
1980 		memorystatus_freeze_configure_for_swap();
1981 	}
1982 #endif
1983 
1984 	/* Check the boot-arg to configure the maximum number of jetsam threads */
1985 	if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof(max_jetsam_threads))) {
1986 		max_jetsam_threads = JETSAM_THREADS_LIMIT;
1987 	}
1988 
1989 	/* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */
1990 	if (max_jetsam_threads > JETSAM_THREADS_LIMIT) {
1991 		max_jetsam_threads = JETSAM_THREADS_LIMIT;
1992 	}
1993 
1994 #if CONFIG_JETSAM
1995 	/* For low CPU systems disable fast jetsam mechanism */
1996 	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
1997 		max_jetsam_threads = 1;
1998 	}
1999 #endif /* CONFIG_JETSAM */
2000 
2001 #if DEVELOPMENT || DEBUG
2002 	if (PE_parse_boot_argn("-memorystatus-skip-fg-notify", &i, sizeof(i))) {
2003 		memorystatus_should_issue_fg_band_notify = false;
2004 	}
2005 #endif /* DEVELOPMENT || DEBUG */
2006 
2007 	/* Initialize the jetsam_threads state array */
2008 	jetsam_threads = zalloc_permanent(sizeof(struct jetsam_state_s) *
2009 	    max_jetsam_threads, ZALIGN(struct jetsam_state_s));
2010 
2011 	/* Initialize all the jetsam threads */
2012 	for (i = 0; i < max_jetsam_threads; i++) {
2013 		jetsam_threads[i].inited = false;
2014 		jetsam_threads[i].index = i;
2015 		result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread);
2016 		if (result != KERN_SUCCESS) {
2017 			panic("Could not create memorystatus_thread %d", i);
2018 		}
2019 		thread_deallocate(jetsam_threads[i].thread);
2020 	}
2021 
2022 #if VM_PRESSURE_EVENTS
2023 	memorystatus_notify_init();
2024 #endif /* VM_PRESSURE_EVENTS */
2025 
2026 #if JETSAM_ZPRINT_SNAPSHOT
2027 	size_t jzs_names_size, jzs_info_size, jzs_meminfo_size;
2028 
2029 	jzs_zone_cnt = zone_max_zones();
2030 	jzs_names_size = jzs_zone_cnt * sizeof(mach_zone_name_t);
2031 	jzs_names = zalloc_permanent(jzs_names_size, ZALIGN(mach_zone_name_t));
2032 
2033 	jzs_info_size = jzs_zone_cnt * sizeof(mach_zone_info_t);
2034 	jzs_info = zalloc_permanent(jzs_info_size, ZALIGN(mach_zone_info_t));
2035 
2036 	jzs_coalesce = zalloc_permanent(jzs_zone_cnt * sizeof(int), ZALIGN(int));
2037 
2038 	jzs_meminfo_cnt = vm_page_diagnose_estimate();
2039 	jzs_meminfo_size = jzs_meminfo_cnt * sizeof(mach_memory_info_t);
2040 	jzs_meminfo = kalloc_data_tag(jzs_meminfo_size, Z_WAITOK, VM_KERN_MEMORY_DIAG);
2041 #endif /* JETSAM_ZPRINT_SNAPSHOT */
2042 }
2043 
2044 #if CONFIG_JETSAM
2045 bool
memorystatus_disable_swap(void)2046 memorystatus_disable_swap(void)
2047 {
2048 #if DEVELOPMENT || DEBUG
2049 	int boot_arg_val = 0;
2050 	if (PE_parse_boot_argn("kern.swap_all_apps", &boot_arg_val, sizeof(boot_arg_val))) {
2051 		if (boot_arg_val) {
2052 			/* Can't disable app swap if it was set via a boot-arg */
2053 			return false;
2054 		}
2055 	}
2056 #endif /* DEVELOPMENT || DEBUG */
2057 	memorystatus_swap_all_apps = false;
2058 #if CONFIG_FREEZE
2059 	/* Go back to the smaller freezer thresholds */
2060 	memorystatus_freeze_disable_swap();
2061 #endif /* CONFIG_FREEZE */
2062 	initialize_entitled_max_task_limit();
2063 	return true;
2064 }
2065 #endif /* CONFIG_JETSAM */
2066 
2067 /*
2068  * The jetsam no frills kill call
2069  *      Return: 0 on success
2070  *		error code on failure (EINVAL...)
2071  */
2072 static int
jetsam_do_kill(proc_t p,int jetsam_flags,os_reason_t jetsam_reason)2073 jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason)
2074 {
2075 	int error = 0;
2076 	error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason);
2077 	return error;
2078 }
2079 
2080 /*
2081  * Wrapper for processes exiting with memorystatus details
2082  */
2083 static boolean_t
memorystatus_do_kill(proc_t p,uint32_t cause,os_reason_t jetsam_reason,uint64_t * footprint_of_killed_proc)2084 memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason, uint64_t *footprint_of_killed_proc)
2085 {
2086 	int error = 0;
2087 	__unused pid_t victim_pid = proc_getpid(p);
2088 	uint64_t footprint = get_task_phys_footprint(proc_task(p));
2089 #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
2090 	int32_t memstat_effectivepriority = p->p_memstat_effectivepriority;
2091 #endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
2092 
2093 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DO_KILL) | DBG_FUNC_START,
2094 	    victim_pid, cause, vm_page_free_count, footprint);
2095 	DTRACE_MEMORYSTATUS4(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause, uint64_t, footprint);
2096 
2097 #if CONFIG_JETSAM
2098 	if (*p->p_name && !strncmp(memorystatus_jetsam_proc_name_panic, p->p_name, sizeof(p->p_name))) { /* name */
2099 		if ((!memorystatus_jetsam_proc_cause_panic || cause == memorystatus_jetsam_proc_cause_panic) && /* cause */
2100 		    (!memorystatus_jetsam_proc_size_panic || (footprint >> 20) >= memorystatus_jetsam_proc_size_panic)) { /* footprint */
2101 			panic("memorystatus_do_kill(): requested panic on jetsam of %s (cause: %d and footprint: %llu mb)",
2102 			    memorystatus_jetsam_proc_name_panic, cause, footprint >> 20);
2103 		}
2104 	}
2105 #else /* CONFIG_JETSAM */
2106 #pragma unused(cause)
2107 #endif /* CONFIG_JETSAM */
2108 
2109 	if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
2110 		memorystatus_log(
2111 			"memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n",
2112 			proc_getpid(p), (*p->p_name ? p->p_name : "unknown"),
2113 			memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority,
2114 			(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
2115 	}
2116 
2117 	/*
2118 	 * The jetsam_reason (os_reason_t) has enough information about the kill cause.
2119 	 * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped.
2120 	 */
2121 	int jetsam_flags = P_LTERM_JETSAM;
2122 	switch (cause) {
2123 	case kMemorystatusKilledHiwat:                                          jetsam_flags |= P_JETSAM_HIWAT; break;
2124 	case kMemorystatusKilledVnodes:                                         jetsam_flags |= P_JETSAM_VNODE; break;
2125 	case kMemorystatusKilledVMPageShortage:                         jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
2126 	case kMemorystatusKilledVMCompressorThrashing:
2127 	case kMemorystatusKilledVMCompressorSpaceShortage:      jetsam_flags |= P_JETSAM_VMTHRASHING; break;
2128 	case kMemorystatusKilledFCThrashing:                            jetsam_flags |= P_JETSAM_FCTHRASHING; break;
2129 	case kMemorystatusKilledPerProcessLimit:                        jetsam_flags |= P_JETSAM_PID; break;
2130 	case kMemorystatusKilledIdleExit:                                       jetsam_flags |= P_JETSAM_IDLEEXIT; break;
2131 	}
2132 	/* jetsam_do_kill drops a reference. */
2133 	os_reason_ref(jetsam_reason);
2134 	error = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
2135 	*footprint_of_killed_proc = ((error == 0) ? footprint : 0);
2136 
2137 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DO_KILL) | DBG_FUNC_END,
2138 	    victim_pid, memstat_effectivepriority, vm_page_free_count, error);
2139 
2140 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_COMPACTOR_RUN) | DBG_FUNC_START,
2141 	    victim_pid, cause, vm_page_free_count, *footprint_of_killed_proc);
2142 
2143 	if (jetsam_reason->osr_code == JETSAM_REASON_VNODE) {
2144 		/*
2145 		 * vnode jetsams are syncronous and not caused by memory pressure.
2146 		 * Running the compactor on this thread adds significant latency to the filesystem operation
2147 		 * that triggered this jetsam.
2148 		 * Kick of compactor thread asyncronously instead.
2149 		 */
2150 		vm_wake_compactor_swapper();
2151 	} else {
2152 		vm_run_compactor();
2153 	}
2154 
2155 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_COMPACTOR_RUN) | DBG_FUNC_END,
2156 	    victim_pid, cause, vm_page_free_count);
2157 
2158 	os_reason_free(jetsam_reason);
2159 	return error == 0;
2160 }
2161 
2162 /*
2163  * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work.
2164  * For an application: that means no longer in the FG band
2165  * For a daemon: that means no longer in its 'requested' jetsam priority band
2166  */
2167 
2168 int
memorystatus_update_inactive_jetsam_priority_band(pid_t pid,uint32_t op_flags,int jetsam_prio,boolean_t effective_now)2169 memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now)
2170 {
2171 	int error = 0;
2172 	boolean_t enable = FALSE;
2173 	proc_t  p = NULL;
2174 
2175 	if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) {
2176 		enable = TRUE;
2177 	} else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) {
2178 		enable = FALSE;
2179 	} else {
2180 		return EINVAL;
2181 	}
2182 
2183 	p = proc_find(pid);
2184 	if (p != NULL) {
2185 		if ((enable && _memstat_proc_is_elevated(p)) ||
2186 		    (!enable && !_memstat_proc_is_elevated(p))) {
2187 			/*
2188 			 * No change in state.
2189 			 */
2190 		} else {
2191 			proc_list_lock();
2192 
2193 			if (enable) {
2194 				p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
2195 
2196 				if (effective_now) {
2197 					if (p->p_memstat_effectivepriority < jetsam_prio) {
2198 						memstat_update_priority_locked(p, jetsam_prio, MEMSTAT_PRIORITY_OPTIONS_NONE);
2199 					}
2200 				} else {
2201 					if (_memstat_proc_is_aging(p)) {
2202 						memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2203 					}
2204 				}
2205 			} else {
2206 				p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
2207 
2208 				if (effective_now) {
2209 					if (p->p_memstat_effectivepriority == jetsam_prio) {
2210 						memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2211 					}
2212 				} else {
2213 					if (_memstat_proc_is_aging(p)) {
2214 						memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2215 					}
2216 				}
2217 			}
2218 
2219 			proc_list_unlock();
2220 		}
2221 		proc_rele(p);
2222 		error = 0;
2223 	} else {
2224 		error = ESRCH;
2225 	}
2226 
2227 	return error;
2228 }
2229 
2230 static void
memorystatus_perform_idle_demotion(__unused void * spare1,__unused void * spare2)2231 memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
2232 {
2233 	proc_t p;
2234 	uint64_t current_time = 0, idle_delay_time = 0;
2235 	int demote_prio_band = 0;
2236 	memstat_bucket_t *demotion_bucket;
2237 
2238 	memorystatus_log_debug("memorystatus_perform_idle_demotion()\n");
2239 
2240 	if (!system_procs_aging_band && !system_procs_aging_band_stuck && !applications_aging_band) {
2241 		return;
2242 	}
2243 
2244 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START);
2245 
2246 	current_time = mach_absolute_time();
2247 
2248 	proc_list_lock();
2249 
2250 	demote_prio_band = JETSAM_PRIORITY_IDLE + 1;
2251 
2252 	for (; demote_prio_band < JETSAM_PRIORITY_MAX; demote_prio_band++) {
2253 		if (demote_prio_band != system_procs_aging_band &&
2254 		    demote_prio_band != system_procs_aging_band_stuck &&
2255 		    demote_prio_band != applications_aging_band) {
2256 			continue;
2257 		}
2258 
2259 		demotion_bucket = &memstat_bucket[demote_prio_band];
2260 		p = TAILQ_FIRST(&demotion_bucket->list);
2261 
2262 		while (p) {
2263 			memorystatus_log_debug("memorystatus_perform_idle_demotion() found %s [%d]\n", proc_best_name(p), proc_getpid(p));
2264 
2265 			assert(p->p_memstat_idledeadline);
2266 
2267 			assert(_memstat_proc_is_aging(p));
2268 
2269 			if (current_time >= p->p_memstat_idledeadline) {
2270 				proc_t next_proc = NULL;
2271 
2272 				next_proc = TAILQ_NEXT(p, p_memstat_list);
2273 
2274 				if ((isSysProc(p) && _memstat_proc_is_dirty(p)) || /* system proc marked dirty*/
2275 				    task_has_assertions(proc_task(p))) {     /* has outstanding assertions which might indicate outstanding work too */
2276 					idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p);
2277 
2278 					if (isSysProc(p) && task_has_assertions(proc_task(p)) && demote_prio_band != system_procs_aging_band_stuck) {
2279 						memorystatus_log_debug("memorystatus_perform_idle_demotion() found stuck process %d [%s], moving to JETSAM_PRIORITY_AGING_BAND1_STUCK\n",
2280 						    proc_getpid(p), (*p->p_name ? p->p_name : "(unknown)"));
2281 						memstat_update_priority_locked(p, JETSAM_PRIORITY_AGING_BAND1_STUCK, MEMSTAT_PRIORITY_NO_AGING);
2282 					}
2283 
2284 					p->p_memstat_idledeadline += idle_delay_time;
2285 				} else {
2286 					memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2287 				}
2288 				p = next_proc;
2289 			} else {
2290 				// No further candidates
2291 				break;
2292 			}
2293 		}
2294 	}
2295 
2296 	_memstat_reschedule_idle_demotion_locked();
2297 
2298 	proc_list_unlock();
2299 
2300 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END);
2301 }
2302 
2303 /*
2304  * Schedule a process for idle demotion. Updates the process' idle deadline
2305  * and marks it as aging. The caller is responsible for rescheduling the idle
2306  * demotion thread
2307  */
2308 static void
_memstat_schedule_idle_demotion_locked(proc_t p)2309 _memstat_schedule_idle_demotion_locked(proc_t p)
2310 {
2311 	uint64_t  idle_delay_time = 0;
2312 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2313 	assert(system_procs_aging_band || applications_aging_band);
2314 	assert(!_memstat_proc_is_aging(p));
2315 
2316 	memorystatus_log_debug(
2317 		"%s: scheduling demotion to idle band for pid %d (dirty:0x%x).\n",
2318 		__func__, proc_getpid(p), p->p_memstat_dirty);
2319 
2320 	idle_delay_time = isSysProc(p) ? memorystatus_sysprocs_idle_time(p) :
2321 	    memorystatus_apps_idle_time(p);
2322 	p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS;
2323 	p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time;
2324 }
2325 
2326 /*
2327  * Cancel a process' idle demotion. The caller must also reschedule the idle
2328  * demotion thread.
2329  */
2330 static void
_memstat_invalidate_idle_demotion_locked(proc_t p)2331 _memstat_invalidate_idle_demotion_locked(proc_t p)
2332 {
2333 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2334 	assert(system_procs_aging_band || applications_aging_band);
2335 	assert(_memstat_proc_is_aging(p));
2336 
2337 	memorystatus_log_debug(
2338 		"%s: invalidating demotion to idle band for %s [%d]\n",
2339 		__func__, proc_best_name(p), proc_getpid(p));
2340 
2341 	p->p_memstat_idledeadline = 0;
2342 	p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS;
2343 }
2344 
2345 /*
2346  * Return the earliest idle deadline of all aging procs. Returns 0 if there are
2347  * no aging procs.
2348  */
2349 static uint64_t
_memstat_find_earliest_idle_deadline(void)2350 _memstat_find_earliest_idle_deadline(void)
2351 {
2352 	memstat_bucket_t *demotion_bucket;
2353 	proc_t oldest_proc = PROC_NULL;
2354 	uint32_t aging_app_count = 0, aging_sysproc_count = 0, aging_sysproc_count_stuck = 0;
2355 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2356 	assert(system_procs_aging_band || system_procs_aging_band_stuck || applications_aging_band);
2357 
2358 	if (system_procs_aging_band) {
2359 		aging_sysproc_count = memstat_bucket[system_procs_aging_band].count;
2360 	}
2361 	if (system_procs_aging_band_stuck) {
2362 		aging_sysproc_count_stuck = memstat_bucket[system_procs_aging_band_stuck].count;
2363 	}
2364 	if (applications_aging_band) {
2365 		aging_app_count = memstat_bucket[applications_aging_band].count;
2366 	}
2367 
2368 	if ((aging_app_count + aging_sysproc_count + aging_sysproc_count_stuck) == 0) {
2369 		return 0;
2370 	}
2371 
2372 	if (system_procs_aging_band && aging_sysproc_count > 0) {
2373 		demotion_bucket = &memstat_bucket[system_procs_aging_band];
2374 		oldest_proc = TAILQ_FIRST(&demotion_bucket->list);
2375 	}
2376 
2377 	if (system_procs_aging_band_stuck && aging_sysproc_count_stuck > 0) {
2378 		proc_t oldest_sysproc_stuck;
2379 		demotion_bucket = &memstat_bucket[system_procs_aging_band_stuck];
2380 		oldest_sysproc_stuck = TAILQ_FIRST(&demotion_bucket->list);
2381 
2382 		if (oldest_proc) {
2383 			if (oldest_sysproc_stuck->p_memstat_idledeadline <
2384 			    oldest_proc->p_memstat_idledeadline) {
2385 				oldest_proc = oldest_sysproc_stuck;
2386 			}
2387 		} else {
2388 			oldest_proc = oldest_sysproc_stuck;
2389 		}
2390 	}
2391 
2392 	if (applications_aging_band && aging_app_count > 0) {
2393 		proc_t oldest_app;
2394 		demotion_bucket = &memstat_bucket[applications_aging_band];
2395 		oldest_app = TAILQ_FIRST(&demotion_bucket->list);
2396 
2397 		if (!oldest_proc ||
2398 		    (oldest_app->p_memstat_idledeadline <
2399 		    oldest_proc->p_memstat_idledeadline)) {
2400 			oldest_proc = oldest_app;
2401 		}
2402 	}
2403 
2404 	assert(oldest_proc);
2405 	assert(oldest_proc->p_memstat_idledeadline);
2406 	assert(_memstat_proc_is_aging(oldest_proc));
2407 
2408 	return oldest_proc->p_memstat_idledeadline;
2409 }
2410 
2411 /*
2412  * Reschedule or cancel a pending wakeup of the idle_demotion thread. If called
2413  * in response to a process transitioning in/out of the aging band, then
2414  * rescheduling must occur *after* the new priority is updated.
2415  */
2416 static void
_memstat_reschedule_idle_demotion_locked(void)2417 _memstat_reschedule_idle_demotion_locked(void)
2418 {
2419 	uint64_t idle_deadline;
2420 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2421 
2422 	if (!system_procs_aging_band && !applications_aging_band) {
2423 		return;
2424 	}
2425 	idle_deadline = _memstat_find_earliest_idle_deadline();
2426 	if (idle_deadline == 0) {
2427 		/* No aging processes, cancel call to demotion thread */
2428 		thread_call_cancel(memorystatus_idle_demotion_call);
2429 	} else if (memstat_idle_demotion_deadline != idle_deadline) {
2430 		thread_call_enter_delayed(memorystatus_idle_demotion_call, idle_deadline);
2431 	}
2432 	memstat_idle_demotion_deadline = idle_deadline;
2433 }
2434 
2435 /*
2436  * List manipulation
2437  */
2438 
2439 int
memorystatus_add(proc_t p,boolean_t locked)2440 memorystatus_add(proc_t p, boolean_t locked)
2441 {
2442 	memstat_bucket_t *bucket;
2443 	bool reschedule_demotion = false;
2444 
2445 	memorystatus_log_debug("memorystatus_list_add(): adding pid %d with priority %d.\n",
2446 	    proc_getpid(p), p->p_memstat_effectivepriority);
2447 
2448 	if (!locked) {
2449 		proc_list_lock();
2450 	}
2451 
2452 	DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority);
2453 
2454 	/* Processes marked internal do not have priority tracked */
2455 	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2456 		goto exit;
2457 	}
2458 
2459 	/*
2460 	 * Opt out system processes from being frozen by default.
2461 	 * For coalition-based freezing, we only want to freeze sysprocs that have specifically opted in.
2462 	 */
2463 	if (isSysProc(p)) {
2464 		p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED;
2465 	}
2466 #if CONFIG_FREEZE
2467 	memorystatus_freeze_init_proc(p);
2468 #endif
2469 
2470 	bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2471 
2472 	if ((system_procs_aging_band &&
2473 	    p->p_memstat_effectivepriority == system_procs_aging_band) ||
2474 	    (applications_aging_band &&
2475 	    p->p_memstat_effectivepriority == applications_aging_band)) {
2476 		_memstat_schedule_idle_demotion_locked(p);
2477 		reschedule_demotion = true;
2478 	} else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2479 		/*
2480 		 * Entering the idle band.
2481 		 * Record idle start time.
2482 		 */
2483 		p->p_memstat_idle_start = mach_absolute_time();
2484 	}
2485 
2486 	TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
2487 	bucket->count++;
2488 	if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2489 		bucket->relaunch_high_count++;
2490 	}
2491 
2492 	memorystatus_list_count++;
2493 
2494 	if (reschedule_demotion) {
2495 		_memstat_reschedule_idle_demotion_locked();
2496 	}
2497 
2498 	task_t t = proc_task(p);
2499 	if (t && task_is_app_suspended(t)) {
2500 		_memstat_proc_set_suspended(p);
2501 	}
2502 
2503 	_memstat_consider_waking_jetsam_thread();
2504 
2505 exit:
2506 	if (!locked) {
2507 		proc_list_unlock();
2508 	}
2509 
2510 	return 0;
2511 }
2512 
2513 /*
2514  * Record timestamps if process p is transitioning in/out of the IDLE band.
2515  */
2516 static void
_memstat_record_idle_transition(proc_t p,int new_priority)2517 _memstat_record_idle_transition(proc_t p, int new_priority)
2518 {
2519 	if (p->p_memstat_effectivepriority == new_priority) {
2520 		/* no change in priority */
2521 		return;
2522 	}
2523 	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2524 		uint64_t now;
2525 		/*
2526 		 * Transitioning out of the idle priority bucket.
2527 		 * Record idle delta.
2528 		 */
2529 		assert(p->p_memstat_idle_start != 0);
2530 		now = mach_absolute_time();
2531 		assert3u(now, >, p->p_memstat_idle_start);
2532 		p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2533 
2534 		/*
2535 		 * About to become active and so memory footprint could change.
2536 		 * So mark it eligible for freeze-considerations next time around.
2537 		 */
2538 		p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE;
2539 
2540 		_memstat_consider_waking_jetsam_thread();
2541 	} else if (new_priority == JETSAM_PRIORITY_IDLE) {
2542 		/*
2543 		 * Transitioning into the idle priority bucket.
2544 		 * Record idle start.
2545 		 */
2546 		p->p_memstat_idle_start = mach_absolute_time();
2547 	}
2548 }
2549 
2550 /*
2551  * Description:
2552  *	Moves a process from one jetsam bucket to another.
2553  *	which changes the LRU position of the process.
2554  *
2555  *	Monitors transition between buckets and if necessary
2556  *	will update cached memory limits accordingly.
2557  *
2558  */
2559 void
memstat_update_priority_locked(proc_t p,int priority,memstat_priority_options_t options)2560 memstat_update_priority_locked(proc_t p,
2561     int priority,
2562     memstat_priority_options_t options)
2563 {
2564 	memstat_bucket_t *old_bucket, *new_bucket;
2565 	bool reschedule_demotion = false;
2566 
2567 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2568 
2569 	assert(priority < MEMSTAT_BUCKET_COUNT);
2570 
2571 	/* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
2572 	if (proc_list_exited(p)) {
2573 		return;
2574 	}
2575 
2576 	memorystatus_log_debug("memorystatus: setting %s(%d) to priority %d, inserting at %s\n",
2577 	    (*p->p_name ? p->p_name : "unknown"), proc_getpid(p), priority,
2578 	    (options & MEMSTAT_PRIORITY_INSERT_HEAD) ? "head" : "tail");
2579 
2580 	DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority);
2581 
2582 	old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2583 
2584 	if (!(options & MEMSTAT_PRIORITY_NO_AGING)) {
2585 		if (_memstat_proc_is_elevated(p)) {
2586 			/*
2587 			 * 2 types of processes can use the non-standard elevated inactive band:
2588 			 * - Frozen processes that always land in memorystatus_freeze_jetsam_band
2589 			 * OR
2590 			 * - processes that specifically opt-in to the elevated inactive support e.g. docked processes.
2591 			 */
2592 			if (_memstat_proc_is_frozen(p) &&
2593 			    priority <= memorystatus_freeze_jetsam_band) {
2594 				priority = memorystatus_freeze_jetsam_band;
2595 			} else if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
2596 				priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2597 			}
2598 		}
2599 		if (_memstat_proc_is_tracked(p)) {
2600 			if (system_procs_aging_band && priority <= system_procs_aging_band) {
2601 				if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2602 					/* process has already aged */
2603 					priority = JETSAM_PRIORITY_IDLE;
2604 				} else {
2605 					priority = system_procs_aging_band;
2606 				}
2607 			} else if (system_procs_aging_band_stuck && priority <= system_procs_aging_band_stuck) {
2608 				if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2609 					/* process has already aged */
2610 					priority = JETSAM_PRIORITY_IDLE;
2611 				} else {
2612 					/* don't let anyone move anything between sysproc and sysproc stuck inclusive */
2613 					priority = system_procs_aging_band;
2614 				}
2615 			}
2616 		} else if (_memstat_proc_is_managed(p)) {
2617 			if (applications_aging_band && priority <= applications_aging_band) {
2618 				if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2619 					/* process has already aged */
2620 					priority = JETSAM_PRIORITY_IDLE;
2621 				} else {
2622 					priority = applications_aging_band;
2623 				}
2624 			}
2625 		}
2626 	}
2627 
2628 	TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
2629 	old_bucket->count--;
2630 	if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2631 		old_bucket->relaunch_high_count--;
2632 	}
2633 
2634 	new_bucket = &memstat_bucket[priority];
2635 	if (options & MEMSTAT_PRIORITY_INSERT_HEAD) {
2636 		TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
2637 	} else {
2638 		TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
2639 	}
2640 	new_bucket->count++;
2641 	if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2642 		new_bucket->relaunch_high_count++;
2643 	}
2644 
2645 	if (p->p_memstat_effectivepriority != priority) {
2646 		/*
2647 		 * This process is transitioning between
2648 		 * jetsam priority buckets.
2649 		 */
2650 		_memstat_record_idle_transition(p, priority);
2651 
2652 		if ((system_procs_aging_band &&
2653 		    p->p_memstat_effectivepriority == system_procs_aging_band) ||
2654 		    (system_procs_aging_band_stuck &&
2655 		    p->p_memstat_effectivepriority == system_procs_aging_band_stuck) ||
2656 		    (applications_aging_band &&
2657 		    p->p_memstat_effectivepriority == applications_aging_band)) {
2658 			/* removing this process from an aging band */
2659 			_memstat_invalidate_idle_demotion_locked(p);
2660 			reschedule_demotion = true;
2661 		}
2662 
2663 		if ((system_procs_aging_band &&
2664 		    priority == system_procs_aging_band) ||
2665 		    (system_procs_aging_band_stuck &&
2666 		    priority == system_procs_aging_band_stuck) ||
2667 		    (applications_aging_band &&
2668 		    priority == applications_aging_band)) {
2669 			/* placing this process into an aging band */
2670 			_memstat_schedule_idle_demotion_locked(p);
2671 			reschedule_demotion = true;
2672 		}
2673 
2674 		if (reschedule_demotion) {
2675 			_memstat_reschedule_idle_demotion_locked();
2676 		}
2677 
2678 		KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CHANGE_PRIORITY) | DBG_FUNC_NONE,
2679 		    proc_getpid(p), priority, p->p_memstat_effectivepriority);
2680 		p->p_memstat_effectivepriority = priority;
2681 	}
2682 
2683 	if (memorystatus_highwater_enabled) {
2684 		const bool use_active = _memstat_proc_is_active_locked(p);
2685 		if (_memstat_update_memlimit_locked(p, use_active)) {
2686 			_memstat_write_memlimit_to_ledger_locked(p, use_active, false);
2687 		}
2688 	}
2689 
2690 #if CONFIG_SECLUDED_MEMORY
2691 	if (secluded_for_apps &&
2692 	    task_could_use_secluded_mem(proc_task(p))) {
2693 		task_set_can_use_secluded_mem(
2694 			proc_task(p),
2695 			(priority >= JETSAM_PRIORITY_FOREGROUND));
2696 	}
2697 #endif /* CONFIG_SECLUDED_MEMORY */
2698 
2699 	_memstat_consider_waking_jetsam_thread();
2700 }
2701 
2702 int
memorystatus_relaunch_flags_update(proc_t p,int relaunch_flags)2703 memorystatus_relaunch_flags_update(proc_t p, int relaunch_flags)
2704 {
2705 	p->p_memstat_relaunch_flags = relaunch_flags;
2706 	KDBG(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_RELAUNCH_FLAGS), proc_getpid(p), relaunch_flags);
2707 	return 0;
2708 }
2709 
2710 #if DEVELOPMENT || DEBUG
2711 static int sysctl_memorystatus_relaunch_flags SYSCTL_HANDLER_ARGS {
2712 #pragma unused(oidp, arg1, arg2)
2713 	proc_t p;
2714 	int relaunch_flags = 0;
2715 
2716 	p = current_proc();
2717 	relaunch_flags = p->p_memstat_relaunch_flags;
2718 	switch (relaunch_flags) {
2719 	case P_MEMSTAT_RELAUNCH_LOW:
2720 		relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_LOW;
2721 		break;
2722 	case P_MEMSTAT_RELAUNCH_MED:
2723 		relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MED;
2724 		break;
2725 	case P_MEMSTAT_RELAUNCH_HIGH:
2726 		relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_HIGH;
2727 		break;
2728 	}
2729 
2730 	return SYSCTL_OUT(req, &relaunch_flags, sizeof(relaunch_flags));
2731 }
2732 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_relaunch_flags, CTLTYPE_INT | CTLFLAG_RD |
2733     CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, sysctl_memorystatus_relaunch_flags, "I", "get relaunch flags for current process");
2734 #endif /* DEVELOPMENT || DEBUG */
2735 
2736 /*
2737  * Everything between the idle band and the application agining band
2738  * are reserved for internal use. We allow some entitled user space programs
2739  * to use this range for experimentation.
2740  */
2741 static bool
current_task_can_use_entitled_range()2742 current_task_can_use_entitled_range()
2743 {
2744 	static const char kInternalJetsamRangeEntitlement[] = "com.apple.private.internal-jetsam-range";
2745 	task_t task = current_task();
2746 	if (task == kernel_task) {
2747 		return true;
2748 	}
2749 	return IOTaskHasEntitlement(task, kInternalJetsamRangeEntitlement);
2750 }
2751 
2752 /*
2753  * Set a process' requested priority band. This is the entry point used during
2754  * spawn and by memorystatus_control.
2755  */
2756 int
memorystatus_set_priority(proc_t p,int priority,uint64_t user_data,memstat_priority_options_t options)2757 memorystatus_set_priority(proc_t p, int priority, uint64_t user_data,
2758     memstat_priority_options_t options)
2759 {
2760 	int ret;
2761 
2762 	memorystatus_log_debug("memorystatus: changing (%s) pid %d: priority %d, user_data 0x%llx\n",
2763 	    (*p->p_name ? p->p_name : "unknown"), proc_getpid(p), priority, user_data);
2764 
2765 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, proc_getpid(p), priority, user_data, options);
2766 
2767 	if (priority == -1) {
2768 		/* Use as shorthand for default priority */
2769 		priority = JETSAM_PRIORITY_DEFAULT;
2770 	} else if (priority > JETSAM_PRIORITY_IDLE && priority <= JETSAM_PRIORITY_AGING_BAND2) {
2771 		/*
2772 		 * Everything between idle and the aging bands are reserved for internal use.
2773 		 * if requested, adjust to JETSAM_PRIORITY_IDLE.
2774 		 * Entitled processes (just munch) can use a subset of this range for testing.
2775 		 */
2776 		if (priority > JETSAM_PRIORITY_ENTITLED_MAX ||
2777 		    !current_task_can_use_entitled_range()) {
2778 			priority = JETSAM_PRIORITY_IDLE;
2779 			options |= MEMSTAT_PRIORITY_NO_AGING;
2780 		}
2781 	} else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
2782 		/* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
2783 		priority = JETSAM_PRIORITY_IDLE;
2784 		options |= MEMSTAT_PRIORITY_INSERT_HEAD;
2785 	} else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
2786 		/* Sanity check */
2787 		ret = EINVAL;
2788 		goto out;
2789 	}
2790 
2791 	proc_list_lock();
2792 
2793 	assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
2794 
2795 	if ((options & MEMSTAT_PRIORITY_IS_EFFECTIVE) &&
2796 	    (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
2797 		ret = EALREADY;
2798 		proc_list_unlock();
2799 		memorystatus_log_error("memorystatus_update: effective change specified for pid %d, but change already occurred.\n",
2800 		    proc_getpid(p));
2801 		goto out;
2802 	}
2803 
2804 	if ((p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) || proc_list_exited(p)) {
2805 		/*
2806 		 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
2807 		 */
2808 		ret = EBUSY;
2809 		proc_list_unlock();
2810 		goto out;
2811 	}
2812 
2813 	p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
2814 	p->p_memstat_userdata = user_data;
2815 
2816 	if ((options & MEMSTAT_PRIORITY_IS_ASSERTION)) {
2817 		if (priority != JETSAM_PRIORITY_IDLE) {
2818 			/*
2819 			 * Process is now being managed by assertions,
2820 			 */
2821 			p->p_memstat_state |= P_MEMSTAT_PRIORITY_ASSERTION;
2822 			p->p_memstat_assertionpriority = priority;
2823 		} else if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
2824 			/*
2825 			 * Assertions relinquish control when the process is heading to IDLE.
2826 			 */
2827 			p->p_memstat_state &= ~P_MEMSTAT_PRIORITY_ASSERTION;
2828 		}
2829 
2830 		if (_memstat_proc_is_tracked(p) &&
2831 		    (_memstat_proc_is_dirty(p) || !_memstat_proc_can_idle_exit(p))) {
2832 			priority = MAX(p->p_memstat_assertionpriority,
2833 			    p->p_memstat_requestedpriority);
2834 		}
2835 	} else {
2836 		p->p_memstat_requestedpriority = priority;
2837 	}
2838 
2839 	memstat_update_priority_locked(p, priority, options);
2840 
2841 	proc_list_unlock();
2842 	ret = 0;
2843 
2844 out:
2845 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret);
2846 
2847 	return ret;
2848 }
2849 
2850 static int
memstat_set_memlimits_locked(proc_t p,int32_t active_limit,int32_t inactive_limit,memlimit_options_t options)2851 memstat_set_memlimits_locked(proc_t p, int32_t active_limit,
2852     int32_t inactive_limit, memlimit_options_t options)
2853 {
2854 	/*
2855 	 * Posix_spawn'd processes and managed processes come through this path to
2856 	 * instantiate ledger limits. Forked processes do not come through this
2857 	 * path and will always receive the default task limit.
2858 	 */
2859 
2860 	int err = 0;
2861 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2862 
2863 	int32_t default_active_limit = memorystatus_get_default_task_active_limit(p);
2864 	/*
2865 	 * The special value of -1 specifies that this proc wants the default
2866 	 * memory limit
2867 	 */
2868 	if (active_limit <= 0) {
2869 		active_limit = default_active_limit;
2870 	}
2871 	/*
2872 	 * Work around a bug in JetsamProperties whereby processes may mistakenly receive
2873 	 * ActiveSoftMemoryLimit := -1 by forcing the default task limit to be fatal.
2874 	 */
2875 	if (default_active_limit && active_limit == default_active_limit) {
2876 		options |= MEMLIMIT_ACTIVE_FATAL;
2877 	}
2878 
2879 	int32_t default_inactive_limit = memorystatus_get_default_task_inactive_limit(p);
2880 	if (inactive_limit <= 0) {
2881 		inactive_limit = default_inactive_limit;
2882 	}
2883 	if (default_inactive_limit && inactive_limit == default_inactive_limit) {
2884 		options |= MEMLIMIT_INACTIVE_FATAL;
2885 	}
2886 #if DEVELOPMENT || DEBUG
2887 	if (p->p_memlimit_increase) {
2888 		/* Apply memlimit increase (for testing with overlay roots) */
2889 		int32_t memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
2890 		active_limit = active_limit + memlimit_increase;
2891 		inactive_limit = inactive_limit + memlimit_increase;
2892 	}
2893 #endif /* DEVELOPMENT || DEBUG */
2894 
2895 	memorystatus_log_debug(
2896 		"memorystatus: setting memlimit for %s [%d], "
2897 		"Active(%dMB %s), Inactive(%dMB, %s)\n",
2898 		proc_best_name(p), proc_getpid(p),
2899 		active_limit, ((options & MEMLIMIT_ACTIVE_FATAL) ? "F" : "NF"),
2900 		inactive_limit, ((options & MEMLIMIT_INACTIVE_FATAL) ? "F" : "NF"));
2901 
2902 	p->p_memstat_memlimit_active = active_limit;
2903 	p->p_memstat_memlimit_inactive = inactive_limit;
2904 	if (options & MEMLIMIT_INACTIVE_FATAL) {
2905 		p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL;
2906 	} else {
2907 		p->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL;
2908 	}
2909 	if (options & MEMLIMIT_ACTIVE_FATAL) {
2910 		p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL;
2911 	} else {
2912 		p->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL;
2913 	}
2914 
2915 	/*
2916 	 * Initialize the cached limits for target process.
2917 	 * When the target process is dirty tracked, it's typically
2918 	 * in a clean state.  Non dirty tracked processes are
2919 	 * typically active (Foreground or above).
2920 	 * But just in case, we don't make assumptions...
2921 	 */
2922 	const bool use_active = _memstat_proc_is_active_locked(p);
2923 	if (memorystatus_highwater_enabled &&
2924 	    _memstat_update_memlimit_locked(p, use_active)) {
2925 		err = _memstat_write_memlimit_to_ledger_locked(p, use_active, false);
2926 	}
2927 
2928 	return err;
2929 }
2930 
2931 int
memorystatus_set_memlimits(proc_t p,int32_t active_limit,int32_t inactive_limit,memlimit_options_t options)2932 memorystatus_set_memlimits(proc_t p, int32_t active_limit,
2933     int32_t inactive_limit, memlimit_options_t options)
2934 {
2935 	int err;
2936 	proc_list_lock();
2937 	err = memstat_set_memlimits_locked(p, active_limit, inactive_limit,
2938 	    options);
2939 	proc_list_unlock();
2940 	return err;
2941 }
2942 
2943 int
memorystatus_remove(proc_t p)2944 memorystatus_remove(proc_t p)
2945 {
2946 	int ret;
2947 	memstat_bucket_t *bucket;
2948 	bool reschedule = false;
2949 
2950 	memorystatus_log_debug("memorystatus_list_remove: removing pid %d\n", proc_getpid(p));
2951 
2952 	/* Processes marked internal do not have priority tracked */
2953 	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2954 		return 0;
2955 	}
2956 
2957 	/*
2958 	 * Check if this proc is locked (because we're performing a freeze).
2959 	 * If so, we fail and instruct the caller to try again later.
2960 	 */
2961 	if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
2962 		return EAGAIN;
2963 	}
2964 
2965 	assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
2966 
2967 	bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2968 
2969 	if ((system_procs_aging_band &&
2970 	    p->p_memstat_effectivepriority == system_procs_aging_band) ||
2971 	    (system_procs_aging_band_stuck &&
2972 	    p->p_memstat_effectivepriority == system_procs_aging_band_stuck) ||
2973 	    (applications_aging_band &&
2974 	    p->p_memstat_effectivepriority == applications_aging_band)) {
2975 		_memstat_invalidate_idle_demotion_locked(p);
2976 		reschedule = true;
2977 	}
2978 
2979 	/*
2980 	 * Record idle delta
2981 	 */
2982 
2983 	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2984 		uint64_t now = mach_absolute_time();
2985 		if (now > p->p_memstat_idle_start) {
2986 			p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2987 		}
2988 	}
2989 
2990 	TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
2991 	bucket->count--;
2992 	if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2993 		bucket->relaunch_high_count--;
2994 	}
2995 
2996 	memorystatus_list_count--;
2997 
2998 	/* If awaiting demotion to the idle band, clean up */
2999 	if (reschedule) {
3000 		_memstat_reschedule_idle_demotion_locked();
3001 	}
3002 
3003 #if CONFIG_FREEZE
3004 	if (_memstat_proc_is_frozen(p)) {
3005 		if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
3006 			p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
3007 			assert(memorystatus_refreeze_eligible_count > 0);
3008 			memorystatus_refreeze_eligible_count--;
3009 		}
3010 
3011 		assert(memorystatus_frozen_count > 0);
3012 		memorystatus_frozen_count--;
3013 		if (p->p_memstat_state & P_MEMSTAT_FROZEN_XPC_SERVICE) {
3014 			assert(memorystatus_frozen_count_xpc_service > 0);
3015 			memorystatus_frozen_count_xpc_service--;
3016 		}
3017 		if (strcmp(p->p_name, "com.apple.WebKit.WebContent") == 0) {
3018 			assert(memorystatus_frozen_count_webcontent > 0);
3019 			memorystatus_frozen_count_webcontent--;
3020 		}
3021 		memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
3022 		p->p_memstat_freeze_sharedanon_pages = 0;
3023 	}
3024 #endif /* CONFIG_FREEZE */
3025 
3026 	_memstat_proc_set_resumed(p);
3027 
3028 #if DEVELOPMENT || DEBUG
3029 	if (proc_getpid(p) == memorystatus_testing_pid) {
3030 		memorystatus_testing_pid = 0;
3031 	}
3032 #endif /* DEVELOPMENT || DEBUG */
3033 
3034 	if (p) {
3035 		ret = 0;
3036 	} else {
3037 		ret = ESRCH;
3038 	}
3039 
3040 	return ret;
3041 }
3042 
3043 /*
3044  * Validate dirty tracking flags with process state.
3045  *
3046  * Return:
3047  *	0     on success
3048  *      non-0 on failure
3049  *
3050  * The proc_list_lock is held by the caller.
3051  */
3052 
3053 static int
memorystatus_validate_track_flags(struct proc * target_p,uint32_t pcontrol)3054 memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol)
3055 {
3056 	/* See that the process isn't marked for termination */
3057 	if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
3058 		return EBUSY;
3059 	}
3060 
3061 	/* Idle exit requires that process be tracked */
3062 	if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
3063 	    !(pcontrol & PROC_DIRTY_TRACK)) {
3064 		return EINVAL;
3065 	}
3066 
3067 	/* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
3068 	if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
3069 	    !(pcontrol & PROC_DIRTY_TRACK)) {
3070 		return EINVAL;
3071 	}
3072 
3073 	/* Only one type of DEFER behavior is allowed.*/
3074 	if ((pcontrol & PROC_DIRTY_DEFER) &&
3075 	    (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) {
3076 		return EINVAL;
3077 	}
3078 
3079 	/* Deferral is only relevant if idle exit is specified */
3080 	if (((pcontrol & PROC_DIRTY_DEFER) ||
3081 	    (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) &&
3082 	    !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
3083 		return EINVAL;
3084 	}
3085 
3086 	return 0;
3087 }
3088 
3089 /*
3090  * Processes can opt to have their state tracked by the kernel, indicating  when they are busy (dirty) or idle
3091  * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
3092  * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
3093  * priority idle band when clean (and killed earlier, protecting higher priority procesess).
3094  *
3095  * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
3096  * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
3097  * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
3098  * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
3099  * band. The deferral can be cleared early by clearing the appropriate flag.
3100  *
3101  * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
3102  * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
3103  * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
3104  */
3105 
3106 int
memorystatus_dirty_track(proc_t p,uint32_t pcontrol)3107 memorystatus_dirty_track(proc_t p, uint32_t pcontrol)
3108 {
3109 	unsigned int old_dirty;
3110 	boolean_t defer_now = FALSE;
3111 	int ret = 0;
3112 	int priority;
3113 	memstat_priority_options_t priority_options =
3114 	    MEMSTAT_PRIORITY_OPTIONS_NONE;
3115 
3116 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_TRACK),
3117 	    proc_getpid(p), p->p_memstat_dirty, pcontrol);
3118 
3119 	proc_list_lock();
3120 
3121 	if (proc_list_exited(p)) {
3122 		/*
3123 		 * Process is on its way out.
3124 		 */
3125 		ret = EBUSY;
3126 		goto exit;
3127 	}
3128 
3129 	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3130 		ret = EPERM;
3131 		goto exit;
3132 	}
3133 
3134 	if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
3135 		/* error  */
3136 		goto exit;
3137 	}
3138 
3139 	old_dirty = p->p_memstat_dirty;
3140 
3141 	/* These bits are cumulative, as per <rdar://problem/11159924> */
3142 	if (pcontrol & PROC_DIRTY_TRACK) {
3143 		/* Request to turn ON Dirty tracking... */
3144 		if (p->p_memstat_state & P_MEMSTAT_MANAGED) {
3145 			/* on a process managed by RunningBoard or its equivalent...*/
3146 			if (!_memstat_proc_cached_memlimit_is_fatal(p)) {
3147 				/* but this might be an app because there's no fatal limits
3148 				 * NB: This _big_ assumption is not universal. What we really
3149 				 * need is a way to say this is an _APP_ and we can't have dirty
3150 				 * tracking turned ON for it. Lacking that functionality we clump
3151 				 * together some checks and try to do the best detection we can.
3152 				 * Reason we can't allow addition of these flags is because, per the
3153 				 * kernel checks, they change the role of a process from app to daemon. And the
3154 				 * AGING_IN_PROGRESS bits might still be set i.e. it needs to be demoted
3155 				 * correctly from the right aging band (app or sysproc). We can't simply try
3156 				 * to invalidate the demotion here because, owing to assertion priorities, we
3157 				 * might not be in the aging bands.
3158 				 */
3159 				memorystatus_log(
3160 					"memorystatus: Denying dirty-tracking opt-in for managed %s [%d]\n",
3161 					proc_best_name(p), proc_getpid(p));
3162 				/* fail silently to avoid an XPC assertion... */
3163 				ret = 0;
3164 				goto exit;
3165 			}
3166 		}
3167 
3168 		p->p_memstat_dirty |= P_DIRTY_TRACK;
3169 	}
3170 
3171 	if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
3172 		p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
3173 	}
3174 
3175 	if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3176 		p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
3177 	}
3178 
3179 	/*
3180 	 * NB: All processes are now automatically enrolled in idle aging
3181 	 * regardless of whether they request to be deferred.
3182 	 */
3183 	if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
3184 		if ((pcontrol & (PROC_DIRTY_DEFER)) &&
3185 		    !(old_dirty & P_DIRTY_DEFER)) {
3186 			p->p_memstat_dirty |= P_DIRTY_DEFER;
3187 		}
3188 
3189 		if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) &&
3190 		    !(old_dirty & P_DIRTY_DEFER_ALWAYS)) {
3191 			p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS;
3192 		}
3193 
3194 		defer_now = TRUE;
3195 	}
3196 
3197 	memorystatus_log_info(
3198 		"%s [%d] enrolled in ActivityTracking tracked %d / idle-exit %d / defer %d / dirty %d",
3199 		proc_best_name(p), proc_getpid(p),
3200 		_memstat_proc_is_tracked(p), _memstat_proc_can_idle_exit(p), defer_now,
3201 		_memstat_proc_is_dirty(p));
3202 
3203 	if (!_memstat_proc_is_dirty(p) && _memstat_proc_is_tracked(p) &&
3204 	    _memstat_proc_can_idle_exit(p)) {
3205 		priority = JETSAM_PRIORITY_IDLE;
3206 		if (!defer_now && _memstat_proc_is_aging(p)) {
3207 			/*
3208 			 * Historically, some processes have tried to use this to opt out
3209 			 * of the 'aging' facility.
3210 			 */
3211 			priority_options |= MEMSTAT_PRIORITY_NO_AGING;
3212 		}
3213 	} else {
3214 		priority = p->p_memstat_requestedpriority;
3215 	}
3216 
3217 	if (_memstat_proc_has_priority_assertion(p)) {
3218 		priority = MAX(priority, p->p_memstat_assertionpriority);
3219 	}
3220 
3221 	memstat_update_priority_locked(p, priority, priority_options);
3222 
3223 exit:
3224 	proc_list_unlock();
3225 
3226 	return ret;
3227 }
3228 
3229 int
memorystatus_dirty_set(proc_t p,boolean_t self,uint32_t pcontrol)3230 memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol)
3231 {
3232 	int ret = 0;
3233 	bool kill = false;
3234 	bool was_dirty;
3235 	bool now_dirty = false;
3236 	int priority;
3237 	task_t t = proc_task(p);
3238 
3239 	memorystatus_log_debug("memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, proc_getpid(p), pcontrol, p->p_memstat_dirty);
3240 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_SET), proc_getpid(p), self, pcontrol);
3241 
3242 	proc_list_lock();
3243 
3244 	if (proc_list_exited(p)) {
3245 		/*
3246 		 * Process is on its way out.
3247 		 */
3248 		ret = EBUSY;
3249 		goto exit;
3250 	}
3251 
3252 	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3253 		ret = EPERM;
3254 		goto exit;
3255 	}
3256 
3257 	was_dirty = _memstat_proc_is_dirty(p);
3258 
3259 	if (!_memstat_proc_is_tracked(p)) {
3260 		/* Dirty tracking not enabled */
3261 		ret = EINVAL;
3262 		goto exit;
3263 	} else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
3264 		/*
3265 		 * Process is set to be terminated and we're attempting to mark it dirty.
3266 		 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
3267 		 */
3268 		ret = EBUSY;
3269 		goto exit;
3270 	}
3271 
3272 	int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
3273 	if (pcontrol && !(p->p_memstat_dirty & flag)) {
3274 		/* Mark the process as having been dirtied at some point */
3275 		p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
3276 	} else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
3277 		if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
3278 			/* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
3279 			p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3280 			kill = true;
3281 		} else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
3282 			/* Kill previously terminated processes if set clean */
3283 			kill = true;
3284 		}
3285 		p->p_memstat_dirty &= ~flag;
3286 	} else {
3287 		/* Already set */
3288 		ret = EALREADY;
3289 		goto exit;
3290 	}
3291 
3292 	now_dirty = _memstat_proc_is_dirty(p);
3293 
3294 	if (was_dirty && !now_dirty) {
3295 		if (_memstat_proc_can_idle_exit(p)) {
3296 			/*
3297 			 * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back
3298 			 * there once it's clean again. For the legacy case, this only applies if it has some protection window left.
3299 			 * P_DIRTY_DEFER: one-time protection window given at launch
3300 			 * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode.
3301 			 *
3302 			 * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over
3303 			 * in that band on it's way to IDLE.
3304 			 */
3305 			assert(!_memstat_proc_is_aging(p));
3306 			priority = JETSAM_PRIORITY_IDLE;
3307 		} else {
3308 			priority = p->p_memstat_requestedpriority;
3309 		}
3310 		task_ledger_settle_dirty_time(t);
3311 		task_set_dirty_start(t, 0);
3312 	} else if (!was_dirty && now_dirty) {
3313 		priority = p->p_memstat_requestedpriority;
3314 		task_set_dirty_start(t, mach_absolute_time());
3315 	}
3316 
3317 	if (_memstat_proc_has_priority_assertion(p)) {
3318 		priority = MAX(priority, p->p_memstat_assertionpriority);
3319 	}
3320 
3321 	memstat_update_priority_locked(p, priority, MEMSTAT_PRIORITY_OPTIONS_NONE);
3322 
3323 	if (kill) {
3324 		if (proc_ref(p, true) == p) {
3325 			proc_list_unlock();
3326 			psignal(p, SIGKILL);
3327 			proc_list_lock();
3328 			proc_rele(p);
3329 		}
3330 	}
3331 
3332 exit:
3333 	proc_list_unlock();
3334 
3335 	return ret;
3336 }
3337 
3338 int
memorystatus_dirty_clear(proc_t p,uint32_t pcontrol)3339 memorystatus_dirty_clear(proc_t p, uint32_t pcontrol)
3340 {
3341 	int ret = 0;
3342 
3343 	memorystatus_log_debug("memorystatus_dirty_clear(): %d 0x%x 0x%x\n", proc_getpid(p), pcontrol, p->p_memstat_dirty);
3344 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_CLEAR), proc_getpid(p), pcontrol);
3345 
3346 	proc_list_lock();
3347 
3348 	if (proc_list_exited(p)) {
3349 		/*
3350 		 * Process is on its way out.
3351 		 */
3352 		ret = EBUSY;
3353 		goto exit;
3354 	}
3355 
3356 	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3357 		ret = EPERM;
3358 		goto exit;
3359 	}
3360 
3361 	if (!_memstat_proc_is_tracked(p)) {
3362 		/* Dirty tracking not enabled */
3363 		ret = EINVAL;
3364 		goto exit;
3365 	}
3366 
3367 	if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) {
3368 		ret = EINVAL;
3369 		goto exit;
3370 	}
3371 
3372 	if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3373 		p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
3374 	}
3375 
3376 	/* This can be set and cleared exactly once. */
3377 	if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
3378 		if (p->p_memstat_dirty & P_DIRTY_DEFER) {
3379 			p->p_memstat_dirty &= ~(P_DIRTY_DEFER);
3380 		}
3381 
3382 		if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
3383 			p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS);
3384 		}
3385 
3386 		if (_memstat_proc_is_aging(p)) {
3387 			memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE,
3388 			    MEMSTAT_PRIORITY_NO_AGING);
3389 		}
3390 	}
3391 
3392 	ret = 0;
3393 exit:
3394 	proc_list_unlock();
3395 
3396 	return ret;
3397 }
3398 
3399 int
memorystatus_dirty_get(proc_t p,boolean_t locked)3400 memorystatus_dirty_get(proc_t p, boolean_t locked)
3401 {
3402 	int ret = 0;
3403 
3404 	if (!locked) {
3405 		proc_list_lock();
3406 	}
3407 
3408 	if (_memstat_proc_is_tracked(p)) {
3409 		ret |= PROC_DIRTY_TRACKED;
3410 		if (_memstat_proc_can_idle_exit(p)) {
3411 			ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
3412 		}
3413 		if (p->p_memstat_dirty & P_DIRTY) {
3414 			ret |= PROC_DIRTY_IS_DIRTY;
3415 		}
3416 		if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
3417 			ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
3418 		}
3419 	}
3420 
3421 	if (!locked) {
3422 		proc_list_unlock();
3423 	}
3424 
3425 	return ret;
3426 }
3427 
3428 int
memorystatus_on_terminate(proc_t p)3429 memorystatus_on_terminate(proc_t p)
3430 {
3431 	int sig;
3432 
3433 	proc_list_lock();
3434 
3435 	p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3436 
3437 	if ((_memstat_proc_is_tracked(p) && !_memstat_proc_is_dirty(p)) ||
3438 	    (_memstat_proc_is_suspended(p))) {
3439 		/*
3440 		 * Mark as terminated and issue SIGKILL if:-
3441 		 * - process is clean, or,
3442 		 * - if process is dirty but suspended. This case is likely
3443 		 * an extension because apps don't opt into dirty-tracking
3444 		 * and daemons aren't suspended.
3445 		 */
3446 #if DEVELOPMENT || DEBUG
3447 		if (_memstat_proc_is_suspended(p)) {
3448 			memorystatus_log(
3449 				"memorystatus: sending suspended process %s (pid %d) SIGKILL\n",
3450 				(*p->p_name ? p->p_name : "unknown"), proc_getpid(p));
3451 		}
3452 #endif /* DEVELOPMENT || DEBUG */
3453 		sig = SIGKILL;
3454 	} else {
3455 		/* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
3456 		sig = SIGTERM;
3457 	}
3458 
3459 	proc_list_unlock();
3460 
3461 	return sig;
3462 }
3463 
3464 void
memorystatus_on_suspend(proc_t p)3465 memorystatus_on_suspend(proc_t p)
3466 {
3467 #if CONFIG_FREEZE
3468 	uint32_t pages;
3469 	memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
3470 #endif
3471 	proc_list_lock();
3472 
3473 	_memstat_proc_set_suspended(p);
3474 
3475 	/* Check if proc is marked for termination */
3476 	bool kill_process = !!(p->p_memstat_dirty & P_DIRTY_TERMINATED);
3477 	proc_list_unlock();
3478 
3479 	if (kill_process) {
3480 		psignal(p, SIGKILL);
3481 	}
3482 
3483 #if CONFIG_DEFERRED_RECLAIM
3484 	vm_deferred_reclamation_reclaim_from_task_async(proc_task(p));
3485 #endif /* CONFIG_DEFERRED_RECLAIM */
3486 }
3487 
3488 extern uint64_t memorystatus_thaw_count_since_boot;
3489 
3490 void
memorystatus_on_resume(proc_t p)3491 memorystatus_on_resume(proc_t p)
3492 {
3493 #if CONFIG_FREEZE
3494 	pid_t pid;
3495 #endif
3496 
3497 	proc_list_lock();
3498 
3499 #if CONFIG_FREEZE
3500 	const bool frozen = _memstat_proc_is_frozen(p);
3501 	if (frozen) {
3502 		/*
3503 		 * Now that we don't _thaw_ a process completely,
3504 		 * resuming it (and having some on-demand swapins)
3505 		 * shouldn't preclude it from being counted as frozen.
3506 		 *
3507 		 * memorystatus_frozen_count--;
3508 		 *
3509 		 * We preserve the P_MEMSTAT_FROZEN state since the process
3510 		 * could have state on disk AND so will deserve some protection
3511 		 * in the jetsam bands.
3512 		 */
3513 		if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) {
3514 			p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE;
3515 			memorystatus_refreeze_eligible_count++;
3516 		}
3517 		if (p->p_memstat_thaw_count == 0 || p->p_memstat_last_thaw_interval < memorystatus_freeze_current_interval) {
3518 			os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed), relaxed);
3519 			if (strcmp(p->p_name, "com.apple.WebKit.WebContent") == 0) {
3520 				os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed_webcontent), relaxed);
3521 			}
3522 		}
3523 		p->p_memstat_last_thaw_interval = memorystatus_freeze_current_interval;
3524 		p->p_memstat_thaw_count++;
3525 
3526 		memorystatus_freeze_last_pid_thawed = p->p_pid;
3527 		memorystatus_freeze_last_pid_thawed_ts = mach_absolute_time();
3528 
3529 		memorystatus_thaw_count++;
3530 		memorystatus_thaw_count_since_boot++;
3531 	}
3532 
3533 	pid = proc_getpid(p);
3534 #endif
3535 
3536 	/*
3537 	 * P_MEMSTAT_FROZEN will remain unchanged. This used to be:
3538 	 * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
3539 	 */
3540 	_memstat_proc_set_resumed(p);
3541 
3542 	proc_list_unlock();
3543 
3544 #if CONFIG_FREEZE
3545 	if (frozen) {
3546 		memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
3547 		memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
3548 	}
3549 #endif
3550 }
3551 
3552 void
memorystatus_on_inactivity(proc_t p)3553 memorystatus_on_inactivity(proc_t p)
3554 {
3555 #pragma unused(p)
3556 #if CONFIG_FREEZE
3557 	/* Wake the freeze thread */
3558 	thread_wakeup((event_t)&memorystatus_freeze_wakeup);
3559 #endif
3560 }
3561 
3562 /*
3563  * The proc_list_lock is held by the caller.
3564  */
3565 static uint32_t
memorystatus_build_state(proc_t p)3566 memorystatus_build_state(proc_t p)
3567 {
3568 	uint32_t snapshot_state = 0;
3569 
3570 	/* General */
3571 	if (_memstat_proc_is_suspended(p)) {
3572 		snapshot_state |= kMemorystatusSuspended;
3573 	}
3574 	if (_memstat_proc_is_frozen(p)) {
3575 		snapshot_state |= kMemorystatusFrozen;
3576 	}
3577 	if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
3578 		snapshot_state |= kMemorystatusWasThawed;
3579 	}
3580 	if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
3581 		snapshot_state |= kMemorystatusAssertion;
3582 	}
3583 
3584 	/* Tracking */
3585 	if (_memstat_proc_is_tracked(p)) {
3586 		snapshot_state |= kMemorystatusTracked;
3587 	}
3588 	if (_memstat_proc_can_idle_exit(p)) {
3589 		snapshot_state |= kMemorystatusSupportsIdleExit;
3590 	}
3591 	if (_memstat_proc_is_dirty(p)) {
3592 		snapshot_state |= kMemorystatusDirty;
3593 	}
3594 
3595 	return snapshot_state;
3596 }
3597 
3598 static boolean_t
kill_idle_exit_proc(void)3599 kill_idle_exit_proc(void)
3600 {
3601 	proc_t p, victim_p = PROC_NULL;
3602 	uint64_t current_time, footprint_of_killed_proc;
3603 	boolean_t killed = FALSE;
3604 	unsigned int i = 0;
3605 	os_reason_t jetsam_reason = OS_REASON_NULL;
3606 
3607 	/* Pick next idle exit victim. */
3608 	current_time = mach_absolute_time();
3609 
3610 	jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT);
3611 	if (jetsam_reason == OS_REASON_NULL) {
3612 		memorystatus_log_error("kill_idle_exit_proc: failed to allocate jetsam reason\n");
3613 	}
3614 
3615 	proc_list_lock();
3616 
3617 	p = memorystatus_get_first_proc_locked(&i, FALSE);
3618 	while (p) {
3619 		/* No need to look beyond the idle band */
3620 		if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
3621 			break;
3622 		}
3623 
3624 		if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
3625 			if (current_time >= p->p_memstat_idledeadline) {
3626 				p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3627 				victim_p = proc_ref(p, true);
3628 				break;
3629 			}
3630 		}
3631 
3632 		p = memorystatus_get_next_proc_locked(&i, p, FALSE);
3633 	}
3634 
3635 	proc_list_unlock();
3636 
3637 	if (victim_p) {
3638 		memorystatus_log(
3639 			"memorystatus: killing_idle_process pid %d [%s] jetsam_reason->osr_code: %llu\n",
3640 			proc_getpid(victim_p), (*victim_p->p_name ? victim_p->p_name : "unknown"), jetsam_reason->osr_code);
3641 		killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit, jetsam_reason, &footprint_of_killed_proc);
3642 		proc_rele(victim_p);
3643 	} else {
3644 		os_reason_free(jetsam_reason);
3645 	}
3646 
3647 	return killed;
3648 }
3649 
3650 /*
3651  * Consider waking the jetsam thread. Returns true if the thread was awoken.
3652  */
3653 static bool
_memstat_consider_waking_jetsam_thread(void)3654 _memstat_consider_waking_jetsam_thread(void)
3655 {
3656 #if CONFIG_JETSAM
3657 	if (memstat_evaluate_page_shortage(NULL, NULL, NULL)) {
3658 		memorystatus_thread_wake();
3659 		return true;
3660 	}
3661 #endif /* CONFIG_JETSAM */
3662 	return false;
3663 }
3664 
3665 void
memorystatus_thread_wake()3666 memorystatus_thread_wake()
3667 {
3668 	int thr_id = 0;
3669 	int active_thr = atomic_load(&active_jetsam_threads);
3670 
3671 	/* Wakeup all the jetsam threads */
3672 	for (thr_id = 0; thr_id < active_thr; thr_id++) {
3673 		jetsam_state_t jetsam_thread = &jetsam_threads[thr_id];
3674 		sched_cond_signal(&(jetsam_thread->jt_wakeup_cond), jetsam_thread->thread);
3675 	}
3676 }
3677 
3678 #if CONFIG_JETSAM
3679 static void
memorystatus_thread_pool_max()3680 memorystatus_thread_pool_max()
3681 {
3682 	/* Increase the jetsam thread pool to max_jetsam_threads */
3683 	int max_threads = max_jetsam_threads;
3684 	memorystatus_log_info("Expanding memorystatus pool to %d\n", max_threads);
3685 	os_atomic_store(&active_jetsam_threads, max_threads, relaxed);
3686 }
3687 
3688 static void
memorystatus_thread_pool_default()3689 memorystatus_thread_pool_default()
3690 {
3691 	/* Restore the jetsam thread pool to a single thread */
3692 	memorystatus_log_info("Reverting memorystatus pool back to 1\n");
3693 	os_atomic_store(&active_jetsam_threads, 1, relaxed);
3694 }
3695 #endif /* CONFIG_JETSAM */
3696 
3697 /*
3698  * An offset applied to non-critical page shortage thresholds.
3699  */
3700 static uint32_t
_memstat_page_shortage_offset(void)3701 _memstat_page_shortage_offset(void)
3702 {
3703 	uint32_t offset = 0;
3704 	if (os_atomic_load(&memstat_policy_config, acquire) & kPolicyClearTheDecks) {
3705 		offset += memstat_ctd_offset;
3706 	}
3707 	if (os_atomic_load(&memstat_policy_config, acquire) & kPolicyBallastDrain) {
3708 		offset += memstat_ballast_offset;
3709 	}
3710 	return offset;
3711 }
3712 
3713 uint32_t
memorystatus_get_critical_page_shortage_threshold(void)3714 memorystatus_get_critical_page_shortage_threshold(void)
3715 {
3716 	return memstat_critical_threshold;
3717 }
3718 
3719 uint32_t
memorystatus_get_idle_exit_page_shortage_threshold(void)3720 memorystatus_get_idle_exit_page_shortage_threshold(void)
3721 {
3722 	uint32_t offset = _memstat_page_shortage_offset();
3723 	return memstat_idle_threshold + offset;
3724 }
3725 
3726 uint32_t
memorystatus_get_soft_memlimit_page_shortage_threshold(void)3727 memorystatus_get_soft_memlimit_page_shortage_threshold(void)
3728 {
3729 	uint32_t offset = _memstat_page_shortage_offset();
3730 	return memstat_soft_threshold + offset;
3731 }
3732 
3733 bool
memstat_evaluate_page_shortage(bool * should_enforce_memlimits,bool * should_idle_exit,bool * should_jetsam)3734 memstat_evaluate_page_shortage(
3735 	bool *should_enforce_memlimits,
3736 	bool *should_idle_exit,
3737 	bool *should_jetsam)
3738 {
3739 	bool requires_action = false;
3740 	if (should_enforce_memlimits) {
3741 		*should_enforce_memlimits = false;
3742 	}
3743 	if (should_idle_exit) {
3744 		*should_idle_exit = false;
3745 	}
3746 	if (should_jetsam) {
3747 		*should_jetsam = false;
3748 	}
3749 #if CONFIG_JETSAM
3750 	uint32_t available_page_count = os_atomic_load(&memorystatus_available_pages, relaxed);
3751 #if VM_PRESSURE_EVENTS
3752 	if (available_page_count <
3753 	    memorystatus_get_soft_memlimit_page_shortage_threshold()) {
3754 		/*
3755 		 * Only wake the jetsam thread if there are hwm violators to
3756 		 * kill
3757 		 */
3758 		bool hwm_candidates = os_atomic_load(&memorystatus_hwm_candidates, acquire);
3759 		requires_action = requires_action || hwm_candidates;
3760 		if (should_enforce_memlimits) {
3761 			*should_enforce_memlimits = true;
3762 		}
3763 	}
3764 #endif /* VM_PRESSURE_EVENTS */
3765 	if (available_page_count < memorystatus_get_idle_exit_page_shortage_threshold()) {
3766 		/*
3767 		 * Only wake the jetsam thread if there are idle processes that
3768 		 * could exit.
3769 		 */
3770 		uint32_t idle_proc_count = os_atomic_load(
3771 			&memstat_bucket[JETSAM_PRIORITY_IDLE].count, relaxed);
3772 		requires_action = requires_action || (idle_proc_count > 0);
3773 		if (should_idle_exit) {
3774 			*should_idle_exit = true;
3775 		}
3776 	}
3777 	if (available_page_count < memorystatus_get_critical_page_shortage_threshold()) {
3778 		if (should_jetsam) {
3779 			*should_jetsam = true;
3780 		}
3781 		requires_action = true;
3782 	}
3783 #endif /* CONFIG_JETSAM */
3784 	return requires_action;
3785 }
3786 
3787 #if CONFIG_JETSAM
3788 static uint64_t
memorystatus_swap_trigger_pages(void)3789 memorystatus_swap_trigger_pages(void)
3790 {
3791 	/*
3792 	 * The swapout trigger varies based on the current memorystatus_level.
3793 	 * When available memory is somewhat high (at memorystatus_available_pages_pressure)
3794 	 * we keep more swappable compressor segments in memory.
3795 	 * However, as available memory drops to our idle and eventually critical kill
3796 	 * thresholds we start swapping more aggressively.
3797 	 */
3798 	static uint32_t available_pages_factor[] = {0, 1, 1, 1, 2, 2, 3, 5, 7, 8, 10, 13, 15, 17, 20};
3799 	size_t index = MIN(memorystatus_level, sizeof(available_pages_factor) / sizeof(uint32_t) - 1);
3800 	return available_pages_factor[index] * memorystatus_available_pages / 10;
3801 }
3802 
3803 static int
3804 sysctl_memorystatus_swap_trigger_pages SYSCTL_HANDLER_ARGS
3805 {
3806 #pragma unused(arg1, arg2)
3807 	uint64_t trigger_pages = memorystatus_swap_trigger_pages();
3808 	return SYSCTL_OUT(req, &trigger_pages, sizeof(trigger_pages));
3809 }
3810 
3811 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_swap_trigger_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3812     0, 0, &sysctl_memorystatus_swap_trigger_pages, "I", "");
3813 
3814 /*
3815  * Check if the number of full swappable csegments is over the trigger
3816  * threshold to start swapping.
3817  * The adjustment_factor is applied to the trigger to raise or lower
3818  * it. For example an adjustement factor of 110 will raise the threshold by 10%.
3819  */
3820 bool
memorystatus_swap_over_trigger(uint64_t adjustment_factor)3821 memorystatus_swap_over_trigger(uint64_t adjustment_factor)
3822 {
3823 	if (!memorystatus_swap_all_apps) {
3824 		return false;
3825 	}
3826 	uint64_t trigger_pages = memorystatus_swap_trigger_pages();
3827 	trigger_pages = trigger_pages * adjustment_factor / 100;
3828 	return atop_64(c_late_swapout_count * c_seg_allocsize) > trigger_pages;
3829 }
3830 
3831 /*
3832  * Check if the number of segments on the early swapin queue
3833  * is over the trigger to start compacting it.
3834  */
3835 bool
memorystatus_swapin_over_trigger(void)3836 memorystatus_swapin_over_trigger(void)
3837 {
3838 	return atop_64(c_late_swappedin_count * c_seg_allocsize) > memorystatus_swapin_trigger_pages;
3839 }
3840 #endif /* CONFIG_JETSAM */
3841 
3842 #if DEVELOPMENT || DEBUG
3843 SYSCTL_UINT(_vm, OID_AUTO, c_late_swapout_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swapout_count, 0, "");
3844 SYSCTL_UINT(_vm, OID_AUTO, c_seg_allocsize, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_allocsize, 0, "");
3845 #if CONFIG_FREEZE
3846 extern int32_t c_segment_pages_compressed_incore_late_swapout;
3847 SYSCTL_INT(_vm, OID_AUTO, c_segment_pages_compressed_incore_late_swapout, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_pages_compressed_incore_late_swapout, 0, "");
3848 #endif /* CONFIG_FREEZE */
3849 #endif /* DEVELOPMENT || DEBUG */
3850 
3851 static boolean_t
memorystatus_should_post_snapshot(int32_t priority,uint32_t cause)3852 memorystatus_should_post_snapshot(int32_t priority, uint32_t cause)
3853 {
3854 	boolean_t is_idle_priority;
3855 
3856 	is_idle_priority = (priority == JETSAM_PRIORITY_IDLE || priority == JETSAM_PRIORITY_IDLE_DEFERRED);
3857 #if CONFIG_JETSAM
3858 #pragma unused(cause)
3859 	/*
3860 	 * Don't generate logs for steady-state idle-exit kills,
3861 	 * unless it is overridden for debug or by the device
3862 	 * tree.
3863 	 */
3864 
3865 	return !is_idle_priority || memorystatus_idle_snapshot;
3866 
3867 #else /* CONFIG_JETSAM */
3868 	/*
3869 	 * Don't generate logs for steady-state idle-exit kills,
3870 	 * unless
3871 	 * - it is overridden for debug or by the device
3872 	 * tree.
3873 	 * OR
3874 	 * - the kill causes are important i.e. not kMemorystatusKilledIdleExit
3875 	 */
3876 
3877 	boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause));
3878 	return !is_idle_priority || memorystatus_idle_snapshot || snapshot_eligible_kill_cause;
3879 #endif /* CONFIG_JETSAM */
3880 }
3881 
3882 
3883 static boolean_t
memorystatus_act_on_hiwat_processes(uint32_t * errors,uint32_t * hwm_kill,bool * post_snapshot,uint64_t * memory_reclaimed)3884 memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, bool *post_snapshot, uint64_t *memory_reclaimed)
3885 {
3886 	boolean_t purged = FALSE, killed = FALSE;
3887 
3888 	*memory_reclaimed = 0;
3889 	killed = memorystatus_kill_hiwat_proc(errors, &purged, memory_reclaimed);
3890 
3891 	if (killed) {
3892 		*hwm_kill = *hwm_kill + 1;
3893 		*post_snapshot = TRUE;
3894 		return TRUE;
3895 	} else {
3896 		if (purged == FALSE) {
3897 			/* couldn't purge and couldn't kill */
3898 			os_atomic_store(&memorystatus_hwm_candidates, false, relaxed);
3899 		}
3900 	}
3901 
3902 	return killed;
3903 }
3904 
3905 /*
3906  * Purge kernel memory caches
3907  */
3908 static void
memstat_purge_caches(jetsam_state_t state)3909 memstat_purge_caches(jetsam_state_t state)
3910 {
3911 	memorystatus_log("memorystatus: purging kernel memory caches\n");
3912 
3913 	uint64_t pmap_released = pmap_release_pages_fast();
3914 	memorystatus_log("memorystatus: recovered %llu pages from pmap\n",
3915 	    pmap_released);
3916 
3917 	/*
3918 	 * Only purge corpses once per jetsam event. No new corpses can be created
3919 	 * after the initial purge (block_corpses)
3920 	 */
3921 	if (!state->corpse_list_purged) {
3922 		memorystatus_log("memorystatus: purging all corpses\n");
3923 		os_atomic_inc(&block_corpses, relaxed);
3924 		assert(block_corpses > 0);
3925 		if (total_corpses_count() > 0) {
3926 			task_purge_all_corpses();
3927 		} else {
3928 			memorystatus_log("memorystatus: no corpses to purge\n");
3929 		}
3930 		state->corpse_list_purged = true;
3931 	}
3932 
3933 #if CONFIG_DEFERRED_RECLAIM
3934 	/* TODO: estimate memory recovered from deferred reclaim */
3935 	memorystatus_log("memorystatus: reclaiming all deferred user memory\n");
3936 	/*
3937 	 * Avoid faulting on the reclaim buffer and avoid blocking waiting for
3938 	 * threads which may be faulting themselves.
3939 	 */
3940 	vm_deferred_reclamation_reclaim_all_memory(
3941 		RECLAIM_NO_WAIT | RECLAIM_NO_FAULT);
3942 #endif /* CONFIG_DEFERRED_RECLAIM */
3943 
3944 	/* TODO: estimate wired memory recovered from zone_gc */
3945 	memorystatus_log("memorystatus: trimming kernel zone allocator\n");
3946 	zone_gc_trim();
3947 }
3948 
3949 /*
3950  * Called before jetsamming in the foreground band in the hope that we'll
3951  * avoid a jetsam.
3952  */
3953 static void
memstat_approaching_fg_band(jetsam_state_t state)3954 memstat_approaching_fg_band(jetsam_state_t state)
3955 {
3956 	memorystatus_log("memorystatus: jetsam is approaching JETSAM_PRIORITY_FOREGROUND\n");
3957 	if (memorystatus_should_issue_fg_band_notify) {
3958 		memorystatus_broadcast_jetsam_pressure(kVMPressureForegroundJetsam);
3959 	}
3960 	memstat_purge_caches(state);
3961 }
3962 
3963 unsigned int jld_eval_aggressive_count = 0;
3964 uint64_t  jld_timestamp_msecs = 0;
3965 int       jld_idle_kill_candidates = 0;
3966 
3967 /*
3968  * Progressively raise the maximum priority to aggressively kill to
3969  * when a jetsam loop is detected. Background work often happens at
3970  * @c JETSAM_PRIORITY_MAIL. Start there and elevate as needed if
3971  * the jetsam loop re-occurs in a short time window.
3972  */
3973 int jld_max_priority_arr[] = {
3974 	JETSAM_PRIORITY_MAIL,
3975 	JETSAM_PRIORITY_MAIL,
3976 	JETSAM_PRIORITY_UI_SUPPORT,
3977 	JETSAM_PRIORITY_UI_SUPPORT,
3978 	JETSAM_PRIORITY_DRIVER_APPLE,
3979 };
3980 #define JLD_MAX_PRIORITY_ARR_COUNT (sizeof(jld_max_priority_arr) / sizeof(jld_max_priority_arr[0]))
3981 
3982 static bool
memorystatus_act_aggressive(jetsam_state_t state,uint32_t cause,os_reason_t jetsam_reason)3983 memorystatus_act_aggressive(jetsam_state_t state, uint32_t cause, os_reason_t jetsam_reason)
3984 {
3985 	boolean_t killed;
3986 	uint32_t errors = 0;
3987 	uint64_t footprint_of_killed_proc = 0;
3988 	int elevated_bucket_count = 0, maximum_kills = 0, band = 0;
3989 	state->memory_reclaimed = 0;
3990 
3991 	unsigned int iteration_no = jld_eval_aggressive_count++;
3992 	int max_kill_pri = jld_max_priority_arr[MIN(iteration_no, JLD_MAX_PRIORITY_ARR_COUNT - 1)];
3993 	assert3u(max_kill_pri, <=, MEMSTAT_BUCKET_COUNT);
3994 
3995 	if (max_kill_pri >= JETSAM_PRIORITY_FOREGROUND) {
3996 		memstat_approaching_fg_band(state);
3997 	}
3998 
3999 	proc_list_lock();
4000 	elevated_bucket_count = memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE].count;
4001 	proc_list_unlock();
4002 
4003 	/* Visit elevated processes first */
4004 	while (elevated_bucket_count) {
4005 		elevated_bucket_count--;
4006 
4007 		/*
4008 		 * memorystatus_kill_elevated_process() drops a reference,
4009 		 * so take another one so we can continue to use this exit reason
4010 		 * even after it returns.
4011 		 */
4012 
4013 		os_reason_ref(jetsam_reason);
4014 		killed = memorystatus_kill_elevated_process(
4015 			cause,
4016 			jetsam_reason,
4017 			JETSAM_PRIORITY_ELEVATED_INACTIVE,
4018 			jld_eval_aggressive_count,
4019 			&errors, &footprint_of_killed_proc);
4020 		if (killed) {
4021 			state->post_snapshot = true;
4022 			state->memory_reclaimed += footprint_of_killed_proc;
4023 			if (!memstat_evaluate_page_shortage(NULL, NULL, NULL)) {
4024 				/*
4025 				 * System is no longer under pressure --
4026 				 * bail early because the pressure was
4027 				 * coming from an inactive process
4028 				 */
4029 				return true;
4030 			}
4031 		} else {
4032 			/*
4033 			 * No pinned processes left to kill.
4034 			 * Abandon elevated band.
4035 			 */
4036 			break;
4037 		}
4038 	}
4039 
4040 	proc_list_lock();
4041 	for (band = JETSAM_PRIORITY_IDLE; band < max_kill_pri; band++) {
4042 		maximum_kills += memstat_bucket[band].count;
4043 	}
4044 	proc_list_unlock();
4045 	maximum_kills *= memorystatus_jld_max_kill_loops;
4046 	/*
4047 	 * memorystatus_kill_processes_aggressive() allocates its own
4048 	 * jetsam_reason so the kMemorystatusKilledProcThrashing cause
4049 	 * is consistent throughout the aggressive march.
4050 	 */
4051 	killed = memorystatus_kill_processes_aggressive(
4052 		kMemorystatusKilledProcThrashing,
4053 		jld_eval_aggressive_count,
4054 		max_kill_pri,
4055 		maximum_kills,
4056 		&errors, &footprint_of_killed_proc);
4057 
4058 	if (killed) {
4059 		/* Always generate logs after aggressive kill */
4060 		state->post_snapshot = true;
4061 		state->memory_reclaimed += footprint_of_killed_proc;
4062 		state->jld_idle_kills = 0;
4063 	}
4064 
4065 	return killed;
4066 }
4067 
4068 /*
4069  * Sets up a new jetsam thread.
4070  */
4071 static void
memorystatus_thread_init(jetsam_state_t jetsam_thread)4072 memorystatus_thread_init(jetsam_state_t jetsam_thread)
4073 {
4074 	char name[32];
4075 	thread_wire_internal(host_priv_self(), current_thread(), TRUE, NULL);
4076 	snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1);
4077 
4078 	/* Limit all but one thread to the lower jetsam bands, as that's where most of the victims are. */
4079 	if (jetsam_thread->index == 0) {
4080 		if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4081 			thread_vm_bind_group_add();
4082 		}
4083 		jetsam_thread->limit_to_low_bands = false;
4084 	} else {
4085 		jetsam_thread->limit_to_low_bands = true;
4086 	}
4087 #if CONFIG_THREAD_GROUPS
4088 	thread_group_vm_add();
4089 #endif
4090 	thread_set_thread_name(current_thread(), name);
4091 	sched_cond_init(&(jetsam_thread->jt_wakeup_cond));
4092 	jetsam_thread->inited = true;
4093 }
4094 
4095 /*
4096  * Create a new jetsam reason from the given kill cause.
4097  */
4098 static os_reason_t
create_jetsam_reason(memorystatus_kill_cause_t cause)4099 create_jetsam_reason(memorystatus_kill_cause_t cause)
4100 {
4101 	os_reason_t jetsam_reason = OS_REASON_NULL;
4102 
4103 	jetsam_reason_t reason_code = (jetsam_reason_t)cause;
4104 	assert3u(reason_code, <=, JETSAM_REASON_MEMORYSTATUS_MAX);
4105 
4106 	jetsam_reason = os_reason_create(OS_REASON_JETSAM, reason_code);
4107 	if (jetsam_reason == OS_REASON_NULL) {
4108 		memorystatus_log_error("memorystatus: failed to allocate jetsam reason for cause %u\n", cause);
4109 	}
4110 	return jetsam_reason;
4111 }
4112 
4113 /*
4114  * Do one kill as we're marching up the priority bands.
4115  * This is a wrapper around memorystatus_kill_top_process that also
4116  * sets post_snapshot, tracks jld_idle_kills, and notifies if we're appraoching the fg band.
4117  */
4118 static bool
memorystatus_do_priority_kill(jetsam_state_t state,uint32_t kill_cause,int32_t max_priority,bool only_swappable)4119 memorystatus_do_priority_kill(jetsam_state_t state,
4120     uint32_t kill_cause, int32_t max_priority, bool only_swappable)
4121 {
4122 	os_reason_t jetsam_reason = OS_REASON_NULL;
4123 	bool killed = false;
4124 	int priority;
4125 
4126 	jetsam_reason = create_jetsam_reason(kill_cause);
4127 	/*
4128 	 * memorystatus_kill_top_process() drops a reference,
4129 	 * so take another one so we can continue to use this exit reason
4130 	 * even after it returns
4131 	 */
4132 	os_reason_ref(jetsam_reason);
4133 
4134 	/* LRU */
4135 	killed = memorystatus_kill_top_process(true, state->sort_flag, kill_cause, jetsam_reason, max_priority,
4136 	    only_swappable, &priority, &state->errors, &state->memory_reclaimed);
4137 	state->sort_flag = false;
4138 
4139 	if (killed) {
4140 		if (memorystatus_should_post_snapshot(priority, kill_cause) == TRUE) {
4141 			state->post_snapshot = true;
4142 		}
4143 
4144 		/* Jetsam Loop Detection */
4145 		if (memorystatus_jld_enabled == TRUE) {
4146 			if (priority <= applications_aging_band) {
4147 				state->jld_idle_kills++;
4148 			} else {
4149 				/*
4150 				 * We've reached into bands beyond idle deferred.
4151 				 * We make no attempt to monitor them
4152 				 */
4153 			}
4154 		}
4155 
4156 		if (priority >= JETSAM_PRIORITY_FREEZER) {
4157 			memstat_approaching_fg_band(state);
4158 		} else if (priority >= JETSAM_PRIORITY_BACKGROUND) {
4159 			memorystatus_broadcast_jetsam_pressure(kVMPressureBackgroundJetsam);
4160 		}
4161 	}
4162 	os_reason_free(jetsam_reason);
4163 
4164 	return killed;
4165 }
4166 
4167 static bool
memorystatus_do_action(jetsam_state_t state,memorystatus_action_t action,uint32_t kill_cause)4168 memorystatus_do_action(jetsam_state_t state, memorystatus_action_t action, uint32_t kill_cause)
4169 {
4170 	bool killed = false;
4171 	os_reason_t jetsam_reason = OS_REASON_NULL;
4172 
4173 	switch (action) {
4174 	case MEMORYSTATUS_KILL_HIWATER:
4175 		killed = memorystatus_act_on_hiwat_processes(&state->errors, &state->hwm_kills,
4176 		    &state->post_snapshot, &state->memory_reclaimed);
4177 		break;
4178 	case MEMORYSTATUS_KILL_AGGRESSIVE:
4179 		jetsam_reason = create_jetsam_reason(kill_cause);
4180 		killed = memorystatus_act_aggressive(state, kill_cause, jetsam_reason);
4181 		os_reason_free(jetsam_reason);
4182 		break;
4183 	case MEMORYSTATUS_KILL_TOP_PROCESS:
4184 		killed = memorystatus_do_priority_kill(state, kill_cause, max_kill_priority, false);
4185 		break;
4186 	case MEMORYSTATUS_WAKE_SWAPPER:
4187 		memorystatus_log_info(
4188 			"memorystatus_do_action: Waking up swap thread. memorystatus_available_pages: %llu\n",
4189 			(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4190 		os_atomic_store(&vm_swapout_wake_pending, true, relaxed);
4191 		thread_wakeup((event_t)&vm_swapout_thread);
4192 		break;
4193 	case MEMORYSTATUS_PROCESS_SWAPIN_QUEUE:
4194 		memorystatus_log_info(
4195 			"memorystatus_do_action: Processing swapin queue of length: %u memorystatus_available_pages: %llu\n",
4196 			c_late_swappedin_count, (uint64_t) MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4197 		vm_compressor_process_special_swapped_in_segments();
4198 		break;
4199 	case MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE:
4200 		killed = memorystatus_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_BACKGROUND - 1, true);
4201 		break;
4202 	case MEMORYSTATUS_KILL_SWAPPABLE:
4203 		killed = memorystatus_do_priority_kill(state, kill_cause, max_kill_priority, true);
4204 		break;
4205 	case MEMORYSTATUS_KILL_IDLE:
4206 		killed = memorystatus_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_IDLE, false);
4207 		break;
4208 	case MEMORYSTATUS_KILL_NONE:
4209 		panic("memorystatus_do_action: Impossible! memorystatus_do_action called with action = NONE\n");
4210 	}
4211 	return killed;
4212 }
4213 
4214 void
memorystatus_post_snapshot()4215 memorystatus_post_snapshot()
4216 {
4217 	proc_list_lock();
4218 	size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
4219 	    sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
4220 	uint64_t timestamp_now = mach_absolute_time();
4221 	memorystatus_jetsam_snapshot->notification_time = timestamp_now;
4222 	memorystatus_jetsam_snapshot->js_gencount++;
4223 	if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
4224 	    timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
4225 		proc_list_unlock();
4226 		int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4227 		if (!ret) {
4228 			proc_list_lock();
4229 			memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; proc_list_unlock();
4230 		}
4231 	} else {
4232 		proc_list_unlock();
4233 	}
4234 }
4235 
4236 #if JETSAM_ZPRINT_SNAPSHOT
4237 
4238 /*
4239  *  Called by memorystatus_update_jetsam_snapshot_entry_locked to take a zprint snapshot.
4240  */
4241 static void
memorystatus_collect_jetsam_snapshot_zprint(void)4242 memorystatus_collect_jetsam_snapshot_zprint(void)
4243 {
4244 	unsigned int new_meminfo_cnt;
4245 
4246 	jzs_zone_cnt = zone_max_zones();
4247 
4248 	new_meminfo_cnt = vm_page_diagnose_estimate();
4249 	if (new_meminfo_cnt > jzs_meminfo_cnt) {
4250 		jzs_meminfo = krealloc_data_tag(jzs_meminfo,
4251 		    jzs_meminfo_cnt * sizeof(mach_memory_info_t),
4252 		    new_meminfo_cnt * sizeof(mach_memory_info_t),
4253 		    Z_WAITOK,
4254 		    VM_KERN_MEMORY_DIAG);
4255 
4256 		jzs_meminfo_cnt = new_meminfo_cnt;
4257 	}
4258 
4259 	mach_memory_info_sample(jzs_names, jzs_info, jzs_coalesce, &jzs_zone_cnt, jzs_meminfo, jzs_meminfo_cnt, true);
4260 }
4261 
4262 #endif /* JETSAM_ZPRINT_SNAPSHOT */
4263 
4264 /*
4265  * Main entrypoint for the memorystatus thread.
4266  * This thread is woken up when we're low on one of the following resources:
4267  * - available pages (free + filebacked)
4268  * - zone memory
4269  * - compressor space
4270  *
4271  * Or when thrashing is detected in the compressor or file cache.
4272  */
4273 static void
memorystatus_thread_internal(jetsam_state_t state)4274 memorystatus_thread_internal(jetsam_state_t state)
4275 {
4276 	uint64_t total_memory_reclaimed = 0;
4277 	bool highwater_remaining = true;
4278 	bool swappable_apps_remaining = false;
4279 	bool suspended_swappable_apps_remaining = false;
4280 
4281 #if CONFIG_JETSAM
4282 	swappable_apps_remaining = memorystatus_swap_all_apps;
4283 	suspended_swappable_apps_remaining = memorystatus_swap_all_apps;
4284 #endif /* CONFIG_JETSAM */
4285 
4286 	assert(state != NULL);
4287 	state->jld_idle_kills = 0;
4288 	state->errors = 0;
4289 	state->hwm_kills = 0;
4290 	state->sort_flag = true;
4291 	state->corpse_list_purged = false;
4292 	state->post_snapshot = false;
4293 	state->memory_reclaimed = 0;
4294 
4295 	if (state->inited == FALSE) {
4296 		/*
4297 		 * It's the first time the thread has run, so just mark the thread as privileged and block.
4298 		 */
4299 		memorystatus_thread_init(state);
4300 		sched_cond_wait(&state->jt_wakeup_cond, THREAD_UNINT, memorystatus_thread);
4301 	}
4302 
4303 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
4304 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, jld_eval_aggressive_count);
4305 
4306 	extern uint32_t c_segment_count;
4307 	extern mach_timespec_t major_compact_ts;
4308 	clock_sec_t now;
4309 	clock_nsec_t nsec;
4310 	clock_get_system_nanotime(&now, &nsec);
4311 	mach_timespec_t major_compact_diff = {.tv_sec = (int)now, .tv_nsec = nsec};
4312 	SUB_MACH_TIMESPEC(&major_compact_diff, &major_compact_ts);
4313 	memorystatus_log_info(
4314 		"memorystatus: c_segment_count=%u major compaction occurred %u seconds ago\n",
4315 		c_segment_count, major_compact_diff.tv_sec);
4316 
4317 	/*
4318 	 * Jetsam aware version.
4319 	 *
4320 	 * The VM pressure notification thread is working its way through clients in parallel.
4321 	 *
4322 	 * So, while the pressure notification thread is targeting processes in order of
4323 	 * increasing jetsam priority, we can hopefully reduce / stop its work by killing
4324 	 * any processes that have exceeded their highwater mark.
4325 	 *
4326 	 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
4327 	 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
4328 	 */
4329 	while (true) {
4330 		bool killed;
4331 		state->memory_reclaimed = 0;
4332 		uint32_t cause = 0;
4333 
4334 		memorystatus_action_t action = memorystatus_pick_action(state, &cause,
4335 		    highwater_remaining, suspended_swappable_apps_remaining, swappable_apps_remaining,
4336 		    &state->jld_idle_kills);
4337 		if (action == MEMORYSTATUS_KILL_NONE) {
4338 			break;
4339 		}
4340 
4341 		if (cause == kMemorystatusKilledVMCompressorThrashing || cause == kMemorystatusKilledVMCompressorSpaceShortage) {
4342 			memorystatus_log("memorystatus: killing due to \"%s\" - compression_ratio=%u\n", memorystatus_kill_cause_name[cause], vm_compression_ratio());
4343 		}
4344 
4345 		killed = memorystatus_do_action(state, action, cause);
4346 		total_memory_reclaimed += state->memory_reclaimed;
4347 
4348 		if (!killed) {
4349 			if (action == MEMORYSTATUS_KILL_HIWATER) {
4350 				highwater_remaining = false;
4351 			} else if (action == MEMORYSTATUS_KILL_SWAPPABLE) {
4352 				swappable_apps_remaining = false;
4353 				suspended_swappable_apps_remaining = false;
4354 			} else if (action == MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE) {
4355 				suspended_swappable_apps_remaining = false;
4356 			}
4357 		} else {
4358 			if (cause == kMemorystatusKilledVMCompressorThrashing || cause == kMemorystatusKilledVMCompressorSpaceShortage) {
4359 				memorystatus_log("memorystatus: post-jetsam compressor fragmentation_level=%u\n", vm_compressor_fragmentation_level());
4360 			}
4361 			/* Always re-check for highwater and swappable kills after doing a kill. */
4362 			highwater_remaining = true;
4363 			swappable_apps_remaining = true;
4364 			suspended_swappable_apps_remaining = true;
4365 		}
4366 
4367 		if (!killed && total_memory_reclaimed == 0) {
4368 			memorystatus_log("memorystatus: failed to kill a process and no memory was reclaimed\n");
4369 			if ((action == MEMORYSTATUS_KILL_TOP_PROCESS || action == MEMORYSTATUS_KILL_AGGRESSIVE) &&
4370 			    memorystatus_get_available_page_count() < memorystatus_get_critical_page_shortage_threshold()) {
4371 				/*
4372 				 * Still under pressure and unable to kill a process - purge corpse memory
4373 				 * and get everything back from the pmap.
4374 				 */
4375 				memorystatus_log("memorystatus: ran out of %sprocesses to kill but "
4376 				    "system is still in critical condition\n",
4377 				    state->limit_to_low_bands ? "low-band " : "");
4378 				memstat_purge_caches(state);
4379 
4380 				if (!state->limit_to_low_bands &&
4381 				    memorystatus_get_available_page_count() < memorystatus_get_critical_page_shortage_threshold()) {
4382 					/*
4383 					 * Still under pressure and unable to kill a process
4384 					 */
4385 					memorystatus_log_fault("memorystatus: attempting full drain of kernel zone allocator\n");
4386 					zone_gc_drain();
4387 					if (memorystatus_get_available_page_count() < memorystatus_get_critical_page_shortage_threshold()) {
4388 						panic("memorystatus_jetsam_thread: no victim! available pages:%llu", (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4389 					}
4390 				}
4391 			}
4392 		}
4393 
4394 		/*
4395 		 * If we did a kill on behalf of another subsystem (compressor or zalloc)
4396 		 * notify them.
4397 		 */
4398 		if (killed && is_reason_thrashing(cause)) {
4399 			os_atomic_store(&memorystatus_compressor_space_shortage, false, release);
4400 #if CONFIG_PHANTOM_CACHE
4401 			os_atomic_store(&memorystatus_phantom_cache_pressure, false, release);
4402 #endif /* CONFIG_PHANTOM_CACHE */
4403 #if CONFIG_JETSAM
4404 			vm_thrashing_jetsam_done();
4405 #endif /* CONFIG_JETSAM */
4406 		} else if (killed && is_reason_zone_map_exhaustion(cause)) {
4407 			os_atomic_store(&memorystatus_zone_map_is_exhausted, false, release);
4408 		} else if (killed && cause == kMemorystatusKilledVMPageoutStarvation) {
4409 			os_atomic_store(&memorystatus_pageout_starved, false, release);
4410 		}
4411 	}
4412 
4413 	if (state->errors) {
4414 		memorystatus_clear_errors();
4415 	}
4416 
4417 	if (state->post_snapshot) {
4418 		memorystatus_post_snapshot();
4419 	}
4420 
4421 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
4422 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, total_memory_reclaimed);
4423 
4424 	if (state->corpse_list_purged) {
4425 		os_atomic_dec(&block_corpses, relaxed);
4426 		assert(block_corpses >= 0);
4427 	}
4428 }
4429 
4430 OS_NORETURN
4431 static void
memorystatus_thread(void * param __unused,wait_result_t wr __unused)4432 memorystatus_thread(void *param __unused, wait_result_t wr __unused)
4433 {
4434 	jetsam_state_t jetsam_thread = jetsam_current_thread();
4435 	sched_cond_ack(&(jetsam_thread->jt_wakeup_cond));
4436 	while (1) {
4437 		memorystatus_thread_internal(jetsam_thread);
4438 		sched_cond_wait(&(jetsam_thread->jt_wakeup_cond), THREAD_UNINT, memorystatus_thread);
4439 	}
4440 }
4441 
4442 /*
4443  * This section defines when we deploy aggressive jetsam.
4444  * Aggressive jetsam kills everything up to the jld_priority_band_max band.
4445  */
4446 
4447 /*
4448  * Returns TRUE:
4449  *      when an idle-exitable proc was killed
4450  * Returns FALSE:
4451  *	when there are no more idle-exitable procs found
4452  *      when the attempt to kill an idle-exitable proc failed
4453  */
4454 boolean_t
memorystatus_idle_exit_from_VM(void)4455 memorystatus_idle_exit_from_VM(void)
4456 {
4457 	/*
4458 	 * This routine should no longer be needed since we are
4459 	 * now using jetsam bands on all platforms and so will deal
4460 	 * with IDLE processes within the memorystatus thread itself.
4461 	 *
4462 	 * But we still use it because we observed that macos systems
4463 	 * started heavy compression/swapping with a bunch of
4464 	 * idle-exitable processes alive and doing nothing. We decided
4465 	 * to rather kill those processes than start swapping earlier.
4466 	 */
4467 
4468 	return kill_idle_exit_proc();
4469 }
4470 
4471 /*
4472  * Callback invoked when allowable physical memory footprint exceeded
4473  * (dirty pages + IOKit mappings)
4474  *
4475  * This is invoked for both advisory, non-fatal per-task high watermarks,
4476  * as well as the fatal task memory limits.
4477  */
4478 void
memorystatus_on_ledger_footprint_exceeded(boolean_t warning,boolean_t memlimit_is_active,boolean_t memlimit_is_fatal)4479 memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
4480 {
4481 	os_reason_t jetsam_reason = OS_REASON_NULL;
4482 
4483 	proc_t p = current_proc();
4484 
4485 #if VM_PRESSURE_EVENTS
4486 	if (warning == TRUE) {
4487 		/*
4488 		 * This is a warning path which implies that the current process is close, but has
4489 		 * not yet exceeded its per-process memory limit.
4490 		 */
4491 		if (memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) {
4492 			/* Print warning, since it's possible that task has not registered for pressure notifications */
4493 			memorystatus_log_debug(
4494 				"memorystatus_on_ledger_footprint_exceeded: failed to warn %s [%d] (exiting, or no handler registered?).\n",
4495 				proc_best_name(p), proc_getpid(p));
4496 		}
4497 		return;
4498 	}
4499 #endif /* VM_PRESSURE_EVENTS */
4500 
4501 	if (memlimit_is_fatal) {
4502 		/*
4503 		 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
4504 		 * has violated either the system-wide per-task memory limit OR its own task limit.
4505 		 */
4506 		jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT);
4507 		if (jetsam_reason == NULL) {
4508 			memorystatus_log_error("task_exceeded footprint: failed to allocate jetsam reason\n");
4509 		} else if (corpse_for_fatal_memkill && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) {
4510 			/* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
4511 			jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
4512 		}
4513 
4514 		if (memorystatus_kill_process_sync(proc_getpid(p), kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) {
4515 			memorystatus_log_error("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
4516 		}
4517 	} else {
4518 		/*
4519 		 * HWM offender exists. Done without locks or synchronization.
4520 		 * See comment near its declaration for more details.
4521 		 */
4522 		os_atomic_store(&memorystatus_hwm_candidates, true, release);
4523 		_memstat_consider_waking_jetsam_thread();
4524 
4525 #if VM_PRESSURE_EVENTS
4526 		/*
4527 		 * The current process is not in the warning path.
4528 		 * This path implies the current process has exceeded a non-fatal (soft) memory limit.
4529 		 * Failure to send note is ignored here.
4530 		 */
4531 		(void)memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */);
4532 
4533 #endif /* VM_PRESSURE_EVENTS */
4534 	}
4535 }
4536 
4537 inline void
memorystatus_log_exception(const int max_footprint_mb,boolean_t memlimit_is_active,boolean_t memlimit_is_fatal)4538 memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
4539 {
4540 	proc_t p = current_proc();
4541 
4542 	/*
4543 	 * The limit violation is logged here, but only once per process per limit.
4544 	 * Soft memory limit is a non-fatal high-water-mark
4545 	 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
4546 	 */
4547 
4548 	memorystatus_log("EXC_RESOURCE -> %s[%d] exceeded mem limit: %s%s %d MB (%s)\n",
4549 	    ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), (memlimit_is_active ? "Active" : "Inactive"),
4550 	    (memlimit_is_fatal  ? "Hard" : "Soft"), max_footprint_mb,
4551 	    (memlimit_is_fatal  ? "fatal" : "non-fatal"));
4552 }
4553 
4554 inline void
memorystatus_log_diag_threshold_exception(const int diag_threshold_value)4555 memorystatus_log_diag_threshold_exception(const int diag_threshold_value)
4556 {
4557 	proc_t p = current_proc();
4558 
4559 	/*
4560 	 * The limit violation is logged here, but only once per process per limit.
4561 	 * Soft memory limit is a non-fatal high-water-mark
4562 	 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
4563 	 */
4564 
4565 	memorystatus_log("EXC_RESOURCE -> %s[%d] exceeded diag threshold limit: %d MB \n",
4566 	    ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), diag_threshold_value);
4567 }
4568 
4569 /*
4570  * Description:
4571  *	Evaluates process state to determine which limit
4572  *	should be applied (active vs. inactive limit).
4573  *
4574  *	Processes that have the 'elevated inactive jetsam band' attribute
4575  *	are first evaluated based on their current priority band.
4576  *	presently elevated ==> active
4577  *
4578  *	Processes that opt into dirty tracking are evaluated
4579  *	based on clean vs dirty state.
4580  *	dirty ==> active
4581  *	clean ==> inactive
4582  *
4583  *	Process that do not opt into dirty tracking are
4584  *	evalulated based on priority level.
4585  *	Foreground or above ==> active
4586  *	Below Foreground    ==> inactive
4587  *
4588  *	Return: TRUE if active
4589  *		False if inactive
4590  */
4591 static bool
_memstat_proc_is_active_locked(proc_t p)4592 _memstat_proc_is_active_locked(proc_t p)
4593 {
4594 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
4595 
4596 	if (_memstat_proc_is_elevated(p) &&
4597 	    (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE)) {
4598 		/*
4599 		 * process has the 'elevated inactive jetsam band' attribute
4600 		 * and process is present in the elevated band
4601 		 */
4602 		return true;
4603 	} else if (_memstat_proc_is_tracked(p)) {
4604 		/*
4605 		 * process has opted into dirty tracking
4606 		 * active state is based on dirty vs. clean
4607 		 */
4608 		if (_memstat_proc_is_dirty(p)) {
4609 			/* Dirty */
4610 			return true;
4611 		} else if (_memstat_proc_can_idle_exit(p) &&
4612 		    p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
4613 			/* Clean and Not Idle */
4614 			return true;
4615 		} else {
4616 			/* Clean and Idle */
4617 			return false;
4618 		}
4619 	} else {
4620 		return p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND;
4621 	}
4622 }
4623 
4624 static boolean_t
memorystatus_kill_process_sync(pid_t victim_pid,uint32_t cause,os_reason_t jetsam_reason)4625 memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
4626 {
4627 	boolean_t res;
4628 
4629 	uint32_t errors = 0;
4630 	uint64_t memory_reclaimed = 0;
4631 
4632 	if (victim_pid == -1) {
4633 		/* No pid, so kill first process */
4634 		res = memorystatus_kill_top_process(true, true, cause, jetsam_reason,
4635 		    max_kill_priority, false, NULL, &errors, &memory_reclaimed);
4636 	} else {
4637 		res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason);
4638 	}
4639 
4640 	if (errors) {
4641 		memorystatus_clear_errors();
4642 	}
4643 
4644 	if (res == TRUE) {
4645 		/* Fire off snapshot notification */
4646 		proc_list_lock();
4647 		size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
4648 		    sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
4649 		uint64_t timestamp_now = mach_absolute_time();
4650 		memorystatus_jetsam_snapshot->notification_time = timestamp_now;
4651 		if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
4652 		    timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
4653 			proc_list_unlock();
4654 			int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4655 			if (!ret) {
4656 				proc_list_lock();
4657 				memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
4658 				proc_list_unlock();
4659 			}
4660 		} else {
4661 			proc_list_unlock();
4662 		}
4663 	}
4664 
4665 	return res;
4666 }
4667 
4668 /*
4669  * Jetsam a specific process.
4670  */
4671 static boolean_t
memorystatus_kill_specific_process(pid_t victim_pid,uint32_t cause,os_reason_t jetsam_reason)4672 memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
4673 {
4674 	boolean_t killed;
4675 	proc_t p;
4676 	uint64_t killtime = 0;
4677 	uint64_t footprint_of_killed_proc;
4678 	clock_sec_t     tv_sec;
4679 	clock_usec_t    tv_usec;
4680 	uint32_t        tv_msec;
4681 
4682 	/* TODO - add a victim queue and push this into the main jetsam thread */
4683 
4684 	p = proc_find(victim_pid);
4685 	if (!p) {
4686 		os_reason_free(jetsam_reason);
4687 		return FALSE;
4688 	}
4689 
4690 	proc_list_lock();
4691 
4692 	if (p->p_memstat_state & P_MEMSTAT_TERMINATED) {
4693 		/*
4694 		 * Someone beat us to this kill.
4695 		 * Nothing to do here.
4696 		 */
4697 		proc_list_unlock();
4698 		os_reason_free(jetsam_reason);
4699 		proc_rele(p);
4700 		return FALSE;
4701 	}
4702 	p->p_memstat_state |= P_MEMSTAT_TERMINATED;
4703 
4704 	if (memorystatus_jetsam_snapshot_count == 0) {
4705 		memorystatus_init_jetsam_snapshot_locked(NULL, 0);
4706 	}
4707 
4708 	killtime = mach_absolute_time();
4709 	absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
4710 	tv_msec = tv_usec / 1000;
4711 
4712 	memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
4713 
4714 	proc_list_unlock();
4715 
4716 	killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
4717 
4718 	memorystatus_log("%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n",
4719 	    (unsigned long)tv_sec, tv_msec, victim_pid, ((p && *p->p_name) ? p->p_name : "unknown"),
4720 	    memorystatus_kill_cause_name[cause], (p ? p->p_memstat_effectivepriority: -1),
4721 	    footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4722 
4723 	if (!killed) {
4724 		proc_list_lock();
4725 		p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
4726 		proc_list_unlock();
4727 	}
4728 
4729 	proc_rele(p);
4730 
4731 	return killed;
4732 }
4733 
4734 
4735 /*
4736  * Toggle the P_MEMSTAT_SKIP bit.
4737  * Takes the proc_list_lock.
4738  */
4739 void
proc_memstat_skip(proc_t p,boolean_t set)4740 proc_memstat_skip(proc_t p, boolean_t set)
4741 {
4742 #if DEVELOPMENT || DEBUG
4743 	if (p) {
4744 		proc_list_lock();
4745 		if (set == TRUE) {
4746 			p->p_memstat_state |= P_MEMSTAT_SKIP;
4747 		} else {
4748 			p->p_memstat_state &= ~P_MEMSTAT_SKIP;
4749 		}
4750 		proc_list_unlock();
4751 	}
4752 #else
4753 #pragma unused(p, set)
4754 	/*
4755 	 * do nothing
4756 	 */
4757 #endif /* DEVELOPMENT || DEBUG */
4758 	return;
4759 }
4760 
4761 
4762 #if CONFIG_JETSAM
4763 /*
4764  * This is invoked when cpulimits have been exceeded while in fatal mode.
4765  * The jetsam_flags do not apply as those are for memory related kills.
4766  * We call this routine so that the offending process is killed with
4767  * a non-zero exit status.
4768  */
4769 void
jetsam_on_ledger_cpulimit_exceeded(void)4770 jetsam_on_ledger_cpulimit_exceeded(void)
4771 {
4772 	int retval = 0;
4773 	int jetsam_flags = 0;  /* make it obvious */
4774 	proc_t p = current_proc();
4775 	os_reason_t jetsam_reason = OS_REASON_NULL;
4776 
4777 	memorystatus_log("task_exceeded_cpulimit: killing pid %d [%s]\n", proc_getpid(p), (*p->p_name ? p->p_name : "(unknown)"));
4778 
4779 	jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT);
4780 	if (jetsam_reason == OS_REASON_NULL) {
4781 		memorystatus_log_error("task_exceeded_cpulimit: unable to allocate memory for jetsam reason\n");
4782 	}
4783 
4784 	retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
4785 
4786 	if (retval) {
4787 		memorystatus_log_error("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
4788 	}
4789 }
4790 
4791 #endif /* CONFIG_JETSAM */
4792 
4793 static void
memorystatus_get_task_memory_region_count(task_t task,uint64_t * count)4794 memorystatus_get_task_memory_region_count(task_t task, uint64_t *count)
4795 {
4796 	assert(task);
4797 	assert(count);
4798 
4799 	*count = get_task_memory_region_count(task);
4800 }
4801 
4802 
4803 #define MEMORYSTATUS_VM_MAP_FORK_ALLOWED     0x100000000
4804 #define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000
4805 
4806 #if DEVELOPMENT || DEBUG
4807 
4808 /*
4809  * Sysctl only used to test memorystatus_allowed_vm_map_fork() path.
4810  *   set a new pidwatch value
4811  *	or
4812  *   get the current pidwatch value
4813  *
4814  * The pidwatch_val starts out with a PID to watch for in the map_fork path.
4815  * Its value is:
4816  * - OR'd with MEMORYSTATUS_VM_MAP_FORK_ALLOWED if we allow the map_fork.
4817  * - OR'd with MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED if we disallow the map_fork.
4818  * - set to -1ull if the map_fork() is aborted for other reasons.
4819  */
4820 
4821 uint64_t memorystatus_vm_map_fork_pidwatch_val = 0;
4822 
4823 static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS {
4824 #pragma unused(oidp, arg1, arg2)
4825 
4826 	uint64_t new_value = 0;
4827 	uint64_t old_value = 0;
4828 	int error = 0;
4829 
4830 	/*
4831 	 * The pid is held in the low 32 bits.
4832 	 * The 'allowed' flags are in the upper 32 bits.
4833 	 */
4834 	old_value = memorystatus_vm_map_fork_pidwatch_val;
4835 
4836 	error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL);
4837 
4838 	if (error || !req->newptr) {
4839 		/*
4840 		 * No new value passed in.
4841 		 */
4842 		return error;
4843 	}
4844 
4845 	/*
4846 	 * A new pid was passed in via req->newptr.
4847 	 * Ignore any attempt to set the higher order bits.
4848 	 */
4849 	memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF;
4850 	memorystatus_log_debug("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx\n", old_value, new_value);
4851 
4852 	return error;
4853 }
4854 
4855 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED,
4856     0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork");
4857 
4858 
4859 /*
4860  * Record if a watched process fails to qualify for a vm_map_fork().
4861  */
4862 void
memorystatus_abort_vm_map_fork(task_t task)4863 memorystatus_abort_vm_map_fork(task_t task)
4864 {
4865 	if (memorystatus_vm_map_fork_pidwatch_val != 0) {
4866 		proc_t p = get_bsdtask_info(task);
4867 		if (p != NULL && memorystatus_vm_map_fork_pidwatch_val == (uint64_t)proc_getpid(p)) {
4868 			memorystatus_vm_map_fork_pidwatch_val = -1ull;
4869 		}
4870 	}
4871 }
4872 
4873 static void
set_vm_map_fork_pidwatch(task_t task,uint64_t x)4874 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
4875 {
4876 	if (memorystatus_vm_map_fork_pidwatch_val != 0) {
4877 		proc_t p = get_bsdtask_info(task);
4878 		if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)proc_getpid(p))) {
4879 			memorystatus_vm_map_fork_pidwatch_val |= x;
4880 		}
4881 	}
4882 }
4883 
4884 #else /* DEVELOPMENT || DEBUG */
4885 
4886 
4887 static void
set_vm_map_fork_pidwatch(task_t task,uint64_t x)4888 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
4889 {
4890 #pragma unused(task)
4891 #pragma unused(x)
4892 }
4893 
4894 #endif /* DEVELOPMENT || DEBUG */
4895 
4896 /*
4897  * Called during EXC_RESOURCE handling when a process exceeds a soft
4898  * memory limit.  This is the corpse fork path and here we decide if
4899  * vm_map_fork will be allowed when creating the corpse.
4900  * The task being considered is suspended.
4901  *
4902  * By default, a vm_map_fork is allowed to proceed.
4903  *
4904  * A few simple policy assumptions:
4905  *	If the device has a zero system-wide task limit,
4906  *	then the vm_map_fork is allowed. macOS always has a zero
4907  *	system wide task limit (unless overriden by a boot-arg).
4908  *
4909  *	And if a process's memory footprint calculates less
4910  *	than or equal to quarter of the system-wide task limit,
4911  *	then the vm_map_fork is allowed.  This calculation
4912  *	is based on the assumption that a process can
4913  *	munch memory up to the system-wide task limit.
4914  *
4915  *      For watchOS, which has a low task limit, we use a
4916  *      different value. Current task limit has been reduced
4917  *      to 300MB and it's been decided the limit should be 200MB.
4918  */
4919 int large_corpse_count = 0;
4920 boolean_t
memorystatus_allowed_vm_map_fork(task_t task,bool * is_large)4921 memorystatus_allowed_vm_map_fork(task_t task, bool *is_large)
4922 {
4923 	boolean_t is_allowed = TRUE;   /* default */
4924 	uint64_t footprint_in_bytes;
4925 	uint64_t max_allowed_bytes;
4926 	thread_t self = current_thread();
4927 
4928 	*is_large = false;
4929 
4930 	/* Jetsam in high bands blocks any new corpse */
4931 	if (os_atomic_load(&block_corpses, relaxed) != 0) {
4932 		memorystatus_log("memorystatus_allowed_vm_map_fork: corpse for pid %d blocked by jetsam).\n", task_pid(task));
4933 		ktriage_record(thread_tid(self), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CORPSE, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CORPSE_BLOCKED_JETSAM), 0 /* arg */);
4934 		return FALSE;
4935 	}
4936 
4937 	if (max_task_footprint_mb == 0) {
4938 		set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
4939 		return is_allowed;
4940 	}
4941 
4942 	footprint_in_bytes = get_task_phys_footprint(task);
4943 
4944 	/*
4945 	 * Maximum is 1/4 of the system-wide task limit by default.
4946 	 */
4947 	max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2;
4948 
4949 #if XNU_TARGET_OS_WATCH
4950 	/*
4951 	 * For watches with > 1G, use a limit of 200MB and allow
4952 	 * one corpse at a time of up to 300MB.
4953 	 */
4954 #define LARGE_CORPSE_LIMIT 1
4955 	if (sane_size > 1 * 1024 * 1024 * 1024) {
4956 		int cnt = large_corpse_count;
4957 		if (footprint_in_bytes > 200 * 1024 * 1024 &&
4958 		    footprint_in_bytes <= 300 * 1024 * 1024 &&
4959 		    cnt < LARGE_CORPSE_LIMIT &&
4960 		    OSCompareAndSwap(cnt, cnt + 1, &large_corpse_count)) {
4961 			*is_large = true;
4962 			max_allowed_bytes = MAX(max_allowed_bytes, 300 * 1024 * 1024);
4963 		} else {
4964 			max_allowed_bytes = MAX(max_allowed_bytes, 200 * 1024 * 1024);
4965 		}
4966 	}
4967 #endif /* XNU_TARGET_OS_WATCH */
4968 
4969 #if DEBUG || DEVELOPMENT
4970 	if (corpse_threshold_system_limit) {
4971 		max_allowed_bytes = (uint64_t)max_task_footprint_mb * (1UL << 20);
4972 	}
4973 #endif /* DEBUG || DEVELOPMENT */
4974 
4975 	if (footprint_in_bytes > max_allowed_bytes) {
4976 		memorystatus_log("memorystatus disallowed vm_map_fork %lld  %lld\n", footprint_in_bytes, max_allowed_bytes);
4977 		set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED);
4978 		ktriage_record(thread_tid(self), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CORPSE, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CORPSE_PROC_TOO_BIG), 0 /* arg */);
4979 		return !is_allowed;
4980 	}
4981 
4982 	set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
4983 	return is_allowed;
4984 }
4985 
4986 void
memorystatus_get_task_page_counts(task_t task,uint32_t * footprint,uint32_t * max_footprint_lifetime,uint32_t * purgeable_pages)4987 memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
4988 {
4989 	assert(task);
4990 	assert(footprint);
4991 
4992 	uint64_t pages;
4993 
4994 	pages = (get_task_phys_footprint(task) / PAGE_SIZE_64);
4995 	assert(((uint32_t)pages) == pages);
4996 	*footprint = (uint32_t)pages;
4997 
4998 	if (max_footprint_lifetime) {
4999 		pages = (get_task_phys_footprint_lifetime_max(task) / PAGE_SIZE_64);
5000 		assert(((uint32_t)pages) == pages);
5001 		*max_footprint_lifetime = (uint32_t)pages;
5002 	}
5003 	if (purgeable_pages) {
5004 		pages = (get_task_purgeable_size(task) / PAGE_SIZE_64);
5005 		assert(((uint32_t)pages) == pages);
5006 		*purgeable_pages = (uint32_t)pages;
5007 	}
5008 }
5009 
5010 static void
memorystatus_get_task_phys_footprint_page_counts(task_t task,uint64_t * internal_pages,uint64_t * internal_compressed_pages,uint64_t * purgeable_nonvolatile_pages,uint64_t * purgeable_nonvolatile_compressed_pages,uint64_t * alternate_accounting_pages,uint64_t * alternate_accounting_compressed_pages,uint64_t * iokit_mapped_pages,uint64_t * page_table_pages,uint64_t * frozen_to_swap_pages,uint64_t * neural_nofootprint_total_pages)5011 memorystatus_get_task_phys_footprint_page_counts(task_t task,
5012     uint64_t *internal_pages, uint64_t *internal_compressed_pages,
5013     uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
5014     uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
5015     uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages,
5016     uint64_t *neural_nofootprint_total_pages)
5017 {
5018 	assert(task);
5019 
5020 	if (internal_pages) {
5021 		*internal_pages = (get_task_internal(task) / PAGE_SIZE_64);
5022 	}
5023 
5024 	if (internal_compressed_pages) {
5025 		*internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64);
5026 	}
5027 
5028 	if (purgeable_nonvolatile_pages) {
5029 		*purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64);
5030 	}
5031 
5032 	if (purgeable_nonvolatile_compressed_pages) {
5033 		*purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64);
5034 	}
5035 
5036 	if (alternate_accounting_pages) {
5037 		*alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64);
5038 	}
5039 
5040 	if (alternate_accounting_compressed_pages) {
5041 		*alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64);
5042 	}
5043 
5044 	if (iokit_mapped_pages) {
5045 		*iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64);
5046 	}
5047 
5048 	if (page_table_pages) {
5049 		*page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64);
5050 	}
5051 
5052 	if (neural_nofootprint_total_pages) {
5053 		*neural_nofootprint_total_pages = (get_task_neural_nofootprint_total(task) / PAGE_SIZE_64);
5054 	}
5055 
5056 #if CONFIG_FREEZE
5057 	if (frozen_to_swap_pages) {
5058 		*frozen_to_swap_pages = (get_task_frozen_to_swap(task) / PAGE_SIZE_64);
5059 	}
5060 #else /* CONFIG_FREEZE */
5061 #pragma unused(frozen_to_swap_pages)
5062 #endif /* CONFIG_FREEZE */
5063 }
5064 
5065 #if CONFIG_FREEZE
5066 /*
5067  * Copies the source entry into the destination snapshot.
5068  * Returns true on success. Fails if the destination snapshot is full.
5069  * Caller must hold the proc list lock.
5070  */
5071 static bool
memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t * dst_snapshot,unsigned int dst_snapshot_size,const memorystatus_jetsam_snapshot_entry_t * src_entry)5072 memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t *dst_snapshot, unsigned int dst_snapshot_size, const memorystatus_jetsam_snapshot_entry_t *src_entry)
5073 {
5074 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5075 	assert(dst_snapshot);
5076 
5077 	if (dst_snapshot->entry_count == dst_snapshot_size) {
5078 		/* Destination snapshot is full. Can not be updated until it is consumed. */
5079 		return false;
5080 	}
5081 	if (dst_snapshot->entry_count == 0) {
5082 		memorystatus_init_jetsam_snapshot_header(dst_snapshot);
5083 	}
5084 	memorystatus_jetsam_snapshot_entry_t *dst_entry = &dst_snapshot->entries[dst_snapshot->entry_count++];
5085 	memcpy(dst_entry, src_entry, sizeof(memorystatus_jetsam_snapshot_entry_t));
5086 	return true;
5087 }
5088 #endif /* CONFIG_FREEZE */
5089 
5090 static bool
memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t * snapshot,proc_t p,uint32_t kill_cause,uint64_t killtime,memorystatus_jetsam_snapshot_entry_t ** entry)5091 memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t *snapshot, proc_t p, uint32_t kill_cause, uint64_t killtime, memorystatus_jetsam_snapshot_entry_t **entry)
5092 {
5093 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5094 	memorystatus_jetsam_snapshot_entry_t *snapshot_list = snapshot->entries;
5095 	size_t i = snapshot->entry_count;
5096 
5097 	if (memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], (snapshot->js_gencount)) == TRUE) {
5098 		*entry = &snapshot_list[i];
5099 		(*entry)->killed       = kill_cause;
5100 		(*entry)->jse_killtime = killtime;
5101 
5102 		snapshot->entry_count = i + 1;
5103 		return true;
5104 	}
5105 	return false;
5106 }
5107 
5108 /*
5109  * This routine only acts on the global jetsam event snapshot.
5110  * Updating the process's entry can race when the memorystatus_thread
5111  * has chosen to kill a process that is racing to exit on another core.
5112  */
5113 static void
memorystatus_update_jetsam_snapshot_entry_locked(proc_t p,uint32_t kill_cause,uint64_t killtime)5114 memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime)
5115 {
5116 	memorystatus_jetsam_snapshot_entry_t *entry = NULL;
5117 	memorystatus_jetsam_snapshot_t *snapshot    = NULL;
5118 	memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
5119 
5120 	unsigned int i;
5121 #if CONFIG_FREEZE
5122 	bool copied_to_freezer_snapshot = false;
5123 #endif /* CONFIG_FREEZE */
5124 
5125 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5126 
5127 	if (memorystatus_jetsam_snapshot_count == 0) {
5128 		/*
5129 		 * No active snapshot.
5130 		 * Nothing to do.
5131 		 */
5132 		goto exit;
5133 	}
5134 
5135 	/*
5136 	 * Sanity check as this routine should only be called
5137 	 * from a jetsam kill path.
5138 	 */
5139 	assert(kill_cause != 0 && killtime != 0);
5140 
5141 	snapshot       = memorystatus_jetsam_snapshot;
5142 	snapshot_list  = memorystatus_jetsam_snapshot->entries;
5143 
5144 #if JETSAM_ZPRINT_SNAPSHOT
5145 	/*
5146 	 * Collect the snapshot zprint info if we've reached the right priority
5147 	 */
5148 	if (p->p_memstat_effectivepriority >= (int)jzs_trigger_band &&
5149 	    jzs_gencount != snapshot->js_gencount) {
5150 		memorystatus_collect_jetsam_snapshot_zprint();
5151 		jzs_gencount = snapshot->js_gencount;
5152 	}
5153 #endif
5154 
5155 	for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
5156 		if (snapshot_list[i].pid == proc_getpid(p)) {
5157 			entry = &snapshot_list[i];
5158 
5159 			if (entry->killed || entry->jse_killtime) {
5160 				/*
5161 				 * We apparently raced on the exit path
5162 				 * for this process, as it's snapshot entry
5163 				 * has already recorded a kill.
5164 				 */
5165 				assert(entry->killed && entry->jse_killtime);
5166 				break;
5167 			}
5168 
5169 			/*
5170 			 * Update the entry we just found in the snapshot.
5171 			 */
5172 
5173 			entry->killed       = kill_cause;
5174 			entry->jse_killtime = killtime;
5175 			entry->jse_gencount = snapshot->js_gencount;
5176 			entry->jse_idle_delta = p->p_memstat_idle_delta;
5177 #if CONFIG_FREEZE
5178 			entry->jse_thaw_count = p->p_memstat_thaw_count;
5179 			entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason;
5180 #else /* CONFIG_FREEZE */
5181 			entry->jse_thaw_count = 0;
5182 			entry->jse_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
5183 #endif /* CONFIG_FREEZE */
5184 
5185 			/*
5186 			 * If a process has moved between bands since snapshot was
5187 			 * initialized, then likely these fields changed too.
5188 			 */
5189 			if (entry->priority != p->p_memstat_effectivepriority) {
5190 				strlcpy(entry->name, p->p_name, sizeof(entry->name));
5191 				entry->priority  = p->p_memstat_effectivepriority;
5192 				entry->state     = memorystatus_build_state(p);
5193 				entry->user_data = p->p_memstat_userdata;
5194 				entry->fds       = p->p_fd.fd_nfiles;
5195 			}
5196 
5197 			/*
5198 			 * Always update the page counts on a kill.
5199 			 */
5200 
5201 			uint32_t pages              = 0;
5202 			uint32_t max_pages_lifetime = 0;
5203 			uint32_t purgeable_pages    = 0;
5204 
5205 			memorystatus_get_task_page_counts(proc_task(p), &pages, &max_pages_lifetime, &purgeable_pages);
5206 			entry->pages              = (uint64_t)pages;
5207 			entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
5208 			entry->purgeable_pages    = (uint64_t)purgeable_pages;
5209 
5210 			uint64_t internal_pages                        = 0;
5211 			uint64_t internal_compressed_pages             = 0;
5212 			uint64_t purgeable_nonvolatile_pages           = 0;
5213 			uint64_t purgeable_nonvolatile_compressed_pages = 0;
5214 			uint64_t alternate_accounting_pages            = 0;
5215 			uint64_t alternate_accounting_compressed_pages = 0;
5216 			uint64_t iokit_mapped_pages                    = 0;
5217 			uint64_t page_table_pages                      = 0;
5218 			uint64_t frozen_to_swap_pages                  = 0;
5219 			uint64_t neural_nofootprint_total_pages        = 0;
5220 
5221 			memorystatus_get_task_phys_footprint_page_counts(proc_task(p), &internal_pages, &internal_compressed_pages,
5222 			    &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
5223 			    &alternate_accounting_pages, &alternate_accounting_compressed_pages,
5224 			    &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages, &neural_nofootprint_total_pages);
5225 
5226 			entry->jse_internal_pages = internal_pages;
5227 			entry->jse_internal_compressed_pages = internal_compressed_pages;
5228 			entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
5229 			entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
5230 			entry->jse_alternate_accounting_pages = alternate_accounting_pages;
5231 			entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
5232 			entry->jse_iokit_mapped_pages = iokit_mapped_pages;
5233 			entry->jse_page_table_pages = page_table_pages;
5234 			entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
5235 			entry->jse_neural_nofootprint_total_pages = neural_nofootprint_total_pages;
5236 
5237 			uint64_t region_count = 0;
5238 			memorystatus_get_task_memory_region_count(proc_task(p), &region_count);
5239 			entry->jse_memory_region_count = region_count;
5240 			entry->csflags = proc_getcsflags(p);
5241 			goto exit;
5242 		}
5243 	}
5244 
5245 	if (entry == NULL) {
5246 		/*
5247 		 * The entry was not found in the snapshot, so the process must have
5248 		 * launched after the snapshot was initialized.
5249 		 * Let's try to append the new entry.
5250 		 */
5251 		if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) {
5252 			/*
5253 			 * A populated snapshot buffer exists
5254 			 * and there is room to init a new entry.
5255 			 */
5256 			assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count);
5257 
5258 			if (memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry)) {
5259 				memorystatus_jetsam_snapshot_count++;
5260 
5261 				if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) {
5262 					/*
5263 					 * We just used the last slot in the snapshot buffer.
5264 					 * We only want to log it once... so we do it here
5265 					 * when we notice we've hit the max.
5266 					 */
5267 					memorystatus_log_error("memorystatus: WARNING snapshot buffer is full, count %d\n", memorystatus_jetsam_snapshot_count);
5268 				}
5269 			}
5270 		}
5271 	}
5272 
5273 exit:
5274 	if (entry) {
5275 #if CONFIG_FREEZE
5276 		if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5277 			/* This is an app kill. Record it in the freezer snapshot so dasd can incorporate this in its recommendations. */
5278 			copied_to_freezer_snapshot = memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_freezer, memorystatus_jetsam_snapshot_freezer_max, entry);
5279 			if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5280 				/*
5281 				 * We just used the last slot in the freezer snapshot buffer.
5282 				 * We only want to log it once... so we do it here
5283 				 * when we notice we've hit the max.
5284 				 */
5285 				memorystatus_log_error("memorystatus: WARNING freezer snapshot buffer is full, count %zu\n",
5286 				    memorystatus_jetsam_snapshot_freezer->entry_count);
5287 			}
5288 		}
5289 #endif /* CONFIG_FREEZE */
5290 	} else {
5291 		/*
5292 		 * If we reach here, the snapshot buffer could not be updated.
5293 		 * Most likely, the buffer is full, in which case we would have
5294 		 * logged a warning in the previous call.
5295 		 *
5296 		 * For now, we will stop appending snapshot entries.
5297 		 * When the buffer is consumed, the snapshot state will reset.
5298 		 */
5299 
5300 		memorystatus_log_error(
5301 			"memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n",
5302 			proc_getpid(p), p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count);
5303 
5304 #if CONFIG_FREEZE
5305 		/* We still attempt to record this in the freezer snapshot */
5306 		if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5307 			snapshot = memorystatus_jetsam_snapshot_freezer;
5308 			if (snapshot->entry_count < memorystatus_jetsam_snapshot_freezer_max) {
5309 				copied_to_freezer_snapshot = memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry);
5310 				if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5311 					/*
5312 					 * We just used the last slot in the freezer snapshot buffer.
5313 					 * We only want to log it once... so we do it here
5314 					 * when we notice we've hit the max.
5315 					 */
5316 					memorystatus_log_error("memorystatus: WARNING freezer snapshot buffer is full, count %zu\n",
5317 					    memorystatus_jetsam_snapshot_freezer->entry_count);
5318 				}
5319 			}
5320 		}
5321 #endif /* CONFIG_FREEZE */
5322 	}
5323 
5324 	return;
5325 }
5326 
5327 uint32_t
memorystatus_get_available_page_count(void)5328 memorystatus_get_available_page_count(void)
5329 {
5330 	return os_atomic_load(&memorystatus_available_pages, relaxed);
5331 }
5332 
5333 void
memorystatus_update_available_page_count(uint32_t available_page_count)5334 memorystatus_update_available_page_count(uint32_t available_page_count)
5335 {
5336 	os_atomic_store(&memorystatus_available_pages, available_page_count,
5337 	    relaxed);
5338 #if VM_PRESSURE_EVENTS
5339 	/*
5340 	 * Since memorystatus_available_pages changes, we should
5341 	 * re-evaluate the pressure levels on the system and
5342 	 * check if we need to wake the pressure thread.
5343 	 * We also update memorystatus_level in that routine.
5344 	 */
5345 	vm_pressure_response();
5346 #endif /* VM_PRESSURE_EVENTS */
5347 #if CONFIG_FREEZE
5348 	/*
5349 	 * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect
5350 	 * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this
5351 	 * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here
5352 	 * will result in the "mutex with preemption disabled" panic.
5353 	 */
5354 
5355 	if (memorystatus_freeze_thread_should_run()) {
5356 		/*
5357 		 * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process).
5358 		 * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here.
5359 		 */
5360 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
5361 			thread_wakeup((event_t)&memorystatus_freeze_wakeup);
5362 		}
5363 	}
5364 #endif /* CONFIG_FREEZE */
5365 	_memstat_consider_waking_jetsam_thread();
5366 }
5367 
5368 static boolean_t
memorystatus_init_jetsam_snapshot_entry_locked(proc_t p,memorystatus_jetsam_snapshot_entry_t * entry,uint64_t gencount)5369 memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount)
5370 {
5371 	clock_sec_t                     tv_sec;
5372 	clock_usec_t                    tv_usec;
5373 	uint32_t pages = 0;
5374 	uint32_t max_pages_lifetime = 0;
5375 	uint32_t purgeable_pages = 0;
5376 	uint64_t internal_pages                         = 0;
5377 	uint64_t internal_compressed_pages              = 0;
5378 	uint64_t purgeable_nonvolatile_pages            = 0;
5379 	uint64_t purgeable_nonvolatile_compressed_pages = 0;
5380 	uint64_t alternate_accounting_pages             = 0;
5381 	uint64_t alternate_accounting_compressed_pages  = 0;
5382 	uint64_t iokit_mapped_pages                     = 0;
5383 	uint64_t page_table_pages                       = 0;
5384 	uint64_t frozen_to_swap_pages                   = 0;
5385 	uint64_t neural_nofootprint_total_pages         = 0;
5386 	uint64_t region_count                           = 0;
5387 	uint64_t cids[COALITION_NUM_TYPES];
5388 	uint32_t trust                                  = 0;
5389 	kern_return_t ret                               = 0;
5390 	memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
5391 
5392 	entry->pid = proc_getpid(p);
5393 	strlcpy(&entry->name[0], p->p_name, sizeof(entry->name));
5394 	entry->priority = p->p_memstat_effectivepriority;
5395 
5396 	memorystatus_get_task_page_counts(proc_task(p), &pages, &max_pages_lifetime, &purgeable_pages);
5397 	entry->pages              = (uint64_t)pages;
5398 	entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
5399 	entry->purgeable_pages    = (uint64_t)purgeable_pages;
5400 
5401 	memorystatus_get_task_phys_footprint_page_counts(proc_task(p), &internal_pages, &internal_compressed_pages,
5402 	    &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
5403 	    &alternate_accounting_pages, &alternate_accounting_compressed_pages,
5404 	    &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages, &neural_nofootprint_total_pages);
5405 
5406 	entry->jse_internal_pages = internal_pages;
5407 	entry->jse_internal_compressed_pages = internal_compressed_pages;
5408 	entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
5409 	entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
5410 	entry->jse_alternate_accounting_pages = alternate_accounting_pages;
5411 	entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
5412 	entry->jse_iokit_mapped_pages = iokit_mapped_pages;
5413 	entry->jse_page_table_pages = page_table_pages;
5414 	entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
5415 	entry->jse_neural_nofootprint_total_pages = neural_nofootprint_total_pages;
5416 
5417 	memorystatus_get_task_memory_region_count(proc_task(p), &region_count);
5418 	entry->jse_memory_region_count = region_count;
5419 
5420 	entry->state     = memorystatus_build_state(p);
5421 	entry->user_data = p->p_memstat_userdata;
5422 	proc_getexecutableuuid(p, &entry->uuid[0], sizeof(entry->uuid));
5423 	entry->fds       = p->p_fd.fd_nfiles;
5424 
5425 	absolutetime_to_microtime(get_task_cpu_time(proc_task(p)), &tv_sec, &tv_usec);
5426 	entry->cpu_time.tv_sec = (int64_t)tv_sec;
5427 	entry->cpu_time.tv_usec = (int64_t)tv_usec;
5428 
5429 	assert(p->p_stats != NULL);
5430 	entry->jse_starttime =  p->p_stats->ps_start;   /* abstime process started */
5431 	entry->jse_killtime = 0;                        /* abstime jetsam chose to kill process */
5432 	entry->killed       = 0;                        /* the jetsam kill cause */
5433 	entry->jse_gencount = gencount;                 /* indicates a pass through jetsam thread, when process was targeted to be killed */
5434 
5435 	entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */
5436 
5437 #if CONFIG_FREEZE
5438 	entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason;
5439 	entry->jse_thaw_count = p->p_memstat_thaw_count;
5440 #else /* CONFIG_FREEZE */
5441 	entry->jse_thaw_count = 0;
5442 	entry->jse_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
5443 #endif /* CONFIG_FREEZE */
5444 
5445 	proc_coalitionids(p, cids);
5446 	entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM];
5447 	entry->csflags = proc_getcsflags(p);
5448 	ret = get_trust_level_kdp(get_task_pmap(proc_task(p)), &trust);
5449 	if (ret != KERN_SUCCESS) {
5450 		trust = KCDATA_INVALID_CS_TRUST_LEVEL;
5451 	}
5452 	entry->cs_trust_level = trust;
5453 	return TRUE;
5454 }
5455 
5456 static void
memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t * snapshot)5457 memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
5458 {
5459 	kern_return_t kr = KERN_SUCCESS;
5460 	mach_msg_type_number_t  count = HOST_VM_INFO64_COUNT;
5461 	vm_statistics64_data_t  vm_stat;
5462 
5463 	if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count)) != KERN_SUCCESS) {
5464 		memorystatus_log_error("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
5465 		memset(&snapshot->stats, 0, sizeof(snapshot->stats));
5466 	} else {
5467 		snapshot->stats.free_pages      = vm_stat.free_count;
5468 		snapshot->stats.active_pages    = vm_stat.active_count;
5469 		snapshot->stats.inactive_pages  = vm_stat.inactive_count;
5470 		snapshot->stats.throttled_pages = vm_stat.throttled_count;
5471 		snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
5472 		snapshot->stats.wired_pages     = vm_stat.wire_count;
5473 
5474 		snapshot->stats.speculative_pages = vm_stat.speculative_count;
5475 		snapshot->stats.filebacked_pages  = vm_stat.external_page_count;
5476 		snapshot->stats.anonymous_pages   = vm_stat.internal_page_count;
5477 		snapshot->stats.compressions      = vm_stat.compressions;
5478 		snapshot->stats.decompressions    = vm_stat.decompressions;
5479 		snapshot->stats.compressor_pages  = vm_stat.compressor_page_count;
5480 		snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
5481 	}
5482 
5483 	get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity);
5484 
5485 	bzero(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name));
5486 	get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name),
5487 	    &snapshot->stats.largest_zone_size);
5488 }
5489 
5490 /*
5491  * Collect vm statistics at boot.
5492  * Called only once (see kern_exec.c)
5493  * Data can be consumed at any time.
5494  */
5495 void
memorystatus_init_at_boot_snapshot()5496 memorystatus_init_at_boot_snapshot()
5497 {
5498 	memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
5499 	memorystatus_at_boot_snapshot.entry_count = 0;
5500 	memorystatus_at_boot_snapshot.notification_time = 0;   /* updated when consumed */
5501 	memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
5502 }
5503 
5504 static void
memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t * snapshot)5505 memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot)
5506 {
5507 	memorystatus_init_snapshot_vmstats(snapshot);
5508 	snapshot->snapshot_time = mach_absolute_time();
5509 	snapshot->notification_time = 0;
5510 	snapshot->js_gencount = 0;
5511 }
5512 
5513 static void
memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t * od_snapshot,uint32_t ods_list_count)5514 memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
5515 {
5516 	proc_t p, next_p;
5517 	unsigned int b = 0, i = 0;
5518 
5519 	memorystatus_jetsam_snapshot_t *snapshot = NULL;
5520 	memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
5521 	unsigned int snapshot_max = 0;
5522 
5523 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5524 
5525 	if (od_snapshot) {
5526 		/*
5527 		 * This is an on_demand snapshot
5528 		 */
5529 		snapshot      = od_snapshot;
5530 		snapshot_list = od_snapshot->entries;
5531 		snapshot_max  = ods_list_count;
5532 	} else {
5533 		/*
5534 		 * This is a jetsam event snapshot
5535 		 */
5536 		snapshot      = memorystatus_jetsam_snapshot;
5537 		snapshot_list = memorystatus_jetsam_snapshot->entries;
5538 		snapshot_max  = memorystatus_jetsam_snapshot_max;
5539 	}
5540 
5541 	memorystatus_init_jetsam_snapshot_header(snapshot);
5542 
5543 	next_p = memorystatus_get_first_proc_locked(&b, TRUE);
5544 	while (next_p) {
5545 		p = next_p;
5546 		next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
5547 
5548 		if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) {
5549 			continue;
5550 		}
5551 
5552 		if (++i == snapshot_max) {
5553 			break;
5554 		}
5555 	}
5556 
5557 	/* Log launchd and kernel_task as well to see more context, even though jetsam doesn't apply to them. */
5558 	if (i < snapshot_max) {
5559 		memorystatus_init_jetsam_snapshot_entry_locked(initproc, &snapshot_list[i], snapshot->js_gencount);
5560 		i++;
5561 	}
5562 
5563 	if (i < snapshot_max) {
5564 		memorystatus_init_jetsam_snapshot_entry_locked(kernproc, &snapshot_list[i], snapshot->js_gencount);
5565 		i++;
5566 	}
5567 
5568 	snapshot->entry_count = i;
5569 
5570 	if (!od_snapshot) {
5571 		/* update the system buffer count */
5572 		memorystatus_jetsam_snapshot_count = i;
5573 	}
5574 }
5575 
5576 #if DEVELOPMENT || DEBUG
5577 
5578 /*
5579  * Verify that the given bucket has been sorted correctly.
5580  *
5581  * Walks through the bucket and verifies that all pids in the
5582  * expected_order buffer are in that bucket and in the same
5583  * relative order.
5584  *
5585  * The proc_list_lock must be held by the caller.
5586  */
5587 static int
memorystatus_verify_sort_order(unsigned int bucket_index,pid_t * expected_order,size_t num_pids)5588 memorystatus_verify_sort_order(unsigned int bucket_index, pid_t *expected_order, size_t num_pids)
5589 {
5590 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5591 
5592 	int error = 0;
5593 	proc_t p = NULL;
5594 	size_t i = 0;
5595 
5596 	/*
5597 	 * NB: We allow other procs to be mixed in within the expected ones.
5598 	 * We just need the expected procs to be in the right order relative to each other.
5599 	 */
5600 	p = memorystatus_get_first_proc_locked(&bucket_index, FALSE);
5601 	while (p) {
5602 		if (proc_getpid(p) == expected_order[i]) {
5603 			i++;
5604 		}
5605 		if (i == num_pids) {
5606 			break;
5607 		}
5608 		p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE);
5609 	}
5610 	if (i != num_pids) {
5611 		char buffer[128];
5612 		size_t len = sizeof(buffer);
5613 		size_t buffer_idx = 0;
5614 		memorystatus_log_error("memorystatus_verify_sort_order: Processes in bucket %d were not sorted properly\n", bucket_index);
5615 		for (i = 0; i < num_pids; i++) {
5616 			int num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%d,", expected_order[i]);
5617 			if (num_written <= 0) {
5618 				break;
5619 			}
5620 			if (buffer_idx + (unsigned int) num_written >= len) {
5621 				break;
5622 			}
5623 			buffer_idx += num_written;
5624 		}
5625 		memorystatus_log_error("memorystatus_verify_sort_order: Expected order [%s]\n", buffer);
5626 		memset(buffer, 0, len);
5627 		buffer_idx = 0;
5628 		p = memorystatus_get_first_proc_locked(&bucket_index, FALSE);
5629 		i = 0;
5630 		memorystatus_log_error("memorystatus_verify_sort_order: Actual order:\n");
5631 		while (p) {
5632 			int num_written;
5633 			if (buffer_idx == 0) {
5634 				num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%zu: %d,", i, proc_getpid(p));
5635 			} else {
5636 				num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%d,", proc_getpid(p));
5637 			}
5638 			if (num_written <= 0) {
5639 				break;
5640 			}
5641 			buffer_idx += (unsigned int) num_written;
5642 			assert(buffer_idx <= len);
5643 			if (i % 10 == 0) {
5644 				memorystatus_log_error("memorystatus_verify_sort_order: %s\n", buffer);
5645 				buffer_idx = 0;
5646 			}
5647 			p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE);
5648 			i++;
5649 		}
5650 		if (buffer_idx != 0) {
5651 			memorystatus_log_error("memorystatus_verify_sort_order: %s\n", buffer);
5652 		}
5653 		error = EINVAL;
5654 	}
5655 	return error;
5656 }
5657 
5658 /*
5659  * Triggers a sort_order on a specified jetsam priority band.
5660  * This is for testing only, used to force a path through the sort
5661  * function.
5662  */
5663 static int
memorystatus_cmd_test_jetsam_sort(int priority,int sort_order,user_addr_t expected_order_user,size_t expected_order_user_len)5664 memorystatus_cmd_test_jetsam_sort(int priority,
5665     int sort_order,
5666     user_addr_t expected_order_user,
5667     size_t expected_order_user_len)
5668 {
5669 	int error = 0;
5670 	unsigned int bucket_index = 0;
5671 	const size_t kMaxPids = 8;
5672 	pid_t expected_order[kMaxPids];
5673 	size_t copy_size = sizeof(expected_order);
5674 	size_t num_pids;
5675 
5676 	if (expected_order_user_len < copy_size) {
5677 		copy_size = expected_order_user_len;
5678 	}
5679 	num_pids = copy_size / sizeof(pid_t);
5680 
5681 	error = copyin(expected_order_user, expected_order, copy_size);
5682 	if (error != 0) {
5683 		return error;
5684 	}
5685 
5686 	if (priority == -1) {
5687 		/* Use as shorthand for default priority */
5688 		bucket_index = JETSAM_PRIORITY_DEFAULT;
5689 	} else {
5690 		bucket_index = (unsigned int)priority;
5691 	}
5692 
5693 	/*
5694 	 * Acquire lock before sorting so we can check the sort order
5695 	 * while still holding the lock.
5696 	 */
5697 	proc_list_lock();
5698 
5699 	memorystatus_sort_bucket_locked(bucket_index, sort_order);
5700 
5701 	if (expected_order_user != CAST_USER_ADDR_T(NULL) && expected_order_user_len > 0) {
5702 		error = memorystatus_verify_sort_order(bucket_index, expected_order, num_pids);
5703 	}
5704 
5705 	proc_list_unlock();
5706 
5707 	return error;
5708 }
5709 
5710 #endif /* DEVELOPMENT || DEBUG */
5711 
5712 /*
5713  * Prepare the process to be killed (set state, update snapshot) and kill it.
5714  */
5715 static uint64_t memorystatus_purge_before_jetsam_success = 0;
5716 
5717 #if SOCKETS
5718 static int
networking_memstatus_callout(proc_t p,uint32_t status)5719 networking_memstatus_callout(proc_t p, uint32_t status)
5720 {
5721 	struct fileproc *fp;
5722 
5723 	/*
5724 	 * proc list lock NOT held
5725 	 * proc lock NOT held
5726 	 * a reference on the proc has been held / shall be dropped by the caller.
5727 	 */
5728 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
5729 	LCK_MTX_ASSERT(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED);
5730 
5731 	proc_fdlock(p);
5732 
5733 	fdt_foreach(fp, p) {
5734 		switch (FILEGLOB_DTYPE(fp->fp_glob)) {
5735 #if NECP
5736 		case DTYPE_NETPOLICY:
5737 			necp_fd_memstatus(p, status,
5738 			    (struct necp_fd_data *)fp_get_data(fp));
5739 			break;
5740 #endif /* NECP */
5741 #if SKYWALK
5742 		case DTYPE_CHANNEL:
5743 			kern_channel_memstatus(p, status,
5744 			    (struct kern_channel *)fp_get_data(fp));
5745 			break;
5746 #endif /* SKYWALK */
5747 		default:
5748 			break;
5749 		}
5750 	}
5751 	proc_fdunlock(p);
5752 
5753 	return 1;
5754 }
5755 #endif /* SOCKETS */
5756 
5757 static boolean_t
memorystatus_kill_proc(proc_t p,uint32_t cause,os_reason_t jetsam_reason,bool * killed,uint64_t * footprint_of_killed_proc)5758 memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool *killed, uint64_t *footprint_of_killed_proc)
5759 {
5760 	pid_t aPid = 0;
5761 	uint32_t aPid_ep = 0;
5762 
5763 	uint64_t        killtime = 0;
5764 	clock_sec_t     tv_sec;
5765 	clock_usec_t    tv_usec;
5766 	uint32_t        tv_msec;
5767 	boolean_t       retval = FALSE;
5768 
5769 	aPid = proc_getpid(p);
5770 	aPid_ep = p->p_memstat_effectivepriority;
5771 
5772 	if (cause != kMemorystatusKilledVnodes && cause != kMemorystatusKilledZoneMapExhaustion) {
5773 		/*
5774 		 * Genuine memory pressure and not other (vnode/zone) resource exhaustion.
5775 		 */
5776 		boolean_t success = FALSE;
5777 		uint64_t num_pages_purged;
5778 		uint64_t num_pages_reclaimed = 0;
5779 		uint64_t num_pages_unsecluded = 0;
5780 
5781 		networking_memstatus_callout(p, cause);
5782 		num_pages_purged = vm_purgeable_purge_task_owned(proc_task(p));
5783 		num_pages_reclaimed += num_pages_purged;
5784 #if CONFIG_SECLUDED_MEMORY
5785 		if (cause == kMemorystatusKilledVMPageShortage &&
5786 		    vm_page_secluded_count > 0 &&
5787 		    task_can_use_secluded_mem(proc_task(p), FALSE)) {
5788 			/*
5789 			 * We're about to kill a process that has access
5790 			 * to the secluded pool.  Drain that pool into the
5791 			 * free or active queues to make these pages re-appear
5792 			 * as "available", which might make us no longer need
5793 			 * to kill that process.
5794 			 * Since the secluded pool does not get refilled while
5795 			 * a process has access to it, it should remain
5796 			 * drained.
5797 			 */
5798 			num_pages_unsecluded = vm_page_secluded_drain();
5799 			num_pages_reclaimed += num_pages_unsecluded;
5800 		}
5801 #endif /* CONFIG_SECLUDED_MEMORY */
5802 
5803 		if (num_pages_reclaimed) {
5804 			/*
5805 			 * We actually reclaimed something and so let's
5806 			 * check if we need to continue with the kill.
5807 			 */
5808 			if (cause == kMemorystatusKilledHiwat) {
5809 				uint64_t footprint_in_bytes = get_task_phys_footprint(proc_task(p));
5810 				uint64_t memlimit_in_bytes  = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL);  /* convert MB to bytes */
5811 				success = (footprint_in_bytes <= memlimit_in_bytes);
5812 			} else {
5813 				success = !(memorystatus_get_available_page_count() < memorystatus_get_soft_memlimit_page_shortage_threshold());
5814 #if CONFIG_SECLUDED_MEMORY
5815 				if (!success && num_pages_unsecluded) {
5816 					/*
5817 					 * We just drained the secluded pool
5818 					 * because we're about to kill a
5819 					 * process that has access to it.
5820 					 * This is an important process and
5821 					 * we'd rather not kill it unless
5822 					 * absolutely necessary, so declare
5823 					 * success even if draining the pool
5824 					 * did not quite get us out of the
5825 					 * "pressure" level but still got
5826 					 * us out of the "critical" level.
5827 					 */
5828 					success = !(
5829 						memorystatus_get_available_page_count() <
5830 						memorystatus_get_critical_page_shortage_threshold());
5831 				}
5832 #endif /* CONFIG_SECLUDED_MEMORY */
5833 			}
5834 
5835 			if (success) {
5836 				memorystatus_purge_before_jetsam_success++;
5837 
5838 				memorystatus_log_info("memorystatus: reclaimed %llu pages (%llu purged, %llu unsecluded) from pid %d [%s] and avoided %s\n",
5839 				    num_pages_reclaimed, num_pages_purged, num_pages_unsecluded, aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_kill_cause_name[cause]);
5840 
5841 				*killed = false;
5842 				*footprint_of_killed_proc = num_pages_reclaimed + num_pages_purged + num_pages_unsecluded;
5843 
5844 				return TRUE;
5845 			}
5846 		}
5847 	}
5848 
5849 	killtime = mach_absolute_time();
5850 	absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5851 	tv_msec = tv_usec / 1000;
5852 
5853 	proc_list_lock();
5854 	memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
5855 	proc_list_unlock();
5856 
5857 	char kill_reason_string[128];
5858 
5859 	if (cause == kMemorystatusKilledHiwat) {
5860 		strlcpy(kill_reason_string, "killing_highwater_process", 128);
5861 	} else {
5862 		if (aPid_ep == JETSAM_PRIORITY_IDLE) {
5863 			strlcpy(kill_reason_string, "killing_idle_process", 128);
5864 		} else {
5865 			strlcpy(kill_reason_string, "killing_top_process", 128);
5866 		}
5867 	}
5868 
5869 	/*
5870 	 * memorystatus_do_kill drops a reference, so take another one so we can
5871 	 * continue to use this exit reason even after memorystatus_do_kill()
5872 	 * returns
5873 	 */
5874 	os_reason_ref(jetsam_reason);
5875 
5876 	retval = memorystatus_do_kill(p, cause, jetsam_reason, footprint_of_killed_proc);
5877 	*killed = retval;
5878 
5879 	memorystatus_log("memorystatus: %s pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu compressor_size:%u\n",
5880 	    kill_reason_string,
5881 	    aPid, proc_best_name(p),
5882 	    memorystatus_kill_cause_name[cause], aPid_ep,
5883 	    (*footprint_of_killed_proc) >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
5884 
5885 	return retval;
5886 }
5887 
5888 /*
5889  * Jetsam the first process in the queue.
5890  */
5891 static bool
memorystatus_kill_top_process(bool any,bool sort_flag,uint32_t cause,os_reason_t jetsam_reason,int32_t max_priority,bool only_swappable,int32_t * priority,uint32_t * errors,uint64_t * memory_reclaimed)5892 memorystatus_kill_top_process(bool any, bool sort_flag, uint32_t cause, os_reason_t jetsam_reason,
5893     int32_t max_priority, bool only_swappable,
5894     int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed)
5895 {
5896 	pid_t aPid;
5897 	proc_t p = PROC_NULL, next_p = PROC_NULL;
5898 	bool new_snapshot = false, force_new_snapshot = false, killed = false, freed_mem = false;
5899 	unsigned int i = 0;
5900 	uint32_t aPid_ep;
5901 	int32_t local_max_kill_prio = JETSAM_PRIORITY_IDLE;
5902 	uint64_t footprint_of_killed_proc = 0;
5903 
5904 #ifndef CONFIG_FREEZE
5905 #pragma unused(any)
5906 #endif
5907 
5908 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
5909 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES);
5910 
5911 
5912 #if CONFIG_JETSAM
5913 	if (sort_flag) {
5914 		(void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
5915 	}
5916 
5917 	*memory_reclaimed = 0;
5918 	local_max_kill_prio = MIN(max_kill_priority, max_priority);
5919 
5920 #if VM_PRESSURE_EVENTS
5921 	if (cause == kMemorystatusKilledSustainedPressure) {
5922 		local_max_kill_prio = memorystatus_sustained_pressure_maximum_band;
5923 	}
5924 #endif /* VM_PRESSURE_EVENTS */
5925 
5926 	force_new_snapshot = false;
5927 
5928 #else /* CONFIG_JETSAM */
5929 	(void) max_priority;
5930 
5931 	if (sort_flag) {
5932 		(void)memorystatus_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_DEFAULT);
5933 	}
5934 
5935 	/*
5936 	 * On macos, we currently only have 2 reasons to be here:
5937 	 *
5938 	 * kMemorystatusKilledZoneMapExhaustion
5939 	 * AND
5940 	 * kMemorystatusKilledVMCompressorSpaceShortage
5941 	 *
5942 	 * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider
5943 	 * any and all processes as eligible kill candidates since we need to avoid a panic.
5944 	 *
5945 	 * Since this function can be called async. it is harder to toggle the max_kill_priority
5946 	 * value before and after a call. And so we use this local variable to set the upper band
5947 	 * on the eligible kill bands.
5948 	 */
5949 	if (cause == kMemorystatusKilledZoneMapExhaustion) {
5950 		local_max_kill_prio = JETSAM_PRIORITY_MAX;
5951 	} else {
5952 		local_max_kill_prio = max_kill_priority;
5953 	}
5954 
5955 	/*
5956 	 * And, because we are here under extreme circumstances, we force a snapshot even for
5957 	 * IDLE kills.
5958 	 */
5959 	force_new_snapshot = true;
5960 
5961 #endif /* CONFIG_JETSAM */
5962 
5963 	if (cause != kMemorystatusKilledZoneMapExhaustion &&
5964 	    jetsam_current_thread() != NULL &&
5965 	    jetsam_current_thread()->limit_to_low_bands &&
5966 	    local_max_kill_prio > JETSAM_PRIORITY_MAIL) {
5967 		local_max_kill_prio = JETSAM_PRIORITY_MAIL;
5968 	}
5969 
5970 	proc_list_lock();
5971 
5972 	next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5973 	while (next_p && (next_p->p_memstat_effectivepriority <= local_max_kill_prio)) {
5974 		p = next_p;
5975 		next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5976 
5977 
5978 		aPid = proc_getpid(p);
5979 		aPid_ep = p->p_memstat_effectivepriority;
5980 
5981 		if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
5982 			continue;   /* with lock held */
5983 		}
5984 
5985 		if (cause == kMemorystatusKilledVnodes) {
5986 			/*
5987 			 * If the system runs out of vnodes, we systematically jetsam
5988 			 * processes in hopes of stumbling onto a vnode gain that helps
5989 			 * the system recover.  The process that happens to trigger
5990 			 * this path has no known relationship to the vnode shortage.
5991 			 * Deadlock avoidance: attempt to safeguard the caller.
5992 			 */
5993 
5994 			if (p == current_proc()) {
5995 				/* do not jetsam the current process */
5996 				continue;
5997 			}
5998 		}
5999 
6000 		if (only_swappable && !task_donates_own_pages(proc_task(p))) {
6001 			continue;
6002 		}
6003 
6004 #if CONFIG_FREEZE
6005 		boolean_t skip;
6006 		boolean_t reclaim_proc = !(p->p_memstat_state & P_MEMSTAT_LOCKED);
6007 		if (any || reclaim_proc) {
6008 			skip = FALSE;
6009 		} else {
6010 			skip = TRUE;
6011 		}
6012 
6013 		if (skip) {
6014 			continue;
6015 		} else
6016 #endif
6017 		{
6018 			if (proc_ref(p, true) == p) {
6019 				/*
6020 				 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6021 				 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6022 				 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6023 				 * acquisition of the proc lock.
6024 				 */
6025 				p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6026 			} else {
6027 				/*
6028 				 * We need to restart the search again because
6029 				 * proc_ref _can_ drop the proc_list lock
6030 				 * and we could have lost our stored next_p via
6031 				 * an exit() on another core.
6032 				 */
6033 				i = 0;
6034 				next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6035 				continue;
6036 			}
6037 
6038 			/*
6039 			 * Capture a snapshot if none exists and:
6040 			 * - we are forcing a new snapshot creation, either because:
6041 			 *      - on a particular platform we need these snapshots every time, OR
6042 			 *	- a boot-arg/embedded device tree property has been set.
6043 			 * - priority was not requested (this is something other than an ambient kill)
6044 			 * - the priority was requested *and* the targeted process is not at idle priority
6045 			 */
6046 			if ((memorystatus_jetsam_snapshot_count == 0) &&
6047 			    (force_new_snapshot || memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) {
6048 				memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6049 				new_snapshot = true;
6050 			}
6051 
6052 			proc_list_unlock();
6053 
6054 			freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed, &footprint_of_killed_proc); /* purged and/or killed 'p' */
6055 			/* Success? */
6056 			if (freed_mem) {
6057 				*memory_reclaimed = footprint_of_killed_proc;
6058 				if (killed) {
6059 					if (priority) {
6060 						*priority = aPid_ep;
6061 					}
6062 				} else {
6063 					/* purged */
6064 					proc_list_lock();
6065 					p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6066 					proc_list_unlock();
6067 				}
6068 				proc_rele(p);
6069 				goto exit;
6070 			}
6071 
6072 			/*
6073 			 * Failure - first unwind the state,
6074 			 * then fall through to restart the search.
6075 			 */
6076 			proc_list_lock();
6077 			proc_rele(p);
6078 			p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6079 			p->p_memstat_state |= P_MEMSTAT_ERROR;
6080 			*errors += 1;
6081 
6082 			i = 0;
6083 			next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6084 		}
6085 	}
6086 
6087 	proc_list_unlock();
6088 
6089 exit:
6090 	os_reason_free(jetsam_reason);
6091 
6092 	if (!killed) {
6093 		/* Clear snapshot if freshly captured and no target was found */
6094 		if (new_snapshot) {
6095 			proc_list_lock();
6096 			memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6097 			proc_list_unlock();
6098 		}
6099 	}
6100 
6101 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6102 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, *memory_reclaimed);
6103 
6104 	return killed;
6105 }
6106 
6107 /*
6108  * Jetsam aggressively
6109  */
6110 static bool
memorystatus_kill_processes_aggressive(uint32_t cause,int aggr_count,int32_t priority_max,int max_kills,uint32_t * errors,uint64_t * memory_reclaimed)6111 memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count,
6112     int32_t priority_max, int max_kills, uint32_t *errors, uint64_t *memory_reclaimed)
6113 {
6114 	pid_t aPid;
6115 	proc_t p = PROC_NULL, next_p = PROC_NULL;
6116 	boolean_t new_snapshot = FALSE, killed = FALSE;
6117 	int kill_count = 0;
6118 	unsigned int priority_band = JETSAM_PRIORITY_IDLE;
6119 	int32_t aPid_ep = 0;
6120 	unsigned int memorystatus_level_snapshot = 0;
6121 	uint64_t killtime = 0;
6122 	clock_sec_t     tv_sec;
6123 	clock_usec_t    tv_usec;
6124 	uint32_t        tv_msec;
6125 	os_reason_t jetsam_reason = OS_REASON_NULL;
6126 	uint64_t footprint_of_killed_proc = 0;
6127 
6128 	*memory_reclaimed = 0;
6129 
6130 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
6131 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, priority_max);
6132 
6133 	if (priority_max >= JETSAM_PRIORITY_FOREGROUND) {
6134 		/*
6135 		 * Check if aggressive jetsam has been asked to kill upto or beyond the
6136 		 * JETSAM_PRIORITY_FOREGROUND bucket. If yes, sort the FG band based on
6137 		 * coalition footprint.
6138 		 */
6139 		memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
6140 	}
6141 
6142 	jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause);
6143 	if (jetsam_reason == OS_REASON_NULL) {
6144 		memorystatus_log_error("memorystatus_kill_processes_aggressive: failed to allocate exit reason\n");
6145 	}
6146 	memorystatus_log("memorystatus: aggressively killing up to %d processes below band %d.\n", max_kills, priority_max + 1);
6147 	proc_list_lock();
6148 
6149 	next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
6150 	while (next_p) {
6151 		if (proc_list_exited(next_p) ||
6152 		    ((unsigned int)(next_p->p_memstat_effectivepriority) != priority_band)) {
6153 			/*
6154 			 * We have raced with next_p running on another core.
6155 			 * It may be exiting or it may have moved to a different
6156 			 * jetsam priority band.  This means we have lost our
6157 			 * place in line while traversing the jetsam list.  We
6158 			 * attempt to recover by rewinding to the beginning of the band
6159 			 * we were already traversing.  By doing this, we do not guarantee
6160 			 * that no process escapes this aggressive march, but we can make
6161 			 * skipping an entire range of processes less likely. (PR-21069019)
6162 			 */
6163 
6164 			memorystatus_log_debug(
6165 				"memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n",
6166 				aggr_count, priority_band, (*next_p->p_name ? next_p->p_name : "unknown"), proc_getpid(next_p));
6167 
6168 			next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
6169 			continue;
6170 		}
6171 
6172 		p = next_p;
6173 		next_p = memorystatus_get_next_proc_locked(&priority_band, p, TRUE);
6174 
6175 		if (p->p_memstat_effectivepriority > priority_max) {
6176 			/*
6177 			 * Bail out of this killing spree if we have
6178 			 * reached beyond the priority_max jetsam band.
6179 			 * That is, we kill up to and through the
6180 			 * priority_max jetsam band.
6181 			 */
6182 			proc_list_unlock();
6183 			goto exit;
6184 		}
6185 
6186 		aPid = proc_getpid(p);
6187 		aPid_ep = p->p_memstat_effectivepriority;
6188 
6189 		if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6190 			continue;
6191 		}
6192 
6193 		/*
6194 		 * Capture a snapshot if none exists.
6195 		 */
6196 		if (memorystatus_jetsam_snapshot_count == 0) {
6197 			memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6198 			new_snapshot = TRUE;
6199 		}
6200 
6201 		/*
6202 		 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6203 		 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6204 		 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6205 		 * acquisition of the proc lock.
6206 		 */
6207 		p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6208 
6209 		killtime = mach_absolute_time();
6210 		absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
6211 		tv_msec = tv_usec / 1000;
6212 
6213 		/* Shift queue, update stats */
6214 		memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
6215 
6216 		/*
6217 		 * In order to kill the target process, we will drop the proc_list_lock.
6218 		 * To guaranteee that p and next_p don't disappear out from under the lock,
6219 		 * we must take a ref on both.
6220 		 * If we cannot get a reference, then it's likely we've raced with
6221 		 * that process exiting on another core.
6222 		 */
6223 		if (proc_ref(p, true) == p) {
6224 			if (next_p) {
6225 				while (next_p && (proc_ref(next_p, true) != next_p)) {
6226 					proc_t temp_p;
6227 
6228 					/*
6229 					 * We must have raced with next_p exiting on another core.
6230 					 * Recover by getting the next eligible process in the band.
6231 					 */
6232 
6233 					memorystatus_log_debug(
6234 						"memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
6235 						aggr_count, proc_getpid(next_p), (*next_p->p_name ? next_p->p_name : "(unknown)"));
6236 
6237 					temp_p = next_p;
6238 					next_p = memorystatus_get_next_proc_locked(&priority_band, temp_p, TRUE);
6239 				}
6240 			}
6241 			proc_list_unlock();
6242 
6243 			if (aPid_ep <= system_procs_aging_band &&
6244 			    (p->p_memstat_relaunch_flags & P_MEMSTAT_RELAUNCH_HIGH)) {
6245 				memorystatus_log("memorystatus: killing %s [%d] in band %d "
6246 				    "with high relaunch probability\n",
6247 				    proc_best_name(p), aPid, aPid_ep);
6248 			}
6249 			memorystatus_log(
6250 				"memorystatus: %s%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
6251 				((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"),
6252 				aggr_count, aPid, proc_best_name(p),
6253 				memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6254 
6255 			memorystatus_level_snapshot = memorystatus_level;
6256 
6257 			/*
6258 			 * memorystatus_do_kill() drops a reference, so take another one so we can
6259 			 * continue to use this exit reason even after memorystatus_do_kill()
6260 			 * returns.
6261 			 */
6262 			os_reason_ref(jetsam_reason);
6263 			killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
6264 
6265 			/* Success? */
6266 			if (killed) {
6267 				*memory_reclaimed += footprint_of_killed_proc;
6268 				proc_rele(p);
6269 				kill_count++;
6270 				p = NULL;
6271 				killed = FALSE;
6272 
6273 				/*
6274 				 * Continue the killing spree.
6275 				 */
6276 				proc_list_lock();
6277 				if (next_p) {
6278 					proc_rele(next_p);
6279 				}
6280 
6281 				if (kill_count == max_kills) {
6282 					memorystatus_log_info(
6283 						"memorystatus: giving up aggressive kill after killing "
6284 						"%d processes below band %d.\n",
6285 						max_kills, priority_max + 1);
6286 					break;
6287 				}
6288 
6289 				if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) {
6290 					if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) {
6291 #if DEVELOPMENT || DEBUG
6292 						memorystatus_log_info("Disabling Lenient mode after one-time deployment.\n");
6293 #endif /* DEVELOPMENT || DEBUG */
6294 						memorystatus_aggressive_jetsam_lenient = FALSE;
6295 						break;
6296 					}
6297 				}
6298 
6299 				continue;
6300 			}
6301 
6302 			/*
6303 			 * Failure - first unwind the state,
6304 			 * then fall through to restart the search.
6305 			 */
6306 			proc_list_lock();
6307 			proc_rele(p);
6308 			if (next_p) {
6309 				proc_rele(next_p);
6310 			}
6311 			p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6312 			p->p_memstat_state |= P_MEMSTAT_ERROR;
6313 			*errors += 1;
6314 			p = NULL;
6315 		}
6316 
6317 		/*
6318 		 * Failure - restart the search at the beginning of
6319 		 * the band we were already traversing.
6320 		 *
6321 		 * We might have raced with "p" exiting on another core, resulting in no
6322 		 * ref on "p".  Or, we may have failed to kill "p".
6323 		 *
6324 		 * Either way, we fall thru to here, leaving the proc in the
6325 		 * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
6326 		 *
6327 		 * And, we hold the the proc_list_lock at this point.
6328 		 */
6329 
6330 		next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
6331 	}
6332 
6333 	proc_list_unlock();
6334 
6335 exit:
6336 	os_reason_free(jetsam_reason);
6337 
6338 	/* Clear snapshot if freshly captured and no target was found */
6339 	if (new_snapshot && (kill_count == 0)) {
6340 		proc_list_lock();
6341 		memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6342 		proc_list_unlock();
6343 	}
6344 
6345 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6346 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, kill_count, *memory_reclaimed);
6347 
6348 	return kill_count > 0;
6349 }
6350 
6351 static boolean_t
memorystatus_kill_hiwat_proc(uint32_t * errors,boolean_t * purged,uint64_t * memory_reclaimed)6352 memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed)
6353 {
6354 	pid_t aPid = 0;
6355 	proc_t p = PROC_NULL, next_p = PROC_NULL;
6356 	bool new_snapshot = false, killed = false, freed_mem = false;
6357 	unsigned int i = 0;
6358 	uint32_t aPid_ep;
6359 	os_reason_t jetsam_reason = OS_REASON_NULL;
6360 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
6361 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6362 
6363 	jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER);
6364 	if (jetsam_reason == OS_REASON_NULL) {
6365 		memorystatus_log_error("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n");
6366 	}
6367 
6368 	proc_list_lock();
6369 
6370 	next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6371 	while (next_p) {
6372 		uint64_t footprint_in_bytes = 0;
6373 		uint64_t memlimit_in_bytes  = 0;
6374 		boolean_t skip = 0;
6375 
6376 		p = next_p;
6377 		next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6378 
6379 		aPid = proc_getpid(p);
6380 		aPid_ep = p->p_memstat_effectivepriority;
6381 
6382 		if (p->p_memstat_state  & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6383 			continue;
6384 		}
6385 
6386 		/* skip if no limit set */
6387 		if (p->p_memstat_memlimit <= 0) {
6388 			continue;
6389 		}
6390 
6391 		footprint_in_bytes = get_task_phys_footprint(proc_task(p));
6392 		memlimit_in_bytes  = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL);   /* convert MB to bytes */
6393 		skip = (footprint_in_bytes <= memlimit_in_bytes);
6394 
6395 #if CONFIG_FREEZE
6396 		if (!skip) {
6397 			if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
6398 				skip = TRUE;
6399 			} else {
6400 				skip = FALSE;
6401 			}
6402 		}
6403 #endif
6404 
6405 		if (skip) {
6406 			continue;
6407 		} else {
6408 			if (memorystatus_jetsam_snapshot_count == 0) {
6409 				memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6410 				new_snapshot = true;
6411 			}
6412 
6413 			if (proc_ref(p, true) == p) {
6414 				/*
6415 				 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6416 				 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6417 				 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6418 				 * acquisition of the proc lock.
6419 				 */
6420 				p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6421 
6422 				proc_list_unlock();
6423 			} else {
6424 				/*
6425 				 * We need to restart the search again because
6426 				 * proc_ref _can_ drop the proc_list lock
6427 				 * and we could have lost our stored next_p via
6428 				 * an exit() on another core.
6429 				 */
6430 				i = 0;
6431 				next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6432 				continue;
6433 			}
6434 
6435 			footprint_in_bytes = 0;
6436 			freed_mem = memorystatus_kill_proc(p, kMemorystatusKilledHiwat, jetsam_reason, &killed, &footprint_in_bytes); /* purged and/or killed 'p' */
6437 
6438 			/* Success? */
6439 			if (freed_mem) {
6440 				if (!killed) {
6441 					/* purged 'p'..don't reset HWM candidate count */
6442 					*purged = TRUE;
6443 
6444 					proc_list_lock();
6445 					p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6446 					proc_list_unlock();
6447 				} else {
6448 					*memory_reclaimed = footprint_in_bytes;
6449 				}
6450 				proc_rele(p);
6451 				goto exit;
6452 			}
6453 			/*
6454 			 * Failure - first unwind the state,
6455 			 * then fall through to restart the search.
6456 			 */
6457 			proc_list_lock();
6458 			proc_rele(p);
6459 			p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6460 			p->p_memstat_state |= P_MEMSTAT_ERROR;
6461 			*errors += 1;
6462 
6463 			i = 0;
6464 			next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6465 		}
6466 	}
6467 
6468 	proc_list_unlock();
6469 
6470 exit:
6471 	os_reason_free(jetsam_reason);
6472 
6473 	if (!killed) {
6474 		*memory_reclaimed = 0;
6475 
6476 		/* Clear snapshot if freshly captured and no target was found */
6477 		if (new_snapshot) {
6478 			proc_list_lock();
6479 			memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6480 			proc_list_unlock();
6481 		}
6482 	}
6483 
6484 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
6485 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, *memory_reclaimed, 0);
6486 
6487 	return killed;
6488 }
6489 
6490 /*
6491  * Jetsam a process pinned in the elevated band.
6492  *
6493  * Return:  true -- a pinned process was jetsammed
6494  *	    false -- no pinned process was jetsammed
6495  */
6496 boolean_t
memorystatus_kill_elevated_process(uint32_t cause,os_reason_t jetsam_reason,unsigned int band,int aggr_count,uint32_t * errors,uint64_t * memory_reclaimed)6497 memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors, uint64_t *memory_reclaimed)
6498 {
6499 	pid_t aPid = 0;
6500 	proc_t p = PROC_NULL, next_p = PROC_NULL;
6501 	boolean_t new_snapshot = FALSE, killed = FALSE;
6502 	int kill_count = 0;
6503 	uint32_t aPid_ep;
6504 	uint64_t killtime = 0;
6505 	clock_sec_t     tv_sec;
6506 	clock_usec_t    tv_usec;
6507 	uint32_t        tv_msec;
6508 	uint64_t footprint_of_killed_proc = 0;
6509 
6510 
6511 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
6512 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6513 
6514 #if CONFIG_FREEZE
6515 	boolean_t consider_frozen_only = FALSE;
6516 
6517 	if (band == (unsigned int) memorystatus_freeze_jetsam_band) {
6518 		consider_frozen_only = TRUE;
6519 	}
6520 #endif /* CONFIG_FREEZE */
6521 
6522 	proc_list_lock();
6523 
6524 	next_p = memorystatus_get_first_proc_locked(&band, FALSE);
6525 	while (next_p) {
6526 		p = next_p;
6527 		next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
6528 
6529 		aPid = proc_getpid(p);
6530 		aPid_ep = p->p_memstat_effectivepriority;
6531 
6532 		/*
6533 		 * Only pick a process pinned in this elevated band
6534 		 */
6535 		if (!_memstat_proc_is_elevated(p)) {
6536 			continue;
6537 		}
6538 
6539 		if (p->p_memstat_state  & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6540 			continue;
6541 		}
6542 
6543 #if CONFIG_FREEZE
6544 		if (consider_frozen_only && !_memstat_proc_is_frozen(p)) {
6545 			continue;
6546 		}
6547 
6548 		if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
6549 			continue;
6550 		}
6551 #endif /* CONFIG_FREEZE */
6552 
6553 #if DEVELOPMENT || DEBUG
6554 		memorystatus_log_info(
6555 			"jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n",
6556 			aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"), MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6557 #endif /* DEVELOPMENT || DEBUG */
6558 
6559 		if (memorystatus_jetsam_snapshot_count == 0) {
6560 			memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6561 			new_snapshot = TRUE;
6562 		}
6563 
6564 		p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6565 
6566 		killtime = mach_absolute_time();
6567 		absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
6568 		tv_msec = tv_usec / 1000;
6569 
6570 		memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
6571 
6572 		if (proc_ref(p, true) == p) {
6573 			proc_list_unlock();
6574 
6575 			/*
6576 			 * memorystatus_do_kill drops a reference, so take another one so we can
6577 			 * continue to use this exit reason even after memorystatus_do_kill()
6578 			 * returns
6579 			 */
6580 			os_reason_ref(jetsam_reason);
6581 			killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
6582 
6583 			memorystatus_log("%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n",
6584 			    (unsigned long)tv_sec, tv_msec,
6585 			    aggr_count,
6586 			    aPid, ((p && *p->p_name) ? p->p_name : "unknown"),
6587 			    memorystatus_kill_cause_name[cause], aPid_ep,
6588 			    footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6589 
6590 			/* Success? */
6591 			if (killed) {
6592 				*memory_reclaimed = footprint_of_killed_proc;
6593 				proc_rele(p);
6594 				kill_count++;
6595 				goto exit;
6596 			}
6597 
6598 			/*
6599 			 * Failure - first unwind the state,
6600 			 * then fall through to restart the search.
6601 			 */
6602 			proc_list_lock();
6603 			proc_rele(p);
6604 			p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6605 			p->p_memstat_state |= P_MEMSTAT_ERROR;
6606 			*errors += 1;
6607 		}
6608 
6609 		/*
6610 		 * Failure - restart the search.
6611 		 *
6612 		 * We might have raced with "p" exiting on another core, resulting in no
6613 		 * ref on "p".  Or, we may have failed to kill "p".
6614 		 *
6615 		 * Either way, we fall thru to here, leaving the proc in the
6616 		 * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state.
6617 		 *
6618 		 * And, we hold the the proc_list_lock at this point.
6619 		 */
6620 
6621 		next_p = memorystatus_get_first_proc_locked(&band, FALSE);
6622 	}
6623 
6624 	proc_list_unlock();
6625 
6626 exit:
6627 	os_reason_free(jetsam_reason);
6628 
6629 	if (kill_count == 0) {
6630 		*memory_reclaimed = 0;
6631 
6632 		/* Clear snapshot if freshly captured and no target was found */
6633 		if (new_snapshot) {
6634 			proc_list_lock();
6635 			memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6636 			proc_list_unlock();
6637 		}
6638 	}
6639 
6640 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6641 	    MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, kill_count, *memory_reclaimed);
6642 
6643 	return killed;
6644 }
6645 
6646 boolean_t
memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async)6647 memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async)
6648 {
6649 	if (async) {
6650 		os_atomic_store(&memorystatus_compressor_space_shortage, true, release);
6651 		memorystatus_thread_wake();
6652 		return true;
6653 	} else {
6654 		os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE);
6655 		if (jetsam_reason == OS_REASON_NULL) {
6656 			memorystatus_log_error("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n");
6657 		}
6658 
6659 		return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason);
6660 	}
6661 }
6662 
6663 #if CONFIG_JETSAM
6664 
6665 void
memorystatus_kill_on_vps_starvation(void)6666 memorystatus_kill_on_vps_starvation(void)
6667 {
6668 	os_atomic_store(&memorystatus_pageout_starved, true, release);
6669 	memorystatus_thread_wake();
6670 }
6671 
6672 boolean_t
memorystatus_kill_on_vnode_limit(void)6673 memorystatus_kill_on_vnode_limit(void)
6674 {
6675 	os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE);
6676 	if (jetsam_reason == OS_REASON_NULL) {
6677 		memorystatus_log_error("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n");
6678 	}
6679 
6680 	return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason);
6681 }
6682 
6683 boolean_t
memorystatus_kill_on_sustained_pressure()6684 memorystatus_kill_on_sustained_pressure()
6685 {
6686 	os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_SUSTAINED_PRESSURE);
6687 	if (jetsam_reason == OS_REASON_NULL) {
6688 		memorystatus_log_error("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n");
6689 	}
6690 
6691 	return memorystatus_kill_process_sync(-1, kMemorystatusKilledSustainedPressure, jetsam_reason);
6692 }
6693 
6694 boolean_t
memorystatus_kill_with_jetsam_reason_sync(pid_t pid,os_reason_t jetsam_reason)6695 memorystatus_kill_with_jetsam_reason_sync(pid_t pid, os_reason_t jetsam_reason)
6696 {
6697 	uint32_t kill_cause = jetsam_reason->osr_code <= JETSAM_REASON_MEMORYSTATUS_MAX ?
6698 	    (uint32_t) jetsam_reason->osr_code : JETSAM_REASON_INVALID;
6699 	return memorystatus_kill_process_sync(pid, kill_cause, jetsam_reason);
6700 }
6701 
6702 #endif /* CONFIG_JETSAM */
6703 
6704 boolean_t
memorystatus_kill_on_zone_map_exhaustion(pid_t pid)6705 memorystatus_kill_on_zone_map_exhaustion(pid_t pid)
6706 {
6707 	boolean_t res = FALSE;
6708 	if (pid == -1) {
6709 		os_atomic_store(&memorystatus_zone_map_is_exhausted, true, release);
6710 		memorystatus_thread_wake();
6711 		return true;
6712 	} else {
6713 		os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION);
6714 		if (jetsam_reason == OS_REASON_NULL) {
6715 			memorystatus_log_error("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n");
6716 		}
6717 
6718 		res = memorystatus_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason);
6719 	}
6720 	return res;
6721 }
6722 
6723 void
memorystatus_on_pageout_scan_end(void)6724 memorystatus_on_pageout_scan_end(void)
6725 {
6726 	/* No-op */
6727 }
6728 
6729 /* Return both allocated and actual size, since there's a race between allocation and list compilation */
6730 static int
memorystatus_get_priority_list(memorystatus_priority_entry_t ** list_ptr,size_t * buffer_size,size_t * list_size,boolean_t size_only)6731 memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t *buffer_size, size_t *list_size, boolean_t size_only)
6732 {
6733 	uint32_t list_count, i = 0;
6734 	memorystatus_priority_entry_t *list_entry;
6735 	proc_t p;
6736 
6737 	list_count = memorystatus_list_count;
6738 	*list_size = sizeof(memorystatus_priority_entry_t) * list_count;
6739 
6740 	/* Just a size check? */
6741 	if (size_only) {
6742 		return 0;
6743 	}
6744 
6745 	/* Otherwise, validate the size of the buffer */
6746 	if (*buffer_size < *list_size) {
6747 		return EINVAL;
6748 	}
6749 
6750 	*list_ptr = kalloc_data(*list_size, Z_WAITOK | Z_ZERO);
6751 	if (!*list_ptr) {
6752 		return ENOMEM;
6753 	}
6754 
6755 	*buffer_size = *list_size;
6756 	*list_size = 0;
6757 
6758 	list_entry = *list_ptr;
6759 
6760 	proc_list_lock();
6761 
6762 	p = memorystatus_get_first_proc_locked(&i, TRUE);
6763 	while (p && (*list_size < *buffer_size)) {
6764 		list_entry->pid = proc_getpid(p);
6765 		list_entry->priority = p->p_memstat_effectivepriority;
6766 		list_entry->user_data = p->p_memstat_userdata;
6767 
6768 		if (p->p_memstat_memlimit <= 0) {
6769 			task_get_phys_footprint_limit(proc_task(p), &list_entry->limit);
6770 		} else {
6771 			list_entry->limit = p->p_memstat_memlimit;
6772 		}
6773 
6774 		list_entry->state = memorystatus_build_state(p);
6775 		list_entry++;
6776 
6777 		*list_size += sizeof(memorystatus_priority_entry_t);
6778 
6779 		p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6780 	}
6781 
6782 	proc_list_unlock();
6783 
6784 	memorystatus_log_debug("memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
6785 
6786 	return 0;
6787 }
6788 
6789 static int
memorystatus_get_priority_pid(pid_t pid,user_addr_t buffer,size_t buffer_size)6790 memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size)
6791 {
6792 	int error = 0;
6793 	bool zombref = false;
6794 	memorystatus_priority_entry_t mp_entry;
6795 	kern_return_t ret;
6796 
6797 	/* Validate inputs */
6798 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_entry_t))) {
6799 		return EINVAL;
6800 	}
6801 
6802 	proc_t p = proc_find(pid);
6803 	if (!p) {
6804 		zombref = true;
6805 		p = proc_find_zombref(pid);
6806 		if (!p) {
6807 			return ESRCH;
6808 		}
6809 	}
6810 
6811 	memset(&mp_entry, 0, sizeof(memorystatus_priority_entry_t));
6812 
6813 	mp_entry.pid = proc_getpid(p);
6814 	mp_entry.priority = p->p_memstat_effectivepriority;
6815 	mp_entry.user_data = p->p_memstat_userdata;
6816 	if (p->p_memstat_memlimit <= 0 && !zombref) {
6817 		task_t task = proc_task(p);
6818 		assert(task);
6819 		ret = task_get_phys_footprint_limit(task, &mp_entry.limit);
6820 		if (ret != KERN_SUCCESS) {
6821 			error = mach_to_bsd_errno(ret);
6822 			goto done;
6823 		}
6824 	} else {
6825 		mp_entry.limit = p->p_memstat_memlimit;
6826 	}
6827 	mp_entry.state = memorystatus_build_state(p);
6828 
6829 	error = copyout(&mp_entry, buffer, buffer_size);
6830 
6831 done:
6832 	if (zombref) {
6833 		proc_drop_zombref(p);
6834 	} else {
6835 		proc_rele(p);
6836 	}
6837 
6838 	return error;
6839 }
6840 
6841 static int
memorystatus_cmd_get_priority_list(pid_t pid,user_addr_t buffer,size_t buffer_size,int32_t * retval)6842 memorystatus_cmd_get_priority_list(pid_t pid, user_addr_t buffer, size_t buffer_size, int32_t *retval)
6843 {
6844 	int error = 0;
6845 	boolean_t size_only;
6846 	size_t list_size;
6847 
6848 	/*
6849 	 * When a non-zero pid is provided, the 'list' has only one entry.
6850 	 */
6851 
6852 	size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
6853 
6854 	if (pid != 0) {
6855 		list_size = sizeof(memorystatus_priority_entry_t) * 1;
6856 		if (!size_only) {
6857 			error = memorystatus_get_priority_pid(pid, buffer, buffer_size);
6858 		}
6859 	} else {
6860 		memorystatus_priority_entry_t *list = NULL;
6861 		error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, size_only);
6862 
6863 		if (error == 0) {
6864 			if (!size_only) {
6865 				error = copyout(list, buffer, list_size);
6866 			}
6867 
6868 			kfree_data(list, buffer_size);
6869 		}
6870 	}
6871 
6872 	if (error == 0) {
6873 		assert(list_size <= INT32_MAX);
6874 		*retval = (int32_t) list_size;
6875 	}
6876 
6877 	return error;
6878 }
6879 
6880 static void
memorystatus_clear_errors(void)6881 memorystatus_clear_errors(void)
6882 {
6883 	proc_t p;
6884 	unsigned int i = 0;
6885 
6886 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START);
6887 
6888 	proc_list_lock();
6889 
6890 	p = memorystatus_get_first_proc_locked(&i, TRUE);
6891 	while (p) {
6892 		if (p->p_memstat_state & P_MEMSTAT_ERROR) {
6893 			p->p_memstat_state &= ~P_MEMSTAT_ERROR;
6894 		}
6895 		p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6896 	}
6897 
6898 	proc_list_unlock();
6899 
6900 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END);
6901 }
6902 
6903 void
memorystatus_fast_jetsam_override(bool enable_override)6904 memorystatus_fast_jetsam_override(bool enable_override)
6905 {
6906 #if CONFIG_JETSAM
6907 	fast_jetsam_enabled = !enable_override;
6908 	if (!fast_jetsam_enabled) {
6909 		/* Disable any pre-configured policies */
6910 		os_atomic_store(&memstat_policy_config, kPolicyDefault, relaxed);
6911 		memorystatus_thread_pool_default();
6912 		_memstat_consider_waking_jetsam_thread();
6913 	}
6914 #else /* CONFIG_JETSAM */
6915 	(void)enable_override;
6916 #endif /* CONFIG_JETSAM */
6917 }
6918 
6919 /*
6920  * Get the at_boot snapshot
6921  */
6922 static int
memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)6923 memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6924 {
6925 	size_t input_size = *snapshot_size;
6926 
6927 	/*
6928 	 * The at_boot snapshot has no entry list.
6929 	 */
6930 	*snapshot_size = sizeof(memorystatus_jetsam_snapshot_t);
6931 
6932 	if (size_only) {
6933 		return 0;
6934 	}
6935 
6936 	/*
6937 	 * Validate the size of the snapshot buffer
6938 	 */
6939 	if (input_size < *snapshot_size) {
6940 		return EINVAL;
6941 	}
6942 
6943 	/*
6944 	 * Update the notification_time only
6945 	 */
6946 	memorystatus_at_boot_snapshot.notification_time = mach_absolute_time();
6947 	*snapshot = &memorystatus_at_boot_snapshot;
6948 
6949 	memorystatus_log_debug(
6950 		"memorystatus_get_at_boot_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%d)\n",
6951 		(long)input_size, (long)*snapshot_size, 0);
6952 	return 0;
6953 }
6954 
6955 #if CONFIG_FREEZE
6956 static int
memorystatus_get_jetsam_snapshot_freezer(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)6957 memorystatus_get_jetsam_snapshot_freezer(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6958 {
6959 	size_t input_size = *snapshot_size;
6960 
6961 	if (memorystatus_jetsam_snapshot_freezer->entry_count > 0) {
6962 		*snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_freezer->entry_count));
6963 	} else {
6964 		*snapshot_size = 0;
6965 	}
6966 	assert(*snapshot_size <= memorystatus_jetsam_snapshot_freezer_size);
6967 
6968 	if (size_only) {
6969 		return 0;
6970 	}
6971 
6972 	if (input_size < *snapshot_size) {
6973 		return EINVAL;
6974 	}
6975 
6976 	*snapshot = memorystatus_jetsam_snapshot_freezer;
6977 
6978 	memorystatus_log_debug(
6979 		"memorystatus_get_jetsam_snapshot_freezer: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
6980 		(long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_freezer->entry_count);
6981 
6982 	return 0;
6983 }
6984 #endif /* CONFIG_FREEZE */
6985 
6986 static int
memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)6987 memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6988 {
6989 	size_t input_size = *snapshot_size;
6990 	uint32_t ods_list_count = memorystatus_list_count;
6991 	memorystatus_jetsam_snapshot_t *ods = NULL;     /* The on_demand snapshot buffer */
6992 
6993 	*snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (ods_list_count));
6994 
6995 	if (size_only) {
6996 		return 0;
6997 	}
6998 
6999 	/*
7000 	 * Validate the size of the snapshot buffer.
7001 	 * This is inherently racey. May want to revisit
7002 	 * this error condition and trim the output when
7003 	 * it doesn't fit.
7004 	 */
7005 	if (input_size < *snapshot_size) {
7006 		return EINVAL;
7007 	}
7008 
7009 	/*
7010 	 * Allocate and initialize a snapshot buffer.
7011 	 */
7012 	ods = kalloc_data(*snapshot_size, Z_WAITOK | Z_ZERO);
7013 	if (!ods) {
7014 		return ENOMEM;
7015 	}
7016 
7017 	proc_list_lock();
7018 	memorystatus_init_jetsam_snapshot_locked(ods, ods_list_count);
7019 	proc_list_unlock();
7020 
7021 	/*
7022 	 * Return the kernel allocated, on_demand buffer.
7023 	 * The caller of this routine will copy the data out
7024 	 * to user space and then free the kernel allocated
7025 	 * buffer.
7026 	 */
7027 	*snapshot = ods;
7028 
7029 	memorystatus_log_debug(
7030 		"memorystatus_get_on_demand_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7031 		(long)input_size, (long)*snapshot_size, (long)ods_list_count);
7032 
7033 	return 0;
7034 }
7035 
7036 static int
memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)7037 memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
7038 {
7039 	size_t input_size = *snapshot_size;
7040 
7041 	if (memorystatus_jetsam_snapshot_count > 0) {
7042 		*snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
7043 	} else {
7044 		*snapshot_size = 0;
7045 	}
7046 
7047 	if (size_only) {
7048 		return 0;
7049 	}
7050 
7051 	if (input_size < *snapshot_size) {
7052 		return EINVAL;
7053 	}
7054 
7055 	*snapshot = memorystatus_jetsam_snapshot;
7056 
7057 	memorystatus_log_debug(
7058 		"memorystatus_get_jetsam_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7059 		(long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_count);
7060 
7061 	return 0;
7062 }
7063 
7064 #if JETSAM_ZPRINT_SNAPSHOT
7065 /*
7066  * Utility function to handle copyout of jetsam zprint snapshot data
7067  */
7068 static int
memorystatus_cmd_get_data_buffer(user_addr_t buffer,size_t buffer_size,int32_t * retval,size_t data_size,void * data)7069 memorystatus_cmd_get_data_buffer(
7070 	user_addr_t  buffer,
7071 	size_t       buffer_size,
7072 	int32_t      *retval,
7073 	size_t       data_size,
7074 	void         *data)
7075 {
7076 	boolean_t size_only = (buffer == USER_ADDR_NULL);
7077 	int error;
7078 
7079 	/* Nothing to return if there's no data yet, instruct the caller to try again later. */
7080 	if (data == NULL) {
7081 		*retval = -1;
7082 		return EAGAIN;
7083 	}
7084 
7085 	/* Handle just a size request */
7086 	if (size_only) {
7087 		*retval = (int32_t)data_size;
7088 		return 0;
7089 	}
7090 
7091 	/* buffer needs to be large enough */
7092 	if (buffer_size < data_size) {
7093 		*retval = -1;
7094 		return EINVAL;
7095 	}
7096 
7097 	error = copyout(data, buffer, data_size);
7098 	if (error == 0) {
7099 		*retval = (int32_t)data_size;
7100 	} else {
7101 		*retval = -1;
7102 	}
7103 
7104 	return error;
7105 }
7106 #endif
7107 
7108 static int
memorystatus_cmd_get_jetsam_snapshot(int32_t flags,user_addr_t buffer,size_t buffer_size,int32_t * retval)7109 memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval)
7110 {
7111 	int error = EINVAL;
7112 	boolean_t size_only;
7113 	boolean_t is_default_snapshot = FALSE;
7114 	boolean_t is_on_demand_snapshot = FALSE;
7115 	boolean_t is_at_boot_snapshot = FALSE;
7116 #if CONFIG_FREEZE
7117 	bool is_freezer_snapshot = false;
7118 #endif /* CONFIG_FREEZE */
7119 	memorystatus_jetsam_snapshot_t *snapshot;
7120 
7121 	size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
7122 
7123 	if (flags == 0) {
7124 		/* Default */
7125 		is_default_snapshot = TRUE;
7126 		error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only);
7127 	} else {
7128 		if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT | MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER)) {
7129 			/*
7130 			 * Unsupported bit set in flag.
7131 			 */
7132 			return EINVAL;
7133 		}
7134 
7135 		if (flags & (flags - 0x1)) {
7136 			/*
7137 			 * Can't have multiple flags set at the same time.
7138 			 */
7139 			return EINVAL;
7140 		}
7141 
7142 		if (flags & MEMORYSTATUS_SNAPSHOT_ON_DEMAND) {
7143 			is_on_demand_snapshot = TRUE;
7144 			/*
7145 			 * When not requesting the size only, the following call will allocate
7146 			 * an on_demand snapshot buffer, which is freed below.
7147 			 */
7148 			error = memorystatus_get_on_demand_snapshot(&snapshot, &buffer_size, size_only);
7149 		} else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) {
7150 			is_at_boot_snapshot = TRUE;
7151 			error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only);
7152 #if CONFIG_FREEZE
7153 		} else if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER) {
7154 			is_freezer_snapshot = true;
7155 			error = memorystatus_get_jetsam_snapshot_freezer(&snapshot, &buffer_size, size_only);
7156 #endif /* CONFIG_FREEZE */
7157 		} else {
7158 			/*
7159 			 * Invalid flag setting.
7160 			 */
7161 			return EINVAL;
7162 		}
7163 	}
7164 
7165 	if (error) {
7166 		goto out;
7167 	}
7168 
7169 	/*
7170 	 * Copy the data out to user space and clear the snapshot buffer.
7171 	 * If working with the jetsam snapshot,
7172 	 *	clearing the buffer means, reset the count.
7173 	 * If working with an on_demand snapshot
7174 	 *	clearing the buffer means, free it.
7175 	 * If working with the at_boot snapshot
7176 	 *	there is nothing to clear or update.
7177 	 * If working with a copy of the snapshot
7178 	 *	there is nothing to clear or update.
7179 	 * If working with the freezer snapshot
7180 	 *	clearing the buffer means, reset the count.
7181 	 */
7182 	if (!size_only) {
7183 		if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
7184 #if CONFIG_FREEZE
7185 			if (is_default_snapshot || is_freezer_snapshot) {
7186 #else
7187 			if (is_default_snapshot) {
7188 #endif /* CONFIG_FREEZE */
7189 				/*
7190 				 * The jetsam snapshot is never freed, its count is simply reset.
7191 				 * However, we make a copy for any parties that might be interested
7192 				 * in the previous fully populated snapshot.
7193 				 */
7194 				proc_list_lock();
7195 #if DEVELOPMENT || DEBUG
7196 				if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != proc_getpid(current_proc())) {
7197 					/* Snapshot is currently owned by someone else. Don't consume it. */
7198 					proc_list_unlock();
7199 					goto out;
7200 				}
7201 #endif /* (DEVELOPMENT || DEBUG)*/
7202 				if (is_default_snapshot) {
7203 					snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
7204 					memorystatus_jetsam_snapshot_last_timestamp = 0;
7205 				}
7206 #if CONFIG_FREEZE
7207 				else if (is_freezer_snapshot) {
7208 					memorystatus_jetsam_snapshot_freezer->entry_count = 0;
7209 				}
7210 #endif /* CONFIG_FREEZE */
7211 				proc_list_unlock();
7212 			}
7213 		}
7214 
7215 		if (is_on_demand_snapshot) {
7216 			/*
7217 			 * The on_demand snapshot is always freed,
7218 			 * even if the copyout failed.
7219 			 */
7220 			kfree_data(snapshot, buffer_size);
7221 		}
7222 	}
7223 
7224 out:
7225 	if (error == 0) {
7226 		assert(buffer_size <= INT32_MAX);
7227 		*retval = (int32_t) buffer_size;
7228 	}
7229 	return error;
7230 }
7231 
7232 #if DEVELOPMENT || DEBUG
7233 static int
7234 memorystatus_cmd_set_testing_pid(int32_t flags)
7235 {
7236 	int error = EINVAL;
7237 	proc_t caller = current_proc();
7238 	assert(caller != kernproc);
7239 	proc_list_lock();
7240 	if (flags & MEMORYSTATUS_FLAGS_SET_TESTING_PID) {
7241 		if (memorystatus_testing_pid == 0) {
7242 			memorystatus_testing_pid = proc_getpid(caller);
7243 			error = 0;
7244 		} else if (memorystatus_testing_pid == proc_getpid(caller)) {
7245 			error = 0;
7246 		} else {
7247 			/* We don't allow ownership to be taken from another proc. */
7248 			error = EBUSY;
7249 		}
7250 	} else if (flags & MEMORYSTATUS_FLAGS_UNSET_TESTING_PID) {
7251 		if (memorystatus_testing_pid == proc_getpid(caller)) {
7252 			memorystatus_testing_pid = 0;
7253 			error = 0;
7254 		} else if (memorystatus_testing_pid != 0) {
7255 			/* We don't allow ownership to be taken from another proc. */
7256 			error = EPERM;
7257 		}
7258 	}
7259 	proc_list_unlock();
7260 
7261 	return error;
7262 }
7263 #endif /* DEVELOPMENT || DEBUG */
7264 
7265 /*
7266  *      Routine:	memorystatus_cmd_grp_set_priorities
7267  *	Purpose:	Update priorities for a group of processes.
7268  *
7269  *	[priority]
7270  *		Move each process out of its effective priority
7271  *		band and into a new priority band.
7272  *		Maintains relative order from lowest to highest priority.
7273  *		In single band, maintains relative order from head to tail.
7274  *
7275  *		eg: before	[effectivepriority | pid]
7276  *				[18 | p101              ]
7277  *				[17 | p55, p67, p19     ]
7278  *				[12 | p103 p10          ]
7279  *				[ 7 | p25               ]
7280  *			        [ 0 | p71, p82,         ]
7281  *
7282  *		after	[ new band | pid]
7283  *			[ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
7284  *
7285  *	Returns:  0 on success, else non-zero.
7286  *
7287  *	Caveat:   We know there is a race window regarding recycled pids.
7288  *		  A process could be killed before the kernel can act on it here.
7289  *		  If a pid cannot be found in any of the jetsam priority bands,
7290  *		  then we simply ignore it.  No harm.
7291  *		  But, if the pid has been recycled then it could be an issue.
7292  *		  In that scenario, we might move an unsuspecting process to the new
7293  *		  priority band. It's not clear how the kernel can safeguard
7294  *		  against this, but it would be an extremely rare case anyway.
7295  *		  The caller of this api might avoid such race conditions by
7296  *		  ensuring that the processes passed in the pid list are suspended.
7297  */
7298 
7299 
7300 static int
7301 memorystatus_cmd_grp_set_priorities(user_addr_t buffer, size_t buffer_size)
7302 {
7303 	/*
7304 	 * We only handle setting priority
7305 	 * per process
7306 	 */
7307 	int error = 0;
7308 	memorystatus_properties_entry_v1_t *entries = NULL;
7309 	size_t entry_count = 0;
7310 
7311 	/* This will be the ordered proc list */
7312 	typedef struct memorystatus_internal_properties {
7313 		proc_t proc;
7314 		int32_t priority;
7315 	} memorystatus_internal_properties_t;
7316 
7317 	memorystatus_internal_properties_t *table = NULL;
7318 	uint32_t table_count = 0;
7319 
7320 	size_t i = 0;
7321 	uint32_t bucket_index = 0;
7322 	int32_t new_priority;
7323 
7324 	proc_t p;
7325 
7326 	/* Verify inputs */
7327 	if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
7328 		error = EINVAL;
7329 		goto out;
7330 	}
7331 
7332 	entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
7333 	if (entry_count == 0) {
7334 		/* buffer size was not large enough for a single entry */
7335 		error = EINVAL;
7336 		goto out;
7337 	}
7338 
7339 	if ((entries = kalloc_data(buffer_size, Z_WAITOK)) == NULL) {
7340 		error = ENOMEM;
7341 		goto out;
7342 	}
7343 
7344 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count);
7345 
7346 	if ((error = copyin(buffer, entries, buffer_size)) != 0) {
7347 		goto out;
7348 	}
7349 
7350 	/* Verify sanity of input priorities */
7351 	if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
7352 		if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
7353 			error = EINVAL;
7354 			goto out;
7355 		}
7356 	} else {
7357 		error = EINVAL;
7358 		goto out;
7359 	}
7360 
7361 	for (i = 0; i < entry_count; i++) {
7362 		if (entries[i].priority == -1) {
7363 			/* Use as shorthand for default priority */
7364 			entries[i].priority = JETSAM_PRIORITY_DEFAULT;
7365 		} else if (entries[i].priority > JETSAM_PRIORITY_IDLE && entries[i].priority <= applications_aging_band) {
7366 			/*
7367 			 * Everything between idle and the aging bands are reserved for internal use.
7368 			 * if requested, adjust to JETSAM_PRIORITY_IDLE.
7369 			 * Entitled processes (just munch) can use a subset of this range for testing.
7370 			 */
7371 			if (entries[i].priority > JETSAM_PRIORITY_ENTITLED_MAX ||
7372 			    !current_task_can_use_entitled_range()) {
7373 				entries[i].priority = JETSAM_PRIORITY_IDLE;
7374 			}
7375 		} else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7376 			/* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
7377 			 * queue */
7378 			/* Deal with this later */
7379 		} else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
7380 			/* Sanity check */
7381 			error = EINVAL;
7382 			goto out;
7383 		}
7384 	}
7385 
7386 	table = kalloc_type(memorystatus_internal_properties_t, entry_count,
7387 	    Z_WAITOK | Z_ZERO);
7388 	if (table == NULL) {
7389 		error = ENOMEM;
7390 		goto out;
7391 	}
7392 
7393 
7394 	/*
7395 	 * For each jetsam bucket entry, spin through the input property list.
7396 	 * When a matching pid is found, populate an adjacent table with the
7397 	 * appropriate proc pointer and new property values.
7398 	 * This traversal automatically preserves order from lowest
7399 	 * to highest priority.
7400 	 */
7401 
7402 	bucket_index = 0;
7403 
7404 	proc_list_lock();
7405 
7406 	/* Create the ordered table */
7407 	p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
7408 	while (p && (table_count < entry_count)) {
7409 		for (i = 0; i < entry_count; i++) {
7410 			if (proc_getpid(p) == entries[i].pid) {
7411 				/* Build the table data  */
7412 				table[table_count].proc = p;
7413 				table[table_count].priority = entries[i].priority;
7414 				table_count++;
7415 				break;
7416 			}
7417 		}
7418 		p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
7419 	}
7420 
7421 	/* We now have ordered list of procs ready to move */
7422 	for (i = 0; i < table_count; i++) {
7423 		p = table[i].proc;
7424 		assert(p != NULL);
7425 		memstat_priority_options_t priority_options = MEMSTAT_PRIORITY_OPTIONS_NONE;
7426 
7427 		/* Allow head inserts -- but relative order is now  */
7428 		if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7429 			new_priority = JETSAM_PRIORITY_IDLE;
7430 			priority_options |= MEMSTAT_PRIORITY_INSERT_HEAD;
7431 		} else {
7432 			new_priority = table[i].priority;
7433 		}
7434 
7435 		/* Not allowed */
7436 		if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
7437 			continue;
7438 		}
7439 
7440 		memstat_update_priority_locked(p, new_priority, priority_options);
7441 	}
7442 
7443 	proc_list_unlock();
7444 
7445 	/*
7446 	 * if (table_count != entry_count)
7447 	 * then some pids were not found in a jetsam band.
7448 	 * harmless but interesting...
7449 	 */
7450 out:
7451 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, table_count);
7452 
7453 	kfree_data(entries, buffer_size);
7454 	kfree_type(memorystatus_internal_properties_t, entry_count, table);
7455 
7456 	return error;
7457 }
7458 
7459 memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL;
7460 size_t memorystatus_global_probabilities_size = 0;
7461 
7462 static int
7463 memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size)
7464 {
7465 	int error = 0;
7466 	memorystatus_properties_entry_v1_t *entries = NULL;
7467 	size_t entry_count = 0, i = 0;
7468 	memorystatus_internal_probabilities_t *tmp_table_new = NULL, *tmp_table_old = NULL;
7469 	size_t tmp_table_new_size = 0, tmp_table_old_size = 0;
7470 #if DEVELOPMENT || DEBUG
7471 	if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != proc_getpid(current_proc())) {
7472 		/* probabilites are currently owned by someone else. Don't change them. */
7473 		error = EPERM;
7474 		goto out;
7475 	}
7476 #endif /* (DEVELOPMENT || DEBUG)*/
7477 
7478 	/* Verify inputs */
7479 	if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
7480 		error = EINVAL;
7481 		goto out;
7482 	}
7483 
7484 	entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
7485 	if (entry_count == 0) {
7486 		error = EINVAL;
7487 		goto out;
7488 	}
7489 
7490 	if ((entries = kalloc_data(buffer_size, Z_WAITOK)) == NULL) {
7491 		error = ENOMEM;
7492 		goto out;
7493 	}
7494 
7495 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count);
7496 
7497 	if ((error = copyin(buffer, entries, buffer_size)) != 0) {
7498 		goto out;
7499 	}
7500 
7501 	if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
7502 		if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
7503 			error = EINVAL;
7504 			goto out;
7505 		}
7506 	} else {
7507 		error = EINVAL;
7508 		goto out;
7509 	}
7510 
7511 	/* Verify sanity of input priorities */
7512 	for (i = 0; i < entry_count; i++) {
7513 		/*
7514 		 * 0 - low probability of use.
7515 		 * 1 - high probability of use.
7516 		 *
7517 		 * Keeping this field an int (& not a bool) to allow
7518 		 * us to experiment with different values/approaches
7519 		 * later on.
7520 		 */
7521 		if (entries[i].use_probability > 1) {
7522 			error = EINVAL;
7523 			goto out;
7524 		}
7525 	}
7526 
7527 	tmp_table_new_size = sizeof(memorystatus_internal_probabilities_t) * entry_count;
7528 
7529 	if ((tmp_table_new = kalloc_data(tmp_table_new_size, Z_WAITOK | Z_ZERO)) == NULL) {
7530 		error = ENOMEM;
7531 		goto out;
7532 	}
7533 
7534 	proc_list_lock();
7535 
7536 	if (memorystatus_global_probabilities_table) {
7537 		tmp_table_old = memorystatus_global_probabilities_table;
7538 		tmp_table_old_size = memorystatus_global_probabilities_size;
7539 	}
7540 
7541 	memorystatus_global_probabilities_table = tmp_table_new;
7542 	memorystatus_global_probabilities_size = tmp_table_new_size;
7543 	tmp_table_new = NULL;
7544 
7545 	for (i = 0; i < entry_count; i++) {
7546 		/* Build the table data  */
7547 		strlcpy(memorystatus_global_probabilities_table[i].proc_name, entries[i].proc_name, MAXCOMLEN + 1);
7548 		memorystatus_global_probabilities_table[i].use_probability = entries[i].use_probability;
7549 	}
7550 
7551 	proc_list_unlock();
7552 
7553 out:
7554 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, tmp_table_new_size);
7555 
7556 	kfree_data(entries, buffer_size);
7557 	kfree_data(tmp_table_old, tmp_table_old_size);
7558 
7559 	return error;
7560 }
7561 
7562 static int
7563 memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7564 {
7565 	int error = 0;
7566 
7567 	if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) {
7568 		error = memorystatus_cmd_grp_set_priorities(buffer, buffer_size);
7569 	} else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) == MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) {
7570 		error = memorystatus_cmd_grp_set_probabilities(buffer, buffer_size);
7571 #if CONFIG_FREEZE
7572 	} else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_FREEZE_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_FREEZE_PRIORITY) {
7573 		error = memorystatus_cmd_grp_set_freeze_list(buffer, buffer_size);
7574 	} else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_DEMOTE_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_DEMOTE_PRIORITY) {
7575 		error = memorystatus_cmd_grp_set_demote_list(buffer, buffer_size);
7576 #endif /* CONFIG_FREEZE */
7577 	} else {
7578 		error = EINVAL;
7579 	}
7580 
7581 	return error;
7582 }
7583 
7584 /*
7585  * This routine is used to update a process's jetsam priority position and stored user_data.
7586  * It is not used for the setting of memory limits.
7587  *
7588  * Flags passed into this call are used to distinguish the motivation behind a jetsam priority
7589  * transition.  By default, the kernel updates the process's original requested priority when
7590  * no flag is passed.  But when the MEMORYSTATUS_SET_PRIORITY_ASSERTION flag is used, the kernel
7591  * updates the process's assertion driven priority.
7592  *
7593  * The assertion flag was introduced for use by the device's assertion mediator (eg: runningboardd).
7594  * When an assertion is controlling a process's jetsam priority, it may conflict with that process's
7595  * dirty/clean (active/inactive) jetsam state.  The kernel attempts to resolve a priority transition
7596  * conflict by reviewing the process state and then choosing the maximum jetsam band at play,
7597  * eg: requested priority versus assertion priority.
7598  */
7599 
7600 static int
7601 memorystatus_cmd_set_priority_properties(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7602 {
7603 	int error = 0;
7604 	memorystatus_priority_properties_t mpp_entry;
7605 
7606 	/* Validate inputs */
7607 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_properties_t))) {
7608 		return EINVAL;
7609 	}
7610 
7611 	/* Validate flags */
7612 	if (flags & ~(MEMORYSTATUS_SET_PRIORITY_ASSERTION)) {
7613 		/*
7614 		 * Unsupported bit set in flag.
7615 		 */
7616 		return EINVAL;
7617 	}
7618 
7619 	error = copyin(buffer, &mpp_entry, buffer_size);
7620 
7621 	if (error == 0) {
7622 		proc_t p;
7623 
7624 		p = proc_find(pid);
7625 		if (!p) {
7626 			return ESRCH;
7627 		}
7628 
7629 		if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
7630 			proc_rele(p);
7631 			return EPERM;
7632 		}
7633 
7634 		if ((flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) &&
7635 		    !(p->p_memstat_state & P_MEMSTAT_MANAGED)) {
7636 			/*
7637 			 * Assertion-
7638 			 * processes.
7639 			 */
7640 			proc_rele(p);
7641 			return EPERM;
7642 		}
7643 
7644 		memstat_priority_options_t options = MEMSTAT_PRIORITY_OPTIONS_NONE;
7645 		if (flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) {
7646 			options |= MEMSTAT_PRIORITY_IS_ASSERTION;
7647 		}
7648 		error = memorystatus_set_priority(p, mpp_entry.priority, mpp_entry.user_data,
7649 		    options);
7650 		proc_rele(p);
7651 	}
7652 
7653 	return error;
7654 }
7655 
7656 static int
7657 memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7658 {
7659 	int error = 0;
7660 	memorystatus_memlimit_properties_t mmp_entry;
7661 
7662 	/* Validate inputs */
7663 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
7664 		return EINVAL;
7665 	}
7666 
7667 	error = copyin(buffer, &mmp_entry, buffer_size);
7668 
7669 	if (error == 0) {
7670 		error = memorystatus_set_memlimit_properties(pid, &mmp_entry);
7671 	}
7672 
7673 	return error;
7674 }
7675 
7676 #if DEBUG || DEVELOPMENT
7677 static int
7678 memorystatus_cmd_set_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7679 {
7680 	int error = 0;
7681 	memorystatus_diag_memlimit_properties_t mmp_entry;
7682 	proc_t p = proc_find(pid);
7683 	if (!p) {
7684 		return ESRCH;
7685 	}
7686 
7687 	/* Validate inputs */
7688 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_diag_memlimit_properties_t))) {
7689 		proc_rele(p);
7690 		return EINVAL;
7691 	}
7692 
7693 	error = copyin(buffer, &mmp_entry, buffer_size);
7694 
7695 	if (error == 0) {
7696 		proc_list_lock();
7697 		error = memorystatus_set_diag_memlimit_properties_internal(p, &mmp_entry);
7698 		proc_list_unlock();
7699 	}
7700 	proc_rele(p);
7701 	return error;
7702 }
7703 
7704 static int
7705 memorystatus_cmd_get_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7706 {
7707 	int error = 0;
7708 	memorystatus_diag_memlimit_properties_t mmp_entry;
7709 	proc_t p = proc_find(pid);
7710 	if (!p) {
7711 		return ESRCH;
7712 	}
7713 
7714 	/* Validate inputs */
7715 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_diag_memlimit_properties_t))) {
7716 		proc_rele(p);
7717 		return EINVAL;
7718 	}
7719 	proc_list_lock();
7720 	error = memorystatus_get_diag_memlimit_properties_internal(p, &mmp_entry);
7721 	proc_list_unlock();
7722 	proc_rele(p);
7723 	if (error == 0) {
7724 		error = copyout(&mmp_entry, buffer, buffer_size);
7725 	}
7726 
7727 
7728 	return error;
7729 }
7730 #endif //DEBUG || DEVELOPMENT
7731 
7732 static void
7733 memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t* p_entry)
7734 {
7735 	memset(p_entry, 0, sizeof(memorystatus_memlimit_properties_t));
7736 
7737 	if (p->p_memstat_memlimit_active > 0) {
7738 		p_entry->memlimit_active = p->p_memstat_memlimit_active;
7739 	} else {
7740 		task_convert_phys_footprint_limit(-1, &p_entry->memlimit_active);
7741 	}
7742 
7743 	if (_memstat_proc_active_memlimit_is_fatal(p)) {
7744 		p_entry->memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7745 	}
7746 
7747 	/*
7748 	 * Get the inactive limit and attributes
7749 	 */
7750 	if (p->p_memstat_memlimit_inactive <= 0) {
7751 		task_convert_phys_footprint_limit(-1, &p_entry->memlimit_inactive);
7752 	} else {
7753 		p_entry->memlimit_inactive = p->p_memstat_memlimit_inactive;
7754 	}
7755 	if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
7756 		p_entry->memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7757 	}
7758 }
7759 
7760 /*
7761  * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit().
7762  * That gets the proc's cached memlimit and there is no guarantee that the active/inactive
7763  * limits will be the same in the no-limit case.  Instead we convert limits <= 0 using
7764  * task_convert_phys_footprint_limit(). It computes the same limit value that would be written
7765  * to the task's ledgers via task_set_phys_footprint_limit().
7766  */
7767 static int
7768 memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7769 {
7770 	memorystatus_memlimit_properties2_t mmp_entry;
7771 
7772 	/* Validate inputs */
7773 	if ((pid == 0) || (buffer == USER_ADDR_NULL) ||
7774 	    ((buffer_size != sizeof(memorystatus_memlimit_properties_t)) &&
7775 	    (buffer_size != sizeof(memorystatus_memlimit_properties2_t)))) {
7776 		return EINVAL;
7777 	}
7778 
7779 	memset(&mmp_entry, 0, sizeof(memorystatus_memlimit_properties2_t));
7780 
7781 	proc_t p = proc_find(pid);
7782 	if (!p) {
7783 		return ESRCH;
7784 	}
7785 
7786 	/*
7787 	 * Get the active limit and attributes.
7788 	 * No locks taken since we hold a reference to the proc.
7789 	 */
7790 
7791 	memorystatus_get_memlimit_properties_internal(p, &mmp_entry.v1);
7792 
7793 #if CONFIG_JETSAM
7794 #if DEVELOPMENT || DEBUG
7795 	/*
7796 	 * Get the limit increased via SPI
7797 	 */
7798 	mmp_entry.memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
7799 	mmp_entry.memlimit_increase_bytes = p->p_memlimit_increase;
7800 #endif /* DEVELOPMENT || DEBUG */
7801 #endif /* CONFIG_JETSAM */
7802 
7803 	proc_rele(p);
7804 
7805 	int error = copyout(&mmp_entry, buffer, buffer_size);
7806 
7807 	return error;
7808 }
7809 
7810 
7811 /*
7812  * SPI for kbd - pr24956468
7813  * This is a very simple snapshot that calculates how much a
7814  * process's phys_footprint exceeds a specific memory limit.
7815  * Only the inactive memory limit is supported for now.
7816  * The delta is returned as bytes in excess or zero.
7817  */
7818 static int
7819 memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7820 {
7821 	int error = 0;
7822 	uint64_t footprint_in_bytes = 0;
7823 	uint64_t delta_in_bytes = 0;
7824 	int32_t  memlimit_mb = 0;
7825 	uint64_t memlimit_bytes = 0;
7826 
7827 	/* Validate inputs */
7828 	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(uint64_t)) || (flags != 0)) {
7829 		return EINVAL;
7830 	}
7831 
7832 	proc_t p = proc_find(pid);
7833 	if (!p) {
7834 		return ESRCH;
7835 	}
7836 
7837 	/*
7838 	 * Get the inactive limit.
7839 	 * No locks taken since we hold a reference to the proc.
7840 	 */
7841 
7842 	if (p->p_memstat_memlimit_inactive <= 0) {
7843 		task_convert_phys_footprint_limit(-1, &memlimit_mb);
7844 	} else {
7845 		memlimit_mb = p->p_memstat_memlimit_inactive;
7846 	}
7847 
7848 	footprint_in_bytes = get_task_phys_footprint(proc_task(p));
7849 
7850 	proc_rele(p);
7851 
7852 	memlimit_bytes = memlimit_mb * 1024 * 1024;     /* MB to bytes */
7853 
7854 	/*
7855 	 * Computed delta always returns >= 0 bytes
7856 	 */
7857 	if (footprint_in_bytes > memlimit_bytes) {
7858 		delta_in_bytes = footprint_in_bytes - memlimit_bytes;
7859 	}
7860 
7861 	error = copyout(&delta_in_bytes, buffer, sizeof(delta_in_bytes));
7862 
7863 	return error;
7864 }
7865 
7866 
7867 static int
7868 memorystatus_cmd_get_pressure_status(int32_t *retval)
7869 {
7870 	int error;
7871 
7872 	/* Need privilege for check */
7873 	error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
7874 	if (error) {
7875 		return error;
7876 	}
7877 
7878 	/* Inherently racy, so it's not worth taking a lock here */
7879 	*retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
7880 
7881 	return error;
7882 }
7883 
7884 int
7885 memorystatus_get_pressure_status_kdp()
7886 {
7887 	return (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
7888 }
7889 
7890 /*
7891  * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
7892  *
7893  * This call is inflexible -- it does not distinguish between active/inactive, fatal/non-fatal
7894  * So, with 2-level HWM preserving previous behavior will map as follows.
7895  *      - treat the limit passed in as both an active and inactive limit.
7896  *      - treat the is_fatal_limit flag as though it applies to both active and inactive limits.
7897  *
7898  * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK
7899  *      - the is_fatal_limit is FALSE, meaning the active and inactive limits are non-fatal/soft
7900  *      - so mapping is (active/non-fatal, inactive/non-fatal)
7901  *
7902  * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT
7903  *      - the is_fatal_limit is TRUE, meaning the process's active and inactive limits are fatal/hard
7904  *      - so mapping is (active/fatal, inactive/fatal)
7905  */
7906 
7907 #if CONFIG_JETSAM
7908 static int
7909 memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit)
7910 {
7911 	int error = 0;
7912 	memorystatus_memlimit_properties_t entry;
7913 
7914 	entry.memlimit_active = high_water_mark;
7915 	entry.memlimit_active_attr = 0;
7916 	entry.memlimit_inactive = high_water_mark;
7917 	entry.memlimit_inactive_attr = 0;
7918 
7919 	if (is_fatal_limit == TRUE) {
7920 		entry.memlimit_active_attr   |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7921 		entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7922 	}
7923 
7924 	error = memorystatus_set_memlimit_properties(pid, &entry);
7925 	return error;
7926 }
7927 
7928 static int
7929 memorystatus_cmd_mark_process_coalition_swappable(pid_t pid, __unused int32_t *retval)
7930 {
7931 	int error = 0;
7932 	proc_t p = PROC_NULL;
7933 	coalition_t coal = COALITION_NULL;
7934 
7935 	if (!memorystatus_swap_all_apps) {
7936 		/* Swap is not supported on this device. */
7937 		return ENOTSUP;
7938 	}
7939 	p = proc_find(pid);
7940 	if (!p) {
7941 		return ESRCH;
7942 	}
7943 	coal = task_get_coalition((task_t) proc_task(p), COALITION_TYPE_JETSAM);
7944 	if (coal && coalition_is_leader((task_t) proc_task(p), coal)) {
7945 		coalition_mark_swappable(coal);
7946 	} else {
7947 		/* This SPI is only supported on coalition leaders. */
7948 		error = EINVAL;
7949 	}
7950 
7951 	proc_rele(p);
7952 	return error;
7953 }
7954 
7955 static int
7956 memorystatus_cmd_get_process_coalition_is_swappable(pid_t pid, int32_t *retval)
7957 {
7958 	int error = 0;
7959 	proc_t p = PROC_NULL;
7960 	coalition_t coal = COALITION_NULL;
7961 
7962 	if (!memorystatus_swap_all_apps) {
7963 		/* Swap is not supported on this device. */
7964 		return ENOTSUP;
7965 	}
7966 	p = proc_find(pid);
7967 	if (!p) {
7968 		return ESRCH;
7969 	}
7970 	coal = task_get_coalition((task_t) proc_task(p), COALITION_TYPE_JETSAM);
7971 	if (coal) {
7972 		*retval = coalition_is_swappable(coal);
7973 	} else {
7974 		error = EINVAL;
7975 	}
7976 
7977 	proc_rele(p);
7978 	return error;
7979 }
7980 
7981 static int
7982 memorystatus_cmd_convert_memlimit_mb(pid_t pid, int32_t limit, int32_t *retval)
7983 {
7984 	int error = 0;
7985 	proc_t p;
7986 	p = proc_find(pid);
7987 	if (!p) {
7988 		return ESRCH;
7989 	}
7990 	if (limit <= 0) {
7991 		/*
7992 		 * A limit of <= 0 implies that the task gets its default limit.
7993 		 */
7994 		limit = memorystatus_get_default_task_active_limit(p);
7995 		if (limit <= 0) {
7996 			/* Task uses system wide default limit */
7997 			limit = max_task_footprint_mb ? max_task_footprint_mb : INT32_MAX;
7998 		}
7999 		*retval = limit;
8000 	} else {
8001 #if DEVELOPMENT || DEBUG
8002 		/* add the current increase to it, for roots */
8003 		limit += roundToNearestMB(p->p_memlimit_increase);
8004 #endif /* DEVELOPMENT || DEBUG */
8005 		*retval = limit;
8006 	}
8007 
8008 	proc_rele(p);
8009 	return error;
8010 }
8011 #endif /* CONFIG_JETSAM */
8012 
8013 #if DEBUG || DEVELOPMENT
8014 static int
8015 memorystatus_set_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry)
8016 {
8017 	int error = 0;
8018 	uint64_t old_limit = 0;
8019 
8020 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
8021 	/* Enforce the limit by writing to the ledgers */
8022 	error = (task_set_diag_footprint_limit_internal(proc_task(p), p_entry->memlimit, &old_limit) == KERN_SUCCESS) ? KERN_SUCCESS : EINVAL;
8023 
8024 	memorystatus_log_debug( "memorystatus_set_diag_memlimit_properties: new limit on pid %d (%lluMB old %lluMB)\n",
8025 	    proc_getpid(p), (p_entry->memlimit > 0 ? p_entry->memlimit : -1), (old_limit)
8026 	    );
8027 	DTRACE_MEMORYSTATUS2(memorystatus_diag_memlimit_properties_t, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
8028 	return error;
8029 }
8030 
8031 static int
8032 memorystatus_get_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry)
8033 {
8034 	int error = 0;
8035 	/* Enforce the limit by writing to the ledgers */
8036 	error = (task_get_diag_footprint_limit_internal(proc_task(p), &p_entry->memlimit, &p_entry->threshold_enabled) == KERN_SUCCESS) ? KERN_SUCCESS : EINVAL;
8037 
8038 	DTRACE_MEMORYSTATUS2(memorystatus_diag_memlimit_properties_t, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
8039 	return error;
8040 }
8041 #endif // DEBUG || DEVELOPMENT
8042 
8043 bool
8044 memorystatus_task_has_increased_memory_limit_entitlement(task_t task)
8045 {
8046 	if (memorystatus_entitled_max_task_footprint_mb == 0) {
8047 		// Entitlement is not supported on this device.
8048 		return false;
8049 	}
8050 	return IOTaskHasEntitlement(task,
8051 	           "com.apple.developer.kernel.increased-memory-limit");
8052 }
8053 
8054 bool
8055 memorystatus_task_has_increased_debugging_memory_limit_entitlement(task_t task)
8056 {
8057 	if (memorystatus_entitled_dev_max_task_footprint_mb == 0) {
8058 		// Entitlement is not supported on this device.
8059 		return false;
8060 	}
8061 	return IOTaskHasEntitlement(task,
8062 	           "com.apple.developer.kernel.increased-debugging-memory-limit");
8063 }
8064 
8065 bool
8066 memorystatus_task_has_legacy_footprint_entitlement(task_t task)
8067 {
8068 	return IOTaskHasEntitlement(task,
8069 	           "com.apple.private.memory.legacy_footprint");
8070 }
8071 
8072 bool
8073 memorystatus_task_has_ios13extended_footprint_limit(task_t task)
8074 {
8075 	if (max_mem < 1500ULL * 1024 * 1024 ||
8076 	    max_mem > 2ULL * 1024 * 1024 * 1024) {
8077 		/* ios13extended_footprint is only for 2GB devices */
8078 		return false;
8079 	}
8080 	return IOTaskHasEntitlement(task,
8081 	           "com.apple.developer.memory.ios13extended_footprint");
8082 }
8083 
8084 static int32_t
8085 memorystatus_get_default_task_active_limit(proc_t p)
8086 {
8087 	int32_t limit = (int32_t)max_task_footprint_mb;
8088 	task_t task = proc_task(p);
8089 
8090 	/*
8091 	 * Check for the various entitlement footprint hacks
8092 	 * and try to apply each one. Note that if multiple entitlements are present
8093 	 * whichever results in the largest limit applies.
8094 	 */
8095 	if (memorystatus_task_has_increased_debugging_memory_limit_entitlement(task)) {
8096 		limit = MAX(limit, memorystatus_entitled_dev_max_task_footprint_mb);
8097 	}
8098 	if (memorystatus_task_has_increased_memory_limit_entitlement(task)) {
8099 		limit = MAX(limit, memorystatus_entitled_max_task_footprint_mb);
8100 	}
8101 #if __arm64__
8102 	if (legacy_footprint_entitlement_mode == LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE &&
8103 	    memorystatus_task_has_legacy_footprint_entitlement(task)) {
8104 		limit = MAX(limit, max_task_footprint_mb + legacy_footprint_bonus_mb);
8105 	}
8106 #endif /* __arm64__ */
8107 	if (memorystatus_task_has_ios13extended_footprint_limit(task)) {
8108 		limit = MAX(limit, memorystatus_ios13extended_footprint_limit_mb);
8109 	}
8110 
8111 	return limit;
8112 }
8113 
8114 static int32_t
8115 memorystatus_get_default_task_inactive_limit(proc_t p)
8116 {
8117 	// Currently the default active and inactive limits are always the same.
8118 	return memorystatus_get_default_task_active_limit(p);
8119 }
8120 
8121 static int
8122 memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry)
8123 {
8124 	int32_t memlimit_active, memlimit_inactive;
8125 	memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
8126 
8127 	proc_t p = proc_find(pid);
8128 	if (!p) {
8129 		return ESRCH;
8130 	}
8131 
8132 	/*
8133 	 * Check for valid attribute flags.
8134 	 */
8135 	const uint32_t valid_attrs = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8136 	if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
8137 		proc_rele(p);
8138 		return EINVAL;
8139 	}
8140 	if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
8141 		proc_rele(p);
8142 		return EINVAL;
8143 	}
8144 
8145 	/*
8146 	 * Setup the active memlimit properties
8147 	 */
8148 	memlimit_active = entry->memlimit_active;
8149 	if (entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
8150 		memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
8151 	}
8152 
8153 	/*
8154 	 * Setup the inactive memlimit properties
8155 	 */
8156 	memlimit_inactive = entry->memlimit_inactive;
8157 	if (entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
8158 		memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
8159 	}
8160 
8161 	int error = memorystatus_set_memlimits(p, memlimit_active,
8162 	    memlimit_inactive, memlimit_options);
8163 	proc_rele(p);
8164 	return error;
8165 }
8166 
8167 /*
8168  * Returns the jetsam priority (effective or requested) of the process
8169  * associated with this task.
8170  */
8171 int
8172 proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
8173 {
8174 	if (p) {
8175 		if (effective_priority) {
8176 			return p->p_memstat_effectivepriority;
8177 		} else {
8178 			return p->p_memstat_requestedpriority;
8179 		}
8180 	}
8181 	return 0;
8182 }
8183 
8184 static int
8185 memorystatus_get_process_is_managed(pid_t pid, int *is_managed)
8186 {
8187 	proc_t p = NULL;
8188 
8189 	/* Validate inputs */
8190 	if (pid == 0) {
8191 		return EINVAL;
8192 	}
8193 
8194 	p = proc_find(pid);
8195 	if (!p) {
8196 		return ESRCH;
8197 	}
8198 
8199 	proc_list_lock();
8200 	*is_managed = ((p->p_memstat_state & P_MEMSTAT_MANAGED) ? 1 : 0);
8201 	proc_rele(p);
8202 	proc_list_unlock();
8203 
8204 	return 0;
8205 }
8206 
8207 static int
8208 memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed)
8209 {
8210 	proc_t p = NULL;
8211 
8212 	/* Validate inputs */
8213 	if (pid == 0) {
8214 		return EINVAL;
8215 	}
8216 
8217 	p = proc_find(pid);
8218 	if (!p) {
8219 		return ESRCH;
8220 	}
8221 
8222 	proc_list_lock();
8223 
8224 	if (set_managed == TRUE) {
8225 		p->p_memstat_state |= P_MEMSTAT_MANAGED;
8226 		/*
8227 		 * The P_MEMSTAT_MANAGED bit is set by Runningboard for Apps.
8228 		 * Also opt them in to being frozen (they might have started
8229 		 * off with the P_MEMSTAT_FREEZE_DISABLED bit set.)
8230 		 */
8231 		p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED;
8232 	} else {
8233 		p->p_memstat_state &= ~P_MEMSTAT_MANAGED;
8234 	}
8235 
8236 	if (_memstat_proc_is_tracked(p)) {
8237 		memorystatus_log_error("memorystatus: process %s [%d] opted in to both "
8238 		    "Management and ActivityTracking\n", proc_best_name(p),
8239 		    proc_pid(p));
8240 	}
8241 
8242 	proc_list_unlock();
8243 
8244 	proc_rele(p);
8245 
8246 	return 0;
8247 }
8248 
8249 int
8250 memorystatus_control(struct proc *p, struct memorystatus_control_args *args, int *ret)
8251 {
8252 	int error = EINVAL;
8253 	boolean_t skip_auth_check = FALSE;
8254 	os_reason_t jetsam_reason = OS_REASON_NULL;
8255 
8256 #if !CONFIG_JETSAM
8257     #pragma unused(ret)
8258     #pragma unused(jetsam_reason)
8259 #endif
8260 
8261 	/* We don't need entitlements if we're setting / querying the freeze preference or frozen status for a process. */
8262 	if (args->command == MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE ||
8263 	    args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE ||
8264 	    args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN) {
8265 		skip_auth_check = TRUE;
8266 	}
8267 
8268 	/*
8269 	 * On development kernel, we don't need entitlements if we're adjusting the limit.
8270 	 * This required for limit adjustment by dyld when roots are detected, see rdar://99669958
8271 	 */
8272 #if DEVELOPMENT || DEBUG
8273 	if (args->command == MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT && proc_getpid(p) == args->pid) {
8274 		skip_auth_check = TRUE;
8275 	}
8276 #endif /* DEVELOPMENT || DEBUG */
8277 
8278 	/* Need to be root or have entitlement. */
8279 	if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT) && !skip_auth_check) {
8280 		error = EPERM;
8281 		goto out;
8282 	}
8283 
8284 	/*
8285 	 * Sanity check.
8286 	 * Do not enforce it for snapshots.
8287 	 */
8288 	if (args->command != MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT &&
8289 	    args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_NAMES &&
8290 	    args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_INFO &&
8291 	    args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_MEMINFO) {
8292 		if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
8293 			error = EINVAL;
8294 			goto out;
8295 		}
8296 	}
8297 
8298 #if CONFIG_MACF
8299 	error = mac_proc_check_memorystatus_control(p, args->command, args->pid);
8300 	if (error) {
8301 		goto out;
8302 	}
8303 #endif /* MAC */
8304 
8305 	switch (args->command) {
8306 	case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
8307 		error = memorystatus_cmd_get_priority_list(args->pid, args->buffer, args->buffersize, ret);
8308 		break;
8309 	case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
8310 		error = memorystatus_cmd_set_priority_properties(args->pid, args->flags, args->buffer, args->buffersize, ret);
8311 		break;
8312 	case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES:
8313 		error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8314 		break;
8315 	case MEMORYSTATUS_CMD_GET_MEMLIMIT_PROPERTIES:
8316 		error = memorystatus_cmd_get_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8317 		break;
8318 	case MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS:
8319 		error = memorystatus_cmd_get_memlimit_excess_np(args->pid, args->flags, args->buffer, args->buffersize, ret);
8320 		break;
8321 	case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
8322 		error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
8323 		break;
8324 	case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
8325 		error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret);
8326 		break;
8327 #if JETSAM_ZPRINT_SNAPSHOT
8328 	case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_NAMES:
8329 		error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
8330 		    jzs_zone_cnt * sizeof(mach_zone_name_t), jzs_names);
8331 		break;
8332 	case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_INFO:
8333 		error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
8334 		    jzs_zone_cnt * sizeof(mach_zone_info_t), jzs_info);
8335 		break;
8336 	case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_MEMINFO:
8337 		error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
8338 		    jzs_meminfo_cnt * sizeof(mach_memory_info_t), jzs_meminfo);
8339 		break;
8340 #endif
8341 #if DEVELOPMENT || DEBUG
8342 	case MEMORYSTATUS_CMD_SET_TESTING_PID:
8343 		error = memorystatus_cmd_set_testing_pid((int32_t) args->flags);
8344 		break;
8345 #endif
8346 	case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
8347 		error = memorystatus_cmd_get_pressure_status(ret);
8348 		break;
8349 #if CONFIG_JETSAM
8350 	case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
8351 		/*
8352 		 * This call does not distinguish between active and inactive limits.
8353 		 * Default behavior in 2-level HWM world is to set both.
8354 		 * Non-fatal limit is also assumed for both.
8355 		 */
8356 		error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
8357 		break;
8358 	case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
8359 		/*
8360 		 * This call does not distinguish between active and inactive limits.
8361 		 * Default behavior in 2-level HWM world is to set both.
8362 		 * Fatal limit is also assumed for both.
8363 		 */
8364 		error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
8365 		break;
8366 	case MEMORYSTATUS_CMD_MARK_PROCESS_COALITION_SWAPPABLE:
8367 		error = memorystatus_cmd_mark_process_coalition_swappable(args->pid, ret);
8368 		break;
8369 
8370 	case MEMORYSTATUS_CMD_GET_PROCESS_COALITION_IS_SWAPPABLE:
8371 		error = memorystatus_cmd_get_process_coalition_is_swappable(args->pid, ret);
8372 		break;
8373 
8374 	case MEMORYSTATUS_CMD_CONVERT_MEMLIMIT_MB:
8375 		error = memorystatus_cmd_convert_memlimit_mb(args->pid, (int32_t) args->flags, ret);
8376 		break;
8377 #endif /* CONFIG_JETSAM */
8378 		/* Test commands */
8379 #if DEVELOPMENT || DEBUG
8380 	case MEMORYSTATUS_CMD_TEST_JETSAM:
8381 		jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_GENERIC);
8382 		if (jetsam_reason == OS_REASON_NULL) {
8383 			memorystatus_log_error("memorystatus_control: failed to allocate jetsam reason\n");
8384 		}
8385 
8386 		error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled, jetsam_reason) ? 0 : EINVAL;
8387 		break;
8388 	case MEMORYSTATUS_CMD_TEST_JETSAM_SORT:
8389 		error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags, args->buffer, args->buffersize);
8390 		break;
8391 #else /* DEVELOPMENT || DEBUG */
8392 	#pragma unused(jetsam_reason)
8393 #endif /* DEVELOPMENT || DEBUG */
8394 	case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE:
8395 		if (memorystatus_aggressive_jetsam_lenient_allowed == FALSE) {
8396 #if DEVELOPMENT || DEBUG
8397 			memorystatus_log_info("Enabling Lenient Mode\n");
8398 #endif /* DEVELOPMENT || DEBUG */
8399 
8400 			memorystatus_aggressive_jetsam_lenient_allowed = TRUE;
8401 			memorystatus_aggressive_jetsam_lenient = TRUE;
8402 			error = 0;
8403 		}
8404 		break;
8405 	case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE:
8406 #if DEVELOPMENT || DEBUG
8407 		memorystatus_log_info("Disabling Lenient mode\n");
8408 #endif /* DEVELOPMENT || DEBUG */
8409 		memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
8410 		memorystatus_aggressive_jetsam_lenient = FALSE;
8411 		error = 0;
8412 		break;
8413 	case MEMORYSTATUS_CMD_GET_AGGRESSIVE_JETSAM_LENIENT_MODE:
8414 		*ret = (memorystatus_aggressive_jetsam_lenient ? 1 : 0);
8415 		error = 0;
8416 		break;
8417 	case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE:
8418 	case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE:
8419 		error = memorystatus_low_mem_privileged_listener(args->command);
8420 		break;
8421 
8422 	case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE:
8423 	case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE:
8424 		error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, JETSAM_PRIORITY_ELEVATED_INACTIVE, args->flags ? TRUE : FALSE);
8425 		break;
8426 	case MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED:
8427 		error = memorystatus_set_process_is_managed(args->pid, args->flags);
8428 		break;
8429 
8430 	case MEMORYSTATUS_CMD_GET_PROCESS_IS_MANAGED:
8431 		error = memorystatus_get_process_is_managed(args->pid, ret);
8432 		break;
8433 
8434 #if CONFIG_FREEZE
8435 	case MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE:
8436 		error = memorystatus_set_process_is_freezable(args->pid, args->flags ? TRUE : FALSE);
8437 		break;
8438 
8439 	case MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE:
8440 		error = memorystatus_get_process_is_freezable(args->pid, ret);
8441 		break;
8442 	case MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN:
8443 		error = memorystatus_get_process_is_frozen(args->pid, ret);
8444 		break;
8445 
8446 	case MEMORYSTATUS_CMD_FREEZER_CONTROL:
8447 		error = memorystatus_freezer_control(args->flags, args->buffer, args->buffersize, ret);
8448 		break;
8449 #endif /* CONFIG_FREEZE */
8450 
8451 #if DEVELOPMENT || DEBUG
8452 	case MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT:
8453 		error = memorystatus_cmd_increase_jetsam_task_limit(args->pid, args->flags);
8454 		break;
8455 	case MEMORYSTATUS_CMD_SET_DIAG_LIMIT:
8456 		error = memorystatus_cmd_set_diag_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8457 		break;
8458 	case MEMORYSTATUS_CMD_GET_DIAG_LIMIT:
8459 		error = memorystatus_cmd_get_diag_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8460 		break;
8461 #endif /* DEVELOPMENT || DEBUG */
8462 	default:
8463 		error = EINVAL;
8464 		break;
8465 	}
8466 
8467 out:
8468 	return error;
8469 }
8470 
8471 /* Coalition support */
8472 
8473 /* sorting info for a particular priority bucket */
8474 typedef struct memstat_sort_info {
8475 	coalition_t     msi_coal;
8476 	uint64_t        msi_page_count;
8477 	pid_t           msi_pid;
8478 	int             msi_ntasks;
8479 } memstat_sort_info_t;
8480 
8481 /*
8482  * qsort from smallest page count to largest page count
8483  *
8484  * return < 0 for a < b
8485  *          0 for a == b
8486  *        > 0 for a > b
8487  */
8488 static int
8489 memstat_asc_cmp(const void *a, const void *b)
8490 {
8491 	const memstat_sort_info_t *msA = (const memstat_sort_info_t *)a;
8492 	const memstat_sort_info_t *msB = (const memstat_sort_info_t *)b;
8493 
8494 	return (int)((uint64_t)msA->msi_page_count - (uint64_t)msB->msi_page_count);
8495 }
8496 
8497 /*
8498  * Return the number of pids rearranged during this sort.
8499  */
8500 static int
8501 memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order)
8502 {
8503 #define MAX_SORT_PIDS           80
8504 #define MAX_COAL_LEADERS        10
8505 
8506 	unsigned int b = bucket_index;
8507 	int nleaders = 0;
8508 	int ntasks = 0;
8509 	proc_t p = NULL;
8510 	coalition_t coal = COALITION_NULL;
8511 	int pids_moved = 0;
8512 	int total_pids_moved = 0;
8513 	int i;
8514 
8515 	/*
8516 	 * The system is typically under memory pressure when in this
8517 	 * path, hence, we want to avoid dynamic memory allocation.
8518 	 */
8519 	memstat_sort_info_t leaders[MAX_COAL_LEADERS];
8520 	pid_t pid_list[MAX_SORT_PIDS];
8521 
8522 	if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8523 		return 0;
8524 	}
8525 
8526 	/*
8527 	 * Clear the array that holds coalition leader information
8528 	 */
8529 	for (i = 0; i < MAX_COAL_LEADERS; i++) {
8530 		leaders[i].msi_coal = COALITION_NULL;
8531 		leaders[i].msi_page_count = 0;          /* will hold total coalition page count */
8532 		leaders[i].msi_pid = 0;                 /* will hold coalition leader pid */
8533 		leaders[i].msi_ntasks = 0;              /* will hold the number of tasks in a coalition */
8534 	}
8535 
8536 	p = memorystatus_get_first_proc_locked(&b, FALSE);
8537 	while (p) {
8538 		coal = task_get_coalition(proc_task(p), COALITION_TYPE_JETSAM);
8539 		if (coalition_is_leader(proc_task(p), coal)) {
8540 			if (nleaders < MAX_COAL_LEADERS) {
8541 				int coal_ntasks = 0;
8542 				uint64_t coal_page_count = coalition_get_page_count(coal, &coal_ntasks);
8543 				leaders[nleaders].msi_coal = coal;
8544 				leaders[nleaders].msi_page_count = coal_page_count;
8545 				leaders[nleaders].msi_pid = proc_getpid(p);           /* the coalition leader */
8546 				leaders[nleaders].msi_ntasks = coal_ntasks;
8547 				nleaders++;
8548 			} else {
8549 				/*
8550 				 * We've hit MAX_COAL_LEADERS meaning we can handle no more coalitions.
8551 				 * Abandoned coalitions will linger at the tail of the priority band
8552 				 * when this sort session ends.
8553 				 * TODO:  should this be an assert?
8554 				 */
8555 				memorystatus_log_error(
8556 					"%s: WARNING: more than %d leaders in priority band [%d]\n",
8557 					__FUNCTION__, MAX_COAL_LEADERS, bucket_index);
8558 				break;
8559 			}
8560 		}
8561 		p = memorystatus_get_next_proc_locked(&b, p, FALSE);
8562 	}
8563 
8564 	if (nleaders == 0) {
8565 		/* Nothing to sort */
8566 		return 0;
8567 	}
8568 
8569 	/*
8570 	 * Sort the coalition leader array, from smallest coalition page count
8571 	 * to largest coalition page count.  When inserted in the priority bucket,
8572 	 * smallest coalition is handled first, resulting in the last to be jetsammed.
8573 	 */
8574 	if (nleaders > 1) {
8575 		qsort(leaders, nleaders, sizeof(memstat_sort_info_t), memstat_asc_cmp);
8576 	}
8577 
8578 	/*
8579 	 * During coalition sorting, processes in a priority band are rearranged
8580 	 * by being re-inserted at the head of the queue.  So, when handling a
8581 	 * list, the first process that gets moved to the head of the queue,
8582 	 * ultimately gets pushed toward the queue tail, and hence, jetsams last.
8583 	 *
8584 	 * So, for example, the coalition leader is expected to jetsam last,
8585 	 * after its coalition members.  Therefore, the coalition leader is
8586 	 * inserted at the head of the queue first.
8587 	 *
8588 	 * After processing a coalition, the jetsam order is as follows:
8589 	 *   undefs(jetsam first), extensions, xpc services, leader(jetsam last)
8590 	 */
8591 
8592 	/*
8593 	 * Coalition members are rearranged in the priority bucket here,
8594 	 * based on their coalition role.
8595 	 */
8596 	total_pids_moved = 0;
8597 	for (i = 0; i < nleaders; i++) {
8598 		/* a bit of bookkeeping */
8599 		pids_moved = 0;
8600 
8601 		/* Coalition leaders are jetsammed last, so move into place first */
8602 		pid_list[0] = leaders[i].msi_pid;
8603 		pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, 1);
8604 
8605 		/* xpc services should jetsam after extensions */
8606 		ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_XPC,
8607 		    coal_sort_order, pid_list, MAX_SORT_PIDS);
8608 
8609 		if (ntasks > 0) {
8610 			pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8611 			    (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8612 		}
8613 
8614 		/* extensions should jetsam after unmarked processes */
8615 		ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_EXT,
8616 		    coal_sort_order, pid_list, MAX_SORT_PIDS);
8617 
8618 		if (ntasks > 0) {
8619 			pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8620 			    (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8621 		}
8622 
8623 		/* undefined coalition members should be the first to jetsam */
8624 		ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_UNDEF,
8625 		    coal_sort_order, pid_list, MAX_SORT_PIDS);
8626 
8627 		if (ntasks > 0) {
8628 			pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8629 			    (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8630 		}
8631 
8632 		total_pids_moved += pids_moved;
8633 	} /* end for */
8634 
8635 	return total_pids_moved;
8636 }
8637 
8638 
8639 /*
8640  * Traverse a list of pids, searching for each within the priority band provided.
8641  * If pid is found, move it to the front of the priority band.
8642  * Never searches outside the priority band provided.
8643  *
8644  * Input:
8645  *	bucket_index - jetsam priority band.
8646  *	pid_list - pointer to a list of pids.
8647  *	list_sz  - number of pids in the list.
8648  *
8649  * Pid list ordering is important in that,
8650  * pid_list[n] is expected to jetsam ahead of pid_list[n+1].
8651  * The sort_order is set by the coalition default.
8652  *
8653  * Return:
8654  *	the number of pids found and hence moved within the priority band.
8655  */
8656 static int
8657 memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz)
8658 {
8659 	memstat_bucket_t *current_bucket;
8660 	int i;
8661 	int found_pids = 0;
8662 
8663 	if ((pid_list == NULL) || (list_sz <= 0)) {
8664 		return 0;
8665 	}
8666 
8667 	if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8668 		return 0;
8669 	}
8670 
8671 	current_bucket = &memstat_bucket[bucket_index];
8672 	for (i = 0; i < list_sz; i++) {
8673 		unsigned int b = bucket_index;
8674 		proc_t p = NULL;
8675 		proc_t aProc = NULL;
8676 		pid_t  aPid;
8677 		int list_index;
8678 
8679 		list_index = ((list_sz - 1) - i);
8680 		aPid = pid_list[list_index];
8681 
8682 		/* never search beyond bucket_index provided */
8683 		p = memorystatus_get_first_proc_locked(&b, FALSE);
8684 		while (p) {
8685 			if (proc_getpid(p) == aPid) {
8686 				aProc = p;
8687 				break;
8688 			}
8689 			p = memorystatus_get_next_proc_locked(&b, p, FALSE);
8690 		}
8691 
8692 		if (aProc == NULL) {
8693 			/* pid not found in this band, just skip it */
8694 			continue;
8695 		} else {
8696 			TAILQ_REMOVE(&current_bucket->list, aProc, p_memstat_list);
8697 			TAILQ_INSERT_HEAD(&current_bucket->list, aProc, p_memstat_list);
8698 			found_pids++;
8699 		}
8700 	}
8701 	return found_pids;
8702 }
8703 
8704 int
8705 memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index)
8706 {
8707 	int32_t i = JETSAM_PRIORITY_IDLE;
8708 	int count = 0;
8709 
8710 	if (max_bucket_index >= MEMSTAT_BUCKET_COUNT) {
8711 		return -1;
8712 	}
8713 
8714 	while (i <= max_bucket_index) {
8715 		count += memstat_bucket[i++].count;
8716 	}
8717 
8718 	return count;
8719 }
8720 
8721 int
8722 memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap)
8723 {
8724 #if !CONFIG_JETSAM
8725 	if (!p || (!isApp(p)) || (p->p_memstat_state & (P_MEMSTAT_INTERNAL | P_MEMSTAT_MANAGED))) {
8726 		/*
8727 		 * Ineligible processes OR system processes e.g. launchd.
8728 		 *
8729 		 * We also skip processes that have the P_MEMSTAT_MANAGED bit set, i.e.
8730 		 * they're managed by assertiond. These are iOS apps that have been ported
8731 		 * to macOS. assertiond might be in the process of modifying the app's
8732 		 * priority / memory limit - so it might have the proc_list lock, and then try
8733 		 * to take the task lock. Meanwhile we've entered this function with the task lock
8734 		 * held, and we need the proc_list lock below. So we'll deadlock with assertiond.
8735 		 *
8736 		 * It should be fine to read the P_MEMSTAT_MANAGED bit without the proc_list
8737 		 * lock here, since assertiond only sets this bit on process launch.
8738 		 */
8739 		return -1;
8740 	}
8741 
8742 	/*
8743 	 * For macOS only:
8744 	 * We would like to use memorystatus_set_priority() here to move the processes
8745 	 * within the bands. Unfortunately memorystatus_set_priority() calls
8746 	 * memorystatus_update_priority_locked() which uses any band transitions
8747 	 * as an indication to modify ledgers. For that it needs the task lock
8748 	 * and since we came into this function with the task lock held, we'll deadlock.
8749 	 *
8750 	 * Unfortunately we can't completely disable ledger updates  because we still
8751 	 * need the ledger updates for a subset of processes i.e. daemons.
8752 	 * When all processes on all platforms support memory limits, we can simply call
8753 	 * memorystatus_set_priority().
8754 	 *
8755 	 * It also has some logic to deal with 'aging' which, currently, is only applicable
8756 	 * on CONFIG_JETSAM configs. So, till every platform has CONFIG_JETSAM we'll need
8757 	 * to do this explicit band transition.
8758 	 */
8759 
8760 	memstat_bucket_t *current_bucket, *new_bucket;
8761 	int32_t priority = 0;
8762 
8763 	proc_list_lock();
8764 
8765 	if (proc_list_exited(p) ||
8766 	    (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP))) {
8767 		/*
8768 		 * If the process is on its way out OR
8769 		 * jetsam has alread tried and failed to kill this process,
8770 		 * let's skip the whole jetsam band transition.
8771 		 */
8772 		proc_list_unlock();
8773 		return 0;
8774 	}
8775 
8776 	if (is_appnap) {
8777 		current_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
8778 		new_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8779 		priority = JETSAM_PRIORITY_IDLE;
8780 	} else {
8781 		if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
8782 			/*
8783 			 * It is possible that someone pulled this process
8784 			 * out of the IDLE band without updating its app-nap
8785 			 * parameters.
8786 			 */
8787 			proc_list_unlock();
8788 			return 0;
8789 		}
8790 
8791 		current_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8792 		new_bucket = &memstat_bucket[p->p_memstat_requestedpriority];
8793 		priority = p->p_memstat_requestedpriority;
8794 	}
8795 
8796 	TAILQ_REMOVE(&current_bucket->list, p, p_memstat_list);
8797 	current_bucket->count--;
8798 	if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
8799 		current_bucket->relaunch_high_count--;
8800 	}
8801 	TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
8802 	new_bucket->count++;
8803 	if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
8804 		new_bucket->relaunch_high_count++;
8805 	}
8806 	/*
8807 	 * Record idle start or idle delta.
8808 	 */
8809 	if (p->p_memstat_effectivepriority == priority) {
8810 		/*
8811 		 * This process is not transitioning between
8812 		 * jetsam priority buckets.  Do nothing.
8813 		 */
8814 	} else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
8815 		uint64_t now;
8816 		/*
8817 		 * Transitioning out of the idle priority bucket.
8818 		 * Record idle delta.
8819 		 */
8820 		assert(p->p_memstat_idle_start != 0);
8821 		now = mach_absolute_time();
8822 		if (now > p->p_memstat_idle_start) {
8823 			p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
8824 		}
8825 	} else if (priority == JETSAM_PRIORITY_IDLE) {
8826 		/*
8827 		 * Transitioning into the idle priority bucket.
8828 		 * Record idle start.
8829 		 */
8830 		p->p_memstat_idle_start = mach_absolute_time();
8831 	}
8832 
8833 	KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CHANGE_PRIORITY), proc_getpid(p), priority, p->p_memstat_effectivepriority);
8834 
8835 	p->p_memstat_effectivepriority = priority;
8836 
8837 	proc_list_unlock();
8838 
8839 	return 0;
8840 
8841 #else /* !CONFIG_JETSAM */
8842 	#pragma unused(p)
8843 	#pragma unused(is_appnap)
8844 	return -1;
8845 #endif /* !CONFIG_JETSAM */
8846 }
8847 
8848 uint64_t
8849 memorystatus_available_memory_internal(struct proc *p)
8850 {
8851 #ifdef XNU_TARGET_OS_OSX
8852 	if (p->p_memstat_memlimit <= 0) {
8853 		return 0;
8854 	}
8855 #endif /* XNU_TARGET_OS_OSX */
8856 	const uint64_t footprint_in_bytes = get_task_phys_footprint(proc_task(p));
8857 	int32_t memlimit_mb;
8858 	int64_t memlimit_bytes;
8859 	int64_t rc;
8860 
8861 	if (isApp(p) == FALSE) {
8862 		return 0;
8863 	}
8864 
8865 	if (p->p_memstat_memlimit > 0) {
8866 		memlimit_mb = p->p_memstat_memlimit;
8867 	} else if (task_convert_phys_footprint_limit(-1, &memlimit_mb) != KERN_SUCCESS) {
8868 		return 0;
8869 	}
8870 
8871 	if (memlimit_mb <= 0) {
8872 		memlimit_bytes = INT_MAX & ~((1 << 20) - 1);
8873 	} else {
8874 		memlimit_bytes = ((int64_t) memlimit_mb) << 20;
8875 	}
8876 
8877 	rc = memlimit_bytes - footprint_in_bytes;
8878 
8879 	return (rc >= 0) ? rc : 0;
8880 }
8881 
8882 int
8883 memorystatus_available_memory(struct proc *p, __unused struct memorystatus_available_memory_args *args, uint64_t *ret)
8884 {
8885 	*ret = memorystatus_available_memory_internal(p);
8886 
8887 	return 0;
8888 }
8889 
8890 void
8891 memorystatus_log_system_health(const memorystatus_system_health_t *status)
8892 {
8893 	static struct memorystatus_system_health prev_status = {0};
8894 
8895 	bool healthy = memorystatus_is_system_healthy(status);
8896 
8897 	/*
8898 	 * Avoid spamming logs by only logging when the system status has changed.
8899 	 */
8900 	if (prev_status.msh_zone_map_is_exhausted == status->msh_zone_map_is_exhausted
8901 #if CONFIG_JETSAM
8902 	    &&
8903 	    prev_status.msh_available_pages_below_idle == status->msh_available_pages_below_idle &&
8904 	    prev_status.msh_available_pages_below_soft == status->msh_available_pages_below_soft &&
8905 	    prev_status.msh_available_pages_below_critical == status->msh_available_pages_below_critical &&
8906 	    prev_status.msh_compressor_needs_to_swap == status->msh_compressor_needs_to_swap &&
8907 	    prev_status.msh_compressor_is_low_on_space == status->msh_compressor_is_low_on_space &&
8908 	    prev_status.msh_compressor_is_thrashing == status->msh_compressor_is_thrashing &&
8909 	    prev_status.msh_compressed_pages_nearing_limit == status->msh_compressed_pages_nearing_limit &&
8910 	    prev_status.msh_filecache_is_thrashing == status->msh_filecache_is_thrashing &&
8911 	    prev_status.msh_phantom_cache_pressure == status->msh_phantom_cache_pressure &&
8912 	    prev_status.msh_swappable_compressor_segments_over_limit == status->msh_swappable_compressor_segments_over_limit &&
8913 	    prev_status.msh_swapin_queue_over_limit == status->msh_swapin_queue_over_limit &&
8914 	    prev_status.msh_swap_low_on_space == status->msh_swap_low_on_space &&
8915 	    prev_status.msh_swap_out_of_space == status->msh_swap_out_of_space &&
8916 	    prev_status.msh_pageout_starved == status->msh_pageout_starved
8917 #endif /* CONFIG_JETSAM */
8918 	    ) {
8919 		/* No change */
8920 		return;
8921 	}
8922 
8923 #if CONFIG_JETSAM
8924 	if (healthy) {
8925 		if (status->msh_available_pages_below_soft) {
8926 			memorystatus_log(
8927 				"memorystatus: System will begin enforcing "
8928 				"soft memory limits. "
8929 				"memorystatus_available_pages: %llu compressor_size: %u\n",
8930 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
8931 		} else if (status->msh_available_pages_below_idle) {
8932 			memorystatus_log(
8933 				"memorystatus: System will begin enacting "
8934 				"idle-exits. "
8935 				"memorystatus_available_pages: %llu compressor_size: %u\n",
8936 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
8937 		} else {
8938 			memorystatus_log(
8939 				"memorystatus: System is healthy. "
8940 				"memorystatus_available_pages: %llu compressor_size:%u\n",
8941 				(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
8942 		}
8943 	} else {
8944 		/* Unhealthy */
8945 		memorystatus_log("memorystatus: System is unhealthy! memorystatus_available_pages: %llu compressor_size:%u\n",
8946 		    (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
8947 		memorystatus_log(
8948 			"memorystatus: {"
8949 			"\"available_pages_below_critical\": %d, "
8950 			"\"available_pages_below_idle\": %d, "
8951 			"\"available_pages_below_soft\": %d, "
8952 			"\"compressor_needs_to_swap\": %d, "
8953 			"\"compressor_is_low_on_space\": %d, "
8954 			"\"compressor_is_thrashing\": %d, "
8955 			"\"compressed_pages_nearing_limit\": %d, "
8956 			"\"filecache_is_thrashing\": %d, "
8957 			"\"zone_map_is_exhausted\": %d, "
8958 			"\"phantom_cache_pressure\": %d, "
8959 			"\"swappable_compressor_segments_over_limit\": %d, "
8960 			"\"swapin_queue_over_limit\": %d, "
8961 			"\"swap_low\": %d, "
8962 			"\"swap_full\": %d"
8963 			"}\n",
8964 			status->msh_available_pages_below_critical,
8965 			status->msh_available_pages_below_idle,
8966 			status->msh_available_pages_below_soft,
8967 			status->msh_compressor_needs_to_swap,
8968 			status->msh_compressor_is_low_on_space,
8969 			status->msh_compressor_is_thrashing,
8970 			status->msh_compressed_pages_nearing_limit,
8971 			status->msh_filecache_is_thrashing,
8972 			status->msh_zone_map_is_exhausted,
8973 			status->msh_phantom_cache_pressure,
8974 			status->msh_swappable_compressor_segments_over_limit,
8975 			status->msh_swapin_queue_over_limit,
8976 			status->msh_swap_low_on_space,
8977 			status->msh_swap_out_of_space);
8978 	}
8979 #else /* CONFIG_JETSAM */
8980 	memorystatus_log("memorystatus: System is %s. memorystatus_available_pages: %llu compressor_size:%u\n",
8981 	    healthy ? "healthy" : "unhealthy",
8982 	    (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
8983 	if (!healthy) {
8984 		memorystatus_log("memorystatus: zone_map_is_exhausted=%d\n",
8985 		    status->msh_zone_map_is_exhausted);
8986 	}
8987 #endif /* CONFIG_JETSAM */
8988 	prev_status = *status;
8989 }
8990 
8991 uint32_t
8992 memorystatus_pick_kill_cause(const memorystatus_system_health_t *status)
8993 {
8994 	assert(!memorystatus_is_system_healthy(status));
8995 #if CONFIG_JETSAM
8996 	if (status->msh_compressor_is_thrashing) {
8997 		return kMemorystatusKilledVMCompressorThrashing;
8998 	} else if (status->msh_compressor_is_low_on_space) {
8999 		return kMemorystatusKilledVMCompressorSpaceShortage;
9000 	} else if (status->msh_filecache_is_thrashing) {
9001 		return kMemorystatusKilledFCThrashing;
9002 	} else if (status->msh_zone_map_is_exhausted) {
9003 		return kMemorystatusKilledZoneMapExhaustion;
9004 	} else if (status->msh_pageout_starved) {
9005 		return kMemorystatusKilledVMPageoutStarvation;
9006 	} else {
9007 		assert(status->msh_available_pages_below_critical);
9008 		return kMemorystatusKilledVMPageShortage;
9009 	}
9010 #else /* CONFIG_JETSAM */
9011 	assert(status->msh_zone_map_is_exhausted);
9012 	(void) status;
9013 	return kMemorystatusKilledZoneMapExhaustion;
9014 #endif /* CONFIG_JETSAM */
9015 }
9016 
9017 #if DEVELOPMENT || DEBUG
9018 static int
9019 memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase)
9020 {
9021 	int32_t memlimit_active, memlimit_inactive;
9022 
9023 	/* Validate inputs */
9024 	if ((pid == 0) || (byte_increase == 0)) {
9025 		return EINVAL;
9026 	}
9027 
9028 	proc_t p = proc_find(pid);
9029 
9030 	if (!p) {
9031 		return ESRCH;
9032 	}
9033 
9034 	const uint32_t current_memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
9035 	/* round to page */
9036 	const int32_t page_aligned_increase = (int32_t) MIN(round_page(p->p_memlimit_increase + byte_increase), INT32_MAX);
9037 
9038 	proc_list_lock();
9039 
9040 	memlimit_active = p->p_memstat_memlimit_active;
9041 	if (memlimit_active > 0) {
9042 		memlimit_active -= current_memlimit_increase;
9043 		memlimit_active += roundToNearestMB(page_aligned_increase);
9044 	}
9045 
9046 	memlimit_inactive = p->p_memstat_memlimit_inactive;
9047 	if (memlimit_inactive > 0) {
9048 		memlimit_inactive -= current_memlimit_increase;
9049 		memlimit_inactive += roundToNearestMB(page_aligned_increase);
9050 	}
9051 
9052 	/*
9053 	 * Store the updated delta limit in the proc.
9054 	 */
9055 	p->p_memlimit_increase = page_aligned_increase;
9056 
9057 	memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
9058 	if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
9059 		memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
9060 	}
9061 	if (_memstat_proc_active_memlimit_is_fatal(p)) {
9062 		memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
9063 	}
9064 
9065 	int error = memstat_set_memlimits_locked(p,
9066 	    memlimit_active, memlimit_inactive,
9067 	    memlimit_options);
9068 
9069 	proc_list_unlock();
9070 	proc_rele(p);
9071 
9072 	return error;
9073 }
9074 #endif /* DEVELOPMENT */
9075