1 /*
2 * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29
30 #include <kern/sched_prim.h>
31 #include <kern/kalloc.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/host.h>
38 #include <kern/policy_internal.h>
39 #include <kern/thread_group.h>
40 #include <kern/zalloc.h>
41
42 #include <corpses/task_corpse.h>
43 #include <libkern/libkern.h>
44 #include <mach/mach_time.h>
45 #include <mach/task.h>
46 #include <mach/host_priv.h>
47 #include <mach/mach_host.h>
48 #include <pexpert/pexpert.h>
49 #include <sys/coalition.h>
50 #include <sys/code_signing.h>
51 #include <sys/kern_event.h>
52 #include <sys/proc.h>
53 #include <sys/proc_info.h>
54 #include <sys/reason.h>
55 #include <sys/signal.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysproto.h>
59 #include <sys/spawn_internal.h>
60 #include <sys/wait.h>
61 #include <sys/tree.h>
62 #include <sys/priv.h>
63 #include <vm/pmap.h>
64 #include <vm/vm_reclaim_xnu.h>
65 #include <vm/vm_pageout_xnu.h>
66 #include <vm/vm_protos.h>
67 #include <vm/vm_purgeable_xnu.h>
68 #include <vm/vm_page.h>
69 #include <vm/vm_compressor_xnu.h>
70 #include <mach/machine/sdt.h>
71 #include <libkern/section_keywords.h>
72 #include <os/atomic_private.h>
73 #include <os/overflow.h>
74 #include <mach/mach_time.h>
75
76 #include <IOKit/IOBSD.h>
77
78 #if CONFIG_MACF
79 #include <security/mac_framework.h>
80 #endif
81
82 #if CONFIG_FREEZE
83 #include <vm/vm_map.h>
84 #endif /* CONFIG_FREEZE */
85
86 #include <kern/kern_memorystatus_internal.h>
87 #include <sys/kern_memorystatus.h>
88 #include <sys/kern_memorystatus_xnu.h>
89 #include <sys/kern_memorystatus_freeze.h>
90 #include <sys/kern_memorystatus_notify.h>
91 #include <sys/kdebug_triage.h>
92 #include <sys/file_internal.h>
93 #include <net/necp.h>
94
95 errno_t mach_to_bsd_errno(kern_return_t mach_err);
96 extern uint32_t vm_compressor_pool_size(void);
97 extern uint32_t vm_compressor_fragmentation_level(void);
98
99 int block_corpses = 0; /* counter to block new corpses if jetsam purges them */
100
101 /* For logging clarity */
102 static const char *memstat_kill_cause_name[] = {
103 "", /* kMemorystatusInvalid */
104 "jettisoned", /* kMemorystatusKilled */
105 "highwater", /* kMemorystatusKilledHiwat */
106 "vnode-limit", /* kMemorystatusKilledVnodes */
107 "vm-pageshortage", /* kMemorystatusKilledVMPageShortage */
108 "proc-thrashing", /* kMemorystatusKilledProcThrashing */
109 "fc-thrashing", /* kMemorystatusKilledFCThrashing */
110 "per-process-limit", /* kMemorystatusKilledPerProcessLimit */
111 "disk-space-shortage", /* kMemorystatusKilledDiskSpaceShortage */
112 "idle-exit", /* kMemorystatusKilledIdleExit */
113 "zone-map-exhaustion", /* kMemorystatusKilledZoneMapExhaustion */
114 "vm-compressor-thrashing", /* kMemorystatusKilledVMCompressorThrashing */
115 "vm-compressor-space-shortage", /* kMemorystatusKilledVMCompressorSpaceShortage */
116 "low-swap", /* kMemorystatusKilledLowSwap */
117 "sustained-memory-pressure", /* kMemorystatusKilledSustainedPressure */
118 "vm-pageout-starvation", /* kMemorystatusKilledVMPageoutStarvation */
119 "conclave-limit", /* kMemorystatusKilledConclaveLimit */
120 "long-idle-exit", /* kMemorystatusKilledLongIdleExit */
121 };
122
123 static const char *
memorystatus_priority_band_name(int32_t priority)124 memorystatus_priority_band_name(int32_t priority)
125 {
126 switch (priority) {
127 case JETSAM_PRIORITY_FOREGROUND:
128 return "FOREGROUND";
129 case JETSAM_PRIORITY_AUDIO_AND_ACCESSORY:
130 return "AUDIO_AND_ACCESSORY";
131 case JETSAM_PRIORITY_CONDUCTOR:
132 return "CONDUCTOR";
133 case JETSAM_PRIORITY_DRIVER_APPLE:
134 return "DRIVER_APPLE";
135 case JETSAM_PRIORITY_HOME:
136 return "HOME";
137 case JETSAM_PRIORITY_EXECUTIVE:
138 return "EXECUTIVE";
139 case JETSAM_PRIORITY_IMPORTANT:
140 return "IMPORTANT";
141 case JETSAM_PRIORITY_CRITICAL:
142 return "CRITICAL";
143 }
144
145 return "?";
146 }
147
148 bool
is_reason_thrashing(unsigned cause)149 is_reason_thrashing(unsigned cause)
150 {
151 switch (cause) {
152 case kMemorystatusKilledFCThrashing:
153 case kMemorystatusKilledVMCompressorThrashing:
154 case kMemorystatusKilledVMCompressorSpaceShortage:
155 return true;
156 default:
157 return false;
158 }
159 }
160
161 bool
is_reason_zone_map_exhaustion(unsigned cause)162 is_reason_zone_map_exhaustion(unsigned cause)
163 {
164 return cause == kMemorystatusKilledZoneMapExhaustion;
165 }
166
167 /*
168 * Returns the current zone map size and capacity to include in the jetsam snapshot.
169 * Defined in zalloc.c
170 */
171 extern void get_zone_map_size(uint64_t *current_size, uint64_t *capacity);
172
173 /*
174 * Returns the name of the largest zone and its size to include in the jetsam snapshot.
175 * Defined in zalloc.c
176 */
177 extern void get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size);
178
179 static int memstat_set_memlimits_locked(proc_t p, int32_t active_limit,
180 int32_t inactive_limit, memlimit_options_t options);
181 static bool memstat_proc_is_active_locked(proc_t);
182
183 static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. */
184
185 /*
186 * Cache this proc's active limit as its current limit before writing it to
187 * the ledger. Returns whether the new limit should be written to the ledger.
188 */
189 static inline bool
memstat_update_memlimit_locked(proc_t p,bool use_active)190 memstat_update_memlimit_locked(proc_t p, bool use_active)
191 {
192 bool ledger_needed = false;
193 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
194
195 /* Cache limit value */
196 if (use_active && p->p_memstat_memlimit != p->p_memstat_memlimit_active) {
197 p->p_memstat_memlimit = p->p_memstat_memlimit_active;
198 ledger_needed = true;
199 } else if (!use_active &&
200 p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) {
201 p->p_memstat_memlimit = p->p_memstat_memlimit_inactive;
202 ledger_needed = true;
203 }
204
205 /* Cache limit fatality */
206 if (_memstat_proc_memlimit_is_fatal(p, use_active) &&
207 !_memstat_proc_cached_memlimit_is_fatal(p)) {
208 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
209 ledger_needed = true;
210 } else if (!_memstat_proc_memlimit_is_fatal(p, use_active) &&
211 _memstat_proc_cached_memlimit_is_fatal(p)) {
212 p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
213 ledger_needed = true;
214 }
215
216 return ledger_needed;
217 }
218
219 /*
220 * Write the process' current memlimit to the ledger for enforcement.
221 *
222 * Holding the proc_list_lock while writing to the ledgers (where the task
223 * lock is taken) can be problematic. The proc list lock may optionally be
224 * dropped and re-taken while writing limits to the ledger. (rdar://21394491)
225 */
226 static int
_memstat_write_memlimit_to_ledger_locked(proc_t p,bool is_active,bool drop_lock)227 _memstat_write_memlimit_to_ledger_locked(proc_t p, bool is_active, bool drop_lock)
228 {
229 kern_return_t kr;
230 bool is_fatal = _memstat_proc_cached_memlimit_is_fatal(p);
231
232 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
233
234 #if MACH_ASSERT
235 if (memorystatus_highwater_enabled) {
236 if (is_active) {
237 assert3u(is_fatal, ==, _memstat_proc_active_memlimit_is_fatal(p));
238 assert3u(p->p_memstat_memlimit, ==, p->p_memstat_memlimit_active);
239 } else {
240 assert3u(is_fatal, ==, _memstat_proc_inactive_memlimit_is_fatal(p));
241 assert3u(p->p_memstat_memlimit, ==, p->p_memstat_memlimit_inactive);
242 }
243 }
244 #endif /* MACH_ASSERT */
245
246 if (drop_lock) {
247 if (proc_ref(p, true) != p) {
248 memorystatus_log_error("Unable to take a reference on proc %s [%d]. "
249 "Cannot update memlimit", proc_best_name(p), proc_getpid(p));
250 return ESRCH;
251 }
252 proc_list_unlock();
253 }
254
255 memorystatus_log_debug("memorystatus: new limit on pid %d (%dMB %s)\n",
256 proc_getpid(p), (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
257 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"));
258
259 kr = task_set_phys_footprint_limit_internal(proc_task(p),
260 (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1,
261 NULL, is_active, is_fatal);
262
263 if (drop_lock) {
264 proc_list_lock();
265 proc_rele(p);
266 }
267
268 if (kr != KERN_SUCCESS) {
269 memorystatus_log_fault("memorystatus: error (%d) setting memlimit in "
270 "ledger for %s [%d]\n", kr, proc_best_name(p), proc_pid(p));
271 return mach_to_bsd_errno(kr);
272 }
273 return 0;
274 }
275
276 #pragma mark General Tunables
277
278 #define MEMORYSTATUS_SMALL_MEMORY_THRESHOLD (3UL * (1UL << 30))
279 #define MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD (6UL * (1UL << 30))
280
281 #define MEMORYSTATUS_CLEAR_THE_DECKS_OFFSET_PERCENTAGE 5UL
282 #define MEMORYSTATUS_BALLAST_OFFSET_PERCENTAGE 5UL
283 #define MEMORYSTATUS_AGGR_SYSPROC_AGING_PERCENTAGE 7UL
284 #define MEMORYSTATUS_DELTA_PERCENTAGE_LARGE 4UL
285 #define MEMORYSTATUS_DELTA_PERCENTAGE_SMALL 5UL
286
287 /*
288 * Fall back to these percentages/ratios if a mb value is not provided via EDT
289 * DRAM (GB) | critical | idle | pressure | reaper | freeze
290 * (0,3] | 5% | 10% | 15% | 20% | 50%
291 * (3,6] | 4% | 9% | 15% | 18% | 50%
292 * (6,∞) | 4% | 8% | 12% | 16% | 50%
293 */
294
295 #define MEMORYSTATUS_CRITICAL_PERCENTAGE_SMALL 5UL
296 #define MEMORYSTATUS_CRITICAL_PERCENTAGE_LARGE 4UL
297
298 #define MEMORYSTATUS_IDLE_RATIO_NUM 2UL
299 #define MEMORYSTATUS_IDLE_RATIO_DENOM 1UL
300 #define MEMORYSTATUS_PRESSURE_RATIO_NUM 3UL
301 #define MEMORYSTATUS_PRESSURE_RATIO_DENOM 1UL
302 #define MEMORYSTATUS_REAPER_RATIO_NUM 4UL
303 #define MEMORYSTATUS_REAPER_RATIO_DENOM 1UL
304
305 #if (XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR) || XNU_TARGET_OS_WATCH
306 #define MEMORYSTATUS_REAPER_ENABLED_DEFAULT TRUE
307 #else
308 #define MEMORYSTATUS_REAPER_ENABLED_DEFAULT FALSE
309 #endif /* (XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR) || XNU_TARGET_OS_WATCH */
310 #define MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT 300
311 #define MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT 300
312 #define MEMORYSTATUS_REAPER_MAX_PRIORITY_DEFAULT JETSAM_PRIORITY_IDLE
313 #define MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT 30
314 #define MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT -1
315
316 #define MEMORYSTATUS_REAPER_REAP_RELAUNCH_MASK_UNKNOWN (P_MEMSTAT_RELAUNCH_HIGH << 1)
317 #define MEMORYSTATUS_REAPER_REAP_RELAUNCH_MASK_DEFAULT (P_MEMSTAT_RELAUNCH_LOW | MEMORYSTATUS_REAPER_REAP_RELAUNCH_MASK_UNKNOWN)
318
319 /*
320 * For historical reasons, devices with "medium"-sized memory configs have a critical:idle:pressure ratio of
321 * 4:9:15. This ratio is preserved for these devices when a fixed-mb base value has not been provided by EDT/boot-arg;
322 * all other devices use a 1:2:3 ratio.
323 */
324 #define MEMORYSTATUS_IDLE_RATIO_NUM_MEDIUM 9UL
325 #define MEMORYSTATUS_IDLE_RATIO_DENOM_MEDIUM 4UL
326 #define MEMORYSTATUS_PRESSURE_RATIO_NUM_MEDIUM 15UL
327 #define MEMORYSTATUS_PRESSURE_RATIO_DENOM_MEDIUM 4UL
328 #define MEMORYSTATUS_REAPER_RATIO_NUM_MEDIUM 20UL
329 #define MEMORYSTATUS_REAPER_RATIO_DENOM_MEDIUM 4UL
330 #define MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT_MEDIUM 240
331 #define MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT_MEDIUM 240
332
333 /*
334 * For Large config device, set the reaper threhsold to be 19% of the Memsize
335 */
336 #define MEMORYSTATUS_REAPER_RATIO_NUM_LARGE 19UL
337 #define MEMORYSTATUS_REAPER_RATIO_DENOM_LARGE 4UL
338
339 static int32_t memorystatus_get_default_task_active_limit(proc_t p);
340 static int32_t memorystatus_get_default_task_inactive_limit(proc_t p);
341
342 /*
343 * default jetsam snapshot support
344 */
345 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
346
347 #if CONFIG_FREEZE
348 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_freezer;
349 /*
350 * The size of the freezer snapshot is given by memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR
351 * The freezer snapshot can be much smaller than the default snapshot
352 * because it only includes apps that have been killed and dasd consumes it every 30 minutes.
353 * Since the snapshots are always wired we don't want to overallocate too much.
354 */
355 #define JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR 20
356 unsigned int memorystatus_jetsam_snapshot_freezer_max;
357 unsigned int memorystatus_jetsam_snapshot_freezer_size;
358 TUNABLE(bool, memorystatus_jetsam_use_freezer_snapshot, "kern.jetsam_user_freezer_snapshot", true);
359
360 #define MEMORYSTATUS_FREEZE_THRESHOLD_PERCENTAGE 50UL
361 TUNABLE_DT(uint32_t, memorystatus_freeze_threshold_mb, "/defaults", "kern.memstat_freeze_mb",
362 "memorystatus_freeze_threshold_mb", 0, TUNABLE_DT_NONE);
363 #endif /* CONFIG_FREEZE */
364
365 unsigned int memorystatus_jetsam_snapshot_count = 0;
366 unsigned int memorystatus_jetsam_snapshot_max = 0;
367 unsigned int memorystatus_jetsam_snapshot_size = 0;
368 uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
369 uint64_t memorystatus_jetsam_snapshot_timeout = 0;
370
371 #define JETSAM_SNAPSHOT_TIMEOUT_SECS 30
372
373 #if DEVELOPMENT || DEBUG
374 /*
375 * On development and debug kernels, we allow one pid to take ownership
376 * of some memorystatus data structures for testing purposes (via memorystatus_control).
377 * If there's an owner, then only they may consume the jetsam snapshot & set freezer probabilities.
378 * This is used when testing these interface to avoid racing with other
379 * processes on the system that typically use them (namely OSAnalytics & dasd).
380 */
381 static pid_t memorystatus_testing_pid = 0;
382 SYSCTL_INT(_kern, OID_AUTO, memorystatus_testing_pid, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_testing_pid, 0, "");
383 #endif /* DEVELOPMENT || DEBUG */
384
385 /*
386 * jetsam zprint snapshot data
387 */
388 #if JETSAM_ZPRINT_SNAPSHOT
389 static unsigned int jzs_trigger_band = JETSAM_PRIORITY_FOREGROUND;
390 static mach_zone_name_t *jzs_names = NULL;
391 static mach_zone_info_t *jzs_info = NULL;
392 static int *jzs_coalesce = NULL;
393 static unsigned int jzs_zone_cnt = 0;
394 static mach_memory_info_t *jzs_meminfo = NULL;
395 static unsigned int jzs_meminfo_cnt = 0;
396 static uint64_t jzs_gencount = (uint64_t) -1ll;
397
398 #if DEVELOPMENT || DEBUG
399 SYSCTL_UINT(_kern, OID_AUTO, jzs_trigger_band, CTLFLAG_RW | CTLFLAG_LOCKED, &jzs_trigger_band, 0, "Priority band threshold for taking jetsam zprint snapshot");
400 #endif /* DEVELOPMENT || DEBUG */
401 #endif /* JETSAM_ZPRINT_SNAPSHOT */
402
403
404 static void memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot);
405
406 /* General memorystatus stuff */
407
408 /*
409 * Daemons: The actual idle deferred time for the daemon is based on
410 * the relaunch behavior of the daemon. The relaunch behavior determines
411 * the scaling factor applied to memorystatus_sysprocs_idle_delay_time. See
412 * kJetsamSysProcsIdleDelayTime* ratios defined in kern_memorystatus.c
413 *
414 * Apps: The apps are aged for memorystatus_apps_idle_delay_time factored
415 * by kJetsamAppsIdleDelayTimeRatio.
416 */
417 TUNABLE(uint64_t, memstat_idle_deferral_time_s, "memorystatus_idle_deferral_time_s", 10);
418 TUNABLE(uint64_t, memstat_aging_stuck_time_s, "memorystatus_aging_stuck_time_s", 30);
419 uint64_t memorystatus_sysprocs_idle_delay_time = 0;
420 uint64_t memorystatus_apps_idle_delay_time = 0;
421 uint64_t memorystatus_aging_stuck_delay_time = 0;
422 /* 2GB devices support an entitlement for a higher app memory limit of "almost 2GB". */
423 static int32_t memorystatus_ios13extended_footprint_limit_mb = 1800;
424
425 #define CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT XNU_TARGET_OS_XR
426
427 /* Some devices give entitled apps a higher memory limit */
428 TUNABLE_DT_WRITEABLE(int32_t, memorystatus_entitled_max_task_footprint_mb,
429 "/defaults", "kern.entitled_max_task_pmem",
430 "entitled_max_task_pmem", 0, TUNABLE_DT_NONE);
431 TUNABLE_DT_WRITEABLE(int32_t, memorystatus_entitled_dev_max_task_footprint_mb,
432 "/defaults", "kern.entitled_dev_max_task_pmem",
433 "entitled_dev_max_task_pmem", 0, TUNABLE_DT_NONE);
434 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
435 /* On visionOS, we want a separate high memory limit for bincompat (iOS) apps. */
436 TUNABLE_DT_WRITEABLE(int32_t, memorystatus_entitled_bincompat_max_task_footprint_mb,
437 "/defaults", "kern.entitled_bc_max_task_pmem",
438 "entitled_bincompat_max_task_pmem", 0, TUNABLE_DT_NONE);
439 #endif // CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
440 #if DEBUG || DEVELOPMENT
441 TUNABLE(bool, memstat_ignore_task_limit_increase, "memstat_no_task_limit_increase", false);
442 #endif /* DEBUG || DEVELOPMENT */
443
444 #if __arm64__
445 #if DEVELOPMENT || DEBUG
446 SYSCTL_INT(_kern, OID_AUTO, ios13extended_footprint_limit_mb,
447 CTLFLAG_RD | CTLFLAG_LOCKED,
448 &memorystatus_ios13extended_footprint_limit_mb, 0, "");
449 SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem,
450 CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN,
451 &memorystatus_entitled_max_task_footprint_mb, 0, "");
452 SYSCTL_INT(_kern, OID_AUTO, entitled_dev_max_task_pmem,
453 CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN,
454 &memorystatus_entitled_dev_max_task_footprint_mb, 0, "");
455 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
456 SYSCTL_INT(_kern, OID_AUTO, entitled_bincompat_max_task_pmem,
457 CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN,
458 &memorystatus_entitled_bincompat_max_task_footprint_mb, 0, "");
459 #endif /* CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT */
460 #else /* !(DEVELOPMENT || DEBUG) */
461 SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem,
462 CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_KERN,
463 &memorystatus_entitled_max_task_footprint_mb, 0, "");
464 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
465 SYSCTL_INT(_kern, OID_AUTO, entitled_bincompat_max_task_pmem,
466 CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_KERN,
467 &memorystatus_entitled_bincompat_max_task_footprint_mb, 0, "");
468 #endif /* CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT */
469 #endif /* DEVELOPMENT || DEBUG */
470 #endif /* __arm64__ */
471
472 #pragma mark Logging
473
474 os_log_t memorystatus_log_handle;
475
476 TUNABLE_WRITEABLE(memorystatus_log_level_t, memorystatus_log_level, "memorystatus_log_level", MEMORYSTATUS_LOG_LEVEL_DEFAULT);
477
478 #if DEBUG || DEVELOPMENT
479 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_log_level, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_log_level, MEMORYSTATUS_LOG_LEVEL_DEFAULT, "");
480 #endif
481
482 #pragma mark Locks
483
484 static LCK_GRP_DECLARE(memorystatus_lock_group, "memorystatus");
485
486 /* Synchronizes jetsam pressure broadcasts */
487 LCK_MTX_DECLARE(memorystatus_jetsam_broadcast_lock, &memorystatus_lock_group);
488
489 #if DEVELOPMENT || DEBUG
490 static LCK_MTX_DECLARE(disconnect_page_mappings_mutex, &memorystatus_lock_group);
491 #endif /* DEVELOPMENT || DEBUG */
492
493 /* Idle guard handling */
494
495 static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
496 static void _memstat_invalidate_idle_demotion_locked(proc_t p);
497 static void _memstat_schedule_idle_demotion_locked(proc_t p);
498 static void _memstat_reschedule_idle_demotion_locked(void);
499 int memorystatus_update_priority_for_appnap(proc_t p);
500 vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
501 boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
502 void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
503 void memorystatus_send_low_swap_note(void);
504 boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count,
505 uint32_t *errors, uint64_t *memory_reclaimed);
506 static bool memorystatus_kill_proc(proc_t p, uint32_t cause,
507 os_reason_t jetsam_reason, bool *killed, uint64_t *footprint_of_killed_proc);
508 /* Synchronously kill a process in priority order */
509 static bool memstat_kill_top_process(uint32_t cause, os_reason_t jetsam_reason,
510 int32_t max_priority, memstat_kill_options_t options,
511 int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed);
512
513 uint64_t memorystatus_available_memory_internal(proc_t p);
514 void memorystatus_thread_wake(void);
515 static bool _memstat_consider_waking_jetsam_thread(void);
516 #if CONFIG_JETSAM
517 static void memorystatus_thread_pool_default(void);
518 static void memorystatus_thread_pool_max(void);
519 #endif /* CONFIG_JETSAM */
520
521 unsigned int memorystatus_level = 0;
522 static int memorystatus_list_count = 0;
523 memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
524 static thread_call_t memorystatus_idle_demotion_call;
525 uint64_t memstat_idle_demotion_deadline = 0;
526 #if CONFIG_FREEZE
527 unsigned int memorystatus_suspended_count = 0;
528 #endif /* CONFIG_FREEZE */
529
530 #if XNU_TARGET_OS_OSX
531 /*
532 * Effectively disable the system process and application demotion
533 * logic on macOS. This means system processes and apps won't get the
534 * 10 second protection before landing in the IDLE band after moving
535 * out of their active band. Reasons:-
536 * - daemons + extensions + apps on macOS don't behave the way they
537 * do on iOS and so they are confusing the demotion logic. For example,
538 * not all apps go from FG to IDLE. Some sit in higher bands instead. This
539 * is causing multiple asserts to fire internally.
540 * - we use the aging bands to protect processes from jetsam. But on macOS,
541 * we have a very limited jetsam that is only invoked under extreme conditions
542 * where we have no more swap / compressor space OR are under critical pressure.
543 */
544 int system_procs_aging_band = 0;
545 int system_procs_aging_band_stuck = 0;
546 int applications_aging_band = 0;
547 #else /* XNU_TARGET_OS_OSX */
548 int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
549 int system_procs_aging_band_stuck = JETSAM_PRIORITY_AGING_BAND1_STUCK;
550 int applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
551 #endif /* XNU_TARGET_OS_OSX */
552
553 /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */
554 #if CONFIG_FREEZE
555 int memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_FREEZER;
556 #else /* CONFIG_FREEZE */
557 int memorystatus_freeze_jetsam_band = 0;
558 #endif /* CONFIG_FREEZE */
559
560 _Atomic bool memorystatus_zone_map_is_exhausted = false;
561 _Atomic bool memorystatus_compressor_space_shortage = false;
562 _Atomic bool memorystatus_pageout_starved = false;
563 #if CONFIG_PHANTOM_CACHE
564 _Atomic bool memorystatus_phantom_cache_pressure = false;
565 #endif /* CONFIG_PHANTOM_CACHE */
566
567 bool memorystatus_should_issue_fg_band_notify = true;
568
569 extern void coalition_mark_swappable(coalition_t coal);
570 extern bool coalition_is_swappable(coalition_t coal);
571 boolean_t memorystatus_allowed_vm_map_fork(task_t, bool *);
572 #if DEVELOPMENT || DEBUG
573 void memorystatus_abort_vm_map_fork(task_t);
574 #endif
575
576 SYSCTL_NODE(_kern, OID_AUTO, memorystatus,
577 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "memorystatus subsystem");
578
579 /*
580 * Idle delay timeout factors for daemons based on relaunch behavior. Only used in
581 * kJetsamAgingPolicySysProcsReclaimedFirst aging policy.
582 */
583 #define kJetsamSysProcsIdleDelayTimeLowRatio (5)
584 #define kJetsamSysProcsIdleDelayTimeMedRatio (2)
585 #define kJetsamSysProcsIdleDelayTimeHighRatio (1)
586
587 /*
588 * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, treat apps as well
589 * behaved daemons for aging purposes.
590 */
591 #define kJetsamAppsIdleDelayTimeRatio (kJetsamSysProcsIdleDelayTimeLowRatio)
592
593 static uint64_t
memorystatus_sysprocs_idle_time(proc_t p)594 memorystatus_sysprocs_idle_time(proc_t p)
595 {
596 uint64_t idle_delay_time = 0;
597 /*
598 * For system processes, base the idle delay time on the
599 * jetsam relaunch behavior specified by launchd. The idea
600 * is to provide extra protection to the daemons which would
601 * relaunch immediately after jetsam.
602 */
603 switch (p->p_memstat_relaunch_flags) {
604 case P_MEMSTAT_RELAUNCH_UNKNOWN:
605 case P_MEMSTAT_RELAUNCH_LOW:
606 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeLowRatio;
607 break;
608 case P_MEMSTAT_RELAUNCH_MED:
609 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeMedRatio;
610 break;
611 case P_MEMSTAT_RELAUNCH_HIGH:
612 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeHighRatio;
613 break;
614 default:
615 panic("Unknown relaunch flags on process!");
616 break;
617 }
618 return idle_delay_time;
619 }
620
621 static uint64_t
memorystatus_apps_idle_time(__unused proc_t p)622 memorystatus_apps_idle_time(__unused proc_t p)
623 {
624 return memorystatus_apps_idle_delay_time / kJetsamAppsIdleDelayTimeRatio;
625 }
626
627 static uint64_t
_memstat_sysprocs_aging_stuck_delay_time(__unused proc_t p)628 _memstat_sysprocs_aging_stuck_delay_time(__unused proc_t p)
629 {
630 return memorystatus_aging_stuck_delay_time;
631 }
632
633
634 static int
635 sysctl_jetsam_set_sysprocs_idle_delay_time SYSCTL_HANDLER_ARGS
636 {
637 #pragma unused(oidp, arg1, arg2)
638
639 int error = 0, val = 0, old_time_in_secs = 0;
640 uint64_t old_time_in_ns = 0;
641
642 absolutetime_to_nanoseconds(memorystatus_sysprocs_idle_delay_time, &old_time_in_ns);
643 old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
644
645 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
646 if (error || !req->newptr) {
647 return error;
648 }
649
650 if ((val < 0) || (val > INT32_MAX)) {
651 memorystatus_log_error("jetsam: new idle delay interval has invalid value.\n");
652 return EINVAL;
653 }
654
655 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
656
657 return 0;
658 }
659
660 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, sysprocs_idle_delay_time_ns, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
661 0, 0, sysctl_jetsam_set_sysprocs_idle_delay_time, "I", "Aging window for system processes");
662
663
664 static int
665 sysctl_jetsam_set_apps_idle_delay_time SYSCTL_HANDLER_ARGS
666 {
667 #pragma unused(oidp, arg1, arg2)
668
669 int error = 0, val = 0, old_time_in_secs = 0;
670 uint64_t old_time_in_ns = 0;
671
672 absolutetime_to_nanoseconds(memorystatus_apps_idle_delay_time, &old_time_in_ns);
673 old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
674
675 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
676 if (error || !req->newptr) {
677 return error;
678 }
679
680 if ((val < 0) || (val > INT32_MAX)) {
681 memorystatus_log_error("jetsam: new idle delay interval has invalid value.\n");
682 return EINVAL;
683 }
684
685 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
686
687 return 0;
688 }
689
690 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, apps_idle_delay_time_ns, CTLTYPE_INT | CTLFLAG_RW,
691 0, 0, sysctl_jetsam_set_apps_idle_delay_time, "I", "Aging window for applications");
692
693 SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_KERN, &max_task_footprint_mb, 0, "");
694
695 #if __arm64__
696 int legacy_footprint_bonus_mb = 50; /* This value was chosen after looking at the top 30 apps
697 * that needed the additional room in their footprint when
698 * the 'correct' accounting methods were applied to them.
699 */
700
701 #if DEVELOPMENT || DEBUG
702 SYSCTL_INT(_kern, OID_AUTO, legacy_footprint_bonus_mb, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_footprint_bonus_mb, 0, "");
703 #endif /* DEVELOPMENT || DEBUG */
704 /*
705 * Raise the inactive and active memory limits to new values.
706 * Will only raise the limits and will do nothing if either of the current
707 * limits are 0.
708 * Caller must hold the proc_list_lock
709 */
710 static void
memorystatus_raise_memlimit_locked(proc_t p,int new_memlimit_active,int new_memlimit_inactive)711 memorystatus_raise_memlimit_locked(proc_t p,
712 int new_memlimit_active,
713 int new_memlimit_inactive)
714 {
715 int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
716 memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
717
718 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
719
720 if (p->p_memstat_memlimit_active > 0) {
721 memlimit_mb_active = p->p_memstat_memlimit_active;
722 } else if (p->p_memstat_memlimit_active == -1) {
723 memlimit_mb_active = max_task_footprint_mb;
724 } else {
725 /*
726 * Nothing to do for '0' which is
727 * a special value only used internally
728 * to test 'no limits'.
729 */
730 return;
731 }
732
733 if (p->p_memstat_memlimit_inactive > 0) {
734 memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
735 } else if (p->p_memstat_memlimit_inactive == -1) {
736 memlimit_mb_inactive = max_task_footprint_mb;
737 } else {
738 /*
739 * Nothing to do for '0' which is
740 * a special value only used internally
741 * to test 'no limits'.
742 */
743 return;
744 }
745
746 memlimit_mb_active = MAX(new_memlimit_active, memlimit_mb_active);
747 memlimit_mb_inactive = MAX(new_memlimit_inactive, memlimit_mb_inactive);
748
749 /* Maintain pre-existing limit fatality */
750 if (_memstat_proc_active_memlimit_is_fatal(p)) {
751 memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
752 }
753 if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
754 memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
755 }
756
757 memstat_set_memlimits_locked(p, memlimit_mb_active,
758 memlimit_mb_inactive, memlimit_options);
759 }
760
761 void
memorystatus_act_on_legacy_footprint_entitlement(proc_t p,boolean_t footprint_increase)762 memorystatus_act_on_legacy_footprint_entitlement(proc_t p, boolean_t footprint_increase)
763 {
764 int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
765
766 if (p == NULL) {
767 return;
768 }
769
770 proc_list_lock();
771
772 if (p->p_memstat_memlimit_active > 0) {
773 memlimit_mb_active = p->p_memstat_memlimit_active;
774 } else if (p->p_memstat_memlimit_active == -1) {
775 memlimit_mb_active = max_task_footprint_mb;
776 } else {
777 /*
778 * Nothing to do for '0' which is
779 * a special value only used internally
780 * to test 'no limits'.
781 */
782 proc_list_unlock();
783 return;
784 }
785
786 if (p->p_memstat_memlimit_inactive > 0) {
787 memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
788 } else if (p->p_memstat_memlimit_inactive == -1) {
789 memlimit_mb_inactive = max_task_footprint_mb;
790 } else {
791 /*
792 * Nothing to do for '0' which is
793 * a special value only used internally
794 * to test 'no limits'.
795 */
796 proc_list_unlock();
797 return;
798 }
799
800 if (footprint_increase) {
801 memlimit_mb_active += legacy_footprint_bonus_mb;
802 memlimit_mb_inactive += legacy_footprint_bonus_mb;
803 } else {
804 memlimit_mb_active -= legacy_footprint_bonus_mb;
805 if (memlimit_mb_active == max_task_footprint_mb) {
806 memlimit_mb_active = -1; /* reverting back to default system limit */
807 }
808
809 memlimit_mb_inactive -= legacy_footprint_bonus_mb;
810 if (memlimit_mb_inactive == max_task_footprint_mb) {
811 memlimit_mb_inactive = -1; /* reverting back to default system limit */
812 }
813 }
814 memorystatus_raise_memlimit_locked(p, memlimit_mb_active, memlimit_mb_inactive);
815
816 proc_list_unlock();
817 }
818
819 void
memorystatus_act_on_ios13extended_footprint_entitlement(proc_t p)820 memorystatus_act_on_ios13extended_footprint_entitlement(proc_t p)
821 {
822 proc_list_lock();
823 memorystatus_raise_memlimit_locked(p,
824 memorystatus_ios13extended_footprint_limit_mb,
825 memorystatus_ios13extended_footprint_limit_mb);
826 proc_list_unlock();
827 }
828
829 void
memorystatus_act_on_entitled_task_limit(proc_t p)830 memorystatus_act_on_entitled_task_limit(proc_t p)
831 {
832 int memlimit;
833 if (memorystatus_entitled_max_task_footprint_mb == 0) {
834 // Entitlement is not supported on this device.
835 return;
836 }
837 proc_list_lock();
838 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
839 // We want a separate memory limit for bincompat (iPad) apps on visionOS.
840 switch (proc_platform(p)) {
841 case PLATFORM_XROS:
842 case PLATFORM_XROSSIMULATOR:
843 memlimit = memorystatus_entitled_max_task_footprint_mb;
844 break;
845 default:
846 if (memorystatus_entitled_bincompat_max_task_footprint_mb != 0) {
847 memlimit = memorystatus_entitled_bincompat_max_task_footprint_mb;
848 } else {
849 memlimit = memorystatus_entitled_max_task_footprint_mb;
850 }
851 break;
852 }
853 #else // CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
854 memlimit = memorystatus_entitled_max_task_footprint_mb;
855 #endif // !CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
856 memorystatus_raise_memlimit_locked(p, memlimit, memlimit);
857 proc_list_unlock();
858 }
859
860 void
memorystatus_act_on_entitled_developer_task_limit(proc_t p)861 memorystatus_act_on_entitled_developer_task_limit(proc_t p)
862 {
863 if (memorystatus_entitled_dev_max_task_footprint_mb == 0) {
864 // Entitlement not supported on this device
865 return;
866 }
867 memorystatus_log("memorystatus: WARNING %s [%d] is receiving an entitled "
868 "debugging memory limit. This is intended only for debugging and "
869 "can result in unstable device behavior.",
870 proc_best_name(p), proc_getpid(p));
871 proc_list_lock();
872 memorystatus_raise_memlimit_locked(p,
873 memorystatus_entitled_dev_max_task_footprint_mb,
874 memorystatus_entitled_dev_max_task_footprint_mb);
875 proc_list_unlock();
876 }
877
878 #endif /* __arm64__ */
879
880 SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_level, 0, "");
881
882 int
memorystatus_get_level(__unused struct proc * p,struct memorystatus_get_level_args * args,__unused int * ret)883 memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
884 {
885 user_addr_t level = 0;
886
887 level = args->level;
888
889 if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
890 return EFAULT;
891 }
892
893 return 0;
894 }
895
896 static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
897
898 /* Memory Limits */
899
900 static bool memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
901 static bool memstat_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
902
903
904 static int memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
905
906 #if DEBUG || DEVELOPMENT
907 static int memorystatus_cmd_set_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
908 static int memorystatus_cmd_get_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
909 static int memorystatus_set_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry);
910 static int memorystatus_get_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry);
911 #endif // DEBUG || DEVELOPMENT
912 static int memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry);
913
914 static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
915
916 static int memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
917
918 static void memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry);
919
920 int proc_get_memstat_priority(proc_t, boolean_t);
921
922 static boolean_t memorystatus_idle_snapshot = 0;
923
924 unsigned int memorystatus_delta = 0;
925
926 /* Jetsam Loop Detection */
927 boolean_t memorystatus_jld_enabled = FALSE; /* Enable jetsam loop detection */
928 uint32_t memorystatus_jld_eval_period_msecs = 0; /* Init pass sets this based on device memory size */
929 int memorystatus_jld_max_kill_loops = 2; /* How many times should we try and kill up to the target band */
930
931 /*
932 * A FG app can request that the aggressive jetsam mechanism display some leniency in the FG band. This 'lenient' mode is described as:
933 * --- if aggressive jetsam kills an app in the FG band and gets back >=AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD memory, it will stop the aggressive march further into and up the jetsam bands.
934 *
935 * RESTRICTIONS:
936 * - Such a request is respected/acknowledged only once while that 'requesting' app is in the FG band i.e. if aggressive jetsam was
937 * needed and the 'lenient' mode was deployed then that's it for this special mode while the app is in the FG band.
938 *
939 * - If the app is still in the FG band and aggressive jetsam is needed again, there will be no stop-and-check the next time around.
940 *
941 * - Also, the transition of the 'requesting' app away from the FG band will void this special behavior.
942 */
943
944 #define AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD 25
945 boolean_t memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
946 boolean_t memorystatus_aggressive_jetsam_lenient = FALSE;
947
948 #if DEVELOPMENT || DEBUG
949 /*
950 * Jetsam Loop Detection tunables.
951 */
952
953 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_eval_period_msecs, 0, "");
954 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_max_kill_loops, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_max_kill_loops, 0, "");
955 #endif /* DEVELOPMENT || DEBUG */
956
957 /*
958 * snapshot support for memstats collected at boot.
959 */
960 static memorystatus_jetsam_snapshot_t memorystatus_at_boot_snapshot;
961
962 static void memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count);
963 static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount);
964 static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime);
965
966 static void memorystatus_clear_errors(void);
967
968 static void memorystatus_get_task_phys_footprint_page_counts(task_t task,
969 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
970 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
971 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
972 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages,
973 uint64_t *neural_nofootprint_total_pages);
974
975 static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *count);
976
977 static memorystatus_proc_state_t _memstat_build_state(proc_t p);
978 //static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
979
980 static bool memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, int32_t max_kills, uint32_t *errors, uint64_t *memory_reclaimed);
981 static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed);
982 static bool _memstat_proc_is_reapable(proc_t p);
983 static void _memstat_refresh_oldest_reapable_proc_info(void);
984 static bool _memstat_proc_is_application(proc_t p);
985
986 #if CONFIG_JETSAM
987 static void _memstat_reaper_check_oldest_reapable_proc_info_timeout(void);
988 static void _memstat_reaper_start_sweep(void);
989 static void _memstat_reaper_end_sweep(void);
990 static void _memstat_reaper_record_kill(uint64_t bytes_freed);
991 #endif /* CONFIG_JETSAM */
992 static const char* _memstat_relaunch_flags_description(uint32_t flags);
993 static const char* _memstat_proc_type_description(proc_t p);
994
995
996 /* Priority Band Sorting Routines */
997 static int memstat_sort_bucket(unsigned int bucket_index, memorystatus_jetsam_sort_order_t sort_order);
998 static void memstat_sort_coals_locked(unsigned int bucket_index, memorystatus_jetsam_sort_order_t sort_order);
999 static void memstat_sort_by_footprint_locked(unsigned int bucket_index);
1000
1001 #define JETSAM_SORT_IDLE_DEFAULT JETSAM_SORT_FOOTPRINT_NOCOAL
1002 #if XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR
1003 #define JETSAM_SORT_FG_DEFAULT JETSAM_SORT_LRU
1004 #else /* XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR */
1005 #define JETSAM_SORT_FG_DEFAULT JETSAM_SORT_FOOTPRINT
1006 #endif /* !(XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR) */
1007
1008 TUNABLE_DT(memorystatus_jetsam_sort_order_t, memstat_jetsam_fg_sort_order, "/defaults",
1009 "kern.memstat_fg_sort_order", "memstat_fg_sort_order", JETSAM_SORT_FG_DEFAULT, TUNABLE_DT_NONE);
1010
1011 /* qsort routines */
1012 typedef int (*cmpfunc_t)(const void *a, const void *b);
1013 extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
1014
1015 /* VM pressure */
1016
1017 #if CONFIG_SECLUDED_MEMORY
1018 extern unsigned int vm_page_secluded_count;
1019 extern unsigned int vm_page_secluded_count_over_target;
1020 #endif /* CONFIG_SECLUDED_MEMORY */
1021
1022 /* Aggressive jetsam pages threshold for sysproc aging policy */
1023 unsigned int memorystatus_sysproc_aging_aggr_pages = 0;
1024
1025 uint32_t memorystatus_available_pages = UINT32_MAX;
1026
1027 __options_closed_decl(memorystatus_policy_t, uint8_t, {
1028 kPolicyDefault = 0x00,
1029 kPolicyClearTheDecks = 0x01,
1030 kPolicyBallastDrain = 0x02,
1031 });
1032
1033 static memorystatus_policy_t memstat_policy_config = kPolicyDefault;
1034
1035 #define MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX ((uint32_t)(atop_64(max_mem) / 2))
1036
1037 /*
1038 * Jetsam Page Shortage Thresholds (PSTs):
1039 * - critical: jetsam above the idle band
1040 * - idle: jetsam in the idle band
1041 * - pressure: jetsam soft memory limit violators
1042 * - reaper: jetsam long-idle processes
1043 * - ballast: offset applied to non-critical thresholds upon request
1044 * from userspace
1045 * - ctd (clear-the-decks): offset applied to non-critical thresholds upon request
1046 * from userspace
1047 */
1048 uint32_t memstat_critical_threshold = 0;
1049 uint32_t memstat_idle_threshold = 0;
1050 uint32_t memstat_soft_threshold = 0;
1051 uint32_t memstat_reaper_threshold = 0;
1052 uint32_t memstat_ballast_offset = 0;
1053 uint32_t memstat_ctd_offset = 0;
1054
1055 int32_t memstat_reaper_min_age_secs = MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT;
1056 int32_t memstat_reaper_min_age_apps_secs = MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT;
1057 boolean_t memstat_reaper_enabled = MEMORYSTATUS_REAPER_ENABLED_DEFAULT;
1058 uint32_t memstat_reaper_max_priority = MEMORYSTATUS_REAPER_MAX_PRIORITY_DEFAULT;
1059 int32_t memstat_reaper_rescan_secs = MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT;
1060 bool memstat_reaper_is_currently_sweeping = false;
1061 uint64_t memstat_reaper_can_run_after_ts_matu = 0;
1062 uint64_t memstat_reaper_start_ts_matu = 0;
1063
1064 uint32_t memstat_reaper_reap_relaunch_mask = MEMORYSTATUS_REAPER_REAP_RELAUNCH_MASK_DEFAULT;
1065
1066 #define MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_UNKNOWN UINT64_MAX
1067 #define MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_NONE (UINT64_MAX-1)
1068 uint64_t memstat_oldest_reapable_proc_prio_start = MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_UNKNOWN;
1069 uint64_t memstat_oldest_reapable_proc_info_expiration_ts_matu = 0;
1070
1071 uint64_t memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu = 0;
1072
1073 typedef struct memstat_reaper_stats {
1074 uint32_t sweep_count;
1075 uint32_t kill_count;
1076 uint64_t memory_freed_bytes;
1077 } memstat_reaper_stats_t;
1078
1079 memstat_reaper_stats_t memstat_reaper_current_sweep_stats;
1080 memstat_reaper_stats_t memstat_reaper_cumulative_stats;
1081 uint32_t memstat_reaper_cumulative_memory_freed_mb = 0;
1082 /*
1083 * NB: These MiB thresholds are only read at boot and may become out of sync
1084 * with the PSTs above.
1085 */
1086 TUNABLE_DT(uint32_t, memorystatus_critical_threshold_mb, "/defaults",
1087 "kern.memstat_critical_mb", "memorystatus_critical_threshold_mb", 0, TUNABLE_DT_NONE);
1088 TUNABLE_DT(uint32_t, memorystatus_idle_threshold_mb, "/defaults",
1089 "kern.memstat_idle_mb", "memorystatus_idle_threshold_mb", 0, TUNABLE_DT_NONE);
1090 TUNABLE_DT(uint32_t, memorystatus_pressure_threshold_mb, "/defaults",
1091 "kern.memstat_pressure_mb", "memorystatus_pressure_threshold_mb", 0, TUNABLE_DT_NONE);
1092 TUNABLE_DT(uint32_t, memorystatus_reaper_threshold_mb, "/defaults",
1093 "kern.memstat_reaper_mb", "memorystatus_reaper_threshold_mb", 0, TUNABLE_DT_NONE);
1094 TUNABLE_DT(uint32_t, memstat_ballast_offset_mb, "/defaults",
1095 "kern.memstat_ballast_mb", "memstat_ballast_offset_mb", 0, TUNABLE_DT_NONE);
1096 TUNABLE(uint32_t, memstat_ctd_offset_mb, "memstat_ballast_offset_mb", 0);
1097
1098 /*
1099 * Kill count tracking
1100 *
1101 * Since idle exit is only applicable to processes in the idle band, track it
1102 * separately to save space. We also don't care about kMemorysatusInvalid.
1103 */
1104 uint32_t _Atomic memorystatus_kill_counts[JETSAM_PRIORITY_MAX + 1][JETSAM_REASON_MEMORYSTATUS_MAX - 1];
1105 uint32_t _Atomic memorystatus_idle_exit_kill_count = 0;
1106
1107 TUNABLE_DT(int32_t, memorystatus_reaper_minimum_age_seconds, "/defaults",
1108 "kern.memstat_reaper_minage_secs", "memorystatus_reaper_minimum_age_seconds", MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT, TUNABLE_DT_NONE);
1109 TUNABLE_DT(int32_t, memorystatus_reaper_minimum_age_apps_seconds, "/defaults",
1110 "kern.memstat_reaper_minapp_secs", "memorystatus_reaper_minimum_age_apps_seconds", MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT, TUNABLE_DT_NONE);
1111 TUNABLE_DT(uint32_t, memorystatus_reaper_rescan_delay_seconds, "/defaults",
1112 "kern.memstat_reaper_rescan_secs", "memorystatus_reaper_rescan_delay_seconds", MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT, TUNABLE_DT_NONE);
1113 TUNABLE_DT(boolean_t, memorystatus_reaper_enabled, "/defaults",
1114 "kern.memstat_reaper_enabled", "memorystatus_reaper_enabled", MEMORYSTATUS_REAPER_ENABLED_DEFAULT, TUNABLE_DT_NONE);
1115
1116
1117 #if CONFIG_JETSAM
1118 TUNABLE_DT_WRITEABLE(unsigned int, memorystatus_swap_all_apps, "/defaults", "kern.swap_all_apps", "kern.swap_all_apps", false, TUNABLE_DT_NONE);
1119 /* Will compact the early swapin queue if there are >= this many csegs on it. */
1120 static unsigned int memorystatus_swapin_trigger_segments = 10;
1121 unsigned int memorystatus_swapin_trigger_pages = 0;
1122
1123 #if DEVELOPMENT || DEBUG
1124 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
1125 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_swapin_trigger_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_swapin_trigger_pages, 0, "");
1126 #else
1127 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
1128 #endif /* DEVELOPMENT || DEBUG */
1129 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_swap_all_apps, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_swap_all_apps, 0, "");
1130
1131 static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit);
1132
1133 proc_name_t memorystatus_jetsam_proc_name_panic; /* Panic when we are about to jetsam this process. */
1134 uint32_t memorystatus_jetsam_proc_cause_panic = 0; /* If specified, panic only when we are about to jetsam the process above for this cause. */
1135 uint32_t memorystatus_jetsam_proc_size_panic = 0; /* If specified, panic only when we are about to jetsam the process above and its footprint is more than this in MB. */
1136
1137 /* If set, kill swappable processes when we're low on swap space. Currently off until we can allocate more swap space (rdar://87800902) */
1138 TUNABLE(bool, jetsam_kill_on_low_swap, "jetsam_kill_on_low_swap", false);
1139
1140 /*
1141 * Global switch for enabling fast jetsam. Fast jetsam is
1142 * hooked up via the system_override() system call. When
1143 * enabled, the following features can be toggled:
1144 * - clear-the-decks jetsam
1145 * - ballast-drain jetsam
1146 */
1147 TUNABLE_WRITEABLE(bool, fast_jetsam_enabled, "fast_jetsam_enabled", true);
1148
1149 #else /* !CONFIG_JETSAM */
1150
1151 /*
1152 * On compressor/swap exhaustion, kill the largest process regardless of
1153 * its chosen process policy.
1154 */
1155 #if DEVELOPMENT || DEBUG
1156 TUNABLE(bool, kill_on_no_paging_space, "-kill_on_no_paging_space", false);
1157 #else /* !(DEVELOPMENT || DEBUG) */
1158 bool kill_on_no_paging_space = false;
1159 #endif /* DEVELOPMENT || DEBUG */
1160
1161 /* The timestamp (MAS) of the last no paging space action */
1162 _Atomic uint64_t last_no_space_action_ts = 0;
1163 /* The minimum duration between no paging space actions */
1164 TUNABLE(uint64_t, no_paging_space_action_throttle_delay_ns,
1165 "no_paging_space_throttle_delay_ns", 5 * NSEC_PER_SEC);
1166
1167 #endif /* CONFIG_JETSAM */
1168
1169 static inline uint32_t
roundToNearestMB(uint32_t in)1170 roundToNearestMB(uint32_t in)
1171 {
1172 return (in + ((1 << 20) - 1)) >> 20;
1173 }
1174
1175 #if DEVELOPMENT || DEBUG
1176 static int memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase);
1177 #endif
1178
1179 #if __arm64__
1180 extern int legacy_footprint_entitlement_mode;
1181 #endif /* __arm64__ */
1182
1183 /* Debug */
1184
1185 extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
1186
1187 #if DEVELOPMENT || DEBUG
1188
1189 static unsigned int memorystatus_debug_dump_this_bucket = 0;
1190
1191 static void
memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)1192 memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)
1193 {
1194 proc_t p = NULL;
1195 uint64_t bytes = 0;
1196 int ledger_limit = 0;
1197 unsigned int b = bucket_index;
1198 boolean_t traverse_all_buckets = FALSE;
1199
1200 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1201 traverse_all_buckets = TRUE;
1202 b = 0;
1203 } else {
1204 traverse_all_buckets = FALSE;
1205 b = bucket_index;
1206 }
1207
1208 /*
1209 * footprint reported in [pages / MB ]
1210 * limits reported as:
1211 * L-limit proc's Ledger limit
1212 * C-limit proc's Cached limit, should match Ledger
1213 * A-limit proc's Active limit
1214 * IA-limit proc's Inactive limit
1215 * F==Fatal, NF==NonFatal
1216 */
1217
1218 memorystatus_log_debug("memorystatus_debug_dump ***START*(PAGE_SIZE_64=%llu)**\n", PAGE_SIZE_64);
1219 memorystatus_log_debug("bucket [pid] [pages / MB] [state] [EP / RP / AP] dirty deadline [L-limit / C-limit / A-limit / IA-limit] name\n");
1220 p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets);
1221 while (p) {
1222 bytes = get_task_phys_footprint(proc_task(p));
1223 task_get_phys_footprint_limit(proc_task(p), &ledger_limit);
1224 memorystatus_log_debug("%2d [%5d] [%5lld /%3lldMB] 0x%-8x [%2d / %2d / %2d] 0x%-3x %10lld [%3d / %3d%s / %3d%s / %3d%s] %s\n",
1225 b, proc_getpid(p),
1226 (bytes / PAGE_SIZE_64), /* task's footprint converted from bytes to pages */
1227 (bytes / (1024ULL * 1024ULL)), /* task's footprint converted from bytes to MB */
1228 p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_assertionpriority,
1229 p->p_memstat_dirty, p->p_memstat_idledeadline,
1230 ledger_limit,
1231 p->p_memstat_memlimit,
1232 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"),
1233 p->p_memstat_memlimit_active,
1234 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL ? "F " : "NF"),
1235 p->p_memstat_memlimit_inactive,
1236 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL ? "F " : "NF"),
1237 (*p->p_name ? p->p_name : "unknown"));
1238 p = memorystatus_get_next_proc_locked(&b, p, traverse_all_buckets);
1239 }
1240 memorystatus_log_debug("memorystatus_debug_dump ***END***\n");
1241 }
1242
1243 static int
1244 sysctl_memorystatus_debug_dump_bucket SYSCTL_HANDLER_ARGS
1245 {
1246 #pragma unused(oidp, arg2)
1247 int bucket_index = 0;
1248 int error;
1249 error = SYSCTL_OUT(req, arg1, sizeof(int));
1250 if (error || !req->newptr) {
1251 return error;
1252 }
1253 error = SYSCTL_IN(req, &bucket_index, sizeof(int));
1254 if (error || !req->newptr) {
1255 return error;
1256 }
1257 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1258 /*
1259 * All jetsam buckets will be dumped.
1260 */
1261 } else {
1262 /*
1263 * Only a single bucket will be dumped.
1264 */
1265 }
1266
1267 proc_list_lock();
1268 memorystatus_debug_dump_bucket_locked(bucket_index);
1269 proc_list_unlock();
1270 memorystatus_debug_dump_this_bucket = bucket_index;
1271 return error;
1272 }
1273
1274 /*
1275 * Debug aid to look at jetsam buckets and proc jetsam fields.
1276 * Use this sysctl to act on a particular jetsam bucket.
1277 * Writing the sysctl triggers the dump.
1278 * Usage: sysctl kern.memorystatus_debug_dump_this_bucket=<bucket_index>
1279 */
1280
1281 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_debug_dump_this_bucket, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_debug_dump_this_bucket, 0, sysctl_memorystatus_debug_dump_bucket, "I", "");
1282
1283
1284 /* Debug aid to aid determination of limit */
1285
1286 static int
1287 sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
1288 {
1289 #pragma unused(oidp, arg2)
1290 proc_t p;
1291 unsigned int b = 0;
1292 int error, enable = 0;
1293 bool use_active; /* use the active limit and active limit attributes */
1294
1295 error = SYSCTL_OUT(req, arg1, sizeof(int));
1296 if (error || !req->newptr) {
1297 return error;
1298 }
1299
1300 error = SYSCTL_IN(req, &enable, sizeof(int));
1301 if (error || !req->newptr) {
1302 return error;
1303 }
1304
1305 if (!(enable == 0 || enable == 1)) {
1306 return EINVAL;
1307 }
1308
1309 proc_list_lock();
1310
1311 memorystatus_highwater_enabled = enable;
1312
1313 p = memorystatus_get_first_proc_locked(&b, TRUE);
1314 while (p) {
1315 use_active = memstat_proc_is_active_locked(p);
1316
1317 if (enable) {
1318 (void)memstat_update_memlimit_locked(p, use_active);
1319 } else {
1320 /*
1321 * Disabling limits does not touch the stored variants.
1322 * Set the cached limit fields to system_wide defaults.
1323 */
1324 p->p_memstat_memlimit = -1;
1325 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
1326 }
1327
1328 /*
1329 * Enforce the cached limit by writing to the ledger.
1330 */
1331 _memstat_write_memlimit_to_ledger_locked(p, use_active, false);
1332
1333 p = memorystatus_get_next_proc_locked(&b, p, TRUE);
1334 }
1335
1336
1337 proc_list_unlock();
1338
1339 return 0;
1340 }
1341
1342 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
1343
1344 SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
1345
1346 #endif /* DEVELOPMENT || DEBUG */
1347
1348 #if CONFIG_JETSAM
1349 #if DEVELOPMENT || DEBUG
1350 static int
1351 memstat_page_shortage_threshold_sysctl_handler SYSCTL_HANDLER_ARGS
1352 {
1353 uint32_t threshold;
1354 if (arg1 == &memstat_idle_threshold) {
1355 threshold = memorystatus_get_idle_exit_page_shortage_threshold();
1356 } else if (arg1 == &memstat_soft_threshold) {
1357 threshold = memorystatus_get_soft_memlimit_page_shortage_threshold();
1358 } else if (arg1 == &memstat_critical_threshold) {
1359 threshold = memorystatus_get_critical_page_shortage_threshold();
1360 } else if (arg1 == &memstat_reaper_threshold) {
1361 threshold = memorystatus_get_reaper_page_shortage_threshold();
1362 } else {
1363 return EINVAL;
1364 }
1365 return sysctl_handle_int(oidp, NULL, threshold, req);
1366 }
1367
1368 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_critical,
1369 CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_critical_threshold, 0,
1370 memstat_page_shortage_threshold_sysctl_handler, "IU",
1371 "");
1372 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_idle,
1373 CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_idle_threshold, 0,
1374 memstat_page_shortage_threshold_sysctl_handler, "IU",
1375 "");
1376 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_soft,
1377 CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_soft_threshold, 0,
1378 memstat_page_shortage_threshold_sysctl_handler, "IU",
1379 "");
1380 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_reaper,
1381 CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_reaper_threshold, 0,
1382 memstat_page_shortage_threshold_sysctl_handler, "IU",
1383 "");
1384
1385 SYSCTL_INT(_kern_memorystatus, OID_AUTO, ballast_offset_pages,
1386 CTLFLAG_RD | CTLFLAG_LOCKED,
1387 &memstat_ballast_offset, 0, "");
1388 SYSCTL_INT(_kern_memorystatus, OID_AUTO, ctd_offset_pages,
1389 CTLFLAG_RD | CTLFLAG_LOCKED,
1390 &memstat_ctd_offset, 0, "");
1391 #endif /* DEBUG || DEVELOPMENT */
1392
1393 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_enabled, &memstat_reaper_enabled, FALSE, TRUE, "");
1394 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_min_age_secs, &memstat_reaper_min_age_secs, 0, UINT32_MAX, "");
1395 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_min_age_apps_secs, &memstat_reaper_min_age_apps_secs, 0, UINT32_MAX, "");
1396 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_max_priority, &memstat_reaper_max_priority, 0, JETSAM_PRIORITY_MAX, "");
1397 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_reap_relaunch_mask, &memstat_reaper_reap_relaunch_mask, 0, UINT32_MAX, "");
1398 EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_rescan_secs, &memstat_reaper_rescan_secs, 0, UINT32_MAX, "");
1399
1400 SYSCTL_INT(_kern_memorystatus, OID_AUTO, reaper_stats_sweep_count,
1401 CTLFLAG_RD | CTLFLAG_LOCKED,
1402 &(memstat_reaper_cumulative_stats.sweep_count), 0, "");
1403 SYSCTL_INT(_kern_memorystatus, OID_AUTO, reaper_stats_total_kills,
1404 CTLFLAG_RD | CTLFLAG_LOCKED,
1405 &(memstat_reaper_cumulative_stats.kill_count), 0, "");
1406 SYSCTL_INT(_kern_memorystatus, OID_AUTO, reaper_stats_total_freed_mb,
1407 CTLFLAG_RD | CTLFLAG_LOCKED,
1408 &memstat_reaper_cumulative_memory_freed_mb, 0, "");
1409
1410 static int
1411 memstat_page_shortage_threshold_experiment_handler SYSCTL_HANDLER_ARGS
1412 {
1413 uint32_t threshold_mb;
1414 int error;
1415
1416 assert3p(arg1, !=, NULL);
1417 threshold_mb = ptoa_32(os_atomic_load((uint32_t *)arg1, relaxed)) >> 20;
1418
1419 error = sysctl_handle_int(oidp, &threshold_mb, 0, req);
1420 if (error || !req->newptr) {
1421 return error;
1422 }
1423
1424 if (threshold_mb > UINT32_MAX >> 20) {
1425 /* Converting to bytes would overflow */
1426 return EINVAL;
1427 }
1428
1429 uint32_t new_threshold_pages = atop_32(threshold_mb << 20);
1430 /*
1431 * Page shortage thresholds may not exceed 1/2 max_mem
1432 */
1433 if (new_threshold_pages > MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX) {
1434 return EINVAL;
1435 }
1436 if ((arg1 == &memstat_soft_threshold ||
1437 arg1 == &memstat_idle_threshold ||
1438 arg1 == &memstat_critical_threshold ||
1439 arg1 == &memstat_reaper_threshold) &&
1440 new_threshold_pages == 0) {
1441 return EINVAL;
1442 }
1443
1444 if (arg1 == &memstat_soft_threshold) {
1445 memorystatus_log("memorystatus: setting soft memory limit "
1446 "page shortage threshold to %u MiB\n", threshold_mb);
1447 } else if (arg1 == &memstat_idle_threshold) {
1448 memorystatus_log("memorystatus: setting idle exit page "
1449 "shortage threshold to %u MiB\n", threshold_mb);
1450 } else if (arg1 == &memstat_critical_threshold) {
1451 memorystatus_log("memorystatus: setting critical page shortage"
1452 " threshold to %u MiB\n", threshold_mb);
1453 } else if (arg1 == &memstat_reaper_threshold) {
1454 memorystatus_log("memorystatus: setting reaper page shortage"
1455 " threshold to %u MiB\n", threshold_mb);
1456 } else if (arg1 == &memstat_ctd_offset) {
1457 memorystatus_log("memorystatus: setting clear-the-decks page shortage"
1458 " offset to %u MiB\n", threshold_mb);
1459 } else if (arg1 == &memstat_ballast_offset) {
1460 memorystatus_log("memorystatus: setting ballast page shortage"
1461 " offset to %u MiB\n", threshold_mb);
1462 } else {
1463 return EINVAL;
1464 }
1465 os_atomic_store((uint32_t *)arg1, new_threshold_pages, relaxed);
1466
1467 return 0;
1468 }
1469
1470 #if DEVELOPMENT || DEBUG
1471 #define MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED)
1472 #else /* RELEASE */
1473 #define MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED)
1474 #endif /* DEVELOPMENT || DEBUG */
1475
1476 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, soft_threshold_mb,
1477 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1478 &memstat_soft_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1479 "IU",
1480 "The minimum amount of available memory to maintain before killing "
1481 "processes which have violated there soft memory limit");
1482
1483 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, idle_threshold_mb,
1484 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1485 &memstat_idle_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1486 "IU",
1487 "The minimum amount of available memory to maintain before exiting idle "
1488 "processes");
1489 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, critical_threshold_mb,
1490 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1491 &memstat_critical_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1492 "IU",
1493 "The minimum amount of available memory to maintain before killing non-idle "
1494 "processes");
1495 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, reaper_threshold_mb,
1496 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1497 &memstat_reaper_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1498 "IU",
1499 "The minimum amount of available memory to maintain before killing long-idle "
1500 "processes");
1501 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, ballast_offset_mb,
1502 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1503 &memstat_ballast_offset, 0, memstat_page_shortage_threshold_experiment_handler,
1504 "IU",
1505 "An offset to apply to all non-critical page shortage thresholds when "
1506 "ballast is filling");
1507 EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, clear_the_decks_offset_mb,
1508 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1509 &memstat_ctd_offset, 0, memstat_page_shortage_threshold_experiment_handler,
1510 "IU",
1511 "An offset to apply to all non-critical page shortage thresholds when "
1512 "clear-the-decks is engaged");
1513
1514 int
memorystatus_ballast_control(bool drain)1515 memorystatus_ballast_control(bool drain)
1516 {
1517 if (!fast_jetsam_enabled) {
1518 memorystatus_log_error("memorystatus: fast-jetsam "
1519 "has been disabled on this system. denying request to %s ballast\n",
1520 drain ? "drain" : "flood");
1521 return ENOTSUP;
1522 }
1523 if (memstat_ballast_offset == 0) {
1524 /* nothing to do */
1525 return 0;
1526 }
1527 if (drain) {
1528 /*
1529 * Drain the ballast tanks, providing additional buoyancy by requiring that
1530 * they only be used to store "available" memory.
1531 */
1532 memorystatus_policy_t orig_policy = os_atomic_or_orig(
1533 &memstat_policy_config,
1534 (memorystatus_policy_t)kPolicyBallastDrain, relaxed);
1535 if (orig_policy & kPolicyBallastDrain) {
1536 return 0;
1537 }
1538 memorystatus_log("memorystatus: draining ballast "
1539 "-- will add %u MiB to non-critical page shortage "
1540 "thresholds\n", ptoa_32(memstat_ballast_offset) >> 20);
1541 memorystatus_thread_pool_max();
1542 _memstat_consider_waking_jetsam_thread();
1543 } else {
1544 /*
1545 * Flood the ballast tanks, removing the extra buoyancy by allowing them to be
1546 * filled with "unavailable" memory.
1547 */
1548 memorystatus_policy_t orig_policy = os_atomic_andnot_orig(
1549 &memstat_policy_config,
1550 (memorystatus_policy_t)kPolicyBallastDrain, relaxed);
1551 if (!(orig_policy & kPolicyBallastDrain)) {
1552 /* already disabled */
1553 return 0;
1554 }
1555 assertf(fast_jetsam_enabled, "ballast was drained while fast-jetsam was disabled");
1556 memorystatus_log("memorystatus: flooding ballast "
1557 "-- will subtract %u MiB from non-critical page shortage "
1558 "thresholds\n", ptoa_32(memstat_ballast_offset) >> 20);
1559 memorystatus_thread_pool_default();
1560 _memstat_consider_waking_jetsam_thread();
1561 }
1562 return 0;
1563 }
1564
1565 static int
1566 sysctl_kern_memorystatus_ballast_drain SYSCTL_HANDLER_ARGS
1567 {
1568 int error = 0;
1569
1570 boolean_t drained = os_atomic_load(&memstat_policy_config, relaxed) & kPolicyBallastDrain ? TRUE : FALSE;
1571
1572 error = sysctl_handle_int(oidp, &drained, 0, req);
1573 if (error || !req->newptr) {
1574 return error;
1575 }
1576
1577 /*
1578 * Writers must be root or have the com.apple.private.kernel.jetsam entitlement
1579 */
1580 error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
1581 if (error) {
1582 return error;
1583 }
1584
1585 return memorystatus_ballast_control(drained);
1586 }
1587
1588 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, ballast_drained,
1589 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS, 0, 0,
1590 sysctl_kern_memorystatus_ballast_drain, "IU",
1591 "If true, apply an offset (kern.memorystatus.ballast_offset_mb) to "
1592 "all non-critical page shortage thresholds");
1593
1594 #if DEVELOPMENT || DEBUG
1595 /*
1596 * In preparation for a storm, sailors may "clear the decks" of non-essential
1597 * cargo to increase the seaworthiness of a vessel. In our analogy, the
1598 * non-essential cargo is idle processes or processes which have exceeded
1599 * their memory limit. The storm may be any foreseeable user activity that will
1600 * require significant memory demand.
1601 *
1602 * Mechanically, clearing the decks involves adding a configurable offset to
1603 * the idle and soft available page shortage thresholds.
1604 *
1605 * Readers may note that the clear-the-decks policy is mechanically identical
1606 * the ballast-draining policy. Their difference lies in intended use.
1607 * Clear-the-decks is intended to address imminent memory demand and may be
1608 * configured with an offset that wouldn't be sustainable for long-term system
1609 * use. The interface is generally intended to allow clients to hint to the
1610 * system that they will need a significant amount of memory in the near future,
1611 * and the system should proactively try to free unneeded reserves to satisfy
1612 * to be able to better satisfy the demand.
1613 *
1614 * This policy is currently only exposed on development kernels for prototyping
1615 * until a productized use case emerges
1616 *
1617 * TODO: If adopted on production systems, this mechanism should use a
1618 * dedicated system-call / memorystatus-command
1619 */
1620 static int
memstat_clear_the_decks(bool clear)1621 memstat_clear_the_decks(bool clear)
1622 {
1623 if (!fast_jetsam_enabled) {
1624 memorystatus_log_error("memorystatus: fast-jetsam "
1625 "has been disabled on this system\n");
1626 return ENOTSUP;
1627 }
1628 if (clear) {
1629 /*
1630 * Clear the decks of non-essential cargo.
1631 */
1632 memorystatus_policy_t orig_policy = os_atomic_or_orig(
1633 &memstat_policy_config,
1634 (memorystatus_policy_t)kPolicyClearTheDecks, relaxed);
1635 if (orig_policy & kPolicyClearTheDecks) {
1636 return EALREADY;
1637 }
1638 memorystatus_log("memorystatus: clear-the-decks engaged "
1639 "-- will add %u MiB to non-critical page shortage "
1640 "thresholds\n", ptoa_32(memstat_ctd_offset) >> 20);
1641 memorystatus_thread_pool_max();
1642 _memstat_consider_waking_jetsam_thread();
1643 } else {
1644 /*
1645 * Allow the decks to be reloaded with non-essential cargo.
1646 */
1647 memorystatus_policy_t orig_policy = os_atomic_andnot_orig(
1648 &memstat_policy_config,
1649 (memorystatus_policy_t)kPolicyClearTheDecks, relaxed);
1650 if (!(orig_policy & kPolicyClearTheDecks)) {
1651 return EALREADY;
1652 }
1653 assertf(fast_jetsam_enabled, "clear the decks was set while fast-jetsam was disabled");
1654 memorystatus_log("memorystatus: clear-the-decks disengaged "
1655 "-- will subtract %u MiB from non-critical page shortage "
1656 "thresholds\n", ptoa_32(memstat_ctd_offset) >> 20);
1657 memorystatus_thread_pool_default();
1658 _memstat_consider_waking_jetsam_thread();
1659 }
1660 return 0;
1661 }
1662
1663 static int
1664 sysctl_kern_memorystatus_decks_cleared SYSCTL_HANDLER_ARGS
1665 {
1666 int error = 0;
1667
1668 boolean_t cleared = os_atomic_load(&memstat_policy_config, relaxed) & kPolicyClearTheDecks ? TRUE : FALSE;
1669
1670 error = sysctl_handle_int(oidp, &cleared, 0, req);
1671 if (error || !req->newptr) {
1672 return error;
1673 }
1674
1675 /*
1676 * Writers must be root or have the com.apple.private.kernel.jetsam entitlement
1677 */
1678 error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
1679 if (error) {
1680 return error;
1681 }
1682
1683 return memstat_clear_the_decks(cleared);
1684 }
1685
1686 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, decks_cleared,
1687 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1688 0, 0, sysctl_kern_memorystatus_decks_cleared, "I",
1689 "If true, apply an offset (kern.memorystatus_ctd_offset_mb) to "
1690 "all non-critical page shortage thresholds");
1691 #endif /* DEVELOPMENT || DEBUG */
1692 #endif /* CONFIG_JETSAM */
1693
1694 extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
1695 void *parameter,
1696 integer_t priority,
1697 thread_t *new_thread);
1698
1699 #if DEVELOPMENT || DEBUG
1700
1701 static int
1702 sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS
1703 {
1704 #pragma unused(arg1, arg2)
1705 int error = 0, pid = 0;
1706 proc_t p;
1707
1708 error = sysctl_handle_int(oidp, &pid, 0, req);
1709 if (error || !req->newptr) {
1710 return error;
1711 }
1712
1713 lck_mtx_lock(&disconnect_page_mappings_mutex);
1714
1715 if (pid == -1) {
1716 vm_pageout_disconnect_all_pages();
1717 } else {
1718 p = proc_find(pid);
1719
1720 if (p != NULL) {
1721 error = task_disconnect_page_mappings(proc_task(p));
1722
1723 proc_rele(p);
1724
1725 if (error) {
1726 error = EIO;
1727 }
1728 } else {
1729 error = EINVAL;
1730 }
1731 }
1732 lck_mtx_unlock(&disconnect_page_mappings_mutex);
1733
1734 return error;
1735 }
1736
1737 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1738 0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", "");
1739
1740 #endif /* DEVELOPMENT || DEBUG */
1741
1742 /*
1743 * Sorts the given bucket.
1744 *
1745 * Input:
1746 * bucket_index - jetsam priority band to be sorted.
1747 * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1748 *
1749 * proc_list_lock must be held by the caller.
1750 */
1751 static void
memstat_sort_bucket_locked(unsigned int bucket_index,memorystatus_jetsam_sort_order_t sort_order)1752 memstat_sort_bucket_locked(
1753 unsigned int bucket_index,
1754 memorystatus_jetsam_sort_order_t sort_order)
1755 {
1756 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
1757 assert(bucket_index < MEMSTAT_BUCKET_COUNT);
1758 if (memstat_bucket[bucket_index].count == 0) {
1759 return;
1760 }
1761
1762 switch (sort_order) {
1763 case JETSAM_SORT_NONE:
1764 break;
1765 case JETSAM_SORT_LRU:
1766 case JETSAM_SORT_FOOTPRINT:
1767 memstat_sort_coals_locked(bucket_index, sort_order);
1768 break;
1769 case JETSAM_SORT_FOOTPRINT_NOCOAL:
1770 memstat_sort_by_footprint_locked(bucket_index);
1771 }
1772 }
1773
1774 /*
1775 * Picks the sorting routine for a given jetsam priority band.
1776 *
1777 * Input:
1778 * bucket_index - jetsam priority band to be sorted.
1779 * sort_order - sort order to use
1780 *
1781 * Return:
1782 * 0 on success
1783 * non-0 on failure
1784 */
1785 static int
memstat_sort_bucket(unsigned int bucket_index,memorystatus_jetsam_sort_order_t sort_order)1786 memstat_sort_bucket(
1787 unsigned int bucket_index,
1788 memorystatus_jetsam_sort_order_t sort_order)
1789 {
1790 assert(bucket_index < MEMSTAT_BUCKET_COUNT);
1791
1792 proc_list_lock();
1793 memstat_sort_bucket_locked(bucket_index, sort_order);
1794 proc_list_unlock();
1795
1796 return 0;
1797 }
1798
1799 /*
1800 * Sort processes by size for a single jetsam bucket.
1801 */
1802
1803 static void
memstat_sort_by_footprint_locked(unsigned int bucket_index)1804 memstat_sort_by_footprint_locked(unsigned int bucket_index)
1805 {
1806 proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
1807 proc_t next_p = NULL, prev_max_proc = NULL;
1808 uint32_t pages = 0, max_pages = 0;
1809 memstat_bucket_t *current_bucket;
1810
1811 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1812 return;
1813 }
1814
1815 current_bucket = &memstat_bucket[bucket_index];
1816
1817 p = TAILQ_FIRST(¤t_bucket->list);
1818
1819 while (p) {
1820 memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
1821 max_pages = pages;
1822 max_proc = p;
1823 prev_max_proc = p;
1824
1825 while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
1826 /* traversing list until we find next largest process */
1827 p = next_p;
1828 memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
1829 if (pages > max_pages) {
1830 max_pages = pages;
1831 max_proc = p;
1832 }
1833 }
1834
1835 if (prev_max_proc != max_proc) {
1836 /* found a larger process, place it in the list */
1837 TAILQ_REMOVE(¤t_bucket->list, max_proc, p_memstat_list);
1838 if (insert_after_proc == NULL) {
1839 TAILQ_INSERT_HEAD(¤t_bucket->list, max_proc, p_memstat_list);
1840 } else {
1841 TAILQ_INSERT_AFTER(¤t_bucket->list, insert_after_proc, max_proc, p_memstat_list);
1842 }
1843 prev_max_proc = max_proc;
1844 }
1845
1846 insert_after_proc = max_proc;
1847
1848 p = TAILQ_NEXT(max_proc, p_memstat_list);
1849 }
1850 }
1851
1852 proc_t
memorystatus_get_first_proc_locked(unsigned int * bucket_index,boolean_t search)1853 memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search)
1854 {
1855 memstat_bucket_t *current_bucket;
1856 proc_t next_p;
1857
1858 if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
1859 return NULL;
1860 }
1861
1862 current_bucket = &memstat_bucket[*bucket_index];
1863 next_p = TAILQ_FIRST(¤t_bucket->list);
1864 if (!next_p && search) {
1865 while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1866 current_bucket = &memstat_bucket[*bucket_index];
1867 next_p = TAILQ_FIRST(¤t_bucket->list);
1868 }
1869 }
1870
1871 return next_p;
1872 }
1873
1874 proc_t
memorystatus_get_next_proc_locked(unsigned int * bucket_index,proc_t p,boolean_t search)1875 memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search)
1876 {
1877 memstat_bucket_t *current_bucket;
1878 proc_t next_p;
1879
1880 if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
1881 return NULL;
1882 }
1883
1884 next_p = TAILQ_NEXT(p, p_memstat_list);
1885 while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1886 current_bucket = &memstat_bucket[*bucket_index];
1887 next_p = TAILQ_FIRST(¤t_bucket->list);
1888 }
1889
1890 return next_p;
1891 }
1892
1893 jetsam_state_t jetsam_threads;
1894
1895 /* Maximum number of jetsam threads allowed */
1896 #define JETSAM_THREADS_LIMIT 3
1897
1898 /* Number of active jetsam threads */
1899 _Atomic unsigned int active_jetsam_threads = 1;
1900 /* Number of maximum jetsam threads configured */
1901 unsigned int max_jetsam_threads = 1;
1902
1903 static jetsam_state_t
jetsam_current_thread()1904 jetsam_current_thread()
1905 {
1906 for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) {
1907 if (jetsam_threads[thr_id].thread == current_thread()) {
1908 return &(jetsam_threads[thr_id]);
1909 }
1910 }
1911 return NULL;
1912 }
1913
1914 #if CONFIG_JETSAM
1915 static void
initialize_entitled_max_task_limit()1916 initialize_entitled_max_task_limit()
1917 {
1918 #if !XNU_TARGET_OS_XR
1919 /**
1920 * We've already stored the potential boot-arg "entitled_max_task_pmem" in
1921 * memorystatus_entitled_max_task_footprint_mb as a TUNABLE_DT. We provide
1922 * argptr=NULL and max_len=0 here to check only for existence of the boot-arg.
1923 *
1924 * The boot-arg takes precedence over memorystatus_swap_all_apps.
1925 */
1926 if (!PE_parse_boot_argn("entitled_max_task_pmem", NULL, 0) && memorystatus_swap_all_apps) {
1927 /*
1928 * When we have swap, we let entitled apps go up to the dram config
1929 * regardless of what's set in EDT,
1930 * This can still be overriden with the entitled_max_task_pmem boot-arg.
1931 *
1932 * We do not want to do this on visionOS, since we can have an effectively
1933 * infinite number of apps open at a time, and cannot swap our way to safety.
1934 */
1935 memorystatus_entitled_max_task_footprint_mb =
1936 (int32_t)(max_mem_actual / (1ULL << 20));
1937 memorystatus_entitled_dev_max_task_footprint_mb =
1938 memorystatus_entitled_max_task_footprint_mb;
1939 }
1940 #endif
1941
1942 if (memorystatus_entitled_max_task_footprint_mb < 0) {
1943 memorystatus_log_error("Invalid value (%d) for entitled_max_task_pmem. "
1944 "Setting to 0\n", memorystatus_entitled_max_task_footprint_mb);
1945 memorystatus_entitled_max_task_footprint_mb = 0;
1946 }
1947
1948 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
1949 if (memorystatus_entitled_bincompat_max_task_footprint_mb < 0) {
1950 memorystatus_log_error("Invalid value (%d) for entitled_bincompat_max_task_pmem. "
1951 "Setting to 0\n", memorystatus_entitled_bincompat_max_task_footprint_mb);
1952 memorystatus_entitled_bincompat_max_task_footprint_mb = 0;
1953 }
1954 #endif /* CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT */
1955
1956 if (memorystatus_entitled_dev_max_task_footprint_mb < -1) {
1957 memorystatus_log_error("Invalid value (%d) for entitled_max_developer_task_pmem. "
1958 "Setting to 0\n", memorystatus_entitled_dev_max_task_footprint_mb);
1959 memorystatus_entitled_dev_max_task_footprint_mb = 0;
1960 } else if (memorystatus_entitled_dev_max_task_footprint_mb == -1) {
1961 memorystatus_entitled_dev_max_task_footprint_mb = (int32_t)
1962 (max_mem_actual >> 20);
1963 }
1964
1965 if (memorystatus_entitled_dev_max_task_footprint_mb &&
1966 memorystatus_entitled_dev_max_task_footprint_mb <
1967 memorystatus_entitled_max_task_footprint_mb) {
1968 memorystatus_log_error("memorystatus: Entitled developer limit (%d MB) "
1969 "must be ≥ entitled task limit (%d MB)\n",
1970 memorystatus_entitled_dev_max_task_footprint_mb,
1971 memorystatus_entitled_max_task_footprint_mb);
1972 memorystatus_entitled_dev_max_task_footprint_mb =
1973 memorystatus_entitled_max_task_footprint_mb;
1974 }
1975 }
1976
1977 #endif /* CONFIG_JETSAM */
1978
1979
1980 __private_extern__ void
memorystatus_init(void)1981 memorystatus_init(void)
1982 {
1983 kern_return_t result;
1984 int i;
1985
1986 #if CONFIG_FREEZE
1987 memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX_DEFAULT;
1988 memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */
1989 memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4);
1990 memorystatus_freeze_pages_min = FREEZE_PAGES_MIN_DEFAULT;
1991 memorystatus_freeze_pages_max = FREEZE_PAGES_MAX_DEFAULT;
1992 memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS_DEFAULT;
1993 memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD_DEFAULT;
1994 memorystatus_min_thaw_refreeze_threshold = MIN_THAW_REFREEZE_THRESHOLD_DEFAULT;
1995 #endif /* CONFIG_FREEZE */
1996
1997 // Note: no-op pending rdar://27006343 (Custom kernel log handles)
1998 memorystatus_log_handle = os_log_create("com.apple.xnu", "memorystatus");
1999
2000 /* Init buckets */
2001 for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
2002 TAILQ_INIT(&memstat_bucket[i].list);
2003 memstat_bucket[i].count = 0;
2004 memstat_bucket[i].relaunch_high_count = 0;
2005 }
2006 memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
2007
2008 nanoseconds_to_absolutetime(memstat_idle_deferral_time_s * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
2009 nanoseconds_to_absolutetime(memstat_idle_deferral_time_s * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
2010 nanoseconds_to_absolutetime(memstat_aging_stuck_time_s * NSEC_PER_SEC, &memorystatus_aging_stuck_delay_time);
2011 assert3u(memstat_idle_deferral_time_s, >=, kJetsamSysProcsIdleDelayTimeLowRatio);
2012
2013 #if CONFIG_JETSAM
2014 bzero(memorystatus_jetsam_proc_name_panic, sizeof(memorystatus_jetsam_proc_name_panic));
2015 if (PE_parse_boot_argn("jetsam_proc_name_panic", &memorystatus_jetsam_proc_name_panic, sizeof(memorystatus_jetsam_proc_name_panic))) {
2016 /*
2017 * No bounds check to see if this is a valid cause.
2018 * This is a debugging aid. The callers should know precisely which cause they wish to track.
2019 */
2020 PE_parse_boot_argn("jetsam_proc_cause_panic", &memorystatus_jetsam_proc_cause_panic, sizeof(memorystatus_jetsam_proc_cause_panic));
2021 PE_parse_boot_argn("jetsam_proc_size_panic", &memorystatus_jetsam_proc_size_panic, sizeof(memorystatus_jetsam_proc_size_panic));
2022 }
2023
2024 if (memorystatus_swap_all_apps && vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
2025 panic("kern.swap_all_apps is not supported on this platform");
2026 }
2027
2028 /*
2029 * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE
2030 * band and must be below it in priority. This is so that we don't have to make
2031 * our 'aging' code worry about a mix of processes, some of which need to age
2032 * and some others that need to stay elevated in the jetsam bands.
2033 */
2034 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band);
2035 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band_stuck);
2036 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band);
2037
2038 /* Take snapshots for idle-exit kills by default? First check the boot-arg... */
2039 if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot))) {
2040 /* ...no boot-arg, so check the device tree */
2041 PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
2042 }
2043
2044 memorystatus_sysproc_aging_aggr_pages = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_AGGR_SYSPROC_AGING_PERCENTAGE);
2045
2046 if (max_mem <= MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) {
2047 memorystatus_delta = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_DELTA_PERCENTAGE_SMALL);
2048 } else {
2049 memorystatus_delta = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_DELTA_PERCENTAGE_LARGE);
2050 }
2051
2052 if (memorystatus_critical_threshold_mb != 0) {
2053 memstat_critical_threshold = atop_32(memorystatus_critical_threshold_mb << 20);
2054 } else {
2055 if (max_mem <= MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) {
2056 memstat_critical_threshold = MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_CRITICAL_PERCENTAGE_SMALL);
2057 } else {
2058 memstat_critical_threshold = MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_CRITICAL_PERCENTAGE_LARGE);
2059 }
2060 }
2061 assert3u(memstat_critical_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2062
2063 if (memorystatus_idle_threshold_mb != 0) {
2064 memstat_idle_threshold = atop_32(memorystatus_idle_threshold_mb << 20);
2065 } else {
2066 /*
2067 * For historical reasons, devices with "medium"-sized memory configs have a different critical:idle:pressure ratio
2068 */
2069 if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
2070 (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
2071 memstat_idle_threshold = (MEMORYSTATUS_IDLE_RATIO_NUM_MEDIUM * memstat_critical_threshold) /
2072 MEMORYSTATUS_IDLE_RATIO_DENOM_MEDIUM;
2073 } else {
2074 memstat_idle_threshold = (MEMORYSTATUS_IDLE_RATIO_NUM * memstat_critical_threshold) /
2075 MEMORYSTATUS_IDLE_RATIO_DENOM;
2076 }
2077 }
2078 assert3u(memstat_idle_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2079
2080 if (memorystatus_pressure_threshold_mb != 0) {
2081 memstat_soft_threshold = atop_32(memorystatus_pressure_threshold_mb << 20);
2082 } else {
2083 if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
2084 (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
2085 memstat_soft_threshold = (MEMORYSTATUS_PRESSURE_RATIO_NUM_MEDIUM * memstat_critical_threshold) /
2086 MEMORYSTATUS_PRESSURE_RATIO_DENOM_MEDIUM;
2087 } else {
2088 memstat_soft_threshold = (MEMORYSTATUS_PRESSURE_RATIO_NUM * memstat_critical_threshold) /
2089 MEMORYSTATUS_PRESSURE_RATIO_DENOM;
2090 }
2091 }
2092 assert3u(memstat_soft_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2093
2094 memstat_reaper_max_priority = MEMORYSTATUS_REAPER_MAX_PRIORITY_DEFAULT;
2095
2096 if (memorystatus_reaper_threshold_mb != 0) {
2097 memstat_reaper_threshold = atop_32(memorystatus_reaper_threshold_mb << 20);
2098 } else {
2099 if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
2100 (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
2101 memstat_reaper_threshold = (MEMORYSTATUS_REAPER_RATIO_NUM_MEDIUM * memstat_critical_threshold) /
2102 MEMORYSTATUS_REAPER_RATIO_DENOM_MEDIUM;
2103 } else if (max_mem > MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD) {
2104 memstat_reaper_threshold = (MEMORYSTATUS_REAPER_RATIO_NUM_LARGE * memstat_critical_threshold) /
2105 MEMORYSTATUS_REAPER_RATIO_DENOM_LARGE;
2106 } else {
2107 memstat_reaper_threshold = (MEMORYSTATUS_REAPER_RATIO_NUM * memstat_critical_threshold) /
2108 MEMORYSTATUS_REAPER_RATIO_DENOM;
2109 }
2110 }
2111 assert3u(memstat_reaper_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2112
2113 if (memorystatus_reaper_minimum_age_seconds != MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT) {
2114 memstat_reaper_min_age_secs = memorystatus_reaper_minimum_age_seconds;
2115 } else {
2116 if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
2117 (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
2118 memstat_reaper_min_age_secs = MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT_MEDIUM;
2119 } else {
2120 memstat_reaper_min_age_secs = MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT;
2121 }
2122 }
2123
2124 if (memorystatus_reaper_minimum_age_apps_seconds != MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT) {
2125 memstat_reaper_min_age_apps_secs = memorystatus_reaper_minimum_age_apps_seconds;
2126 } else {
2127 if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
2128 (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
2129 memstat_reaper_min_age_apps_secs = MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT_MEDIUM;
2130 } else {
2131 memstat_reaper_min_age_apps_secs = MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT;
2132 }
2133 }
2134
2135 if (memorystatus_reaper_rescan_delay_seconds != MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT) {
2136 memstat_reaper_rescan_secs = memorystatus_reaper_rescan_delay_seconds;
2137 } else {
2138 memstat_reaper_rescan_secs = MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT;
2139 }
2140
2141 memstat_reaper_enabled = memorystatus_reaper_enabled;
2142
2143 if (memstat_ballast_offset_mb != 0) {
2144 memstat_ballast_offset = atop_32(memstat_ballast_offset_mb << 20);
2145 }
2146 assert3u(memstat_ballast_offset, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2147
2148 if (memstat_ctd_offset_mb != 0) {
2149 memstat_ctd_offset = atop_32(memstat_ctd_offset_mb << 20);
2150 }
2151 assert3u(memstat_ctd_offset, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
2152
2153 /* Set the swapin trigger in pages based on the maximum size allocated for each c_seg */
2154 memorystatus_swapin_trigger_pages = (unsigned int) atop_64(memorystatus_swapin_trigger_segments * c_seg_allocsize);
2155
2156 /* Jetsam Loop Detection */
2157 if (max_mem <= (512 * 1024 * 1024)) {
2158 /* 512 MB devices */
2159 memorystatus_jld_eval_period_msecs = 8000; /* 8000 msecs == 8 second window */
2160 } else {
2161 /* 1GB and larger devices */
2162 memorystatus_jld_eval_period_msecs = 6000; /* 6000 msecs == 6 second window */
2163 }
2164
2165 memorystatus_jld_enabled = TRUE;
2166
2167 initialize_entitled_max_task_limit();
2168 #endif /* CONFIG_JETSAM */
2169
2170 memorystatus_jetsam_snapshot_max = maxproc;
2171
2172 memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
2173 (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
2174
2175 memorystatus_jetsam_snapshot = kalloc_data(memorystatus_jetsam_snapshot_size, Z_WAITOK | Z_ZERO);
2176 if (!memorystatus_jetsam_snapshot) {
2177 panic("Could not allocate memorystatus_jetsam_snapshot");
2178 }
2179
2180 #if CONFIG_FREEZE
2181 memorystatus_jetsam_snapshot_freezer_max = memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR;
2182 memorystatus_jetsam_snapshot_freezer_size = sizeof(memorystatus_jetsam_snapshot_t) +
2183 (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_freezer_max);
2184
2185 memorystatus_jetsam_snapshot_freezer =
2186 zalloc_permanent(memorystatus_jetsam_snapshot_freezer_size, ZALIGN_PTR);
2187 #endif /* CONFIG_FREEZE */
2188
2189 nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
2190
2191 memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
2192
2193 #if CONFIG_FREEZE
2194 if (memorystatus_freeze_threshold_mb != 0) {
2195 memorystatus_freeze_threshold = (unsigned int)atop_64((uint64_t)memorystatus_freeze_threshold_mb << 20);
2196 } else {
2197 memorystatus_freeze_threshold = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_FREEZE_THRESHOLD_PERCENTAGE);
2198 }
2199 assert(memorystatus_freeze_threshold < (unsigned int)atop_64(max_mem));
2200
2201 if (memorystatus_swap_all_apps) {
2202 /*
2203 * Swap is enabled, so we expect a larger working set & larger apps.
2204 * Adjust thresholds accordingly.
2205 */
2206 memorystatus_freeze_configure_for_swap();
2207 }
2208 #endif
2209
2210 /* Check the boot-arg to configure the maximum number of jetsam threads */
2211 if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof(max_jetsam_threads))) {
2212 max_jetsam_threads = JETSAM_THREADS_LIMIT;
2213 }
2214
2215 /* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */
2216 if (max_jetsam_threads > JETSAM_THREADS_LIMIT) {
2217 max_jetsam_threads = JETSAM_THREADS_LIMIT;
2218 }
2219
2220 #if CONFIG_JETSAM
2221 /* For low CPU systems disable fast jetsam mechanism */
2222 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
2223 max_jetsam_threads = 1;
2224 }
2225 #endif /* CONFIG_JETSAM */
2226
2227 #if DEVELOPMENT || DEBUG
2228 if (PE_parse_boot_argn("-memorystatus-skip-fg-notify", &i, sizeof(i))) {
2229 memorystatus_should_issue_fg_band_notify = false;
2230 }
2231
2232 if (PE_parse_boot_argn("memorystatus_kill_on_sustained_pressure", &i, sizeof(i))) {
2233 if (i) {
2234 memstat_pressure_config |= MEMSTAT_WARNING_KILL_SUSTAINED;
2235 } else {
2236 memstat_pressure_config &= ~MEMSTAT_WARNING_KILL_SUSTAINED;
2237 }
2238 }
2239 #endif /* DEVELOPMENT || DEBUG */
2240
2241 /* Initialize the jetsam_threads state array */
2242 jetsam_threads = zalloc_permanent(sizeof(struct jetsam_state_s) *
2243 max_jetsam_threads, ZALIGN(struct jetsam_state_s));
2244
2245 /* Initialize all the jetsam threads */
2246 for (i = 0; i < max_jetsam_threads; i++) {
2247 jetsam_threads[i].inited = false;
2248 jetsam_threads[i].index = i;
2249 result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread);
2250 if (result != KERN_SUCCESS) {
2251 panic("Could not create memorystatus_thread %d", i);
2252 }
2253 thread_deallocate(jetsam_threads[i].thread);
2254 }
2255
2256 #if VM_PRESSURE_EVENTS
2257 memorystatus_notify_init();
2258 #endif /* VM_PRESSURE_EVENTS */
2259
2260 #if JETSAM_ZPRINT_SNAPSHOT
2261 size_t jzs_names_size, jzs_info_size, jzs_meminfo_size;
2262
2263 jzs_zone_cnt = zone_max_zones();
2264 jzs_names_size = jzs_zone_cnt * sizeof(mach_zone_name_t);
2265 jzs_names = zalloc_permanent(jzs_names_size, ZALIGN(mach_zone_name_t));
2266
2267 jzs_info_size = jzs_zone_cnt * sizeof(mach_zone_info_t);
2268 jzs_info = zalloc_permanent(jzs_info_size, ZALIGN(mach_zone_info_t));
2269
2270 jzs_coalesce = zalloc_permanent(jzs_zone_cnt * sizeof(int), ZALIGN(int));
2271
2272 jzs_meminfo_cnt = vm_page_diagnose_estimate();
2273 jzs_meminfo_size = jzs_meminfo_cnt * sizeof(mach_memory_info_t);
2274 jzs_meminfo = kalloc_data_tag(jzs_meminfo_size, Z_WAITOK, VM_KERN_MEMORY_DIAG);
2275 #endif /* JETSAM_ZPRINT_SNAPSHOT */
2276
2277 bzero(memorystatus_kill_counts, sizeof(memorystatus_kill_counts));
2278 }
2279
2280 #if CONFIG_JETSAM
2281 bool
memorystatus_disable_swap(void)2282 memorystatus_disable_swap(void)
2283 {
2284 #if DEVELOPMENT || DEBUG
2285 int boot_arg_val = 0;
2286 if (PE_parse_boot_argn("kern.swap_all_apps", &boot_arg_val, sizeof(boot_arg_val))) {
2287 if (boot_arg_val) {
2288 /* Can't disable app swap if it was set via a boot-arg */
2289 return false;
2290 }
2291 }
2292 #endif /* DEVELOPMENT || DEBUG */
2293 memorystatus_swap_all_apps = false;
2294 #if CONFIG_FREEZE
2295 /* Go back to the smaller freezer thresholds */
2296 memorystatus_freeze_disable_swap();
2297 #endif /* CONFIG_FREEZE */
2298 initialize_entitled_max_task_limit();
2299 return true;
2300 }
2301 #endif /* CONFIG_JETSAM */
2302
2303 static void
_memstat_record_kill(int32_t priority,memorystatus_kill_cause_t cause)2304 _memstat_record_kill(int32_t priority, memorystatus_kill_cause_t cause)
2305 {
2306 uint32_t _Atomic *count;
2307 uint32_t orig;
2308
2309 /* Check validity of reason / cause */
2310 if ((priority < JETSAM_PRIORITY_IDLE) ||
2311 (priority > JETSAM_PRIORITY_MAX) ||
2312 (cause <= kMemorystatusInvalid) ||
2313 (cause > JETSAM_REASON_MEMORYSTATUS_MAX)) {
2314 memorystatus_log_error("memorystatus: not tracking kill with invalid priority %d / cause %d\n",
2315 priority, cause);
2316 return;
2317 }
2318
2319 if ((priority == JETSAM_PRIORITY_IDLE) && (cause == kMemorystatusKilledIdleExit)) {
2320 /* rdar://141462516 */
2321 count = &memorystatus_idle_exit_kill_count;
2322 } else {
2323 if (cause == kMemorystatusKilledIdleExit) {
2324 memorystatus_log_error("memorystatus: not tracking idle exit kill for priority %d\n", priority);
2325 return;
2326 }
2327
2328 /* kMemorystatusKilledIdleExit and kMemorystatusInvalid are not in the array */
2329 if (cause < kMemorystatusKilledIdleExit) {
2330 count = &memorystatus_kill_counts[priority][cause - 1];
2331 } else {
2332 count = &memorystatus_kill_counts[priority][cause - 2];
2333 }
2334 }
2335
2336 orig = os_atomic_inc_orig(count, relaxed);
2337 if (orig == UINT32_MAX) {
2338 os_atomic_dec(count, relaxed);
2339 memorystatus_log_error("memorystatus: overflowed kill count for priority %d + cause %d\n", priority, cause);
2340 }
2341 }
2342
2343 /*
2344 * The jetsam no frills kill call
2345 * Return: 0 on success
2346 * error code on failure (EINVAL...)
2347 */
2348 static int
jetsam_do_kill(proc_t p,int jetsam_flags,os_reason_t jetsam_reason)2349 jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason)
2350 {
2351 int error = 0;
2352 error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason);
2353 return error;
2354 }
2355
2356 /*
2357 * Wrapper for processes exiting with memorystatus details
2358 */
2359 static bool
memorystatus_do_kill(proc_t p,uint32_t cause,os_reason_t jetsam_reason,uint64_t * footprint_out)2360 memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason, uint64_t *footprint_out)
2361 {
2362 int error = 0;
2363 __unused pid_t victim_pid = proc_getpid(p);
2364 uint64_t footprint = get_task_phys_footprint(proc_task(p));
2365 #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
2366 int32_t memstat_effectivepriority = p->p_memstat_effectivepriority;
2367 #endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
2368
2369 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DO_KILL) | DBG_FUNC_START,
2370 victim_pid, cause, vm_page_free_count, footprint);
2371 DTRACE_MEMORYSTATUS4(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause, uint64_t, footprint);
2372
2373 #if CONFIG_JETSAM
2374 if (*p->p_name && !strncmp(memorystatus_jetsam_proc_name_panic, p->p_name, sizeof(p->p_name))) { /* name */
2375 if ((!memorystatus_jetsam_proc_cause_panic || cause == memorystatus_jetsam_proc_cause_panic) && /* cause */
2376 (!memorystatus_jetsam_proc_size_panic || (footprint >> 20) >= memorystatus_jetsam_proc_size_panic)) { /* footprint */
2377 panic("memorystatus_do_kill(): requested panic on jetsam of %s (cause: %d and footprint: %llu mb)",
2378 memorystatus_jetsam_proc_name_panic, cause, footprint >> 20);
2379 }
2380 }
2381 #else /* CONFIG_JETSAM */
2382 #pragma unused(cause)
2383 #endif /* CONFIG_JETSAM */
2384
2385 if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
2386 memorystatus_log(
2387 "memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n",
2388 proc_getpid(p), (*p->p_name ? p->p_name : "unknown"),
2389 memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority,
2390 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
2391 }
2392
2393 _memstat_record_kill(p->p_memstat_effectivepriority, cause);
2394
2395 /*
2396 * The jetsam_reason (os_reason_t) has enough information about the kill cause.
2397 * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped.
2398 */
2399 int jetsam_flags = P_LTERM_JETSAM;
2400 switch (cause) {
2401 case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break;
2402 case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break;
2403 case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
2404 case kMemorystatusKilledVMCompressorThrashing:
2405 case kMemorystatusKilledVMCompressorSpaceShortage: jetsam_flags |= P_JETSAM_VMTHRASHING; break;
2406 case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break;
2407 case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break;
2408 case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break;
2409 case kMemorystatusKilledConclaveLimit: jetsam_flags |= P_JETSAM_PID; break;
2410 }
2411 /* jetsam_do_kill drops a reference. */
2412 os_reason_ref(jetsam_reason);
2413 error = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
2414 if (footprint_out) {
2415 *footprint_out = ((error == 0) ? footprint : 0);
2416 }
2417
2418 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DO_KILL) | DBG_FUNC_END,
2419 victim_pid, memstat_effectivepriority, vm_page_free_count, error);
2420
2421 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_COMPACTOR_RUN) | DBG_FUNC_START,
2422 victim_pid, cause, vm_page_free_count, footprint);
2423
2424 if (jetsam_reason->osr_code == JETSAM_REASON_VNODE) {
2425 /*
2426 * vnode jetsams are syncronous and not caused by memory pressure.
2427 * Running the compactor on this thread adds significant latency to the filesystem operation
2428 * that triggered this jetsam.
2429 * Kick of compactor thread asyncronously instead.
2430 */
2431 vm_wake_compactor_swapper();
2432 } else {
2433 /* compact now, except for idle reaper kills.
2434 * idle reaper kills are done in batches, so we defer compaction until the end of the batch.
2435 */
2436 if (jetsam_reason->osr_code != JETSAM_REASON_MEMORY_LONGIDLE_EXIT) {
2437 vm_run_compactor();
2438 }
2439 }
2440
2441 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_COMPACTOR_RUN) | DBG_FUNC_END,
2442 victim_pid, cause, vm_page_free_count);
2443
2444 os_reason_free(jetsam_reason);
2445 return error == 0;
2446 }
2447
2448 static int
memstat_update_inactive_priority(proc_t p,boolean_t enable,int jetsam_prio,boolean_t effective_now)2449 memstat_update_inactive_priority(proc_t p, boolean_t enable, int jetsam_prio, boolean_t effective_now)
2450 {
2451 if (_memstat_proc_is_internal(p)) {
2452 return EINVAL;
2453 }
2454
2455 if ((enable && _memstat_proc_is_elevated(p)) ||
2456 (!enable && !_memstat_proc_is_elevated(p))) {
2457 /*
2458 * No change in state.
2459 */
2460 } else {
2461 proc_list_lock();
2462
2463 if (enable) {
2464 p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
2465
2466 if (effective_now) {
2467 if (p->p_memstat_effectivepriority < jetsam_prio) {
2468 memstat_update_priority_locked(p, jetsam_prio, MEMSTAT_PRIORITY_OPTIONS_NONE);
2469 }
2470 } else {
2471 if (_memstat_proc_is_aging(p)) {
2472 memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2473 }
2474 }
2475 } else {
2476 p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
2477
2478 if (effective_now) {
2479 if (p->p_memstat_effectivepriority == jetsam_prio) {
2480 memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2481 }
2482 } else {
2483 if (_memstat_proc_is_aging(p)) {
2484 memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2485 }
2486 }
2487 }
2488 proc_list_unlock();
2489 }
2490 return 0;
2491 }
2492
2493 /*
2494 * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work.
2495 * For an application: that means no longer in the FG band
2496 * For a daemon: that means no longer in its 'requested' jetsam priority band
2497 */
2498
2499 int
memorystatus_update_inactive_jetsam_priority_band(pid_t pid,uint32_t op_flags,int jetsam_prio,boolean_t effective_now)2500 memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now)
2501 {
2502 int error = 0;
2503 boolean_t enable = FALSE;
2504 proc_t p = NULL;
2505
2506 /* Validate inputs */
2507 if (pid == 0) {
2508 return EINVAL;
2509 }
2510
2511 if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) {
2512 enable = TRUE;
2513 } else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) {
2514 enable = FALSE;
2515 } else {
2516 return EINVAL;
2517 }
2518
2519 p = proc_find(pid);
2520 if (p != NULL) {
2521 error = memstat_update_inactive_priority(p, enable, jetsam_prio, effective_now);
2522 proc_rele(p);
2523 } else {
2524 error = ESRCH;
2525 }
2526 return error;
2527 }
2528
2529 static bool
_memstat_proc_has_importance_assertion(proc_t p)2530 _memstat_proc_has_importance_assertion(proc_t p)
2531 {
2532 return (p->p_memstat_state & P_MEMSTAT_TEST_IMP_ASSERTION) || task_has_assertions(proc_task(p));
2533 }
2534
2535 static void
_memstat_perform_idle_demotion_for_band(unsigned int demote_prio_band)2536 _memstat_perform_idle_demotion_for_band(unsigned int demote_prio_band)
2537 {
2538 proc_t p;
2539 uint64_t current_time = 0, idle_delay_time = 0;
2540 memstat_bucket_t *demotion_bucket;
2541
2542 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2543
2544 current_time = mach_absolute_time();
2545
2546 demotion_bucket = &memstat_bucket[demote_prio_band];
2547 p = memorystatus_get_first_proc_locked(&demote_prio_band, FALSE);
2548
2549 while (p) {
2550 memorystatus_log_debug("memorystatus_perform_idle_demotion() found %s [%d]\n", proc_best_name(p), proc_getpid(p));
2551
2552 assert(p->p_memstat_idledeadline);
2553
2554 assert(_memstat_proc_is_aging(p));
2555
2556 if (current_time >= p->p_memstat_idledeadline) {
2557 proc_t next_proc = NULL;
2558
2559 next_proc = memorystatus_get_next_proc_locked(&demote_prio_band, p, FALSE);
2560
2561 if ((isSysProc(p) && _memstat_proc_is_dirty(p)) || /* system proc marked dirty*/
2562 _memstat_proc_has_importance_assertion(p)) { /* has outstanding assertions which might indicate outstanding work too */
2563 idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p);
2564
2565 if (isSysProc(p) && _memstat_proc_has_importance_assertion(p)) {
2566 if (demote_prio_band != system_procs_aging_band_stuck) {
2567 memorystatus_log_debug("memorystatus_perform_idle_demotion() found stuck process %d [%s], moving to JETSAM_PRIORITY_AGING_BAND1_STUCK\n",
2568 proc_getpid(p), (*p->p_name ? p->p_name : "(unknown)"));
2569 memstat_update_priority_locked(p, JETSAM_PRIORITY_AGING_BAND1_STUCK, MEMSTAT_PRIORITY_NO_AGING);
2570 idle_delay_time = _memstat_sysprocs_aging_stuck_delay_time(p);
2571 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_NONE,
2572 proc_pid(p), JETSAM_PRIORITY_AGING_BAND1_STUCK, p->p_memstat_idledeadline + idle_delay_time);
2573 } else {
2574 memorystatus_log("memorystatus_perform_idle_demotion() timed out stuck process %d [%s], moving to idle band\n",
2575 proc_getpid(p), proc_best_name(p));
2576 memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2577 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_NONE,
2578 proc_pid(p), JETSAM_PRIORITY_IDLE, p->p_memstat_idledeadline);
2579 idle_delay_time = 0;
2580 }
2581 }
2582
2583 p->p_memstat_idledeadline += idle_delay_time;
2584 } else {
2585 memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2586 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_NONE,
2587 proc_pid(p), JETSAM_PRIORITY_IDLE, p->p_memstat_idledeadline);
2588 }
2589 p = next_proc;
2590 } else {
2591 // No further candidates
2592 break;
2593 }
2594 }
2595 }
2596
2597 static void
memorystatus_perform_idle_demotion(__unused void * spare1,__unused void * spare2)2598 memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
2599 {
2600 memorystatus_log_debug("memorystatus_perform_idle_demotion()\n");
2601
2602 if (!system_procs_aging_band && !system_procs_aging_band_stuck && !applications_aging_band) {
2603 return;
2604 }
2605
2606 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START);
2607
2608 proc_list_lock();
2609
2610 _memstat_perform_idle_demotion_for_band(system_procs_aging_band);
2611 _memstat_perform_idle_demotion_for_band(system_procs_aging_band_stuck);
2612 _memstat_perform_idle_demotion_for_band(applications_aging_band);
2613
2614 _memstat_reschedule_idle_demotion_locked();
2615
2616 proc_list_unlock();
2617
2618 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END);
2619 }
2620
2621 /*
2622 * Schedule a process for idle demotion. Updates the process' idle deadline
2623 * and marks it as aging. The caller is responsible for rescheduling the idle
2624 * demotion thread
2625 */
2626 static void
_memstat_schedule_idle_demotion_locked(proc_t p)2627 _memstat_schedule_idle_demotion_locked(proc_t p)
2628 {
2629 uint64_t idle_delay_time = 0;
2630 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2631 assert(system_procs_aging_band || applications_aging_band);
2632 assert(!_memstat_proc_is_aging(p));
2633
2634 memorystatus_log_debug(
2635 "%s: scheduling demotion to idle band for pid %d (dirty:0x%x).\n",
2636 __func__, proc_getpid(p), p->p_memstat_dirty);
2637
2638 idle_delay_time = isSysProc(p) ? memorystatus_sysprocs_idle_time(p) :
2639 memorystatus_apps_idle_time(p);
2640 p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS;
2641 p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time;
2642 }
2643
2644 /*
2645 * Cancel a process' idle demotion. The caller must also reschedule the idle
2646 * demotion thread.
2647 */
2648 static void
_memstat_invalidate_idle_demotion_locked(proc_t p)2649 _memstat_invalidate_idle_demotion_locked(proc_t p)
2650 {
2651 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2652 assert(system_procs_aging_band || applications_aging_band);
2653 assert(_memstat_proc_is_aging(p));
2654
2655 memorystatus_log_debug(
2656 "%s: invalidating demotion to idle band for %s [%d]\n",
2657 __func__, proc_best_name(p), proc_getpid(p));
2658
2659 p->p_memstat_idledeadline = 0;
2660 p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS;
2661 }
2662
2663 /*
2664 * Return the earliest idle deadline of all aging procs. Returns 0 if there are
2665 * no aging procs.
2666 */
2667 static uint64_t
_memstat_find_earliest_idle_deadline(void)2668 _memstat_find_earliest_idle_deadline(void)
2669 {
2670 memstat_bucket_t *demotion_bucket;
2671 proc_t oldest_proc = PROC_NULL;
2672 uint32_t aging_app_count = 0, aging_sysproc_count = 0, aging_sysproc_count_stuck = 0;
2673 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2674 assert(system_procs_aging_band || system_procs_aging_band_stuck || applications_aging_band);
2675
2676 if (system_procs_aging_band) {
2677 aging_sysproc_count = memstat_bucket[system_procs_aging_band].count;
2678 }
2679 if (system_procs_aging_band_stuck) {
2680 aging_sysproc_count_stuck = memstat_bucket[system_procs_aging_band_stuck].count;
2681 }
2682 if (applications_aging_band) {
2683 aging_app_count = memstat_bucket[applications_aging_band].count;
2684 }
2685
2686 if ((aging_app_count + aging_sysproc_count + aging_sysproc_count_stuck) == 0) {
2687 return 0;
2688 }
2689
2690 if (system_procs_aging_band && aging_sysproc_count > 0) {
2691 demotion_bucket = &memstat_bucket[system_procs_aging_band];
2692 oldest_proc = TAILQ_FIRST(&demotion_bucket->list);
2693 }
2694
2695 if (system_procs_aging_band_stuck && aging_sysproc_count_stuck > 0) {
2696 proc_t oldest_sysproc_stuck;
2697 demotion_bucket = &memstat_bucket[system_procs_aging_band_stuck];
2698 oldest_sysproc_stuck = TAILQ_FIRST(&demotion_bucket->list);
2699
2700 if (oldest_proc) {
2701 if (oldest_sysproc_stuck->p_memstat_idledeadline <
2702 oldest_proc->p_memstat_idledeadline) {
2703 oldest_proc = oldest_sysproc_stuck;
2704 }
2705 } else {
2706 oldest_proc = oldest_sysproc_stuck;
2707 }
2708 }
2709
2710 if (applications_aging_band && aging_app_count > 0) {
2711 proc_t oldest_app;
2712 demotion_bucket = &memstat_bucket[applications_aging_band];
2713 oldest_app = TAILQ_FIRST(&demotion_bucket->list);
2714
2715 if (!oldest_proc ||
2716 (oldest_app->p_memstat_idledeadline <
2717 oldest_proc->p_memstat_idledeadline)) {
2718 oldest_proc = oldest_app;
2719 }
2720 }
2721
2722 assert(oldest_proc);
2723 assert(oldest_proc->p_memstat_idledeadline);
2724 assert(_memstat_proc_is_aging(oldest_proc));
2725
2726 return oldest_proc->p_memstat_idledeadline;
2727 }
2728
2729 /*
2730 * Reschedule or cancel a pending wakeup of the idle_demotion thread. If called
2731 * in response to a process transitioning in/out of the aging band, then
2732 * rescheduling must occur *after* the new priority is updated.
2733 */
2734 static void
_memstat_reschedule_idle_demotion_locked(void)2735 _memstat_reschedule_idle_demotion_locked(void)
2736 {
2737 uint64_t idle_deadline;
2738 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2739
2740 if (!system_procs_aging_band && !applications_aging_band) {
2741 return;
2742 }
2743 idle_deadline = _memstat_find_earliest_idle_deadline();
2744 if (idle_deadline == 0) {
2745 /* No aging processes, cancel call to demotion thread */
2746 thread_call_cancel(memorystatus_idle_demotion_call);
2747 } else if (memstat_idle_demotion_deadline != idle_deadline) {
2748 thread_call_enter_delayed(memorystatus_idle_demotion_call, idle_deadline);
2749 }
2750 memstat_idle_demotion_deadline = idle_deadline;
2751 }
2752
2753 /*
2754 * List manipulation
2755 */
2756
2757 int
memorystatus_add(proc_t p,boolean_t locked)2758 memorystatus_add(proc_t p, boolean_t locked)
2759 {
2760 memstat_bucket_t *bucket;
2761 bool reschedule_demotion = false;
2762
2763 memorystatus_log_debug("memorystatus_list_add(): adding pid %d with priority %d.\n",
2764 proc_getpid(p), p->p_memstat_effectivepriority);
2765
2766 if (!locked) {
2767 proc_list_lock();
2768 }
2769
2770 DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority);
2771
2772 /* Processes marked internal do not have priority tracked */
2773 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2774 goto exit;
2775 }
2776
2777 /*
2778 * Opt out system processes from being frozen by default.
2779 * For coalition-based freezing, we only want to freeze sysprocs that have specifically opted in.
2780 */
2781 if (isSysProc(p)) {
2782 p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED;
2783 }
2784 #if CONFIG_FREEZE
2785 memorystatus_freeze_init_proc(p);
2786 #endif
2787
2788 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2789
2790 if ((system_procs_aging_band &&
2791 p->p_memstat_effectivepriority == system_procs_aging_band) ||
2792 (applications_aging_band &&
2793 p->p_memstat_effectivepriority == applications_aging_band)) {
2794 _memstat_schedule_idle_demotion_locked(p);
2795 reschedule_demotion = true;
2796 }
2797
2798 p->p_memstat_prio_start = mach_absolute_time();
2799
2800 TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
2801 bucket->count++;
2802 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2803 bucket->relaunch_high_count++;
2804 }
2805
2806 memorystatus_list_count++;
2807
2808 if (reschedule_demotion) {
2809 _memstat_reschedule_idle_demotion_locked();
2810 }
2811
2812 task_t t = proc_task(p);
2813 if (t && task_is_app_suspended(t)) {
2814 _memstat_proc_set_suspended(p);
2815 }
2816
2817 _memstat_consider_waking_jetsam_thread();
2818
2819 exit:
2820 if (!locked) {
2821 proc_list_unlock();
2822 }
2823
2824 return 0;
2825 }
2826
2827 /*
2828 * Record timestamps if process p is transitioning in/out of the IDLE band.
2829 */
2830 static void
_memstat_record_prio_transition(proc_t p,int new_priority)2831 _memstat_record_prio_transition(proc_t p, int new_priority)
2832 {
2833 uint64_t now;
2834
2835 if (p->p_memstat_effectivepriority == new_priority) {
2836 /* no change in priority */
2837 return;
2838 }
2839
2840 now = mach_absolute_time();
2841
2842 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2843 /*
2844 * Transitioning out of the idle priority bucket.
2845 * Record idle delta.
2846 */
2847 assert(p->p_memstat_prio_start != 0);
2848 if (now < p->p_memstat_prio_start) {
2849 // rdar://139660508
2850 memorystatus_log_error("memorystatus: prio_start > mach_absolute_time "
2851 "for %s(%d)? Using delta of 0.\n",
2852 proc_best_name(p), proc_getpid(p));
2853 p->p_memstat_prio_start = now;
2854 }
2855 p->p_memstat_idle_delta = now - p->p_memstat_prio_start;
2856
2857 /*
2858 * About to become active and so memory footprint could change.
2859 * So mark it eligible for freeze-considerations next time around.
2860 */
2861 p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE;
2862
2863 _memstat_consider_waking_jetsam_thread();
2864 }
2865 p->p_memstat_prio_start = now;
2866 }
2867
2868 /*
2869 * Description:
2870 * Moves a process from one jetsam bucket to another.
2871 * which changes the LRU position of the process.
2872 *
2873 * Monitors transition between buckets and if necessary
2874 * will update cached memory limits accordingly.
2875 *
2876 */
2877 void
memstat_update_priority_locked(proc_t p,int priority,memstat_priority_options_t options)2878 memstat_update_priority_locked(proc_t p,
2879 int priority,
2880 memstat_priority_options_t options)
2881 {
2882 memstat_bucket_t *old_bucket, *new_bucket;
2883 bool reschedule_demotion = false;
2884
2885 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2886
2887 assert(priority < MEMSTAT_BUCKET_COUNT);
2888 /* Not allowed */
2889 assert(!_memstat_proc_is_internal(p));
2890
2891 /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
2892 if (proc_list_exited(p)) {
2893 return;
2894 }
2895
2896 memorystatus_log_debug("memorystatus: setting %s(%d) to priority %d, inserting at %s\n",
2897 (*p->p_name ? p->p_name : "unknown"), proc_getpid(p), priority,
2898 (options & MEMSTAT_PRIORITY_INSERT_HEAD) ? "head" : "tail");
2899
2900 DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority);
2901
2902 old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2903
2904 if (priority == JETSAM_PRIORITY_IDLE &&
2905 !(_memstat_proc_can_idle_exit(p) && !_memstat_proc_is_dirty(p)) &&
2906 !(_memstat_proc_is_managed(p) && !_memstat_proc_has_priority_assertion(p))) {
2907 priority = JETSAM_PRIORITY_BACKGROUND;
2908 memorystatus_log_error("memorystatus: %s [%d] is neither "
2909 "clean (0x%x) nor assertion-less (0x%x) and cannot "
2910 "therefore be idle - overriding to pri %d\n",
2911 proc_best_name(p), proc_getpid(p), p->p_memstat_dirty,
2912 p->p_memstat_state, priority);
2913 }
2914
2915 if (!(options & MEMSTAT_PRIORITY_NO_AGING)) {
2916 if (_memstat_proc_is_elevated(p)) {
2917 /*
2918 * 2 types of processes can use the non-standard elevated inactive band:
2919 * - Frozen processes that always land in memorystatus_freeze_jetsam_band
2920 * OR
2921 * - processes that specifically opt-in to the elevated inactive support e.g. docked processes.
2922 */
2923 if (_memstat_proc_is_frozen(p) &&
2924 priority <= memorystatus_freeze_jetsam_band) {
2925 priority = memorystatus_freeze_jetsam_band;
2926 } else if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
2927 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2928 }
2929 }
2930 if (_memstat_proc_is_tracked(p)) {
2931 if (system_procs_aging_band && priority <= system_procs_aging_band) {
2932 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2933 /* process has already aged */
2934 priority = JETSAM_PRIORITY_IDLE;
2935 } else {
2936 priority = system_procs_aging_band;
2937 }
2938 } else if (system_procs_aging_band_stuck && priority <= system_procs_aging_band_stuck) {
2939 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2940 /* process has already aged */
2941 priority = JETSAM_PRIORITY_IDLE;
2942 } else {
2943 /* don't let anyone move anything between sysproc and sysproc stuck inclusive */
2944 priority = system_procs_aging_band;
2945 }
2946 }
2947 } else if (_memstat_proc_is_managed(p)) {
2948 if (applications_aging_band && priority <= applications_aging_band) {
2949 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2950 /* process has already aged */
2951 priority = JETSAM_PRIORITY_IDLE;
2952 } else {
2953 priority = applications_aging_band;
2954 }
2955 }
2956 }
2957 }
2958
2959 TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
2960 old_bucket->count--;
2961 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2962 old_bucket->relaunch_high_count--;
2963 }
2964
2965 new_bucket = &memstat_bucket[priority];
2966 if (options & MEMSTAT_PRIORITY_INSERT_HEAD) {
2967 TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
2968 } else {
2969 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
2970 }
2971 new_bucket->count++;
2972 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2973 new_bucket->relaunch_high_count++;
2974 }
2975
2976 if (p->p_memstat_effectivepriority != priority) {
2977 /*
2978 * This process is transitioning between
2979 * jetsam priority buckets.
2980 */
2981 _memstat_record_prio_transition(p, priority);
2982
2983 if ((system_procs_aging_band &&
2984 p->p_memstat_effectivepriority == system_procs_aging_band) ||
2985 (system_procs_aging_band_stuck &&
2986 p->p_memstat_effectivepriority == system_procs_aging_band_stuck) ||
2987 (applications_aging_band &&
2988 p->p_memstat_effectivepriority == applications_aging_band)) {
2989 /* removing this process from an aging band */
2990 _memstat_invalidate_idle_demotion_locked(p);
2991 reschedule_demotion = true;
2992 }
2993
2994 if ((system_procs_aging_band &&
2995 priority == system_procs_aging_band) ||
2996 (system_procs_aging_band_stuck &&
2997 priority == system_procs_aging_band_stuck) ||
2998 (applications_aging_band &&
2999 priority == applications_aging_band)) {
3000 /* placing this process into an aging band */
3001 _memstat_schedule_idle_demotion_locked(p);
3002 reschedule_demotion = true;
3003 }
3004
3005 if (reschedule_demotion) {
3006 _memstat_reschedule_idle_demotion_locked();
3007 }
3008
3009 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CHANGE_PRIORITY) | DBG_FUNC_NONE,
3010 proc_getpid(p), priority, p->p_memstat_effectivepriority);
3011 p->p_memstat_effectivepriority = priority;
3012 }
3013
3014 if (memorystatus_highwater_enabled) {
3015 const bool use_active = memstat_proc_is_active_locked(p);
3016 if (memstat_update_memlimit_locked(p, use_active)) {
3017 _memstat_write_memlimit_to_ledger_locked(p, use_active, false);
3018 }
3019 }
3020
3021 #if CONFIG_SECLUDED_MEMORY
3022 if (secluded_for_apps &&
3023 task_could_use_secluded_mem(proc_task(p))) {
3024 task_set_can_use_secluded_mem(
3025 proc_task(p),
3026 (priority >= JETSAM_PRIORITY_FOREGROUND));
3027 }
3028 #endif /* CONFIG_SECLUDED_MEMORY */
3029
3030 _memstat_consider_waking_jetsam_thread();
3031 }
3032
3033 int
memorystatus_relaunch_flags_update(proc_t p,int relaunch_flags)3034 memorystatus_relaunch_flags_update(proc_t p, int relaunch_flags)
3035 {
3036 p->p_memstat_relaunch_flags = relaunch_flags;
3037 KDBG(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_RELAUNCH_FLAGS), proc_getpid(p), relaunch_flags);
3038 return 0;
3039 }
3040
3041 #if DEVELOPMENT || DEBUG
3042 static int sysctl_memorystatus_relaunch_flags SYSCTL_HANDLER_ARGS {
3043 #pragma unused(oidp, arg1, arg2)
3044 proc_t p;
3045 int relaunch_flags = 0;
3046
3047 p = current_proc();
3048 relaunch_flags = p->p_memstat_relaunch_flags;
3049 switch (relaunch_flags) {
3050 case P_MEMSTAT_RELAUNCH_LOW:
3051 relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_LOW;
3052 break;
3053 case P_MEMSTAT_RELAUNCH_MED:
3054 relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MED;
3055 break;
3056 case P_MEMSTAT_RELAUNCH_HIGH:
3057 relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_HIGH;
3058 break;
3059 }
3060
3061 return SYSCTL_OUT(req, &relaunch_flags, sizeof(relaunch_flags));
3062 }
3063 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_relaunch_flags, CTLTYPE_INT | CTLFLAG_RD |
3064 CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, sysctl_memorystatus_relaunch_flags, "I", "get relaunch flags for current process");
3065 #endif /* DEVELOPMENT || DEBUG */
3066
3067 /*
3068 * Everything between the idle band and the application agining band
3069 * are reserved for internal use. We allow some entitled user space programs
3070 * to use this range for experimentation.
3071 */
3072 static bool
current_task_can_use_entitled_range()3073 current_task_can_use_entitled_range()
3074 {
3075 static const char kInternalJetsamRangeEntitlement[] = "com.apple.private.internal-jetsam-range";
3076 task_t task = current_task();
3077 if (task == kernel_task) {
3078 return true;
3079 }
3080 return IOTaskHasEntitlement(task, kInternalJetsamRangeEntitlement);
3081 }
3082
3083 /*
3084 * Set a process' requested priority band. This is the entry point used during
3085 * spawn and by memorystatus_control.
3086 */
3087 int
memorystatus_set_priority(proc_t p,int priority,uint64_t user_data,memstat_priority_options_t options)3088 memorystatus_set_priority(proc_t p, int priority, uint64_t user_data,
3089 memstat_priority_options_t options)
3090 {
3091 int ret;
3092
3093 memorystatus_log_debug("memorystatus: changing (%s) pid %d: priority %d, user_data 0x%llx\n",
3094 (*p->p_name ? p->p_name : "unknown"), proc_getpid(p), priority, user_data);
3095
3096 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, proc_getpid(p), priority, user_data, options);
3097
3098 if (priority == -1) {
3099 /* Use as shorthand for default priority */
3100 priority = JETSAM_PRIORITY_DEFAULT;
3101 } else if (priority > JETSAM_PRIORITY_IDLE && priority <= JETSAM_PRIORITY_AGING_BAND2) {
3102 /*
3103 * Everything between idle and the aging bands are reserved for internal use.
3104 * if requested, adjust to JETSAM_PRIORITY_IDLE.
3105 * Entitled processes (just munch) can use a subset of this range for testing.
3106 */
3107 if (priority > JETSAM_PRIORITY_ENTITLED_MAX ||
3108 !current_task_can_use_entitled_range()) {
3109 priority = JETSAM_PRIORITY_IDLE;
3110 options |= MEMSTAT_PRIORITY_NO_AGING;
3111 }
3112 } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
3113 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
3114 priority = JETSAM_PRIORITY_IDLE;
3115 options |= (MEMSTAT_PRIORITY_INSERT_HEAD | MEMSTAT_PRIORITY_NO_AGING);
3116 } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
3117 /* Sanity check */
3118 ret = EINVAL;
3119 goto out;
3120 }
3121
3122 proc_list_lock();
3123
3124 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
3125
3126 if ((options & MEMSTAT_PRIORITY_IS_EFFECTIVE) &&
3127 (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
3128 ret = EALREADY;
3129 proc_list_unlock();
3130 memorystatus_log_error("memorystatus_update: effective change specified for pid %d, but change already occurred.\n",
3131 proc_getpid(p));
3132 goto out;
3133 }
3134
3135 if ((p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) || proc_list_exited(p)) {
3136 /*
3137 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
3138 */
3139 ret = EBUSY;
3140 proc_list_unlock();
3141 goto out;
3142 }
3143
3144 p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
3145 p->p_memstat_userdata = user_data;
3146
3147 if ((options & MEMSTAT_PRIORITY_IS_ASSERTION)) {
3148 if (priority != JETSAM_PRIORITY_IDLE) {
3149 /*
3150 * Process is now being managed by assertions,
3151 */
3152 p->p_memstat_state |= P_MEMSTAT_PRIORITY_ASSERTION;
3153 p->p_memstat_assertionpriority = priority;
3154 } else if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
3155 /*
3156 * Assertions relinquish control when the process is heading to IDLE.
3157 */
3158 p->p_memstat_state &= ~P_MEMSTAT_PRIORITY_ASSERTION;
3159 }
3160
3161 if (_memstat_proc_is_tracked(p) &&
3162 (_memstat_proc_is_dirty(p) || !_memstat_proc_can_idle_exit(p))) {
3163 priority = MAX(p->p_memstat_assertionpriority,
3164 p->p_memstat_requestedpriority);
3165 }
3166 } else {
3167 p->p_memstat_requestedpriority = priority;
3168 }
3169
3170 memstat_update_priority_locked(p, priority, options);
3171
3172 proc_list_unlock();
3173 ret = 0;
3174
3175 out:
3176 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret);
3177
3178 return ret;
3179 }
3180
3181 #if DEVELOPMENT || DEBUG
3182 static int32_t
memstat_increased_limit(int32_t limit,int32_t increase)3183 memstat_increased_limit(int32_t limit, int32_t increase)
3184 {
3185 int32_t offset_limit;
3186 if (limit <= 0) {
3187 return 0;
3188 }
3189 if (os_add_overflow(limit, increase, &offset_limit)) {
3190 return INT32_MAX;
3191 }
3192 return offset_limit;
3193 }
3194 #endif /* DEVELOPMENT || DEBUG */
3195
3196 static int
memstat_set_memlimits_locked(proc_t p,int32_t active_limit,int32_t inactive_limit,memlimit_options_t options)3197 memstat_set_memlimits_locked(proc_t p, int32_t active_limit,
3198 int32_t inactive_limit, memlimit_options_t options)
3199 {
3200 /*
3201 * Posix_spawn'd processes and managed processes come through this path to
3202 * instantiate ledger limits. Forked processes do not come through this
3203 * path and will always receive the default task limit.
3204 */
3205
3206 int err = 0;
3207 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
3208
3209 int32_t default_active_limit = memorystatus_get_default_task_active_limit(p);
3210 int32_t default_inactive_limit = memorystatus_get_default_task_inactive_limit(p);
3211
3212 /*
3213 * The special value of -1 specifies that this proc wants the default
3214 * memory limit
3215 */
3216 if (active_limit <= 0) {
3217 active_limit = default_active_limit;
3218 }
3219 if (inactive_limit <= 0) {
3220 inactive_limit = default_inactive_limit;
3221 }
3222
3223 #if DEVELOPMENT || DEBUG
3224 if (p->p_memlimit_increase) {
3225 /* Apply memlimit increase (for testing with overlay roots) */
3226 int32_t memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
3227 active_limit = memstat_increased_limit(active_limit, memlimit_increase);
3228 inactive_limit = memstat_increased_limit(inactive_limit, memlimit_increase);
3229 }
3230 #endif /* DEVELOPMENT || DEBUG */
3231
3232 /*
3233 * Work around a bug in JetsamProperties whereby processes may mistakenly receive
3234 * ActiveSoftMemoryLimit := -1 by forcing the default task limit to be fatal.
3235 */
3236 if (default_active_limit && active_limit == default_active_limit) {
3237 options |= MEMLIMIT_ACTIVE_FATAL;
3238 }
3239
3240 if (default_inactive_limit && inactive_limit == default_inactive_limit) {
3241 options |= MEMLIMIT_INACTIVE_FATAL;
3242 }
3243
3244 memorystatus_log_debug(
3245 "memorystatus: setting memlimit for %s [%d], "
3246 "Active(%dMB %s), Inactive(%dMB, %s)\n",
3247 proc_best_name(p), proc_getpid(p),
3248 active_limit, ((options & MEMLIMIT_ACTIVE_FATAL) ? "F" : "NF"),
3249 inactive_limit, ((options & MEMLIMIT_INACTIVE_FATAL) ? "F" : "NF"));
3250
3251 p->p_memstat_memlimit_active = active_limit;
3252 p->p_memstat_memlimit_inactive = inactive_limit;
3253 if (options & MEMLIMIT_INACTIVE_FATAL) {
3254 p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL;
3255 } else {
3256 p->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL;
3257 }
3258 if (options & MEMLIMIT_ACTIVE_FATAL) {
3259 p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL;
3260 } else {
3261 p->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL;
3262 }
3263
3264 /*
3265 * Initialize the cached limits for target process.
3266 * When the target process is dirty tracked, it's typically
3267 * in a clean state. Non dirty tracked processes are
3268 * typically active (Foreground or above).
3269 * But just in case, we don't make assumptions...
3270 */
3271 const bool use_active = memstat_proc_is_active_locked(p);
3272 if (memorystatus_highwater_enabled &&
3273 memstat_update_memlimit_locked(p, use_active)) {
3274 err = _memstat_write_memlimit_to_ledger_locked(p, use_active, false);
3275 }
3276
3277 return err;
3278 }
3279
3280 int
memorystatus_set_memlimits(proc_t p,int32_t active_limit,int32_t inactive_limit,memlimit_options_t options)3281 memorystatus_set_memlimits(proc_t p, int32_t active_limit,
3282 int32_t inactive_limit, memlimit_options_t options)
3283 {
3284 int err;
3285 proc_list_lock();
3286 err = memstat_set_memlimits_locked(p, active_limit, inactive_limit,
3287 options);
3288 proc_list_unlock();
3289 return err;
3290 }
3291
3292 int
memorystatus_remove(proc_t p)3293 memorystatus_remove(proc_t p)
3294 {
3295 int ret;
3296 memstat_bucket_t *bucket;
3297 bool reschedule = false;
3298
3299 memorystatus_log_debug("memorystatus_list_remove: removing pid %d\n", proc_getpid(p));
3300
3301 /* Processes marked internal do not have priority tracked */
3302 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3303 return 0;
3304 }
3305
3306 /*
3307 * Check if this proc is locked (because we're performing a freeze).
3308 * If so, we fail and instruct the caller to try again later.
3309 */
3310 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
3311 return EAGAIN;
3312 }
3313
3314 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
3315
3316 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
3317
3318 if ((system_procs_aging_band &&
3319 p->p_memstat_effectivepriority == system_procs_aging_band) ||
3320 (system_procs_aging_band_stuck &&
3321 p->p_memstat_effectivepriority == system_procs_aging_band_stuck) ||
3322 (applications_aging_band &&
3323 p->p_memstat_effectivepriority == applications_aging_band)) {
3324 _memstat_invalidate_idle_demotion_locked(p);
3325 reschedule = true;
3326 }
3327
3328 /*
3329 * Record idle delta
3330 */
3331
3332 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
3333 uint64_t now = mach_absolute_time();
3334 if (now > p->p_memstat_prio_start) {
3335 p->p_memstat_idle_delta = now - p->p_memstat_prio_start;
3336 }
3337 }
3338
3339 TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
3340 bucket->count--;
3341 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
3342 bucket->relaunch_high_count--;
3343 }
3344
3345 memorystatus_list_count--;
3346
3347 /* If awaiting demotion to the idle band, clean up */
3348 if (reschedule) {
3349 _memstat_reschedule_idle_demotion_locked();
3350 }
3351
3352 #if CONFIG_FREEZE
3353 if (_memstat_proc_is_frozen(p)) {
3354 if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
3355 p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
3356 assert(memorystatus_refreeze_eligible_count > 0);
3357 memorystatus_refreeze_eligible_count--;
3358 }
3359
3360 assert(memorystatus_frozen_count > 0);
3361 memorystatus_frozen_count--;
3362 if (p->p_memstat_state & P_MEMSTAT_FROZEN_XPC_SERVICE) {
3363 assert(memorystatus_frozen_count_xpc_service > 0);
3364 memorystatus_frozen_count_xpc_service--;
3365 }
3366 if (strcmp(p->p_name, "com.apple.WebKit.WebContent") == 0) {
3367 assert(memorystatus_frozen_count_webcontent > 0);
3368 memorystatus_frozen_count_webcontent--;
3369 }
3370 memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
3371 p->p_memstat_freeze_sharedanon_pages = 0;
3372 }
3373 #endif /* CONFIG_FREEZE */
3374
3375 _memstat_proc_set_resumed(p);
3376
3377 #if DEVELOPMENT || DEBUG
3378 if (proc_getpid(p) == memorystatus_testing_pid) {
3379 memorystatus_testing_pid = 0;
3380 }
3381 #endif /* DEVELOPMENT || DEBUG */
3382
3383 if (p) {
3384 ret = 0;
3385 } else {
3386 ret = ESRCH;
3387 }
3388
3389 return ret;
3390 }
3391
3392 /*
3393 * Validate dirty tracking flags with process state.
3394 *
3395 * Return:
3396 * 0 on success
3397 * non-0 on failure
3398 *
3399 * The proc_list_lock is held by the caller.
3400 */
3401
3402 static int
memorystatus_validate_track_flags(struct proc * target_p,uint32_t pcontrol)3403 memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol)
3404 {
3405 /* See that the process isn't marked for termination */
3406 if (_memstat_proc_is_terminating(target_p)) {
3407 return EBUSY;
3408 }
3409
3410 /* Idle exit requires that process be tracked */
3411 if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
3412 !(pcontrol & PROC_DIRTY_TRACK)) {
3413 return EINVAL;
3414 }
3415
3416 /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
3417 if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
3418 !(pcontrol & PROC_DIRTY_TRACK)) {
3419 return EINVAL;
3420 }
3421
3422 /* Only one type of DEFER behavior is allowed.*/
3423 if ((pcontrol & PROC_DIRTY_DEFER) &&
3424 (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) {
3425 return EINVAL;
3426 }
3427
3428 /* Deferral is only relevant if idle exit is specified */
3429 if (((pcontrol & PROC_DIRTY_DEFER) ||
3430 (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) &&
3431 !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
3432 return EINVAL;
3433 }
3434
3435 return 0;
3436 }
3437
3438 /*
3439 * Processes can opt to have their state tracked by the kernel, indicating when they are busy (dirty) or idle
3440 * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
3441 * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
3442 * priority idle band when clean (and killed earlier, protecting higher priority procesess).
3443 *
3444 * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
3445 * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
3446 * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
3447 * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
3448 * band. The deferral can be cleared early by clearing the appropriate flag.
3449 *
3450 * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
3451 * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
3452 * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
3453 */
3454
3455 int
memorystatus_dirty_track(proc_t p,uint32_t pcontrol)3456 memorystatus_dirty_track(proc_t p, uint32_t pcontrol)
3457 {
3458 unsigned int old_dirty;
3459 boolean_t defer_now = FALSE;
3460 int ret = 0;
3461 int priority;
3462 bool kill = false;
3463 memstat_priority_options_t priority_options =
3464 MEMSTAT_PRIORITY_OPTIONS_NONE;
3465
3466 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_TRACK),
3467 proc_getpid(p), p->p_memstat_dirty, pcontrol);
3468
3469 proc_list_lock();
3470
3471 if (proc_list_exited(p)) {
3472 /*
3473 * Process is on its way out.
3474 */
3475 ret = EBUSY;
3476 goto exit;
3477 }
3478
3479 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3480 ret = EPERM;
3481 goto exit;
3482 }
3483
3484 if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
3485 /* error */
3486 goto exit;
3487 }
3488
3489 old_dirty = p->p_memstat_dirty;
3490
3491 /* These bits are cumulative, as per <rdar://problem/11159924> */
3492 if (pcontrol & PROC_DIRTY_TRACK) {
3493 /* Request to turn ON Dirty tracking... */
3494 if (p->p_memstat_state & P_MEMSTAT_MANAGED) {
3495 /* on a process managed by RunningBoard or its equivalent...*/
3496 if (!_memstat_proc_cached_memlimit_is_fatal(p)) {
3497 /* but this might be an app because there's no fatal limits
3498 * NB: This _big_ assumption is not universal. What we really
3499 * need is a way to say this is an _APP_ and we can't have dirty
3500 * tracking turned ON for it. Lacking that functionality we clump
3501 * together some checks and try to do the best detection we can.
3502 * Reason we can't allow addition of these flags is because, per the
3503 * kernel checks, they change the role of a process from app to daemon. And the
3504 * AGING_IN_PROGRESS bits might still be set i.e. it needs to be demoted
3505 * correctly from the right aging band (app or sysproc). We can't simply try
3506 * to invalidate the demotion here because, owing to assertion priorities, we
3507 * might not be in the aging bands.
3508 */
3509 memorystatus_log(
3510 "memorystatus: Denying dirty-tracking opt-in for managed %s [%d]\n",
3511 proc_best_name(p), proc_getpid(p));
3512 /* fail silently to avoid an XPC assertion... */
3513 ret = 0;
3514 goto exit;
3515 }
3516 }
3517
3518 p->p_memstat_dirty |= P_DIRTY_TRACK;
3519 }
3520
3521 if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
3522 p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
3523 }
3524
3525 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3526 p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
3527 }
3528
3529 /*
3530 * NB: All processes are now automatically enrolled in idle aging
3531 * regardless of whether they request to be deferred.
3532 */
3533 if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
3534 if ((pcontrol & (PROC_DIRTY_DEFER)) &&
3535 !(old_dirty & P_DIRTY_DEFER)) {
3536 p->p_memstat_dirty |= P_DIRTY_DEFER;
3537 }
3538
3539 if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) &&
3540 !(old_dirty & P_DIRTY_DEFER_ALWAYS)) {
3541 p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS;
3542 }
3543
3544 defer_now = TRUE;
3545 }
3546
3547 if (pcontrol & PROC_DIRTY_SHUTDOWN_ON_CLEAN) {
3548 p->p_memstat_dirty |= P_DIRTY_SHUTDOWN_ON_CLEAN;
3549
3550 if (_memstat_proc_is_tracked(p) && !_memstat_proc_is_dirty(p)) {
3551 kill = true;
3552 }
3553 }
3554
3555 memorystatus_log_info(
3556 "%s [%d] enrolled in ActivityTracking tracked %d / idle-exit %d / defer %d / dirty %d",
3557 proc_best_name(p), proc_getpid(p),
3558 _memstat_proc_is_tracked(p), _memstat_proc_can_idle_exit(p), defer_now,
3559 _memstat_proc_is_dirty(p));
3560
3561 if (!_memstat_proc_is_dirty(p) && _memstat_proc_is_tracked(p) &&
3562 _memstat_proc_can_idle_exit(p)) {
3563 priority = JETSAM_PRIORITY_IDLE;
3564 if (!defer_now && _memstat_proc_is_aging(p)) {
3565 /*
3566 * Historically, some processes have tried to use this to opt out
3567 * of the 'aging' facility.
3568 */
3569 priority_options |= MEMSTAT_PRIORITY_NO_AGING;
3570 }
3571 } else {
3572 priority = p->p_memstat_requestedpriority;
3573 }
3574
3575 if (_memstat_proc_has_priority_assertion(p)) {
3576 priority = MAX(priority, p->p_memstat_assertionpriority);
3577 }
3578
3579 memstat_update_priority_locked(p, priority, priority_options);
3580
3581 exit:
3582 if (kill && proc_ref(p, true) == p) {
3583 proc_list_unlock();
3584 psignal(p, SIGKILL);
3585 proc_rele(p);
3586 } else {
3587 proc_list_unlock();
3588 }
3589
3590 return ret;
3591 }
3592
3593 int
memorystatus_dirty_set(proc_t p,boolean_t self,uint32_t pcontrol)3594 memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol)
3595 {
3596 int ret = 0;
3597 bool kill = false;
3598 bool was_dirty;
3599 bool now_dirty = false;
3600 int priority;
3601 task_t t = proc_task(p);
3602
3603 memorystatus_log_debug("memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, proc_getpid(p), pcontrol, p->p_memstat_dirty);
3604 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_SET), proc_getpid(p), self, pcontrol);
3605
3606 proc_list_lock();
3607
3608 if (proc_list_exited(p)) {
3609 /*
3610 * Process is on its way out.
3611 */
3612 ret = EBUSY;
3613 goto exit;
3614 }
3615
3616 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3617 ret = EPERM;
3618 goto exit;
3619 }
3620
3621 was_dirty = _memstat_proc_is_dirty(p);
3622
3623 if (!_memstat_proc_is_tracked(p)) {
3624 /* Dirty tracking not enabled */
3625 ret = EINVAL;
3626 goto exit;
3627 } else if (pcontrol && _memstat_proc_is_terminating(p)) {
3628 /*
3629 * Process is set to be terminated and we're attempting to mark it dirty.
3630 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
3631 */
3632 ret = EBUSY;
3633 goto exit;
3634 }
3635
3636 int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
3637 if (pcontrol && !(p->p_memstat_dirty & flag)) {
3638 /* Mark the process as having been dirtied at some point */
3639 p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
3640 } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
3641 if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
3642 /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
3643 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3644 kill = true;
3645 } else if ((flag == P_DIRTY) && _memstat_proc_is_terminating(p)) {
3646 /* Kill previously terminated processes if set clean */
3647 kill = true;
3648 }
3649 p->p_memstat_dirty &= ~flag;
3650 } else {
3651 /* Already set */
3652 ret = EALREADY;
3653 goto exit;
3654 }
3655
3656 now_dirty = _memstat_proc_is_dirty(p);
3657
3658 if (was_dirty && !now_dirty) {
3659 if (_memstat_proc_can_idle_exit(p)) {
3660 /*
3661 * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back
3662 * there once it's clean again. For the legacy case, this only applies if it has some protection window left.
3663 * P_DIRTY_DEFER: one-time protection window given at launch
3664 * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode.
3665 *
3666 * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over
3667 * in that band on it's way to IDLE.
3668 */
3669 assert(!_memstat_proc_is_aging(p));
3670 priority = JETSAM_PRIORITY_IDLE;
3671 } else {
3672 priority = p->p_memstat_requestedpriority;
3673 }
3674 task_ledger_settle_dirty_time(t);
3675 task_set_dirty_start(t, 0);
3676 if (_memstat_proc_shutdown_on_clean(p)) {
3677 kill = true;
3678 }
3679 } else if (!was_dirty && now_dirty) {
3680 priority = p->p_memstat_requestedpriority;
3681 task_set_dirty_start(t, mach_absolute_time());
3682 }
3683
3684 if (_memstat_proc_has_priority_assertion(p)) {
3685 priority = MAX(priority, p->p_memstat_assertionpriority);
3686 }
3687
3688 memstat_update_priority_locked(p, priority, MEMSTAT_PRIORITY_OPTIONS_NONE);
3689
3690 exit:
3691 if (kill && proc_ref(p, true) == p) {
3692 proc_list_unlock();
3693 psignal(p, SIGKILL);
3694 proc_rele(p);
3695 } else {
3696 proc_list_unlock();
3697 }
3698
3699 return ret;
3700 }
3701
3702 int
memorystatus_dirty_clear(proc_t p,uint32_t pcontrol)3703 memorystatus_dirty_clear(proc_t p, uint32_t pcontrol)
3704 {
3705 int ret = 0;
3706
3707 memorystatus_log_debug("memorystatus_dirty_clear(): %d 0x%x 0x%x\n", proc_getpid(p), pcontrol, p->p_memstat_dirty);
3708 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_CLEAR), proc_getpid(p), pcontrol);
3709
3710 proc_list_lock();
3711
3712 if (proc_list_exited(p)) {
3713 /*
3714 * Process is on its way out.
3715 */
3716 ret = EBUSY;
3717 goto exit;
3718 }
3719
3720 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3721 ret = EPERM;
3722 goto exit;
3723 }
3724
3725 if (!_memstat_proc_is_tracked(p)) {
3726 /* Dirty tracking not enabled */
3727 ret = EINVAL;
3728 goto exit;
3729 }
3730
3731 if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) {
3732 ret = EINVAL;
3733 goto exit;
3734 }
3735
3736 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3737 p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
3738 }
3739
3740 /* This can be set and cleared exactly once. */
3741 if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
3742 if (p->p_memstat_dirty & P_DIRTY_DEFER) {
3743 p->p_memstat_dirty &= ~(P_DIRTY_DEFER);
3744 }
3745
3746 if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
3747 p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS);
3748 }
3749
3750 if (_memstat_proc_is_aging(p)) {
3751 memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE,
3752 MEMSTAT_PRIORITY_NO_AGING);
3753 }
3754 }
3755
3756 ret = 0;
3757 exit:
3758 proc_list_unlock();
3759
3760 return ret;
3761 }
3762
3763 int
memorystatus_dirty_get(proc_t p,boolean_t locked)3764 memorystatus_dirty_get(proc_t p, boolean_t locked)
3765 {
3766 int ret = 0;
3767
3768 if (!locked) {
3769 proc_list_lock();
3770 }
3771
3772 if (_memstat_proc_is_tracked(p)) {
3773 ret |= PROC_DIRTY_TRACKED;
3774 if (_memstat_proc_can_idle_exit(p)) {
3775 ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
3776 }
3777 if (p->p_memstat_dirty & P_DIRTY) {
3778 ret |= PROC_DIRTY_IS_DIRTY;
3779 }
3780 if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
3781 ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
3782 }
3783 }
3784
3785 if (!locked) {
3786 proc_list_unlock();
3787 }
3788
3789 return ret;
3790 }
3791
3792 int
memorystatus_on_terminate(proc_t p)3793 memorystatus_on_terminate(proc_t p)
3794 {
3795 int sig;
3796
3797 proc_list_lock();
3798
3799 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3800
3801 if ((_memstat_proc_is_tracked(p) && !_memstat_proc_is_dirty(p)) ||
3802 (_memstat_proc_is_suspended(p))) {
3803 /*
3804 * Mark as terminated and issue SIGKILL if:-
3805 * - process is clean, or,
3806 * - if process is dirty but suspended. This case is likely
3807 * an extension because apps don't opt into dirty-tracking
3808 * and daemons aren't suspended.
3809 */
3810 #if DEVELOPMENT || DEBUG
3811 if (_memstat_proc_is_suspended(p)) {
3812 memorystatus_log(
3813 "memorystatus: sending suspended process %s (pid %d) SIGKILL\n",
3814 (*p->p_name ? p->p_name : "unknown"), proc_getpid(p));
3815 }
3816 #endif /* DEVELOPMENT || DEBUG */
3817 sig = SIGKILL;
3818 } else {
3819 /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
3820 sig = SIGTERM;
3821 }
3822
3823 proc_list_unlock();
3824
3825 return sig;
3826 }
3827
3828 void
memorystatus_on_suspend(proc_t p)3829 memorystatus_on_suspend(proc_t p)
3830 {
3831 #if CONFIG_FREEZE
3832 uint32_t pages;
3833 memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
3834 #endif
3835 proc_list_lock();
3836
3837 _memstat_proc_set_suspended(p);
3838
3839 /* Check if proc is marked for termination */
3840 bool kill_process = _memstat_proc_is_terminating(p);
3841 proc_list_unlock();
3842
3843 if (kill_process) {
3844 psignal(p, SIGKILL);
3845 }
3846 }
3847
3848 extern uint64_t memorystatus_thaw_count_since_boot;
3849
3850 void
memorystatus_on_resume(proc_t p)3851 memorystatus_on_resume(proc_t p)
3852 {
3853 #if CONFIG_FREEZE
3854 pid_t pid;
3855 #endif
3856
3857 proc_list_lock();
3858
3859 #if CONFIG_FREEZE
3860 const bool frozen = _memstat_proc_is_frozen(p);
3861 if (frozen) {
3862 /*
3863 * Now that we don't _thaw_ a process completely,
3864 * resuming it (and having some on-demand swapins)
3865 * shouldn't preclude it from being counted as frozen.
3866 *
3867 * memorystatus_frozen_count--;
3868 *
3869 * We preserve the P_MEMSTAT_FROZEN state since the process
3870 * could have state on disk AND so will deserve some protection
3871 * in the jetsam bands.
3872 */
3873 if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) {
3874 p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE;
3875 memorystatus_refreeze_eligible_count++;
3876 }
3877 if (p->p_memstat_thaw_count == 0 || p->p_memstat_last_thaw_interval < memorystatus_freeze_current_interval) {
3878 os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed), relaxed);
3879 if (strcmp(p->p_name, "com.apple.WebKit.WebContent") == 0) {
3880 os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed_webcontent), relaxed);
3881 }
3882 }
3883 p->p_memstat_last_thaw_interval = memorystatus_freeze_current_interval;
3884 p->p_memstat_thaw_count++;
3885
3886 memorystatus_log("memorystatus: resuming/thawing pid %d [%s]\n", p->p_pid, proc_best_name(p));
3887 memorystatus_freeze_record_process_thawed(p);
3888
3889 memorystatus_thaw_count++;
3890 memorystatus_thaw_count_since_boot++;
3891 }
3892
3893 pid = proc_getpid(p);
3894 #endif
3895
3896 /*
3897 * P_MEMSTAT_FROZEN will remain unchanged. This used to be:
3898 * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
3899 */
3900 _memstat_proc_set_resumed(p);
3901
3902 proc_list_unlock();
3903
3904 #if CONFIG_FREEZE
3905 if (frozen) {
3906 memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
3907 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
3908 }
3909 #endif
3910 }
3911
3912 void
memorystatus_on_inactivity(proc_t p)3913 memorystatus_on_inactivity(proc_t p)
3914 {
3915 #pragma unused(p)
3916 #if CONFIG_FREEZE
3917 /* Wake the freeze thread */
3918 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
3919 #endif
3920 }
3921
3922 /*
3923 * The proc_list_lock is held by the caller.
3924 */
3925 static memorystatus_proc_state_t
_memstat_build_state(proc_t p)3926 _memstat_build_state(proc_t p)
3927 {
3928 uint32_t snapshot_state = 0;
3929
3930 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
3931
3932 /* General */
3933 if (_memstat_proc_is_suspended(p)) {
3934 snapshot_state |= kMemorystatusSuspended;
3935 }
3936 if (_memstat_proc_is_frozen(p)) {
3937 snapshot_state |= kMemorystatusFrozen;
3938 }
3939 if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
3940 snapshot_state |= kMemorystatusWasThawed;
3941 }
3942 if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
3943 snapshot_state |= kMemorystatusAssertion;
3944 }
3945
3946 /* Tracking */
3947 if (_memstat_proc_is_tracked(p)) {
3948 snapshot_state |= kMemorystatusTracked;
3949 }
3950 if (_memstat_proc_can_idle_exit(p)) {
3951 snapshot_state |= kMemorystatusSupportsIdleExit;
3952 }
3953 if (_memstat_proc_is_dirty(p)) {
3954 snapshot_state |= kMemorystatusDirty;
3955 }
3956 if (memstat_proc_is_active_locked(p)) {
3957 snapshot_state |= kMemorystatusActive;
3958 }
3959
3960 /* Probable relaunch behavior */
3961 if (p->p_memstat_relaunch_flags & P_MEMSTAT_RELAUNCH_LOW) {
3962 snapshot_state |= kMemorystatusRelaunchLow;
3963 }
3964 if (p->p_memstat_relaunch_flags & P_MEMSTAT_RELAUNCH_MED) {
3965 snapshot_state |= kMemorystatusRelaunchMed;
3966 }
3967 if (p->p_memstat_relaunch_flags & P_MEMSTAT_RELAUNCH_HIGH) {
3968 snapshot_state |= kMemorystatusRelaunchHigh;
3969 }
3970
3971 return snapshot_state;
3972 }
3973
3974 bool
memstat_kill_idle_process(memorystatus_kill_cause_t cause,uint64_t * footprint_out)3975 memstat_kill_idle_process(memorystatus_kill_cause_t cause,
3976 uint64_t *footprint_out)
3977 {
3978 proc_t p = PROC_NULL;
3979 uint64_t current_time;
3980 bool killed = FALSE;
3981 unsigned int i = 0;
3982 os_reason_t jetsam_reason = OS_REASON_NULL;
3983
3984 /* Pick next idle exit victim. */
3985 current_time = mach_absolute_time();
3986
3987 jetsam_reason = os_reason_create(OS_REASON_JETSAM, (jetsam_reason_t)cause);
3988 if (jetsam_reason == OS_REASON_NULL) {
3989 memorystatus_log_error("memorystatus: failed to allocate jetsam reason\n");
3990 }
3991
3992 proc_list_lock();
3993
3994 p = memorystatus_get_first_proc_locked(&i, FALSE);
3995 while (p) {
3996 /* No need to look beyond the idle band */
3997 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
3998 break;
3999 }
4000
4001 if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
4002 if (current_time >= p->p_memstat_idledeadline) {
4003 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
4004 p = proc_ref(p, true);
4005 break;
4006 }
4007 }
4008
4009 p = memorystatus_get_next_proc_locked(&i, p, FALSE);
4010 }
4011
4012 proc_list_unlock();
4013
4014 if (p) {
4015 memorystatus_log(
4016 "memorystatus: killing (idle) %s [%d] due to %s (%u)\n",
4017 proc_best_name(p), proc_getpid(p), memstat_kill_cause_name[cause], cause);
4018 memorystatus_kill_proc(p, cause, jetsam_reason, &killed, footprint_out);
4019 proc_rele(p);
4020 } else {
4021 os_reason_free(jetsam_reason);
4022 }
4023
4024 return killed;
4025 }
4026
4027 /*
4028 * Consider waking the jetsam thread. Returns true if the thread was awoken.
4029 */
4030 static bool
_memstat_consider_waking_jetsam_thread(void)4031 _memstat_consider_waking_jetsam_thread(void)
4032 {
4033 #if CONFIG_JETSAM
4034 if (memstat_evaluate_page_shortage(NULL, NULL, NULL, NULL)) {
4035 memorystatus_thread_wake();
4036 return true;
4037 }
4038 #endif /* CONFIG_JETSAM */
4039 return false;
4040 }
4041
4042 void
memorystatus_thread_wake()4043 memorystatus_thread_wake()
4044 {
4045 int thr_id = 0;
4046 int active_thr = atomic_load(&active_jetsam_threads);
4047
4048 /* Wakeup all the jetsam threads */
4049 for (thr_id = 0; thr_id < active_thr; thr_id++) {
4050 jetsam_state_t jetsam_thread = &jetsam_threads[thr_id];
4051 sched_cond_signal(&(jetsam_thread->jt_wakeup_cond), jetsam_thread->thread);
4052 }
4053 }
4054
4055 void
memorystatus_respond_to_compressor_exhaustion(void)4056 memorystatus_respond_to_compressor_exhaustion(void)
4057 {
4058 #if CONFIG_JETSAM
4059 memorystatus_thread_wake();
4060 #else /* !CONFIG_JETSAM */
4061 if (kill_on_no_paging_space) {
4062 memorystatus_thread_wake();
4063 } else {
4064 /*
4065 * Throttle how often the jetsam thread is woken due to
4066 * compressor/swap exhaustion
4067 */
4068 uint64_t now = mach_absolute_time();
4069 uint64_t delta_since_last_no_space_ns;
4070 uint64_t last_action_ts = os_atomic_load(&last_no_space_action_ts, relaxed);
4071 if (now < last_action_ts) {
4072 /* Raced with a concurrent no-paging-space action */
4073 return;
4074 }
4075 absolutetime_to_nanoseconds(now - last_action_ts, &delta_since_last_no_space_ns);
4076 if (delta_since_last_no_space_ns > no_paging_space_action_throttle_delay_ns) {
4077 memorystatus_thread_wake();
4078 }
4079 }
4080 #endif /* CONFIG_JETSAM */
4081 }
4082
4083 void
memorystatus_respond_to_swap_exhaustion(void)4084 memorystatus_respond_to_swap_exhaustion(void)
4085 {
4086 #if CONFIG_JETSAM
4087 /*
4088 * On systems with both swap and jetsam,
4089 * just wake up the jetsam thread and have it handle the low swap condition
4090 * by killing apps.
4091 */
4092 if (jetsam_kill_on_low_swap) {
4093 memorystatus_thread_wake();
4094 }
4095 #else /* !CONFIG_JETSAM */
4096 memorystatus_respond_to_compressor_exhaustion();
4097 #endif /* CONFIG_JETSAM */
4098 }
4099
4100 #if CONFIG_JETSAM
4101 static void
memorystatus_thread_pool_max()4102 memorystatus_thread_pool_max()
4103 {
4104 /* Increase the jetsam thread pool to max_jetsam_threads */
4105 int max_threads = max_jetsam_threads;
4106 memorystatus_log_info("Expanding memorystatus pool to %d\n", max_threads);
4107 os_atomic_store(&active_jetsam_threads, max_threads, relaxed);
4108 }
4109
4110 static void
memorystatus_thread_pool_default()4111 memorystatus_thread_pool_default()
4112 {
4113 /* Restore the jetsam thread pool to a single thread */
4114 memorystatus_log_info("Reverting memorystatus pool back to 1\n");
4115 os_atomic_store(&active_jetsam_threads, 1, relaxed);
4116 }
4117 #endif /* CONFIG_JETSAM */
4118
4119 /*
4120 * An offset applied to non-critical page shortage thresholds.
4121 */
4122 static uint32_t
_memstat_page_shortage_offset(void)4123 _memstat_page_shortage_offset(void)
4124 {
4125 uint32_t offset = 0;
4126 if (os_atomic_load(&memstat_policy_config, acquire) & kPolicyClearTheDecks) {
4127 offset += memstat_ctd_offset;
4128 }
4129 if (os_atomic_load(&memstat_policy_config, acquire) & kPolicyBallastDrain) {
4130 offset += memstat_ballast_offset;
4131 }
4132 return offset;
4133 }
4134
4135 uint32_t
memorystatus_get_critical_page_shortage_threshold(void)4136 memorystatus_get_critical_page_shortage_threshold(void)
4137 {
4138 return memstat_critical_threshold;
4139 }
4140
4141 uint32_t
memorystatus_get_idle_exit_page_shortage_threshold(void)4142 memorystatus_get_idle_exit_page_shortage_threshold(void)
4143 {
4144 uint32_t offset = _memstat_page_shortage_offset();
4145 return memstat_idle_threshold + offset;
4146 }
4147
4148 uint32_t
memorystatus_get_soft_memlimit_page_shortage_threshold(void)4149 memorystatus_get_soft_memlimit_page_shortage_threshold(void)
4150 {
4151 uint32_t offset = _memstat_page_shortage_offset();
4152 return memstat_soft_threshold + offset;
4153 }
4154
4155 uint32_t
memorystatus_get_reaper_page_shortage_threshold(void)4156 memorystatus_get_reaper_page_shortage_threshold(void)
4157 {
4158 uint32_t offset = _memstat_page_shortage_offset();
4159 return memstat_reaper_threshold + offset;
4160 }
4161
4162 #if CONFIG_JETSAM
4163 void
_memstat_reaper_check_oldest_reapable_proc_info_timeout(void)4164 _memstat_reaper_check_oldest_reapable_proc_info_timeout(void)
4165 {
4166 if (memstat_oldest_reapable_proc_prio_start != MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_UNKNOWN) {
4167 uint64_t curr_ts_matu = mach_absolute_time();
4168 if (curr_ts_matu > memstat_oldest_reapable_proc_info_expiration_ts_matu) {
4169 memstat_oldest_reapable_proc_prio_start = MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_UNKNOWN;
4170 memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu = 0;
4171 }
4172 }
4173 }
4174
4175 void
_memstat_reaper_start_sweep(void)4176 _memstat_reaper_start_sweep(void)
4177 {
4178 if (!memstat_reaper_is_currently_sweeping) {
4179 memorystatus_log("memorystatus: _memstat_reaper_start_sweep: reaper sweep starting\n");
4180 memstat_reaper_is_currently_sweeping = true;
4181 memstat_reaper_start_ts_matu = mach_absolute_time();
4182 memstat_reaper_current_sweep_stats.kill_count = 0;
4183 memstat_reaper_current_sweep_stats.memory_freed_bytes = 0;
4184 }
4185 }
4186
4187 void
_memstat_reaper_end_sweep(void)4188 _memstat_reaper_end_sweep(void)
4189 {
4190 uint64_t rescan_delta_matu;
4191 if (memstat_reaper_is_currently_sweeping) {
4192 uint64_t delta_time_nsec;
4193
4194 /* For idle reaper kills, we skip the normal compaction after each kill,
4195 * and do one compaction here at the end of the sweep.
4196 */
4197 vm_run_compactor();
4198
4199 absolutetime_to_nanoseconds(mach_absolute_time() - memstat_reaper_start_ts_matu, &delta_time_nsec);
4200 memstat_reaper_cumulative_stats.sweep_count++;
4201 memstat_reaper_cumulative_memory_freed_mb = (uint32_t)(memstat_reaper_cumulative_stats.memory_freed_bytes >> 20);
4202 memorystatus_log("memorystatus: _memstat_reaper_end_sweep: reaper sweep ended, %d processes killed, %lluMB freed, %llums elapsed, %lluus/process\n",
4203 memstat_reaper_current_sweep_stats.kill_count,
4204 memstat_reaper_current_sweep_stats.memory_freed_bytes >> 20,
4205 (delta_time_nsec / NSEC_PER_MSEC),
4206 memstat_reaper_current_sweep_stats.kill_count ? ((delta_time_nsec / NSEC_PER_USEC) / memstat_reaper_current_sweep_stats.kill_count) : 0);
4207 memorystatus_log("memorystatus: _memstat_reaper_end_sweep: reaper totals: %d sweeps, %d processes killed, %dMB freed\n",
4208 memstat_reaper_cumulative_stats.sweep_count,
4209 memstat_reaper_cumulative_stats.kill_count,
4210 memstat_reaper_cumulative_memory_freed_mb);
4211 memstat_reaper_is_currently_sweeping = false;
4212 nanoseconds_to_absolutetime((memstat_reaper_rescan_secs * NSEC_PER_SEC), &rescan_delta_matu);
4213 memstat_reaper_can_run_after_ts_matu = mach_absolute_time() + rescan_delta_matu;
4214 }
4215 }
4216
4217 void
_memstat_reaper_record_kill(uint64_t bytes_freed)4218 _memstat_reaper_record_kill(uint64_t bytes_freed)
4219 {
4220 memstat_reaper_current_sweep_stats.kill_count++;
4221 memstat_reaper_current_sweep_stats.memory_freed_bytes += bytes_freed;
4222 memstat_reaper_cumulative_stats.kill_count++;
4223 memstat_reaper_cumulative_stats.memory_freed_bytes += bytes_freed;
4224 }
4225 #endif /* CONFIG_JETSAM */
4226
4227 const char*
_memstat_relaunch_flags_description(uint32_t flags)4228 _memstat_relaunch_flags_description(uint32_t flags)
4229 {
4230 switch (flags) {
4231 case P_MEMSTAT_RELAUNCH_UNKNOWN:
4232 return "-";
4233 case P_MEMSTAT_RELAUNCH_LOW:
4234 return "low";
4235 case P_MEMSTAT_RELAUNCH_MED:
4236 return "med";
4237 case P_MEMSTAT_RELAUNCH_HIGH:
4238 return "high";
4239 default:
4240 return "??";
4241 }
4242 }
4243
4244 const char*
_memstat_proc_type_description(proc_t p)4245 _memstat_proc_type_description(proc_t p)
4246 {
4247 if (_memstat_proc_is_application(p)) {
4248 return "app";
4249 } else {
4250 return "daemon";
4251 }
4252 }
4253
4254 bool
memstat_evaluate_page_shortage(bool * should_enforce_memlimits,bool * should_idle_exit,bool * should_jetsam,bool * should_reap)4255 memstat_evaluate_page_shortage(
4256 bool *should_enforce_memlimits,
4257 bool *should_idle_exit,
4258 bool *should_jetsam,
4259 bool *should_reap)
4260 {
4261 bool requires_action = false;
4262 if (should_enforce_memlimits) {
4263 *should_enforce_memlimits = false;
4264 }
4265 if (should_idle_exit) {
4266 *should_idle_exit = false;
4267 }
4268 if (should_jetsam) {
4269 *should_jetsam = false;
4270 }
4271 if (should_reap) {
4272 *should_reap = false;
4273 }
4274 #if CONFIG_JETSAM
4275 uint32_t available_page_count = os_atomic_load(&memorystatus_available_pages, relaxed);
4276 #if VM_PRESSURE_EVENTS
4277 if (available_page_count <
4278 memorystatus_get_soft_memlimit_page_shortage_threshold()) {
4279 /*
4280 * Only wake the jetsam thread if there are hwm violators to
4281 * kill
4282 */
4283 bool hwm_candidates = os_atomic_load(&memorystatus_hwm_candidates, acquire);
4284 requires_action = requires_action || hwm_candidates;
4285 if (should_enforce_memlimits) {
4286 *should_enforce_memlimits = true;
4287 }
4288 }
4289 #endif /* VM_PRESSURE_EVENTS */
4290
4291 if (memstat_reaper_enabled) {
4292 /*
4293 * Only wake the jetsam thread to do reaper kills if the reaper is currently alreay running a sweep
4294 * OR if other conditions suggest that we should start a sweep
4295 */
4296
4297 // if we are already in the middle of a reaper sweep already, continue it
4298 if (memstat_reaper_is_currently_sweeping) {
4299 requires_action = true;
4300 if (should_reap) {
4301 *should_reap = true;
4302 }
4303 } else {
4304 uint64_t curr_ts_matu = mach_absolute_time();
4305 // if we are not already in the middle of a reaper sweep, do very quick tests to see if we should possibly start one:
4306 // - the minimum rescan time has passed since the end of the last sweep
4307 // - we are below the page threshold
4308 // - the oldest reapable process is old enough to be a reaper candidate now
4309
4310 if ((curr_ts_matu > memstat_reaper_can_run_after_ts_matu)
4311 && (available_page_count < memorystatus_get_reaper_page_shortage_threshold())) {
4312 _memstat_reaper_check_oldest_reapable_proc_info_timeout();
4313
4314 if (memstat_oldest_reapable_proc_prio_start == MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_NONE) {
4315 memorystatus_log_debug("memorystatus: memstat_evaluate_page_shortage: no known-reapable processes\n");
4316 } else {
4317 if (curr_ts_matu >= memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu) {
4318 requires_action = true;
4319 if (should_reap) {
4320 *should_reap = true;
4321 memorystatus_log_debug("memorystatus: memstat_evaluate_page_shortage: should start reaping long-idle processes\n");
4322 }
4323 _memstat_reaper_start_sweep();
4324 }
4325 }
4326 }
4327 }
4328 }
4329 if (available_page_count < memorystatus_get_idle_exit_page_shortage_threshold()) {
4330 /*
4331 * Only wake the jetsam thread if there are idle processes that
4332 * could exit.
4333 */
4334 uint32_t idle_proc_count = os_atomic_load(
4335 &memstat_bucket[JETSAM_PRIORITY_IDLE].count, relaxed);
4336 requires_action = requires_action || (idle_proc_count > 0);
4337 if (should_idle_exit) {
4338 *should_idle_exit = true;
4339 }
4340 }
4341 if (available_page_count < memorystatus_get_critical_page_shortage_threshold()) {
4342 if (should_jetsam) {
4343 *should_jetsam = true;
4344 }
4345 requires_action = true;
4346 }
4347 #endif /* CONFIG_JETSAM */
4348 return requires_action;
4349 }
4350
4351 #if CONFIG_JETSAM
4352 static uint64_t
memorystatus_swap_trigger_pages(void)4353 memorystatus_swap_trigger_pages(void)
4354 {
4355 /*
4356 * The swapout trigger varies based on the current memorystatus_level.
4357 * When available memory is somewhat high (at memorystatus_available_pages_pressure)
4358 * we keep more swappable compressor segments in memory.
4359 * However, as available memory drops to our idle and eventually critical kill
4360 * thresholds we start swapping more aggressively.
4361 */
4362 static uint32_t available_pages_factor[] = {0, 1, 1, 1, 2, 2, 3, 5, 7, 8, 10, 13, 15, 17, 20};
4363 size_t index = MIN(memorystatus_level, sizeof(available_pages_factor) / sizeof(uint32_t) - 1);
4364 return available_pages_factor[index] * memorystatus_available_pages / 10;
4365 }
4366
4367 static int
4368 sysctl_memorystatus_swap_trigger_pages SYSCTL_HANDLER_ARGS
4369 {
4370 #pragma unused(arg1, arg2)
4371 uint64_t trigger_pages = memorystatus_swap_trigger_pages();
4372 return SYSCTL_OUT(req, &trigger_pages, sizeof(trigger_pages));
4373 }
4374
4375 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_swap_trigger_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
4376 0, 0, &sysctl_memorystatus_swap_trigger_pages, "I", "");
4377
4378 /*
4379 * Check if the number of full swappable csegments is over the trigger
4380 * threshold to start swapping.
4381 * The adjustment_factor is applied to the trigger to raise or lower
4382 * it. For example an adjustement factor of 110 will raise the threshold by 10%.
4383 */
4384 bool
memorystatus_swap_over_trigger(uint64_t adjustment_factor)4385 memorystatus_swap_over_trigger(uint64_t adjustment_factor)
4386 {
4387 if (!memorystatus_swap_all_apps) {
4388 return false;
4389 }
4390 uint64_t trigger_pages = memorystatus_swap_trigger_pages();
4391 trigger_pages = trigger_pages * adjustment_factor / 100;
4392 return atop_64(c_late_swapout_count * c_seg_allocsize) > trigger_pages;
4393 }
4394
4395 /*
4396 * Check if the number of segments on the early swapin queue
4397 * is over the trigger to start compacting it.
4398 */
4399 bool
memorystatus_swapin_over_trigger(void)4400 memorystatus_swapin_over_trigger(void)
4401 {
4402 return atop_64(c_late_swappedin_count * c_seg_allocsize) > memorystatus_swapin_trigger_pages;
4403 }
4404 #endif /* CONFIG_JETSAM */
4405
4406 #if DEVELOPMENT || DEBUG
4407 SYSCTL_UINT(_vm, OID_AUTO, c_late_swapout_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swapout_count, 0, "");
4408 SYSCTL_UINT(_vm, OID_AUTO, c_seg_allocsize, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_allocsize, 0, "");
4409 #if CONFIG_FREEZE
4410 extern int32_t c_segment_pages_compressed_incore_late_swapout;
4411 SYSCTL_INT(_vm, OID_AUTO, c_segment_pages_compressed_incore_late_swapout, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_pages_compressed_incore_late_swapout, 0, "");
4412 #endif /* CONFIG_FREEZE */
4413 #endif /* DEVELOPMENT || DEBUG */
4414
4415 static boolean_t
memorystatus_should_post_snapshot(int32_t priority,uint32_t cause)4416 memorystatus_should_post_snapshot(int32_t priority, uint32_t cause)
4417 {
4418 boolean_t is_idle_priority;
4419
4420 is_idle_priority = (priority == JETSAM_PRIORITY_IDLE || priority == JETSAM_PRIORITY_IDLE_DEFERRED);
4421 #if CONFIG_JETSAM
4422 #pragma unused(cause)
4423 /*
4424 * Don't generate logs for steady-state idle-exit kills,
4425 * unless it is overridden for debug or by the device
4426 * tree.
4427 */
4428
4429 return !is_idle_priority || memorystatus_idle_snapshot;
4430
4431 #else /* CONFIG_JETSAM */
4432 /*
4433 * Don't generate logs for steady-state idle-exit kills,
4434 * unless
4435 * - it is overridden for debug or by the device
4436 * tree.
4437 * OR
4438 * - the kill causes are important i.e. not kMemorystatusKilledIdleExit
4439 */
4440
4441 boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause));
4442 return !is_idle_priority || memorystatus_idle_snapshot || snapshot_eligible_kill_cause;
4443 #endif /* CONFIG_JETSAM */
4444 }
4445
4446
4447 static boolean_t
memorystatus_act_on_hiwat_processes(uint32_t * errors,uint32_t * hwm_kill,bool * post_snapshot,uint64_t * memory_reclaimed)4448 memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, bool *post_snapshot, uint64_t *memory_reclaimed)
4449 {
4450 boolean_t purged = FALSE, killed = FALSE;
4451
4452 *memory_reclaimed = 0;
4453 killed = memorystatus_kill_hiwat_proc(errors, &purged, memory_reclaimed);
4454
4455 if (killed) {
4456 *hwm_kill = *hwm_kill + 1;
4457 *post_snapshot = TRUE;
4458 return TRUE;
4459 } else {
4460 if (purged == FALSE) {
4461 /* couldn't purge and couldn't kill */
4462 os_atomic_store(&memorystatus_hwm_candidates, false, relaxed);
4463 }
4464 }
4465
4466 return killed;
4467 }
4468
4469 /*
4470 * Purge kernel memory caches
4471 */
4472 static void
memstat_purge_caches(jetsam_state_t state)4473 memstat_purge_caches(jetsam_state_t state)
4474 {
4475 memorystatus_log("memorystatus: purging kernel memory caches\n");
4476
4477 uint64_t pmap_released = pmap_release_pages_fast();
4478 memorystatus_log("memorystatus: recovered %llu pages from pmap\n",
4479 pmap_released);
4480
4481 /*
4482 * Only purge corpses once per jetsam event. No new corpses can be created
4483 * after the initial purge (block_corpses)
4484 */
4485 if (!state->corpse_list_purged) {
4486 memorystatus_log("memorystatus: purging all corpses\n");
4487 os_atomic_inc(&block_corpses, relaxed);
4488 assert(block_corpses > 0);
4489 if (total_corpses_count() > 0) {
4490 task_purge_all_corpses();
4491 } else {
4492 memorystatus_log("memorystatus: no corpses to purge\n");
4493 }
4494 state->corpse_list_purged = true;
4495 }
4496
4497 #if CONFIG_DEFERRED_RECLAIM
4498 /* TODO: estimate memory recovered from deferred reclaim */
4499 memorystatus_log("memorystatus: reclaiming all deferred user memory\n");
4500 mach_vm_size_t vmdr_bytes_reclaimed;
4501 vm_deferred_reclamation_gc(RECLAIM_GC_DRAIN, &vmdr_bytes_reclaimed,
4502 RECLAIM_NO_FAULT | RECLAIM_NO_WAIT);
4503 memorystatus_log("memorystatus: purged %llu KiB of deferred user memory\n",
4504 vmdr_bytes_reclaimed);
4505 #endif /* CONFIG_DEFERRED_RECLAIM */
4506
4507 /* TODO: estimate wired memory recovered from zone_gc */
4508 memorystatus_log("memorystatus: trimming kernel zone allocator\n");
4509 zone_gc_trim();
4510 }
4511
4512 static void
memstat_no_victim(jetsam_state_t state,memorystatus_kill_cause_t cause)4513 memstat_no_victim(jetsam_state_t state,
4514 memorystatus_kill_cause_t cause)
4515 {
4516 /*
4517 * We tried to kill a process, but failed to find anyone to kill. It's
4518 * possible we chose not to because we reclaimed some purgeable memory or
4519 * hit this thread's priority limit.
4520 */
4521 assert3u(state->memory_reclaimed, ==, 0);
4522 if (state->limit_to_low_bands) {
4523 /*
4524 * This thread isn't allowed to reach the high bands -- no need to overreact.
4525 */
4526 return;
4527 }
4528 /*
4529 * We should have found someone to kill. Either we failed because of a transient
4530 * error or we've run out of candidates and the issue is caused by the kernel.
4531 */
4532 memorystatus_log("memorystatus: failed to find a %s victim!\n", memstat_kill_cause_name[cause]);
4533 if (state->errors && !state->errors_cleared) {
4534 /*
4535 * It's possible that all of the kill candidates had the error bit set
4536 * (e.g. because we caught them in exec()). Clear all the error bits and
4537 * try to kill them one more time in the hopes that they are now killable.
4538 */
4539 memorystatus_log("memorystatus: clearing kill errors and retrying\n");
4540 memorystatus_clear_errors();
4541 state->errors_cleared = true;
4542 } else {
4543 /* The memory may be held by a corpse or zalloc. */
4544 memstat_purge_caches(state);
4545 struct memorystatus_system_health_s health_status;
4546 bool is_system_healthy = memstat_check_system_health(&health_status);
4547 if (!is_system_healthy) {
4548 memorystatus_log("memorystatus: system still unhealthy after cache purge!\n");
4549 /*
4550 * We trimmed the zones above but it's possible there is a bug with
4551 * working set estimation and we needed a full drain.
4552 */
4553 memorystatus_log_fault("memorystatus: fully draining kernel zone allocator\n");
4554 zone_gc_drain();
4555 is_system_healthy = memstat_check_system_health(&health_status);
4556 if (!is_system_healthy) {
4557 /*
4558 * We've killed everything and purged all xnu caches. There is nothing
4559 * left to do but panic.
4560 */
4561 panic("memorystatus: all %s victims exhausted", memstat_kill_cause_name[cause]);
4562 }
4563 }
4564 }
4565 }
4566
4567 /*
4568 * Called before jetsamming in the foreground band in the hope that we'll
4569 * avoid a jetsam.
4570 */
4571 static void
memstat_approaching_fg_band(jetsam_state_t state)4572 memstat_approaching_fg_band(jetsam_state_t state)
4573 {
4574 memorystatus_log("memorystatus: jetsam is approaching JETSAM_PRIORITY_FOREGROUND\n");
4575 if (memorystatus_should_issue_fg_band_notify) {
4576 memorystatus_broadcast_jetsam_pressure(kVMPressureForegroundJetsam);
4577 }
4578 memstat_purge_caches(state);
4579 }
4580
4581 unsigned int jld_eval_aggressive_count = 0;
4582 uint64_t jld_timestamp_msecs = 0;
4583 int jld_idle_kill_candidates = 0;
4584
4585 /*
4586 * Progressively raise the maximum priority to aggressively kill to
4587 * when a jetsam loop is detected. Background work often happens at
4588 * @c JETSAM_PRIORITY_MAIL. Start there and elevate as needed if
4589 * the jetsam loop re-occurs in a short time window.
4590 */
4591 int jld_max_priority_arr[] = {
4592 JETSAM_PRIORITY_MAIL,
4593 JETSAM_PRIORITY_MAIL,
4594 JETSAM_PRIORITY_UI_SUPPORT,
4595 JETSAM_PRIORITY_UI_SUPPORT,
4596 JETSAM_PRIORITY_DRIVER_APPLE,
4597 };
4598 #define JLD_MAX_PRIORITY_ARR_COUNT (sizeof(jld_max_priority_arr) / sizeof(jld_max_priority_arr[0]))
4599
4600 static bool
memorystatus_act_aggressive(jetsam_state_t state,uint32_t cause,os_reason_t jetsam_reason)4601 memorystatus_act_aggressive(jetsam_state_t state, uint32_t cause, os_reason_t jetsam_reason)
4602 {
4603 boolean_t killed;
4604 uint32_t errors = 0;
4605 uint64_t footprint_of_killed_proc = 0;
4606 int elevated_bucket_count = 0, maximum_kills = 0, band = 0;
4607 state->memory_reclaimed = 0;
4608
4609 unsigned int iteration_no = jld_eval_aggressive_count++;
4610 int max_kill_pri = jld_max_priority_arr[MIN(iteration_no, JLD_MAX_PRIORITY_ARR_COUNT - 1)];
4611 assert3u(max_kill_pri, <=, MEMSTAT_BUCKET_COUNT);
4612
4613 if (max_kill_pri >= JETSAM_PRIORITY_FOREGROUND) {
4614 memstat_approaching_fg_band(state);
4615 }
4616
4617 proc_list_lock();
4618 elevated_bucket_count = memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE].count;
4619 proc_list_unlock();
4620
4621 /* Visit elevated processes first */
4622 while (elevated_bucket_count) {
4623 elevated_bucket_count--;
4624
4625 /*
4626 * memorystatus_kill_elevated_process() drops a reference,
4627 * so take another one so we can continue to use this exit reason
4628 * even after it returns.
4629 */
4630
4631 os_reason_ref(jetsam_reason);
4632 killed = memorystatus_kill_elevated_process(
4633 cause,
4634 jetsam_reason,
4635 JETSAM_PRIORITY_ELEVATED_INACTIVE,
4636 jld_eval_aggressive_count,
4637 &errors, &footprint_of_killed_proc);
4638 if (killed) {
4639 state->post_snapshot = true;
4640 state->memory_reclaimed += footprint_of_killed_proc;
4641 if (!memstat_evaluate_page_shortage(NULL, NULL, NULL, NULL)) {
4642 /*
4643 * System is no longer under pressure --
4644 * bail early because the pressure was
4645 * coming from an inactive process
4646 */
4647 return true;
4648 }
4649 } else {
4650 /*
4651 * No pinned processes left to kill.
4652 * Abandon elevated band.
4653 */
4654 break;
4655 }
4656 }
4657
4658 proc_list_lock();
4659 for (band = JETSAM_PRIORITY_IDLE; band < max_kill_pri; band++) {
4660 maximum_kills += memstat_bucket[band].count;
4661 }
4662 proc_list_unlock();
4663 maximum_kills *= memorystatus_jld_max_kill_loops;
4664 /*
4665 * memorystatus_kill_processes_aggressive() allocates its own
4666 * jetsam_reason so the kMemorystatusKilledProcThrashing cause
4667 * is consistent throughout the aggressive march.
4668 */
4669 killed = memorystatus_kill_processes_aggressive(
4670 kMemorystatusKilledProcThrashing,
4671 jld_eval_aggressive_count,
4672 max_kill_pri,
4673 maximum_kills,
4674 &errors, &footprint_of_killed_proc);
4675
4676 if (killed) {
4677 /* Always generate logs after aggressive kill */
4678 state->post_snapshot = true;
4679 state->memory_reclaimed += footprint_of_killed_proc;
4680 state->jld_idle_kills = 0;
4681 }
4682
4683 return killed;
4684 }
4685
4686 /*
4687 * Sets up a new jetsam thread.
4688 */
4689 static void
memorystatus_thread_init(jetsam_state_t jetsam_thread)4690 memorystatus_thread_init(jetsam_state_t jetsam_thread)
4691 {
4692 char name[32];
4693 thread_wire_internal(host_priv_self(), current_thread(), TRUE, NULL);
4694 snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1);
4695
4696 /* Limit all but one thread to the lower jetsam bands, as that's where most of the victims are. */
4697 if (jetsam_thread->index == 0) {
4698 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4699 thread_vm_bind_group_add();
4700 }
4701 jetsam_thread->limit_to_low_bands = false;
4702 } else {
4703 jetsam_thread->limit_to_low_bands = true;
4704 }
4705 #if CONFIG_THREAD_GROUPS
4706 thread_group_vm_add();
4707 #endif
4708 thread_set_thread_name(current_thread(), name);
4709 sched_cond_init(&(jetsam_thread->jt_wakeup_cond));
4710 jetsam_thread->inited = true;
4711 }
4712
4713 /*
4714 * Create a new jetsam reason from the given kill cause.
4715 */
4716 static os_reason_t
create_jetsam_reason(memorystatus_kill_cause_t cause)4717 create_jetsam_reason(memorystatus_kill_cause_t cause)
4718 {
4719 os_reason_t jetsam_reason = OS_REASON_NULL;
4720
4721 jetsam_reason_t reason_code = (jetsam_reason_t)cause;
4722 assert3u(reason_code, <=, JETSAM_REASON_MEMORYSTATUS_MAX);
4723
4724 jetsam_reason = os_reason_create(OS_REASON_JETSAM, reason_code);
4725 if (jetsam_reason == OS_REASON_NULL) {
4726 memorystatus_log_error("memorystatus: failed to allocate jetsam reason for cause %u\n", cause);
4727 }
4728 return jetsam_reason;
4729 }
4730
4731 /*
4732 * Do one kill as we're marching up the priority bands.
4733 * This is a wrapper around memstat_kill_top_process that also
4734 * sets post_snapshot, tracks jld_idle_kills, and notifies if we're appraoching the fg band.
4735 */
4736 static bool
memstat_do_priority_kill(jetsam_state_t state,uint32_t kill_cause,int32_t max_priority,memstat_kill_options_t options)4737 memstat_do_priority_kill(jetsam_state_t state,
4738 uint32_t kill_cause, int32_t max_priority, memstat_kill_options_t options)
4739 {
4740 os_reason_t jetsam_reason = OS_REASON_NULL;
4741 bool killed = false;
4742 int priority;
4743
4744 jetsam_reason = create_jetsam_reason(kill_cause);
4745 /*
4746 * memstat_kill_top_process() drops a reference,
4747 * so take another one so we can continue to use this exit reason
4748 * even after it returns
4749 */
4750 os_reason_ref(jetsam_reason);
4751
4752 if (state->sort_flag) {
4753 options |= MEMSTAT_SORT_BUCKET;
4754 }
4755 /* LRU */
4756 killed = memstat_kill_top_process(kill_cause, jetsam_reason, max_priority,
4757 options, &priority, &state->errors, &state->memory_reclaimed);
4758 state->sort_flag = false;
4759
4760 if (killed) {
4761 if (memorystatus_should_post_snapshot(priority, kill_cause) == TRUE) {
4762 state->post_snapshot = true;
4763 }
4764
4765 /* Jetsam Loop Detection */
4766 if (memorystatus_jld_enabled == TRUE) {
4767 if (priority <= applications_aging_band) {
4768 state->jld_idle_kills++;
4769 } else {
4770 /*
4771 * We've reached into bands beyond idle deferred.
4772 * We make no attempt to monitor them
4773 */
4774 }
4775 }
4776
4777 if (priority >= JETSAM_PRIORITY_FREEZER && !state->fg_approached) {
4778 state->fg_approached = true;
4779 memstat_approaching_fg_band(state);
4780 }
4781 if (priority >= JETSAM_PRIORITY_BACKGROUND && !state->bg_approached) {
4782 state->bg_approached = true;
4783 memorystatus_broadcast_jetsam_pressure(kVMPressureBackgroundJetsam);
4784 }
4785 }
4786 os_reason_free(jetsam_reason);
4787
4788 return killed;
4789 }
4790
4791 static bool
memstat_perform_no_paging_space_action(memorystatus_kill_cause_t cause)4792 memstat_perform_no_paging_space_action(memorystatus_kill_cause_t cause)
4793 {
4794 #if !CONFIG_JETSAM
4795 uint64_t now = mach_absolute_time();
4796 os_atomic_store(&last_no_space_action_ts, now, relaxed);
4797
4798 bool should_notify = no_paging_space_action(cause);
4799 if (should_notify) {
4800 /*
4801 * Put up the "Out of Application Memory" dialogue. The user will be
4802 * prompted to select applications to Force Quit.
4803 */
4804 memorystatus_log("memorystatus: sending out-of-application memory knote\n");
4805 memorystatus_send_low_swap_note();
4806 return false;
4807 }
4808 return true;
4809 #else /* CONFIG_JETSAM */
4810 (void)cause;
4811 panic("No-Paging-Space Action unsupported on this platform");
4812 #endif /* !CONFIG_JETSAM */
4813 }
4814
4815 static bool
memorystatus_do_action(jetsam_state_t state,memorystatus_action_t action,memorystatus_kill_cause_t kill_cause)4816 memorystatus_do_action(jetsam_state_t state, memorystatus_action_t action, memorystatus_kill_cause_t kill_cause)
4817 {
4818 bool killed = false;
4819 os_reason_t jetsam_reason = OS_REASON_NULL;
4820
4821 switch (action) {
4822 case MEMORYSTATUS_KILL_HIWATER:
4823 killed = memorystatus_act_on_hiwat_processes(&state->errors, &state->hwm_kills,
4824 &state->post_snapshot, &state->memory_reclaimed);
4825 break;
4826 case MEMORYSTATUS_KILL_AGGRESSIVE:
4827 jetsam_reason = create_jetsam_reason(kill_cause);
4828 killed = memorystatus_act_aggressive(state, kill_cause, jetsam_reason);
4829 os_reason_free(jetsam_reason);
4830 break;
4831 case MEMORYSTATUS_KILL_TOP_PROCESS:
4832 killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_MAX, 0);
4833 break;
4834 case MEMORYSTATUS_WAKE_SWAPPER:
4835 memorystatus_log_info(
4836 "memorystatus_do_action: Waking up swap thread. memorystatus_available_pages: %llu\n",
4837 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4838 os_atomic_store(&vm_swapout_wake_pending, true, relaxed);
4839 thread_wakeup((event_t)&vm_swapout_thread);
4840 break;
4841 case MEMORYSTATUS_PROCESS_SWAPIN_QUEUE:
4842 memorystatus_log_info(
4843 "memorystatus_do_action: Processing swapin queue of length: %u memorystatus_available_pages: %llu\n",
4844 c_late_swappedin_count, (uint64_t) MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4845 vm_compressor_process_special_swapped_in_segments();
4846 break;
4847 case MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE:
4848 killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_BACKGROUND - 1, MEMSTAT_ONLY_SWAPPABBLE);
4849 break;
4850 case MEMORYSTATUS_KILL_SWAPPABLE:
4851 killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_MAX, MEMSTAT_ONLY_SWAPPABBLE);
4852 break;
4853 case MEMORYSTATUS_KILL_IDLE:
4854 killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_IDLE, 0);
4855 break;
4856 case MEMORYSTATUS_KILL_LONG_IDLE:
4857 killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_IDLE, MEMSTAT_ONLY_LONG_IDLE);
4858 break;
4859 case MEMORYSTATUS_NO_PAGING_SPACE:
4860 killed = memstat_perform_no_paging_space_action(kill_cause);
4861 break;
4862 case MEMORYSTATUS_PURGE_CACHES:
4863 memstat_purge_caches(state);
4864 killed = true;
4865 break;
4866 case MEMORYSTATUS_KILL_NONE:
4867 panic("memorystatus_do_action: Impossible! memorystatus_do_action called with action = NONE\n");
4868 }
4869 return killed;
4870 }
4871
4872 void
memorystatus_post_snapshot()4873 memorystatus_post_snapshot()
4874 {
4875 proc_list_lock();
4876 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
4877 sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
4878 uint64_t timestamp_now = mach_absolute_time();
4879 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
4880 memorystatus_jetsam_snapshot->js_gencount++;
4881 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
4882 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
4883 proc_list_unlock();
4884 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4885 if (!ret) {
4886 proc_list_lock();
4887 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; proc_list_unlock();
4888 }
4889 } else {
4890 proc_list_unlock();
4891 }
4892 }
4893
4894 #if JETSAM_ZPRINT_SNAPSHOT
4895
4896 /*
4897 * Called by memorystatus_update_jetsam_snapshot_entry_locked to take a zprint snapshot.
4898 */
4899 static void
memorystatus_collect_jetsam_snapshot_zprint(void)4900 memorystatus_collect_jetsam_snapshot_zprint(void)
4901 {
4902 unsigned int new_meminfo_cnt;
4903
4904 jzs_zone_cnt = zone_max_zones();
4905
4906 new_meminfo_cnt = vm_page_diagnose_estimate();
4907 if (new_meminfo_cnt > jzs_meminfo_cnt) {
4908 jzs_meminfo = krealloc_data_tag(jzs_meminfo,
4909 jzs_meminfo_cnt * sizeof(mach_memory_info_t),
4910 new_meminfo_cnt * sizeof(mach_memory_info_t),
4911 Z_WAITOK,
4912 VM_KERN_MEMORY_DIAG);
4913
4914 jzs_meminfo_cnt = new_meminfo_cnt;
4915 }
4916
4917 mach_memory_info_sample(jzs_names, jzs_info, jzs_coalesce, &jzs_zone_cnt, jzs_meminfo, jzs_meminfo_cnt, true);
4918 }
4919
4920 #endif /* JETSAM_ZPRINT_SNAPSHOT */
4921
4922 /*
4923 * Main entrypoint for the memorystatus thread.
4924 * This thread is woken up when we're low on one of the following resources:
4925 * - available pages (free + filebacked)
4926 * - zone memory
4927 * - compressor space
4928 *
4929 * Or when thrashing is detected in the compressor or file cache.
4930 */
4931 static void
memorystatus_thread_internal(jetsam_state_t state)4932 memorystatus_thread_internal(jetsam_state_t state)
4933 {
4934 uint64_t total_memory_reclaimed = 0;
4935 bool highwater_remaining = true;
4936 bool swappable_apps_remaining = false;
4937 bool suspended_swappable_apps_remaining = false;
4938
4939 #if CONFIG_JETSAM
4940 swappable_apps_remaining = memorystatus_swap_all_apps;
4941 suspended_swappable_apps_remaining = memorystatus_swap_all_apps;
4942 #endif /* CONFIG_JETSAM */
4943
4944 assert(state != NULL);
4945 state->jld_idle_kills = 0;
4946 state->errors = 0;
4947 state->errors_cleared = false;
4948 state->hwm_kills = 0;
4949 state->sort_flag = true;
4950 state->corpse_list_purged = false;
4951 state->bg_approached = false;
4952 state->fg_approached = false;
4953 state->post_snapshot = false;
4954 state->memory_reclaimed = 0;
4955
4956 if (state->inited == FALSE) {
4957 /*
4958 * It's the first time the thread has run, so just mark the thread as privileged and block.
4959 */
4960 memorystatus_thread_init(state);
4961 sched_cond_wait(&state->jt_wakeup_cond, THREAD_UNINT, memorystatus_thread);
4962 }
4963
4964 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
4965 MEMORYSTATUS_LOG_AVAILABLE_PAGES, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, jld_eval_aggressive_count);
4966
4967 extern uint32_t c_segment_count;
4968 extern mach_timespec_t major_compact_ts;
4969 clock_sec_t now;
4970 clock_nsec_t nsec;
4971 clock_get_system_nanotime(&now, &nsec);
4972 mach_timespec_t major_compact_diff = {.tv_sec = (int)now, .tv_nsec = nsec};
4973 SUB_MACH_TIMESPEC(&major_compact_diff, &major_compact_ts);
4974 memorystatus_log_info(
4975 "memorystatus: c_segment_count=%u major compaction occurred %u seconds ago\n",
4976 c_segment_count, major_compact_diff.tv_sec);
4977
4978 /*
4979 * Jetsam aware version.
4980 *
4981 * The VM pressure notification thread is working its way through clients in parallel.
4982 *
4983 * So, while the pressure notification thread is targeting processes in order of
4984 * increasing jetsam priority, we can hopefully reduce / stop its work by killing
4985 * any processes that have exceeded their highwater mark.
4986 *
4987 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
4988 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
4989 */
4990 while (true) {
4991 bool killed;
4992 state->memory_reclaimed = 0;
4993 uint32_t cause = 0;
4994
4995 memorystatus_action_t action = memorystatus_pick_action(state, &cause,
4996 highwater_remaining, suspended_swappable_apps_remaining, swappable_apps_remaining,
4997 &state->jld_idle_kills);
4998 if (action == MEMORYSTATUS_KILL_NONE) {
4999 break;
5000 }
5001
5002 if (cause == kMemorystatusKilledVMCompressorThrashing || cause == kMemorystatusKilledVMCompressorSpaceShortage) {
5003 memorystatus_log("memorystatus: killing due to \"%s\" - compression_ratio=%u\n", memstat_kill_cause_name[cause], vm_compression_ratio());
5004 }
5005
5006 killed = memorystatus_do_action(state, action, cause);
5007 total_memory_reclaimed += state->memory_reclaimed;
5008
5009 if (!killed && !state->memory_reclaimed) {
5010 switch (action) {
5011 case MEMORYSTATUS_KILL_HIWATER:
5012 highwater_remaining = false;
5013 break;
5014 case MEMORYSTATUS_KILL_SWAPPABLE:
5015 swappable_apps_remaining = false;
5016 suspended_swappable_apps_remaining = false;
5017 break;
5018 case MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE:
5019 suspended_swappable_apps_remaining = false;
5020 break;
5021 case MEMORYSTATUS_KILL_TOP_PROCESS:
5022 memstat_no_victim(state, cause);
5023 break;
5024 default:
5025 memorystatus_log("memorystatus: no victim found (action: %d)\n", action);
5026 break;
5027 }
5028 } else {
5029 /* We successfully killed a process */
5030 if (cause == kMemorystatusKilledVMCompressorThrashing || cause == kMemorystatusKilledVMCompressorSpaceShortage) {
5031 memorystatus_log("memorystatus: post-jetsam compressor fragmentation_level=%u\n", vm_compressor_fragmentation_level());
5032 }
5033 /* Always re-check for highwater and swappable kills after doing a kill. */
5034 highwater_remaining = true;
5035 swappable_apps_remaining = true;
5036 suspended_swappable_apps_remaining = true;
5037 }
5038
5039
5040 /*
5041 * If we did a kill on behalf of another subsystem (compressor or zalloc)
5042 * notify them.
5043 */
5044 if (killed && is_reason_thrashing(cause)) {
5045 os_atomic_store(&memorystatus_compressor_space_shortage, false, release);
5046 #if CONFIG_PHANTOM_CACHE
5047 os_atomic_store(&memorystatus_phantom_cache_pressure, false, release);
5048 #endif /* CONFIG_PHANTOM_CACHE */
5049 #if CONFIG_JETSAM
5050 vm_thrashing_jetsam_done();
5051 #endif /* CONFIG_JETSAM */
5052 } else if (killed && is_reason_zone_map_exhaustion(cause)) {
5053 os_atomic_store(&memorystatus_zone_map_is_exhausted, false, release);
5054 } else if (killed && cause == kMemorystatusKilledVMPageoutStarvation) {
5055 os_atomic_store(&memorystatus_pageout_starved, false, release);
5056 }
5057 }
5058
5059 if (state->errors) {
5060 memorystatus_clear_errors();
5061 }
5062
5063 if (state->post_snapshot) {
5064 memorystatus_post_snapshot();
5065 }
5066
5067 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
5068 MEMORYSTATUS_LOG_AVAILABLE_PAGES, total_memory_reclaimed);
5069
5070 if (state->corpse_list_purged) {
5071 os_atomic_dec(&block_corpses, relaxed);
5072 assert(block_corpses >= 0);
5073 }
5074 }
5075
5076 OS_NORETURN
5077 static void
memorystatus_thread(void * param __unused,wait_result_t wr __unused)5078 memorystatus_thread(void *param __unused, wait_result_t wr __unused)
5079 {
5080 jetsam_state_t jetsam_thread = jetsam_current_thread();
5081 sched_cond_ack(&(jetsam_thread->jt_wakeup_cond));
5082 while (1) {
5083 memorystatus_thread_internal(jetsam_thread);
5084 sched_cond_wait(&(jetsam_thread->jt_wakeup_cond), THREAD_UNINT, memorystatus_thread);
5085 }
5086 }
5087
5088 /*
5089 * Callback invoked when allowable physical memory footprint exceeded
5090 * (dirty pages + IOKit mappings)
5091 *
5092 * This is invoked for both advisory, non-fatal per-task high watermarks,
5093 * as well as the fatal task memory limits.
5094 */
5095 void
memorystatus_on_ledger_footprint_exceeded(boolean_t warning,boolean_t memlimit_is_active,boolean_t memlimit_is_fatal)5096 memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
5097 {
5098 os_reason_t jetsam_reason = OS_REASON_NULL;
5099
5100 proc_t p = current_proc();
5101
5102 #if VM_PRESSURE_EVENTS
5103 if (warning == TRUE) {
5104 /*
5105 * This is a warning path which implies that the current process is close, but has
5106 * not yet exceeded its per-process memory limit.
5107 */
5108 if (memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) {
5109 /* Print warning, since it's possible that task has not registered for pressure notifications */
5110 memorystatus_log_debug(
5111 "memorystatus_on_ledger_footprint_exceeded: failed to warn %s [%d] (exiting, or no handler registered?).\n",
5112 proc_best_name(p), proc_getpid(p));
5113 }
5114 return;
5115 }
5116 #endif /* VM_PRESSURE_EVENTS */
5117
5118 if (memlimit_is_fatal) {
5119 /*
5120 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
5121 * has violated either the system-wide per-task memory limit OR its own task limit.
5122 */
5123 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT);
5124 if (jetsam_reason == NULL) {
5125 memorystatus_log_error("task_exceeded footprint: failed to allocate jetsam reason\n");
5126 } else if (corpse_for_fatal_memkill && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) {
5127 /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
5128 jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
5129 }
5130
5131 if (memstat_kill_process_sync(proc_getpid(p), kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) {
5132 memorystatus_log_error("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
5133 }
5134 } else {
5135 /*
5136 * HWM offender exists. Done without locks or synchronization.
5137 * See comment near its declaration for more details.
5138 */
5139 os_atomic_store(&memorystatus_hwm_candidates, true, release);
5140 _memstat_consider_waking_jetsam_thread();
5141
5142 #if VM_PRESSURE_EVENTS
5143 /*
5144 * The current process is not in the warning path.
5145 * This path implies the current process has exceeded a non-fatal (soft) memory limit.
5146 * Failure to send note is ignored here.
5147 */
5148 (void)memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */);
5149
5150 #endif /* VM_PRESSURE_EVENTS */
5151 }
5152 }
5153
5154 void
memorystatus_log_exception(const int max_footprint_mb,boolean_t memlimit_is_active,boolean_t memlimit_is_fatal)5155 memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
5156 {
5157 proc_t p = current_proc();
5158
5159 /*
5160 * The limit violation is logged here, but only once per process per limit.
5161 * Soft memory limit is a non-fatal high-water-mark
5162 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
5163 */
5164
5165 memorystatus_log("memorystatus: %s [%d] exceeded mem limit: %s%s %d MB (%s)\n",
5166 ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), (memlimit_is_active ? "Active" : "Inactive"),
5167 (memlimit_is_fatal ? "Hard" : "Soft"), max_footprint_mb,
5168 (memlimit_is_fatal ? "fatal" : "non-fatal"));
5169 }
5170
5171 void
memorystatus_log_diag_threshold_exception(const int diag_threshold_value)5172 memorystatus_log_diag_threshold_exception(const int diag_threshold_value)
5173 {
5174 proc_t p = current_proc();
5175
5176 /*
5177 * The limit violation is logged here, but only once per process per limit.
5178 * Soft memory limit is a non-fatal high-water-mark
5179 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
5180 */
5181
5182 memorystatus_log("memorystatus: %s [%d] exceeded diag threshold limit: %d MB \n",
5183 ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), diag_threshold_value);
5184 }
5185
5186 void
memorystatus_on_conclave_limit_exceeded(const int max_footprint_mb)5187 memorystatus_on_conclave_limit_exceeded(const int max_footprint_mb)
5188 {
5189 os_reason_t jetsam_reason = OS_REASON_NULL;
5190 proc_t p = current_proc();
5191
5192 /*
5193 * The limit violation is logged here; it's always fatal.
5194 */
5195 memorystatus_log("memorystatus: %s [%d] exceeded conclave limit: %d MB \n",
5196 ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), max_footprint_mb);
5197
5198 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_CONCLAVELIMIT);
5199 if (jetsam_reason == NULL) {
5200 memorystatus_log_error("task_exceeded_conclave: failed to allocate jetsam reason\n");
5201 } else if (corpse_for_fatal_memkill && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) {
5202 /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
5203 jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
5204 }
5205
5206 if (memstat_kill_process_sync(proc_getpid(p), kMemorystatusKilledConclaveLimit, jetsam_reason) != TRUE) {
5207 memorystatus_log_error("task_exceeded_conclave: failed to kill the current task (exiting?).\n");
5208 }
5209 }
5210
5211 /*
5212 * Description:
5213 * Evaluates process state to determine which limit
5214 * should be applied (active vs. inactive limit).
5215 *
5216 * Return: TRUE if active
5217 * False if inactive
5218 */
5219 static bool
memstat_proc_is_active_locked(proc_t p)5220 memstat_proc_is_active_locked(proc_t p)
5221 {
5222 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5223
5224 if (_memstat_proc_is_elevated(p) &&
5225 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE ||
5226 p->p_memstat_effectivepriority == JETSAM_PRIORITY_FREEZER)) {
5227 /* This process is sitting in an elevated inactive band. */
5228 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE) {
5229 /*
5230 * This process is in an elevated band and may be doing background
5231 * work.
5232 */
5233 return true;
5234 } else {
5235 /* This process is frozen. */
5236 return false;
5237 }
5238 } else if (_memstat_proc_is_tracked(p)) {
5239 /*
5240 * Process has enrolled in ActivityTracking. Its limit will be
5241 * determined based on whether it is clean or dirty.
5242 */
5243 if (_memstat_proc_is_dirty(p)) {
5244 /* Dirty processes are always active */
5245 return true;
5246 } else if (_memstat_proc_can_idle_exit(p) &&
5247 p->p_memstat_effectivepriority > JETSAM_PRIORITY_IDLE) {
5248 /*
5249 * This process is clean and supports idle exit, but has not made
5250 * its way to the idle band. It is either aging in the deferred
5251 * idle band or has a RunningBoard assertion that is keeping it
5252 * from going idle.
5253 */
5254 return true;
5255 } else {
5256 /*
5257 * This process is clean and either:
5258 * - does not support idle exit
5259 * or
5260 * - does support idle exit and is now idle
5261 */
5262 return false;
5263 }
5264 } else if (_memstat_proc_is_managed(p)) {
5265 /*
5266 * RunningBoard-managed processes are active if they have any
5267 * outstanding assertions
5268 */
5269 return _memstat_proc_has_priority_assertion(p);
5270 } else {
5271 /*
5272 * Unmanaged and untracked processes receive an active limit unless
5273 * they are completely idle.
5274 */
5275 return p->p_memstat_effectivepriority > JETSAM_PRIORITY_IDLE;
5276 }
5277 }
5278
5279 static bool
memstat_kill_process_sync(pid_t victim_pid,uint32_t cause,os_reason_t jetsam_reason)5280 memstat_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
5281 {
5282 bool killed;
5283
5284 uint32_t errors = 0;
5285 uint64_t memory_reclaimed = 0;
5286
5287 /* Validate inputs */
5288 if (victim_pid == 0) {
5289 return false;
5290 }
5291
5292 if (victim_pid == -1) {
5293 uint32_t max_priority;
5294 #if CONFIG_JETSAM
5295 max_priority = JETSAM_PRIORITY_MAX;
5296 #else /* !CONFIG_JETSAM */
5297 if (kill_on_no_paging_space ||
5298 cause == kMemorystatusKilledZoneMapExhaustion) {
5299 max_priority = JETSAM_PRIORITY_MAX;
5300 } else if (cause == kMemorystatusKilledSustainedPressure) {
5301 max_priority = memstat_sustained_pressure_max_pri;
5302 } else {
5303 max_priority = JETSAM_PRIORITY_IDLE;
5304 }
5305 #endif /* CONFIG_JETSAM */
5306 /* No pid, so kill first process */
5307 killed = memstat_kill_top_process(cause, jetsam_reason,
5308 max_priority, MEMSTAT_SORT_BUCKET, NULL, &errors, &memory_reclaimed);
5309 } else {
5310 killed = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason);
5311 }
5312
5313 if (errors) {
5314 memorystatus_clear_errors();
5315 }
5316
5317 if (killed) {
5318 /* Fire off snapshot notification */
5319 proc_list_lock();
5320 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
5321 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
5322 uint64_t timestamp_now = mach_absolute_time();
5323 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
5324 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
5325 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
5326 proc_list_unlock();
5327 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
5328 if (!ret) {
5329 proc_list_lock();
5330 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
5331 proc_list_unlock();
5332 }
5333 } else {
5334 proc_list_unlock();
5335 }
5336 }
5337
5338 return killed;
5339 }
5340
5341 /*
5342 * Jetsam a specific process.
5343 */
5344 static bool
memorystatus_kill_specific_process(pid_t victim_pid,uint32_t cause,os_reason_t jetsam_reason)5345 memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
5346 {
5347 bool killed;
5348 proc_t p;
5349 uint64_t killtime = 0;
5350 uint64_t time_in_priority_band_secs = 0;
5351 uint64_t footprint_of_killed_proc;
5352 clock_sec_t tv_sec;
5353 clock_usec_t tv_usec;
5354 uint32_t tv_msec;
5355
5356 /* TODO - add a victim queue and push this into the main jetsam thread */
5357
5358 p = proc_find(victim_pid);
5359 if (!p) {
5360 os_reason_free(jetsam_reason);
5361 return false;
5362 }
5363
5364 proc_list_lock();
5365
5366 if (_memstat_proc_was_killed(p)) {
5367 /*
5368 * Someone beat us to this kill.
5369 * Nothing to do here.
5370 */
5371 proc_list_unlock();
5372 os_reason_free(jetsam_reason);
5373 proc_rele(p);
5374 return false;
5375 }
5376 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
5377
5378 if (memorystatus_jetsam_snapshot_count == 0) {
5379 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
5380 }
5381
5382 killtime = mach_absolute_time();
5383 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5384 tv_msec = tv_usec / 1000;
5385
5386 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
5387
5388 proc_list_unlock();
5389
5390 absolutetime_to_nanoseconds(killtime - p->p_memstat_prio_start, &time_in_priority_band_secs);
5391 time_in_priority_band_secs /= NSEC_PER_SEC;
5392
5393 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
5394
5395 memorystatus_log("%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d %llus rf:%s type:%s) %lluKB - memorystatus_available_pages: %llu\n",
5396 (unsigned long)tv_sec, tv_msec, victim_pid, ((p && *p->p_name) ? p->p_name : "unknown"),
5397 memstat_kill_cause_name[cause], (p ? p->p_memstat_effectivepriority: -1), time_in_priority_band_secs,
5398 (p ? _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags) : 0), _memstat_proc_type_description(p),
5399 footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
5400
5401 if (!killed) {
5402 proc_list_lock();
5403 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5404 proc_list_unlock();
5405 }
5406
5407 proc_rele(p);
5408
5409 return killed;
5410 }
5411
5412
5413 /*
5414 * Toggle the P_MEMSTAT_SKIP bit.
5415 * Takes the proc_list_lock.
5416 */
5417 void
proc_memstat_skip(proc_t p,boolean_t set)5418 proc_memstat_skip(proc_t p, boolean_t set)
5419 {
5420 #if DEVELOPMENT || DEBUG
5421 if (p) {
5422 proc_list_lock();
5423 if (set == TRUE) {
5424 p->p_memstat_state |= P_MEMSTAT_SKIP;
5425 } else {
5426 p->p_memstat_state &= ~P_MEMSTAT_SKIP;
5427 }
5428 proc_list_unlock();
5429 }
5430 #else
5431 #pragma unused(p, set)
5432 /*
5433 * do nothing
5434 */
5435 #endif /* DEVELOPMENT || DEBUG */
5436 return;
5437 }
5438
5439
5440 #if CONFIG_JETSAM
5441 /*
5442 * This is invoked when cpulimits have been exceeded while in fatal mode.
5443 * The jetsam_flags do not apply as those are for memory related kills.
5444 * We call this routine so that the offending process is killed with
5445 * a non-zero exit status.
5446 */
5447 void
jetsam_on_ledger_cpulimit_exceeded(void)5448 jetsam_on_ledger_cpulimit_exceeded(void)
5449 {
5450 int retval = 0;
5451 int jetsam_flags = 0; /* make it obvious */
5452 proc_t p = current_proc();
5453 os_reason_t jetsam_reason = OS_REASON_NULL;
5454
5455 memorystatus_log("memorystatus: killing %s [%d] due to cpulimit "
5456 "violation\n", proc_best_name(p), proc_getpid(p));
5457
5458 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT);
5459 if (jetsam_reason == OS_REASON_NULL) {
5460 memorystatus_log_error("memorystatus: unable to allocate memory for jetsam reason\n");
5461 }
5462
5463 retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
5464
5465 if (retval) {
5466 memorystatus_log_error("memorystatus: failed to kill current task (exiting?).\n");
5467 }
5468 }
5469
5470 #endif /* CONFIG_JETSAM */
5471
5472 static void
memorystatus_get_task_memory_region_count(task_t task,uint64_t * count)5473 memorystatus_get_task_memory_region_count(task_t task, uint64_t *count)
5474 {
5475 assert(task);
5476 assert(count);
5477
5478 *count = get_task_memory_region_count(task);
5479 }
5480
5481
5482 #define MEMORYSTATUS_VM_MAP_FORK_ALLOWED 0x100000000
5483 #define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000
5484
5485 #if DEVELOPMENT || DEBUG
5486
5487 /*
5488 * Sysctl only used to test memorystatus_allowed_vm_map_fork() path.
5489 * set a new pidwatch value
5490 * or
5491 * get the current pidwatch value
5492 *
5493 * The pidwatch_val starts out with a PID to watch for in the map_fork path.
5494 * Its value is:
5495 * - OR'd with MEMORYSTATUS_VM_MAP_FORK_ALLOWED if we allow the map_fork.
5496 * - OR'd with MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED if we disallow the map_fork.
5497 * - set to -1ull if the map_fork() is aborted for other reasons.
5498 */
5499
5500 uint64_t memorystatus_vm_map_fork_pidwatch_val = 0;
5501
5502 static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS {
5503 #pragma unused(oidp, arg1, arg2)
5504
5505 uint64_t new_value = 0;
5506 uint64_t old_value = 0;
5507 int error = 0;
5508
5509 /*
5510 * The pid is held in the low 32 bits.
5511 * The 'allowed' flags are in the upper 32 bits.
5512 */
5513 old_value = memorystatus_vm_map_fork_pidwatch_val;
5514
5515 error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL);
5516
5517 if (error || !req->newptr) {
5518 /*
5519 * No new value passed in.
5520 */
5521 return error;
5522 }
5523
5524 /*
5525 * A new pid was passed in via req->newptr.
5526 * Ignore any attempt to set the higher order bits.
5527 */
5528 memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF;
5529 memorystatus_log_debug("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx\n", old_value, new_value);
5530
5531 return error;
5532 }
5533
5534 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED,
5535 0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork");
5536
5537
5538 /*
5539 * Record if a watched process fails to qualify for a vm_map_fork().
5540 */
5541 void
memorystatus_abort_vm_map_fork(task_t task)5542 memorystatus_abort_vm_map_fork(task_t task)
5543 {
5544 if (memorystatus_vm_map_fork_pidwatch_val != 0) {
5545 proc_t p = get_bsdtask_info(task);
5546 if (p != NULL && memorystatus_vm_map_fork_pidwatch_val == (uint64_t)proc_getpid(p)) {
5547 memorystatus_vm_map_fork_pidwatch_val = -1ull;
5548 }
5549 }
5550 }
5551
5552 static void
set_vm_map_fork_pidwatch(task_t task,uint64_t x)5553 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
5554 {
5555 if (memorystatus_vm_map_fork_pidwatch_val != 0) {
5556 proc_t p = get_bsdtask_info(task);
5557 if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)proc_getpid(p))) {
5558 memorystatus_vm_map_fork_pidwatch_val |= x;
5559 }
5560 }
5561 }
5562
5563 #else /* DEVELOPMENT || DEBUG */
5564
5565
5566 static void
set_vm_map_fork_pidwatch(task_t task,uint64_t x)5567 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
5568 {
5569 #pragma unused(task)
5570 #pragma unused(x)
5571 }
5572
5573 #endif /* DEVELOPMENT || DEBUG */
5574
5575 /*
5576 * Called during EXC_RESOURCE handling when a process exceeds a soft
5577 * memory limit. This is the corpse fork path and here we decide if
5578 * vm_map_fork will be allowed when creating the corpse.
5579 * The task being considered is suspended.
5580 *
5581 * By default, a vm_map_fork is allowed to proceed.
5582 *
5583 * A few simple policy assumptions:
5584 * If the device has a zero system-wide task limit,
5585 * then the vm_map_fork is allowed. macOS always has a zero
5586 * system wide task limit (unless overriden by a boot-arg).
5587 *
5588 * And if a process's memory footprint calculates less
5589 * than or equal to quarter of the system-wide task limit,
5590 * then the vm_map_fork is allowed. This calculation
5591 * is based on the assumption that a process can
5592 * munch memory up to the system-wide task limit.
5593 *
5594 * For watchOS, which has a low task limit, we use a
5595 * different value. Current task limit has been reduced
5596 * to 300MB and it's been decided the limit should be 200MB.
5597 */
5598 int large_corpse_count = 0;
5599 boolean_t
memorystatus_allowed_vm_map_fork(task_t task,bool * is_large)5600 memorystatus_allowed_vm_map_fork(task_t task, bool *is_large)
5601 {
5602 boolean_t is_allowed = TRUE; /* default */
5603 uint64_t footprint_in_bytes;
5604 uint64_t max_allowed_bytes;
5605 thread_t self = current_thread();
5606
5607 *is_large = false;
5608
5609 /* Jetsam in high bands blocks any new corpse */
5610 if (os_atomic_load(&block_corpses, relaxed) != 0) {
5611 memorystatus_log("memorystatus_allowed_vm_map_fork: corpse for pid %d blocked by jetsam).\n", task_pid(task));
5612 ktriage_record(thread_tid(self), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CORPSE, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CORPSE_BLOCKED_JETSAM), 0 /* arg */);
5613 return FALSE;
5614 }
5615
5616 if (max_task_footprint_mb == 0) {
5617 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
5618 return is_allowed;
5619 }
5620
5621 footprint_in_bytes = get_task_phys_footprint(task);
5622
5623 /*
5624 * Maximum is 1/4 of the system-wide task limit by default.
5625 */
5626 max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2;
5627
5628 #if XNU_TARGET_OS_WATCH
5629 /*
5630 * For watches with > 1G, use a limit of 200MB and allow
5631 * one corpse at a time of up to 300MB.
5632 */
5633 #define LARGE_CORPSE_LIMIT 1
5634 if (sane_size > 1 * 1024 * 1024 * 1024) {
5635 int cnt = large_corpse_count;
5636 if (footprint_in_bytes > 200 * 1024 * 1024 &&
5637 footprint_in_bytes <= 300 * 1024 * 1024 &&
5638 cnt < LARGE_CORPSE_LIMIT &&
5639 OSCompareAndSwap(cnt, cnt + 1, &large_corpse_count)) {
5640 *is_large = true;
5641 max_allowed_bytes = MAX(max_allowed_bytes, 300 * 1024 * 1024);
5642 } else {
5643 max_allowed_bytes = MAX(max_allowed_bytes, 200 * 1024 * 1024);
5644 }
5645 }
5646 #endif /* XNU_TARGET_OS_WATCH */
5647
5648 #if DEBUG || DEVELOPMENT
5649 if (corpse_threshold_system_limit) {
5650 max_allowed_bytes = (uint64_t)max_task_footprint_mb * (1UL << 20);
5651 }
5652 #endif /* DEBUG || DEVELOPMENT */
5653
5654 if (footprint_in_bytes > max_allowed_bytes) {
5655 memorystatus_log("memorystatus disallowed vm_map_fork %lld %lld\n", footprint_in_bytes, max_allowed_bytes);
5656 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED);
5657 ktriage_record(thread_tid(self), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CORPSE, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CORPSE_PROC_TOO_BIG), 0 /* arg */);
5658 return !is_allowed;
5659 }
5660
5661 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
5662 return is_allowed;
5663 }
5664
5665 void
memorystatus_get_task_page_counts(task_t task,uint32_t * footprint,uint32_t * max_footprint_lifetime,uint32_t * purgeable_pages)5666 memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
5667 {
5668 assert(task);
5669 assert(footprint);
5670
5671 uint64_t pages;
5672
5673 pages = (get_task_phys_footprint(task) / PAGE_SIZE_64);
5674 assert(((uint32_t)pages) == pages);
5675 *footprint = (uint32_t)pages;
5676
5677 if (max_footprint_lifetime) {
5678 pages = (get_task_phys_footprint_lifetime_max(task) / PAGE_SIZE_64);
5679 assert(((uint32_t)pages) == pages);
5680 *max_footprint_lifetime = (uint32_t)pages;
5681 }
5682 if (purgeable_pages) {
5683 pages = (get_task_purgeable_size(task) / PAGE_SIZE_64);
5684 assert(((uint32_t)pages) == pages);
5685 *purgeable_pages = (uint32_t)pages;
5686 }
5687 }
5688
5689 static void
memorystatus_get_task_phys_footprint_page_counts(task_t task,uint64_t * internal_pages,uint64_t * internal_compressed_pages,uint64_t * purgeable_nonvolatile_pages,uint64_t * purgeable_nonvolatile_compressed_pages,uint64_t * alternate_accounting_pages,uint64_t * alternate_accounting_compressed_pages,uint64_t * iokit_mapped_pages,uint64_t * page_table_pages,uint64_t * frozen_to_swap_pages,uint64_t * neural_nofootprint_total_pages)5690 memorystatus_get_task_phys_footprint_page_counts(task_t task,
5691 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
5692 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
5693 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
5694 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages,
5695 uint64_t *neural_nofootprint_total_pages)
5696 {
5697 assert(task);
5698
5699 if (internal_pages) {
5700 *internal_pages = (get_task_internal(task) / PAGE_SIZE_64);
5701 }
5702
5703 if (internal_compressed_pages) {
5704 *internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64);
5705 }
5706
5707 if (purgeable_nonvolatile_pages) {
5708 *purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64);
5709 }
5710
5711 if (purgeable_nonvolatile_compressed_pages) {
5712 *purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64);
5713 }
5714
5715 if (alternate_accounting_pages) {
5716 *alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64);
5717 }
5718
5719 if (alternate_accounting_compressed_pages) {
5720 *alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64);
5721 }
5722
5723 if (iokit_mapped_pages) {
5724 *iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64);
5725 }
5726
5727 if (page_table_pages) {
5728 *page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64);
5729 }
5730
5731 if (neural_nofootprint_total_pages) {
5732 *neural_nofootprint_total_pages = (get_task_neural_nofootprint_total(task) / PAGE_SIZE_64);
5733 }
5734
5735 #if CONFIG_FREEZE
5736 if (frozen_to_swap_pages) {
5737 *frozen_to_swap_pages = (get_task_frozen_to_swap(task) / PAGE_SIZE_64);
5738 }
5739 #else /* CONFIG_FREEZE */
5740 #pragma unused(frozen_to_swap_pages)
5741 #endif /* CONFIG_FREEZE */
5742 }
5743
5744 #if CONFIG_FREEZE
5745 /*
5746 * Copies the source entry into the destination snapshot.
5747 * Returns true on success. Fails if the destination snapshot is full.
5748 * Caller must hold the proc list lock.
5749 */
5750 static bool
memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t * dst_snapshot,unsigned int dst_snapshot_size,const memorystatus_jetsam_snapshot_entry_t * src_entry)5751 memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t *dst_snapshot, unsigned int dst_snapshot_size, const memorystatus_jetsam_snapshot_entry_t *src_entry)
5752 {
5753 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5754 assert(dst_snapshot);
5755
5756 if (dst_snapshot->entry_count == dst_snapshot_size) {
5757 /* Destination snapshot is full. Can not be updated until it is consumed. */
5758 return false;
5759 }
5760 if (dst_snapshot->entry_count == 0) {
5761 memorystatus_init_jetsam_snapshot_header(dst_snapshot);
5762 }
5763 memorystatus_jetsam_snapshot_entry_t *dst_entry = &dst_snapshot->entries[dst_snapshot->entry_count++];
5764 memcpy(dst_entry, src_entry, sizeof(memorystatus_jetsam_snapshot_entry_t));
5765 return true;
5766 }
5767 #endif /* CONFIG_FREEZE */
5768
5769 static bool
memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t * snapshot,proc_t p,uint32_t kill_cause,uint64_t killtime,memorystatus_jetsam_snapshot_entry_t ** entry)5770 memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t *snapshot, proc_t p, uint32_t kill_cause, uint64_t killtime, memorystatus_jetsam_snapshot_entry_t **entry)
5771 {
5772 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5773 memorystatus_jetsam_snapshot_entry_t *snapshot_list = snapshot->entries;
5774 size_t i = snapshot->entry_count;
5775
5776 if (memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], (snapshot->js_gencount)) == TRUE) {
5777 *entry = &snapshot_list[i];
5778 (*entry)->killed = kill_cause;
5779 (*entry)->jse_killtime = killtime;
5780
5781 snapshot->entry_count = i + 1;
5782 return true;
5783 }
5784 return false;
5785 }
5786
5787 /*
5788 * This routine only acts on the global jetsam event snapshot.
5789 * Updating the process's entry can race when the memorystatus_thread
5790 * has chosen to kill a process that is racing to exit on another core.
5791 */
5792 static void
memorystatus_update_jetsam_snapshot_entry_locked(proc_t p,uint32_t kill_cause,uint64_t killtime)5793 memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime)
5794 {
5795 memorystatus_jetsam_snapshot_entry_t *entry = NULL;
5796 memorystatus_jetsam_snapshot_t *snapshot = NULL;
5797 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
5798
5799 unsigned int i;
5800 #if CONFIG_FREEZE
5801 bool copied_to_freezer_snapshot = false;
5802 #endif /* CONFIG_FREEZE */
5803
5804 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5805
5806 if (memorystatus_jetsam_snapshot_count == 0) {
5807 /*
5808 * No active snapshot.
5809 * Nothing to do.
5810 */
5811 goto exit;
5812 }
5813
5814 /*
5815 * Sanity check as this routine should only be called
5816 * from a jetsam kill path.
5817 */
5818 assert(kill_cause != 0 && killtime != 0);
5819
5820 snapshot = memorystatus_jetsam_snapshot;
5821 snapshot_list = memorystatus_jetsam_snapshot->entries;
5822
5823 #if JETSAM_ZPRINT_SNAPSHOT
5824 /*
5825 * Collect the snapshot zprint info if we've reached the right priority
5826 */
5827 if (p->p_memstat_effectivepriority >= (int)jzs_trigger_band &&
5828 jzs_gencount != snapshot->js_gencount) {
5829 memorystatus_collect_jetsam_snapshot_zprint();
5830 jzs_gencount = snapshot->js_gencount;
5831 }
5832 #endif
5833
5834 for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
5835 if (snapshot_list[i].pid == proc_getpid(p)) {
5836 entry = &snapshot_list[i];
5837
5838 if (entry->killed || entry->jse_killtime) {
5839 /*
5840 * We apparently raced on the exit path
5841 * for this process, as it's snapshot entry
5842 * has already recorded a kill.
5843 */
5844 assert(entry->killed && entry->jse_killtime);
5845 break;
5846 }
5847
5848 /*
5849 * Update the entry we just found in the snapshot.
5850 */
5851
5852 entry->killed = kill_cause;
5853 entry->jse_killtime = killtime;
5854 entry->jse_gencount = snapshot->js_gencount;
5855 entry->jse_idle_delta = p->p_memstat_idle_delta;
5856 entry->jse_prio_start = p->p_memstat_prio_start;
5857 #if CONFIG_FREEZE
5858 entry->jse_thaw_count = p->p_memstat_thaw_count;
5859 entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason;
5860 #else /* CONFIG_FREEZE */
5861 entry->jse_thaw_count = 0;
5862 entry->jse_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
5863 #endif /* CONFIG_FREEZE */
5864
5865 /*
5866 * If a process has moved between bands since snapshot was
5867 * initialized, then likely these fields changed too.
5868 */
5869 if (entry->priority != p->p_memstat_effectivepriority) {
5870 strlcpy(entry->name, p->p_name, sizeof(entry->name));
5871 entry->priority = p->p_memstat_effectivepriority;
5872 entry->state = _memstat_build_state(p);
5873 entry->user_data = p->p_memstat_userdata;
5874 entry->fds = p->p_fd.fd_nfiles;
5875 }
5876
5877 /*
5878 * Always update the page counts on a kill.
5879 */
5880
5881 uint32_t pages = 0;
5882 uint32_t max_pages_lifetime = 0;
5883 uint32_t purgeable_pages = 0;
5884
5885 memorystatus_get_task_page_counts(proc_task(p), &pages, &max_pages_lifetime, &purgeable_pages);
5886 entry->pages = (uint64_t)pages;
5887 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
5888 entry->purgeable_pages = (uint64_t)purgeable_pages;
5889
5890 uint64_t internal_pages = 0;
5891 uint64_t internal_compressed_pages = 0;
5892 uint64_t purgeable_nonvolatile_pages = 0;
5893 uint64_t purgeable_nonvolatile_compressed_pages = 0;
5894 uint64_t alternate_accounting_pages = 0;
5895 uint64_t alternate_accounting_compressed_pages = 0;
5896 uint64_t iokit_mapped_pages = 0;
5897 uint64_t page_table_pages = 0;
5898 uint64_t frozen_to_swap_pages = 0;
5899 uint64_t neural_nofootprint_total_pages = 0;
5900
5901 memorystatus_get_task_phys_footprint_page_counts(proc_task(p), &internal_pages, &internal_compressed_pages,
5902 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
5903 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
5904 &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages, &neural_nofootprint_total_pages);
5905
5906 entry->jse_internal_pages = internal_pages;
5907 entry->jse_internal_compressed_pages = internal_compressed_pages;
5908 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
5909 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
5910 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
5911 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
5912 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
5913 entry->jse_page_table_pages = page_table_pages;
5914 entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
5915 entry->jse_neural_nofootprint_total_pages = neural_nofootprint_total_pages;
5916
5917 uint64_t region_count = 0;
5918 memorystatus_get_task_memory_region_count(proc_task(p), ®ion_count);
5919 entry->jse_memory_region_count = region_count;
5920 entry->csflags = proc_getcsflags(p);
5921 goto exit;
5922 }
5923 }
5924
5925 if (entry == NULL) {
5926 /*
5927 * The entry was not found in the snapshot, so the process must have
5928 * launched after the snapshot was initialized.
5929 * Let's try to append the new entry.
5930 */
5931 if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) {
5932 /*
5933 * A populated snapshot buffer exists
5934 * and there is room to init a new entry.
5935 */
5936 assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count);
5937
5938 if (memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry)) {
5939 memorystatus_jetsam_snapshot_count++;
5940
5941 if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) {
5942 /*
5943 * We just used the last slot in the snapshot buffer.
5944 * We only want to log it once... so we do it here
5945 * when we notice we've hit the max.
5946 */
5947 memorystatus_log_error("memorystatus: WARNING snapshot buffer is full, count %d\n", memorystatus_jetsam_snapshot_count);
5948 }
5949 }
5950 }
5951 }
5952
5953 exit:
5954 if (entry) {
5955 #if CONFIG_FREEZE
5956 if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5957 /* This is an app kill. Record it in the freezer snapshot so dasd can incorporate this in its recommendations. */
5958 copied_to_freezer_snapshot = memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_freezer, memorystatus_jetsam_snapshot_freezer_max, entry);
5959 if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5960 /*
5961 * We just used the last slot in the freezer snapshot buffer.
5962 * We only want to log it once... so we do it here
5963 * when we notice we've hit the max.
5964 */
5965 memorystatus_log_error("memorystatus: WARNING freezer snapshot buffer is full, count %zu\n",
5966 memorystatus_jetsam_snapshot_freezer->entry_count);
5967 }
5968 }
5969 #endif /* CONFIG_FREEZE */
5970 } else {
5971 /*
5972 * If we reach here, the snapshot buffer could not be updated.
5973 * Most likely, the buffer is full, in which case we would have
5974 * logged a warning in the previous call.
5975 *
5976 * For now, we will stop appending snapshot entries.
5977 * When the buffer is consumed, the snapshot state will reset.
5978 */
5979
5980 memorystatus_log_error(
5981 "memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n",
5982 proc_getpid(p), p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count);
5983
5984 #if CONFIG_FREEZE
5985 /* We still attempt to record this in the freezer snapshot */
5986 if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5987 snapshot = memorystatus_jetsam_snapshot_freezer;
5988 if (snapshot->entry_count < memorystatus_jetsam_snapshot_freezer_max) {
5989 copied_to_freezer_snapshot = memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry);
5990 if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5991 /*
5992 * We just used the last slot in the freezer snapshot buffer.
5993 * We only want to log it once... so we do it here
5994 * when we notice we've hit the max.
5995 */
5996 memorystatus_log_error("memorystatus: WARNING freezer snapshot buffer is full, count %zu\n",
5997 memorystatus_jetsam_snapshot_freezer->entry_count);
5998 }
5999 }
6000 }
6001 #endif /* CONFIG_FREEZE */
6002 }
6003
6004 return;
6005 }
6006
6007 uint32_t
memorystatus_get_available_page_count(void)6008 memorystatus_get_available_page_count(void)
6009 {
6010 return os_atomic_load(&memorystatus_available_pages, relaxed);
6011 }
6012
6013 void
memorystatus_update_available_page_count(uint32_t available_page_count)6014 memorystatus_update_available_page_count(uint32_t available_page_count)
6015 {
6016 os_atomic_store(&memorystatus_available_pages, available_page_count,
6017 relaxed);
6018 #if VM_PRESSURE_EVENTS
6019 /*
6020 * Since memorystatus_available_pages changes, we should
6021 * re-evaluate the pressure levels on the system and
6022 * check if we need to wake the pressure thread.
6023 * We also update memorystatus_level in that routine.
6024 */
6025 vm_pressure_response();
6026 #endif /* VM_PRESSURE_EVENTS */
6027 #if CONFIG_FREEZE
6028 /*
6029 * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect
6030 * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this
6031 * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here
6032 * will result in the "mutex with preemption disabled" panic.
6033 */
6034
6035 if (memorystatus_freeze_thread_should_run()) {
6036 /*
6037 * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process).
6038 * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here.
6039 */
6040 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
6041 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
6042 }
6043 }
6044 #endif /* CONFIG_FREEZE */
6045 _memstat_consider_waking_jetsam_thread();
6046 }
6047
6048 static boolean_t
memorystatus_init_jetsam_snapshot_entry_locked(proc_t p,memorystatus_jetsam_snapshot_entry_t * entry,uint64_t gencount)6049 memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount)
6050 {
6051 clock_sec_t tv_sec;
6052 clock_usec_t tv_usec;
6053 uint32_t pages = 0;
6054 uint32_t max_pages_lifetime = 0;
6055 uint32_t purgeable_pages = 0;
6056 uint64_t internal_pages = 0;
6057 uint64_t internal_compressed_pages = 0;
6058 uint64_t purgeable_nonvolatile_pages = 0;
6059 uint64_t purgeable_nonvolatile_compressed_pages = 0;
6060 uint64_t alternate_accounting_pages = 0;
6061 uint64_t alternate_accounting_compressed_pages = 0;
6062 uint64_t iokit_mapped_pages = 0;
6063 uint64_t page_table_pages = 0;
6064 uint64_t frozen_to_swap_pages = 0;
6065 uint64_t neural_nofootprint_total_pages = 0;
6066 uint64_t region_count = 0;
6067 uint64_t cids[COALITION_NUM_TYPES];
6068 uint32_t trust = 0;
6069 kern_return_t ret = 0;
6070 memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
6071
6072 entry->pid = proc_getpid(p);
6073 strlcpy(&entry->name[0], p->p_name, sizeof(entry->name));
6074 entry->priority = p->p_memstat_effectivepriority;
6075
6076 memorystatus_get_task_page_counts(proc_task(p), &pages, &max_pages_lifetime, &purgeable_pages);
6077 entry->pages = (uint64_t)pages;
6078 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
6079 entry->purgeable_pages = (uint64_t)purgeable_pages;
6080
6081 memorystatus_get_task_phys_footprint_page_counts(proc_task(p), &internal_pages, &internal_compressed_pages,
6082 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
6083 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
6084 &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages, &neural_nofootprint_total_pages);
6085
6086 entry->jse_internal_pages = internal_pages;
6087 entry->jse_internal_compressed_pages = internal_compressed_pages;
6088 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
6089 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
6090 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
6091 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
6092 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
6093 entry->jse_page_table_pages = page_table_pages;
6094 entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
6095 entry->jse_neural_nofootprint_total_pages = neural_nofootprint_total_pages;
6096
6097 memorystatus_get_task_memory_region_count(proc_task(p), ®ion_count);
6098 entry->jse_memory_region_count = region_count;
6099
6100 entry->state = _memstat_build_state(p);
6101 entry->user_data = p->p_memstat_userdata;
6102 proc_getexecutableuuid(p, &entry->uuid[0], sizeof(entry->uuid));
6103 entry->fds = p->p_fd.fd_nfiles;
6104
6105 absolutetime_to_microtime(get_task_cpu_time(proc_task(p)), &tv_sec, &tv_usec);
6106 entry->cpu_time.tv_sec = (int64_t)tv_sec;
6107 entry->cpu_time.tv_usec = (int64_t)tv_usec;
6108
6109 assert(p->p_stats != NULL);
6110 entry->jse_starttime = p->p_stats->ps_start; /* abstime process started */
6111 entry->jse_killtime = 0; /* abstime jetsam chose to kill process */
6112 entry->killed = 0; /* the jetsam kill cause */
6113 entry->jse_gencount = gencount; /* indicates a pass through jetsam thread, when process was targeted to be killed */
6114
6115 entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */
6116 entry->jse_prio_start = p->p_memstat_prio_start; /* Time moved to current band */
6117
6118 #if CONFIG_FREEZE
6119 entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason;
6120 entry->jse_thaw_count = p->p_memstat_thaw_count;
6121 #else /* CONFIG_FREEZE */
6122 entry->jse_thaw_count = 0;
6123 entry->jse_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
6124 #endif /* CONFIG_FREEZE */
6125
6126 proc_coalitionids(p, cids);
6127 entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM];
6128 entry->csflags = proc_getcsflags(p);
6129 ret = get_trust_level_kdp(get_task_pmap(proc_task(p)), &trust);
6130 if (ret != KERN_SUCCESS) {
6131 trust = KCDATA_INVALID_CS_TRUST_LEVEL;
6132 }
6133 entry->cs_trust_level = trust;
6134 return TRUE;
6135 }
6136
6137 static void
memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t * snapshot)6138 memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
6139 {
6140 kern_return_t kr = KERN_SUCCESS;
6141 mach_msg_type_number_t count = HOST_VM_INFO64_COUNT;
6142 vm_statistics64_data_t vm_stat;
6143
6144 if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count)) != KERN_SUCCESS) {
6145 memorystatus_log_error("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
6146 memset(&snapshot->stats, 0, sizeof(snapshot->stats));
6147 } else {
6148 snapshot->stats.free_pages = vm_stat.free_count;
6149 snapshot->stats.active_pages = vm_stat.active_count;
6150 snapshot->stats.inactive_pages = vm_stat.inactive_count;
6151 snapshot->stats.throttled_pages = vm_stat.throttled_count;
6152 snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
6153 snapshot->stats.wired_pages = vm_stat.wire_count;
6154
6155 snapshot->stats.speculative_pages = vm_stat.speculative_count;
6156 snapshot->stats.filebacked_pages = vm_stat.external_page_count;
6157 snapshot->stats.anonymous_pages = vm_stat.internal_page_count;
6158 snapshot->stats.compressions = vm_stat.compressions;
6159 snapshot->stats.decompressions = vm_stat.decompressions;
6160 snapshot->stats.compressor_pages = vm_stat.compressor_page_count;
6161 snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
6162 }
6163
6164 get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity);
6165
6166 bzero(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name));
6167 get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name),
6168 &snapshot->stats.largest_zone_size);
6169 }
6170
6171 /*
6172 * Collect vm statistics at boot.
6173 * Called only once (see kern_exec.c)
6174 * Data can be consumed at any time.
6175 */
6176 void
memorystatus_init_at_boot_snapshot()6177 memorystatus_init_at_boot_snapshot()
6178 {
6179 memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
6180 memorystatus_at_boot_snapshot.entry_count = 0;
6181 memorystatus_at_boot_snapshot.notification_time = 0; /* updated when consumed */
6182 memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
6183 }
6184
6185 static void
memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t * snapshot)6186 memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot)
6187 {
6188 memorystatus_init_snapshot_vmstats(snapshot);
6189 snapshot->snapshot_time = mach_absolute_time();
6190 snapshot->notification_time = 0;
6191 snapshot->js_gencount = 0;
6192 }
6193
6194 static void
memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t * od_snapshot,uint32_t ods_list_count)6195 memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
6196 {
6197 proc_t p, next_p;
6198 unsigned int b = 0, i = 0;
6199
6200 memorystatus_jetsam_snapshot_t *snapshot = NULL;
6201 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
6202 unsigned int snapshot_max = 0;
6203
6204 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
6205
6206 if (od_snapshot) {
6207 /*
6208 * This is an on_demand snapshot
6209 */
6210 snapshot = od_snapshot;
6211 snapshot_list = od_snapshot->entries;
6212 snapshot_max = ods_list_count;
6213 } else {
6214 /*
6215 * This is a jetsam event snapshot
6216 */
6217 snapshot = memorystatus_jetsam_snapshot;
6218 snapshot_list = memorystatus_jetsam_snapshot->entries;
6219 snapshot_max = memorystatus_jetsam_snapshot_max;
6220 }
6221
6222 memorystatus_init_jetsam_snapshot_header(snapshot);
6223
6224 next_p = memorystatus_get_first_proc_locked(&b, TRUE);
6225 while (next_p) {
6226 p = next_p;
6227 next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
6228
6229 if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) {
6230 continue;
6231 }
6232
6233 if (++i == snapshot_max) {
6234 break;
6235 }
6236 }
6237
6238 /* Log launchd and kernel_task as well to see more context, even though jetsam doesn't apply to them. */
6239 if (i < snapshot_max) {
6240 memorystatus_init_jetsam_snapshot_entry_locked(initproc, &snapshot_list[i], snapshot->js_gencount);
6241 i++;
6242 }
6243
6244 if (i < snapshot_max) {
6245 memorystatus_init_jetsam_snapshot_entry_locked(kernproc, &snapshot_list[i], snapshot->js_gencount);
6246 i++;
6247 }
6248
6249 snapshot->entry_count = i;
6250
6251 if (!od_snapshot) {
6252 /* update the system buffer count */
6253 memorystatus_jetsam_snapshot_count = i;
6254 }
6255 }
6256
6257 /* number of entries added to the end of the jetsam snapshot (for launchd and kernel) */
6258 static const int memorystatus_artificial_snapshot_entry_count = 2;
6259
6260 #if DEVELOPMENT || DEBUG
6261
6262 /*
6263 * Fills an array with the given pids in the order they are seen in a
6264 * jetsam band.
6265 */
6266 static int
memorystatus_get_sort_order(unsigned int bucket_index,pid_t * pids,pid_t * order,size_t num_pids)6267 memorystatus_get_sort_order(
6268 unsigned int bucket_index,
6269 pid_t *pids,
6270 pid_t *order,
6271 size_t num_pids)
6272 {
6273 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
6274
6275 proc_t p = NULL;
6276 size_t i, out_idx = 0;
6277
6278 /*
6279 * Read out the order of all the pids into the order array.
6280 */
6281 p = memorystatus_get_first_proc_locked(&bucket_index, FALSE);
6282 while (p) {
6283 for (i = 0; i < num_pids; i++) {
6284 if (pids[i] == proc_getpid(p)) {
6285 if (out_idx >= num_pids) {
6286 /* Did we somehow see something twice? */
6287 return EINVAL;
6288 }
6289 order[out_idx] = pids[i];
6290 out_idx++;
6291 }
6292 }
6293 p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE);
6294 }
6295 return 0;
6296 }
6297
6298 /*
6299 * Triggers a sort_order on a specified jetsam priority band.
6300 * This is for testing only, used to force a path through the sort
6301 * function.
6302 */
6303 static int
memorystatus_cmd_test_jetsam_sort(int priority,int sort_order,user_addr_t expected_order_user,size_t expected_order_user_len)6304 memorystatus_cmd_test_jetsam_sort(int priority,
6305 int sort_order,
6306 user_addr_t expected_order_user,
6307 size_t expected_order_user_len)
6308 {
6309 pid_t *expected_order, *actual_order;
6310 int error = 0;
6311 size_t num_pids = expected_order_user_len / sizeof(pid_t);
6312
6313 if (num_pids > 512) { /* Just so we don't allocate some huge buffer */
6314 return EINVAL;
6315 }
6316
6317 if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
6318 return EINVAL;
6319 }
6320
6321 expected_order = kalloc_data_tag(num_pids * sizeof(pid_t), Z_WAITOK, VM_KERN_MEMORY_DIAG);
6322 actual_order = kalloc_data_tag(num_pids * sizeof(pid_t), Z_WAITOK, VM_KERN_MEMORY_DIAG);
6323
6324 error = copyin(expected_order_user, expected_order, expected_order_user_len);
6325 if (error != 0) {
6326 goto err;
6327 }
6328
6329 /*
6330 * Acquire lock before sorting so we can check the sort order
6331 * while still holding the lock.
6332 */
6333 proc_list_lock();
6334
6335 memstat_sort_bucket_locked(priority, sort_order);
6336
6337 if (expected_order_user != CAST_USER_ADDR_T(NULL) && expected_order_user_len > 0) {
6338 bzero(actual_order, num_pids * sizeof(pid_t));
6339 error = memorystatus_get_sort_order(priority, expected_order, actual_order, num_pids);
6340 /* Even if we get an error, we still want to copyout what we had */
6341 copyout(actual_order, expected_order_user, num_pids * sizeof(pid_t));
6342 }
6343
6344 proc_list_unlock();
6345
6346 err:
6347 kfree_data(expected_order, num_pids * sizeof(pid_t));
6348 kfree_data(actual_order, num_pids * sizeof(pid_t));
6349 return error;
6350 }
6351
6352 #endif /* DEVELOPMENT || DEBUG */
6353
6354 /*
6355 * Prepare the process to be killed (set state, update snapshot) and kill it.
6356 */
6357 static uint64_t memorystatus_purge_before_jetsam_success = 0;
6358
6359 #if SOCKETS
6360 static int
networking_memstatus_callout(proc_t p,uint32_t status)6361 networking_memstatus_callout(proc_t p, uint32_t status)
6362 {
6363 struct fileproc *fp;
6364
6365 /*
6366 * proc list lock NOT held
6367 * proc lock NOT held
6368 * a reference on the proc has been held / shall be dropped by the caller.
6369 */
6370 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
6371 LCK_MTX_ASSERT(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED);
6372
6373 proc_fdlock(p);
6374
6375 fdt_foreach(fp, p) {
6376 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
6377 #if NECP
6378 case DTYPE_NETPOLICY:
6379 necp_fd_memstatus(p, status,
6380 (struct necp_fd_data *)fp_get_data(fp));
6381 break;
6382 #endif /* NECP */
6383 #if SKYWALK
6384 case DTYPE_CHANNEL:
6385 kern_channel_memstatus(p, status,
6386 (struct kern_channel *)fp_get_data(fp));
6387 break;
6388 #endif /* SKYWALK */
6389 default:
6390 break;
6391 }
6392 }
6393 proc_fdunlock(p);
6394
6395 return 1;
6396 }
6397 #endif /* SOCKETS */
6398
6399 static bool
memorystatus_kill_proc(proc_t p,uint32_t cause,os_reason_t jetsam_reason,bool * killed,uint64_t * footprint_out)6400 memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool *killed, uint64_t *footprint_out)
6401 {
6402 pid_t aPid = 0;
6403 uint32_t aPid_ep = 0;
6404
6405 uint64_t killtime = 0;
6406 uint64_t time_in_priority_band_secs = 0;
6407 clock_sec_t tv_sec;
6408 clock_usec_t tv_usec;
6409 uint32_t tv_msec;
6410 bool retval = false;
6411
6412 aPid = proc_getpid(p);
6413 aPid_ep = p->p_memstat_effectivepriority;
6414
6415 if (cause != kMemorystatusKilledVnodes && cause != kMemorystatusKilledZoneMapExhaustion) {
6416 /*
6417 * Genuine memory pressure and not other (vnode/zone) resource exhaustion.
6418 */
6419 boolean_t success = FALSE;
6420 uint64_t num_pages_purged;
6421 uint64_t num_pages_reclaimed = 0;
6422 uint64_t num_pages_unsecluded = 0;
6423
6424 networking_memstatus_callout(p, cause);
6425 num_pages_purged = vm_purgeable_purge_task_owned(proc_task(p));
6426 num_pages_reclaimed += num_pages_purged;
6427 #if CONFIG_SECLUDED_MEMORY
6428 if (cause == kMemorystatusKilledVMPageShortage &&
6429 vm_page_secluded_count > 0 &&
6430 task_can_use_secluded_mem(proc_task(p), FALSE)) {
6431 /*
6432 * We're about to kill a process that has access
6433 * to the secluded pool. Drain that pool into the
6434 * free or active queues to make these pages re-appear
6435 * as "available", which might make us no longer need
6436 * to kill that process.
6437 * Since the secluded pool does not get refilled while
6438 * a process has access to it, it should remain
6439 * drained.
6440 */
6441 num_pages_unsecluded = vm_page_secluded_drain();
6442 num_pages_reclaimed += num_pages_unsecluded;
6443 }
6444 #endif /* CONFIG_SECLUDED_MEMORY */
6445
6446 if (num_pages_reclaimed) {
6447 /*
6448 * We actually reclaimed something and so let's
6449 * check if we need to continue with the kill.
6450 */
6451 if (cause == kMemorystatusKilledHiwat) {
6452 uint64_t footprint_in_bytes = get_task_phys_footprint(proc_task(p));
6453 uint64_t memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */
6454 success = (footprint_in_bytes <= memlimit_in_bytes);
6455 } else {
6456 success = !(memorystatus_get_available_page_count() < memorystatus_get_soft_memlimit_page_shortage_threshold());
6457 #if CONFIG_SECLUDED_MEMORY
6458 if (!success && num_pages_unsecluded) {
6459 /*
6460 * We just drained the secluded pool
6461 * because we're about to kill a
6462 * process that has access to it.
6463 * This is an important process and
6464 * we'd rather not kill it unless
6465 * absolutely necessary, so declare
6466 * success even if draining the pool
6467 * did not quite get us out of the
6468 * "pressure" level but still got
6469 * us out of the "critical" level.
6470 */
6471 success = !(
6472 memorystatus_get_available_page_count() <
6473 memorystatus_get_critical_page_shortage_threshold());
6474 }
6475 #endif /* CONFIG_SECLUDED_MEMORY */
6476 }
6477
6478 if (success) {
6479 memorystatus_purge_before_jetsam_success++;
6480
6481 memorystatus_log_info("memorystatus: reclaimed %llu pages (%llu purged, %llu unsecluded) from pid %d [%s] and avoided %s\n",
6482 num_pages_reclaimed, num_pages_purged, num_pages_unsecluded, aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memstat_kill_cause_name[cause]);
6483
6484 *killed = false;
6485 *footprint_out = num_pages_reclaimed + num_pages_purged + num_pages_unsecluded;
6486
6487 return true;
6488 }
6489 }
6490 }
6491
6492 killtime = mach_absolute_time();
6493 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
6494 tv_msec = tv_usec / 1000;
6495
6496 proc_list_lock();
6497 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
6498 proc_list_unlock();
6499
6500 char kill_reason_string[128];
6501
6502 if (cause == kMemorystatusKilledHiwat) {
6503 strlcpy(kill_reason_string, "killing_highwater_process", 128);
6504 } else {
6505 if (aPid_ep == JETSAM_PRIORITY_IDLE) {
6506 strlcpy(kill_reason_string, "killing_idle_process", 128);
6507 } else {
6508 strlcpy(kill_reason_string, "killing_top_process", 128);
6509 }
6510 }
6511
6512 /*
6513 * memorystatus_do_kill drops a reference, so take another one so we can
6514 * continue to use this exit reason even after memorystatus_do_kill()
6515 * returns
6516 */
6517 os_reason_ref(jetsam_reason);
6518
6519 retval = memorystatus_do_kill(p, cause, jetsam_reason, footprint_out);
6520 *killed = retval;
6521
6522 absolutetime_to_nanoseconds(killtime - p->p_memstat_prio_start, &time_in_priority_band_secs);
6523 time_in_priority_band_secs /= NSEC_PER_SEC;
6524
6525 memorystatus_log("memorystatus: %s pid %d [%s] (%s %d %llus rf:%s type:%s) %lluKB - memorystatus_available_pages: %llu compressor_size:%u\n",
6526 kill_reason_string,
6527 aPid, proc_best_name(p),
6528 memstat_kill_cause_name[cause], aPid_ep, time_in_priority_band_secs,
6529 _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags), _memstat_proc_type_description(p),
6530 (*footprint_out) >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
6531
6532 return retval;
6533 }
6534
6535 /*
6536 * Jetsam the first process in the queue.
6537 */
6538 static bool
memstat_kill_top_process(uint32_t cause,os_reason_t jetsam_reason,int32_t max_priority,memstat_kill_options_t options,int32_t * priority_out,uint32_t * errors_out,uint64_t * memory_reclaimed_out)6539 memstat_kill_top_process(uint32_t cause, os_reason_t jetsam_reason,
6540 int32_t max_priority, memstat_kill_options_t options,
6541 int32_t *priority_out, uint32_t *errors_out, uint64_t *memory_reclaimed_out)
6542 {
6543 pid_t aPid;
6544 proc_t p = PROC_NULL, next_p = PROC_NULL;
6545 bool new_snapshot = false, force_new_snapshot = false, killed = false, freed_mem = false;
6546 unsigned int i = 0;
6547 uint32_t aPid_ep;
6548 uint64_t footprint_of_killed_proc = 0;
6549
6550 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
6551 MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6552
6553 bool only_long_idle = options & MEMSTAT_ONLY_LONG_IDLE;
6554 bool only_swappable = options & MEMSTAT_ONLY_SWAPPABBLE;
6555 bool sort_bucket = options & MEMSTAT_SORT_BUCKET;
6556
6557 #if CONFIG_JETSAM
6558 if (sort_bucket) {
6559 (void)memstat_sort_bucket(JETSAM_PRIORITY_FOREGROUND, memstat_jetsam_fg_sort_order);
6560 }
6561
6562 if (memory_reclaimed_out) {
6563 *memory_reclaimed_out = 0;
6564 }
6565
6566 force_new_snapshot = false;
6567
6568 #else /* CONFIG_JETSAM */
6569 if (sort_bucket) {
6570 (void)memstat_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_IDLE_DEFAULT);
6571 }
6572
6573 /*
6574 * And, because we are here under extreme circumstances, we force a snapshot even for
6575 * IDLE kills.
6576 */
6577 force_new_snapshot = true;
6578
6579 #endif /* CONFIG_JETSAM */
6580
6581 if (cause != kMemorystatusKilledZoneMapExhaustion &&
6582 jetsam_current_thread() != NULL &&
6583 jetsam_current_thread()->limit_to_low_bands &&
6584 max_priority > JETSAM_PRIORITY_MAIL) {
6585 max_priority = JETSAM_PRIORITY_MAIL;
6586 }
6587
6588 _memstat_refresh_oldest_reapable_proc_info();
6589
6590 proc_list_lock();
6591
6592 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6593 while (next_p && (next_p->p_memstat_effectivepriority <= max_priority)) {
6594 p = next_p;
6595 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6596
6597
6598 aPid = proc_getpid(p);
6599 aPid_ep = p->p_memstat_effectivepriority;
6600
6601 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6602 continue; /* with lock held */
6603 }
6604
6605 if (cause == kMemorystatusKilledVnodes) {
6606 /*
6607 * If the system runs out of vnodes, we systematically jetsam
6608 * processes in hopes of stumbling onto a vnode gain that helps
6609 * the system recover. The process that happens to trigger
6610 * this path has no known relationship to the vnode shortage.
6611 * Deadlock avoidance: attempt to safeguard the caller.
6612 */
6613
6614 if (p == current_proc()) {
6615 /* do not jetsam the current process */
6616 continue;
6617 }
6618 }
6619
6620 if (only_swappable && !task_donates_own_pages(proc_task(p))) {
6621 continue;
6622 }
6623
6624 if (only_long_idle) {
6625 if (!_memstat_proc_is_reapable(p)) {
6626 memorystatus_log_debug("memorystatus: memstat_kill_top_process: skipping non-reapable process %s [%d]\n",
6627 proc_best_name(p), p->p_pid);
6628 continue;
6629 }
6630 memorystatus_log_debug("memorystatus: memstat_kill_top_process: found reapable long-idle process %s [%d]\n",
6631 proc_best_name(p), p->p_pid);
6632 }
6633
6634 #if !CONFIG_JETSAM
6635 if (max_priority == JETSAM_PRIORITY_IDLE &&
6636 ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) != (P_DIRTY_ALLOW_IDLE_EXIT))) {
6637 /*
6638 * This process is in the idle band but is not clean+idle-exitable or
6639 * managed+assertion-less. Skip it.
6640 */
6641 memorystatus_log_error("memorystatus: skipping idle but not idle-exitable process "
6642 "%s [%d] (0x%x)\n", proc_best_name(p), proc_getpid(p), p->p_memstat_state);
6643 continue;
6644 }
6645 #endif /* !CONFIG_JETSAM */
6646 #if CONFIG_FREEZE
6647 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
6648 continue;
6649 }
6650 #endif
6651 if (proc_ref(p, true) == p) {
6652 /*
6653 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6654 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6655 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6656 * acquisition of the proc lock.
6657 */
6658 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6659 } else {
6660 /*
6661 * We need to restart the search again because
6662 * proc_ref _can_ drop the proc_list lock
6663 * and we could have lost our stored next_p via
6664 * an exit() on another core.
6665 */
6666 i = 0;
6667 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6668 continue;
6669 }
6670
6671 /*
6672 * Capture a snapshot if none exists and:
6673 * - we are forcing a new snapshot creation, either because:
6674 * - on a particular platform we need these snapshots every time, OR
6675 * - a boot-arg/embedded device tree property has been set.
6676 * - priority was not requested (this is something other than an ambient kill)
6677 * - the priority was requested *and* the targeted process is not at idle priority
6678 */
6679 if ((memorystatus_jetsam_snapshot_count == 0) &&
6680 (force_new_snapshot || memorystatus_idle_snapshot || ((!priority_out) || (priority_out && (aPid_ep != JETSAM_PRIORITY_IDLE))))) {
6681 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6682 new_snapshot = true;
6683 }
6684
6685 proc_list_unlock();
6686
6687 freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed, &footprint_of_killed_proc); /* purged and/or killed 'p' */
6688 /* Success? */
6689 if (freed_mem) {
6690 if (memory_reclaimed_out) {
6691 *memory_reclaimed_out = footprint_of_killed_proc;
6692 }
6693 if (killed) {
6694 if (priority_out) {
6695 *priority_out = aPid_ep;
6696 }
6697 } else {
6698 /* purged */
6699 proc_list_lock();
6700 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6701 proc_list_unlock();
6702 }
6703 proc_rele(p);
6704 goto exit;
6705 }
6706
6707 /*
6708 * Failure - first unwind the state,
6709 * then fall through to restart the search.
6710 */
6711 proc_list_lock();
6712 proc_rele(p);
6713 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6714 p->p_memstat_state |= P_MEMSTAT_ERROR;
6715 if (errors_out) {
6716 *errors_out += 1;
6717 }
6718
6719 i = 0;
6720 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6721 }
6722
6723 proc_list_unlock();
6724
6725 exit:
6726 os_reason_free(jetsam_reason);
6727
6728 if (!killed) {
6729 /* Clear snapshot if freshly captured and no target was found */
6730 if (new_snapshot) {
6731 proc_list_lock();
6732 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6733 proc_list_unlock();
6734 }
6735 #if CONFIG_JETSAM
6736 if (only_long_idle) {
6737 _memstat_reaper_end_sweep();
6738 }
6739 #endif
6740 }
6741
6742 #if CONFIG_JETSAM
6743 if (killed && only_long_idle) {
6744 _memstat_reaper_record_kill(footprint_of_killed_proc);
6745 }
6746 #endif
6747
6748 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6749 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, footprint_of_killed_proc);
6750
6751 return killed;
6752 }
6753
6754 static
6755 void
_memstat_refresh_oldest_reapable_proc_info()6756 _memstat_refresh_oldest_reapable_proc_info()
6757 {
6758 uint64_t oldest_prio_start = MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_NONE;
6759 proc_t p = PROC_NULL, next_p = PROC_NULL;
6760 unsigned int i = 0;
6761 uint64_t mat = mach_absolute_time();
6762 uint64_t rescan_timeout_duration_matu;
6763
6764
6765 /* If we're still within 'memstat_reaper_rescan_secs' of the last process-list walk,
6766 * don't do another walk yet, and just use the existing information.
6767 */
6768 if (mat < memstat_oldest_reapable_proc_info_expiration_ts_matu) {
6769 memorystatus_log_debug("memorystatus: _memstat_refresh_oldest_reapable_proc_info: re-using existing data\n");
6770 return;
6771 } else {
6772 memorystatus_log_debug("memorystatus: _memstat_refresh_oldest_reapable_proc_info: rescanning proc list\n");
6773 }
6774
6775 proc_list_lock();
6776
6777 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6778 while (next_p) {
6779 p = next_p;
6780 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6781
6782 /* Since the process list is sorted in priority order, once we find the first
6783 * process that is out of the reaper's acceptible range, we can skip the rest
6784 */
6785 if (p->p_memstat_effectivepriority > memstat_reaper_max_priority) {
6786 break;
6787 }
6788
6789 if (_memstat_proc_is_reapable(p)) {
6790 uint64_t proc_prio_start = p->p_memstat_prio_start;
6791 if (proc_prio_start < oldest_prio_start) {
6792 oldest_prio_start = proc_prio_start;
6793 /* Since the process list is sorted in age order within priority bands,
6794 * the first process will be the oldest one, and we can bail out and skip the rest
6795 */
6796 break;
6797 }
6798 }
6799 }
6800
6801 proc_list_unlock();
6802
6803 memstat_oldest_reapable_proc_prio_start = oldest_prio_start;
6804
6805 if (memstat_oldest_reapable_proc_prio_start != MEMSTAT_OLDEST_REAPABLE_PROC_PRIO_START_NONE) {
6806 uint64_t min_age_matu;
6807 nanoseconds_to_absolutetime((memstat_reaper_min_age_secs * NSEC_PER_SEC), &min_age_matu);
6808 memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu = memstat_oldest_reapable_proc_prio_start + min_age_matu;
6809 }
6810
6811 nanoseconds_to_absolutetime((memstat_reaper_rescan_secs * NSEC_PER_SEC), &rescan_timeout_duration_matu);
6812 memstat_oldest_reapable_proc_info_expiration_ts_matu = mat + rescan_timeout_duration_matu;
6813 }
6814
6815 static bool
_memstat_proc_is_reapable(proc_t proc)6816 _memstat_proc_is_reapable(proc_t proc)
6817 {
6818 uint32_t priority_band;
6819 uint64_t time_in_priority_band_secs;
6820 uint32_t relaunch_probability_acceptable_mask;
6821
6822 /*
6823 * To be potentially reapable, the process
6824 * - must be in or below the max reapable priority and
6825 * - must not have a relaunch probability of High or Medium (per memstat_reaper_reap_relaunch_mask)
6826 * - must have been in that priority band longer than the reaper minimum age threshold
6827 * - must have been in that priority band longer than the reaper minimum age threshold for applications, if process is an application
6828 */
6829 priority_band = proc->p_memstat_effectivepriority;
6830 if (priority_band > memstat_reaper_max_priority) {
6831 memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] not reapable because priority (%d) is above threshold (%d)\n",
6832 proc_best_name(proc), proc->p_pid, priority_band, memstat_reaper_max_priority);
6833 return false;
6834 }
6835
6836 uint32_t relaunch_flags = proc->p_memstat_relaunch_flags;
6837 // There's no explicit flag for "unknown" relaunch probability, and we need one for our control bitmask.
6838 // So if none of the Low Medium or High bits are set, we set the next higher bit as the "unknown relaunch probability" bit
6839 // and then test all the bits at once, below, with a bitwise-and.
6840 if ((relaunch_flags & (P_MEMSTAT_RELAUNCH_LOW | P_MEMSTAT_RELAUNCH_MED | P_MEMSTAT_RELAUNCH_HIGH)) == 0) {
6841 relaunch_flags |= MEMORYSTATUS_REAPER_REAP_RELAUNCH_MASK_UNKNOWN; // The bit for 'unknown' is the one just to the left (above) of High, e.g. 0x08
6842 }
6843 relaunch_probability_acceptable_mask = relaunch_flags & memstat_reaper_reap_relaunch_mask;
6844
6845 if (relaunch_probability_acceptable_mask == 0) {
6846 memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] not reapable because relaunch probability bitmask (0x%02X) does not match with the memstat_reaper_reap_relaunch_mask (0x%02X).\n",
6847 proc_best_name(proc), proc->p_pid, relaunch_flags, memstat_reaper_reap_relaunch_mask);
6848 return false;
6849 }
6850
6851 absolutetime_to_nanoseconds(mach_absolute_time() - proc->p_memstat_prio_start, &time_in_priority_band_secs);
6852 time_in_priority_band_secs /= NSEC_PER_SEC;
6853
6854 if (_memstat_proc_is_application(proc)) {
6855 if ((time_in_priority_band_secs < memstat_reaper_min_age_apps_secs)) {
6856 memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] not reapable because it is an application and age (%llu) is below min age for apps (%d)\n",
6857 proc_best_name(proc), proc->p_pid, time_in_priority_band_secs, memstat_reaper_min_age_apps_secs);
6858 return false;
6859 }
6860 } else {
6861 if (time_in_priority_band_secs < memstat_reaper_min_age_secs) {
6862 memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] not reapable because age (%llu) is below min age (%d)\n",
6863 proc_best_name(proc), proc->p_pid, time_in_priority_band_secs, memstat_reaper_min_age_secs);
6864 return false;
6865 }
6866 }
6867
6868 memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] is reapable; priority=%d, age=%d, relaunch_probability_acceptable_mask=0x%02X, type=%s\n",
6869 proc_best_name(proc), proc->p_pid, priority_band, (uint32_t)(time_in_priority_band_secs), relaunch_probability_acceptable_mask,
6870 _memstat_proc_type_description(proc));
6871 return true;
6872 }
6873
6874 static bool
_memstat_proc_is_application(proc_t proc)6875 _memstat_proc_is_application(proc_t proc)
6876 {
6877 bool isApp = false;
6878
6879 task_t task = proc_task(proc);
6880 if (task != NULL) {
6881 isApp = task_is_app( task);
6882 }
6883
6884 return isApp;
6885 }
6886
6887 /*
6888 * Jetsam aggressively
6889 */
6890 static bool
memorystatus_kill_processes_aggressive(uint32_t cause,int aggr_count,int32_t priority_max,int max_kills,uint32_t * errors,uint64_t * memory_reclaimed)6891 memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count,
6892 int32_t priority_max, int max_kills, uint32_t *errors, uint64_t *memory_reclaimed)
6893 {
6894 pid_t aPid;
6895 proc_t p = PROC_NULL, next_p = PROC_NULL;
6896 boolean_t new_snapshot = FALSE, killed = FALSE;
6897 int kill_count = 0;
6898 unsigned int priority_band = JETSAM_PRIORITY_IDLE;
6899 int32_t aPid_ep = 0;
6900 unsigned int memorystatus_level_snapshot = 0;
6901 uint64_t killtime = 0;
6902 uint64_t time_in_priority_band_secs = 0;
6903 clock_sec_t tv_sec;
6904 clock_usec_t tv_usec;
6905 uint32_t tv_msec;
6906 os_reason_t jetsam_reason = OS_REASON_NULL;
6907 uint64_t footprint_of_killed_proc = 0;
6908
6909 *memory_reclaimed = 0;
6910
6911 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
6912 MEMORYSTATUS_LOG_AVAILABLE_PAGES, priority_max);
6913
6914 if (priority_max >= JETSAM_PRIORITY_FOREGROUND) {
6915 /*
6916 * Check if aggressive jetsam has been asked to kill upto or beyond the
6917 * JETSAM_PRIORITY_FOREGROUND bucket. If yes, sort the FG band based on
6918 * coalition footprint.
6919 */
6920 memstat_sort_bucket(JETSAM_PRIORITY_FOREGROUND, memstat_jetsam_fg_sort_order);
6921 }
6922
6923 jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause);
6924 if (jetsam_reason == OS_REASON_NULL) {
6925 memorystatus_log_error("memorystatus_kill_processes_aggressive: failed to allocate exit reason\n");
6926 }
6927 memorystatus_log("memorystatus: aggressively killing up to %d processes below band %d.\n", max_kills, priority_max + 1);
6928 proc_list_lock();
6929
6930 next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
6931 while (next_p) {
6932 if (proc_list_exited(next_p) ||
6933 ((unsigned int)(next_p->p_memstat_effectivepriority) != priority_band)) {
6934 /*
6935 * We have raced with next_p running on another core.
6936 * It may be exiting or it may have moved to a different
6937 * jetsam priority band. This means we have lost our
6938 * place in line while traversing the jetsam list. We
6939 * attempt to recover by rewinding to the beginning of the band
6940 * we were already traversing. By doing this, we do not guarantee
6941 * that no process escapes this aggressive march, but we can make
6942 * skipping an entire range of processes less likely. (PR-21069019)
6943 */
6944
6945 memorystatus_log_debug(
6946 "memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n",
6947 aggr_count, priority_band, (*next_p->p_name ? next_p->p_name : "unknown"), proc_getpid(next_p));
6948
6949 next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
6950 continue;
6951 }
6952
6953 p = next_p;
6954 next_p = memorystatus_get_next_proc_locked(&priority_band, p, TRUE);
6955
6956 if (p->p_memstat_effectivepriority > priority_max) {
6957 /*
6958 * Bail out of this killing spree if we have
6959 * reached beyond the priority_max jetsam band.
6960 * That is, we kill up to and through the
6961 * priority_max jetsam band.
6962 */
6963 proc_list_unlock();
6964 goto exit;
6965 }
6966
6967 aPid = proc_getpid(p);
6968 aPid_ep = p->p_memstat_effectivepriority;
6969
6970 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6971 continue;
6972 }
6973
6974 /*
6975 * Capture a snapshot if none exists.
6976 */
6977 if (memorystatus_jetsam_snapshot_count == 0) {
6978 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6979 new_snapshot = TRUE;
6980 }
6981
6982 /*
6983 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6984 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6985 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6986 * acquisition of the proc lock.
6987 */
6988 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6989
6990 killtime = mach_absolute_time();
6991 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
6992 tv_msec = tv_usec / 1000;
6993
6994 /* Shift queue, update stats */
6995 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
6996
6997 /*
6998 * In order to kill the target process, we will drop the proc_list_lock.
6999 * To guaranteee that p and next_p don't disappear out from under the lock,
7000 * we must take a ref on both.
7001 * If we cannot get a reference, then it's likely we've raced with
7002 * that process exiting on another core.
7003 */
7004 if (proc_ref(p, true) == p) {
7005 if (next_p) {
7006 while (next_p && (proc_ref(next_p, true) != next_p)) {
7007 proc_t temp_p;
7008
7009 /*
7010 * We must have raced with next_p exiting on another core.
7011 * Recover by getting the next eligible process in the band.
7012 */
7013
7014 memorystatus_log_debug(
7015 "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
7016 aggr_count, proc_getpid(next_p), (*next_p->p_name ? next_p->p_name : "(unknown)"));
7017
7018 temp_p = next_p;
7019 next_p = memorystatus_get_next_proc_locked(&priority_band, temp_p, TRUE);
7020 }
7021 }
7022 proc_list_unlock();
7023
7024 if (aPid_ep <= system_procs_aging_band &&
7025 (p->p_memstat_relaunch_flags & P_MEMSTAT_RELAUNCH_HIGH)) {
7026 memorystatus_log("memorystatus: killing %s [%d] in band %d "
7027 "with high relaunch probability\n",
7028 proc_best_name(p), aPid, aPid_ep);
7029 }
7030 absolutetime_to_nanoseconds(killtime - p->p_memstat_prio_start, &time_in_priority_band_secs);
7031 time_in_priority_band_secs /= NSEC_PER_SEC;
7032 memorystatus_log(
7033 "memorystatus: %s%d pid %d [%s] (%s %d %llus rf:%s type:%s) - memorystatus_available_pages: %llu\n",
7034 ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"),
7035 aggr_count, aPid, proc_best_name(p),
7036 memstat_kill_cause_name[cause], aPid_ep,
7037 time_in_priority_band_secs, _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags), _memstat_proc_type_description(p),
7038 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
7039
7040 memorystatus_level_snapshot = memorystatus_level;
7041
7042 /*
7043 * memorystatus_do_kill() drops a reference, so take another one so we can
7044 * continue to use this exit reason even after memorystatus_do_kill()
7045 * returns.
7046 */
7047 os_reason_ref(jetsam_reason);
7048 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
7049
7050 /* Success? */
7051 if (killed) {
7052 *memory_reclaimed += footprint_of_killed_proc;
7053 proc_rele(p);
7054 kill_count++;
7055 p = NULL;
7056 killed = FALSE;
7057
7058 /*
7059 * Continue the killing spree.
7060 */
7061 proc_list_lock();
7062 if (next_p) {
7063 proc_rele(next_p);
7064 }
7065
7066 if (kill_count == max_kills) {
7067 memorystatus_log_info(
7068 "memorystatus: giving up aggressive kill after killing "
7069 "%d processes below band %d.\n",
7070 max_kills, priority_max + 1);
7071 break;
7072 }
7073
7074 if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) {
7075 if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) {
7076 #if DEVELOPMENT || DEBUG
7077 memorystatus_log_info("Disabling Lenient mode after one-time deployment.\n");
7078 #endif /* DEVELOPMENT || DEBUG */
7079 memorystatus_aggressive_jetsam_lenient = FALSE;
7080 break;
7081 }
7082 }
7083
7084 continue;
7085 }
7086
7087 /*
7088 * Failure - first unwind the state,
7089 * then fall through to restart the search.
7090 */
7091 proc_list_lock();
7092 proc_rele(p);
7093 if (next_p) {
7094 proc_rele(next_p);
7095 }
7096 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
7097 p->p_memstat_state |= P_MEMSTAT_ERROR;
7098 *errors += 1;
7099 p = NULL;
7100 }
7101
7102 /*
7103 * Failure - restart the search at the beginning of
7104 * the band we were already traversing.
7105 *
7106 * We might have raced with "p" exiting on another core, resulting in no
7107 * ref on "p". Or, we may have failed to kill "p".
7108 *
7109 * Either way, we fall thru to here, leaving the proc in the
7110 * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
7111 *
7112 * And, we hold the the proc_list_lock at this point.
7113 */
7114
7115 next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
7116 }
7117
7118 proc_list_unlock();
7119
7120 exit:
7121 os_reason_free(jetsam_reason);
7122
7123 /* Clear snapshot if freshly captured and no target was found */
7124 if (new_snapshot && (kill_count == 0)) {
7125 proc_list_lock();
7126 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
7127 proc_list_unlock();
7128 }
7129
7130 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
7131 MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, kill_count, *memory_reclaimed);
7132
7133 return kill_count > 0;
7134 }
7135
7136 static boolean_t
memorystatus_kill_hiwat_proc(uint32_t * errors,boolean_t * purged,uint64_t * memory_reclaimed)7137 memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed)
7138 {
7139 pid_t aPid = 0;
7140 proc_t p = PROC_NULL, next_p = PROC_NULL;
7141 bool new_snapshot = false, killed = false, freed_mem = false;
7142 unsigned int i = 0;
7143 uint32_t aPid_ep;
7144 os_reason_t jetsam_reason = OS_REASON_NULL;
7145 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
7146 MEMORYSTATUS_LOG_AVAILABLE_PAGES);
7147
7148 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER);
7149 if (jetsam_reason == OS_REASON_NULL) {
7150 memorystatus_log_error("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n");
7151 }
7152
7153 proc_list_lock();
7154
7155 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
7156 while (next_p) {
7157 uint64_t footprint_in_bytes = 0;
7158 uint64_t memlimit_in_bytes = 0;
7159 boolean_t skip = 0;
7160
7161 p = next_p;
7162 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
7163
7164 aPid = proc_getpid(p);
7165 aPid_ep = p->p_memstat_effectivepriority;
7166
7167 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
7168 continue;
7169 }
7170
7171 /* skip if no limit set */
7172 if (p->p_memstat_memlimit <= 0) {
7173 continue;
7174 }
7175
7176 footprint_in_bytes = get_task_phys_footprint(proc_task(p));
7177 memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */
7178 skip = (footprint_in_bytes <= memlimit_in_bytes);
7179
7180 #if CONFIG_FREEZE
7181 if (!skip) {
7182 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
7183 skip = TRUE;
7184 } else {
7185 skip = FALSE;
7186 }
7187 }
7188 #endif
7189
7190 if (skip) {
7191 continue;
7192 } else {
7193 if (memorystatus_jetsam_snapshot_count == 0) {
7194 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
7195 new_snapshot = true;
7196 }
7197
7198 if (proc_ref(p, true) == p) {
7199 /*
7200 * Mark as terminated so that if exit1() indicates success, but the process (for example)
7201 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
7202 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
7203 * acquisition of the proc lock.
7204 */
7205 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
7206
7207 proc_list_unlock();
7208 } else {
7209 /*
7210 * We need to restart the search again because
7211 * proc_ref _can_ drop the proc_list lock
7212 * and we could have lost our stored next_p via
7213 * an exit() on another core.
7214 */
7215 i = 0;
7216 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
7217 continue;
7218 }
7219
7220 footprint_in_bytes = 0;
7221 freed_mem = memorystatus_kill_proc(p, kMemorystatusKilledHiwat, jetsam_reason, &killed, &footprint_in_bytes); /* purged and/or killed 'p' */
7222
7223 /* Success? */
7224 if (freed_mem) {
7225 if (!killed) {
7226 /* purged 'p'..don't reset HWM candidate count */
7227 *purged = TRUE;
7228
7229 proc_list_lock();
7230 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
7231 proc_list_unlock();
7232 } else {
7233 *memory_reclaimed = footprint_in_bytes;
7234 }
7235 proc_rele(p);
7236 goto exit;
7237 }
7238 /*
7239 * Failure - first unwind the state,
7240 * then fall through to restart the search.
7241 */
7242 proc_list_lock();
7243 proc_rele(p);
7244 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
7245 p->p_memstat_state |= P_MEMSTAT_ERROR;
7246 *errors += 1;
7247
7248 i = 0;
7249 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
7250 }
7251 }
7252
7253 proc_list_unlock();
7254
7255 exit:
7256 os_reason_free(jetsam_reason);
7257
7258 if (!killed) {
7259 *memory_reclaimed = 0;
7260
7261 /* Clear snapshot if freshly captured and no target was found */
7262 if (new_snapshot) {
7263 proc_list_lock();
7264 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
7265 proc_list_unlock();
7266 }
7267 }
7268
7269 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
7270 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, *memory_reclaimed, 0);
7271
7272 return killed;
7273 }
7274
7275 /*
7276 * Jetsam a process pinned in the elevated band.
7277 *
7278 * Return: true -- a pinned process was jetsammed
7279 * false -- no pinned process was jetsammed
7280 */
7281 boolean_t
memorystatus_kill_elevated_process(uint32_t cause,os_reason_t jetsam_reason,unsigned int band,int aggr_count,uint32_t * errors,uint64_t * memory_reclaimed)7282 memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors, uint64_t *memory_reclaimed)
7283 {
7284 pid_t aPid = 0;
7285 proc_t p = PROC_NULL, next_p = PROC_NULL;
7286 boolean_t new_snapshot = FALSE, killed = FALSE;
7287 int kill_count = 0;
7288 uint32_t aPid_ep;
7289 uint64_t killtime = 0;
7290 uint64_t time_in_priority_band_secs = 0;
7291 clock_sec_t tv_sec;
7292 clock_usec_t tv_usec;
7293 uint32_t tv_msec;
7294 uint64_t footprint_of_killed_proc = 0;
7295
7296
7297 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
7298 MEMORYSTATUS_LOG_AVAILABLE_PAGES);
7299
7300 #if CONFIG_FREEZE
7301 boolean_t consider_frozen_only = FALSE;
7302
7303 if (band == (unsigned int) memorystatus_freeze_jetsam_band) {
7304 consider_frozen_only = TRUE;
7305 }
7306 #endif /* CONFIG_FREEZE */
7307
7308 proc_list_lock();
7309
7310 next_p = memorystatus_get_first_proc_locked(&band, FALSE);
7311 while (next_p) {
7312 p = next_p;
7313 next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
7314
7315 aPid = proc_getpid(p);
7316 aPid_ep = p->p_memstat_effectivepriority;
7317
7318 /*
7319 * Only pick a process pinned in this elevated band
7320 */
7321 if (!_memstat_proc_is_elevated(p)) {
7322 continue;
7323 }
7324
7325 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
7326 continue;
7327 }
7328
7329 #if CONFIG_FREEZE
7330 if (consider_frozen_only && !_memstat_proc_is_frozen(p)) {
7331 continue;
7332 }
7333
7334 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
7335 continue;
7336 }
7337 #endif /* CONFIG_FREEZE */
7338
7339 #if DEVELOPMENT || DEBUG
7340 memorystatus_log_info(
7341 "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n",
7342 aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"), MEMORYSTATUS_LOG_AVAILABLE_PAGES);
7343 #endif /* DEVELOPMENT || DEBUG */
7344
7345 if (memorystatus_jetsam_snapshot_count == 0) {
7346 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
7347 new_snapshot = TRUE;
7348 }
7349
7350 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
7351
7352 killtime = mach_absolute_time();
7353 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
7354 tv_msec = tv_usec / 1000;
7355
7356 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
7357
7358 if (proc_ref(p, true) == p) {
7359 proc_list_unlock();
7360
7361 /*
7362 * memorystatus_do_kill drops a reference, so take another one so we can
7363 * continue to use this exit reason even after memorystatus_do_kill()
7364 * returns
7365 */
7366 os_reason_ref(jetsam_reason);
7367 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
7368
7369 absolutetime_to_nanoseconds(killtime - p->p_memstat_prio_start, &time_in_priority_band_secs);
7370 time_in_priority_band_secs /= NSEC_PER_SEC;
7371 memorystatus_log("%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d %llus rf:%s type:%s) %lluKB - memorystatus_available_pages: %llu\n",
7372 (unsigned long)tv_sec, tv_msec,
7373 aggr_count,
7374 aPid, ((p && *p->p_name) ? p->p_name : "unknown"),
7375 memstat_kill_cause_name[cause], aPid_ep,
7376 time_in_priority_band_secs, _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags),
7377 _memstat_proc_type_description(p),
7378 footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
7379
7380 /* Success? */
7381 if (killed) {
7382 *memory_reclaimed = footprint_of_killed_proc;
7383 proc_rele(p);
7384 kill_count++;
7385 goto exit;
7386 }
7387
7388 /*
7389 * Failure - first unwind the state,
7390 * then fall through to restart the search.
7391 */
7392 proc_list_lock();
7393 proc_rele(p);
7394 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
7395 p->p_memstat_state |= P_MEMSTAT_ERROR;
7396 *errors += 1;
7397 }
7398
7399 /*
7400 * Failure - restart the search.
7401 *
7402 * We might have raced with "p" exiting on another core, resulting in no
7403 * ref on "p". Or, we may have failed to kill "p".
7404 *
7405 * Either way, we fall thru to here, leaving the proc in the
7406 * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state.
7407 *
7408 * And, we hold the the proc_list_lock at this point.
7409 */
7410
7411 next_p = memorystatus_get_first_proc_locked(&band, FALSE);
7412 }
7413
7414 proc_list_unlock();
7415
7416 exit:
7417 os_reason_free(jetsam_reason);
7418
7419 if (kill_count == 0) {
7420 *memory_reclaimed = 0;
7421
7422 /* Clear snapshot if freshly captured and no target was found */
7423 if (new_snapshot) {
7424 proc_list_lock();
7425 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
7426 proc_list_unlock();
7427 }
7428 }
7429
7430 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
7431 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, kill_count, *memory_reclaimed);
7432
7433 return killed;
7434 }
7435
7436 bool
memorystatus_kill_on_VM_compressor_space_shortage(bool async)7437 memorystatus_kill_on_VM_compressor_space_shortage(bool async)
7438 {
7439 if (async) {
7440 os_atomic_store(&memorystatus_compressor_space_shortage, true, release);
7441 memorystatus_thread_wake();
7442 return true;
7443 } else {
7444 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE);
7445 if (jetsam_reason == OS_REASON_NULL) {
7446 memorystatus_log_error("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n");
7447 }
7448
7449 return memstat_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason);
7450 }
7451 }
7452
7453 #if CONFIG_JETSAM
7454
7455 void
memorystatus_kill_on_vps_starvation(void)7456 memorystatus_kill_on_vps_starvation(void)
7457 {
7458 os_atomic_store(&memorystatus_pageout_starved, true, release);
7459 memorystatus_thread_wake();
7460 }
7461
7462 bool
memorystatus_kill_on_vnode_exhaustion(void)7463 memorystatus_kill_on_vnode_exhaustion(void)
7464 {
7465 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE);
7466 if (jetsam_reason == OS_REASON_NULL) {
7467 memorystatus_log_error("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n");
7468 }
7469
7470 return memstat_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason);
7471 }
7472
7473 #endif /* CONFIG_JETSAM */
7474
7475 bool
memorystatus_kill_on_sustained_pressure()7476 memorystatus_kill_on_sustained_pressure()
7477 {
7478 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_SUSTAINED_PRESSURE);
7479 if (jetsam_reason == OS_REASON_NULL) {
7480 memorystatus_log_error("%s() failed to allocate jetsam reason\n", __func__);
7481 }
7482
7483 return memstat_kill_process_sync(-1, kMemorystatusKilledSustainedPressure, jetsam_reason);
7484 }
7485
7486 bool
memstat_kill_with_jetsam_reason_sync(pid_t pid,os_reason_t jetsam_reason)7487 memstat_kill_with_jetsam_reason_sync(pid_t pid, os_reason_t jetsam_reason)
7488 {
7489 uint32_t kill_cause = jetsam_reason->osr_code <= JETSAM_REASON_MEMORYSTATUS_MAX ?
7490 (uint32_t) jetsam_reason->osr_code : JETSAM_REASON_INVALID;
7491 return memstat_kill_process_sync(pid, kill_cause, jetsam_reason);
7492 }
7493
7494 bool
memorystatus_kill_on_zone_map_exhaustion(pid_t pid)7495 memorystatus_kill_on_zone_map_exhaustion(pid_t pid)
7496 {
7497 if (pid == -1) {
7498 os_atomic_store(&memorystatus_zone_map_is_exhausted, true, release);
7499 memorystatus_thread_wake();
7500 return true;
7501 } else {
7502 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION);
7503 if (jetsam_reason == OS_REASON_NULL) {
7504 memorystatus_log_error("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n");
7505 }
7506 return memstat_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason);
7507 }
7508 }
7509
7510 void
memorystatus_on_pageout_scan_end(void)7511 memorystatus_on_pageout_scan_end(void)
7512 {
7513 /* No-op */
7514 }
7515
7516 static size_t
memorystatus_priority_list_size(pid_t pid,size_t entry_size)7517 memorystatus_priority_list_size(pid_t pid, size_t entry_size)
7518 {
7519 assert(
7520 (entry_size == sizeof(memorystatus_priority_entry_t)) ||
7521 (entry_size == sizeof(memorystatus_priority_entry_v2_t)));
7522 uint32_t list_count = (pid == 0) ? memorystatus_list_count : 1;
7523 return entry_size * list_count;
7524 }
7525
7526 /* Return both allocated and actual size, since there's a race between allocation and list compilation */
7527 static int
memorystatus_get_priority_list(memorystatus_priority_entry_v2_t ** list_ptr,size_t * buffer_size,size_t * list_size,size_t entry_size)7528 memorystatus_get_priority_list(memorystatus_priority_entry_v2_t **list_ptr, size_t *buffer_size, size_t *list_size, size_t entry_size)
7529 {
7530 memorystatus_priority_entry_v2_t *entry;
7531 proc_t p;
7532 uint32_t i = 0;
7533
7534 *list_size = memorystatus_priority_list_size(0, entry_size);
7535
7536 /* Otherwise, validate the size of the buffer */
7537 if (*buffer_size < *list_size) {
7538 return EINVAL;
7539 }
7540
7541 *list_ptr = kalloc_data(*list_size, Z_WAITOK | Z_ZERO);
7542 if (!*list_ptr) {
7543 return ENOMEM;
7544 }
7545
7546 *buffer_size = *list_size;
7547 *list_size = 0;
7548
7549 entry = *list_ptr;
7550
7551 proc_list_lock();
7552
7553 p = memorystatus_get_first_proc_locked(&i, TRUE);
7554 while (p && (*list_size < *buffer_size)) {
7555 entry->pid = proc_getpid(p);
7556 entry->priority = p->p_memstat_effectivepriority;
7557 entry->user_data = p->p_memstat_userdata;
7558
7559 if (p->p_memstat_memlimit <= 0) {
7560 task_get_phys_footprint_limit(proc_task(p), &entry->limit);
7561 } else {
7562 entry->limit = p->p_memstat_memlimit;
7563 }
7564
7565 entry->state = _memstat_build_state(p);
7566
7567 if (entry_size == sizeof(memorystatus_priority_entry_t)) {
7568 entry = (memorystatus_priority_entry_v2_t *) (((memorystatus_priority_entry_t *)entry) + 1);
7569 } else {
7570 /* Only add v2 entries if we're not using the legacy version of this call */
7571 entry->priority_start_mtime = p->p_memstat_prio_start;
7572
7573 entry++;
7574 }
7575
7576 *list_size += entry_size;
7577 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
7578 }
7579
7580 proc_list_unlock();
7581
7582 memorystatus_log_debug("memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
7583
7584 return 0;
7585 }
7586
7587 static int
memorystatus_get_priority_pid(pid_t pid,user_addr_t buffer,size_t buffer_size)7588 memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size)
7589 {
7590 int error = 0;
7591 bool zombref = false;
7592 memorystatus_priority_entry_v2_t mp_entry;
7593 kern_return_t ret;
7594 boolean_t size_valid =
7595 (buffer_size == sizeof(memorystatus_priority_entry_v2_t)) ||
7596 (buffer_size == sizeof(memorystatus_priority_entry_t));
7597
7598 /* Validate inputs */
7599 if ((pid == 0) || (buffer == USER_ADDR_NULL) || !size_valid) {
7600 return EINVAL;
7601 }
7602
7603 proc_list_lock();
7604 proc_t p = proc_find_locked(pid);
7605 if (!p) {
7606 zombref = true;
7607 p = proc_find_zombref_locked(pid);
7608 if (!p) {
7609 proc_list_unlock();
7610 return ESRCH;
7611 }
7612 }
7613
7614 memset(&mp_entry, 0, sizeof(memorystatus_priority_entry_v2_t));
7615
7616 mp_entry.pid = proc_getpid(p);
7617 mp_entry.priority = p->p_memstat_effectivepriority;
7618 mp_entry.user_data = p->p_memstat_userdata;
7619 if (p->p_memstat_memlimit <= 0 && !zombref) {
7620 task_t task = proc_task(p);
7621 assert(task);
7622 ret = task_get_phys_footprint_limit(task, &mp_entry.limit);
7623 if (ret != KERN_SUCCESS) {
7624 error = mach_to_bsd_errno(ret);
7625 proc_list_unlock();
7626 goto done;
7627 }
7628 } else {
7629 mp_entry.limit = p->p_memstat_memlimit;
7630 }
7631
7632 mp_entry.state = _memstat_build_state(p);
7633 mp_entry.priority_start_mtime = p->p_memstat_prio_start;
7634 proc_list_unlock();
7635
7636 error = copyout(&mp_entry, buffer, buffer_size);
7637
7638 done:
7639 if (zombref) {
7640 proc_drop_zombref(p);
7641 } else {
7642 proc_rele(p);
7643 }
7644
7645 return error;
7646 }
7647
7648 static int
memorystatus_cmd_get_priority_list(pid_t pid,user_addr_t buffer,size_t buffer_size,int32_t * retval,size_t entry_size)7649 memorystatus_cmd_get_priority_list(pid_t pid, user_addr_t buffer, size_t buffer_size, int32_t *retval, size_t entry_size)
7650 {
7651 int error = 0;
7652 boolean_t size_only;
7653 size_t list_size;
7654
7655 static_assert(sizeof(memorystatus_priority_entry_v2_t) == 128);
7656 assert(
7657 (entry_size == sizeof(memorystatus_priority_entry_t)) ||
7658 (entry_size == sizeof(memorystatus_priority_entry_v2_t)));
7659
7660 /*
7661 * When a non-zero pid is provided, the 'list' has only one entry.
7662 */
7663
7664 size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
7665
7666 if (pid != 0) {
7667 /* One PID */
7668 list_size = entry_size;
7669 if (!size_only) {
7670 error = memorystatus_get_priority_pid(pid, buffer, buffer_size);
7671 }
7672 } else if (size_only) {
7673 /* List size query */
7674 list_size = memorystatus_priority_list_size(0, entry_size);
7675 } else {
7676 /* List */
7677 memorystatus_priority_entry_v2_t *list = NULL;
7678 error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, entry_size);
7679 if (error == 0) {
7680 error = copyout(list, buffer, list_size);
7681 kfree_data(list, buffer_size);
7682 }
7683 }
7684
7685 if (error == 0) {
7686 assert(list_size <= INT32_MAX);
7687 *retval = (int32_t) list_size;
7688 }
7689
7690 return error;
7691 }
7692
7693 static void
memorystatus_clear_errors(void)7694 memorystatus_clear_errors(void)
7695 {
7696 proc_t p;
7697 unsigned int i = 0;
7698
7699 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START);
7700
7701 proc_list_lock();
7702
7703 p = memorystatus_get_first_proc_locked(&i, TRUE);
7704 while (p) {
7705 if (p->p_memstat_state & P_MEMSTAT_ERROR) {
7706 p->p_memstat_state &= ~P_MEMSTAT_ERROR;
7707 }
7708 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
7709 }
7710
7711 proc_list_unlock();
7712
7713 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END);
7714 }
7715
7716 void
memorystatus_fast_jetsam_override(bool enable_override)7717 memorystatus_fast_jetsam_override(bool enable_override)
7718 {
7719 #if CONFIG_JETSAM
7720 fast_jetsam_enabled = !enable_override;
7721 if (!fast_jetsam_enabled) {
7722 /* Disable any pre-configured policies */
7723 os_atomic_store(&memstat_policy_config, kPolicyDefault, relaxed);
7724 memorystatus_thread_pool_default();
7725 _memstat_consider_waking_jetsam_thread();
7726 }
7727 #else /* CONFIG_JETSAM */
7728 (void)enable_override;
7729 #endif /* CONFIG_JETSAM */
7730 }
7731
7732 /*
7733 * Get the at_boot snapshot
7734 */
7735 static int
memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)7736 memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
7737 {
7738 size_t input_size = *snapshot_size;
7739
7740 /*
7741 * The at_boot snapshot has no entry list.
7742 */
7743 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t);
7744
7745 if (size_only) {
7746 return 0;
7747 }
7748
7749 /*
7750 * Validate the size of the snapshot buffer
7751 */
7752 if (input_size < *snapshot_size) {
7753 return EINVAL;
7754 }
7755
7756 /*
7757 * Update the notification_time only
7758 */
7759 memorystatus_at_boot_snapshot.notification_time = mach_absolute_time();
7760 *snapshot = &memorystatus_at_boot_snapshot;
7761
7762 memorystatus_log_debug(
7763 "memorystatus_get_at_boot_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%d)\n",
7764 (long)input_size, (long)*snapshot_size, 0);
7765 return 0;
7766 }
7767
7768 #if CONFIG_FREEZE
7769 static int
memorystatus_get_jetsam_snapshot_freezer(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)7770 memorystatus_get_jetsam_snapshot_freezer(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
7771 {
7772 size_t input_size = *snapshot_size;
7773
7774 if (memorystatus_jetsam_snapshot_freezer->entry_count > 0) {
7775 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_freezer->entry_count));
7776 } else {
7777 *snapshot_size = 0;
7778 }
7779 assert(*snapshot_size <= memorystatus_jetsam_snapshot_freezer_size);
7780
7781 if (size_only) {
7782 return 0;
7783 }
7784
7785 if (input_size < *snapshot_size) {
7786 return EINVAL;
7787 }
7788
7789 *snapshot = memorystatus_jetsam_snapshot_freezer;
7790
7791 memorystatus_log_debug(
7792 "memorystatus_get_jetsam_snapshot_freezer: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7793 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_freezer->entry_count);
7794
7795 return 0;
7796 }
7797 #endif /* CONFIG_FREEZE */
7798
7799 static int
memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)7800 memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
7801 {
7802 size_t input_size = *snapshot_size;
7803 uint32_t ods_list_count = memorystatus_list_count + memorystatus_artificial_snapshot_entry_count;
7804 memorystatus_jetsam_snapshot_t *ods = NULL; /* The on_demand snapshot buffer */
7805
7806 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (ods_list_count));
7807
7808 if (size_only) {
7809 return 0;
7810 }
7811
7812 /*
7813 * Validate the size of the snapshot buffer.
7814 * This is inherently racey. May want to revisit
7815 * this error condition and trim the output when
7816 * it doesn't fit.
7817 */
7818 if (input_size < *snapshot_size) {
7819 return EINVAL;
7820 }
7821
7822 /*
7823 * Allocate and initialize a snapshot buffer.
7824 */
7825 ods = kalloc_data(*snapshot_size, Z_WAITOK | Z_ZERO);
7826 if (!ods) {
7827 return ENOMEM;
7828 }
7829
7830 proc_list_lock();
7831 memorystatus_init_jetsam_snapshot_locked(ods, ods_list_count);
7832 proc_list_unlock();
7833
7834 /*
7835 * Return the kernel allocated, on_demand buffer.
7836 * The caller of this routine will copy the data out
7837 * to user space and then free the kernel allocated
7838 * buffer.
7839 */
7840 *snapshot = ods;
7841
7842 memorystatus_log_debug(
7843 "memorystatus_get_on_demand_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7844 (long)input_size, (long)*snapshot_size, (long)ods_list_count);
7845
7846 return 0;
7847 }
7848
7849 static int
memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)7850 memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
7851 {
7852 size_t input_size = *snapshot_size;
7853
7854 if (memorystatus_jetsam_snapshot_count > 0) {
7855 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
7856 } else {
7857 *snapshot_size = 0;
7858 }
7859
7860 if (size_only) {
7861 return 0;
7862 }
7863
7864 if (input_size < *snapshot_size) {
7865 return EINVAL;
7866 }
7867
7868 *snapshot = memorystatus_jetsam_snapshot;
7869
7870 memorystatus_log_debug(
7871 "memorystatus_get_jetsam_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7872 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_count);
7873
7874 return 0;
7875 }
7876
7877 #if JETSAM_ZPRINT_SNAPSHOT
7878 /*
7879 * Utility function to handle copyout of jetsam zprint snapshot data
7880 */
7881 static int
memorystatus_cmd_get_data_buffer(user_addr_t buffer,size_t buffer_size,int32_t * retval,size_t data_size,void * data)7882 memorystatus_cmd_get_data_buffer(
7883 user_addr_t buffer,
7884 size_t buffer_size,
7885 int32_t *retval,
7886 size_t data_size,
7887 void *data)
7888 {
7889 boolean_t size_only = (buffer == USER_ADDR_NULL);
7890 int error;
7891
7892 /* Nothing to return if there's no data yet, instruct the caller to try again later. */
7893 if (data == NULL) {
7894 *retval = -1;
7895 return EAGAIN;
7896 }
7897
7898 /* Handle just a size request */
7899 if (size_only) {
7900 *retval = (int32_t)data_size;
7901 return 0;
7902 }
7903
7904 /* buffer needs to be large enough */
7905 if (buffer_size < data_size) {
7906 *retval = -1;
7907 return EINVAL;
7908 }
7909
7910 error = copyout(data, buffer, data_size);
7911 if (error == 0) {
7912 *retval = (int32_t)data_size;
7913 } else {
7914 *retval = -1;
7915 }
7916
7917 return error;
7918 }
7919 #endif
7920
7921 static int
memorystatus_cmd_get_jetsam_snapshot(int32_t flags,user_addr_t buffer,size_t buffer_size,int32_t * retval)7922 memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval)
7923 {
7924 int error = EINVAL;
7925 boolean_t size_only;
7926 boolean_t is_default_snapshot = FALSE;
7927 boolean_t is_on_demand_snapshot = FALSE;
7928 boolean_t is_at_boot_snapshot = FALSE;
7929 #if CONFIG_FREEZE
7930 bool is_freezer_snapshot = false;
7931 #endif /* CONFIG_FREEZE */
7932 memorystatus_jetsam_snapshot_t *snapshot;
7933
7934 size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
7935
7936 if (flags == 0) {
7937 /* Default */
7938 is_default_snapshot = TRUE;
7939 error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only);
7940 } else {
7941 if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT | MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER)) {
7942 /*
7943 * Unsupported bit set in flag.
7944 */
7945 return EINVAL;
7946 }
7947
7948 if (flags & (flags - 0x1)) {
7949 /*
7950 * Can't have multiple flags set at the same time.
7951 */
7952 return EINVAL;
7953 }
7954
7955 if (flags & MEMORYSTATUS_SNAPSHOT_ON_DEMAND) {
7956 is_on_demand_snapshot = TRUE;
7957 /*
7958 * When not requesting the size only, the following call will allocate
7959 * an on_demand snapshot buffer, which is freed below.
7960 */
7961 error = memorystatus_get_on_demand_snapshot(&snapshot, &buffer_size, size_only);
7962 } else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) {
7963 is_at_boot_snapshot = TRUE;
7964 error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only);
7965 #if CONFIG_FREEZE
7966 } else if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER) {
7967 is_freezer_snapshot = true;
7968 error = memorystatus_get_jetsam_snapshot_freezer(&snapshot, &buffer_size, size_only);
7969 #endif /* CONFIG_FREEZE */
7970 } else {
7971 /*
7972 * Invalid flag setting.
7973 */
7974 return EINVAL;
7975 }
7976 }
7977
7978 if (error) {
7979 goto out;
7980 }
7981
7982 /*
7983 * Copy the data out to user space and clear the snapshot buffer.
7984 * If working with the jetsam snapshot,
7985 * clearing the buffer means, reset the count.
7986 * If working with an on_demand snapshot
7987 * clearing the buffer means, free it.
7988 * If working with the at_boot snapshot
7989 * there is nothing to clear or update.
7990 * If working with a copy of the snapshot
7991 * there is nothing to clear or update.
7992 * If working with the freezer snapshot
7993 * clearing the buffer means, reset the count.
7994 */
7995 if (!size_only) {
7996 if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
7997 #if CONFIG_FREEZE
7998 if (is_default_snapshot || is_freezer_snapshot) {
7999 #else
8000 if (is_default_snapshot) {
8001 #endif /* CONFIG_FREEZE */
8002 /*
8003 * The jetsam snapshot is never freed, its count is simply reset.
8004 * However, we make a copy for any parties that might be interested
8005 * in the previous fully populated snapshot.
8006 */
8007 proc_list_lock();
8008 #if DEVELOPMENT || DEBUG
8009 if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != proc_getpid(current_proc())) {
8010 /* Snapshot is currently owned by someone else. Don't consume it. */
8011 proc_list_unlock();
8012 goto out;
8013 }
8014 #endif /* (DEVELOPMENT || DEBUG)*/
8015 if (is_default_snapshot) {
8016 snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
8017 }
8018 #if CONFIG_FREEZE
8019 else if (is_freezer_snapshot) {
8020 memorystatus_jetsam_snapshot_freezer->entry_count = 0;
8021 }
8022 #endif /* CONFIG_FREEZE */
8023 proc_list_unlock();
8024 }
8025 }
8026
8027 if (is_on_demand_snapshot) {
8028 /*
8029 * The on_demand snapshot is always freed,
8030 * even if the copyout failed.
8031 */
8032 kfree_data(snapshot, buffer_size);
8033 }
8034 }
8035
8036 out:
8037 if (error == 0) {
8038 assert(buffer_size <= INT32_MAX);
8039 *retval = (int32_t) buffer_size;
8040 }
8041 return error;
8042 }
8043
8044 #if DEVELOPMENT || DEBUG
8045 static int
8046 memorystatus_cmd_set_testing_pid(int32_t flags)
8047 {
8048 int error = EINVAL;
8049 proc_t caller = current_proc();
8050 assert(caller != kernproc);
8051 proc_list_lock();
8052 if (flags & MEMORYSTATUS_FLAGS_SET_TESTING_PID) {
8053 if (memorystatus_testing_pid == 0) {
8054 memorystatus_testing_pid = proc_getpid(caller);
8055 error = 0;
8056 } else if (memorystatus_testing_pid == proc_getpid(caller)) {
8057 error = 0;
8058 } else {
8059 /* We don't allow ownership to be taken from another proc. */
8060 error = EBUSY;
8061 }
8062 } else if (flags & MEMORYSTATUS_FLAGS_UNSET_TESTING_PID) {
8063 if (memorystatus_testing_pid == proc_getpid(caller)) {
8064 memorystatus_testing_pid = 0;
8065 error = 0;
8066 } else if (memorystatus_testing_pid != 0) {
8067 /* We don't allow ownership to be taken from another proc. */
8068 error = EPERM;
8069 }
8070 } else if (flags & MEMORYSTATUS_FLAGS_SET_IMP_TESTING_PID) {
8071 caller->p_memstat_state |= P_MEMSTAT_TEST_IMP_ASSERTION;
8072 error = 0;
8073 }
8074 proc_list_unlock();
8075
8076 return error;
8077 }
8078 #endif /* DEVELOPMENT || DEBUG */
8079
8080 /*
8081 * Routine: memorystatus_cmd_grp_set_priorities
8082 * Purpose: Update priorities for a group of processes.
8083 *
8084 * [priority]
8085 * Move each process out of its effective priority
8086 * band and into a new priority band.
8087 * Maintains relative order from lowest to highest priority.
8088 * In single band, maintains relative order from head to tail.
8089 *
8090 * eg: before [effectivepriority | pid]
8091 * [18 | p101 ]
8092 * [17 | p55, p67, p19 ]
8093 * [12 | p103 p10 ]
8094 * [ 7 | p25 ]
8095 * [ 0 | p71, p82, ]
8096 *
8097 * after [ new band | pid]
8098 * [ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
8099 *
8100 * Returns: 0 on success, else non-zero.
8101 *
8102 * Caveat: We know there is a race window regarding recycled pids.
8103 * A process could be killed before the kernel can act on it here.
8104 * If a pid cannot be found in any of the jetsam priority bands,
8105 * then we simply ignore it. No harm.
8106 * But, if the pid has been recycled then it could be an issue.
8107 * In that scenario, we might move an unsuspecting process to the new
8108 * priority band. It's not clear how the kernel can safeguard
8109 * against this, but it would be an extremely rare case anyway.
8110 * The caller of this api might avoid such race conditions by
8111 * ensuring that the processes passed in the pid list are suspended.
8112 */
8113
8114
8115 static int
8116 memorystatus_cmd_grp_set_priorities(user_addr_t buffer, size_t buffer_size)
8117 {
8118 /*
8119 * We only handle setting priority
8120 * per process
8121 */
8122 int error = 0;
8123 memorystatus_properties_entry_v1_t *entries = NULL;
8124 size_t entry_count = 0;
8125
8126 /* This will be the ordered proc list */
8127 typedef struct memorystatus_internal_properties {
8128 proc_t proc;
8129 int32_t priority;
8130 } memorystatus_internal_properties_t;
8131
8132 memorystatus_internal_properties_t *table = NULL;
8133 uint32_t table_count = 0;
8134
8135 size_t i = 0;
8136 uint32_t bucket_index = 0;
8137 int32_t new_priority;
8138
8139 proc_t p;
8140
8141 /* Verify inputs */
8142 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
8143 error = EINVAL;
8144 goto out;
8145 }
8146
8147 entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
8148 if (entry_count == 0) {
8149 /* buffer size was not large enough for a single entry */
8150 error = EINVAL;
8151 goto out;
8152 }
8153
8154 if ((entries = kalloc_data(buffer_size, Z_WAITOK)) == NULL) {
8155 error = ENOMEM;
8156 goto out;
8157 }
8158
8159 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count);
8160
8161 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
8162 goto out;
8163 }
8164
8165 /* Verify sanity of input priorities */
8166 if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
8167 if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
8168 error = EINVAL;
8169 goto out;
8170 }
8171 } else {
8172 error = EINVAL;
8173 goto out;
8174 }
8175
8176 for (i = 0; i < entry_count; i++) {
8177 if (entries[i].priority == -1) {
8178 /* Use as shorthand for default priority */
8179 entries[i].priority = JETSAM_PRIORITY_DEFAULT;
8180 } else if (entries[i].priority > JETSAM_PRIORITY_IDLE && entries[i].priority <= applications_aging_band) {
8181 /*
8182 * Everything between idle and the aging bands are reserved for internal use.
8183 * if requested, adjust to JETSAM_PRIORITY_IDLE.
8184 * Entitled processes (just munch) can use a subset of this range for testing.
8185 */
8186 if (entries[i].priority > JETSAM_PRIORITY_ENTITLED_MAX ||
8187 !current_task_can_use_entitled_range()) {
8188 entries[i].priority = JETSAM_PRIORITY_IDLE;
8189 }
8190 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
8191 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
8192 * queue */
8193 /* Deal with this later */
8194 } else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
8195 /* Sanity check */
8196 error = EINVAL;
8197 goto out;
8198 }
8199 }
8200
8201 table = kalloc_type(memorystatus_internal_properties_t, entry_count,
8202 Z_WAITOK | Z_ZERO);
8203 if (table == NULL) {
8204 error = ENOMEM;
8205 goto out;
8206 }
8207
8208
8209 /*
8210 * For each jetsam bucket entry, spin through the input property list.
8211 * When a matching pid is found, populate an adjacent table with the
8212 * appropriate proc pointer and new property values.
8213 * This traversal automatically preserves order from lowest
8214 * to highest priority.
8215 */
8216
8217 bucket_index = 0;
8218
8219 proc_list_lock();
8220
8221 /* Create the ordered table */
8222 p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
8223 while (p && (table_count < entry_count)) {
8224 for (i = 0; i < entry_count; i++) {
8225 if (proc_getpid(p) == entries[i].pid) {
8226 /* Build the table data */
8227 table[table_count].proc = p;
8228 table[table_count].priority = entries[i].priority;
8229 table_count++;
8230 break;
8231 }
8232 }
8233 p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
8234 }
8235
8236 /* We now have ordered list of procs ready to move */
8237 for (i = 0; i < table_count; i++) {
8238 p = table[i].proc;
8239 assert(p != NULL);
8240 memstat_priority_options_t priority_options = MEMSTAT_PRIORITY_OPTIONS_NONE;
8241
8242 /* Allow head inserts -- but relative order is now */
8243 if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
8244 new_priority = JETSAM_PRIORITY_IDLE;
8245 priority_options |= (MEMSTAT_PRIORITY_INSERT_HEAD | MEMSTAT_PRIORITY_NO_AGING);
8246 } else {
8247 new_priority = table[i].priority;
8248 }
8249
8250 /* Not allowed */
8251 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
8252 continue;
8253 }
8254
8255 memstat_update_priority_locked(p, new_priority, priority_options);
8256 }
8257
8258 proc_list_unlock();
8259
8260 /*
8261 * if (table_count != entry_count)
8262 * then some pids were not found in a jetsam band.
8263 * harmless but interesting...
8264 */
8265 out:
8266 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, table_count);
8267
8268 kfree_data(entries, buffer_size);
8269 kfree_type(memorystatus_internal_properties_t, entry_count, table);
8270
8271 return error;
8272 }
8273
8274 memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL;
8275 size_t memorystatus_global_probabilities_size = 0;
8276
8277 static int
8278 memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size)
8279 {
8280 int error = 0;
8281 memorystatus_properties_entry_v1_t *entries = NULL;
8282 size_t entry_count = 0, i = 0;
8283 memorystatus_internal_probabilities_t *tmp_table_new = NULL, *tmp_table_old = NULL;
8284 size_t tmp_table_new_size = 0, tmp_table_old_size = 0;
8285 #if DEVELOPMENT || DEBUG
8286 if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != proc_getpid(current_proc())) {
8287 /* probabilites are currently owned by someone else. Don't change them. */
8288 error = EPERM;
8289 goto out;
8290 }
8291 #endif /* (DEVELOPMENT || DEBUG)*/
8292
8293 /* Verify inputs */
8294 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
8295 error = EINVAL;
8296 goto out;
8297 }
8298
8299 entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
8300 if (entry_count == 0) {
8301 error = EINVAL;
8302 goto out;
8303 }
8304
8305 if ((entries = kalloc_data(buffer_size, Z_WAITOK)) == NULL) {
8306 error = ENOMEM;
8307 goto out;
8308 }
8309
8310 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count);
8311
8312 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
8313 goto out;
8314 }
8315
8316 if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
8317 if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
8318 error = EINVAL;
8319 goto out;
8320 }
8321 } else {
8322 error = EINVAL;
8323 goto out;
8324 }
8325
8326 /* Verify sanity of input priorities */
8327 for (i = 0; i < entry_count; i++) {
8328 /*
8329 * 0 - low probability of use.
8330 * 1 - high probability of use.
8331 *
8332 * Keeping this field an int (& not a bool) to allow
8333 * us to experiment with different values/approaches
8334 * later on.
8335 */
8336 if (entries[i].use_probability > 1) {
8337 error = EINVAL;
8338 goto out;
8339 }
8340 }
8341
8342 tmp_table_new_size = sizeof(memorystatus_internal_probabilities_t) * entry_count;
8343
8344 if ((tmp_table_new = kalloc_data(tmp_table_new_size, Z_WAITOK | Z_ZERO)) == NULL) {
8345 error = ENOMEM;
8346 goto out;
8347 }
8348
8349 proc_list_lock();
8350
8351 if (memorystatus_global_probabilities_table) {
8352 tmp_table_old = memorystatus_global_probabilities_table;
8353 tmp_table_old_size = memorystatus_global_probabilities_size;
8354 }
8355
8356 memorystatus_global_probabilities_table = tmp_table_new;
8357 memorystatus_global_probabilities_size = tmp_table_new_size;
8358 tmp_table_new = NULL;
8359
8360 for (i = 0; i < entry_count; i++) {
8361 /* Build the table data */
8362 strlcpy(memorystatus_global_probabilities_table[i].proc_name, entries[i].proc_name, MAXCOMLEN + 1);
8363 memorystatus_global_probabilities_table[i].use_probability = entries[i].use_probability;
8364 }
8365
8366 proc_list_unlock();
8367
8368 out:
8369 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, tmp_table_new_size);
8370
8371 kfree_data(entries, buffer_size);
8372 kfree_data(tmp_table_old, tmp_table_old_size);
8373
8374 return error;
8375 }
8376
8377 static int
8378 memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8379 {
8380 int error = 0;
8381
8382 if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) {
8383 error = memorystatus_cmd_grp_set_priorities(buffer, buffer_size);
8384 } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) == MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) {
8385 error = memorystatus_cmd_grp_set_probabilities(buffer, buffer_size);
8386 #if CONFIG_FREEZE
8387 } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_FREEZE_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_FREEZE_PRIORITY) {
8388 error = memorystatus_cmd_grp_set_freeze_list(buffer, buffer_size);
8389 } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_DEMOTE_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_DEMOTE_PRIORITY) {
8390 error = memorystatus_cmd_grp_set_demote_list(buffer, buffer_size);
8391 #endif /* CONFIG_FREEZE */
8392 } else {
8393 error = EINVAL;
8394 }
8395
8396 return error;
8397 }
8398
8399 /*
8400 * This routine is used to update a process's jetsam priority position and stored user_data.
8401 * It is not used for the setting of memory limits.
8402 *
8403 * Flags passed into this call are used to distinguish the motivation behind a jetsam priority
8404 * transition. By default, the kernel updates the process's original requested priority when
8405 * no flag is passed. But when the MEMORYSTATUS_SET_PRIORITY_ASSERTION flag is used, the kernel
8406 * updates the process's assertion driven priority.
8407 *
8408 * The assertion flag was introduced for use by the device's assertion mediator (eg: runningboardd).
8409 * When an assertion is controlling a process's jetsam priority, it may conflict with that process's
8410 * dirty/clean (active/inactive) jetsam state. The kernel attempts to resolve a priority transition
8411 * conflict by reviewing the process state and then choosing the maximum jetsam band at play,
8412 * eg: requested priority versus assertion priority.
8413 */
8414
8415 static int
8416 memorystatus_cmd_set_priority_properties(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8417 {
8418 int error = 0;
8419 memorystatus_priority_properties_t mpp_entry;
8420
8421 /* Validate inputs */
8422 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_properties_t))) {
8423 return EINVAL;
8424 }
8425
8426 /* Validate flags */
8427 if (flags & ~(MEMORYSTATUS_SET_PRIORITY_ASSERTION)) {
8428 /*
8429 * Unsupported bit set in flag.
8430 */
8431 return EINVAL;
8432 }
8433
8434 error = copyin(buffer, &mpp_entry, buffer_size);
8435
8436 if (error == 0) {
8437 proc_t p;
8438
8439 p = proc_find(pid);
8440 if (!p) {
8441 return ESRCH;
8442 }
8443
8444 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
8445 proc_rele(p);
8446 return EPERM;
8447 }
8448
8449 if ((flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) &&
8450 !(p->p_memstat_state & P_MEMSTAT_MANAGED)) {
8451 /*
8452 * Assertion-
8453 * processes.
8454 */
8455 proc_rele(p);
8456 return EPERM;
8457 }
8458
8459 memstat_priority_options_t options = MEMSTAT_PRIORITY_OPTIONS_NONE;
8460 if (flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) {
8461 options |= MEMSTAT_PRIORITY_IS_ASSERTION;
8462 }
8463 error = memorystatus_set_priority(p, mpp_entry.priority, mpp_entry.user_data,
8464 options);
8465 proc_rele(p);
8466 }
8467
8468 return error;
8469 }
8470
8471 static int
8472 memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8473 {
8474 int error = 0;
8475 memorystatus_memlimit_properties_t mmp_entry;
8476
8477 /* Validate inputs */
8478 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
8479 return EINVAL;
8480 }
8481
8482 error = copyin(buffer, &mmp_entry, buffer_size);
8483
8484 if (error == 0) {
8485 error = memorystatus_set_memlimit_properties(pid, &mmp_entry);
8486 }
8487
8488 return error;
8489 }
8490
8491 #if DEBUG || DEVELOPMENT
8492 static int
8493 memorystatus_cmd_set_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8494 {
8495 int error = 0;
8496 memorystatus_diag_memlimit_properties_t mmp_entry;
8497 proc_t p = proc_find(pid);
8498 if (!p) {
8499 return ESRCH;
8500 }
8501
8502 /* Validate inputs */
8503 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_diag_memlimit_properties_t))) {
8504 proc_rele(p);
8505 return EINVAL;
8506 }
8507
8508 error = copyin(buffer, &mmp_entry, buffer_size);
8509
8510 if (error == 0) {
8511 proc_list_lock();
8512 error = memorystatus_set_diag_memlimit_properties_internal(p, &mmp_entry);
8513 proc_list_unlock();
8514 }
8515 proc_rele(p);
8516 return error;
8517 }
8518
8519 static int
8520 memorystatus_cmd_get_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8521 {
8522 int error = 0;
8523 memorystatus_diag_memlimit_properties_t mmp_entry;
8524 proc_t p = proc_find(pid);
8525 if (!p) {
8526 return ESRCH;
8527 }
8528
8529 /* Validate inputs */
8530 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_diag_memlimit_properties_t))) {
8531 proc_rele(p);
8532 return EINVAL;
8533 }
8534 proc_list_lock();
8535 error = memorystatus_get_diag_memlimit_properties_internal(p, &mmp_entry);
8536 proc_list_unlock();
8537 proc_rele(p);
8538 if (error == 0) {
8539 error = copyout(&mmp_entry, buffer, buffer_size);
8540 }
8541
8542
8543 return error;
8544 }
8545 #endif //DEBUG || DEVELOPMENT
8546
8547 static int
8548 _memstat_get_process_conclave_mem_limit(pid_t pid, int32_t *retval)
8549 {
8550 kern_return_t error;
8551 proc_t p = proc_find(pid);
8552 if (!p) {
8553 return ESRCH;
8554 }
8555
8556 uint64_t conclave_limit;
8557 error = task_get_conclave_mem_limit(proc_task(p), &conclave_limit);
8558
8559 if (error == KERN_SUCCESS) {
8560 *retval = roundToNearestMB((uint32_t)conclave_limit);
8561 }
8562
8563 proc_rele(p);
8564 return mach_to_bsd_errno(error);
8565 }
8566
8567 static void
8568 memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t* p_entry)
8569 {
8570 memset(p_entry, 0, sizeof(memorystatus_memlimit_properties_t));
8571
8572 if (p->p_memstat_memlimit_active > 0) {
8573 p_entry->memlimit_active = p->p_memstat_memlimit_active;
8574 } else {
8575 task_convert_phys_footprint_limit(-1, &p_entry->memlimit_active);
8576 }
8577
8578 if (_memstat_proc_active_memlimit_is_fatal(p)) {
8579 p_entry->memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8580 }
8581
8582 /*
8583 * Get the inactive limit and attributes
8584 */
8585 if (p->p_memstat_memlimit_inactive <= 0) {
8586 task_convert_phys_footprint_limit(-1, &p_entry->memlimit_inactive);
8587 } else {
8588 p_entry->memlimit_inactive = p->p_memstat_memlimit_inactive;
8589 }
8590 if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
8591 p_entry->memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8592 }
8593 }
8594
8595 /*
8596 * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit().
8597 * That gets the proc's cached memlimit and there is no guarantee that the active/inactive
8598 * limits will be the same in the no-limit case. Instead we convert limits <= 0 using
8599 * task_convert_phys_footprint_limit(). It computes the same limit value that would be written
8600 * to the task's ledgers via task_set_phys_footprint_limit().
8601 */
8602 static int
8603 memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8604 {
8605 memorystatus_memlimit_properties2_t mmp_entry;
8606
8607 /* Validate inputs */
8608 if ((pid == 0) || (buffer == USER_ADDR_NULL) ||
8609 ((buffer_size != sizeof(memorystatus_memlimit_properties_t)) &&
8610 (buffer_size != sizeof(memorystatus_memlimit_properties2_t)))) {
8611 return EINVAL;
8612 }
8613
8614 memset(&mmp_entry, 0, sizeof(memorystatus_memlimit_properties2_t));
8615
8616 proc_t p = proc_find(pid);
8617 if (!p) {
8618 return ESRCH;
8619 }
8620
8621 /*
8622 * Get the active limit and attributes.
8623 * No locks taken since we hold a reference to the proc.
8624 */
8625
8626 memorystatus_get_memlimit_properties_internal(p, &mmp_entry.v1);
8627
8628 #if CONFIG_JETSAM
8629 #if DEVELOPMENT || DEBUG
8630 /*
8631 * Get the limit increased via SPI
8632 */
8633 mmp_entry.memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
8634 mmp_entry.memlimit_increase_bytes = p->p_memlimit_increase;
8635 #endif /* DEVELOPMENT || DEBUG */
8636 #endif /* CONFIG_JETSAM */
8637
8638 proc_rele(p);
8639
8640 int error = copyout(&mmp_entry, buffer, buffer_size);
8641
8642 return error;
8643 }
8644
8645
8646 /*
8647 * SPI for kbd - pr24956468
8648 * This is a very simple snapshot that calculates how much a
8649 * process's phys_footprint exceeds a specific memory limit.
8650 * Only the inactive memory limit is supported for now.
8651 * The delta is returned as bytes in excess or zero.
8652 */
8653 static int
8654 memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
8655 {
8656 int error = 0;
8657 uint64_t footprint_in_bytes = 0;
8658 uint64_t delta_in_bytes = 0;
8659 int32_t memlimit_mb = 0;
8660 uint64_t memlimit_bytes = 0;
8661
8662 /* Validate inputs */
8663 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(uint64_t)) || (flags != 0)) {
8664 return EINVAL;
8665 }
8666
8667 proc_t p = proc_find(pid);
8668 if (!p) {
8669 return ESRCH;
8670 }
8671
8672 /*
8673 * Get the inactive limit.
8674 * No locks taken since we hold a reference to the proc.
8675 */
8676
8677 if (p->p_memstat_memlimit_inactive <= 0) {
8678 task_convert_phys_footprint_limit(-1, &memlimit_mb);
8679 } else {
8680 memlimit_mb = p->p_memstat_memlimit_inactive;
8681 }
8682
8683 footprint_in_bytes = get_task_phys_footprint(proc_task(p));
8684
8685 proc_rele(p);
8686
8687 memlimit_bytes = memlimit_mb * 1024 * 1024; /* MB to bytes */
8688
8689 /*
8690 * Computed delta always returns >= 0 bytes
8691 */
8692 if (footprint_in_bytes > memlimit_bytes) {
8693 delta_in_bytes = footprint_in_bytes - memlimit_bytes;
8694 }
8695
8696 error = copyout(&delta_in_bytes, buffer, sizeof(delta_in_bytes));
8697
8698 return error;
8699 }
8700
8701
8702 static int
8703 memorystatus_cmd_get_pressure_status(int32_t *retval)
8704 {
8705 int error;
8706
8707 /* Need privilege for check */
8708 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
8709 if (error) {
8710 return error;
8711 }
8712
8713 /* Inherently racy, so it's not worth taking a lock here */
8714 *retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
8715
8716 return error;
8717 }
8718
8719 int
8720 memorystatus_get_pressure_status_kdp()
8721 {
8722 return (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
8723 }
8724
8725 /*
8726 * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
8727 *
8728 * This call is inflexible -- it does not distinguish between active/inactive, fatal/non-fatal
8729 * So, with 2-level HWM preserving previous behavior will map as follows.
8730 * - treat the limit passed in as both an active and inactive limit.
8731 * - treat the is_fatal_limit flag as though it applies to both active and inactive limits.
8732 *
8733 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK
8734 * - the is_fatal_limit is FALSE, meaning the active and inactive limits are non-fatal/soft
8735 * - so mapping is (active/non-fatal, inactive/non-fatal)
8736 *
8737 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT
8738 * - the is_fatal_limit is TRUE, meaning the process's active and inactive limits are fatal/hard
8739 * - so mapping is (active/fatal, inactive/fatal)
8740 */
8741
8742 #if CONFIG_JETSAM
8743 static int
8744 memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit)
8745 {
8746 int error = 0;
8747 memorystatus_memlimit_properties_t entry;
8748
8749 entry.memlimit_active = high_water_mark;
8750 entry.memlimit_active_attr = 0;
8751 entry.memlimit_inactive = high_water_mark;
8752 entry.memlimit_inactive_attr = 0;
8753
8754 if (is_fatal_limit == TRUE) {
8755 entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8756 entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8757 }
8758
8759 error = memorystatus_set_memlimit_properties(pid, &entry);
8760 return error;
8761 }
8762
8763 static int
8764 memorystatus_cmd_mark_process_coalition_swappable(pid_t pid, __unused int32_t *retval)
8765 {
8766 int error = 0;
8767 proc_t p = PROC_NULL;
8768 coalition_t coal = COALITION_NULL;
8769
8770 if (!memorystatus_swap_all_apps) {
8771 /* Swap is not supported on this device. */
8772 return ENOTSUP;
8773 }
8774 p = proc_find(pid);
8775 if (!p) {
8776 return ESRCH;
8777 }
8778 coal = task_get_coalition((task_t) proc_task(p), COALITION_TYPE_JETSAM);
8779 if (coal && coalition_is_leader((task_t) proc_task(p), coal)) {
8780 coalition_mark_swappable(coal);
8781 } else {
8782 /* This SPI is only supported on coalition leaders. */
8783 error = EINVAL;
8784 }
8785
8786 proc_rele(p);
8787 return error;
8788 }
8789
8790 static int
8791 memorystatus_cmd_get_process_coalition_is_swappable(pid_t pid, int32_t *retval)
8792 {
8793 int error = 0;
8794 proc_t p = PROC_NULL;
8795 coalition_t coal = COALITION_NULL;
8796
8797 if (!memorystatus_swap_all_apps) {
8798 /* Swap is not supported on this device. */
8799 return ENOTSUP;
8800 }
8801 p = proc_find(pid);
8802 if (!p) {
8803 return ESRCH;
8804 }
8805 coal = task_get_coalition((task_t) proc_task(p), COALITION_TYPE_JETSAM);
8806 if (coal) {
8807 *retval = coalition_is_swappable(coal);
8808 } else {
8809 error = EINVAL;
8810 }
8811
8812 proc_rele(p);
8813 return error;
8814 }
8815
8816 static int
8817 memorystatus_cmd_convert_memlimit_mb(pid_t pid, int32_t limit, int32_t *retval)
8818 {
8819 int error = 0;
8820 proc_t p;
8821 p = proc_find(pid);
8822 if (!p) {
8823 return ESRCH;
8824 }
8825 if (limit <= 0) {
8826 /*
8827 * A limit of <= 0 implies that the task gets its default limit.
8828 */
8829 limit = memorystatus_get_default_task_active_limit(p);
8830 if (limit <= 0) {
8831 /* Task uses system wide default limit */
8832 limit = max_task_footprint_mb ? max_task_footprint_mb : INT32_MAX;
8833 }
8834 *retval = limit;
8835 } else {
8836 #if DEVELOPMENT || DEBUG
8837 /* add the current increase to it, for roots */
8838 limit += roundToNearestMB(p->p_memlimit_increase);
8839 #endif /* DEVELOPMENT || DEBUG */
8840 *retval = limit;
8841 }
8842
8843 proc_rele(p);
8844 return error;
8845 }
8846
8847 static int
8848 _memstat_rearm_proc_memlimit(proc_t proc, void* flagsptr)
8849 {
8850 task_t task = proc_task(proc);
8851 uint32_t flags = *((uint32_t *) flagsptr);
8852
8853 if (flags & MEMORYSTATUS_FLAGS_REARM_ACTIVE) {
8854 task_reset_triggered_exc_resource(task, true);
8855 }
8856 if (flags & MEMORYSTATUS_FLAGS_REARM_INACTIVE) {
8857 task_reset_triggered_exc_resource(task, false);
8858 }
8859
8860 return 0;
8861 }
8862
8863 static int
8864 memorystatus_cmd_rearm_memlimit(pid_t pid, uint32_t flags, __unused int32_t *retval)
8865 {
8866 if (pid == -1) {
8867 /* Re-arm all pids */
8868 proc_iterate(
8869 PROC_ALLPROCLIST,
8870 _memstat_rearm_proc_memlimit,
8871 &flags,
8872 NULL,
8873 NULL);
8874 } else {
8875 /* Re-arm one pid */
8876 proc_t p = (pid == proc_selfpid()) ? proc_self() : proc_find(pid);
8877 if (!p) {
8878 return ESRCH;
8879 }
8880 _memstat_rearm_proc_memlimit(p, &flags);
8881 proc_rele(p);
8882 }
8883
8884 return 0;
8885 }
8886 #endif /* CONFIG_JETSAM */
8887
8888 #if DEBUG || DEVELOPMENT
8889 static int
8890 memorystatus_set_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry)
8891 {
8892 int error = 0;
8893 uint64_t old_limit = 0;
8894
8895 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
8896 /* Enforce the limit by writing to the ledgers */
8897 error = (task_set_diag_footprint_limit_internal(proc_task(p), p_entry->memlimit, &old_limit) == KERN_SUCCESS) ? KERN_SUCCESS : EINVAL;
8898
8899 memorystatus_log_debug( "memorystatus_set_diag_memlimit_properties: new limit on pid %d (%lluMB old %lluMB)\n",
8900 proc_getpid(p), (p_entry->memlimit > 0 ? p_entry->memlimit : -1), (old_limit)
8901 );
8902 DTRACE_MEMORYSTATUS2(memorystatus_diag_memlimit_properties_t, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
8903 return error;
8904 }
8905
8906 static int
8907 memorystatus_get_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry)
8908 {
8909 int error = 0;
8910 /* Enforce the limit by writing to the ledgers */
8911 error = (task_get_diag_footprint_limit_internal(proc_task(p), &p_entry->memlimit, &p_entry->threshold_enabled) == KERN_SUCCESS) ? KERN_SUCCESS : EINVAL;
8912
8913 DTRACE_MEMORYSTATUS2(memorystatus_diag_memlimit_properties_t, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
8914 return error;
8915 }
8916 #endif // DEBUG || DEVELOPMENT
8917
8918 bool
8919 memorystatus_task_has_increased_memory_limit_entitlement(task_t task)
8920 {
8921 if (memorystatus_entitled_max_task_footprint_mb == 0) {
8922 // Entitlement is not supported on this device.
8923 return false;
8924 }
8925 return IOTaskHasEntitlement(task,
8926 "com.apple.developer.kernel.increased-memory-limit");
8927 }
8928
8929 bool
8930 memorystatus_task_has_increased_debugging_memory_limit_entitlement(task_t task)
8931 {
8932 if (memorystatus_entitled_dev_max_task_footprint_mb == 0) {
8933 // Entitlement is not supported on this device.
8934 return false;
8935 }
8936 return IOTaskHasEntitlement(task,
8937 "com.apple.developer.kernel.increased-debugging-memory-limit");
8938 }
8939
8940 bool
8941 memorystatus_task_has_legacy_footprint_entitlement(task_t task)
8942 {
8943 return IOTaskHasEntitlement(task,
8944 "com.apple.private.memory.legacy_footprint");
8945 }
8946
8947 bool
8948 memorystatus_task_has_ios13extended_footprint_limit(task_t task)
8949 {
8950 if (max_mem < 1500ULL * 1024 * 1024 ||
8951 max_mem > 2ULL * 1024 * 1024 * 1024) {
8952 /* ios13extended_footprint is only for 2GB devices */
8953 return false;
8954 }
8955 return IOTaskHasEntitlement(task,
8956 "com.apple.developer.memory.ios13extended_footprint");
8957 }
8958
8959 static int32_t
8960 memorystatus_get_default_task_active_limit(proc_t p)
8961 {
8962 int32_t limit = (int32_t)max_task_footprint_mb;
8963 task_t task = proc_task(p);
8964
8965 /*
8966 * Check for the various entitlement footprint hacks
8967 * and try to apply each one. Note that if multiple entitlements are present
8968 * whichever results in the largest limit applies.
8969 */
8970 if (memorystatus_task_has_increased_debugging_memory_limit_entitlement(task)) {
8971 limit = MAX(limit, memorystatus_entitled_dev_max_task_footprint_mb);
8972 }
8973 if (memorystatus_task_has_increased_memory_limit_entitlement(task)) {
8974 #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT
8975 /* On visionOS, we want a separate memory limit for iOS (bincompat) apps. */
8976 if ((proc_platform(p) == PLATFORM_IOS) &&
8977 (memorystatus_entitled_bincompat_max_task_footprint_mb != 0)) {
8978 limit = MAX(limit, memorystatus_entitled_bincompat_max_task_footprint_mb);
8979 } else {
8980 limit = MAX(limit, memorystatus_entitled_max_task_footprint_mb);
8981 }
8982 #else /* CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT */
8983 limit = MAX(limit, memorystatus_entitled_max_task_footprint_mb);
8984 #endif /* !CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT */
8985 }
8986 #if __arm64__
8987 if (legacy_footprint_entitlement_mode == LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE &&
8988 memorystatus_task_has_legacy_footprint_entitlement(task)) {
8989 limit = MAX(limit, max_task_footprint_mb + legacy_footprint_bonus_mb);
8990 }
8991 #endif /* __arm64__ */
8992 if (memorystatus_task_has_ios13extended_footprint_limit(task)) {
8993 limit = MAX(limit, memorystatus_ios13extended_footprint_limit_mb);
8994 }
8995
8996 return limit;
8997 }
8998
8999 static int32_t
9000 memorystatus_get_default_task_inactive_limit(proc_t p)
9001 {
9002 // Currently the default active and inactive limits are always the same.
9003 return memorystatus_get_default_task_active_limit(p);
9004 }
9005
9006 static int
9007 memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry)
9008 {
9009 int32_t memlimit_active, memlimit_inactive;
9010 memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
9011
9012 proc_t p = proc_find(pid);
9013 if (!p) {
9014 return ESRCH;
9015 }
9016
9017 /*
9018 * Check for valid attribute flags.
9019 */
9020 const uint32_t valid_attrs = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
9021 if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
9022 proc_rele(p);
9023 return EINVAL;
9024 }
9025 if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
9026 proc_rele(p);
9027 return EINVAL;
9028 }
9029
9030 /*
9031 * Setup the active memlimit properties
9032 */
9033 memlimit_active = entry->memlimit_active;
9034 if (entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
9035 memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
9036 }
9037
9038 /*
9039 * Setup the inactive memlimit properties
9040 */
9041 memlimit_inactive = entry->memlimit_inactive;
9042 if (entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
9043 memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
9044 }
9045
9046 int error = memorystatus_set_memlimits(p, memlimit_active,
9047 memlimit_inactive, memlimit_options);
9048 proc_rele(p);
9049 return error;
9050 }
9051
9052 /*
9053 * Returns the jetsam priority (effective or requested) of the process
9054 * associated with this task.
9055 */
9056 int
9057 proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
9058 {
9059 if (p) {
9060 if (effective_priority) {
9061 return p->p_memstat_effectivepriority;
9062 } else {
9063 return p->p_memstat_requestedpriority;
9064 }
9065 }
9066 return 0;
9067 }
9068
9069 static int
9070 memorystatus_get_process_is_managed(pid_t pid, int *is_managed)
9071 {
9072 proc_t p = NULL;
9073
9074 /* Validate inputs */
9075 if (pid == 0) {
9076 return EINVAL;
9077 }
9078
9079 p = proc_find(pid);
9080 if (!p) {
9081 return ESRCH;
9082 }
9083
9084 *is_managed = memorystatus_get_proc_is_managed(p) ? 1 : 0;
9085
9086 proc_rele(p);
9087
9088 return 0;
9089 }
9090
9091 bool
9092 memorystatus_get_proc_is_managed(proc_t proc)
9093 {
9094 proc_list_lock();
9095 bool is_managed = _memstat_proc_is_managed(proc);
9096 proc_list_unlock();
9097 return is_managed;
9098 }
9099
9100
9101 static int
9102 memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed)
9103 {
9104 proc_t p = NULL;
9105
9106 /* Validate inputs */
9107 if (pid == 0) {
9108 return EINVAL;
9109 }
9110
9111 p = proc_find(pid);
9112 if (!p) {
9113 return ESRCH;
9114 }
9115
9116 proc_list_lock();
9117
9118 if (set_managed == TRUE) {
9119 p->p_memstat_state |= P_MEMSTAT_MANAGED;
9120 /*
9121 * The P_MEMSTAT_MANAGED bit is set by Runningboard for Apps.
9122 * Also opt them in to being frozen (they might have started
9123 * off with the P_MEMSTAT_FREEZE_DISABLED bit set.)
9124 */
9125 p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED;
9126 } else {
9127 p->p_memstat_state &= ~P_MEMSTAT_MANAGED;
9128 }
9129
9130 if (_memstat_proc_is_tracked(p)) {
9131 memorystatus_log_error("memorystatus: process %s [%d] opted in to both "
9132 "Management and ActivityTracking\n", proc_best_name(p),
9133 proc_pid(p));
9134 }
9135
9136 proc_list_unlock();
9137
9138 proc_rele(p);
9139
9140 return 0;
9141 }
9142
9143 static int
9144 _memstat_get_kill_count(int priority, memorystatus_kill_cause_t cause, bool clear)
9145 {
9146 uint32_t _Atomic *ptr;
9147
9148 assert(priority >= JETSAM_PRIORITY_IDLE);
9149 assert(priority <= JETSAM_PRIORITY_MAX);
9150
9151 /* rdar://141462516 */
9152 if (cause == kMemorystatusInvalid) {
9153 return 0;
9154 } else if (cause == kMemorystatusKilledIdleExit) {
9155 if (priority == JETSAM_PRIORITY_IDLE) {
9156 ptr = &memorystatus_idle_exit_kill_count;
9157 } else {
9158 return 0; /* This never happens */
9159 }
9160 } else {
9161 if (cause < kMemorystatusKilledIdleExit) {
9162 ptr = &memorystatus_kill_counts[priority][cause - 1];
9163 } else {
9164 ptr = &memorystatus_kill_counts[priority][cause - 2];
9165 }
9166 }
9167
9168 if (clear) {
9169 return os_atomic_xchg(ptr, 0, relaxed);
9170 } else {
9171 return os_atomic_load(ptr, relaxed);
9172 }
9173 }
9174
9175 static int
9176 memorystatus_cmd_get_kill_counts(int priority, user_addr_t buffer, size_t buffer_size, int flags)
9177 {
9178 memorystatus_kill_cause_t cause;
9179 uint32_t outbuf[JETSAM_REASON_MEMORYSTATUS_MAX + 1];
9180 bool clear = flags & MEMORYSTATUS_GET_KILL_COUNTS_CLEAR;
9181
9182 if (((buffer_size % sizeof(uint32_t)) != 0) ||
9183 (priority < JETSAM_PRIORITY_IDLE) ||
9184 (priority > JETSAM_PRIORITY_MAX)) {
9185 return EINVAL;
9186 }
9187
9188 for (cause = kMemorystatusInvalid; cause <= JETSAM_REASON_MEMORYSTATUS_MAX; cause++) {
9189 outbuf[cause] = _memstat_get_kill_count(priority, cause, clear);
9190 }
9191
9192 return copyout(outbuf, buffer, MIN(buffer_size, sizeof(outbuf)));
9193 }
9194
9195 int
9196 memorystatus_control(struct proc *p, struct memorystatus_control_args *args, int *ret)
9197 {
9198 int error = EINVAL;
9199 boolean_t skip_auth_check = FALSE;
9200 os_reason_t jetsam_reason = OS_REASON_NULL;
9201
9202 #if !CONFIG_JETSAM
9203 #pragma unused(ret)
9204 #pragma unused(jetsam_reason)
9205 #endif
9206
9207 /* We don't need entitlements if we're setting / querying the freeze preference or frozen status for a process. */
9208 if (args->command == MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE ||
9209 args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE ||
9210 args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN) {
9211 skip_auth_check = TRUE;
9212 }
9213
9214 /*
9215 * On development kernel, we don't need entitlements if we're adjusting the limit.
9216 * This required for limit adjustment by dyld when roots are detected, see rdar://99669958
9217 */
9218 #if DEVELOPMENT || DEBUG
9219 if (args->command == MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT && proc_getpid(p) == args->pid) {
9220 skip_auth_check = TRUE;
9221 }
9222 #endif /* DEVELOPMENT || DEBUG */
9223
9224 #if DEVELOPMENT || DEBUG
9225 /*
9226 * On development kernels, processes should be able to re-arm themselves
9227 * without entitlement for testing.
9228 */
9229 if (args->command == MEMORYSTATUS_CMD_REARM_MEMLIMIT && proc_getpid(p) == args->pid) {
9230 skip_auth_check = TRUE;
9231 }
9232 #endif
9233
9234 /* Need to be root or have entitlement. */
9235 if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT) && !skip_auth_check) {
9236 error = EPERM;
9237 goto out;
9238 }
9239
9240 /*
9241 * Sanity check.
9242 * Do not enforce it for snapshots or v2 priority list.
9243 * (the latter always allocates an appropriately-sized buffer.)
9244 */
9245 if (args->command != MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT &&
9246 args->command != MEMORYSTATUS_CMD_GET_PRIORITY_LIST_V2 &&
9247 args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_NAMES &&
9248 args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_INFO &&
9249 args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_MEMINFO) {
9250 if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
9251 error = EINVAL;
9252 goto out;
9253 }
9254 }
9255
9256 #if CONFIG_MACF
9257 error = mac_proc_check_memorystatus_control(p, args->command, args->pid);
9258 if (error) {
9259 goto out;
9260 }
9261 #endif /* MAC */
9262
9263 switch (args->command) {
9264 case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
9265 error = memorystatus_cmd_get_priority_list(
9266 args->pid,
9267 args->buffer,
9268 args->buffersize,
9269 ret,
9270 sizeof(memorystatus_priority_entry_t));
9271 break;
9272 case MEMORYSTATUS_CMD_GET_PRIORITY_LIST_V2:
9273 error = memorystatus_cmd_get_priority_list(
9274 args->pid,
9275 args->buffer,
9276 args->buffersize,
9277 ret,
9278 sizeof(memorystatus_priority_entry_v2_t));
9279 break;
9280 case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
9281 error = memorystatus_cmd_set_priority_properties(args->pid, args->flags, args->buffer, args->buffersize, ret);
9282 break;
9283 case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES:
9284 error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
9285 break;
9286 case MEMORYSTATUS_CMD_GET_MEMLIMIT_PROPERTIES:
9287 error = memorystatus_cmd_get_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
9288 break;
9289 case MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS:
9290 error = memorystatus_cmd_get_memlimit_excess_np(args->pid, args->flags, args->buffer, args->buffersize, ret);
9291 break;
9292 case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
9293 error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
9294 break;
9295 case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
9296 error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret);
9297 break;
9298 #if JETSAM_ZPRINT_SNAPSHOT
9299 case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_NAMES:
9300 error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
9301 jzs_zone_cnt * sizeof(mach_zone_name_t), jzs_names);
9302 break;
9303 case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_INFO:
9304 error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
9305 jzs_zone_cnt * sizeof(mach_zone_info_t), jzs_info);
9306 break;
9307 case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_MEMINFO:
9308 error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
9309 jzs_meminfo_cnt * sizeof(mach_memory_info_t), jzs_meminfo);
9310 break;
9311 #endif
9312 #if DEVELOPMENT || DEBUG
9313 case MEMORYSTATUS_CMD_SET_TESTING_PID:
9314 error = memorystatus_cmd_set_testing_pid((int32_t) args->flags);
9315 break;
9316 #endif
9317 case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
9318 error = memorystatus_cmd_get_pressure_status(ret);
9319 break;
9320 #if CONFIG_JETSAM
9321 case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
9322 /*
9323 * This call does not distinguish between active and inactive limits.
9324 * Default behavior in 2-level HWM world is to set both.
9325 * Non-fatal limit is also assumed for both.
9326 */
9327 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
9328 break;
9329 case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
9330 /*
9331 * This call does not distinguish between active and inactive limits.
9332 * Default behavior in 2-level HWM world is to set both.
9333 * Fatal limit is also assumed for both.
9334 */
9335 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
9336 break;
9337 case MEMORYSTATUS_CMD_MARK_PROCESS_COALITION_SWAPPABLE:
9338 error = memorystatus_cmd_mark_process_coalition_swappable(args->pid, ret);
9339 break;
9340
9341 case MEMORYSTATUS_CMD_GET_PROCESS_COALITION_IS_SWAPPABLE:
9342 error = memorystatus_cmd_get_process_coalition_is_swappable(args->pid, ret);
9343 break;
9344
9345 case MEMORYSTATUS_CMD_CONVERT_MEMLIMIT_MB:
9346 error = memorystatus_cmd_convert_memlimit_mb(args->pid, (int32_t) args->flags, ret);
9347 break;
9348
9349 case MEMORYSTATUS_CMD_REARM_MEMLIMIT:
9350 error = memorystatus_cmd_rearm_memlimit(args->pid, args->flags, ret);
9351 break;
9352 #endif /* CONFIG_JETSAM */
9353 /* Test commands */
9354 #if DEVELOPMENT || DEBUG
9355 case MEMORYSTATUS_CMD_TEST_JETSAM:
9356 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_GENERIC);
9357 if (jetsam_reason == OS_REASON_NULL) {
9358 memorystatus_log_error("memorystatus_control: failed to allocate jetsam reason\n");
9359 }
9360
9361 error = memstat_kill_process_sync(args->pid, kMemorystatusKilled, jetsam_reason) ? 0 : EINVAL;
9362 break;
9363 case MEMORYSTATUS_CMD_TEST_JETSAM_SORT:
9364 error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags, args->buffer, args->buffersize);
9365 break;
9366 #else /* DEVELOPMENT || DEBUG */
9367 #pragma unused(jetsam_reason)
9368 #endif /* DEVELOPMENT || DEBUG */
9369 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE:
9370 if (memorystatus_aggressive_jetsam_lenient_allowed == FALSE) {
9371 #if DEVELOPMENT || DEBUG
9372 memorystatus_log_info("Enabling Lenient Mode\n");
9373 #endif /* DEVELOPMENT || DEBUG */
9374
9375 memorystatus_aggressive_jetsam_lenient_allowed = TRUE;
9376 memorystatus_aggressive_jetsam_lenient = TRUE;
9377 error = 0;
9378 }
9379 break;
9380 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE:
9381 #if DEVELOPMENT || DEBUG
9382 memorystatus_log_info("Disabling Lenient mode\n");
9383 #endif /* DEVELOPMENT || DEBUG */
9384 memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
9385 memorystatus_aggressive_jetsam_lenient = FALSE;
9386 error = 0;
9387 break;
9388 case MEMORYSTATUS_CMD_GET_AGGRESSIVE_JETSAM_LENIENT_MODE:
9389 *ret = (memorystatus_aggressive_jetsam_lenient ? 1 : 0);
9390 error = 0;
9391 break;
9392 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE:
9393 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE:
9394 error = memorystatus_low_mem_privileged_listener(args->command);
9395 break;
9396
9397 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE:
9398 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE:
9399 error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, JETSAM_PRIORITY_ELEVATED_INACTIVE, args->flags ? TRUE : FALSE);
9400 break;
9401 case MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED:
9402 error = memorystatus_set_process_is_managed(args->pid, args->flags);
9403 break;
9404
9405 case MEMORYSTATUS_CMD_GET_PROCESS_IS_MANAGED:
9406 error = memorystatus_get_process_is_managed(args->pid, ret);
9407 break;
9408
9409 #if CONFIG_FREEZE
9410 case MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE:
9411 error = memorystatus_set_process_is_freezable(args->pid, args->flags ? TRUE : FALSE);
9412 break;
9413
9414 case MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE:
9415 error = memorystatus_get_process_is_freezable(args->pid, ret);
9416 break;
9417 case MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN:
9418 error = memorystatus_get_process_is_frozen(args->pid, ret);
9419 break;
9420
9421 case MEMORYSTATUS_CMD_FREEZER_CONTROL:
9422 error = memorystatus_freezer_control(args->flags, args->buffer, args->buffersize, ret);
9423 break;
9424 #endif /* CONFIG_FREEZE */
9425
9426 #if DEVELOPMENT || DEBUG
9427 case MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT:
9428 error = memorystatus_cmd_increase_jetsam_task_limit(args->pid, args->flags);
9429 break;
9430 case MEMORYSTATUS_CMD_SET_DIAG_LIMIT:
9431 error = memorystatus_cmd_set_diag_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
9432 break;
9433 case MEMORYSTATUS_CMD_GET_DIAG_LIMIT:
9434 error = memorystatus_cmd_get_diag_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
9435 break;
9436 #endif /* DEVELOPMENT || DEBUG */
9437
9438 case MEMORYSTATUS_CMD_GET_KILL_COUNTS:
9439 error = memorystatus_cmd_get_kill_counts(args->pid, args->buffer, args->buffersize, args->flags);
9440 break;
9441
9442 case MEMORYSTATUS_CMD_GET_CONCLAVE_LIMIT:
9443 error = _memstat_get_process_conclave_mem_limit(args->pid, ret);
9444 break;
9445
9446 default:
9447 error = EINVAL;
9448 break;
9449 }
9450
9451 out:
9452 return error;
9453 }
9454
9455 /* Coalition support */
9456
9457 /*
9458 * Inserts a list of pids before the given proc in the bucket. If any of the
9459 * pids in the given list are not already in the bucket, they will be ignored.
9460 */
9461 static void
9462 memstat_insert_list_locked(
9463 proc_t before,
9464 unsigned int bucket_idx,
9465 pid_t *pid_list,
9466 int list_sz)
9467 {
9468 int i;
9469 proc_t p;
9470 memstat_bucket_t *bucket;
9471
9472 assert(bucket_idx < MEMSTAT_BUCKET_COUNT);
9473
9474 bucket = &memstat_bucket[bucket_idx];
9475
9476 if ((pid_list == NULL) || (list_sz <= 0)) {
9477 return;
9478 }
9479
9480 for (i = list_sz - 1; i >= 0; i--) {
9481 p = proc_find_locked(pid_list[i]);
9482
9483 if (p == NULL) {
9484 continue;
9485 }
9486
9487 if ((p == before) || (p->p_memstat_effectivepriority != bucket_idx)) {
9488 /*
9489 * We can encounter p == before when we try to sort a coalition with an in-
9490 * progress exec of the leader, such that the leader and the exec-ing
9491 * member have the same PID. Just skip over it for now, since this member
9492 * will soon be removed from the proc list anyway.
9493 */
9494 proc_rele(p);
9495 continue;
9496 }
9497
9498 TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
9499 TAILQ_INSERT_BEFORE(before, p, p_memstat_list);
9500 proc_rele(p);
9501 }
9502 }
9503 /*
9504 * Return the number of pids rearranged during this sort.
9505 */
9506 static void
9507 memstat_sort_coals_locked(unsigned int bucket_index, memorystatus_jetsam_sort_order_t sort_order)
9508 {
9509 #define MAX_SORT_PIDS 80
9510
9511 int ntasks = 0;
9512 proc_t p = NULL;
9513 coalition_t coal = COALITION_NULL;
9514 pid_t pid_list[MAX_SORT_PIDS];
9515 memstat_bucket_t *bucket;
9516
9517 assert((sort_order == JETSAM_SORT_LRU) || (sort_order == JETSAM_SORT_FOOTPRINT));
9518 assert(bucket_index < MEMSTAT_BUCKET_COUNT);
9519
9520 switch (sort_order) {
9521 case JETSAM_SORT_LRU:
9522 /* Nothing to do, buckets are already LRU */
9523 break;
9524 case JETSAM_SORT_FOOTPRINT:
9525 /* Sort bucket by footprint first */
9526 memstat_sort_by_footprint_locked(bucket_index);
9527 break;
9528 default:
9529 panic("Invalid sort order %d passed to memstat_sort_coals", sort_order);
9530 }
9531
9532 /*
9533 * During coalition sorting, processes in a priority band are rearranged
9534 * by being re-inserted at the head of the queue. So, when handling a
9535 * list, the first process that gets moved to the head of the queue,
9536 * ultimately gets pushed toward the queue tail, and hence, jetsams last.
9537 *
9538 * So, for example, the coalition leader is expected to jetsam last,
9539 * after its coalition members. Therefore, the coalition leader is
9540 * inserted at the head of the queue first.
9541 *
9542 * After processing a coalition, the jetsam order is as follows:
9543 * undefs(jetsam first), extensions, xpc services, leader(jetsam last)
9544 */
9545
9546 /*
9547 * Coalition members are rearranged in the priority bucket here,
9548 * based on their coalition role.
9549 */
9550
9551 bucket = &memstat_bucket[bucket_index];
9552 p = TAILQ_FIRST(&bucket->list);
9553 while (p) {
9554 coal = task_get_coalition(proc_task(p), COALITION_TYPE_JETSAM);
9555 if (!coalition_is_leader(proc_task(p), coal)) {
9556 p = TAILQ_NEXT(p, p_memstat_list);
9557 continue;
9558 }
9559
9560 /* undefined coalition members should be the first to jetsam */
9561 ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_UNDEF,
9562 COALITION_SORT_DEFAULT, pid_list, MAX_SORT_PIDS);
9563
9564 if (ntasks > 0) {
9565 memstat_insert_list_locked(p, bucket_index, pid_list,
9566 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
9567 }
9568
9569 /* extensions should jetsam after unmarked processes */
9570 ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_EXT,
9571 COALITION_SORT_DEFAULT, pid_list, MAX_SORT_PIDS);
9572
9573 if (ntasks > 0) {
9574 memstat_insert_list_locked(p, bucket_index, pid_list,
9575 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
9576 }
9577
9578 /* xpc services should jetsam after extensions */
9579 ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_XPC,
9580 COALITION_SORT_DEFAULT, pid_list, MAX_SORT_PIDS);
9581
9582 if (ntasks > 0) {
9583 memstat_insert_list_locked(p, bucket_index, pid_list,
9584 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
9585 }
9586
9587 /*
9588 * And then, the leader will jetsam last since we inserted everyone else
9589 * before it in the bucket
9590 */
9591
9592 p = TAILQ_NEXT(p, p_memstat_list);
9593 } /* end for */
9594 }
9595
9596
9597
9598 uint32_t
9599 memstat_get_idle_proccnt(void)
9600 {
9601 #if CONFIG_JETSAM
9602 /*
9603 * On fully jetsam-enabled systems, all processes on the idle band may
9604 * be idle-exited
9605 */
9606 return os_atomic_load(&memstat_bucket[JETSAM_PRIORITY_IDLE].count, relaxed);
9607 #else /* !CONFIG_JETSAM */
9608 uint32_t count = 0;
9609 uint32_t bucket = JETSAM_PRIORITY_IDLE;
9610
9611 proc_list_lock();
9612 for (proc_t p = memorystatus_get_first_proc_locked(&bucket, FALSE);
9613 p != PROC_NULL;
9614 p = memorystatus_get_next_proc_locked(&bucket, p, FALSE)) {
9615 /*
9616 * On macOS, we can only exit clean daemons. In the future, we
9617 * should include assertion-less managed daemons. Apps may make
9618 * their way into this band as well, and we cannot jetsam those.
9619 */
9620 if (_memstat_proc_can_idle_exit(p) &&
9621 !_memstat_proc_is_dirty(p) &&
9622 !_memstat_proc_is_terminating(p)) {
9623 count++;
9624 }
9625 }
9626 proc_list_unlock();
9627
9628 return count;
9629 #endif /* CONFIG_JETSAM */
9630 }
9631
9632 uint32_t
9633 memstat_get_long_idle_proccnt(void)
9634 {
9635 uint32_t count = 0;
9636 uint32_t bucket = JETSAM_PRIORITY_IDLE;
9637
9638 proc_list_lock();
9639 for (proc_t p = memorystatus_get_first_proc_locked(&bucket, FALSE);
9640 p != PROC_NULL;
9641 p = memorystatus_get_next_proc_locked(&bucket, p, FALSE)) {
9642 if (!_memstat_proc_is_dirty(p) && _memstat_proc_can_idle_exit(p) &&
9643 !_memstat_proc_is_terminating(p) && _memstat_proc_is_reapable(p)) {
9644 count++;
9645 }
9646 }
9647 proc_list_unlock();
9648
9649 return count;
9650 }
9651
9652 uint32_t
9653 memstat_get_proccnt_upto_priority(uint32_t max_bucket_index)
9654 {
9655 int32_t i = JETSAM_PRIORITY_IDLE;
9656 int count = 0;
9657
9658 assert3u(max_bucket_index, <=, MEMSTAT_BUCKET_COUNT);
9659
9660 while (i <= max_bucket_index) {
9661 /*
9662 * NB: We don't hold the proc-list lock here; that's ok b/c this is just an
9663 * estimate.
9664 */
9665 count += os_atomic_load(&memstat_bucket[i++].count, relaxed);
9666 }
9667
9668 return count;
9669 }
9670
9671 int
9672 memorystatus_update_priority_for_appnap(proc_t p)
9673 {
9674 #if !CONFIG_JETSAM
9675 if (!p || (!isApp(p)) || (p->p_memstat_state & (P_MEMSTAT_INTERNAL | P_MEMSTAT_MANAGED))) {
9676 /*
9677 * Ineligible processes OR system processes e.g. launchd.
9678 */
9679 return -1;
9680 }
9681
9682 int32_t priority = 0;
9683
9684 proc_list_lock();
9685
9686 if (proc_list_exited(p) ||
9687 (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP))) {
9688 /*
9689 * If the process is on its way out OR
9690 * jetsam has alread tried and failed to kill this process,
9691 * let's skip the whole jetsam band transition.
9692 */
9693 proc_list_unlock();
9694 return 0;
9695 }
9696
9697 /*
9698 * Update priority. We don't want the aging logic because that's only applicable on
9699 * configs with CONFIG_JETSAM.
9700 */
9701 priority = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_SUP_ACTIVE) ?
9702 JETSAM_PRIORITY_BACKGROUND :
9703 p->p_memstat_requestedpriority;
9704 if (_memstat_proc_has_priority_assertion(p)) {
9705 priority = MAX(priority, p->p_memstat_assertionpriority);
9706 }
9707 memstat_update_priority_locked(p, priority, MEMSTAT_PRIORITY_NO_AGING);
9708
9709 proc_list_unlock();
9710
9711 return 0;
9712
9713 #else /* !CONFIG_JETSAM */
9714 #pragma unused(p)
9715 return -1;
9716 #endif /* !CONFIG_JETSAM */
9717 }
9718
9719 uint64_t
9720 memorystatus_available_memory_internal(struct proc *p)
9721 {
9722 #ifdef XNU_TARGET_OS_OSX
9723 if (p->p_memstat_memlimit <= 0) {
9724 return 0;
9725 }
9726 #endif /* XNU_TARGET_OS_OSX */
9727 const uint64_t footprint_in_bytes = get_task_phys_footprint(proc_task(p));
9728 int32_t memlimit_mb;
9729 int64_t memlimit_bytes;
9730 int64_t rc;
9731
9732 if (isApp(p) == FALSE) {
9733 return 0;
9734 }
9735
9736 if (p->p_memstat_memlimit > 0) {
9737 memlimit_mb = p->p_memstat_memlimit;
9738 } else if (task_convert_phys_footprint_limit(-1, &memlimit_mb) != KERN_SUCCESS) {
9739 return 0;
9740 }
9741
9742 if (memlimit_mb <= 0) {
9743 memlimit_bytes = INT_MAX & ~((1 << 20) - 1);
9744 } else {
9745 memlimit_bytes = ((int64_t) memlimit_mb) << 20;
9746 }
9747
9748 rc = memlimit_bytes - footprint_in_bytes;
9749
9750 return (rc >= 0) ? rc : 0;
9751 }
9752
9753 int
9754 memorystatus_available_memory(struct proc *p, __unused struct memorystatus_available_memory_args *args, uint64_t *ret)
9755 {
9756 *ret = memorystatus_available_memory_internal(p);
9757
9758 return 0;
9759 }
9760
9761 #if DEVELOPMENT || DEBUG
9762 static int
9763 memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase)
9764 {
9765 int32_t memlimit_active, memlimit_inactive;
9766
9767 /* Validate inputs */
9768 if ((pid == 0) || (byte_increase == 0)) {
9769 return EINVAL;
9770 }
9771
9772 if (memstat_ignore_task_limit_increase) {
9773 /* If the bootarg is set, lie and say we did it */
9774 return 0;
9775 }
9776
9777 proc_t p = proc_find(pid);
9778
9779 if (!p) {
9780 return ESRCH;
9781 }
9782
9783 const uint32_t current_memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
9784 /* round to page */
9785 const int32_t page_aligned_increase = (int32_t) MIN(round_page(p->p_memlimit_increase + byte_increase), INT32_MAX);
9786
9787 proc_list_lock();
9788
9789 memlimit_active = p->p_memstat_memlimit_active;
9790 if (memlimit_active > 0) {
9791 memlimit_active -= current_memlimit_increase;
9792 memlimit_active += roundToNearestMB(page_aligned_increase);
9793 }
9794
9795 memlimit_inactive = p->p_memstat_memlimit_inactive;
9796 if (memlimit_inactive > 0) {
9797 memlimit_inactive -= current_memlimit_increase;
9798 memlimit_inactive += roundToNearestMB(page_aligned_increase);
9799 }
9800
9801 /*
9802 * Store the updated delta limit in the proc.
9803 */
9804 p->p_memlimit_increase = page_aligned_increase;
9805
9806 memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
9807 if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
9808 memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
9809 }
9810 if (_memstat_proc_active_memlimit_is_fatal(p)) {
9811 memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
9812 }
9813
9814 int error = memstat_set_memlimits_locked(p,
9815 memlimit_active, memlimit_inactive,
9816 memlimit_options);
9817
9818 proc_list_unlock();
9819 proc_rele(p);
9820
9821 return error;
9822 }
9823 #endif /* DEVELOPMENT */
9824