1 /*
2 * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29
30 #include <kern/sched_prim.h>
31 #include <kern/kalloc.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/host.h>
38 #include <kern/policy_internal.h>
39 #include <kern/thread_group.h>
40 #include <kern/zalloc.h>
41
42 #include <corpses/task_corpse.h>
43 #include <libkern/libkern.h>
44 #include <mach/mach_time.h>
45 #include <mach/task.h>
46 #include <mach/host_priv.h>
47 #include <mach/mach_host.h>
48 #include <pexpert/pexpert.h>
49 #include <sys/coalition.h>
50 #include <sys/code_signing.h>
51 #include <sys/kern_event.h>
52 #include <sys/proc.h>
53 #include <sys/proc_info.h>
54 #include <sys/reason.h>
55 #include <sys/signal.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysproto.h>
59 #include <sys/spawn_internal.h>
60 #include <sys/wait.h>
61 #include <sys/tree.h>
62 #include <sys/priv.h>
63 #include <vm/pmap.h>
64 #include <vm/vm_reclaim_xnu.h>
65 #include <vm/vm_pageout_xnu.h>
66 #include <vm/vm_protos.h>
67 #include <vm/vm_purgeable_xnu.h>
68 #include <vm/vm_page.h>
69 #include <vm/vm_compressor_xnu.h>
70 #include <mach/machine/sdt.h>
71 #include <libkern/section_keywords.h>
72 #include <os/atomic_private.h>
73 #include <os/overflow.h>
74
75 #include <IOKit/IOBSD.h>
76
77 #if CONFIG_MACF
78 #include <security/mac_framework.h>
79 #endif
80
81 #if CONFIG_FREEZE
82 #include <vm/vm_map.h>
83 #endif /* CONFIG_FREEZE */
84
85 #include <kern/kern_memorystatus_internal.h>
86 #include <sys/kern_memorystatus.h>
87 #include <sys/kern_memorystatus_xnu.h>
88 #include <sys/kern_memorystatus_freeze.h>
89 #include <sys/kern_memorystatus_notify.h>
90 #include <sys/kdebug_triage.h>
91 #include <sys/file_internal.h>
92 #include <net/necp.h>
93
94 errno_t mach_to_bsd_errno(kern_return_t mach_err);
95 extern uint32_t vm_compressor_pool_size(void);
96 extern uint32_t vm_compressor_fragmentation_level(void);
97
98 pid_t memorystatus_freeze_last_pid_thawed = 0;
99 uint64_t memorystatus_freeze_last_pid_thawed_ts = 0;
100
101 int block_corpses = 0; /* counter to block new corpses if jetsam purges them */
102
103 /* For logging clarity */
104 static const char *memorystatus_kill_cause_name[] = {
105 "", /* kMemorystatusInvalid */
106 "jettisoned", /* kMemorystatusKilled */
107 "highwater", /* kMemorystatusKilledHiwat */
108 "vnode-limit", /* kMemorystatusKilledVnodes */
109 "vm-pageshortage", /* kMemorystatusKilledVMPageShortage */
110 "proc-thrashing", /* kMemorystatusKilledProcThrashing */
111 "fc-thrashing", /* kMemorystatusKilledFCThrashing */
112 "per-process-limit", /* kMemorystatusKilledPerProcessLimit */
113 "disk-space-shortage", /* kMemorystatusKilledDiskSpaceShortage */
114 "idle-exit", /* kMemorystatusKilledIdleExit */
115 "zone-map-exhaustion", /* kMemorystatusKilledZoneMapExhaustion */
116 "vm-compressor-thrashing", /* kMemorystatusKilledVMCompressorThrashing */
117 "vm-compressor-space-shortage", /* kMemorystatusKilledVMCompressorSpaceShortage */
118 "low-swap", /* kMemorystatusKilledLowSwap */
119 "sustained-memory-pressure", /* kMemorystatusKilledSustainedPressure */
120 "vm-pageout-starvation", /* kMemorystatusKilledVMPageoutStarvation */
121 };
122
123 static const char *
memorystatus_priority_band_name(int32_t priority)124 memorystatus_priority_band_name(int32_t priority)
125 {
126 switch (priority) {
127 case JETSAM_PRIORITY_FOREGROUND:
128 return "FOREGROUND";
129 case JETSAM_PRIORITY_AUDIO_AND_ACCESSORY:
130 return "AUDIO_AND_ACCESSORY";
131 case JETSAM_PRIORITY_CONDUCTOR:
132 return "CONDUCTOR";
133 case JETSAM_PRIORITY_DRIVER_APPLE:
134 return "DRIVER_APPLE";
135 case JETSAM_PRIORITY_HOME:
136 return "HOME";
137 case JETSAM_PRIORITY_EXECUTIVE:
138 return "EXECUTIVE";
139 case JETSAM_PRIORITY_IMPORTANT:
140 return "IMPORTANT";
141 case JETSAM_PRIORITY_CRITICAL:
142 return "CRITICAL";
143 }
144
145 return "?";
146 }
147
148 bool
is_reason_thrashing(unsigned cause)149 is_reason_thrashing(unsigned cause)
150 {
151 switch (cause) {
152 case kMemorystatusKilledFCThrashing:
153 case kMemorystatusKilledVMCompressorThrashing:
154 case kMemorystatusKilledVMCompressorSpaceShortage:
155 return true;
156 default:
157 return false;
158 }
159 }
160
161 bool
is_reason_zone_map_exhaustion(unsigned cause)162 is_reason_zone_map_exhaustion(unsigned cause)
163 {
164 return cause == kMemorystatusKilledZoneMapExhaustion;
165 }
166
167 /*
168 * Returns the current zone map size and capacity to include in the jetsam snapshot.
169 * Defined in zalloc.c
170 */
171 extern void get_zone_map_size(uint64_t *current_size, uint64_t *capacity);
172
173 /*
174 * Returns the name of the largest zone and its size to include in the jetsam snapshot.
175 * Defined in zalloc.c
176 */
177 extern void get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size);
178
179 static int memstat_set_memlimits_locked(proc_t p, int32_t active_limit,
180 int32_t inactive_limit, memlimit_options_t options);
181 static bool _memstat_proc_is_active_locked(proc_t);
182
183 static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. */
184
185 /*
186 * Cache this proc's active limit as its current limit before writing it to
187 * the ledger. Returns whether the new limit should be written to the ledger.
188 */
189 static inline bool
_memstat_update_memlimit_locked(proc_t p,bool use_active)190 _memstat_update_memlimit_locked(proc_t p, bool use_active)
191 {
192 bool ledger_needed = false;
193 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
194
195 /* Cache limit value */
196 if (use_active && p->p_memstat_memlimit != p->p_memstat_memlimit_active) {
197 p->p_memstat_memlimit = p->p_memstat_memlimit_active;
198 ledger_needed = true;
199 } else if (!use_active &&
200 p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) {
201 p->p_memstat_memlimit = p->p_memstat_memlimit_inactive;
202 ledger_needed = true;
203 }
204
205 /* Cache limit fatality */
206 if (_memstat_proc_memlimit_is_fatal(p, use_active) &&
207 !_memstat_proc_cached_memlimit_is_fatal(p)) {
208 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
209 ledger_needed = true;
210 } else if (!_memstat_proc_memlimit_is_fatal(p, use_active) &&
211 _memstat_proc_cached_memlimit_is_fatal(p)) {
212 p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
213 ledger_needed = true;
214 }
215
216 return ledger_needed;
217 }
218
219 /*
220 * Write the process' current memlimit to the ledger for enforcement.
221 *
222 * Holding the proc_list_lock while writing to the ledgers (where the task
223 * lock is taken) can be problematic. The proc list lock may optionally be
224 * dropped and re-taken while writing limits to the ledger. (rdar://21394491)
225 */
226 static int
_memstat_write_memlimit_to_ledger_locked(proc_t p,bool is_active,bool drop_lock)227 _memstat_write_memlimit_to_ledger_locked(proc_t p, bool is_active, bool drop_lock)
228 {
229 kern_return_t kr;
230 bool is_fatal = _memstat_proc_cached_memlimit_is_fatal(p);
231
232 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
233
234 #if MACH_ASSERT
235 if (memorystatus_highwater_enabled) {
236 if (is_active) {
237 assert3u(is_fatal, ==, _memstat_proc_active_memlimit_is_fatal(p));
238 assert3u(p->p_memstat_memlimit, ==, p->p_memstat_memlimit_active);
239 } else {
240 assert3u(is_fatal, ==, _memstat_proc_inactive_memlimit_is_fatal(p));
241 assert3u(p->p_memstat_memlimit, ==, p->p_memstat_memlimit_inactive);
242 }
243 }
244 #endif /* MACH_ASSERT */
245
246 if (drop_lock) {
247 if (proc_ref(p, true) != p) {
248 memorystatus_log_error("Unable to take a reference on proc %s [%d]. "
249 "Cannot update memlimit", proc_best_name(p), proc_getpid(p));
250 return ESRCH;
251 }
252 proc_list_unlock();
253 }
254
255 memorystatus_log_debug("memorystatus: new limit on pid %d (%dMB %s)\n",
256 proc_getpid(p), (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
257 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"));
258
259 kr = task_set_phys_footprint_limit_internal(proc_task(p),
260 (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1,
261 NULL, is_active, is_fatal);
262
263 if (drop_lock) {
264 proc_list_lock();
265 proc_rele(p);
266 }
267
268 if (kr != KERN_SUCCESS) {
269 memorystatus_log_fault("memorystatus: error (%d) setting memlimit in "
270 "ledger for %s [%d]\n", kr, proc_best_name(p), proc_pid(p));
271 return mach_to_bsd_errno(kr);
272 }
273 return 0;
274 }
275
276 #pragma mark General Tunables
277
278 #define MEMORYSTATUS_SMALL_MEMORY_THRESHOLD (3UL * (1UL << 30))
279 #define MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD (6UL * (1UL << 30))
280
281 #define MEMORYSTATUS_CLEAR_THE_DECKS_OFFSET_PERCENTAGE 5UL
282 #define MEMORYSTATUS_BALLAST_OFFSET_PERCENTAGE 5UL
283 #define MEMORYSTATUS_AGGR_SYSPROC_AGING_PERCENTAGE 7UL
284 #define MEMORYSTATUS_DELTA_PERCENTAGE_LARGE 4UL
285 #define MEMORYSTATUS_DELTA_PERCENTAGE_SMALL 5UL
286
287 /*
288 * Fall back to these percentages/ratios if a mb value is not provided via EDT
289 * DRAM (GB) | critical | idle | pressure | freeze
290 * (0,3] | 5% | 10% | 15% | 50%
291 * (3,6] | 4% | 9% | 15% | 50%
292 * (6,∞) | 4% | 8% | 12% | 50%
293 */
294
295 #define MEMORYSTATUS_CRITICAL_PERCENTAGE_SMALL 5UL
296 #define MEMORYSTATUS_CRITICAL_PERCENTAGE_LARGE 4UL
297
298 #define MEMORYSTATUS_IDLE_RATIO_NUM 2UL
299 #define MEMORYSTATUS_IDLE_RATIO_DENOM 1UL
300 #define MEMORYSTATUS_PRESSURE_RATIO_NUM 3UL
301 #define MEMORYSTATUS_PRESSURE_RATIO_DENOM 1UL
302
303 /*
304 * For historical reasons, devices with "medium"-sized memory configs have a critical:idle:pressure ratio of
305 * 4:9:15. This ratio is preserved for these devices when a fixed-mb base value has not been provided by EDT/boot-arg;
306 * all other devices use a 1:2:3 ratio.
307 */
308 #define MEMORYSTATUS_IDLE_RATIO_NUM_MEDIUM 9UL
309 #define MEMORYSTATUS_IDLE_RATIO_DENOM_MEDIUM 4UL
310 #define MEMORYSTATUS_PRESSURE_RATIO_NUM_MEDIUM 15UL
311 #define MEMORYSTATUS_PRESSURE_RATIO_DENOM_MEDIUM 4UL
312
313 static int32_t memorystatus_get_default_task_active_limit(proc_t p);
314 static int32_t memorystatus_get_default_task_inactive_limit(proc_t p);
315
316 /*
317 * default jetsam snapshot support
318 */
319 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
320
321 #if CONFIG_FREEZE
322 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_freezer;
323 /*
324 * The size of the freezer snapshot is given by memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR
325 * The freezer snapshot can be much smaller than the default snapshot
326 * because it only includes apps that have been killed and dasd consumes it every 30 minutes.
327 * Since the snapshots are always wired we don't want to overallocate too much.
328 */
329 #define JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR 20
330 unsigned int memorystatus_jetsam_snapshot_freezer_max;
331 unsigned int memorystatus_jetsam_snapshot_freezer_size;
332 TUNABLE(bool, memorystatus_jetsam_use_freezer_snapshot, "kern.jetsam_user_freezer_snapshot", true);
333
334 #define MEMORYSTATUS_FREEZE_THRESHOLD_PERCENTAGE 50UL
335 TUNABLE_DT(uint32_t, memorystatus_freeze_threshold_mb, "/defaults", "kern.memstat_freeze_mb",
336 "memorystatus_freeze_threshold_mb", 0, TUNABLE_DT_NONE);
337 #endif /* CONFIG_FREEZE */
338
339 unsigned int memorystatus_jetsam_snapshot_count = 0;
340 unsigned int memorystatus_jetsam_snapshot_max = 0;
341 unsigned int memorystatus_jetsam_snapshot_size = 0;
342 uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
343 uint64_t memorystatus_jetsam_snapshot_timeout = 0;
344
345 #if DEVELOPMENT || DEBUG
346 /*
347 * On development and debug kernels, we allow one pid to take ownership
348 * of some memorystatus data structures for testing purposes (via memorystatus_control).
349 * If there's an owner, then only they may consume the jetsam snapshot & set freezer probabilities.
350 * This is used when testing these interface to avoid racing with other
351 * processes on the system that typically use them (namely OSAnalytics & dasd).
352 */
353 static pid_t memorystatus_testing_pid = 0;
354 SYSCTL_INT(_kern, OID_AUTO, memorystatus_testing_pid, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_testing_pid, 0, "");
355 #endif /* DEVELOPMENT || DEBUG */
356
357 /*
358 * jetsam zprint snapshot data
359 */
360 #if JETSAM_ZPRINT_SNAPSHOT
361 static unsigned int jzs_trigger_band = JETSAM_PRIORITY_FOREGROUND;
362 static mach_zone_name_t *jzs_names = NULL;
363 static mach_zone_info_t *jzs_info = NULL;
364 static int *jzs_coalesce = NULL;
365 static unsigned int jzs_zone_cnt = 0;
366 static mach_memory_info_t *jzs_meminfo = NULL;
367 static unsigned int jzs_meminfo_cnt = 0;
368 static uint64_t jzs_gencount = (uint64_t) -1ll;
369
370 #if DEVELOPMENT || DEBUG
371 SYSCTL_UINT(_kern, OID_AUTO, jzs_trigger_band, CTLFLAG_RW | CTLFLAG_LOCKED, &jzs_trigger_band, 0, "Priority band threshold for taking jetsam zprint snapshot");
372 #endif /* DEVELOPMENT || DEBUG */
373 #endif /* JETSAM_ZPRINT_SNAPSHOT */
374
375
376 static void memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot);
377
378 /* General memorystatus stuff */
379
380 /*
381 * Daemons: The actual idle deferred time for the daemon is based on
382 * the relaunch behavior of the daemon. The relaunch behavior determines
383 * the scaling factor applied to memorystatus_sysprocs_idle_delay_time. See
384 * kJetsamSysProcsIdleDelayTime* ratios defined in kern_memorystatus.c
385 *
386 * Apps: The apps are aged for memorystatus_apps_idle_delay_time factored
387 * by kJetsamAppsIdleDelayTimeRatio.
388 */
389 TUNABLE(uint64_t, memstat_idle_deferral_time_s, "memorystatus_idle_deferral_time_s", 10);
390 uint64_t memorystatus_sysprocs_idle_delay_time = 0;
391 uint64_t memorystatus_apps_idle_delay_time = 0;
392 /* 2GB devices support an entitlement for a higher app memory limit of "almost 2GB". */
393 static int32_t memorystatus_ios13extended_footprint_limit_mb = 1800;
394
395 /* Some devices give entitled apps a higher memory limit */
396 TUNABLE_DT_WRITEABLE(int32_t, memorystatus_entitled_max_task_footprint_mb,
397 "/defaults", "kern.entitled_max_task_pmem",
398 "entitled_max_task_pmem", 0, TUNABLE_DT_NONE);
399 TUNABLE_DT_WRITEABLE(int32_t, memorystatus_entitled_dev_max_task_footprint_mb,
400 "/defaults", "kern.entitled_dev_max_task_pmem",
401 "entitled_dev_max_task_pmem", 0, TUNABLE_DT_NONE);
402
403 #if __arm64__
404 #if DEVELOPMENT || DEBUG
405 SYSCTL_INT(_kern, OID_AUTO, ios13extended_footprint_limit_mb,
406 CTLFLAG_RD | CTLFLAG_LOCKED,
407 &memorystatus_ios13extended_footprint_limit_mb, 0, "");
408 SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem,
409 CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN,
410 &memorystatus_entitled_max_task_footprint_mb, 0, "");
411 SYSCTL_INT(_kern, OID_AUTO, entitled_dev_max_task_pmem,
412 CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN,
413 &memorystatus_entitled_dev_max_task_footprint_mb, 0, "");
414 #else /* !(DEVELOPMENT || DEBUG) */
415 SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem,
416 CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_KERN,
417 &memorystatus_entitled_max_task_footprint_mb, 0, "");
418 #endif /* DEVELOPMENT || DEBUG */
419 #endif /* __arm64__ */
420
421 #pragma mark Logging
422
423 os_log_t memorystatus_log_handle;
424
425 TUNABLE_WRITEABLE(memorystatus_log_level_t, memorystatus_log_level, "memorystatus_log_level", MEMORYSTATUS_LOG_LEVEL_DEFAULT);
426
427 #if DEBUG || DEVELOPMENT
428 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_log_level, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_log_level, MEMORYSTATUS_LOG_LEVEL_DEFAULT, "");
429 #endif
430
431 #pragma mark Locks
432
433 static LCK_GRP_DECLARE(memorystatus_lock_group, "memorystatus");
434
435 /* Synchronizes jetsam pressure broadcasts */
436 LCK_MTX_DECLARE(memorystatus_jetsam_broadcast_lock, &memorystatus_lock_group);
437
438 #if DEVELOPMENT || DEBUG
439 static LCK_MTX_DECLARE(disconnect_page_mappings_mutex, &memorystatus_lock_group);
440 #endif /* DEVELOPMENT || DEBUG */
441
442 /* Idle guard handling */
443
444 static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
445 static void _memstat_invalidate_idle_demotion_locked(proc_t p);
446 static void _memstat_schedule_idle_demotion_locked(proc_t p);
447 static void _memstat_reschedule_idle_demotion_locked(void);
448 int memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap);
449 vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
450 boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
451 void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
452 void memorystatus_send_low_swap_note(void);
453 boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count,
454 uint32_t *errors, uint64_t *memory_reclaimed);
455 uint64_t memorystatus_available_memory_internal(proc_t p);
456 void memorystatus_thread_wake(void);
457 static bool _memstat_consider_waking_jetsam_thread(void);
458 #if CONFIG_JETSAM
459 static void memorystatus_thread_pool_default(void);
460 static void memorystatus_thread_pool_max(void);
461 #endif /* CONFIG_JETSAM */
462
463 unsigned int memorystatus_level = 0;
464 static int memorystatus_list_count = 0;
465 memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
466 static thread_call_t memorystatus_idle_demotion_call;
467 uint64_t memstat_idle_demotion_deadline = 0;
468 #if CONFIG_FREEZE
469 unsigned int memorystatus_suspended_count = 0;
470 #endif /* CONFIG_FREEZE */
471
472 #ifdef XNU_TARGET_OS_OSX
473 /*
474 * Effectively disable the system process and application demotion
475 * logic on macOS. This means system processes and apps won't get the
476 * 10 second protection before landing in the IDLE band after moving
477 * out of their active band. Reasons:-
478 * - daemons + extensions + apps on macOS don't behave the way they
479 * do on iOS and so they are confusing the demotion logic. For example,
480 * not all apps go from FG to IDLE. Some sit in higher bands instead. This
481 * is causing multiple asserts to fire internally.
482 * - we use the aging bands to protect processes from jetsam. But on macOS,
483 * we have a very limited jetsam that is only invoked under extreme conditions
484 * where we have no more swap / compressor space OR are under critical pressure.
485 */
486 int system_procs_aging_band = 0;
487 int system_procs_aging_band_stuck = 0;
488 int applications_aging_band = 0;
489 #else /* XNU_TARGET_OS_OSX */
490 int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
491 int system_procs_aging_band_stuck = JETSAM_PRIORITY_AGING_BAND1_STUCK;
492 int applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
493 #endif /* XNU_TARGET_OS_OSX */
494
495 /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */
496 #if CONFIG_FREEZE
497 int memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_FREEZER;
498 #else /* CONFIG_FREEZE */
499 int memorystatus_freeze_jetsam_band = 0;
500 #endif /* CONFIG_FREEZE */
501
502 _Atomic bool memorystatus_zone_map_is_exhausted = false;
503 _Atomic bool memorystatus_compressor_space_shortage = false;
504 _Atomic bool memorystatus_pageout_starved = false;
505 #if CONFIG_PHANTOM_CACHE
506 _Atomic bool memorystatus_phantom_cache_pressure = false;
507 #endif /* CONFIG_PHANTOM_CACHE */
508
509 bool memorystatus_should_issue_fg_band_notify = true;
510
511 extern void coalition_mark_swappable(coalition_t coal);
512 extern bool coalition_is_swappable(coalition_t coal);
513 boolean_t memorystatus_allowed_vm_map_fork(task_t, bool *);
514 #if DEVELOPMENT || DEBUG
515 void memorystatus_abort_vm_map_fork(task_t);
516 #endif
517
518 SYSCTL_NODE(_kern, OID_AUTO, memorystatus,
519 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "memorystatus subsystem");
520
521 /*
522 * Idle delay timeout factors for daemons based on relaunch behavior. Only used in
523 * kJetsamAgingPolicySysProcsReclaimedFirst aging policy.
524 */
525 #define kJetsamSysProcsIdleDelayTimeLowRatio (5)
526 #define kJetsamSysProcsIdleDelayTimeMedRatio (2)
527 #define kJetsamSysProcsIdleDelayTimeHighRatio (1)
528
529 /*
530 * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, treat apps as well
531 * behaved daemons for aging purposes.
532 */
533 #define kJetsamAppsIdleDelayTimeRatio (kJetsamSysProcsIdleDelayTimeLowRatio)
534
535 static uint64_t
memorystatus_sysprocs_idle_time(proc_t p)536 memorystatus_sysprocs_idle_time(proc_t p)
537 {
538 uint64_t idle_delay_time = 0;
539 /*
540 * For system processes, base the idle delay time on the
541 * jetsam relaunch behavior specified by launchd. The idea
542 * is to provide extra protection to the daemons which would
543 * relaunch immediately after jetsam.
544 */
545 switch (p->p_memstat_relaunch_flags) {
546 case P_MEMSTAT_RELAUNCH_UNKNOWN:
547 case P_MEMSTAT_RELAUNCH_LOW:
548 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeLowRatio;
549 break;
550 case P_MEMSTAT_RELAUNCH_MED:
551 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeMedRatio;
552 break;
553 case P_MEMSTAT_RELAUNCH_HIGH:
554 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeHighRatio;
555 break;
556 default:
557 panic("Unknown relaunch flags on process!");
558 break;
559 }
560 return idle_delay_time;
561 }
562
563 static uint64_t
memorystatus_apps_idle_time(__unused proc_t p)564 memorystatus_apps_idle_time(__unused proc_t p)
565 {
566 return memorystatus_apps_idle_delay_time / kJetsamAppsIdleDelayTimeRatio;
567 }
568
569
570 static int
571 sysctl_jetsam_set_sysprocs_idle_delay_time SYSCTL_HANDLER_ARGS
572 {
573 #pragma unused(oidp, arg1, arg2)
574
575 int error = 0, val = 0, old_time_in_secs = 0;
576 uint64_t old_time_in_ns = 0;
577
578 absolutetime_to_nanoseconds(memorystatus_sysprocs_idle_delay_time, &old_time_in_ns);
579 old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
580
581 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
582 if (error || !req->newptr) {
583 return error;
584 }
585
586 if ((val < 0) || (val > INT32_MAX)) {
587 memorystatus_log_error("jetsam: new idle delay interval has invalid value.\n");
588 return EINVAL;
589 }
590
591 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
592
593 return 0;
594 }
595
596 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, sysprocs_idle_delay_time_ns, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
597 0, 0, sysctl_jetsam_set_sysprocs_idle_delay_time, "I", "Aging window for system processes");
598
599
600 static int
601 sysctl_jetsam_set_apps_idle_delay_time SYSCTL_HANDLER_ARGS
602 {
603 #pragma unused(oidp, arg1, arg2)
604
605 int error = 0, val = 0, old_time_in_secs = 0;
606 uint64_t old_time_in_ns = 0;
607
608 absolutetime_to_nanoseconds(memorystatus_apps_idle_delay_time, &old_time_in_ns);
609 old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
610
611 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
612 if (error || !req->newptr) {
613 return error;
614 }
615
616 if ((val < 0) || (val > INT32_MAX)) {
617 memorystatus_log_error("jetsam: new idle delay interval has invalid value.\n");
618 return EINVAL;
619 }
620
621 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
622
623 return 0;
624 }
625
626 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, apps_idle_delay_time_ns, CTLTYPE_INT | CTLFLAG_RW,
627 0, 0, sysctl_jetsam_set_apps_idle_delay_time, "I", "Aging window for applications");
628
629 SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_KERN, &max_task_footprint_mb, 0, "");
630
631 #if __arm64__
632 int legacy_footprint_bonus_mb = 50; /* This value was chosen after looking at the top 30 apps
633 * that needed the additional room in their footprint when
634 * the 'correct' accounting methods were applied to them.
635 */
636
637 #if DEVELOPMENT || DEBUG
638 SYSCTL_INT(_kern, OID_AUTO, legacy_footprint_bonus_mb, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_footprint_bonus_mb, 0, "");
639 #endif /* DEVELOPMENT || DEBUG */
640 /*
641 * Raise the inactive and active memory limits to new values.
642 * Will only raise the limits and will do nothing if either of the current
643 * limits are 0.
644 * Caller must hold the proc_list_lock
645 */
646 static void
memorystatus_raise_memlimit_locked(proc_t p,int new_memlimit_active,int new_memlimit_inactive)647 memorystatus_raise_memlimit_locked(proc_t p,
648 int new_memlimit_active,
649 int new_memlimit_inactive)
650 {
651 int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
652 memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
653
654 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
655
656 if (p->p_memstat_memlimit_active > 0) {
657 memlimit_mb_active = p->p_memstat_memlimit_active;
658 } else if (p->p_memstat_memlimit_active == -1) {
659 memlimit_mb_active = max_task_footprint_mb;
660 } else {
661 /*
662 * Nothing to do for '0' which is
663 * a special value only used internally
664 * to test 'no limits'.
665 */
666 return;
667 }
668
669 if (p->p_memstat_memlimit_inactive > 0) {
670 memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
671 } else if (p->p_memstat_memlimit_inactive == -1) {
672 memlimit_mb_inactive = max_task_footprint_mb;
673 } else {
674 /*
675 * Nothing to do for '0' which is
676 * a special value only used internally
677 * to test 'no limits'.
678 */
679 return;
680 }
681
682 memlimit_mb_active = MAX(new_memlimit_active, memlimit_mb_active);
683 memlimit_mb_inactive = MAX(new_memlimit_inactive, memlimit_mb_inactive);
684
685 /* Maintain pre-existing limit fatality */
686 if (_memstat_proc_active_memlimit_is_fatal(p)) {
687 memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
688 }
689 if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
690 memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
691 }
692
693 memstat_set_memlimits_locked(p, memlimit_mb_active,
694 memlimit_mb_inactive, memlimit_options);
695 }
696
697 void
memorystatus_act_on_legacy_footprint_entitlement(proc_t p,boolean_t footprint_increase)698 memorystatus_act_on_legacy_footprint_entitlement(proc_t p, boolean_t footprint_increase)
699 {
700 int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
701
702 if (p == NULL) {
703 return;
704 }
705
706 proc_list_lock();
707
708 if (p->p_memstat_memlimit_active > 0) {
709 memlimit_mb_active = p->p_memstat_memlimit_active;
710 } else if (p->p_memstat_memlimit_active == -1) {
711 memlimit_mb_active = max_task_footprint_mb;
712 } else {
713 /*
714 * Nothing to do for '0' which is
715 * a special value only used internally
716 * to test 'no limits'.
717 */
718 proc_list_unlock();
719 return;
720 }
721
722 if (p->p_memstat_memlimit_inactive > 0) {
723 memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
724 } else if (p->p_memstat_memlimit_inactive == -1) {
725 memlimit_mb_inactive = max_task_footprint_mb;
726 } else {
727 /*
728 * Nothing to do for '0' which is
729 * a special value only used internally
730 * to test 'no limits'.
731 */
732 proc_list_unlock();
733 return;
734 }
735
736 if (footprint_increase) {
737 memlimit_mb_active += legacy_footprint_bonus_mb;
738 memlimit_mb_inactive += legacy_footprint_bonus_mb;
739 } else {
740 memlimit_mb_active -= legacy_footprint_bonus_mb;
741 if (memlimit_mb_active == max_task_footprint_mb) {
742 memlimit_mb_active = -1; /* reverting back to default system limit */
743 }
744
745 memlimit_mb_inactive -= legacy_footprint_bonus_mb;
746 if (memlimit_mb_inactive == max_task_footprint_mb) {
747 memlimit_mb_inactive = -1; /* reverting back to default system limit */
748 }
749 }
750 memorystatus_raise_memlimit_locked(p, memlimit_mb_active, memlimit_mb_inactive);
751
752 proc_list_unlock();
753 }
754
755 void
memorystatus_act_on_ios13extended_footprint_entitlement(proc_t p)756 memorystatus_act_on_ios13extended_footprint_entitlement(proc_t p)
757 {
758 proc_list_lock();
759 memorystatus_raise_memlimit_locked(p,
760 memorystatus_ios13extended_footprint_limit_mb,
761 memorystatus_ios13extended_footprint_limit_mb);
762 proc_list_unlock();
763 }
764
765 void
memorystatus_act_on_entitled_task_limit(proc_t p)766 memorystatus_act_on_entitled_task_limit(proc_t p)
767 {
768 if (memorystatus_entitled_max_task_footprint_mb == 0) {
769 // Entitlement is not supported on this device.
770 return;
771 }
772 proc_list_lock();
773 memorystatus_raise_memlimit_locked(p,
774 memorystatus_entitled_max_task_footprint_mb,
775 memorystatus_entitled_max_task_footprint_mb);
776 proc_list_unlock();
777 }
778
779 void
memorystatus_act_on_entitled_developer_task_limit(proc_t p)780 memorystatus_act_on_entitled_developer_task_limit(proc_t p)
781 {
782 if (memorystatus_entitled_dev_max_task_footprint_mb == 0) {
783 // Entitlement not supported on this device
784 return;
785 }
786 memorystatus_log("memorystatus: WARNING %s [%d] is receiving an entitled "
787 "debugging memory limit. This is intended only for debugging and "
788 "can result in unstable device behavior.",
789 proc_best_name(p), proc_getpid(p));
790 proc_list_lock();
791 memorystatus_raise_memlimit_locked(p,
792 memorystatus_entitled_dev_max_task_footprint_mb,
793 memorystatus_entitled_dev_max_task_footprint_mb);
794 proc_list_unlock();
795 }
796
797 #endif /* __arm64__ */
798
799 SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_level, 0, "");
800
801 int
memorystatus_get_level(__unused struct proc * p,struct memorystatus_get_level_args * args,__unused int * ret)802 memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
803 {
804 user_addr_t level = 0;
805
806 level = args->level;
807
808 if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
809 return EFAULT;
810 }
811
812 return 0;
813 }
814
815 static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
816
817 /* Memory Limits */
818
819 static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
820 static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
821
822
823 static int memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
824
825 #if DEBUG || DEVELOPMENT
826 static int memorystatus_cmd_set_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
827 static int memorystatus_cmd_get_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
828 static int memorystatus_set_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry);
829 static int memorystatus_get_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry);
830 #endif // DEBUG || DEVELOPMENT
831 static int memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry);
832
833 static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
834
835 static int memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
836
837 static void memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry);
838
839 int proc_get_memstat_priority(proc_t, boolean_t);
840
841 static boolean_t memorystatus_idle_snapshot = 0;
842
843 unsigned int memorystatus_delta = 0;
844
845 /* Jetsam Loop Detection */
846 boolean_t memorystatus_jld_enabled = FALSE; /* Enable jetsam loop detection */
847 uint32_t memorystatus_jld_eval_period_msecs = 0; /* Init pass sets this based on device memory size */
848 int memorystatus_jld_max_kill_loops = 2; /* How many times should we try and kill up to the target band */
849
850 /*
851 * A FG app can request that the aggressive jetsam mechanism display some leniency in the FG band. This 'lenient' mode is described as:
852 * --- if aggressive jetsam kills an app in the FG band and gets back >=AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD memory, it will stop the aggressive march further into and up the jetsam bands.
853 *
854 * RESTRICTIONS:
855 * - Such a request is respected/acknowledged only once while that 'requesting' app is in the FG band i.e. if aggressive jetsam was
856 * needed and the 'lenient' mode was deployed then that's it for this special mode while the app is in the FG band.
857 *
858 * - If the app is still in the FG band and aggressive jetsam is needed again, there will be no stop-and-check the next time around.
859 *
860 * - Also, the transition of the 'requesting' app away from the FG band will void this special behavior.
861 */
862
863 #define AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD 25
864 boolean_t memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
865 boolean_t memorystatus_aggressive_jetsam_lenient = FALSE;
866
867 #if DEVELOPMENT || DEBUG
868 /*
869 * Jetsam Loop Detection tunables.
870 */
871
872 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_eval_period_msecs, 0, "");
873 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_max_kill_loops, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_max_kill_loops, 0, "");
874 #endif /* DEVELOPMENT || DEBUG */
875
876 /*
877 * snapshot support for memstats collected at boot.
878 */
879 static memorystatus_jetsam_snapshot_t memorystatus_at_boot_snapshot;
880
881 static void memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count);
882 static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount);
883 static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime);
884
885 static void memorystatus_clear_errors(void);
886
887 static void memorystatus_get_task_phys_footprint_page_counts(task_t task,
888 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
889 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
890 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
891 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages,
892 uint64_t *neural_nofootprint_total_pages);
893
894 static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *count);
895
896 static uint32_t memorystatus_build_state(proc_t p);
897 //static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
898
899 static bool memorystatus_kill_top_process(bool any, bool sort_flag, uint32_t cause, os_reason_t jetsam_reason,
900 int32_t max_priority, bool only_swappable,
901 int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed);
902 static bool memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, int32_t max_kills, uint32_t *errors, uint64_t *memory_reclaimed);
903 static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed);
904
905 /* Priority Band Sorting Routines */
906 static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order);
907 static int memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order);
908 static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index);
909 static int memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz);
910
911 /* qsort routines */
912 typedef int (*cmpfunc_t)(const void *a, const void *b);
913 extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
914 static int memstat_asc_cmp(const void *a, const void *b);
915
916 /* VM pressure */
917
918 #if CONFIG_SECLUDED_MEMORY
919 extern unsigned int vm_page_secluded_count;
920 extern unsigned int vm_page_secluded_count_over_target;
921 #endif /* CONFIG_SECLUDED_MEMORY */
922
923 /* Aggressive jetsam pages threshold for sysproc aging policy */
924 unsigned int memorystatus_sysproc_aging_aggr_pages = 0;
925
926 uint32_t memorystatus_available_pages = UINT32_MAX;
927
928 __options_closed_decl(memorystatus_policy_t, uint8_t, {
929 kPolicyDefault = 0x00,
930 kPolicyClearTheDecks = 0x01,
931 kPolicyBallastDrain = 0x02,
932 });
933
934 static memorystatus_policy_t memstat_policy_config = kPolicyDefault;
935
936 #define MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX ((uint32_t)(atop_64(max_mem) / 2))
937
938 /*
939 * Jetsam Page Shortage Thresholds (PSTs):
940 * - critical: jetsam above the idle band
941 * - idle: jetsam in the idle band
942 * - pressure: jetsam soft memory limit violators
943 * - ballast: offset applied to non-critical thresholds upon request
944 * from userspace
945 * - ctd (clear-the-decks): offset applied to non-critical thresholds upon request
946 * from userspace
947 */
948 uint32_t memstat_critical_threshold = 0;
949 uint32_t memstat_idle_threshold = 0;
950 uint32_t memstat_soft_threshold = 0;
951 uint32_t memstat_ballast_offset = 0;
952 uint32_t memstat_ctd_offset = 0;
953
954 /*
955 * NB: These MiB thresholds are only read at boot and may become out of sync
956 * with the PSTs above.
957 */
958 TUNABLE_DT(uint32_t, memorystatus_critical_threshold_mb, "/defaults",
959 "kern.memstat_critical_mb", "memorystatus_critical_threshold_mb", 0, TUNABLE_DT_NONE);
960 TUNABLE_DT(uint32_t, memorystatus_idle_threshold_mb, "/defaults",
961 "kern.memstat_idle_mb", "memorystatus_idle_threshold_mb", 0, TUNABLE_DT_NONE);
962 TUNABLE_DT(uint32_t, memorystatus_pressure_threshold_mb, "/defaults",
963 "kern.memstat_pressure_mb", "memorystatus_pressure_threshold_mb", 0, TUNABLE_DT_NONE);
964 TUNABLE_DT(uint32_t, memstat_ballast_offset_mb, "/defaults",
965 "kern.memstat_ballast_mb", "memstat_ballast_offset_mb", 0, TUNABLE_DT_NONE);
966 TUNABLE(uint32_t, memstat_ctd_offset_mb, "memstat_ballast_offset_mb", 0);
967
968 #if CONFIG_JETSAM
969 TUNABLE_DT_WRITEABLE(unsigned int, memorystatus_swap_all_apps, "/defaults", "kern.swap_all_apps", "kern.swap_all_apps", false, TUNABLE_DT_NONE);
970 /* Will compact the early swapin queue if there are >= this many csegs on it. */
971 static unsigned int memorystatus_swapin_trigger_segments = 10;
972 unsigned int memorystatus_swapin_trigger_pages = 0;
973
974 #if DEVELOPMENT || DEBUG
975 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
976 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_swapin_trigger_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_swapin_trigger_pages, 0, "");
977 #else
978 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
979 #endif /* DEVELOPMENT || DEBUG */
980 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_swap_all_apps, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_swap_all_apps, 0, "");
981
982 static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit);
983
984 int32_t max_kill_priority = JETSAM_PRIORITY_MAX;
985
986 proc_name_t memorystatus_jetsam_proc_name_panic; /* Panic when we are about to jetsam this process. */
987 uint32_t memorystatus_jetsam_proc_cause_panic = 0; /* If specified, panic only when we are about to jetsam the process above for this cause. */
988 uint32_t memorystatus_jetsam_proc_size_panic = 0; /* If specified, panic only when we are about to jetsam the process above and its footprint is more than this in MB. */
989
990 /* If set, kill swappable processes when we're low on swap space. Currently off until we can allocate more swap space (rdar://87800902) */
991 uint32_t jetsam_kill_on_low_swap = 0;
992
993 /*
994 * Global switch for enabling fast jetsam. Fast jetsam is
995 * hooked up via the system_override() system call. When
996 * enabled, the following features can be toggled:
997 * - clear-the-decks jetsam
998 * - ballast-drain jetsam
999 */
1000 TUNABLE_WRITEABLE(bool, fast_jetsam_enabled, "fast_jetsam_enabled", true);
1001
1002 #else /* CONFIG_JETSAM */
1003 int32_t max_kill_priority = JETSAM_PRIORITY_IDLE;
1004 #endif /* CONFIG_JETSAM */
1005
1006 #if DEVELOPMENT || DEBUG
1007 extern bool kill_on_no_paging_space;
1008 #endif /* DEVELOPMENT || DEBUG */
1009
1010 #if DEVELOPMENT || DEBUG
1011 static inline uint32_t
roundToNearestMB(uint32_t in)1012 roundToNearestMB(uint32_t in)
1013 {
1014 return (in + ((1 << 20) - 1)) >> 20;
1015 }
1016
1017 static int memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase);
1018 #endif
1019
1020 #if __arm64__
1021 extern int legacy_footprint_entitlement_mode;
1022 #endif /* __arm64__ */
1023
1024 /* Debug */
1025
1026 extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
1027
1028 #if DEVELOPMENT || DEBUG
1029
1030 static unsigned int memorystatus_debug_dump_this_bucket = 0;
1031
1032 static void
memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)1033 memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)
1034 {
1035 proc_t p = NULL;
1036 uint64_t bytes = 0;
1037 int ledger_limit = 0;
1038 unsigned int b = bucket_index;
1039 boolean_t traverse_all_buckets = FALSE;
1040
1041 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1042 traverse_all_buckets = TRUE;
1043 b = 0;
1044 } else {
1045 traverse_all_buckets = FALSE;
1046 b = bucket_index;
1047 }
1048
1049 /*
1050 * footprint reported in [pages / MB ]
1051 * limits reported as:
1052 * L-limit proc's Ledger limit
1053 * C-limit proc's Cached limit, should match Ledger
1054 * A-limit proc's Active limit
1055 * IA-limit proc's Inactive limit
1056 * F==Fatal, NF==NonFatal
1057 */
1058
1059 memorystatus_log_debug("memorystatus_debug_dump ***START*(PAGE_SIZE_64=%llu)**\n", PAGE_SIZE_64);
1060 memorystatus_log_debug("bucket [pid] [pages / MB] [state] [EP / RP / AP] dirty deadline [L-limit / C-limit / A-limit / IA-limit] name\n");
1061 p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets);
1062 while (p) {
1063 bytes = get_task_phys_footprint(proc_task(p));
1064 task_get_phys_footprint_limit(proc_task(p), &ledger_limit);
1065 memorystatus_log_debug("%2d [%5d] [%5lld /%3lldMB] 0x%-8x [%2d / %2d / %2d] 0x%-3x %10lld [%3d / %3d%s / %3d%s / %3d%s] %s\n",
1066 b, proc_getpid(p),
1067 (bytes / PAGE_SIZE_64), /* task's footprint converted from bytes to pages */
1068 (bytes / (1024ULL * 1024ULL)), /* task's footprint converted from bytes to MB */
1069 p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_assertionpriority,
1070 p->p_memstat_dirty, p->p_memstat_idledeadline,
1071 ledger_limit,
1072 p->p_memstat_memlimit,
1073 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"),
1074 p->p_memstat_memlimit_active,
1075 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL ? "F " : "NF"),
1076 p->p_memstat_memlimit_inactive,
1077 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL ? "F " : "NF"),
1078 (*p->p_name ? p->p_name : "unknown"));
1079 p = memorystatus_get_next_proc_locked(&b, p, traverse_all_buckets);
1080 }
1081 memorystatus_log_debug("memorystatus_debug_dump ***END***\n");
1082 }
1083
1084 static int
1085 sysctl_memorystatus_debug_dump_bucket SYSCTL_HANDLER_ARGS
1086 {
1087 #pragma unused(oidp, arg2)
1088 int bucket_index = 0;
1089 int error;
1090 error = SYSCTL_OUT(req, arg1, sizeof(int));
1091 if (error || !req->newptr) {
1092 return error;
1093 }
1094 error = SYSCTL_IN(req, &bucket_index, sizeof(int));
1095 if (error || !req->newptr) {
1096 return error;
1097 }
1098 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1099 /*
1100 * All jetsam buckets will be dumped.
1101 */
1102 } else {
1103 /*
1104 * Only a single bucket will be dumped.
1105 */
1106 }
1107
1108 proc_list_lock();
1109 memorystatus_debug_dump_bucket_locked(bucket_index);
1110 proc_list_unlock();
1111 memorystatus_debug_dump_this_bucket = bucket_index;
1112 return error;
1113 }
1114
1115 /*
1116 * Debug aid to look at jetsam buckets and proc jetsam fields.
1117 * Use this sysctl to act on a particular jetsam bucket.
1118 * Writing the sysctl triggers the dump.
1119 * Usage: sysctl kern.memorystatus_debug_dump_this_bucket=<bucket_index>
1120 */
1121
1122 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_debug_dump_this_bucket, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_debug_dump_this_bucket, 0, sysctl_memorystatus_debug_dump_bucket, "I", "");
1123
1124
1125 /* Debug aid to aid determination of limit */
1126
1127 static int
1128 sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
1129 {
1130 #pragma unused(oidp, arg2)
1131 proc_t p;
1132 unsigned int b = 0;
1133 int error, enable = 0;
1134 bool use_active; /* use the active limit and active limit attributes */
1135
1136 error = SYSCTL_OUT(req, arg1, sizeof(int));
1137 if (error || !req->newptr) {
1138 return error;
1139 }
1140
1141 error = SYSCTL_IN(req, &enable, sizeof(int));
1142 if (error || !req->newptr) {
1143 return error;
1144 }
1145
1146 if (!(enable == 0 || enable == 1)) {
1147 return EINVAL;
1148 }
1149
1150 proc_list_lock();
1151
1152 memorystatus_highwater_enabled = enable;
1153
1154 p = memorystatus_get_first_proc_locked(&b, TRUE);
1155 while (p) {
1156 use_active = _memstat_proc_is_active_locked(p);
1157
1158 if (enable) {
1159 (void)_memstat_update_memlimit_locked(p, use_active);
1160 } else {
1161 /*
1162 * Disabling limits does not touch the stored variants.
1163 * Set the cached limit fields to system_wide defaults.
1164 */
1165 p->p_memstat_memlimit = -1;
1166 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
1167 }
1168
1169 /*
1170 * Enforce the cached limit by writing to the ledger.
1171 */
1172 _memstat_write_memlimit_to_ledger_locked(p, use_active, false);
1173
1174 p = memorystatus_get_next_proc_locked(&b, p, TRUE);
1175 }
1176
1177
1178 proc_list_unlock();
1179
1180 return 0;
1181 }
1182
1183 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
1184
1185 SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
1186
1187 #endif /* DEVELOPMENT || DEBUG */
1188
1189 #if CONFIG_JETSAM
1190 #if DEVELOPMENT || DEBUG
1191 static int
1192 memstat_page_shortage_threshold_sysctl_handler SYSCTL_HANDLER_ARGS
1193 {
1194 uint32_t threshold;
1195 if (arg1 == &memstat_idle_threshold) {
1196 threshold = memorystatus_get_idle_exit_page_shortage_threshold();
1197 } else if (arg1 == &memstat_soft_threshold) {
1198 threshold = memorystatus_get_soft_memlimit_page_shortage_threshold();
1199 } else if (arg1 == &memstat_critical_threshold) {
1200 threshold = memorystatus_get_critical_page_shortage_threshold();
1201 } else {
1202 return EINVAL;
1203 }
1204 return sysctl_handle_int(oidp, NULL, threshold, req);
1205 }
1206
1207 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_critical,
1208 CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_critical_threshold, 0,
1209 memstat_page_shortage_threshold_sysctl_handler, "IU",
1210 "");
1211 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_idle,
1212 CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_idle_threshold, 0,
1213 memstat_page_shortage_threshold_sysctl_handler, "IU",
1214 "");
1215 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, available_pages_soft,
1216 CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_soft_threshold, 0,
1217 memstat_page_shortage_threshold_sysctl_handler, "IU",
1218 "");
1219
1220 SYSCTL_INT(_kern_memorystatus, OID_AUTO, ballast_offset_pages,
1221 CTLFLAG_RD | CTLFLAG_LOCKED,
1222 &memstat_ballast_offset, 0, "");
1223 SYSCTL_INT(_kern_memorystatus, OID_AUTO, ctd_offset_pages,
1224 CTLFLAG_RD | CTLFLAG_LOCKED,
1225 &memstat_ctd_offset, 0, "");
1226 #endif /* DEVELOPMENT || DEBUG */
1227
1228 static int
1229 memstat_page_shortage_threshold_experiment_handler SYSCTL_HANDLER_ARGS
1230 {
1231 uint32_t threshold_mb;
1232 int error;
1233
1234 assert3p(arg1, !=, NULL);
1235 threshold_mb = ptoa_32(os_atomic_load((uint32_t *)arg1, relaxed)) >> 20;
1236
1237 error = sysctl_handle_int(oidp, &threshold_mb, 0, req);
1238 if (error || !req->newptr) {
1239 return error;
1240 }
1241
1242 if (threshold_mb > UINT32_MAX >> 20) {
1243 /* Converting to bytes would overflow */
1244 return EINVAL;
1245 }
1246
1247 uint32_t new_threshold_pages = atop_32(threshold_mb << 20);
1248 /*
1249 * Page shortage thresholds may not exceed 1/2 max_mem
1250 */
1251 if (new_threshold_pages > MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX) {
1252 return EINVAL;
1253 }
1254 if ((arg1 == &memstat_soft_threshold ||
1255 arg1 == &memstat_idle_threshold ||
1256 arg1 == &memstat_critical_threshold) &&
1257 new_threshold_pages == 0) {
1258 return EINVAL;
1259 }
1260
1261 if (arg1 == &memstat_soft_threshold) {
1262 memorystatus_log("memorystatus: setting soft memory limit "
1263 "page shortage threshold to %u MiB\n", threshold_mb);
1264 } else if (arg1 == &memstat_idle_threshold) {
1265 memorystatus_log("memorystatus: setting idle exit page "
1266 "shortage threshold to %u MiB\n", threshold_mb);
1267 } else if (arg1 == &memstat_critical_threshold) {
1268 memorystatus_log("memorystatus: setting critical page shortage"
1269 " threshold to %u MiB\n", threshold_mb);
1270 } else if (arg1 == &memstat_ctd_offset) {
1271 memorystatus_log("memorystatus: setting clear-the-decks page shortage"
1272 " offset to %u MiB\n", threshold_mb);
1273 } else if (arg1 == &memstat_ballast_offset) {
1274 memorystatus_log("memorystatus: setting ballast page shortage"
1275 " offset to %u MiB\n", threshold_mb);
1276 } else {
1277 return EINVAL;
1278 }
1279 os_atomic_store((uint32_t *)arg1, new_threshold_pages, relaxed);
1280
1281 return 0;
1282 }
1283
1284 #if DEVELOPMENT || DEBUG
1285 #define MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED)
1286 #else /* RELEASE */
1287 #define MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED)
1288 #endif /* DEVELOPMENT || DEBUG */
1289
1290 EXPERIMENT_FACTOR_PROC(_kern_memorystatus, soft_threshold_mb,
1291 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1292 &memstat_soft_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1293 "IU",
1294 "The minimum amount of available memory to maintain before killing "
1295 "processes which have violated there soft memory limit");
1296
1297 EXPERIMENT_FACTOR_PROC(_kern_memorystatus, idle_threshold_mb,
1298 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1299 &memstat_idle_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1300 "IU",
1301 "The minimum amount of available memory to maintain before exiting idle "
1302 "processes");
1303 EXPERIMENT_FACTOR_PROC(_kern_memorystatus, critical_threshold_mb,
1304 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1305 &memstat_critical_threshold, 0, memstat_page_shortage_threshold_experiment_handler,
1306 "IU",
1307 "The minimum amount of available memory to maintain before killing non-idle "
1308 "processes");
1309 EXPERIMENT_FACTOR_PROC(_kern_memorystatus, ballast_offset_mb,
1310 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1311 &memstat_ballast_offset, 0, memstat_page_shortage_threshold_experiment_handler,
1312 "IU",
1313 "An offset to apply to all non-critical page shortage thresholds when "
1314 "ballast is filling");
1315 EXPERIMENT_FACTOR_PROC(_kern_memorystatus, clear_the_decks_offset_mb,
1316 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1317 &memstat_ctd_offset, 0, memstat_page_shortage_threshold_experiment_handler,
1318 "IU",
1319 "An offset to apply to all non-critical page shortage thresholds when "
1320 "clear-the-decks is engaged");
1321
1322 int
memorystatus_ballast_control(bool drain)1323 memorystatus_ballast_control(bool drain)
1324 {
1325 if (!fast_jetsam_enabled) {
1326 memorystatus_log_error("memorystatus: fast-jetsam "
1327 "has been disabled on this system. denying request to %s ballast\n",
1328 drain ? "drain" : "flood");
1329 return ENOTSUP;
1330 }
1331 if (memstat_ballast_offset == 0) {
1332 /* nothing to do */
1333 return 0;
1334 }
1335 if (drain) {
1336 /*
1337 * Drain the ballast tanks, providing additional buoyancy by requiring that
1338 * they only be used to store "available" memory.
1339 */
1340 memorystatus_policy_t orig_policy = os_atomic_or_orig(
1341 &memstat_policy_config,
1342 (memorystatus_policy_t)kPolicyBallastDrain, relaxed);
1343 if (orig_policy & kPolicyBallastDrain) {
1344 return 0;
1345 }
1346 memorystatus_log("memorystatus: draining ballast "
1347 "-- will add %u MiB to non-critical page shortage "
1348 "thresholds\n", ptoa_32(memstat_ballast_offset) >> 20);
1349 memorystatus_thread_pool_max();
1350 _memstat_consider_waking_jetsam_thread();
1351 } else {
1352 /*
1353 * Flood the ballast tanks, removing the extra buoyancy by allowing them to be
1354 * filled with "unavailable" memory.
1355 */
1356 memorystatus_policy_t orig_policy = os_atomic_andnot_orig(
1357 &memstat_policy_config,
1358 (memorystatus_policy_t)kPolicyBallastDrain, relaxed);
1359 if (!(orig_policy & kPolicyBallastDrain)) {
1360 /* already disabled */
1361 return 0;
1362 }
1363 assertf(fast_jetsam_enabled, "ballast was drained while fast-jetsam was disabled");
1364 memorystatus_log("memorystatus: flooding ballast "
1365 "-- will subtract %u MiB from non-critical page shortage "
1366 "thresholds\n", ptoa_32(memstat_ballast_offset) >> 20);
1367 memorystatus_thread_pool_default();
1368 _memstat_consider_waking_jetsam_thread();
1369 }
1370 return 0;
1371 }
1372
1373 static int
1374 sysctl_kern_memorystatus_ballast_drain SYSCTL_HANDLER_ARGS
1375 {
1376 int error = 0;
1377
1378 boolean_t drained = os_atomic_load(&memstat_policy_config, relaxed) & kPolicyBallastDrain ? TRUE : FALSE;
1379
1380 error = sysctl_handle_int(oidp, &drained, 0, req);
1381 if (error || !req->newptr) {
1382 return error;
1383 }
1384
1385 /*
1386 * Writers must be root or have the com.apple.private.kernel.jetsam entitlement
1387 */
1388 error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
1389 if (error) {
1390 return error;
1391 }
1392
1393 return memorystatus_ballast_control(drained);
1394 }
1395
1396 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, ballast_drained,
1397 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS, 0, 0,
1398 sysctl_kern_memorystatus_ballast_drain, "IU",
1399 "If true, apply an offset (kern.memorystatus.ballast_offset_mb) to "
1400 "all non-critical page shortage thresholds");
1401
1402 #if DEVELOPMENT || DEBUG
1403 /*
1404 * In preparation for a storm, sailors may "clear the decks" of non-essential
1405 * cargo to increase the seaworthiness of a vessel. In our analogy, the
1406 * non-essential cargo is idle processes or processes which have exceeded
1407 * their memory limit. The storm may be any foreseeable user activity that will
1408 * require significant memory demand.
1409 *
1410 * Mechanically, clearing the decks involves adding a configurable offset to
1411 * the idle and soft available page shortage thresholds.
1412 *
1413 * Readers may note that the clear-the-decks policy is mechanically identical
1414 * the ballast-draining policy. Their difference lies in intended use.
1415 * Clear-the-decks is intended to address imminent memory demand and may be
1416 * configured with an offset that wouldn't be sustainable for long-term system
1417 * use. The interface is generally intended to allow clients to hint to the
1418 * system that they will need a significant amount of memory in the near future,
1419 * and the system should proactively try to free unneeded reserves to satisfy
1420 * to be able to better satisfy the demand.
1421 *
1422 * This policy is currently only exposed on development kernels for prototyping
1423 * until a productized use case emerges
1424 *
1425 * TODO: If adopted on production systems, this mechanism should use a
1426 * dedicated system-call / memorystatus-command
1427 */
1428 static int
memstat_clear_the_decks(bool clear)1429 memstat_clear_the_decks(bool clear)
1430 {
1431 if (!fast_jetsam_enabled) {
1432 memorystatus_log_error("memorystatus: fast-jetsam "
1433 "has been disabled on this system\n");
1434 return ENOTSUP;
1435 }
1436 if (clear) {
1437 /*
1438 * Clear the decks of non-essential cargo.
1439 */
1440 memorystatus_policy_t orig_policy = os_atomic_or_orig(
1441 &memstat_policy_config,
1442 (memorystatus_policy_t)kPolicyClearTheDecks, relaxed);
1443 if (orig_policy & kPolicyClearTheDecks) {
1444 return EALREADY;
1445 }
1446 memorystatus_log("memorystatus: clear-the-decks engaged "
1447 "-- will add %u MiB to non-critical page shortage "
1448 "thresholds\n", ptoa_32(memstat_ctd_offset) >> 20);
1449 memorystatus_thread_pool_max();
1450 _memstat_consider_waking_jetsam_thread();
1451 } else {
1452 /*
1453 * Allow the decks to be reloaded with non-essential cargo.
1454 */
1455 memorystatus_policy_t orig_policy = os_atomic_andnot_orig(
1456 &memstat_policy_config,
1457 (memorystatus_policy_t)kPolicyClearTheDecks, relaxed);
1458 if (!(orig_policy & kPolicyClearTheDecks)) {
1459 return EALREADY;
1460 }
1461 assertf(fast_jetsam_enabled, "clear the decks was set while fast-jetsam was disabled");
1462 memorystatus_log("memorystatus: clear-the-decks disengaged "
1463 "-- will subtract %u MiB from non-critical page shortage "
1464 "thresholds\n", ptoa_32(memstat_ctd_offset) >> 20);
1465 memorystatus_thread_pool_default();
1466 _memstat_consider_waking_jetsam_thread();
1467 }
1468 return 0;
1469 }
1470
1471 static int
1472 sysctl_kern_memorystatus_decks_cleared SYSCTL_HANDLER_ARGS
1473 {
1474 int error = 0;
1475
1476 boolean_t cleared = os_atomic_load(&memstat_policy_config, relaxed) & kPolicyClearTheDecks ? TRUE : FALSE;
1477
1478 error = sysctl_handle_int(oidp, &cleared, 0, req);
1479 if (error || !req->newptr) {
1480 return error;
1481 }
1482
1483 /*
1484 * Writers must be root or have the com.apple.private.kernel.jetsam entitlement
1485 */
1486 error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
1487 if (error) {
1488 return error;
1489 }
1490
1491 return memstat_clear_the_decks(cleared);
1492 }
1493
1494 SYSCTL_PROC(_kern_memorystatus, OID_AUTO, decks_cleared,
1495 MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS,
1496 0, 0, sysctl_kern_memorystatus_decks_cleared, "I",
1497 "If true, apply an offset (kern.memorystatus_ctd_offset_mb) to "
1498 "all non-critical page shortage thresholds");
1499 #endif /* DEVELOPMENT || DEBUG */
1500 #endif /* CONFIG_JETSAM */
1501
1502 extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
1503 void *parameter,
1504 integer_t priority,
1505 thread_t *new_thread);
1506
1507 #if DEVELOPMENT || DEBUG
1508
1509 static int
1510 sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS
1511 {
1512 #pragma unused(arg1, arg2)
1513 int error = 0, pid = 0;
1514 proc_t p;
1515
1516 error = sysctl_handle_int(oidp, &pid, 0, req);
1517 if (error || !req->newptr) {
1518 return error;
1519 }
1520
1521 lck_mtx_lock(&disconnect_page_mappings_mutex);
1522
1523 if (pid == -1) {
1524 vm_pageout_disconnect_all_pages();
1525 } else {
1526 p = proc_find(pid);
1527
1528 if (p != NULL) {
1529 error = task_disconnect_page_mappings(proc_task(p));
1530
1531 proc_rele(p);
1532
1533 if (error) {
1534 error = EIO;
1535 }
1536 } else {
1537 error = EINVAL;
1538 }
1539 }
1540 lck_mtx_unlock(&disconnect_page_mappings_mutex);
1541
1542 return error;
1543 }
1544
1545 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1546 0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", "");
1547
1548 #endif /* DEVELOPMENT || DEBUG */
1549
1550 /*
1551 * Sorts the given bucket.
1552 *
1553 * Input:
1554 * bucket_index - jetsam priority band to be sorted.
1555 * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1556 * Currently sort_order is only meaningful when handling
1557 * coalitions.
1558 *
1559 * proc_list_lock must be held by the caller.
1560 */
1561 static void
memorystatus_sort_bucket_locked(unsigned int bucket_index,int sort_order)1562 memorystatus_sort_bucket_locked(unsigned int bucket_index, int sort_order)
1563 {
1564 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
1565 if (memstat_bucket[bucket_index].count == 0) {
1566 return;
1567 }
1568
1569 switch (bucket_index) {
1570 case JETSAM_PRIORITY_FOREGROUND:
1571 if (memorystatus_sort_by_largest_coalition_locked(bucket_index, sort_order) == 0) {
1572 /*
1573 * Fall back to per process sorting when zero coalitions are found.
1574 */
1575 memorystatus_sort_by_largest_process_locked(bucket_index);
1576 }
1577 break;
1578 default:
1579 memorystatus_sort_by_largest_process_locked(bucket_index);
1580 break;
1581 }
1582 }
1583
1584 /*
1585 * Picks the sorting routine for a given jetsam priority band.
1586 *
1587 * Input:
1588 * bucket_index - jetsam priority band to be sorted.
1589 * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1590 * Currently sort_order is only meaningful when handling
1591 * coalitions.
1592 *
1593 * Return:
1594 * 0 on success
1595 * non-0 on failure
1596 */
1597 static int
memorystatus_sort_bucket(unsigned int bucket_index,int sort_order)1598 memorystatus_sort_bucket(unsigned int bucket_index, int sort_order)
1599 {
1600 int coal_sort_order;
1601
1602 /*
1603 * Verify the jetsam priority
1604 */
1605 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1606 return EINVAL;
1607 }
1608
1609 #if DEVELOPMENT || DEBUG
1610 if (sort_order == JETSAM_SORT_DEFAULT) {
1611 coal_sort_order = COALITION_SORT_DEFAULT;
1612 } else {
1613 coal_sort_order = sort_order; /* only used for testing scenarios */
1614 }
1615 #else
1616 /* Verify default */
1617 if (sort_order == JETSAM_SORT_DEFAULT) {
1618 coal_sort_order = COALITION_SORT_DEFAULT;
1619 } else {
1620 return EINVAL;
1621 }
1622 #endif
1623
1624 proc_list_lock();
1625 memorystatus_sort_bucket_locked(bucket_index, coal_sort_order);
1626 proc_list_unlock();
1627
1628 return 0;
1629 }
1630
1631 /*
1632 * Sort processes by size for a single jetsam bucket.
1633 */
1634
1635 static void
memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)1636 memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
1637 {
1638 proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
1639 proc_t next_p = NULL, prev_max_proc = NULL;
1640 uint32_t pages = 0, max_pages = 0;
1641 memstat_bucket_t *current_bucket;
1642
1643 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1644 return;
1645 }
1646
1647 current_bucket = &memstat_bucket[bucket_index];
1648
1649 p = TAILQ_FIRST(¤t_bucket->list);
1650
1651 while (p) {
1652 memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
1653 max_pages = pages;
1654 max_proc = p;
1655 prev_max_proc = p;
1656
1657 while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
1658 /* traversing list until we find next largest process */
1659 p = next_p;
1660 memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
1661 if (pages > max_pages) {
1662 max_pages = pages;
1663 max_proc = p;
1664 }
1665 }
1666
1667 if (prev_max_proc != max_proc) {
1668 /* found a larger process, place it in the list */
1669 TAILQ_REMOVE(¤t_bucket->list, max_proc, p_memstat_list);
1670 if (insert_after_proc == NULL) {
1671 TAILQ_INSERT_HEAD(¤t_bucket->list, max_proc, p_memstat_list);
1672 } else {
1673 TAILQ_INSERT_AFTER(¤t_bucket->list, insert_after_proc, max_proc, p_memstat_list);
1674 }
1675 prev_max_proc = max_proc;
1676 }
1677
1678 insert_after_proc = max_proc;
1679
1680 p = TAILQ_NEXT(max_proc, p_memstat_list);
1681 }
1682 }
1683
1684 proc_t
memorystatus_get_first_proc_locked(unsigned int * bucket_index,boolean_t search)1685 memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search)
1686 {
1687 memstat_bucket_t *current_bucket;
1688 proc_t next_p;
1689
1690 if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
1691 return NULL;
1692 }
1693
1694 current_bucket = &memstat_bucket[*bucket_index];
1695 next_p = TAILQ_FIRST(¤t_bucket->list);
1696 if (!next_p && search) {
1697 while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1698 current_bucket = &memstat_bucket[*bucket_index];
1699 next_p = TAILQ_FIRST(¤t_bucket->list);
1700 }
1701 }
1702
1703 return next_p;
1704 }
1705
1706 proc_t
memorystatus_get_next_proc_locked(unsigned int * bucket_index,proc_t p,boolean_t search)1707 memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search)
1708 {
1709 memstat_bucket_t *current_bucket;
1710 proc_t next_p;
1711
1712 if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
1713 return NULL;
1714 }
1715
1716 next_p = TAILQ_NEXT(p, p_memstat_list);
1717 while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1718 current_bucket = &memstat_bucket[*bucket_index];
1719 next_p = TAILQ_FIRST(¤t_bucket->list);
1720 }
1721
1722 return next_p;
1723 }
1724
1725 jetsam_state_t jetsam_threads;
1726
1727 /* Maximum number of jetsam threads allowed */
1728 #define JETSAM_THREADS_LIMIT 3
1729
1730 /* Number of active jetsam threads */
1731 _Atomic unsigned int active_jetsam_threads = 1;
1732 /* Number of maximum jetsam threads configured */
1733 unsigned int max_jetsam_threads = 1;
1734
1735 static jetsam_state_t
jetsam_current_thread()1736 jetsam_current_thread()
1737 {
1738 for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) {
1739 if (jetsam_threads[thr_id].thread == current_thread()) {
1740 return &(jetsam_threads[thr_id]);
1741 }
1742 }
1743 return NULL;
1744 }
1745
1746 #if CONFIG_JETSAM
1747 static void
initialize_entitled_max_task_limit()1748 initialize_entitled_max_task_limit()
1749 {
1750 /**
1751 * We've already stored the potential boot-arg "entitled_max_task_pmem" in
1752 * memorystatus_entitled_max_task_footprint_mb as a TUNABLE_DT. We provide
1753 * argptr=NULL and max_len=0 here to check only for existence of the boot-arg.
1754 *
1755 * The boot-arg takes precedence over memorystatus_swap_all_apps.
1756 */
1757 if (!PE_parse_boot_argn("entitled_max_task_pmem", NULL, 0) && memorystatus_swap_all_apps) {
1758 /*
1759 * When we have swap, we let entitled apps go up to the dram config
1760 * regardless of what's set in EDT,
1761 * This can still be overriden with the entitled_max_task_pmem boot-arg.
1762 */
1763 memorystatus_entitled_max_task_footprint_mb =
1764 (int32_t)(max_mem_actual / (1ULL << 20));
1765 memorystatus_entitled_dev_max_task_footprint_mb =
1766 memorystatus_entitled_max_task_footprint_mb;
1767 }
1768
1769 if (memorystatus_entitled_max_task_footprint_mb < 0) {
1770 memorystatus_log_error("Invalid value (%d) for entitled_max_task_pmem. "
1771 "Setting to 0\n", memorystatus_entitled_max_task_footprint_mb);
1772 memorystatus_entitled_max_task_footprint_mb = 0;
1773 }
1774
1775 if (memorystatus_entitled_dev_max_task_footprint_mb < -1) {
1776 memorystatus_log_error("Invalid value (%d) for entitled_max_developer_task_pmem. "
1777 "Setting to 0\n", memorystatus_entitled_dev_max_task_footprint_mb);
1778 memorystatus_entitled_dev_max_task_footprint_mb = 0;
1779 } else if (memorystatus_entitled_dev_max_task_footprint_mb == -1) {
1780 memorystatus_entitled_dev_max_task_footprint_mb = (int32_t)
1781 (max_mem_actual >> 20);
1782 }
1783
1784 if (memorystatus_entitled_dev_max_task_footprint_mb &&
1785 memorystatus_entitled_dev_max_task_footprint_mb <
1786 memorystatus_entitled_max_task_footprint_mb) {
1787 memorystatus_log_error("memorystatus: Entitled developer limit (%d MB) "
1788 "must be ≥ entitled task limit (%d MB)\n",
1789 memorystatus_entitled_dev_max_task_footprint_mb,
1790 memorystatus_entitled_max_task_footprint_mb);
1791 memorystatus_entitled_dev_max_task_footprint_mb =
1792 memorystatus_entitled_max_task_footprint_mb;
1793 }
1794 }
1795
1796 #endif /* CONFIG_JETSAM */
1797
1798
1799 __private_extern__ void
memorystatus_init(void)1800 memorystatus_init(void)
1801 {
1802 kern_return_t result;
1803 int i;
1804
1805 #if CONFIG_FREEZE
1806 memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX_DEFAULT;
1807 memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */
1808 memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4);
1809 memorystatus_freeze_pages_min = FREEZE_PAGES_MIN_DEFAULT;
1810 memorystatus_freeze_pages_max = FREEZE_PAGES_MAX_DEFAULT;
1811 memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS_DEFAULT;
1812 memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD_DEFAULT;
1813 memorystatus_min_thaw_refreeze_threshold = MIN_THAW_REFREEZE_THRESHOLD_DEFAULT;
1814 #endif /* CONFIG_FREEZE */
1815
1816 #if DEVELOPMENT || DEBUG
1817 if (kill_on_no_paging_space) {
1818 max_kill_priority = JETSAM_PRIORITY_MAX;
1819 }
1820 #endif
1821 // Note: no-op pending rdar://27006343 (Custom kernel log handles)
1822 memorystatus_log_handle = os_log_create("com.apple.xnu", "memorystatus");
1823
1824 /* Init buckets */
1825 for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
1826 TAILQ_INIT(&memstat_bucket[i].list);
1827 memstat_bucket[i].count = 0;
1828 memstat_bucket[i].relaunch_high_count = 0;
1829 }
1830 memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
1831
1832 nanoseconds_to_absolutetime(memstat_idle_deferral_time_s * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
1833 nanoseconds_to_absolutetime(memstat_idle_deferral_time_s * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
1834 assert3u(memstat_idle_deferral_time_s, >=, kJetsamSysProcsIdleDelayTimeLowRatio);
1835
1836 #if CONFIG_JETSAM
1837 bzero(memorystatus_jetsam_proc_name_panic, sizeof(memorystatus_jetsam_proc_name_panic));
1838 if (PE_parse_boot_argn("jetsam_proc_name_panic", &memorystatus_jetsam_proc_name_panic, sizeof(memorystatus_jetsam_proc_name_panic))) {
1839 /*
1840 * No bounds check to see if this is a valid cause.
1841 * This is a debugging aid. The callers should know precisely which cause they wish to track.
1842 */
1843 PE_parse_boot_argn("jetsam_proc_cause_panic", &memorystatus_jetsam_proc_cause_panic, sizeof(memorystatus_jetsam_proc_cause_panic));
1844 PE_parse_boot_argn("jetsam_proc_size_panic", &memorystatus_jetsam_proc_size_panic, sizeof(memorystatus_jetsam_proc_size_panic));
1845 }
1846
1847 if (memorystatus_swap_all_apps && vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1848 panic("kern.swap_all_apps is not supported on this platform");
1849 }
1850
1851 /*
1852 * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE
1853 * band and must be below it in priority. This is so that we don't have to make
1854 * our 'aging' code worry about a mix of processes, some of which need to age
1855 * and some others that need to stay elevated in the jetsam bands.
1856 */
1857 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band);
1858 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band_stuck);
1859 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band);
1860
1861 /* Take snapshots for idle-exit kills by default? First check the boot-arg... */
1862 if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot))) {
1863 /* ...no boot-arg, so check the device tree */
1864 PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
1865 }
1866
1867 memorystatus_sysproc_aging_aggr_pages = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_AGGR_SYSPROC_AGING_PERCENTAGE);
1868
1869 if (max_mem <= MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) {
1870 memorystatus_delta = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_DELTA_PERCENTAGE_SMALL);
1871 } else {
1872 memorystatus_delta = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_DELTA_PERCENTAGE_LARGE);
1873 }
1874
1875 if (memorystatus_critical_threshold_mb != 0) {
1876 memstat_critical_threshold = atop_32(memorystatus_critical_threshold_mb << 20);
1877 } else {
1878 if (max_mem <= MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) {
1879 memstat_critical_threshold = MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_CRITICAL_PERCENTAGE_SMALL);
1880 } else {
1881 memstat_critical_threshold = MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_CRITICAL_PERCENTAGE_LARGE);
1882 }
1883 }
1884 assert3u(memstat_critical_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
1885
1886 if (memorystatus_idle_threshold_mb != 0) {
1887 memstat_idle_threshold = atop_32(memorystatus_idle_threshold_mb << 20);
1888 } else {
1889 /*
1890 * For historical reasons, devices with "medium"-sized memory configs have a different critical:idle:pressure ratio
1891 */
1892 if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
1893 (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
1894 memstat_idle_threshold = (MEMORYSTATUS_IDLE_RATIO_NUM_MEDIUM * memstat_critical_threshold) /
1895 MEMORYSTATUS_IDLE_RATIO_DENOM_MEDIUM;
1896 } else {
1897 memstat_idle_threshold = (MEMORYSTATUS_IDLE_RATIO_NUM * memstat_critical_threshold) /
1898 MEMORYSTATUS_IDLE_RATIO_DENOM;
1899 }
1900 }
1901 assert3u(memstat_idle_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
1902
1903 if (memorystatus_pressure_threshold_mb != 0) {
1904 memstat_soft_threshold = atop_32(memorystatus_pressure_threshold_mb << 20);
1905 } else {
1906 if ((max_mem > MEMORYSTATUS_SMALL_MEMORY_THRESHOLD) &&
1907 (max_mem <= MEMORYSTATUS_MEDIUM_MEMORY_THRESHOLD)) {
1908 memstat_soft_threshold = (MEMORYSTATUS_PRESSURE_RATIO_NUM_MEDIUM * memstat_critical_threshold) /
1909 MEMORYSTATUS_PRESSURE_RATIO_DENOM_MEDIUM;
1910 } else {
1911 memstat_soft_threshold = (MEMORYSTATUS_PRESSURE_RATIO_NUM * memstat_critical_threshold) /
1912 MEMORYSTATUS_PRESSURE_RATIO_DENOM;
1913 }
1914 }
1915 assert3u(memstat_soft_threshold, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
1916
1917 if (memstat_ballast_offset_mb != 0) {
1918 memstat_ballast_offset = atop_32(memstat_ballast_offset_mb << 20);
1919 }
1920 assert3u(memstat_ballast_offset, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
1921
1922 if (memstat_ctd_offset_mb != 0) {
1923 memstat_ctd_offset = atop_32(memstat_ctd_offset_mb << 20);
1924 }
1925 assert3u(memstat_ctd_offset, <, MEMSTAT_PAGE_SHORTAGE_THRESHOLD_MAX);
1926
1927 /* Set the swapin trigger in pages based on the maximum size allocated for each c_seg */
1928 memorystatus_swapin_trigger_pages = (unsigned int) atop_64(memorystatus_swapin_trigger_segments * c_seg_allocsize);
1929
1930 /* Jetsam Loop Detection */
1931 if (max_mem <= (512 * 1024 * 1024)) {
1932 /* 512 MB devices */
1933 memorystatus_jld_eval_period_msecs = 8000; /* 8000 msecs == 8 second window */
1934 } else {
1935 /* 1GB and larger devices */
1936 memorystatus_jld_eval_period_msecs = 6000; /* 6000 msecs == 6 second window */
1937 }
1938
1939 memorystatus_jld_enabled = TRUE;
1940
1941 initialize_entitled_max_task_limit();
1942 #endif /* CONFIG_JETSAM */
1943
1944 memorystatus_jetsam_snapshot_max = maxproc;
1945
1946 memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
1947 (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
1948
1949 memorystatus_jetsam_snapshot = kalloc_data(memorystatus_jetsam_snapshot_size, Z_WAITOK | Z_ZERO);
1950 if (!memorystatus_jetsam_snapshot) {
1951 panic("Could not allocate memorystatus_jetsam_snapshot");
1952 }
1953
1954 #if CONFIG_FREEZE
1955 memorystatus_jetsam_snapshot_freezer_max = memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR;
1956 memorystatus_jetsam_snapshot_freezer_size = sizeof(memorystatus_jetsam_snapshot_t) +
1957 (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_freezer_max);
1958
1959 memorystatus_jetsam_snapshot_freezer =
1960 zalloc_permanent(memorystatus_jetsam_snapshot_freezer_size, ZALIGN_PTR);
1961 #endif /* CONFIG_FREEZE */
1962
1963 nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
1964
1965 memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
1966
1967 #if CONFIG_FREEZE
1968 if (memorystatus_freeze_threshold_mb != 0) {
1969 memorystatus_freeze_threshold = (unsigned int)atop_64((uint64_t)memorystatus_freeze_threshold_mb << 20);
1970 } else {
1971 memorystatus_freeze_threshold = (unsigned int)MEMSTAT_PERCENT_TOTAL_PAGES(MEMORYSTATUS_FREEZE_THRESHOLD_PERCENTAGE);
1972 }
1973 assert(memorystatus_freeze_threshold < (unsigned int)atop_64(max_mem));
1974
1975 if (memorystatus_swap_all_apps) {
1976 /*
1977 * Swap is enabled, so we expect a larger working set & larger apps.
1978 * Adjust thresholds accordingly.
1979 */
1980 memorystatus_freeze_configure_for_swap();
1981 }
1982 #endif
1983
1984 /* Check the boot-arg to configure the maximum number of jetsam threads */
1985 if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof(max_jetsam_threads))) {
1986 max_jetsam_threads = JETSAM_THREADS_LIMIT;
1987 }
1988
1989 /* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */
1990 if (max_jetsam_threads > JETSAM_THREADS_LIMIT) {
1991 max_jetsam_threads = JETSAM_THREADS_LIMIT;
1992 }
1993
1994 #if CONFIG_JETSAM
1995 /* For low CPU systems disable fast jetsam mechanism */
1996 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
1997 max_jetsam_threads = 1;
1998 }
1999 #endif /* CONFIG_JETSAM */
2000
2001 #if DEVELOPMENT || DEBUG
2002 if (PE_parse_boot_argn("-memorystatus-skip-fg-notify", &i, sizeof(i))) {
2003 memorystatus_should_issue_fg_band_notify = false;
2004 }
2005 #endif /* DEVELOPMENT || DEBUG */
2006
2007 /* Initialize the jetsam_threads state array */
2008 jetsam_threads = zalloc_permanent(sizeof(struct jetsam_state_s) *
2009 max_jetsam_threads, ZALIGN(struct jetsam_state_s));
2010
2011 /* Initialize all the jetsam threads */
2012 for (i = 0; i < max_jetsam_threads; i++) {
2013 jetsam_threads[i].inited = false;
2014 jetsam_threads[i].index = i;
2015 result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread);
2016 if (result != KERN_SUCCESS) {
2017 panic("Could not create memorystatus_thread %d", i);
2018 }
2019 thread_deallocate(jetsam_threads[i].thread);
2020 }
2021
2022 #if VM_PRESSURE_EVENTS
2023 memorystatus_notify_init();
2024 #endif /* VM_PRESSURE_EVENTS */
2025
2026 #if JETSAM_ZPRINT_SNAPSHOT
2027 size_t jzs_names_size, jzs_info_size, jzs_meminfo_size;
2028
2029 jzs_zone_cnt = zone_max_zones();
2030 jzs_names_size = jzs_zone_cnt * sizeof(mach_zone_name_t);
2031 jzs_names = zalloc_permanent(jzs_names_size, ZALIGN(mach_zone_name_t));
2032
2033 jzs_info_size = jzs_zone_cnt * sizeof(mach_zone_info_t);
2034 jzs_info = zalloc_permanent(jzs_info_size, ZALIGN(mach_zone_info_t));
2035
2036 jzs_coalesce = zalloc_permanent(jzs_zone_cnt * sizeof(int), ZALIGN(int));
2037
2038 jzs_meminfo_cnt = vm_page_diagnose_estimate();
2039 jzs_meminfo_size = jzs_meminfo_cnt * sizeof(mach_memory_info_t);
2040 jzs_meminfo = kalloc_data_tag(jzs_meminfo_size, Z_WAITOK, VM_KERN_MEMORY_DIAG);
2041 #endif /* JETSAM_ZPRINT_SNAPSHOT */
2042 }
2043
2044 #if CONFIG_JETSAM
2045 bool
memorystatus_disable_swap(void)2046 memorystatus_disable_swap(void)
2047 {
2048 #if DEVELOPMENT || DEBUG
2049 int boot_arg_val = 0;
2050 if (PE_parse_boot_argn("kern.swap_all_apps", &boot_arg_val, sizeof(boot_arg_val))) {
2051 if (boot_arg_val) {
2052 /* Can't disable app swap if it was set via a boot-arg */
2053 return false;
2054 }
2055 }
2056 #endif /* DEVELOPMENT || DEBUG */
2057 memorystatus_swap_all_apps = false;
2058 #if CONFIG_FREEZE
2059 /* Go back to the smaller freezer thresholds */
2060 memorystatus_freeze_disable_swap();
2061 #endif /* CONFIG_FREEZE */
2062 initialize_entitled_max_task_limit();
2063 return true;
2064 }
2065 #endif /* CONFIG_JETSAM */
2066
2067 /*
2068 * The jetsam no frills kill call
2069 * Return: 0 on success
2070 * error code on failure (EINVAL...)
2071 */
2072 static int
jetsam_do_kill(proc_t p,int jetsam_flags,os_reason_t jetsam_reason)2073 jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason)
2074 {
2075 int error = 0;
2076 error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason);
2077 return error;
2078 }
2079
2080 /*
2081 * Wrapper for processes exiting with memorystatus details
2082 */
2083 static boolean_t
memorystatus_do_kill(proc_t p,uint32_t cause,os_reason_t jetsam_reason,uint64_t * footprint_of_killed_proc)2084 memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason, uint64_t *footprint_of_killed_proc)
2085 {
2086 int error = 0;
2087 __unused pid_t victim_pid = proc_getpid(p);
2088 uint64_t footprint = get_task_phys_footprint(proc_task(p));
2089 #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
2090 int32_t memstat_effectivepriority = p->p_memstat_effectivepriority;
2091 #endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
2092
2093 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DO_KILL) | DBG_FUNC_START,
2094 victim_pid, cause, vm_page_free_count, footprint);
2095 DTRACE_MEMORYSTATUS4(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause, uint64_t, footprint);
2096
2097 #if CONFIG_JETSAM
2098 if (*p->p_name && !strncmp(memorystatus_jetsam_proc_name_panic, p->p_name, sizeof(p->p_name))) { /* name */
2099 if ((!memorystatus_jetsam_proc_cause_panic || cause == memorystatus_jetsam_proc_cause_panic) && /* cause */
2100 (!memorystatus_jetsam_proc_size_panic || (footprint >> 20) >= memorystatus_jetsam_proc_size_panic)) { /* footprint */
2101 panic("memorystatus_do_kill(): requested panic on jetsam of %s (cause: %d and footprint: %llu mb)",
2102 memorystatus_jetsam_proc_name_panic, cause, footprint >> 20);
2103 }
2104 }
2105 #else /* CONFIG_JETSAM */
2106 #pragma unused(cause)
2107 #endif /* CONFIG_JETSAM */
2108
2109 if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
2110 memorystatus_log(
2111 "memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n",
2112 proc_getpid(p), (*p->p_name ? p->p_name : "unknown"),
2113 memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority,
2114 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
2115 }
2116
2117 /*
2118 * The jetsam_reason (os_reason_t) has enough information about the kill cause.
2119 * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped.
2120 */
2121 int jetsam_flags = P_LTERM_JETSAM;
2122 switch (cause) {
2123 case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break;
2124 case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break;
2125 case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
2126 case kMemorystatusKilledVMCompressorThrashing:
2127 case kMemorystatusKilledVMCompressorSpaceShortage: jetsam_flags |= P_JETSAM_VMTHRASHING; break;
2128 case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break;
2129 case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break;
2130 case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break;
2131 }
2132 /* jetsam_do_kill drops a reference. */
2133 os_reason_ref(jetsam_reason);
2134 error = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
2135 *footprint_of_killed_proc = ((error == 0) ? footprint : 0);
2136
2137 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DO_KILL) | DBG_FUNC_END,
2138 victim_pid, memstat_effectivepriority, vm_page_free_count, error);
2139
2140 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_COMPACTOR_RUN) | DBG_FUNC_START,
2141 victim_pid, cause, vm_page_free_count, *footprint_of_killed_proc);
2142
2143 if (jetsam_reason->osr_code == JETSAM_REASON_VNODE) {
2144 /*
2145 * vnode jetsams are syncronous and not caused by memory pressure.
2146 * Running the compactor on this thread adds significant latency to the filesystem operation
2147 * that triggered this jetsam.
2148 * Kick of compactor thread asyncronously instead.
2149 */
2150 vm_wake_compactor_swapper();
2151 } else {
2152 vm_run_compactor();
2153 }
2154
2155 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_COMPACTOR_RUN) | DBG_FUNC_END,
2156 victim_pid, cause, vm_page_free_count);
2157
2158 os_reason_free(jetsam_reason);
2159 return error == 0;
2160 }
2161
2162 /*
2163 * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work.
2164 * For an application: that means no longer in the FG band
2165 * For a daemon: that means no longer in its 'requested' jetsam priority band
2166 */
2167
2168 int
memorystatus_update_inactive_jetsam_priority_band(pid_t pid,uint32_t op_flags,int jetsam_prio,boolean_t effective_now)2169 memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now)
2170 {
2171 int error = 0;
2172 boolean_t enable = FALSE;
2173 proc_t p = NULL;
2174
2175 if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) {
2176 enable = TRUE;
2177 } else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) {
2178 enable = FALSE;
2179 } else {
2180 return EINVAL;
2181 }
2182
2183 p = proc_find(pid);
2184 if (p != NULL) {
2185 if ((enable && _memstat_proc_is_elevated(p)) ||
2186 (!enable && !_memstat_proc_is_elevated(p))) {
2187 /*
2188 * No change in state.
2189 */
2190 } else {
2191 proc_list_lock();
2192
2193 if (enable) {
2194 p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
2195
2196 if (effective_now) {
2197 if (p->p_memstat_effectivepriority < jetsam_prio) {
2198 memstat_update_priority_locked(p, jetsam_prio, MEMSTAT_PRIORITY_OPTIONS_NONE);
2199 }
2200 } else {
2201 if (_memstat_proc_is_aging(p)) {
2202 memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2203 }
2204 }
2205 } else {
2206 p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
2207
2208 if (effective_now) {
2209 if (p->p_memstat_effectivepriority == jetsam_prio) {
2210 memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2211 }
2212 } else {
2213 if (_memstat_proc_is_aging(p)) {
2214 memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2215 }
2216 }
2217 }
2218
2219 proc_list_unlock();
2220 }
2221 proc_rele(p);
2222 error = 0;
2223 } else {
2224 error = ESRCH;
2225 }
2226
2227 return error;
2228 }
2229
2230 static void
memorystatus_perform_idle_demotion(__unused void * spare1,__unused void * spare2)2231 memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
2232 {
2233 proc_t p;
2234 uint64_t current_time = 0, idle_delay_time = 0;
2235 int demote_prio_band = 0;
2236 memstat_bucket_t *demotion_bucket;
2237
2238 memorystatus_log_debug("memorystatus_perform_idle_demotion()\n");
2239
2240 if (!system_procs_aging_band && !system_procs_aging_band_stuck && !applications_aging_band) {
2241 return;
2242 }
2243
2244 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START);
2245
2246 current_time = mach_absolute_time();
2247
2248 proc_list_lock();
2249
2250 demote_prio_band = JETSAM_PRIORITY_IDLE + 1;
2251
2252 for (; demote_prio_band < JETSAM_PRIORITY_MAX; demote_prio_band++) {
2253 if (demote_prio_band != system_procs_aging_band &&
2254 demote_prio_band != system_procs_aging_band_stuck &&
2255 demote_prio_band != applications_aging_band) {
2256 continue;
2257 }
2258
2259 demotion_bucket = &memstat_bucket[demote_prio_band];
2260 p = TAILQ_FIRST(&demotion_bucket->list);
2261
2262 while (p) {
2263 memorystatus_log_debug("memorystatus_perform_idle_demotion() found %s [%d]\n", proc_best_name(p), proc_getpid(p));
2264
2265 assert(p->p_memstat_idledeadline);
2266
2267 assert(_memstat_proc_is_aging(p));
2268
2269 if (current_time >= p->p_memstat_idledeadline) {
2270 proc_t next_proc = NULL;
2271
2272 next_proc = TAILQ_NEXT(p, p_memstat_list);
2273
2274 if ((isSysProc(p) && _memstat_proc_is_dirty(p)) || /* system proc marked dirty*/
2275 task_has_assertions(proc_task(p))) { /* has outstanding assertions which might indicate outstanding work too */
2276 idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p);
2277
2278 if (isSysProc(p) && task_has_assertions(proc_task(p)) && demote_prio_band != system_procs_aging_band_stuck) {
2279 memorystatus_log_debug("memorystatus_perform_idle_demotion() found stuck process %d [%s], moving to JETSAM_PRIORITY_AGING_BAND1_STUCK\n",
2280 proc_getpid(p), (*p->p_name ? p->p_name : "(unknown)"));
2281 memstat_update_priority_locked(p, JETSAM_PRIORITY_AGING_BAND1_STUCK, MEMSTAT_PRIORITY_NO_AGING);
2282 }
2283
2284 p->p_memstat_idledeadline += idle_delay_time;
2285 } else {
2286 memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE, MEMSTAT_PRIORITY_NO_AGING);
2287 }
2288 p = next_proc;
2289 } else {
2290 // No further candidates
2291 break;
2292 }
2293 }
2294 }
2295
2296 _memstat_reschedule_idle_demotion_locked();
2297
2298 proc_list_unlock();
2299
2300 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END);
2301 }
2302
2303 /*
2304 * Schedule a process for idle demotion. Updates the process' idle deadline
2305 * and marks it as aging. The caller is responsible for rescheduling the idle
2306 * demotion thread
2307 */
2308 static void
_memstat_schedule_idle_demotion_locked(proc_t p)2309 _memstat_schedule_idle_demotion_locked(proc_t p)
2310 {
2311 uint64_t idle_delay_time = 0;
2312 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2313 assert(system_procs_aging_band || applications_aging_band);
2314 assert(!_memstat_proc_is_aging(p));
2315
2316 memorystatus_log_debug(
2317 "%s: scheduling demotion to idle band for pid %d (dirty:0x%x).\n",
2318 __func__, proc_getpid(p), p->p_memstat_dirty);
2319
2320 idle_delay_time = isSysProc(p) ? memorystatus_sysprocs_idle_time(p) :
2321 memorystatus_apps_idle_time(p);
2322 p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS;
2323 p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time;
2324 }
2325
2326 /*
2327 * Cancel a process' idle demotion. The caller must also reschedule the idle
2328 * demotion thread.
2329 */
2330 static void
_memstat_invalidate_idle_demotion_locked(proc_t p)2331 _memstat_invalidate_idle_demotion_locked(proc_t p)
2332 {
2333 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2334 assert(system_procs_aging_band || applications_aging_band);
2335 assert(_memstat_proc_is_aging(p));
2336
2337 memorystatus_log_debug(
2338 "%s: invalidating demotion to idle band for %s [%d]\n",
2339 __func__, proc_best_name(p), proc_getpid(p));
2340
2341 p->p_memstat_idledeadline = 0;
2342 p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS;
2343 }
2344
2345 /*
2346 * Return the earliest idle deadline of all aging procs. Returns 0 if there are
2347 * no aging procs.
2348 */
2349 static uint64_t
_memstat_find_earliest_idle_deadline(void)2350 _memstat_find_earliest_idle_deadline(void)
2351 {
2352 memstat_bucket_t *demotion_bucket;
2353 proc_t oldest_proc = PROC_NULL;
2354 uint32_t aging_app_count = 0, aging_sysproc_count = 0, aging_sysproc_count_stuck = 0;
2355 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2356 assert(system_procs_aging_band || system_procs_aging_band_stuck || applications_aging_band);
2357
2358 if (system_procs_aging_band) {
2359 aging_sysproc_count = memstat_bucket[system_procs_aging_band].count;
2360 }
2361 if (system_procs_aging_band_stuck) {
2362 aging_sysproc_count_stuck = memstat_bucket[system_procs_aging_band_stuck].count;
2363 }
2364 if (applications_aging_band) {
2365 aging_app_count = memstat_bucket[applications_aging_band].count;
2366 }
2367
2368 if ((aging_app_count + aging_sysproc_count + aging_sysproc_count_stuck) == 0) {
2369 return 0;
2370 }
2371
2372 if (system_procs_aging_band && aging_sysproc_count > 0) {
2373 demotion_bucket = &memstat_bucket[system_procs_aging_band];
2374 oldest_proc = TAILQ_FIRST(&demotion_bucket->list);
2375 }
2376
2377 if (system_procs_aging_band_stuck && aging_sysproc_count_stuck > 0) {
2378 proc_t oldest_sysproc_stuck;
2379 demotion_bucket = &memstat_bucket[system_procs_aging_band_stuck];
2380 oldest_sysproc_stuck = TAILQ_FIRST(&demotion_bucket->list);
2381
2382 if (oldest_proc) {
2383 if (oldest_sysproc_stuck->p_memstat_idledeadline <
2384 oldest_proc->p_memstat_idledeadline) {
2385 oldest_proc = oldest_sysproc_stuck;
2386 }
2387 } else {
2388 oldest_proc = oldest_sysproc_stuck;
2389 }
2390 }
2391
2392 if (applications_aging_band && aging_app_count > 0) {
2393 proc_t oldest_app;
2394 demotion_bucket = &memstat_bucket[applications_aging_band];
2395 oldest_app = TAILQ_FIRST(&demotion_bucket->list);
2396
2397 if (!oldest_proc ||
2398 (oldest_app->p_memstat_idledeadline <
2399 oldest_proc->p_memstat_idledeadline)) {
2400 oldest_proc = oldest_app;
2401 }
2402 }
2403
2404 assert(oldest_proc);
2405 assert(oldest_proc->p_memstat_idledeadline);
2406 assert(_memstat_proc_is_aging(oldest_proc));
2407
2408 return oldest_proc->p_memstat_idledeadline;
2409 }
2410
2411 /*
2412 * Reschedule or cancel a pending wakeup of the idle_demotion thread. If called
2413 * in response to a process transitioning in/out of the aging band, then
2414 * rescheduling must occur *after* the new priority is updated.
2415 */
2416 static void
_memstat_reschedule_idle_demotion_locked(void)2417 _memstat_reschedule_idle_demotion_locked(void)
2418 {
2419 uint64_t idle_deadline;
2420 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2421
2422 if (!system_procs_aging_band && !applications_aging_band) {
2423 return;
2424 }
2425 idle_deadline = _memstat_find_earliest_idle_deadline();
2426 if (idle_deadline == 0) {
2427 /* No aging processes, cancel call to demotion thread */
2428 thread_call_cancel(memorystatus_idle_demotion_call);
2429 } else if (memstat_idle_demotion_deadline != idle_deadline) {
2430 thread_call_enter_delayed(memorystatus_idle_demotion_call, idle_deadline);
2431 }
2432 memstat_idle_demotion_deadline = idle_deadline;
2433 }
2434
2435 /*
2436 * List manipulation
2437 */
2438
2439 int
memorystatus_add(proc_t p,boolean_t locked)2440 memorystatus_add(proc_t p, boolean_t locked)
2441 {
2442 memstat_bucket_t *bucket;
2443 bool reschedule_demotion = false;
2444
2445 memorystatus_log_debug("memorystatus_list_add(): adding pid %d with priority %d.\n",
2446 proc_getpid(p), p->p_memstat_effectivepriority);
2447
2448 if (!locked) {
2449 proc_list_lock();
2450 }
2451
2452 DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority);
2453
2454 /* Processes marked internal do not have priority tracked */
2455 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2456 goto exit;
2457 }
2458
2459 /*
2460 * Opt out system processes from being frozen by default.
2461 * For coalition-based freezing, we only want to freeze sysprocs that have specifically opted in.
2462 */
2463 if (isSysProc(p)) {
2464 p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED;
2465 }
2466 #if CONFIG_FREEZE
2467 memorystatus_freeze_init_proc(p);
2468 #endif
2469
2470 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2471
2472 if ((system_procs_aging_band &&
2473 p->p_memstat_effectivepriority == system_procs_aging_band) ||
2474 (applications_aging_band &&
2475 p->p_memstat_effectivepriority == applications_aging_band)) {
2476 _memstat_schedule_idle_demotion_locked(p);
2477 reschedule_demotion = true;
2478 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2479 /*
2480 * Entering the idle band.
2481 * Record idle start time.
2482 */
2483 p->p_memstat_idle_start = mach_absolute_time();
2484 }
2485
2486 TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
2487 bucket->count++;
2488 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2489 bucket->relaunch_high_count++;
2490 }
2491
2492 memorystatus_list_count++;
2493
2494 if (reschedule_demotion) {
2495 _memstat_reschedule_idle_demotion_locked();
2496 }
2497
2498 task_t t = proc_task(p);
2499 if (t && task_is_app_suspended(t)) {
2500 _memstat_proc_set_suspended(p);
2501 }
2502
2503 _memstat_consider_waking_jetsam_thread();
2504
2505 exit:
2506 if (!locked) {
2507 proc_list_unlock();
2508 }
2509
2510 return 0;
2511 }
2512
2513 /*
2514 * Record timestamps if process p is transitioning in/out of the IDLE band.
2515 */
2516 static void
_memstat_record_idle_transition(proc_t p,int new_priority)2517 _memstat_record_idle_transition(proc_t p, int new_priority)
2518 {
2519 if (p->p_memstat_effectivepriority == new_priority) {
2520 /* no change in priority */
2521 return;
2522 }
2523 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2524 uint64_t now;
2525 /*
2526 * Transitioning out of the idle priority bucket.
2527 * Record idle delta.
2528 */
2529 assert(p->p_memstat_idle_start != 0);
2530 now = mach_absolute_time();
2531 assert3u(now, >, p->p_memstat_idle_start);
2532 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2533
2534 /*
2535 * About to become active and so memory footprint could change.
2536 * So mark it eligible for freeze-considerations next time around.
2537 */
2538 p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE;
2539
2540 _memstat_consider_waking_jetsam_thread();
2541 } else if (new_priority == JETSAM_PRIORITY_IDLE) {
2542 /*
2543 * Transitioning into the idle priority bucket.
2544 * Record idle start.
2545 */
2546 p->p_memstat_idle_start = mach_absolute_time();
2547 }
2548 }
2549
2550 /*
2551 * Description:
2552 * Moves a process from one jetsam bucket to another.
2553 * which changes the LRU position of the process.
2554 *
2555 * Monitors transition between buckets and if necessary
2556 * will update cached memory limits accordingly.
2557 *
2558 */
2559 void
memstat_update_priority_locked(proc_t p,int priority,memstat_priority_options_t options)2560 memstat_update_priority_locked(proc_t p,
2561 int priority,
2562 memstat_priority_options_t options)
2563 {
2564 memstat_bucket_t *old_bucket, *new_bucket;
2565 bool reschedule_demotion = false;
2566
2567 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2568
2569 assert(priority < MEMSTAT_BUCKET_COUNT);
2570
2571 /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
2572 if (proc_list_exited(p)) {
2573 return;
2574 }
2575
2576 memorystatus_log_debug("memorystatus: setting %s(%d) to priority %d, inserting at %s\n",
2577 (*p->p_name ? p->p_name : "unknown"), proc_getpid(p), priority,
2578 (options & MEMSTAT_PRIORITY_INSERT_HEAD) ? "head" : "tail");
2579
2580 DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority);
2581
2582 old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2583
2584 if (!(options & MEMSTAT_PRIORITY_NO_AGING)) {
2585 if (_memstat_proc_is_elevated(p)) {
2586 /*
2587 * 2 types of processes can use the non-standard elevated inactive band:
2588 * - Frozen processes that always land in memorystatus_freeze_jetsam_band
2589 * OR
2590 * - processes that specifically opt-in to the elevated inactive support e.g. docked processes.
2591 */
2592 if (_memstat_proc_is_frozen(p) &&
2593 priority <= memorystatus_freeze_jetsam_band) {
2594 priority = memorystatus_freeze_jetsam_band;
2595 } else if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
2596 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2597 }
2598 }
2599 if (_memstat_proc_is_tracked(p)) {
2600 if (system_procs_aging_band && priority <= system_procs_aging_band) {
2601 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2602 /* process has already aged */
2603 priority = JETSAM_PRIORITY_IDLE;
2604 } else {
2605 priority = system_procs_aging_band;
2606 }
2607 } else if (system_procs_aging_band_stuck && priority <= system_procs_aging_band_stuck) {
2608 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2609 /* process has already aged */
2610 priority = JETSAM_PRIORITY_IDLE;
2611 } else {
2612 /* don't let anyone move anything between sysproc and sysproc stuck inclusive */
2613 priority = system_procs_aging_band;
2614 }
2615 }
2616 } else if (_memstat_proc_is_managed(p)) {
2617 if (applications_aging_band && priority <= applications_aging_band) {
2618 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2619 /* process has already aged */
2620 priority = JETSAM_PRIORITY_IDLE;
2621 } else {
2622 priority = applications_aging_band;
2623 }
2624 }
2625 }
2626 }
2627
2628 TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
2629 old_bucket->count--;
2630 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2631 old_bucket->relaunch_high_count--;
2632 }
2633
2634 new_bucket = &memstat_bucket[priority];
2635 if (options & MEMSTAT_PRIORITY_INSERT_HEAD) {
2636 TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
2637 } else {
2638 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
2639 }
2640 new_bucket->count++;
2641 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2642 new_bucket->relaunch_high_count++;
2643 }
2644
2645 if (p->p_memstat_effectivepriority != priority) {
2646 /*
2647 * This process is transitioning between
2648 * jetsam priority buckets.
2649 */
2650 _memstat_record_idle_transition(p, priority);
2651
2652 if ((system_procs_aging_band &&
2653 p->p_memstat_effectivepriority == system_procs_aging_band) ||
2654 (system_procs_aging_band_stuck &&
2655 p->p_memstat_effectivepriority == system_procs_aging_band_stuck) ||
2656 (applications_aging_band &&
2657 p->p_memstat_effectivepriority == applications_aging_band)) {
2658 /* removing this process from an aging band */
2659 _memstat_invalidate_idle_demotion_locked(p);
2660 reschedule_demotion = true;
2661 }
2662
2663 if ((system_procs_aging_band &&
2664 priority == system_procs_aging_band) ||
2665 (system_procs_aging_band_stuck &&
2666 priority == system_procs_aging_band_stuck) ||
2667 (applications_aging_band &&
2668 priority == applications_aging_band)) {
2669 /* placing this process into an aging band */
2670 _memstat_schedule_idle_demotion_locked(p);
2671 reschedule_demotion = true;
2672 }
2673
2674 if (reschedule_demotion) {
2675 _memstat_reschedule_idle_demotion_locked();
2676 }
2677
2678 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CHANGE_PRIORITY) | DBG_FUNC_NONE,
2679 proc_getpid(p), priority, p->p_memstat_effectivepriority);
2680 p->p_memstat_effectivepriority = priority;
2681 }
2682
2683 if (memorystatus_highwater_enabled) {
2684 const bool use_active = _memstat_proc_is_active_locked(p);
2685 if (_memstat_update_memlimit_locked(p, use_active)) {
2686 _memstat_write_memlimit_to_ledger_locked(p, use_active, false);
2687 }
2688 }
2689
2690 #if CONFIG_SECLUDED_MEMORY
2691 if (secluded_for_apps &&
2692 task_could_use_secluded_mem(proc_task(p))) {
2693 task_set_can_use_secluded_mem(
2694 proc_task(p),
2695 (priority >= JETSAM_PRIORITY_FOREGROUND));
2696 }
2697 #endif /* CONFIG_SECLUDED_MEMORY */
2698
2699 _memstat_consider_waking_jetsam_thread();
2700 }
2701
2702 int
memorystatus_relaunch_flags_update(proc_t p,int relaunch_flags)2703 memorystatus_relaunch_flags_update(proc_t p, int relaunch_flags)
2704 {
2705 p->p_memstat_relaunch_flags = relaunch_flags;
2706 KDBG(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_RELAUNCH_FLAGS), proc_getpid(p), relaunch_flags);
2707 return 0;
2708 }
2709
2710 #if DEVELOPMENT || DEBUG
2711 static int sysctl_memorystatus_relaunch_flags SYSCTL_HANDLER_ARGS {
2712 #pragma unused(oidp, arg1, arg2)
2713 proc_t p;
2714 int relaunch_flags = 0;
2715
2716 p = current_proc();
2717 relaunch_flags = p->p_memstat_relaunch_flags;
2718 switch (relaunch_flags) {
2719 case P_MEMSTAT_RELAUNCH_LOW:
2720 relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_LOW;
2721 break;
2722 case P_MEMSTAT_RELAUNCH_MED:
2723 relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MED;
2724 break;
2725 case P_MEMSTAT_RELAUNCH_HIGH:
2726 relaunch_flags = POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_HIGH;
2727 break;
2728 }
2729
2730 return SYSCTL_OUT(req, &relaunch_flags, sizeof(relaunch_flags));
2731 }
2732 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_relaunch_flags, CTLTYPE_INT | CTLFLAG_RD |
2733 CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, sysctl_memorystatus_relaunch_flags, "I", "get relaunch flags for current process");
2734 #endif /* DEVELOPMENT || DEBUG */
2735
2736 /*
2737 * Everything between the idle band and the application agining band
2738 * are reserved for internal use. We allow some entitled user space programs
2739 * to use this range for experimentation.
2740 */
2741 static bool
current_task_can_use_entitled_range()2742 current_task_can_use_entitled_range()
2743 {
2744 static const char kInternalJetsamRangeEntitlement[] = "com.apple.private.internal-jetsam-range";
2745 task_t task = current_task();
2746 if (task == kernel_task) {
2747 return true;
2748 }
2749 return IOTaskHasEntitlement(task, kInternalJetsamRangeEntitlement);
2750 }
2751
2752 /*
2753 * Set a process' requested priority band. This is the entry point used during
2754 * spawn and by memorystatus_control.
2755 */
2756 int
memorystatus_set_priority(proc_t p,int priority,uint64_t user_data,memstat_priority_options_t options)2757 memorystatus_set_priority(proc_t p, int priority, uint64_t user_data,
2758 memstat_priority_options_t options)
2759 {
2760 int ret;
2761
2762 memorystatus_log_debug("memorystatus: changing (%s) pid %d: priority %d, user_data 0x%llx\n",
2763 (*p->p_name ? p->p_name : "unknown"), proc_getpid(p), priority, user_data);
2764
2765 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, proc_getpid(p), priority, user_data, options);
2766
2767 if (priority == -1) {
2768 /* Use as shorthand for default priority */
2769 priority = JETSAM_PRIORITY_DEFAULT;
2770 } else if (priority > JETSAM_PRIORITY_IDLE && priority <= JETSAM_PRIORITY_AGING_BAND2) {
2771 /*
2772 * Everything between idle and the aging bands are reserved for internal use.
2773 * if requested, adjust to JETSAM_PRIORITY_IDLE.
2774 * Entitled processes (just munch) can use a subset of this range for testing.
2775 */
2776 if (priority > JETSAM_PRIORITY_ENTITLED_MAX ||
2777 !current_task_can_use_entitled_range()) {
2778 priority = JETSAM_PRIORITY_IDLE;
2779 options |= MEMSTAT_PRIORITY_NO_AGING;
2780 }
2781 } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
2782 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
2783 priority = JETSAM_PRIORITY_IDLE;
2784 options |= MEMSTAT_PRIORITY_INSERT_HEAD;
2785 } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
2786 /* Sanity check */
2787 ret = EINVAL;
2788 goto out;
2789 }
2790
2791 proc_list_lock();
2792
2793 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
2794
2795 if ((options & MEMSTAT_PRIORITY_IS_EFFECTIVE) &&
2796 (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
2797 ret = EALREADY;
2798 proc_list_unlock();
2799 memorystatus_log_error("memorystatus_update: effective change specified for pid %d, but change already occurred.\n",
2800 proc_getpid(p));
2801 goto out;
2802 }
2803
2804 if ((p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) || proc_list_exited(p)) {
2805 /*
2806 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
2807 */
2808 ret = EBUSY;
2809 proc_list_unlock();
2810 goto out;
2811 }
2812
2813 p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
2814 p->p_memstat_userdata = user_data;
2815
2816 if ((options & MEMSTAT_PRIORITY_IS_ASSERTION)) {
2817 if (priority != JETSAM_PRIORITY_IDLE) {
2818 /*
2819 * Process is now being managed by assertions,
2820 */
2821 p->p_memstat_state |= P_MEMSTAT_PRIORITY_ASSERTION;
2822 p->p_memstat_assertionpriority = priority;
2823 } else if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
2824 /*
2825 * Assertions relinquish control when the process is heading to IDLE.
2826 */
2827 p->p_memstat_state &= ~P_MEMSTAT_PRIORITY_ASSERTION;
2828 }
2829
2830 if (_memstat_proc_is_tracked(p) &&
2831 (_memstat_proc_is_dirty(p) || !_memstat_proc_can_idle_exit(p))) {
2832 priority = MAX(p->p_memstat_assertionpriority,
2833 p->p_memstat_requestedpriority);
2834 }
2835 } else {
2836 p->p_memstat_requestedpriority = priority;
2837 }
2838
2839 memstat_update_priority_locked(p, priority, options);
2840
2841 proc_list_unlock();
2842 ret = 0;
2843
2844 out:
2845 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret);
2846
2847 return ret;
2848 }
2849
2850 static int
memstat_set_memlimits_locked(proc_t p,int32_t active_limit,int32_t inactive_limit,memlimit_options_t options)2851 memstat_set_memlimits_locked(proc_t p, int32_t active_limit,
2852 int32_t inactive_limit, memlimit_options_t options)
2853 {
2854 /*
2855 * Posix_spawn'd processes and managed processes come through this path to
2856 * instantiate ledger limits. Forked processes do not come through this
2857 * path and will always receive the default task limit.
2858 */
2859
2860 int err = 0;
2861 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
2862
2863 int32_t default_active_limit = memorystatus_get_default_task_active_limit(p);
2864 /*
2865 * The special value of -1 specifies that this proc wants the default
2866 * memory limit
2867 */
2868 if (active_limit <= 0) {
2869 active_limit = default_active_limit;
2870 }
2871 /*
2872 * Work around a bug in JetsamProperties whereby processes may mistakenly receive
2873 * ActiveSoftMemoryLimit := -1 by forcing the default task limit to be fatal.
2874 */
2875 if (default_active_limit && active_limit == default_active_limit) {
2876 options |= MEMLIMIT_ACTIVE_FATAL;
2877 }
2878
2879 int32_t default_inactive_limit = memorystatus_get_default_task_inactive_limit(p);
2880 if (inactive_limit <= 0) {
2881 inactive_limit = default_inactive_limit;
2882 }
2883 if (default_inactive_limit && inactive_limit == default_inactive_limit) {
2884 options |= MEMLIMIT_INACTIVE_FATAL;
2885 }
2886 #if DEVELOPMENT || DEBUG
2887 if (p->p_memlimit_increase) {
2888 /* Apply memlimit increase (for testing with overlay roots) */
2889 int32_t memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
2890 active_limit = active_limit + memlimit_increase;
2891 inactive_limit = inactive_limit + memlimit_increase;
2892 }
2893 #endif /* DEVELOPMENT || DEBUG */
2894
2895 memorystatus_log_debug(
2896 "memorystatus: setting memlimit for %s [%d], "
2897 "Active(%dMB %s), Inactive(%dMB, %s)\n",
2898 proc_best_name(p), proc_getpid(p),
2899 active_limit, ((options & MEMLIMIT_ACTIVE_FATAL) ? "F" : "NF"),
2900 inactive_limit, ((options & MEMLIMIT_INACTIVE_FATAL) ? "F" : "NF"));
2901
2902 p->p_memstat_memlimit_active = active_limit;
2903 p->p_memstat_memlimit_inactive = inactive_limit;
2904 if (options & MEMLIMIT_INACTIVE_FATAL) {
2905 p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL;
2906 } else {
2907 p->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL;
2908 }
2909 if (options & MEMLIMIT_ACTIVE_FATAL) {
2910 p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL;
2911 } else {
2912 p->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL;
2913 }
2914
2915 /*
2916 * Initialize the cached limits for target process.
2917 * When the target process is dirty tracked, it's typically
2918 * in a clean state. Non dirty tracked processes are
2919 * typically active (Foreground or above).
2920 * But just in case, we don't make assumptions...
2921 */
2922 const bool use_active = _memstat_proc_is_active_locked(p);
2923 if (memorystatus_highwater_enabled &&
2924 _memstat_update_memlimit_locked(p, use_active)) {
2925 err = _memstat_write_memlimit_to_ledger_locked(p, use_active, false);
2926 }
2927
2928 return err;
2929 }
2930
2931 int
memorystatus_set_memlimits(proc_t p,int32_t active_limit,int32_t inactive_limit,memlimit_options_t options)2932 memorystatus_set_memlimits(proc_t p, int32_t active_limit,
2933 int32_t inactive_limit, memlimit_options_t options)
2934 {
2935 int err;
2936 proc_list_lock();
2937 err = memstat_set_memlimits_locked(p, active_limit, inactive_limit,
2938 options);
2939 proc_list_unlock();
2940 return err;
2941 }
2942
2943 int
memorystatus_remove(proc_t p)2944 memorystatus_remove(proc_t p)
2945 {
2946 int ret;
2947 memstat_bucket_t *bucket;
2948 bool reschedule = false;
2949
2950 memorystatus_log_debug("memorystatus_list_remove: removing pid %d\n", proc_getpid(p));
2951
2952 /* Processes marked internal do not have priority tracked */
2953 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2954 return 0;
2955 }
2956
2957 /*
2958 * Check if this proc is locked (because we're performing a freeze).
2959 * If so, we fail and instruct the caller to try again later.
2960 */
2961 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
2962 return EAGAIN;
2963 }
2964
2965 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
2966
2967 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2968
2969 if ((system_procs_aging_band &&
2970 p->p_memstat_effectivepriority == system_procs_aging_band) ||
2971 (system_procs_aging_band_stuck &&
2972 p->p_memstat_effectivepriority == system_procs_aging_band_stuck) ||
2973 (applications_aging_band &&
2974 p->p_memstat_effectivepriority == applications_aging_band)) {
2975 _memstat_invalidate_idle_demotion_locked(p);
2976 reschedule = true;
2977 }
2978
2979 /*
2980 * Record idle delta
2981 */
2982
2983 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2984 uint64_t now = mach_absolute_time();
2985 if (now > p->p_memstat_idle_start) {
2986 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2987 }
2988 }
2989
2990 TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
2991 bucket->count--;
2992 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2993 bucket->relaunch_high_count--;
2994 }
2995
2996 memorystatus_list_count--;
2997
2998 /* If awaiting demotion to the idle band, clean up */
2999 if (reschedule) {
3000 _memstat_reschedule_idle_demotion_locked();
3001 }
3002
3003 #if CONFIG_FREEZE
3004 if (_memstat_proc_is_frozen(p)) {
3005 if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
3006 p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
3007 assert(memorystatus_refreeze_eligible_count > 0);
3008 memorystatus_refreeze_eligible_count--;
3009 }
3010
3011 assert(memorystatus_frozen_count > 0);
3012 memorystatus_frozen_count--;
3013 if (p->p_memstat_state & P_MEMSTAT_FROZEN_XPC_SERVICE) {
3014 assert(memorystatus_frozen_count_xpc_service > 0);
3015 memorystatus_frozen_count_xpc_service--;
3016 }
3017 if (strcmp(p->p_name, "com.apple.WebKit.WebContent") == 0) {
3018 assert(memorystatus_frozen_count_webcontent > 0);
3019 memorystatus_frozen_count_webcontent--;
3020 }
3021 memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
3022 p->p_memstat_freeze_sharedanon_pages = 0;
3023 }
3024 #endif /* CONFIG_FREEZE */
3025
3026 _memstat_proc_set_resumed(p);
3027
3028 #if DEVELOPMENT || DEBUG
3029 if (proc_getpid(p) == memorystatus_testing_pid) {
3030 memorystatus_testing_pid = 0;
3031 }
3032 #endif /* DEVELOPMENT || DEBUG */
3033
3034 if (p) {
3035 ret = 0;
3036 } else {
3037 ret = ESRCH;
3038 }
3039
3040 return ret;
3041 }
3042
3043 /*
3044 * Validate dirty tracking flags with process state.
3045 *
3046 * Return:
3047 * 0 on success
3048 * non-0 on failure
3049 *
3050 * The proc_list_lock is held by the caller.
3051 */
3052
3053 static int
memorystatus_validate_track_flags(struct proc * target_p,uint32_t pcontrol)3054 memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol)
3055 {
3056 /* See that the process isn't marked for termination */
3057 if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
3058 return EBUSY;
3059 }
3060
3061 /* Idle exit requires that process be tracked */
3062 if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
3063 !(pcontrol & PROC_DIRTY_TRACK)) {
3064 return EINVAL;
3065 }
3066
3067 /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
3068 if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
3069 !(pcontrol & PROC_DIRTY_TRACK)) {
3070 return EINVAL;
3071 }
3072
3073 /* Only one type of DEFER behavior is allowed.*/
3074 if ((pcontrol & PROC_DIRTY_DEFER) &&
3075 (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) {
3076 return EINVAL;
3077 }
3078
3079 /* Deferral is only relevant if idle exit is specified */
3080 if (((pcontrol & PROC_DIRTY_DEFER) ||
3081 (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) &&
3082 !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
3083 return EINVAL;
3084 }
3085
3086 return 0;
3087 }
3088
3089 /*
3090 * Processes can opt to have their state tracked by the kernel, indicating when they are busy (dirty) or idle
3091 * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
3092 * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
3093 * priority idle band when clean (and killed earlier, protecting higher priority procesess).
3094 *
3095 * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
3096 * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
3097 * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
3098 * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
3099 * band. The deferral can be cleared early by clearing the appropriate flag.
3100 *
3101 * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
3102 * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
3103 * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
3104 */
3105
3106 int
memorystatus_dirty_track(proc_t p,uint32_t pcontrol)3107 memorystatus_dirty_track(proc_t p, uint32_t pcontrol)
3108 {
3109 unsigned int old_dirty;
3110 boolean_t defer_now = FALSE;
3111 int ret = 0;
3112 int priority;
3113 memstat_priority_options_t priority_options =
3114 MEMSTAT_PRIORITY_OPTIONS_NONE;
3115
3116 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_TRACK),
3117 proc_getpid(p), p->p_memstat_dirty, pcontrol);
3118
3119 proc_list_lock();
3120
3121 if (proc_list_exited(p)) {
3122 /*
3123 * Process is on its way out.
3124 */
3125 ret = EBUSY;
3126 goto exit;
3127 }
3128
3129 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3130 ret = EPERM;
3131 goto exit;
3132 }
3133
3134 if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
3135 /* error */
3136 goto exit;
3137 }
3138
3139 old_dirty = p->p_memstat_dirty;
3140
3141 /* These bits are cumulative, as per <rdar://problem/11159924> */
3142 if (pcontrol & PROC_DIRTY_TRACK) {
3143 /* Request to turn ON Dirty tracking... */
3144 if (p->p_memstat_state & P_MEMSTAT_MANAGED) {
3145 /* on a process managed by RunningBoard or its equivalent...*/
3146 if (!_memstat_proc_cached_memlimit_is_fatal(p)) {
3147 /* but this might be an app because there's no fatal limits
3148 * NB: This _big_ assumption is not universal. What we really
3149 * need is a way to say this is an _APP_ and we can't have dirty
3150 * tracking turned ON for it. Lacking that functionality we clump
3151 * together some checks and try to do the best detection we can.
3152 * Reason we can't allow addition of these flags is because, per the
3153 * kernel checks, they change the role of a process from app to daemon. And the
3154 * AGING_IN_PROGRESS bits might still be set i.e. it needs to be demoted
3155 * correctly from the right aging band (app or sysproc). We can't simply try
3156 * to invalidate the demotion here because, owing to assertion priorities, we
3157 * might not be in the aging bands.
3158 */
3159 memorystatus_log(
3160 "memorystatus: Denying dirty-tracking opt-in for managed %s [%d]\n",
3161 proc_best_name(p), proc_getpid(p));
3162 /* fail silently to avoid an XPC assertion... */
3163 ret = 0;
3164 goto exit;
3165 }
3166 }
3167
3168 p->p_memstat_dirty |= P_DIRTY_TRACK;
3169 }
3170
3171 if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
3172 p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
3173 }
3174
3175 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3176 p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
3177 }
3178
3179 /*
3180 * NB: All processes are now automatically enrolled in idle aging
3181 * regardless of whether they request to be deferred.
3182 */
3183 if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
3184 if ((pcontrol & (PROC_DIRTY_DEFER)) &&
3185 !(old_dirty & P_DIRTY_DEFER)) {
3186 p->p_memstat_dirty |= P_DIRTY_DEFER;
3187 }
3188
3189 if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) &&
3190 !(old_dirty & P_DIRTY_DEFER_ALWAYS)) {
3191 p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS;
3192 }
3193
3194 defer_now = TRUE;
3195 }
3196
3197 memorystatus_log_info(
3198 "%s [%d] enrolled in ActivityTracking tracked %d / idle-exit %d / defer %d / dirty %d",
3199 proc_best_name(p), proc_getpid(p),
3200 _memstat_proc_is_tracked(p), _memstat_proc_can_idle_exit(p), defer_now,
3201 _memstat_proc_is_dirty(p));
3202
3203 if (!_memstat_proc_is_dirty(p) && _memstat_proc_is_tracked(p) &&
3204 _memstat_proc_can_idle_exit(p)) {
3205 priority = JETSAM_PRIORITY_IDLE;
3206 if (!defer_now && _memstat_proc_is_aging(p)) {
3207 /*
3208 * Historically, some processes have tried to use this to opt out
3209 * of the 'aging' facility.
3210 */
3211 priority_options |= MEMSTAT_PRIORITY_NO_AGING;
3212 }
3213 } else {
3214 priority = p->p_memstat_requestedpriority;
3215 }
3216
3217 if (_memstat_proc_has_priority_assertion(p)) {
3218 priority = MAX(priority, p->p_memstat_assertionpriority);
3219 }
3220
3221 memstat_update_priority_locked(p, priority, priority_options);
3222
3223 exit:
3224 proc_list_unlock();
3225
3226 return ret;
3227 }
3228
3229 int
memorystatus_dirty_set(proc_t p,boolean_t self,uint32_t pcontrol)3230 memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol)
3231 {
3232 int ret = 0;
3233 bool kill = false;
3234 bool was_dirty;
3235 bool now_dirty = false;
3236 int priority;
3237 task_t t = proc_task(p);
3238
3239 memorystatus_log_debug("memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, proc_getpid(p), pcontrol, p->p_memstat_dirty);
3240 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_SET), proc_getpid(p), self, pcontrol);
3241
3242 proc_list_lock();
3243
3244 if (proc_list_exited(p)) {
3245 /*
3246 * Process is on its way out.
3247 */
3248 ret = EBUSY;
3249 goto exit;
3250 }
3251
3252 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3253 ret = EPERM;
3254 goto exit;
3255 }
3256
3257 was_dirty = _memstat_proc_is_dirty(p);
3258
3259 if (!_memstat_proc_is_tracked(p)) {
3260 /* Dirty tracking not enabled */
3261 ret = EINVAL;
3262 goto exit;
3263 } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
3264 /*
3265 * Process is set to be terminated and we're attempting to mark it dirty.
3266 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
3267 */
3268 ret = EBUSY;
3269 goto exit;
3270 }
3271
3272 int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
3273 if (pcontrol && !(p->p_memstat_dirty & flag)) {
3274 /* Mark the process as having been dirtied at some point */
3275 p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
3276 } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
3277 if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
3278 /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
3279 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3280 kill = true;
3281 } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
3282 /* Kill previously terminated processes if set clean */
3283 kill = true;
3284 }
3285 p->p_memstat_dirty &= ~flag;
3286 } else {
3287 /* Already set */
3288 ret = EALREADY;
3289 goto exit;
3290 }
3291
3292 now_dirty = _memstat_proc_is_dirty(p);
3293
3294 if (was_dirty && !now_dirty) {
3295 if (_memstat_proc_can_idle_exit(p)) {
3296 /*
3297 * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back
3298 * there once it's clean again. For the legacy case, this only applies if it has some protection window left.
3299 * P_DIRTY_DEFER: one-time protection window given at launch
3300 * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode.
3301 *
3302 * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over
3303 * in that band on it's way to IDLE.
3304 */
3305 assert(!_memstat_proc_is_aging(p));
3306 priority = JETSAM_PRIORITY_IDLE;
3307 } else {
3308 priority = p->p_memstat_requestedpriority;
3309 }
3310 task_ledger_settle_dirty_time(t);
3311 task_set_dirty_start(t, 0);
3312 } else if (!was_dirty && now_dirty) {
3313 priority = p->p_memstat_requestedpriority;
3314 task_set_dirty_start(t, mach_absolute_time());
3315 }
3316
3317 if (_memstat_proc_has_priority_assertion(p)) {
3318 priority = MAX(priority, p->p_memstat_assertionpriority);
3319 }
3320
3321 memstat_update_priority_locked(p, priority, MEMSTAT_PRIORITY_OPTIONS_NONE);
3322
3323 if (kill) {
3324 if (proc_ref(p, true) == p) {
3325 proc_list_unlock();
3326 psignal(p, SIGKILL);
3327 proc_list_lock();
3328 proc_rele(p);
3329 }
3330 }
3331
3332 exit:
3333 proc_list_unlock();
3334
3335 return ret;
3336 }
3337
3338 int
memorystatus_dirty_clear(proc_t p,uint32_t pcontrol)3339 memorystatus_dirty_clear(proc_t p, uint32_t pcontrol)
3340 {
3341 int ret = 0;
3342
3343 memorystatus_log_debug("memorystatus_dirty_clear(): %d 0x%x 0x%x\n", proc_getpid(p), pcontrol, p->p_memstat_dirty);
3344 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_DIRTY_CLEAR), proc_getpid(p), pcontrol);
3345
3346 proc_list_lock();
3347
3348 if (proc_list_exited(p)) {
3349 /*
3350 * Process is on its way out.
3351 */
3352 ret = EBUSY;
3353 goto exit;
3354 }
3355
3356 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3357 ret = EPERM;
3358 goto exit;
3359 }
3360
3361 if (!_memstat_proc_is_tracked(p)) {
3362 /* Dirty tracking not enabled */
3363 ret = EINVAL;
3364 goto exit;
3365 }
3366
3367 if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) {
3368 ret = EINVAL;
3369 goto exit;
3370 }
3371
3372 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3373 p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
3374 }
3375
3376 /* This can be set and cleared exactly once. */
3377 if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
3378 if (p->p_memstat_dirty & P_DIRTY_DEFER) {
3379 p->p_memstat_dirty &= ~(P_DIRTY_DEFER);
3380 }
3381
3382 if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
3383 p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS);
3384 }
3385
3386 if (_memstat_proc_is_aging(p)) {
3387 memstat_update_priority_locked(p, JETSAM_PRIORITY_IDLE,
3388 MEMSTAT_PRIORITY_NO_AGING);
3389 }
3390 }
3391
3392 ret = 0;
3393 exit:
3394 proc_list_unlock();
3395
3396 return ret;
3397 }
3398
3399 int
memorystatus_dirty_get(proc_t p,boolean_t locked)3400 memorystatus_dirty_get(proc_t p, boolean_t locked)
3401 {
3402 int ret = 0;
3403
3404 if (!locked) {
3405 proc_list_lock();
3406 }
3407
3408 if (_memstat_proc_is_tracked(p)) {
3409 ret |= PROC_DIRTY_TRACKED;
3410 if (_memstat_proc_can_idle_exit(p)) {
3411 ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
3412 }
3413 if (p->p_memstat_dirty & P_DIRTY) {
3414 ret |= PROC_DIRTY_IS_DIRTY;
3415 }
3416 if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
3417 ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
3418 }
3419 }
3420
3421 if (!locked) {
3422 proc_list_unlock();
3423 }
3424
3425 return ret;
3426 }
3427
3428 int
memorystatus_on_terminate(proc_t p)3429 memorystatus_on_terminate(proc_t p)
3430 {
3431 int sig;
3432
3433 proc_list_lock();
3434
3435 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3436
3437 if ((_memstat_proc_is_tracked(p) && !_memstat_proc_is_dirty(p)) ||
3438 (_memstat_proc_is_suspended(p))) {
3439 /*
3440 * Mark as terminated and issue SIGKILL if:-
3441 * - process is clean, or,
3442 * - if process is dirty but suspended. This case is likely
3443 * an extension because apps don't opt into dirty-tracking
3444 * and daemons aren't suspended.
3445 */
3446 #if DEVELOPMENT || DEBUG
3447 if (_memstat_proc_is_suspended(p)) {
3448 memorystatus_log(
3449 "memorystatus: sending suspended process %s (pid %d) SIGKILL\n",
3450 (*p->p_name ? p->p_name : "unknown"), proc_getpid(p));
3451 }
3452 #endif /* DEVELOPMENT || DEBUG */
3453 sig = SIGKILL;
3454 } else {
3455 /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
3456 sig = SIGTERM;
3457 }
3458
3459 proc_list_unlock();
3460
3461 return sig;
3462 }
3463
3464 void
memorystatus_on_suspend(proc_t p)3465 memorystatus_on_suspend(proc_t p)
3466 {
3467 #if CONFIG_FREEZE
3468 uint32_t pages;
3469 memorystatus_get_task_page_counts(proc_task(p), &pages, NULL, NULL);
3470 #endif
3471 proc_list_lock();
3472
3473 _memstat_proc_set_suspended(p);
3474
3475 /* Check if proc is marked for termination */
3476 bool kill_process = !!(p->p_memstat_dirty & P_DIRTY_TERMINATED);
3477 proc_list_unlock();
3478
3479 if (kill_process) {
3480 psignal(p, SIGKILL);
3481 }
3482
3483 #if CONFIG_DEFERRED_RECLAIM
3484 vm_deferred_reclamation_reclaim_from_task_async(proc_task(p));
3485 #endif /* CONFIG_DEFERRED_RECLAIM */
3486 }
3487
3488 extern uint64_t memorystatus_thaw_count_since_boot;
3489
3490 void
memorystatus_on_resume(proc_t p)3491 memorystatus_on_resume(proc_t p)
3492 {
3493 #if CONFIG_FREEZE
3494 pid_t pid;
3495 #endif
3496
3497 proc_list_lock();
3498
3499 #if CONFIG_FREEZE
3500 const bool frozen = _memstat_proc_is_frozen(p);
3501 if (frozen) {
3502 /*
3503 * Now that we don't _thaw_ a process completely,
3504 * resuming it (and having some on-demand swapins)
3505 * shouldn't preclude it from being counted as frozen.
3506 *
3507 * memorystatus_frozen_count--;
3508 *
3509 * We preserve the P_MEMSTAT_FROZEN state since the process
3510 * could have state on disk AND so will deserve some protection
3511 * in the jetsam bands.
3512 */
3513 if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) {
3514 p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE;
3515 memorystatus_refreeze_eligible_count++;
3516 }
3517 if (p->p_memstat_thaw_count == 0 || p->p_memstat_last_thaw_interval < memorystatus_freeze_current_interval) {
3518 os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed), relaxed);
3519 if (strcmp(p->p_name, "com.apple.WebKit.WebContent") == 0) {
3520 os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed_webcontent), relaxed);
3521 }
3522 }
3523 p->p_memstat_last_thaw_interval = memorystatus_freeze_current_interval;
3524 p->p_memstat_thaw_count++;
3525
3526 memorystatus_freeze_last_pid_thawed = p->p_pid;
3527 memorystatus_freeze_last_pid_thawed_ts = mach_absolute_time();
3528
3529 memorystatus_thaw_count++;
3530 memorystatus_thaw_count_since_boot++;
3531 }
3532
3533 pid = proc_getpid(p);
3534 #endif
3535
3536 /*
3537 * P_MEMSTAT_FROZEN will remain unchanged. This used to be:
3538 * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
3539 */
3540 _memstat_proc_set_resumed(p);
3541
3542 proc_list_unlock();
3543
3544 #if CONFIG_FREEZE
3545 if (frozen) {
3546 memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
3547 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
3548 }
3549 #endif
3550 }
3551
3552 void
memorystatus_on_inactivity(proc_t p)3553 memorystatus_on_inactivity(proc_t p)
3554 {
3555 #pragma unused(p)
3556 #if CONFIG_FREEZE
3557 /* Wake the freeze thread */
3558 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
3559 #endif
3560 }
3561
3562 /*
3563 * The proc_list_lock is held by the caller.
3564 */
3565 static uint32_t
memorystatus_build_state(proc_t p)3566 memorystatus_build_state(proc_t p)
3567 {
3568 uint32_t snapshot_state = 0;
3569
3570 /* General */
3571 if (_memstat_proc_is_suspended(p)) {
3572 snapshot_state |= kMemorystatusSuspended;
3573 }
3574 if (_memstat_proc_is_frozen(p)) {
3575 snapshot_state |= kMemorystatusFrozen;
3576 }
3577 if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
3578 snapshot_state |= kMemorystatusWasThawed;
3579 }
3580 if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
3581 snapshot_state |= kMemorystatusAssertion;
3582 }
3583
3584 /* Tracking */
3585 if (_memstat_proc_is_tracked(p)) {
3586 snapshot_state |= kMemorystatusTracked;
3587 }
3588 if (_memstat_proc_can_idle_exit(p)) {
3589 snapshot_state |= kMemorystatusSupportsIdleExit;
3590 }
3591 if (_memstat_proc_is_dirty(p)) {
3592 snapshot_state |= kMemorystatusDirty;
3593 }
3594
3595 return snapshot_state;
3596 }
3597
3598 static boolean_t
kill_idle_exit_proc(void)3599 kill_idle_exit_proc(void)
3600 {
3601 proc_t p, victim_p = PROC_NULL;
3602 uint64_t current_time, footprint_of_killed_proc;
3603 boolean_t killed = FALSE;
3604 unsigned int i = 0;
3605 os_reason_t jetsam_reason = OS_REASON_NULL;
3606
3607 /* Pick next idle exit victim. */
3608 current_time = mach_absolute_time();
3609
3610 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT);
3611 if (jetsam_reason == OS_REASON_NULL) {
3612 memorystatus_log_error("kill_idle_exit_proc: failed to allocate jetsam reason\n");
3613 }
3614
3615 proc_list_lock();
3616
3617 p = memorystatus_get_first_proc_locked(&i, FALSE);
3618 while (p) {
3619 /* No need to look beyond the idle band */
3620 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
3621 break;
3622 }
3623
3624 if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
3625 if (current_time >= p->p_memstat_idledeadline) {
3626 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3627 victim_p = proc_ref(p, true);
3628 break;
3629 }
3630 }
3631
3632 p = memorystatus_get_next_proc_locked(&i, p, FALSE);
3633 }
3634
3635 proc_list_unlock();
3636
3637 if (victim_p) {
3638 memorystatus_log(
3639 "memorystatus: killing_idle_process pid %d [%s] jetsam_reason->osr_code: %llu\n",
3640 proc_getpid(victim_p), (*victim_p->p_name ? victim_p->p_name : "unknown"), jetsam_reason->osr_code);
3641 killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit, jetsam_reason, &footprint_of_killed_proc);
3642 proc_rele(victim_p);
3643 } else {
3644 os_reason_free(jetsam_reason);
3645 }
3646
3647 return killed;
3648 }
3649
3650 /*
3651 * Consider waking the jetsam thread. Returns true if the thread was awoken.
3652 */
3653 static bool
_memstat_consider_waking_jetsam_thread(void)3654 _memstat_consider_waking_jetsam_thread(void)
3655 {
3656 #if CONFIG_JETSAM
3657 if (memstat_evaluate_page_shortage(NULL, NULL, NULL)) {
3658 memorystatus_thread_wake();
3659 return true;
3660 }
3661 #endif /* CONFIG_JETSAM */
3662 return false;
3663 }
3664
3665 void
memorystatus_thread_wake()3666 memorystatus_thread_wake()
3667 {
3668 int thr_id = 0;
3669 int active_thr = atomic_load(&active_jetsam_threads);
3670
3671 /* Wakeup all the jetsam threads */
3672 for (thr_id = 0; thr_id < active_thr; thr_id++) {
3673 jetsam_state_t jetsam_thread = &jetsam_threads[thr_id];
3674 sched_cond_signal(&(jetsam_thread->jt_wakeup_cond), jetsam_thread->thread);
3675 }
3676 }
3677
3678 #if CONFIG_JETSAM
3679 static void
memorystatus_thread_pool_max()3680 memorystatus_thread_pool_max()
3681 {
3682 /* Increase the jetsam thread pool to max_jetsam_threads */
3683 int max_threads = max_jetsam_threads;
3684 memorystatus_log_info("Expanding memorystatus pool to %d\n", max_threads);
3685 os_atomic_store(&active_jetsam_threads, max_threads, relaxed);
3686 }
3687
3688 static void
memorystatus_thread_pool_default()3689 memorystatus_thread_pool_default()
3690 {
3691 /* Restore the jetsam thread pool to a single thread */
3692 memorystatus_log_info("Reverting memorystatus pool back to 1\n");
3693 os_atomic_store(&active_jetsam_threads, 1, relaxed);
3694 }
3695 #endif /* CONFIG_JETSAM */
3696
3697 /*
3698 * An offset applied to non-critical page shortage thresholds.
3699 */
3700 static uint32_t
_memstat_page_shortage_offset(void)3701 _memstat_page_shortage_offset(void)
3702 {
3703 uint32_t offset = 0;
3704 if (os_atomic_load(&memstat_policy_config, acquire) & kPolicyClearTheDecks) {
3705 offset += memstat_ctd_offset;
3706 }
3707 if (os_atomic_load(&memstat_policy_config, acquire) & kPolicyBallastDrain) {
3708 offset += memstat_ballast_offset;
3709 }
3710 return offset;
3711 }
3712
3713 uint32_t
memorystatus_get_critical_page_shortage_threshold(void)3714 memorystatus_get_critical_page_shortage_threshold(void)
3715 {
3716 return memstat_critical_threshold;
3717 }
3718
3719 uint32_t
memorystatus_get_idle_exit_page_shortage_threshold(void)3720 memorystatus_get_idle_exit_page_shortage_threshold(void)
3721 {
3722 uint32_t offset = _memstat_page_shortage_offset();
3723 return memstat_idle_threshold + offset;
3724 }
3725
3726 uint32_t
memorystatus_get_soft_memlimit_page_shortage_threshold(void)3727 memorystatus_get_soft_memlimit_page_shortage_threshold(void)
3728 {
3729 uint32_t offset = _memstat_page_shortage_offset();
3730 return memstat_soft_threshold + offset;
3731 }
3732
3733 bool
memstat_evaluate_page_shortage(bool * should_enforce_memlimits,bool * should_idle_exit,bool * should_jetsam)3734 memstat_evaluate_page_shortage(
3735 bool *should_enforce_memlimits,
3736 bool *should_idle_exit,
3737 bool *should_jetsam)
3738 {
3739 bool requires_action = false;
3740 if (should_enforce_memlimits) {
3741 *should_enforce_memlimits = false;
3742 }
3743 if (should_idle_exit) {
3744 *should_idle_exit = false;
3745 }
3746 if (should_jetsam) {
3747 *should_jetsam = false;
3748 }
3749 #if CONFIG_JETSAM
3750 uint32_t available_page_count = os_atomic_load(&memorystatus_available_pages, relaxed);
3751 #if VM_PRESSURE_EVENTS
3752 if (available_page_count <
3753 memorystatus_get_soft_memlimit_page_shortage_threshold()) {
3754 /*
3755 * Only wake the jetsam thread if there are hwm violators to
3756 * kill
3757 */
3758 bool hwm_candidates = os_atomic_load(&memorystatus_hwm_candidates, acquire);
3759 requires_action = requires_action || hwm_candidates;
3760 if (should_enforce_memlimits) {
3761 *should_enforce_memlimits = true;
3762 }
3763 }
3764 #endif /* VM_PRESSURE_EVENTS */
3765 if (available_page_count < memorystatus_get_idle_exit_page_shortage_threshold()) {
3766 /*
3767 * Only wake the jetsam thread if there are idle processes that
3768 * could exit.
3769 */
3770 uint32_t idle_proc_count = os_atomic_load(
3771 &memstat_bucket[JETSAM_PRIORITY_IDLE].count, relaxed);
3772 requires_action = requires_action || (idle_proc_count > 0);
3773 if (should_idle_exit) {
3774 *should_idle_exit = true;
3775 }
3776 }
3777 if (available_page_count < memorystatus_get_critical_page_shortage_threshold()) {
3778 if (should_jetsam) {
3779 *should_jetsam = true;
3780 }
3781 requires_action = true;
3782 }
3783 #endif /* CONFIG_JETSAM */
3784 return requires_action;
3785 }
3786
3787 #if CONFIG_JETSAM
3788 static uint64_t
memorystatus_swap_trigger_pages(void)3789 memorystatus_swap_trigger_pages(void)
3790 {
3791 /*
3792 * The swapout trigger varies based on the current memorystatus_level.
3793 * When available memory is somewhat high (at memorystatus_available_pages_pressure)
3794 * we keep more swappable compressor segments in memory.
3795 * However, as available memory drops to our idle and eventually critical kill
3796 * thresholds we start swapping more aggressively.
3797 */
3798 static uint32_t available_pages_factor[] = {0, 1, 1, 1, 2, 2, 3, 5, 7, 8, 10, 13, 15, 17, 20};
3799 size_t index = MIN(memorystatus_level, sizeof(available_pages_factor) / sizeof(uint32_t) - 1);
3800 return available_pages_factor[index] * memorystatus_available_pages / 10;
3801 }
3802
3803 static int
3804 sysctl_memorystatus_swap_trigger_pages SYSCTL_HANDLER_ARGS
3805 {
3806 #pragma unused(arg1, arg2)
3807 uint64_t trigger_pages = memorystatus_swap_trigger_pages();
3808 return SYSCTL_OUT(req, &trigger_pages, sizeof(trigger_pages));
3809 }
3810
3811 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_swap_trigger_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3812 0, 0, &sysctl_memorystatus_swap_trigger_pages, "I", "");
3813
3814 /*
3815 * Check if the number of full swappable csegments is over the trigger
3816 * threshold to start swapping.
3817 * The adjustment_factor is applied to the trigger to raise or lower
3818 * it. For example an adjustement factor of 110 will raise the threshold by 10%.
3819 */
3820 bool
memorystatus_swap_over_trigger(uint64_t adjustment_factor)3821 memorystatus_swap_over_trigger(uint64_t adjustment_factor)
3822 {
3823 if (!memorystatus_swap_all_apps) {
3824 return false;
3825 }
3826 uint64_t trigger_pages = memorystatus_swap_trigger_pages();
3827 trigger_pages = trigger_pages * adjustment_factor / 100;
3828 return atop_64(c_late_swapout_count * c_seg_allocsize) > trigger_pages;
3829 }
3830
3831 /*
3832 * Check if the number of segments on the early swapin queue
3833 * is over the trigger to start compacting it.
3834 */
3835 bool
memorystatus_swapin_over_trigger(void)3836 memorystatus_swapin_over_trigger(void)
3837 {
3838 return atop_64(c_late_swappedin_count * c_seg_allocsize) > memorystatus_swapin_trigger_pages;
3839 }
3840 #endif /* CONFIG_JETSAM */
3841
3842 #if DEVELOPMENT || DEBUG
3843 SYSCTL_UINT(_vm, OID_AUTO, c_late_swapout_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swapout_count, 0, "");
3844 SYSCTL_UINT(_vm, OID_AUTO, c_seg_allocsize, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_allocsize, 0, "");
3845 #if CONFIG_FREEZE
3846 extern int32_t c_segment_pages_compressed_incore_late_swapout;
3847 SYSCTL_INT(_vm, OID_AUTO, c_segment_pages_compressed_incore_late_swapout, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_pages_compressed_incore_late_swapout, 0, "");
3848 #endif /* CONFIG_FREEZE */
3849 #endif /* DEVELOPMENT || DEBUG */
3850
3851 static boolean_t
memorystatus_should_post_snapshot(int32_t priority,uint32_t cause)3852 memorystatus_should_post_snapshot(int32_t priority, uint32_t cause)
3853 {
3854 boolean_t is_idle_priority;
3855
3856 is_idle_priority = (priority == JETSAM_PRIORITY_IDLE || priority == JETSAM_PRIORITY_IDLE_DEFERRED);
3857 #if CONFIG_JETSAM
3858 #pragma unused(cause)
3859 /*
3860 * Don't generate logs for steady-state idle-exit kills,
3861 * unless it is overridden for debug or by the device
3862 * tree.
3863 */
3864
3865 return !is_idle_priority || memorystatus_idle_snapshot;
3866
3867 #else /* CONFIG_JETSAM */
3868 /*
3869 * Don't generate logs for steady-state idle-exit kills,
3870 * unless
3871 * - it is overridden for debug or by the device
3872 * tree.
3873 * OR
3874 * - the kill causes are important i.e. not kMemorystatusKilledIdleExit
3875 */
3876
3877 boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause));
3878 return !is_idle_priority || memorystatus_idle_snapshot || snapshot_eligible_kill_cause;
3879 #endif /* CONFIG_JETSAM */
3880 }
3881
3882
3883 static boolean_t
memorystatus_act_on_hiwat_processes(uint32_t * errors,uint32_t * hwm_kill,bool * post_snapshot,uint64_t * memory_reclaimed)3884 memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, bool *post_snapshot, uint64_t *memory_reclaimed)
3885 {
3886 boolean_t purged = FALSE, killed = FALSE;
3887
3888 *memory_reclaimed = 0;
3889 killed = memorystatus_kill_hiwat_proc(errors, &purged, memory_reclaimed);
3890
3891 if (killed) {
3892 *hwm_kill = *hwm_kill + 1;
3893 *post_snapshot = TRUE;
3894 return TRUE;
3895 } else {
3896 if (purged == FALSE) {
3897 /* couldn't purge and couldn't kill */
3898 os_atomic_store(&memorystatus_hwm_candidates, false, relaxed);
3899 }
3900 }
3901
3902 return killed;
3903 }
3904
3905 /*
3906 * Purge kernel memory caches
3907 */
3908 static void
memstat_purge_caches(jetsam_state_t state)3909 memstat_purge_caches(jetsam_state_t state)
3910 {
3911 memorystatus_log("memorystatus: purging kernel memory caches\n");
3912
3913 uint64_t pmap_released = pmap_release_pages_fast();
3914 memorystatus_log("memorystatus: recovered %llu pages from pmap\n",
3915 pmap_released);
3916
3917 /*
3918 * Only purge corpses once per jetsam event. No new corpses can be created
3919 * after the initial purge (block_corpses)
3920 */
3921 if (!state->corpse_list_purged) {
3922 memorystatus_log("memorystatus: purging all corpses\n");
3923 os_atomic_inc(&block_corpses, relaxed);
3924 assert(block_corpses > 0);
3925 if (total_corpses_count() > 0) {
3926 task_purge_all_corpses();
3927 } else {
3928 memorystatus_log("memorystatus: no corpses to purge\n");
3929 }
3930 state->corpse_list_purged = true;
3931 }
3932
3933 #if CONFIG_DEFERRED_RECLAIM
3934 /* TODO: estimate memory recovered from deferred reclaim */
3935 memorystatus_log("memorystatus: reclaiming all deferred user memory\n");
3936 /*
3937 * Avoid faulting on the reclaim buffer and avoid blocking waiting for
3938 * threads which may be faulting themselves.
3939 */
3940 vm_deferred_reclamation_reclaim_all_memory(
3941 RECLAIM_NO_WAIT | RECLAIM_NO_FAULT);
3942 #endif /* CONFIG_DEFERRED_RECLAIM */
3943
3944 /* TODO: estimate wired memory recovered from zone_gc */
3945 memorystatus_log("memorystatus: trimming kernel zone allocator\n");
3946 zone_gc_trim();
3947 }
3948
3949 /*
3950 * Called before jetsamming in the foreground band in the hope that we'll
3951 * avoid a jetsam.
3952 */
3953 static void
memstat_approaching_fg_band(jetsam_state_t state)3954 memstat_approaching_fg_band(jetsam_state_t state)
3955 {
3956 memorystatus_log("memorystatus: jetsam is approaching JETSAM_PRIORITY_FOREGROUND\n");
3957 if (memorystatus_should_issue_fg_band_notify) {
3958 memorystatus_broadcast_jetsam_pressure(kVMPressureForegroundJetsam);
3959 }
3960 memstat_purge_caches(state);
3961 }
3962
3963 unsigned int jld_eval_aggressive_count = 0;
3964 uint64_t jld_timestamp_msecs = 0;
3965 int jld_idle_kill_candidates = 0;
3966
3967 /*
3968 * Progressively raise the maximum priority to aggressively kill to
3969 * when a jetsam loop is detected. Background work often happens at
3970 * @c JETSAM_PRIORITY_MAIL. Start there and elevate as needed if
3971 * the jetsam loop re-occurs in a short time window.
3972 */
3973 int jld_max_priority_arr[] = {
3974 JETSAM_PRIORITY_MAIL,
3975 JETSAM_PRIORITY_MAIL,
3976 JETSAM_PRIORITY_UI_SUPPORT,
3977 JETSAM_PRIORITY_UI_SUPPORT,
3978 JETSAM_PRIORITY_DRIVER_APPLE,
3979 };
3980 #define JLD_MAX_PRIORITY_ARR_COUNT (sizeof(jld_max_priority_arr) / sizeof(jld_max_priority_arr[0]))
3981
3982 static bool
memorystatus_act_aggressive(jetsam_state_t state,uint32_t cause,os_reason_t jetsam_reason)3983 memorystatus_act_aggressive(jetsam_state_t state, uint32_t cause, os_reason_t jetsam_reason)
3984 {
3985 boolean_t killed;
3986 uint32_t errors = 0;
3987 uint64_t footprint_of_killed_proc = 0;
3988 int elevated_bucket_count = 0, maximum_kills = 0, band = 0;
3989 state->memory_reclaimed = 0;
3990
3991 unsigned int iteration_no = jld_eval_aggressive_count++;
3992 int max_kill_pri = jld_max_priority_arr[MIN(iteration_no, JLD_MAX_PRIORITY_ARR_COUNT - 1)];
3993 assert3u(max_kill_pri, <=, MEMSTAT_BUCKET_COUNT);
3994
3995 if (max_kill_pri >= JETSAM_PRIORITY_FOREGROUND) {
3996 memstat_approaching_fg_band(state);
3997 }
3998
3999 proc_list_lock();
4000 elevated_bucket_count = memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE].count;
4001 proc_list_unlock();
4002
4003 /* Visit elevated processes first */
4004 while (elevated_bucket_count) {
4005 elevated_bucket_count--;
4006
4007 /*
4008 * memorystatus_kill_elevated_process() drops a reference,
4009 * so take another one so we can continue to use this exit reason
4010 * even after it returns.
4011 */
4012
4013 os_reason_ref(jetsam_reason);
4014 killed = memorystatus_kill_elevated_process(
4015 cause,
4016 jetsam_reason,
4017 JETSAM_PRIORITY_ELEVATED_INACTIVE,
4018 jld_eval_aggressive_count,
4019 &errors, &footprint_of_killed_proc);
4020 if (killed) {
4021 state->post_snapshot = true;
4022 state->memory_reclaimed += footprint_of_killed_proc;
4023 if (!memstat_evaluate_page_shortage(NULL, NULL, NULL)) {
4024 /*
4025 * System is no longer under pressure --
4026 * bail early because the pressure was
4027 * coming from an inactive process
4028 */
4029 return true;
4030 }
4031 } else {
4032 /*
4033 * No pinned processes left to kill.
4034 * Abandon elevated band.
4035 */
4036 break;
4037 }
4038 }
4039
4040 proc_list_lock();
4041 for (band = JETSAM_PRIORITY_IDLE; band < max_kill_pri; band++) {
4042 maximum_kills += memstat_bucket[band].count;
4043 }
4044 proc_list_unlock();
4045 maximum_kills *= memorystatus_jld_max_kill_loops;
4046 /*
4047 * memorystatus_kill_processes_aggressive() allocates its own
4048 * jetsam_reason so the kMemorystatusKilledProcThrashing cause
4049 * is consistent throughout the aggressive march.
4050 */
4051 killed = memorystatus_kill_processes_aggressive(
4052 kMemorystatusKilledProcThrashing,
4053 jld_eval_aggressive_count,
4054 max_kill_pri,
4055 maximum_kills,
4056 &errors, &footprint_of_killed_proc);
4057
4058 if (killed) {
4059 /* Always generate logs after aggressive kill */
4060 state->post_snapshot = true;
4061 state->memory_reclaimed += footprint_of_killed_proc;
4062 state->jld_idle_kills = 0;
4063 }
4064
4065 return killed;
4066 }
4067
4068 /*
4069 * Sets up a new jetsam thread.
4070 */
4071 static void
memorystatus_thread_init(jetsam_state_t jetsam_thread)4072 memorystatus_thread_init(jetsam_state_t jetsam_thread)
4073 {
4074 char name[32];
4075 thread_wire_internal(host_priv_self(), current_thread(), TRUE, NULL);
4076 snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1);
4077
4078 /* Limit all but one thread to the lower jetsam bands, as that's where most of the victims are. */
4079 if (jetsam_thread->index == 0) {
4080 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4081 thread_vm_bind_group_add();
4082 }
4083 jetsam_thread->limit_to_low_bands = false;
4084 } else {
4085 jetsam_thread->limit_to_low_bands = true;
4086 }
4087 #if CONFIG_THREAD_GROUPS
4088 thread_group_vm_add();
4089 #endif
4090 thread_set_thread_name(current_thread(), name);
4091 sched_cond_init(&(jetsam_thread->jt_wakeup_cond));
4092 jetsam_thread->inited = true;
4093 }
4094
4095 /*
4096 * Create a new jetsam reason from the given kill cause.
4097 */
4098 static os_reason_t
create_jetsam_reason(memorystatus_kill_cause_t cause)4099 create_jetsam_reason(memorystatus_kill_cause_t cause)
4100 {
4101 os_reason_t jetsam_reason = OS_REASON_NULL;
4102
4103 jetsam_reason_t reason_code = (jetsam_reason_t)cause;
4104 assert3u(reason_code, <=, JETSAM_REASON_MEMORYSTATUS_MAX);
4105
4106 jetsam_reason = os_reason_create(OS_REASON_JETSAM, reason_code);
4107 if (jetsam_reason == OS_REASON_NULL) {
4108 memorystatus_log_error("memorystatus: failed to allocate jetsam reason for cause %u\n", cause);
4109 }
4110 return jetsam_reason;
4111 }
4112
4113 /*
4114 * Do one kill as we're marching up the priority bands.
4115 * This is a wrapper around memorystatus_kill_top_process that also
4116 * sets post_snapshot, tracks jld_idle_kills, and notifies if we're appraoching the fg band.
4117 */
4118 static bool
memorystatus_do_priority_kill(jetsam_state_t state,uint32_t kill_cause,int32_t max_priority,bool only_swappable)4119 memorystatus_do_priority_kill(jetsam_state_t state,
4120 uint32_t kill_cause, int32_t max_priority, bool only_swappable)
4121 {
4122 os_reason_t jetsam_reason = OS_REASON_NULL;
4123 bool killed = false;
4124 int priority;
4125
4126 jetsam_reason = create_jetsam_reason(kill_cause);
4127 /*
4128 * memorystatus_kill_top_process() drops a reference,
4129 * so take another one so we can continue to use this exit reason
4130 * even after it returns
4131 */
4132 os_reason_ref(jetsam_reason);
4133
4134 /* LRU */
4135 killed = memorystatus_kill_top_process(true, state->sort_flag, kill_cause, jetsam_reason, max_priority,
4136 only_swappable, &priority, &state->errors, &state->memory_reclaimed);
4137 state->sort_flag = false;
4138
4139 if (killed) {
4140 if (memorystatus_should_post_snapshot(priority, kill_cause) == TRUE) {
4141 state->post_snapshot = true;
4142 }
4143
4144 /* Jetsam Loop Detection */
4145 if (memorystatus_jld_enabled == TRUE) {
4146 if (priority <= applications_aging_band) {
4147 state->jld_idle_kills++;
4148 } else {
4149 /*
4150 * We've reached into bands beyond idle deferred.
4151 * We make no attempt to monitor them
4152 */
4153 }
4154 }
4155
4156 if (priority >= JETSAM_PRIORITY_FREEZER) {
4157 memstat_approaching_fg_band(state);
4158 } else if (priority >= JETSAM_PRIORITY_BACKGROUND) {
4159 memorystatus_broadcast_jetsam_pressure(kVMPressureBackgroundJetsam);
4160 }
4161 }
4162 os_reason_free(jetsam_reason);
4163
4164 return killed;
4165 }
4166
4167 static bool
memorystatus_do_action(jetsam_state_t state,memorystatus_action_t action,uint32_t kill_cause)4168 memorystatus_do_action(jetsam_state_t state, memorystatus_action_t action, uint32_t kill_cause)
4169 {
4170 bool killed = false;
4171 os_reason_t jetsam_reason = OS_REASON_NULL;
4172
4173 switch (action) {
4174 case MEMORYSTATUS_KILL_HIWATER:
4175 killed = memorystatus_act_on_hiwat_processes(&state->errors, &state->hwm_kills,
4176 &state->post_snapshot, &state->memory_reclaimed);
4177 break;
4178 case MEMORYSTATUS_KILL_AGGRESSIVE:
4179 jetsam_reason = create_jetsam_reason(kill_cause);
4180 killed = memorystatus_act_aggressive(state, kill_cause, jetsam_reason);
4181 os_reason_free(jetsam_reason);
4182 break;
4183 case MEMORYSTATUS_KILL_TOP_PROCESS:
4184 killed = memorystatus_do_priority_kill(state, kill_cause, max_kill_priority, false);
4185 break;
4186 case MEMORYSTATUS_WAKE_SWAPPER:
4187 memorystatus_log_info(
4188 "memorystatus_do_action: Waking up swap thread. memorystatus_available_pages: %llu\n",
4189 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4190 os_atomic_store(&vm_swapout_wake_pending, true, relaxed);
4191 thread_wakeup((event_t)&vm_swapout_thread);
4192 break;
4193 case MEMORYSTATUS_PROCESS_SWAPIN_QUEUE:
4194 memorystatus_log_info(
4195 "memorystatus_do_action: Processing swapin queue of length: %u memorystatus_available_pages: %llu\n",
4196 c_late_swappedin_count, (uint64_t) MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4197 vm_compressor_process_special_swapped_in_segments();
4198 break;
4199 case MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE:
4200 killed = memorystatus_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_BACKGROUND - 1, true);
4201 break;
4202 case MEMORYSTATUS_KILL_SWAPPABLE:
4203 killed = memorystatus_do_priority_kill(state, kill_cause, max_kill_priority, true);
4204 break;
4205 case MEMORYSTATUS_KILL_IDLE:
4206 killed = memorystatus_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_IDLE, false);
4207 break;
4208 case MEMORYSTATUS_KILL_NONE:
4209 panic("memorystatus_do_action: Impossible! memorystatus_do_action called with action = NONE\n");
4210 }
4211 return killed;
4212 }
4213
4214 void
memorystatus_post_snapshot()4215 memorystatus_post_snapshot()
4216 {
4217 proc_list_lock();
4218 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
4219 sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
4220 uint64_t timestamp_now = mach_absolute_time();
4221 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
4222 memorystatus_jetsam_snapshot->js_gencount++;
4223 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
4224 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
4225 proc_list_unlock();
4226 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4227 if (!ret) {
4228 proc_list_lock();
4229 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; proc_list_unlock();
4230 }
4231 } else {
4232 proc_list_unlock();
4233 }
4234 }
4235
4236 #if JETSAM_ZPRINT_SNAPSHOT
4237
4238 /*
4239 * Called by memorystatus_update_jetsam_snapshot_entry_locked to take a zprint snapshot.
4240 */
4241 static void
memorystatus_collect_jetsam_snapshot_zprint(void)4242 memorystatus_collect_jetsam_snapshot_zprint(void)
4243 {
4244 unsigned int new_meminfo_cnt;
4245
4246 jzs_zone_cnt = zone_max_zones();
4247
4248 new_meminfo_cnt = vm_page_diagnose_estimate();
4249 if (new_meminfo_cnt > jzs_meminfo_cnt) {
4250 jzs_meminfo = krealloc_data_tag(jzs_meminfo,
4251 jzs_meminfo_cnt * sizeof(mach_memory_info_t),
4252 new_meminfo_cnt * sizeof(mach_memory_info_t),
4253 Z_WAITOK,
4254 VM_KERN_MEMORY_DIAG);
4255
4256 jzs_meminfo_cnt = new_meminfo_cnt;
4257 }
4258
4259 mach_memory_info_sample(jzs_names, jzs_info, jzs_coalesce, &jzs_zone_cnt, jzs_meminfo, jzs_meminfo_cnt, true);
4260 }
4261
4262 #endif /* JETSAM_ZPRINT_SNAPSHOT */
4263
4264 /*
4265 * Main entrypoint for the memorystatus thread.
4266 * This thread is woken up when we're low on one of the following resources:
4267 * - available pages (free + filebacked)
4268 * - zone memory
4269 * - compressor space
4270 *
4271 * Or when thrashing is detected in the compressor or file cache.
4272 */
4273 static void
memorystatus_thread_internal(jetsam_state_t state)4274 memorystatus_thread_internal(jetsam_state_t state)
4275 {
4276 uint64_t total_memory_reclaimed = 0;
4277 bool highwater_remaining = true;
4278 bool swappable_apps_remaining = false;
4279 bool suspended_swappable_apps_remaining = false;
4280
4281 #if CONFIG_JETSAM
4282 swappable_apps_remaining = memorystatus_swap_all_apps;
4283 suspended_swappable_apps_remaining = memorystatus_swap_all_apps;
4284 #endif /* CONFIG_JETSAM */
4285
4286 assert(state != NULL);
4287 state->jld_idle_kills = 0;
4288 state->errors = 0;
4289 state->hwm_kills = 0;
4290 state->sort_flag = true;
4291 state->corpse_list_purged = false;
4292 state->post_snapshot = false;
4293 state->memory_reclaimed = 0;
4294
4295 if (state->inited == FALSE) {
4296 /*
4297 * It's the first time the thread has run, so just mark the thread as privileged and block.
4298 */
4299 memorystatus_thread_init(state);
4300 sched_cond_wait(&state->jt_wakeup_cond, THREAD_UNINT, memorystatus_thread);
4301 }
4302
4303 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
4304 MEMORYSTATUS_LOG_AVAILABLE_PAGES, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, jld_eval_aggressive_count);
4305
4306 extern uint32_t c_segment_count;
4307 extern mach_timespec_t major_compact_ts;
4308 clock_sec_t now;
4309 clock_nsec_t nsec;
4310 clock_get_system_nanotime(&now, &nsec);
4311 mach_timespec_t major_compact_diff = {.tv_sec = (int)now, .tv_nsec = nsec};
4312 SUB_MACH_TIMESPEC(&major_compact_diff, &major_compact_ts);
4313 memorystatus_log_info(
4314 "memorystatus: c_segment_count=%u major compaction occurred %u seconds ago\n",
4315 c_segment_count, major_compact_diff.tv_sec);
4316
4317 /*
4318 * Jetsam aware version.
4319 *
4320 * The VM pressure notification thread is working its way through clients in parallel.
4321 *
4322 * So, while the pressure notification thread is targeting processes in order of
4323 * increasing jetsam priority, we can hopefully reduce / stop its work by killing
4324 * any processes that have exceeded their highwater mark.
4325 *
4326 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
4327 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
4328 */
4329 while (true) {
4330 bool killed;
4331 state->memory_reclaimed = 0;
4332 uint32_t cause = 0;
4333
4334 memorystatus_action_t action = memorystatus_pick_action(state, &cause,
4335 highwater_remaining, suspended_swappable_apps_remaining, swappable_apps_remaining,
4336 &state->jld_idle_kills);
4337 if (action == MEMORYSTATUS_KILL_NONE) {
4338 break;
4339 }
4340
4341 if (cause == kMemorystatusKilledVMCompressorThrashing || cause == kMemorystatusKilledVMCompressorSpaceShortage) {
4342 memorystatus_log("memorystatus: killing due to \"%s\" - compression_ratio=%u\n", memorystatus_kill_cause_name[cause], vm_compression_ratio());
4343 }
4344
4345 killed = memorystatus_do_action(state, action, cause);
4346 total_memory_reclaimed += state->memory_reclaimed;
4347
4348 if (!killed) {
4349 if (action == MEMORYSTATUS_KILL_HIWATER) {
4350 highwater_remaining = false;
4351 } else if (action == MEMORYSTATUS_KILL_SWAPPABLE) {
4352 swappable_apps_remaining = false;
4353 suspended_swappable_apps_remaining = false;
4354 } else if (action == MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE) {
4355 suspended_swappable_apps_remaining = false;
4356 }
4357 } else {
4358 if (cause == kMemorystatusKilledVMCompressorThrashing || cause == kMemorystatusKilledVMCompressorSpaceShortage) {
4359 memorystatus_log("memorystatus: post-jetsam compressor fragmentation_level=%u\n", vm_compressor_fragmentation_level());
4360 }
4361 /* Always re-check for highwater and swappable kills after doing a kill. */
4362 highwater_remaining = true;
4363 swappable_apps_remaining = true;
4364 suspended_swappable_apps_remaining = true;
4365 }
4366
4367 if (!killed && total_memory_reclaimed == 0) {
4368 memorystatus_log("memorystatus: failed to kill a process and no memory was reclaimed\n");
4369 if ((action == MEMORYSTATUS_KILL_TOP_PROCESS || action == MEMORYSTATUS_KILL_AGGRESSIVE) &&
4370 memorystatus_get_available_page_count() < memorystatus_get_critical_page_shortage_threshold()) {
4371 /*
4372 * Still under pressure and unable to kill a process - purge corpse memory
4373 * and get everything back from the pmap.
4374 */
4375 memorystatus_log("memorystatus: ran out of %sprocesses to kill but "
4376 "system is still in critical condition\n",
4377 state->limit_to_low_bands ? "low-band " : "");
4378 memstat_purge_caches(state);
4379
4380 if (!state->limit_to_low_bands &&
4381 memorystatus_get_available_page_count() < memorystatus_get_critical_page_shortage_threshold()) {
4382 /*
4383 * Still under pressure and unable to kill a process
4384 */
4385 memorystatus_log_fault("memorystatus: attempting full drain of kernel zone allocator\n");
4386 zone_gc_drain();
4387 if (memorystatus_get_available_page_count() < memorystatus_get_critical_page_shortage_threshold()) {
4388 panic("memorystatus_jetsam_thread: no victim! available pages:%llu", (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4389 }
4390 }
4391 }
4392 }
4393
4394 /*
4395 * If we did a kill on behalf of another subsystem (compressor or zalloc)
4396 * notify them.
4397 */
4398 if (killed && is_reason_thrashing(cause)) {
4399 os_atomic_store(&memorystatus_compressor_space_shortage, false, release);
4400 #if CONFIG_PHANTOM_CACHE
4401 os_atomic_store(&memorystatus_phantom_cache_pressure, false, release);
4402 #endif /* CONFIG_PHANTOM_CACHE */
4403 #if CONFIG_JETSAM
4404 vm_thrashing_jetsam_done();
4405 #endif /* CONFIG_JETSAM */
4406 } else if (killed && is_reason_zone_map_exhaustion(cause)) {
4407 os_atomic_store(&memorystatus_zone_map_is_exhausted, false, release);
4408 } else if (killed && cause == kMemorystatusKilledVMPageoutStarvation) {
4409 os_atomic_store(&memorystatus_pageout_starved, false, release);
4410 }
4411 }
4412
4413 if (state->errors) {
4414 memorystatus_clear_errors();
4415 }
4416
4417 if (state->post_snapshot) {
4418 memorystatus_post_snapshot();
4419 }
4420
4421 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
4422 MEMORYSTATUS_LOG_AVAILABLE_PAGES, total_memory_reclaimed);
4423
4424 if (state->corpse_list_purged) {
4425 os_atomic_dec(&block_corpses, relaxed);
4426 assert(block_corpses >= 0);
4427 }
4428 }
4429
4430 OS_NORETURN
4431 static void
memorystatus_thread(void * param __unused,wait_result_t wr __unused)4432 memorystatus_thread(void *param __unused, wait_result_t wr __unused)
4433 {
4434 jetsam_state_t jetsam_thread = jetsam_current_thread();
4435 sched_cond_ack(&(jetsam_thread->jt_wakeup_cond));
4436 while (1) {
4437 memorystatus_thread_internal(jetsam_thread);
4438 sched_cond_wait(&(jetsam_thread->jt_wakeup_cond), THREAD_UNINT, memorystatus_thread);
4439 }
4440 }
4441
4442 /*
4443 * This section defines when we deploy aggressive jetsam.
4444 * Aggressive jetsam kills everything up to the jld_priority_band_max band.
4445 */
4446
4447 /*
4448 * Returns TRUE:
4449 * when an idle-exitable proc was killed
4450 * Returns FALSE:
4451 * when there are no more idle-exitable procs found
4452 * when the attempt to kill an idle-exitable proc failed
4453 */
4454 boolean_t
memorystatus_idle_exit_from_VM(void)4455 memorystatus_idle_exit_from_VM(void)
4456 {
4457 /*
4458 * This routine should no longer be needed since we are
4459 * now using jetsam bands on all platforms and so will deal
4460 * with IDLE processes within the memorystatus thread itself.
4461 *
4462 * But we still use it because we observed that macos systems
4463 * started heavy compression/swapping with a bunch of
4464 * idle-exitable processes alive and doing nothing. We decided
4465 * to rather kill those processes than start swapping earlier.
4466 */
4467
4468 return kill_idle_exit_proc();
4469 }
4470
4471 /*
4472 * Callback invoked when allowable physical memory footprint exceeded
4473 * (dirty pages + IOKit mappings)
4474 *
4475 * This is invoked for both advisory, non-fatal per-task high watermarks,
4476 * as well as the fatal task memory limits.
4477 */
4478 void
memorystatus_on_ledger_footprint_exceeded(boolean_t warning,boolean_t memlimit_is_active,boolean_t memlimit_is_fatal)4479 memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
4480 {
4481 os_reason_t jetsam_reason = OS_REASON_NULL;
4482
4483 proc_t p = current_proc();
4484
4485 #if VM_PRESSURE_EVENTS
4486 if (warning == TRUE) {
4487 /*
4488 * This is a warning path which implies that the current process is close, but has
4489 * not yet exceeded its per-process memory limit.
4490 */
4491 if (memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) {
4492 /* Print warning, since it's possible that task has not registered for pressure notifications */
4493 memorystatus_log_debug(
4494 "memorystatus_on_ledger_footprint_exceeded: failed to warn %s [%d] (exiting, or no handler registered?).\n",
4495 proc_best_name(p), proc_getpid(p));
4496 }
4497 return;
4498 }
4499 #endif /* VM_PRESSURE_EVENTS */
4500
4501 if (memlimit_is_fatal) {
4502 /*
4503 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
4504 * has violated either the system-wide per-task memory limit OR its own task limit.
4505 */
4506 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT);
4507 if (jetsam_reason == NULL) {
4508 memorystatus_log_error("task_exceeded footprint: failed to allocate jetsam reason\n");
4509 } else if (corpse_for_fatal_memkill && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) {
4510 /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
4511 jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
4512 }
4513
4514 if (memorystatus_kill_process_sync(proc_getpid(p), kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) {
4515 memorystatus_log_error("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
4516 }
4517 } else {
4518 /*
4519 * HWM offender exists. Done without locks or synchronization.
4520 * See comment near its declaration for more details.
4521 */
4522 os_atomic_store(&memorystatus_hwm_candidates, true, release);
4523 _memstat_consider_waking_jetsam_thread();
4524
4525 #if VM_PRESSURE_EVENTS
4526 /*
4527 * The current process is not in the warning path.
4528 * This path implies the current process has exceeded a non-fatal (soft) memory limit.
4529 * Failure to send note is ignored here.
4530 */
4531 (void)memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */);
4532
4533 #endif /* VM_PRESSURE_EVENTS */
4534 }
4535 }
4536
4537 inline void
memorystatus_log_exception(const int max_footprint_mb,boolean_t memlimit_is_active,boolean_t memlimit_is_fatal)4538 memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
4539 {
4540 proc_t p = current_proc();
4541
4542 /*
4543 * The limit violation is logged here, but only once per process per limit.
4544 * Soft memory limit is a non-fatal high-water-mark
4545 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
4546 */
4547
4548 memorystatus_log("EXC_RESOURCE -> %s[%d] exceeded mem limit: %s%s %d MB (%s)\n",
4549 ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), (memlimit_is_active ? "Active" : "Inactive"),
4550 (memlimit_is_fatal ? "Hard" : "Soft"), max_footprint_mb,
4551 (memlimit_is_fatal ? "fatal" : "non-fatal"));
4552 }
4553
4554 inline void
memorystatus_log_diag_threshold_exception(const int diag_threshold_value)4555 memorystatus_log_diag_threshold_exception(const int diag_threshold_value)
4556 {
4557 proc_t p = current_proc();
4558
4559 /*
4560 * The limit violation is logged here, but only once per process per limit.
4561 * Soft memory limit is a non-fatal high-water-mark
4562 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
4563 */
4564
4565 memorystatus_log("EXC_RESOURCE -> %s[%d] exceeded diag threshold limit: %d MB \n",
4566 ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), diag_threshold_value);
4567 }
4568
4569 /*
4570 * Description:
4571 * Evaluates process state to determine which limit
4572 * should be applied (active vs. inactive limit).
4573 *
4574 * Processes that have the 'elevated inactive jetsam band' attribute
4575 * are first evaluated based on their current priority band.
4576 * presently elevated ==> active
4577 *
4578 * Processes that opt into dirty tracking are evaluated
4579 * based on clean vs dirty state.
4580 * dirty ==> active
4581 * clean ==> inactive
4582 *
4583 * Process that do not opt into dirty tracking are
4584 * evalulated based on priority level.
4585 * Foreground or above ==> active
4586 * Below Foreground ==> inactive
4587 *
4588 * Return: TRUE if active
4589 * False if inactive
4590 */
4591 static bool
_memstat_proc_is_active_locked(proc_t p)4592 _memstat_proc_is_active_locked(proc_t p)
4593 {
4594 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
4595
4596 if (_memstat_proc_is_elevated(p) &&
4597 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE)) {
4598 /*
4599 * process has the 'elevated inactive jetsam band' attribute
4600 * and process is present in the elevated band
4601 */
4602 return true;
4603 } else if (_memstat_proc_is_tracked(p)) {
4604 /*
4605 * process has opted into dirty tracking
4606 * active state is based on dirty vs. clean
4607 */
4608 if (_memstat_proc_is_dirty(p)) {
4609 /* Dirty */
4610 return true;
4611 } else if (_memstat_proc_can_idle_exit(p) &&
4612 p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
4613 /* Clean and Not Idle */
4614 return true;
4615 } else {
4616 /* Clean and Idle */
4617 return false;
4618 }
4619 } else {
4620 return p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND;
4621 }
4622 }
4623
4624 static boolean_t
memorystatus_kill_process_sync(pid_t victim_pid,uint32_t cause,os_reason_t jetsam_reason)4625 memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
4626 {
4627 boolean_t res;
4628
4629 uint32_t errors = 0;
4630 uint64_t memory_reclaimed = 0;
4631
4632 if (victim_pid == -1) {
4633 /* No pid, so kill first process */
4634 res = memorystatus_kill_top_process(true, true, cause, jetsam_reason,
4635 max_kill_priority, false, NULL, &errors, &memory_reclaimed);
4636 } else {
4637 res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason);
4638 }
4639
4640 if (errors) {
4641 memorystatus_clear_errors();
4642 }
4643
4644 if (res == TRUE) {
4645 /* Fire off snapshot notification */
4646 proc_list_lock();
4647 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
4648 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
4649 uint64_t timestamp_now = mach_absolute_time();
4650 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
4651 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
4652 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
4653 proc_list_unlock();
4654 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4655 if (!ret) {
4656 proc_list_lock();
4657 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
4658 proc_list_unlock();
4659 }
4660 } else {
4661 proc_list_unlock();
4662 }
4663 }
4664
4665 return res;
4666 }
4667
4668 /*
4669 * Jetsam a specific process.
4670 */
4671 static boolean_t
memorystatus_kill_specific_process(pid_t victim_pid,uint32_t cause,os_reason_t jetsam_reason)4672 memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
4673 {
4674 boolean_t killed;
4675 proc_t p;
4676 uint64_t killtime = 0;
4677 uint64_t footprint_of_killed_proc;
4678 clock_sec_t tv_sec;
4679 clock_usec_t tv_usec;
4680 uint32_t tv_msec;
4681
4682 /* TODO - add a victim queue and push this into the main jetsam thread */
4683
4684 p = proc_find(victim_pid);
4685 if (!p) {
4686 os_reason_free(jetsam_reason);
4687 return FALSE;
4688 }
4689
4690 proc_list_lock();
4691
4692 if (p->p_memstat_state & P_MEMSTAT_TERMINATED) {
4693 /*
4694 * Someone beat us to this kill.
4695 * Nothing to do here.
4696 */
4697 proc_list_unlock();
4698 os_reason_free(jetsam_reason);
4699 proc_rele(p);
4700 return FALSE;
4701 }
4702 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
4703
4704 if (memorystatus_jetsam_snapshot_count == 0) {
4705 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
4706 }
4707
4708 killtime = mach_absolute_time();
4709 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
4710 tv_msec = tv_usec / 1000;
4711
4712 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
4713
4714 proc_list_unlock();
4715
4716 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
4717
4718 memorystatus_log("%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n",
4719 (unsigned long)tv_sec, tv_msec, victim_pid, ((p && *p->p_name) ? p->p_name : "unknown"),
4720 memorystatus_kill_cause_name[cause], (p ? p->p_memstat_effectivepriority: -1),
4721 footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4722
4723 if (!killed) {
4724 proc_list_lock();
4725 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
4726 proc_list_unlock();
4727 }
4728
4729 proc_rele(p);
4730
4731 return killed;
4732 }
4733
4734
4735 /*
4736 * Toggle the P_MEMSTAT_SKIP bit.
4737 * Takes the proc_list_lock.
4738 */
4739 void
proc_memstat_skip(proc_t p,boolean_t set)4740 proc_memstat_skip(proc_t p, boolean_t set)
4741 {
4742 #if DEVELOPMENT || DEBUG
4743 if (p) {
4744 proc_list_lock();
4745 if (set == TRUE) {
4746 p->p_memstat_state |= P_MEMSTAT_SKIP;
4747 } else {
4748 p->p_memstat_state &= ~P_MEMSTAT_SKIP;
4749 }
4750 proc_list_unlock();
4751 }
4752 #else
4753 #pragma unused(p, set)
4754 /*
4755 * do nothing
4756 */
4757 #endif /* DEVELOPMENT || DEBUG */
4758 return;
4759 }
4760
4761
4762 #if CONFIG_JETSAM
4763 /*
4764 * This is invoked when cpulimits have been exceeded while in fatal mode.
4765 * The jetsam_flags do not apply as those are for memory related kills.
4766 * We call this routine so that the offending process is killed with
4767 * a non-zero exit status.
4768 */
4769 void
jetsam_on_ledger_cpulimit_exceeded(void)4770 jetsam_on_ledger_cpulimit_exceeded(void)
4771 {
4772 int retval = 0;
4773 int jetsam_flags = 0; /* make it obvious */
4774 proc_t p = current_proc();
4775 os_reason_t jetsam_reason = OS_REASON_NULL;
4776
4777 memorystatus_log("task_exceeded_cpulimit: killing pid %d [%s]\n", proc_getpid(p), (*p->p_name ? p->p_name : "(unknown)"));
4778
4779 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT);
4780 if (jetsam_reason == OS_REASON_NULL) {
4781 memorystatus_log_error("task_exceeded_cpulimit: unable to allocate memory for jetsam reason\n");
4782 }
4783
4784 retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
4785
4786 if (retval) {
4787 memorystatus_log_error("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
4788 }
4789 }
4790
4791 #endif /* CONFIG_JETSAM */
4792
4793 static void
memorystatus_get_task_memory_region_count(task_t task,uint64_t * count)4794 memorystatus_get_task_memory_region_count(task_t task, uint64_t *count)
4795 {
4796 assert(task);
4797 assert(count);
4798
4799 *count = get_task_memory_region_count(task);
4800 }
4801
4802
4803 #define MEMORYSTATUS_VM_MAP_FORK_ALLOWED 0x100000000
4804 #define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000
4805
4806 #if DEVELOPMENT || DEBUG
4807
4808 /*
4809 * Sysctl only used to test memorystatus_allowed_vm_map_fork() path.
4810 * set a new pidwatch value
4811 * or
4812 * get the current pidwatch value
4813 *
4814 * The pidwatch_val starts out with a PID to watch for in the map_fork path.
4815 * Its value is:
4816 * - OR'd with MEMORYSTATUS_VM_MAP_FORK_ALLOWED if we allow the map_fork.
4817 * - OR'd with MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED if we disallow the map_fork.
4818 * - set to -1ull if the map_fork() is aborted for other reasons.
4819 */
4820
4821 uint64_t memorystatus_vm_map_fork_pidwatch_val = 0;
4822
4823 static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS {
4824 #pragma unused(oidp, arg1, arg2)
4825
4826 uint64_t new_value = 0;
4827 uint64_t old_value = 0;
4828 int error = 0;
4829
4830 /*
4831 * The pid is held in the low 32 bits.
4832 * The 'allowed' flags are in the upper 32 bits.
4833 */
4834 old_value = memorystatus_vm_map_fork_pidwatch_val;
4835
4836 error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL);
4837
4838 if (error || !req->newptr) {
4839 /*
4840 * No new value passed in.
4841 */
4842 return error;
4843 }
4844
4845 /*
4846 * A new pid was passed in via req->newptr.
4847 * Ignore any attempt to set the higher order bits.
4848 */
4849 memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF;
4850 memorystatus_log_debug("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx\n", old_value, new_value);
4851
4852 return error;
4853 }
4854
4855 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED,
4856 0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork");
4857
4858
4859 /*
4860 * Record if a watched process fails to qualify for a vm_map_fork().
4861 */
4862 void
memorystatus_abort_vm_map_fork(task_t task)4863 memorystatus_abort_vm_map_fork(task_t task)
4864 {
4865 if (memorystatus_vm_map_fork_pidwatch_val != 0) {
4866 proc_t p = get_bsdtask_info(task);
4867 if (p != NULL && memorystatus_vm_map_fork_pidwatch_val == (uint64_t)proc_getpid(p)) {
4868 memorystatus_vm_map_fork_pidwatch_val = -1ull;
4869 }
4870 }
4871 }
4872
4873 static void
set_vm_map_fork_pidwatch(task_t task,uint64_t x)4874 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
4875 {
4876 if (memorystatus_vm_map_fork_pidwatch_val != 0) {
4877 proc_t p = get_bsdtask_info(task);
4878 if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)proc_getpid(p))) {
4879 memorystatus_vm_map_fork_pidwatch_val |= x;
4880 }
4881 }
4882 }
4883
4884 #else /* DEVELOPMENT || DEBUG */
4885
4886
4887 static void
set_vm_map_fork_pidwatch(task_t task,uint64_t x)4888 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
4889 {
4890 #pragma unused(task)
4891 #pragma unused(x)
4892 }
4893
4894 #endif /* DEVELOPMENT || DEBUG */
4895
4896 /*
4897 * Called during EXC_RESOURCE handling when a process exceeds a soft
4898 * memory limit. This is the corpse fork path and here we decide if
4899 * vm_map_fork will be allowed when creating the corpse.
4900 * The task being considered is suspended.
4901 *
4902 * By default, a vm_map_fork is allowed to proceed.
4903 *
4904 * A few simple policy assumptions:
4905 * If the device has a zero system-wide task limit,
4906 * then the vm_map_fork is allowed. macOS always has a zero
4907 * system wide task limit (unless overriden by a boot-arg).
4908 *
4909 * And if a process's memory footprint calculates less
4910 * than or equal to quarter of the system-wide task limit,
4911 * then the vm_map_fork is allowed. This calculation
4912 * is based on the assumption that a process can
4913 * munch memory up to the system-wide task limit.
4914 *
4915 * For watchOS, which has a low task limit, we use a
4916 * different value. Current task limit has been reduced
4917 * to 300MB and it's been decided the limit should be 200MB.
4918 */
4919 int large_corpse_count = 0;
4920 boolean_t
memorystatus_allowed_vm_map_fork(task_t task,bool * is_large)4921 memorystatus_allowed_vm_map_fork(task_t task, bool *is_large)
4922 {
4923 boolean_t is_allowed = TRUE; /* default */
4924 uint64_t footprint_in_bytes;
4925 uint64_t max_allowed_bytes;
4926 thread_t self = current_thread();
4927
4928 *is_large = false;
4929
4930 /* Jetsam in high bands blocks any new corpse */
4931 if (os_atomic_load(&block_corpses, relaxed) != 0) {
4932 memorystatus_log("memorystatus_allowed_vm_map_fork: corpse for pid %d blocked by jetsam).\n", task_pid(task));
4933 ktriage_record(thread_tid(self), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CORPSE, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CORPSE_BLOCKED_JETSAM), 0 /* arg */);
4934 return FALSE;
4935 }
4936
4937 if (max_task_footprint_mb == 0) {
4938 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
4939 return is_allowed;
4940 }
4941
4942 footprint_in_bytes = get_task_phys_footprint(task);
4943
4944 /*
4945 * Maximum is 1/4 of the system-wide task limit by default.
4946 */
4947 max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2;
4948
4949 #if XNU_TARGET_OS_WATCH
4950 /*
4951 * For watches with > 1G, use a limit of 200MB and allow
4952 * one corpse at a time of up to 300MB.
4953 */
4954 #define LARGE_CORPSE_LIMIT 1
4955 if (sane_size > 1 * 1024 * 1024 * 1024) {
4956 int cnt = large_corpse_count;
4957 if (footprint_in_bytes > 200 * 1024 * 1024 &&
4958 footprint_in_bytes <= 300 * 1024 * 1024 &&
4959 cnt < LARGE_CORPSE_LIMIT &&
4960 OSCompareAndSwap(cnt, cnt + 1, &large_corpse_count)) {
4961 *is_large = true;
4962 max_allowed_bytes = MAX(max_allowed_bytes, 300 * 1024 * 1024);
4963 } else {
4964 max_allowed_bytes = MAX(max_allowed_bytes, 200 * 1024 * 1024);
4965 }
4966 }
4967 #endif /* XNU_TARGET_OS_WATCH */
4968
4969 #if DEBUG || DEVELOPMENT
4970 if (corpse_threshold_system_limit) {
4971 max_allowed_bytes = (uint64_t)max_task_footprint_mb * (1UL << 20);
4972 }
4973 #endif /* DEBUG || DEVELOPMENT */
4974
4975 if (footprint_in_bytes > max_allowed_bytes) {
4976 memorystatus_log("memorystatus disallowed vm_map_fork %lld %lld\n", footprint_in_bytes, max_allowed_bytes);
4977 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED);
4978 ktriage_record(thread_tid(self), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CORPSE, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CORPSE_PROC_TOO_BIG), 0 /* arg */);
4979 return !is_allowed;
4980 }
4981
4982 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
4983 return is_allowed;
4984 }
4985
4986 void
memorystatus_get_task_page_counts(task_t task,uint32_t * footprint,uint32_t * max_footprint_lifetime,uint32_t * purgeable_pages)4987 memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
4988 {
4989 assert(task);
4990 assert(footprint);
4991
4992 uint64_t pages;
4993
4994 pages = (get_task_phys_footprint(task) / PAGE_SIZE_64);
4995 assert(((uint32_t)pages) == pages);
4996 *footprint = (uint32_t)pages;
4997
4998 if (max_footprint_lifetime) {
4999 pages = (get_task_phys_footprint_lifetime_max(task) / PAGE_SIZE_64);
5000 assert(((uint32_t)pages) == pages);
5001 *max_footprint_lifetime = (uint32_t)pages;
5002 }
5003 if (purgeable_pages) {
5004 pages = (get_task_purgeable_size(task) / PAGE_SIZE_64);
5005 assert(((uint32_t)pages) == pages);
5006 *purgeable_pages = (uint32_t)pages;
5007 }
5008 }
5009
5010 static void
memorystatus_get_task_phys_footprint_page_counts(task_t task,uint64_t * internal_pages,uint64_t * internal_compressed_pages,uint64_t * purgeable_nonvolatile_pages,uint64_t * purgeable_nonvolatile_compressed_pages,uint64_t * alternate_accounting_pages,uint64_t * alternate_accounting_compressed_pages,uint64_t * iokit_mapped_pages,uint64_t * page_table_pages,uint64_t * frozen_to_swap_pages,uint64_t * neural_nofootprint_total_pages)5011 memorystatus_get_task_phys_footprint_page_counts(task_t task,
5012 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
5013 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
5014 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
5015 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages,
5016 uint64_t *neural_nofootprint_total_pages)
5017 {
5018 assert(task);
5019
5020 if (internal_pages) {
5021 *internal_pages = (get_task_internal(task) / PAGE_SIZE_64);
5022 }
5023
5024 if (internal_compressed_pages) {
5025 *internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64);
5026 }
5027
5028 if (purgeable_nonvolatile_pages) {
5029 *purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64);
5030 }
5031
5032 if (purgeable_nonvolatile_compressed_pages) {
5033 *purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64);
5034 }
5035
5036 if (alternate_accounting_pages) {
5037 *alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64);
5038 }
5039
5040 if (alternate_accounting_compressed_pages) {
5041 *alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64);
5042 }
5043
5044 if (iokit_mapped_pages) {
5045 *iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64);
5046 }
5047
5048 if (page_table_pages) {
5049 *page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64);
5050 }
5051
5052 if (neural_nofootprint_total_pages) {
5053 *neural_nofootprint_total_pages = (get_task_neural_nofootprint_total(task) / PAGE_SIZE_64);
5054 }
5055
5056 #if CONFIG_FREEZE
5057 if (frozen_to_swap_pages) {
5058 *frozen_to_swap_pages = (get_task_frozen_to_swap(task) / PAGE_SIZE_64);
5059 }
5060 #else /* CONFIG_FREEZE */
5061 #pragma unused(frozen_to_swap_pages)
5062 #endif /* CONFIG_FREEZE */
5063 }
5064
5065 #if CONFIG_FREEZE
5066 /*
5067 * Copies the source entry into the destination snapshot.
5068 * Returns true on success. Fails if the destination snapshot is full.
5069 * Caller must hold the proc list lock.
5070 */
5071 static bool
memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t * dst_snapshot,unsigned int dst_snapshot_size,const memorystatus_jetsam_snapshot_entry_t * src_entry)5072 memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t *dst_snapshot, unsigned int dst_snapshot_size, const memorystatus_jetsam_snapshot_entry_t *src_entry)
5073 {
5074 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5075 assert(dst_snapshot);
5076
5077 if (dst_snapshot->entry_count == dst_snapshot_size) {
5078 /* Destination snapshot is full. Can not be updated until it is consumed. */
5079 return false;
5080 }
5081 if (dst_snapshot->entry_count == 0) {
5082 memorystatus_init_jetsam_snapshot_header(dst_snapshot);
5083 }
5084 memorystatus_jetsam_snapshot_entry_t *dst_entry = &dst_snapshot->entries[dst_snapshot->entry_count++];
5085 memcpy(dst_entry, src_entry, sizeof(memorystatus_jetsam_snapshot_entry_t));
5086 return true;
5087 }
5088 #endif /* CONFIG_FREEZE */
5089
5090 static bool
memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t * snapshot,proc_t p,uint32_t kill_cause,uint64_t killtime,memorystatus_jetsam_snapshot_entry_t ** entry)5091 memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t *snapshot, proc_t p, uint32_t kill_cause, uint64_t killtime, memorystatus_jetsam_snapshot_entry_t **entry)
5092 {
5093 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5094 memorystatus_jetsam_snapshot_entry_t *snapshot_list = snapshot->entries;
5095 size_t i = snapshot->entry_count;
5096
5097 if (memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], (snapshot->js_gencount)) == TRUE) {
5098 *entry = &snapshot_list[i];
5099 (*entry)->killed = kill_cause;
5100 (*entry)->jse_killtime = killtime;
5101
5102 snapshot->entry_count = i + 1;
5103 return true;
5104 }
5105 return false;
5106 }
5107
5108 /*
5109 * This routine only acts on the global jetsam event snapshot.
5110 * Updating the process's entry can race when the memorystatus_thread
5111 * has chosen to kill a process that is racing to exit on another core.
5112 */
5113 static void
memorystatus_update_jetsam_snapshot_entry_locked(proc_t p,uint32_t kill_cause,uint64_t killtime)5114 memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime)
5115 {
5116 memorystatus_jetsam_snapshot_entry_t *entry = NULL;
5117 memorystatus_jetsam_snapshot_t *snapshot = NULL;
5118 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
5119
5120 unsigned int i;
5121 #if CONFIG_FREEZE
5122 bool copied_to_freezer_snapshot = false;
5123 #endif /* CONFIG_FREEZE */
5124
5125 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5126
5127 if (memorystatus_jetsam_snapshot_count == 0) {
5128 /*
5129 * No active snapshot.
5130 * Nothing to do.
5131 */
5132 goto exit;
5133 }
5134
5135 /*
5136 * Sanity check as this routine should only be called
5137 * from a jetsam kill path.
5138 */
5139 assert(kill_cause != 0 && killtime != 0);
5140
5141 snapshot = memorystatus_jetsam_snapshot;
5142 snapshot_list = memorystatus_jetsam_snapshot->entries;
5143
5144 #if JETSAM_ZPRINT_SNAPSHOT
5145 /*
5146 * Collect the snapshot zprint info if we've reached the right priority
5147 */
5148 if (p->p_memstat_effectivepriority >= (int)jzs_trigger_band &&
5149 jzs_gencount != snapshot->js_gencount) {
5150 memorystatus_collect_jetsam_snapshot_zprint();
5151 jzs_gencount = snapshot->js_gencount;
5152 }
5153 #endif
5154
5155 for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
5156 if (snapshot_list[i].pid == proc_getpid(p)) {
5157 entry = &snapshot_list[i];
5158
5159 if (entry->killed || entry->jse_killtime) {
5160 /*
5161 * We apparently raced on the exit path
5162 * for this process, as it's snapshot entry
5163 * has already recorded a kill.
5164 */
5165 assert(entry->killed && entry->jse_killtime);
5166 break;
5167 }
5168
5169 /*
5170 * Update the entry we just found in the snapshot.
5171 */
5172
5173 entry->killed = kill_cause;
5174 entry->jse_killtime = killtime;
5175 entry->jse_gencount = snapshot->js_gencount;
5176 entry->jse_idle_delta = p->p_memstat_idle_delta;
5177 #if CONFIG_FREEZE
5178 entry->jse_thaw_count = p->p_memstat_thaw_count;
5179 entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason;
5180 #else /* CONFIG_FREEZE */
5181 entry->jse_thaw_count = 0;
5182 entry->jse_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
5183 #endif /* CONFIG_FREEZE */
5184
5185 /*
5186 * If a process has moved between bands since snapshot was
5187 * initialized, then likely these fields changed too.
5188 */
5189 if (entry->priority != p->p_memstat_effectivepriority) {
5190 strlcpy(entry->name, p->p_name, sizeof(entry->name));
5191 entry->priority = p->p_memstat_effectivepriority;
5192 entry->state = memorystatus_build_state(p);
5193 entry->user_data = p->p_memstat_userdata;
5194 entry->fds = p->p_fd.fd_nfiles;
5195 }
5196
5197 /*
5198 * Always update the page counts on a kill.
5199 */
5200
5201 uint32_t pages = 0;
5202 uint32_t max_pages_lifetime = 0;
5203 uint32_t purgeable_pages = 0;
5204
5205 memorystatus_get_task_page_counts(proc_task(p), &pages, &max_pages_lifetime, &purgeable_pages);
5206 entry->pages = (uint64_t)pages;
5207 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
5208 entry->purgeable_pages = (uint64_t)purgeable_pages;
5209
5210 uint64_t internal_pages = 0;
5211 uint64_t internal_compressed_pages = 0;
5212 uint64_t purgeable_nonvolatile_pages = 0;
5213 uint64_t purgeable_nonvolatile_compressed_pages = 0;
5214 uint64_t alternate_accounting_pages = 0;
5215 uint64_t alternate_accounting_compressed_pages = 0;
5216 uint64_t iokit_mapped_pages = 0;
5217 uint64_t page_table_pages = 0;
5218 uint64_t frozen_to_swap_pages = 0;
5219 uint64_t neural_nofootprint_total_pages = 0;
5220
5221 memorystatus_get_task_phys_footprint_page_counts(proc_task(p), &internal_pages, &internal_compressed_pages,
5222 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
5223 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
5224 &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages, &neural_nofootprint_total_pages);
5225
5226 entry->jse_internal_pages = internal_pages;
5227 entry->jse_internal_compressed_pages = internal_compressed_pages;
5228 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
5229 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
5230 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
5231 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
5232 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
5233 entry->jse_page_table_pages = page_table_pages;
5234 entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
5235 entry->jse_neural_nofootprint_total_pages = neural_nofootprint_total_pages;
5236
5237 uint64_t region_count = 0;
5238 memorystatus_get_task_memory_region_count(proc_task(p), ®ion_count);
5239 entry->jse_memory_region_count = region_count;
5240 entry->csflags = proc_getcsflags(p);
5241 goto exit;
5242 }
5243 }
5244
5245 if (entry == NULL) {
5246 /*
5247 * The entry was not found in the snapshot, so the process must have
5248 * launched after the snapshot was initialized.
5249 * Let's try to append the new entry.
5250 */
5251 if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) {
5252 /*
5253 * A populated snapshot buffer exists
5254 * and there is room to init a new entry.
5255 */
5256 assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count);
5257
5258 if (memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry)) {
5259 memorystatus_jetsam_snapshot_count++;
5260
5261 if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) {
5262 /*
5263 * We just used the last slot in the snapshot buffer.
5264 * We only want to log it once... so we do it here
5265 * when we notice we've hit the max.
5266 */
5267 memorystatus_log_error("memorystatus: WARNING snapshot buffer is full, count %d\n", memorystatus_jetsam_snapshot_count);
5268 }
5269 }
5270 }
5271 }
5272
5273 exit:
5274 if (entry) {
5275 #if CONFIG_FREEZE
5276 if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5277 /* This is an app kill. Record it in the freezer snapshot so dasd can incorporate this in its recommendations. */
5278 copied_to_freezer_snapshot = memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_freezer, memorystatus_jetsam_snapshot_freezer_max, entry);
5279 if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5280 /*
5281 * We just used the last slot in the freezer snapshot buffer.
5282 * We only want to log it once... so we do it here
5283 * when we notice we've hit the max.
5284 */
5285 memorystatus_log_error("memorystatus: WARNING freezer snapshot buffer is full, count %zu\n",
5286 memorystatus_jetsam_snapshot_freezer->entry_count);
5287 }
5288 }
5289 #endif /* CONFIG_FREEZE */
5290 } else {
5291 /*
5292 * If we reach here, the snapshot buffer could not be updated.
5293 * Most likely, the buffer is full, in which case we would have
5294 * logged a warning in the previous call.
5295 *
5296 * For now, we will stop appending snapshot entries.
5297 * When the buffer is consumed, the snapshot state will reset.
5298 */
5299
5300 memorystatus_log_error(
5301 "memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n",
5302 proc_getpid(p), p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count);
5303
5304 #if CONFIG_FREEZE
5305 /* We still attempt to record this in the freezer snapshot */
5306 if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5307 snapshot = memorystatus_jetsam_snapshot_freezer;
5308 if (snapshot->entry_count < memorystatus_jetsam_snapshot_freezer_max) {
5309 copied_to_freezer_snapshot = memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry);
5310 if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5311 /*
5312 * We just used the last slot in the freezer snapshot buffer.
5313 * We only want to log it once... so we do it here
5314 * when we notice we've hit the max.
5315 */
5316 memorystatus_log_error("memorystatus: WARNING freezer snapshot buffer is full, count %zu\n",
5317 memorystatus_jetsam_snapshot_freezer->entry_count);
5318 }
5319 }
5320 }
5321 #endif /* CONFIG_FREEZE */
5322 }
5323
5324 return;
5325 }
5326
5327 uint32_t
memorystatus_get_available_page_count(void)5328 memorystatus_get_available_page_count(void)
5329 {
5330 return os_atomic_load(&memorystatus_available_pages, relaxed);
5331 }
5332
5333 void
memorystatus_update_available_page_count(uint32_t available_page_count)5334 memorystatus_update_available_page_count(uint32_t available_page_count)
5335 {
5336 os_atomic_store(&memorystatus_available_pages, available_page_count,
5337 relaxed);
5338 #if VM_PRESSURE_EVENTS
5339 /*
5340 * Since memorystatus_available_pages changes, we should
5341 * re-evaluate the pressure levels on the system and
5342 * check if we need to wake the pressure thread.
5343 * We also update memorystatus_level in that routine.
5344 */
5345 vm_pressure_response();
5346 #endif /* VM_PRESSURE_EVENTS */
5347 #if CONFIG_FREEZE
5348 /*
5349 * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect
5350 * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this
5351 * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here
5352 * will result in the "mutex with preemption disabled" panic.
5353 */
5354
5355 if (memorystatus_freeze_thread_should_run()) {
5356 /*
5357 * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process).
5358 * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here.
5359 */
5360 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
5361 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
5362 }
5363 }
5364 #endif /* CONFIG_FREEZE */
5365 _memstat_consider_waking_jetsam_thread();
5366 }
5367
5368 static boolean_t
memorystatus_init_jetsam_snapshot_entry_locked(proc_t p,memorystatus_jetsam_snapshot_entry_t * entry,uint64_t gencount)5369 memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount)
5370 {
5371 clock_sec_t tv_sec;
5372 clock_usec_t tv_usec;
5373 uint32_t pages = 0;
5374 uint32_t max_pages_lifetime = 0;
5375 uint32_t purgeable_pages = 0;
5376 uint64_t internal_pages = 0;
5377 uint64_t internal_compressed_pages = 0;
5378 uint64_t purgeable_nonvolatile_pages = 0;
5379 uint64_t purgeable_nonvolatile_compressed_pages = 0;
5380 uint64_t alternate_accounting_pages = 0;
5381 uint64_t alternate_accounting_compressed_pages = 0;
5382 uint64_t iokit_mapped_pages = 0;
5383 uint64_t page_table_pages = 0;
5384 uint64_t frozen_to_swap_pages = 0;
5385 uint64_t neural_nofootprint_total_pages = 0;
5386 uint64_t region_count = 0;
5387 uint64_t cids[COALITION_NUM_TYPES];
5388 uint32_t trust = 0;
5389 kern_return_t ret = 0;
5390 memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
5391
5392 entry->pid = proc_getpid(p);
5393 strlcpy(&entry->name[0], p->p_name, sizeof(entry->name));
5394 entry->priority = p->p_memstat_effectivepriority;
5395
5396 memorystatus_get_task_page_counts(proc_task(p), &pages, &max_pages_lifetime, &purgeable_pages);
5397 entry->pages = (uint64_t)pages;
5398 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
5399 entry->purgeable_pages = (uint64_t)purgeable_pages;
5400
5401 memorystatus_get_task_phys_footprint_page_counts(proc_task(p), &internal_pages, &internal_compressed_pages,
5402 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
5403 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
5404 &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages, &neural_nofootprint_total_pages);
5405
5406 entry->jse_internal_pages = internal_pages;
5407 entry->jse_internal_compressed_pages = internal_compressed_pages;
5408 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
5409 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
5410 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
5411 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
5412 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
5413 entry->jse_page_table_pages = page_table_pages;
5414 entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
5415 entry->jse_neural_nofootprint_total_pages = neural_nofootprint_total_pages;
5416
5417 memorystatus_get_task_memory_region_count(proc_task(p), ®ion_count);
5418 entry->jse_memory_region_count = region_count;
5419
5420 entry->state = memorystatus_build_state(p);
5421 entry->user_data = p->p_memstat_userdata;
5422 proc_getexecutableuuid(p, &entry->uuid[0], sizeof(entry->uuid));
5423 entry->fds = p->p_fd.fd_nfiles;
5424
5425 absolutetime_to_microtime(get_task_cpu_time(proc_task(p)), &tv_sec, &tv_usec);
5426 entry->cpu_time.tv_sec = (int64_t)tv_sec;
5427 entry->cpu_time.tv_usec = (int64_t)tv_usec;
5428
5429 assert(p->p_stats != NULL);
5430 entry->jse_starttime = p->p_stats->ps_start; /* abstime process started */
5431 entry->jse_killtime = 0; /* abstime jetsam chose to kill process */
5432 entry->killed = 0; /* the jetsam kill cause */
5433 entry->jse_gencount = gencount; /* indicates a pass through jetsam thread, when process was targeted to be killed */
5434
5435 entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */
5436
5437 #if CONFIG_FREEZE
5438 entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason;
5439 entry->jse_thaw_count = p->p_memstat_thaw_count;
5440 #else /* CONFIG_FREEZE */
5441 entry->jse_thaw_count = 0;
5442 entry->jse_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
5443 #endif /* CONFIG_FREEZE */
5444
5445 proc_coalitionids(p, cids);
5446 entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM];
5447 entry->csflags = proc_getcsflags(p);
5448 ret = get_trust_level_kdp(get_task_pmap(proc_task(p)), &trust);
5449 if (ret != KERN_SUCCESS) {
5450 trust = KCDATA_INVALID_CS_TRUST_LEVEL;
5451 }
5452 entry->cs_trust_level = trust;
5453 return TRUE;
5454 }
5455
5456 static void
memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t * snapshot)5457 memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
5458 {
5459 kern_return_t kr = KERN_SUCCESS;
5460 mach_msg_type_number_t count = HOST_VM_INFO64_COUNT;
5461 vm_statistics64_data_t vm_stat;
5462
5463 if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count)) != KERN_SUCCESS) {
5464 memorystatus_log_error("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
5465 memset(&snapshot->stats, 0, sizeof(snapshot->stats));
5466 } else {
5467 snapshot->stats.free_pages = vm_stat.free_count;
5468 snapshot->stats.active_pages = vm_stat.active_count;
5469 snapshot->stats.inactive_pages = vm_stat.inactive_count;
5470 snapshot->stats.throttled_pages = vm_stat.throttled_count;
5471 snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
5472 snapshot->stats.wired_pages = vm_stat.wire_count;
5473
5474 snapshot->stats.speculative_pages = vm_stat.speculative_count;
5475 snapshot->stats.filebacked_pages = vm_stat.external_page_count;
5476 snapshot->stats.anonymous_pages = vm_stat.internal_page_count;
5477 snapshot->stats.compressions = vm_stat.compressions;
5478 snapshot->stats.decompressions = vm_stat.decompressions;
5479 snapshot->stats.compressor_pages = vm_stat.compressor_page_count;
5480 snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
5481 }
5482
5483 get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity);
5484
5485 bzero(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name));
5486 get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name),
5487 &snapshot->stats.largest_zone_size);
5488 }
5489
5490 /*
5491 * Collect vm statistics at boot.
5492 * Called only once (see kern_exec.c)
5493 * Data can be consumed at any time.
5494 */
5495 void
memorystatus_init_at_boot_snapshot()5496 memorystatus_init_at_boot_snapshot()
5497 {
5498 memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
5499 memorystatus_at_boot_snapshot.entry_count = 0;
5500 memorystatus_at_boot_snapshot.notification_time = 0; /* updated when consumed */
5501 memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
5502 }
5503
5504 static void
memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t * snapshot)5505 memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot)
5506 {
5507 memorystatus_init_snapshot_vmstats(snapshot);
5508 snapshot->snapshot_time = mach_absolute_time();
5509 snapshot->notification_time = 0;
5510 snapshot->js_gencount = 0;
5511 }
5512
5513 static void
memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t * od_snapshot,uint32_t ods_list_count)5514 memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
5515 {
5516 proc_t p, next_p;
5517 unsigned int b = 0, i = 0;
5518
5519 memorystatus_jetsam_snapshot_t *snapshot = NULL;
5520 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
5521 unsigned int snapshot_max = 0;
5522
5523 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5524
5525 if (od_snapshot) {
5526 /*
5527 * This is an on_demand snapshot
5528 */
5529 snapshot = od_snapshot;
5530 snapshot_list = od_snapshot->entries;
5531 snapshot_max = ods_list_count;
5532 } else {
5533 /*
5534 * This is a jetsam event snapshot
5535 */
5536 snapshot = memorystatus_jetsam_snapshot;
5537 snapshot_list = memorystatus_jetsam_snapshot->entries;
5538 snapshot_max = memorystatus_jetsam_snapshot_max;
5539 }
5540
5541 memorystatus_init_jetsam_snapshot_header(snapshot);
5542
5543 next_p = memorystatus_get_first_proc_locked(&b, TRUE);
5544 while (next_p) {
5545 p = next_p;
5546 next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
5547
5548 if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) {
5549 continue;
5550 }
5551
5552 if (++i == snapshot_max) {
5553 break;
5554 }
5555 }
5556
5557 /* Log launchd and kernel_task as well to see more context, even though jetsam doesn't apply to them. */
5558 if (i < snapshot_max) {
5559 memorystatus_init_jetsam_snapshot_entry_locked(initproc, &snapshot_list[i], snapshot->js_gencount);
5560 i++;
5561 }
5562
5563 if (i < snapshot_max) {
5564 memorystatus_init_jetsam_snapshot_entry_locked(kernproc, &snapshot_list[i], snapshot->js_gencount);
5565 i++;
5566 }
5567
5568 snapshot->entry_count = i;
5569
5570 if (!od_snapshot) {
5571 /* update the system buffer count */
5572 memorystatus_jetsam_snapshot_count = i;
5573 }
5574 }
5575
5576 #if DEVELOPMENT || DEBUG
5577
5578 /*
5579 * Verify that the given bucket has been sorted correctly.
5580 *
5581 * Walks through the bucket and verifies that all pids in the
5582 * expected_order buffer are in that bucket and in the same
5583 * relative order.
5584 *
5585 * The proc_list_lock must be held by the caller.
5586 */
5587 static int
memorystatus_verify_sort_order(unsigned int bucket_index,pid_t * expected_order,size_t num_pids)5588 memorystatus_verify_sort_order(unsigned int bucket_index, pid_t *expected_order, size_t num_pids)
5589 {
5590 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5591
5592 int error = 0;
5593 proc_t p = NULL;
5594 size_t i = 0;
5595
5596 /*
5597 * NB: We allow other procs to be mixed in within the expected ones.
5598 * We just need the expected procs to be in the right order relative to each other.
5599 */
5600 p = memorystatus_get_first_proc_locked(&bucket_index, FALSE);
5601 while (p) {
5602 if (proc_getpid(p) == expected_order[i]) {
5603 i++;
5604 }
5605 if (i == num_pids) {
5606 break;
5607 }
5608 p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE);
5609 }
5610 if (i != num_pids) {
5611 char buffer[128];
5612 size_t len = sizeof(buffer);
5613 size_t buffer_idx = 0;
5614 memorystatus_log_error("memorystatus_verify_sort_order: Processes in bucket %d were not sorted properly\n", bucket_index);
5615 for (i = 0; i < num_pids; i++) {
5616 int num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%d,", expected_order[i]);
5617 if (num_written <= 0) {
5618 break;
5619 }
5620 if (buffer_idx + (unsigned int) num_written >= len) {
5621 break;
5622 }
5623 buffer_idx += num_written;
5624 }
5625 memorystatus_log_error("memorystatus_verify_sort_order: Expected order [%s]\n", buffer);
5626 memset(buffer, 0, len);
5627 buffer_idx = 0;
5628 p = memorystatus_get_first_proc_locked(&bucket_index, FALSE);
5629 i = 0;
5630 memorystatus_log_error("memorystatus_verify_sort_order: Actual order:\n");
5631 while (p) {
5632 int num_written;
5633 if (buffer_idx == 0) {
5634 num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%zu: %d,", i, proc_getpid(p));
5635 } else {
5636 num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%d,", proc_getpid(p));
5637 }
5638 if (num_written <= 0) {
5639 break;
5640 }
5641 buffer_idx += (unsigned int) num_written;
5642 assert(buffer_idx <= len);
5643 if (i % 10 == 0) {
5644 memorystatus_log_error("memorystatus_verify_sort_order: %s\n", buffer);
5645 buffer_idx = 0;
5646 }
5647 p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE);
5648 i++;
5649 }
5650 if (buffer_idx != 0) {
5651 memorystatus_log_error("memorystatus_verify_sort_order: %s\n", buffer);
5652 }
5653 error = EINVAL;
5654 }
5655 return error;
5656 }
5657
5658 /*
5659 * Triggers a sort_order on a specified jetsam priority band.
5660 * This is for testing only, used to force a path through the sort
5661 * function.
5662 */
5663 static int
memorystatus_cmd_test_jetsam_sort(int priority,int sort_order,user_addr_t expected_order_user,size_t expected_order_user_len)5664 memorystatus_cmd_test_jetsam_sort(int priority,
5665 int sort_order,
5666 user_addr_t expected_order_user,
5667 size_t expected_order_user_len)
5668 {
5669 int error = 0;
5670 unsigned int bucket_index = 0;
5671 const size_t kMaxPids = 8;
5672 pid_t expected_order[kMaxPids];
5673 size_t copy_size = sizeof(expected_order);
5674 size_t num_pids;
5675
5676 if (expected_order_user_len < copy_size) {
5677 copy_size = expected_order_user_len;
5678 }
5679 num_pids = copy_size / sizeof(pid_t);
5680
5681 error = copyin(expected_order_user, expected_order, copy_size);
5682 if (error != 0) {
5683 return error;
5684 }
5685
5686 if (priority == -1) {
5687 /* Use as shorthand for default priority */
5688 bucket_index = JETSAM_PRIORITY_DEFAULT;
5689 } else {
5690 bucket_index = (unsigned int)priority;
5691 }
5692
5693 /*
5694 * Acquire lock before sorting so we can check the sort order
5695 * while still holding the lock.
5696 */
5697 proc_list_lock();
5698
5699 memorystatus_sort_bucket_locked(bucket_index, sort_order);
5700
5701 if (expected_order_user != CAST_USER_ADDR_T(NULL) && expected_order_user_len > 0) {
5702 error = memorystatus_verify_sort_order(bucket_index, expected_order, num_pids);
5703 }
5704
5705 proc_list_unlock();
5706
5707 return error;
5708 }
5709
5710 #endif /* DEVELOPMENT || DEBUG */
5711
5712 /*
5713 * Prepare the process to be killed (set state, update snapshot) and kill it.
5714 */
5715 static uint64_t memorystatus_purge_before_jetsam_success = 0;
5716
5717 #if SOCKETS
5718 static int
networking_memstatus_callout(proc_t p,uint32_t status)5719 networking_memstatus_callout(proc_t p, uint32_t status)
5720 {
5721 struct fileproc *fp;
5722
5723 /*
5724 * proc list lock NOT held
5725 * proc lock NOT held
5726 * a reference on the proc has been held / shall be dropped by the caller.
5727 */
5728 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
5729 LCK_MTX_ASSERT(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED);
5730
5731 proc_fdlock(p);
5732
5733 fdt_foreach(fp, p) {
5734 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
5735 #if NECP
5736 case DTYPE_NETPOLICY:
5737 necp_fd_memstatus(p, status,
5738 (struct necp_fd_data *)fp_get_data(fp));
5739 break;
5740 #endif /* NECP */
5741 #if SKYWALK
5742 case DTYPE_CHANNEL:
5743 kern_channel_memstatus(p, status,
5744 (struct kern_channel *)fp_get_data(fp));
5745 break;
5746 #endif /* SKYWALK */
5747 default:
5748 break;
5749 }
5750 }
5751 proc_fdunlock(p);
5752
5753 return 1;
5754 }
5755 #endif /* SOCKETS */
5756
5757 static boolean_t
memorystatus_kill_proc(proc_t p,uint32_t cause,os_reason_t jetsam_reason,bool * killed,uint64_t * footprint_of_killed_proc)5758 memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool *killed, uint64_t *footprint_of_killed_proc)
5759 {
5760 pid_t aPid = 0;
5761 uint32_t aPid_ep = 0;
5762
5763 uint64_t killtime = 0;
5764 clock_sec_t tv_sec;
5765 clock_usec_t tv_usec;
5766 uint32_t tv_msec;
5767 boolean_t retval = FALSE;
5768
5769 aPid = proc_getpid(p);
5770 aPid_ep = p->p_memstat_effectivepriority;
5771
5772 if (cause != kMemorystatusKilledVnodes && cause != kMemorystatusKilledZoneMapExhaustion) {
5773 /*
5774 * Genuine memory pressure and not other (vnode/zone) resource exhaustion.
5775 */
5776 boolean_t success = FALSE;
5777 uint64_t num_pages_purged;
5778 uint64_t num_pages_reclaimed = 0;
5779 uint64_t num_pages_unsecluded = 0;
5780
5781 networking_memstatus_callout(p, cause);
5782 num_pages_purged = vm_purgeable_purge_task_owned(proc_task(p));
5783 num_pages_reclaimed += num_pages_purged;
5784 #if CONFIG_SECLUDED_MEMORY
5785 if (cause == kMemorystatusKilledVMPageShortage &&
5786 vm_page_secluded_count > 0 &&
5787 task_can_use_secluded_mem(proc_task(p), FALSE)) {
5788 /*
5789 * We're about to kill a process that has access
5790 * to the secluded pool. Drain that pool into the
5791 * free or active queues to make these pages re-appear
5792 * as "available", which might make us no longer need
5793 * to kill that process.
5794 * Since the secluded pool does not get refilled while
5795 * a process has access to it, it should remain
5796 * drained.
5797 */
5798 num_pages_unsecluded = vm_page_secluded_drain();
5799 num_pages_reclaimed += num_pages_unsecluded;
5800 }
5801 #endif /* CONFIG_SECLUDED_MEMORY */
5802
5803 if (num_pages_reclaimed) {
5804 /*
5805 * We actually reclaimed something and so let's
5806 * check if we need to continue with the kill.
5807 */
5808 if (cause == kMemorystatusKilledHiwat) {
5809 uint64_t footprint_in_bytes = get_task_phys_footprint(proc_task(p));
5810 uint64_t memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */
5811 success = (footprint_in_bytes <= memlimit_in_bytes);
5812 } else {
5813 success = !(memorystatus_get_available_page_count() < memorystatus_get_soft_memlimit_page_shortage_threshold());
5814 #if CONFIG_SECLUDED_MEMORY
5815 if (!success && num_pages_unsecluded) {
5816 /*
5817 * We just drained the secluded pool
5818 * because we're about to kill a
5819 * process that has access to it.
5820 * This is an important process and
5821 * we'd rather not kill it unless
5822 * absolutely necessary, so declare
5823 * success even if draining the pool
5824 * did not quite get us out of the
5825 * "pressure" level but still got
5826 * us out of the "critical" level.
5827 */
5828 success = !(
5829 memorystatus_get_available_page_count() <
5830 memorystatus_get_critical_page_shortage_threshold());
5831 }
5832 #endif /* CONFIG_SECLUDED_MEMORY */
5833 }
5834
5835 if (success) {
5836 memorystatus_purge_before_jetsam_success++;
5837
5838 memorystatus_log_info("memorystatus: reclaimed %llu pages (%llu purged, %llu unsecluded) from pid %d [%s] and avoided %s\n",
5839 num_pages_reclaimed, num_pages_purged, num_pages_unsecluded, aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_kill_cause_name[cause]);
5840
5841 *killed = false;
5842 *footprint_of_killed_proc = num_pages_reclaimed + num_pages_purged + num_pages_unsecluded;
5843
5844 return TRUE;
5845 }
5846 }
5847 }
5848
5849 killtime = mach_absolute_time();
5850 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5851 tv_msec = tv_usec / 1000;
5852
5853 proc_list_lock();
5854 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
5855 proc_list_unlock();
5856
5857 char kill_reason_string[128];
5858
5859 if (cause == kMemorystatusKilledHiwat) {
5860 strlcpy(kill_reason_string, "killing_highwater_process", 128);
5861 } else {
5862 if (aPid_ep == JETSAM_PRIORITY_IDLE) {
5863 strlcpy(kill_reason_string, "killing_idle_process", 128);
5864 } else {
5865 strlcpy(kill_reason_string, "killing_top_process", 128);
5866 }
5867 }
5868
5869 /*
5870 * memorystatus_do_kill drops a reference, so take another one so we can
5871 * continue to use this exit reason even after memorystatus_do_kill()
5872 * returns
5873 */
5874 os_reason_ref(jetsam_reason);
5875
5876 retval = memorystatus_do_kill(p, cause, jetsam_reason, footprint_of_killed_proc);
5877 *killed = retval;
5878
5879 memorystatus_log("memorystatus: %s pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu compressor_size:%u\n",
5880 kill_reason_string,
5881 aPid, proc_best_name(p),
5882 memorystatus_kill_cause_name[cause], aPid_ep,
5883 (*footprint_of_killed_proc) >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
5884
5885 return retval;
5886 }
5887
5888 /*
5889 * Jetsam the first process in the queue.
5890 */
5891 static bool
memorystatus_kill_top_process(bool any,bool sort_flag,uint32_t cause,os_reason_t jetsam_reason,int32_t max_priority,bool only_swappable,int32_t * priority,uint32_t * errors,uint64_t * memory_reclaimed)5892 memorystatus_kill_top_process(bool any, bool sort_flag, uint32_t cause, os_reason_t jetsam_reason,
5893 int32_t max_priority, bool only_swappable,
5894 int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed)
5895 {
5896 pid_t aPid;
5897 proc_t p = PROC_NULL, next_p = PROC_NULL;
5898 bool new_snapshot = false, force_new_snapshot = false, killed = false, freed_mem = false;
5899 unsigned int i = 0;
5900 uint32_t aPid_ep;
5901 int32_t local_max_kill_prio = JETSAM_PRIORITY_IDLE;
5902 uint64_t footprint_of_killed_proc = 0;
5903
5904 #ifndef CONFIG_FREEZE
5905 #pragma unused(any)
5906 #endif
5907
5908 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
5909 MEMORYSTATUS_LOG_AVAILABLE_PAGES);
5910
5911
5912 #if CONFIG_JETSAM
5913 if (sort_flag) {
5914 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
5915 }
5916
5917 *memory_reclaimed = 0;
5918 local_max_kill_prio = MIN(max_kill_priority, max_priority);
5919
5920 #if VM_PRESSURE_EVENTS
5921 if (cause == kMemorystatusKilledSustainedPressure) {
5922 local_max_kill_prio = memorystatus_sustained_pressure_maximum_band;
5923 }
5924 #endif /* VM_PRESSURE_EVENTS */
5925
5926 force_new_snapshot = false;
5927
5928 #else /* CONFIG_JETSAM */
5929 (void) max_priority;
5930
5931 if (sort_flag) {
5932 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_DEFAULT);
5933 }
5934
5935 /*
5936 * On macos, we currently only have 2 reasons to be here:
5937 *
5938 * kMemorystatusKilledZoneMapExhaustion
5939 * AND
5940 * kMemorystatusKilledVMCompressorSpaceShortage
5941 *
5942 * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider
5943 * any and all processes as eligible kill candidates since we need to avoid a panic.
5944 *
5945 * Since this function can be called async. it is harder to toggle the max_kill_priority
5946 * value before and after a call. And so we use this local variable to set the upper band
5947 * on the eligible kill bands.
5948 */
5949 if (cause == kMemorystatusKilledZoneMapExhaustion) {
5950 local_max_kill_prio = JETSAM_PRIORITY_MAX;
5951 } else {
5952 local_max_kill_prio = max_kill_priority;
5953 }
5954
5955 /*
5956 * And, because we are here under extreme circumstances, we force a snapshot even for
5957 * IDLE kills.
5958 */
5959 force_new_snapshot = true;
5960
5961 #endif /* CONFIG_JETSAM */
5962
5963 if (cause != kMemorystatusKilledZoneMapExhaustion &&
5964 jetsam_current_thread() != NULL &&
5965 jetsam_current_thread()->limit_to_low_bands &&
5966 local_max_kill_prio > JETSAM_PRIORITY_MAIL) {
5967 local_max_kill_prio = JETSAM_PRIORITY_MAIL;
5968 }
5969
5970 proc_list_lock();
5971
5972 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5973 while (next_p && (next_p->p_memstat_effectivepriority <= local_max_kill_prio)) {
5974 p = next_p;
5975 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5976
5977
5978 aPid = proc_getpid(p);
5979 aPid_ep = p->p_memstat_effectivepriority;
5980
5981 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
5982 continue; /* with lock held */
5983 }
5984
5985 if (cause == kMemorystatusKilledVnodes) {
5986 /*
5987 * If the system runs out of vnodes, we systematically jetsam
5988 * processes in hopes of stumbling onto a vnode gain that helps
5989 * the system recover. The process that happens to trigger
5990 * this path has no known relationship to the vnode shortage.
5991 * Deadlock avoidance: attempt to safeguard the caller.
5992 */
5993
5994 if (p == current_proc()) {
5995 /* do not jetsam the current process */
5996 continue;
5997 }
5998 }
5999
6000 if (only_swappable && !task_donates_own_pages(proc_task(p))) {
6001 continue;
6002 }
6003
6004 #if CONFIG_FREEZE
6005 boolean_t skip;
6006 boolean_t reclaim_proc = !(p->p_memstat_state & P_MEMSTAT_LOCKED);
6007 if (any || reclaim_proc) {
6008 skip = FALSE;
6009 } else {
6010 skip = TRUE;
6011 }
6012
6013 if (skip) {
6014 continue;
6015 } else
6016 #endif
6017 {
6018 if (proc_ref(p, true) == p) {
6019 /*
6020 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6021 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6022 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6023 * acquisition of the proc lock.
6024 */
6025 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6026 } else {
6027 /*
6028 * We need to restart the search again because
6029 * proc_ref _can_ drop the proc_list lock
6030 * and we could have lost our stored next_p via
6031 * an exit() on another core.
6032 */
6033 i = 0;
6034 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6035 continue;
6036 }
6037
6038 /*
6039 * Capture a snapshot if none exists and:
6040 * - we are forcing a new snapshot creation, either because:
6041 * - on a particular platform we need these snapshots every time, OR
6042 * - a boot-arg/embedded device tree property has been set.
6043 * - priority was not requested (this is something other than an ambient kill)
6044 * - the priority was requested *and* the targeted process is not at idle priority
6045 */
6046 if ((memorystatus_jetsam_snapshot_count == 0) &&
6047 (force_new_snapshot || memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) {
6048 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6049 new_snapshot = true;
6050 }
6051
6052 proc_list_unlock();
6053
6054 freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed, &footprint_of_killed_proc); /* purged and/or killed 'p' */
6055 /* Success? */
6056 if (freed_mem) {
6057 *memory_reclaimed = footprint_of_killed_proc;
6058 if (killed) {
6059 if (priority) {
6060 *priority = aPid_ep;
6061 }
6062 } else {
6063 /* purged */
6064 proc_list_lock();
6065 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6066 proc_list_unlock();
6067 }
6068 proc_rele(p);
6069 goto exit;
6070 }
6071
6072 /*
6073 * Failure - first unwind the state,
6074 * then fall through to restart the search.
6075 */
6076 proc_list_lock();
6077 proc_rele(p);
6078 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6079 p->p_memstat_state |= P_MEMSTAT_ERROR;
6080 *errors += 1;
6081
6082 i = 0;
6083 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6084 }
6085 }
6086
6087 proc_list_unlock();
6088
6089 exit:
6090 os_reason_free(jetsam_reason);
6091
6092 if (!killed) {
6093 /* Clear snapshot if freshly captured and no target was found */
6094 if (new_snapshot) {
6095 proc_list_lock();
6096 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6097 proc_list_unlock();
6098 }
6099 }
6100
6101 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6102 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, *memory_reclaimed);
6103
6104 return killed;
6105 }
6106
6107 /*
6108 * Jetsam aggressively
6109 */
6110 static bool
memorystatus_kill_processes_aggressive(uint32_t cause,int aggr_count,int32_t priority_max,int max_kills,uint32_t * errors,uint64_t * memory_reclaimed)6111 memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count,
6112 int32_t priority_max, int max_kills, uint32_t *errors, uint64_t *memory_reclaimed)
6113 {
6114 pid_t aPid;
6115 proc_t p = PROC_NULL, next_p = PROC_NULL;
6116 boolean_t new_snapshot = FALSE, killed = FALSE;
6117 int kill_count = 0;
6118 unsigned int priority_band = JETSAM_PRIORITY_IDLE;
6119 int32_t aPid_ep = 0;
6120 unsigned int memorystatus_level_snapshot = 0;
6121 uint64_t killtime = 0;
6122 clock_sec_t tv_sec;
6123 clock_usec_t tv_usec;
6124 uint32_t tv_msec;
6125 os_reason_t jetsam_reason = OS_REASON_NULL;
6126 uint64_t footprint_of_killed_proc = 0;
6127
6128 *memory_reclaimed = 0;
6129
6130 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
6131 MEMORYSTATUS_LOG_AVAILABLE_PAGES, priority_max);
6132
6133 if (priority_max >= JETSAM_PRIORITY_FOREGROUND) {
6134 /*
6135 * Check if aggressive jetsam has been asked to kill upto or beyond the
6136 * JETSAM_PRIORITY_FOREGROUND bucket. If yes, sort the FG band based on
6137 * coalition footprint.
6138 */
6139 memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
6140 }
6141
6142 jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause);
6143 if (jetsam_reason == OS_REASON_NULL) {
6144 memorystatus_log_error("memorystatus_kill_processes_aggressive: failed to allocate exit reason\n");
6145 }
6146 memorystatus_log("memorystatus: aggressively killing up to %d processes below band %d.\n", max_kills, priority_max + 1);
6147 proc_list_lock();
6148
6149 next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
6150 while (next_p) {
6151 if (proc_list_exited(next_p) ||
6152 ((unsigned int)(next_p->p_memstat_effectivepriority) != priority_band)) {
6153 /*
6154 * We have raced with next_p running on another core.
6155 * It may be exiting or it may have moved to a different
6156 * jetsam priority band. This means we have lost our
6157 * place in line while traversing the jetsam list. We
6158 * attempt to recover by rewinding to the beginning of the band
6159 * we were already traversing. By doing this, we do not guarantee
6160 * that no process escapes this aggressive march, but we can make
6161 * skipping an entire range of processes less likely. (PR-21069019)
6162 */
6163
6164 memorystatus_log_debug(
6165 "memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n",
6166 aggr_count, priority_band, (*next_p->p_name ? next_p->p_name : "unknown"), proc_getpid(next_p));
6167
6168 next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
6169 continue;
6170 }
6171
6172 p = next_p;
6173 next_p = memorystatus_get_next_proc_locked(&priority_band, p, TRUE);
6174
6175 if (p->p_memstat_effectivepriority > priority_max) {
6176 /*
6177 * Bail out of this killing spree if we have
6178 * reached beyond the priority_max jetsam band.
6179 * That is, we kill up to and through the
6180 * priority_max jetsam band.
6181 */
6182 proc_list_unlock();
6183 goto exit;
6184 }
6185
6186 aPid = proc_getpid(p);
6187 aPid_ep = p->p_memstat_effectivepriority;
6188
6189 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6190 continue;
6191 }
6192
6193 /*
6194 * Capture a snapshot if none exists.
6195 */
6196 if (memorystatus_jetsam_snapshot_count == 0) {
6197 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6198 new_snapshot = TRUE;
6199 }
6200
6201 /*
6202 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6203 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6204 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6205 * acquisition of the proc lock.
6206 */
6207 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6208
6209 killtime = mach_absolute_time();
6210 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
6211 tv_msec = tv_usec / 1000;
6212
6213 /* Shift queue, update stats */
6214 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
6215
6216 /*
6217 * In order to kill the target process, we will drop the proc_list_lock.
6218 * To guaranteee that p and next_p don't disappear out from under the lock,
6219 * we must take a ref on both.
6220 * If we cannot get a reference, then it's likely we've raced with
6221 * that process exiting on another core.
6222 */
6223 if (proc_ref(p, true) == p) {
6224 if (next_p) {
6225 while (next_p && (proc_ref(next_p, true) != next_p)) {
6226 proc_t temp_p;
6227
6228 /*
6229 * We must have raced with next_p exiting on another core.
6230 * Recover by getting the next eligible process in the band.
6231 */
6232
6233 memorystatus_log_debug(
6234 "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
6235 aggr_count, proc_getpid(next_p), (*next_p->p_name ? next_p->p_name : "(unknown)"));
6236
6237 temp_p = next_p;
6238 next_p = memorystatus_get_next_proc_locked(&priority_band, temp_p, TRUE);
6239 }
6240 }
6241 proc_list_unlock();
6242
6243 if (aPid_ep <= system_procs_aging_band &&
6244 (p->p_memstat_relaunch_flags & P_MEMSTAT_RELAUNCH_HIGH)) {
6245 memorystatus_log("memorystatus: killing %s [%d] in band %d "
6246 "with high relaunch probability\n",
6247 proc_best_name(p), aPid, aPid_ep);
6248 }
6249 memorystatus_log(
6250 "memorystatus: %s%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
6251 ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"),
6252 aggr_count, aPid, proc_best_name(p),
6253 memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6254
6255 memorystatus_level_snapshot = memorystatus_level;
6256
6257 /*
6258 * memorystatus_do_kill() drops a reference, so take another one so we can
6259 * continue to use this exit reason even after memorystatus_do_kill()
6260 * returns.
6261 */
6262 os_reason_ref(jetsam_reason);
6263 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
6264
6265 /* Success? */
6266 if (killed) {
6267 *memory_reclaimed += footprint_of_killed_proc;
6268 proc_rele(p);
6269 kill_count++;
6270 p = NULL;
6271 killed = FALSE;
6272
6273 /*
6274 * Continue the killing spree.
6275 */
6276 proc_list_lock();
6277 if (next_p) {
6278 proc_rele(next_p);
6279 }
6280
6281 if (kill_count == max_kills) {
6282 memorystatus_log_info(
6283 "memorystatus: giving up aggressive kill after killing "
6284 "%d processes below band %d.\n",
6285 max_kills, priority_max + 1);
6286 break;
6287 }
6288
6289 if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) {
6290 if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) {
6291 #if DEVELOPMENT || DEBUG
6292 memorystatus_log_info("Disabling Lenient mode after one-time deployment.\n");
6293 #endif /* DEVELOPMENT || DEBUG */
6294 memorystatus_aggressive_jetsam_lenient = FALSE;
6295 break;
6296 }
6297 }
6298
6299 continue;
6300 }
6301
6302 /*
6303 * Failure - first unwind the state,
6304 * then fall through to restart the search.
6305 */
6306 proc_list_lock();
6307 proc_rele(p);
6308 if (next_p) {
6309 proc_rele(next_p);
6310 }
6311 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6312 p->p_memstat_state |= P_MEMSTAT_ERROR;
6313 *errors += 1;
6314 p = NULL;
6315 }
6316
6317 /*
6318 * Failure - restart the search at the beginning of
6319 * the band we were already traversing.
6320 *
6321 * We might have raced with "p" exiting on another core, resulting in no
6322 * ref on "p". Or, we may have failed to kill "p".
6323 *
6324 * Either way, we fall thru to here, leaving the proc in the
6325 * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
6326 *
6327 * And, we hold the the proc_list_lock at this point.
6328 */
6329
6330 next_p = memorystatus_get_first_proc_locked(&priority_band, TRUE);
6331 }
6332
6333 proc_list_unlock();
6334
6335 exit:
6336 os_reason_free(jetsam_reason);
6337
6338 /* Clear snapshot if freshly captured and no target was found */
6339 if (new_snapshot && (kill_count == 0)) {
6340 proc_list_lock();
6341 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6342 proc_list_unlock();
6343 }
6344
6345 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6346 MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, kill_count, *memory_reclaimed);
6347
6348 return kill_count > 0;
6349 }
6350
6351 static boolean_t
memorystatus_kill_hiwat_proc(uint32_t * errors,boolean_t * purged,uint64_t * memory_reclaimed)6352 memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed)
6353 {
6354 pid_t aPid = 0;
6355 proc_t p = PROC_NULL, next_p = PROC_NULL;
6356 bool new_snapshot = false, killed = false, freed_mem = false;
6357 unsigned int i = 0;
6358 uint32_t aPid_ep;
6359 os_reason_t jetsam_reason = OS_REASON_NULL;
6360 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
6361 MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6362
6363 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER);
6364 if (jetsam_reason == OS_REASON_NULL) {
6365 memorystatus_log_error("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n");
6366 }
6367
6368 proc_list_lock();
6369
6370 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6371 while (next_p) {
6372 uint64_t footprint_in_bytes = 0;
6373 uint64_t memlimit_in_bytes = 0;
6374 boolean_t skip = 0;
6375
6376 p = next_p;
6377 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6378
6379 aPid = proc_getpid(p);
6380 aPid_ep = p->p_memstat_effectivepriority;
6381
6382 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6383 continue;
6384 }
6385
6386 /* skip if no limit set */
6387 if (p->p_memstat_memlimit <= 0) {
6388 continue;
6389 }
6390
6391 footprint_in_bytes = get_task_phys_footprint(proc_task(p));
6392 memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */
6393 skip = (footprint_in_bytes <= memlimit_in_bytes);
6394
6395 #if CONFIG_FREEZE
6396 if (!skip) {
6397 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
6398 skip = TRUE;
6399 } else {
6400 skip = FALSE;
6401 }
6402 }
6403 #endif
6404
6405 if (skip) {
6406 continue;
6407 } else {
6408 if (memorystatus_jetsam_snapshot_count == 0) {
6409 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6410 new_snapshot = true;
6411 }
6412
6413 if (proc_ref(p, true) == p) {
6414 /*
6415 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6416 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6417 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6418 * acquisition of the proc lock.
6419 */
6420 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6421
6422 proc_list_unlock();
6423 } else {
6424 /*
6425 * We need to restart the search again because
6426 * proc_ref _can_ drop the proc_list lock
6427 * and we could have lost our stored next_p via
6428 * an exit() on another core.
6429 */
6430 i = 0;
6431 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6432 continue;
6433 }
6434
6435 footprint_in_bytes = 0;
6436 freed_mem = memorystatus_kill_proc(p, kMemorystatusKilledHiwat, jetsam_reason, &killed, &footprint_in_bytes); /* purged and/or killed 'p' */
6437
6438 /* Success? */
6439 if (freed_mem) {
6440 if (!killed) {
6441 /* purged 'p'..don't reset HWM candidate count */
6442 *purged = TRUE;
6443
6444 proc_list_lock();
6445 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6446 proc_list_unlock();
6447 } else {
6448 *memory_reclaimed = footprint_in_bytes;
6449 }
6450 proc_rele(p);
6451 goto exit;
6452 }
6453 /*
6454 * Failure - first unwind the state,
6455 * then fall through to restart the search.
6456 */
6457 proc_list_lock();
6458 proc_rele(p);
6459 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6460 p->p_memstat_state |= P_MEMSTAT_ERROR;
6461 *errors += 1;
6462
6463 i = 0;
6464 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6465 }
6466 }
6467
6468 proc_list_unlock();
6469
6470 exit:
6471 os_reason_free(jetsam_reason);
6472
6473 if (!killed) {
6474 *memory_reclaimed = 0;
6475
6476 /* Clear snapshot if freshly captured and no target was found */
6477 if (new_snapshot) {
6478 proc_list_lock();
6479 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6480 proc_list_unlock();
6481 }
6482 }
6483
6484 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
6485 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, *memory_reclaimed, 0);
6486
6487 return killed;
6488 }
6489
6490 /*
6491 * Jetsam a process pinned in the elevated band.
6492 *
6493 * Return: true -- a pinned process was jetsammed
6494 * false -- no pinned process was jetsammed
6495 */
6496 boolean_t
memorystatus_kill_elevated_process(uint32_t cause,os_reason_t jetsam_reason,unsigned int band,int aggr_count,uint32_t * errors,uint64_t * memory_reclaimed)6497 memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors, uint64_t *memory_reclaimed)
6498 {
6499 pid_t aPid = 0;
6500 proc_t p = PROC_NULL, next_p = PROC_NULL;
6501 boolean_t new_snapshot = FALSE, killed = FALSE;
6502 int kill_count = 0;
6503 uint32_t aPid_ep;
6504 uint64_t killtime = 0;
6505 clock_sec_t tv_sec;
6506 clock_usec_t tv_usec;
6507 uint32_t tv_msec;
6508 uint64_t footprint_of_killed_proc = 0;
6509
6510
6511 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
6512 MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6513
6514 #if CONFIG_FREEZE
6515 boolean_t consider_frozen_only = FALSE;
6516
6517 if (band == (unsigned int) memorystatus_freeze_jetsam_band) {
6518 consider_frozen_only = TRUE;
6519 }
6520 #endif /* CONFIG_FREEZE */
6521
6522 proc_list_lock();
6523
6524 next_p = memorystatus_get_first_proc_locked(&band, FALSE);
6525 while (next_p) {
6526 p = next_p;
6527 next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
6528
6529 aPid = proc_getpid(p);
6530 aPid_ep = p->p_memstat_effectivepriority;
6531
6532 /*
6533 * Only pick a process pinned in this elevated band
6534 */
6535 if (!_memstat_proc_is_elevated(p)) {
6536 continue;
6537 }
6538
6539 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6540 continue;
6541 }
6542
6543 #if CONFIG_FREEZE
6544 if (consider_frozen_only && !_memstat_proc_is_frozen(p)) {
6545 continue;
6546 }
6547
6548 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
6549 continue;
6550 }
6551 #endif /* CONFIG_FREEZE */
6552
6553 #if DEVELOPMENT || DEBUG
6554 memorystatus_log_info(
6555 "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n",
6556 aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"), MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6557 #endif /* DEVELOPMENT || DEBUG */
6558
6559 if (memorystatus_jetsam_snapshot_count == 0) {
6560 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6561 new_snapshot = TRUE;
6562 }
6563
6564 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6565
6566 killtime = mach_absolute_time();
6567 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
6568 tv_msec = tv_usec / 1000;
6569
6570 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
6571
6572 if (proc_ref(p, true) == p) {
6573 proc_list_unlock();
6574
6575 /*
6576 * memorystatus_do_kill drops a reference, so take another one so we can
6577 * continue to use this exit reason even after memorystatus_do_kill()
6578 * returns
6579 */
6580 os_reason_ref(jetsam_reason);
6581 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
6582
6583 memorystatus_log("%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n",
6584 (unsigned long)tv_sec, tv_msec,
6585 aggr_count,
6586 aPid, ((p && *p->p_name) ? p->p_name : "unknown"),
6587 memorystatus_kill_cause_name[cause], aPid_ep,
6588 footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6589
6590 /* Success? */
6591 if (killed) {
6592 *memory_reclaimed = footprint_of_killed_proc;
6593 proc_rele(p);
6594 kill_count++;
6595 goto exit;
6596 }
6597
6598 /*
6599 * Failure - first unwind the state,
6600 * then fall through to restart the search.
6601 */
6602 proc_list_lock();
6603 proc_rele(p);
6604 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6605 p->p_memstat_state |= P_MEMSTAT_ERROR;
6606 *errors += 1;
6607 }
6608
6609 /*
6610 * Failure - restart the search.
6611 *
6612 * We might have raced with "p" exiting on another core, resulting in no
6613 * ref on "p". Or, we may have failed to kill "p".
6614 *
6615 * Either way, we fall thru to here, leaving the proc in the
6616 * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state.
6617 *
6618 * And, we hold the the proc_list_lock at this point.
6619 */
6620
6621 next_p = memorystatus_get_first_proc_locked(&band, FALSE);
6622 }
6623
6624 proc_list_unlock();
6625
6626 exit:
6627 os_reason_free(jetsam_reason);
6628
6629 if (kill_count == 0) {
6630 *memory_reclaimed = 0;
6631
6632 /* Clear snapshot if freshly captured and no target was found */
6633 if (new_snapshot) {
6634 proc_list_lock();
6635 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6636 proc_list_unlock();
6637 }
6638 }
6639
6640 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6641 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, kill_count, *memory_reclaimed);
6642
6643 return killed;
6644 }
6645
6646 boolean_t
memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async)6647 memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async)
6648 {
6649 if (async) {
6650 os_atomic_store(&memorystatus_compressor_space_shortage, true, release);
6651 memorystatus_thread_wake();
6652 return true;
6653 } else {
6654 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE);
6655 if (jetsam_reason == OS_REASON_NULL) {
6656 memorystatus_log_error("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n");
6657 }
6658
6659 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason);
6660 }
6661 }
6662
6663 #if CONFIG_JETSAM
6664
6665 void
memorystatus_kill_on_vps_starvation(void)6666 memorystatus_kill_on_vps_starvation(void)
6667 {
6668 os_atomic_store(&memorystatus_pageout_starved, true, release);
6669 memorystatus_thread_wake();
6670 }
6671
6672 boolean_t
memorystatus_kill_on_vnode_limit(void)6673 memorystatus_kill_on_vnode_limit(void)
6674 {
6675 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE);
6676 if (jetsam_reason == OS_REASON_NULL) {
6677 memorystatus_log_error("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n");
6678 }
6679
6680 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason);
6681 }
6682
6683 boolean_t
memorystatus_kill_on_sustained_pressure()6684 memorystatus_kill_on_sustained_pressure()
6685 {
6686 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_SUSTAINED_PRESSURE);
6687 if (jetsam_reason == OS_REASON_NULL) {
6688 memorystatus_log_error("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n");
6689 }
6690
6691 return memorystatus_kill_process_sync(-1, kMemorystatusKilledSustainedPressure, jetsam_reason);
6692 }
6693
6694 boolean_t
memorystatus_kill_with_jetsam_reason_sync(pid_t pid,os_reason_t jetsam_reason)6695 memorystatus_kill_with_jetsam_reason_sync(pid_t pid, os_reason_t jetsam_reason)
6696 {
6697 uint32_t kill_cause = jetsam_reason->osr_code <= JETSAM_REASON_MEMORYSTATUS_MAX ?
6698 (uint32_t) jetsam_reason->osr_code : JETSAM_REASON_INVALID;
6699 return memorystatus_kill_process_sync(pid, kill_cause, jetsam_reason);
6700 }
6701
6702 #endif /* CONFIG_JETSAM */
6703
6704 boolean_t
memorystatus_kill_on_zone_map_exhaustion(pid_t pid)6705 memorystatus_kill_on_zone_map_exhaustion(pid_t pid)
6706 {
6707 boolean_t res = FALSE;
6708 if (pid == -1) {
6709 os_atomic_store(&memorystatus_zone_map_is_exhausted, true, release);
6710 memorystatus_thread_wake();
6711 return true;
6712 } else {
6713 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION);
6714 if (jetsam_reason == OS_REASON_NULL) {
6715 memorystatus_log_error("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n");
6716 }
6717
6718 res = memorystatus_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason);
6719 }
6720 return res;
6721 }
6722
6723 void
memorystatus_on_pageout_scan_end(void)6724 memorystatus_on_pageout_scan_end(void)
6725 {
6726 /* No-op */
6727 }
6728
6729 /* Return both allocated and actual size, since there's a race between allocation and list compilation */
6730 static int
memorystatus_get_priority_list(memorystatus_priority_entry_t ** list_ptr,size_t * buffer_size,size_t * list_size,boolean_t size_only)6731 memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t *buffer_size, size_t *list_size, boolean_t size_only)
6732 {
6733 uint32_t list_count, i = 0;
6734 memorystatus_priority_entry_t *list_entry;
6735 proc_t p;
6736
6737 list_count = memorystatus_list_count;
6738 *list_size = sizeof(memorystatus_priority_entry_t) * list_count;
6739
6740 /* Just a size check? */
6741 if (size_only) {
6742 return 0;
6743 }
6744
6745 /* Otherwise, validate the size of the buffer */
6746 if (*buffer_size < *list_size) {
6747 return EINVAL;
6748 }
6749
6750 *list_ptr = kalloc_data(*list_size, Z_WAITOK | Z_ZERO);
6751 if (!*list_ptr) {
6752 return ENOMEM;
6753 }
6754
6755 *buffer_size = *list_size;
6756 *list_size = 0;
6757
6758 list_entry = *list_ptr;
6759
6760 proc_list_lock();
6761
6762 p = memorystatus_get_first_proc_locked(&i, TRUE);
6763 while (p && (*list_size < *buffer_size)) {
6764 list_entry->pid = proc_getpid(p);
6765 list_entry->priority = p->p_memstat_effectivepriority;
6766 list_entry->user_data = p->p_memstat_userdata;
6767
6768 if (p->p_memstat_memlimit <= 0) {
6769 task_get_phys_footprint_limit(proc_task(p), &list_entry->limit);
6770 } else {
6771 list_entry->limit = p->p_memstat_memlimit;
6772 }
6773
6774 list_entry->state = memorystatus_build_state(p);
6775 list_entry++;
6776
6777 *list_size += sizeof(memorystatus_priority_entry_t);
6778
6779 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6780 }
6781
6782 proc_list_unlock();
6783
6784 memorystatus_log_debug("memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
6785
6786 return 0;
6787 }
6788
6789 static int
memorystatus_get_priority_pid(pid_t pid,user_addr_t buffer,size_t buffer_size)6790 memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size)
6791 {
6792 int error = 0;
6793 bool zombref = false;
6794 memorystatus_priority_entry_t mp_entry;
6795 kern_return_t ret;
6796
6797 /* Validate inputs */
6798 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_entry_t))) {
6799 return EINVAL;
6800 }
6801
6802 proc_t p = proc_find(pid);
6803 if (!p) {
6804 zombref = true;
6805 p = proc_find_zombref(pid);
6806 if (!p) {
6807 return ESRCH;
6808 }
6809 }
6810
6811 memset(&mp_entry, 0, sizeof(memorystatus_priority_entry_t));
6812
6813 mp_entry.pid = proc_getpid(p);
6814 mp_entry.priority = p->p_memstat_effectivepriority;
6815 mp_entry.user_data = p->p_memstat_userdata;
6816 if (p->p_memstat_memlimit <= 0 && !zombref) {
6817 task_t task = proc_task(p);
6818 assert(task);
6819 ret = task_get_phys_footprint_limit(task, &mp_entry.limit);
6820 if (ret != KERN_SUCCESS) {
6821 error = mach_to_bsd_errno(ret);
6822 goto done;
6823 }
6824 } else {
6825 mp_entry.limit = p->p_memstat_memlimit;
6826 }
6827 mp_entry.state = memorystatus_build_state(p);
6828
6829 error = copyout(&mp_entry, buffer, buffer_size);
6830
6831 done:
6832 if (zombref) {
6833 proc_drop_zombref(p);
6834 } else {
6835 proc_rele(p);
6836 }
6837
6838 return error;
6839 }
6840
6841 static int
memorystatus_cmd_get_priority_list(pid_t pid,user_addr_t buffer,size_t buffer_size,int32_t * retval)6842 memorystatus_cmd_get_priority_list(pid_t pid, user_addr_t buffer, size_t buffer_size, int32_t *retval)
6843 {
6844 int error = 0;
6845 boolean_t size_only;
6846 size_t list_size;
6847
6848 /*
6849 * When a non-zero pid is provided, the 'list' has only one entry.
6850 */
6851
6852 size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
6853
6854 if (pid != 0) {
6855 list_size = sizeof(memorystatus_priority_entry_t) * 1;
6856 if (!size_only) {
6857 error = memorystatus_get_priority_pid(pid, buffer, buffer_size);
6858 }
6859 } else {
6860 memorystatus_priority_entry_t *list = NULL;
6861 error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, size_only);
6862
6863 if (error == 0) {
6864 if (!size_only) {
6865 error = copyout(list, buffer, list_size);
6866 }
6867
6868 kfree_data(list, buffer_size);
6869 }
6870 }
6871
6872 if (error == 0) {
6873 assert(list_size <= INT32_MAX);
6874 *retval = (int32_t) list_size;
6875 }
6876
6877 return error;
6878 }
6879
6880 static void
memorystatus_clear_errors(void)6881 memorystatus_clear_errors(void)
6882 {
6883 proc_t p;
6884 unsigned int i = 0;
6885
6886 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START);
6887
6888 proc_list_lock();
6889
6890 p = memorystatus_get_first_proc_locked(&i, TRUE);
6891 while (p) {
6892 if (p->p_memstat_state & P_MEMSTAT_ERROR) {
6893 p->p_memstat_state &= ~P_MEMSTAT_ERROR;
6894 }
6895 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6896 }
6897
6898 proc_list_unlock();
6899
6900 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END);
6901 }
6902
6903 void
memorystatus_fast_jetsam_override(bool enable_override)6904 memorystatus_fast_jetsam_override(bool enable_override)
6905 {
6906 #if CONFIG_JETSAM
6907 fast_jetsam_enabled = !enable_override;
6908 if (!fast_jetsam_enabled) {
6909 /* Disable any pre-configured policies */
6910 os_atomic_store(&memstat_policy_config, kPolicyDefault, relaxed);
6911 memorystatus_thread_pool_default();
6912 _memstat_consider_waking_jetsam_thread();
6913 }
6914 #else /* CONFIG_JETSAM */
6915 (void)enable_override;
6916 #endif /* CONFIG_JETSAM */
6917 }
6918
6919 /*
6920 * Get the at_boot snapshot
6921 */
6922 static int
memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)6923 memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6924 {
6925 size_t input_size = *snapshot_size;
6926
6927 /*
6928 * The at_boot snapshot has no entry list.
6929 */
6930 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t);
6931
6932 if (size_only) {
6933 return 0;
6934 }
6935
6936 /*
6937 * Validate the size of the snapshot buffer
6938 */
6939 if (input_size < *snapshot_size) {
6940 return EINVAL;
6941 }
6942
6943 /*
6944 * Update the notification_time only
6945 */
6946 memorystatus_at_boot_snapshot.notification_time = mach_absolute_time();
6947 *snapshot = &memorystatus_at_boot_snapshot;
6948
6949 memorystatus_log_debug(
6950 "memorystatus_get_at_boot_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%d)\n",
6951 (long)input_size, (long)*snapshot_size, 0);
6952 return 0;
6953 }
6954
6955 #if CONFIG_FREEZE
6956 static int
memorystatus_get_jetsam_snapshot_freezer(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)6957 memorystatus_get_jetsam_snapshot_freezer(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6958 {
6959 size_t input_size = *snapshot_size;
6960
6961 if (memorystatus_jetsam_snapshot_freezer->entry_count > 0) {
6962 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_freezer->entry_count));
6963 } else {
6964 *snapshot_size = 0;
6965 }
6966 assert(*snapshot_size <= memorystatus_jetsam_snapshot_freezer_size);
6967
6968 if (size_only) {
6969 return 0;
6970 }
6971
6972 if (input_size < *snapshot_size) {
6973 return EINVAL;
6974 }
6975
6976 *snapshot = memorystatus_jetsam_snapshot_freezer;
6977
6978 memorystatus_log_debug(
6979 "memorystatus_get_jetsam_snapshot_freezer: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
6980 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_freezer->entry_count);
6981
6982 return 0;
6983 }
6984 #endif /* CONFIG_FREEZE */
6985
6986 static int
memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)6987 memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6988 {
6989 size_t input_size = *snapshot_size;
6990 uint32_t ods_list_count = memorystatus_list_count;
6991 memorystatus_jetsam_snapshot_t *ods = NULL; /* The on_demand snapshot buffer */
6992
6993 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (ods_list_count));
6994
6995 if (size_only) {
6996 return 0;
6997 }
6998
6999 /*
7000 * Validate the size of the snapshot buffer.
7001 * This is inherently racey. May want to revisit
7002 * this error condition and trim the output when
7003 * it doesn't fit.
7004 */
7005 if (input_size < *snapshot_size) {
7006 return EINVAL;
7007 }
7008
7009 /*
7010 * Allocate and initialize a snapshot buffer.
7011 */
7012 ods = kalloc_data(*snapshot_size, Z_WAITOK | Z_ZERO);
7013 if (!ods) {
7014 return ENOMEM;
7015 }
7016
7017 proc_list_lock();
7018 memorystatus_init_jetsam_snapshot_locked(ods, ods_list_count);
7019 proc_list_unlock();
7020
7021 /*
7022 * Return the kernel allocated, on_demand buffer.
7023 * The caller of this routine will copy the data out
7024 * to user space and then free the kernel allocated
7025 * buffer.
7026 */
7027 *snapshot = ods;
7028
7029 memorystatus_log_debug(
7030 "memorystatus_get_on_demand_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7031 (long)input_size, (long)*snapshot_size, (long)ods_list_count);
7032
7033 return 0;
7034 }
7035
7036 static int
memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)7037 memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
7038 {
7039 size_t input_size = *snapshot_size;
7040
7041 if (memorystatus_jetsam_snapshot_count > 0) {
7042 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
7043 } else {
7044 *snapshot_size = 0;
7045 }
7046
7047 if (size_only) {
7048 return 0;
7049 }
7050
7051 if (input_size < *snapshot_size) {
7052 return EINVAL;
7053 }
7054
7055 *snapshot = memorystatus_jetsam_snapshot;
7056
7057 memorystatus_log_debug(
7058 "memorystatus_get_jetsam_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
7059 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_count);
7060
7061 return 0;
7062 }
7063
7064 #if JETSAM_ZPRINT_SNAPSHOT
7065 /*
7066 * Utility function to handle copyout of jetsam zprint snapshot data
7067 */
7068 static int
memorystatus_cmd_get_data_buffer(user_addr_t buffer,size_t buffer_size,int32_t * retval,size_t data_size,void * data)7069 memorystatus_cmd_get_data_buffer(
7070 user_addr_t buffer,
7071 size_t buffer_size,
7072 int32_t *retval,
7073 size_t data_size,
7074 void *data)
7075 {
7076 boolean_t size_only = (buffer == USER_ADDR_NULL);
7077 int error;
7078
7079 /* Nothing to return if there's no data yet, instruct the caller to try again later. */
7080 if (data == NULL) {
7081 *retval = -1;
7082 return EAGAIN;
7083 }
7084
7085 /* Handle just a size request */
7086 if (size_only) {
7087 *retval = (int32_t)data_size;
7088 return 0;
7089 }
7090
7091 /* buffer needs to be large enough */
7092 if (buffer_size < data_size) {
7093 *retval = -1;
7094 return EINVAL;
7095 }
7096
7097 error = copyout(data, buffer, data_size);
7098 if (error == 0) {
7099 *retval = (int32_t)data_size;
7100 } else {
7101 *retval = -1;
7102 }
7103
7104 return error;
7105 }
7106 #endif
7107
7108 static int
memorystatus_cmd_get_jetsam_snapshot(int32_t flags,user_addr_t buffer,size_t buffer_size,int32_t * retval)7109 memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval)
7110 {
7111 int error = EINVAL;
7112 boolean_t size_only;
7113 boolean_t is_default_snapshot = FALSE;
7114 boolean_t is_on_demand_snapshot = FALSE;
7115 boolean_t is_at_boot_snapshot = FALSE;
7116 #if CONFIG_FREEZE
7117 bool is_freezer_snapshot = false;
7118 #endif /* CONFIG_FREEZE */
7119 memorystatus_jetsam_snapshot_t *snapshot;
7120
7121 size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
7122
7123 if (flags == 0) {
7124 /* Default */
7125 is_default_snapshot = TRUE;
7126 error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only);
7127 } else {
7128 if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT | MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER)) {
7129 /*
7130 * Unsupported bit set in flag.
7131 */
7132 return EINVAL;
7133 }
7134
7135 if (flags & (flags - 0x1)) {
7136 /*
7137 * Can't have multiple flags set at the same time.
7138 */
7139 return EINVAL;
7140 }
7141
7142 if (flags & MEMORYSTATUS_SNAPSHOT_ON_DEMAND) {
7143 is_on_demand_snapshot = TRUE;
7144 /*
7145 * When not requesting the size only, the following call will allocate
7146 * an on_demand snapshot buffer, which is freed below.
7147 */
7148 error = memorystatus_get_on_demand_snapshot(&snapshot, &buffer_size, size_only);
7149 } else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) {
7150 is_at_boot_snapshot = TRUE;
7151 error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only);
7152 #if CONFIG_FREEZE
7153 } else if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER) {
7154 is_freezer_snapshot = true;
7155 error = memorystatus_get_jetsam_snapshot_freezer(&snapshot, &buffer_size, size_only);
7156 #endif /* CONFIG_FREEZE */
7157 } else {
7158 /*
7159 * Invalid flag setting.
7160 */
7161 return EINVAL;
7162 }
7163 }
7164
7165 if (error) {
7166 goto out;
7167 }
7168
7169 /*
7170 * Copy the data out to user space and clear the snapshot buffer.
7171 * If working with the jetsam snapshot,
7172 * clearing the buffer means, reset the count.
7173 * If working with an on_demand snapshot
7174 * clearing the buffer means, free it.
7175 * If working with the at_boot snapshot
7176 * there is nothing to clear or update.
7177 * If working with a copy of the snapshot
7178 * there is nothing to clear or update.
7179 * If working with the freezer snapshot
7180 * clearing the buffer means, reset the count.
7181 */
7182 if (!size_only) {
7183 if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
7184 #if CONFIG_FREEZE
7185 if (is_default_snapshot || is_freezer_snapshot) {
7186 #else
7187 if (is_default_snapshot) {
7188 #endif /* CONFIG_FREEZE */
7189 /*
7190 * The jetsam snapshot is never freed, its count is simply reset.
7191 * However, we make a copy for any parties that might be interested
7192 * in the previous fully populated snapshot.
7193 */
7194 proc_list_lock();
7195 #if DEVELOPMENT || DEBUG
7196 if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != proc_getpid(current_proc())) {
7197 /* Snapshot is currently owned by someone else. Don't consume it. */
7198 proc_list_unlock();
7199 goto out;
7200 }
7201 #endif /* (DEVELOPMENT || DEBUG)*/
7202 if (is_default_snapshot) {
7203 snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
7204 memorystatus_jetsam_snapshot_last_timestamp = 0;
7205 }
7206 #if CONFIG_FREEZE
7207 else if (is_freezer_snapshot) {
7208 memorystatus_jetsam_snapshot_freezer->entry_count = 0;
7209 }
7210 #endif /* CONFIG_FREEZE */
7211 proc_list_unlock();
7212 }
7213 }
7214
7215 if (is_on_demand_snapshot) {
7216 /*
7217 * The on_demand snapshot is always freed,
7218 * even if the copyout failed.
7219 */
7220 kfree_data(snapshot, buffer_size);
7221 }
7222 }
7223
7224 out:
7225 if (error == 0) {
7226 assert(buffer_size <= INT32_MAX);
7227 *retval = (int32_t) buffer_size;
7228 }
7229 return error;
7230 }
7231
7232 #if DEVELOPMENT || DEBUG
7233 static int
7234 memorystatus_cmd_set_testing_pid(int32_t flags)
7235 {
7236 int error = EINVAL;
7237 proc_t caller = current_proc();
7238 assert(caller != kernproc);
7239 proc_list_lock();
7240 if (flags & MEMORYSTATUS_FLAGS_SET_TESTING_PID) {
7241 if (memorystatus_testing_pid == 0) {
7242 memorystatus_testing_pid = proc_getpid(caller);
7243 error = 0;
7244 } else if (memorystatus_testing_pid == proc_getpid(caller)) {
7245 error = 0;
7246 } else {
7247 /* We don't allow ownership to be taken from another proc. */
7248 error = EBUSY;
7249 }
7250 } else if (flags & MEMORYSTATUS_FLAGS_UNSET_TESTING_PID) {
7251 if (memorystatus_testing_pid == proc_getpid(caller)) {
7252 memorystatus_testing_pid = 0;
7253 error = 0;
7254 } else if (memorystatus_testing_pid != 0) {
7255 /* We don't allow ownership to be taken from another proc. */
7256 error = EPERM;
7257 }
7258 }
7259 proc_list_unlock();
7260
7261 return error;
7262 }
7263 #endif /* DEVELOPMENT || DEBUG */
7264
7265 /*
7266 * Routine: memorystatus_cmd_grp_set_priorities
7267 * Purpose: Update priorities for a group of processes.
7268 *
7269 * [priority]
7270 * Move each process out of its effective priority
7271 * band and into a new priority band.
7272 * Maintains relative order from lowest to highest priority.
7273 * In single band, maintains relative order from head to tail.
7274 *
7275 * eg: before [effectivepriority | pid]
7276 * [18 | p101 ]
7277 * [17 | p55, p67, p19 ]
7278 * [12 | p103 p10 ]
7279 * [ 7 | p25 ]
7280 * [ 0 | p71, p82, ]
7281 *
7282 * after [ new band | pid]
7283 * [ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
7284 *
7285 * Returns: 0 on success, else non-zero.
7286 *
7287 * Caveat: We know there is a race window regarding recycled pids.
7288 * A process could be killed before the kernel can act on it here.
7289 * If a pid cannot be found in any of the jetsam priority bands,
7290 * then we simply ignore it. No harm.
7291 * But, if the pid has been recycled then it could be an issue.
7292 * In that scenario, we might move an unsuspecting process to the new
7293 * priority band. It's not clear how the kernel can safeguard
7294 * against this, but it would be an extremely rare case anyway.
7295 * The caller of this api might avoid such race conditions by
7296 * ensuring that the processes passed in the pid list are suspended.
7297 */
7298
7299
7300 static int
7301 memorystatus_cmd_grp_set_priorities(user_addr_t buffer, size_t buffer_size)
7302 {
7303 /*
7304 * We only handle setting priority
7305 * per process
7306 */
7307 int error = 0;
7308 memorystatus_properties_entry_v1_t *entries = NULL;
7309 size_t entry_count = 0;
7310
7311 /* This will be the ordered proc list */
7312 typedef struct memorystatus_internal_properties {
7313 proc_t proc;
7314 int32_t priority;
7315 } memorystatus_internal_properties_t;
7316
7317 memorystatus_internal_properties_t *table = NULL;
7318 uint32_t table_count = 0;
7319
7320 size_t i = 0;
7321 uint32_t bucket_index = 0;
7322 int32_t new_priority;
7323
7324 proc_t p;
7325
7326 /* Verify inputs */
7327 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
7328 error = EINVAL;
7329 goto out;
7330 }
7331
7332 entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
7333 if (entry_count == 0) {
7334 /* buffer size was not large enough for a single entry */
7335 error = EINVAL;
7336 goto out;
7337 }
7338
7339 if ((entries = kalloc_data(buffer_size, Z_WAITOK)) == NULL) {
7340 error = ENOMEM;
7341 goto out;
7342 }
7343
7344 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count);
7345
7346 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
7347 goto out;
7348 }
7349
7350 /* Verify sanity of input priorities */
7351 if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
7352 if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
7353 error = EINVAL;
7354 goto out;
7355 }
7356 } else {
7357 error = EINVAL;
7358 goto out;
7359 }
7360
7361 for (i = 0; i < entry_count; i++) {
7362 if (entries[i].priority == -1) {
7363 /* Use as shorthand for default priority */
7364 entries[i].priority = JETSAM_PRIORITY_DEFAULT;
7365 } else if (entries[i].priority > JETSAM_PRIORITY_IDLE && entries[i].priority <= applications_aging_band) {
7366 /*
7367 * Everything between idle and the aging bands are reserved for internal use.
7368 * if requested, adjust to JETSAM_PRIORITY_IDLE.
7369 * Entitled processes (just munch) can use a subset of this range for testing.
7370 */
7371 if (entries[i].priority > JETSAM_PRIORITY_ENTITLED_MAX ||
7372 !current_task_can_use_entitled_range()) {
7373 entries[i].priority = JETSAM_PRIORITY_IDLE;
7374 }
7375 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7376 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
7377 * queue */
7378 /* Deal with this later */
7379 } else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
7380 /* Sanity check */
7381 error = EINVAL;
7382 goto out;
7383 }
7384 }
7385
7386 table = kalloc_type(memorystatus_internal_properties_t, entry_count,
7387 Z_WAITOK | Z_ZERO);
7388 if (table == NULL) {
7389 error = ENOMEM;
7390 goto out;
7391 }
7392
7393
7394 /*
7395 * For each jetsam bucket entry, spin through the input property list.
7396 * When a matching pid is found, populate an adjacent table with the
7397 * appropriate proc pointer and new property values.
7398 * This traversal automatically preserves order from lowest
7399 * to highest priority.
7400 */
7401
7402 bucket_index = 0;
7403
7404 proc_list_lock();
7405
7406 /* Create the ordered table */
7407 p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
7408 while (p && (table_count < entry_count)) {
7409 for (i = 0; i < entry_count; i++) {
7410 if (proc_getpid(p) == entries[i].pid) {
7411 /* Build the table data */
7412 table[table_count].proc = p;
7413 table[table_count].priority = entries[i].priority;
7414 table_count++;
7415 break;
7416 }
7417 }
7418 p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
7419 }
7420
7421 /* We now have ordered list of procs ready to move */
7422 for (i = 0; i < table_count; i++) {
7423 p = table[i].proc;
7424 assert(p != NULL);
7425 memstat_priority_options_t priority_options = MEMSTAT_PRIORITY_OPTIONS_NONE;
7426
7427 /* Allow head inserts -- but relative order is now */
7428 if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7429 new_priority = JETSAM_PRIORITY_IDLE;
7430 priority_options |= MEMSTAT_PRIORITY_INSERT_HEAD;
7431 } else {
7432 new_priority = table[i].priority;
7433 }
7434
7435 /* Not allowed */
7436 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
7437 continue;
7438 }
7439
7440 memstat_update_priority_locked(p, new_priority, priority_options);
7441 }
7442
7443 proc_list_unlock();
7444
7445 /*
7446 * if (table_count != entry_count)
7447 * then some pids were not found in a jetsam band.
7448 * harmless but interesting...
7449 */
7450 out:
7451 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, table_count);
7452
7453 kfree_data(entries, buffer_size);
7454 kfree_type(memorystatus_internal_properties_t, entry_count, table);
7455
7456 return error;
7457 }
7458
7459 memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL;
7460 size_t memorystatus_global_probabilities_size = 0;
7461
7462 static int
7463 memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size)
7464 {
7465 int error = 0;
7466 memorystatus_properties_entry_v1_t *entries = NULL;
7467 size_t entry_count = 0, i = 0;
7468 memorystatus_internal_probabilities_t *tmp_table_new = NULL, *tmp_table_old = NULL;
7469 size_t tmp_table_new_size = 0, tmp_table_old_size = 0;
7470 #if DEVELOPMENT || DEBUG
7471 if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != proc_getpid(current_proc())) {
7472 /* probabilites are currently owned by someone else. Don't change them. */
7473 error = EPERM;
7474 goto out;
7475 }
7476 #endif /* (DEVELOPMENT || DEBUG)*/
7477
7478 /* Verify inputs */
7479 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
7480 error = EINVAL;
7481 goto out;
7482 }
7483
7484 entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
7485 if (entry_count == 0) {
7486 error = EINVAL;
7487 goto out;
7488 }
7489
7490 if ((entries = kalloc_data(buffer_size, Z_WAITOK)) == NULL) {
7491 error = ENOMEM;
7492 goto out;
7493 }
7494
7495 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count);
7496
7497 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
7498 goto out;
7499 }
7500
7501 if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
7502 if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
7503 error = EINVAL;
7504 goto out;
7505 }
7506 } else {
7507 error = EINVAL;
7508 goto out;
7509 }
7510
7511 /* Verify sanity of input priorities */
7512 for (i = 0; i < entry_count; i++) {
7513 /*
7514 * 0 - low probability of use.
7515 * 1 - high probability of use.
7516 *
7517 * Keeping this field an int (& not a bool) to allow
7518 * us to experiment with different values/approaches
7519 * later on.
7520 */
7521 if (entries[i].use_probability > 1) {
7522 error = EINVAL;
7523 goto out;
7524 }
7525 }
7526
7527 tmp_table_new_size = sizeof(memorystatus_internal_probabilities_t) * entry_count;
7528
7529 if ((tmp_table_new = kalloc_data(tmp_table_new_size, Z_WAITOK | Z_ZERO)) == NULL) {
7530 error = ENOMEM;
7531 goto out;
7532 }
7533
7534 proc_list_lock();
7535
7536 if (memorystatus_global_probabilities_table) {
7537 tmp_table_old = memorystatus_global_probabilities_table;
7538 tmp_table_old_size = memorystatus_global_probabilities_size;
7539 }
7540
7541 memorystatus_global_probabilities_table = tmp_table_new;
7542 memorystatus_global_probabilities_size = tmp_table_new_size;
7543 tmp_table_new = NULL;
7544
7545 for (i = 0; i < entry_count; i++) {
7546 /* Build the table data */
7547 strlcpy(memorystatus_global_probabilities_table[i].proc_name, entries[i].proc_name, MAXCOMLEN + 1);
7548 memorystatus_global_probabilities_table[i].use_probability = entries[i].use_probability;
7549 }
7550
7551 proc_list_unlock();
7552
7553 out:
7554 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, tmp_table_new_size);
7555
7556 kfree_data(entries, buffer_size);
7557 kfree_data(tmp_table_old, tmp_table_old_size);
7558
7559 return error;
7560 }
7561
7562 static int
7563 memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7564 {
7565 int error = 0;
7566
7567 if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) {
7568 error = memorystatus_cmd_grp_set_priorities(buffer, buffer_size);
7569 } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) == MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) {
7570 error = memorystatus_cmd_grp_set_probabilities(buffer, buffer_size);
7571 #if CONFIG_FREEZE
7572 } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_FREEZE_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_FREEZE_PRIORITY) {
7573 error = memorystatus_cmd_grp_set_freeze_list(buffer, buffer_size);
7574 } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_DEMOTE_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_DEMOTE_PRIORITY) {
7575 error = memorystatus_cmd_grp_set_demote_list(buffer, buffer_size);
7576 #endif /* CONFIG_FREEZE */
7577 } else {
7578 error = EINVAL;
7579 }
7580
7581 return error;
7582 }
7583
7584 /*
7585 * This routine is used to update a process's jetsam priority position and stored user_data.
7586 * It is not used for the setting of memory limits.
7587 *
7588 * Flags passed into this call are used to distinguish the motivation behind a jetsam priority
7589 * transition. By default, the kernel updates the process's original requested priority when
7590 * no flag is passed. But when the MEMORYSTATUS_SET_PRIORITY_ASSERTION flag is used, the kernel
7591 * updates the process's assertion driven priority.
7592 *
7593 * The assertion flag was introduced for use by the device's assertion mediator (eg: runningboardd).
7594 * When an assertion is controlling a process's jetsam priority, it may conflict with that process's
7595 * dirty/clean (active/inactive) jetsam state. The kernel attempts to resolve a priority transition
7596 * conflict by reviewing the process state and then choosing the maximum jetsam band at play,
7597 * eg: requested priority versus assertion priority.
7598 */
7599
7600 static int
7601 memorystatus_cmd_set_priority_properties(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7602 {
7603 int error = 0;
7604 memorystatus_priority_properties_t mpp_entry;
7605
7606 /* Validate inputs */
7607 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_properties_t))) {
7608 return EINVAL;
7609 }
7610
7611 /* Validate flags */
7612 if (flags & ~(MEMORYSTATUS_SET_PRIORITY_ASSERTION)) {
7613 /*
7614 * Unsupported bit set in flag.
7615 */
7616 return EINVAL;
7617 }
7618
7619 error = copyin(buffer, &mpp_entry, buffer_size);
7620
7621 if (error == 0) {
7622 proc_t p;
7623
7624 p = proc_find(pid);
7625 if (!p) {
7626 return ESRCH;
7627 }
7628
7629 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
7630 proc_rele(p);
7631 return EPERM;
7632 }
7633
7634 if ((flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) &&
7635 !(p->p_memstat_state & P_MEMSTAT_MANAGED)) {
7636 /*
7637 * Assertion-
7638 * processes.
7639 */
7640 proc_rele(p);
7641 return EPERM;
7642 }
7643
7644 memstat_priority_options_t options = MEMSTAT_PRIORITY_OPTIONS_NONE;
7645 if (flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) {
7646 options |= MEMSTAT_PRIORITY_IS_ASSERTION;
7647 }
7648 error = memorystatus_set_priority(p, mpp_entry.priority, mpp_entry.user_data,
7649 options);
7650 proc_rele(p);
7651 }
7652
7653 return error;
7654 }
7655
7656 static int
7657 memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7658 {
7659 int error = 0;
7660 memorystatus_memlimit_properties_t mmp_entry;
7661
7662 /* Validate inputs */
7663 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
7664 return EINVAL;
7665 }
7666
7667 error = copyin(buffer, &mmp_entry, buffer_size);
7668
7669 if (error == 0) {
7670 error = memorystatus_set_memlimit_properties(pid, &mmp_entry);
7671 }
7672
7673 return error;
7674 }
7675
7676 #if DEBUG || DEVELOPMENT
7677 static int
7678 memorystatus_cmd_set_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7679 {
7680 int error = 0;
7681 memorystatus_diag_memlimit_properties_t mmp_entry;
7682 proc_t p = proc_find(pid);
7683 if (!p) {
7684 return ESRCH;
7685 }
7686
7687 /* Validate inputs */
7688 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_diag_memlimit_properties_t))) {
7689 proc_rele(p);
7690 return EINVAL;
7691 }
7692
7693 error = copyin(buffer, &mmp_entry, buffer_size);
7694
7695 if (error == 0) {
7696 proc_list_lock();
7697 error = memorystatus_set_diag_memlimit_properties_internal(p, &mmp_entry);
7698 proc_list_unlock();
7699 }
7700 proc_rele(p);
7701 return error;
7702 }
7703
7704 static int
7705 memorystatus_cmd_get_diag_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7706 {
7707 int error = 0;
7708 memorystatus_diag_memlimit_properties_t mmp_entry;
7709 proc_t p = proc_find(pid);
7710 if (!p) {
7711 return ESRCH;
7712 }
7713
7714 /* Validate inputs */
7715 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_diag_memlimit_properties_t))) {
7716 proc_rele(p);
7717 return EINVAL;
7718 }
7719 proc_list_lock();
7720 error = memorystatus_get_diag_memlimit_properties_internal(p, &mmp_entry);
7721 proc_list_unlock();
7722 proc_rele(p);
7723 if (error == 0) {
7724 error = copyout(&mmp_entry, buffer, buffer_size);
7725 }
7726
7727
7728 return error;
7729 }
7730 #endif //DEBUG || DEVELOPMENT
7731
7732 static void
7733 memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t* p_entry)
7734 {
7735 memset(p_entry, 0, sizeof(memorystatus_memlimit_properties_t));
7736
7737 if (p->p_memstat_memlimit_active > 0) {
7738 p_entry->memlimit_active = p->p_memstat_memlimit_active;
7739 } else {
7740 task_convert_phys_footprint_limit(-1, &p_entry->memlimit_active);
7741 }
7742
7743 if (_memstat_proc_active_memlimit_is_fatal(p)) {
7744 p_entry->memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7745 }
7746
7747 /*
7748 * Get the inactive limit and attributes
7749 */
7750 if (p->p_memstat_memlimit_inactive <= 0) {
7751 task_convert_phys_footprint_limit(-1, &p_entry->memlimit_inactive);
7752 } else {
7753 p_entry->memlimit_inactive = p->p_memstat_memlimit_inactive;
7754 }
7755 if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
7756 p_entry->memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7757 }
7758 }
7759
7760 /*
7761 * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit().
7762 * That gets the proc's cached memlimit and there is no guarantee that the active/inactive
7763 * limits will be the same in the no-limit case. Instead we convert limits <= 0 using
7764 * task_convert_phys_footprint_limit(). It computes the same limit value that would be written
7765 * to the task's ledgers via task_set_phys_footprint_limit().
7766 */
7767 static int
7768 memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7769 {
7770 memorystatus_memlimit_properties2_t mmp_entry;
7771
7772 /* Validate inputs */
7773 if ((pid == 0) || (buffer == USER_ADDR_NULL) ||
7774 ((buffer_size != sizeof(memorystatus_memlimit_properties_t)) &&
7775 (buffer_size != sizeof(memorystatus_memlimit_properties2_t)))) {
7776 return EINVAL;
7777 }
7778
7779 memset(&mmp_entry, 0, sizeof(memorystatus_memlimit_properties2_t));
7780
7781 proc_t p = proc_find(pid);
7782 if (!p) {
7783 return ESRCH;
7784 }
7785
7786 /*
7787 * Get the active limit and attributes.
7788 * No locks taken since we hold a reference to the proc.
7789 */
7790
7791 memorystatus_get_memlimit_properties_internal(p, &mmp_entry.v1);
7792
7793 #if CONFIG_JETSAM
7794 #if DEVELOPMENT || DEBUG
7795 /*
7796 * Get the limit increased via SPI
7797 */
7798 mmp_entry.memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
7799 mmp_entry.memlimit_increase_bytes = p->p_memlimit_increase;
7800 #endif /* DEVELOPMENT || DEBUG */
7801 #endif /* CONFIG_JETSAM */
7802
7803 proc_rele(p);
7804
7805 int error = copyout(&mmp_entry, buffer, buffer_size);
7806
7807 return error;
7808 }
7809
7810
7811 /*
7812 * SPI for kbd - pr24956468
7813 * This is a very simple snapshot that calculates how much a
7814 * process's phys_footprint exceeds a specific memory limit.
7815 * Only the inactive memory limit is supported for now.
7816 * The delta is returned as bytes in excess or zero.
7817 */
7818 static int
7819 memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7820 {
7821 int error = 0;
7822 uint64_t footprint_in_bytes = 0;
7823 uint64_t delta_in_bytes = 0;
7824 int32_t memlimit_mb = 0;
7825 uint64_t memlimit_bytes = 0;
7826
7827 /* Validate inputs */
7828 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(uint64_t)) || (flags != 0)) {
7829 return EINVAL;
7830 }
7831
7832 proc_t p = proc_find(pid);
7833 if (!p) {
7834 return ESRCH;
7835 }
7836
7837 /*
7838 * Get the inactive limit.
7839 * No locks taken since we hold a reference to the proc.
7840 */
7841
7842 if (p->p_memstat_memlimit_inactive <= 0) {
7843 task_convert_phys_footprint_limit(-1, &memlimit_mb);
7844 } else {
7845 memlimit_mb = p->p_memstat_memlimit_inactive;
7846 }
7847
7848 footprint_in_bytes = get_task_phys_footprint(proc_task(p));
7849
7850 proc_rele(p);
7851
7852 memlimit_bytes = memlimit_mb * 1024 * 1024; /* MB to bytes */
7853
7854 /*
7855 * Computed delta always returns >= 0 bytes
7856 */
7857 if (footprint_in_bytes > memlimit_bytes) {
7858 delta_in_bytes = footprint_in_bytes - memlimit_bytes;
7859 }
7860
7861 error = copyout(&delta_in_bytes, buffer, sizeof(delta_in_bytes));
7862
7863 return error;
7864 }
7865
7866
7867 static int
7868 memorystatus_cmd_get_pressure_status(int32_t *retval)
7869 {
7870 int error;
7871
7872 /* Need privilege for check */
7873 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
7874 if (error) {
7875 return error;
7876 }
7877
7878 /* Inherently racy, so it's not worth taking a lock here */
7879 *retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
7880
7881 return error;
7882 }
7883
7884 int
7885 memorystatus_get_pressure_status_kdp()
7886 {
7887 return (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
7888 }
7889
7890 /*
7891 * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
7892 *
7893 * This call is inflexible -- it does not distinguish between active/inactive, fatal/non-fatal
7894 * So, with 2-level HWM preserving previous behavior will map as follows.
7895 * - treat the limit passed in as both an active and inactive limit.
7896 * - treat the is_fatal_limit flag as though it applies to both active and inactive limits.
7897 *
7898 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK
7899 * - the is_fatal_limit is FALSE, meaning the active and inactive limits are non-fatal/soft
7900 * - so mapping is (active/non-fatal, inactive/non-fatal)
7901 *
7902 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT
7903 * - the is_fatal_limit is TRUE, meaning the process's active and inactive limits are fatal/hard
7904 * - so mapping is (active/fatal, inactive/fatal)
7905 */
7906
7907 #if CONFIG_JETSAM
7908 static int
7909 memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit)
7910 {
7911 int error = 0;
7912 memorystatus_memlimit_properties_t entry;
7913
7914 entry.memlimit_active = high_water_mark;
7915 entry.memlimit_active_attr = 0;
7916 entry.memlimit_inactive = high_water_mark;
7917 entry.memlimit_inactive_attr = 0;
7918
7919 if (is_fatal_limit == TRUE) {
7920 entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7921 entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7922 }
7923
7924 error = memorystatus_set_memlimit_properties(pid, &entry);
7925 return error;
7926 }
7927
7928 static int
7929 memorystatus_cmd_mark_process_coalition_swappable(pid_t pid, __unused int32_t *retval)
7930 {
7931 int error = 0;
7932 proc_t p = PROC_NULL;
7933 coalition_t coal = COALITION_NULL;
7934
7935 if (!memorystatus_swap_all_apps) {
7936 /* Swap is not supported on this device. */
7937 return ENOTSUP;
7938 }
7939 p = proc_find(pid);
7940 if (!p) {
7941 return ESRCH;
7942 }
7943 coal = task_get_coalition((task_t) proc_task(p), COALITION_TYPE_JETSAM);
7944 if (coal && coalition_is_leader((task_t) proc_task(p), coal)) {
7945 coalition_mark_swappable(coal);
7946 } else {
7947 /* This SPI is only supported on coalition leaders. */
7948 error = EINVAL;
7949 }
7950
7951 proc_rele(p);
7952 return error;
7953 }
7954
7955 static int
7956 memorystatus_cmd_get_process_coalition_is_swappable(pid_t pid, int32_t *retval)
7957 {
7958 int error = 0;
7959 proc_t p = PROC_NULL;
7960 coalition_t coal = COALITION_NULL;
7961
7962 if (!memorystatus_swap_all_apps) {
7963 /* Swap is not supported on this device. */
7964 return ENOTSUP;
7965 }
7966 p = proc_find(pid);
7967 if (!p) {
7968 return ESRCH;
7969 }
7970 coal = task_get_coalition((task_t) proc_task(p), COALITION_TYPE_JETSAM);
7971 if (coal) {
7972 *retval = coalition_is_swappable(coal);
7973 } else {
7974 error = EINVAL;
7975 }
7976
7977 proc_rele(p);
7978 return error;
7979 }
7980
7981 static int
7982 memorystatus_cmd_convert_memlimit_mb(pid_t pid, int32_t limit, int32_t *retval)
7983 {
7984 int error = 0;
7985 proc_t p;
7986 p = proc_find(pid);
7987 if (!p) {
7988 return ESRCH;
7989 }
7990 if (limit <= 0) {
7991 /*
7992 * A limit of <= 0 implies that the task gets its default limit.
7993 */
7994 limit = memorystatus_get_default_task_active_limit(p);
7995 if (limit <= 0) {
7996 /* Task uses system wide default limit */
7997 limit = max_task_footprint_mb ? max_task_footprint_mb : INT32_MAX;
7998 }
7999 *retval = limit;
8000 } else {
8001 #if DEVELOPMENT || DEBUG
8002 /* add the current increase to it, for roots */
8003 limit += roundToNearestMB(p->p_memlimit_increase);
8004 #endif /* DEVELOPMENT || DEBUG */
8005 *retval = limit;
8006 }
8007
8008 proc_rele(p);
8009 return error;
8010 }
8011 #endif /* CONFIG_JETSAM */
8012
8013 #if DEBUG || DEVELOPMENT
8014 static int
8015 memorystatus_set_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry)
8016 {
8017 int error = 0;
8018 uint64_t old_limit = 0;
8019
8020 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
8021 /* Enforce the limit by writing to the ledgers */
8022 error = (task_set_diag_footprint_limit_internal(proc_task(p), p_entry->memlimit, &old_limit) == KERN_SUCCESS) ? KERN_SUCCESS : EINVAL;
8023
8024 memorystatus_log_debug( "memorystatus_set_diag_memlimit_properties: new limit on pid %d (%lluMB old %lluMB)\n",
8025 proc_getpid(p), (p_entry->memlimit > 0 ? p_entry->memlimit : -1), (old_limit)
8026 );
8027 DTRACE_MEMORYSTATUS2(memorystatus_diag_memlimit_properties_t, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
8028 return error;
8029 }
8030
8031 static int
8032 memorystatus_get_diag_memlimit_properties_internal(proc_t p, memorystatus_diag_memlimit_properties_t *p_entry)
8033 {
8034 int error = 0;
8035 /* Enforce the limit by writing to the ledgers */
8036 error = (task_get_diag_footprint_limit_internal(proc_task(p), &p_entry->memlimit, &p_entry->threshold_enabled) == KERN_SUCCESS) ? KERN_SUCCESS : EINVAL;
8037
8038 DTRACE_MEMORYSTATUS2(memorystatus_diag_memlimit_properties_t, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
8039 return error;
8040 }
8041 #endif // DEBUG || DEVELOPMENT
8042
8043 bool
8044 memorystatus_task_has_increased_memory_limit_entitlement(task_t task)
8045 {
8046 if (memorystatus_entitled_max_task_footprint_mb == 0) {
8047 // Entitlement is not supported on this device.
8048 return false;
8049 }
8050 return IOTaskHasEntitlement(task,
8051 "com.apple.developer.kernel.increased-memory-limit");
8052 }
8053
8054 bool
8055 memorystatus_task_has_increased_debugging_memory_limit_entitlement(task_t task)
8056 {
8057 if (memorystatus_entitled_dev_max_task_footprint_mb == 0) {
8058 // Entitlement is not supported on this device.
8059 return false;
8060 }
8061 return IOTaskHasEntitlement(task,
8062 "com.apple.developer.kernel.increased-debugging-memory-limit");
8063 }
8064
8065 bool
8066 memorystatus_task_has_legacy_footprint_entitlement(task_t task)
8067 {
8068 return IOTaskHasEntitlement(task,
8069 "com.apple.private.memory.legacy_footprint");
8070 }
8071
8072 bool
8073 memorystatus_task_has_ios13extended_footprint_limit(task_t task)
8074 {
8075 if (max_mem < 1500ULL * 1024 * 1024 ||
8076 max_mem > 2ULL * 1024 * 1024 * 1024) {
8077 /* ios13extended_footprint is only for 2GB devices */
8078 return false;
8079 }
8080 return IOTaskHasEntitlement(task,
8081 "com.apple.developer.memory.ios13extended_footprint");
8082 }
8083
8084 static int32_t
8085 memorystatus_get_default_task_active_limit(proc_t p)
8086 {
8087 int32_t limit = (int32_t)max_task_footprint_mb;
8088 task_t task = proc_task(p);
8089
8090 /*
8091 * Check for the various entitlement footprint hacks
8092 * and try to apply each one. Note that if multiple entitlements are present
8093 * whichever results in the largest limit applies.
8094 */
8095 if (memorystatus_task_has_increased_debugging_memory_limit_entitlement(task)) {
8096 limit = MAX(limit, memorystatus_entitled_dev_max_task_footprint_mb);
8097 }
8098 if (memorystatus_task_has_increased_memory_limit_entitlement(task)) {
8099 limit = MAX(limit, memorystatus_entitled_max_task_footprint_mb);
8100 }
8101 #if __arm64__
8102 if (legacy_footprint_entitlement_mode == LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE &&
8103 memorystatus_task_has_legacy_footprint_entitlement(task)) {
8104 limit = MAX(limit, max_task_footprint_mb + legacy_footprint_bonus_mb);
8105 }
8106 #endif /* __arm64__ */
8107 if (memorystatus_task_has_ios13extended_footprint_limit(task)) {
8108 limit = MAX(limit, memorystatus_ios13extended_footprint_limit_mb);
8109 }
8110
8111 return limit;
8112 }
8113
8114 static int32_t
8115 memorystatus_get_default_task_inactive_limit(proc_t p)
8116 {
8117 // Currently the default active and inactive limits are always the same.
8118 return memorystatus_get_default_task_active_limit(p);
8119 }
8120
8121 static int
8122 memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry)
8123 {
8124 int32_t memlimit_active, memlimit_inactive;
8125 memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
8126
8127 proc_t p = proc_find(pid);
8128 if (!p) {
8129 return ESRCH;
8130 }
8131
8132 /*
8133 * Check for valid attribute flags.
8134 */
8135 const uint32_t valid_attrs = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
8136 if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
8137 proc_rele(p);
8138 return EINVAL;
8139 }
8140 if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
8141 proc_rele(p);
8142 return EINVAL;
8143 }
8144
8145 /*
8146 * Setup the active memlimit properties
8147 */
8148 memlimit_active = entry->memlimit_active;
8149 if (entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
8150 memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
8151 }
8152
8153 /*
8154 * Setup the inactive memlimit properties
8155 */
8156 memlimit_inactive = entry->memlimit_inactive;
8157 if (entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL) {
8158 memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
8159 }
8160
8161 int error = memorystatus_set_memlimits(p, memlimit_active,
8162 memlimit_inactive, memlimit_options);
8163 proc_rele(p);
8164 return error;
8165 }
8166
8167 /*
8168 * Returns the jetsam priority (effective or requested) of the process
8169 * associated with this task.
8170 */
8171 int
8172 proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
8173 {
8174 if (p) {
8175 if (effective_priority) {
8176 return p->p_memstat_effectivepriority;
8177 } else {
8178 return p->p_memstat_requestedpriority;
8179 }
8180 }
8181 return 0;
8182 }
8183
8184 static int
8185 memorystatus_get_process_is_managed(pid_t pid, int *is_managed)
8186 {
8187 proc_t p = NULL;
8188
8189 /* Validate inputs */
8190 if (pid == 0) {
8191 return EINVAL;
8192 }
8193
8194 p = proc_find(pid);
8195 if (!p) {
8196 return ESRCH;
8197 }
8198
8199 proc_list_lock();
8200 *is_managed = ((p->p_memstat_state & P_MEMSTAT_MANAGED) ? 1 : 0);
8201 proc_rele(p);
8202 proc_list_unlock();
8203
8204 return 0;
8205 }
8206
8207 static int
8208 memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed)
8209 {
8210 proc_t p = NULL;
8211
8212 /* Validate inputs */
8213 if (pid == 0) {
8214 return EINVAL;
8215 }
8216
8217 p = proc_find(pid);
8218 if (!p) {
8219 return ESRCH;
8220 }
8221
8222 proc_list_lock();
8223
8224 if (set_managed == TRUE) {
8225 p->p_memstat_state |= P_MEMSTAT_MANAGED;
8226 /*
8227 * The P_MEMSTAT_MANAGED bit is set by Runningboard for Apps.
8228 * Also opt them in to being frozen (they might have started
8229 * off with the P_MEMSTAT_FREEZE_DISABLED bit set.)
8230 */
8231 p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED;
8232 } else {
8233 p->p_memstat_state &= ~P_MEMSTAT_MANAGED;
8234 }
8235
8236 if (_memstat_proc_is_tracked(p)) {
8237 memorystatus_log_error("memorystatus: process %s [%d] opted in to both "
8238 "Management and ActivityTracking\n", proc_best_name(p),
8239 proc_pid(p));
8240 }
8241
8242 proc_list_unlock();
8243
8244 proc_rele(p);
8245
8246 return 0;
8247 }
8248
8249 int
8250 memorystatus_control(struct proc *p, struct memorystatus_control_args *args, int *ret)
8251 {
8252 int error = EINVAL;
8253 boolean_t skip_auth_check = FALSE;
8254 os_reason_t jetsam_reason = OS_REASON_NULL;
8255
8256 #if !CONFIG_JETSAM
8257 #pragma unused(ret)
8258 #pragma unused(jetsam_reason)
8259 #endif
8260
8261 /* We don't need entitlements if we're setting / querying the freeze preference or frozen status for a process. */
8262 if (args->command == MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE ||
8263 args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE ||
8264 args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN) {
8265 skip_auth_check = TRUE;
8266 }
8267
8268 /*
8269 * On development kernel, we don't need entitlements if we're adjusting the limit.
8270 * This required for limit adjustment by dyld when roots are detected, see rdar://99669958
8271 */
8272 #if DEVELOPMENT || DEBUG
8273 if (args->command == MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT && proc_getpid(p) == args->pid) {
8274 skip_auth_check = TRUE;
8275 }
8276 #endif /* DEVELOPMENT || DEBUG */
8277
8278 /* Need to be root or have entitlement. */
8279 if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT) && !skip_auth_check) {
8280 error = EPERM;
8281 goto out;
8282 }
8283
8284 /*
8285 * Sanity check.
8286 * Do not enforce it for snapshots.
8287 */
8288 if (args->command != MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT &&
8289 args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_NAMES &&
8290 args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_INFO &&
8291 args->command != MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_MEMINFO) {
8292 if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
8293 error = EINVAL;
8294 goto out;
8295 }
8296 }
8297
8298 #if CONFIG_MACF
8299 error = mac_proc_check_memorystatus_control(p, args->command, args->pid);
8300 if (error) {
8301 goto out;
8302 }
8303 #endif /* MAC */
8304
8305 switch (args->command) {
8306 case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
8307 error = memorystatus_cmd_get_priority_list(args->pid, args->buffer, args->buffersize, ret);
8308 break;
8309 case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
8310 error = memorystatus_cmd_set_priority_properties(args->pid, args->flags, args->buffer, args->buffersize, ret);
8311 break;
8312 case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES:
8313 error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8314 break;
8315 case MEMORYSTATUS_CMD_GET_MEMLIMIT_PROPERTIES:
8316 error = memorystatus_cmd_get_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8317 break;
8318 case MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS:
8319 error = memorystatus_cmd_get_memlimit_excess_np(args->pid, args->flags, args->buffer, args->buffersize, ret);
8320 break;
8321 case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
8322 error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
8323 break;
8324 case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
8325 error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret);
8326 break;
8327 #if JETSAM_ZPRINT_SNAPSHOT
8328 case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_NAMES:
8329 error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
8330 jzs_zone_cnt * sizeof(mach_zone_name_t), jzs_names);
8331 break;
8332 case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_INFO:
8333 error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
8334 jzs_zone_cnt * sizeof(mach_zone_info_t), jzs_info);
8335 break;
8336 case MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_MEMINFO:
8337 error = memorystatus_cmd_get_data_buffer(args->buffer, args->buffersize, ret,
8338 jzs_meminfo_cnt * sizeof(mach_memory_info_t), jzs_meminfo);
8339 break;
8340 #endif
8341 #if DEVELOPMENT || DEBUG
8342 case MEMORYSTATUS_CMD_SET_TESTING_PID:
8343 error = memorystatus_cmd_set_testing_pid((int32_t) args->flags);
8344 break;
8345 #endif
8346 case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
8347 error = memorystatus_cmd_get_pressure_status(ret);
8348 break;
8349 #if CONFIG_JETSAM
8350 case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
8351 /*
8352 * This call does not distinguish between active and inactive limits.
8353 * Default behavior in 2-level HWM world is to set both.
8354 * Non-fatal limit is also assumed for both.
8355 */
8356 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
8357 break;
8358 case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
8359 /*
8360 * This call does not distinguish between active and inactive limits.
8361 * Default behavior in 2-level HWM world is to set both.
8362 * Fatal limit is also assumed for both.
8363 */
8364 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
8365 break;
8366 case MEMORYSTATUS_CMD_MARK_PROCESS_COALITION_SWAPPABLE:
8367 error = memorystatus_cmd_mark_process_coalition_swappable(args->pid, ret);
8368 break;
8369
8370 case MEMORYSTATUS_CMD_GET_PROCESS_COALITION_IS_SWAPPABLE:
8371 error = memorystatus_cmd_get_process_coalition_is_swappable(args->pid, ret);
8372 break;
8373
8374 case MEMORYSTATUS_CMD_CONVERT_MEMLIMIT_MB:
8375 error = memorystatus_cmd_convert_memlimit_mb(args->pid, (int32_t) args->flags, ret);
8376 break;
8377 #endif /* CONFIG_JETSAM */
8378 /* Test commands */
8379 #if DEVELOPMENT || DEBUG
8380 case MEMORYSTATUS_CMD_TEST_JETSAM:
8381 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_GENERIC);
8382 if (jetsam_reason == OS_REASON_NULL) {
8383 memorystatus_log_error("memorystatus_control: failed to allocate jetsam reason\n");
8384 }
8385
8386 error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled, jetsam_reason) ? 0 : EINVAL;
8387 break;
8388 case MEMORYSTATUS_CMD_TEST_JETSAM_SORT:
8389 error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags, args->buffer, args->buffersize);
8390 break;
8391 #else /* DEVELOPMENT || DEBUG */
8392 #pragma unused(jetsam_reason)
8393 #endif /* DEVELOPMENT || DEBUG */
8394 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE:
8395 if (memorystatus_aggressive_jetsam_lenient_allowed == FALSE) {
8396 #if DEVELOPMENT || DEBUG
8397 memorystatus_log_info("Enabling Lenient Mode\n");
8398 #endif /* DEVELOPMENT || DEBUG */
8399
8400 memorystatus_aggressive_jetsam_lenient_allowed = TRUE;
8401 memorystatus_aggressive_jetsam_lenient = TRUE;
8402 error = 0;
8403 }
8404 break;
8405 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE:
8406 #if DEVELOPMENT || DEBUG
8407 memorystatus_log_info("Disabling Lenient mode\n");
8408 #endif /* DEVELOPMENT || DEBUG */
8409 memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
8410 memorystatus_aggressive_jetsam_lenient = FALSE;
8411 error = 0;
8412 break;
8413 case MEMORYSTATUS_CMD_GET_AGGRESSIVE_JETSAM_LENIENT_MODE:
8414 *ret = (memorystatus_aggressive_jetsam_lenient ? 1 : 0);
8415 error = 0;
8416 break;
8417 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE:
8418 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE:
8419 error = memorystatus_low_mem_privileged_listener(args->command);
8420 break;
8421
8422 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE:
8423 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE:
8424 error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, JETSAM_PRIORITY_ELEVATED_INACTIVE, args->flags ? TRUE : FALSE);
8425 break;
8426 case MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED:
8427 error = memorystatus_set_process_is_managed(args->pid, args->flags);
8428 break;
8429
8430 case MEMORYSTATUS_CMD_GET_PROCESS_IS_MANAGED:
8431 error = memorystatus_get_process_is_managed(args->pid, ret);
8432 break;
8433
8434 #if CONFIG_FREEZE
8435 case MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE:
8436 error = memorystatus_set_process_is_freezable(args->pid, args->flags ? TRUE : FALSE);
8437 break;
8438
8439 case MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE:
8440 error = memorystatus_get_process_is_freezable(args->pid, ret);
8441 break;
8442 case MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN:
8443 error = memorystatus_get_process_is_frozen(args->pid, ret);
8444 break;
8445
8446 case MEMORYSTATUS_CMD_FREEZER_CONTROL:
8447 error = memorystatus_freezer_control(args->flags, args->buffer, args->buffersize, ret);
8448 break;
8449 #endif /* CONFIG_FREEZE */
8450
8451 #if DEVELOPMENT || DEBUG
8452 case MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT:
8453 error = memorystatus_cmd_increase_jetsam_task_limit(args->pid, args->flags);
8454 break;
8455 case MEMORYSTATUS_CMD_SET_DIAG_LIMIT:
8456 error = memorystatus_cmd_set_diag_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8457 break;
8458 case MEMORYSTATUS_CMD_GET_DIAG_LIMIT:
8459 error = memorystatus_cmd_get_diag_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
8460 break;
8461 #endif /* DEVELOPMENT || DEBUG */
8462 default:
8463 error = EINVAL;
8464 break;
8465 }
8466
8467 out:
8468 return error;
8469 }
8470
8471 /* Coalition support */
8472
8473 /* sorting info for a particular priority bucket */
8474 typedef struct memstat_sort_info {
8475 coalition_t msi_coal;
8476 uint64_t msi_page_count;
8477 pid_t msi_pid;
8478 int msi_ntasks;
8479 } memstat_sort_info_t;
8480
8481 /*
8482 * qsort from smallest page count to largest page count
8483 *
8484 * return < 0 for a < b
8485 * 0 for a == b
8486 * > 0 for a > b
8487 */
8488 static int
8489 memstat_asc_cmp(const void *a, const void *b)
8490 {
8491 const memstat_sort_info_t *msA = (const memstat_sort_info_t *)a;
8492 const memstat_sort_info_t *msB = (const memstat_sort_info_t *)b;
8493
8494 return (int)((uint64_t)msA->msi_page_count - (uint64_t)msB->msi_page_count);
8495 }
8496
8497 /*
8498 * Return the number of pids rearranged during this sort.
8499 */
8500 static int
8501 memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order)
8502 {
8503 #define MAX_SORT_PIDS 80
8504 #define MAX_COAL_LEADERS 10
8505
8506 unsigned int b = bucket_index;
8507 int nleaders = 0;
8508 int ntasks = 0;
8509 proc_t p = NULL;
8510 coalition_t coal = COALITION_NULL;
8511 int pids_moved = 0;
8512 int total_pids_moved = 0;
8513 int i;
8514
8515 /*
8516 * The system is typically under memory pressure when in this
8517 * path, hence, we want to avoid dynamic memory allocation.
8518 */
8519 memstat_sort_info_t leaders[MAX_COAL_LEADERS];
8520 pid_t pid_list[MAX_SORT_PIDS];
8521
8522 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8523 return 0;
8524 }
8525
8526 /*
8527 * Clear the array that holds coalition leader information
8528 */
8529 for (i = 0; i < MAX_COAL_LEADERS; i++) {
8530 leaders[i].msi_coal = COALITION_NULL;
8531 leaders[i].msi_page_count = 0; /* will hold total coalition page count */
8532 leaders[i].msi_pid = 0; /* will hold coalition leader pid */
8533 leaders[i].msi_ntasks = 0; /* will hold the number of tasks in a coalition */
8534 }
8535
8536 p = memorystatus_get_first_proc_locked(&b, FALSE);
8537 while (p) {
8538 coal = task_get_coalition(proc_task(p), COALITION_TYPE_JETSAM);
8539 if (coalition_is_leader(proc_task(p), coal)) {
8540 if (nleaders < MAX_COAL_LEADERS) {
8541 int coal_ntasks = 0;
8542 uint64_t coal_page_count = coalition_get_page_count(coal, &coal_ntasks);
8543 leaders[nleaders].msi_coal = coal;
8544 leaders[nleaders].msi_page_count = coal_page_count;
8545 leaders[nleaders].msi_pid = proc_getpid(p); /* the coalition leader */
8546 leaders[nleaders].msi_ntasks = coal_ntasks;
8547 nleaders++;
8548 } else {
8549 /*
8550 * We've hit MAX_COAL_LEADERS meaning we can handle no more coalitions.
8551 * Abandoned coalitions will linger at the tail of the priority band
8552 * when this sort session ends.
8553 * TODO: should this be an assert?
8554 */
8555 memorystatus_log_error(
8556 "%s: WARNING: more than %d leaders in priority band [%d]\n",
8557 __FUNCTION__, MAX_COAL_LEADERS, bucket_index);
8558 break;
8559 }
8560 }
8561 p = memorystatus_get_next_proc_locked(&b, p, FALSE);
8562 }
8563
8564 if (nleaders == 0) {
8565 /* Nothing to sort */
8566 return 0;
8567 }
8568
8569 /*
8570 * Sort the coalition leader array, from smallest coalition page count
8571 * to largest coalition page count. When inserted in the priority bucket,
8572 * smallest coalition is handled first, resulting in the last to be jetsammed.
8573 */
8574 if (nleaders > 1) {
8575 qsort(leaders, nleaders, sizeof(memstat_sort_info_t), memstat_asc_cmp);
8576 }
8577
8578 /*
8579 * During coalition sorting, processes in a priority band are rearranged
8580 * by being re-inserted at the head of the queue. So, when handling a
8581 * list, the first process that gets moved to the head of the queue,
8582 * ultimately gets pushed toward the queue tail, and hence, jetsams last.
8583 *
8584 * So, for example, the coalition leader is expected to jetsam last,
8585 * after its coalition members. Therefore, the coalition leader is
8586 * inserted at the head of the queue first.
8587 *
8588 * After processing a coalition, the jetsam order is as follows:
8589 * undefs(jetsam first), extensions, xpc services, leader(jetsam last)
8590 */
8591
8592 /*
8593 * Coalition members are rearranged in the priority bucket here,
8594 * based on their coalition role.
8595 */
8596 total_pids_moved = 0;
8597 for (i = 0; i < nleaders; i++) {
8598 /* a bit of bookkeeping */
8599 pids_moved = 0;
8600
8601 /* Coalition leaders are jetsammed last, so move into place first */
8602 pid_list[0] = leaders[i].msi_pid;
8603 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, 1);
8604
8605 /* xpc services should jetsam after extensions */
8606 ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_XPC,
8607 coal_sort_order, pid_list, MAX_SORT_PIDS);
8608
8609 if (ntasks > 0) {
8610 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8611 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8612 }
8613
8614 /* extensions should jetsam after unmarked processes */
8615 ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_EXT,
8616 coal_sort_order, pid_list, MAX_SORT_PIDS);
8617
8618 if (ntasks > 0) {
8619 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8620 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8621 }
8622
8623 /* undefined coalition members should be the first to jetsam */
8624 ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_UNDEF,
8625 coal_sort_order, pid_list, MAX_SORT_PIDS);
8626
8627 if (ntasks > 0) {
8628 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8629 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8630 }
8631
8632 total_pids_moved += pids_moved;
8633 } /* end for */
8634
8635 return total_pids_moved;
8636 }
8637
8638
8639 /*
8640 * Traverse a list of pids, searching for each within the priority band provided.
8641 * If pid is found, move it to the front of the priority band.
8642 * Never searches outside the priority band provided.
8643 *
8644 * Input:
8645 * bucket_index - jetsam priority band.
8646 * pid_list - pointer to a list of pids.
8647 * list_sz - number of pids in the list.
8648 *
8649 * Pid list ordering is important in that,
8650 * pid_list[n] is expected to jetsam ahead of pid_list[n+1].
8651 * The sort_order is set by the coalition default.
8652 *
8653 * Return:
8654 * the number of pids found and hence moved within the priority band.
8655 */
8656 static int
8657 memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz)
8658 {
8659 memstat_bucket_t *current_bucket;
8660 int i;
8661 int found_pids = 0;
8662
8663 if ((pid_list == NULL) || (list_sz <= 0)) {
8664 return 0;
8665 }
8666
8667 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8668 return 0;
8669 }
8670
8671 current_bucket = &memstat_bucket[bucket_index];
8672 for (i = 0; i < list_sz; i++) {
8673 unsigned int b = bucket_index;
8674 proc_t p = NULL;
8675 proc_t aProc = NULL;
8676 pid_t aPid;
8677 int list_index;
8678
8679 list_index = ((list_sz - 1) - i);
8680 aPid = pid_list[list_index];
8681
8682 /* never search beyond bucket_index provided */
8683 p = memorystatus_get_first_proc_locked(&b, FALSE);
8684 while (p) {
8685 if (proc_getpid(p) == aPid) {
8686 aProc = p;
8687 break;
8688 }
8689 p = memorystatus_get_next_proc_locked(&b, p, FALSE);
8690 }
8691
8692 if (aProc == NULL) {
8693 /* pid not found in this band, just skip it */
8694 continue;
8695 } else {
8696 TAILQ_REMOVE(¤t_bucket->list, aProc, p_memstat_list);
8697 TAILQ_INSERT_HEAD(¤t_bucket->list, aProc, p_memstat_list);
8698 found_pids++;
8699 }
8700 }
8701 return found_pids;
8702 }
8703
8704 int
8705 memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index)
8706 {
8707 int32_t i = JETSAM_PRIORITY_IDLE;
8708 int count = 0;
8709
8710 if (max_bucket_index >= MEMSTAT_BUCKET_COUNT) {
8711 return -1;
8712 }
8713
8714 while (i <= max_bucket_index) {
8715 count += memstat_bucket[i++].count;
8716 }
8717
8718 return count;
8719 }
8720
8721 int
8722 memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap)
8723 {
8724 #if !CONFIG_JETSAM
8725 if (!p || (!isApp(p)) || (p->p_memstat_state & (P_MEMSTAT_INTERNAL | P_MEMSTAT_MANAGED))) {
8726 /*
8727 * Ineligible processes OR system processes e.g. launchd.
8728 *
8729 * We also skip processes that have the P_MEMSTAT_MANAGED bit set, i.e.
8730 * they're managed by assertiond. These are iOS apps that have been ported
8731 * to macOS. assertiond might be in the process of modifying the app's
8732 * priority / memory limit - so it might have the proc_list lock, and then try
8733 * to take the task lock. Meanwhile we've entered this function with the task lock
8734 * held, and we need the proc_list lock below. So we'll deadlock with assertiond.
8735 *
8736 * It should be fine to read the P_MEMSTAT_MANAGED bit without the proc_list
8737 * lock here, since assertiond only sets this bit on process launch.
8738 */
8739 return -1;
8740 }
8741
8742 /*
8743 * For macOS only:
8744 * We would like to use memorystatus_set_priority() here to move the processes
8745 * within the bands. Unfortunately memorystatus_set_priority() calls
8746 * memorystatus_update_priority_locked() which uses any band transitions
8747 * as an indication to modify ledgers. For that it needs the task lock
8748 * and since we came into this function with the task lock held, we'll deadlock.
8749 *
8750 * Unfortunately we can't completely disable ledger updates because we still
8751 * need the ledger updates for a subset of processes i.e. daemons.
8752 * When all processes on all platforms support memory limits, we can simply call
8753 * memorystatus_set_priority().
8754 *
8755 * It also has some logic to deal with 'aging' which, currently, is only applicable
8756 * on CONFIG_JETSAM configs. So, till every platform has CONFIG_JETSAM we'll need
8757 * to do this explicit band transition.
8758 */
8759
8760 memstat_bucket_t *current_bucket, *new_bucket;
8761 int32_t priority = 0;
8762
8763 proc_list_lock();
8764
8765 if (proc_list_exited(p) ||
8766 (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP))) {
8767 /*
8768 * If the process is on its way out OR
8769 * jetsam has alread tried and failed to kill this process,
8770 * let's skip the whole jetsam band transition.
8771 */
8772 proc_list_unlock();
8773 return 0;
8774 }
8775
8776 if (is_appnap) {
8777 current_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
8778 new_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8779 priority = JETSAM_PRIORITY_IDLE;
8780 } else {
8781 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
8782 /*
8783 * It is possible that someone pulled this process
8784 * out of the IDLE band without updating its app-nap
8785 * parameters.
8786 */
8787 proc_list_unlock();
8788 return 0;
8789 }
8790
8791 current_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8792 new_bucket = &memstat_bucket[p->p_memstat_requestedpriority];
8793 priority = p->p_memstat_requestedpriority;
8794 }
8795
8796 TAILQ_REMOVE(¤t_bucket->list, p, p_memstat_list);
8797 current_bucket->count--;
8798 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
8799 current_bucket->relaunch_high_count--;
8800 }
8801 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
8802 new_bucket->count++;
8803 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
8804 new_bucket->relaunch_high_count++;
8805 }
8806 /*
8807 * Record idle start or idle delta.
8808 */
8809 if (p->p_memstat_effectivepriority == priority) {
8810 /*
8811 * This process is not transitioning between
8812 * jetsam priority buckets. Do nothing.
8813 */
8814 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
8815 uint64_t now;
8816 /*
8817 * Transitioning out of the idle priority bucket.
8818 * Record idle delta.
8819 */
8820 assert(p->p_memstat_idle_start != 0);
8821 now = mach_absolute_time();
8822 if (now > p->p_memstat_idle_start) {
8823 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
8824 }
8825 } else if (priority == JETSAM_PRIORITY_IDLE) {
8826 /*
8827 * Transitioning into the idle priority bucket.
8828 * Record idle start.
8829 */
8830 p->p_memstat_idle_start = mach_absolute_time();
8831 }
8832
8833 KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CHANGE_PRIORITY), proc_getpid(p), priority, p->p_memstat_effectivepriority);
8834
8835 p->p_memstat_effectivepriority = priority;
8836
8837 proc_list_unlock();
8838
8839 return 0;
8840
8841 #else /* !CONFIG_JETSAM */
8842 #pragma unused(p)
8843 #pragma unused(is_appnap)
8844 return -1;
8845 #endif /* !CONFIG_JETSAM */
8846 }
8847
8848 uint64_t
8849 memorystatus_available_memory_internal(struct proc *p)
8850 {
8851 #ifdef XNU_TARGET_OS_OSX
8852 if (p->p_memstat_memlimit <= 0) {
8853 return 0;
8854 }
8855 #endif /* XNU_TARGET_OS_OSX */
8856 const uint64_t footprint_in_bytes = get_task_phys_footprint(proc_task(p));
8857 int32_t memlimit_mb;
8858 int64_t memlimit_bytes;
8859 int64_t rc;
8860
8861 if (isApp(p) == FALSE) {
8862 return 0;
8863 }
8864
8865 if (p->p_memstat_memlimit > 0) {
8866 memlimit_mb = p->p_memstat_memlimit;
8867 } else if (task_convert_phys_footprint_limit(-1, &memlimit_mb) != KERN_SUCCESS) {
8868 return 0;
8869 }
8870
8871 if (memlimit_mb <= 0) {
8872 memlimit_bytes = INT_MAX & ~((1 << 20) - 1);
8873 } else {
8874 memlimit_bytes = ((int64_t) memlimit_mb) << 20;
8875 }
8876
8877 rc = memlimit_bytes - footprint_in_bytes;
8878
8879 return (rc >= 0) ? rc : 0;
8880 }
8881
8882 int
8883 memorystatus_available_memory(struct proc *p, __unused struct memorystatus_available_memory_args *args, uint64_t *ret)
8884 {
8885 *ret = memorystatus_available_memory_internal(p);
8886
8887 return 0;
8888 }
8889
8890 void
8891 memorystatus_log_system_health(const memorystatus_system_health_t *status)
8892 {
8893 static struct memorystatus_system_health prev_status = {0};
8894
8895 bool healthy = memorystatus_is_system_healthy(status);
8896
8897 /*
8898 * Avoid spamming logs by only logging when the system status has changed.
8899 */
8900 if (prev_status.msh_zone_map_is_exhausted == status->msh_zone_map_is_exhausted
8901 #if CONFIG_JETSAM
8902 &&
8903 prev_status.msh_available_pages_below_idle == status->msh_available_pages_below_idle &&
8904 prev_status.msh_available_pages_below_soft == status->msh_available_pages_below_soft &&
8905 prev_status.msh_available_pages_below_critical == status->msh_available_pages_below_critical &&
8906 prev_status.msh_compressor_needs_to_swap == status->msh_compressor_needs_to_swap &&
8907 prev_status.msh_compressor_is_low_on_space == status->msh_compressor_is_low_on_space &&
8908 prev_status.msh_compressor_is_thrashing == status->msh_compressor_is_thrashing &&
8909 prev_status.msh_compressed_pages_nearing_limit == status->msh_compressed_pages_nearing_limit &&
8910 prev_status.msh_filecache_is_thrashing == status->msh_filecache_is_thrashing &&
8911 prev_status.msh_phantom_cache_pressure == status->msh_phantom_cache_pressure &&
8912 prev_status.msh_swappable_compressor_segments_over_limit == status->msh_swappable_compressor_segments_over_limit &&
8913 prev_status.msh_swapin_queue_over_limit == status->msh_swapin_queue_over_limit &&
8914 prev_status.msh_swap_low_on_space == status->msh_swap_low_on_space &&
8915 prev_status.msh_swap_out_of_space == status->msh_swap_out_of_space &&
8916 prev_status.msh_pageout_starved == status->msh_pageout_starved
8917 #endif /* CONFIG_JETSAM */
8918 ) {
8919 /* No change */
8920 return;
8921 }
8922
8923 #if CONFIG_JETSAM
8924 if (healthy) {
8925 if (status->msh_available_pages_below_soft) {
8926 memorystatus_log(
8927 "memorystatus: System will begin enforcing "
8928 "soft memory limits. "
8929 "memorystatus_available_pages: %llu compressor_size: %u\n",
8930 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
8931 } else if (status->msh_available_pages_below_idle) {
8932 memorystatus_log(
8933 "memorystatus: System will begin enacting "
8934 "idle-exits. "
8935 "memorystatus_available_pages: %llu compressor_size: %u\n",
8936 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
8937 } else {
8938 memorystatus_log(
8939 "memorystatus: System is healthy. "
8940 "memorystatus_available_pages: %llu compressor_size:%u\n",
8941 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
8942 }
8943 } else {
8944 /* Unhealthy */
8945 memorystatus_log("memorystatus: System is unhealthy! memorystatus_available_pages: %llu compressor_size:%u\n",
8946 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
8947 memorystatus_log(
8948 "memorystatus: {"
8949 "\"available_pages_below_critical\": %d, "
8950 "\"available_pages_below_idle\": %d, "
8951 "\"available_pages_below_soft\": %d, "
8952 "\"compressor_needs_to_swap\": %d, "
8953 "\"compressor_is_low_on_space\": %d, "
8954 "\"compressor_is_thrashing\": %d, "
8955 "\"compressed_pages_nearing_limit\": %d, "
8956 "\"filecache_is_thrashing\": %d, "
8957 "\"zone_map_is_exhausted\": %d, "
8958 "\"phantom_cache_pressure\": %d, "
8959 "\"swappable_compressor_segments_over_limit\": %d, "
8960 "\"swapin_queue_over_limit\": %d, "
8961 "\"swap_low\": %d, "
8962 "\"swap_full\": %d"
8963 "}\n",
8964 status->msh_available_pages_below_critical,
8965 status->msh_available_pages_below_idle,
8966 status->msh_available_pages_below_soft,
8967 status->msh_compressor_needs_to_swap,
8968 status->msh_compressor_is_low_on_space,
8969 status->msh_compressor_is_thrashing,
8970 status->msh_compressed_pages_nearing_limit,
8971 status->msh_filecache_is_thrashing,
8972 status->msh_zone_map_is_exhausted,
8973 status->msh_phantom_cache_pressure,
8974 status->msh_swappable_compressor_segments_over_limit,
8975 status->msh_swapin_queue_over_limit,
8976 status->msh_swap_low_on_space,
8977 status->msh_swap_out_of_space);
8978 }
8979 #else /* CONFIG_JETSAM */
8980 memorystatus_log("memorystatus: System is %s. memorystatus_available_pages: %llu compressor_size:%u\n",
8981 healthy ? "healthy" : "unhealthy",
8982 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
8983 if (!healthy) {
8984 memorystatus_log("memorystatus: zone_map_is_exhausted=%d\n",
8985 status->msh_zone_map_is_exhausted);
8986 }
8987 #endif /* CONFIG_JETSAM */
8988 prev_status = *status;
8989 }
8990
8991 uint32_t
8992 memorystatus_pick_kill_cause(const memorystatus_system_health_t *status)
8993 {
8994 assert(!memorystatus_is_system_healthy(status));
8995 #if CONFIG_JETSAM
8996 if (status->msh_compressor_is_thrashing) {
8997 return kMemorystatusKilledVMCompressorThrashing;
8998 } else if (status->msh_compressor_is_low_on_space) {
8999 return kMemorystatusKilledVMCompressorSpaceShortage;
9000 } else if (status->msh_filecache_is_thrashing) {
9001 return kMemorystatusKilledFCThrashing;
9002 } else if (status->msh_zone_map_is_exhausted) {
9003 return kMemorystatusKilledZoneMapExhaustion;
9004 } else if (status->msh_pageout_starved) {
9005 return kMemorystatusKilledVMPageoutStarvation;
9006 } else {
9007 assert(status->msh_available_pages_below_critical);
9008 return kMemorystatusKilledVMPageShortage;
9009 }
9010 #else /* CONFIG_JETSAM */
9011 assert(status->msh_zone_map_is_exhausted);
9012 (void) status;
9013 return kMemorystatusKilledZoneMapExhaustion;
9014 #endif /* CONFIG_JETSAM */
9015 }
9016
9017 #if DEVELOPMENT || DEBUG
9018 static int
9019 memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase)
9020 {
9021 int32_t memlimit_active, memlimit_inactive;
9022
9023 /* Validate inputs */
9024 if ((pid == 0) || (byte_increase == 0)) {
9025 return EINVAL;
9026 }
9027
9028 proc_t p = proc_find(pid);
9029
9030 if (!p) {
9031 return ESRCH;
9032 }
9033
9034 const uint32_t current_memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
9035 /* round to page */
9036 const int32_t page_aligned_increase = (int32_t) MIN(round_page(p->p_memlimit_increase + byte_increase), INT32_MAX);
9037
9038 proc_list_lock();
9039
9040 memlimit_active = p->p_memstat_memlimit_active;
9041 if (memlimit_active > 0) {
9042 memlimit_active -= current_memlimit_increase;
9043 memlimit_active += roundToNearestMB(page_aligned_increase);
9044 }
9045
9046 memlimit_inactive = p->p_memstat_memlimit_inactive;
9047 if (memlimit_inactive > 0) {
9048 memlimit_inactive -= current_memlimit_increase;
9049 memlimit_inactive += roundToNearestMB(page_aligned_increase);
9050 }
9051
9052 /*
9053 * Store the updated delta limit in the proc.
9054 */
9055 p->p_memlimit_increase = page_aligned_increase;
9056
9057 memlimit_options_t memlimit_options = MEMLIMIT_OPTIONS_NONE;
9058 if (_memstat_proc_inactive_memlimit_is_fatal(p)) {
9059 memlimit_options |= MEMLIMIT_INACTIVE_FATAL;
9060 }
9061 if (_memstat_proc_active_memlimit_is_fatal(p)) {
9062 memlimit_options |= MEMLIMIT_ACTIVE_FATAL;
9063 }
9064
9065 int error = memstat_set_memlimits_locked(p,
9066 memlimit_active, memlimit_inactive,
9067 memlimit_options);
9068
9069 proc_list_unlock();
9070 proc_rele(p);
9071
9072 return error;
9073 }
9074 #endif /* DEVELOPMENT */
9075