1 /*
2 * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29
30 #include <kern/sched_prim.h>
31 #include <kern/kalloc.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/host.h>
38 #include <kern/policy_internal.h>
39 #include <kern/thread_group.h>
40
41 #include <corpses/task_corpse.h>
42 #include <libkern/libkern.h>
43 #include <mach/coalition.h>
44 #include <mach/mach_time.h>
45 #include <mach/task.h>
46 #include <mach/host_priv.h>
47 #include <mach/mach_host.h>
48 #include <os/log.h>
49 #include <pexpert/pexpert.h>
50 #include <sys/coalition.h>
51 #include <sys/kern_event.h>
52 #include <sys/proc.h>
53 #include <sys/proc_info.h>
54 #include <sys/reason.h>
55 #include <sys/signal.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysproto.h>
59 #include <sys/wait.h>
60 #include <sys/tree.h>
61 #include <sys/priv.h>
62 #include <vm/pmap.h>
63 #include <vm/vm_pageout.h>
64 #include <vm/vm_protos.h>
65 #include <mach/machine/sdt.h>
66 #include <libkern/section_keywords.h>
67 #include <stdatomic.h>
68
69 #include <IOKit/IOBSD.h>
70
71 #if CONFIG_MACF
72 #include <security/mac_framework.h>
73 #endif
74
75 #if CONFIG_FREEZE
76 #include <vm/vm_map.h>
77 #endif /* CONFIG_FREEZE */
78
79 #include <sys/kern_memorystatus.h>
80 #include <sys/kern_memorystatus_freeze.h>
81 #include <sys/kern_memorystatus_notify.h>
82
83 extern uint32_t vm_compressor_pool_size(void);
84
85 static int block_corpses = 0; /* counter to block new corpses if jetsam purges them */
86
87 /* For logging clarity */
88 static const char *memorystatus_kill_cause_name[] = {
89 "", /* kMemorystatusInvalid */
90 "jettisoned", /* kMemorystatusKilled */
91 "highwater", /* kMemorystatusKilledHiwat */
92 "vnode-limit", /* kMemorystatusKilledVnodes */
93 "vm-pageshortage", /* kMemorystatusKilledVMPageShortage */
94 "proc-thrashing", /* kMemorystatusKilledProcThrashing */
95 "fc-thrashing", /* kMemorystatusKilledFCThrashing */
96 "per-process-limit", /* kMemorystatusKilledPerProcessLimit */
97 "disk-space-shortage", /* kMemorystatusKilledDiskSpaceShortage */
98 "idle-exit", /* kMemorystatusKilledIdleExit */
99 "zone-map-exhaustion", /* kMemorystatusKilledZoneMapExhaustion */
100 "vm-compressor-thrashing", /* kMemorystatusKilledVMCompressorThrashing */
101 "vm-compressor-space-shortage", /* kMemorystatusKilledVMCompressorSpaceShortage */
102 "low-swap", /* kMemorystatusKilledLowSwap */
103 "sustained-memory-pressure", /* kMemorystatusKilledSustainedPressure */
104 };
105
106 static const char *
memorystatus_priority_band_name(int32_t priority)107 memorystatus_priority_band_name(int32_t priority)
108 {
109 switch (priority) {
110 case JETSAM_PRIORITY_FOREGROUND:
111 return "FOREGROUND";
112 case JETSAM_PRIORITY_AUDIO_AND_ACCESSORY:
113 return "AUDIO_AND_ACCESSORY";
114 case JETSAM_PRIORITY_CONDUCTOR:
115 return "CONDUCTOR";
116 case JETSAM_PRIORITY_DRIVER_APPLE:
117 return "DRIVER_APPLE";
118 case JETSAM_PRIORITY_HOME:
119 return "HOME";
120 case JETSAM_PRIORITY_EXECUTIVE:
121 return "EXECUTIVE";
122 case JETSAM_PRIORITY_IMPORTANT:
123 return "IMPORTANT";
124 case JETSAM_PRIORITY_CRITICAL:
125 return "CRITICAL";
126 }
127
128 return "?";
129 }
130
131 /* Does cause indicate vm or fc thrashing? */
132 static boolean_t
is_reason_thrashing(unsigned cause)133 is_reason_thrashing(unsigned cause)
134 {
135 switch (cause) {
136 case kMemorystatusKilledFCThrashing:
137 case kMemorystatusKilledVMCompressorThrashing:
138 case kMemorystatusKilledVMCompressorSpaceShortage:
139 return TRUE;
140 default:
141 return FALSE;
142 }
143 }
144
145 /* Is the zone map almost full? */
146 static boolean_t
is_reason_zone_map_exhaustion(unsigned cause)147 is_reason_zone_map_exhaustion(unsigned cause)
148 {
149 if (cause == kMemorystatusKilledZoneMapExhaustion) {
150 return TRUE;
151 }
152 return FALSE;
153 }
154
155 /*
156 * Returns the current zone map size and capacity to include in the jetsam snapshot.
157 * Defined in zalloc.c
158 */
159 extern void get_zone_map_size(uint64_t *current_size, uint64_t *capacity);
160
161 /*
162 * Returns the name of the largest zone and its size to include in the jetsam snapshot.
163 * Defined in zalloc.c
164 */
165 extern void get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size);
166
167 /*
168 * Active / Inactive limit support
169 * proc list must be locked
170 *
171 * The SET_*** macros are used to initialize a limit
172 * for the first time.
173 *
174 * The CACHE_*** macros are use to cache the limit that will
175 * soon be in effect down in the ledgers.
176 */
177
178 #define SET_ACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
179 MACRO_BEGIN \
180 (p)->p_memstat_memlimit_active = (limit); \
181 if (is_fatal) { \
182 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
183 } else { \
184 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL; \
185 } \
186 MACRO_END
187
188 #define SET_INACTIVE_LIMITS_LOCKED(p, limit, is_fatal) \
189 MACRO_BEGIN \
190 (p)->p_memstat_memlimit_inactive = (limit); \
191 if (is_fatal) { \
192 (p)->p_memstat_state |= P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
193 } else { \
194 (p)->p_memstat_state &= ~P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL; \
195 } \
196 MACRO_END
197
198 #define CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal) \
199 MACRO_BEGIN \
200 (p)->p_memstat_memlimit = (p)->p_memstat_memlimit_active; \
201 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) { \
202 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
203 is_fatal = TRUE; \
204 } else { \
205 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
206 is_fatal = FALSE; \
207 } \
208 MACRO_END
209
210 #define CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal) \
211 MACRO_BEGIN \
212 (p)->p_memstat_memlimit = (p)->p_memstat_memlimit_inactive; \
213 if ((p)->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) { \
214 (p)->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT; \
215 is_fatal = TRUE; \
216 } else { \
217 (p)->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT; \
218 is_fatal = FALSE; \
219 } \
220 MACRO_END
221
222
223 /* General tunables */
224
225 unsigned long delta_percentage = 5;
226 unsigned long critical_threshold_percentage = 5;
227 // On embedded devices with more than 3GB of memory we lower the critical percentage.
228 uint64_t config_jetsam_large_memory_cutoff = 3UL * (1UL << 30);
229 unsigned long critical_threshold_percentage_larger_devices = 4;
230 unsigned long delta_percentage_larger_devices = 4;
231 unsigned long idle_offset_percentage = 5;
232 unsigned long pressure_threshold_percentage = 15;
233 unsigned long policy_more_free_offset_percentage = 5;
234 unsigned long sysproc_aging_aggr_threshold_percentage = 7;
235
236 /*
237 * default jetsam snapshot support
238 */
239 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
240
241 #if CONFIG_FREEZE
242 memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_freezer;
243 /*
244 * The size of the freezer snapshot is given by memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR
245 * The freezer snapshot can be much smaller than the default snapshot
246 * because it only includes apps that have been killed and dasd consumes it every 30 minutes.
247 * Since the snapshots are always wired we don't want to overallocate too much.
248 */
249 #define JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR 20
250 unsigned int memorystatus_jetsam_snapshot_freezer_max;
251 unsigned int memorystatus_jetsam_snapshot_freezer_size;
252 TUNABLE(bool, memorystatus_jetsam_use_freezer_snapshot, "kern.jetsam_user_freezer_snapshot", true);
253 #endif /* CONFIG_FREEZE */
254
255 unsigned int memorystatus_jetsam_snapshot_count = 0;
256 unsigned int memorystatus_jetsam_snapshot_max = 0;
257 unsigned int memorystatus_jetsam_snapshot_size = 0;
258 uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
259 uint64_t memorystatus_jetsam_snapshot_timeout = 0;
260
261 #if DEVELOPMENT || DEBUG
262 /*
263 * On development and debug kernels, we allow one pid to take ownership
264 * of some memorystatus data structures for testing purposes (via memorystatus_control).
265 * If there's an owner, then only they may consume the jetsam snapshot & set freezer probabilities.
266 * This is used when testing these interface to avoid racing with other
267 * processes on the system that typically use them (namely OSAnalytics & dasd).
268 */
269 static pid_t memorystatus_testing_pid = 0;
270 SYSCTL_INT(_kern, OID_AUTO, memorystatus_testing_pid, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_testing_pid, 0, "");
271 #endif /* DEVELOPMENT || DEBUG */
272 static void memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot);
273
274 /* General memorystatus stuff */
275
276 uint64_t memorystatus_sysprocs_idle_delay_time = 0;
277 uint64_t memorystatus_apps_idle_delay_time = 0;
278 /* Some devices give entitled apps a higher memory limit */
279 #if __arm64__
280 int32_t memorystatus_entitled_max_task_footprint_mb = 0;
281
282 #if DEVELOPMENT || DEBUG
283 SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_entitled_max_task_footprint_mb, 0, "");
284 #endif /* DEVELOPMENT || DEBUG */
285 #endif /* __arm64__ */
286
287 static LCK_GRP_DECLARE(memorystatus_jetsam_fg_band_lock_grp,
288 "memorystatus_jetsam_fg_band");
289 LCK_MTX_DECLARE(memorystatus_jetsam_fg_band_lock,
290 &memorystatus_jetsam_fg_band_lock_grp);
291
292 /* Idle guard handling */
293
294 static int32_t memorystatus_scheduled_idle_demotions_sysprocs = 0;
295 static int32_t memorystatus_scheduled_idle_demotions_apps = 0;
296
297 static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
298 static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state);
299 static void memorystatus_reschedule_idle_demotion_locked(void);
300 int memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap);
301 vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
302 boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
303 void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
304 void memorystatus_send_low_swap_note(void);
305 boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count,
306 uint32_t *errors, uint64_t *memory_reclaimed);
307 uint64_t memorystatus_available_memory_internal(proc_t p);
308
309 unsigned int memorystatus_level = 0;
310 static int memorystatus_list_count = 0;
311 memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
312 static thread_call_t memorystatus_idle_demotion_call;
313 uint64_t memstat_idle_demotion_deadline = 0;
314 int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
315 int applications_aging_band = JETSAM_PRIORITY_IDLE;
316
317 #define isProcessInAgingBands(p) ((isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) || (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)))
318
319 #define kJetsamAgingPolicyNone (0)
320 #define kJetsamAgingPolicyLegacy (1)
321 #define kJetsamAgingPolicySysProcsReclaimedFirst (2)
322 #define kJetsamAgingPolicyAppsReclaimedFirst (3)
323 #define kJetsamAgingPolicyMax kJetsamAgingPolicyAppsReclaimedFirst
324
325 unsigned int jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst;
326
327 extern uint64_t vm_purgeable_purge_task_owned(task_t task);
328 boolean_t memorystatus_allowed_vm_map_fork(task_t);
329 #if DEVELOPMENT || DEBUG
330 void memorystatus_abort_vm_map_fork(task_t);
331 #endif
332
333 /*
334 * Idle delay timeout factors for daemons based on relaunch behavior. Only used in
335 * kJetsamAgingPolicySysProcsReclaimedFirst aging policy.
336 */
337 #define kJetsamSysProcsIdleDelayTimeLowRatio (5)
338 #define kJetsamSysProcsIdleDelayTimeMedRatio (2)
339 #define kJetsamSysProcsIdleDelayTimeHighRatio (1)
340 static_assert(kJetsamSysProcsIdleDelayTimeLowRatio <= DEFERRED_IDLE_EXIT_TIME_SECS, "sysproc idle delay time for low relaunch daemons would be 0");
341
342 /*
343 * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, treat apps as well
344 * behaved daemons for aging purposes.
345 */
346 #define kJetsamAppsIdleDelayTimeRatio (kJetsamSysProcsIdleDelayTimeLowRatio)
347
348 static uint64_t
memorystatus_sysprocs_idle_time(proc_t p)349 memorystatus_sysprocs_idle_time(proc_t p)
350 {
351 /*
352 * The kJetsamAgingPolicySysProcsReclaimedFirst aging policy uses the relaunch behavior to
353 * determine the exact idle deferred time provided to the daemons. For all other aging
354 * policies, simply return the default aging idle time.
355 */
356 if (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst) {
357 return memorystatus_sysprocs_idle_delay_time;
358 }
359
360 uint64_t idle_delay_time = 0;
361 /*
362 * For system processes, base the idle delay time on the
363 * jetsam relaunch behavior specified by launchd. The idea
364 * is to provide extra protection to the daemons which would
365 * relaunch immediately after jetsam.
366 */
367 switch (p->p_memstat_relaunch_flags) {
368 case P_MEMSTAT_RELAUNCH_UNKNOWN:
369 case P_MEMSTAT_RELAUNCH_LOW:
370 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeLowRatio;
371 break;
372 case P_MEMSTAT_RELAUNCH_MED:
373 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeMedRatio;
374 break;
375 case P_MEMSTAT_RELAUNCH_HIGH:
376 idle_delay_time = memorystatus_sysprocs_idle_delay_time / kJetsamSysProcsIdleDelayTimeHighRatio;
377 break;
378 default:
379 panic("Unknown relaunch flags on process!");
380 break;
381 }
382 return idle_delay_time;
383 }
384
385 static uint64_t
memorystatus_apps_idle_time(__unused proc_t p)386 memorystatus_apps_idle_time(__unused proc_t p)
387 {
388 /*
389 * For kJetsamAgingPolicySysProcsReclaimedFirst, the Apps are considered as low
390 * relaunch candidates. So only provide limited protection to them. In the other
391 * aging policies, return the default aging idle time.
392 */
393 if (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst) {
394 return memorystatus_apps_idle_delay_time;
395 }
396
397 return memorystatus_apps_idle_delay_time / kJetsamAppsIdleDelayTimeRatio;
398 }
399
400
401 #if 0
402
403 /* Keeping around for future use if we need a utility that can do this OR an app that needs a dynamic adjustment. */
404
405 static int
406 sysctl_set_jetsam_aging_policy SYSCTL_HANDLER_ARGS
407 {
408 #pragma unused(oidp, arg1, arg2)
409
410 int error = 0, val = 0;
411 memstat_bucket_t *old_bucket = 0;
412 int old_system_procs_aging_band = 0, new_system_procs_aging_band = 0;
413 int old_applications_aging_band = 0, new_applications_aging_band = 0;
414 proc_t p = NULL, next_proc = NULL;
415
416
417 error = sysctl_io_number(req, jetsam_aging_policy, sizeof(int), &val, NULL);
418 if (error || !req->newptr) {
419 return error;
420 }
421
422 if ((val < 0) || (val > kJetsamAgingPolicyMax)) {
423 printf("jetsam: ordering policy sysctl has invalid value - %d\n", val);
424 return EINVAL;
425 }
426
427 /*
428 * We need to synchronize with any potential adding/removal from aging bands
429 * that might be in progress currently. We use the proc_list_lock() just for
430 * consistency with all the routines dealing with 'aging' processes. We need
431 * a lighterweight lock.
432 */
433 proc_list_lock();
434
435 old_system_procs_aging_band = system_procs_aging_band;
436 old_applications_aging_band = applications_aging_band;
437
438 switch (val) {
439 case kJetsamAgingPolicyNone:
440 new_system_procs_aging_band = JETSAM_PRIORITY_IDLE;
441 new_applications_aging_band = JETSAM_PRIORITY_IDLE;
442 break;
443
444 case kJetsamAgingPolicyLegacy:
445 /*
446 * Legacy behavior where some daemons get a 10s protection once and only before the first clean->dirty->clean transition before going into IDLE band.
447 */
448 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
449 new_applications_aging_band = JETSAM_PRIORITY_IDLE;
450 break;
451
452 case kJetsamAgingPolicySysProcsReclaimedFirst:
453 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
454 new_applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
455 break;
456
457 case kJetsamAgingPolicyAppsReclaimedFirst:
458 new_system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2;
459 new_applications_aging_band = JETSAM_PRIORITY_AGING_BAND1;
460 break;
461
462 default:
463 break;
464 }
465
466 if (old_system_procs_aging_band && (old_system_procs_aging_band != new_system_procs_aging_band)) {
467 old_bucket = &memstat_bucket[old_system_procs_aging_band];
468 p = TAILQ_FIRST(&old_bucket->list);
469
470 while (p) {
471 next_proc = TAILQ_NEXT(p, p_memstat_list);
472
473 if (isSysProc(p)) {
474 if (new_system_procs_aging_band == JETSAM_PRIORITY_IDLE) {
475 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
476 }
477
478 memorystatus_update_priority_locked(p, new_system_procs_aging_band, false, true);
479 }
480
481 p = next_proc;
482 continue;
483 }
484 }
485
486 if (old_applications_aging_band && (old_applications_aging_band != new_applications_aging_band)) {
487 old_bucket = &memstat_bucket[old_applications_aging_band];
488 p = TAILQ_FIRST(&old_bucket->list);
489
490 while (p) {
491 next_proc = TAILQ_NEXT(p, p_memstat_list);
492
493 if (isApp(p)) {
494 if (new_applications_aging_band == JETSAM_PRIORITY_IDLE) {
495 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
496 }
497
498 memorystatus_update_priority_locked(p, new_applications_aging_band, false, true);
499 }
500
501 p = next_proc;
502 continue;
503 }
504 }
505
506 jetsam_aging_policy = val;
507 system_procs_aging_band = new_system_procs_aging_band;
508 applications_aging_band = new_applications_aging_band;
509
510 proc_list_unlock();
511
512 return 0;
513 }
514
515 SYSCTL_PROC(_kern, OID_AUTO, set_jetsam_aging_policy, CTLTYPE_INT | CTLFLAG_RW,
516 0, 0, sysctl_set_jetsam_aging_policy, "I", "Jetsam Aging Policy");
517 #endif /*0*/
518
519 static int
520 sysctl_jetsam_set_sysprocs_idle_delay_time SYSCTL_HANDLER_ARGS
521 {
522 #pragma unused(oidp, arg1, arg2)
523
524 int error = 0, val = 0, old_time_in_secs = 0;
525 uint64_t old_time_in_ns = 0;
526
527 absolutetime_to_nanoseconds(memorystatus_sysprocs_idle_delay_time, &old_time_in_ns);
528 old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
529
530 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
531 if (error || !req->newptr) {
532 return error;
533 }
534
535 if ((val < 0) || (val > INT32_MAX)) {
536 printf("jetsam: new idle delay interval has invalid value.\n");
537 return EINVAL;
538 }
539
540 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
541
542 return 0;
543 }
544
545 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_sysprocs_idle_delay_time, CTLTYPE_INT | CTLFLAG_RW,
546 0, 0, sysctl_jetsam_set_sysprocs_idle_delay_time, "I", "Aging window for system processes");
547
548
549 static int
550 sysctl_jetsam_set_apps_idle_delay_time SYSCTL_HANDLER_ARGS
551 {
552 #pragma unused(oidp, arg1, arg2)
553
554 int error = 0, val = 0, old_time_in_secs = 0;
555 uint64_t old_time_in_ns = 0;
556
557 absolutetime_to_nanoseconds(memorystatus_apps_idle_delay_time, &old_time_in_ns);
558 old_time_in_secs = (int) (old_time_in_ns / NSEC_PER_SEC);
559
560 error = sysctl_io_number(req, old_time_in_secs, sizeof(int), &val, NULL);
561 if (error || !req->newptr) {
562 return error;
563 }
564
565 if ((val < 0) || (val > INT32_MAX)) {
566 printf("jetsam: new idle delay interval has invalid value.\n");
567 return EINVAL;
568 }
569
570 nanoseconds_to_absolutetime((uint64_t)val * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
571
572 return 0;
573 }
574
575 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_apps_idle_delay_time, CTLTYPE_INT | CTLFLAG_RW,
576 0, 0, sysctl_jetsam_set_apps_idle_delay_time, "I", "Aging window for applications");
577
578 SYSCTL_INT(_kern, OID_AUTO, jetsam_aging_policy, CTLTYPE_INT | CTLFLAG_RD, &jetsam_aging_policy, 0, "");
579
580 static unsigned int memorystatus_dirty_count = 0;
581
582 SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED, &max_task_footprint_mb, 0, "");
583
584 static int memorystatus_highwater_enabled = 1; /* Update the cached memlimit data. */
585 static boolean_t proc_jetsam_state_is_active_locked(proc_t);
586
587 #if __arm64__
588 int legacy_footprint_bonus_mb = 50; /* This value was chosen after looking at the top 30 apps
589 * that needed the additional room in their footprint when
590 * the 'correct' accounting methods were applied to them.
591 */
592
593 #if DEVELOPMENT || DEBUG
594 SYSCTL_INT(_kern, OID_AUTO, legacy_footprint_bonus_mb, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_footprint_bonus_mb, 0, "");
595 #endif /* DEVELOPMENT || DEBUG */
596 /*
597 * Raise the inactive and active memory limits to new values.
598 * Will only raise the limits and will do nothing if either of the current
599 * limits are 0.
600 * Caller must hold the proc_list_lock
601 */
602 static void
memorystatus_raise_memlimit(proc_t p,int new_memlimit_active,int new_memlimit_inactive)603 memorystatus_raise_memlimit(proc_t p, int new_memlimit_active, int new_memlimit_inactive)
604 {
605 int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
606 boolean_t memlimit_active_is_fatal = FALSE, memlimit_inactive_is_fatal = FALSE, use_active_limit = FALSE;
607
608 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
609
610 if (p->p_memstat_memlimit_active > 0) {
611 memlimit_mb_active = p->p_memstat_memlimit_active;
612 } else if (p->p_memstat_memlimit_active == -1) {
613 memlimit_mb_active = max_task_footprint_mb;
614 } else {
615 /*
616 * Nothing to do for '0' which is
617 * a special value only used internally
618 * to test 'no limits'.
619 */
620 return;
621 }
622
623 if (p->p_memstat_memlimit_inactive > 0) {
624 memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
625 } else if (p->p_memstat_memlimit_inactive == -1) {
626 memlimit_mb_inactive = max_task_footprint_mb;
627 } else {
628 /*
629 * Nothing to do for '0' which is
630 * a special value only used internally
631 * to test 'no limits'.
632 */
633 return;
634 }
635
636 memlimit_mb_active = MAX(new_memlimit_active, memlimit_mb_active);
637 memlimit_mb_inactive = MAX(new_memlimit_inactive, memlimit_mb_inactive);
638
639 memlimit_active_is_fatal = (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL);
640 memlimit_inactive_is_fatal = (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL);
641
642 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_mb_active, memlimit_active_is_fatal);
643 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_mb_inactive, memlimit_inactive_is_fatal);
644
645 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
646 use_active_limit = TRUE;
647 CACHE_ACTIVE_LIMITS_LOCKED(p, memlimit_active_is_fatal);
648 } else {
649 CACHE_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive_is_fatal);
650 }
651
652 if (memorystatus_highwater_enabled) {
653 task_set_phys_footprint_limit_internal(p->task,
654 (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1,
655 NULL, /*return old value */
656 use_active_limit, /*active limit?*/
657 (use_active_limit ? memlimit_active_is_fatal : memlimit_inactive_is_fatal));
658 }
659 }
660
661 void
memorystatus_act_on_legacy_footprint_entitlement(proc_t p,boolean_t footprint_increase)662 memorystatus_act_on_legacy_footprint_entitlement(proc_t p, boolean_t footprint_increase)
663 {
664 int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
665
666 if (p == NULL) {
667 return;
668 }
669
670 proc_list_lock();
671
672 if (p->p_memstat_memlimit_active > 0) {
673 memlimit_mb_active = p->p_memstat_memlimit_active;
674 } else if (p->p_memstat_memlimit_active == -1) {
675 memlimit_mb_active = max_task_footprint_mb;
676 } else {
677 /*
678 * Nothing to do for '0' which is
679 * a special value only used internally
680 * to test 'no limits'.
681 */
682 proc_list_unlock();
683 return;
684 }
685
686 if (p->p_memstat_memlimit_inactive > 0) {
687 memlimit_mb_inactive = p->p_memstat_memlimit_inactive;
688 } else if (p->p_memstat_memlimit_inactive == -1) {
689 memlimit_mb_inactive = max_task_footprint_mb;
690 } else {
691 /*
692 * Nothing to do for '0' which is
693 * a special value only used internally
694 * to test 'no limits'.
695 */
696 proc_list_unlock();
697 return;
698 }
699
700 if (footprint_increase) {
701 memlimit_mb_active += legacy_footprint_bonus_mb;
702 memlimit_mb_inactive += legacy_footprint_bonus_mb;
703 } else {
704 memlimit_mb_active -= legacy_footprint_bonus_mb;
705 if (memlimit_mb_active == max_task_footprint_mb) {
706 memlimit_mb_active = -1; /* reverting back to default system limit */
707 }
708
709 memlimit_mb_inactive -= legacy_footprint_bonus_mb;
710 if (memlimit_mb_inactive == max_task_footprint_mb) {
711 memlimit_mb_inactive = -1; /* reverting back to default system limit */
712 }
713 }
714 memorystatus_raise_memlimit(p, memlimit_mb_active, memlimit_mb_inactive);
715
716 proc_list_unlock();
717 }
718
719 void
memorystatus_act_on_ios13extended_footprint_entitlement(proc_t p)720 memorystatus_act_on_ios13extended_footprint_entitlement(proc_t p)
721 {
722 if (max_mem < 1500ULL * 1024 * 1024 ||
723 max_mem > 2ULL * 1024 * 1024 * 1024) {
724 /* ios13extended_footprint is only for 2GB devices */
725 return;
726 }
727 /* limit to "almost 2GB" */
728 proc_list_lock();
729 memorystatus_raise_memlimit(p, 1800, 1800);
730 proc_list_unlock();
731 }
732
733 void
memorystatus_act_on_entitled_task_limit(proc_t p)734 memorystatus_act_on_entitled_task_limit(proc_t p)
735 {
736 if (memorystatus_entitled_max_task_footprint_mb == 0) {
737 // Entitlement is not supported on this device.
738 return;
739 }
740 proc_list_lock();
741 memorystatus_raise_memlimit(p, memorystatus_entitled_max_task_footprint_mb, memorystatus_entitled_max_task_footprint_mb);
742 proc_list_unlock();
743 }
744 #endif /* __arm64__ */
745
746 SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_level, 0, "");
747
748 int
memorystatus_get_level(__unused struct proc * p,struct memorystatus_get_level_args * args,__unused int * ret)749 memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
750 {
751 user_addr_t level = 0;
752
753 level = args->level;
754
755 if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
756 return EFAULT;
757 }
758
759 return 0;
760 }
761
762 static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
763
764 /* Memory Limits */
765
766 static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
767 static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason);
768
769
770 static int memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
771
772 static int memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry);
773
774 static int memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
775
776 static int memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval);
777
778 static void memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry);
779 static int memorystatus_set_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry);
780
781 int proc_get_memstat_priority(proc_t, boolean_t);
782
783 static boolean_t memorystatus_idle_snapshot = 0;
784
785 unsigned int memorystatus_delta = 0;
786
787 /* Jetsam Loop Detection */
788 static boolean_t memorystatus_jld_enabled = FALSE; /* Enable jetsam loop detection */
789 static uint32_t memorystatus_jld_eval_period_msecs = 0; /* Init pass sets this based on device memory size */
790 static int memorystatus_jld_eval_aggressive_count = 3; /* Raise the priority max after 'n' aggressive loops */
791 static int memorystatus_jld_eval_aggressive_priority_band_max = 15; /* Kill aggressively up through this band */
792 static int memorystatus_jld_max_kill_loops = 2; /* How many times should we try and kill up to the target band */
793
794 /*
795 * A FG app can request that the aggressive jetsam mechanism display some leniency in the FG band. This 'lenient' mode is described as:
796 * --- if aggressive jetsam kills an app in the FG band and gets back >=AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD memory, it will stop the aggressive march further into and up the jetsam bands.
797 *
798 * RESTRICTIONS:
799 * - Such a request is respected/acknowledged only once while that 'requesting' app is in the FG band i.e. if aggressive jetsam was
800 * needed and the 'lenient' mode was deployed then that's it for this special mode while the app is in the FG band.
801 *
802 * - If the app is still in the FG band and aggressive jetsam is needed again, there will be no stop-and-check the next time around.
803 *
804 * - Also, the transition of the 'requesting' app away from the FG band will void this special behavior.
805 */
806
807 #define AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD 25
808 boolean_t memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
809 boolean_t memorystatus_aggressive_jetsam_lenient = FALSE;
810
811 #if DEVELOPMENT || DEBUG
812 /*
813 * Jetsam Loop Detection tunables.
814 */
815
816 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_eval_period_msecs, 0, "");
817 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_count, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_count, 0, "");
818 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_eval_aggressive_priority_band_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_eval_aggressive_priority_band_max, 0, "");
819 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jld_max_kill_loops, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_jld_max_kill_loops, 0, "");
820 #endif /* DEVELOPMENT || DEBUG */
821
822 static uint32_t kill_under_pressure_cause = 0;
823
824 /*
825 * snapshot support for memstats collected at boot.
826 */
827 static memorystatus_jetsam_snapshot_t memorystatus_at_boot_snapshot;
828
829 static void memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count);
830 static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount);
831 static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime);
832
833 static void memorystatus_clear_errors(void);
834 static void memorystatus_get_task_phys_footprint_page_counts(task_t task,
835 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
836 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
837 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
838 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages);
839
840 static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *count);
841
842 static uint32_t memorystatus_build_state(proc_t p);
843 //static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
844
845 static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, int32_t *priority,
846 uint32_t *errors, uint64_t *memory_reclaimed);
847 static boolean_t memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, int32_t max_kills, uint32_t *errors, uint64_t *memory_reclaimed);
848 static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed);
849
850 static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause);
851
852 /* Priority Band Sorting Routines */
853 static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order);
854 static int memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order);
855 static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index);
856 static int memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz);
857
858 /* qsort routines */
859 typedef int (*cmpfunc_t)(const void *a, const void *b);
860 extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
861 static int memstat_asc_cmp(const void *a, const void *b);
862
863 /* VM pressure */
864
865 extern unsigned int vm_page_free_count;
866 extern unsigned int vm_page_active_count;
867 extern unsigned int vm_page_inactive_count;
868 extern unsigned int vm_page_throttled_count;
869 extern unsigned int vm_page_purgeable_count;
870 extern unsigned int vm_page_wire_count;
871 extern unsigned int vm_page_speculative_count;
872
873 #if CONFIG_JETSAM
874 #define MEMORYSTATUS_LOG_AVAILABLE_PAGES memorystatus_available_pages
875 #else /* CONFIG_JETSAM */
876 #define MEMORYSTATUS_LOG_AVAILABLE_PAGES (vm_page_active_count + vm_page_inactive_count + vm_page_free_count + vm_page_speculative_count)
877 #endif /* CONFIG_JETSAM */
878 #if CONFIG_SECLUDED_MEMORY
879 extern unsigned int vm_page_secluded_count;
880 extern unsigned int vm_page_secluded_count_over_target;
881 #endif /* CONFIG_SECLUDED_MEMORY */
882
883 /* Aggressive jetsam pages threshold for sysproc aging policy */
884 unsigned int memorystatus_sysproc_aging_aggr_pages = 0;
885
886 #if CONFIG_JETSAM
887 unsigned int memorystatus_available_pages = (unsigned int)-1;
888 unsigned int memorystatus_available_pages_pressure = 0;
889 unsigned int memorystatus_available_pages_critical = 0;
890 unsigned int memorystatus_available_pages_critical_base = 0;
891 unsigned int memorystatus_available_pages_critical_idle_offset = 0;
892
893 #if DEVELOPMENT || DEBUG
894 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
895 #else
896 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
897 #endif /* DEVELOPMENT || DEBUG */
898
899 static unsigned int memorystatus_jetsam_policy = kPolicyDefault;
900 unsigned int memorystatus_policy_more_free_offset_pages = 0;
901 static void memorystatus_update_levels_locked(boolean_t critical_only);
902 static unsigned int memorystatus_thread_wasted_wakeup = 0;
903
904 /* Callback into vm_compressor.c to signal that thrashing has been mitigated. */
905 extern void vm_thrashing_jetsam_done(void);
906 static int memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit);
907
908 int32_t max_kill_priority = JETSAM_PRIORITY_MAX;
909
910 char memorystatus_jetsam_proc_name_panic[MAXCOMLEN + 1]; /* Panic when we are about to jetsam this process. */
911 uint32_t memorystatus_jetsam_proc_cause_panic = 0; /* If specified, panic only when we are about to jetsam the process above for this cause. */
912 uint32_t memorystatus_jetsam_proc_size_panic = 0; /* If specified, panic only when we are about to jetsam the process above and its footprint is more than this in MB. */
913
914 #else /* CONFIG_JETSAM */
915
916 uint64_t memorystatus_available_pages = (uint64_t)-1;
917 uint64_t memorystatus_available_pages_pressure = (uint64_t)-1;
918 uint64_t memorystatus_available_pages_critical = (uint64_t)-1;
919
920 int32_t max_kill_priority = JETSAM_PRIORITY_IDLE;
921 #endif /* CONFIG_JETSAM */
922
923 #if DEVELOPMENT || DEBUG
924
925 static LCK_GRP_DECLARE(disconnect_page_mappings_lck_grp, "disconnect_page_mappings");
926 static LCK_MTX_DECLARE(disconnect_page_mappings_mutex, &disconnect_page_mappings_lck_grp);
927
928 extern bool kill_on_no_paging_space;
929 #endif /* DEVELOPMENT || DEBUG */
930
931 #if DEVELOPMENT || DEBUG
932 static inline uint32_t
roundToNearestMB(uint32_t in)933 roundToNearestMB(uint32_t in)
934 {
935 return (in + ((1 << 20) - 1)) >> 20;
936 }
937
938 static int memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase);
939 #endif
940
941 /* Debug */
942
943 extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
944
945 #if DEVELOPMENT || DEBUG
946
947 static unsigned int memorystatus_debug_dump_this_bucket = 0;
948
949 static void
memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)950 memorystatus_debug_dump_bucket_locked(unsigned int bucket_index)
951 {
952 proc_t p = NULL;
953 uint64_t bytes = 0;
954 int ledger_limit = 0;
955 unsigned int b = bucket_index;
956 boolean_t traverse_all_buckets = FALSE;
957
958 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
959 traverse_all_buckets = TRUE;
960 b = 0;
961 } else {
962 traverse_all_buckets = FALSE;
963 b = bucket_index;
964 }
965
966 /*
967 * footprint reported in [pages / MB ]
968 * limits reported as:
969 * L-limit proc's Ledger limit
970 * C-limit proc's Cached limit, should match Ledger
971 * A-limit proc's Active limit
972 * IA-limit proc's Inactive limit
973 * F==Fatal, NF==NonFatal
974 */
975
976 printf("memorystatus_debug_dump ***START*(PAGE_SIZE_64=%llu)**\n", PAGE_SIZE_64);
977 printf("bucket [pid] [pages / MB] [state] [EP / RP / AP] dirty deadline [L-limit / C-limit / A-limit / IA-limit] name\n");
978 p = memorystatus_get_first_proc_locked(&b, traverse_all_buckets);
979 while (p) {
980 bytes = get_task_phys_footprint(p->task);
981 task_get_phys_footprint_limit(p->task, &ledger_limit);
982 printf("%2d [%5d] [%5lld /%3lldMB] 0x%-8x [%2d / %2d / %2d] 0x%-3x %10lld [%3d / %3d%s / %3d%s / %3d%s] %s\n",
983 b, proc_getpid(p),
984 (bytes / PAGE_SIZE_64), /* task's footprint converted from bytes to pages */
985 (bytes / (1024ULL * 1024ULL)), /* task's footprint converted from bytes to MB */
986 p->p_memstat_state, p->p_memstat_effectivepriority, p->p_memstat_requestedpriority, p->p_memstat_assertionpriority,
987 p->p_memstat_dirty, p->p_memstat_idledeadline,
988 ledger_limit,
989 p->p_memstat_memlimit,
990 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"),
991 p->p_memstat_memlimit_active,
992 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL ? "F " : "NF"),
993 p->p_memstat_memlimit_inactive,
994 (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL ? "F " : "NF"),
995 (*p->p_name ? p->p_name : "unknown"));
996 p = memorystatus_get_next_proc_locked(&b, p, traverse_all_buckets);
997 }
998 printf("memorystatus_debug_dump ***END***\n");
999 }
1000
1001 static int
1002 sysctl_memorystatus_debug_dump_bucket SYSCTL_HANDLER_ARGS
1003 {
1004 #pragma unused(oidp, arg2)
1005 int bucket_index = 0;
1006 int error;
1007 error = SYSCTL_OUT(req, arg1, sizeof(int));
1008 if (error || !req->newptr) {
1009 return error;
1010 }
1011 error = SYSCTL_IN(req, &bucket_index, sizeof(int));
1012 if (error || !req->newptr) {
1013 return error;
1014 }
1015 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1016 /*
1017 * All jetsam buckets will be dumped.
1018 */
1019 } else {
1020 /*
1021 * Only a single bucket will be dumped.
1022 */
1023 }
1024
1025 proc_list_lock();
1026 memorystatus_debug_dump_bucket_locked(bucket_index);
1027 proc_list_unlock();
1028 memorystatus_debug_dump_this_bucket = bucket_index;
1029 return error;
1030 }
1031
1032 /*
1033 * Debug aid to look at jetsam buckets and proc jetsam fields.
1034 * Use this sysctl to act on a particular jetsam bucket.
1035 * Writing the sysctl triggers the dump.
1036 * Usage: sysctl kern.memorystatus_debug_dump_this_bucket=<bucket_index>
1037 */
1038
1039 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_debug_dump_this_bucket, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_debug_dump_this_bucket, 0, sysctl_memorystatus_debug_dump_bucket, "I", "");
1040
1041
1042 /* Debug aid to aid determination of limit */
1043
1044 static int
1045 sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
1046 {
1047 #pragma unused(oidp, arg2)
1048 proc_t p;
1049 unsigned int b = 0;
1050 int error, enable = 0;
1051 boolean_t use_active; /* use the active limit and active limit attributes */
1052 boolean_t is_fatal;
1053
1054 error = SYSCTL_OUT(req, arg1, sizeof(int));
1055 if (error || !req->newptr) {
1056 return error;
1057 }
1058
1059 error = SYSCTL_IN(req, &enable, sizeof(int));
1060 if (error || !req->newptr) {
1061 return error;
1062 }
1063
1064 if (!(enable == 0 || enable == 1)) {
1065 return EINVAL;
1066 }
1067
1068 proc_list_lock();
1069
1070 p = memorystatus_get_first_proc_locked(&b, TRUE);
1071 while (p) {
1072 use_active = proc_jetsam_state_is_active_locked(p);
1073
1074 if (enable) {
1075 if (use_active == TRUE) {
1076 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
1077 } else {
1078 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
1079 }
1080 } else {
1081 /*
1082 * Disabling limits does not touch the stored variants.
1083 * Set the cached limit fields to system_wide defaults.
1084 */
1085 p->p_memstat_memlimit = -1;
1086 p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
1087 is_fatal = TRUE;
1088 }
1089
1090 /*
1091 * Enforce the cached limit by writing to the ledger.
1092 */
1093 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit: -1, NULL, use_active, is_fatal);
1094
1095 p = memorystatus_get_next_proc_locked(&b, p, TRUE);
1096 }
1097
1098 memorystatus_highwater_enabled = enable;
1099
1100 proc_list_unlock();
1101
1102 return 0;
1103 }
1104
1105 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
1106
1107 SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
1108
1109 #if CONFIG_JETSAM
1110 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, "");
1111 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, "");
1112 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, "");
1113 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_policy_more_free_offset_pages, CTLFLAG_RW, &memorystatus_policy_more_free_offset_pages, 0, "");
1114
1115 #if VM_PRESSURE_EVENTS
1116
1117 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, "");
1118
1119 #endif /* VM_PRESSURE_EVENTS */
1120
1121 #endif /* CONFIG_JETSAM */
1122
1123 #endif /* DEVELOPMENT || DEBUG */
1124
1125 extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
1126 void *parameter,
1127 integer_t priority,
1128 thread_t *new_thread);
1129
1130 #if DEVELOPMENT || DEBUG
1131
1132 static int
1133 sysctl_memorystatus_disconnect_page_mappings SYSCTL_HANDLER_ARGS
1134 {
1135 #pragma unused(arg1, arg2)
1136 int error = 0, pid = 0;
1137 proc_t p;
1138
1139 error = sysctl_handle_int(oidp, &pid, 0, req);
1140 if (error || !req->newptr) {
1141 return error;
1142 }
1143
1144 lck_mtx_lock(&disconnect_page_mappings_mutex);
1145
1146 if (pid == -1) {
1147 vm_pageout_disconnect_all_pages();
1148 } else {
1149 p = proc_find(pid);
1150
1151 if (p != NULL) {
1152 error = task_disconnect_page_mappings(p->task);
1153
1154 proc_rele(p);
1155
1156 if (error) {
1157 error = EIO;
1158 }
1159 } else {
1160 error = EINVAL;
1161 }
1162 }
1163 lck_mtx_unlock(&disconnect_page_mappings_mutex);
1164
1165 return error;
1166 }
1167
1168 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1169 0, 0, &sysctl_memorystatus_disconnect_page_mappings, "I", "");
1170
1171 #endif /* DEVELOPMENT || DEBUG */
1172
1173 /*
1174 * Sorts the given bucket.
1175 *
1176 * Input:
1177 * bucket_index - jetsam priority band to be sorted.
1178 * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1179 * Currently sort_order is only meaningful when handling
1180 * coalitions.
1181 *
1182 * proc_list_lock must be held by the caller.
1183 */
1184 static void
memorystatus_sort_bucket_locked(unsigned int bucket_index,int sort_order)1185 memorystatus_sort_bucket_locked(unsigned int bucket_index, int sort_order)
1186 {
1187 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
1188 if (memstat_bucket[bucket_index].count == 0) {
1189 return;
1190 }
1191
1192 switch (bucket_index) {
1193 case JETSAM_PRIORITY_FOREGROUND:
1194 if (memorystatus_sort_by_largest_coalition_locked(bucket_index, sort_order) == 0) {
1195 /*
1196 * Fall back to per process sorting when zero coalitions are found.
1197 */
1198 memorystatus_sort_by_largest_process_locked(bucket_index);
1199 }
1200 break;
1201 default:
1202 memorystatus_sort_by_largest_process_locked(bucket_index);
1203 break;
1204 }
1205 }
1206
1207 /*
1208 * Picks the sorting routine for a given jetsam priority band.
1209 *
1210 * Input:
1211 * bucket_index - jetsam priority band to be sorted.
1212 * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h
1213 * Currently sort_order is only meaningful when handling
1214 * coalitions.
1215 *
1216 * Return:
1217 * 0 on success
1218 * non-0 on failure
1219 */
1220 static int
memorystatus_sort_bucket(unsigned int bucket_index,int sort_order)1221 memorystatus_sort_bucket(unsigned int bucket_index, int sort_order)
1222 {
1223 int coal_sort_order;
1224
1225 /*
1226 * Verify the jetsam priority
1227 */
1228 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1229 return EINVAL;
1230 }
1231
1232 #if DEVELOPMENT || DEBUG
1233 if (sort_order == JETSAM_SORT_DEFAULT) {
1234 coal_sort_order = COALITION_SORT_DEFAULT;
1235 } else {
1236 coal_sort_order = sort_order; /* only used for testing scenarios */
1237 }
1238 #else
1239 /* Verify default */
1240 if (sort_order == JETSAM_SORT_DEFAULT) {
1241 coal_sort_order = COALITION_SORT_DEFAULT;
1242 } else {
1243 return EINVAL;
1244 }
1245 #endif
1246
1247 proc_list_lock();
1248 memorystatus_sort_bucket_locked(bucket_index, coal_sort_order);
1249 proc_list_unlock();
1250
1251 return 0;
1252 }
1253
1254 /*
1255 * Sort processes by size for a single jetsam bucket.
1256 */
1257
1258 static void
memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)1259 memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
1260 {
1261 proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
1262 proc_t next_p = NULL, prev_max_proc = NULL;
1263 uint32_t pages = 0, max_pages = 0;
1264 memstat_bucket_t *current_bucket;
1265
1266 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
1267 return;
1268 }
1269
1270 current_bucket = &memstat_bucket[bucket_index];
1271
1272 p = TAILQ_FIRST(¤t_bucket->list);
1273
1274 while (p) {
1275 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
1276 max_pages = pages;
1277 max_proc = p;
1278 prev_max_proc = p;
1279
1280 while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
1281 /* traversing list until we find next largest process */
1282 p = next_p;
1283 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
1284 if (pages > max_pages) {
1285 max_pages = pages;
1286 max_proc = p;
1287 }
1288 }
1289
1290 if (prev_max_proc != max_proc) {
1291 /* found a larger process, place it in the list */
1292 TAILQ_REMOVE(¤t_bucket->list, max_proc, p_memstat_list);
1293 if (insert_after_proc == NULL) {
1294 TAILQ_INSERT_HEAD(¤t_bucket->list, max_proc, p_memstat_list);
1295 } else {
1296 TAILQ_INSERT_AFTER(¤t_bucket->list, insert_after_proc, max_proc, p_memstat_list);
1297 }
1298 prev_max_proc = max_proc;
1299 }
1300
1301 insert_after_proc = max_proc;
1302
1303 p = TAILQ_NEXT(max_proc, p_memstat_list);
1304 }
1305 }
1306
1307 proc_t
memorystatus_get_first_proc_locked(unsigned int * bucket_index,boolean_t search)1308 memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search)
1309 {
1310 memstat_bucket_t *current_bucket;
1311 proc_t next_p;
1312
1313 if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
1314 return NULL;
1315 }
1316
1317 current_bucket = &memstat_bucket[*bucket_index];
1318 next_p = TAILQ_FIRST(¤t_bucket->list);
1319 if (!next_p && search) {
1320 while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1321 current_bucket = &memstat_bucket[*bucket_index];
1322 next_p = TAILQ_FIRST(¤t_bucket->list);
1323 }
1324 }
1325
1326 return next_p;
1327 }
1328
1329 proc_t
memorystatus_get_next_proc_locked(unsigned int * bucket_index,proc_t p,boolean_t search)1330 memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search)
1331 {
1332 memstat_bucket_t *current_bucket;
1333 proc_t next_p;
1334
1335 if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
1336 return NULL;
1337 }
1338
1339 next_p = TAILQ_NEXT(p, p_memstat_list);
1340 while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
1341 current_bucket = &memstat_bucket[*bucket_index];
1342 next_p = TAILQ_FIRST(¤t_bucket->list);
1343 }
1344
1345 return next_p;
1346 }
1347
1348 /*
1349 * Structure to hold state for a jetsam thread.
1350 * Typically there should be a single jetsam thread
1351 * unless parallel jetsam is enabled.
1352 */
1353 struct jetsam_thread_state {
1354 uint8_t inited; /* boolean - if the thread is initialized */
1355 uint8_t limit_to_low_bands; /* boolean */
1356 int memorystatus_wakeup; /* wake channel */
1357 int index; /* jetsam thread index */
1358 thread_t thread; /* jetsam thread pointer */
1359 } *jetsam_threads;
1360
1361 /* Maximum number of jetsam threads allowed */
1362 #define JETSAM_THREADS_LIMIT 3
1363
1364 /* Number of active jetsam threads */
1365 _Atomic int active_jetsam_threads = 1;
1366
1367 /* Number of maximum jetsam threads configured */
1368 int max_jetsam_threads = JETSAM_THREADS_LIMIT;
1369
1370 /*
1371 * Global switch for enabling fast jetsam. Fast jetsam is
1372 * hooked up via the system_override() system call. It has the
1373 * following effects:
1374 * - Raise the jetsam threshold ("clear-the-deck")
1375 * - Enabled parallel jetsam on eligible devices
1376 */
1377 #if __AMP__
1378 int fast_jetsam_enabled = 1;
1379 #else /* __AMP__ */
1380 int fast_jetsam_enabled = 0;
1381 #endif /* __AMP__ */
1382
1383 /* Routine to find the jetsam state structure for the current jetsam thread */
1384 static inline struct jetsam_thread_state *
jetsam_current_thread(void)1385 jetsam_current_thread(void)
1386 {
1387 for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) {
1388 if (jetsam_threads[thr_id].thread == current_thread()) {
1389 return &(jetsam_threads[thr_id]);
1390 }
1391 }
1392 return NULL;
1393 }
1394
1395 __private_extern__ void
memorystatus_init(void)1396 memorystatus_init(void)
1397 {
1398 kern_return_t result;
1399 int i;
1400
1401 #if CONFIG_FREEZE
1402 memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_UI_SUPPORT;
1403 memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX;
1404 memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */
1405 memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4);
1406 memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
1407 memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
1408 memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS;
1409 memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD;
1410 #endif /* CONFIG_FREEZE */
1411
1412 #if DEVELOPMENT || DEBUG
1413 if (kill_on_no_paging_space) {
1414 max_kill_priority = JETSAM_PRIORITY_MAX;
1415 }
1416 #endif
1417
1418 /* Init buckets */
1419 for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
1420 TAILQ_INIT(&memstat_bucket[i].list);
1421 memstat_bucket[i].count = 0;
1422 memstat_bucket[i].relaunch_high_count = 0;
1423 }
1424 memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
1425
1426 nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_sysprocs_idle_delay_time);
1427 nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_apps_idle_delay_time);
1428
1429 #if CONFIG_JETSAM
1430 bzero(memorystatus_jetsam_proc_name_panic, MAXCOMLEN + 1);
1431 if (PE_parse_boot_argn("jetsam_proc_name_panic", &memorystatus_jetsam_proc_name_panic, MAXCOMLEN + 1)) {
1432 printf("Enabling panic on jetsam for process %s ", memorystatus_jetsam_proc_name_panic);
1433
1434 /*
1435 * No bounds check to see if this is a valid cause.
1436 * This is a debugging aid. The callers should know precisely which cause they wish to track.
1437 */
1438 if (PE_parse_boot_argn("jetsam_proc_cause_panic", &memorystatus_jetsam_proc_cause_panic, sizeof(memorystatus_jetsam_proc_cause_panic))) {
1439 printf("when cause is %d ", memorystatus_jetsam_proc_cause_panic);
1440 } else {
1441 printf("for any cause ");
1442 }
1443
1444 if (PE_parse_boot_argn("jetsam_proc_size_panic", &memorystatus_jetsam_proc_size_panic, sizeof(memorystatus_jetsam_proc_size_panic))) {
1445 printf("and when its footprint is >= %d MB.\n", memorystatus_jetsam_proc_size_panic);
1446 } else {
1447 printf("and for any footprint.\n");
1448 }
1449 }
1450
1451 /* Apply overrides */
1452 if (!PE_parse_boot_argn("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage))) {
1453 PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage));
1454 }
1455 if (delta_percentage == 0) {
1456 delta_percentage = 5;
1457 }
1458 if (max_mem > config_jetsam_large_memory_cutoff) {
1459 critical_threshold_percentage = critical_threshold_percentage_larger_devices;
1460 delta_percentage = delta_percentage_larger_devices;
1461 }
1462 assert(delta_percentage < 100);
1463 if (!PE_parse_boot_argn("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage))) {
1464 PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage));
1465 }
1466 assert(critical_threshold_percentage < 100);
1467 PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage));
1468 assert(idle_offset_percentage < 100);
1469 PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage));
1470 assert(pressure_threshold_percentage < 100);
1471 PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage));
1472 assert(freeze_threshold_percentage < 100);
1473
1474
1475 if (!PE_parse_boot_argn("jetsam_aging_policy", &jetsam_aging_policy,
1476 sizeof(jetsam_aging_policy))) {
1477 if (!PE_get_default("kern.jetsam_aging_policy", &jetsam_aging_policy,
1478 sizeof(jetsam_aging_policy))) {
1479 jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst;
1480 }
1481 }
1482
1483 if (jetsam_aging_policy > kJetsamAgingPolicyMax) {
1484 jetsam_aging_policy = kJetsamAgingPolicySysProcsReclaimedFirst;
1485 }
1486
1487 switch (jetsam_aging_policy) {
1488 case kJetsamAgingPolicyNone:
1489 system_procs_aging_band = JETSAM_PRIORITY_IDLE;
1490 applications_aging_band = JETSAM_PRIORITY_IDLE;
1491 break;
1492
1493 case kJetsamAgingPolicyLegacy:
1494 /*
1495 * Legacy behavior where some daemons get a 10s protection once
1496 * AND only before the first clean->dirty->clean transition before
1497 * going into IDLE band.
1498 */
1499 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1500 applications_aging_band = JETSAM_PRIORITY_IDLE;
1501 break;
1502
1503 case kJetsamAgingPolicySysProcsReclaimedFirst:
1504 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1505 applications_aging_band = JETSAM_PRIORITY_AGING_BAND2;
1506 break;
1507
1508 case kJetsamAgingPolicyAppsReclaimedFirst:
1509 system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND2;
1510 applications_aging_band = JETSAM_PRIORITY_AGING_BAND1;
1511 break;
1512
1513 default:
1514 break;
1515 }
1516
1517 /*
1518 * The aging bands cannot overlap with the JETSAM_PRIORITY_ELEVATED_INACTIVE
1519 * band and must be below it in priority. This is so that we don't have to make
1520 * our 'aging' code worry about a mix of processes, some of which need to age
1521 * and some others that need to stay elevated in the jetsam bands.
1522 */
1523 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > system_procs_aging_band);
1524 assert(JETSAM_PRIORITY_ELEVATED_INACTIVE > applications_aging_band);
1525
1526 /* Take snapshots for idle-exit kills by default? First check the boot-arg... */
1527 if (!PE_parse_boot_argn("jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot))) {
1528 /* ...no boot-arg, so check the device tree */
1529 PE_get_default("kern.jetsam_idle_snapshot", &memorystatus_idle_snapshot, sizeof(memorystatus_idle_snapshot));
1530 }
1531
1532 memorystatus_delta = (unsigned int) (delta_percentage * atop_64(max_mem) / 100);
1533 memorystatus_available_pages_critical_idle_offset = (unsigned int) (idle_offset_percentage * atop_64(max_mem) / 100);
1534 memorystatus_available_pages_critical_base = (unsigned int) ((critical_threshold_percentage / delta_percentage) * memorystatus_delta);
1535 memorystatus_policy_more_free_offset_pages = (unsigned int) ((policy_more_free_offset_percentage / delta_percentage) * memorystatus_delta);
1536 memorystatus_sysproc_aging_aggr_pages = (unsigned int) (sysproc_aging_aggr_threshold_percentage * atop_64(max_mem) / 100);
1537
1538 /* Jetsam Loop Detection */
1539 if (max_mem <= (512 * 1024 * 1024)) {
1540 /* 512 MB devices */
1541 memorystatus_jld_eval_period_msecs = 8000; /* 8000 msecs == 8 second window */
1542 } else {
1543 /* 1GB and larger devices */
1544 memorystatus_jld_eval_period_msecs = 6000; /* 6000 msecs == 6 second window */
1545 }
1546
1547 memorystatus_jld_enabled = TRUE;
1548
1549 /* No contention at this point */
1550 memorystatus_update_levels_locked(FALSE);
1551
1552 #endif /* CONFIG_JETSAM */
1553
1554 #if __arm64__
1555 if (!PE_parse_boot_argn("entitled_max_task_pmem", &memorystatus_entitled_max_task_footprint_mb,
1556 sizeof(memorystatus_entitled_max_task_footprint_mb))) {
1557 if (!PE_get_default("kern.entitled_max_task_pmem", &memorystatus_entitled_max_task_footprint_mb,
1558 sizeof(memorystatus_entitled_max_task_footprint_mb))) {
1559 // entitled_max_task_pmem is not supported on this system.
1560 memorystatus_entitled_max_task_footprint_mb = 0;
1561 }
1562 }
1563 if (memorystatus_entitled_max_task_footprint_mb > max_mem / (1UL << 20) || memorystatus_entitled_max_task_footprint_mb < 0) {
1564 os_log_with_startup_serial(OS_LOG_DEFAULT, "Invalid value (%d) for entitled_max_task_pmem. Setting to 0",
1565 memorystatus_entitled_max_task_footprint_mb);
1566 }
1567 #endif /* __arm64__ */
1568
1569 memorystatus_jetsam_snapshot_max = maxproc;
1570
1571 memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
1572 (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
1573
1574 memorystatus_jetsam_snapshot = kalloc_data(memorystatus_jetsam_snapshot_size, Z_WAITOK | Z_ZERO);
1575 if (!memorystatus_jetsam_snapshot) {
1576 panic("Could not allocate memorystatus_jetsam_snapshot");
1577 }
1578
1579 #if CONFIG_FREEZE
1580 memorystatus_jetsam_snapshot_freezer_max = memorystatus_jetsam_snapshot_max / JETSAM_SNAPSHOT_FREEZER_MAX_FACTOR;
1581 memorystatus_jetsam_snapshot_freezer_size = sizeof(memorystatus_jetsam_snapshot_t) +
1582 (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_freezer_max);
1583
1584 memorystatus_jetsam_snapshot_freezer =
1585 zalloc_permanent(memorystatus_jetsam_snapshot_freezer_size, ZALIGN_PTR);
1586 #endif /* CONFIG_FREEZE */
1587
1588 nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
1589
1590 memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
1591
1592 #if CONFIG_FREEZE
1593 memorystatus_freeze_threshold = (unsigned int) ((freeze_threshold_percentage / delta_percentage) * memorystatus_delta);
1594 #endif
1595
1596 /* Check the boot-arg to see if fast jetsam is allowed */
1597 if (!PE_parse_boot_argn("fast_jetsam_enabled", &fast_jetsam_enabled, sizeof(fast_jetsam_enabled))) {
1598 fast_jetsam_enabled = 0;
1599 }
1600
1601 /* Check the boot-arg to configure the maximum number of jetsam threads */
1602 if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof(max_jetsam_threads))) {
1603 max_jetsam_threads = JETSAM_THREADS_LIMIT;
1604 }
1605
1606 /* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */
1607 if (max_jetsam_threads > JETSAM_THREADS_LIMIT) {
1608 max_jetsam_threads = JETSAM_THREADS_LIMIT;
1609 }
1610
1611 /* For low CPU systems disable fast jetsam mechanism */
1612 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
1613 max_jetsam_threads = 1;
1614 fast_jetsam_enabled = 0;
1615 }
1616
1617 /* Initialize the jetsam_threads state array */
1618 jetsam_threads = zalloc_permanent(sizeof(struct jetsam_thread_state) *
1619 max_jetsam_threads, ZALIGN(struct jetsam_thread_state));
1620
1621 /* Initialize all the jetsam threads */
1622 for (i = 0; i < max_jetsam_threads; i++) {
1623 jetsam_threads[i].inited = FALSE;
1624 jetsam_threads[i].index = i;
1625 result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread);
1626 if (result != KERN_SUCCESS) {
1627 panic("Could not create memorystatus_thread %d", i);
1628 }
1629 thread_deallocate(jetsam_threads[i].thread);
1630 }
1631
1632 #if VM_PRESSURE_EVENTS
1633 memorystatus_notify_init();
1634 #endif /* VM_PRESSURE_EVENTS */
1635 }
1636
1637 /* Centralised for the purposes of allowing panic-on-jetsam */
1638 extern void
1639 vm_run_compactor(void);
1640 extern void
1641 vm_wake_compactor_swapper(void);
1642
1643 /*
1644 * The jetsam no frills kill call
1645 * Return: 0 on success
1646 * error code on failure (EINVAL...)
1647 */
1648 static int
jetsam_do_kill(proc_t p,int jetsam_flags,os_reason_t jetsam_reason)1649 jetsam_do_kill(proc_t p, int jetsam_flags, os_reason_t jetsam_reason)
1650 {
1651 int error = 0;
1652 error = exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags, jetsam_reason);
1653 return error;
1654 }
1655
1656 /*
1657 * Wrapper for processes exiting with memorystatus details
1658 */
1659 static boolean_t
memorystatus_do_kill(proc_t p,uint32_t cause,os_reason_t jetsam_reason,uint64_t * footprint_of_killed_proc)1660 memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason, uint64_t *footprint_of_killed_proc)
1661 {
1662 int error = 0;
1663 __unused pid_t victim_pid = proc_getpid(p);
1664 uint64_t footprint = get_task_phys_footprint(p->task);
1665 #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
1666 int32_t memstat_effectivepriority = p->p_memstat_effectivepriority;
1667 #endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
1668
1669 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START,
1670 victim_pid, cause, vm_page_free_count, footprint, 0);
1671 DTRACE_MEMORYSTATUS4(memorystatus_do_kill, proc_t, p, os_reason_t, jetsam_reason, uint32_t, cause, uint64_t, footprint);
1672
1673 #if CONFIG_JETSAM
1674 if (*p->p_name && !strncmp(memorystatus_jetsam_proc_name_panic, p->p_name, sizeof(p->p_name))) { /* name */
1675 if ((!memorystatus_jetsam_proc_cause_panic || cause == memorystatus_jetsam_proc_cause_panic) && /* cause */
1676 (!memorystatus_jetsam_proc_size_panic || (footprint >> 20) >= memorystatus_jetsam_proc_size_panic)) { /* footprint */
1677 panic("memorystatus_do_kill(): requested panic on jetsam of %s (cause: %d and footprint: %llu mb)",
1678 memorystatus_jetsam_proc_name_panic, cause, footprint >> 20);
1679 }
1680 }
1681 #else /* CONFIG_JETSAM */
1682 #pragma unused(cause)
1683 #endif /* CONFIG_JETSAM */
1684
1685 if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
1686 printf("memorystatus: killing process %d [%s] in high band %s (%d) - memorystatus_available_pages: %llu\n", proc_getpid(p),
1687 (*p->p_name ? p->p_name : "unknown"),
1688 memorystatus_priority_band_name(p->p_memstat_effectivepriority), p->p_memstat_effectivepriority,
1689 (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
1690 }
1691
1692 /*
1693 * The jetsam_reason (os_reason_t) has enough information about the kill cause.
1694 * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped.
1695 */
1696 int jetsam_flags = P_LTERM_JETSAM;
1697 switch (cause) {
1698 case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break;
1699 case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break;
1700 case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
1701 case kMemorystatusKilledVMCompressorThrashing:
1702 case kMemorystatusKilledVMCompressorSpaceShortage: jetsam_flags |= P_JETSAM_VMTHRASHING; break;
1703 case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break;
1704 case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break;
1705 case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break;
1706 }
1707 /* jetsam_do_kill drops a reference. */
1708 os_reason_ref(jetsam_reason);
1709 error = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
1710 *footprint_of_killed_proc = ((error == 0) ? footprint : 0);
1711
1712 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END,
1713 victim_pid, memstat_effectivepriority, vm_page_free_count, error, 0);
1714
1715 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_START,
1716 victim_pid, cause, vm_page_free_count, *footprint_of_killed_proc, 0);
1717
1718 if (jetsam_reason->osr_code == JETSAM_REASON_VNODE) {
1719 /*
1720 * vnode jetsams are syncronous and not caused by memory pressure.
1721 * Running the compactor on this thread adds significant latency to the filesystem operation
1722 * that triggered this jetsam.
1723 * Kick of compactor thread asyncronously instead.
1724 */
1725 vm_wake_compactor_swapper();
1726 } else {
1727 vm_run_compactor();
1728 }
1729
1730 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_END,
1731 victim_pid, cause, vm_page_free_count, 0, 0);
1732
1733 os_reason_free(jetsam_reason);
1734 return error == 0;
1735 }
1736
1737 /*
1738 * Node manipulation
1739 */
1740
1741 static void
memorystatus_check_levels_locked(void)1742 memorystatus_check_levels_locked(void)
1743 {
1744 #if CONFIG_JETSAM
1745 /* Update levels */
1746 memorystatus_update_levels_locked(TRUE);
1747 #else /* CONFIG_JETSAM */
1748 /*
1749 * Nothing to do here currently since we update
1750 * memorystatus_available_pages in vm_pressure_response.
1751 */
1752 #endif /* CONFIG_JETSAM */
1753 }
1754
1755 /*
1756 * Pin a process to a particular jetsam band when it is in the background i.e. not doing active work.
1757 * For an application: that means no longer in the FG band
1758 * For a daemon: that means no longer in its 'requested' jetsam priority band
1759 */
1760
1761 int
memorystatus_update_inactive_jetsam_priority_band(pid_t pid,uint32_t op_flags,int jetsam_prio,boolean_t effective_now)1762 memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now)
1763 {
1764 int error = 0;
1765 boolean_t enable = FALSE;
1766 proc_t p = NULL;
1767
1768 if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE) {
1769 enable = TRUE;
1770 } else if (op_flags == MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE) {
1771 enable = FALSE;
1772 } else {
1773 return EINVAL;
1774 }
1775
1776 p = proc_find(pid);
1777 if (p != NULL) {
1778 if ((enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) ||
1779 (!enable && ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) == 0))) {
1780 /*
1781 * No change in state.
1782 */
1783 } else {
1784 proc_list_lock();
1785
1786 if (enable) {
1787 p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
1788 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1789
1790 if (effective_now) {
1791 if (p->p_memstat_effectivepriority < jetsam_prio) {
1792 if (memorystatus_highwater_enabled) {
1793 /*
1794 * Process is about to transition from
1795 * inactive --> active
1796 * assign active state
1797 */
1798 boolean_t is_fatal;
1799 boolean_t use_active = TRUE;
1800 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
1801 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
1802 }
1803 memorystatus_update_priority_locked(p, jetsam_prio, FALSE, FALSE);
1804 }
1805 } else {
1806 if (isProcessInAgingBands(p)) {
1807 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1808 }
1809 }
1810 } else {
1811 p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
1812 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1813
1814 if (effective_now) {
1815 if (p->p_memstat_effectivepriority == jetsam_prio) {
1816 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1817 }
1818 } else {
1819 if (isProcessInAgingBands(p)) {
1820 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
1821 }
1822 }
1823 }
1824
1825 proc_list_unlock();
1826 }
1827 proc_rele(p);
1828 error = 0;
1829 } else {
1830 error = ESRCH;
1831 }
1832
1833 return error;
1834 }
1835
1836 static void
memorystatus_perform_idle_demotion(__unused void * spare1,__unused void * spare2)1837 memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
1838 {
1839 proc_t p;
1840 uint64_t current_time = 0, idle_delay_time = 0;
1841 int demote_prio_band = 0;
1842 memstat_bucket_t *demotion_bucket;
1843
1844 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n");
1845
1846 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0);
1847
1848 current_time = mach_absolute_time();
1849
1850 proc_list_lock();
1851
1852 demote_prio_band = JETSAM_PRIORITY_IDLE + 1;
1853
1854 for (; demote_prio_band < JETSAM_PRIORITY_MAX; demote_prio_band++) {
1855 if (demote_prio_band != system_procs_aging_band && demote_prio_band != applications_aging_band) {
1856 continue;
1857 }
1858
1859 demotion_bucket = &memstat_bucket[demote_prio_band];
1860 p = TAILQ_FIRST(&demotion_bucket->list);
1861
1862 while (p) {
1863 MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", proc_getpid(p));
1864
1865 assert(p->p_memstat_idledeadline);
1866
1867 assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
1868
1869 if (current_time >= p->p_memstat_idledeadline) {
1870 if ((isSysProc(p) &&
1871 ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) != P_DIRTY_IDLE_EXIT_ENABLED)) || /* system proc marked dirty*/
1872 task_has_assertions((struct task *)(p->task))) { /* has outstanding assertions which might indicate outstanding work too */
1873 idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p);
1874
1875 p->p_memstat_idledeadline += idle_delay_time;
1876 p = TAILQ_NEXT(p, p_memstat_list);
1877 } else {
1878 proc_t next_proc = NULL;
1879
1880 next_proc = TAILQ_NEXT(p, p_memstat_list);
1881 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1882
1883 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false, true);
1884
1885 p = next_proc;
1886 continue;
1887 }
1888 } else {
1889 // No further candidates
1890 break;
1891 }
1892 }
1893 }
1894
1895 memorystatus_reschedule_idle_demotion_locked();
1896
1897 proc_list_unlock();
1898
1899 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
1900 }
1901
1902 static void
memorystatus_schedule_idle_demotion_locked(proc_t p,boolean_t set_state)1903 memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state)
1904 {
1905 boolean_t present_in_sysprocs_aging_bucket = FALSE;
1906 boolean_t present_in_apps_aging_bucket = FALSE;
1907 uint64_t idle_delay_time = 0;
1908
1909 if (jetsam_aging_policy == kJetsamAgingPolicyNone) {
1910 return;
1911 }
1912
1913 if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) ||
1914 (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION)) {
1915 /*
1916 * This process isn't going to be making the trip to the lower bands.
1917 */
1918 return;
1919 }
1920
1921 if (isProcessInAgingBands(p)) {
1922 if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
1923 assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) != P_DIRTY_AGING_IN_PROGRESS);
1924 }
1925
1926 if (isSysProc(p) && system_procs_aging_band) {
1927 present_in_sysprocs_aging_bucket = TRUE;
1928 } else if (isApp(p) && applications_aging_band) {
1929 present_in_apps_aging_bucket = TRUE;
1930 }
1931 }
1932
1933 assert(!present_in_sysprocs_aging_bucket);
1934 assert(!present_in_apps_aging_bucket);
1935
1936 MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for pid %d (dirty:0x%x, set_state %d, demotions %d).\n",
1937 proc_getpid(p), p->p_memstat_dirty, set_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
1938
1939 if (isSysProc(p)) {
1940 assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED);
1941 }
1942
1943 idle_delay_time = (isSysProc(p)) ? memorystatus_sysprocs_idle_time(p) : memorystatus_apps_idle_time(p);
1944 if (set_state) {
1945 p->p_memstat_dirty |= P_DIRTY_AGING_IN_PROGRESS;
1946 p->p_memstat_idledeadline = mach_absolute_time() + idle_delay_time;
1947 }
1948
1949 assert(p->p_memstat_idledeadline);
1950
1951 if (isSysProc(p) && present_in_sysprocs_aging_bucket == FALSE) {
1952 memorystatus_scheduled_idle_demotions_sysprocs++;
1953 } else if (isApp(p) && present_in_apps_aging_bucket == FALSE) {
1954 memorystatus_scheduled_idle_demotions_apps++;
1955 }
1956 }
1957
1958 void
memorystatus_invalidate_idle_demotion_locked(proc_t p,boolean_t clear_state)1959 memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state)
1960 {
1961 boolean_t present_in_sysprocs_aging_bucket = FALSE;
1962 boolean_t present_in_apps_aging_bucket = FALSE;
1963
1964 if (!system_procs_aging_band && !applications_aging_band) {
1965 return;
1966 }
1967
1968 if ((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == 0) {
1969 return;
1970 }
1971
1972 if (isProcessInAgingBands(p)) {
1973 if (jetsam_aging_policy != kJetsamAgingPolicyLegacy) {
1974 assert((p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) == P_DIRTY_AGING_IN_PROGRESS);
1975 }
1976
1977 if (isSysProc(p) && system_procs_aging_band) {
1978 assert(p->p_memstat_effectivepriority == system_procs_aging_band);
1979 assert(p->p_memstat_idledeadline);
1980 present_in_sysprocs_aging_bucket = TRUE;
1981 } else if (isApp(p) && applications_aging_band) {
1982 assert(p->p_memstat_effectivepriority == applications_aging_band);
1983 assert(p->p_memstat_idledeadline);
1984 present_in_apps_aging_bucket = TRUE;
1985 }
1986 }
1987
1988 MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for pid %d (clear_state %d, demotions %d).\n",
1989 proc_getpid(p), clear_state, (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps));
1990
1991
1992 if (clear_state) {
1993 p->p_memstat_idledeadline = 0;
1994 p->p_memstat_dirty &= ~P_DIRTY_AGING_IN_PROGRESS;
1995 }
1996
1997 if (isSysProc(p) && present_in_sysprocs_aging_bucket == TRUE) {
1998 memorystatus_scheduled_idle_demotions_sysprocs--;
1999 assert(memorystatus_scheduled_idle_demotions_sysprocs >= 0);
2000 } else if (isApp(p) && present_in_apps_aging_bucket == TRUE) {
2001 memorystatus_scheduled_idle_demotions_apps--;
2002 assert(memorystatus_scheduled_idle_demotions_apps >= 0);
2003 }
2004
2005 assert((memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps) >= 0);
2006 }
2007
2008 static void
memorystatus_reschedule_idle_demotion_locked(void)2009 memorystatus_reschedule_idle_demotion_locked(void)
2010 {
2011 if (0 == (memorystatus_scheduled_idle_demotions_sysprocs + memorystatus_scheduled_idle_demotions_apps)) {
2012 if (memstat_idle_demotion_deadline) {
2013 /* Transitioned 1->0, so cancel next call */
2014 thread_call_cancel(memorystatus_idle_demotion_call);
2015 memstat_idle_demotion_deadline = 0;
2016 }
2017 } else {
2018 memstat_bucket_t *demotion_bucket;
2019 proc_t p = NULL, p1 = NULL, p2 = NULL;
2020
2021 if (system_procs_aging_band) {
2022 demotion_bucket = &memstat_bucket[system_procs_aging_band];
2023 p1 = TAILQ_FIRST(&demotion_bucket->list);
2024
2025 p = p1;
2026 }
2027
2028 if (applications_aging_band) {
2029 demotion_bucket = &memstat_bucket[applications_aging_band];
2030 p2 = TAILQ_FIRST(&demotion_bucket->list);
2031
2032 if (p1 && p2) {
2033 p = (p1->p_memstat_idledeadline > p2->p_memstat_idledeadline) ? p2 : p1;
2034 } else {
2035 p = (p1 == NULL) ? p2 : p1;
2036 }
2037 }
2038
2039 assert(p);
2040
2041 if (p != NULL) {
2042 assert(p && p->p_memstat_idledeadline);
2043 if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline) {
2044 thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline);
2045 memstat_idle_demotion_deadline = p->p_memstat_idledeadline;
2046 }
2047 }
2048 }
2049 }
2050
2051 /*
2052 * List manipulation
2053 */
2054
2055 int
memorystatus_add(proc_t p,boolean_t locked)2056 memorystatus_add(proc_t p, boolean_t locked)
2057 {
2058 memstat_bucket_t *bucket;
2059
2060 MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding pid %d with priority %d.\n", proc_getpid(p), p->p_memstat_effectivepriority);
2061
2062 if (!locked) {
2063 proc_list_lock();
2064 }
2065
2066 DTRACE_MEMORYSTATUS2(memorystatus_add, proc_t, p, int32_t, p->p_memstat_effectivepriority);
2067
2068 /* Processes marked internal do not have priority tracked */
2069 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2070 goto exit;
2071 }
2072
2073 /*
2074 * Opt out system processes from being frozen by default.
2075 * For coalition-based freezing, we only want to freeze sysprocs that have specifically opted in.
2076 */
2077 if (isSysProc(p)) {
2078 p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED;
2079 }
2080 #if CONFIG_FREEZE
2081 memorystatus_freeze_init_proc(p);
2082 #endif
2083
2084 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2085
2086 if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
2087 assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs - 1);
2088 } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
2089 assert(bucket->count == memorystatus_scheduled_idle_demotions_apps - 1);
2090 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2091 /*
2092 * Entering the idle band.
2093 * Record idle start time.
2094 */
2095 p->p_memstat_idle_start = mach_absolute_time();
2096 }
2097
2098 TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
2099 bucket->count++;
2100 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2101 bucket->relaunch_high_count++;
2102 }
2103
2104 memorystatus_list_count++;
2105
2106 memorystatus_check_levels_locked();
2107
2108 exit:
2109 if (!locked) {
2110 proc_list_unlock();
2111 }
2112
2113 return 0;
2114 }
2115
2116 /*
2117 * Description:
2118 * Moves a process from one jetsam bucket to another.
2119 * which changes the LRU position of the process.
2120 *
2121 * Monitors transition between buckets and if necessary
2122 * will update cached memory limits accordingly.
2123 *
2124 * skip_demotion_check:
2125 * - if the 'jetsam aging policy' is NOT 'legacy':
2126 * When this flag is TRUE, it means we are going
2127 * to age the ripe processes out of the aging bands and into the
2128 * IDLE band and apply their inactive memory limits.
2129 *
2130 * - if the 'jetsam aging policy' is 'legacy':
2131 * When this flag is TRUE, it might mean the above aging mechanism
2132 * OR
2133 * It might be that we have a process that has used up its 'idle deferral'
2134 * stay that is given to it once per lifetime. And in this case, the process
2135 * won't be going through any aging codepaths. But we still need to apply
2136 * the right inactive limits and so we explicitly set this to TRUE if the
2137 * new priority for the process is the IDLE band.
2138 */
2139 void
memorystatus_update_priority_locked(proc_t p,int priority,boolean_t head_insert,boolean_t skip_demotion_check)2140 memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert, boolean_t skip_demotion_check)
2141 {
2142 memstat_bucket_t *old_bucket, *new_bucket;
2143
2144 assert(priority < MEMSTAT_BUCKET_COUNT);
2145
2146 /* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
2147 if (proc_list_exited(p)) {
2148 return;
2149 }
2150
2151 MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting %s(%d) to priority %d, inserting at %s\n",
2152 (*p->p_name ? p->p_name : "unknown"), proc_getpid(p), priority, head_insert ? "head" : "tail");
2153
2154 DTRACE_MEMORYSTATUS3(memorystatus_update_priority, proc_t, p, int32_t, p->p_memstat_effectivepriority, int, priority);
2155
2156 old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2157
2158 if (skip_demotion_check == FALSE) {
2159 if (isSysProc(p)) {
2160 /*
2161 * For system processes, the memorystatus_dirty_* routines take care of adding/removing
2162 * the processes from the aging bands and balancing the demotion counts.
2163 * We can, however, override that if the process has an 'elevated inactive jetsam band' attribute.
2164 */
2165
2166 if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
2167 /*
2168 * 2 types of processes can use the non-standard elevated inactive band:
2169 * - Frozen processes that always land in memorystatus_freeze_jetsam_band
2170 * OR
2171 * - processes that specifically opt-in to the elevated inactive support e.g. docked processes.
2172 */
2173 #if CONFIG_FREEZE
2174 if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
2175 if (priority <= memorystatus_freeze_jetsam_band) {
2176 priority = memorystatus_freeze_jetsam_band;
2177 }
2178 } else
2179 #endif /* CONFIG_FREEZE */
2180 {
2181 if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
2182 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2183 }
2184 }
2185 assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
2186 }
2187 } else if (isApp(p)) {
2188 /*
2189 * Check to see if the application is being lowered in jetsam priority. If so, and:
2190 * - it has an 'elevated inactive jetsam band' attribute, then put it in the appropriate band.
2191 * - it is a normal application, then let it age in the aging band if that policy is in effect.
2192 */
2193
2194 if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
2195 #if CONFIG_FREEZE
2196 if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
2197 if (priority <= memorystatus_freeze_jetsam_band) {
2198 priority = memorystatus_freeze_jetsam_band;
2199 }
2200 } else
2201 #endif /* CONFIG_FREEZE */
2202 {
2203 if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
2204 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2205 }
2206 }
2207 } else {
2208 if (applications_aging_band) {
2209 if (p->p_memstat_effectivepriority == applications_aging_band) {
2210 assert(old_bucket->count == (memorystatus_scheduled_idle_demotions_apps + 1));
2211 }
2212
2213 if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && (priority <= applications_aging_band)) {
2214 assert(!(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
2215 priority = applications_aging_band;
2216 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2217 }
2218 }
2219 }
2220 }
2221 }
2222
2223 if ((system_procs_aging_band && (priority == system_procs_aging_band)) || (applications_aging_band && (priority == applications_aging_band))) {
2224 assert(p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS);
2225 }
2226
2227 #if DEVELOPMENT || DEBUG
2228 if (priority == JETSAM_PRIORITY_IDLE && /* if the process is on its way into the IDLE band */
2229 skip_demotion_check == FALSE && /* and it isn't via the path that will set the INACTIVE memlimits */
2230 (p->p_memstat_dirty & P_DIRTY_TRACK) && /* and it has 'DIRTY' tracking enabled */
2231 ((p->p_memstat_memlimit != p->p_memstat_memlimit_inactive) || /* and we notice that the current limit isn't the right value (inactive) */
2232 ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) ? (!(p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)) : (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT)))) { /* OR type (fatal vs non-fatal) */
2233 printf("memorystatus_update_priority_locked: on %s with 0x%x, prio: %d and %d\n", p->p_name, p->p_memstat_state, priority, p->p_memstat_memlimit); /* then we must catch this */
2234 }
2235 #endif /* DEVELOPMENT || DEBUG */
2236
2237 TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
2238 old_bucket->count--;
2239 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2240 old_bucket->relaunch_high_count--;
2241 }
2242
2243 new_bucket = &memstat_bucket[priority];
2244 if (head_insert) {
2245 TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
2246 } else {
2247 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
2248 }
2249 new_bucket->count++;
2250 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2251 new_bucket->relaunch_high_count++;
2252 }
2253
2254 if (memorystatus_highwater_enabled) {
2255 boolean_t is_fatal;
2256 boolean_t use_active;
2257
2258 /*
2259 * If cached limit data is updated, then the limits
2260 * will be enforced by writing to the ledgers.
2261 */
2262 boolean_t ledger_update_needed = TRUE;
2263
2264 /*
2265 * Here, we must update the cached memory limit if the task
2266 * is transitioning between:
2267 * active <--> inactive
2268 * FG <--> BG
2269 * but:
2270 * dirty <--> clean is ignored
2271 *
2272 * We bypass non-idle processes that have opted into dirty tracking because
2273 * a move between buckets does not imply a transition between the
2274 * dirty <--> clean state.
2275 */
2276
2277 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
2278 if (skip_demotion_check == TRUE && priority == JETSAM_PRIORITY_IDLE) {
2279 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2280 use_active = FALSE;
2281 } else {
2282 ledger_update_needed = FALSE;
2283 }
2284 } else if ((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) {
2285 /*
2286 * inactive --> active
2287 * BG --> FG
2288 * assign active state
2289 */
2290 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
2291 use_active = TRUE;
2292 } else if ((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
2293 /*
2294 * active --> inactive
2295 * FG --> BG
2296 * assign inactive state
2297 */
2298 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2299 use_active = FALSE;
2300 } else {
2301 /*
2302 * The transition between jetsam priority buckets apparently did
2303 * not affect active/inactive state.
2304 * This is not unusual... especially during startup when
2305 * processes are getting established in their respective bands.
2306 */
2307 ledger_update_needed = FALSE;
2308 }
2309
2310 /*
2311 * Enforce the new limits by writing to the ledger
2312 */
2313 if (ledger_update_needed) {
2314 task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
2315
2316 MEMORYSTATUS_DEBUG(3, "memorystatus_update_priority_locked: new limit on pid %d (%dMB %s) priority old --> new (%d --> %d) dirty?=0x%x %s\n",
2317 proc_getpid(p), (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
2318 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, priority, p->p_memstat_dirty,
2319 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
2320 }
2321 }
2322
2323 /*
2324 * Record idle start or idle delta.
2325 */
2326 if (p->p_memstat_effectivepriority == priority) {
2327 /*
2328 * This process is not transitioning between
2329 * jetsam priority buckets. Do nothing.
2330 */
2331 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2332 uint64_t now;
2333 /*
2334 * Transitioning out of the idle priority bucket.
2335 * Record idle delta.
2336 */
2337 assert(p->p_memstat_idle_start != 0);
2338 now = mach_absolute_time();
2339 if (now > p->p_memstat_idle_start) {
2340 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2341 }
2342
2343 /*
2344 * About to become active and so memory footprint could change.
2345 * So mark it eligible for freeze-considerations next time around.
2346 */
2347 if (p->p_memstat_state & P_MEMSTAT_FREEZE_IGNORE) {
2348 p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE;
2349 }
2350 } else if (priority == JETSAM_PRIORITY_IDLE) {
2351 /*
2352 * Transitioning into the idle priority bucket.
2353 * Record idle start.
2354 */
2355 p->p_memstat_idle_start = mach_absolute_time();
2356 }
2357
2358 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), proc_getpid(p), priority, p->p_memstat_effectivepriority, 0, 0);
2359
2360 p->p_memstat_effectivepriority = priority;
2361
2362 #if CONFIG_SECLUDED_MEMORY
2363 if (secluded_for_apps &&
2364 task_could_use_secluded_mem(p->task)) {
2365 task_set_can_use_secluded_mem(
2366 p->task,
2367 (priority >= JETSAM_PRIORITY_FOREGROUND));
2368 }
2369 #endif /* CONFIG_SECLUDED_MEMORY */
2370
2371 memorystatus_check_levels_locked();
2372 }
2373
2374 int
memorystatus_relaunch_flags_update(proc_t p,int relaunch_flags)2375 memorystatus_relaunch_flags_update(proc_t p, int relaunch_flags)
2376 {
2377 p->p_memstat_relaunch_flags = relaunch_flags;
2378 KDBG(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_RELAUNCH_FLAGS), proc_getpid(p), relaunch_flags, 0, 0, 0);
2379 return 0;
2380 }
2381
2382 /*
2383 *
2384 * Description: Update the jetsam priority and memory limit attributes for a given process.
2385 *
2386 * Parameters:
2387 * p init this process's jetsam information.
2388 * priority The jetsam priority band
2389 * user_data user specific data, unused by the kernel
2390 * is_assertion When true, a priority update is driven by an assertion.
2391 * effective guards against race if process's update already occurred
2392 * update_memlimit When true we know this is the init step via the posix_spawn path.
2393 *
2394 * memlimit_active Value in megabytes; The monitored footprint level while the
2395 * process is active. Exceeding it may result in termination
2396 * based on it's associated fatal flag.
2397 *
2398 * memlimit_active_is_fatal When a process is active and exceeds its memory footprint,
2399 * this describes whether or not it should be immediately fatal.
2400 *
2401 * memlimit_inactive Value in megabytes; The monitored footprint level while the
2402 * process is inactive. Exceeding it may result in termination
2403 * based on it's associated fatal flag.
2404 *
2405 * memlimit_inactive_is_fatal When a process is inactive and exceeds its memory footprint,
2406 * this describes whether or not it should be immediatly fatal.
2407 *
2408 * Returns: 0 Success
2409 * non-0 Failure
2410 */
2411
2412 int
memorystatus_update(proc_t p,int priority,uint64_t user_data,boolean_t is_assertion,boolean_t effective,boolean_t update_memlimit,int32_t memlimit_active,boolean_t memlimit_active_is_fatal,int32_t memlimit_inactive,boolean_t memlimit_inactive_is_fatal)2413 memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t is_assertion, boolean_t effective, boolean_t update_memlimit,
2414 int32_t memlimit_active, boolean_t memlimit_active_is_fatal,
2415 int32_t memlimit_inactive, boolean_t memlimit_inactive_is_fatal)
2416 {
2417 int ret;
2418 boolean_t head_insert = false;
2419
2420 MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing (%s) pid %d: priority %d, user_data 0x%llx\n", (*p->p_name ? p->p_name : "unknown"), proc_getpid(p), priority, user_data);
2421
2422 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, proc_getpid(p), priority, user_data, effective, 0);
2423
2424 if (priority == -1) {
2425 /* Use as shorthand for default priority */
2426 priority = JETSAM_PRIORITY_DEFAULT;
2427 } else if ((priority == system_procs_aging_band) || (priority == applications_aging_band)) {
2428 /* Both the aging bands are reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */
2429 priority = JETSAM_PRIORITY_IDLE;
2430 } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
2431 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
2432 priority = JETSAM_PRIORITY_IDLE;
2433 head_insert = TRUE;
2434 } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
2435 /* Sanity check */
2436 ret = EINVAL;
2437 goto out;
2438 }
2439
2440 proc_list_lock();
2441
2442 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
2443
2444 if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
2445 ret = EALREADY;
2446 proc_list_unlock();
2447 MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", proc_getpid(p));
2448 goto out;
2449 }
2450
2451 if ((p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) || proc_list_exited(p)) {
2452 /*
2453 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
2454 */
2455 ret = EBUSY;
2456 proc_list_unlock();
2457 goto out;
2458 }
2459
2460 p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
2461 p->p_memstat_userdata = user_data;
2462
2463 if (is_assertion) {
2464 if (priority == JETSAM_PRIORITY_IDLE) {
2465 /*
2466 * Assertions relinquish control when the process is heading to IDLE.
2467 */
2468 if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
2469 /*
2470 * Mark the process as no longer being managed by assertions.
2471 */
2472 p->p_memstat_state &= ~P_MEMSTAT_PRIORITY_ASSERTION;
2473 } else {
2474 /*
2475 * Ignore an idle priority transition if the process is not
2476 * already managed by assertions. We won't treat this as
2477 * an error, but we will log the unexpected behavior and bail.
2478 */
2479 os_log(OS_LOG_DEFAULT, "memorystatus: Ignore assertion driven idle priority. Process not previously controlled %s:%d\n",
2480 (*p->p_name ? p->p_name : "unknown"), proc_getpid(p));
2481
2482 ret = 0;
2483 proc_list_unlock();
2484 goto out;
2485 }
2486 } else {
2487 /*
2488 * Process is now being managed by assertions,
2489 */
2490 p->p_memstat_state |= P_MEMSTAT_PRIORITY_ASSERTION;
2491 }
2492
2493 /* Always update the assertion priority in this path */
2494
2495 p->p_memstat_assertionpriority = priority;
2496
2497 int memstat_dirty_flags = memorystatus_dirty_get(p, TRUE); /* proc_list_lock is held */
2498
2499 if (memstat_dirty_flags != 0) {
2500 /*
2501 * Calculate maximum priority only when dirty tracking processes are involved.
2502 */
2503 int maxpriority;
2504 if (memstat_dirty_flags & PROC_DIRTY_IS_DIRTY) {
2505 maxpriority = MAX(p->p_memstat_assertionpriority, p->p_memstat_requestedpriority);
2506 } else {
2507 /* clean */
2508
2509 if (memstat_dirty_flags & PROC_DIRTY_ALLOWS_IDLE_EXIT) {
2510 /*
2511 * The aging policy must be evaluated and applied here because runnningboardd
2512 * has relinquished its hold on the jetsam priority by attempting to move a
2513 * clean process to the idle band.
2514 */
2515
2516 int newpriority = JETSAM_PRIORITY_IDLE;
2517 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
2518 newpriority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE;
2519 }
2520
2521 maxpriority = MAX(p->p_memstat_assertionpriority, newpriority );
2522
2523 if (newpriority == system_procs_aging_band) {
2524 memorystatus_schedule_idle_demotion_locked(p, FALSE);
2525 }
2526 } else {
2527 /*
2528 * Preserves requestedpriority when the process does not support pressured exit.
2529 */
2530 maxpriority = MAX(p->p_memstat_assertionpriority, p->p_memstat_requestedpriority);
2531 }
2532 }
2533 priority = maxpriority;
2534 }
2535 } else {
2536 p->p_memstat_requestedpriority = priority;
2537 }
2538
2539 if (update_memlimit) {
2540 boolean_t is_fatal;
2541 boolean_t use_active;
2542
2543 /*
2544 * Posix_spawn'd processes come through this path to instantiate ledger limits.
2545 * Forked processes do not come through this path, so no ledger limits exist.
2546 * (That's why forked processes can consume unlimited memory.)
2547 */
2548
2549 MEMORYSTATUS_DEBUG(3, "memorystatus_update(enter): pid %d, priority %d, dirty=0x%x, Active(%dMB %s), Inactive(%dMB, %s)\n",
2550 proc_getpid(p), priority, p->p_memstat_dirty,
2551 memlimit_active, (memlimit_active_is_fatal ? "F " : "NF"),
2552 memlimit_inactive, (memlimit_inactive_is_fatal ? "F " : "NF"));
2553
2554 if (memlimit_active <= 0) {
2555 /*
2556 * This process will have a system_wide task limit when active.
2557 * System_wide task limit is always fatal.
2558 * It's quite common to see non-fatal flag passed in here.
2559 * It's not an error, we just ignore it.
2560 */
2561
2562 /*
2563 * For backward compatibility with some unexplained launchd behavior,
2564 * we allow a zero sized limit. But we still enforce system_wide limit
2565 * when written to the ledgers.
2566 */
2567
2568 if (memlimit_active < 0) {
2569 memlimit_active = -1; /* enforces system_wide task limit */
2570 }
2571 memlimit_active_is_fatal = TRUE;
2572 }
2573
2574 if (memlimit_inactive <= 0) {
2575 /*
2576 * This process will have a system_wide task limit when inactive.
2577 * System_wide task limit is always fatal.
2578 */
2579
2580 memlimit_inactive = -1;
2581 memlimit_inactive_is_fatal = TRUE;
2582 }
2583
2584 /*
2585 * Initialize the active limit variants for this process.
2586 */
2587 SET_ACTIVE_LIMITS_LOCKED(p, memlimit_active, memlimit_active_is_fatal);
2588
2589 /*
2590 * Initialize the inactive limit variants for this process.
2591 */
2592 SET_INACTIVE_LIMITS_LOCKED(p, memlimit_inactive, memlimit_inactive_is_fatal);
2593
2594 /*
2595 * Initialize the cached limits for target process.
2596 * When the target process is dirty tracked, it's typically
2597 * in a clean state. Non dirty tracked processes are
2598 * typically active (Foreground or above).
2599 * But just in case, we don't make assumptions...
2600 */
2601
2602 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
2603 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
2604 use_active = TRUE;
2605 } else {
2606 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
2607 use_active = FALSE;
2608 }
2609
2610 /*
2611 * Enforce the cached limit by writing to the ledger.
2612 */
2613 if (memorystatus_highwater_enabled) {
2614 /* apply now */
2615 task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal);
2616
2617 MEMORYSTATUS_DEBUG(3, "memorystatus_update: init: limit on pid %d (%dMB %s) targeting priority(%d) dirty?=0x%x %s\n",
2618 proc_getpid(p), (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
2619 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), priority, p->p_memstat_dirty,
2620 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
2621 }
2622 }
2623
2624 /*
2625 * We can't add to the aging bands buckets here.
2626 * But, we could be removing it from those buckets.
2627 * Check and take appropriate steps if so.
2628 */
2629
2630 if (isProcessInAgingBands(p)) {
2631 if ((jetsam_aging_policy != kJetsamAgingPolicyLegacy) && isApp(p) && (priority > applications_aging_band)) {
2632 /*
2633 * Runningboardd is pulling up an application that is in the aging band.
2634 * We reset the app's state here so that it'll get a fresh stay in the
2635 * aging band on the way back.
2636 *
2637 * We always handled the app 'aging' in the memorystatus_update_priority_locked()
2638 * function. Daemons used to be handled via the dirty 'set/clear/track' path.
2639 * But with extensions (daemon-app hybrid), runningboardd is now going through
2640 * this routine for daemons too and things have gotten a bit tangled. This should
2641 * be simplified/untangled at some point and might require some assistance from
2642 * runningboardd.
2643 */
2644 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2645 } else {
2646 memorystatus_invalidate_idle_demotion_locked(p, FALSE);
2647 }
2648 memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
2649 } else {
2650 if (jetsam_aging_policy == kJetsamAgingPolicyLegacy && priority == JETSAM_PRIORITY_IDLE) {
2651 /*
2652 * Daemons with 'inactive' limits will go through the dirty tracking codepath.
2653 * This path deals with apps that may have 'inactive' limits e.g. WebContent processes.
2654 * If this is the legacy aging policy we explicitly need to apply those limits. If it
2655 * is any other aging policy, then we don't need to worry because all processes
2656 * will go through the aging bands and then the demotion thread will take care to
2657 * move them into the IDLE band and apply the required limits.
2658 */
2659 memorystatus_update_priority_locked(p, priority, head_insert, TRUE);
2660 }
2661 }
2662
2663 memorystatus_update_priority_locked(p, priority, head_insert, FALSE);
2664
2665 proc_list_unlock();
2666 ret = 0;
2667
2668 out:
2669 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0);
2670
2671 return ret;
2672 }
2673
2674 int
memorystatus_remove(proc_t p)2675 memorystatus_remove(proc_t p)
2676 {
2677 int ret;
2678 memstat_bucket_t *bucket;
2679 boolean_t reschedule = FALSE;
2680
2681 MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing pid %d\n", proc_getpid(p));
2682
2683 /*
2684 * Check if this proc is locked (because we're performing a freeze).
2685 * If so, we fail and instruct the caller to try again later.
2686 */
2687 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
2688 return EAGAIN;
2689 }
2690
2691 assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
2692
2693 bucket = &memstat_bucket[p->p_memstat_effectivepriority];
2694
2695 if (isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) {
2696 assert(bucket->count == memorystatus_scheduled_idle_demotions_sysprocs);
2697 reschedule = TRUE;
2698 } else if (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)) {
2699 assert(bucket->count == memorystatus_scheduled_idle_demotions_apps);
2700 reschedule = TRUE;
2701 }
2702
2703 /*
2704 * Record idle delta
2705 */
2706
2707 if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
2708 uint64_t now = mach_absolute_time();
2709 if (now > p->p_memstat_idle_start) {
2710 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
2711 }
2712 }
2713
2714 TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
2715 bucket->count--;
2716 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
2717 bucket->relaunch_high_count--;
2718 }
2719
2720 memorystatus_list_count--;
2721
2722 /* If awaiting demotion to the idle band, clean up */
2723 if (reschedule) {
2724 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2725 memorystatus_reschedule_idle_demotion_locked();
2726 }
2727
2728 memorystatus_check_levels_locked();
2729
2730 #if CONFIG_FREEZE
2731 if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
2732 if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
2733 p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
2734 memorystatus_refreeze_eligible_count--;
2735 }
2736
2737 memorystatus_frozen_count--;
2738 if (p->p_memstat_state & P_MEMSTAT_FROZEN_XPC_SERVICE) {
2739 memorystatus_frozen_count_xpc_service--;
2740 }
2741 if (strcmp(p->p_name, "com.apple.WebKit.WebContent") == 0) {
2742 memorystatus_frozen_count_webcontent--;
2743 }
2744 memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
2745 p->p_memstat_freeze_sharedanon_pages = 0;
2746 }
2747
2748 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
2749 memorystatus_suspended_count--;
2750 }
2751 #endif
2752
2753 #if DEVELOPMENT || DEBUG
2754 if (proc_getpid(p) == memorystatus_testing_pid) {
2755 memorystatus_testing_pid = 0;
2756 }
2757 #endif /* DEVELOPMENT || DEBUG */
2758
2759 if (p) {
2760 ret = 0;
2761 } else {
2762 ret = ESRCH;
2763 }
2764
2765 return ret;
2766 }
2767
2768 /*
2769 * Validate dirty tracking flags with process state.
2770 *
2771 * Return:
2772 * 0 on success
2773 * non-0 on failure
2774 *
2775 * The proc_list_lock is held by the caller.
2776 */
2777
2778 static int
memorystatus_validate_track_flags(struct proc * target_p,uint32_t pcontrol)2779 memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol)
2780 {
2781 /* See that the process isn't marked for termination */
2782 if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
2783 return EBUSY;
2784 }
2785
2786 /* Idle exit requires that process be tracked */
2787 if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
2788 !(pcontrol & PROC_DIRTY_TRACK)) {
2789 return EINVAL;
2790 }
2791
2792 /* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
2793 if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
2794 !(pcontrol & PROC_DIRTY_TRACK)) {
2795 return EINVAL;
2796 }
2797
2798 /* Only one type of DEFER behavior is allowed.*/
2799 if ((pcontrol & PROC_DIRTY_DEFER) &&
2800 (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) {
2801 return EINVAL;
2802 }
2803
2804 /* Deferral is only relevant if idle exit is specified */
2805 if (((pcontrol & PROC_DIRTY_DEFER) ||
2806 (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) &&
2807 !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
2808 return EINVAL;
2809 }
2810
2811 return 0;
2812 }
2813
2814 static void
memorystatus_update_idle_priority_locked(proc_t p)2815 memorystatus_update_idle_priority_locked(proc_t p)
2816 {
2817 int32_t priority;
2818
2819 MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", proc_getpid(p), p->p_memstat_dirty);
2820
2821 assert(isSysProc(p));
2822
2823 if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED | P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
2824 priority = (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) ? system_procs_aging_band : JETSAM_PRIORITY_IDLE;
2825 } else {
2826 priority = p->p_memstat_requestedpriority;
2827 }
2828
2829 if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
2830 /*
2831 * This process has a jetsam priority managed by an assertion.
2832 * Policy is to choose the max priority.
2833 */
2834 if (p->p_memstat_assertionpriority > priority) {
2835 os_log(OS_LOG_DEFAULT, "memorystatus: assertion priority %d overrides priority %d for %s:%d\n",
2836 p->p_memstat_assertionpriority, priority,
2837 (*p->p_name ? p->p_name : "unknown"), proc_getpid(p));
2838 priority = p->p_memstat_assertionpriority;
2839 }
2840 }
2841
2842 if (priority != p->p_memstat_effectivepriority) {
2843 if ((jetsam_aging_policy == kJetsamAgingPolicyLegacy) &&
2844 (priority == JETSAM_PRIORITY_IDLE)) {
2845 /*
2846 * This process is on its way into the IDLE band. The system is
2847 * using 'legacy' jetsam aging policy. That means, this process
2848 * has already used up its idle-deferral aging time that is given
2849 * once per its lifetime. So we need to set the INACTIVE limits
2850 * explicitly because it won't be going through the demotion paths
2851 * that take care to apply the limits appropriately.
2852 */
2853
2854 if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
2855 /*
2856 * This process has the 'elevated inactive jetsam band' attribute.
2857 * So, there will be no trip to IDLE after all.
2858 * Instead, we pin the process in the elevated band,
2859 * where its ACTIVE limits will apply.
2860 */
2861
2862 priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
2863 }
2864
2865 memorystatus_update_priority_locked(p, priority, false, true);
2866 } else {
2867 memorystatus_update_priority_locked(p, priority, false, false);
2868 }
2869 }
2870 }
2871
2872 /*
2873 * Processes can opt to have their state tracked by the kernel, indicating when they are busy (dirty) or idle
2874 * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
2875 * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
2876 * priority idle band when clean (and killed earlier, protecting higher priority procesess).
2877 *
2878 * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
2879 * memorystatus_sysprocs_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
2880 * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
2881 * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
2882 * band. The deferral can be cleared early by clearing the appropriate flag.
2883 *
2884 * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
2885 * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
2886 * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
2887 */
2888
2889 int
memorystatus_dirty_track(proc_t p,uint32_t pcontrol)2890 memorystatus_dirty_track(proc_t p, uint32_t pcontrol)
2891 {
2892 unsigned int old_dirty;
2893 boolean_t reschedule = FALSE;
2894 boolean_t already_deferred = FALSE;
2895 boolean_t defer_now = FALSE;
2896 int ret = 0;
2897
2898 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK),
2899 proc_getpid(p), p->p_memstat_dirty, pcontrol, 0, 0);
2900
2901 proc_list_lock();
2902
2903 if (proc_list_exited(p)) {
2904 /*
2905 * Process is on its way out.
2906 */
2907 ret = EBUSY;
2908 goto exit;
2909 }
2910
2911 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
2912 ret = EPERM;
2913 goto exit;
2914 }
2915
2916 if ((ret = memorystatus_validate_track_flags(p, pcontrol)) != 0) {
2917 /* error */
2918 goto exit;
2919 }
2920
2921 old_dirty = p->p_memstat_dirty;
2922
2923 /* These bits are cumulative, as per <rdar://problem/11159924> */
2924 if (pcontrol & PROC_DIRTY_TRACK) {
2925 p->p_memstat_dirty |= P_DIRTY_TRACK;
2926 }
2927
2928 if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
2929 p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
2930 }
2931
2932 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
2933 p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
2934 }
2935
2936 if (old_dirty & P_DIRTY_AGING_IN_PROGRESS) {
2937 already_deferred = TRUE;
2938 }
2939
2940
2941 /* This can be set and cleared exactly once. */
2942 if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
2943 if ((pcontrol & (PROC_DIRTY_DEFER)) &&
2944 !(old_dirty & P_DIRTY_DEFER)) {
2945 p->p_memstat_dirty |= P_DIRTY_DEFER;
2946 }
2947
2948 if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) &&
2949 !(old_dirty & P_DIRTY_DEFER_ALWAYS)) {
2950 p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS;
2951 }
2952
2953 defer_now = TRUE;
2954 }
2955
2956 MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for pid %d\n",
2957 ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N",
2958 defer_now ? "Y" : "N",
2959 p->p_memstat_dirty & P_DIRTY ? "Y" : "N",
2960 proc_getpid(p));
2961
2962 /* Kick off or invalidate the idle exit deferment if there's a state transition. */
2963 if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) {
2964 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
2965 if (defer_now && !already_deferred) {
2966 /*
2967 * Request to defer a clean process that's idle-exit enabled
2968 * and not already in the jetsam deferred band. Most likely a
2969 * new launch.
2970 */
2971 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2972 reschedule = TRUE;
2973 } else if (!defer_now) {
2974 /*
2975 * The process isn't asking for the 'aging' facility.
2976 * Could be that it is:
2977 */
2978
2979 if (already_deferred) {
2980 /*
2981 * already in the aging bands. Traditionally,
2982 * some processes have tried to use this to
2983 * opt out of the 'aging' facility.
2984 */
2985
2986 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
2987 } else {
2988 /*
2989 * agnostic to the 'aging' facility. In that case,
2990 * we'll go ahead and opt it in because this is likely
2991 * a new launch (clean process, dirty tracking enabled)
2992 */
2993
2994 memorystatus_schedule_idle_demotion_locked(p, TRUE);
2995 }
2996
2997 reschedule = TRUE;
2998 }
2999 }
3000 } else {
3001 /*
3002 * We are trying to operate on a dirty process. Dirty processes have to
3003 * be removed from the deferred band. The question is do we reset the
3004 * deferred state or not?
3005 *
3006 * This could be a legal request like:
3007 * - this process had opted into the 'aging' band
3008 * - but it's now dirty and requests to opt out.
3009 * In this case, we remove the process from the band and reset its
3010 * state too. It'll opt back in properly when needed.
3011 *
3012 * OR, this request could be a user-space bug. E.g.:
3013 * - this process had opted into the 'aging' band when clean
3014 * - and, then issues another request to again put it into the band except
3015 * this time the process is dirty.
3016 * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of
3017 * the deferred band with its state intact. So our request below is no-op.
3018 * But we do it here anyways for coverage.
3019 *
3020 * memorystatus_update_idle_priority_locked()
3021 * single-mindedly treats a dirty process as "cannot be in the aging band".
3022 */
3023
3024 if (!defer_now && already_deferred) {
3025 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
3026 reschedule = TRUE;
3027 } else {
3028 boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
3029
3030 memorystatus_invalidate_idle_demotion_locked(p, reset_state);
3031 reschedule = TRUE;
3032 }
3033 }
3034
3035 memorystatus_update_idle_priority_locked(p);
3036
3037 if (reschedule) {
3038 memorystatus_reschedule_idle_demotion_locked();
3039 }
3040
3041 ret = 0;
3042
3043 exit:
3044 proc_list_unlock();
3045
3046 return ret;
3047 }
3048
3049 int
memorystatus_dirty_set(proc_t p,boolean_t self,uint32_t pcontrol)3050 memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol)
3051 {
3052 int ret;
3053 boolean_t kill = false;
3054 boolean_t reschedule = FALSE;
3055 boolean_t was_dirty = FALSE;
3056 boolean_t now_dirty = FALSE;
3057
3058 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, proc_getpid(p), pcontrol, p->p_memstat_dirty);
3059 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), proc_getpid(p), self, pcontrol, 0, 0);
3060
3061 proc_list_lock();
3062
3063 if (proc_list_exited(p)) {
3064 /*
3065 * Process is on its way out.
3066 */
3067 ret = EBUSY;
3068 goto exit;
3069 }
3070
3071 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3072 ret = EPERM;
3073 goto exit;
3074 }
3075
3076 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3077 was_dirty = TRUE;
3078 }
3079
3080 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
3081 /* Dirty tracking not enabled */
3082 ret = EINVAL;
3083 } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
3084 /*
3085 * Process is set to be terminated and we're attempting to mark it dirty.
3086 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
3087 */
3088 ret = EBUSY;
3089 } else {
3090 int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
3091 if (pcontrol && !(p->p_memstat_dirty & flag)) {
3092 /* Mark the process as having been dirtied at some point */
3093 p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
3094 memorystatus_dirty_count++;
3095 ret = 0;
3096 } else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
3097 if ((flag == P_DIRTY_SHUTDOWN) && (!(p->p_memstat_dirty & P_DIRTY))) {
3098 /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
3099 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3100 kill = true;
3101 } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
3102 /* Kill previously terminated processes if set clean */
3103 kill = true;
3104 }
3105 p->p_memstat_dirty &= ~flag;
3106 memorystatus_dirty_count--;
3107 ret = 0;
3108 } else {
3109 /* Already set */
3110 ret = EALREADY;
3111 }
3112 }
3113
3114 if (ret != 0) {
3115 goto exit;
3116 }
3117
3118 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3119 now_dirty = TRUE;
3120 }
3121
3122 if ((was_dirty == TRUE && now_dirty == FALSE) ||
3123 (was_dirty == FALSE && now_dirty == TRUE)) {
3124 /* Manage idle exit deferral, if applied */
3125 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
3126 /*
3127 * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back
3128 * there once it's clean again. For the legacy case, this only applies if it has some protection window left.
3129 * P_DIRTY_DEFER: one-time protection window given at launch
3130 * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode.
3131 *
3132 * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over
3133 * in that band on it's way to IDLE.
3134 */
3135
3136 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3137 /*
3138 * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE"
3139 *
3140 * The process will move from its aging band to its higher requested
3141 * jetsam band.
3142 */
3143 boolean_t reset_state = (jetsam_aging_policy != kJetsamAgingPolicyLegacy) ? TRUE : FALSE;
3144
3145 memorystatus_invalidate_idle_demotion_locked(p, reset_state);
3146 reschedule = TRUE;
3147 } else {
3148 /*
3149 * Process is back from "dirty" to "clean".
3150 */
3151
3152 if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) {
3153 if (((p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) == FALSE) &&
3154 (mach_absolute_time() >= p->p_memstat_idledeadline)) {
3155 /*
3156 * The process' hasn't enrolled in the "always defer after dirty"
3157 * mode and its deadline has expired. It currently
3158 * does not reside in any of the aging buckets.
3159 *
3160 * It's on its way to the JETSAM_PRIORITY_IDLE
3161 * bucket via memorystatus_update_idle_priority_locked()
3162 * below.
3163 *
3164 * So all we need to do is reset all the state on the
3165 * process that's related to the aging bucket i.e.
3166 * the AGING_IN_PROGRESS flag and the timer deadline.
3167 */
3168
3169 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
3170 reschedule = TRUE;
3171 } else {
3172 /*
3173 * Process enrolled in "always stop in deferral band after dirty" OR
3174 * it still has some protection window left and so
3175 * we just re-arm the timer without modifying any
3176 * state on the process iff it still wants into that band.
3177 */
3178
3179 if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
3180 memorystatus_schedule_idle_demotion_locked(p, TRUE);
3181 reschedule = TRUE;
3182 } else if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) {
3183 memorystatus_schedule_idle_demotion_locked(p, FALSE);
3184 reschedule = TRUE;
3185 }
3186 }
3187 } else {
3188 memorystatus_schedule_idle_demotion_locked(p, TRUE);
3189 reschedule = TRUE;
3190 }
3191 }
3192 }
3193
3194 memorystatus_update_idle_priority_locked(p);
3195
3196 if (memorystatus_highwater_enabled) {
3197 boolean_t ledger_update_needed = TRUE;
3198 boolean_t use_active;
3199 boolean_t is_fatal;
3200 /*
3201 * We are in this path because this process transitioned between
3202 * dirty <--> clean state. Update the cached memory limits.
3203 */
3204
3205 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
3206 /*
3207 * process is pinned in elevated band
3208 * or
3209 * process is dirty
3210 */
3211 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
3212 use_active = TRUE;
3213 ledger_update_needed = TRUE;
3214 } else {
3215 /*
3216 * process is clean...but if it has opted into pressured-exit
3217 * we don't apply the INACTIVE limit till the process has aged
3218 * out and is entering the IDLE band.
3219 * See memorystatus_update_priority_locked() for that.
3220 */
3221
3222 if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
3223 ledger_update_needed = FALSE;
3224 } else {
3225 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
3226 use_active = FALSE;
3227 ledger_update_needed = TRUE;
3228 }
3229 }
3230
3231 /*
3232 * Enforce the new limits by writing to the ledger.
3233 *
3234 * This is a hot path and holding the proc_list_lock while writing to the ledgers,
3235 * (where the task lock is taken) is bad. So, we temporarily drop the proc_list_lock.
3236 * We aren't traversing the jetsam bucket list here, so we should be safe.
3237 * See rdar://21394491.
3238 */
3239
3240 if (ledger_update_needed && proc_ref(p, true) == p) {
3241 int ledger_limit;
3242 if (p->p_memstat_memlimit > 0) {
3243 ledger_limit = p->p_memstat_memlimit;
3244 } else {
3245 ledger_limit = -1;
3246 }
3247 proc_list_unlock();
3248 task_set_phys_footprint_limit_internal(p->task, ledger_limit, NULL, use_active, is_fatal);
3249 proc_list_lock();
3250 proc_rele(p);
3251
3252 MEMORYSTATUS_DEBUG(3, "memorystatus_dirty_set: new limit on pid %d (%dMB %s) priority(%d) dirty?=0x%x %s\n",
3253 proc_getpid(p), (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
3254 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
3255 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
3256 }
3257 }
3258
3259 /* If the deferral state changed, reschedule the demotion timer */
3260 if (reschedule) {
3261 memorystatus_reschedule_idle_demotion_locked();
3262 }
3263
3264 /* Settle dirty time in ledger, and update transition timestamp */
3265 task_t t = proc_task(p);
3266 if (was_dirty) {
3267 task_ledger_settle_dirty_time(t);
3268 task_set_dirty_start(t, 0);
3269 } else {
3270 task_set_dirty_start(t, mach_absolute_time());
3271 }
3272 }
3273
3274 if (kill) {
3275 if (proc_ref(p, true) == p) {
3276 proc_list_unlock();
3277 psignal(p, SIGKILL);
3278 proc_list_lock();
3279 proc_rele(p);
3280 }
3281 }
3282
3283 exit:
3284 proc_list_unlock();
3285
3286 return ret;
3287 }
3288
3289 int
memorystatus_dirty_clear(proc_t p,uint32_t pcontrol)3290 memorystatus_dirty_clear(proc_t p, uint32_t pcontrol)
3291 {
3292 int ret = 0;
3293
3294 MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", proc_getpid(p), pcontrol, p->p_memstat_dirty);
3295
3296 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), proc_getpid(p), pcontrol, 0, 0, 0);
3297
3298 proc_list_lock();
3299
3300 if (proc_list_exited(p)) {
3301 /*
3302 * Process is on its way out.
3303 */
3304 ret = EBUSY;
3305 goto exit;
3306 }
3307
3308 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3309 ret = EPERM;
3310 goto exit;
3311 }
3312
3313 if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
3314 /* Dirty tracking not enabled */
3315 ret = EINVAL;
3316 goto exit;
3317 }
3318
3319 if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) {
3320 ret = EINVAL;
3321 goto exit;
3322 }
3323
3324 if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
3325 p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
3326 }
3327
3328 /* This can be set and cleared exactly once. */
3329 if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
3330 if (p->p_memstat_dirty & P_DIRTY_DEFER) {
3331 p->p_memstat_dirty &= ~(P_DIRTY_DEFER);
3332 }
3333
3334 if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
3335 p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS);
3336 }
3337
3338 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
3339 memorystatus_update_idle_priority_locked(p);
3340 memorystatus_reschedule_idle_demotion_locked();
3341 }
3342
3343 ret = 0;
3344 exit:
3345 proc_list_unlock();
3346
3347 return ret;
3348 }
3349
3350 int
memorystatus_dirty_get(proc_t p,boolean_t locked)3351 memorystatus_dirty_get(proc_t p, boolean_t locked)
3352 {
3353 int ret = 0;
3354
3355 if (!locked) {
3356 proc_list_lock();
3357 }
3358
3359 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
3360 ret |= PROC_DIRTY_TRACKED;
3361 if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
3362 ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
3363 }
3364 if (p->p_memstat_dirty & P_DIRTY) {
3365 ret |= PROC_DIRTY_IS_DIRTY;
3366 }
3367 if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
3368 ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
3369 }
3370 }
3371
3372 if (!locked) {
3373 proc_list_unlock();
3374 }
3375
3376 return ret;
3377 }
3378
3379 int
memorystatus_on_terminate(proc_t p)3380 memorystatus_on_terminate(proc_t p)
3381 {
3382 int sig;
3383
3384 proc_list_lock();
3385
3386 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3387
3388 if (((p->p_memstat_dirty & (P_DIRTY_TRACK | P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) ||
3389 (p->p_memstat_state & P_MEMSTAT_SUSPENDED)) {
3390 /*
3391 * Mark as terminated and issue SIGKILL if:-
3392 * - process is clean, or,
3393 * - if process is dirty but suspended. This case is likely
3394 * an extension because apps don't opt into dirty-tracking
3395 * and daemons aren't suspended.
3396 */
3397 #if DEVELOPMENT || DEBUG
3398 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
3399 os_log(OS_LOG_DEFAULT, "memorystatus: sending suspended process %s (pid %d) SIGKILL",
3400 (*p->p_name ? p->p_name : "unknown"), proc_getpid(p));
3401 }
3402 #endif /* DEVELOPMENT || DEBUG */
3403 sig = SIGKILL;
3404 } else {
3405 /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
3406 sig = SIGTERM;
3407 }
3408
3409 proc_list_unlock();
3410
3411 return sig;
3412 }
3413
3414 void
memorystatus_on_suspend(proc_t p)3415 memorystatus_on_suspend(proc_t p)
3416 {
3417 #if CONFIG_FREEZE
3418 uint32_t pages;
3419 memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
3420 #endif
3421 proc_list_lock();
3422 #if CONFIG_FREEZE
3423 memorystatus_suspended_count++;
3424 #endif
3425 p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
3426
3427 /* Check if proc is marked for termination */
3428 bool kill_process = !!(p->p_memstat_dirty & P_DIRTY_TERMINATED);
3429 proc_list_unlock();
3430
3431 if (kill_process) {
3432 psignal(p, SIGKILL);
3433 }
3434 }
3435
3436 extern uint64_t memorystatus_thaw_count_since_boot;
3437
3438 void
memorystatus_on_resume(proc_t p)3439 memorystatus_on_resume(proc_t p)
3440 {
3441 #if CONFIG_FREEZE
3442 boolean_t frozen;
3443 pid_t pid;
3444 #endif
3445
3446 proc_list_lock();
3447
3448 #if CONFIG_FREEZE
3449 frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
3450 if (frozen) {
3451 /*
3452 * Now that we don't _thaw_ a process completely,
3453 * resuming it (and having some on-demand swapins)
3454 * shouldn't preclude it from being counted as frozen.
3455 *
3456 * memorystatus_frozen_count--;
3457 *
3458 * We preserve the P_MEMSTAT_FROZEN state since the process
3459 * could have state on disk AND so will deserve some protection
3460 * in the jetsam bands.
3461 */
3462 if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) {
3463 p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE;
3464 memorystatus_refreeze_eligible_count++;
3465 }
3466 if (p->p_memstat_thaw_count == 0 || p->p_memstat_last_thaw_interval < memorystatus_freeze_current_interval) {
3467 os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed), relaxed);
3468 if (strcmp(p->p_name, "com.apple.WebKit.WebContent") == 0) {
3469 os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed_webcontent), relaxed);
3470 }
3471 }
3472 p->p_memstat_last_thaw_interval = memorystatus_freeze_current_interval;
3473 p->p_memstat_thaw_count++;
3474
3475 memorystatus_thaw_count++;
3476 memorystatus_thaw_count_since_boot++;
3477 }
3478
3479 memorystatus_suspended_count--;
3480
3481 pid = proc_getpid(p);
3482 #endif
3483
3484 /*
3485 * P_MEMSTAT_FROZEN will remain unchanged. This used to be:
3486 * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
3487 */
3488 p->p_memstat_state &= ~P_MEMSTAT_SUSPENDED;
3489
3490 proc_list_unlock();
3491
3492 #if CONFIG_FREEZE
3493 if (frozen) {
3494 memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
3495 memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
3496 }
3497 #endif
3498 }
3499
3500 void
memorystatus_on_inactivity(proc_t p)3501 memorystatus_on_inactivity(proc_t p)
3502 {
3503 #pragma unused(p)
3504 #if CONFIG_FREEZE
3505 /* Wake the freeze thread */
3506 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
3507 #endif
3508 }
3509
3510 /*
3511 * The proc_list_lock is held by the caller.
3512 */
3513 static uint32_t
memorystatus_build_state(proc_t p)3514 memorystatus_build_state(proc_t p)
3515 {
3516 uint32_t snapshot_state = 0;
3517
3518 /* General */
3519 if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
3520 snapshot_state |= kMemorystatusSuspended;
3521 }
3522 if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
3523 snapshot_state |= kMemorystatusFrozen;
3524 }
3525 if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
3526 snapshot_state |= kMemorystatusWasThawed;
3527 }
3528 if (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) {
3529 snapshot_state |= kMemorystatusAssertion;
3530 }
3531
3532 /* Tracking */
3533 if (p->p_memstat_dirty & P_DIRTY_TRACK) {
3534 snapshot_state |= kMemorystatusTracked;
3535 }
3536 if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
3537 snapshot_state |= kMemorystatusSupportsIdleExit;
3538 }
3539 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
3540 snapshot_state |= kMemorystatusDirty;
3541 }
3542
3543 return snapshot_state;
3544 }
3545
3546 static boolean_t
kill_idle_exit_proc(void)3547 kill_idle_exit_proc(void)
3548 {
3549 proc_t p, victim_p = PROC_NULL;
3550 uint64_t current_time, footprint_of_killed_proc;
3551 boolean_t killed = FALSE;
3552 unsigned int i = 0;
3553 os_reason_t jetsam_reason = OS_REASON_NULL;
3554
3555 /* Pick next idle exit victim. */
3556 current_time = mach_absolute_time();
3557
3558 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT);
3559 if (jetsam_reason == OS_REASON_NULL) {
3560 printf("kill_idle_exit_proc: failed to allocate jetsam reason\n");
3561 }
3562
3563 proc_list_lock();
3564
3565 p = memorystatus_get_first_proc_locked(&i, FALSE);
3566 while (p) {
3567 /* No need to look beyond the idle band */
3568 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
3569 break;
3570 }
3571
3572 if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
3573 if (current_time >= p->p_memstat_idledeadline) {
3574 p->p_memstat_dirty |= P_DIRTY_TERMINATED;
3575 victim_p = proc_ref(p, true);
3576 break;
3577 }
3578 }
3579
3580 p = memorystatus_get_next_proc_locked(&i, p, FALSE);
3581 }
3582
3583 proc_list_unlock();
3584
3585 if (victim_p) {
3586 printf("memorystatus: killing_idle_process pid %d [%s] jetsam_reason->osr_code: %llu\n", proc_getpid(victim_p), (*victim_p->p_name ? victim_p->p_name : "unknown"), jetsam_reason->osr_code);
3587 killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit, jetsam_reason, &footprint_of_killed_proc);
3588 proc_rele(victim_p);
3589 } else {
3590 os_reason_free(jetsam_reason);
3591 }
3592
3593 return killed;
3594 }
3595
3596 static void
memorystatus_thread_wake(void)3597 memorystatus_thread_wake(void)
3598 {
3599 int thr_id = 0;
3600 int active_thr = atomic_load(&active_jetsam_threads);
3601
3602 /* Wakeup all the jetsam threads */
3603 for (thr_id = 0; thr_id < active_thr; thr_id++) {
3604 thread_wakeup((event_t)&jetsam_threads[thr_id].memorystatus_wakeup);
3605 }
3606 }
3607
3608 #if CONFIG_JETSAM
3609
3610 static void
memorystatus_thread_pool_max()3611 memorystatus_thread_pool_max()
3612 {
3613 /* Increase the jetsam thread pool to max_jetsam_threads */
3614 int max_threads = max_jetsam_threads;
3615 printf("Expanding memorystatus pool to %d!\n", max_threads);
3616 atomic_store(&active_jetsam_threads, max_threads);
3617 }
3618
3619 static void
memorystatus_thread_pool_default()3620 memorystatus_thread_pool_default()
3621 {
3622 /* Restore the jetsam thread pool to a single thread */
3623 printf("Reverting memorystatus pool back to 1\n");
3624 atomic_store(&active_jetsam_threads, 1);
3625 }
3626
3627 #endif /* CONFIG_JETSAM */
3628
3629 extern void vm_pressure_response(void);
3630
3631 static int
memorystatus_thread_block(uint32_t interval_ms,thread_continue_t continuation)3632 memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
3633 {
3634 struct jetsam_thread_state *jetsam_thread = jetsam_current_thread();
3635
3636 assert(jetsam_thread != NULL);
3637 if (interval_ms) {
3638 assert_wait_timeout(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT, interval_ms, NSEC_PER_MSEC);
3639 } else {
3640 assert_wait(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT);
3641 }
3642
3643 return thread_block(continuation);
3644 }
3645
3646 static boolean_t
memorystatus_avail_pages_below_pressure(void)3647 memorystatus_avail_pages_below_pressure(void)
3648 {
3649 #if CONFIG_JETSAM
3650 return memorystatus_available_pages <= memorystatus_available_pages_pressure;
3651 #else /* CONFIG_JETSAM */
3652 return FALSE;
3653 #endif /* CONFIG_JETSAM */
3654 }
3655
3656 static boolean_t
memorystatus_avail_pages_below_critical(void)3657 memorystatus_avail_pages_below_critical(void)
3658 {
3659 #if CONFIG_JETSAM
3660 return memorystatus_available_pages <= memorystatus_available_pages_critical;
3661 #else /* CONFIG_JETSAM */
3662 return FALSE;
3663 #endif /* CONFIG_JETSAM */
3664 }
3665
3666 static boolean_t
memorystatus_post_snapshot(int32_t priority,uint32_t cause)3667 memorystatus_post_snapshot(int32_t priority, uint32_t cause)
3668 {
3669 boolean_t is_idle_priority;
3670
3671 if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) {
3672 is_idle_priority = (priority == JETSAM_PRIORITY_IDLE);
3673 } else {
3674 is_idle_priority = (priority == JETSAM_PRIORITY_IDLE || priority == JETSAM_PRIORITY_IDLE_DEFERRED);
3675 }
3676 #if CONFIG_JETSAM
3677 #pragma unused(cause)
3678 /*
3679 * Don't generate logs for steady-state idle-exit kills,
3680 * unless it is overridden for debug or by the device
3681 * tree.
3682 */
3683
3684 return !is_idle_priority || memorystatus_idle_snapshot;
3685
3686 #else /* CONFIG_JETSAM */
3687 /*
3688 * Don't generate logs for steady-state idle-exit kills,
3689 * unless
3690 * - it is overridden for debug or by the device
3691 * tree.
3692 * OR
3693 * - the kill causes are important i.e. not kMemorystatusKilledIdleExit
3694 */
3695
3696 boolean_t snapshot_eligible_kill_cause = (is_reason_thrashing(cause) || is_reason_zone_map_exhaustion(cause));
3697 return !is_idle_priority || memorystatus_idle_snapshot || snapshot_eligible_kill_cause;
3698 #endif /* CONFIG_JETSAM */
3699 }
3700
3701 static boolean_t
memorystatus_action_needed(void)3702 memorystatus_action_needed(void)
3703 {
3704 #if CONFIG_JETSAM
3705 return is_reason_thrashing(kill_under_pressure_cause) ||
3706 is_reason_zone_map_exhaustion(kill_under_pressure_cause) ||
3707 memorystatus_available_pages <= memorystatus_available_pages_pressure;
3708 #else /* CONFIG_JETSAM */
3709 return is_reason_thrashing(kill_under_pressure_cause) ||
3710 is_reason_zone_map_exhaustion(kill_under_pressure_cause);
3711 #endif /* CONFIG_JETSAM */
3712 }
3713
3714 static boolean_t
memorystatus_act_on_hiwat_processes(uint32_t * errors,uint32_t * hwm_kill,boolean_t * post_snapshot,__unused boolean_t * is_critical,uint64_t * memory_reclaimed)3715 memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, boolean_t *post_snapshot, __unused boolean_t *is_critical, uint64_t *memory_reclaimed)
3716 {
3717 boolean_t purged = FALSE, killed = FALSE;
3718
3719 *memory_reclaimed = 0;
3720 killed = memorystatus_kill_hiwat_proc(errors, &purged, memory_reclaimed);
3721
3722 if (killed) {
3723 *hwm_kill = *hwm_kill + 1;
3724 *post_snapshot = TRUE;
3725 return TRUE;
3726 } else {
3727 if (purged == FALSE) {
3728 /* couldn't purge and couldn't kill */
3729 memorystatus_hwm_candidates = FALSE;
3730 }
3731 }
3732
3733 #if CONFIG_JETSAM
3734 /* No highwater processes to kill. Continue or stop for now? */
3735 if (!is_reason_thrashing(kill_under_pressure_cause) &&
3736 !is_reason_zone_map_exhaustion(kill_under_pressure_cause) &&
3737 (memorystatus_available_pages > memorystatus_available_pages_critical)) {
3738 /*
3739 * We are _not_ out of pressure but we are above the critical threshold and there's:
3740 * - no compressor thrashing
3741 * - enough zone memory
3742 * - no more HWM processes left.
3743 * For now, don't kill any other processes.
3744 */
3745
3746 if (*hwm_kill == 0) {
3747 memorystatus_thread_wasted_wakeup++;
3748 }
3749
3750 *is_critical = FALSE;
3751
3752 return TRUE;
3753 }
3754 #endif /* CONFIG_JETSAM */
3755
3756 return FALSE;
3757 }
3758
3759 /*
3760 * kJetsamHighRelaunchCandidatesThreshold defines the percentage of candidates
3761 * in the idle & deferred bands that need to be bad candidates in order to trigger
3762 * aggressive jetsam.
3763 */
3764 #define kJetsamHighRelaunchCandidatesThreshold (100)
3765
3766 /* kJetsamMinCandidatesThreshold defines the minimum number of candidates in the
3767 * idle/deferred bands to trigger aggressive jetsam. This value basically decides
3768 * how much memory the system is ready to hold in the lower bands without triggering
3769 * aggressive jetsam. This number should ideally be tuned based on the memory config
3770 * of the device.
3771 */
3772 #define kJetsamMinCandidatesThreshold (5)
3773
3774 static boolean_t
memorystatus_aggressive_jetsam_needed_sysproc_aging(__unused int jld_eval_aggressive_count,__unused int * jld_idle_kills,__unused int jld_idle_kill_candidates,int * total_candidates,int * elevated_bucket_count)3775 memorystatus_aggressive_jetsam_needed_sysproc_aging(__unused int jld_eval_aggressive_count, __unused int *jld_idle_kills, __unused int jld_idle_kill_candidates, int *total_candidates, int *elevated_bucket_count)
3776 {
3777 boolean_t aggressive_jetsam_needed = false;
3778
3779 /*
3780 * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, we maintain the jetsam
3781 * relaunch behavior for all daemons. Also, daemons and apps are aged in deferred bands on
3782 * every dirty->clean transition. For this aging policy, the best way to determine if
3783 * aggressive jetsam is needed, is to see if the kill candidates are mostly bad candidates.
3784 * If yes, then we need to go to higher bands to reclaim memory.
3785 */
3786 proc_list_lock();
3787 /* Get total candidate counts for idle and idle deferred bands */
3788 *total_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].count + memstat_bucket[system_procs_aging_band].count;
3789 /* Get counts of bad kill candidates in idle and idle deferred bands */
3790 int bad_candidates = memstat_bucket[JETSAM_PRIORITY_IDLE].relaunch_high_count + memstat_bucket[system_procs_aging_band].relaunch_high_count;
3791
3792 *elevated_bucket_count = memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE].count;
3793
3794 proc_list_unlock();
3795
3796 /* Check if the number of bad candidates is greater than kJetsamHighRelaunchCandidatesThreshold % */
3797 aggressive_jetsam_needed = (((bad_candidates * 100) / *total_candidates) >= kJetsamHighRelaunchCandidatesThreshold);
3798
3799 /*
3800 * Since the new aging policy bases the aggressive jetsam trigger on percentage of
3801 * bad candidates, it is prone to being overly aggressive. In order to mitigate that,
3802 * make sure the system is really under memory pressure before triggering aggressive
3803 * jetsam.
3804 */
3805 if (memorystatus_available_pages > memorystatus_sysproc_aging_aggr_pages) {
3806 aggressive_jetsam_needed = false;
3807 }
3808
3809 #if DEVELOPMENT || DEBUG
3810 printf("memorystatus: aggressive%d: [%s] Bad Candidate Threshold Check (total: %d, bad: %d, threshold: %d %%); Memory Pressure Check (available_pgs: %llu, threshold_pgs: %llu)\n",
3811 jld_eval_aggressive_count, aggressive_jetsam_needed ? "PASSED" : "FAILED", *total_candidates, bad_candidates,
3812 kJetsamHighRelaunchCandidatesThreshold, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, (uint64_t)memorystatus_sysproc_aging_aggr_pages);
3813 #endif /* DEVELOPMENT || DEBUG */
3814 return aggressive_jetsam_needed;
3815 }
3816
3817 /*
3818 * Gets memory back from various system caches.
3819 * Called before jetsamming in the foreground band in the hope that we'll
3820 * avoid a jetsam.
3821 */
3822 static void
memorystatus_approaching_fg_band(boolean_t * corpse_list_purged)3823 memorystatus_approaching_fg_band(boolean_t *corpse_list_purged)
3824 {
3825 assert(corpse_list_purged != NULL);
3826 pmap_release_pages_fast();
3827 memorystatus_issue_fg_band_notify();
3828 if (total_corpses_count() > 0 && !*corpse_list_purged) {
3829 os_atomic_inc(&block_corpses, relaxed);
3830 assert(block_corpses > 0);
3831 task_purge_all_corpses();
3832 *corpse_list_purged = TRUE;
3833 }
3834 }
3835
3836 static boolean_t
memorystatus_aggressive_jetsam_needed_default(__unused int jld_eval_aggressive_count,int * jld_idle_kills,int jld_idle_kill_candidates,int * total_candidates,int * elevated_bucket_count)3837 memorystatus_aggressive_jetsam_needed_default(__unused int jld_eval_aggressive_count, int *jld_idle_kills, int jld_idle_kill_candidates, int *total_candidates, int *elevated_bucket_count)
3838 {
3839 boolean_t aggressive_jetsam_needed = false;
3840 /* Jetsam Loop Detection - locals */
3841 memstat_bucket_t *bucket;
3842 int jld_bucket_count = 0;
3843
3844 proc_list_lock();
3845 switch (jetsam_aging_policy) {
3846 case kJetsamAgingPolicyLegacy:
3847 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3848 jld_bucket_count = bucket->count;
3849 bucket = &memstat_bucket[JETSAM_PRIORITY_AGING_BAND1];
3850 jld_bucket_count += bucket->count;
3851 break;
3852 case kJetsamAgingPolicyAppsReclaimedFirst:
3853 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3854 jld_bucket_count = bucket->count;
3855 bucket = &memstat_bucket[system_procs_aging_band];
3856 jld_bucket_count += bucket->count;
3857 bucket = &memstat_bucket[applications_aging_band];
3858 jld_bucket_count += bucket->count;
3859 break;
3860 case kJetsamAgingPolicyNone:
3861 default:
3862 bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3863 jld_bucket_count = bucket->count;
3864 break;
3865 }
3866
3867 bucket = &memstat_bucket[JETSAM_PRIORITY_ELEVATED_INACTIVE];
3868 *elevated_bucket_count = bucket->count;
3869 *total_candidates = jld_bucket_count;
3870 proc_list_unlock();
3871
3872 aggressive_jetsam_needed = (*jld_idle_kills > jld_idle_kill_candidates);
3873
3874 #if DEVELOPMENT || DEBUG
3875 if (aggressive_jetsam_needed) {
3876 printf("memorystatus: aggressive%d: idle candidates: %d, idle kills: %d\n",
3877 jld_eval_aggressive_count,
3878 jld_idle_kill_candidates,
3879 *jld_idle_kills);
3880 }
3881 #endif /* DEVELOPMENT || DEBUG */
3882 return aggressive_jetsam_needed;
3883 }
3884
3885 static boolean_t
memorystatus_act_aggressive(uint32_t cause,os_reason_t jetsam_reason,int * jld_idle_kills,boolean_t * corpse_list_purged,boolean_t * post_snapshot,uint64_t * memory_reclaimed)3886 memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_idle_kills, boolean_t *corpse_list_purged, boolean_t *post_snapshot, uint64_t *memory_reclaimed)
3887 {
3888 boolean_t aggressive_jetsam_needed = false;
3889 boolean_t killed;
3890 uint32_t errors = 0;
3891 uint64_t footprint_of_killed_proc = 0;
3892 int elevated_bucket_count = 0, maximum_kills = 0, band = 0;
3893 int total_candidates = 0;
3894 *memory_reclaimed = 0;
3895
3896 /*
3897 * The aggressive jetsam logic looks at the number of times it has been in the
3898 * aggressive loop to determine the max priority band it should kill upto. The
3899 * static variables below are used to track that property.
3900 *
3901 * To reset those values, the implementation checks if it has been
3902 * memorystatus_jld_eval_period_msecs since the parameters were reset.
3903 */
3904 static int jld_eval_aggressive_count = 0;
3905 static int32_t jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
3906 static uint64_t jld_timestamp_msecs = 0;
3907 static int jld_idle_kill_candidates = 0;
3908
3909 if (memorystatus_jld_enabled == FALSE) {
3910 /* If aggressive jetsam is disabled, nothing to do here */
3911 return FALSE;
3912 }
3913
3914 /* Get current timestamp (msecs only) */
3915 struct timeval jld_now_tstamp = {0, 0};
3916 uint64_t jld_now_msecs = 0;
3917 microuptime(&jld_now_tstamp);
3918 jld_now_msecs = (jld_now_tstamp.tv_sec * 1000);
3919
3920 /*
3921 * The aggressive jetsam logic looks at the number of candidates and their
3922 * properties to decide if aggressive jetsam should be engaged.
3923 */
3924 if (jetsam_aging_policy == kJetsamAgingPolicySysProcsReclaimedFirst) {
3925 /*
3926 * For the kJetsamAgingPolicySysProcsReclaimedFirst aging policy, the logic looks at the number of
3927 * candidates in the idle and deferred band and how many out of them are marked as high relaunch
3928 * probability.
3929 */
3930 aggressive_jetsam_needed = memorystatus_aggressive_jetsam_needed_sysproc_aging(jld_eval_aggressive_count,
3931 jld_idle_kills, jld_idle_kill_candidates, &total_candidates, &elevated_bucket_count);
3932 } else {
3933 /*
3934 * The other aging policies look at number of candidate processes over a specific time window and
3935 * evaluate if the system is in a jetsam loop. If yes, aggressive jetsam is triggered.
3936 */
3937 aggressive_jetsam_needed = memorystatus_aggressive_jetsam_needed_default(jld_eval_aggressive_count,
3938 jld_idle_kills, jld_idle_kill_candidates, &total_candidates, &elevated_bucket_count);
3939 }
3940
3941 /*
3942 * Check if its been really long since the aggressive jetsam evaluation
3943 * parameters have been refreshed. This logic also resets the jld_eval_aggressive_count
3944 * counter to make sure we reset the aggressive jetsam severity.
3945 */
3946 boolean_t param_reval = false;
3947
3948 if ((total_candidates == 0) ||
3949 (jld_now_msecs > (jld_timestamp_msecs + memorystatus_jld_eval_period_msecs))) {
3950 jld_timestamp_msecs = jld_now_msecs;
3951 jld_idle_kill_candidates = total_candidates;
3952 *jld_idle_kills = 0;
3953 jld_eval_aggressive_count = 0;
3954 jld_priority_band_max = JETSAM_PRIORITY_UI_SUPPORT;
3955 param_reval = true;
3956 }
3957
3958 /*
3959 * If the parameters have been updated, re-evaluate the aggressive_jetsam_needed condition for
3960 * the non kJetsamAgingPolicySysProcsReclaimedFirst policy since its based on jld_idle_kill_candidates etc.
3961 */
3962 if ((param_reval == true) && (jetsam_aging_policy != kJetsamAgingPolicySysProcsReclaimedFirst)) {
3963 aggressive_jetsam_needed = (*jld_idle_kills > jld_idle_kill_candidates);
3964 }
3965
3966 /*
3967 * It is also possible that the system is down to a very small number of processes in the candidate
3968 * bands. In that case, the decisions made by the memorystatus_aggressive_jetsam_needed_* routines
3969 * would not be useful. In that case, do not trigger aggressive jetsam.
3970 */
3971 if (total_candidates < kJetsamMinCandidatesThreshold) {
3972 #if DEVELOPMENT || DEBUG
3973 printf("memorystatus: aggressive: [FAILED] Low Candidate Count (current: %d, threshold: %d)\n", total_candidates, kJetsamMinCandidatesThreshold);
3974 #endif /* DEVELOPMENT || DEBUG */
3975 aggressive_jetsam_needed = false;
3976 }
3977
3978 if (aggressive_jetsam_needed == false) {
3979 /* Either the aging policy or the candidate count decided that aggressive jetsam is not needed. Nothing more to do here. */
3980 return FALSE;
3981 }
3982
3983 /* Looks like aggressive jetsam is needed */
3984 jld_eval_aggressive_count++;
3985
3986 if (jld_eval_aggressive_count == memorystatus_jld_eval_aggressive_count) {
3987 memorystatus_approaching_fg_band(corpse_list_purged);
3988 } else if (jld_eval_aggressive_count > memorystatus_jld_eval_aggressive_count) {
3989 /*
3990 * Bump up the jetsam priority limit (eg: the bucket index)
3991 * Enforce bucket index sanity.
3992 */
3993 if ((memorystatus_jld_eval_aggressive_priority_band_max < 0) ||
3994 (memorystatus_jld_eval_aggressive_priority_band_max >= MEMSTAT_BUCKET_COUNT)) {
3995 /*
3996 * Do nothing. Stick with the default level.
3997 */
3998 } else {
3999 jld_priority_band_max = memorystatus_jld_eval_aggressive_priority_band_max;
4000 }
4001 }
4002
4003 /* Visit elevated processes first */
4004 while (elevated_bucket_count) {
4005 elevated_bucket_count--;
4006
4007 /*
4008 * memorystatus_kill_elevated_process() drops a reference,
4009 * so take another one so we can continue to use this exit reason
4010 * even after it returns.
4011 */
4012
4013 os_reason_ref(jetsam_reason);
4014 killed = memorystatus_kill_elevated_process(
4015 cause,
4016 jetsam_reason,
4017 JETSAM_PRIORITY_ELEVATED_INACTIVE,
4018 jld_eval_aggressive_count,
4019 &errors, &footprint_of_killed_proc);
4020 if (killed) {
4021 *post_snapshot = TRUE;
4022 *memory_reclaimed += footprint_of_killed_proc;
4023 if (memorystatus_avail_pages_below_pressure()) {
4024 /*
4025 * Still under pressure.
4026 * Find another pinned processes.
4027 */
4028 continue;
4029 } else {
4030 return TRUE;
4031 }
4032 } else {
4033 /*
4034 * No pinned processes left to kill.
4035 * Abandon elevated band.
4036 */
4037 break;
4038 }
4039 }
4040
4041 proc_list_lock();
4042 for (band = 0; band < jld_priority_band_max; band++) {
4043 maximum_kills += memstat_bucket[band].count;
4044 }
4045 proc_list_unlock();
4046 maximum_kills *= memorystatus_jld_max_kill_loops;
4047 /*
4048 * memorystatus_kill_processes_aggressive() allocates its own
4049 * jetsam_reason so the kMemorystatusKilledProcThrashing cause
4050 * is consistent throughout the aggressive march.
4051 */
4052 killed = memorystatus_kill_processes_aggressive(
4053 kMemorystatusKilledProcThrashing,
4054 jld_eval_aggressive_count,
4055 jld_priority_band_max,
4056 maximum_kills,
4057 &errors, &footprint_of_killed_proc);
4058
4059 if (killed) {
4060 /* Always generate logs after aggressive kill */
4061 *post_snapshot = TRUE;
4062 *memory_reclaimed += footprint_of_killed_proc;
4063 *jld_idle_kills = 0;
4064 return TRUE;
4065 }
4066
4067 return FALSE;
4068 }
4069
4070
4071 static void
memorystatus_thread(void * param __unused,wait_result_t wr __unused)4072 memorystatus_thread(void *param __unused, wait_result_t wr __unused)
4073 {
4074 boolean_t post_snapshot = FALSE;
4075 uint32_t errors = 0;
4076 uint32_t hwm_kill = 0;
4077 boolean_t sort_flag = TRUE;
4078 boolean_t corpse_list_purged = FALSE;
4079 int jld_idle_kills = 0;
4080 struct jetsam_thread_state *jetsam_thread = jetsam_current_thread();
4081 uint64_t total_memory_reclaimed = 0;
4082
4083 assert(jetsam_thread != NULL);
4084 if (jetsam_thread->inited == FALSE) {
4085 /*
4086 * It's the first time the thread has run, so just mark the thread as privileged and block.
4087 * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
4088 */
4089
4090 char name[32];
4091 thread_wire(host_priv_self(), current_thread(), TRUE);
4092 snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1);
4093
4094 /* Limit all but one thread to the lower jetsam bands, as that's where most of the victims are. */
4095 if (jetsam_thread->index == 0) {
4096 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4097 thread_vm_bind_group_add();
4098 }
4099 jetsam_thread->limit_to_low_bands = FALSE;
4100 } else {
4101 jetsam_thread->limit_to_low_bands = TRUE;
4102 }
4103 #if CONFIG_THREAD_GROUPS
4104 thread_group_vm_add();
4105 #endif
4106 thread_set_thread_name(current_thread(), name);
4107 jetsam_thread->inited = TRUE;
4108 memorystatus_thread_block(0, memorystatus_thread);
4109 }
4110
4111 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
4112 MEMORYSTATUS_LOG_AVAILABLE_PAGES, memorystatus_jld_enabled, memorystatus_jld_eval_period_msecs, memorystatus_jld_eval_aggressive_count, 0);
4113
4114 /*
4115 * Jetsam aware version.
4116 *
4117 * The VM pressure notification thread is working it's way through clients in parallel.
4118 *
4119 * So, while the pressure notification thread is targeting processes in order of
4120 * increasing jetsam priority, we can hopefully reduce / stop it's work by killing
4121 * any processes that have exceeded their highwater mark.
4122 *
4123 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
4124 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
4125 */
4126 while (memorystatus_action_needed()) {
4127 boolean_t killed;
4128 int32_t priority;
4129 uint32_t cause;
4130 uint64_t memory_reclaimed = 0;
4131 uint64_t jetsam_reason_code = JETSAM_REASON_INVALID;
4132 os_reason_t jetsam_reason = OS_REASON_NULL;
4133
4134 cause = kill_under_pressure_cause;
4135 switch (cause) {
4136 case kMemorystatusKilledFCThrashing:
4137 jetsam_reason_code = JETSAM_REASON_MEMORY_FCTHRASHING;
4138 break;
4139 case kMemorystatusKilledVMCompressorThrashing:
4140 jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING;
4141 break;
4142 case kMemorystatusKilledVMCompressorSpaceShortage:
4143 jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE;
4144 break;
4145 case kMemorystatusKilledZoneMapExhaustion:
4146 jetsam_reason_code = JETSAM_REASON_ZONE_MAP_EXHAUSTION;
4147 break;
4148 case kMemorystatusKilledSustainedPressure:
4149 jetsam_reason_code = JETSAM_REASON_MEMORY_SUSTAINED_PRESSURE;
4150 break;
4151 case kMemorystatusKilledVMPageShortage:
4152 /* falls through */
4153 default:
4154 jetsam_reason_code = JETSAM_REASON_MEMORY_VMPAGESHORTAGE;
4155 cause = kMemorystatusKilledVMPageShortage;
4156 break;
4157 }
4158
4159 /* Highwater */
4160 boolean_t is_critical = TRUE;
4161 if (memorystatus_act_on_hiwat_processes(&errors, &hwm_kill, &post_snapshot, &is_critical, &memory_reclaimed)) {
4162 total_memory_reclaimed += memory_reclaimed;
4163 if (is_critical == FALSE) {
4164 /*
4165 * For now, don't kill any other processes.
4166 */
4167 break;
4168 } else {
4169 goto done;
4170 }
4171 }
4172
4173 jetsam_reason = os_reason_create(OS_REASON_JETSAM, jetsam_reason_code);
4174 if (jetsam_reason == OS_REASON_NULL) {
4175 printf("memorystatus_thread: failed to allocate jetsam reason\n");
4176 }
4177
4178 /* Only unlimited jetsam threads should act aggressive */
4179 if (!jetsam_thread->limit_to_low_bands &&
4180 memorystatus_act_aggressive(cause, jetsam_reason, &jld_idle_kills, &corpse_list_purged, &post_snapshot, &memory_reclaimed)) {
4181 total_memory_reclaimed += memory_reclaimed;
4182 goto done;
4183 }
4184
4185 /*
4186 * memorystatus_kill_top_process() drops a reference,
4187 * so take another one so we can continue to use this exit reason
4188 * even after it returns
4189 */
4190 os_reason_ref(jetsam_reason);
4191
4192 /* LRU */
4193 killed = memorystatus_kill_top_process(TRUE, sort_flag, cause, jetsam_reason, &priority, &errors, &memory_reclaimed);
4194 sort_flag = FALSE;
4195
4196 if (killed) {
4197 total_memory_reclaimed += memory_reclaimed;
4198 if (memorystatus_post_snapshot(priority, cause) == TRUE) {
4199 post_snapshot = TRUE;
4200 }
4201
4202 /* Jetsam Loop Detection */
4203 if (memorystatus_jld_enabled == TRUE) {
4204 if ((priority == JETSAM_PRIORITY_IDLE) || (priority == system_procs_aging_band) || (priority == applications_aging_band)) {
4205 jld_idle_kills++;
4206 } else {
4207 /*
4208 * We've reached into bands beyond idle deferred.
4209 * We make no attempt to monitor them
4210 */
4211 }
4212 }
4213
4214 /*
4215 * If we have jetsammed a process in or above JETSAM_PRIORITY_UI_SUPPORT
4216 * then we attempt to relieve pressure by purging corpse memory and notifying
4217 * anybody wanting to know this.
4218 */
4219 if (priority >= JETSAM_PRIORITY_UI_SUPPORT) {
4220 memorystatus_approaching_fg_band(&corpse_list_purged);
4221 }
4222 goto done;
4223 }
4224
4225 if (memorystatus_avail_pages_below_critical()) {
4226 /*
4227 * Still under pressure and unable to kill a process - purge corpse memory
4228 * and get everything back from the pmap.
4229 */
4230 pmap_release_pages_fast();
4231 if (total_corpses_count() > 0) {
4232 os_atomic_inc(&block_corpses, relaxed);
4233 assert(block_corpses > 0);
4234 task_purge_all_corpses();
4235 corpse_list_purged = TRUE;
4236 }
4237
4238 if (!jetsam_thread->limit_to_low_bands && memorystatus_avail_pages_below_critical()) {
4239 /*
4240 * Still under pressure and unable to kill a process - panic
4241 */
4242 panic("memorystatus_jetsam_thread: no victim! available pages:%llu", (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4243 }
4244 }
4245
4246 done:
4247
4248 /*
4249 * We do not want to over-kill when thrashing has been detected.
4250 * To avoid that, we reset the flag here and notify the
4251 * compressor.
4252 */
4253 if (is_reason_thrashing(kill_under_pressure_cause)) {
4254 kill_under_pressure_cause = 0;
4255 #if CONFIG_JETSAM
4256 vm_thrashing_jetsam_done();
4257 #endif /* CONFIG_JETSAM */
4258 } else if (is_reason_zone_map_exhaustion(kill_under_pressure_cause)) {
4259 kill_under_pressure_cause = 0;
4260 }
4261
4262 os_reason_free(jetsam_reason);
4263 }
4264
4265 kill_under_pressure_cause = 0;
4266
4267 if (errors) {
4268 memorystatus_clear_errors();
4269 }
4270
4271 if (post_snapshot) {
4272 proc_list_lock();
4273 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
4274 sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
4275 uint64_t timestamp_now = mach_absolute_time();
4276 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
4277 memorystatus_jetsam_snapshot->js_gencount++;
4278 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
4279 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
4280 proc_list_unlock();
4281 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4282 if (!ret) {
4283 proc_list_lock();
4284 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
4285 proc_list_unlock();
4286 }
4287 } else {
4288 proc_list_unlock();
4289 }
4290 }
4291
4292 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
4293 MEMORYSTATUS_LOG_AVAILABLE_PAGES, total_memory_reclaimed, 0, 0, 0);
4294
4295 if (corpse_list_purged) {
4296 os_atomic_dec(&block_corpses, relaxed);
4297 assert(block_corpses >= 0);
4298 }
4299 memorystatus_thread_block(0, memorystatus_thread);
4300 }
4301
4302 /*
4303 * Returns TRUE:
4304 * when an idle-exitable proc was killed
4305 * Returns FALSE:
4306 * when there are no more idle-exitable procs found
4307 * when the attempt to kill an idle-exitable proc failed
4308 */
4309 boolean_t
memorystatus_idle_exit_from_VM(void)4310 memorystatus_idle_exit_from_VM(void)
4311 {
4312 /*
4313 * This routine should no longer be needed since we are
4314 * now using jetsam bands on all platforms and so will deal
4315 * with IDLE processes within the memorystatus thread itself.
4316 *
4317 * But we still use it because we observed that macos systems
4318 * started heavy compression/swapping with a bunch of
4319 * idle-exitable processes alive and doing nothing. We decided
4320 * to rather kill those processes than start swapping earlier.
4321 */
4322
4323 return kill_idle_exit_proc();
4324 }
4325
4326 /*
4327 * Callback invoked when allowable physical memory footprint exceeded
4328 * (dirty pages + IOKit mappings)
4329 *
4330 * This is invoked for both advisory, non-fatal per-task high watermarks,
4331 * as well as the fatal task memory limits.
4332 */
4333 void
memorystatus_on_ledger_footprint_exceeded(boolean_t warning,boolean_t memlimit_is_active,boolean_t memlimit_is_fatal)4334 memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
4335 {
4336 os_reason_t jetsam_reason = OS_REASON_NULL;
4337
4338 proc_t p = current_proc();
4339
4340 #if VM_PRESSURE_EVENTS
4341 if (warning == TRUE) {
4342 /*
4343 * This is a warning path which implies that the current process is close, but has
4344 * not yet exceeded its per-process memory limit.
4345 */
4346 if (memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, FALSE /* not exceeded */) != TRUE) {
4347 /* Print warning, since it's possible that task has not registered for pressure notifications */
4348 os_log(OS_LOG_DEFAULT, "memorystatus_on_ledger_footprint_exceeded: failed to warn the current task (%d exiting, or no handler registered?).\n", proc_getpid(p));
4349 }
4350 return;
4351 }
4352 #endif /* VM_PRESSURE_EVENTS */
4353
4354 if (memlimit_is_fatal) {
4355 /*
4356 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
4357 * has violated either the system-wide per-task memory limit OR its own task limit.
4358 */
4359 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT);
4360 if (jetsam_reason == NULL) {
4361 printf("task_exceeded footprint: failed to allocate jetsam reason\n");
4362 } else if (corpse_for_fatal_memkill && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) {
4363 /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
4364 jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
4365 }
4366
4367 if (memorystatus_kill_process_sync(proc_getpid(p), kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) {
4368 printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
4369 }
4370 } else {
4371 /*
4372 * HWM offender exists. Done without locks or synchronization.
4373 * See comment near its declaration for more details.
4374 */
4375 memorystatus_hwm_candidates = TRUE;
4376
4377 #if VM_PRESSURE_EVENTS
4378 /*
4379 * The current process is not in the warning path.
4380 * This path implies the current process has exceeded a non-fatal (soft) memory limit.
4381 * Failure to send note is ignored here.
4382 */
4383 (void)memorystatus_warn_process(p, memlimit_is_active, memlimit_is_fatal, TRUE /* exceeded */);
4384
4385 #endif /* VM_PRESSURE_EVENTS */
4386 }
4387 }
4388
4389 void
memorystatus_log_exception(const int max_footprint_mb,boolean_t memlimit_is_active,boolean_t memlimit_is_fatal)4390 memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal)
4391 {
4392 proc_t p = current_proc();
4393
4394 /*
4395 * The limit violation is logged here, but only once per process per limit.
4396 * Soft memory limit is a non-fatal high-water-mark
4397 * Hard memory limit is a fatal custom-task-limit or system-wide per-task memory limit.
4398 */
4399
4400 os_log_with_startup_serial(OS_LOG_DEFAULT, "EXC_RESOURCE -> %s[%d] exceeded mem limit: %s%s %d MB (%s)\n",
4401 ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), (memlimit_is_active ? "Active" : "Inactive"),
4402 (memlimit_is_fatal ? "Hard" : "Soft"), max_footprint_mb,
4403 (memlimit_is_fatal ? "fatal" : "non-fatal"));
4404
4405 return;
4406 }
4407
4408
4409 /*
4410 * Description:
4411 * Evaluates process state to determine which limit
4412 * should be applied (active vs. inactive limit).
4413 *
4414 * Processes that have the 'elevated inactive jetsam band' attribute
4415 * are first evaluated based on their current priority band.
4416 * presently elevated ==> active
4417 *
4418 * Processes that opt into dirty tracking are evaluated
4419 * based on clean vs dirty state.
4420 * dirty ==> active
4421 * clean ==> inactive
4422 *
4423 * Process that do not opt into dirty tracking are
4424 * evalulated based on priority level.
4425 * Foreground or above ==> active
4426 * Below Foreground ==> inactive
4427 *
4428 * Return: TRUE if active
4429 * False if inactive
4430 */
4431
4432 static boolean_t
proc_jetsam_state_is_active_locked(proc_t p)4433 proc_jetsam_state_is_active_locked(proc_t p)
4434 {
4435 if ((p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) &&
4436 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE)) {
4437 /*
4438 * process has the 'elevated inactive jetsam band' attribute
4439 * and process is present in the elevated band
4440 * implies active state
4441 */
4442 return TRUE;
4443 } else if (p->p_memstat_dirty & P_DIRTY_TRACK) {
4444 /*
4445 * process has opted into dirty tracking
4446 * active state is based on dirty vs. clean
4447 */
4448 if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
4449 /*
4450 * process is dirty
4451 * implies active state
4452 */
4453 return TRUE;
4454 } else {
4455 /*
4456 * process is clean
4457 * implies inactive state
4458 */
4459 return FALSE;
4460 }
4461 } else if (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND) {
4462 /*
4463 * process is Foreground or higher
4464 * implies active state
4465 */
4466 return TRUE;
4467 } else {
4468 /*
4469 * process found below Foreground
4470 * implies inactive state
4471 */
4472 return FALSE;
4473 }
4474 }
4475
4476 static boolean_t
memorystatus_kill_process_sync(pid_t victim_pid,uint32_t cause,os_reason_t jetsam_reason)4477 memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
4478 {
4479 boolean_t res;
4480
4481 uint32_t errors = 0;
4482 uint64_t memory_reclaimed = 0;
4483
4484 if (victim_pid == -1) {
4485 /* No pid, so kill first process */
4486 res = memorystatus_kill_top_process(TRUE, TRUE, cause, jetsam_reason, NULL, &errors, &memory_reclaimed);
4487 } else {
4488 res = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason);
4489 }
4490
4491 if (errors) {
4492 memorystatus_clear_errors();
4493 }
4494
4495 if (res == TRUE) {
4496 /* Fire off snapshot notification */
4497 proc_list_lock();
4498 size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
4499 sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
4500 uint64_t timestamp_now = mach_absolute_time();
4501 memorystatus_jetsam_snapshot->notification_time = timestamp_now;
4502 if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
4503 timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
4504 proc_list_unlock();
4505 int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
4506 if (!ret) {
4507 proc_list_lock();
4508 memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
4509 proc_list_unlock();
4510 }
4511 } else {
4512 proc_list_unlock();
4513 }
4514 }
4515
4516 return res;
4517 }
4518
4519 /*
4520 * Jetsam a specific process.
4521 */
4522 static boolean_t
memorystatus_kill_specific_process(pid_t victim_pid,uint32_t cause,os_reason_t jetsam_reason)4523 memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason)
4524 {
4525 boolean_t killed;
4526 proc_t p;
4527 uint64_t killtime = 0;
4528 uint64_t footprint_of_killed_proc;
4529 clock_sec_t tv_sec;
4530 clock_usec_t tv_usec;
4531 uint32_t tv_msec;
4532
4533 /* TODO - add a victim queue and push this into the main jetsam thread */
4534
4535 p = proc_find(victim_pid);
4536 if (!p) {
4537 os_reason_free(jetsam_reason);
4538 return FALSE;
4539 }
4540
4541 proc_list_lock();
4542
4543 if (memorystatus_jetsam_snapshot_count == 0) {
4544 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
4545 }
4546
4547 killtime = mach_absolute_time();
4548 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
4549 tv_msec = tv_usec / 1000;
4550
4551 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
4552
4553 proc_list_unlock();
4554
4555 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
4556
4557 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n",
4558 (unsigned long)tv_sec, tv_msec, victim_pid, ((p && *p->p_name) ? p->p_name : "unknown"),
4559 memorystatus_kill_cause_name[cause], (p ? p->p_memstat_effectivepriority: -1),
4560 footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
4561
4562 proc_rele(p);
4563
4564 return killed;
4565 }
4566
4567
4568 /*
4569 * Toggle the P_MEMSTAT_SKIP bit.
4570 * Takes the proc_list_lock.
4571 */
4572 void
proc_memstat_skip(proc_t p,boolean_t set)4573 proc_memstat_skip(proc_t p, boolean_t set)
4574 {
4575 #if DEVELOPMENT || DEBUG
4576 if (p) {
4577 proc_list_lock();
4578 if (set == TRUE) {
4579 p->p_memstat_state |= P_MEMSTAT_SKIP;
4580 } else {
4581 p->p_memstat_state &= ~P_MEMSTAT_SKIP;
4582 }
4583 proc_list_unlock();
4584 }
4585 #else
4586 #pragma unused(p, set)
4587 /*
4588 * do nothing
4589 */
4590 #endif /* DEVELOPMENT || DEBUG */
4591 return;
4592 }
4593
4594
4595 #if CONFIG_JETSAM
4596 /*
4597 * This is invoked when cpulimits have been exceeded while in fatal mode.
4598 * The jetsam_flags do not apply as those are for memory related kills.
4599 * We call this routine so that the offending process is killed with
4600 * a non-zero exit status.
4601 */
4602 void
jetsam_on_ledger_cpulimit_exceeded(void)4603 jetsam_on_ledger_cpulimit_exceeded(void)
4604 {
4605 int retval = 0;
4606 int jetsam_flags = 0; /* make it obvious */
4607 proc_t p = current_proc();
4608 os_reason_t jetsam_reason = OS_REASON_NULL;
4609
4610 printf("task_exceeded_cpulimit: killing pid %d [%s]\n",
4611 proc_getpid(p), (*p->p_name ? p->p_name : "(unknown)"));
4612
4613 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_CPULIMIT);
4614 if (jetsam_reason == OS_REASON_NULL) {
4615 printf("task_exceeded_cpulimit: unable to allocate memory for jetsam reason\n");
4616 }
4617
4618 retval = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
4619
4620 if (retval) {
4621 printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
4622 }
4623 }
4624
4625 #endif /* CONFIG_JETSAM */
4626
4627 static void
memorystatus_get_task_memory_region_count(task_t task,uint64_t * count)4628 memorystatus_get_task_memory_region_count(task_t task, uint64_t *count)
4629 {
4630 assert(task);
4631 assert(count);
4632
4633 *count = get_task_memory_region_count(task);
4634 }
4635
4636
4637 #define MEMORYSTATUS_VM_MAP_FORK_ALLOWED 0x100000000
4638 #define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000
4639
4640 #if DEVELOPMENT || DEBUG
4641
4642 /*
4643 * Sysctl only used to test memorystatus_allowed_vm_map_fork() path.
4644 * set a new pidwatch value
4645 * or
4646 * get the current pidwatch value
4647 *
4648 * The pidwatch_val starts out with a PID to watch for in the map_fork path.
4649 * Its value is:
4650 * - OR'd with MEMORYSTATUS_VM_MAP_FORK_ALLOWED if we allow the map_fork.
4651 * - OR'd with MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED if we disallow the map_fork.
4652 * - set to -1ull if the map_fork() is aborted for other reasons.
4653 */
4654
4655 uint64_t memorystatus_vm_map_fork_pidwatch_val = 0;
4656
4657 static int sysctl_memorystatus_vm_map_fork_pidwatch SYSCTL_HANDLER_ARGS {
4658 #pragma unused(oidp, arg1, arg2)
4659
4660 uint64_t new_value = 0;
4661 uint64_t old_value = 0;
4662 int error = 0;
4663
4664 /*
4665 * The pid is held in the low 32 bits.
4666 * The 'allowed' flags are in the upper 32 bits.
4667 */
4668 old_value = memorystatus_vm_map_fork_pidwatch_val;
4669
4670 error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL);
4671
4672 if (error || !req->newptr) {
4673 /*
4674 * No new value passed in.
4675 */
4676 return error;
4677 }
4678
4679 /*
4680 * A new pid was passed in via req->newptr.
4681 * Ignore any attempt to set the higher order bits.
4682 */
4683 memorystatus_vm_map_fork_pidwatch_val = new_value & 0xFFFFFFFF;
4684 printf("memorystatus: pidwatch old_value = 0x%llx, new_value = 0x%llx \n", old_value, new_value);
4685
4686 return error;
4687 }
4688
4689 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_map_fork_pidwatch, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED,
4690 0, 0, sysctl_memorystatus_vm_map_fork_pidwatch, "Q", "get/set pid watched for in vm_map_fork");
4691
4692
4693 /*
4694 * Record if a watched process fails to qualify for a vm_map_fork().
4695 */
4696 void
memorystatus_abort_vm_map_fork(task_t task)4697 memorystatus_abort_vm_map_fork(task_t task)
4698 {
4699 if (memorystatus_vm_map_fork_pidwatch_val != 0) {
4700 proc_t p = get_bsdtask_info(task);
4701 if (p != NULL && memorystatus_vm_map_fork_pidwatch_val == (uint64_t)proc_getpid(p)) {
4702 memorystatus_vm_map_fork_pidwatch_val = -1ull;
4703 }
4704 }
4705 }
4706
4707 static void
set_vm_map_fork_pidwatch(task_t task,uint64_t x)4708 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
4709 {
4710 if (memorystatus_vm_map_fork_pidwatch_val != 0) {
4711 proc_t p = get_bsdtask_info(task);
4712 if (p && (memorystatus_vm_map_fork_pidwatch_val == (uint64_t)proc_getpid(p))) {
4713 memorystatus_vm_map_fork_pidwatch_val |= x;
4714 }
4715 }
4716 }
4717
4718 #else /* DEVELOPMENT || DEBUG */
4719
4720
4721 static void
set_vm_map_fork_pidwatch(task_t task,uint64_t x)4722 set_vm_map_fork_pidwatch(task_t task, uint64_t x)
4723 {
4724 #pragma unused(task)
4725 #pragma unused(x)
4726 }
4727
4728 #endif /* DEVELOPMENT || DEBUG */
4729
4730 /*
4731 * Called during EXC_RESOURCE handling when a process exceeds a soft
4732 * memory limit. This is the corpse fork path and here we decide if
4733 * vm_map_fork will be allowed when creating the corpse.
4734 * The task being considered is suspended.
4735 *
4736 * By default, a vm_map_fork is allowed to proceed.
4737 *
4738 * A few simple policy assumptions:
4739 * If the device has a zero system-wide task limit,
4740 * then the vm_map_fork is allowed. macOS always has a zero
4741 * system wide task limit (unless overriden by a boot-arg).
4742 *
4743 * And if a process's memory footprint calculates less
4744 * than or equal to quarter of the system-wide task limit,
4745 * then the vm_map_fork is allowed. This calculation
4746 * is based on the assumption that a process can
4747 * munch memory up to the system-wide task limit.
4748 *
4749 * For watchOS, which has a low task limit, we use a
4750 * different value. Current task limit has been reduced
4751 * to 300MB and it's been decided the limit should be 200MB.
4752 */
4753 boolean_t
memorystatus_allowed_vm_map_fork(task_t task)4754 memorystatus_allowed_vm_map_fork(task_t task)
4755 {
4756 boolean_t is_allowed = TRUE; /* default */
4757
4758 uint64_t footprint_in_bytes;
4759 uint64_t max_allowed_bytes;
4760
4761 /* Jetsam in high bands blocks any new corpse */
4762 if (os_atomic_load(&block_corpses, relaxed) != 0) {
4763 os_log(OS_LOG_DEFAULT, "memorystatus_allowed_vm_map_fork: corpse for pid %d blocked by jetsam).\n", task_pid(task));
4764 return FALSE;
4765 }
4766
4767 if (max_task_footprint_mb == 0) {
4768 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
4769 return is_allowed;
4770 }
4771
4772 footprint_in_bytes = get_task_phys_footprint(task);
4773
4774 /*
4775 * Maximum is 1/4 of the system-wide task limit by default.
4776 */
4777 max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2;
4778
4779 #if XNU_TARGET_OS_WATCH
4780 /*
4781 * For watches with > 1G, raise the limit to 200MB.
4782 */
4783 if (sane_size > 1 * 1024 * 1024 * 1024) {
4784 max_allowed_bytes = MAX(max_allowed_bytes, 200 * 1024 * 1024);
4785 }
4786 #endif /* XNU_TARGET_OS_WATCH */
4787
4788 #if DEBUG || DEVELOPMENT
4789 if (corpse_threshold_system_limit) {
4790 max_allowed_bytes = (uint64_t)max_task_footprint_mb * (1UL << 20);
4791 }
4792 #endif /* DEBUG || DEVELOPMENT */
4793
4794 if (footprint_in_bytes > max_allowed_bytes) {
4795 printf("memorystatus disallowed vm_map_fork %lld %lld\n", footprint_in_bytes, max_allowed_bytes);
4796 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED);
4797 return !is_allowed;
4798 }
4799
4800 set_vm_map_fork_pidwatch(task, MEMORYSTATUS_VM_MAP_FORK_ALLOWED);
4801 return is_allowed;
4802 }
4803
4804 void
memorystatus_get_task_page_counts(task_t task,uint32_t * footprint,uint32_t * max_footprint_lifetime,uint32_t * purgeable_pages)4805 memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
4806 {
4807 assert(task);
4808 assert(footprint);
4809
4810 uint64_t pages;
4811
4812 pages = (get_task_phys_footprint(task) / PAGE_SIZE_64);
4813 assert(((uint32_t)pages) == pages);
4814 *footprint = (uint32_t)pages;
4815
4816 if (max_footprint_lifetime) {
4817 pages = (get_task_phys_footprint_lifetime_max(task) / PAGE_SIZE_64);
4818 assert(((uint32_t)pages) == pages);
4819 *max_footprint_lifetime = (uint32_t)pages;
4820 }
4821 if (purgeable_pages) {
4822 pages = (get_task_purgeable_size(task) / PAGE_SIZE_64);
4823 assert(((uint32_t)pages) == pages);
4824 *purgeable_pages = (uint32_t)pages;
4825 }
4826 }
4827
4828 static void
memorystatus_get_task_phys_footprint_page_counts(task_t task,uint64_t * internal_pages,uint64_t * internal_compressed_pages,uint64_t * purgeable_nonvolatile_pages,uint64_t * purgeable_nonvolatile_compressed_pages,uint64_t * alternate_accounting_pages,uint64_t * alternate_accounting_compressed_pages,uint64_t * iokit_mapped_pages,uint64_t * page_table_pages,uint64_t * frozen_to_swap_pages)4829 memorystatus_get_task_phys_footprint_page_counts(task_t task,
4830 uint64_t *internal_pages, uint64_t *internal_compressed_pages,
4831 uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
4832 uint64_t *alternate_accounting_pages, uint64_t *alternate_accounting_compressed_pages,
4833 uint64_t *iokit_mapped_pages, uint64_t *page_table_pages, uint64_t *frozen_to_swap_pages)
4834 {
4835 assert(task);
4836
4837 if (internal_pages) {
4838 *internal_pages = (get_task_internal(task) / PAGE_SIZE_64);
4839 }
4840
4841 if (internal_compressed_pages) {
4842 *internal_compressed_pages = (get_task_internal_compressed(task) / PAGE_SIZE_64);
4843 }
4844
4845 if (purgeable_nonvolatile_pages) {
4846 *purgeable_nonvolatile_pages = (get_task_purgeable_nonvolatile(task) / PAGE_SIZE_64);
4847 }
4848
4849 if (purgeable_nonvolatile_compressed_pages) {
4850 *purgeable_nonvolatile_compressed_pages = (get_task_purgeable_nonvolatile_compressed(task) / PAGE_SIZE_64);
4851 }
4852
4853 if (alternate_accounting_pages) {
4854 *alternate_accounting_pages = (get_task_alternate_accounting(task) / PAGE_SIZE_64);
4855 }
4856
4857 if (alternate_accounting_compressed_pages) {
4858 *alternate_accounting_compressed_pages = (get_task_alternate_accounting_compressed(task) / PAGE_SIZE_64);
4859 }
4860
4861 if (iokit_mapped_pages) {
4862 *iokit_mapped_pages = (get_task_iokit_mapped(task) / PAGE_SIZE_64);
4863 }
4864
4865 if (page_table_pages) {
4866 *page_table_pages = (get_task_page_table(task) / PAGE_SIZE_64);
4867 }
4868
4869 #if CONFIG_FREEZE
4870 if (frozen_to_swap_pages) {
4871 *frozen_to_swap_pages = (get_task_frozen_to_swap(task) / PAGE_SIZE_64);
4872 }
4873 #else /* CONFIG_FREEZE */
4874 #pragma unused(frozen_to_swap_pages)
4875 #endif /* CONFIG_FREEZE */
4876 }
4877
4878 #if CONFIG_FREEZE
4879 /*
4880 * Copies the source entry into the destination snapshot.
4881 * Returns true on success. Fails if the destination snapshot is full.
4882 * Caller must hold the proc list lock.
4883 */
4884 static bool
memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t * dst_snapshot,unsigned int dst_snapshot_size,const memorystatus_jetsam_snapshot_entry_t * src_entry)4885 memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t *dst_snapshot, unsigned int dst_snapshot_size, const memorystatus_jetsam_snapshot_entry_t *src_entry)
4886 {
4887 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
4888 assert(dst_snapshot);
4889
4890 if (dst_snapshot->entry_count == dst_snapshot_size) {
4891 /* Destination snapshot is full. Can not be updated until it is consumed. */
4892 return false;
4893 }
4894 if (dst_snapshot->entry_count == 0) {
4895 memorystatus_init_jetsam_snapshot_header(dst_snapshot);
4896 }
4897 memorystatus_jetsam_snapshot_entry_t *dst_entry = &dst_snapshot->entries[dst_snapshot->entry_count++];
4898 memcpy(dst_entry, src_entry, sizeof(memorystatus_jetsam_snapshot_entry_t));
4899 return true;
4900 }
4901 #endif /* CONFIG_FREEZE */
4902
4903 static bool
memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t * snapshot,proc_t p,uint32_t kill_cause,uint64_t killtime,memorystatus_jetsam_snapshot_entry_t ** entry)4904 memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t *snapshot, proc_t p, uint32_t kill_cause, uint64_t killtime, memorystatus_jetsam_snapshot_entry_t **entry)
4905 {
4906 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
4907 memorystatus_jetsam_snapshot_entry_t *snapshot_list = snapshot->entries;
4908 size_t i = snapshot->entry_count;
4909
4910 if (memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], (snapshot->js_gencount)) == TRUE) {
4911 *entry = &snapshot_list[i];
4912 (*entry)->killed = kill_cause;
4913 (*entry)->jse_killtime = killtime;
4914
4915 snapshot->entry_count = i + 1;
4916 return true;
4917 }
4918 return false;
4919 }
4920
4921 /*
4922 * This routine only acts on the global jetsam event snapshot.
4923 * Updating the process's entry can race when the memorystatus_thread
4924 * has chosen to kill a process that is racing to exit on another core.
4925 */
4926 static void
memorystatus_update_jetsam_snapshot_entry_locked(proc_t p,uint32_t kill_cause,uint64_t killtime)4927 memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime)
4928 {
4929 memorystatus_jetsam_snapshot_entry_t *entry = NULL;
4930 memorystatus_jetsam_snapshot_t *snapshot = NULL;
4931 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
4932
4933 unsigned int i;
4934 #if CONFIG_FREEZE
4935 bool copied_to_freezer_snapshot = false;
4936 #endif /* CONFIG_FREEZE */
4937
4938 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
4939
4940 if (memorystatus_jetsam_snapshot_count == 0) {
4941 /*
4942 * No active snapshot.
4943 * Nothing to do.
4944 */
4945 goto exit;
4946 }
4947
4948 /*
4949 * Sanity check as this routine should only be called
4950 * from a jetsam kill path.
4951 */
4952 assert(kill_cause != 0 && killtime != 0);
4953
4954 snapshot = memorystatus_jetsam_snapshot;
4955 snapshot_list = memorystatus_jetsam_snapshot->entries;
4956
4957 for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
4958 if (snapshot_list[i].pid == proc_getpid(p)) {
4959 entry = &snapshot_list[i];
4960
4961 if (entry->killed || entry->jse_killtime) {
4962 /*
4963 * We apparently raced on the exit path
4964 * for this process, as it's snapshot entry
4965 * has already recorded a kill.
4966 */
4967 assert(entry->killed && entry->jse_killtime);
4968 break;
4969 }
4970
4971 /*
4972 * Update the entry we just found in the snapshot.
4973 */
4974
4975 entry->killed = kill_cause;
4976 entry->jse_killtime = killtime;
4977 entry->jse_gencount = snapshot->js_gencount;
4978 entry->jse_idle_delta = p->p_memstat_idle_delta;
4979 #if CONFIG_FREEZE
4980 entry->jse_thaw_count = p->p_memstat_thaw_count;
4981 entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason;
4982 #else /* CONFIG_FREEZE */
4983 entry->jse_thaw_count = 0;
4984 entry->jse_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
4985 #endif /* CONFIG_FREEZE */
4986
4987 /*
4988 * If a process has moved between bands since snapshot was
4989 * initialized, then likely these fields changed too.
4990 */
4991 if (entry->priority != p->p_memstat_effectivepriority) {
4992 strlcpy(entry->name, p->p_name, sizeof(entry->name));
4993 entry->priority = p->p_memstat_effectivepriority;
4994 entry->state = memorystatus_build_state(p);
4995 entry->user_data = p->p_memstat_userdata;
4996 entry->fds = p->p_fd.fd_nfiles;
4997 }
4998
4999 /*
5000 * Always update the page counts on a kill.
5001 */
5002
5003 uint32_t pages = 0;
5004 uint32_t max_pages_lifetime = 0;
5005 uint32_t purgeable_pages = 0;
5006
5007 memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages);
5008 entry->pages = (uint64_t)pages;
5009 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
5010 entry->purgeable_pages = (uint64_t)purgeable_pages;
5011
5012 uint64_t internal_pages = 0;
5013 uint64_t internal_compressed_pages = 0;
5014 uint64_t purgeable_nonvolatile_pages = 0;
5015 uint64_t purgeable_nonvolatile_compressed_pages = 0;
5016 uint64_t alternate_accounting_pages = 0;
5017 uint64_t alternate_accounting_compressed_pages = 0;
5018 uint64_t iokit_mapped_pages = 0;
5019 uint64_t page_table_pages = 0;
5020 uint64_t frozen_to_swap_pages = 0;
5021
5022 memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
5023 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
5024 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
5025 &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages);
5026
5027 entry->jse_internal_pages = internal_pages;
5028 entry->jse_internal_compressed_pages = internal_compressed_pages;
5029 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
5030 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
5031 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
5032 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
5033 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
5034 entry->jse_page_table_pages = page_table_pages;
5035 entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
5036
5037 uint64_t region_count = 0;
5038 memorystatus_get_task_memory_region_count(p->task, ®ion_count);
5039 entry->jse_memory_region_count = region_count;
5040
5041 goto exit;
5042 }
5043 }
5044
5045 if (entry == NULL) {
5046 /*
5047 * The entry was not found in the snapshot, so the process must have
5048 * launched after the snapshot was initialized.
5049 * Let's try to append the new entry.
5050 */
5051 if (memorystatus_jetsam_snapshot_count < memorystatus_jetsam_snapshot_max) {
5052 /*
5053 * A populated snapshot buffer exists
5054 * and there is room to init a new entry.
5055 */
5056 assert(memorystatus_jetsam_snapshot_count == snapshot->entry_count);
5057
5058 if (memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry)) {
5059 memorystatus_jetsam_snapshot_count++;
5060
5061 if (memorystatus_jetsam_snapshot_count >= memorystatus_jetsam_snapshot_max) {
5062 /*
5063 * We just used the last slot in the snapshot buffer.
5064 * We only want to log it once... so we do it here
5065 * when we notice we've hit the max.
5066 */
5067 printf("memorystatus: WARNING snapshot buffer is full, count %d\n",
5068 memorystatus_jetsam_snapshot_count);
5069 }
5070 }
5071 }
5072 }
5073
5074 exit:
5075 if (entry) {
5076 #if CONFIG_FREEZE
5077 if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5078 /* This is an app kill. Record it in the freezer snapshot so dasd can incorporate this in its recommendations. */
5079 copied_to_freezer_snapshot = memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_freezer, memorystatus_jetsam_snapshot_freezer_max, entry);
5080 if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5081 /*
5082 * We just used the last slot in the freezer snapshot buffer.
5083 * We only want to log it once... so we do it here
5084 * when we notice we've hit the max.
5085 */
5086 os_log_error(OS_LOG_DEFAULT, "memorystatus: WARNING freezer snapshot buffer is full, count %zu",
5087 memorystatus_jetsam_snapshot_freezer->entry_count);
5088 }
5089 }
5090 #endif /* CONFIG_FREEZE */
5091 } else {
5092 /*
5093 * If we reach here, the snapshot buffer could not be updated.
5094 * Most likely, the buffer is full, in which case we would have
5095 * logged a warning in the previous call.
5096 *
5097 * For now, we will stop appending snapshot entries.
5098 * When the buffer is consumed, the snapshot state will reset.
5099 */
5100
5101 MEMORYSTATUS_DEBUG(4, "memorystatus_update_jetsam_snapshot_entry_locked: failed to update pid %d, priority %d, count %d\n",
5102 proc_getpid(p), p->p_memstat_effectivepriority, memorystatus_jetsam_snapshot_count);
5103
5104 #if CONFIG_FREEZE
5105 /* We still attempt to record this in the freezer snapshot */
5106 if (memorystatus_jetsam_use_freezer_snapshot && isApp(p)) {
5107 snapshot = memorystatus_jetsam_snapshot_freezer;
5108 if (snapshot->entry_count < memorystatus_jetsam_snapshot_freezer_max) {
5109 copied_to_freezer_snapshot = memorystatus_init_jetsam_snapshot_entry_with_kill_locked(snapshot, p, kill_cause, killtime, &entry);
5110 if (copied_to_freezer_snapshot && memorystatus_jetsam_snapshot_freezer->entry_count == memorystatus_jetsam_snapshot_freezer_max) {
5111 /*
5112 * We just used the last slot in the freezer snapshot buffer.
5113 * We only want to log it once... so we do it here
5114 * when we notice we've hit the max.
5115 */
5116 os_log_error(OS_LOG_DEFAULT, "memorystatus: WARNING freezer snapshot buffer is full, count %zu",
5117 memorystatus_jetsam_snapshot_freezer->entry_count);
5118 }
5119 }
5120 }
5121 #endif /* CONFIG_FREEZE */
5122 }
5123
5124 return;
5125 }
5126
5127 #if CONFIG_JETSAM
5128 void
memorystatus_pages_update(unsigned int pages_avail)5129 memorystatus_pages_update(unsigned int pages_avail)
5130 {
5131 memorystatus_available_pages = pages_avail;
5132
5133 #if VM_PRESSURE_EVENTS
5134 /*
5135 * Since memorystatus_available_pages changes, we should
5136 * re-evaluate the pressure levels on the system and
5137 * check if we need to wake the pressure thread.
5138 * We also update memorystatus_level in that routine.
5139 */
5140 vm_pressure_response();
5141
5142 if (memorystatus_available_pages <= memorystatus_available_pages_pressure) {
5143 if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) {
5144 memorystatus_thread_wake();
5145 }
5146 }
5147 #if CONFIG_FREEZE
5148 /*
5149 * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect
5150 * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this
5151 * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here
5152 * will result in the "mutex with preemption disabled" panic.
5153 */
5154
5155 if (memorystatus_freeze_thread_should_run() == TRUE) {
5156 /*
5157 * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process).
5158 * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here.
5159 */
5160 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
5161 thread_wakeup((event_t)&memorystatus_freeze_wakeup);
5162 }
5163 }
5164 #endif /* CONFIG_FREEZE */
5165
5166 #else /* VM_PRESSURE_EVENTS */
5167
5168 boolean_t critical, delta;
5169
5170 if (!memorystatus_delta) {
5171 return;
5172 }
5173
5174 critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE;
5175 delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta))
5176 || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE;
5177
5178 if (critical || delta) {
5179 unsigned int total_pages;
5180
5181 total_pages = (unsigned int) atop_64(max_mem);
5182 #if CONFIG_SECLUDED_MEMORY
5183 total_pages -= vm_page_secluded_count;
5184 #endif /* CONFIG_SECLUDED_MEMORY */
5185 memorystatus_level = memorystatus_available_pages * 100 / total_pages;
5186 memorystatus_thread_wake();
5187 }
5188 #endif /* VM_PRESSURE_EVENTS */
5189 }
5190 #endif /* CONFIG_JETSAM */
5191
5192 static boolean_t
memorystatus_init_jetsam_snapshot_entry_locked(proc_t p,memorystatus_jetsam_snapshot_entry_t * entry,uint64_t gencount)5193 memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry, uint64_t gencount)
5194 {
5195 clock_sec_t tv_sec;
5196 clock_usec_t tv_usec;
5197 uint32_t pages = 0;
5198 uint32_t max_pages_lifetime = 0;
5199 uint32_t purgeable_pages = 0;
5200 uint64_t internal_pages = 0;
5201 uint64_t internal_compressed_pages = 0;
5202 uint64_t purgeable_nonvolatile_pages = 0;
5203 uint64_t purgeable_nonvolatile_compressed_pages = 0;
5204 uint64_t alternate_accounting_pages = 0;
5205 uint64_t alternate_accounting_compressed_pages = 0;
5206 uint64_t iokit_mapped_pages = 0;
5207 uint64_t page_table_pages = 0;
5208 uint64_t frozen_to_swap_pages = 0;
5209 uint64_t region_count = 0;
5210 uint64_t cids[COALITION_NUM_TYPES];
5211
5212 memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
5213
5214 entry->pid = proc_getpid(p);
5215 strlcpy(&entry->name[0], p->p_name, sizeof(entry->name));
5216 entry->priority = p->p_memstat_effectivepriority;
5217
5218 memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages);
5219 entry->pages = (uint64_t)pages;
5220 entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
5221 entry->purgeable_pages = (uint64_t)purgeable_pages;
5222
5223 memorystatus_get_task_phys_footprint_page_counts(p->task, &internal_pages, &internal_compressed_pages,
5224 &purgeable_nonvolatile_pages, &purgeable_nonvolatile_compressed_pages,
5225 &alternate_accounting_pages, &alternate_accounting_compressed_pages,
5226 &iokit_mapped_pages, &page_table_pages, &frozen_to_swap_pages);
5227
5228 entry->jse_internal_pages = internal_pages;
5229 entry->jse_internal_compressed_pages = internal_compressed_pages;
5230 entry->jse_purgeable_nonvolatile_pages = purgeable_nonvolatile_pages;
5231 entry->jse_purgeable_nonvolatile_compressed_pages = purgeable_nonvolatile_compressed_pages;
5232 entry->jse_alternate_accounting_pages = alternate_accounting_pages;
5233 entry->jse_alternate_accounting_compressed_pages = alternate_accounting_compressed_pages;
5234 entry->jse_iokit_mapped_pages = iokit_mapped_pages;
5235 entry->jse_page_table_pages = page_table_pages;
5236 entry->jse_frozen_to_swap_pages = frozen_to_swap_pages;
5237
5238 memorystatus_get_task_memory_region_count(p->task, ®ion_count);
5239 entry->jse_memory_region_count = region_count;
5240
5241 entry->state = memorystatus_build_state(p);
5242 entry->user_data = p->p_memstat_userdata;
5243 proc_getexecutableuuid(p, &entry->uuid[0], sizeof(entry->uuid));
5244 entry->fds = p->p_fd.fd_nfiles;
5245
5246 absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec);
5247 entry->cpu_time.tv_sec = (int64_t)tv_sec;
5248 entry->cpu_time.tv_usec = (int64_t)tv_usec;
5249
5250 assert(p->p_stats != NULL);
5251 entry->jse_starttime = p->p_stats->ps_start; /* abstime process started */
5252 entry->jse_killtime = 0; /* abstime jetsam chose to kill process */
5253 entry->killed = 0; /* the jetsam kill cause */
5254 entry->jse_gencount = gencount; /* indicates a pass through jetsam thread, when process was targeted to be killed */
5255
5256 entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */
5257
5258 #if CONFIG_FREEZE
5259 entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason;
5260 entry->jse_thaw_count = p->p_memstat_thaw_count;
5261 #else /* CONFIG_FREEZE */
5262 entry->jse_thaw_count = 0;
5263 entry->jse_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
5264 #endif /* CONFIG_FREEZE */
5265
5266 proc_coalitionids(p, cids);
5267 entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM];
5268
5269 return TRUE;
5270 }
5271
5272 static void
memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t * snapshot)5273 memorystatus_init_snapshot_vmstats(memorystatus_jetsam_snapshot_t *snapshot)
5274 {
5275 kern_return_t kr = KERN_SUCCESS;
5276 mach_msg_type_number_t count = HOST_VM_INFO64_COUNT;
5277 vm_statistics64_data_t vm_stat;
5278
5279 if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count)) != KERN_SUCCESS) {
5280 printf("memorystatus_init_jetsam_snapshot_stats: host_statistics64 failed with %d\n", kr);
5281 memset(&snapshot->stats, 0, sizeof(snapshot->stats));
5282 } else {
5283 snapshot->stats.free_pages = vm_stat.free_count;
5284 snapshot->stats.active_pages = vm_stat.active_count;
5285 snapshot->stats.inactive_pages = vm_stat.inactive_count;
5286 snapshot->stats.throttled_pages = vm_stat.throttled_count;
5287 snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
5288 snapshot->stats.wired_pages = vm_stat.wire_count;
5289
5290 snapshot->stats.speculative_pages = vm_stat.speculative_count;
5291 snapshot->stats.filebacked_pages = vm_stat.external_page_count;
5292 snapshot->stats.anonymous_pages = vm_stat.internal_page_count;
5293 snapshot->stats.compressions = vm_stat.compressions;
5294 snapshot->stats.decompressions = vm_stat.decompressions;
5295 snapshot->stats.compressor_pages = vm_stat.compressor_page_count;
5296 snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
5297 }
5298
5299 get_zone_map_size(&snapshot->stats.zone_map_size, &snapshot->stats.zone_map_capacity);
5300
5301 bzero(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name));
5302 get_largest_zone_info(snapshot->stats.largest_zone_name, sizeof(snapshot->stats.largest_zone_name),
5303 &snapshot->stats.largest_zone_size);
5304 }
5305
5306 /*
5307 * Collect vm statistics at boot.
5308 * Called only once (see kern_exec.c)
5309 * Data can be consumed at any time.
5310 */
5311 void
memorystatus_init_at_boot_snapshot()5312 memorystatus_init_at_boot_snapshot()
5313 {
5314 memorystatus_init_snapshot_vmstats(&memorystatus_at_boot_snapshot);
5315 memorystatus_at_boot_snapshot.entry_count = 0;
5316 memorystatus_at_boot_snapshot.notification_time = 0; /* updated when consumed */
5317 memorystatus_at_boot_snapshot.snapshot_time = mach_absolute_time();
5318 }
5319
5320 static void
memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t * snapshot)5321 memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot)
5322 {
5323 memorystatus_init_snapshot_vmstats(snapshot);
5324 snapshot->snapshot_time = mach_absolute_time();
5325 snapshot->notification_time = 0;
5326 snapshot->js_gencount = 0;
5327 }
5328
5329 static void
memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t * od_snapshot,uint32_t ods_list_count)5330 memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snapshot, uint32_t ods_list_count )
5331 {
5332 proc_t p, next_p;
5333 unsigned int b = 0, i = 0;
5334
5335 memorystatus_jetsam_snapshot_t *snapshot = NULL;
5336 memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
5337 unsigned int snapshot_max = 0;
5338 #if MEMORYSTATUS_DEBUG_LOG
5339 uuid_t uuid;
5340 #endif
5341
5342 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5343
5344 if (od_snapshot) {
5345 /*
5346 * This is an on_demand snapshot
5347 */
5348 snapshot = od_snapshot;
5349 snapshot_list = od_snapshot->entries;
5350 snapshot_max = ods_list_count;
5351 } else {
5352 /*
5353 * This is a jetsam event snapshot
5354 */
5355 snapshot = memorystatus_jetsam_snapshot;
5356 snapshot_list = memorystatus_jetsam_snapshot->entries;
5357 snapshot_max = memorystatus_jetsam_snapshot_max;
5358 }
5359
5360 memorystatus_init_jetsam_snapshot_header(snapshot);
5361
5362 next_p = memorystatus_get_first_proc_locked(&b, TRUE);
5363 while (next_p) {
5364 p = next_p;
5365 next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
5366
5367 if (FALSE == memorystatus_init_jetsam_snapshot_entry_locked(p, &snapshot_list[i], snapshot->js_gencount)) {
5368 continue;
5369 }
5370
5371 #if MEMORYSTATUS_DEBUG_LOG
5372 proc_getexecutableuuid(p, &uuid[0], sizeof(uuid));
5373
5374 MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
5375 proc_getpid(p),
5376 uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], uuid[6], uuid[7],
5377 uuid[8], uuid[9], uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]);
5378 #endif
5379
5380 if (++i == snapshot_max) {
5381 break;
5382 }
5383 }
5384
5385 snapshot->entry_count = i;
5386
5387 if (!od_snapshot) {
5388 /* update the system buffer count */
5389 memorystatus_jetsam_snapshot_count = i;
5390 }
5391 }
5392
5393 #if DEVELOPMENT || DEBUG
5394
5395 /*
5396 * Verify that the given bucket has been sorted correctly.
5397 *
5398 * Walks through the bucket and verifies that all pids in the
5399 * expected_order buffer are in that bucket and in the same
5400 * relative order.
5401 *
5402 * The proc_list_lock must be held by the caller.
5403 */
5404 static int
memorystatus_verify_sort_order(unsigned int bucket_index,pid_t * expected_order,size_t num_pids)5405 memorystatus_verify_sort_order(unsigned int bucket_index, pid_t *expected_order, size_t num_pids)
5406 {
5407 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
5408
5409 int error = 0;
5410 proc_t p = NULL;
5411 size_t i = 0;
5412
5413 /*
5414 * NB: We allow other procs to be mixed in within the expected ones.
5415 * We just need the expected procs to be in the right order relative to each other.
5416 */
5417 p = memorystatus_get_first_proc_locked(&bucket_index, FALSE);
5418 while (p) {
5419 if (proc_getpid(p) == expected_order[i]) {
5420 i++;
5421 }
5422 if (i == num_pids) {
5423 break;
5424 }
5425 p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE);
5426 }
5427 if (i != num_pids) {
5428 char buffer[128];
5429 size_t len = sizeof(buffer);
5430 size_t buffer_idx = 0;
5431 os_log_error(OS_LOG_DEFAULT, "memorystatus_verify_sort_order: Processes in bucket %d were not sorted properly\n", bucket_index);
5432 for (i = 0; i < num_pids; i++) {
5433 int num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%d,", expected_order[i]);
5434 if (num_written <= 0) {
5435 break;
5436 }
5437 if (buffer_idx + (unsigned int) num_written >= len) {
5438 break;
5439 }
5440 buffer_idx += num_written;
5441 }
5442 os_log_error(OS_LOG_DEFAULT, "memorystatus_verify_sort_order: Expected order [%s]", buffer);
5443 memset(buffer, 0, len);
5444 buffer_idx = 0;
5445 p = memorystatus_get_first_proc_locked(&bucket_index, FALSE);
5446 i = 0;
5447 os_log_error(OS_LOG_DEFAULT, "memorystatus_verify_sort_order: Actual order:");
5448 while (p) {
5449 int num_written;
5450 if (buffer_idx == 0) {
5451 num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%zu: %d,", i, proc_getpid(p));
5452 } else {
5453 num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%d,", proc_getpid(p));
5454 }
5455 if (num_written <= 0) {
5456 break;
5457 }
5458 buffer_idx += (unsigned int) num_written;
5459 assert(buffer_idx <= len);
5460 if (i % 10 == 0) {
5461 os_log_error(OS_LOG_DEFAULT, "memorystatus_verify_sort_order: %s", buffer);
5462 buffer_idx = 0;
5463 }
5464 p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE);
5465 i++;
5466 }
5467 if (buffer_idx != 0) {
5468 os_log_error(OS_LOG_DEFAULT, "memorystatus_verify_sort_order: %s", buffer);
5469 }
5470 error = EINVAL;
5471 }
5472 return error;
5473 }
5474
5475 /*
5476 * Triggers a sort_order on a specified jetsam priority band.
5477 * This is for testing only, used to force a path through the sort
5478 * function.
5479 */
5480 static int
memorystatus_cmd_test_jetsam_sort(int priority,int sort_order,user_addr_t expected_order_user,size_t expected_order_user_len)5481 memorystatus_cmd_test_jetsam_sort(int priority,
5482 int sort_order,
5483 user_addr_t expected_order_user,
5484 size_t expected_order_user_len)
5485 {
5486 int error = 0;
5487 unsigned int bucket_index = 0;
5488 static size_t kMaxPids = 8;
5489 pid_t expected_order[kMaxPids];
5490 size_t copy_size = sizeof(expected_order);
5491 size_t num_pids;
5492
5493 if (expected_order_user_len < copy_size) {
5494 copy_size = expected_order_user_len;
5495 }
5496 num_pids = copy_size / sizeof(pid_t);
5497
5498 error = copyin(expected_order_user, expected_order, copy_size);
5499 if (error != 0) {
5500 return error;
5501 }
5502
5503 if (priority == -1) {
5504 /* Use as shorthand for default priority */
5505 bucket_index = JETSAM_PRIORITY_DEFAULT;
5506 } else {
5507 bucket_index = (unsigned int)priority;
5508 }
5509
5510 /*
5511 * Acquire lock before sorting so we can check the sort order
5512 * while still holding the lock.
5513 */
5514 proc_list_lock();
5515
5516 memorystatus_sort_bucket_locked(bucket_index, sort_order);
5517
5518 if (expected_order_user != CAST_USER_ADDR_T(NULL) && expected_order_user_len > 0) {
5519 error = memorystatus_verify_sort_order(bucket_index, expected_order, num_pids);
5520 }
5521
5522 proc_list_unlock();
5523
5524 return error;
5525 }
5526
5527 #endif /* DEVELOPMENT || DEBUG */
5528
5529 /*
5530 * Prepare the process to be killed (set state, update snapshot) and kill it.
5531 */
5532 static uint64_t memorystatus_purge_before_jetsam_success = 0;
5533
5534 static boolean_t
memorystatus_kill_proc(proc_t p,uint32_t cause,os_reason_t jetsam_reason,boolean_t * killed,uint64_t * footprint_of_killed_proc)5535 memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, boolean_t *killed, uint64_t *footprint_of_killed_proc)
5536 {
5537 pid_t aPid = 0;
5538 uint32_t aPid_ep = 0;
5539
5540 uint64_t killtime = 0;
5541 clock_sec_t tv_sec;
5542 clock_usec_t tv_usec;
5543 uint32_t tv_msec;
5544 boolean_t retval = FALSE;
5545
5546 aPid = proc_getpid(p);
5547 aPid_ep = p->p_memstat_effectivepriority;
5548
5549 if (cause != kMemorystatusKilledVnodes && cause != kMemorystatusKilledZoneMapExhaustion) {
5550 /*
5551 * Genuine memory pressure and not other (vnode/zone) resource exhaustion.
5552 */
5553 boolean_t success = FALSE;
5554 uint64_t num_pages_purged;
5555 uint64_t num_pages_reclaimed = 0;
5556 uint64_t num_pages_unsecluded = 0;
5557
5558 networking_memstatus_callout(p, cause);
5559 num_pages_purged = vm_purgeable_purge_task_owned(p->task);
5560 num_pages_reclaimed += num_pages_purged;
5561 #if CONFIG_SECLUDED_MEMORY
5562 if (cause == kMemorystatusKilledVMPageShortage &&
5563 vm_page_secluded_count > 0 &&
5564 task_can_use_secluded_mem(p->task, FALSE)) {
5565 /*
5566 * We're about to kill a process that has access
5567 * to the secluded pool. Drain that pool into the
5568 * free or active queues to make these pages re-appear
5569 * as "available", which might make us no longer need
5570 * to kill that process.
5571 * Since the secluded pool does not get refilled while
5572 * a process has access to it, it should remain
5573 * drained.
5574 */
5575 num_pages_unsecluded = vm_page_secluded_drain();
5576 num_pages_reclaimed += num_pages_unsecluded;
5577 }
5578 #endif /* CONFIG_SECLUDED_MEMORY */
5579
5580 if (num_pages_reclaimed) {
5581 /*
5582 * We actually reclaimed something and so let's
5583 * check if we need to continue with the kill.
5584 */
5585 if (cause == kMemorystatusKilledHiwat) {
5586 uint64_t footprint_in_bytes = get_task_phys_footprint(p->task);
5587 uint64_t memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */
5588 success = (footprint_in_bytes <= memlimit_in_bytes);
5589 } else {
5590 success = (memorystatus_avail_pages_below_pressure() == FALSE);
5591 #if CONFIG_SECLUDED_MEMORY
5592 if (!success && num_pages_unsecluded) {
5593 /*
5594 * We just drained the secluded pool
5595 * because we're about to kill a
5596 * process that has access to it.
5597 * This is an important process and
5598 * we'd rather not kill it unless
5599 * absolutely necessary, so declare
5600 * success even if draining the pool
5601 * did not quite get us out of the
5602 * "pressure" level but still got
5603 * us out of the "critical" level.
5604 */
5605 success = (memorystatus_avail_pages_below_critical() == FALSE);
5606 }
5607 #endif /* CONFIG_SECLUDED_MEMORY */
5608 }
5609
5610 if (success) {
5611 memorystatus_purge_before_jetsam_success++;
5612
5613 os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: reclaimed %llu pages (%llu purged, %llu unsecluded) from pid %d [%s] and avoided %s\n",
5614 num_pages_reclaimed, num_pages_purged, num_pages_unsecluded, aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_kill_cause_name[cause]);
5615
5616 *killed = FALSE;
5617
5618 return TRUE;
5619 }
5620 }
5621 }
5622
5623 #if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
5624 MEMORYSTATUS_DEBUG(1, "jetsam: killing pid %d [%s] - %lld Mb > 1 (%d Mb)\n",
5625 aPid, (*p->p_name ? p->p_name : "unknown"),
5626 (footprint_in_bytes / (1024ULL * 1024ULL)), /* converted bytes to MB */
5627 p->p_memstat_memlimit);
5628 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
5629
5630 killtime = mach_absolute_time();
5631 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5632 tv_msec = tv_usec / 1000;
5633
5634 proc_list_lock();
5635 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
5636 proc_list_unlock();
5637
5638 char kill_reason_string[128];
5639
5640 if (cause == kMemorystatusKilledHiwat) {
5641 strlcpy(kill_reason_string, "killing_highwater_process", 128);
5642 } else {
5643 if (aPid_ep == JETSAM_PRIORITY_IDLE) {
5644 strlcpy(kill_reason_string, "killing_idle_process", 128);
5645 } else {
5646 strlcpy(kill_reason_string, "killing_top_process", 128);
5647 }
5648 }
5649
5650 /*
5651 * memorystatus_do_kill drops a reference, so take another one so we can
5652 * continue to use this exit reason even after memorystatus_do_kill()
5653 * returns
5654 */
5655 os_reason_ref(jetsam_reason);
5656
5657 retval = memorystatus_do_kill(p, cause, jetsam_reason, footprint_of_killed_proc);
5658 *killed = retval;
5659
5660 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: %s pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu compressor_size:%u",
5661 (unsigned long)tv_sec, tv_msec, kill_reason_string,
5662 aPid, ((p && *p->p_name) ? p->p_name : "unknown"),
5663 memorystatus_kill_cause_name[cause], aPid_ep,
5664 (*footprint_of_killed_proc) >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
5665
5666 return retval;
5667 }
5668
5669 /*
5670 * Jetsam the first process in the queue.
5671 */
5672 static boolean_t
memorystatus_kill_top_process(boolean_t any,boolean_t sort_flag,uint32_t cause,os_reason_t jetsam_reason,int32_t * priority,uint32_t * errors,uint64_t * memory_reclaimed)5673 memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason,
5674 int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed)
5675 {
5676 pid_t aPid;
5677 proc_t p = PROC_NULL, next_p = PROC_NULL;
5678 boolean_t new_snapshot = FALSE, force_new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE;
5679 unsigned int i = 0;
5680 uint32_t aPid_ep;
5681 int32_t local_max_kill_prio = JETSAM_PRIORITY_IDLE;
5682 uint64_t footprint_of_killed_proc = 0;
5683
5684 #ifndef CONFIG_FREEZE
5685 #pragma unused(any)
5686 #endif
5687
5688 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
5689 MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, 0, 0, 0);
5690
5691
5692 #if CONFIG_JETSAM
5693 if (sort_flag == TRUE) {
5694 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
5695 }
5696
5697 local_max_kill_prio = max_kill_priority;
5698
5699 #if VM_PRESSURE_EVENTS
5700 if (cause == kMemorystatusKilledSustainedPressure) {
5701 local_max_kill_prio = memorystatus_sustained_pressure_maximum_band;
5702 }
5703 #endif /* VM_PRESSURE_EVENTS */
5704
5705 force_new_snapshot = FALSE;
5706
5707 #else /* CONFIG_JETSAM */
5708
5709 if (sort_flag == TRUE) {
5710 (void)memorystatus_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_DEFAULT);
5711 }
5712
5713 /*
5714 * On macos, we currently only have 2 reasons to be here:
5715 *
5716 * kMemorystatusKilledZoneMapExhaustion
5717 * AND
5718 * kMemorystatusKilledVMCompressorSpaceShortage
5719 *
5720 * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider
5721 * any and all processes as eligible kill candidates since we need to avoid a panic.
5722 *
5723 * Since this function can be called async. it is harder to toggle the max_kill_priority
5724 * value before and after a call. And so we use this local variable to set the upper band
5725 * on the eligible kill bands.
5726 */
5727 if (cause == kMemorystatusKilledZoneMapExhaustion) {
5728 local_max_kill_prio = JETSAM_PRIORITY_MAX;
5729 } else {
5730 local_max_kill_prio = max_kill_priority;
5731 }
5732
5733 /*
5734 * And, because we are here under extreme circumstances, we force a snapshot even for
5735 * IDLE kills.
5736 */
5737 force_new_snapshot = TRUE;
5738
5739 #endif /* CONFIG_JETSAM */
5740
5741 if (cause != kMemorystatusKilledZoneMapExhaustion &&
5742 jetsam_current_thread() != NULL &&
5743 jetsam_current_thread()->limit_to_low_bands &&
5744 local_max_kill_prio > JETSAM_PRIORITY_MAIL) {
5745 local_max_kill_prio = JETSAM_PRIORITY_MAIL;
5746 }
5747
5748 proc_list_lock();
5749
5750 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5751 while (next_p && (next_p->p_memstat_effectivepriority <= local_max_kill_prio)) {
5752 p = next_p;
5753 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5754
5755
5756 aPid = proc_getpid(p);
5757 aPid_ep = p->p_memstat_effectivepriority;
5758
5759 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
5760 continue; /* with lock held */
5761 }
5762
5763 if (cause == kMemorystatusKilledVnodes) {
5764 /*
5765 * If the system runs out of vnodes, we systematically jetsam
5766 * processes in hopes of stumbling onto a vnode gain that helps
5767 * the system recover. The process that happens to trigger
5768 * this path has no known relationship to the vnode shortage.
5769 * Deadlock avoidance: attempt to safeguard the caller.
5770 */
5771
5772 if (p == current_proc()) {
5773 /* do not jetsam the current process */
5774 continue;
5775 }
5776 }
5777
5778 #if CONFIG_FREEZE
5779 boolean_t skip;
5780 boolean_t reclaim_proc = !(p->p_memstat_state & P_MEMSTAT_LOCKED);
5781 if (any || reclaim_proc) {
5782 skip = FALSE;
5783 } else {
5784 skip = TRUE;
5785 }
5786
5787 if (skip) {
5788 continue;
5789 } else
5790 #endif
5791 {
5792 if (proc_ref(p, true) == p) {
5793 /*
5794 * Mark as terminated so that if exit1() indicates success, but the process (for example)
5795 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
5796 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
5797 * acquisition of the proc lock.
5798 */
5799 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
5800 } else {
5801 /*
5802 * We need to restart the search again because
5803 * proc_ref _can_ drop the proc_list lock
5804 * and we could have lost our stored next_p via
5805 * an exit() on another core.
5806 */
5807 i = 0;
5808 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5809 continue;
5810 }
5811
5812 /*
5813 * Capture a snapshot if none exists and:
5814 * - we are forcing a new snapshot creation, either because:
5815 * - on a particular platform we need these snapshots every time, OR
5816 * - a boot-arg/embedded device tree property has been set.
5817 * - priority was not requested (this is something other than an ambient kill)
5818 * - the priority was requested *and* the targeted process is not at idle priority
5819 */
5820 if ((memorystatus_jetsam_snapshot_count == 0) &&
5821 (force_new_snapshot || memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) {
5822 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
5823 new_snapshot = TRUE;
5824 }
5825
5826 proc_list_unlock();
5827
5828 freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed, &footprint_of_killed_proc); /* purged and/or killed 'p' */
5829 /* Success? */
5830 if (freed_mem) {
5831 if (killed) {
5832 *memory_reclaimed = footprint_of_killed_proc;
5833 if (priority) {
5834 *priority = aPid_ep;
5835 }
5836 } else {
5837 /* purged */
5838 proc_list_lock();
5839 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5840 proc_list_unlock();
5841 }
5842 proc_rele(p);
5843 goto exit;
5844 }
5845
5846 /*
5847 * Failure - first unwind the state,
5848 * then fall through to restart the search.
5849 */
5850 proc_list_lock();
5851 proc_rele(p);
5852 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
5853 p->p_memstat_state |= P_MEMSTAT_ERROR;
5854 *errors += 1;
5855
5856 i = 0;
5857 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5858 }
5859 }
5860
5861 proc_list_unlock();
5862
5863 exit:
5864 os_reason_free(jetsam_reason);
5865
5866 if (!killed) {
5867 *memory_reclaimed = 0;
5868
5869 /* Clear snapshot if freshly captured and no target was found */
5870 if (new_snapshot) {
5871 proc_list_lock();
5872 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
5873 proc_list_unlock();
5874 }
5875 }
5876
5877 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
5878 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, *memory_reclaimed, 0);
5879
5880 return killed;
5881 }
5882
5883 /*
5884 * Jetsam aggressively
5885 */
5886 static boolean_t
memorystatus_kill_processes_aggressive(uint32_t cause,int aggr_count,int32_t priority_max,int max_kills,uint32_t * errors,uint64_t * memory_reclaimed)5887 memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count,
5888 int32_t priority_max, int max_kills, uint32_t *errors, uint64_t *memory_reclaimed)
5889 {
5890 pid_t aPid;
5891 proc_t p = PROC_NULL, next_p = PROC_NULL;
5892 boolean_t new_snapshot = FALSE, killed = FALSE;
5893 int kill_count = 0;
5894 unsigned int i = 0;
5895 int32_t aPid_ep = 0;
5896 unsigned int memorystatus_level_snapshot = 0;
5897 uint64_t killtime = 0;
5898 clock_sec_t tv_sec;
5899 clock_usec_t tv_usec;
5900 uint32_t tv_msec;
5901 os_reason_t jetsam_reason = OS_REASON_NULL;
5902 uint64_t footprint_of_killed_proc = 0;
5903
5904 *memory_reclaimed = 0;
5905
5906 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
5907 MEMORYSTATUS_LOG_AVAILABLE_PAGES, priority_max, 0, 0, 0);
5908
5909 if (priority_max >= JETSAM_PRIORITY_FOREGROUND) {
5910 /*
5911 * Check if aggressive jetsam has been asked to kill upto or beyond the
5912 * JETSAM_PRIORITY_FOREGROUND bucket. If yes, sort the FG band based on
5913 * coalition footprint.
5914 */
5915 memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT);
5916 }
5917
5918 jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause);
5919 if (jetsam_reason == OS_REASON_NULL) {
5920 printf("memorystatus_kill_processes_aggressive: failed to allocate exit reason\n");
5921 }
5922 os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: aggressively killing up to %d processes below band %d.", max_kills, priority_max + 1);
5923 proc_list_lock();
5924
5925 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5926 while (next_p) {
5927 if (proc_list_exited(next_p) ||
5928 ((unsigned int)(next_p->p_memstat_effectivepriority) != i)) {
5929 /*
5930 * We have raced with next_p running on another core.
5931 * It may be exiting or it may have moved to a different
5932 * jetsam priority band. This means we have lost our
5933 * place in line while traversing the jetsam list. We
5934 * attempt to recover by rewinding to the beginning of the band
5935 * we were already traversing. By doing this, we do not guarantee
5936 * that no process escapes this aggressive march, but we can make
5937 * skipping an entire range of processes less likely. (PR-21069019)
5938 */
5939
5940 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: rewinding band %d, %s(%d) moved or exiting.\n",
5941 aggr_count, i, (*next_p->p_name ? next_p->p_name : "unknown"), proc_getpid(next_p));
5942
5943 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
5944 continue;
5945 }
5946
5947 p = next_p;
5948 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
5949
5950 if (p->p_memstat_effectivepriority > priority_max) {
5951 /*
5952 * Bail out of this killing spree if we have
5953 * reached beyond the priority_max jetsam band.
5954 * That is, we kill up to and through the
5955 * priority_max jetsam band.
5956 */
5957 proc_list_unlock();
5958 goto exit;
5959 }
5960
5961 aPid = proc_getpid(p);
5962 aPid_ep = p->p_memstat_effectivepriority;
5963
5964 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
5965 continue;
5966 }
5967
5968 /*
5969 * Capture a snapshot if none exists.
5970 */
5971 if (memorystatus_jetsam_snapshot_count == 0) {
5972 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
5973 new_snapshot = TRUE;
5974 }
5975
5976 /*
5977 * Mark as terminated so that if exit1() indicates success, but the process (for example)
5978 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
5979 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
5980 * acquisition of the proc lock.
5981 */
5982 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
5983
5984 killtime = mach_absolute_time();
5985 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
5986 tv_msec = tv_usec / 1000;
5987
5988 /* Shift queue, update stats */
5989 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
5990
5991 /*
5992 * In order to kill the target process, we will drop the proc_list_lock.
5993 * To guaranteee that p and next_p don't disappear out from under the lock,
5994 * we must take a ref on both.
5995 * If we cannot get a reference, then it's likely we've raced with
5996 * that process exiting on another core.
5997 */
5998 if (proc_ref(p, true) == p) {
5999 if (next_p) {
6000 while (next_p && (proc_ref(next_p, true) != next_p)) {
6001 proc_t temp_p;
6002
6003 /*
6004 * We must have raced with next_p exiting on another core.
6005 * Recover by getting the next eligible process in the band.
6006 */
6007
6008 MEMORYSTATUS_DEBUG(1, "memorystatus: aggressive%d: skipping %d [%s] (exiting?)\n",
6009 aggr_count, proc_getpid(next_p), (*next_p->p_name ? next_p->p_name : "(unknown)"));
6010
6011 temp_p = next_p;
6012 next_p = memorystatus_get_next_proc_locked(&i, temp_p, TRUE);
6013 }
6014 }
6015 proc_list_unlock();
6016
6017 printf("%lu.%03d memorystatus: %s%d pid %d [%s] (%s %d) - memorystatus_available_pages: %llu\n",
6018 (unsigned long)tv_sec, tv_msec,
6019 ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"),
6020 aggr_count, aPid, (*p->p_name ? p->p_name : "unknown"),
6021 memorystatus_kill_cause_name[cause], aPid_ep, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6022
6023 memorystatus_level_snapshot = memorystatus_level;
6024
6025 /*
6026 * memorystatus_do_kill() drops a reference, so take another one so we can
6027 * continue to use this exit reason even after memorystatus_do_kill()
6028 * returns.
6029 */
6030 os_reason_ref(jetsam_reason);
6031 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
6032
6033 /* Success? */
6034 if (killed) {
6035 *memory_reclaimed += footprint_of_killed_proc;
6036 proc_rele(p);
6037 kill_count++;
6038 p = NULL;
6039 killed = FALSE;
6040
6041 /*
6042 * Continue the killing spree.
6043 */
6044 proc_list_lock();
6045 if (next_p) {
6046 proc_rele(next_p);
6047 }
6048
6049 if (kill_count == max_kills) {
6050 os_log_with_startup_serial(OS_LOG_DEFAULT,
6051 "memorystatus: giving up aggressive kill after killing %d processes below band %d.", max_kills, priority_max + 1);
6052 break;
6053 }
6054
6055 if (aPid_ep == JETSAM_PRIORITY_FOREGROUND && memorystatus_aggressive_jetsam_lenient == TRUE) {
6056 if (memorystatus_level > memorystatus_level_snapshot && ((memorystatus_level - memorystatus_level_snapshot) >= AGGRESSIVE_JETSAM_LENIENT_MODE_THRESHOLD)) {
6057 #if DEVELOPMENT || DEBUG
6058 printf("Disabling Lenient mode after one-time deployment.\n");
6059 #endif /* DEVELOPMENT || DEBUG */
6060 memorystatus_aggressive_jetsam_lenient = FALSE;
6061 break;
6062 }
6063 }
6064
6065 continue;
6066 }
6067
6068 /*
6069 * Failure - first unwind the state,
6070 * then fall through to restart the search.
6071 */
6072 proc_list_lock();
6073 proc_rele(p);
6074 if (next_p) {
6075 proc_rele(next_p);
6076 }
6077 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6078 p->p_memstat_state |= P_MEMSTAT_ERROR;
6079 *errors += 1;
6080 p = NULL;
6081 }
6082
6083 /*
6084 * Failure - restart the search at the beginning of
6085 * the band we were already traversing.
6086 *
6087 * We might have raced with "p" exiting on another core, resulting in no
6088 * ref on "p". Or, we may have failed to kill "p".
6089 *
6090 * Either way, we fall thru to here, leaving the proc in the
6091 * P_MEMSTAT_TERMINATED or P_MEMSTAT_ERROR state.
6092 *
6093 * And, we hold the the proc_list_lock at this point.
6094 */
6095
6096 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6097 }
6098
6099 proc_list_unlock();
6100
6101 exit:
6102 os_reason_free(jetsam_reason);
6103
6104 /* Clear snapshot if freshly captured and no target was found */
6105 if (new_snapshot && (kill_count == 0)) {
6106 proc_list_lock();
6107 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6108 proc_list_unlock();
6109 }
6110
6111 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6112 MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, kill_count, *memory_reclaimed, 0);
6113
6114 if (kill_count > 0) {
6115 return TRUE;
6116 } else {
6117 return FALSE;
6118 }
6119 }
6120
6121 static boolean_t
memorystatus_kill_hiwat_proc(uint32_t * errors,boolean_t * purged,uint64_t * memory_reclaimed)6122 memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed)
6123 {
6124 pid_t aPid = 0;
6125 proc_t p = PROC_NULL, next_p = PROC_NULL;
6126 boolean_t new_snapshot = FALSE, killed = FALSE, freed_mem = FALSE;
6127 unsigned int i = 0;
6128 uint32_t aPid_ep;
6129 os_reason_t jetsam_reason = OS_REASON_NULL;
6130 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
6131 MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, 0, 0, 0);
6132
6133 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_HIGHWATER);
6134 if (jetsam_reason == OS_REASON_NULL) {
6135 printf("memorystatus_kill_hiwat_proc: failed to allocate exit reason\n");
6136 }
6137
6138 proc_list_lock();
6139
6140 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6141 while (next_p) {
6142 uint64_t footprint_in_bytes = 0;
6143 uint64_t memlimit_in_bytes = 0;
6144 boolean_t skip = 0;
6145
6146 p = next_p;
6147 next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6148
6149 aPid = proc_getpid(p);
6150 aPid_ep = p->p_memstat_effectivepriority;
6151
6152 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6153 continue;
6154 }
6155
6156 /* skip if no limit set */
6157 if (p->p_memstat_memlimit <= 0) {
6158 continue;
6159 }
6160
6161 footprint_in_bytes = get_task_phys_footprint(p->task);
6162 memlimit_in_bytes = (((uint64_t)p->p_memstat_memlimit) * 1024ULL * 1024ULL); /* convert MB to bytes */
6163 skip = (footprint_in_bytes <= memlimit_in_bytes);
6164
6165 #if CONFIG_FREEZE
6166 if (!skip) {
6167 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
6168 skip = TRUE;
6169 } else {
6170 skip = FALSE;
6171 }
6172 }
6173 #endif
6174
6175 if (skip) {
6176 continue;
6177 } else {
6178 if (memorystatus_jetsam_snapshot_count == 0) {
6179 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6180 new_snapshot = TRUE;
6181 }
6182
6183 if (proc_ref(p, true) == p) {
6184 /*
6185 * Mark as terminated so that if exit1() indicates success, but the process (for example)
6186 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
6187 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
6188 * acquisition of the proc lock.
6189 */
6190 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6191
6192 proc_list_unlock();
6193 } else {
6194 /*
6195 * We need to restart the search again because
6196 * proc_ref _can_ drop the proc_list lock
6197 * and we could have lost our stored next_p via
6198 * an exit() on another core.
6199 */
6200 i = 0;
6201 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6202 continue;
6203 }
6204
6205 footprint_in_bytes = 0;
6206 freed_mem = memorystatus_kill_proc(p, kMemorystatusKilledHiwat, jetsam_reason, &killed, &footprint_in_bytes); /* purged and/or killed 'p' */
6207
6208 /* Success? */
6209 if (freed_mem) {
6210 if (killed == FALSE) {
6211 /* purged 'p'..don't reset HWM candidate count */
6212 *purged = TRUE;
6213
6214 proc_list_lock();
6215 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6216 proc_list_unlock();
6217 } else {
6218 *memory_reclaimed = footprint_in_bytes;
6219 }
6220 proc_rele(p);
6221 goto exit;
6222 }
6223 /*
6224 * Failure - first unwind the state,
6225 * then fall through to restart the search.
6226 */
6227 proc_list_lock();
6228 proc_rele(p);
6229 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6230 p->p_memstat_state |= P_MEMSTAT_ERROR;
6231 *errors += 1;
6232
6233 i = 0;
6234 next_p = memorystatus_get_first_proc_locked(&i, TRUE);
6235 }
6236 }
6237
6238 proc_list_unlock();
6239
6240 exit:
6241 os_reason_free(jetsam_reason);
6242
6243 if (!killed) {
6244 *memory_reclaimed = 0;
6245
6246 /* Clear snapshot if freshly captured and no target was found */
6247 if (new_snapshot) {
6248 proc_list_lock();
6249 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6250 proc_list_unlock();
6251 }
6252 }
6253
6254 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
6255 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, *memory_reclaimed, 0);
6256
6257 return killed;
6258 }
6259
6260 /*
6261 * Jetsam a process pinned in the elevated band.
6262 *
6263 * Return: true -- a pinned process was jetsammed
6264 * false -- no pinned process was jetsammed
6265 */
6266 boolean_t
memorystatus_kill_elevated_process(uint32_t cause,os_reason_t jetsam_reason,unsigned int band,int aggr_count,uint32_t * errors,uint64_t * memory_reclaimed)6267 memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors, uint64_t *memory_reclaimed)
6268 {
6269 pid_t aPid = 0;
6270 proc_t p = PROC_NULL, next_p = PROC_NULL;
6271 boolean_t new_snapshot = FALSE, killed = FALSE;
6272 int kill_count = 0;
6273 uint32_t aPid_ep;
6274 uint64_t killtime = 0;
6275 clock_sec_t tv_sec;
6276 clock_usec_t tv_usec;
6277 uint32_t tv_msec;
6278 uint64_t footprint_of_killed_proc = 0;
6279
6280
6281 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
6282 MEMORYSTATUS_LOG_AVAILABLE_PAGES, 0, 0, 0, 0);
6283
6284 #if CONFIG_FREEZE
6285 boolean_t consider_frozen_only = FALSE;
6286
6287 if (band == (unsigned int) memorystatus_freeze_jetsam_band) {
6288 consider_frozen_only = TRUE;
6289 }
6290 #endif /* CONFIG_FREEZE */
6291
6292 proc_list_lock();
6293
6294 next_p = memorystatus_get_first_proc_locked(&band, FALSE);
6295 while (next_p) {
6296 p = next_p;
6297 next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
6298
6299 aPid = proc_getpid(p);
6300 aPid_ep = p->p_memstat_effectivepriority;
6301
6302 /*
6303 * Only pick a process pinned in this elevated band
6304 */
6305 if (!(p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
6306 continue;
6307 }
6308
6309 if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP)) {
6310 continue;
6311 }
6312
6313 #if CONFIG_FREEZE
6314 if (consider_frozen_only && !(p->p_memstat_state & P_MEMSTAT_FROZEN)) {
6315 continue;
6316 }
6317
6318 if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
6319 continue;
6320 }
6321 #endif /* CONFIG_FREEZE */
6322
6323 #if DEVELOPMENT || DEBUG
6324 MEMORYSTATUS_DEBUG(1, "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n",
6325 aggr_count,
6326 aPid, (*p->p_name ? p->p_name : "unknown"),
6327 MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6328 #endif /* DEVELOPMENT || DEBUG */
6329
6330 if (memorystatus_jetsam_snapshot_count == 0) {
6331 memorystatus_init_jetsam_snapshot_locked(NULL, 0);
6332 new_snapshot = TRUE;
6333 }
6334
6335 p->p_memstat_state |= P_MEMSTAT_TERMINATED;
6336
6337 killtime = mach_absolute_time();
6338 absolutetime_to_microtime(killtime, &tv_sec, &tv_usec);
6339 tv_msec = tv_usec / 1000;
6340
6341 memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
6342
6343 if (proc_ref(p, true) == p) {
6344 proc_list_unlock();
6345
6346 /*
6347 * memorystatus_do_kill drops a reference, so take another one so we can
6348 * continue to use this exit reason even after memorystatus_do_kill()
6349 * returns
6350 */
6351 os_reason_ref(jetsam_reason);
6352 killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc);
6353
6354 os_log_with_startup_serial(OS_LOG_DEFAULT, "%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d) %lluKB - memorystatus_available_pages: %llu\n",
6355 (unsigned long)tv_sec, tv_msec,
6356 aggr_count,
6357 aPid, ((p && *p->p_name) ? p->p_name : "unknown"),
6358 memorystatus_kill_cause_name[cause], aPid_ep,
6359 footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES);
6360
6361 /* Success? */
6362 if (killed) {
6363 *memory_reclaimed = footprint_of_killed_proc;
6364 proc_rele(p);
6365 kill_count++;
6366 goto exit;
6367 }
6368
6369 /*
6370 * Failure - first unwind the state,
6371 * then fall through to restart the search.
6372 */
6373 proc_list_lock();
6374 proc_rele(p);
6375 p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
6376 p->p_memstat_state |= P_MEMSTAT_ERROR;
6377 *errors += 1;
6378 }
6379
6380 /*
6381 * Failure - restart the search.
6382 *
6383 * We might have raced with "p" exiting on another core, resulting in no
6384 * ref on "p". Or, we may have failed to kill "p".
6385 *
6386 * Either way, we fall thru to here, leaving the proc in the
6387 * P_MEMSTAT_TERMINATED state or P_MEMSTAT_ERROR state.
6388 *
6389 * And, we hold the the proc_list_lock at this point.
6390 */
6391
6392 next_p = memorystatus_get_first_proc_locked(&band, FALSE);
6393 }
6394
6395 proc_list_unlock();
6396
6397 exit:
6398 os_reason_free(jetsam_reason);
6399
6400 if (kill_count == 0) {
6401 *memory_reclaimed = 0;
6402
6403 /* Clear snapshot if freshly captured and no target was found */
6404 if (new_snapshot) {
6405 proc_list_lock();
6406 memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
6407 proc_list_unlock();
6408 }
6409 }
6410
6411 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
6412 MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, kill_count, *memory_reclaimed, 0);
6413
6414 return killed;
6415 }
6416
6417 static boolean_t
memorystatus_kill_process_async(pid_t victim_pid,uint32_t cause)6418 memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause)
6419 {
6420 /*
6421 * TODO: allow a general async path
6422 *
6423 * NOTE: If a new async kill cause is added, make sure to update memorystatus_thread() to
6424 * add the appropriate exit reason code mapping.
6425 */
6426 if ((victim_pid != -1) ||
6427 (cause != kMemorystatusKilledVMPageShortage &&
6428 cause != kMemorystatusKilledVMCompressorThrashing &&
6429 cause != kMemorystatusKilledVMCompressorSpaceShortage &&
6430 cause != kMemorystatusKilledFCThrashing &&
6431 cause != kMemorystatusKilledZoneMapExhaustion &&
6432 cause != kMemorystatusKilledSustainedPressure)) {
6433 return FALSE;
6434 }
6435
6436 kill_under_pressure_cause = cause;
6437 memorystatus_thread_wake();
6438 return TRUE;
6439 }
6440
6441 boolean_t
memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async)6442 memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async)
6443 {
6444 if (async) {
6445 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorSpaceShortage);
6446 } else {
6447 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE);
6448 if (jetsam_reason == OS_REASON_NULL) {
6449 printf("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n");
6450 }
6451
6452 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason);
6453 }
6454 }
6455
6456 #if CONFIG_JETSAM
6457 boolean_t
memorystatus_kill_on_VM_compressor_thrashing(boolean_t async)6458 memorystatus_kill_on_VM_compressor_thrashing(boolean_t async)
6459 {
6460 if (async) {
6461 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorThrashing);
6462 } else {
6463 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING);
6464 if (jetsam_reason == OS_REASON_NULL) {
6465 printf("memorystatus_kill_on_VM_compressor_thrashing -- sync: failed to allocate jetsam reason\n");
6466 }
6467
6468 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorThrashing, jetsam_reason);
6469 }
6470 }
6471
6472 boolean_t
memorystatus_kill_on_VM_page_shortage(boolean_t async)6473 memorystatus_kill_on_VM_page_shortage(boolean_t async)
6474 {
6475 if (async) {
6476 return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage);
6477 } else {
6478 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMPAGESHORTAGE);
6479 if (jetsam_reason == OS_REASON_NULL) {
6480 printf("memorystatus_kill_on_VM_page_shortage -- sync: failed to allocate jetsam reason\n");
6481 }
6482
6483 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage, jetsam_reason);
6484 }
6485 }
6486
6487 boolean_t
memorystatus_kill_on_FC_thrashing(boolean_t async)6488 memorystatus_kill_on_FC_thrashing(boolean_t async)
6489 {
6490 if (async) {
6491 return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing);
6492 } else {
6493 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_FCTHRASHING);
6494 if (jetsam_reason == OS_REASON_NULL) {
6495 printf("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n");
6496 }
6497
6498 return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing, jetsam_reason);
6499 }
6500 }
6501
6502 boolean_t
memorystatus_kill_on_vnode_limit(void)6503 memorystatus_kill_on_vnode_limit(void)
6504 {
6505 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_VNODE);
6506 if (jetsam_reason == OS_REASON_NULL) {
6507 printf("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n");
6508 }
6509
6510 return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason);
6511 }
6512
6513 boolean_t
memorystatus_kill_on_sustained_pressure(boolean_t async)6514 memorystatus_kill_on_sustained_pressure(boolean_t async)
6515 {
6516 if (async) {
6517 return memorystatus_kill_process_async(-1, kMemorystatusKilledSustainedPressure);
6518 } else {
6519 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_SUSTAINED_PRESSURE);
6520 if (jetsam_reason == OS_REASON_NULL) {
6521 printf("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n");
6522 }
6523
6524 return memorystatus_kill_process_sync(-1, kMemorystatusKilledSustainedPressure, jetsam_reason);
6525 }
6526 }
6527
6528 boolean_t
memorystatus_kill_with_jetsam_reason_sync(pid_t pid,os_reason_t jetsam_reason)6529 memorystatus_kill_with_jetsam_reason_sync(pid_t pid, os_reason_t jetsam_reason)
6530 {
6531 return memorystatus_kill_process_sync(pid, kMemorystatusKilled, jetsam_reason);
6532 }
6533
6534 #endif /* CONFIG_JETSAM */
6535
6536 boolean_t
memorystatus_kill_on_zone_map_exhaustion(pid_t pid)6537 memorystatus_kill_on_zone_map_exhaustion(pid_t pid)
6538 {
6539 boolean_t res = FALSE;
6540 if (pid == -1) {
6541 res = memorystatus_kill_process_async(-1, kMemorystatusKilledZoneMapExhaustion);
6542 } else {
6543 os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_ZONE_MAP_EXHAUSTION);
6544 if (jetsam_reason == OS_REASON_NULL) {
6545 printf("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n");
6546 }
6547
6548 res = memorystatus_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason);
6549 }
6550 return res;
6551 }
6552
6553 void
memorystatus_on_pageout_scan_end(void)6554 memorystatus_on_pageout_scan_end(void)
6555 {
6556 /* No-op */
6557 }
6558
6559 /* Return both allocated and actual size, since there's a race between allocation and list compilation */
6560 static int
memorystatus_get_priority_list(memorystatus_priority_entry_t ** list_ptr,size_t * buffer_size,size_t * list_size,boolean_t size_only)6561 memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t *buffer_size, size_t *list_size, boolean_t size_only)
6562 {
6563 uint32_t list_count, i = 0;
6564 memorystatus_priority_entry_t *list_entry;
6565 proc_t p;
6566
6567 list_count = memorystatus_list_count;
6568 *list_size = sizeof(memorystatus_priority_entry_t) * list_count;
6569
6570 /* Just a size check? */
6571 if (size_only) {
6572 return 0;
6573 }
6574
6575 /* Otherwise, validate the size of the buffer */
6576 if (*buffer_size < *list_size) {
6577 return EINVAL;
6578 }
6579
6580 *list_ptr = kalloc_data(*list_size, Z_WAITOK | Z_ZERO);
6581 if (!*list_ptr) {
6582 return ENOMEM;
6583 }
6584
6585 *buffer_size = *list_size;
6586 *list_size = 0;
6587
6588 list_entry = *list_ptr;
6589
6590 proc_list_lock();
6591
6592 p = memorystatus_get_first_proc_locked(&i, TRUE);
6593 while (p && (*list_size < *buffer_size)) {
6594 list_entry->pid = proc_getpid(p);
6595 list_entry->priority = p->p_memstat_effectivepriority;
6596 list_entry->user_data = p->p_memstat_userdata;
6597
6598 if (p->p_memstat_memlimit <= 0) {
6599 task_get_phys_footprint_limit(p->task, &list_entry->limit);
6600 } else {
6601 list_entry->limit = p->p_memstat_memlimit;
6602 }
6603
6604 list_entry->state = memorystatus_build_state(p);
6605 list_entry++;
6606
6607 *list_size += sizeof(memorystatus_priority_entry_t);
6608
6609 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6610 }
6611
6612 proc_list_unlock();
6613
6614 MEMORYSTATUS_DEBUG(1, "memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
6615
6616 return 0;
6617 }
6618
6619 static int
memorystatus_get_priority_pid(pid_t pid,user_addr_t buffer,size_t buffer_size)6620 memorystatus_get_priority_pid(pid_t pid, user_addr_t buffer, size_t buffer_size)
6621 {
6622 int error = 0;
6623 memorystatus_priority_entry_t mp_entry;
6624 kern_return_t ret;
6625
6626 /* Validate inputs */
6627 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_entry_t))) {
6628 return EINVAL;
6629 }
6630
6631 proc_t p = proc_find(pid);
6632 if (!p) {
6633 return ESRCH;
6634 }
6635
6636 memset(&mp_entry, 0, sizeof(memorystatus_priority_entry_t));
6637
6638 mp_entry.pid = proc_getpid(p);
6639 mp_entry.priority = p->p_memstat_effectivepriority;
6640 mp_entry.user_data = p->p_memstat_userdata;
6641 if (p->p_memstat_memlimit <= 0) {
6642 ret = task_get_phys_footprint_limit(p->task, &mp_entry.limit);
6643 if (ret != KERN_SUCCESS) {
6644 proc_rele(p);
6645 return EINVAL;
6646 }
6647 } else {
6648 mp_entry.limit = p->p_memstat_memlimit;
6649 }
6650 mp_entry.state = memorystatus_build_state(p);
6651
6652 proc_rele(p);
6653
6654 error = copyout(&mp_entry, buffer, buffer_size);
6655
6656 return error;
6657 }
6658
6659 static int
memorystatus_cmd_get_priority_list(pid_t pid,user_addr_t buffer,size_t buffer_size,int32_t * retval)6660 memorystatus_cmd_get_priority_list(pid_t pid, user_addr_t buffer, size_t buffer_size, int32_t *retval)
6661 {
6662 int error = 0;
6663 boolean_t size_only;
6664 size_t list_size;
6665
6666 /*
6667 * When a non-zero pid is provided, the 'list' has only one entry.
6668 */
6669
6670 size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
6671
6672 if (pid != 0) {
6673 list_size = sizeof(memorystatus_priority_entry_t) * 1;
6674 if (!size_only) {
6675 error = memorystatus_get_priority_pid(pid, buffer, buffer_size);
6676 }
6677 } else {
6678 memorystatus_priority_entry_t *list = NULL;
6679 error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, size_only);
6680
6681 if (error == 0) {
6682 if (!size_only) {
6683 error = copyout(list, buffer, list_size);
6684 }
6685
6686 kfree_data(list, buffer_size);
6687 }
6688 }
6689
6690 if (error == 0) {
6691 assert(list_size <= INT32_MAX);
6692 *retval = (int32_t) list_size;
6693 }
6694
6695 return error;
6696 }
6697
6698 static void
memorystatus_clear_errors(void)6699 memorystatus_clear_errors(void)
6700 {
6701 proc_t p;
6702 unsigned int i = 0;
6703
6704 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START, 0, 0, 0, 0, 0);
6705
6706 proc_list_lock();
6707
6708 p = memorystatus_get_first_proc_locked(&i, TRUE);
6709 while (p) {
6710 if (p->p_memstat_state & P_MEMSTAT_ERROR) {
6711 p->p_memstat_state &= ~P_MEMSTAT_ERROR;
6712 }
6713 p = memorystatus_get_next_proc_locked(&i, p, TRUE);
6714 }
6715
6716 proc_list_unlock();
6717
6718 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END, 0, 0, 0, 0, 0);
6719 }
6720
6721 #if CONFIG_JETSAM
6722 static void
memorystatus_update_levels_locked(boolean_t critical_only)6723 memorystatus_update_levels_locked(boolean_t critical_only)
6724 {
6725 memorystatus_available_pages_critical = memorystatus_available_pages_critical_base;
6726
6727 /*
6728 * If there's an entry in the first bucket, we have idle processes.
6729 */
6730
6731 memstat_bucket_t *first_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
6732 if (first_bucket->count) {
6733 memorystatus_available_pages_critical += memorystatus_available_pages_critical_idle_offset;
6734
6735 if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure) {
6736 /*
6737 * The critical threshold must never exceed the pressure threshold
6738 */
6739 memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
6740 }
6741 }
6742
6743 if (memorystatus_jetsam_policy & kPolicyMoreFree) {
6744 memorystatus_available_pages_critical += memorystatus_policy_more_free_offset_pages;
6745 }
6746
6747 if (critical_only) {
6748 return;
6749 }
6750
6751 #if VM_PRESSURE_EVENTS
6752 memorystatus_available_pages_pressure = (int32_t)(pressure_threshold_percentage * (atop_64(max_mem) / 100));
6753 #endif
6754 }
6755
6756 void
memorystatus_fast_jetsam_override(boolean_t enable_override)6757 memorystatus_fast_jetsam_override(boolean_t enable_override)
6758 {
6759 /* If fast jetsam is not enabled, simply return */
6760 if (!fast_jetsam_enabled) {
6761 return;
6762 }
6763
6764 if (enable_override) {
6765 if ((memorystatus_jetsam_policy & kPolicyMoreFree) == kPolicyMoreFree) {
6766 return;
6767 }
6768 proc_list_lock();
6769 memorystatus_jetsam_policy |= kPolicyMoreFree;
6770 memorystatus_thread_pool_max();
6771 memorystatus_update_levels_locked(TRUE);
6772 proc_list_unlock();
6773 } else {
6774 if ((memorystatus_jetsam_policy & kPolicyMoreFree) == 0) {
6775 return;
6776 }
6777 proc_list_lock();
6778 memorystatus_jetsam_policy &= ~kPolicyMoreFree;
6779 memorystatus_thread_pool_default();
6780 memorystatus_update_levels_locked(TRUE);
6781 proc_list_unlock();
6782 }
6783 }
6784
6785
6786 static int
6787 sysctl_kern_memorystatus_policy_more_free SYSCTL_HANDLER_ARGS
6788 {
6789 #pragma unused(arg1, arg2, oidp)
6790 int error = 0, more_free = 0;
6791
6792 /*
6793 * TODO: Enable this privilege check?
6794 *
6795 * error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0);
6796 * if (error)
6797 * return (error);
6798 */
6799
6800 error = sysctl_handle_int(oidp, &more_free, 0, req);
6801 if (error || !req->newptr) {
6802 return error;
6803 }
6804
6805 if (more_free) {
6806 memorystatus_fast_jetsam_override(true);
6807 } else {
6808 memorystatus_fast_jetsam_override(false);
6809 }
6810
6811 return 0;
6812 }
6813 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_policy_more_free, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
6814 0, 0, &sysctl_kern_memorystatus_policy_more_free, "I", "");
6815
6816 #endif /* CONFIG_JETSAM */
6817
6818 /*
6819 * Get the at_boot snapshot
6820 */
6821 static int
memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)6822 memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6823 {
6824 size_t input_size = *snapshot_size;
6825
6826 /*
6827 * The at_boot snapshot has no entry list.
6828 */
6829 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t);
6830
6831 if (size_only) {
6832 return 0;
6833 }
6834
6835 /*
6836 * Validate the size of the snapshot buffer
6837 */
6838 if (input_size < *snapshot_size) {
6839 return EINVAL;
6840 }
6841
6842 /*
6843 * Update the notification_time only
6844 */
6845 memorystatus_at_boot_snapshot.notification_time = mach_absolute_time();
6846 *snapshot = &memorystatus_at_boot_snapshot;
6847
6848 MEMORYSTATUS_DEBUG(7, "memorystatus_get_at_boot_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%d)\n",
6849 (long)input_size, (long)*snapshot_size, 0);
6850 return 0;
6851 }
6852
6853 #if CONFIG_FREEZE
6854 static int
memorystatus_get_jetsam_snapshot_freezer(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)6855 memorystatus_get_jetsam_snapshot_freezer(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6856 {
6857 size_t input_size = *snapshot_size;
6858
6859 if (memorystatus_jetsam_snapshot_freezer->entry_count > 0) {
6860 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_freezer->entry_count));
6861 } else {
6862 *snapshot_size = 0;
6863 }
6864 assert(*snapshot_size <= memorystatus_jetsam_snapshot_freezer_size);
6865
6866 if (size_only) {
6867 return 0;
6868 }
6869
6870 if (input_size < *snapshot_size) {
6871 return EINVAL;
6872 }
6873
6874 *snapshot = memorystatus_jetsam_snapshot_freezer;
6875
6876 MEMORYSTATUS_DEBUG(7, "memorystatus_get_jetsam_snapshot_freezer: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
6877 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_freezer->entry_count);
6878
6879 return 0;
6880 }
6881 #endif /* CONFIG_FREEZE */
6882
6883 static int
memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)6884 memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6885 {
6886 size_t input_size = *snapshot_size;
6887 uint32_t ods_list_count = memorystatus_list_count;
6888 memorystatus_jetsam_snapshot_t *ods = NULL; /* The on_demand snapshot buffer */
6889
6890 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (ods_list_count));
6891
6892 if (size_only) {
6893 return 0;
6894 }
6895
6896 /*
6897 * Validate the size of the snapshot buffer.
6898 * This is inherently racey. May want to revisit
6899 * this error condition and trim the output when
6900 * it doesn't fit.
6901 */
6902 if (input_size < *snapshot_size) {
6903 return EINVAL;
6904 }
6905
6906 /*
6907 * Allocate and initialize a snapshot buffer.
6908 */
6909 ods = kalloc_data(*snapshot_size, Z_WAITOK | Z_ZERO);
6910 if (!ods) {
6911 return ENOMEM;
6912 }
6913
6914 proc_list_lock();
6915 memorystatus_init_jetsam_snapshot_locked(ods, ods_list_count);
6916 proc_list_unlock();
6917
6918 /*
6919 * Return the kernel allocated, on_demand buffer.
6920 * The caller of this routine will copy the data out
6921 * to user space and then free the kernel allocated
6922 * buffer.
6923 */
6924 *snapshot = ods;
6925
6926 MEMORYSTATUS_DEBUG(7, "memorystatus_get_on_demand_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
6927 (long)input_size, (long)*snapshot_size, (long)ods_list_count);
6928
6929 return 0;
6930 }
6931
6932 static int
memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t ** snapshot,size_t * snapshot_size,boolean_t size_only)6933 memorystatus_get_jetsam_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only)
6934 {
6935 size_t input_size = *snapshot_size;
6936
6937 if (memorystatus_jetsam_snapshot_count > 0) {
6938 *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
6939 } else {
6940 *snapshot_size = 0;
6941 }
6942
6943 if (size_only) {
6944 return 0;
6945 }
6946
6947 if (input_size < *snapshot_size) {
6948 return EINVAL;
6949 }
6950
6951 *snapshot = memorystatus_jetsam_snapshot;
6952
6953 MEMORYSTATUS_DEBUG(7, "memorystatus_get_jetsam_snapshot: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
6954 (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_count);
6955
6956 return 0;
6957 }
6958
6959
6960 static int
memorystatus_cmd_get_jetsam_snapshot(int32_t flags,user_addr_t buffer,size_t buffer_size,int32_t * retval)6961 memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval)
6962 {
6963 int error = EINVAL;
6964 boolean_t size_only;
6965 boolean_t is_default_snapshot = FALSE;
6966 boolean_t is_on_demand_snapshot = FALSE;
6967 boolean_t is_at_boot_snapshot = FALSE;
6968 #if CONFIG_FREEZE
6969 bool is_freezer_snapshot = false;
6970 #endif /* CONFIG_FREEZE */
6971 memorystatus_jetsam_snapshot_t *snapshot;
6972
6973 size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
6974
6975 if (flags == 0) {
6976 /* Default */
6977 is_default_snapshot = TRUE;
6978 error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only);
6979 } else {
6980 if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT | MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER)) {
6981 /*
6982 * Unsupported bit set in flag.
6983 */
6984 return EINVAL;
6985 }
6986
6987 if (flags & (flags - 0x1)) {
6988 /*
6989 * Can't have multiple flags set at the same time.
6990 */
6991 return EINVAL;
6992 }
6993
6994 if (flags & MEMORYSTATUS_SNAPSHOT_ON_DEMAND) {
6995 is_on_demand_snapshot = TRUE;
6996 /*
6997 * When not requesting the size only, the following call will allocate
6998 * an on_demand snapshot buffer, which is freed below.
6999 */
7000 error = memorystatus_get_on_demand_snapshot(&snapshot, &buffer_size, size_only);
7001 } else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) {
7002 is_at_boot_snapshot = TRUE;
7003 error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only);
7004 #if CONFIG_FREEZE
7005 } else if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER) {
7006 is_freezer_snapshot = true;
7007 error = memorystatus_get_jetsam_snapshot_freezer(&snapshot, &buffer_size, size_only);
7008 #endif /* CONFIG_FREEZE */
7009 } else {
7010 /*
7011 * Invalid flag setting.
7012 */
7013 return EINVAL;
7014 }
7015 }
7016
7017 if (error) {
7018 goto out;
7019 }
7020
7021 /*
7022 * Copy the data out to user space and clear the snapshot buffer.
7023 * If working with the jetsam snapshot,
7024 * clearing the buffer means, reset the count.
7025 * If working with an on_demand snapshot
7026 * clearing the buffer means, free it.
7027 * If working with the at_boot snapshot
7028 * there is nothing to clear or update.
7029 * If working with a copy of the snapshot
7030 * there is nothing to clear or update.
7031 * If working with the freezer snapshot
7032 * clearing the buffer means, reset the count.
7033 */
7034 if (!size_only) {
7035 if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
7036 #if CONFIG_FREEZE
7037 if (is_default_snapshot || is_freezer_snapshot) {
7038 #else
7039 if (is_default_snapshot) {
7040 #endif /* CONFIG_FREEZE */
7041 /*
7042 * The jetsam snapshot is never freed, its count is simply reset.
7043 * However, we make a copy for any parties that might be interested
7044 * in the previous fully populated snapshot.
7045 */
7046 proc_list_lock();
7047 #if DEVELOPMENT || DEBUG
7048 if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != proc_getpid(current_proc())) {
7049 /* Snapshot is currently owned by someone else. Don't consume it. */
7050 proc_list_unlock();
7051 goto out;
7052 }
7053 #endif /* (DEVELOPMENT || DEBUG)*/
7054 if (is_default_snapshot) {
7055 snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
7056 memorystatus_jetsam_snapshot_last_timestamp = 0;
7057 }
7058 #if CONFIG_FREEZE
7059 else if (is_freezer_snapshot) {
7060 memorystatus_jetsam_snapshot_freezer->entry_count = 0;
7061 }
7062 #endif /* CONFIG_FREEZE */
7063 proc_list_unlock();
7064 }
7065 }
7066
7067 if (is_on_demand_snapshot) {
7068 /*
7069 * The on_demand snapshot is always freed,
7070 * even if the copyout failed.
7071 */
7072 kfree_data(snapshot, buffer_size);
7073 }
7074 }
7075
7076 out:
7077 if (error == 0) {
7078 assert(buffer_size <= INT32_MAX);
7079 *retval = (int32_t) buffer_size;
7080 }
7081 return error;
7082 }
7083
7084 #if DEVELOPMENT || DEBUG
7085 static int
7086 memorystatus_cmd_set_testing_pid(int32_t flags)
7087 {
7088 int error = EINVAL;
7089 proc_t caller = current_proc();
7090 assert(caller != kernproc);
7091 proc_list_lock();
7092 if (flags & MEMORYSTATUS_FLAGS_SET_TESTING_PID) {
7093 if (memorystatus_testing_pid == 0) {
7094 memorystatus_testing_pid = proc_getpid(caller);
7095 error = 0;
7096 } else if (memorystatus_testing_pid == proc_getpid(caller)) {
7097 error = 0;
7098 } else {
7099 /* We don't allow ownership to be taken from another proc. */
7100 error = EBUSY;
7101 }
7102 } else if (flags & MEMORYSTATUS_FLAGS_UNSET_TESTING_PID) {
7103 if (memorystatus_testing_pid == proc_getpid(caller)) {
7104 memorystatus_testing_pid = 0;
7105 error = 0;
7106 } else if (memorystatus_testing_pid != 0) {
7107 /* We don't allow ownership to be taken from another proc. */
7108 error = EPERM;
7109 }
7110 }
7111 proc_list_unlock();
7112
7113 return error;
7114 }
7115 #endif /* DEVELOPMENT || DEBUG */
7116
7117 /*
7118 * Routine: memorystatus_cmd_grp_set_priorities
7119 * Purpose: Update priorities for a group of processes.
7120 *
7121 * [priority]
7122 * Move each process out of its effective priority
7123 * band and into a new priority band.
7124 * Maintains relative order from lowest to highest priority.
7125 * In single band, maintains relative order from head to tail.
7126 *
7127 * eg: before [effectivepriority | pid]
7128 * [18 | p101 ]
7129 * [17 | p55, p67, p19 ]
7130 * [12 | p103 p10 ]
7131 * [ 7 | p25 ]
7132 * [ 0 | p71, p82, ]
7133 *
7134 * after [ new band | pid]
7135 * [ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
7136 *
7137 * Returns: 0 on success, else non-zero.
7138 *
7139 * Caveat: We know there is a race window regarding recycled pids.
7140 * A process could be killed before the kernel can act on it here.
7141 * If a pid cannot be found in any of the jetsam priority bands,
7142 * then we simply ignore it. No harm.
7143 * But, if the pid has been recycled then it could be an issue.
7144 * In that scenario, we might move an unsuspecting process to the new
7145 * priority band. It's not clear how the kernel can safeguard
7146 * against this, but it would be an extremely rare case anyway.
7147 * The caller of this api might avoid such race conditions by
7148 * ensuring that the processes passed in the pid list are suspended.
7149 */
7150
7151
7152 static int
7153 memorystatus_cmd_grp_set_priorities(user_addr_t buffer, size_t buffer_size)
7154 {
7155 /*
7156 * We only handle setting priority
7157 * per process
7158 */
7159
7160 int error = 0;
7161 memorystatus_properties_entry_v1_t *entries = NULL;
7162 size_t entry_count = 0;
7163
7164 /* This will be the ordered proc list */
7165 typedef struct memorystatus_internal_properties {
7166 proc_t proc;
7167 int32_t priority;
7168 } memorystatus_internal_properties_t;
7169
7170 memorystatus_internal_properties_t *table = NULL;
7171 uint32_t table_count = 0;
7172
7173 size_t i = 0;
7174 uint32_t bucket_index = 0;
7175 boolean_t head_insert;
7176 int32_t new_priority;
7177
7178 proc_t p;
7179
7180 /* Verify inputs */
7181 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
7182 error = EINVAL;
7183 goto out;
7184 }
7185
7186 entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
7187 if (entry_count == 0) {
7188 /* buffer size was not large enough for a single entry */
7189 error = EINVAL;
7190 goto out;
7191 }
7192
7193 if ((entries = kalloc_data(buffer_size, Z_WAITOK)) == NULL) {
7194 error = ENOMEM;
7195 goto out;
7196 }
7197
7198 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, 0, 0, 0);
7199
7200 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
7201 goto out;
7202 }
7203
7204 /* Verify sanity of input priorities */
7205 if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
7206 if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
7207 error = EINVAL;
7208 goto out;
7209 }
7210 } else {
7211 error = EINVAL;
7212 goto out;
7213 }
7214
7215 for (i = 0; i < entry_count; i++) {
7216 if (entries[i].priority == -1) {
7217 /* Use as shorthand for default priority */
7218 entries[i].priority = JETSAM_PRIORITY_DEFAULT;
7219 } else if ((entries[i].priority == system_procs_aging_band) || (entries[i].priority == applications_aging_band)) {
7220 /* Both the aging bands are reserved for internal use;
7221 * if requested, adjust to JETSAM_PRIORITY_IDLE. */
7222 entries[i].priority = JETSAM_PRIORITY_IDLE;
7223 } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7224 /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
7225 * queue */
7226 /* Deal with this later */
7227 } else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
7228 /* Sanity check */
7229 error = EINVAL;
7230 goto out;
7231 }
7232 }
7233
7234 table = kalloc_type(memorystatus_internal_properties_t, entry_count,
7235 Z_WAITOK | Z_ZERO);
7236 if (table == NULL) {
7237 error = ENOMEM;
7238 goto out;
7239 }
7240
7241
7242 /*
7243 * For each jetsam bucket entry, spin through the input property list.
7244 * When a matching pid is found, populate an adjacent table with the
7245 * appropriate proc pointer and new property values.
7246 * This traversal automatically preserves order from lowest
7247 * to highest priority.
7248 */
7249
7250 bucket_index = 0;
7251
7252 proc_list_lock();
7253
7254 /* Create the ordered table */
7255 p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
7256 while (p && (table_count < entry_count)) {
7257 for (i = 0; i < entry_count; i++) {
7258 if (proc_getpid(p) == entries[i].pid) {
7259 /* Build the table data */
7260 table[table_count].proc = p;
7261 table[table_count].priority = entries[i].priority;
7262 table_count++;
7263 break;
7264 }
7265 }
7266 p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
7267 }
7268
7269 /* We now have ordered list of procs ready to move */
7270 for (i = 0; i < table_count; i++) {
7271 p = table[i].proc;
7272 assert(p != NULL);
7273
7274 /* Allow head inserts -- but relative order is now */
7275 if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
7276 new_priority = JETSAM_PRIORITY_IDLE;
7277 head_insert = true;
7278 } else {
7279 new_priority = table[i].priority;
7280 head_insert = false;
7281 }
7282
7283 /* Not allowed */
7284 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
7285 continue;
7286 }
7287
7288 /*
7289 * Take appropriate steps if moving proc out of
7290 * either of the aging bands.
7291 */
7292 if ((p->p_memstat_effectivepriority == system_procs_aging_band) || (p->p_memstat_effectivepriority == applications_aging_band)) {
7293 memorystatus_invalidate_idle_demotion_locked(p, TRUE);
7294 }
7295
7296 memorystatus_update_priority_locked(p, new_priority, head_insert, false);
7297 }
7298
7299 proc_list_unlock();
7300
7301 /*
7302 * if (table_count != entry_count)
7303 * then some pids were not found in a jetsam band.
7304 * harmless but interesting...
7305 */
7306 out:
7307 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, table_count, 0, 0);
7308
7309 kfree_data(entries, buffer_size);
7310 kfree_type(memorystatus_internal_properties_t, entry_count, table);
7311
7312 return error;
7313 }
7314
7315 memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL;
7316 size_t memorystatus_global_probabilities_size = 0;
7317
7318 static int
7319 memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size)
7320 {
7321 int error = 0;
7322 memorystatus_properties_entry_v1_t *entries = NULL;
7323 size_t entry_count = 0, i = 0;
7324 memorystatus_internal_probabilities_t *tmp_table_new = NULL, *tmp_table_old = NULL;
7325 size_t tmp_table_new_size = 0, tmp_table_old_size = 0;
7326 #if DEVELOPMENT || DEBUG
7327 if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != proc_getpid(current_proc())) {
7328 /* probabilites are currently owned by someone else. Don't change them. */
7329 error = EPERM;
7330 goto out;
7331 }
7332 #endif /* (DEVELOPMENT || DEBUG)*/
7333
7334 /* Verify inputs */
7335 if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
7336 error = EINVAL;
7337 goto out;
7338 }
7339
7340 entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
7341
7342 if ((entries = kalloc_data(buffer_size, Z_WAITOK)) == NULL) {
7343 error = ENOMEM;
7344 goto out;
7345 }
7346
7347 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, 0, 0, 0);
7348
7349 if ((error = copyin(buffer, entries, buffer_size)) != 0) {
7350 goto out;
7351 }
7352
7353 if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
7354 if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
7355 error = EINVAL;
7356 goto out;
7357 }
7358 } else {
7359 error = EINVAL;
7360 goto out;
7361 }
7362
7363 /* Verify sanity of input priorities */
7364 for (i = 0; i < entry_count; i++) {
7365 /*
7366 * 0 - low probability of use.
7367 * 1 - high probability of use.
7368 *
7369 * Keeping this field an int (& not a bool) to allow
7370 * us to experiment with different values/approaches
7371 * later on.
7372 */
7373 if (entries[i].use_probability > 1) {
7374 error = EINVAL;
7375 goto out;
7376 }
7377 }
7378
7379 tmp_table_new_size = sizeof(memorystatus_internal_probabilities_t) * entry_count;
7380
7381 if ((tmp_table_new = kalloc_data(tmp_table_new_size, Z_WAITOK | Z_ZERO)) == NULL) {
7382 error = ENOMEM;
7383 goto out;
7384 }
7385
7386 proc_list_lock();
7387
7388 if (memorystatus_global_probabilities_table) {
7389 tmp_table_old = memorystatus_global_probabilities_table;
7390 tmp_table_old_size = memorystatus_global_probabilities_size;
7391 }
7392
7393 memorystatus_global_probabilities_table = tmp_table_new;
7394 memorystatus_global_probabilities_size = tmp_table_new_size;
7395 tmp_table_new = NULL;
7396
7397 for (i = 0; i < entry_count; i++) {
7398 /* Build the table data */
7399 strlcpy(memorystatus_global_probabilities_table[i].proc_name, entries[i].proc_name, MAXCOMLEN + 1);
7400 memorystatus_global_probabilities_table[i].use_probability = entries[i].use_probability;
7401 }
7402
7403 proc_list_unlock();
7404
7405 out:
7406 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, tmp_table_new_size, 0, 0);
7407
7408 kfree_data(entries, buffer_size);
7409 kfree_data(tmp_table_old, tmp_table_old_size);
7410
7411 return error;
7412 }
7413
7414 static int
7415 memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7416 {
7417 int error = 0;
7418
7419 if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) {
7420 error = memorystatus_cmd_grp_set_priorities(buffer, buffer_size);
7421 } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) == MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) {
7422 error = memorystatus_cmd_grp_set_probabilities(buffer, buffer_size);
7423 #if CONFIG_FREEZE
7424 } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_FREEZE_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_FREEZE_PRIORITY) {
7425 error = memorystatus_cmd_grp_set_freeze_list(buffer, buffer_size);
7426 } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_DEMOTE_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_DEMOTE_PRIORITY) {
7427 error = memorystatus_cmd_grp_set_demote_list(buffer, buffer_size);
7428 #endif /* CONFIG_FREEZE */
7429 } else {
7430 error = EINVAL;
7431 }
7432
7433 return error;
7434 }
7435
7436 /*
7437 * This routine is used to update a process's jetsam priority position and stored user_data.
7438 * It is not used for the setting of memory limits, which is why the last 6 args to the
7439 * memorystatus_update() call are 0 or FALSE.
7440 *
7441 * Flags passed into this call are used to distinguish the motivation behind a jetsam priority
7442 * transition. By default, the kernel updates the process's original requested priority when
7443 * no flag is passed. But when the MEMORYSTATUS_SET_PRIORITY_ASSERTION flag is used, the kernel
7444 * updates the process's assertion driven priority.
7445 *
7446 * The assertion flag was introduced for use by the device's assertion mediator (eg: runningboardd).
7447 * When an assertion is controlling a process's jetsam priority, it may conflict with that process's
7448 * dirty/clean (active/inactive) jetsam state. The kernel attempts to resolve a priority transition
7449 * conflict by reviewing the process state and then choosing the maximum jetsam band at play,
7450 * eg: requested priority versus assertion priority.
7451 */
7452
7453 static int
7454 memorystatus_cmd_set_priority_properties(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7455 {
7456 int error = 0;
7457 boolean_t is_assertion = FALSE; /* priority is driven by an assertion */
7458 memorystatus_priority_properties_t mpp_entry;
7459
7460 /* Validate inputs */
7461 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_priority_properties_t))) {
7462 return EINVAL;
7463 }
7464
7465 /* Validate flags */
7466 if (flags == 0) {
7467 /*
7468 * Default. This path updates requestedpriority.
7469 */
7470 } else {
7471 if (flags & ~(MEMORYSTATUS_SET_PRIORITY_ASSERTION)) {
7472 /*
7473 * Unsupported bit set in flag.
7474 */
7475 return EINVAL;
7476 } else if (flags & MEMORYSTATUS_SET_PRIORITY_ASSERTION) {
7477 is_assertion = TRUE;
7478 }
7479 }
7480
7481 error = copyin(buffer, &mpp_entry, buffer_size);
7482
7483 if (error == 0) {
7484 proc_t p;
7485
7486 p = proc_find(pid);
7487 if (!p) {
7488 return ESRCH;
7489 }
7490
7491 if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
7492 proc_rele(p);
7493 return EPERM;
7494 }
7495
7496 if (is_assertion) {
7497 os_log(OS_LOG_DEFAULT, "memorystatus: set assertion priority(%d) target %s:%d\n",
7498 mpp_entry.priority, (*p->p_name ? p->p_name : "unknown"), proc_getpid(p));
7499 }
7500
7501 error = memorystatus_update(p, mpp_entry.priority, mpp_entry.user_data, is_assertion, FALSE, FALSE, 0, 0, FALSE, FALSE);
7502 proc_rele(p);
7503 }
7504
7505 return error;
7506 }
7507
7508 static int
7509 memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7510 {
7511 int error = 0;
7512 memorystatus_memlimit_properties_t mmp_entry;
7513
7514 /* Validate inputs */
7515 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(memorystatus_memlimit_properties_t))) {
7516 return EINVAL;
7517 }
7518
7519 error = copyin(buffer, &mmp_entry, buffer_size);
7520
7521 if (error == 0) {
7522 error = memorystatus_set_memlimit_properties(pid, &mmp_entry);
7523 }
7524
7525 return error;
7526 }
7527
7528 static void
7529 memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t* p_entry)
7530 {
7531 memset(p_entry, 0, sizeof(memorystatus_memlimit_properties_t));
7532
7533 if (p->p_memstat_memlimit_active > 0) {
7534 p_entry->memlimit_active = p->p_memstat_memlimit_active;
7535 } else {
7536 task_convert_phys_footprint_limit(-1, &p_entry->memlimit_active);
7537 }
7538
7539 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL) {
7540 p_entry->memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7541 }
7542
7543 /*
7544 * Get the inactive limit and attributes
7545 */
7546 if (p->p_memstat_memlimit_inactive <= 0) {
7547 task_convert_phys_footprint_limit(-1, &p_entry->memlimit_inactive);
7548 } else {
7549 p_entry->memlimit_inactive = p->p_memstat_memlimit_inactive;
7550 }
7551 if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL) {
7552 p_entry->memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7553 }
7554 }
7555
7556 /*
7557 * When getting the memlimit settings, we can't simply call task_get_phys_footprint_limit().
7558 * That gets the proc's cached memlimit and there is no guarantee that the active/inactive
7559 * limits will be the same in the no-limit case. Instead we convert limits <= 0 using
7560 * task_convert_phys_footprint_limit(). It computes the same limit value that would be written
7561 * to the task's ledgers via task_set_phys_footprint_limit().
7562 */
7563 static int
7564 memorystatus_cmd_get_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7565 {
7566 memorystatus_memlimit_properties2_t mmp_entry;
7567
7568 /* Validate inputs */
7569 if ((pid == 0) || (buffer == USER_ADDR_NULL) ||
7570 ((buffer_size != sizeof(memorystatus_memlimit_properties_t)) &&
7571 (buffer_size != sizeof(memorystatus_memlimit_properties2_t)))) {
7572 return EINVAL;
7573 }
7574
7575 memset(&mmp_entry, 0, sizeof(memorystatus_memlimit_properties2_t));
7576
7577 proc_t p = proc_find(pid);
7578 if (!p) {
7579 return ESRCH;
7580 }
7581
7582 /*
7583 * Get the active limit and attributes.
7584 * No locks taken since we hold a reference to the proc.
7585 */
7586
7587 memorystatus_get_memlimit_properties_internal(p, &mmp_entry.v1);
7588
7589 #if CONFIG_JETSAM
7590 #if DEVELOPMENT || DEBUG
7591 /*
7592 * Get the limit increased via SPI
7593 */
7594 mmp_entry.memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
7595 mmp_entry.memlimit_increase_bytes = p->p_memlimit_increase;
7596 #endif /* DEVELOPMENT || DEBUG */
7597 #endif /* CONFIG_JETSAM */
7598
7599 proc_rele(p);
7600
7601 int error = copyout(&mmp_entry, buffer, buffer_size);
7602
7603 return error;
7604 }
7605
7606
7607 /*
7608 * SPI for kbd - pr24956468
7609 * This is a very simple snapshot that calculates how much a
7610 * process's phys_footprint exceeds a specific memory limit.
7611 * Only the inactive memory limit is supported for now.
7612 * The delta is returned as bytes in excess or zero.
7613 */
7614 static int
7615 memorystatus_cmd_get_memlimit_excess_np(pid_t pid, uint32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
7616 {
7617 int error = 0;
7618 uint64_t footprint_in_bytes = 0;
7619 uint64_t delta_in_bytes = 0;
7620 int32_t memlimit_mb = 0;
7621 uint64_t memlimit_bytes = 0;
7622
7623 /* Validate inputs */
7624 if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size != sizeof(uint64_t)) || (flags != 0)) {
7625 return EINVAL;
7626 }
7627
7628 proc_t p = proc_find(pid);
7629 if (!p) {
7630 return ESRCH;
7631 }
7632
7633 /*
7634 * Get the inactive limit.
7635 * No locks taken since we hold a reference to the proc.
7636 */
7637
7638 if (p->p_memstat_memlimit_inactive <= 0) {
7639 task_convert_phys_footprint_limit(-1, &memlimit_mb);
7640 } else {
7641 memlimit_mb = p->p_memstat_memlimit_inactive;
7642 }
7643
7644 footprint_in_bytes = get_task_phys_footprint(p->task);
7645
7646 proc_rele(p);
7647
7648 memlimit_bytes = memlimit_mb * 1024 * 1024; /* MB to bytes */
7649
7650 /*
7651 * Computed delta always returns >= 0 bytes
7652 */
7653 if (footprint_in_bytes > memlimit_bytes) {
7654 delta_in_bytes = footprint_in_bytes - memlimit_bytes;
7655 }
7656
7657 error = copyout(&delta_in_bytes, buffer, sizeof(delta_in_bytes));
7658
7659 return error;
7660 }
7661
7662
7663 static int
7664 memorystatus_cmd_get_pressure_status(int32_t *retval)
7665 {
7666 int error;
7667
7668 /* Need privilege for check */
7669 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
7670 if (error) {
7671 return error;
7672 }
7673
7674 /* Inherently racy, so it's not worth taking a lock here */
7675 *retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
7676
7677 return error;
7678 }
7679
7680 int
7681 memorystatus_get_pressure_status_kdp()
7682 {
7683 return (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
7684 }
7685
7686 /*
7687 * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
7688 *
7689 * This call is inflexible -- it does not distinguish between active/inactive, fatal/non-fatal
7690 * So, with 2-level HWM preserving previous behavior will map as follows.
7691 * - treat the limit passed in as both an active and inactive limit.
7692 * - treat the is_fatal_limit flag as though it applies to both active and inactive limits.
7693 *
7694 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK
7695 * - the is_fatal_limit is FALSE, meaning the active and inactive limits are non-fatal/soft
7696 * - so mapping is (active/non-fatal, inactive/non-fatal)
7697 *
7698 * When invoked via MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT
7699 * - the is_fatal_limit is TRUE, meaning the process's active and inactive limits are fatal/hard
7700 * - so mapping is (active/fatal, inactive/fatal)
7701 */
7702
7703 #if CONFIG_JETSAM
7704 static int
7705 memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit)
7706 {
7707 int error = 0;
7708 memorystatus_memlimit_properties_t entry;
7709
7710 entry.memlimit_active = high_water_mark;
7711 entry.memlimit_active_attr = 0;
7712 entry.memlimit_inactive = high_water_mark;
7713 entry.memlimit_inactive_attr = 0;
7714
7715 if (is_fatal_limit == TRUE) {
7716 entry.memlimit_active_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7717 entry.memlimit_inactive_attr |= MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7718 }
7719
7720 error = memorystatus_set_memlimit_properties(pid, &entry);
7721 return error;
7722 }
7723 #endif /* CONFIG_JETSAM */
7724
7725 static int
7726 memorystatus_set_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t *p_entry)
7727 {
7728 int error = 0;
7729
7730 LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
7731
7732 /*
7733 * Store the active limit variants in the proc.
7734 */
7735 SET_ACTIVE_LIMITS_LOCKED(p, p_entry->memlimit_active, p_entry->memlimit_active_attr);
7736
7737 /*
7738 * Store the inactive limit variants in the proc.
7739 */
7740 SET_INACTIVE_LIMITS_LOCKED(p, p_entry->memlimit_inactive, p_entry->memlimit_inactive_attr);
7741
7742 /*
7743 * Enforce appropriate limit variant by updating the cached values
7744 * and writing the ledger.
7745 * Limit choice is based on process active/inactive state.
7746 */
7747
7748 if (memorystatus_highwater_enabled) {
7749 boolean_t is_fatal;
7750 boolean_t use_active;
7751
7752 if (proc_jetsam_state_is_active_locked(p) == TRUE) {
7753 CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
7754 use_active = TRUE;
7755 } else {
7756 CACHE_INACTIVE_LIMITS_LOCKED(p, is_fatal);
7757 use_active = FALSE;
7758 }
7759
7760 /* Enforce the limit by writing to the ledgers */
7761 error = (task_set_phys_footprint_limit_internal(p->task, ((p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1), NULL, use_active, is_fatal) == 0) ? 0 : EINVAL;
7762
7763 MEMORYSTATUS_DEBUG(3, "memorystatus_set_memlimit_properties: new limit on pid %d (%dMB %s) current priority (%d) dirty_state?=0x%x %s\n",
7764 proc_getpid(p), (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1),
7765 (p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT ? "F " : "NF"), p->p_memstat_effectivepriority, p->p_memstat_dirty,
7766 (p->p_memstat_dirty ? ((p->p_memstat_dirty & P_DIRTY) ? "isdirty" : "isclean") : ""));
7767 DTRACE_MEMORYSTATUS2(memorystatus_set_memlimit, proc_t, p, int32_t, (p->p_memstat_memlimit > 0 ? p->p_memstat_memlimit : -1));
7768 }
7769
7770 return error;
7771 }
7772
7773 static int
7774 memorystatus_set_memlimit_properties(pid_t pid, memorystatus_memlimit_properties_t *entry)
7775 {
7776 memorystatus_memlimit_properties_t set_entry;
7777
7778 proc_t p = proc_find(pid);
7779 if (!p) {
7780 return ESRCH;
7781 }
7782
7783 /*
7784 * Check for valid attribute flags.
7785 */
7786 const uint32_t valid_attrs = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7787 if ((entry->memlimit_active_attr & (~valid_attrs)) != 0) {
7788 proc_rele(p);
7789 return EINVAL;
7790 }
7791 if ((entry->memlimit_inactive_attr & (~valid_attrs)) != 0) {
7792 proc_rele(p);
7793 return EINVAL;
7794 }
7795
7796 /*
7797 * Setup the active memlimit properties
7798 */
7799 set_entry.memlimit_active = entry->memlimit_active;
7800 set_entry.memlimit_active_attr = entry->memlimit_active_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7801
7802 /*
7803 * Setup the inactive memlimit properties
7804 */
7805 set_entry.memlimit_inactive = entry->memlimit_inactive;
7806 set_entry.memlimit_inactive_attr = entry->memlimit_inactive_attr & MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7807
7808 /*
7809 * Setting a limit of <= 0 implies that the process has no
7810 * high-water-mark and has no per-task-limit. That means
7811 * the system_wide task limit is in place, which by the way,
7812 * is always fatal.
7813 */
7814
7815 if (set_entry.memlimit_active <= 0) {
7816 /*
7817 * Enforce the fatal system_wide task limit while process is active.
7818 */
7819 set_entry.memlimit_active = -1;
7820 set_entry.memlimit_active_attr = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7821 }
7822 #if CONFIG_JETSAM
7823 #if DEVELOPMENT || DEBUG
7824 else {
7825 /* add the current increase to it, for roots */
7826 set_entry.memlimit_active += roundToNearestMB(p->p_memlimit_increase);
7827 }
7828 #endif /* DEVELOPMENT || DEBUG */
7829 #endif /* CONFIG_JETSAM */
7830
7831 if (set_entry.memlimit_inactive <= 0) {
7832 /*
7833 * Enforce the fatal system_wide task limit while process is inactive.
7834 */
7835 set_entry.memlimit_inactive = -1;
7836 set_entry.memlimit_inactive_attr = MEMORYSTATUS_MEMLIMIT_ATTR_FATAL;
7837 }
7838 #if CONFIG_JETSAM
7839 #if DEVELOPMENT || DEBUG
7840 else {
7841 /* add the current increase to it, for roots */
7842 set_entry.memlimit_inactive += roundToNearestMB(p->p_memlimit_increase);
7843 }
7844 #endif /* DEVELOPMENT || DEBUG */
7845 #endif /* CONFIG_JETSAM */
7846
7847 proc_list_lock();
7848
7849 int error = memorystatus_set_memlimit_properties_internal(p, &set_entry);
7850
7851 proc_list_unlock();
7852 proc_rele(p);
7853
7854 return error;
7855 }
7856
7857 /*
7858 * Returns the jetsam priority (effective or requested) of the process
7859 * associated with this task.
7860 */
7861 int
7862 proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
7863 {
7864 if (p) {
7865 if (effective_priority) {
7866 return p->p_memstat_effectivepriority;
7867 } else {
7868 return p->p_memstat_requestedpriority;
7869 }
7870 }
7871 return 0;
7872 }
7873
7874 static int
7875 memorystatus_get_process_is_managed(pid_t pid, int *is_managed)
7876 {
7877 proc_t p = NULL;
7878
7879 /* Validate inputs */
7880 if (pid == 0) {
7881 return EINVAL;
7882 }
7883
7884 p = proc_find(pid);
7885 if (!p) {
7886 return ESRCH;
7887 }
7888
7889 proc_list_lock();
7890 *is_managed = ((p->p_memstat_state & P_MEMSTAT_MANAGED) ? 1 : 0);
7891 proc_rele(p);
7892 proc_list_unlock();
7893
7894 return 0;
7895 }
7896
7897 static int
7898 memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed)
7899 {
7900 proc_t p = NULL;
7901
7902 /* Validate inputs */
7903 if (pid == 0) {
7904 return EINVAL;
7905 }
7906
7907 p = proc_find(pid);
7908 if (!p) {
7909 return ESRCH;
7910 }
7911
7912 proc_list_lock();
7913 if (set_managed == TRUE) {
7914 p->p_memstat_state |= P_MEMSTAT_MANAGED;
7915 /*
7916 * The P_MEMSTAT_MANAGED bit is set by assertiond for Apps.
7917 * Also opt them in to being frozen (they might have started
7918 * off with the P_MEMSTAT_FREEZE_DISABLED bit set.)
7919 */
7920 p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED;
7921 } else {
7922 p->p_memstat_state &= ~P_MEMSTAT_MANAGED;
7923 }
7924 proc_rele(p);
7925 proc_list_unlock();
7926
7927 return 0;
7928 }
7929
7930 int
7931 memorystatus_control(struct proc *p, struct memorystatus_control_args *args, int *ret)
7932 {
7933 int error = EINVAL;
7934 boolean_t skip_auth_check = FALSE;
7935 os_reason_t jetsam_reason = OS_REASON_NULL;
7936
7937 #if !CONFIG_JETSAM
7938 #pragma unused(ret)
7939 #pragma unused(jetsam_reason)
7940 #endif
7941
7942 /* We don't need entitlements if we're setting / querying the freeze preference or frozen status for a process. */
7943 if (args->command == MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE ||
7944 args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE ||
7945 args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN) {
7946 skip_auth_check = TRUE;
7947 }
7948
7949 /* Need to be root or have entitlement. */
7950 if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT) && !skip_auth_check) {
7951 error = EPERM;
7952 goto out;
7953 }
7954
7955 /*
7956 * Sanity check.
7957 * Do not enforce it for snapshots.
7958 */
7959 if (args->command != MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT) {
7960 if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
7961 error = EINVAL;
7962 goto out;
7963 }
7964 }
7965
7966 #if CONFIG_MACF
7967 error = mac_proc_check_memorystatus_control(p, args->command, args->pid);
7968 if (error) {
7969 goto out;
7970 }
7971 #endif /* MAC */
7972
7973 switch (args->command) {
7974 case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
7975 error = memorystatus_cmd_get_priority_list(args->pid, args->buffer, args->buffersize, ret);
7976 break;
7977 case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
7978 error = memorystatus_cmd_set_priority_properties(args->pid, args->flags, args->buffer, args->buffersize, ret);
7979 break;
7980 case MEMORYSTATUS_CMD_SET_MEMLIMIT_PROPERTIES:
7981 error = memorystatus_cmd_set_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
7982 break;
7983 case MEMORYSTATUS_CMD_GET_MEMLIMIT_PROPERTIES:
7984 error = memorystatus_cmd_get_memlimit_properties(args->pid, args->buffer, args->buffersize, ret);
7985 break;
7986 case MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS:
7987 error = memorystatus_cmd_get_memlimit_excess_np(args->pid, args->flags, args->buffer, args->buffersize, ret);
7988 break;
7989 case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
7990 error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
7991 break;
7992 case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
7993 error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret);
7994 break;
7995 #if DEVELOPMENT || DEBUG
7996 case MEMORYSTATUS_CMD_SET_TESTING_PID:
7997 error = memorystatus_cmd_set_testing_pid((int32_t) args->flags);
7998 break;
7999 #endif
8000 case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
8001 error = memorystatus_cmd_get_pressure_status(ret);
8002 break;
8003 #if CONFIG_JETSAM
8004 case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
8005 /*
8006 * This call does not distinguish between active and inactive limits.
8007 * Default behavior in 2-level HWM world is to set both.
8008 * Non-fatal limit is also assumed for both.
8009 */
8010 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
8011 break;
8012 case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
8013 /*
8014 * This call does not distinguish between active and inactive limits.
8015 * Default behavior in 2-level HWM world is to set both.
8016 * Fatal limit is also assumed for both.
8017 */
8018 error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
8019 break;
8020 #endif /* CONFIG_JETSAM */
8021 /* Test commands */
8022 #if DEVELOPMENT || DEBUG
8023 case MEMORYSTATUS_CMD_TEST_JETSAM:
8024 jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_GENERIC);
8025 if (jetsam_reason == OS_REASON_NULL) {
8026 printf("memorystatus_control: failed to allocate jetsam reason\n");
8027 }
8028
8029 error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled, jetsam_reason) ? 0 : EINVAL;
8030 break;
8031 case MEMORYSTATUS_CMD_TEST_JETSAM_SORT:
8032 error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags, args->buffer, args->buffersize);
8033 break;
8034 #else /* DEVELOPMENT || DEBUG */
8035 #pragma unused(jetsam_reason)
8036 #endif /* DEVELOPMENT || DEBUG */
8037 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE:
8038 if (memorystatus_aggressive_jetsam_lenient_allowed == FALSE) {
8039 #if DEVELOPMENT || DEBUG
8040 printf("Enabling Lenient Mode\n");
8041 #endif /* DEVELOPMENT || DEBUG */
8042
8043 memorystatus_aggressive_jetsam_lenient_allowed = TRUE;
8044 memorystatus_aggressive_jetsam_lenient = TRUE;
8045 error = 0;
8046 }
8047 break;
8048 case MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE:
8049 #if DEVELOPMENT || DEBUG
8050 printf("Disabling Lenient mode\n");
8051 #endif /* DEVELOPMENT || DEBUG */
8052 memorystatus_aggressive_jetsam_lenient_allowed = FALSE;
8053 memorystatus_aggressive_jetsam_lenient = FALSE;
8054 error = 0;
8055 break;
8056 case MEMORYSTATUS_CMD_GET_AGGRESSIVE_JETSAM_LENIENT_MODE:
8057 *ret = (memorystatus_aggressive_jetsam_lenient ? 1 : 0);
8058 error = 0;
8059 break;
8060 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE:
8061 case MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE:
8062 error = memorystatus_low_mem_privileged_listener(args->command);
8063 break;
8064
8065 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE:
8066 case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE:
8067 error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, JETSAM_PRIORITY_ELEVATED_INACTIVE, args->flags ? TRUE : FALSE);
8068 break;
8069 case MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED:
8070 error = memorystatus_set_process_is_managed(args->pid, args->flags);
8071 break;
8072
8073 case MEMORYSTATUS_CMD_GET_PROCESS_IS_MANAGED:
8074 error = memorystatus_get_process_is_managed(args->pid, ret);
8075 break;
8076
8077 #if CONFIG_FREEZE
8078 case MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE:
8079 error = memorystatus_set_process_is_freezable(args->pid, args->flags ? TRUE : FALSE);
8080 break;
8081
8082 case MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE:
8083 error = memorystatus_get_process_is_freezable(args->pid, ret);
8084 break;
8085 case MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN:
8086 error = memorystatus_get_process_is_frozen(args->pid, ret);
8087 break;
8088
8089 case MEMORYSTATUS_CMD_FREEZER_CONTROL:
8090 error = memorystatus_freezer_control(args->flags, args->buffer, args->buffersize, ret);
8091 break;
8092 #endif /* CONFIG_FREEZE */
8093
8094 #if DEVELOPMENT || DEBUG
8095 case MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT:
8096 error = memorystatus_cmd_increase_jetsam_task_limit(args->pid, args->flags);
8097 break;
8098 #endif /* DEVELOPMENT || DEBUG */
8099
8100 default:
8101 error = EINVAL;
8102 break;
8103 }
8104
8105 out:
8106 return error;
8107 }
8108
8109 /* Coalition support */
8110
8111 /* sorting info for a particular priority bucket */
8112 typedef struct memstat_sort_info {
8113 coalition_t msi_coal;
8114 uint64_t msi_page_count;
8115 pid_t msi_pid;
8116 int msi_ntasks;
8117 } memstat_sort_info_t;
8118
8119 /*
8120 * qsort from smallest page count to largest page count
8121 *
8122 * return < 0 for a < b
8123 * 0 for a == b
8124 * > 0 for a > b
8125 */
8126 static int
8127 memstat_asc_cmp(const void *a, const void *b)
8128 {
8129 const memstat_sort_info_t *msA = (const memstat_sort_info_t *)a;
8130 const memstat_sort_info_t *msB = (const memstat_sort_info_t *)b;
8131
8132 return (int)((uint64_t)msA->msi_page_count - (uint64_t)msB->msi_page_count);
8133 }
8134
8135 /*
8136 * Return the number of pids rearranged during this sort.
8137 */
8138 static int
8139 memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order)
8140 {
8141 #define MAX_SORT_PIDS 80
8142 #define MAX_COAL_LEADERS 10
8143
8144 unsigned int b = bucket_index;
8145 int nleaders = 0;
8146 int ntasks = 0;
8147 proc_t p = NULL;
8148 coalition_t coal = COALITION_NULL;
8149 int pids_moved = 0;
8150 int total_pids_moved = 0;
8151 int i;
8152
8153 /*
8154 * The system is typically under memory pressure when in this
8155 * path, hence, we want to avoid dynamic memory allocation.
8156 */
8157 memstat_sort_info_t leaders[MAX_COAL_LEADERS];
8158 pid_t pid_list[MAX_SORT_PIDS];
8159
8160 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8161 return 0;
8162 }
8163
8164 /*
8165 * Clear the array that holds coalition leader information
8166 */
8167 for (i = 0; i < MAX_COAL_LEADERS; i++) {
8168 leaders[i].msi_coal = COALITION_NULL;
8169 leaders[i].msi_page_count = 0; /* will hold total coalition page count */
8170 leaders[i].msi_pid = 0; /* will hold coalition leader pid */
8171 leaders[i].msi_ntasks = 0; /* will hold the number of tasks in a coalition */
8172 }
8173
8174 p = memorystatus_get_first_proc_locked(&b, FALSE);
8175 while (p) {
8176 coal = task_get_coalition(p->task, COALITION_TYPE_JETSAM);
8177 if (coalition_is_leader(p->task, coal)) {
8178 if (nleaders < MAX_COAL_LEADERS) {
8179 int coal_ntasks = 0;
8180 uint64_t coal_page_count = coalition_get_page_count(coal, &coal_ntasks);
8181 leaders[nleaders].msi_coal = coal;
8182 leaders[nleaders].msi_page_count = coal_page_count;
8183 leaders[nleaders].msi_pid = proc_getpid(p); /* the coalition leader */
8184 leaders[nleaders].msi_ntasks = coal_ntasks;
8185 nleaders++;
8186 } else {
8187 /*
8188 * We've hit MAX_COAL_LEADERS meaning we can handle no more coalitions.
8189 * Abandoned coalitions will linger at the tail of the priority band
8190 * when this sort session ends.
8191 * TODO: should this be an assert?
8192 */
8193 printf("%s: WARNING: more than %d leaders in priority band [%d]\n",
8194 __FUNCTION__, MAX_COAL_LEADERS, bucket_index);
8195 break;
8196 }
8197 }
8198 p = memorystatus_get_next_proc_locked(&b, p, FALSE);
8199 }
8200
8201 if (nleaders == 0) {
8202 /* Nothing to sort */
8203 return 0;
8204 }
8205
8206 /*
8207 * Sort the coalition leader array, from smallest coalition page count
8208 * to largest coalition page count. When inserted in the priority bucket,
8209 * smallest coalition is handled first, resulting in the last to be jetsammed.
8210 */
8211 if (nleaders > 1) {
8212 qsort(leaders, nleaders, sizeof(memstat_sort_info_t), memstat_asc_cmp);
8213 }
8214
8215 #if 0
8216 for (i = 0; i < nleaders; i++) {
8217 printf("%s: coal_leader[%d of %d] pid[%d] pages[%llu] ntasks[%d]\n",
8218 __FUNCTION__, i, nleaders, leaders[i].msi_pid, leaders[i].msi_page_count,
8219 leaders[i].msi_ntasks);
8220 }
8221 #endif
8222
8223 /*
8224 * During coalition sorting, processes in a priority band are rearranged
8225 * by being re-inserted at the head of the queue. So, when handling a
8226 * list, the first process that gets moved to the head of the queue,
8227 * ultimately gets pushed toward the queue tail, and hence, jetsams last.
8228 *
8229 * So, for example, the coalition leader is expected to jetsam last,
8230 * after its coalition members. Therefore, the coalition leader is
8231 * inserted at the head of the queue first.
8232 *
8233 * After processing a coalition, the jetsam order is as follows:
8234 * undefs(jetsam first), extensions, xpc services, leader(jetsam last)
8235 */
8236
8237 /*
8238 * Coalition members are rearranged in the priority bucket here,
8239 * based on their coalition role.
8240 */
8241 total_pids_moved = 0;
8242 for (i = 0; i < nleaders; i++) {
8243 /* a bit of bookkeeping */
8244 pids_moved = 0;
8245
8246 /* Coalition leaders are jetsammed last, so move into place first */
8247 pid_list[0] = leaders[i].msi_pid;
8248 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, 1);
8249
8250 /* xpc services should jetsam after extensions */
8251 ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_XPC,
8252 coal_sort_order, pid_list, MAX_SORT_PIDS);
8253
8254 if (ntasks > 0) {
8255 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8256 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8257 }
8258
8259 /* extensions should jetsam after unmarked processes */
8260 ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_EXT,
8261 coal_sort_order, pid_list, MAX_SORT_PIDS);
8262
8263 if (ntasks > 0) {
8264 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8265 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8266 }
8267
8268 /* undefined coalition members should be the first to jetsam */
8269 ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_UNDEF,
8270 coal_sort_order, pid_list, MAX_SORT_PIDS);
8271
8272 if (ntasks > 0) {
8273 pids_moved += memorystatus_move_list_locked(bucket_index, pid_list,
8274 (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS));
8275 }
8276
8277 #if 0
8278 if (pids_moved == leaders[i].msi_ntasks) {
8279 /*
8280 * All the pids in the coalition were found in this band.
8281 */
8282 printf("%s: pids_moved[%d] equal total coalition ntasks[%d] \n", __FUNCTION__,
8283 pids_moved, leaders[i].msi_ntasks);
8284 } else if (pids_moved > leaders[i].msi_ntasks) {
8285 /*
8286 * Apparently new coalition members showed up during the sort?
8287 */
8288 printf("%s: pids_moved[%d] were greater than expected coalition ntasks[%d] \n", __FUNCTION__,
8289 pids_moved, leaders[i].msi_ntasks);
8290 } else {
8291 /*
8292 * Apparently not all the pids in the coalition were found in this band?
8293 */
8294 printf("%s: pids_moved[%d] were less than expected coalition ntasks[%d] \n", __FUNCTION__,
8295 pids_moved, leaders[i].msi_ntasks);
8296 }
8297 #endif
8298
8299 total_pids_moved += pids_moved;
8300 } /* end for */
8301
8302 return total_pids_moved;
8303 }
8304
8305
8306 /*
8307 * Traverse a list of pids, searching for each within the priority band provided.
8308 * If pid is found, move it to the front of the priority band.
8309 * Never searches outside the priority band provided.
8310 *
8311 * Input:
8312 * bucket_index - jetsam priority band.
8313 * pid_list - pointer to a list of pids.
8314 * list_sz - number of pids in the list.
8315 *
8316 * Pid list ordering is important in that,
8317 * pid_list[n] is expected to jetsam ahead of pid_list[n+1].
8318 * The sort_order is set by the coalition default.
8319 *
8320 * Return:
8321 * the number of pids found and hence moved within the priority band.
8322 */
8323 static int
8324 memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz)
8325 {
8326 memstat_bucket_t *current_bucket;
8327 int i;
8328 int found_pids = 0;
8329
8330 if ((pid_list == NULL) || (list_sz <= 0)) {
8331 return 0;
8332 }
8333
8334 if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
8335 return 0;
8336 }
8337
8338 current_bucket = &memstat_bucket[bucket_index];
8339 for (i = 0; i < list_sz; i++) {
8340 unsigned int b = bucket_index;
8341 proc_t p = NULL;
8342 proc_t aProc = NULL;
8343 pid_t aPid;
8344 int list_index;
8345
8346 list_index = ((list_sz - 1) - i);
8347 aPid = pid_list[list_index];
8348
8349 /* never search beyond bucket_index provided */
8350 p = memorystatus_get_first_proc_locked(&b, FALSE);
8351 while (p) {
8352 if (proc_getpid(p) == aPid) {
8353 aProc = p;
8354 break;
8355 }
8356 p = memorystatus_get_next_proc_locked(&b, p, FALSE);
8357 }
8358
8359 if (aProc == NULL) {
8360 /* pid not found in this band, just skip it */
8361 continue;
8362 } else {
8363 TAILQ_REMOVE(¤t_bucket->list, aProc, p_memstat_list);
8364 TAILQ_INSERT_HEAD(¤t_bucket->list, aProc, p_memstat_list);
8365 found_pids++;
8366 }
8367 }
8368 return found_pids;
8369 }
8370
8371 int
8372 memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index)
8373 {
8374 int32_t i = JETSAM_PRIORITY_IDLE;
8375 int count = 0;
8376
8377 if (max_bucket_index >= MEMSTAT_BUCKET_COUNT) {
8378 return -1;
8379 }
8380
8381 while (i <= max_bucket_index) {
8382 count += memstat_bucket[i++].count;
8383 }
8384
8385 return count;
8386 }
8387
8388 int
8389 memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap)
8390 {
8391 #if !CONFIG_JETSAM
8392 if (!p || (!isApp(p)) || (p->p_memstat_state & (P_MEMSTAT_INTERNAL | P_MEMSTAT_MANAGED))) {
8393 /*
8394 * Ineligible processes OR system processes e.g. launchd.
8395 *
8396 * We also skip processes that have the P_MEMSTAT_MANAGED bit set, i.e.
8397 * they're managed by assertiond. These are iOS apps that have been ported
8398 * to macOS. assertiond might be in the process of modifying the app's
8399 * priority / memory limit - so it might have the proc_list lock, and then try
8400 * to take the task lock. Meanwhile we've entered this function with the task lock
8401 * held, and we need the proc_list lock below. So we'll deadlock with assertiond.
8402 *
8403 * It should be fine to read the P_MEMSTAT_MANAGED bit without the proc_list
8404 * lock here, since assertiond only sets this bit on process launch.
8405 */
8406 return -1;
8407 }
8408
8409 /*
8410 * For macOS only:
8411 * We would like to use memorystatus_update() here to move the processes
8412 * within the bands. Unfortunately memorystatus_update() calls
8413 * memorystatus_update_priority_locked() which uses any band transitions
8414 * as an indication to modify ledgers. For that it needs the task lock
8415 * and since we came into this function with the task lock held, we'll deadlock.
8416 *
8417 * Unfortunately we can't completely disable ledger updates because we still
8418 * need the ledger updates for a subset of processes i.e. daemons.
8419 * When all processes on all platforms support memory limits, we can simply call
8420 * memorystatus_update().
8421 *
8422 * It also has some logic to deal with 'aging' which, currently, is only applicable
8423 * on CONFIG_JETSAM configs. So, till every platform has CONFIG_JETSAM we'll need
8424 * to do this explicit band transition.
8425 */
8426
8427 memstat_bucket_t *current_bucket, *new_bucket;
8428 int32_t priority = 0;
8429
8430 proc_list_lock();
8431
8432 if (proc_list_exited(p) ||
8433 (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED | P_MEMSTAT_SKIP))) {
8434 /*
8435 * If the process is on its way out OR
8436 * jetsam has alread tried and failed to kill this process,
8437 * let's skip the whole jetsam band transition.
8438 */
8439 proc_list_unlock();
8440 return 0;
8441 }
8442
8443 if (is_appnap) {
8444 current_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
8445 new_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8446 priority = JETSAM_PRIORITY_IDLE;
8447 } else {
8448 if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
8449 /*
8450 * It is possible that someone pulled this process
8451 * out of the IDLE band without updating its app-nap
8452 * parameters.
8453 */
8454 proc_list_unlock();
8455 return 0;
8456 }
8457
8458 current_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
8459 new_bucket = &memstat_bucket[p->p_memstat_requestedpriority];
8460 priority = p->p_memstat_requestedpriority;
8461 }
8462
8463 TAILQ_REMOVE(¤t_bucket->list, p, p_memstat_list);
8464 current_bucket->count--;
8465 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
8466 current_bucket->relaunch_high_count--;
8467 }
8468 TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
8469 new_bucket->count++;
8470 if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) {
8471 new_bucket->relaunch_high_count++;
8472 }
8473 /*
8474 * Record idle start or idle delta.
8475 */
8476 if (p->p_memstat_effectivepriority == priority) {
8477 /*
8478 * This process is not transitioning between
8479 * jetsam priority buckets. Do nothing.
8480 */
8481 } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) {
8482 uint64_t now;
8483 /*
8484 * Transitioning out of the idle priority bucket.
8485 * Record idle delta.
8486 */
8487 assert(p->p_memstat_idle_start != 0);
8488 now = mach_absolute_time();
8489 if (now > p->p_memstat_idle_start) {
8490 p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
8491 }
8492 } else if (priority == JETSAM_PRIORITY_IDLE) {
8493 /*
8494 * Transitioning into the idle priority bucket.
8495 * Record idle start.
8496 */
8497 p->p_memstat_idle_start = mach_absolute_time();
8498 }
8499
8500 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), proc_getpid(p), priority, p->p_memstat_effectivepriority, 0, 0);
8501
8502 p->p_memstat_effectivepriority = priority;
8503
8504 proc_list_unlock();
8505
8506 return 0;
8507
8508 #else /* !CONFIG_JETSAM */
8509 #pragma unused(p)
8510 #pragma unused(is_appnap)
8511 return -1;
8512 #endif /* !CONFIG_JETSAM */
8513 }
8514
8515 uint64_t
8516 memorystatus_available_memory_internal(struct proc *p)
8517 {
8518 #ifdef XNU_TARGET_OS_OSX
8519 if (p->p_memstat_memlimit <= 0) {
8520 return 0;
8521 }
8522 #endif /* XNU_TARGET_OS_OSX */
8523 const uint64_t footprint_in_bytes = get_task_phys_footprint(p->task);
8524 int32_t memlimit_mb;
8525 int64_t memlimit_bytes;
8526 int64_t rc;
8527
8528 if (isApp(p) == FALSE) {
8529 return 0;
8530 }
8531
8532 if (p->p_memstat_memlimit > 0) {
8533 memlimit_mb = p->p_memstat_memlimit;
8534 } else if (task_convert_phys_footprint_limit(-1, &memlimit_mb) != KERN_SUCCESS) {
8535 return 0;
8536 }
8537
8538 if (memlimit_mb <= 0) {
8539 memlimit_bytes = INT_MAX & ~((1 << 20) - 1);
8540 } else {
8541 memlimit_bytes = ((int64_t) memlimit_mb) << 20;
8542 }
8543
8544 rc = memlimit_bytes - footprint_in_bytes;
8545
8546 return (rc >= 0) ? rc : 0;
8547 }
8548
8549 int
8550 memorystatus_available_memory(struct proc *p, __unused struct memorystatus_available_memory_args *args, uint64_t *ret)
8551 {
8552 *ret = memorystatus_available_memory_internal(p);
8553
8554 return 0;
8555 }
8556
8557 #if DEVELOPMENT || DEBUG
8558 static int
8559 memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase)
8560 {
8561 memorystatus_memlimit_properties_t mmp_entry;
8562
8563 /* Validate inputs */
8564 if ((pid == 0) || (byte_increase == 0)) {
8565 return EINVAL;
8566 }
8567
8568 proc_t p = proc_find(pid);
8569
8570 if (!p) {
8571 return ESRCH;
8572 }
8573
8574 const uint32_t current_memlimit_increase = roundToNearestMB(p->p_memlimit_increase);
8575 /* round to page */
8576 const int32_t page_aligned_increase = (int32_t) MIN(round_page(p->p_memlimit_increase + byte_increase), INT32_MAX);
8577
8578 proc_list_lock();
8579
8580 memorystatus_get_memlimit_properties_internal(p, &mmp_entry);
8581
8582 if (mmp_entry.memlimit_active > 0) {
8583 mmp_entry.memlimit_active -= current_memlimit_increase;
8584 mmp_entry.memlimit_active += roundToNearestMB(page_aligned_increase);
8585 }
8586
8587 if (mmp_entry.memlimit_inactive > 0) {
8588 mmp_entry.memlimit_inactive -= current_memlimit_increase;
8589 mmp_entry.memlimit_inactive += roundToNearestMB(page_aligned_increase);
8590 }
8591
8592 /*
8593 * Store the updated delta limit in the proc.
8594 */
8595 p->p_memlimit_increase = page_aligned_increase;
8596
8597 int error = memorystatus_set_memlimit_properties_internal(p, &mmp_entry);
8598
8599 proc_list_unlock();
8600 proc_rele(p);
8601
8602 return error;
8603 }
8604 #endif /* DEVELOPMENT */
8605