1 /*
2 * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29
30 #include <sys/kern_event.h>
31 #include <kern/sched_prim.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/thread_call.h>
38 #include <kern/host.h>
39 #include <kern/policy_internal.h>
40 #include <kern/thread_group.h>
41
42 #include <IOKit/IOBSD.h>
43
44 #include <libkern/libkern.h>
45 #include <libkern/coreanalytics/coreanalytics.h>
46 #include <mach/coalition.h>
47 #include <mach/clock_types.h>
48 #include <mach/mach_time.h>
49 #include <mach/task.h>
50 #include <mach/host_priv.h>
51 #include <mach/mach_host.h>
52 #include <os/log.h>
53 #include <pexpert/pexpert.h>
54 #include <sys/coalition.h>
55 #include <sys/kern_event.h>
56 #include <sys/proc.h>
57 #include <sys/proc_info.h>
58 #include <sys/reason.h>
59 #include <sys/signal.h>
60 #include <sys/signalvar.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysproto.h>
63 #include <sys/time.h>
64 #include <sys/wait.h>
65 #include <sys/tree.h>
66 #include <sys/priv.h>
67 #include <vm/vm_pageout_xnu.h>
68 #include <vm/vm_protos.h>
69 #include <vm/vm_purgeable_xnu.h>
70 #include <mach/machine/sdt.h>
71 #include <libkern/section_keywords.h>
72 #include <stdatomic.h>
73
74 #if CONFIG_FREEZE
75 #include <vm/vm_map.h>
76 #endif /* CONFIG_FREEZE */
77
78 #include <kern/kern_memorystatus_internal.h>
79 #include <sys/kern_memorystatus.h>
80 #include <sys/kern_memorystatus_notify.h>
81 #include <sys/kern_memorystatus_xnu.h>
82
83 /*
84 * Memorystatus klist structures
85 */
86 struct klist memorystatus_klist;
87 static lck_mtx_t memorystatus_klist_mutex;
88 static void memorystatus_klist_lock(void);
89 static void memorystatus_klist_unlock(void);
90
91 /*
92 * Memorystatus kevent filter routines
93 */
94 static int filt_memorystatusattach(struct knote *kn, struct kevent_qos_s *kev);
95 static void filt_memorystatusdetach(struct knote *kn);
96 static int filt_memorystatus(struct knote *kn, long hint);
97 static int filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev);
98 static int filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev);
99
100 SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
101 .f_attach = filt_memorystatusattach,
102 .f_detach = filt_memorystatusdetach,
103 .f_event = filt_memorystatus,
104 .f_touch = filt_memorystatustouch,
105 .f_process = filt_memorystatusprocess,
106 };
107
108 /*
109 * Memorystatus notification events
110 */
111 enum {
112 kMemorystatusNoPressure = 0x1,
113 kMemorystatusPressure = 0x2,
114 kMemorystatusLowSwap = 0x4,
115 kMemorystatusProcLimitWarn = 0x8,
116 kMemorystatusProcLimitCritical = 0x10
117 };
118
119 #define INTER_NOTIFICATION_DELAY (250000) /* .25 second */
120 #define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */
121 #define WARNING_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
122 #define CRITICAL_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
123
124 /*
125 * Memorystatus notification helper routines
126 */
127 static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
128 static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
129 static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
130 static struct knote *vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update);
131 static void vm_dispatch_memory_pressure(void);
132 kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process);
133
134 #if VM_PRESSURE_EVENTS
135
136 /*
137 * This value is the threshold that a process must meet to be considered for scavenging.
138 */
139 #if XNU_TARGET_OS_OSX
140 #define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */
141 #else /* XNU_TARGET_OS_OSX */
142 #define VM_PRESSURE_MINIMUM_RSIZE 6 /* MB */
143 #endif /* XNU_TARGET_OS_OSX */
144
145 static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
146
147 #if DEVELOPMENT || DEBUG
148 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
149 #endif /* DEVELOPMENT || DEBUG */
150
151 vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
152
153 /*
154 * We use this flag to signal if we have any HWM offenders
155 * on the system. This way we can reduce the number of wakeups
156 * of the memorystatus_thread when the system is between the
157 * "pressure" and "critical" threshold.
158 *
159 * The (re-)setting of this variable is done without any locks
160 * or synchronization simply because it is not possible (currently)
161 * to keep track of HWM offenders that drop down below their memory
162 * limit and/or exit. So, we choose to burn a couple of wasted wakeups
163 * by allowing the unguarded modification of this variable.
164 *
165 * TODO: this should be a count of number of hwm candidates
166 */
167 _Atomic bool memorystatus_hwm_candidates = false;
168
169 #endif /* VM_PRESSURE_EVENTS */
170
171 uint32_t memorystatus_jetsam_fg_band_waiters = 0;
172 uint32_t memorystatus_jetsam_bg_band_waiters = 0;
173 static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = 0; /* nanosec */
174 static uint64_t memorystatus_jetsam_bg_band_timestamp_ns = 0; /* nanosec */
175 static uint64_t memorystatus_jetsam_notification_delay_ns = 5ull * 1000 * 1000 * 1000; /* nanosec */
176
177 #if DEVELOPMENT || DEBUG
178 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_notification_delay_ns, CTLFLAG_RW | CTLFLAG_LOCKED,
179 &memorystatus_jetsam_notification_delay_ns, "");
180 #endif
181
182 static int
filt_memorystatusattach(struct knote * kn,__unused struct kevent_qos_s * kev)183 filt_memorystatusattach(struct knote *kn, __unused struct kevent_qos_s *kev)
184 {
185 int error;
186
187 kn->kn_flags |= EV_CLEAR; /* automatically set */
188 kn->kn_sdata = 0; /* incoming data is ignored */
189 memset(&kn->kn_ext, 0, sizeof(kn->kn_ext));
190
191 error = memorystatus_knote_register(kn);
192 if (error) {
193 knote_set_error(kn, error);
194 }
195 return 0;
196 }
197
198 static void
filt_memorystatusdetach(struct knote * kn)199 filt_memorystatusdetach(struct knote *kn)
200 {
201 memorystatus_knote_unregister(kn);
202 }
203
204 static int
filt_memorystatus(struct knote * kn __unused,long hint)205 filt_memorystatus(struct knote *kn __unused, long hint)
206 {
207 if (hint) {
208 switch (hint) {
209 case kMemorystatusNoPressure:
210 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
211 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
212 }
213 break;
214 case kMemorystatusPressure:
215 if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
216 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
217 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
218 }
219 } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
220 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
221 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
222 }
223 }
224 break;
225 case kMemorystatusLowSwap:
226 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
227 kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
228 }
229 break;
230
231 case kMemorystatusProcLimitWarn:
232 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
233 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
234 }
235 break;
236
237 case kMemorystatusProcLimitCritical:
238 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
239 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
240 }
241 break;
242
243 default:
244 break;
245 }
246 }
247
248 #if 0
249 if (kn->kn_fflags != 0) {
250 proc_t knote_proc = knote_get_kq(kn)->kq_p;
251 pid_t knote_pid = proc_getpid(knote_proc);
252
253 printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
254 (unsigned long)kn, kn->kn_fflags, knote_pid);
255 }
256 #endif
257
258 return kn->kn_fflags != 0;
259 }
260
261 static int
filt_memorystatustouch(struct knote * kn,struct kevent_qos_s * kev)262 filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev)
263 {
264 int res;
265 int prev_kn_sfflags = 0;
266
267 memorystatus_klist_lock();
268
269 /*
270 * copy in new kevent settings
271 * (saving the "desired" data and fflags).
272 */
273
274 prev_kn_sfflags = kn->kn_sfflags;
275 kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
276
277 #if XNU_TARGET_OS_OSX
278 /*
279 * Only on desktop do we restrict notifications to
280 * one per active/inactive state (soft limits only).
281 */
282 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
283 /*
284 * Is there previous state to preserve?
285 */
286 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
287 /*
288 * This knote was previously interested in proc_limit_warn,
289 * so yes, preserve previous state.
290 */
291 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
292 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
293 }
294 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
295 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
296 }
297 } else {
298 /*
299 * This knote was not previously interested in proc_limit_warn,
300 * but it is now. Set both states.
301 */
302 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
303 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
304 }
305 }
306
307 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
308 /*
309 * Is there previous state to preserve?
310 */
311 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
312 /*
313 * This knote was previously interested in proc_limit_critical,
314 * so yes, preserve previous state.
315 */
316 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
317 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
318 }
319 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
320 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
321 }
322 } else {
323 /*
324 * This knote was not previously interested in proc_limit_critical,
325 * but it is now. Set both states.
326 */
327 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
328 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
329 }
330 }
331 #endif /* XNU_TARGET_OS_OSX */
332
333 /*
334 * reset the output flags based on a
335 * combination of the old events and
336 * the new desired event list.
337 */
338 //kn->kn_fflags &= kn->kn_sfflags;
339
340 res = (kn->kn_fflags != 0);
341
342 memorystatus_klist_unlock();
343
344 return res;
345 }
346
347 static int
filt_memorystatusprocess(struct knote * kn,struct kevent_qos_s * kev)348 filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev)
349 {
350 int res = 0;
351
352 memorystatus_klist_lock();
353 if (kn->kn_fflags) {
354 knote_fill_kevent(kn, kev, 0);
355 res = 1;
356 }
357 memorystatus_klist_unlock();
358
359 return res;
360 }
361
362 static void
memorystatus_klist_lock(void)363 memorystatus_klist_lock(void)
364 {
365 lck_mtx_lock(&memorystatus_klist_mutex);
366 }
367
368 static void
memorystatus_klist_unlock(void)369 memorystatus_klist_unlock(void)
370 {
371 lck_mtx_unlock(&memorystatus_klist_mutex);
372 }
373
374 void
memorystatus_kevent_init(lck_grp_t * grp,lck_attr_t * attr)375 memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr)
376 {
377 lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
378 klist_init(&memorystatus_klist);
379 }
380
381 int
memorystatus_knote_register(struct knote * kn)382 memorystatus_knote_register(struct knote *kn)
383 {
384 int error = 0;
385
386 memorystatus_klist_lock();
387
388 /*
389 * Support only userspace visible flags.
390 */
391 if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
392 #if XNU_TARGET_OS_OSX
393 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
394 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
395 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
396 }
397
398 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
399 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
400 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
401 }
402 #endif /* XNU_TARGET_OS_OSX */
403
404 KNOTE_ATTACH(&memorystatus_klist, kn);
405 } else {
406 error = ENOTSUP;
407 }
408
409 memorystatus_klist_unlock();
410
411 return error;
412 }
413
414 void
memorystatus_knote_unregister(struct knote * kn __unused)415 memorystatus_knote_unregister(struct knote *kn __unused)
416 {
417 memorystatus_klist_lock();
418 KNOTE_DETACH(&memorystatus_klist, kn);
419 memorystatus_klist_unlock();
420 }
421
422 #if VM_PRESSURE_EVENTS
423
424 #if CONFIG_JETSAM
425
426 static thread_call_t sustained_pressure_handler_thread_call;
427 int memorystatus_should_kill_on_sustained_pressure = 1;
428 /* Count the number of sustained pressure kills we've done since boot. */
429 uint64_t memorystatus_kill_on_sustained_pressure_count = 0;
430 uint64_t memorystatus_kill_on_sustained_pressure_window_s = 60 * 10; /* 10 Minutes */
431 uint64_t memorystatus_kill_on_sustained_pressure_delay_ms = 500; /* .5 seconds */
432
433 #if DEVELOPMENT || DEBUG
434 SYSCTL_INT(_kern, OID_AUTO, memorystatus_should_kill_on_sustained_pressure, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_should_kill_on_sustained_pressure, 0, "");
435 #endif /* DEVELOPMENT || DEBUG */
436 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_count, "");
437 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_window_s, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_window_s, "");
438 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_delay_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_delay_ms, "");
439
440 static void sustained_pressure_handler(void*, void*);
441 #endif /* CONFIG_JETSAM */
442 static thread_call_t memorystatus_notify_update_telemetry_thread_call;
443 static void update_footprints_for_telemetry(void*, void*);
444
445
446 void
memorystatus_notify_init()447 memorystatus_notify_init()
448 {
449 #if CONFIG_JETSAM
450 sustained_pressure_handler_thread_call = thread_call_allocate_with_options(sustained_pressure_handler, NULL, THREAD_CALL_PRIORITY_KERNEL_HIGH, THREAD_CALL_OPTIONS_ONCE);
451 #endif /* CONFIG_JETSAM */
452 memorystatus_notify_update_telemetry_thread_call = thread_call_allocate_with_options(update_footprints_for_telemetry, NULL, THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
453 }
454
455 #if CONFIG_MEMORYSTATUS
456
457 inline int
memorystatus_send_note(int event_code,void * data,uint32_t data_length)458 memorystatus_send_note(int event_code, void *data, uint32_t data_length)
459 {
460 int ret;
461 struct kev_msg ev_msg;
462
463 ev_msg.vendor_code = KEV_VENDOR_APPLE;
464 ev_msg.kev_class = KEV_SYSTEM_CLASS;
465 ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS;
466
467 ev_msg.event_code = event_code;
468
469 ev_msg.dv[0].data_length = data_length;
470 ev_msg.dv[0].data_ptr = data;
471 ev_msg.dv[1].data_length = 0;
472
473 ret = kev_post_msg(&ev_msg);
474 if (ret) {
475 memorystatus_log_error("%s: kev_post_msg() failed, err %d\n", __func__, ret);
476 }
477
478 return ret;
479 }
480
481 boolean_t
memorystatus_warn_process(const proc_t p,__unused boolean_t is_active,__unused boolean_t is_fatal,boolean_t limit_exceeded)482 memorystatus_warn_process(const proc_t p, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
483 {
484 /*
485 * This function doesn't take a reference to p or lock it. So it better be the current process.
486 */
487 assert(p == current_proc());
488 pid_t pid = proc_getpid(p);
489 boolean_t ret = FALSE;
490 boolean_t found_knote = FALSE;
491 struct knote *kn = NULL;
492 int send_knote_count = 0;
493 uint32_t platform;
494 platform = proc_platform(p);
495
496 /*
497 * See comment in sysctl_memorystatus_vm_pressure_send.
498 */
499
500 memorystatus_klist_lock();
501
502 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
503 proc_t knote_proc = knote_get_kq(kn)->kq_p;
504 pid_t knote_pid = proc_getpid(knote_proc);
505
506 if (knote_pid == pid) {
507 /*
508 * By setting the "fflags" here, we are forcing
509 * a process to deal with the case where it's
510 * bumping up into its memory limits. If we don't
511 * do this here, we will end up depending on the
512 * system pressure snapshot evaluation in
513 * filt_memorystatus().
514 */
515
516 /*
517 * The type of notification and the frequency are different between
518 * embedded and desktop.
519 *
520 * Embedded processes register for global pressure notifications
521 * (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) via UIKit
522 * (see applicationDidReceiveMemoryWarning in UIKit). We'll warn them here if
523 * they are near there memory limit. filt_memorystatus() will warn them based
524 * on the system pressure level.
525 *
526 * On desktop, (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL)
527 * are only expected to fire for system level warnings. Desktop procesess
528 * register for NOTE_MEMORYSTATUS_PROC_LIMIT_WARN
529 * if they want to be warned when they approach their limit
530 * and for NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL to be warned when they
531 * exceed their limit.
532 *
533 * On embedded we continuously warn processes that are approaching their
534 * memory limit. However on desktop, we only send one warning while
535 * the process is active/inactive if the limit is soft..
536 *
537 */
538 if (platform == PLATFORM_MACOS || platform == PLATFORM_MACCATALYST || platform == PLATFORM_DRIVERKIT) {
539 if (!limit_exceeded) {
540 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
541 found_knote = TRUE;
542 if (!is_fatal) {
543 /*
544 * Restrict proc_limit_warn notifications when
545 * non-fatal (soft) limit is at play.
546 */
547 if (is_active) {
548 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
549 /*
550 * Mark this knote for delivery.
551 */
552 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
553 /*
554 * And suppress it from future notifications.
555 */
556 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
557 send_knote_count++;
558 }
559 } else {
560 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
561 /*
562 * Mark this knote for delivery.
563 */
564 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
565 /*
566 * And suppress it from future notifications.
567 */
568 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
569 send_knote_count++;
570 }
571 }
572 } else {
573 /*
574 * No restriction on proc_limit_warn notifications when
575 * fatal (hard) limit is at play.
576 */
577 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
578 send_knote_count++;
579 }
580 }
581 } else {
582 /*
583 * Send this notification when a process has exceeded a soft limit,
584 */
585
586 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
587 found_knote = TRUE;
588 if (!is_fatal) {
589 /*
590 * Restrict critical notifications for soft limits.
591 */
592
593 if (is_active) {
594 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
595 /*
596 * Suppress future proc_limit_critical notifications
597 * for the active soft limit.
598 */
599 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
600 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
601 send_knote_count++;
602 }
603 } else {
604 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
605 /*
606 * Suppress future proc_limit_critical_notifications
607 * for the inactive soft limit.
608 */
609 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
610 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
611 send_knote_count++;
612 }
613 }
614 } else {
615 /*
616 * We should never be trying to send a critical notification for
617 * a hard limit... the process would be killed before it could be
618 * received.
619 */
620 panic("Caught sending pid %d a critical warning for a fatal limit.", pid);
621 }
622 }
623 }
624 } else {
625 if (!limit_exceeded) {
626 /*
627 * Intentionally set either the unambiguous limit warning,
628 * the system-wide critical or the system-wide warning
629 * notification bit.
630 */
631
632 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
633 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
634 found_knote = TRUE;
635 send_knote_count++;
636 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
637 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
638 found_knote = TRUE;
639 send_knote_count++;
640 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
641 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
642 found_knote = TRUE;
643 send_knote_count++;
644 }
645 } else {
646 /*
647 * Send this notification when a process has exceeded a soft limit.
648 */
649 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
650 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
651 found_knote = TRUE;
652 send_knote_count++;
653 }
654 }
655 }
656 }
657 }
658
659 if (found_knote) {
660 if (send_knote_count > 0) {
661 KNOTE(&memorystatus_klist, 0);
662 }
663 ret = TRUE;
664 }
665
666 memorystatus_klist_unlock();
667
668 return ret;
669 }
670
671 /*
672 * Can only be set by the current task on itself.
673 */
674 int
memorystatus_low_mem_privileged_listener(uint32_t op_flags)675 memorystatus_low_mem_privileged_listener(uint32_t op_flags)
676 {
677 boolean_t set_privilege = FALSE;
678 /*
679 * Need an entitlement check here?
680 */
681 if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
682 set_privilege = TRUE;
683 } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
684 set_privilege = FALSE;
685 } else {
686 return EINVAL;
687 }
688
689 return task_low_mem_privileged_listener(current_task(), set_privilege, NULL);
690 }
691
692 int
memorystatus_send_pressure_note(pid_t pid)693 memorystatus_send_pressure_note(pid_t pid)
694 {
695 memorystatus_log_debug("memorystatus_send_pressure_note(): pid %d\n", pid);
696 return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
697 }
698
699 boolean_t
memorystatus_is_foreground_locked(proc_t p)700 memorystatus_is_foreground_locked(proc_t p)
701 {
702 return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
703 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
704 }
705
706 /*
707 * This is meant for stackshot and kperf -- it does not take the proc_list_lock
708 * to access the p_memstat_dirty field.
709 */
710 void
memorystatus_proc_flags_unsafe(void * v,boolean_t * is_dirty,boolean_t * is_dirty_tracked,boolean_t * allow_idle_exit)711 memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
712 {
713 if (!v) {
714 *is_dirty = FALSE;
715 *is_dirty_tracked = FALSE;
716 *allow_idle_exit = FALSE;
717 } else {
718 proc_t p = (proc_t)v;
719 *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
720 *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
721 *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
722 }
723 }
724
725 boolean_t
memorystatus_bg_pressure_eligible(proc_t p)726 memorystatus_bg_pressure_eligible(proc_t p)
727 {
728 boolean_t eligible = FALSE;
729
730 proc_list_lock();
731
732 memorystatus_log_debug("memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", proc_getpid(p), p->p_memstat_state);
733
734 /* Foreground processes have already been dealt with at this point, so just test for eligibility */
735 if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
736 eligible = TRUE;
737 }
738
739 if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
740 /*
741 * IDLE and IDLE_DEFERRED bands contain processes
742 * that have dropped memory to be under their inactive
743 * memory limits. And so they can't really give back
744 * anything.
745 */
746 eligible = FALSE;
747 }
748
749 proc_list_unlock();
750
751 return eligible;
752 }
753
754 void
memorystatus_send_low_swap_note(void)755 memorystatus_send_low_swap_note(void)
756 {
757 struct knote *kn = NULL;
758
759 memorystatus_klist_lock();
760 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
761 /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
762 * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
763 * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
764 * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
765 if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
766 KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
767 break;
768 }
769 }
770
771 memorystatus_klist_unlock();
772 }
773
774 #endif /* CONFIG_MEMORYSTATUS */
775
776 /*
777 * Notification telemetry
778 */
779 CA_EVENT(memorystatus_pressure_interval,
780 CA_INT, num_processes_registered,
781 CA_INT, num_notifications_sent,
782 CA_INT, max_level,
783 CA_INT, num_transitions,
784 CA_INT, num_kills,
785 CA_INT, duration);
786 static CA_EVENT_TYPE(memorystatus_pressure_interval) memorystatus_pressure_interval_telemetry;
787
788 CA_EVENT(memorystatus_proc_notification,
789 CA_INT, footprint_before_notification,
790 CA_INT, footprint_1_min_after_first_warning,
791 CA_INT, footprint_5_min_after_first_warning,
792 CA_INT, footprint_20_min_after_first_warning,
793 CA_INT, footprint_1_min_after_first_critical,
794 CA_INT, footprint_5_min_after_first_critical,
795 CA_INT, footprint_20_min_after_first_critical,
796 CA_INT, order_within_list,
797 CA_INT, num_notifications_sent,
798 CA_INT, time_between_warning_and_critical,
799 CA_STATIC_STRING(CA_PROCNAME_LEN), proc_name);
800
801 /* The send timestamps for the first notifications are stored in the knote's kn_sdata field */
802 #define KNOTE_SEND_TIMESTAMP_WARNING_INDEX 0
803 #define KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX 1
804
805 /* The footprint history for this task is stored in the knote's kn_ext array. */
806 struct knote_footprint_history {
807 uint32_t kfh_starting_footprint;
808 uint32_t kfh_footprint_after_warn_1; /* 1 minute after first warning notification */
809 uint32_t kfh_footprint_after_warn_5; /* 5 minutes after first warning notification */
810 uint32_t kfh_footprint_after_warn_20; /* 20 minutes after first warning notification */
811 uint32_t kfh_footprint_after_critical_1; /* 1 minute after first critical notification */
812 uint32_t kfh_footprint_after_critical_5; /* 5 minutes after first critical notification */
813 uint32_t kfh_footprint_after_critical_20; /* 20 minutes after first critical notification */
814 uint16_t kfh_num_notifications;
815 uint16_t kfh_notification_order;
816 } __attribute__((packed));
817
818
819 static_assert(sizeof(struct knote_footprint_history) <= sizeof(uint64_t) * 4, "footprint history fits in knote extensions");
820
821 static void
mark_knote_send_time(struct knote * kn,task_t task,int knote_pressure_level,uint16_t order_within_list)822 mark_knote_send_time(struct knote *kn, task_t task, int knote_pressure_level, uint16_t order_within_list)
823 {
824 uint32_t *timestamps;
825 uint32_t index;
826 uint64_t curr_ts, curr_ts_seconds;
827 struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
828 if (knote_pressure_level != NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
829 timestamps = (uint32_t *)&(kn->kn_sdata);
830 index = knote_pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN ?
831 KNOTE_SEND_TIMESTAMP_WARNING_INDEX : KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX;
832 if (timestamps[index] == 0) {
833 /* First notification for this level since pressure elevated from normal. */
834 curr_ts = mach_absolute_time();
835 curr_ts_seconds = 0;
836 absolutetime_to_nanoseconds(curr_ts, &curr_ts_seconds);
837 curr_ts_seconds /= NSEC_PER_SEC;
838
839 timestamps[index] = (uint32_t)MIN(UINT32_MAX, curr_ts_seconds);
840
841 /* Record task initial footprint */
842 if (timestamps[index == KNOTE_SEND_TIMESTAMP_WARNING_INDEX ? KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX : KNOTE_SEND_TIMESTAMP_WARNING_INDEX] == 0) {
843 /*
844 * First notification at any level since pressure elevated from normal.
845 * Record the footprint and our order in the notification list.
846 */
847 footprint_history->kfh_starting_footprint = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
848 footprint_history->kfh_notification_order = order_within_list;
849 }
850 }
851 }
852 footprint_history->kfh_num_notifications++;
853 }
854
855 /*
856 * Records the current footprint for this task in the knote telemetry.
857 *
858 * Returns the soonest absolutetime when this footprint history should be updated again.
859 */
860 static uint64_t
update_knote_footprint_history(struct knote * kn,task_t task,uint64_t curr_ts)861 update_knote_footprint_history(struct knote *kn, task_t task, uint64_t curr_ts)
862 {
863 uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
864 struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
865 uint64_t warning_send_time, critical_send_time, minutes_since_warning = UINT64_MAX, minutes_since_critical = UINT64_MAX;
866 warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
867 critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
868 uint32_t task_phys_footprint_mb = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
869 uint64_t next_run = UINT64_MAX, absolutetime_in_minute = 0, minutes_since_last_notification = 0, curr_ts_s;
870 absolutetime_to_nanoseconds(curr_ts, &curr_ts_s);
871 nanoseconds_to_absolutetime(60 * NSEC_PER_SEC, &absolutetime_in_minute);
872 curr_ts_s /= NSEC_PER_SEC;
873
874 if (warning_send_time != 0) {
875 /* This task received a warning notification. */
876 minutes_since_warning = (curr_ts_s - warning_send_time) / 60;
877 if (footprint_history->kfh_footprint_after_warn_1 == 0 && minutes_since_warning >= 1) {
878 footprint_history->kfh_footprint_after_warn_1 = task_phys_footprint_mb;
879 }
880 if (footprint_history->kfh_footprint_after_warn_5 == 0 && minutes_since_warning >= 5) {
881 footprint_history->kfh_footprint_after_warn_5 = task_phys_footprint_mb;
882 }
883 if (footprint_history->kfh_footprint_after_warn_20 == 0 && minutes_since_warning >= 20) {
884 footprint_history->kfh_footprint_after_warn_20 = task_phys_footprint_mb;
885 }
886 }
887 if (critical_send_time != 0) {
888 /* This task received a critical notification. */
889 minutes_since_critical = (curr_ts_s - critical_send_time) / 60;
890 if (footprint_history->kfh_footprint_after_critical_1 == 0 && minutes_since_critical >= 1) {
891 footprint_history->kfh_footprint_after_critical_1 = task_phys_footprint_mb;
892 }
893 if (footprint_history->kfh_footprint_after_critical_5 == 0 && minutes_since_critical >= 5) {
894 footprint_history->kfh_footprint_after_critical_5 = task_phys_footprint_mb;
895 }
896 if (footprint_history->kfh_footprint_after_critical_20 == 0 && minutes_since_critical >= 20) {
897 footprint_history->kfh_footprint_after_critical_20 = task_phys_footprint_mb;
898 }
899 }
900
901 minutes_since_last_notification = MIN(minutes_since_warning, minutes_since_critical);
902 if (minutes_since_last_notification < 20) {
903 if (minutes_since_last_notification < 5) {
904 if (minutes_since_last_notification < 1) {
905 next_run = curr_ts + absolutetime_in_minute;
906 } else {
907 next_run = curr_ts + (absolutetime_in_minute * 5);
908 }
909 } else {
910 next_run = curr_ts + (absolutetime_in_minute * 20);
911 }
912 }
913
914 return next_run;
915 }
916
917 extern char *proc_name_address(void *p);
918 /*
919 * Attempt to send the given level telemetry event.
920 * Finalizes the duration.
921 * Clears the src_event struct.
922 */
923 static void
memorystatus_pressure_interval_send(CA_EVENT_TYPE (memorystatus_pressure_interval)* src_event)924 memorystatus_pressure_interval_send(CA_EVENT_TYPE(memorystatus_pressure_interval) *src_event)
925 {
926 uint64_t duration_nanoseconds = 0;
927 uint64_t curr_ts = mach_absolute_time();
928 src_event->duration = curr_ts - src_event->duration;
929 absolutetime_to_nanoseconds(src_event->duration, &duration_nanoseconds);
930 src_event->duration = (int64_t) (duration_nanoseconds / NSEC_PER_SEC);
931
932 /*
933 * Drop the event rather than block for memory. We should be in a normal pressure level now,
934 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
935 */
936 ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_pressure_interval, Z_NOWAIT);
937 if (event_wrapper) {
938 memcpy(event_wrapper->data, src_event, sizeof(CA_EVENT_TYPE(memorystatus_pressure_interval)));
939 CA_EVENT_SEND(event_wrapper);
940 }
941 src_event->num_processes_registered = 0;
942 src_event->num_notifications_sent = 0;
943 src_event->max_level = 0;
944 src_event->num_transitions = 0;
945 src_event->num_kills = 0;
946 src_event->duration = 0;
947 }
948
949
950 /*
951 * Attempt to send the per-proc telemetry events.
952 * Clears the footprint histories on the knotes.
953 */
954 static void
memorystatus_pressure_proc_telemetry_send(void)955 memorystatus_pressure_proc_telemetry_send(void)
956 {
957 struct knote *kn = NULL;
958 memorystatus_klist_lock();
959 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
960 proc_t p = PROC_NULL;
961 struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
962 uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
963 uint32_t warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
964 uint32_t critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
965 CA_EVENT_TYPE(memorystatus_proc_notification) * event = NULL;
966 if (warning_send_time != 0 || critical_send_time != 0) {
967 /*
968 * Drop the event rather than block for memory. We should be in a normal pressure level now,
969 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
970 */
971 ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_proc_notification, Z_NOWAIT | Z_ZERO);
972 if (event_wrapper) {
973 event = event_wrapper->data;
974
975 event->footprint_before_notification = footprint_history->kfh_starting_footprint;
976 event->footprint_1_min_after_first_warning = footprint_history->kfh_footprint_after_warn_1;
977 event->footprint_5_min_after_first_warning = footprint_history->kfh_footprint_after_warn_5;
978 event->footprint_20_min_after_first_warning = footprint_history->kfh_footprint_after_warn_20;
979 event->footprint_1_min_after_first_critical = footprint_history->kfh_footprint_after_critical_1;
980 event->footprint_5_min_after_first_critical = footprint_history->kfh_footprint_after_critical_5;
981 event->footprint_20_min_after_first_critical = footprint_history->kfh_footprint_after_critical_20;
982 event->num_notifications_sent = footprint_history->kfh_num_notifications;
983 if (warning_send_time != 0 && critical_send_time != 0) {
984 event->time_between_warning_and_critical = (critical_send_time - warning_send_time) / 60; // Minutes
985 }
986 event->order_within_list = footprint_history->kfh_notification_order;
987
988 p = proc_ref(knote_get_kq(kn)->kq_p, false);
989 if (p == NULL) {
990 CA_EVENT_DEALLOCATE(event_wrapper);
991 continue;
992 }
993 strlcpy(event->proc_name, proc_name_address(p), sizeof(event->proc_name));
994
995 proc_rele(p);
996 CA_EVENT_SEND(event_wrapper);
997 }
998 }
999 memset(footprint_history, 0, sizeof(*footprint_history));
1000 timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX] = 0;
1001 timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX] = 0;
1002 }
1003 memorystatus_klist_unlock();
1004 }
1005
1006 /*
1007 * Send all telemetry associated with the increased pressure interval.
1008 */
1009 static void
memorystatus_pressure_telemetry_send(void)1010 memorystatus_pressure_telemetry_send(void)
1011 {
1012 LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_NOTOWNED);
1013 memorystatus_pressure_interval_send(&memorystatus_pressure_interval_telemetry);
1014 memorystatus_pressure_proc_telemetry_send();
1015 }
1016
1017
1018 /*
1019 * kn_max - knote
1020 *
1021 * knote_pressure_level - to check if the knote is registered for this notification level.
1022 *
1023 * task - task whose bits we'll be modifying
1024 *
1025 * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
1026 *
1027 * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
1028 *
1029 */
1030
1031 static boolean_t
is_knote_registered_modify_task_pressure_bits(struct knote * kn_max,int knote_pressure_level,task_t task,vm_pressure_level_t pressure_level_to_clear,vm_pressure_level_t pressure_level_to_set)1032 is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
1033 {
1034 if (kn_max->kn_sfflags & knote_pressure_level) {
1035 if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
1036 task_clear_has_been_notified(task, pressure_level_to_clear);
1037 }
1038
1039 task_mark_has_been_notified(task, pressure_level_to_set);
1040 return TRUE;
1041 }
1042
1043 return FALSE;
1044 }
1045
1046 static void
memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)1047 memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
1048 {
1049 struct knote *kn = NULL;
1050
1051 memorystatus_klist_lock();
1052
1053 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1054 proc_t p = knote_get_kq(kn)->kq_p;
1055
1056 if (p == proc_ref(p, false)) {
1057 task_clear_has_been_notified(proc_task(p), pressure_level_to_clear);
1058 proc_rele(p);
1059 }
1060 }
1061
1062 memorystatus_klist_unlock();
1063 }
1064
1065 /*
1066 * Used by the vm_pressure_thread which is
1067 * signalled from within vm_pageout_scan().
1068 */
1069
1070 void
consider_vm_pressure_events(void)1071 consider_vm_pressure_events(void)
1072 {
1073 vm_dispatch_memory_pressure();
1074 }
1075
1076 static void
vm_dispatch_memory_pressure(void)1077 vm_dispatch_memory_pressure(void)
1078 {
1079 memorystatus_update_vm_pressure(FALSE);
1080 }
1081
1082 static struct knote *
vm_pressure_select_optimal_candidate_to_notify(struct klist * candidate_list,int level,boolean_t target_foreground_process,uint64_t * next_telemetry_update)1083 vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update)
1084 {
1085 struct knote *kn = NULL, *kn_max = NULL;
1086 uint64_t resident_max = 0;/* MB */
1087 int selected_task_importance = 0;
1088 static int pressure_snapshot = -1;
1089 boolean_t pressure_increase = FALSE;
1090 uint64_t curr_ts = mach_absolute_time();
1091 *next_telemetry_update = UINT64_MAX;
1092
1093 if (pressure_snapshot == -1) {
1094 /*
1095 * Initial snapshot.
1096 */
1097 pressure_snapshot = level;
1098 pressure_increase = TRUE;
1099 } else {
1100 if (level && (level >= pressure_snapshot)) {
1101 pressure_increase = TRUE;
1102 } else {
1103 pressure_increase = FALSE;
1104 }
1105
1106 pressure_snapshot = level;
1107 }
1108
1109 if (pressure_increase == TRUE) {
1110 /*
1111 * We'll start by considering the largest
1112 * unimportant task in our list.
1113 */
1114 selected_task_importance = INT_MAX;
1115 } else {
1116 /*
1117 * We'll start by considering the largest
1118 * important task in our list.
1119 */
1120 selected_task_importance = 0;
1121 }
1122
1123 SLIST_FOREACH(kn, candidate_list, kn_selnext) {
1124 uint64_t resident_size = 0;/* MB */
1125 proc_t p = PROC_NULL;
1126 struct task* t = TASK_NULL;
1127 int curr_task_importance = 0;
1128 uint64_t telemetry_update = 0;
1129 boolean_t consider_knote = FALSE;
1130 boolean_t privileged_listener = FALSE;
1131
1132 p = proc_ref(knote_get_kq(kn)->kq_p, false);
1133 if (p == PROC_NULL) {
1134 continue;
1135 }
1136
1137 #if CONFIG_MEMORYSTATUS
1138 if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
1139 /*
1140 * Skip process not marked foreground.
1141 */
1142 proc_rele(p);
1143 continue;
1144 }
1145 #endif /* CONFIG_MEMORYSTATUS */
1146
1147 t = (struct task *)(proc_task(p));
1148 telemetry_update = update_knote_footprint_history(kn, t, curr_ts);
1149 *next_telemetry_update = MIN(*next_telemetry_update, telemetry_update);
1150
1151 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
1152
1153 if ((kn->kn_sfflags & dispatch_level) == 0) {
1154 proc_rele(p);
1155 continue;
1156 }
1157
1158 #if CONFIG_MEMORYSTATUS
1159 if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
1160 VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", proc_getpid(p));
1161 proc_rele(p);
1162 continue;
1163 }
1164 #endif /* CONFIG_MEMORYSTATUS */
1165
1166 #if XNU_TARGET_OS_OSX
1167 curr_task_importance = task_importance_estimate(t);
1168 #else /* XNU_TARGET_OS_OSX */
1169 curr_task_importance = p->p_memstat_effectivepriority;
1170 #endif /* XNU_TARGET_OS_OSX */
1171
1172 /*
1173 * Privileged listeners are only considered in the multi-level pressure scheme
1174 * AND only if the pressure is increasing.
1175 */
1176 if (level > 0) {
1177 if (task_has_been_notified(t, level) == FALSE) {
1178 /*
1179 * Is this a privileged listener?
1180 */
1181 if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
1182 if (privileged_listener) {
1183 kn_max = kn;
1184 proc_rele(p);
1185 goto done_scanning;
1186 }
1187 }
1188 } else {
1189 proc_rele(p);
1190 continue;
1191 }
1192 } else if (level == 0) {
1193 /*
1194 * Task wasn't notified when the pressure was increasing and so
1195 * no need to notify it that the pressure is decreasing.
1196 */
1197 if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
1198 proc_rele(p);
1199 continue;
1200 }
1201 }
1202
1203 /*
1204 * We don't want a small process to block large processes from
1205 * being notified again. <rdar://problem/7955532>
1206 */
1207 resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */
1208
1209 if (resident_size >= vm_pressure_task_footprint_min) {
1210 if (level > 0) {
1211 /*
1212 * Warning or Critical Pressure.
1213 */
1214 if (pressure_increase) {
1215 if ((curr_task_importance < selected_task_importance) ||
1216 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1217 /*
1218 * We have found a candidate process which is:
1219 * a) at a lower importance than the current selected process
1220 * OR
1221 * b) has importance equal to that of the current selected process but is larger
1222 */
1223
1224 consider_knote = TRUE;
1225 }
1226 } else {
1227 if ((curr_task_importance > selected_task_importance) ||
1228 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1229 /*
1230 * We have found a candidate process which is:
1231 * a) at a higher importance than the current selected process
1232 * OR
1233 * b) has importance equal to that of the current selected process but is larger
1234 */
1235
1236 consider_knote = TRUE;
1237 }
1238 }
1239 } else if (level == 0) {
1240 /*
1241 * Pressure back to normal.
1242 */
1243 if ((curr_task_importance > selected_task_importance) ||
1244 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1245 consider_knote = TRUE;
1246 }
1247 }
1248
1249 if (consider_knote) {
1250 resident_max = resident_size;
1251 kn_max = kn;
1252 selected_task_importance = curr_task_importance;
1253 consider_knote = FALSE; /* reset for the next candidate */
1254 }
1255 } else {
1256 /* There was no candidate with enough resident memory to scavenge */
1257 VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", proc_getpid(p), resident_size);
1258 }
1259 proc_rele(p);
1260 }
1261
1262 done_scanning:
1263 if (kn_max) {
1264 VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, DBG_VM_PRESSURE_EVENT, DBG_FUNC_NONE, proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max, 0, 0);
1265 VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max);
1266 }
1267
1268 return kn_max;
1269 }
1270
1271 /*
1272 * To avoid notification storms in a system with sawtooth behavior of pressure levels eg:
1273 * Normal -> warning (notify clients) -> critical (notify) -> warning (notify) -> critical (notify) -> warning (notify)...
1274 *
1275 * We have 'resting' periods: WARNING_NOTIFICATION_RESTING_PERIOD and CRITICAL_NOTIFICATION_RESTING_PERIOD
1276 *
1277 * So it would look like:-
1278 * Normal -> warning (notify) -> critical (notify) -> warning (notify if it has been RestPeriod since last warning) -> critical (notify if it has been RestPeriod since last critical) -> ...
1279 *
1280 * That's what these 2 timestamps below signify.
1281 */
1282
1283 uint64_t next_warning_notification_sent_at_ts = 0;
1284 uint64_t next_critical_notification_sent_at_ts = 0;
1285
1286 boolean_t memorystatus_manual_testing_on = FALSE;
1287 vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal;
1288
1289 unsigned int memorystatus_sustained_pressure_maximum_band = JETSAM_PRIORITY_IDLE;
1290 #if DEVELOPMENT || DEBUG
1291 SYSCTL_INT(_kern, OID_AUTO, memorystatus_sustained_pressure_maximum_band, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_sustained_pressure_maximum_band, 0, "");
1292 #endif /* DEVELOPMENT || DEBUG */
1293
1294 #if CONFIG_JETSAM
1295
1296 /*
1297 * TODO(jason): The memorystatus thread should be responsible for this
1298 * It can just check how long the pressure level has been at warning and the timestamp
1299 * of the last sustained pressure kill.
1300 */
1301 static void
sustained_pressure_handler(void * arg0 __unused,void * arg1 __unused)1302 sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused)
1303 {
1304 int max_kills = 0, kill_count = 0;
1305 /*
1306 * Pressure has been elevated for too long.
1307 * We don't want to leave the system in this state as it can delay background
1308 * work indefinitely & drain battery.
1309 *
1310 * Try to return the system to normal via jetsam.
1311 * We'll run through the idle band up to 2 times.
1312 * If the pressure hasn't been relieved by then, the problem is memory
1313 * consumption in a higher band and this churn is probably doing more harm than good.
1314 */
1315 max_kills = memstat_get_proccnt_upto_priority(memorystatus_sustained_pressure_maximum_band) * 2;
1316 memorystatus_log("memorystatus: Pressure level has been elevated for too long. killing up to %d idle processes\n", max_kills);
1317 while (memorystatus_vm_pressure_level != kVMPressureNormal && kill_count < max_kills) {
1318 bool killed = memorystatus_kill_on_sustained_pressure();
1319 if (killed) {
1320 /*
1321 * Pause before our next kill & see if pressure reduces.
1322 */
1323 delay((int)(memorystatus_kill_on_sustained_pressure_delay_ms * NSEC_PER_MSEC / NSEC_PER_USEC));
1324 kill_count++;
1325 memorystatus_kill_on_sustained_pressure_count++;
1326 /* TODO(jason): Should use os_atomic but requires rdar://76310894. */
1327 memorystatus_pressure_interval_telemetry.num_kills++;
1328 } else {
1329 /* Nothing left to kill */
1330 break;
1331 }
1332 }
1333 if (memorystatus_vm_pressure_level != kVMPressureNormal) {
1334 memorystatus_log("memorystatus: Killed %d idle processes due to sustained pressure, but device didn't quiesce. Giving up.\n", kill_count);
1335 }
1336 }
1337
1338 #endif /* CONFIG_JETSAM */
1339
1340 /*
1341 * Returns the number of processes registered for notifications at this level.
1342 */
1343 static size_t
memorystatus_klist_length(int level)1344 memorystatus_klist_length(int level)
1345 {
1346 LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED);
1347 struct knote *kn;
1348 size_t count = 0;
1349 int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level);
1350 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1351 if (kn->kn_sfflags & knote_pressure_level) {
1352 count++;
1353 }
1354 }
1355 return count;
1356 }
1357
1358 /*
1359 * Updates the footprint telemetry for procs that have received notifications.
1360 */
1361 static void
update_footprints_for_telemetry(void * arg0 __unused,void * arg1 __unused)1362 update_footprints_for_telemetry(void* arg0 __unused, void* arg1 __unused)
1363 {
1364 uint64_t curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1365 struct knote *kn;
1366
1367 memorystatus_klist_lock();
1368 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1369 proc_t p = PROC_NULL;
1370 struct task* t = TASK_NULL;
1371 uint64_t telemetry_update;
1372
1373 p = proc_ref(knote_get_kq(kn)->kq_p, false);
1374 if (p == PROC_NULL) {
1375 continue;
1376 }
1377 t = (struct task *)(proc_task(p));
1378 proc_rele(p);
1379 p = PROC_NULL;
1380 telemetry_update = update_knote_footprint_history(kn, t, curr_ts);
1381 next_telemetry_update = MIN(next_telemetry_update, telemetry_update);
1382 }
1383 memorystatus_klist_unlock();
1384 if (next_telemetry_update != UINT64_MAX) {
1385 uint64_t next_update_seconds;
1386 absolutetime_to_nanoseconds(next_telemetry_update, &next_update_seconds);
1387 next_update_seconds /= NSEC_PER_SEC;
1388 thread_call_enter_delayed(memorystatus_notify_update_telemetry_thread_call, next_telemetry_update);
1389 }
1390 }
1391
1392 kern_return_t
memorystatus_update_vm_pressure(boolean_t target_foreground_process)1393 memorystatus_update_vm_pressure(boolean_t target_foreground_process)
1394 {
1395 struct knote *kn_max = NULL;
1396 struct knote *kn_cur = NULL, *kn_temp = NULL;/* for safe list traversal */
1397 pid_t target_pid = -1;
1398 struct klist dispatch_klist = { NULL };
1399 proc_t target_proc = PROC_NULL;
1400 struct task *task = NULL;
1401 boolean_t found_candidate = FALSE;
1402
1403 static vm_pressure_level_t level_snapshot = kVMPressureNormal;
1404 static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal;
1405 boolean_t smoothing_window_started = FALSE;
1406 struct timeval smoothing_window_start_tstamp = {0, 0};
1407 struct timeval curr_tstamp = {0, 0};
1408 int64_t elapsed_msecs = 0;
1409 uint64_t curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1410
1411
1412 uint64_t logging_now;
1413 absolutetime_to_nanoseconds(curr_ts, &logging_now);
1414 #if !CONFIG_JETSAM
1415 #define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */
1416
1417 int idle_kill_counter = 0;
1418
1419 /*
1420 * On desktop we take this opportunity to free up memory pressure
1421 * by immediately killing idle exitable processes. We use a delay
1422 * to avoid overkill. And we impose a max counter as a fail safe
1423 * in case daemons re-launch too fast.
1424 *
1425 * TODO: These jetsams should be performed on the memorystatus thread. We can
1426 * provide the similar false-idle mitigation by skipping processes with med/high
1427 * relaunch probability and/or using the sustained-pressure mechanism.
1428 * (rdar://134075608)
1429 */
1430 while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
1431 if (!memstat_kill_idle_process(kMemorystatusKilledIdleExit, NULL)) {
1432 /* No idle exitable processes left to kill */
1433 break;
1434 }
1435 idle_kill_counter++;
1436
1437 if (memorystatus_manual_testing_on == TRUE) {
1438 /*
1439 * Skip the delay when testing
1440 * the pressure notification scheme.
1441 */
1442 } else {
1443 delay(1000000); /* 1 second */
1444 }
1445 }
1446 #endif /* !CONFIG_JETSAM */
1447
1448 if (level_snapshot != kVMPressureNormal) {
1449 /*
1450 * Check to see if we are still in the 'resting' period
1451 * after having notified all clients interested in
1452 * a particular pressure level.
1453 */
1454
1455 level_snapshot = memorystatus_vm_pressure_level;
1456
1457 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1458 if (next_warning_notification_sent_at_ts) {
1459 if (curr_ts < next_warning_notification_sent_at_ts) {
1460 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1461 return KERN_SUCCESS;
1462 }
1463
1464 next_warning_notification_sent_at_ts = 0;
1465 memorystatus_klist_reset_all_for_level(kVMPressureWarning);
1466 }
1467 } else if (level_snapshot == kVMPressureCritical) {
1468 if (next_critical_notification_sent_at_ts) {
1469 if (curr_ts < next_critical_notification_sent_at_ts) {
1470 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1471 return KERN_SUCCESS;
1472 }
1473 next_critical_notification_sent_at_ts = 0;
1474 memorystatus_klist_reset_all_for_level(kVMPressureCritical);
1475 }
1476 }
1477 }
1478
1479 #if CONFIG_JETSAM
1480 if (memorystatus_vm_pressure_level == kVMPressureNormal && prev_level_snapshot != kVMPressureNormal) {
1481 if (memorystatus_should_kill_on_sustained_pressure) {
1482 memorystatus_log("memorystatus: Pressure has returned to level %d. Cancelling scheduled jetsam\n", memorystatus_vm_pressure_level);
1483 thread_call_cancel(sustained_pressure_handler_thread_call);
1484 }
1485 } else if (memorystatus_should_kill_on_sustained_pressure && memorystatus_vm_pressure_level != kVMPressureNormal && prev_level_snapshot == kVMPressureNormal) {
1486 /*
1487 * Pressure has increased from normal.
1488 * Hopefully the notifications will relieve it,
1489 * but as a fail-safe we'll trigger jetsam
1490 * after a configurable amount of time.
1491 */
1492 memorystatus_log("memorystatus: Pressure level has increased from %d to %d. Scheduling jetsam.\n", prev_level_snapshot, memorystatus_vm_pressure_level);
1493 uint64_t kill_time;
1494 nanoseconds_to_absolutetime(memorystatus_kill_on_sustained_pressure_window_s * NSEC_PER_SEC, &kill_time);
1495 kill_time += mach_absolute_time();
1496 thread_call_enter_delayed(sustained_pressure_handler_thread_call, kill_time);
1497 }
1498 #endif /* CONFIG_JETSAM */
1499
1500 while (1) {
1501 /*
1502 * There is a race window here. But it's not clear
1503 * how much we benefit from having extra synchronization.
1504 */
1505 level_snapshot = memorystatus_vm_pressure_level;
1506
1507 if (prev_level_snapshot > level_snapshot) {
1508 /*
1509 * Pressure decreased? Let's take a little breather
1510 * and see if this condition stays.
1511 */
1512 if (smoothing_window_started == FALSE) {
1513 smoothing_window_started = TRUE;
1514 microuptime(&smoothing_window_start_tstamp);
1515 }
1516
1517 microuptime(&curr_tstamp);
1518 timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
1519 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
1520
1521 if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
1522 delay(INTER_NOTIFICATION_DELAY);
1523 continue;
1524 }
1525 }
1526 if (level_snapshot == kVMPressureNormal) {
1527 memorystatus_pressure_telemetry_send();
1528 }
1529 prev_level_snapshot = level_snapshot;
1530 smoothing_window_started = FALSE;
1531 memorystatus_klist_lock();
1532
1533 if (level_snapshot > memorystatus_pressure_interval_telemetry.max_level) {
1534 memorystatus_pressure_interval_telemetry.num_processes_registered = memorystatus_klist_length(level_snapshot);
1535 memorystatus_pressure_interval_telemetry.max_level = level_snapshot;
1536 memorystatus_pressure_interval_telemetry.num_transitions++;
1537 if (memorystatus_pressure_interval_telemetry.duration == 0) {
1538 /* Set the start timestamp. Duration will be finalized when we send the event. */
1539 memorystatus_pressure_interval_telemetry.duration = curr_ts;
1540 }
1541 }
1542
1543 kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process, &next_telemetry_update);
1544
1545 if (kn_max == NULL) {
1546 memorystatus_klist_unlock();
1547
1548 /*
1549 * No more level-based clients to notify.
1550 *
1551 * Start the 'resting' window within which clients will not be re-notified.
1552 */
1553
1554 if (level_snapshot != kVMPressureNormal) {
1555 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1556 nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1557
1558 /* Next warning notification (if nothing changes) won't be sent before...*/
1559 next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1560 }
1561
1562 if (level_snapshot == kVMPressureCritical) {
1563 nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1564
1565 /* Next critical notification (if nothing changes) won't be sent before...*/
1566 next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1567 }
1568 }
1569 absolutetime_to_nanoseconds(mach_absolute_time(), &logging_now);
1570 if (next_telemetry_update != UINT64_MAX) {
1571 thread_call_enter_delayed(memorystatus_notify_update_telemetry_thread_call, next_telemetry_update);
1572 } else {
1573 thread_call_cancel(memorystatus_notify_update_telemetry_thread_call);
1574 }
1575 return KERN_FAILURE;
1576 }
1577
1578 target_proc = proc_ref(knote_get_kq(kn_max)->kq_p, false);
1579 if (target_proc == PROC_NULL) {
1580 memorystatus_klist_unlock();
1581 continue;
1582 }
1583
1584 target_pid = proc_getpid(target_proc);
1585
1586 task = (struct task *)(proc_task(target_proc));
1587
1588 if (level_snapshot != kVMPressureNormal) {
1589 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1590 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
1591 found_candidate = TRUE;
1592 }
1593 } else {
1594 if (level_snapshot == kVMPressureCritical) {
1595 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
1596 found_candidate = TRUE;
1597 }
1598 }
1599 }
1600 } else {
1601 if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1602 task_clear_has_been_notified(task, kVMPressureWarning);
1603 task_clear_has_been_notified(task, kVMPressureCritical);
1604
1605 found_candidate = TRUE;
1606 }
1607 }
1608
1609 if (found_candidate == FALSE) {
1610 proc_rele(target_proc);
1611 memorystatus_klist_unlock();
1612 continue;
1613 }
1614
1615 SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
1616 int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
1617
1618 if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
1619 proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
1620 pid_t knote_pid = proc_getpid(knote_proc);
1621 if (knote_pid == target_pid) {
1622 KNOTE_DETACH(&memorystatus_klist, kn_cur);
1623 KNOTE_ATTACH(&dispatch_klist, kn_cur);
1624 }
1625 }
1626 }
1627 if (level_snapshot != kVMPressureNormal) {
1628 mark_knote_send_time(kn_max, task, convert_internal_pressure_level_to_dispatch_level(level_snapshot),
1629 (uint16_t) MIN(UINT16_MAX, memorystatus_pressure_interval_telemetry.num_notifications_sent));
1630 memorystatus_pressure_interval_telemetry.num_notifications_sent++;
1631 }
1632
1633 KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
1634
1635 SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
1636 KNOTE_DETACH(&dispatch_klist, kn_cur);
1637 KNOTE_ATTACH(&memorystatus_klist, kn_cur);
1638 }
1639
1640 memorystatus_klist_unlock();
1641
1642 microuptime(&target_proc->vm_pressure_last_notify_tstamp);
1643 proc_rele(target_proc);
1644
1645 if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
1646 break;
1647 }
1648
1649 if (memorystatus_manual_testing_on == TRUE) {
1650 /*
1651 * Testing out the pressure notification scheme.
1652 * No need for delays etc.
1653 */
1654 } else {
1655 uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
1656 #if CONFIG_JETSAM
1657
1658 uint32_t critical_threshold = memorystatus_get_critical_page_shortage_threshold();
1659 uint32_t soft_threshold = memorystatus_get_soft_memlimit_page_shortage_threshold();
1660 assert(soft_threshold >= critical_threshold);
1661
1662 uint32_t backoff_threshold = soft_threshold -
1663 ((soft_threshold - critical_threshold) / 2);
1664
1665 if (memorystatus_get_available_page_count() <= backoff_threshold) {
1666 /*
1667 * We are nearing the critcal mark fast and can't afford to wait between
1668 * notifications.
1669 */
1670 sleep_interval = 0;
1671 }
1672 #endif /* CONFIG_JETSAM */
1673
1674 if (sleep_interval) {
1675 delay(sleep_interval);
1676 }
1677 }
1678 }
1679
1680 return KERN_SUCCESS;
1681 }
1682
1683 static uint32_t
convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)1684 convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
1685 {
1686 uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1687
1688 switch (internal_pressure_level) {
1689 case kVMPressureNormal:
1690 {
1691 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1692 break;
1693 }
1694
1695 case kVMPressureWarning:
1696 case kVMPressureUrgent:
1697 {
1698 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1699 break;
1700 }
1701
1702 case kVMPressureCritical:
1703 {
1704 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
1705 break;
1706 }
1707
1708 default:
1709 break;
1710 }
1711
1712 return dispatch_level;
1713 }
1714
1715 /*
1716 * Issue a wakeup to any threads listening for jetsam pressure via
1717 * `mach_vm_pressure_level_monitor`. Subscribers should respond to these
1718 * notifications by freeing cached memory.
1719 */
1720 void
memorystatus_broadcast_jetsam_pressure(vm_pressure_level_t pressure_level)1721 memorystatus_broadcast_jetsam_pressure(vm_pressure_level_t pressure_level)
1722 {
1723 uint64_t now;
1724 uint32_t *waiters = NULL;
1725 uint64_t *last_notification_ns = NULL;
1726
1727 switch (pressure_level) {
1728 case kVMPressureForegroundJetsam:
1729 waiters = &memorystatus_jetsam_fg_band_waiters;
1730 last_notification_ns = &memorystatus_jetsam_fg_band_timestamp_ns;
1731 break;
1732 case kVMPressureBackgroundJetsam:
1733 waiters = &memorystatus_jetsam_bg_band_waiters;
1734 last_notification_ns = &memorystatus_jetsam_bg_band_timestamp_ns;
1735 break;
1736 default:
1737 panic("Unexpected non-jetsam pressure level %d", pressure_level);
1738 }
1739
1740 lck_mtx_lock(&memorystatus_jetsam_broadcast_lock);
1741 absolutetime_to_nanoseconds(mach_absolute_time(), &now);
1742
1743 if (now - *last_notification_ns < memorystatus_jetsam_notification_delay_ns) {
1744 lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
1745 return;
1746 }
1747
1748 if (*waiters > 0) {
1749 memorystatus_log("memorystatus: issuing %s jetsam pressure notification to %d waiters",
1750 pressure_level == kVMPressureForegroundJetsam ?
1751 "foreground" : "background", *waiters);
1752 thread_wakeup((event_t)waiters);
1753 *waiters = 0;
1754 *last_notification_ns = now;
1755 }
1756 lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
1757 }
1758
1759 /*
1760 * Memorystatus notification debugging support
1761 */
1762
1763 #if DEVELOPMENT || DEBUG
1764
1765 static int
1766 sysctl_memorystatus_broadcast_jetsam_pressure SYSCTL_HANDLER_ARGS
1767 {
1768 int error = 0;
1769 vm_pressure_level_t pressure_level;
1770
1771 error = SYSCTL_IN(req, &pressure_level, sizeof(pressure_level));
1772 if (error) {
1773 return error;
1774 }
1775
1776 if (pressure_level == kVMPressureForegroundJetsam ||
1777 pressure_level == kVMPressureBackgroundJetsam) {
1778 memorystatus_broadcast_jetsam_pressure(pressure_level);
1779 } else {
1780 return EINVAL;
1781 }
1782
1783 return SYSCTL_OUT(req, &pressure_level, sizeof(pressure_level));
1784 }
1785
1786 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_broadcast_jetsam_pressure,
1787 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
1788 0, 0, &sysctl_memorystatus_broadcast_jetsam_pressure, "I", "");
1789
1790 #endif /* DEVELOPMENT || DEBUG */
1791
1792 static int
1793 sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
1794 {
1795 #pragma unused(arg1, arg2, oidp)
1796 #if !XNU_TARGET_OS_OSX
1797 int error = 0;
1798
1799 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
1800 if (error) {
1801 return error;
1802 }
1803
1804 #endif /* !XNU_TARGET_OS_OSX */
1805 uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
1806
1807 return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
1808 }
1809
1810 #if DEBUG || DEVELOPMENT
1811
1812 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
1813 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1814
1815 #else /* DEBUG || DEVELOPMENT */
1816
1817 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1818 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1819
1820 #endif /* DEBUG || DEVELOPMENT */
1821
1822 /*
1823 * Trigger levels to test the mechanism.
1824 * Can be used via a sysctl.
1825 */
1826 #define TEST_LOW_MEMORY_TRIGGER_ONE 1
1827 #define TEST_LOW_MEMORY_TRIGGER_ALL 2
1828 #define TEST_PURGEABLE_TRIGGER_ONE 3
1829 #define TEST_PURGEABLE_TRIGGER_ALL 4
1830 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5
1831 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6
1832
1833 static int
1834 sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
1835 {
1836 #pragma unused(arg1, arg2)
1837
1838 int level = 0;
1839 int error = 0;
1840 int pressure_level = 0;
1841 int trigger_request = 0;
1842 int force_purge;
1843
1844 error = sysctl_handle_int(oidp, &level, 0, req);
1845 if (error || !req->newptr) {
1846 return error;
1847 }
1848
1849 memorystatus_manual_testing_on = TRUE;
1850
1851 trigger_request = (level >> 16) & 0xFFFF;
1852 pressure_level = (level & 0xFFFF);
1853
1854 if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
1855 trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
1856 return EINVAL;
1857 }
1858 switch (pressure_level) {
1859 case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
1860 case NOTE_MEMORYSTATUS_PRESSURE_WARN:
1861 case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
1862 break;
1863 default:
1864 return EINVAL;
1865 }
1866
1867 /*
1868 * The pressure level is being set from user-space.
1869 * And user-space uses the constants in sys/event.h
1870 * So we translate those events to our internal levels here.
1871 */
1872 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1873 memorystatus_manual_testing_level = kVMPressureNormal;
1874 force_purge = 0;
1875 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
1876 memorystatus_manual_testing_level = kVMPressureWarning;
1877 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1878 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
1879 memorystatus_manual_testing_level = kVMPressureCritical;
1880 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1881 }
1882
1883 memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
1884
1885 /* purge according to the new pressure level */
1886 switch (trigger_request) {
1887 case TEST_PURGEABLE_TRIGGER_ONE:
1888 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
1889 if (force_purge == 0) {
1890 /* no purging requested */
1891 break;
1892 }
1893 vm_purgeable_object_purge_one_unlocked(force_purge);
1894 break;
1895 case TEST_PURGEABLE_TRIGGER_ALL:
1896 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
1897 if (force_purge == 0) {
1898 /* no purging requested */
1899 break;
1900 }
1901 while (vm_purgeable_object_purge_one_unlocked(force_purge)) {
1902 ;
1903 }
1904 break;
1905 }
1906
1907 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
1908 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
1909 memorystatus_update_vm_pressure(TRUE);
1910 }
1911
1912 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
1913 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
1914 while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
1915 continue;
1916 }
1917 }
1918
1919 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1920 memorystatus_manual_testing_on = FALSE;
1921 }
1922
1923 return 0;
1924 }
1925
1926 SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1927 0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
1928
1929
1930 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
1931 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
1932 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
1933
1934 extern int vm_pressure_level_transition_threshold;
1935 SYSCTL_INT(_kern, OID_AUTO, vm_pressure_level_transition_threshold, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_level_transition_threshold, 0, "");
1936
1937 #if DEBUG || DEVELOPMENT
1938 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
1939
1940 #if 0
1941 #if CONFIG_JETSAM && VM_PRESSURE_EVENTS
1942 static boolean_t
1943 memorystatus_issue_pressure_kevent(boolean_t pressured)
1944 {
1945 memorystatus_klist_lock();
1946 KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
1947 memorystatus_klist_unlock();
1948 return TRUE;
1949 }
1950 #endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
1951 #endif /* 0 */
1952
1953 /*
1954 * This routine is used for targeted notifications regardless of system memory pressure
1955 * and regardless of whether or not the process has already been notified.
1956 * It bypasses and has no effect on the only-one-notification per soft-limit policy.
1957 *
1958 * "memnote" is the current user.
1959 */
1960
1961 static int
1962 sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
1963 {
1964 #pragma unused(arg1, arg2)
1965 /* Need to be root or have memorystatus entitlement */
1966 if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT)) {
1967 return EPERM;
1968 }
1969
1970 int error = 0, pid = 0;
1971 struct knote *kn = NULL;
1972 boolean_t found_knote = FALSE;
1973 int fflags = 0; /* filter flags for EVFILT_MEMORYSTATUS */
1974 uint64_t value = 0;
1975
1976 error = sysctl_handle_quad(oidp, &value, 0, req);
1977 if (error || !req->newptr) {
1978 return error;
1979 }
1980
1981 /*
1982 * Find the pid in the low 32 bits of value passed in.
1983 */
1984 pid = (int)(value & 0xFFFFFFFF);
1985
1986 /*
1987 * Find notification in the high 32 bits of the value passed in.
1988 */
1989 fflags = (int)((value >> 32) & 0xFFFFFFFF);
1990
1991 /*
1992 * For backwards compatibility, when no notification is
1993 * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
1994 */
1995 if (fflags == 0) {
1996 fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1997 // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
1998 }
1999
2000 /* wake up everybody waiting for kVMPressureForegroundJetsam */
2001 if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) {
2002 memorystatus_broadcast_jetsam_pressure(kVMPressureForegroundJetsam);
2003 return error;
2004 }
2005
2006 /*
2007 * See event.h ... fflags for EVFILT_MEMORYSTATUS
2008 */
2009 if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) ||
2010 (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
2011 (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
2012 (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
2013 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
2014 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
2015 (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
2016 ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
2017 memorystatus_log_error("memorystatus_vm_pressure_send: notification [0x%x] not supported\n", fflags);
2018 error = 1;
2019 return error;
2020 }
2021
2022 /*
2023 * Forcibly send pid a memorystatus notification.
2024 */
2025
2026 memorystatus_klist_lock();
2027
2028 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
2029 proc_t knote_proc = knote_get_kq(kn)->kq_p;
2030 pid_t knote_pid = proc_getpid(knote_proc);
2031
2032 if (knote_pid == pid) {
2033 /*
2034 * Forcibly send this pid a memorystatus notification.
2035 */
2036 kn->kn_fflags = fflags;
2037 found_knote = TRUE;
2038 }
2039 }
2040
2041 if (found_knote) {
2042 KNOTE(&memorystatus_klist, 0);
2043 memorystatus_log_debug("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d]\n", value, fflags, pid);
2044 error = 0;
2045 } else {
2046 memorystatus_log_error("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
2047 error = 1;
2048 }
2049
2050 memorystatus_klist_unlock();
2051
2052 return error;
2053 }
2054
2055 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
2056 0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
2057
2058 #endif /* DEBUG || DEVELOPMENT */
2059
2060 #endif /* VM_PRESSURE_EVENTS */
2061