1 /*
2 * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29
30 #include <sys/kern_event.h>
31 #include <kern/sched_prim.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/thread_call.h>
38 #include <kern/host.h>
39 #include <kern/policy_internal.h>
40 #include <kern/thread_group.h>
41
42 #include <IOKit/IOBSD.h>
43
44 #include <libkern/libkern.h>
45 #include <libkern/coreanalytics/coreanalytics.h>
46 #include <mach/coalition.h>
47 #include <mach/clock_types.h>
48 #include <mach/mach_time.h>
49 #include <mach/task.h>
50 #include <mach/host_priv.h>
51 #include <mach/mach_host.h>
52 #include <os/log.h>
53 #include <pexpert/pexpert.h>
54 #include <sys/coalition.h>
55 #include <sys/kern_event.h>
56 #include <sys/proc.h>
57 #include <sys/proc_info.h>
58 #include <sys/reason.h>
59 #include <sys/signal.h>
60 #include <sys/signalvar.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysproto.h>
63 #include <sys/time.h>
64 #include <sys/wait.h>
65 #include <sys/tree.h>
66 #include <sys/priv.h>
67 #include <vm/vm_pageout.h>
68 #include <vm/vm_protos.h>
69 #include <mach/machine/sdt.h>
70 #include <libkern/section_keywords.h>
71 #include <stdatomic.h>
72
73 #if CONFIG_FREEZE
74 #include <vm/vm_map.h>
75 #endif /* CONFIG_FREEZE */
76
77 #include <sys/kern_memorystatus.h>
78 #include <sys/kern_memorystatus_notify.h>
79
80 /*
81 * Memorystatus klist structures
82 */
83 struct klist memorystatus_klist;
84 static lck_mtx_t memorystatus_klist_mutex;
85 static void memorystatus_klist_lock(void);
86 static void memorystatus_klist_unlock(void);
87
88 /*
89 * Memorystatus kevent filter routines
90 */
91 static int filt_memorystatusattach(struct knote *kn, struct kevent_qos_s *kev);
92 static void filt_memorystatusdetach(struct knote *kn);
93 static int filt_memorystatus(struct knote *kn, long hint);
94 static int filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev);
95 static int filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev);
96
97 SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
98 .f_attach = filt_memorystatusattach,
99 .f_detach = filt_memorystatusdetach,
100 .f_event = filt_memorystatus,
101 .f_touch = filt_memorystatustouch,
102 .f_process = filt_memorystatusprocess,
103 };
104
105 /*
106 * Memorystatus notification events
107 */
108 enum {
109 kMemorystatusNoPressure = 0x1,
110 kMemorystatusPressure = 0x2,
111 kMemorystatusLowSwap = 0x4,
112 kMemorystatusProcLimitWarn = 0x8,
113 kMemorystatusProcLimitCritical = 0x10
114 };
115
116 #define INTER_NOTIFICATION_DELAY (250000) /* .25 second */
117 #define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */
118 #define WARNING_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
119 #define CRITICAL_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
120
121 /*
122 * Memorystatus notification helper routines
123 */
124 static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
125 static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
126 static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
127 static struct knote *vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update);
128 static void vm_dispatch_memory_pressure(void);
129 kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process);
130
131 #if VM_PRESSURE_EVENTS
132
133 /*
134 * This value is the threshold that a process must meet to be considered for scavenging.
135 */
136 #if XNU_TARGET_OS_OSX
137 #define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */
138 #else /* XNU_TARGET_OS_OSX */
139 #define VM_PRESSURE_MINIMUM_RSIZE 6 /* MB */
140 #endif /* XNU_TARGET_OS_OSX */
141
142 static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
143
144 #if DEVELOPMENT || DEBUG
145 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
146 #endif /* DEVELOPMENT || DEBUG */
147
148 vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
149
150 /*
151 * We use this flag to signal if we have any HWM offenders
152 * on the system. This way we can reduce the number of wakeups
153 * of the memorystatus_thread when the system is between the
154 * "pressure" and "critical" threshold.
155 *
156 * The (re-)setting of this variable is done without any locks
157 * or synchronization simply because it is not possible (currently)
158 * to keep track of HWM offenders that drop down below their memory
159 * limit and/or exit. So, we choose to burn a couple of wasted wakeups
160 * by allowing the unguarded modification of this variable.
161 */
162 boolean_t memorystatus_hwm_candidates = 0;
163
164 #endif /* VM_PRESSURE_EVENTS */
165
166 #if CONFIG_JETSAM
167
168 extern unsigned int memorystatus_available_pages;
169 extern unsigned int memorystatus_available_pages_pressure;
170 extern unsigned int memorystatus_available_pages_critical;
171 extern unsigned int memorystatus_available_pages_critical_base;
172 extern unsigned int memorystatus_available_pages_critical_idle_offset;
173
174 #else /* CONFIG_JETSAM */
175
176 extern uint64_t memorystatus_available_pages;
177 extern uint64_t memorystatus_available_pages_pressure;
178 extern uint64_t memorystatus_available_pages_critical;
179
180 #endif /* CONFIG_JETSAM */
181
182 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
183 uint32_t memorystatus_jetsam_fg_band_waiters = 0;
184 static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = 0; /* nanosec */
185 static uint64_t memorystatus_jetsam_fg_band_delay_ns = 5ull * 1000 * 1000 * 1000; /* nanosec */
186
187 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
188
189 #if DEVELOPMENT || DEBUG
190 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_fg_band_delay_ns, CTLFLAG_RW | CTLFLAG_LOCKED,
191 &memorystatus_jetsam_fg_band_delay_ns, "");
192 #endif
193
194 static int
filt_memorystatusattach(struct knote * kn,__unused struct kevent_qos_s * kev)195 filt_memorystatusattach(struct knote *kn, __unused struct kevent_qos_s *kev)
196 {
197 int error;
198
199 kn->kn_flags |= EV_CLEAR; /* automatically set */
200 kn->kn_sdata = 0; /* incoming data is ignored */
201 memset(&kn->kn_ext, 0, sizeof(kn->kn_ext));
202
203 error = memorystatus_knote_register(kn);
204 if (error) {
205 knote_set_error(kn, error);
206 }
207 return 0;
208 }
209
210 static void
filt_memorystatusdetach(struct knote * kn)211 filt_memorystatusdetach(struct knote *kn)
212 {
213 memorystatus_knote_unregister(kn);
214 }
215
216 static int
filt_memorystatus(struct knote * kn __unused,long hint)217 filt_memorystatus(struct knote *kn __unused, long hint)
218 {
219 if (hint) {
220 switch (hint) {
221 case kMemorystatusNoPressure:
222 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
223 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
224 }
225 break;
226 case kMemorystatusPressure:
227 if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
228 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
229 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
230 }
231 } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
232 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
233 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
234 }
235 }
236 break;
237 case kMemorystatusLowSwap:
238 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
239 kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
240 }
241 break;
242
243 case kMemorystatusProcLimitWarn:
244 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
245 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
246 }
247 break;
248
249 case kMemorystatusProcLimitCritical:
250 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
251 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
252 }
253 break;
254
255 default:
256 break;
257 }
258 }
259
260 #if 0
261 if (kn->kn_fflags != 0) {
262 proc_t knote_proc = knote_get_kq(kn)->kq_p;
263 pid_t knote_pid = proc_getpid(knote_proc);
264
265 printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
266 (unsigned long)kn, kn->kn_fflags, knote_pid);
267 }
268 #endif
269
270 return kn->kn_fflags != 0;
271 }
272
273 static int
filt_memorystatustouch(struct knote * kn,struct kevent_qos_s * kev)274 filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev)
275 {
276 int res;
277 int prev_kn_sfflags = 0;
278
279 memorystatus_klist_lock();
280
281 /*
282 * copy in new kevent settings
283 * (saving the "desired" data and fflags).
284 */
285
286 prev_kn_sfflags = kn->kn_sfflags;
287 kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
288
289 #if XNU_TARGET_OS_OSX
290 /*
291 * Only on desktop do we restrict notifications to
292 * one per active/inactive state (soft limits only).
293 */
294 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
295 /*
296 * Is there previous state to preserve?
297 */
298 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
299 /*
300 * This knote was previously interested in proc_limit_warn,
301 * so yes, preserve previous state.
302 */
303 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
304 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
305 }
306 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
307 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
308 }
309 } else {
310 /*
311 * This knote was not previously interested in proc_limit_warn,
312 * but it is now. Set both states.
313 */
314 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
315 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
316 }
317 }
318
319 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
320 /*
321 * Is there previous state to preserve?
322 */
323 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
324 /*
325 * This knote was previously interested in proc_limit_critical,
326 * so yes, preserve previous state.
327 */
328 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
329 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
330 }
331 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
332 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
333 }
334 } else {
335 /*
336 * This knote was not previously interested in proc_limit_critical,
337 * but it is now. Set both states.
338 */
339 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
340 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
341 }
342 }
343 #endif /* XNU_TARGET_OS_OSX */
344
345 /*
346 * reset the output flags based on a
347 * combination of the old events and
348 * the new desired event list.
349 */
350 //kn->kn_fflags &= kn->kn_sfflags;
351
352 res = (kn->kn_fflags != 0);
353
354 memorystatus_klist_unlock();
355
356 return res;
357 }
358
359 static int
filt_memorystatusprocess(struct knote * kn,struct kevent_qos_s * kev)360 filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev)
361 {
362 int res = 0;
363
364 memorystatus_klist_lock();
365 if (kn->kn_fflags) {
366 knote_fill_kevent(kn, kev, 0);
367 res = 1;
368 }
369 memorystatus_klist_unlock();
370
371 return res;
372 }
373
374 static void
memorystatus_klist_lock(void)375 memorystatus_klist_lock(void)
376 {
377 lck_mtx_lock(&memorystatus_klist_mutex);
378 }
379
380 static void
memorystatus_klist_unlock(void)381 memorystatus_klist_unlock(void)
382 {
383 lck_mtx_unlock(&memorystatus_klist_mutex);
384 }
385
386 void
memorystatus_kevent_init(lck_grp_t * grp,lck_attr_t * attr)387 memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr)
388 {
389 lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
390 klist_init(&memorystatus_klist);
391 }
392
393 int
memorystatus_knote_register(struct knote * kn)394 memorystatus_knote_register(struct knote *kn)
395 {
396 int error = 0;
397
398 memorystatus_klist_lock();
399
400 /*
401 * Support only userspace visible flags.
402 */
403 if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
404 #if XNU_TARGET_OS_OSX
405 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
406 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
407 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
408 }
409
410 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
411 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
412 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
413 }
414 #endif /* XNU_TARGET_OS_OSX */
415
416 KNOTE_ATTACH(&memorystatus_klist, kn);
417 } else {
418 error = ENOTSUP;
419 }
420
421 memorystatus_klist_unlock();
422
423 return error;
424 }
425
426 void
memorystatus_knote_unregister(struct knote * kn __unused)427 memorystatus_knote_unregister(struct knote *kn __unused)
428 {
429 memorystatus_klist_lock();
430 KNOTE_DETACH(&memorystatus_klist, kn);
431 memorystatus_klist_unlock();
432 }
433
434 #if VM_PRESSURE_EVENTS
435
436 #if CONFIG_JETSAM
437
438 static thread_call_t sustained_pressure_handler_thread_call;
439 int memorystatus_should_kill_on_sustained_pressure = 1;
440 /* Count the number of sustained pressure kills we've done since boot. */
441 uint64_t memorystatus_kill_on_sustained_pressure_count = 0;
442 uint64_t memorystatus_kill_on_sustained_pressure_window_s = 60 * 10; /* 10 Minutes */
443 uint64_t memorystatus_kill_on_sustained_pressure_delay_ms = 500; /* .5 seconds */
444
445 #if DEVELOPMENT || DEBUG
446 SYSCTL_INT(_kern, OID_AUTO, memorystatus_should_kill_on_sustained_pressure, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_should_kill_on_sustained_pressure, 0, "");
447 #endif /* DEVELOPMENT || DEBUG */
448 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_count, "");
449 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_window_s, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_window_s, "");
450 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_delay_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_delay_ms, "");
451
452 static void sustained_pressure_handler(void*, void*);
453 #endif /* CONFIG_JETSAM */
454 static thread_call_t memorystatus_notify_update_telemetry_thread_call;
455 static void update_footprints_for_telemetry(void*, void*);
456
457
458 void
memorystatus_notify_init()459 memorystatus_notify_init()
460 {
461 #if CONFIG_JETSAM
462 sustained_pressure_handler_thread_call = thread_call_allocate_with_options(sustained_pressure_handler, NULL, THREAD_CALL_PRIORITY_KERNEL_HIGH, THREAD_CALL_OPTIONS_ONCE);
463 #endif /* CONFIG_JETSAM */
464 memorystatus_notify_update_telemetry_thread_call = thread_call_allocate_with_options(update_footprints_for_telemetry, NULL, THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
465 }
466
467 #if CONFIG_MEMORYSTATUS
468
469 inline int
memorystatus_send_note(int event_code,void * data,uint32_t data_length)470 memorystatus_send_note(int event_code, void *data, uint32_t data_length)
471 {
472 int ret;
473 struct kev_msg ev_msg;
474
475 ev_msg.vendor_code = KEV_VENDOR_APPLE;
476 ev_msg.kev_class = KEV_SYSTEM_CLASS;
477 ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS;
478
479 ev_msg.event_code = event_code;
480
481 ev_msg.dv[0].data_length = data_length;
482 ev_msg.dv[0].data_ptr = data;
483 ev_msg.dv[1].data_length = 0;
484
485 ret = kev_post_msg(&ev_msg);
486 if (ret) {
487 printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
488 }
489
490 return ret;
491 }
492
493 boolean_t
memorystatus_warn_process(const proc_t p,__unused boolean_t is_active,__unused boolean_t is_fatal,boolean_t limit_exceeded)494 memorystatus_warn_process(const proc_t p, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
495 {
496 /*
497 * This function doesn't take a reference to p or lock it. So it better be the current process.
498 */
499 assert(p == current_proc());
500 pid_t pid = proc_getpid(p);
501 boolean_t ret = FALSE;
502 boolean_t found_knote = FALSE;
503 struct knote *kn = NULL;
504 int send_knote_count = 0;
505 uint32_t platform;
506 platform = proc_platform(p);
507
508 /*
509 * See comment in sysctl_memorystatus_vm_pressure_send.
510 */
511
512 memorystatus_klist_lock();
513
514 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
515 proc_t knote_proc = knote_get_kq(kn)->kq_p;
516 pid_t knote_pid = proc_getpid(knote_proc);
517
518 if (knote_pid == pid) {
519 /*
520 * By setting the "fflags" here, we are forcing
521 * a process to deal with the case where it's
522 * bumping up into its memory limits. If we don't
523 * do this here, we will end up depending on the
524 * system pressure snapshot evaluation in
525 * filt_memorystatus().
526 */
527
528 /*
529 * The type of notification and the frequency are different between
530 * embedded and desktop.
531 *
532 * Embedded processes register for global pressure notifications
533 * (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) via UIKit
534 * (see applicationDidReceiveMemoryWarning in UIKit). We'll warn them here if
535 * they are near there memory limit. filt_memorystatus() will warn them based
536 * on the system pressure level.
537 *
538 * On desktop, (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL)
539 * are only expected to fire for system level warnings. Desktop procesess
540 * register for NOTE_MEMORYSTATUS_PROC_LIMIT_WARN
541 * if they want to be warned when they approach their limit
542 * and for NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL to be warned when they
543 * exceed their limit.
544 *
545 * On embedded we continuously warn processes that are approaching their
546 * memory limit. However on desktop, we only send one warning while
547 * the process is active/inactive if the limit is soft..
548 *
549 */
550 if (platform == PLATFORM_MACOS || platform == PLATFORM_MACCATALYST || platform == PLATFORM_DRIVERKIT) {
551 if (!limit_exceeded) {
552 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
553 found_knote = TRUE;
554 if (!is_fatal) {
555 /*
556 * Restrict proc_limit_warn notifications when
557 * non-fatal (soft) limit is at play.
558 */
559 if (is_active) {
560 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
561 /*
562 * Mark this knote for delivery.
563 */
564 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
565 /*
566 * And suppress it from future notifications.
567 */
568 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
569 send_knote_count++;
570 }
571 } else {
572 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
573 /*
574 * Mark this knote for delivery.
575 */
576 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
577 /*
578 * And suppress it from future notifications.
579 */
580 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
581 send_knote_count++;
582 }
583 }
584 } else {
585 /*
586 * No restriction on proc_limit_warn notifications when
587 * fatal (hard) limit is at play.
588 */
589 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
590 send_knote_count++;
591 }
592 }
593 } else {
594 /*
595 * Send this notification when a process has exceeded a soft limit,
596 */
597
598 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
599 found_knote = TRUE;
600 if (!is_fatal) {
601 /*
602 * Restrict critical notifications for soft limits.
603 */
604
605 if (is_active) {
606 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
607 /*
608 * Suppress future proc_limit_critical notifications
609 * for the active soft limit.
610 */
611 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
612 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
613 send_knote_count++;
614 }
615 } else {
616 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
617 /*
618 * Suppress future proc_limit_critical_notifications
619 * for the inactive soft limit.
620 */
621 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
622 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
623 send_knote_count++;
624 }
625 }
626 } else {
627 /*
628 * We should never be trying to send a critical notification for
629 * a hard limit... the process would be killed before it could be
630 * received.
631 */
632 panic("Caught sending pid %d a critical warning for a fatal limit.", pid);
633 }
634 }
635 }
636 } else {
637 if (!limit_exceeded) {
638 /*
639 * Intentionally set either the unambiguous limit warning,
640 * the system-wide critical or the system-wide warning
641 * notification bit.
642 */
643
644 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
645 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
646 found_knote = TRUE;
647 send_knote_count++;
648 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
649 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
650 found_knote = TRUE;
651 send_knote_count++;
652 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
653 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
654 found_knote = TRUE;
655 send_knote_count++;
656 }
657 } else {
658 /*
659 * Send this notification when a process has exceeded a soft limit.
660 */
661 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
662 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
663 found_knote = TRUE;
664 send_knote_count++;
665 }
666 }
667 }
668 }
669 }
670
671 if (found_knote) {
672 if (send_knote_count > 0) {
673 KNOTE(&memorystatus_klist, 0);
674 }
675 ret = TRUE;
676 }
677
678 memorystatus_klist_unlock();
679
680 return ret;
681 }
682
683 /*
684 * Can only be set by the current task on itself.
685 */
686 int
memorystatus_low_mem_privileged_listener(uint32_t op_flags)687 memorystatus_low_mem_privileged_listener(uint32_t op_flags)
688 {
689 boolean_t set_privilege = FALSE;
690 /*
691 * Need an entitlement check here?
692 */
693 if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
694 set_privilege = TRUE;
695 } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
696 set_privilege = FALSE;
697 } else {
698 return EINVAL;
699 }
700
701 return task_low_mem_privileged_listener(current_task(), set_privilege, NULL);
702 }
703
704 int
memorystatus_send_pressure_note(pid_t pid)705 memorystatus_send_pressure_note(pid_t pid)
706 {
707 MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
708 return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
709 }
710
711 boolean_t
memorystatus_is_foreground_locked(proc_t p)712 memorystatus_is_foreground_locked(proc_t p)
713 {
714 return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
715 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
716 }
717
718 /*
719 * This is meant for stackshot and kperf -- it does not take the proc_list_lock
720 * to access the p_memstat_dirty field.
721 */
722 void
memorystatus_proc_flags_unsafe(void * v,boolean_t * is_dirty,boolean_t * is_dirty_tracked,boolean_t * allow_idle_exit)723 memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
724 {
725 if (!v) {
726 *is_dirty = FALSE;
727 *is_dirty_tracked = FALSE;
728 *allow_idle_exit = FALSE;
729 } else {
730 proc_t p = (proc_t)v;
731 *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
732 *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
733 *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
734 }
735 }
736
737 boolean_t
memorystatus_bg_pressure_eligible(proc_t p)738 memorystatus_bg_pressure_eligible(proc_t p)
739 {
740 boolean_t eligible = FALSE;
741
742 proc_list_lock();
743
744 MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", proc_getpid(p), p->p_memstat_state);
745
746 /* Foreground processes have already been dealt with at this point, so just test for eligibility */
747 if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
748 eligible = TRUE;
749 }
750
751 if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
752 /*
753 * IDLE and IDLE_DEFERRED bands contain processes
754 * that have dropped memory to be under their inactive
755 * memory limits. And so they can't really give back
756 * anything.
757 */
758 eligible = FALSE;
759 }
760
761 proc_list_unlock();
762
763 return eligible;
764 }
765
766 void
memorystatus_send_low_swap_note(void)767 memorystatus_send_low_swap_note(void)
768 {
769 struct knote *kn = NULL;
770
771 memorystatus_klist_lock();
772 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
773 /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
774 * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
775 * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
776 * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
777 if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
778 KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
779 break;
780 }
781 }
782
783 memorystatus_klist_unlock();
784 }
785
786 #endif /* CONFIG_MEMORYSTATUS */
787
788 /*
789 * Notification telemetry
790 */
791 CA_EVENT(memorystatus_pressure_interval,
792 CA_INT, num_processes_registered,
793 CA_INT, num_notifications_sent,
794 CA_INT, max_level,
795 CA_INT, num_transitions,
796 CA_INT, num_kills,
797 CA_INT, duration);
798 static CA_EVENT_TYPE(memorystatus_pressure_interval) memorystatus_pressure_interval_telemetry;
799
800 CA_EVENT(memorystatus_proc_notification,
801 CA_INT, footprint_before_notification,
802 CA_INT, footprint_1_min_after_first_warning,
803 CA_INT, footprint_5_min_after_first_warning,
804 CA_INT, footprint_20_min_after_first_warning,
805 CA_INT, footprint_1_min_after_first_critical,
806 CA_INT, footprint_5_min_after_first_critical,
807 CA_INT, footprint_20_min_after_first_critical,
808 CA_INT, order_within_list,
809 CA_INT, num_notifications_sent,
810 CA_INT, time_between_warning_and_critical,
811 CA_STATIC_STRING(CA_PROCNAME_LEN), proc_name);
812
813 /* The send timestamps for the first notifications are stored in the knote's kn_sdata field */
814 #define KNOTE_SEND_TIMESTAMP_WARNING_INDEX 0
815 #define KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX 1
816
817 /* The footprint history for this task is stored in the knote's kn_ext array. */
818 struct knote_footprint_history {
819 uint32_t kfh_starting_footprint;
820 uint32_t kfh_footprint_after_warn_1; /* 1 minute after first warning notification */
821 uint32_t kfh_footprint_after_warn_5; /* 5 minutes after first warning notification */
822 uint32_t kfh_footprint_after_warn_20; /* 20 minutes after first warning notification */
823 uint32_t kfh_footprint_after_critical_1; /* 1 minute after first critical notification */
824 uint32_t kfh_footprint_after_critical_5; /* 5 minutes after first critical notification */
825 uint32_t kfh_footprint_after_critical_20; /* 20 minutes after first critical notification */
826 uint16_t kfh_num_notifications;
827 uint16_t kfh_notification_order;
828 } __attribute__((packed));
829
830
831 static_assert(sizeof(struct knote_footprint_history) <= sizeof(uint64_t) * 4, "footprint history fits in knote extensions");
832
833 static void
mark_knote_send_time(struct knote * kn,task_t task,int knote_pressure_level,uint16_t order_within_list)834 mark_knote_send_time(struct knote *kn, task_t task, int knote_pressure_level, uint16_t order_within_list)
835 {
836 uint32_t *timestamps;
837 uint32_t index;
838 uint64_t curr_ts, curr_ts_seconds;
839 struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
840 if (knote_pressure_level != NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
841 timestamps = (uint32_t *)&(kn->kn_sdata);
842 index = knote_pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN ?
843 KNOTE_SEND_TIMESTAMP_WARNING_INDEX : KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX;
844 if (timestamps[index] == 0) {
845 /* First notification for this level since pressure elevated from normal. */
846 curr_ts = mach_absolute_time();
847 curr_ts_seconds = 0;
848 absolutetime_to_nanoseconds(curr_ts, &curr_ts_seconds);
849 curr_ts_seconds /= NSEC_PER_SEC;
850
851 timestamps[index] = (uint32_t)MIN(UINT32_MAX, curr_ts_seconds);
852
853 /* Record task initial footprint */
854 if (timestamps[index == KNOTE_SEND_TIMESTAMP_WARNING_INDEX ? KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX : KNOTE_SEND_TIMESTAMP_WARNING_INDEX] == 0) {
855 /*
856 * First notification at any level since pressure elevated from normal.
857 * Record the footprint and our order in the notification list.
858 */
859 footprint_history->kfh_starting_footprint = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
860 footprint_history->kfh_notification_order = order_within_list;
861 }
862 }
863 }
864 footprint_history->kfh_num_notifications++;
865 }
866
867 /*
868 * Records the current footprint for this task in the knote telemetry.
869 *
870 * Returns the soonest absolutetime when this footprint history should be updated again.
871 */
872 static uint64_t
update_knote_footprint_history(struct knote * kn,task_t task,uint64_t curr_ts)873 update_knote_footprint_history(struct knote *kn, task_t task, uint64_t curr_ts)
874 {
875 uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
876 struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
877 uint64_t warning_send_time, critical_send_time, minutes_since_warning = UINT64_MAX, minutes_since_critical = UINT64_MAX;
878 warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
879 critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
880 uint32_t task_phys_footprint_mb = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
881 uint64_t next_run = UINT64_MAX, absolutetime_in_minute = 0, minutes_since_last_notification = 0, curr_ts_s;
882 absolutetime_to_nanoseconds(curr_ts, &curr_ts_s);
883 nanoseconds_to_absolutetime(60 * NSEC_PER_SEC, &absolutetime_in_minute);
884 curr_ts_s /= NSEC_PER_SEC;
885
886 if (warning_send_time != 0) {
887 /* This task received a warning notification. */
888 minutes_since_warning = (curr_ts_s - warning_send_time) / 60;
889 if (footprint_history->kfh_footprint_after_warn_1 == 0 && minutes_since_warning >= 1) {
890 footprint_history->kfh_footprint_after_warn_1 = task_phys_footprint_mb;
891 }
892 if (footprint_history->kfh_footprint_after_warn_5 == 0 && minutes_since_warning >= 5) {
893 footprint_history->kfh_footprint_after_warn_5 = task_phys_footprint_mb;
894 }
895 if (footprint_history->kfh_footprint_after_warn_20 == 0 && minutes_since_warning >= 20) {
896 footprint_history->kfh_footprint_after_warn_20 = task_phys_footprint_mb;
897 }
898 }
899 if (critical_send_time != 0) {
900 /* This task received a critical notification. */
901 minutes_since_critical = (curr_ts_s - critical_send_time) / 60;
902 if (footprint_history->kfh_footprint_after_critical_1 == 0 && minutes_since_critical >= 1) {
903 footprint_history->kfh_footprint_after_critical_1 = task_phys_footprint_mb;
904 }
905 if (footprint_history->kfh_footprint_after_critical_5 == 0 && minutes_since_critical >= 5) {
906 footprint_history->kfh_footprint_after_critical_5 = task_phys_footprint_mb;
907 }
908 if (footprint_history->kfh_footprint_after_critical_20 == 0 && minutes_since_critical >= 20) {
909 footprint_history->kfh_footprint_after_critical_20 = task_phys_footprint_mb;
910 }
911 }
912
913 minutes_since_last_notification = MIN(minutes_since_warning, minutes_since_critical);
914 if (minutes_since_last_notification < 20) {
915 if (minutes_since_last_notification < 5) {
916 if (minutes_since_last_notification < 1) {
917 next_run = curr_ts + absolutetime_in_minute;
918 } else {
919 next_run = curr_ts + (absolutetime_in_minute * 5);
920 }
921 } else {
922 next_run = curr_ts + (absolutetime_in_minute * 20);
923 }
924 }
925
926 return next_run;
927 }
928
929 extern char *proc_name_address(void *p);
930 /*
931 * Attempt to send the given level telemetry event.
932 * Finalizes the duration.
933 * Clears the src_event struct.
934 */
935 static void
memorystatus_pressure_interval_send(CA_EVENT_TYPE (memorystatus_pressure_interval)* src_event)936 memorystatus_pressure_interval_send(CA_EVENT_TYPE(memorystatus_pressure_interval) *src_event)
937 {
938 uint64_t duration_nanoseconds = 0;
939 uint64_t curr_ts = mach_absolute_time();
940 src_event->duration = curr_ts - src_event->duration;
941 absolutetime_to_nanoseconds(src_event->duration, &duration_nanoseconds);
942 src_event->duration = (int64_t) (duration_nanoseconds / NSEC_PER_SEC);
943
944 /*
945 * Drop the event rather than block for memory. We should be in a normal pressure level now,
946 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
947 */
948 ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_pressure_interval, Z_NOWAIT);
949 if (event_wrapper) {
950 memcpy(event_wrapper->data, src_event, sizeof(CA_EVENT_TYPE(memorystatus_pressure_interval)));
951 CA_EVENT_SEND(event_wrapper);
952 }
953 src_event->num_processes_registered = 0;
954 src_event->num_notifications_sent = 0;
955 src_event->max_level = 0;
956 src_event->num_transitions = 0;
957 src_event->num_kills = 0;
958 src_event->duration = 0;
959 }
960
961
962 /*
963 * Attempt to send the per-proc telemetry events.
964 * Clears the footprint histories on the knotes.
965 */
966 static void
memorystatus_pressure_proc_telemetry_send(void)967 memorystatus_pressure_proc_telemetry_send(void)
968 {
969 struct knote *kn = NULL;
970 memorystatus_klist_lock();
971 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
972 proc_t p = PROC_NULL;
973 struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
974 uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
975 uint32_t warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
976 uint32_t critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
977 CA_EVENT_TYPE(memorystatus_proc_notification) * event = NULL;
978 if (warning_send_time != 0 || critical_send_time != 0) {
979 /*
980 * Drop the event rather than block for memory. We should be in a normal pressure level now,
981 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
982 */
983 ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_proc_notification, Z_NOWAIT | Z_ZERO);
984 if (event_wrapper) {
985 event = event_wrapper->data;
986
987 event->footprint_before_notification = footprint_history->kfh_starting_footprint;
988 event->footprint_1_min_after_first_warning = footprint_history->kfh_footprint_after_warn_1;
989 event->footprint_5_min_after_first_warning = footprint_history->kfh_footprint_after_warn_5;
990 event->footprint_20_min_after_first_warning = footprint_history->kfh_footprint_after_warn_20;
991 event->footprint_1_min_after_first_critical = footprint_history->kfh_footprint_after_critical_1;
992 event->footprint_5_min_after_first_critical = footprint_history->kfh_footprint_after_critical_5;
993 event->footprint_20_min_after_first_critical = footprint_history->kfh_footprint_after_critical_20;
994 event->num_notifications_sent = footprint_history->kfh_num_notifications;
995 if (warning_send_time != 0 && critical_send_time != 0) {
996 event->time_between_warning_and_critical = (critical_send_time - warning_send_time) / 60; // Minutes
997 }
998 event->order_within_list = footprint_history->kfh_notification_order;
999
1000 p = proc_ref(knote_get_kq(kn)->kq_p, false);
1001 if (p == NULL) {
1002 CA_EVENT_DEALLOCATE(event_wrapper);
1003 continue;
1004 }
1005 strlcpy(event->proc_name, proc_name_address(p), sizeof(event->proc_name));
1006
1007 proc_rele(p);
1008 CA_EVENT_SEND(event_wrapper);
1009 }
1010 }
1011 memset(footprint_history, 0, sizeof(*footprint_history));
1012 timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX] = 0;
1013 timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX] = 0;
1014 }
1015 memorystatus_klist_unlock();
1016 }
1017
1018 /*
1019 * Send all telemetry associated with the increased pressure interval.
1020 */
1021 static void
memorystatus_pressure_telemetry_send(void)1022 memorystatus_pressure_telemetry_send(void)
1023 {
1024 LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_NOTOWNED);
1025 memorystatus_pressure_interval_send(&memorystatus_pressure_interval_telemetry);
1026 memorystatus_pressure_proc_telemetry_send();
1027 }
1028
1029
1030 /*
1031 * kn_max - knote
1032 *
1033 * knote_pressure_level - to check if the knote is registered for this notification level.
1034 *
1035 * task - task whose bits we'll be modifying
1036 *
1037 * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
1038 *
1039 * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
1040 *
1041 */
1042
1043 static boolean_t
is_knote_registered_modify_task_pressure_bits(struct knote * kn_max,int knote_pressure_level,task_t task,vm_pressure_level_t pressure_level_to_clear,vm_pressure_level_t pressure_level_to_set)1044 is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
1045 {
1046 if (kn_max->kn_sfflags & knote_pressure_level) {
1047 if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
1048 task_clear_has_been_notified(task, pressure_level_to_clear);
1049 }
1050
1051 task_mark_has_been_notified(task, pressure_level_to_set);
1052 return TRUE;
1053 }
1054
1055 return FALSE;
1056 }
1057
1058 static void
memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)1059 memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
1060 {
1061 struct knote *kn = NULL;
1062
1063 memorystatus_klist_lock();
1064
1065 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1066 proc_t p = knote_get_kq(kn)->kq_p;
1067
1068 if (p == proc_ref(p, false)) {
1069 task_clear_has_been_notified(p->task, pressure_level_to_clear);
1070 proc_rele(p);
1071 }
1072 }
1073
1074 memorystatus_klist_unlock();
1075 }
1076
1077 /*
1078 * Used by the vm_pressure_thread which is
1079 * signalled from within vm_pageout_scan().
1080 */
1081
1082 void
consider_vm_pressure_events(void)1083 consider_vm_pressure_events(void)
1084 {
1085 vm_dispatch_memory_pressure();
1086 }
1087
1088 static void
vm_dispatch_memory_pressure(void)1089 vm_dispatch_memory_pressure(void)
1090 {
1091 memorystatus_update_vm_pressure(FALSE);
1092 }
1093
1094 static struct knote *
vm_pressure_select_optimal_candidate_to_notify(struct klist * candidate_list,int level,boolean_t target_foreground_process,uint64_t * next_telemetry_update)1095 vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update)
1096 {
1097 struct knote *kn = NULL, *kn_max = NULL;
1098 uint64_t resident_max = 0;/* MB */
1099 int selected_task_importance = 0;
1100 static int pressure_snapshot = -1;
1101 boolean_t pressure_increase = FALSE;
1102 uint64_t curr_ts = mach_absolute_time();
1103 *next_telemetry_update = UINT64_MAX;
1104
1105 if (pressure_snapshot == -1) {
1106 /*
1107 * Initial snapshot.
1108 */
1109 pressure_snapshot = level;
1110 pressure_increase = TRUE;
1111 } else {
1112 if (level && (level >= pressure_snapshot)) {
1113 pressure_increase = TRUE;
1114 } else {
1115 pressure_increase = FALSE;
1116 }
1117
1118 pressure_snapshot = level;
1119 }
1120
1121 if (pressure_increase == TRUE) {
1122 /*
1123 * We'll start by considering the largest
1124 * unimportant task in our list.
1125 */
1126 selected_task_importance = INT_MAX;
1127 } else {
1128 /*
1129 * We'll start by considering the largest
1130 * important task in our list.
1131 */
1132 selected_task_importance = 0;
1133 }
1134
1135 SLIST_FOREACH(kn, candidate_list, kn_selnext) {
1136 uint64_t resident_size = 0;/* MB */
1137 proc_t p = PROC_NULL;
1138 struct task* t = TASK_NULL;
1139 int curr_task_importance = 0;
1140 uint64_t telemetry_update = 0;
1141 boolean_t consider_knote = FALSE;
1142 boolean_t privileged_listener = FALSE;
1143
1144 p = proc_ref(knote_get_kq(kn)->kq_p, false);
1145 if (p == PROC_NULL) {
1146 continue;
1147 }
1148
1149 #if CONFIG_MEMORYSTATUS
1150 if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
1151 /*
1152 * Skip process not marked foreground.
1153 */
1154 proc_rele(p);
1155 continue;
1156 }
1157 #endif /* CONFIG_MEMORYSTATUS */
1158
1159 t = (struct task *)(p->task);
1160 telemetry_update = update_knote_footprint_history(kn, t, curr_ts);
1161 *next_telemetry_update = MIN(*next_telemetry_update, telemetry_update);
1162
1163 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
1164
1165 if ((kn->kn_sfflags & dispatch_level) == 0) {
1166 proc_rele(p);
1167 continue;
1168 }
1169
1170 #if CONFIG_MEMORYSTATUS
1171 if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
1172 VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", proc_getpid(p));
1173 proc_rele(p);
1174 continue;
1175 }
1176 #endif /* CONFIG_MEMORYSTATUS */
1177
1178 #if XNU_TARGET_OS_OSX
1179 curr_task_importance = task_importance_estimate(t);
1180 #else /* XNU_TARGET_OS_OSX */
1181 curr_task_importance = p->p_memstat_effectivepriority;
1182 #endif /* XNU_TARGET_OS_OSX */
1183
1184 /*
1185 * Privileged listeners are only considered in the multi-level pressure scheme
1186 * AND only if the pressure is increasing.
1187 */
1188 if (level > 0) {
1189 if (task_has_been_notified(t, level) == FALSE) {
1190 /*
1191 * Is this a privileged listener?
1192 */
1193 if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
1194 if (privileged_listener) {
1195 kn_max = kn;
1196 proc_rele(p);
1197 goto done_scanning;
1198 }
1199 }
1200 } else {
1201 proc_rele(p);
1202 continue;
1203 }
1204 } else if (level == 0) {
1205 /*
1206 * Task wasn't notified when the pressure was increasing and so
1207 * no need to notify it that the pressure is decreasing.
1208 */
1209 if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
1210 proc_rele(p);
1211 continue;
1212 }
1213 }
1214
1215 /*
1216 * We don't want a small process to block large processes from
1217 * being notified again. <rdar://problem/7955532>
1218 */
1219 resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */
1220
1221 if (resident_size >= vm_pressure_task_footprint_min) {
1222 if (level > 0) {
1223 /*
1224 * Warning or Critical Pressure.
1225 */
1226 if (pressure_increase) {
1227 if ((curr_task_importance < selected_task_importance) ||
1228 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1229 /*
1230 * We have found a candidate process which is:
1231 * a) at a lower importance than the current selected process
1232 * OR
1233 * b) has importance equal to that of the current selected process but is larger
1234 */
1235
1236 consider_knote = TRUE;
1237 }
1238 } else {
1239 if ((curr_task_importance > selected_task_importance) ||
1240 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1241 /*
1242 * We have found a candidate process which is:
1243 * a) at a higher importance than the current selected process
1244 * OR
1245 * b) has importance equal to that of the current selected process but is larger
1246 */
1247
1248 consider_knote = TRUE;
1249 }
1250 }
1251 } else if (level == 0) {
1252 /*
1253 * Pressure back to normal.
1254 */
1255 if ((curr_task_importance > selected_task_importance) ||
1256 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1257 consider_knote = TRUE;
1258 }
1259 }
1260
1261 if (consider_knote) {
1262 resident_max = resident_size;
1263 kn_max = kn;
1264 selected_task_importance = curr_task_importance;
1265 consider_knote = FALSE; /* reset for the next candidate */
1266 }
1267 } else {
1268 /* There was no candidate with enough resident memory to scavenge */
1269 VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", proc_getpid(p), resident_size);
1270 }
1271 proc_rele(p);
1272 }
1273
1274 done_scanning:
1275 if (kn_max) {
1276 VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max, 0, 0);
1277 VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max);
1278 }
1279
1280 return kn_max;
1281 }
1282
1283 /*
1284 * To avoid notification storms in a system with sawtooth behavior of pressure levels eg:
1285 * Normal -> warning (notify clients) -> critical (notify) -> warning (notify) -> critical (notify) -> warning (notify)...
1286 *
1287 * We have 'resting' periods: WARNING_NOTIFICATION_RESTING_PERIOD and CRITICAL_NOTIFICATION_RESTING_PERIOD
1288 *
1289 * So it would look like:-
1290 * Normal -> warning (notify) -> critical (notify) -> warning (notify if it has been RestPeriod since last warning) -> critical (notify if it has been RestPeriod since last critical) -> ...
1291 *
1292 * That's what these 2 timestamps below signify.
1293 */
1294
1295 uint64_t next_warning_notification_sent_at_ts = 0;
1296 uint64_t next_critical_notification_sent_at_ts = 0;
1297
1298 boolean_t memorystatus_manual_testing_on = FALSE;
1299 vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal;
1300
1301 unsigned int memorystatus_sustained_pressure_maximum_band = JETSAM_PRIORITY_IDLE;
1302 #if DEVELOPMENT || DEBUG
1303 SYSCTL_INT(_kern, OID_AUTO, memorystatus_sustained_pressure_maximum_band, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_sustained_pressure_maximum_band, 0, "");
1304 #endif /* DEVELOPMENT || DEBUG */
1305
1306 #if CONFIG_JETSAM
1307
1308 static void
sustained_pressure_handler(void * arg0 __unused,void * arg1 __unused)1309 sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused)
1310 {
1311 int max_kills = 0, kill_count = 0;
1312 /*
1313 * Pressure has been elevated for too long.
1314 * We don't want to leave the system in this state as it can delay background
1315 * work indefinitely & drain battery.
1316 *
1317 * Try to return the system to normal via jetsam.
1318 * We'll run through the idle band up to 2 times.
1319 * If the pressure hasn't been relieved by then, the problem is memory
1320 * consumption in a higher band and this churn is probably doing more harm than good.
1321 */
1322 max_kills = memorystatus_get_proccnt_upto_priority(memorystatus_sustained_pressure_maximum_band) * 2;
1323 os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Pressure level has been elevated for too long. killing up to %d idle processes", max_kills);
1324 while (memorystatus_vm_pressure_level != kVMPressureNormal && kill_count < max_kills) {
1325 boolean_t killed = memorystatus_kill_on_sustained_pressure(false);
1326 if (killed) {
1327 /*
1328 * Pause before our next kill & see if pressure reduces.
1329 */
1330 delay((int)(memorystatus_kill_on_sustained_pressure_delay_ms * NSEC_PER_MSEC / NSEC_PER_USEC));
1331 kill_count++;
1332 memorystatus_kill_on_sustained_pressure_count++;
1333 /* TODO(jason): Should use os_atomic but requires rdar://76310894. */
1334 memorystatus_pressure_interval_telemetry.num_kills++;
1335 } else {
1336 /* Nothing left to kill */
1337 break;
1338 }
1339 }
1340 if (memorystatus_vm_pressure_level != kVMPressureNormal) {
1341 os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Killed %d idle processes due to sustained pressure, but device didn't quiesce. Giving up.", kill_count);
1342 }
1343 }
1344
1345 #endif /* CONFIG_JETSAM */
1346
1347 /*
1348 * Returns the number of processes registered for notifications at this level.
1349 */
1350 static size_t
memorystatus_klist_length(int level)1351 memorystatus_klist_length(int level)
1352 {
1353 LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED);
1354 struct knote *kn;
1355 size_t count = 0;
1356 int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level);
1357 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1358 if (kn->kn_sfflags & knote_pressure_level) {
1359 count++;
1360 }
1361 }
1362 return count;
1363 }
1364
1365 /*
1366 * Updates the footprint telemetry for procs that have received notifications.
1367 */
1368 static void
update_footprints_for_telemetry(void * arg0 __unused,void * arg1 __unused)1369 update_footprints_for_telemetry(void* arg0 __unused, void* arg1 __unused)
1370 {
1371 uint64_t curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1372 struct knote *kn;
1373
1374 memorystatus_klist_lock();
1375 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1376 proc_t p = PROC_NULL;
1377 struct task* t = TASK_NULL;
1378 uint64_t telemetry_update;
1379
1380 p = proc_ref(knote_get_kq(kn)->kq_p, false);
1381 if (p == PROC_NULL) {
1382 continue;
1383 }
1384 t = (struct task *)(p->task);
1385 proc_rele(p);
1386 p = PROC_NULL;
1387 telemetry_update = update_knote_footprint_history(kn, t, curr_ts);
1388 next_telemetry_update = MIN(next_telemetry_update, telemetry_update);
1389 }
1390 memorystatus_klist_unlock();
1391 if (next_telemetry_update != UINT64_MAX) {
1392 uint64_t next_update_seconds;
1393 absolutetime_to_nanoseconds(next_telemetry_update, &next_update_seconds);
1394 next_update_seconds /= NSEC_PER_SEC;
1395 thread_call_enter_delayed(memorystatus_notify_update_telemetry_thread_call, next_telemetry_update);
1396 }
1397 }
1398
1399 kern_return_t
memorystatus_update_vm_pressure(boolean_t target_foreground_process)1400 memorystatus_update_vm_pressure(boolean_t target_foreground_process)
1401 {
1402 struct knote *kn_max = NULL;
1403 struct knote *kn_cur = NULL, *kn_temp = NULL;/* for safe list traversal */
1404 pid_t target_pid = -1;
1405 struct klist dispatch_klist = { NULL };
1406 proc_t target_proc = PROC_NULL;
1407 struct task *task = NULL;
1408 boolean_t found_candidate = FALSE;
1409
1410 static vm_pressure_level_t level_snapshot = kVMPressureNormal;
1411 static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal;
1412 boolean_t smoothing_window_started = FALSE;
1413 struct timeval smoothing_window_start_tstamp = {0, 0};
1414 struct timeval curr_tstamp = {0, 0};
1415 int64_t elapsed_msecs = 0;
1416 uint64_t curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1417
1418
1419 uint64_t logging_now;
1420 absolutetime_to_nanoseconds(curr_ts, &logging_now);
1421 #if !CONFIG_JETSAM
1422 #define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */
1423
1424 int idle_kill_counter = 0;
1425
1426 /*
1427 * On desktop we take this opportunity to free up memory pressure
1428 * by immediately killing idle exitable processes. We use a delay
1429 * to avoid overkill. And we impose a max counter as a fail safe
1430 * in case daemons re-launch too fast.
1431 */
1432 while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
1433 if (memorystatus_idle_exit_from_VM() == FALSE) {
1434 /* No idle exitable processes left to kill */
1435 break;
1436 }
1437 idle_kill_counter++;
1438
1439 if (memorystatus_manual_testing_on == TRUE) {
1440 /*
1441 * Skip the delay when testing
1442 * the pressure notification scheme.
1443 */
1444 } else {
1445 delay(1000000); /* 1 second */
1446 }
1447 }
1448 #endif /* !CONFIG_JETSAM */
1449
1450 if (level_snapshot != kVMPressureNormal) {
1451 /*
1452 * Check to see if we are still in the 'resting' period
1453 * after having notified all clients interested in
1454 * a particular pressure level.
1455 */
1456
1457 level_snapshot = memorystatus_vm_pressure_level;
1458
1459 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1460 if (next_warning_notification_sent_at_ts) {
1461 if (curr_ts < next_warning_notification_sent_at_ts) {
1462 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1463 return KERN_SUCCESS;
1464 }
1465
1466 next_warning_notification_sent_at_ts = 0;
1467 memorystatus_klist_reset_all_for_level(kVMPressureWarning);
1468 }
1469 } else if (level_snapshot == kVMPressureCritical) {
1470 if (next_critical_notification_sent_at_ts) {
1471 if (curr_ts < next_critical_notification_sent_at_ts) {
1472 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1473 return KERN_SUCCESS;
1474 }
1475 next_critical_notification_sent_at_ts = 0;
1476 memorystatus_klist_reset_all_for_level(kVMPressureCritical);
1477 }
1478 }
1479 }
1480
1481 #if CONFIG_JETSAM
1482 if (memorystatus_vm_pressure_level == kVMPressureNormal && prev_level_snapshot != kVMPressureNormal) {
1483 if (memorystatus_should_kill_on_sustained_pressure) {
1484 os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Pressure has returned to level %d. Cancelling scheduled jetsam", memorystatus_vm_pressure_level);
1485 thread_call_cancel(sustained_pressure_handler_thread_call);
1486 }
1487 } else if (memorystatus_should_kill_on_sustained_pressure && memorystatus_vm_pressure_level != kVMPressureNormal && prev_level_snapshot == kVMPressureNormal) {
1488 /*
1489 * Pressure has increased from normal.
1490 * Hopefully the notifications will relieve it,
1491 * but as a fail-safe we'll trigger jetsam
1492 * after a configurable amount of time.
1493 */
1494 os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Pressure level has increased from %d to %d. Scheduling jetsam.", prev_level_snapshot, memorystatus_vm_pressure_level);
1495 uint64_t kill_time;
1496 nanoseconds_to_absolutetime(memorystatus_kill_on_sustained_pressure_window_s * NSEC_PER_SEC, &kill_time);
1497 kill_time += mach_absolute_time();
1498 thread_call_enter_delayed(sustained_pressure_handler_thread_call, kill_time);
1499 }
1500 #endif /* CONFIG_JETSAM */
1501
1502 while (1) {
1503 /*
1504 * There is a race window here. But it's not clear
1505 * how much we benefit from having extra synchronization.
1506 */
1507 level_snapshot = memorystatus_vm_pressure_level;
1508
1509 if (prev_level_snapshot > level_snapshot) {
1510 /*
1511 * Pressure decreased? Let's take a little breather
1512 * and see if this condition stays.
1513 */
1514 if (smoothing_window_started == FALSE) {
1515 smoothing_window_started = TRUE;
1516 microuptime(&smoothing_window_start_tstamp);
1517 }
1518
1519 microuptime(&curr_tstamp);
1520 timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
1521 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
1522
1523 if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
1524 delay(INTER_NOTIFICATION_DELAY);
1525 continue;
1526 }
1527 }
1528 if (level_snapshot == kVMPressureNormal) {
1529 memorystatus_pressure_telemetry_send();
1530 }
1531 prev_level_snapshot = level_snapshot;
1532 smoothing_window_started = FALSE;
1533 memorystatus_klist_lock();
1534
1535 if (level_snapshot > memorystatus_pressure_interval_telemetry.max_level) {
1536 memorystatus_pressure_interval_telemetry.num_processes_registered = memorystatus_klist_length(level_snapshot);
1537 memorystatus_pressure_interval_telemetry.max_level = level_snapshot;
1538 memorystatus_pressure_interval_telemetry.num_transitions++;
1539 if (memorystatus_pressure_interval_telemetry.duration == 0) {
1540 /* Set the start timestamp. Duration will be finalized when we send the event. */
1541 memorystatus_pressure_interval_telemetry.duration = curr_ts;
1542 }
1543 }
1544
1545 kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process, &next_telemetry_update);
1546
1547 if (kn_max == NULL) {
1548 memorystatus_klist_unlock();
1549
1550 /*
1551 * No more level-based clients to notify.
1552 *
1553 * Start the 'resting' window within which clients will not be re-notified.
1554 */
1555
1556 if (level_snapshot != kVMPressureNormal) {
1557 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1558 nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1559
1560 /* Next warning notification (if nothing changes) won't be sent before...*/
1561 next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1562 }
1563
1564 if (level_snapshot == kVMPressureCritical) {
1565 nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1566
1567 /* Next critical notification (if nothing changes) won't be sent before...*/
1568 next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1569 }
1570 }
1571 absolutetime_to_nanoseconds(mach_absolute_time(), &logging_now);
1572 if (next_telemetry_update != UINT64_MAX) {
1573 thread_call_enter_delayed(memorystatus_notify_update_telemetry_thread_call, next_telemetry_update);
1574 } else {
1575 thread_call_cancel(memorystatus_notify_update_telemetry_thread_call);
1576 }
1577 return KERN_FAILURE;
1578 }
1579
1580 target_proc = proc_ref(knote_get_kq(kn_max)->kq_p, false);
1581 if (target_proc == PROC_NULL) {
1582 memorystatus_klist_unlock();
1583 continue;
1584 }
1585
1586 target_pid = proc_getpid(target_proc);
1587
1588 task = (struct task *)(target_proc->task);
1589
1590 if (level_snapshot != kVMPressureNormal) {
1591 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1592 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
1593 found_candidate = TRUE;
1594 }
1595 } else {
1596 if (level_snapshot == kVMPressureCritical) {
1597 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
1598 found_candidate = TRUE;
1599 }
1600 }
1601 }
1602 } else {
1603 if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1604 task_clear_has_been_notified(task, kVMPressureWarning);
1605 task_clear_has_been_notified(task, kVMPressureCritical);
1606
1607 found_candidate = TRUE;
1608 }
1609 }
1610
1611 if (found_candidate == FALSE) {
1612 proc_rele(target_proc);
1613 memorystatus_klist_unlock();
1614 continue;
1615 }
1616
1617 SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
1618 int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
1619
1620 if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
1621 proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
1622 pid_t knote_pid = proc_getpid(knote_proc);
1623 if (knote_pid == target_pid) {
1624 KNOTE_DETACH(&memorystatus_klist, kn_cur);
1625 KNOTE_ATTACH(&dispatch_klist, kn_cur);
1626 }
1627 }
1628 }
1629 if (level_snapshot != kVMPressureNormal) {
1630 mark_knote_send_time(kn_max, task, convert_internal_pressure_level_to_dispatch_level(level_snapshot),
1631 (uint16_t) MIN(UINT16_MAX, memorystatus_pressure_interval_telemetry.num_notifications_sent));
1632 memorystatus_pressure_interval_telemetry.num_notifications_sent++;
1633 }
1634
1635 KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
1636
1637 SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
1638 KNOTE_DETACH(&dispatch_klist, kn_cur);
1639 KNOTE_ATTACH(&memorystatus_klist, kn_cur);
1640 }
1641
1642 memorystatus_klist_unlock();
1643
1644 microuptime(&target_proc->vm_pressure_last_notify_tstamp);
1645 proc_rele(target_proc);
1646
1647 if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
1648 break;
1649 }
1650
1651 if (memorystatus_manual_testing_on == TRUE) {
1652 /*
1653 * Testing out the pressure notification scheme.
1654 * No need for delays etc.
1655 */
1656 } else {
1657 uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
1658 #if CONFIG_JETSAM
1659 unsigned int page_delta = 0;
1660 unsigned int skip_delay_page_threshold = 0;
1661
1662 assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
1663
1664 page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
1665 skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
1666
1667 if (memorystatus_available_pages <= skip_delay_page_threshold) {
1668 /*
1669 * We are nearing the critcal mark fast and can't afford to wait between
1670 * notifications.
1671 */
1672 sleep_interval = 0;
1673 }
1674 #endif /* CONFIG_JETSAM */
1675
1676 if (sleep_interval) {
1677 delay(sleep_interval);
1678 }
1679 }
1680 }
1681
1682 return KERN_SUCCESS;
1683 }
1684
1685 static uint32_t
convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)1686 convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
1687 {
1688 uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1689
1690 switch (internal_pressure_level) {
1691 case kVMPressureNormal:
1692 {
1693 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1694 break;
1695 }
1696
1697 case kVMPressureWarning:
1698 case kVMPressureUrgent:
1699 {
1700 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1701 break;
1702 }
1703
1704 case kVMPressureCritical:
1705 {
1706 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
1707 break;
1708 }
1709
1710 default:
1711 break;
1712 }
1713
1714 return dispatch_level;
1715 }
1716
1717 /*
1718 * Notify any kexts that are waiting for notification that jetsam
1719 * is approaching the foreground bands. They should use this notification
1720 * to free cached memory.
1721 */
1722 void
memorystatus_issue_fg_band_notify(void)1723 memorystatus_issue_fg_band_notify(void)
1724 {
1725 uint64_t now;
1726
1727 lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
1728 absolutetime_to_nanoseconds(mach_absolute_time(), &now);
1729 if (now - memorystatus_jetsam_fg_band_timestamp_ns < memorystatus_jetsam_fg_band_delay_ns) {
1730 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1731 return;
1732 }
1733
1734 if (memorystatus_jetsam_fg_band_waiters > 0) {
1735 thread_wakeup(&memorystatus_jetsam_fg_band_waiters);
1736 memorystatus_jetsam_fg_band_waiters = 0;
1737 memorystatus_jetsam_fg_band_timestamp_ns = now;
1738 }
1739 lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1740
1741 /* Notify the buffer cache, file systems, etc. to jetison everything they can. */
1742 if (consider_buffer_cache_collect != NULL) {
1743 (void)(*consider_buffer_cache_collect)(1);
1744 }
1745 }
1746
1747
1748 /*
1749 * Memorystatus notification debugging support
1750 */
1751
1752 static int
1753 sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
1754 {
1755 #pragma unused(arg1, arg2, oidp)
1756 #if !XNU_TARGET_OS_OSX
1757 int error = 0;
1758
1759 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
1760 if (error) {
1761 return error;
1762 }
1763
1764 #endif /* !XNU_TARGET_OS_OSX */
1765 uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
1766
1767 return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
1768 }
1769
1770 #if DEBUG || DEVELOPMENT
1771
1772 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
1773 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1774
1775 #else /* DEBUG || DEVELOPMENT */
1776
1777 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1778 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1779
1780 #endif /* DEBUG || DEVELOPMENT */
1781
1782 /*
1783 * Trigger levels to test the mechanism.
1784 * Can be used via a sysctl.
1785 */
1786 #define TEST_LOW_MEMORY_TRIGGER_ONE 1
1787 #define TEST_LOW_MEMORY_TRIGGER_ALL 2
1788 #define TEST_PURGEABLE_TRIGGER_ONE 3
1789 #define TEST_PURGEABLE_TRIGGER_ALL 4
1790 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5
1791 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6
1792
1793 static int
1794 sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
1795 {
1796 #pragma unused(arg1, arg2)
1797
1798 int level = 0;
1799 int error = 0;
1800 int pressure_level = 0;
1801 int trigger_request = 0;
1802 int force_purge;
1803
1804 error = sysctl_handle_int(oidp, &level, 0, req);
1805 if (error || !req->newptr) {
1806 return error;
1807 }
1808
1809 memorystatus_manual_testing_on = TRUE;
1810
1811 trigger_request = (level >> 16) & 0xFFFF;
1812 pressure_level = (level & 0xFFFF);
1813
1814 if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
1815 trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
1816 return EINVAL;
1817 }
1818 switch (pressure_level) {
1819 case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
1820 case NOTE_MEMORYSTATUS_PRESSURE_WARN:
1821 case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
1822 break;
1823 default:
1824 return EINVAL;
1825 }
1826
1827 /*
1828 * The pressure level is being set from user-space.
1829 * And user-space uses the constants in sys/event.h
1830 * So we translate those events to our internal levels here.
1831 */
1832 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1833 memorystatus_manual_testing_level = kVMPressureNormal;
1834 force_purge = 0;
1835 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
1836 memorystatus_manual_testing_level = kVMPressureWarning;
1837 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1838 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
1839 memorystatus_manual_testing_level = kVMPressureCritical;
1840 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1841 }
1842
1843 memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
1844
1845 /* purge according to the new pressure level */
1846 switch (trigger_request) {
1847 case TEST_PURGEABLE_TRIGGER_ONE:
1848 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
1849 if (force_purge == 0) {
1850 /* no purging requested */
1851 break;
1852 }
1853 vm_purgeable_object_purge_one_unlocked(force_purge);
1854 break;
1855 case TEST_PURGEABLE_TRIGGER_ALL:
1856 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
1857 if (force_purge == 0) {
1858 /* no purging requested */
1859 break;
1860 }
1861 while (vm_purgeable_object_purge_one_unlocked(force_purge)) {
1862 ;
1863 }
1864 break;
1865 }
1866
1867 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
1868 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
1869 memorystatus_update_vm_pressure(TRUE);
1870 }
1871
1872 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
1873 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
1874 while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
1875 continue;
1876 }
1877 }
1878
1879 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1880 memorystatus_manual_testing_on = FALSE;
1881 }
1882
1883 return 0;
1884 }
1885
1886 SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1887 0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
1888
1889
1890 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
1891 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
1892 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
1893
1894 extern int vm_pressure_level_transition_threshold;
1895 SYSCTL_INT(_kern, OID_AUTO, vm_pressure_level_transition_threshold, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_level_transition_threshold, 0, "");
1896
1897 #if DEBUG || DEVELOPMENT
1898 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
1899
1900 #if 0
1901 #if CONFIG_JETSAM && VM_PRESSURE_EVENTS
1902 static boolean_t
1903 memorystatus_issue_pressure_kevent(boolean_t pressured)
1904 {
1905 memorystatus_klist_lock();
1906 KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
1907 memorystatus_klist_unlock();
1908 return TRUE;
1909 }
1910 #endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
1911 #endif /* 0 */
1912
1913 /*
1914 * This routine is used for targeted notifications regardless of system memory pressure
1915 * and regardless of whether or not the process has already been notified.
1916 * It bypasses and has no effect on the only-one-notification per soft-limit policy.
1917 *
1918 * "memnote" is the current user.
1919 */
1920
1921 static int
1922 sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
1923 {
1924 #pragma unused(arg1, arg2)
1925 /* Need to be root or have memorystatus entitlement */
1926 if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT)) {
1927 return EPERM;
1928 }
1929
1930 int error = 0, pid = 0;
1931 struct knote *kn = NULL;
1932 boolean_t found_knote = FALSE;
1933 int fflags = 0; /* filter flags for EVFILT_MEMORYSTATUS */
1934 uint64_t value = 0;
1935
1936 error = sysctl_handle_quad(oidp, &value, 0, req);
1937 if (error || !req->newptr) {
1938 return error;
1939 }
1940
1941 /*
1942 * Find the pid in the low 32 bits of value passed in.
1943 */
1944 pid = (int)(value & 0xFFFFFFFF);
1945
1946 /*
1947 * Find notification in the high 32 bits of the value passed in.
1948 */
1949 fflags = (int)((value >> 32) & 0xFFFFFFFF);
1950
1951 /*
1952 * For backwards compatibility, when no notification is
1953 * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
1954 */
1955 if (fflags == 0) {
1956 fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1957 // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
1958 }
1959
1960 /* wake up everybody waiting for kVMPressureJetsam */
1961 if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) {
1962 memorystatus_issue_fg_band_notify();
1963 return error;
1964 }
1965
1966 /*
1967 * See event.h ... fflags for EVFILT_MEMORYSTATUS
1968 */
1969 if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) ||
1970 (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
1971 (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
1972 (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
1973 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
1974 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
1975 (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
1976 ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
1977 printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags);
1978 error = 1;
1979 return error;
1980 }
1981
1982 /*
1983 * Forcibly send pid a memorystatus notification.
1984 */
1985
1986 memorystatus_klist_lock();
1987
1988 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1989 proc_t knote_proc = knote_get_kq(kn)->kq_p;
1990 pid_t knote_pid = proc_getpid(knote_proc);
1991
1992 if (knote_pid == pid) {
1993 /*
1994 * Forcibly send this pid a memorystatus notification.
1995 */
1996 kn->kn_fflags = fflags;
1997 found_knote = TRUE;
1998 }
1999 }
2000
2001 if (found_knote) {
2002 KNOTE(&memorystatus_klist, 0);
2003 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid);
2004 error = 0;
2005 } else {
2006 printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
2007 error = 1;
2008 }
2009
2010 memorystatus_klist_unlock();
2011
2012 return error;
2013 }
2014
2015 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
2016 0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
2017
2018 #endif /* DEBUG || DEVELOPMENT */
2019
2020 #endif /* VM_PRESSURE_EVENTS */
2021