xref: /xnu-10002.1.13/bsd/kern/kern_memorystatus_notify.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  *
28  */
29 
30 #include <sys/kern_event.h>
31 #include <kern/sched_prim.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/thread_call.h>
38 #include <kern/host.h>
39 #include <kern/policy_internal.h>
40 #include <kern/thread_group.h>
41 
42 #include <IOKit/IOBSD.h>
43 
44 #include <libkern/libkern.h>
45 #include <libkern/coreanalytics/coreanalytics.h>
46 #include <mach/coalition.h>
47 #include <mach/clock_types.h>
48 #include <mach/mach_time.h>
49 #include <mach/task.h>
50 #include <mach/host_priv.h>
51 #include <mach/mach_host.h>
52 #include <os/log.h>
53 #include <pexpert/pexpert.h>
54 #include <sys/coalition.h>
55 #include <sys/kern_event.h>
56 #include <sys/proc.h>
57 #include <sys/proc_info.h>
58 #include <sys/reason.h>
59 #include <sys/signal.h>
60 #include <sys/signalvar.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysproto.h>
63 #include <sys/time.h>
64 #include <sys/wait.h>
65 #include <sys/tree.h>
66 #include <sys/priv.h>
67 #include <vm/vm_pageout.h>
68 #include <vm/vm_protos.h>
69 #include <mach/machine/sdt.h>
70 #include <libkern/section_keywords.h>
71 #include <stdatomic.h>
72 
73 #if CONFIG_FREEZE
74 #include <vm/vm_map.h>
75 #endif /* CONFIG_FREEZE */
76 
77 #include <kern/kern_memorystatus_internal.h>
78 #include <sys/kern_memorystatus.h>
79 #include <sys/kern_memorystatus_notify.h>
80 
81 /*
82  * Memorystatus klist structures
83  */
84 struct klist memorystatus_klist;
85 static lck_mtx_t memorystatus_klist_mutex;
86 static void memorystatus_klist_lock(void);
87 static void memorystatus_klist_unlock(void);
88 
89 /*
90  * Memorystatus kevent filter routines
91  */
92 static int filt_memorystatusattach(struct knote *kn, struct kevent_qos_s *kev);
93 static void filt_memorystatusdetach(struct knote *kn);
94 static int filt_memorystatus(struct knote *kn, long hint);
95 static int filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev);
96 static int filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev);
97 
98 SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
99 	.f_attach = filt_memorystatusattach,
100 	.f_detach = filt_memorystatusdetach,
101 	.f_event = filt_memorystatus,
102 	.f_touch = filt_memorystatustouch,
103 	.f_process = filt_memorystatusprocess,
104 };
105 
106 /*
107  * Memorystatus notification events
108  */
109 enum {
110 	kMemorystatusNoPressure = 0x1,
111 	kMemorystatusPressure = 0x2,
112 	kMemorystatusLowSwap = 0x4,
113 	kMemorystatusProcLimitWarn = 0x8,
114 	kMemorystatusProcLimitCritical = 0x10
115 };
116 
117 #define INTER_NOTIFICATION_DELAY    (250000)    /* .25 second */
118 #define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD        5000    /* milliseconds */
119 #define WARNING_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
120 #define CRITICAL_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
121 
122 /*
123  * Memorystatus notification helper routines
124  */
125 static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
126 static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
127 static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
128 static struct knote *vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update);
129 static void vm_dispatch_memory_pressure(void);
130 kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process);
131 
132 #if VM_PRESSURE_EVENTS
133 
134 /*
135  * This value is the threshold that a process must meet to be considered for scavenging.
136  */
137 #if XNU_TARGET_OS_OSX
138 #define VM_PRESSURE_MINIMUM_RSIZE        10    /* MB */
139 #else /* XNU_TARGET_OS_OSX */
140 #define VM_PRESSURE_MINIMUM_RSIZE        6    /* MB */
141 #endif /* XNU_TARGET_OS_OSX */
142 
143 static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
144 
145 #if DEVELOPMENT || DEBUG
146 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
147 #endif /* DEVELOPMENT || DEBUG */
148 
149 vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
150 
151 /*
152  * We use this flag to signal if we have any HWM offenders
153  * on the system. This way we can reduce the number of wakeups
154  * of the memorystatus_thread when the system is between the
155  * "pressure" and "critical" threshold.
156  *
157  * The (re-)setting of this variable is done without any locks
158  * or synchronization simply because it is not possible (currently)
159  * to keep track of HWM offenders that drop down below their memory
160  * limit and/or exit. So, we choose to burn a couple of wasted wakeups
161  * by allowing the unguarded modification of this variable.
162  */
163 boolean_t memorystatus_hwm_candidates = 0;
164 
165 #endif /* VM_PRESSURE_EVENTS */
166 
167 #if CONFIG_JETSAM
168 
169 extern unsigned int memorystatus_available_pages;
170 extern unsigned int memorystatus_available_pages_pressure;
171 extern unsigned int memorystatus_available_pages_critical;
172 extern unsigned int memorystatus_available_pages_critical_base;
173 extern unsigned int memorystatus_available_pages_critical_idle_offset;
174 
175 #else /* CONFIG_JETSAM */
176 
177 extern uint64_t memorystatus_available_pages;
178 extern uint64_t memorystatus_available_pages_pressure;
179 extern uint64_t memorystatus_available_pages_critical;
180 
181 #endif /* CONFIG_JETSAM */
182 
183 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
184 uint32_t memorystatus_jetsam_fg_band_waiters = 0;
185 static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = 0; /* nanosec */
186 static uint64_t memorystatus_jetsam_fg_band_delay_ns = 5ull * 1000 * 1000 * 1000; /* nanosec */
187 
188 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
189 
190 #if DEVELOPMENT || DEBUG
191 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_fg_band_delay_ns, CTLFLAG_RW | CTLFLAG_LOCKED,
192     &memorystatus_jetsam_fg_band_delay_ns, "");
193 #endif
194 
195 static int
filt_memorystatusattach(struct knote * kn,__unused struct kevent_qos_s * kev)196 filt_memorystatusattach(struct knote *kn, __unused struct kevent_qos_s *kev)
197 {
198 	int error;
199 
200 	kn->kn_flags |= EV_CLEAR; /* automatically set */
201 	kn->kn_sdata = 0;         /* incoming data is ignored */
202 	memset(&kn->kn_ext, 0, sizeof(kn->kn_ext));
203 
204 	error = memorystatus_knote_register(kn);
205 	if (error) {
206 		knote_set_error(kn, error);
207 	}
208 	return 0;
209 }
210 
211 static void
filt_memorystatusdetach(struct knote * kn)212 filt_memorystatusdetach(struct knote *kn)
213 {
214 	memorystatus_knote_unregister(kn);
215 }
216 
217 static int
filt_memorystatus(struct knote * kn __unused,long hint)218 filt_memorystatus(struct knote *kn __unused, long hint)
219 {
220 	if (hint) {
221 		switch (hint) {
222 		case kMemorystatusNoPressure:
223 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
224 				kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
225 			}
226 			break;
227 		case kMemorystatusPressure:
228 			if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
229 				if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
230 					kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
231 				}
232 			} else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
233 				if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
234 					kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
235 				}
236 			}
237 			break;
238 		case kMemorystatusLowSwap:
239 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
240 				kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
241 			}
242 			break;
243 
244 		case kMemorystatusProcLimitWarn:
245 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
246 				kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
247 			}
248 			break;
249 
250 		case kMemorystatusProcLimitCritical:
251 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
252 				kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
253 			}
254 			break;
255 
256 		default:
257 			break;
258 		}
259 	}
260 
261 #if 0
262 	if (kn->kn_fflags != 0) {
263 		proc_t knote_proc = knote_get_kq(kn)->kq_p;
264 		pid_t knote_pid = proc_getpid(knote_proc);
265 
266 		printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
267 		    (unsigned long)kn, kn->kn_fflags, knote_pid);
268 	}
269 #endif
270 
271 	return kn->kn_fflags != 0;
272 }
273 
274 static int
filt_memorystatustouch(struct knote * kn,struct kevent_qos_s * kev)275 filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev)
276 {
277 	int res;
278 	int prev_kn_sfflags = 0;
279 
280 	memorystatus_klist_lock();
281 
282 	/*
283 	 * copy in new kevent settings
284 	 * (saving the "desired" data and fflags).
285 	 */
286 
287 	prev_kn_sfflags = kn->kn_sfflags;
288 	kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
289 
290 #if XNU_TARGET_OS_OSX
291 	/*
292 	 * Only on desktop do we restrict notifications to
293 	 * one per active/inactive state (soft limits only).
294 	 */
295 	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
296 		/*
297 		 * Is there previous state to preserve?
298 		 */
299 		if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
300 			/*
301 			 * This knote was previously interested in proc_limit_warn,
302 			 * so yes, preserve previous state.
303 			 */
304 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
305 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
306 			}
307 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
308 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
309 			}
310 		} else {
311 			/*
312 			 * This knote was not previously interested in proc_limit_warn,
313 			 * but it is now.  Set both states.
314 			 */
315 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
316 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
317 		}
318 	}
319 
320 	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
321 		/*
322 		 * Is there previous state to preserve?
323 		 */
324 		if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
325 			/*
326 			 * This knote was previously interested in proc_limit_critical,
327 			 * so yes, preserve previous state.
328 			 */
329 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
330 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
331 			}
332 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
333 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
334 			}
335 		} else {
336 			/*
337 			 * This knote was not previously interested in proc_limit_critical,
338 			 * but it is now.  Set both states.
339 			 */
340 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
341 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
342 		}
343 	}
344 #endif /* XNU_TARGET_OS_OSX */
345 
346 	/*
347 	 * reset the output flags based on a
348 	 * combination of the old events and
349 	 * the new desired event list.
350 	 */
351 	//kn->kn_fflags &= kn->kn_sfflags;
352 
353 	res = (kn->kn_fflags != 0);
354 
355 	memorystatus_klist_unlock();
356 
357 	return res;
358 }
359 
360 static int
filt_memorystatusprocess(struct knote * kn,struct kevent_qos_s * kev)361 filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev)
362 {
363 	int res = 0;
364 
365 	memorystatus_klist_lock();
366 	if (kn->kn_fflags) {
367 		knote_fill_kevent(kn, kev, 0);
368 		res = 1;
369 	}
370 	memorystatus_klist_unlock();
371 
372 	return res;
373 }
374 
375 static void
memorystatus_klist_lock(void)376 memorystatus_klist_lock(void)
377 {
378 	lck_mtx_lock(&memorystatus_klist_mutex);
379 }
380 
381 static void
memorystatus_klist_unlock(void)382 memorystatus_klist_unlock(void)
383 {
384 	lck_mtx_unlock(&memorystatus_klist_mutex);
385 }
386 
387 void
memorystatus_kevent_init(lck_grp_t * grp,lck_attr_t * attr)388 memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr)
389 {
390 	lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
391 	klist_init(&memorystatus_klist);
392 }
393 
394 int
memorystatus_knote_register(struct knote * kn)395 memorystatus_knote_register(struct knote *kn)
396 {
397 	int error = 0;
398 
399 	memorystatus_klist_lock();
400 
401 	/*
402 	 * Support only userspace visible flags.
403 	 */
404 	if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
405 #if XNU_TARGET_OS_OSX
406 		if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
407 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
408 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
409 		}
410 
411 		if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
412 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
413 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
414 		}
415 #endif /* XNU_TARGET_OS_OSX */
416 
417 		KNOTE_ATTACH(&memorystatus_klist, kn);
418 	} else {
419 		error = ENOTSUP;
420 	}
421 
422 	memorystatus_klist_unlock();
423 
424 	return error;
425 }
426 
427 void
memorystatus_knote_unregister(struct knote * kn __unused)428 memorystatus_knote_unregister(struct knote *kn __unused)
429 {
430 	memorystatus_klist_lock();
431 	KNOTE_DETACH(&memorystatus_klist, kn);
432 	memorystatus_klist_unlock();
433 }
434 
435 #if VM_PRESSURE_EVENTS
436 
437 #if CONFIG_JETSAM
438 
439 static thread_call_t sustained_pressure_handler_thread_call;
440 int memorystatus_should_kill_on_sustained_pressure = 1;
441 /* Count the number of sustained pressure kills we've done since boot. */
442 uint64_t memorystatus_kill_on_sustained_pressure_count = 0;
443 uint64_t memorystatus_kill_on_sustained_pressure_window_s = 60 * 10; /* 10 Minutes */
444 uint64_t memorystatus_kill_on_sustained_pressure_delay_ms = 500; /* .5 seconds */
445 
446 #if DEVELOPMENT || DEBUG
447 SYSCTL_INT(_kern, OID_AUTO, memorystatus_should_kill_on_sustained_pressure, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_should_kill_on_sustained_pressure, 0, "");
448 #endif /* DEVELOPMENT || DEBUG */
449 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_count, "");
450 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_window_s, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_window_s, "");
451 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_delay_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_delay_ms, "");
452 
453 static void sustained_pressure_handler(void*, void*);
454 #endif /* CONFIG_JETSAM */
455 static thread_call_t memorystatus_notify_update_telemetry_thread_call;
456 static void update_footprints_for_telemetry(void*, void*);
457 
458 
459 void
memorystatus_notify_init()460 memorystatus_notify_init()
461 {
462 #if CONFIG_JETSAM
463 	sustained_pressure_handler_thread_call = thread_call_allocate_with_options(sustained_pressure_handler, NULL, THREAD_CALL_PRIORITY_KERNEL_HIGH, THREAD_CALL_OPTIONS_ONCE);
464 #endif /* CONFIG_JETSAM */
465 	memorystatus_notify_update_telemetry_thread_call = thread_call_allocate_with_options(update_footprints_for_telemetry, NULL, THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
466 }
467 
468 #if CONFIG_MEMORYSTATUS
469 
470 inline int
memorystatus_send_note(int event_code,void * data,uint32_t data_length)471 memorystatus_send_note(int event_code, void *data, uint32_t data_length)
472 {
473 	int ret;
474 	struct kev_msg ev_msg;
475 
476 	ev_msg.vendor_code    = KEV_VENDOR_APPLE;
477 	ev_msg.kev_class      = KEV_SYSTEM_CLASS;
478 	ev_msg.kev_subclass   = KEV_MEMORYSTATUS_SUBCLASS;
479 
480 	ev_msg.event_code     = event_code;
481 
482 	ev_msg.dv[0].data_length = data_length;
483 	ev_msg.dv[0].data_ptr = data;
484 	ev_msg.dv[1].data_length = 0;
485 
486 	ret = kev_post_msg(&ev_msg);
487 	if (ret) {
488 		memorystatus_log_error("%s: kev_post_msg() failed, err %d\n", __func__, ret);
489 	}
490 
491 	return ret;
492 }
493 
494 boolean_t
memorystatus_warn_process(const proc_t p,__unused boolean_t is_active,__unused boolean_t is_fatal,boolean_t limit_exceeded)495 memorystatus_warn_process(const proc_t p, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
496 {
497 	/*
498 	 * This function doesn't take a reference to p or lock it. So it better be the current process.
499 	 */
500 	assert(p == current_proc());
501 	pid_t pid = proc_getpid(p);
502 	boolean_t ret = FALSE;
503 	boolean_t found_knote = FALSE;
504 	struct knote *kn = NULL;
505 	int send_knote_count = 0;
506 	uint32_t platform;
507 	platform = proc_platform(p);
508 
509 	/*
510 	 * See comment in sysctl_memorystatus_vm_pressure_send.
511 	 */
512 
513 	memorystatus_klist_lock();
514 
515 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
516 		proc_t knote_proc = knote_get_kq(kn)->kq_p;
517 		pid_t knote_pid = proc_getpid(knote_proc);
518 
519 		if (knote_pid == pid) {
520 			/*
521 			 * By setting the "fflags" here, we are forcing
522 			 * a process to deal with the case where it's
523 			 * bumping up into its memory limits. If we don't
524 			 * do this here, we will end up depending on the
525 			 * system pressure snapshot evaluation in
526 			 * filt_memorystatus().
527 			 */
528 
529 			/*
530 			 * The type of notification and the frequency are different between
531 			 * embedded and desktop.
532 			 *
533 			 * Embedded processes register for global pressure notifications
534 			 * (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) via UIKit
535 			 * (see applicationDidReceiveMemoryWarning in UIKit). We'll warn them here if
536 			 * they are near there memory limit. filt_memorystatus() will warn them based
537 			 * on the system pressure level.
538 			 *
539 			 * On desktop, (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL)
540 			 * are only expected to fire for system level warnings. Desktop procesess
541 			 * register for NOTE_MEMORYSTATUS_PROC_LIMIT_WARN
542 			 * if they want to be warned when they approach their limit
543 			 * and for NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL to be warned when they
544 			 * exceed their limit.
545 			 *
546 			 * On embedded we continuously warn processes that are approaching their
547 			 * memory limit. However on desktop, we only send one warning while
548 			 * the process is active/inactive if the limit is soft..
549 			 *
550 			 */
551 			if (platform == PLATFORM_MACOS || platform == PLATFORM_MACCATALYST || platform == PLATFORM_DRIVERKIT) {
552 				if (!limit_exceeded) {
553 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
554 						found_knote = TRUE;
555 						if (!is_fatal) {
556 							/*
557 							 * Restrict proc_limit_warn notifications when
558 							 * non-fatal (soft) limit is at play.
559 							 */
560 							if (is_active) {
561 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
562 									/*
563 									 * Mark this knote for delivery.
564 									 */
565 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
566 									/*
567 									 * And suppress it from future notifications.
568 									 */
569 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
570 									send_knote_count++;
571 								}
572 							} else {
573 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
574 									/*
575 									 * Mark this knote for delivery.
576 									 */
577 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
578 									/*
579 									 * And suppress it from future notifications.
580 									 */
581 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
582 									send_knote_count++;
583 								}
584 							}
585 						} else {
586 							/*
587 							 * No restriction on proc_limit_warn notifications when
588 							 * fatal (hard) limit is at play.
589 							 */
590 							kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
591 							send_knote_count++;
592 						}
593 					}
594 				} else {
595 					/*
596 					 * Send this notification when a process has exceeded a soft limit,
597 					 */
598 
599 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
600 						found_knote = TRUE;
601 						if (!is_fatal) {
602 							/*
603 							 * Restrict critical notifications for soft limits.
604 							 */
605 
606 							if (is_active) {
607 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
608 									/*
609 									 * Suppress future proc_limit_critical notifications
610 									 * for the active soft limit.
611 									 */
612 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
613 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
614 									send_knote_count++;
615 								}
616 							} else {
617 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
618 									/*
619 									 * Suppress future proc_limit_critical_notifications
620 									 * for the inactive soft limit.
621 									 */
622 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
623 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
624 									send_knote_count++;
625 								}
626 							}
627 						} else {
628 							/*
629 							 * We should never be trying to send a critical notification for
630 							 * a hard limit... the process would be killed before it could be
631 							 * received.
632 							 */
633 							panic("Caught sending pid %d a critical warning for a fatal limit.", pid);
634 						}
635 					}
636 				}
637 			} else {
638 				if (!limit_exceeded) {
639 					/*
640 					 * Intentionally set either the unambiguous limit warning,
641 					 * the system-wide critical or the system-wide warning
642 					 * notification bit.
643 					 */
644 
645 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
646 						kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
647 						found_knote = TRUE;
648 						send_knote_count++;
649 					} else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
650 						kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
651 						found_knote = TRUE;
652 						send_knote_count++;
653 					} else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
654 						kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
655 						found_knote = TRUE;
656 						send_knote_count++;
657 					}
658 				} else {
659 					/*
660 					 * Send this notification when a process has exceeded a soft limit.
661 					 */
662 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
663 						kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
664 						found_knote = TRUE;
665 						send_knote_count++;
666 					}
667 				}
668 			}
669 		}
670 	}
671 
672 	if (found_knote) {
673 		if (send_knote_count > 0) {
674 			KNOTE(&memorystatus_klist, 0);
675 		}
676 		ret = TRUE;
677 	}
678 
679 	memorystatus_klist_unlock();
680 
681 	return ret;
682 }
683 
684 /*
685  * Can only be set by the current task on itself.
686  */
687 int
memorystatus_low_mem_privileged_listener(uint32_t op_flags)688 memorystatus_low_mem_privileged_listener(uint32_t op_flags)
689 {
690 	boolean_t set_privilege = FALSE;
691 	/*
692 	 * Need an entitlement check here?
693 	 */
694 	if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
695 		set_privilege = TRUE;
696 	} else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
697 		set_privilege = FALSE;
698 	} else {
699 		return EINVAL;
700 	}
701 
702 	return task_low_mem_privileged_listener(current_task(), set_privilege, NULL);
703 }
704 
705 int
memorystatus_send_pressure_note(pid_t pid)706 memorystatus_send_pressure_note(pid_t pid)
707 {
708 	memorystatus_log_debug("memorystatus_send_pressure_note(): pid %d\n", pid);
709 	return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
710 }
711 
712 boolean_t
memorystatus_is_foreground_locked(proc_t p)713 memorystatus_is_foreground_locked(proc_t p)
714 {
715 	return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
716 	       (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
717 }
718 
719 /*
720  * This is meant for stackshot and kperf -- it does not take the proc_list_lock
721  * to access the p_memstat_dirty field.
722  */
723 void
memorystatus_proc_flags_unsafe(void * v,boolean_t * is_dirty,boolean_t * is_dirty_tracked,boolean_t * allow_idle_exit)724 memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
725 {
726 	if (!v) {
727 		*is_dirty = FALSE;
728 		*is_dirty_tracked = FALSE;
729 		*allow_idle_exit = FALSE;
730 	} else {
731 		proc_t p = (proc_t)v;
732 		*is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
733 		*is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
734 		*allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
735 	}
736 }
737 
738 boolean_t
memorystatus_bg_pressure_eligible(proc_t p)739 memorystatus_bg_pressure_eligible(proc_t p)
740 {
741 	boolean_t eligible = FALSE;
742 
743 	proc_list_lock();
744 
745 	memorystatus_log_debug("memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", proc_getpid(p), p->p_memstat_state);
746 
747 	/* Foreground processes have already been dealt with at this point, so just test for eligibility */
748 	if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
749 		eligible = TRUE;
750 	}
751 
752 	if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
753 		/*
754 		 * IDLE and IDLE_DEFERRED bands contain processes
755 		 * that have dropped memory to be under their inactive
756 		 * memory limits. And so they can't really give back
757 		 * anything.
758 		 */
759 		eligible = FALSE;
760 	}
761 
762 	proc_list_unlock();
763 
764 	return eligible;
765 }
766 
767 void
memorystatus_send_low_swap_note(void)768 memorystatus_send_low_swap_note(void)
769 {
770 	struct knote *kn = NULL;
771 
772 	memorystatus_klist_lock();
773 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
774 		/* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
775 		 * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
776 		 * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
777 		 * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
778 		if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
779 			KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
780 			break;
781 		}
782 	}
783 
784 	memorystatus_klist_unlock();
785 }
786 
787 #endif /* CONFIG_MEMORYSTATUS */
788 
789 /*
790  * Notification telemetry
791  */
792 CA_EVENT(memorystatus_pressure_interval,
793     CA_INT, num_processes_registered,
794     CA_INT, num_notifications_sent,
795     CA_INT, max_level,
796     CA_INT, num_transitions,
797     CA_INT, num_kills,
798     CA_INT, duration);
799 static CA_EVENT_TYPE(memorystatus_pressure_interval) memorystatus_pressure_interval_telemetry;
800 
801 CA_EVENT(memorystatus_proc_notification,
802     CA_INT, footprint_before_notification,
803     CA_INT, footprint_1_min_after_first_warning,
804     CA_INT, footprint_5_min_after_first_warning,
805     CA_INT, footprint_20_min_after_first_warning,
806     CA_INT, footprint_1_min_after_first_critical,
807     CA_INT, footprint_5_min_after_first_critical,
808     CA_INT, footprint_20_min_after_first_critical,
809     CA_INT, order_within_list,
810     CA_INT, num_notifications_sent,
811     CA_INT, time_between_warning_and_critical,
812     CA_STATIC_STRING(CA_PROCNAME_LEN), proc_name);
813 
814 /* The send timestamps for the first notifications are stored in the knote's kn_sdata field */
815 #define KNOTE_SEND_TIMESTAMP_WARNING_INDEX 0
816 #define KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX 1
817 
818 /* The footprint history for this task is stored in the knote's kn_ext array. */
819 struct knote_footprint_history {
820 	uint32_t kfh_starting_footprint;
821 	uint32_t kfh_footprint_after_warn_1; /* 1 minute after first warning notification */
822 	uint32_t kfh_footprint_after_warn_5; /* 5 minutes after first warning notification */
823 	uint32_t kfh_footprint_after_warn_20; /* 20 minutes after first warning notification */
824 	uint32_t kfh_footprint_after_critical_1; /* 1 minute after first critical notification */
825 	uint32_t kfh_footprint_after_critical_5; /* 5 minutes after first critical notification */
826 	uint32_t kfh_footprint_after_critical_20; /* 20 minutes after first critical notification */
827 	uint16_t kfh_num_notifications;
828 	uint16_t kfh_notification_order;
829 } __attribute__((packed));
830 
831 
832 static_assert(sizeof(struct knote_footprint_history) <= sizeof(uint64_t) * 4, "footprint history fits in knote extensions");
833 
834 static void
mark_knote_send_time(struct knote * kn,task_t task,int knote_pressure_level,uint16_t order_within_list)835 mark_knote_send_time(struct knote *kn, task_t task, int knote_pressure_level, uint16_t order_within_list)
836 {
837 	uint32_t *timestamps;
838 	uint32_t index;
839 	uint64_t curr_ts, curr_ts_seconds;
840 	struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
841 	if (knote_pressure_level != NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
842 		timestamps = (uint32_t *)&(kn->kn_sdata);
843 		index = knote_pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN ?
844 		    KNOTE_SEND_TIMESTAMP_WARNING_INDEX : KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX;
845 		if (timestamps[index] == 0) {
846 			/* First notification for this level since pressure elevated from normal. */
847 			curr_ts = mach_absolute_time();
848 			curr_ts_seconds = 0;
849 			absolutetime_to_nanoseconds(curr_ts, &curr_ts_seconds);
850 			curr_ts_seconds /= NSEC_PER_SEC;
851 
852 			timestamps[index] = (uint32_t)MIN(UINT32_MAX, curr_ts_seconds);
853 
854 			/* Record task initial footprint */
855 			if (timestamps[index == KNOTE_SEND_TIMESTAMP_WARNING_INDEX ? KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX : KNOTE_SEND_TIMESTAMP_WARNING_INDEX] == 0) {
856 				/*
857 				 * First notification at any level since pressure elevated from normal.
858 				 * Record the footprint and our order in the notification list.
859 				 */
860 				footprint_history->kfh_starting_footprint = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
861 				footprint_history->kfh_notification_order = order_within_list;
862 			}
863 		}
864 	}
865 	footprint_history->kfh_num_notifications++;
866 }
867 
868 /*
869  * Records the current footprint for this task in the knote telemetry.
870  *
871  * Returns the soonest absolutetime when this footprint history should be updated again.
872  */
873 static uint64_t
update_knote_footprint_history(struct knote * kn,task_t task,uint64_t curr_ts)874 update_knote_footprint_history(struct knote *kn, task_t task, uint64_t curr_ts)
875 {
876 	uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
877 	struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
878 	uint64_t warning_send_time, critical_send_time, minutes_since_warning = UINT64_MAX, minutes_since_critical = UINT64_MAX;
879 	warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
880 	critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
881 	uint32_t task_phys_footprint_mb = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
882 	uint64_t next_run = UINT64_MAX, absolutetime_in_minute = 0, minutes_since_last_notification = 0, curr_ts_s;
883 	absolutetime_to_nanoseconds(curr_ts, &curr_ts_s);
884 	nanoseconds_to_absolutetime(60 * NSEC_PER_SEC, &absolutetime_in_minute);
885 	curr_ts_s /= NSEC_PER_SEC;
886 
887 	if (warning_send_time != 0) {
888 		/* This task received a warning notification. */
889 		minutes_since_warning = (curr_ts_s - warning_send_time) / 60;
890 		if (footprint_history->kfh_footprint_after_warn_1 == 0 && minutes_since_warning >= 1) {
891 			footprint_history->kfh_footprint_after_warn_1 = task_phys_footprint_mb;
892 		}
893 		if (footprint_history->kfh_footprint_after_warn_5 == 0 && minutes_since_warning >= 5) {
894 			footprint_history->kfh_footprint_after_warn_5 = task_phys_footprint_mb;
895 		}
896 		if (footprint_history->kfh_footprint_after_warn_20 == 0 && minutes_since_warning >= 20) {
897 			footprint_history->kfh_footprint_after_warn_20 = task_phys_footprint_mb;
898 		}
899 	}
900 	if (critical_send_time != 0) {
901 		/* This task received a critical notification. */
902 		minutes_since_critical = (curr_ts_s - critical_send_time) / 60;
903 		if (footprint_history->kfh_footprint_after_critical_1 == 0 && minutes_since_critical >= 1) {
904 			footprint_history->kfh_footprint_after_critical_1 = task_phys_footprint_mb;
905 		}
906 		if (footprint_history->kfh_footprint_after_critical_5 == 0 && minutes_since_critical >= 5) {
907 			footprint_history->kfh_footprint_after_critical_5 = task_phys_footprint_mb;
908 		}
909 		if (footprint_history->kfh_footprint_after_critical_20 == 0 && minutes_since_critical >= 20) {
910 			footprint_history->kfh_footprint_after_critical_20 = task_phys_footprint_mb;
911 		}
912 	}
913 
914 	minutes_since_last_notification = MIN(minutes_since_warning, minutes_since_critical);
915 	if (minutes_since_last_notification < 20) {
916 		if (minutes_since_last_notification < 5) {
917 			if (minutes_since_last_notification < 1) {
918 				next_run = curr_ts + absolutetime_in_minute;
919 			} else {
920 				next_run = curr_ts + (absolutetime_in_minute * 5);
921 			}
922 		} else {
923 			next_run = curr_ts + (absolutetime_in_minute * 20);
924 		}
925 	}
926 
927 	return next_run;
928 }
929 
930 extern char *proc_name_address(void *p);
931 /*
932  * Attempt to send the given level telemetry event.
933  * Finalizes the duration.
934  * Clears the src_event struct.
935  */
936 static void
memorystatus_pressure_interval_send(CA_EVENT_TYPE (memorystatus_pressure_interval)* src_event)937 memorystatus_pressure_interval_send(CA_EVENT_TYPE(memorystatus_pressure_interval) *src_event)
938 {
939 	uint64_t duration_nanoseconds = 0;
940 	uint64_t             curr_ts = mach_absolute_time();
941 	src_event->duration = curr_ts - src_event->duration;
942 	absolutetime_to_nanoseconds(src_event->duration, &duration_nanoseconds);
943 	src_event->duration = (int64_t) (duration_nanoseconds / NSEC_PER_SEC);
944 
945 	/*
946 	 * Drop the event rather than block for memory. We should be in a normal pressure level now,
947 	 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
948 	 */
949 	ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_pressure_interval, Z_NOWAIT);
950 	if (event_wrapper) {
951 		memcpy(event_wrapper->data, src_event, sizeof(CA_EVENT_TYPE(memorystatus_pressure_interval)));
952 		CA_EVENT_SEND(event_wrapper);
953 	}
954 	src_event->num_processes_registered = 0;
955 	src_event->num_notifications_sent = 0;
956 	src_event->max_level = 0;
957 	src_event->num_transitions = 0;
958 	src_event->num_kills = 0;
959 	src_event->duration = 0;
960 }
961 
962 
963 /*
964  * Attempt to send the per-proc telemetry events.
965  * Clears the footprint histories on the knotes.
966  */
967 static void
memorystatus_pressure_proc_telemetry_send(void)968 memorystatus_pressure_proc_telemetry_send(void)
969 {
970 	struct knote *kn = NULL;
971 	memorystatus_klist_lock();
972 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
973 		proc_t            p = PROC_NULL;
974 		struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
975 		uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
976 		uint32_t warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
977 		uint32_t critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
978 		CA_EVENT_TYPE(memorystatus_proc_notification) * event = NULL;
979 		if (warning_send_time != 0 || critical_send_time != 0) {
980 			/*
981 			 * Drop the event rather than block for memory. We should be in a normal pressure level now,
982 			 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
983 			 */
984 			ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_proc_notification, Z_NOWAIT | Z_ZERO);
985 			if (event_wrapper) {
986 				event = event_wrapper->data;
987 
988 				event->footprint_before_notification = footprint_history->kfh_starting_footprint;
989 				event->footprint_1_min_after_first_warning = footprint_history->kfh_footprint_after_warn_1;
990 				event->footprint_5_min_after_first_warning = footprint_history->kfh_footprint_after_warn_5;
991 				event->footprint_20_min_after_first_warning = footprint_history->kfh_footprint_after_warn_20;
992 				event->footprint_1_min_after_first_critical = footprint_history->kfh_footprint_after_critical_1;
993 				event->footprint_5_min_after_first_critical = footprint_history->kfh_footprint_after_critical_5;
994 				event->footprint_20_min_after_first_critical = footprint_history->kfh_footprint_after_critical_20;
995 				event->num_notifications_sent = footprint_history->kfh_num_notifications;
996 				if (warning_send_time != 0 && critical_send_time != 0) {
997 					event->time_between_warning_and_critical = (critical_send_time - warning_send_time) / 60; // Minutes
998 				}
999 				event->order_within_list = footprint_history->kfh_notification_order;
1000 
1001 				p = proc_ref(knote_get_kq(kn)->kq_p, false);
1002 				if (p == NULL) {
1003 					CA_EVENT_DEALLOCATE(event_wrapper);
1004 					continue;
1005 				}
1006 				strlcpy(event->proc_name, proc_name_address(p), sizeof(event->proc_name));
1007 
1008 				proc_rele(p);
1009 				CA_EVENT_SEND(event_wrapper);
1010 			}
1011 		}
1012 		memset(footprint_history, 0, sizeof(*footprint_history));
1013 		timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX] = 0;
1014 		timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX] = 0;
1015 	}
1016 	memorystatus_klist_unlock();
1017 }
1018 
1019 /*
1020  * Send all telemetry associated with the increased pressure interval.
1021  */
1022 static void
memorystatus_pressure_telemetry_send(void)1023 memorystatus_pressure_telemetry_send(void)
1024 {
1025 	LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_NOTOWNED);
1026 	memorystatus_pressure_interval_send(&memorystatus_pressure_interval_telemetry);
1027 	memorystatus_pressure_proc_telemetry_send();
1028 }
1029 
1030 
1031 /*
1032  * kn_max - knote
1033  *
1034  * knote_pressure_level - to check if the knote is registered for this notification level.
1035  *
1036  * task    - task whose bits we'll be modifying
1037  *
1038  * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
1039  *
1040  * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
1041  *
1042  */
1043 
1044 static boolean_t
is_knote_registered_modify_task_pressure_bits(struct knote * kn_max,int knote_pressure_level,task_t task,vm_pressure_level_t pressure_level_to_clear,vm_pressure_level_t pressure_level_to_set)1045 is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
1046 {
1047 	if (kn_max->kn_sfflags & knote_pressure_level) {
1048 		if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
1049 			task_clear_has_been_notified(task, pressure_level_to_clear);
1050 		}
1051 
1052 		task_mark_has_been_notified(task, pressure_level_to_set);
1053 		return TRUE;
1054 	}
1055 
1056 	return FALSE;
1057 }
1058 
1059 static void
memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)1060 memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
1061 {
1062 	struct knote *kn = NULL;
1063 
1064 	memorystatus_klist_lock();
1065 
1066 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1067 		proc_t p = knote_get_kq(kn)->kq_p;
1068 
1069 		if (p == proc_ref(p, false)) {
1070 			task_clear_has_been_notified(proc_task(p), pressure_level_to_clear);
1071 			proc_rele(p);
1072 		}
1073 	}
1074 
1075 	memorystatus_klist_unlock();
1076 }
1077 
1078 /*
1079  * Used by the vm_pressure_thread which is
1080  * signalled from within vm_pageout_scan().
1081  */
1082 
1083 void
consider_vm_pressure_events(void)1084 consider_vm_pressure_events(void)
1085 {
1086 	vm_dispatch_memory_pressure();
1087 }
1088 
1089 static void
vm_dispatch_memory_pressure(void)1090 vm_dispatch_memory_pressure(void)
1091 {
1092 	memorystatus_update_vm_pressure(FALSE);
1093 }
1094 
1095 static struct knote *
vm_pressure_select_optimal_candidate_to_notify(struct klist * candidate_list,int level,boolean_t target_foreground_process,uint64_t * next_telemetry_update)1096 vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update)
1097 {
1098 	struct knote    *kn = NULL, *kn_max = NULL;
1099 	uint64_t    resident_max = 0;/* MB */
1100 	int        selected_task_importance = 0;
1101 	static int    pressure_snapshot = -1;
1102 	boolean_t    pressure_increase = FALSE;
1103 	uint64_t     curr_ts = mach_absolute_time();
1104 	*next_telemetry_update = UINT64_MAX;
1105 
1106 	if (pressure_snapshot == -1) {
1107 		/*
1108 		 * Initial snapshot.
1109 		 */
1110 		pressure_snapshot = level;
1111 		pressure_increase = TRUE;
1112 	} else {
1113 		if (level && (level >= pressure_snapshot)) {
1114 			pressure_increase = TRUE;
1115 		} else {
1116 			pressure_increase = FALSE;
1117 		}
1118 
1119 		pressure_snapshot = level;
1120 	}
1121 
1122 	if (pressure_increase == TRUE) {
1123 		/*
1124 		 * We'll start by considering the largest
1125 		 * unimportant task in our list.
1126 		 */
1127 		selected_task_importance = INT_MAX;
1128 	} else {
1129 		/*
1130 		 * We'll start by considering the largest
1131 		 * important task in our list.
1132 		 */
1133 		selected_task_importance = 0;
1134 	}
1135 
1136 	SLIST_FOREACH(kn, candidate_list, kn_selnext) {
1137 		uint64_t        resident_size = 0;/* MB */
1138 		proc_t            p = PROC_NULL;
1139 		struct task*        t = TASK_NULL;
1140 		int            curr_task_importance = 0;
1141 		uint64_t         telemetry_update = 0;
1142 		boolean_t        consider_knote = FALSE;
1143 		boolean_t        privileged_listener = FALSE;
1144 
1145 		p = proc_ref(knote_get_kq(kn)->kq_p, false);
1146 		if (p == PROC_NULL) {
1147 			continue;
1148 		}
1149 
1150 #if CONFIG_MEMORYSTATUS
1151 		if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
1152 			/*
1153 			 * Skip process not marked foreground.
1154 			 */
1155 			proc_rele(p);
1156 			continue;
1157 		}
1158 #endif /* CONFIG_MEMORYSTATUS */
1159 
1160 		t = (struct task *)(proc_task(p));
1161 		telemetry_update = update_knote_footprint_history(kn, t, curr_ts);
1162 		*next_telemetry_update = MIN(*next_telemetry_update, telemetry_update);
1163 
1164 		vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
1165 
1166 		if ((kn->kn_sfflags & dispatch_level) == 0) {
1167 			proc_rele(p);
1168 			continue;
1169 		}
1170 
1171 #if CONFIG_MEMORYSTATUS
1172 		if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
1173 			VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", proc_getpid(p));
1174 			proc_rele(p);
1175 			continue;
1176 		}
1177 #endif /* CONFIG_MEMORYSTATUS */
1178 
1179 #if XNU_TARGET_OS_OSX
1180 		curr_task_importance = task_importance_estimate(t);
1181 #else /* XNU_TARGET_OS_OSX */
1182 		curr_task_importance = p->p_memstat_effectivepriority;
1183 #endif /* XNU_TARGET_OS_OSX */
1184 
1185 		/*
1186 		 * Privileged listeners are only considered in the multi-level pressure scheme
1187 		 * AND only if the pressure is increasing.
1188 		 */
1189 		if (level > 0) {
1190 			if (task_has_been_notified(t, level) == FALSE) {
1191 				/*
1192 				 * Is this a privileged listener?
1193 				 */
1194 				if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
1195 					if (privileged_listener) {
1196 						kn_max = kn;
1197 						proc_rele(p);
1198 						goto done_scanning;
1199 					}
1200 				}
1201 			} else {
1202 				proc_rele(p);
1203 				continue;
1204 			}
1205 		} else if (level == 0) {
1206 			/*
1207 			 * Task wasn't notified when the pressure was increasing and so
1208 			 * no need to notify it that the pressure is decreasing.
1209 			 */
1210 			if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
1211 				proc_rele(p);
1212 				continue;
1213 			}
1214 		}
1215 
1216 		/*
1217 		 * We don't want a small process to block large processes from
1218 		 * being notified again. <rdar://problem/7955532>
1219 		 */
1220 		resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */
1221 
1222 		if (resident_size >= vm_pressure_task_footprint_min) {
1223 			if (level > 0) {
1224 				/*
1225 				 * Warning or Critical Pressure.
1226 				 */
1227 				if (pressure_increase) {
1228 					if ((curr_task_importance < selected_task_importance) ||
1229 					    ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1230 						/*
1231 						 * We have found a candidate process which is:
1232 						 * a) at a lower importance than the current selected process
1233 						 * OR
1234 						 * b) has importance equal to that of the current selected process but is larger
1235 						 */
1236 
1237 						consider_knote = TRUE;
1238 					}
1239 				} else {
1240 					if ((curr_task_importance > selected_task_importance) ||
1241 					    ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1242 						/*
1243 						 * We have found a candidate process which is:
1244 						 * a) at a higher importance than the current selected process
1245 						 * OR
1246 						 * b) has importance equal to that of the current selected process but is larger
1247 						 */
1248 
1249 						consider_knote = TRUE;
1250 					}
1251 				}
1252 			} else if (level == 0) {
1253 				/*
1254 				 * Pressure back to normal.
1255 				 */
1256 				if ((curr_task_importance > selected_task_importance) ||
1257 				    ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1258 					consider_knote = TRUE;
1259 				}
1260 			}
1261 
1262 			if (consider_knote) {
1263 				resident_max = resident_size;
1264 				kn_max = kn;
1265 				selected_task_importance = curr_task_importance;
1266 				consider_knote = FALSE; /* reset for the next candidate */
1267 			}
1268 		} else {
1269 			/* There was no candidate with enough resident memory to scavenge */
1270 			VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", proc_getpid(p), resident_size);
1271 		}
1272 		proc_rele(p);
1273 	}
1274 
1275 done_scanning:
1276 	if (kn_max) {
1277 		VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max, 0, 0);
1278 		VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max);
1279 	}
1280 
1281 	return kn_max;
1282 }
1283 
1284 /*
1285  * To avoid notification storms in a system with sawtooth behavior of pressure levels eg:
1286  * Normal -> warning (notify clients) -> critical (notify) -> warning (notify) -> critical (notify) -> warning (notify)...
1287  *
1288  * We have 'resting' periods: WARNING_NOTIFICATION_RESTING_PERIOD and CRITICAL_NOTIFICATION_RESTING_PERIOD
1289  *
1290  * So it would look like:-
1291  * Normal -> warning (notify) -> critical (notify) -> warning (notify if it has been RestPeriod since last warning) -> critical (notify if it has been RestPeriod since last critical) -> ...
1292  *
1293  * That's what these 2 timestamps below signify.
1294  */
1295 
1296 uint64_t next_warning_notification_sent_at_ts = 0;
1297 uint64_t next_critical_notification_sent_at_ts = 0;
1298 
1299 boolean_t        memorystatus_manual_testing_on = FALSE;
1300 vm_pressure_level_t    memorystatus_manual_testing_level = kVMPressureNormal;
1301 
1302 unsigned int memorystatus_sustained_pressure_maximum_band = JETSAM_PRIORITY_IDLE;
1303 #if DEVELOPMENT || DEBUG
1304 SYSCTL_INT(_kern, OID_AUTO, memorystatus_sustained_pressure_maximum_band, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_sustained_pressure_maximum_band, 0, "");
1305 #endif /* DEVELOPMENT || DEBUG */
1306 
1307 #if CONFIG_JETSAM
1308 
1309 /*
1310  * TODO(jason): The memorystatus thread should be responsible for this
1311  * It can just check how long the pressure level has been at warning and the timestamp
1312  * of the last sustained pressure kill.
1313  */
1314 static void
sustained_pressure_handler(void * arg0 __unused,void * arg1 __unused)1315 sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused)
1316 {
1317 	int max_kills = 0, kill_count = 0;
1318 	/*
1319 	 * Pressure has been elevated for too long.
1320 	 * We don't want to leave the system in this state as it can delay background
1321 	 * work indefinitely & drain battery.
1322 	 *
1323 	 * Try to return the system to normal via jetsam.
1324 	 * We'll run through the idle band up to 2 times.
1325 	 * If the pressure hasn't been relieved by then, the problem is memory
1326 	 * consumption in a higher band and this churn is probably doing more harm than good.
1327 	 */
1328 	max_kills = memorystatus_get_proccnt_upto_priority(memorystatus_sustained_pressure_maximum_band) * 2;
1329 	memorystatus_log("memorystatus: Pressure level has been elevated for too long. killing up to %d idle processes\n", max_kills);
1330 	while (memorystatus_vm_pressure_level != kVMPressureNormal && kill_count < max_kills) {
1331 		boolean_t killed = memorystatus_kill_on_sustained_pressure();
1332 		if (killed) {
1333 			/*
1334 			 * Pause before our next kill & see if pressure reduces.
1335 			 */
1336 			delay((int)(memorystatus_kill_on_sustained_pressure_delay_ms * NSEC_PER_MSEC / NSEC_PER_USEC));
1337 			kill_count++;
1338 			memorystatus_kill_on_sustained_pressure_count++;
1339 			/* TODO(jason): Should use os_atomic but requires rdar://76310894. */
1340 			memorystatus_pressure_interval_telemetry.num_kills++;
1341 		} else {
1342 			/* Nothing left to kill */
1343 			break;
1344 		}
1345 	}
1346 	if (memorystatus_vm_pressure_level != kVMPressureNormal) {
1347 		memorystatus_log("memorystatus: Killed %d idle processes due to sustained pressure, but device didn't quiesce. Giving up.\n", kill_count);
1348 	}
1349 }
1350 
1351 #endif /* CONFIG_JETSAM */
1352 
1353 /*
1354  * Returns the number of processes registered for notifications at this level.
1355  */
1356 static size_t
memorystatus_klist_length(int level)1357 memorystatus_klist_length(int level)
1358 {
1359 	LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED);
1360 	struct knote *kn;
1361 	size_t count = 0;
1362 	int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level);
1363 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1364 		if (kn->kn_sfflags & knote_pressure_level) {
1365 			count++;
1366 		}
1367 	}
1368 	return count;
1369 }
1370 
1371 /*
1372  * Updates the footprint telemetry for procs that have received notifications.
1373  */
1374 static void
update_footprints_for_telemetry(void * arg0 __unused,void * arg1 __unused)1375 update_footprints_for_telemetry(void* arg0 __unused, void* arg1 __unused)
1376 {
1377 	uint64_t curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1378 	struct knote *kn;
1379 
1380 	memorystatus_klist_lock();
1381 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1382 		proc_t            p = PROC_NULL;
1383 		struct task*      t = TASK_NULL;
1384 		uint64_t telemetry_update;
1385 
1386 		p = proc_ref(knote_get_kq(kn)->kq_p, false);
1387 		if (p == PROC_NULL) {
1388 			continue;
1389 		}
1390 		t = (struct task *)(proc_task(p));
1391 		proc_rele(p);
1392 		p = PROC_NULL;
1393 		telemetry_update = update_knote_footprint_history(kn, t, curr_ts);
1394 		next_telemetry_update = MIN(next_telemetry_update, telemetry_update);
1395 	}
1396 	memorystatus_klist_unlock();
1397 	if (next_telemetry_update != UINT64_MAX) {
1398 		uint64_t next_update_seconds;
1399 		absolutetime_to_nanoseconds(next_telemetry_update, &next_update_seconds);
1400 		next_update_seconds /= NSEC_PER_SEC;
1401 		thread_call_enter_delayed(memorystatus_notify_update_telemetry_thread_call, next_telemetry_update);
1402 	}
1403 }
1404 
1405 kern_return_t
memorystatus_update_vm_pressure(boolean_t target_foreground_process)1406 memorystatus_update_vm_pressure(boolean_t target_foreground_process)
1407 {
1408 	struct knote            *kn_max = NULL;
1409 	struct knote            *kn_cur = NULL, *kn_temp = NULL;/* for safe list traversal */
1410 	pid_t                target_pid = -1;
1411 	struct klist            dispatch_klist = { NULL };
1412 	proc_t                target_proc = PROC_NULL;
1413 	struct task            *task = NULL;
1414 	boolean_t            found_candidate = FALSE;
1415 
1416 	static vm_pressure_level_t     level_snapshot = kVMPressureNormal;
1417 	static vm_pressure_level_t    prev_level_snapshot = kVMPressureNormal;
1418 	boolean_t            smoothing_window_started = FALSE;
1419 	struct timeval            smoothing_window_start_tstamp = {0, 0};
1420 	struct timeval            curr_tstamp = {0, 0};
1421 	int64_t              elapsed_msecs = 0;
1422 	uint64_t             curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1423 
1424 
1425 	uint64_t logging_now;
1426 	absolutetime_to_nanoseconds(curr_ts, &logging_now);
1427 #if !CONFIG_JETSAM
1428 #define MAX_IDLE_KILLS 100    /* limit the number of idle kills allowed */
1429 
1430 	int    idle_kill_counter = 0;
1431 
1432 	/*
1433 	 * On desktop we take this opportunity to free up memory pressure
1434 	 * by immediately killing idle exitable processes. We use a delay
1435 	 * to avoid overkill.  And we impose a max counter as a fail safe
1436 	 * in case daemons re-launch too fast.
1437 	 */
1438 	while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
1439 		if (memorystatus_idle_exit_from_VM() == FALSE) {
1440 			/* No idle exitable processes left to kill */
1441 			break;
1442 		}
1443 		idle_kill_counter++;
1444 
1445 		if (memorystatus_manual_testing_on == TRUE) {
1446 			/*
1447 			 * Skip the delay when testing
1448 			 * the pressure notification scheme.
1449 			 */
1450 		} else {
1451 			delay(1000000); /* 1 second */
1452 		}
1453 	}
1454 #endif /* !CONFIG_JETSAM */
1455 
1456 	if (level_snapshot != kVMPressureNormal) {
1457 		/*
1458 		 * Check to see if we are still in the 'resting' period
1459 		 * after having notified all clients interested in
1460 		 * a particular pressure level.
1461 		 */
1462 
1463 		level_snapshot = memorystatus_vm_pressure_level;
1464 
1465 		if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1466 			if (next_warning_notification_sent_at_ts) {
1467 				if (curr_ts < next_warning_notification_sent_at_ts) {
1468 					delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1469 					return KERN_SUCCESS;
1470 				}
1471 
1472 				next_warning_notification_sent_at_ts = 0;
1473 				memorystatus_klist_reset_all_for_level(kVMPressureWarning);
1474 			}
1475 		} else if (level_snapshot == kVMPressureCritical) {
1476 			if (next_critical_notification_sent_at_ts) {
1477 				if (curr_ts < next_critical_notification_sent_at_ts) {
1478 					delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1479 					return KERN_SUCCESS;
1480 				}
1481 				next_critical_notification_sent_at_ts = 0;
1482 				memorystatus_klist_reset_all_for_level(kVMPressureCritical);
1483 			}
1484 		}
1485 	}
1486 
1487 #if CONFIG_JETSAM
1488 	if (memorystatus_vm_pressure_level == kVMPressureNormal && prev_level_snapshot != kVMPressureNormal) {
1489 		if (memorystatus_should_kill_on_sustained_pressure) {
1490 			memorystatus_log("memorystatus: Pressure has returned to level %d. Cancelling scheduled jetsam\n", memorystatus_vm_pressure_level);
1491 			thread_call_cancel(sustained_pressure_handler_thread_call);
1492 		}
1493 	} else if (memorystatus_should_kill_on_sustained_pressure && memorystatus_vm_pressure_level != kVMPressureNormal && prev_level_snapshot == kVMPressureNormal) {
1494 		/*
1495 		 * Pressure has increased from normal.
1496 		 * Hopefully the notifications will relieve it,
1497 		 * but as a fail-safe we'll trigger jetsam
1498 		 * after a configurable amount of time.
1499 		 */
1500 		memorystatus_log("memorystatus: Pressure level has increased from %d to %d. Scheduling jetsam.\n", prev_level_snapshot, memorystatus_vm_pressure_level);
1501 		uint64_t kill_time;
1502 		nanoseconds_to_absolutetime(memorystatus_kill_on_sustained_pressure_window_s * NSEC_PER_SEC, &kill_time);
1503 		kill_time += mach_absolute_time();
1504 		thread_call_enter_delayed(sustained_pressure_handler_thread_call, kill_time);
1505 	}
1506 #endif /* CONFIG_JETSAM */
1507 
1508 	while (1) {
1509 		/*
1510 		 * There is a race window here. But it's not clear
1511 		 * how much we benefit from having extra synchronization.
1512 		 */
1513 		level_snapshot = memorystatus_vm_pressure_level;
1514 
1515 		if (prev_level_snapshot > level_snapshot) {
1516 			/*
1517 			 * Pressure decreased? Let's take a little breather
1518 			 * and see if this condition stays.
1519 			 */
1520 			if (smoothing_window_started == FALSE) {
1521 				smoothing_window_started = TRUE;
1522 				microuptime(&smoothing_window_start_tstamp);
1523 			}
1524 
1525 			microuptime(&curr_tstamp);
1526 			timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
1527 			elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
1528 
1529 			if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
1530 				delay(INTER_NOTIFICATION_DELAY);
1531 				continue;
1532 			}
1533 		}
1534 		if (level_snapshot == kVMPressureNormal) {
1535 			memorystatus_pressure_telemetry_send();
1536 		}
1537 		prev_level_snapshot = level_snapshot;
1538 		smoothing_window_started = FALSE;
1539 		memorystatus_klist_lock();
1540 
1541 		if (level_snapshot > memorystatus_pressure_interval_telemetry.max_level) {
1542 			memorystatus_pressure_interval_telemetry.num_processes_registered = memorystatus_klist_length(level_snapshot);
1543 			memorystatus_pressure_interval_telemetry.max_level = level_snapshot;
1544 			memorystatus_pressure_interval_telemetry.num_transitions++;
1545 			if (memorystatus_pressure_interval_telemetry.duration == 0) {
1546 				/* Set the start timestamp. Duration will be finalized when we send the event. */
1547 				memorystatus_pressure_interval_telemetry.duration = curr_ts;
1548 			}
1549 		}
1550 
1551 		kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process, &next_telemetry_update);
1552 
1553 		if (kn_max == NULL) {
1554 			memorystatus_klist_unlock();
1555 
1556 			/*
1557 			 * No more level-based clients to notify.
1558 			 *
1559 			 * Start the 'resting' window within which clients will not be re-notified.
1560 			 */
1561 
1562 			if (level_snapshot != kVMPressureNormal) {
1563 				if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1564 					nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1565 
1566 					/* Next warning notification (if nothing changes) won't be sent before...*/
1567 					next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1568 				}
1569 
1570 				if (level_snapshot == kVMPressureCritical) {
1571 					nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1572 
1573 					/* Next critical notification (if nothing changes) won't be sent before...*/
1574 					next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1575 				}
1576 			}
1577 			absolutetime_to_nanoseconds(mach_absolute_time(), &logging_now);
1578 			if (next_telemetry_update != UINT64_MAX) {
1579 				thread_call_enter_delayed(memorystatus_notify_update_telemetry_thread_call, next_telemetry_update);
1580 			} else {
1581 				thread_call_cancel(memorystatus_notify_update_telemetry_thread_call);
1582 			}
1583 			return KERN_FAILURE;
1584 		}
1585 
1586 		target_proc = proc_ref(knote_get_kq(kn_max)->kq_p, false);
1587 		if (target_proc == PROC_NULL) {
1588 			memorystatus_klist_unlock();
1589 			continue;
1590 		}
1591 
1592 		target_pid = proc_getpid(target_proc);
1593 
1594 		task = (struct task *)(proc_task(target_proc));
1595 
1596 		if (level_snapshot != kVMPressureNormal) {
1597 			if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1598 				if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
1599 					found_candidate = TRUE;
1600 				}
1601 			} else {
1602 				if (level_snapshot == kVMPressureCritical) {
1603 					if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
1604 						found_candidate = TRUE;
1605 					}
1606 				}
1607 			}
1608 		} else {
1609 			if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1610 				task_clear_has_been_notified(task, kVMPressureWarning);
1611 				task_clear_has_been_notified(task, kVMPressureCritical);
1612 
1613 				found_candidate = TRUE;
1614 			}
1615 		}
1616 
1617 		if (found_candidate == FALSE) {
1618 			proc_rele(target_proc);
1619 			memorystatus_klist_unlock();
1620 			continue;
1621 		}
1622 
1623 		SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
1624 			int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
1625 
1626 			if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
1627 				proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
1628 				pid_t knote_pid = proc_getpid(knote_proc);
1629 				if (knote_pid == target_pid) {
1630 					KNOTE_DETACH(&memorystatus_klist, kn_cur);
1631 					KNOTE_ATTACH(&dispatch_klist, kn_cur);
1632 				}
1633 			}
1634 		}
1635 		if (level_snapshot != kVMPressureNormal) {
1636 			mark_knote_send_time(kn_max, task, convert_internal_pressure_level_to_dispatch_level(level_snapshot),
1637 			    (uint16_t) MIN(UINT16_MAX, memorystatus_pressure_interval_telemetry.num_notifications_sent));
1638 			memorystatus_pressure_interval_telemetry.num_notifications_sent++;
1639 		}
1640 
1641 		KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
1642 
1643 		SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
1644 			KNOTE_DETACH(&dispatch_klist, kn_cur);
1645 			KNOTE_ATTACH(&memorystatus_klist, kn_cur);
1646 		}
1647 
1648 		memorystatus_klist_unlock();
1649 
1650 		microuptime(&target_proc->vm_pressure_last_notify_tstamp);
1651 		proc_rele(target_proc);
1652 
1653 		if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
1654 			break;
1655 		}
1656 
1657 		if (memorystatus_manual_testing_on == TRUE) {
1658 			/*
1659 			 * Testing out the pressure notification scheme.
1660 			 * No need for delays etc.
1661 			 */
1662 		} else {
1663 			uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
1664 #if CONFIG_JETSAM
1665 			unsigned int page_delta = 0;
1666 			unsigned int skip_delay_page_threshold = 0;
1667 
1668 			assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
1669 
1670 			page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
1671 			skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
1672 
1673 			if (memorystatus_available_pages <= skip_delay_page_threshold) {
1674 				/*
1675 				 * We are nearing the critcal mark fast and can't afford to wait between
1676 				 * notifications.
1677 				 */
1678 				sleep_interval = 0;
1679 			}
1680 #endif /* CONFIG_JETSAM */
1681 
1682 			if (sleep_interval) {
1683 				delay(sleep_interval);
1684 			}
1685 		}
1686 	}
1687 
1688 	return KERN_SUCCESS;
1689 }
1690 
1691 static uint32_t
convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)1692 convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
1693 {
1694 	uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1695 
1696 	switch (internal_pressure_level) {
1697 	case kVMPressureNormal:
1698 	{
1699 		dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1700 		break;
1701 	}
1702 
1703 	case kVMPressureWarning:
1704 	case kVMPressureUrgent:
1705 	{
1706 		dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1707 		break;
1708 	}
1709 
1710 	case kVMPressureCritical:
1711 	{
1712 		dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
1713 		break;
1714 	}
1715 
1716 	default:
1717 		break;
1718 	}
1719 
1720 	return dispatch_level;
1721 }
1722 
1723 /*
1724  * Notify any kexts that are waiting for notification that jetsam
1725  * is approaching the foreground bands. They should use this notification
1726  * to free cached memory.
1727  */
1728 void
memorystatus_issue_fg_band_notify(void)1729 memorystatus_issue_fg_band_notify(void)
1730 {
1731 	uint64_t now;
1732 
1733 	lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
1734 	absolutetime_to_nanoseconds(mach_absolute_time(), &now);
1735 	if (now - memorystatus_jetsam_fg_band_timestamp_ns < memorystatus_jetsam_fg_band_delay_ns) {
1736 		lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1737 		return;
1738 	}
1739 
1740 	if (memorystatus_jetsam_fg_band_waiters > 0) {
1741 		thread_wakeup(&memorystatus_jetsam_fg_band_waiters);
1742 		memorystatus_jetsam_fg_band_waiters = 0;
1743 		memorystatus_jetsam_fg_band_timestamp_ns = now;
1744 	}
1745 	lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1746 
1747 	/* Notify the buffer cache, file systems, etc. to jetison everything they can. */
1748 	if (consider_buffer_cache_collect != NULL) {
1749 		(void)(*consider_buffer_cache_collect)(1);
1750 	}
1751 }
1752 
1753 
1754 /*
1755  * Memorystatus notification debugging support
1756  */
1757 
1758 static int
1759 sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
1760 {
1761 #pragma unused(arg1, arg2, oidp)
1762 #if !XNU_TARGET_OS_OSX
1763 	int error = 0;
1764 
1765 	error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
1766 	if (error) {
1767 		return error;
1768 	}
1769 
1770 #endif /* !XNU_TARGET_OS_OSX */
1771 	uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
1772 
1773 	return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
1774 }
1775 
1776 #if DEBUG || DEVELOPMENT
1777 
1778 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
1779     0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1780 
1781 #else /* DEBUG || DEVELOPMENT */
1782 
1783 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1784     0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1785 
1786 #endif /* DEBUG || DEVELOPMENT */
1787 
1788 /*
1789  * Trigger levels to test the mechanism.
1790  * Can be used via a sysctl.
1791  */
1792 #define TEST_LOW_MEMORY_TRIGGER_ONE        1
1793 #define TEST_LOW_MEMORY_TRIGGER_ALL        2
1794 #define TEST_PURGEABLE_TRIGGER_ONE        3
1795 #define TEST_PURGEABLE_TRIGGER_ALL        4
1796 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE    5
1797 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL    6
1798 
1799 static int
1800 sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
1801 {
1802 #pragma unused(arg1, arg2)
1803 
1804 	int level = 0;
1805 	int error = 0;
1806 	int pressure_level = 0;
1807 	int trigger_request = 0;
1808 	int force_purge;
1809 
1810 	error = sysctl_handle_int(oidp, &level, 0, req);
1811 	if (error || !req->newptr) {
1812 		return error;
1813 	}
1814 
1815 	memorystatus_manual_testing_on = TRUE;
1816 
1817 	trigger_request = (level >> 16) & 0xFFFF;
1818 	pressure_level = (level & 0xFFFF);
1819 
1820 	if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
1821 	    trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
1822 		return EINVAL;
1823 	}
1824 	switch (pressure_level) {
1825 	case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
1826 	case NOTE_MEMORYSTATUS_PRESSURE_WARN:
1827 	case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
1828 		break;
1829 	default:
1830 		return EINVAL;
1831 	}
1832 
1833 	/*
1834 	 * The pressure level is being set from user-space.
1835 	 * And user-space uses the constants in sys/event.h
1836 	 * So we translate those events to our internal levels here.
1837 	 */
1838 	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1839 		memorystatus_manual_testing_level = kVMPressureNormal;
1840 		force_purge = 0;
1841 	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
1842 		memorystatus_manual_testing_level = kVMPressureWarning;
1843 		force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1844 	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
1845 		memorystatus_manual_testing_level = kVMPressureCritical;
1846 		force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1847 	}
1848 
1849 	memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
1850 
1851 	/* purge according to the new pressure level */
1852 	switch (trigger_request) {
1853 	case TEST_PURGEABLE_TRIGGER_ONE:
1854 	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
1855 		if (force_purge == 0) {
1856 			/* no purging requested */
1857 			break;
1858 		}
1859 		vm_purgeable_object_purge_one_unlocked(force_purge);
1860 		break;
1861 	case TEST_PURGEABLE_TRIGGER_ALL:
1862 	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
1863 		if (force_purge == 0) {
1864 			/* no purging requested */
1865 			break;
1866 		}
1867 		while (vm_purgeable_object_purge_one_unlocked(force_purge)) {
1868 			;
1869 		}
1870 		break;
1871 	}
1872 
1873 	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
1874 	    (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
1875 		memorystatus_update_vm_pressure(TRUE);
1876 	}
1877 
1878 	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
1879 	    (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
1880 		while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
1881 			continue;
1882 		}
1883 	}
1884 
1885 	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1886 		memorystatus_manual_testing_on = FALSE;
1887 	}
1888 
1889 	return 0;
1890 }
1891 
1892 SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1893     0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
1894 
1895 
1896 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
1897 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
1898 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
1899 
1900 extern int vm_pressure_level_transition_threshold;
1901 SYSCTL_INT(_kern, OID_AUTO, vm_pressure_level_transition_threshold, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_level_transition_threshold, 0, "");
1902 
1903 #if DEBUG || DEVELOPMENT
1904 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
1905 
1906 #if 0
1907 #if CONFIG_JETSAM && VM_PRESSURE_EVENTS
1908 static boolean_t
1909 memorystatus_issue_pressure_kevent(boolean_t pressured)
1910 {
1911 	memorystatus_klist_lock();
1912 	KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
1913 	memorystatus_klist_unlock();
1914 	return TRUE;
1915 }
1916 #endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
1917 #endif /* 0 */
1918 
1919 /*
1920  * This routine is used for targeted notifications regardless of system memory pressure
1921  * and regardless of whether or not the process has already been notified.
1922  * It bypasses and has no effect on the only-one-notification per soft-limit policy.
1923  *
1924  * "memnote" is the current user.
1925  */
1926 
1927 static int
1928 sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
1929 {
1930 #pragma unused(arg1, arg2)
1931 	/* Need to be root or have memorystatus entitlement */
1932 	if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT)) {
1933 		return EPERM;
1934 	}
1935 
1936 	int error = 0, pid = 0;
1937 	struct knote *kn = NULL;
1938 	boolean_t found_knote = FALSE;
1939 	int fflags = 0;    /* filter flags for EVFILT_MEMORYSTATUS */
1940 	uint64_t value = 0;
1941 
1942 	error = sysctl_handle_quad(oidp, &value, 0, req);
1943 	if (error || !req->newptr) {
1944 		return error;
1945 	}
1946 
1947 	/*
1948 	 * Find the pid in the low 32 bits of value passed in.
1949 	 */
1950 	pid = (int)(value & 0xFFFFFFFF);
1951 
1952 	/*
1953 	 * Find notification in the high 32 bits of the value passed in.
1954 	 */
1955 	fflags = (int)((value >> 32) & 0xFFFFFFFF);
1956 
1957 	/*
1958 	 * For backwards compatibility, when no notification is
1959 	 * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
1960 	 */
1961 	if (fflags == 0) {
1962 		fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1963 		// printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
1964 	}
1965 
1966 	/* wake up everybody waiting for kVMPressureJetsam */
1967 	if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) {
1968 		memorystatus_issue_fg_band_notify();
1969 		return error;
1970 	}
1971 
1972 	/*
1973 	 * See event.h ... fflags for EVFILT_MEMORYSTATUS
1974 	 */
1975 	if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) ||
1976 	    (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
1977 	    (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
1978 	    (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
1979 	    (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
1980 	    (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
1981 	    (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
1982 	    ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
1983 		memorystatus_log_error("memorystatus_vm_pressure_send: notification [0x%x] not supported\n", fflags);
1984 		error = 1;
1985 		return error;
1986 	}
1987 
1988 	/*
1989 	 * Forcibly send pid a memorystatus notification.
1990 	 */
1991 
1992 	memorystatus_klist_lock();
1993 
1994 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1995 		proc_t knote_proc = knote_get_kq(kn)->kq_p;
1996 		pid_t knote_pid = proc_getpid(knote_proc);
1997 
1998 		if (knote_pid == pid) {
1999 			/*
2000 			 * Forcibly send this pid a memorystatus notification.
2001 			 */
2002 			kn->kn_fflags = fflags;
2003 			found_knote = TRUE;
2004 		}
2005 	}
2006 
2007 	if (found_knote) {
2008 		KNOTE(&memorystatus_klist, 0);
2009 		memorystatus_log_debug("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d]\n", value, fflags, pid);
2010 		error = 0;
2011 	} else {
2012 		memorystatus_log_error("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
2013 		error = 1;
2014 	}
2015 
2016 	memorystatus_klist_unlock();
2017 
2018 	return error;
2019 }
2020 
2021 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
2022     0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
2023 
2024 #endif /* DEBUG || DEVELOPMENT */
2025 
2026 #endif /* VM_PRESSURE_EVENTS */
2027