xref: /xnu-8792.81.2/bsd/kern/kern_memorystatus_notify.c (revision 19c3b8c28c31cb8130e034cfb5df6bf9ba342d90)
1 /*
2  * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  *
28  */
29 
30 #include <sys/kern_event.h>
31 #include <kern/sched_prim.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/thread_call.h>
38 #include <kern/host.h>
39 #include <kern/policy_internal.h>
40 #include <kern/thread_group.h>
41 
42 #include <IOKit/IOBSD.h>
43 
44 #include <libkern/libkern.h>
45 #include <libkern/coreanalytics/coreanalytics.h>
46 #include <mach/coalition.h>
47 #include <mach/clock_types.h>
48 #include <mach/mach_time.h>
49 #include <mach/task.h>
50 #include <mach/host_priv.h>
51 #include <mach/mach_host.h>
52 #include <os/log.h>
53 #include <pexpert/pexpert.h>
54 #include <sys/coalition.h>
55 #include <sys/kern_event.h>
56 #include <sys/proc.h>
57 #include <sys/proc_info.h>
58 #include <sys/reason.h>
59 #include <sys/signal.h>
60 #include <sys/signalvar.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysproto.h>
63 #include <sys/time.h>
64 #include <sys/wait.h>
65 #include <sys/tree.h>
66 #include <sys/priv.h>
67 #include <vm/vm_pageout.h>
68 #include <vm/vm_protos.h>
69 #include <mach/machine/sdt.h>
70 #include <libkern/section_keywords.h>
71 #include <stdatomic.h>
72 
73 #if CONFIG_FREEZE
74 #include <vm/vm_map.h>
75 #endif /* CONFIG_FREEZE */
76 
77 #include <sys/kern_memorystatus.h>
78 #include <sys/kern_memorystatus_notify.h>
79 
80 /*
81  * Memorystatus klist structures
82  */
83 struct klist memorystatus_klist;
84 static lck_mtx_t memorystatus_klist_mutex;
85 static void memorystatus_klist_lock(void);
86 static void memorystatus_klist_unlock(void);
87 
88 /*
89  * Memorystatus kevent filter routines
90  */
91 static int filt_memorystatusattach(struct knote *kn, struct kevent_qos_s *kev);
92 static void filt_memorystatusdetach(struct knote *kn);
93 static int filt_memorystatus(struct knote *kn, long hint);
94 static int filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev);
95 static int filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev);
96 
97 SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
98 	.f_attach = filt_memorystatusattach,
99 	.f_detach = filt_memorystatusdetach,
100 	.f_event = filt_memorystatus,
101 	.f_touch = filt_memorystatustouch,
102 	.f_process = filt_memorystatusprocess,
103 };
104 
105 /*
106  * Memorystatus notification events
107  */
108 enum {
109 	kMemorystatusNoPressure = 0x1,
110 	kMemorystatusPressure = 0x2,
111 	kMemorystatusLowSwap = 0x4,
112 	kMemorystatusProcLimitWarn = 0x8,
113 	kMemorystatusProcLimitCritical = 0x10
114 };
115 
116 #define INTER_NOTIFICATION_DELAY    (250000)    /* .25 second */
117 #define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD        5000    /* milliseconds */
118 #define WARNING_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
119 #define CRITICAL_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
120 
121 /*
122  * Memorystatus notification helper routines
123  */
124 static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
125 static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
126 static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
127 static struct knote *vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update);
128 static void vm_dispatch_memory_pressure(void);
129 kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process);
130 
131 #if VM_PRESSURE_EVENTS
132 
133 /*
134  * This value is the threshold that a process must meet to be considered for scavenging.
135  */
136 #if XNU_TARGET_OS_OSX
137 #define VM_PRESSURE_MINIMUM_RSIZE        10    /* MB */
138 #else /* XNU_TARGET_OS_OSX */
139 #define VM_PRESSURE_MINIMUM_RSIZE        6    /* MB */
140 #endif /* XNU_TARGET_OS_OSX */
141 
142 static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
143 
144 #if DEVELOPMENT || DEBUG
145 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
146 #endif /* DEVELOPMENT || DEBUG */
147 
148 vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
149 
150 /*
151  * We use this flag to signal if we have any HWM offenders
152  * on the system. This way we can reduce the number of wakeups
153  * of the memorystatus_thread when the system is between the
154  * "pressure" and "critical" threshold.
155  *
156  * The (re-)setting of this variable is done without any locks
157  * or synchronization simply because it is not possible (currently)
158  * to keep track of HWM offenders that drop down below their memory
159  * limit and/or exit. So, we choose to burn a couple of wasted wakeups
160  * by allowing the unguarded modification of this variable.
161  */
162 boolean_t memorystatus_hwm_candidates = 0;
163 
164 #endif /* VM_PRESSURE_EVENTS */
165 
166 #if CONFIG_JETSAM
167 
168 extern unsigned int memorystatus_available_pages;
169 extern unsigned int memorystatus_available_pages_pressure;
170 extern unsigned int memorystatus_available_pages_critical;
171 extern unsigned int memorystatus_available_pages_critical_base;
172 extern unsigned int memorystatus_available_pages_critical_idle_offset;
173 
174 #else /* CONFIG_JETSAM */
175 
176 extern uint64_t memorystatus_available_pages;
177 extern uint64_t memorystatus_available_pages_pressure;
178 extern uint64_t memorystatus_available_pages_critical;
179 
180 #endif /* CONFIG_JETSAM */
181 
182 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
183 uint32_t memorystatus_jetsam_fg_band_waiters = 0;
184 static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = 0; /* nanosec */
185 static uint64_t memorystatus_jetsam_fg_band_delay_ns = 5ull * 1000 * 1000 * 1000; /* nanosec */
186 
187 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
188 
189 #if DEVELOPMENT || DEBUG
190 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_fg_band_delay_ns, CTLFLAG_RW | CTLFLAG_LOCKED,
191     &memorystatus_jetsam_fg_band_delay_ns, "");
192 #endif
193 
194 static int
filt_memorystatusattach(struct knote * kn,__unused struct kevent_qos_s * kev)195 filt_memorystatusattach(struct knote *kn, __unused struct kevent_qos_s *kev)
196 {
197 	int error;
198 
199 	kn->kn_flags |= EV_CLEAR; /* automatically set */
200 	kn->kn_sdata = 0;         /* incoming data is ignored */
201 	memset(&kn->kn_ext, 0, sizeof(kn->kn_ext));
202 
203 	error = memorystatus_knote_register(kn);
204 	if (error) {
205 		knote_set_error(kn, error);
206 	}
207 	return 0;
208 }
209 
210 static void
filt_memorystatusdetach(struct knote * kn)211 filt_memorystatusdetach(struct knote *kn)
212 {
213 	memorystatus_knote_unregister(kn);
214 }
215 
216 static int
filt_memorystatus(struct knote * kn __unused,long hint)217 filt_memorystatus(struct knote *kn __unused, long hint)
218 {
219 	if (hint) {
220 		switch (hint) {
221 		case kMemorystatusNoPressure:
222 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
223 				kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
224 			}
225 			break;
226 		case kMemorystatusPressure:
227 			if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
228 				if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
229 					kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
230 				}
231 			} else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
232 				if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
233 					kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
234 				}
235 			}
236 			break;
237 		case kMemorystatusLowSwap:
238 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
239 				kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
240 			}
241 			break;
242 
243 		case kMemorystatusProcLimitWarn:
244 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
245 				kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
246 			}
247 			break;
248 
249 		case kMemorystatusProcLimitCritical:
250 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
251 				kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
252 			}
253 			break;
254 
255 		default:
256 			break;
257 		}
258 	}
259 
260 #if 0
261 	if (kn->kn_fflags != 0) {
262 		proc_t knote_proc = knote_get_kq(kn)->kq_p;
263 		pid_t knote_pid = proc_getpid(knote_proc);
264 
265 		printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
266 		    (unsigned long)kn, kn->kn_fflags, knote_pid);
267 	}
268 #endif
269 
270 	return kn->kn_fflags != 0;
271 }
272 
273 static int
filt_memorystatustouch(struct knote * kn,struct kevent_qos_s * kev)274 filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev)
275 {
276 	int res;
277 	int prev_kn_sfflags = 0;
278 
279 	memorystatus_klist_lock();
280 
281 	/*
282 	 * copy in new kevent settings
283 	 * (saving the "desired" data and fflags).
284 	 */
285 
286 	prev_kn_sfflags = kn->kn_sfflags;
287 	kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
288 
289 #if XNU_TARGET_OS_OSX
290 	/*
291 	 * Only on desktop do we restrict notifications to
292 	 * one per active/inactive state (soft limits only).
293 	 */
294 	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
295 		/*
296 		 * Is there previous state to preserve?
297 		 */
298 		if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
299 			/*
300 			 * This knote was previously interested in proc_limit_warn,
301 			 * so yes, preserve previous state.
302 			 */
303 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
304 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
305 			}
306 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
307 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
308 			}
309 		} else {
310 			/*
311 			 * This knote was not previously interested in proc_limit_warn,
312 			 * but it is now.  Set both states.
313 			 */
314 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
315 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
316 		}
317 	}
318 
319 	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
320 		/*
321 		 * Is there previous state to preserve?
322 		 */
323 		if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
324 			/*
325 			 * This knote was previously interested in proc_limit_critical,
326 			 * so yes, preserve previous state.
327 			 */
328 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
329 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
330 			}
331 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
332 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
333 			}
334 		} else {
335 			/*
336 			 * This knote was not previously interested in proc_limit_critical,
337 			 * but it is now.  Set both states.
338 			 */
339 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
340 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
341 		}
342 	}
343 #endif /* XNU_TARGET_OS_OSX */
344 
345 	/*
346 	 * reset the output flags based on a
347 	 * combination of the old events and
348 	 * the new desired event list.
349 	 */
350 	//kn->kn_fflags &= kn->kn_sfflags;
351 
352 	res = (kn->kn_fflags != 0);
353 
354 	memorystatus_klist_unlock();
355 
356 	return res;
357 }
358 
359 static int
filt_memorystatusprocess(struct knote * kn,struct kevent_qos_s * kev)360 filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev)
361 {
362 	int res = 0;
363 
364 	memorystatus_klist_lock();
365 	if (kn->kn_fflags) {
366 		knote_fill_kevent(kn, kev, 0);
367 		res = 1;
368 	}
369 	memorystatus_klist_unlock();
370 
371 	return res;
372 }
373 
374 static void
memorystatus_klist_lock(void)375 memorystatus_klist_lock(void)
376 {
377 	lck_mtx_lock(&memorystatus_klist_mutex);
378 }
379 
380 static void
memorystatus_klist_unlock(void)381 memorystatus_klist_unlock(void)
382 {
383 	lck_mtx_unlock(&memorystatus_klist_mutex);
384 }
385 
386 void
memorystatus_kevent_init(lck_grp_t * grp,lck_attr_t * attr)387 memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr)
388 {
389 	lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
390 	klist_init(&memorystatus_klist);
391 }
392 
393 int
memorystatus_knote_register(struct knote * kn)394 memorystatus_knote_register(struct knote *kn)
395 {
396 	int error = 0;
397 
398 	memorystatus_klist_lock();
399 
400 	/*
401 	 * Support only userspace visible flags.
402 	 */
403 	if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
404 #if XNU_TARGET_OS_OSX
405 		if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
406 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
407 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
408 		}
409 
410 		if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
411 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
412 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
413 		}
414 #endif /* XNU_TARGET_OS_OSX */
415 
416 		KNOTE_ATTACH(&memorystatus_klist, kn);
417 	} else {
418 		error = ENOTSUP;
419 	}
420 
421 	memorystatus_klist_unlock();
422 
423 	return error;
424 }
425 
426 void
memorystatus_knote_unregister(struct knote * kn __unused)427 memorystatus_knote_unregister(struct knote *kn __unused)
428 {
429 	memorystatus_klist_lock();
430 	KNOTE_DETACH(&memorystatus_klist, kn);
431 	memorystatus_klist_unlock();
432 }
433 
434 #if VM_PRESSURE_EVENTS
435 
436 #if CONFIG_JETSAM
437 
438 static thread_call_t sustained_pressure_handler_thread_call;
439 int memorystatus_should_kill_on_sustained_pressure = 1;
440 /* Count the number of sustained pressure kills we've done since boot. */
441 uint64_t memorystatus_kill_on_sustained_pressure_count = 0;
442 uint64_t memorystatus_kill_on_sustained_pressure_window_s = 60 * 10; /* 10 Minutes */
443 uint64_t memorystatus_kill_on_sustained_pressure_delay_ms = 500; /* .5 seconds */
444 
445 #if DEVELOPMENT || DEBUG
446 SYSCTL_INT(_kern, OID_AUTO, memorystatus_should_kill_on_sustained_pressure, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_should_kill_on_sustained_pressure, 0, "");
447 #endif /* DEVELOPMENT || DEBUG */
448 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_count, "");
449 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_window_s, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_window_s, "");
450 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_delay_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_delay_ms, "");
451 
452 static void sustained_pressure_handler(void*, void*);
453 #endif /* CONFIG_JETSAM */
454 static thread_call_t memorystatus_notify_update_telemetry_thread_call;
455 static void update_footprints_for_telemetry(void*, void*);
456 
457 
458 void
memorystatus_notify_init()459 memorystatus_notify_init()
460 {
461 #if CONFIG_JETSAM
462 	sustained_pressure_handler_thread_call = thread_call_allocate_with_options(sustained_pressure_handler, NULL, THREAD_CALL_PRIORITY_KERNEL_HIGH, THREAD_CALL_OPTIONS_ONCE);
463 #endif /* CONFIG_JETSAM */
464 	memorystatus_notify_update_telemetry_thread_call = thread_call_allocate_with_options(update_footprints_for_telemetry, NULL, THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
465 }
466 
467 #if CONFIG_MEMORYSTATUS
468 
469 inline int
memorystatus_send_note(int event_code,void * data,uint32_t data_length)470 memorystatus_send_note(int event_code, void *data, uint32_t data_length)
471 {
472 	int ret;
473 	struct kev_msg ev_msg;
474 
475 	ev_msg.vendor_code    = KEV_VENDOR_APPLE;
476 	ev_msg.kev_class      = KEV_SYSTEM_CLASS;
477 	ev_msg.kev_subclass   = KEV_MEMORYSTATUS_SUBCLASS;
478 
479 	ev_msg.event_code     = event_code;
480 
481 	ev_msg.dv[0].data_length = data_length;
482 	ev_msg.dv[0].data_ptr = data;
483 	ev_msg.dv[1].data_length = 0;
484 
485 	ret = kev_post_msg(&ev_msg);
486 	if (ret) {
487 		printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
488 	}
489 
490 	return ret;
491 }
492 
493 boolean_t
memorystatus_warn_process(const proc_t p,__unused boolean_t is_active,__unused boolean_t is_fatal,boolean_t limit_exceeded)494 memorystatus_warn_process(const proc_t p, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
495 {
496 	/*
497 	 * This function doesn't take a reference to p or lock it. So it better be the current process.
498 	 */
499 	assert(p == current_proc());
500 	pid_t pid = proc_getpid(p);
501 	boolean_t ret = FALSE;
502 	boolean_t found_knote = FALSE;
503 	struct knote *kn = NULL;
504 	int send_knote_count = 0;
505 	uint32_t platform;
506 	platform = proc_platform(p);
507 
508 	/*
509 	 * See comment in sysctl_memorystatus_vm_pressure_send.
510 	 */
511 
512 	memorystatus_klist_lock();
513 
514 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
515 		proc_t knote_proc = knote_get_kq(kn)->kq_p;
516 		pid_t knote_pid = proc_getpid(knote_proc);
517 
518 		if (knote_pid == pid) {
519 			/*
520 			 * By setting the "fflags" here, we are forcing
521 			 * a process to deal with the case where it's
522 			 * bumping up into its memory limits. If we don't
523 			 * do this here, we will end up depending on the
524 			 * system pressure snapshot evaluation in
525 			 * filt_memorystatus().
526 			 */
527 
528 			/*
529 			 * The type of notification and the frequency are different between
530 			 * embedded and desktop.
531 			 *
532 			 * Embedded processes register for global pressure notifications
533 			 * (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) via UIKit
534 			 * (see applicationDidReceiveMemoryWarning in UIKit). We'll warn them here if
535 			 * they are near there memory limit. filt_memorystatus() will warn them based
536 			 * on the system pressure level.
537 			 *
538 			 * On desktop, (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL)
539 			 * are only expected to fire for system level warnings. Desktop procesess
540 			 * register for NOTE_MEMORYSTATUS_PROC_LIMIT_WARN
541 			 * if they want to be warned when they approach their limit
542 			 * and for NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL to be warned when they
543 			 * exceed their limit.
544 			 *
545 			 * On embedded we continuously warn processes that are approaching their
546 			 * memory limit. However on desktop, we only send one warning while
547 			 * the process is active/inactive if the limit is soft..
548 			 *
549 			 */
550 			if (platform == PLATFORM_MACOS || platform == PLATFORM_MACCATALYST || platform == PLATFORM_DRIVERKIT) {
551 				if (!limit_exceeded) {
552 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
553 						found_knote = TRUE;
554 						if (!is_fatal) {
555 							/*
556 							 * Restrict proc_limit_warn notifications when
557 							 * non-fatal (soft) limit is at play.
558 							 */
559 							if (is_active) {
560 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
561 									/*
562 									 * Mark this knote for delivery.
563 									 */
564 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
565 									/*
566 									 * And suppress it from future notifications.
567 									 */
568 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
569 									send_knote_count++;
570 								}
571 							} else {
572 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
573 									/*
574 									 * Mark this knote for delivery.
575 									 */
576 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
577 									/*
578 									 * And suppress it from future notifications.
579 									 */
580 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
581 									send_knote_count++;
582 								}
583 							}
584 						} else {
585 							/*
586 							 * No restriction on proc_limit_warn notifications when
587 							 * fatal (hard) limit is at play.
588 							 */
589 							kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
590 							send_knote_count++;
591 						}
592 					}
593 				} else {
594 					/*
595 					 * Send this notification when a process has exceeded a soft limit,
596 					 */
597 
598 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
599 						found_knote = TRUE;
600 						if (!is_fatal) {
601 							/*
602 							 * Restrict critical notifications for soft limits.
603 							 */
604 
605 							if (is_active) {
606 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
607 									/*
608 									 * Suppress future proc_limit_critical notifications
609 									 * for the active soft limit.
610 									 */
611 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
612 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
613 									send_knote_count++;
614 								}
615 							} else {
616 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
617 									/*
618 									 * Suppress future proc_limit_critical_notifications
619 									 * for the inactive soft limit.
620 									 */
621 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
622 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
623 									send_knote_count++;
624 								}
625 							}
626 						} else {
627 							/*
628 							 * We should never be trying to send a critical notification for
629 							 * a hard limit... the process would be killed before it could be
630 							 * received.
631 							 */
632 							panic("Caught sending pid %d a critical warning for a fatal limit.", pid);
633 						}
634 					}
635 				}
636 			} else {
637 				if (!limit_exceeded) {
638 					/*
639 					 * Intentionally set either the unambiguous limit warning,
640 					 * the system-wide critical or the system-wide warning
641 					 * notification bit.
642 					 */
643 
644 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
645 						kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
646 						found_knote = TRUE;
647 						send_knote_count++;
648 					} else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
649 						kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
650 						found_knote = TRUE;
651 						send_knote_count++;
652 					} else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
653 						kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
654 						found_knote = TRUE;
655 						send_knote_count++;
656 					}
657 				} else {
658 					/*
659 					 * Send this notification when a process has exceeded a soft limit.
660 					 */
661 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
662 						kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
663 						found_knote = TRUE;
664 						send_knote_count++;
665 					}
666 				}
667 			}
668 		}
669 	}
670 
671 	if (found_knote) {
672 		if (send_knote_count > 0) {
673 			KNOTE(&memorystatus_klist, 0);
674 		}
675 		ret = TRUE;
676 	}
677 
678 	memorystatus_klist_unlock();
679 
680 	return ret;
681 }
682 
683 /*
684  * Can only be set by the current task on itself.
685  */
686 int
memorystatus_low_mem_privileged_listener(uint32_t op_flags)687 memorystatus_low_mem_privileged_listener(uint32_t op_flags)
688 {
689 	boolean_t set_privilege = FALSE;
690 	/*
691 	 * Need an entitlement check here?
692 	 */
693 	if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
694 		set_privilege = TRUE;
695 	} else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
696 		set_privilege = FALSE;
697 	} else {
698 		return EINVAL;
699 	}
700 
701 	return task_low_mem_privileged_listener(current_task(), set_privilege, NULL);
702 }
703 
704 int
memorystatus_send_pressure_note(pid_t pid)705 memorystatus_send_pressure_note(pid_t pid)
706 {
707 	MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
708 	return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
709 }
710 
711 boolean_t
memorystatus_is_foreground_locked(proc_t p)712 memorystatus_is_foreground_locked(proc_t p)
713 {
714 	return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
715 	       (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
716 }
717 
718 /*
719  * This is meant for stackshot and kperf -- it does not take the proc_list_lock
720  * to access the p_memstat_dirty field.
721  */
722 void
memorystatus_proc_flags_unsafe(void * v,boolean_t * is_dirty,boolean_t * is_dirty_tracked,boolean_t * allow_idle_exit)723 memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
724 {
725 	if (!v) {
726 		*is_dirty = FALSE;
727 		*is_dirty_tracked = FALSE;
728 		*allow_idle_exit = FALSE;
729 	} else {
730 		proc_t p = (proc_t)v;
731 		*is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
732 		*is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
733 		*allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
734 	}
735 }
736 
737 boolean_t
memorystatus_bg_pressure_eligible(proc_t p)738 memorystatus_bg_pressure_eligible(proc_t p)
739 {
740 	boolean_t eligible = FALSE;
741 
742 	proc_list_lock();
743 
744 	MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", proc_getpid(p), p->p_memstat_state);
745 
746 	/* Foreground processes have already been dealt with at this point, so just test for eligibility */
747 	if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
748 		eligible = TRUE;
749 	}
750 
751 	if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
752 		/*
753 		 * IDLE and IDLE_DEFERRED bands contain processes
754 		 * that have dropped memory to be under their inactive
755 		 * memory limits. And so they can't really give back
756 		 * anything.
757 		 */
758 		eligible = FALSE;
759 	}
760 
761 	proc_list_unlock();
762 
763 	return eligible;
764 }
765 
766 void
memorystatus_send_low_swap_note(void)767 memorystatus_send_low_swap_note(void)
768 {
769 	struct knote *kn = NULL;
770 
771 	memorystatus_klist_lock();
772 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
773 		/* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
774 		 * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
775 		 * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
776 		 * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
777 		if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
778 			KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
779 			break;
780 		}
781 	}
782 
783 	memorystatus_klist_unlock();
784 }
785 
786 #endif /* CONFIG_MEMORYSTATUS */
787 
788 /*
789  * Notification telemetry
790  */
791 CA_EVENT(memorystatus_pressure_interval,
792     CA_INT, num_processes_registered,
793     CA_INT, num_notifications_sent,
794     CA_INT, max_level,
795     CA_INT, num_transitions,
796     CA_INT, num_kills,
797     CA_INT, duration);
798 static CA_EVENT_TYPE(memorystatus_pressure_interval) memorystatus_pressure_interval_telemetry;
799 
800 CA_EVENT(memorystatus_proc_notification,
801     CA_INT, footprint_before_notification,
802     CA_INT, footprint_1_min_after_first_warning,
803     CA_INT, footprint_5_min_after_first_warning,
804     CA_INT, footprint_20_min_after_first_warning,
805     CA_INT, footprint_1_min_after_first_critical,
806     CA_INT, footprint_5_min_after_first_critical,
807     CA_INT, footprint_20_min_after_first_critical,
808     CA_INT, order_within_list,
809     CA_INT, num_notifications_sent,
810     CA_INT, time_between_warning_and_critical,
811     CA_STATIC_STRING(CA_PROCNAME_LEN), proc_name);
812 
813 /* The send timestamps for the first notifications are stored in the knote's kn_sdata field */
814 #define KNOTE_SEND_TIMESTAMP_WARNING_INDEX 0
815 #define KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX 1
816 
817 /* The footprint history for this task is stored in the knote's kn_ext array. */
818 struct knote_footprint_history {
819 	uint32_t kfh_starting_footprint;
820 	uint32_t kfh_footprint_after_warn_1; /* 1 minute after first warning notification */
821 	uint32_t kfh_footprint_after_warn_5; /* 5 minutes after first warning notification */
822 	uint32_t kfh_footprint_after_warn_20; /* 20 minutes after first warning notification */
823 	uint32_t kfh_footprint_after_critical_1; /* 1 minute after first critical notification */
824 	uint32_t kfh_footprint_after_critical_5; /* 5 minutes after first critical notification */
825 	uint32_t kfh_footprint_after_critical_20; /* 20 minutes after first critical notification */
826 	uint16_t kfh_num_notifications;
827 	uint16_t kfh_notification_order;
828 } __attribute__((packed));
829 
830 
831 static_assert(sizeof(struct knote_footprint_history) <= sizeof(uint64_t) * 4, "footprint history fits in knote extensions");
832 
833 static void
mark_knote_send_time(struct knote * kn,task_t task,int knote_pressure_level,uint16_t order_within_list)834 mark_knote_send_time(struct knote *kn, task_t task, int knote_pressure_level, uint16_t order_within_list)
835 {
836 	uint32_t *timestamps;
837 	uint32_t index;
838 	uint64_t curr_ts, curr_ts_seconds;
839 	struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
840 	if (knote_pressure_level != NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
841 		timestamps = (uint32_t *)&(kn->kn_sdata);
842 		index = knote_pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN ?
843 		    KNOTE_SEND_TIMESTAMP_WARNING_INDEX : KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX;
844 		if (timestamps[index] == 0) {
845 			/* First notification for this level since pressure elevated from normal. */
846 			curr_ts = mach_absolute_time();
847 			curr_ts_seconds = 0;
848 			absolutetime_to_nanoseconds(curr_ts, &curr_ts_seconds);
849 			curr_ts_seconds /= NSEC_PER_SEC;
850 
851 			timestamps[index] = (uint32_t)MIN(UINT32_MAX, curr_ts_seconds);
852 
853 			/* Record task initial footprint */
854 			if (timestamps[index == KNOTE_SEND_TIMESTAMP_WARNING_INDEX ? KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX : KNOTE_SEND_TIMESTAMP_WARNING_INDEX] == 0) {
855 				/*
856 				 * First notification at any level since pressure elevated from normal.
857 				 * Record the footprint and our order in the notification list.
858 				 */
859 				footprint_history->kfh_starting_footprint = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
860 				footprint_history->kfh_notification_order = order_within_list;
861 			}
862 		}
863 	}
864 	footprint_history->kfh_num_notifications++;
865 }
866 
867 /*
868  * Records the current footprint for this task in the knote telemetry.
869  *
870  * Returns the soonest absolutetime when this footprint history should be updated again.
871  */
872 static uint64_t
update_knote_footprint_history(struct knote * kn,task_t task,uint64_t curr_ts)873 update_knote_footprint_history(struct knote *kn, task_t task, uint64_t curr_ts)
874 {
875 	uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
876 	struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
877 	uint64_t warning_send_time, critical_send_time, minutes_since_warning = UINT64_MAX, minutes_since_critical = UINT64_MAX;
878 	warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
879 	critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
880 	uint32_t task_phys_footprint_mb = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
881 	uint64_t next_run = UINT64_MAX, absolutetime_in_minute = 0, minutes_since_last_notification = 0, curr_ts_s;
882 	absolutetime_to_nanoseconds(curr_ts, &curr_ts_s);
883 	nanoseconds_to_absolutetime(60 * NSEC_PER_SEC, &absolutetime_in_minute);
884 	curr_ts_s /= NSEC_PER_SEC;
885 
886 	if (warning_send_time != 0) {
887 		/* This task received a warning notification. */
888 		minutes_since_warning = (curr_ts_s - warning_send_time) / 60;
889 		if (footprint_history->kfh_footprint_after_warn_1 == 0 && minutes_since_warning >= 1) {
890 			footprint_history->kfh_footprint_after_warn_1 = task_phys_footprint_mb;
891 		}
892 		if (footprint_history->kfh_footprint_after_warn_5 == 0 && minutes_since_warning >= 5) {
893 			footprint_history->kfh_footprint_after_warn_5 = task_phys_footprint_mb;
894 		}
895 		if (footprint_history->kfh_footprint_after_warn_20 == 0 && minutes_since_warning >= 20) {
896 			footprint_history->kfh_footprint_after_warn_20 = task_phys_footprint_mb;
897 		}
898 	}
899 	if (critical_send_time != 0) {
900 		/* This task received a critical notification. */
901 		minutes_since_critical = (curr_ts_s - critical_send_time) / 60;
902 		if (footprint_history->kfh_footprint_after_critical_1 == 0 && minutes_since_critical >= 1) {
903 			footprint_history->kfh_footprint_after_critical_1 = task_phys_footprint_mb;
904 		}
905 		if (footprint_history->kfh_footprint_after_critical_5 == 0 && minutes_since_critical >= 5) {
906 			footprint_history->kfh_footprint_after_critical_5 = task_phys_footprint_mb;
907 		}
908 		if (footprint_history->kfh_footprint_after_critical_20 == 0 && minutes_since_critical >= 20) {
909 			footprint_history->kfh_footprint_after_critical_20 = task_phys_footprint_mb;
910 		}
911 	}
912 
913 	minutes_since_last_notification = MIN(minutes_since_warning, minutes_since_critical);
914 	if (minutes_since_last_notification < 20) {
915 		if (minutes_since_last_notification < 5) {
916 			if (minutes_since_last_notification < 1) {
917 				next_run = curr_ts + absolutetime_in_minute;
918 			} else {
919 				next_run = curr_ts + (absolutetime_in_minute * 5);
920 			}
921 		} else {
922 			next_run = curr_ts + (absolutetime_in_minute * 20);
923 		}
924 	}
925 
926 	return next_run;
927 }
928 
929 extern char *proc_name_address(void *p);
930 /*
931  * Attempt to send the given level telemetry event.
932  * Finalizes the duration.
933  * Clears the src_event struct.
934  */
935 static void
memorystatus_pressure_interval_send(CA_EVENT_TYPE (memorystatus_pressure_interval)* src_event)936 memorystatus_pressure_interval_send(CA_EVENT_TYPE(memorystatus_pressure_interval) *src_event)
937 {
938 	uint64_t duration_nanoseconds = 0;
939 	uint64_t             curr_ts = mach_absolute_time();
940 	src_event->duration = curr_ts - src_event->duration;
941 	absolutetime_to_nanoseconds(src_event->duration, &duration_nanoseconds);
942 	src_event->duration = (int64_t) (duration_nanoseconds / NSEC_PER_SEC);
943 
944 	/*
945 	 * Drop the event rather than block for memory. We should be in a normal pressure level now,
946 	 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
947 	 */
948 	ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_pressure_interval, Z_NOWAIT);
949 	if (event_wrapper) {
950 		memcpy(event_wrapper->data, src_event, sizeof(CA_EVENT_TYPE(memorystatus_pressure_interval)));
951 		CA_EVENT_SEND(event_wrapper);
952 	}
953 	src_event->num_processes_registered = 0;
954 	src_event->num_notifications_sent = 0;
955 	src_event->max_level = 0;
956 	src_event->num_transitions = 0;
957 	src_event->num_kills = 0;
958 	src_event->duration = 0;
959 }
960 
961 
962 /*
963  * Attempt to send the per-proc telemetry events.
964  * Clears the footprint histories on the knotes.
965  */
966 static void
memorystatus_pressure_proc_telemetry_send(void)967 memorystatus_pressure_proc_telemetry_send(void)
968 {
969 	struct knote *kn = NULL;
970 	memorystatus_klist_lock();
971 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
972 		proc_t            p = PROC_NULL;
973 		struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
974 		uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
975 		uint32_t warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
976 		uint32_t critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
977 		CA_EVENT_TYPE(memorystatus_proc_notification) * event = NULL;
978 		if (warning_send_time != 0 || critical_send_time != 0) {
979 			/*
980 			 * Drop the event rather than block for memory. We should be in a normal pressure level now,
981 			 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
982 			 */
983 			ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_proc_notification, Z_NOWAIT | Z_ZERO);
984 			if (event_wrapper) {
985 				event = event_wrapper->data;
986 
987 				event->footprint_before_notification = footprint_history->kfh_starting_footprint;
988 				event->footprint_1_min_after_first_warning = footprint_history->kfh_footprint_after_warn_1;
989 				event->footprint_5_min_after_first_warning = footprint_history->kfh_footprint_after_warn_5;
990 				event->footprint_20_min_after_first_warning = footprint_history->kfh_footprint_after_warn_20;
991 				event->footprint_1_min_after_first_critical = footprint_history->kfh_footprint_after_critical_1;
992 				event->footprint_5_min_after_first_critical = footprint_history->kfh_footprint_after_critical_5;
993 				event->footprint_20_min_after_first_critical = footprint_history->kfh_footprint_after_critical_20;
994 				event->num_notifications_sent = footprint_history->kfh_num_notifications;
995 				if (warning_send_time != 0 && critical_send_time != 0) {
996 					event->time_between_warning_and_critical = (critical_send_time - warning_send_time) / 60; // Minutes
997 				}
998 				event->order_within_list = footprint_history->kfh_notification_order;
999 
1000 				p = proc_ref(knote_get_kq(kn)->kq_p, false);
1001 				if (p == NULL) {
1002 					CA_EVENT_DEALLOCATE(event_wrapper);
1003 					continue;
1004 				}
1005 				strlcpy(event->proc_name, proc_name_address(p), sizeof(event->proc_name));
1006 
1007 				proc_rele(p);
1008 				CA_EVENT_SEND(event_wrapper);
1009 			}
1010 		}
1011 		memset(footprint_history, 0, sizeof(*footprint_history));
1012 		timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX] = 0;
1013 		timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX] = 0;
1014 	}
1015 	memorystatus_klist_unlock();
1016 }
1017 
1018 /*
1019  * Send all telemetry associated with the increased pressure interval.
1020  */
1021 static void
memorystatus_pressure_telemetry_send(void)1022 memorystatus_pressure_telemetry_send(void)
1023 {
1024 	LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_NOTOWNED);
1025 	memorystatus_pressure_interval_send(&memorystatus_pressure_interval_telemetry);
1026 	memorystatus_pressure_proc_telemetry_send();
1027 }
1028 
1029 
1030 /*
1031  * kn_max - knote
1032  *
1033  * knote_pressure_level - to check if the knote is registered for this notification level.
1034  *
1035  * task    - task whose bits we'll be modifying
1036  *
1037  * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
1038  *
1039  * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
1040  *
1041  */
1042 
1043 static boolean_t
is_knote_registered_modify_task_pressure_bits(struct knote * kn_max,int knote_pressure_level,task_t task,vm_pressure_level_t pressure_level_to_clear,vm_pressure_level_t pressure_level_to_set)1044 is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
1045 {
1046 	if (kn_max->kn_sfflags & knote_pressure_level) {
1047 		if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
1048 			task_clear_has_been_notified(task, pressure_level_to_clear);
1049 		}
1050 
1051 		task_mark_has_been_notified(task, pressure_level_to_set);
1052 		return TRUE;
1053 	}
1054 
1055 	return FALSE;
1056 }
1057 
1058 static void
memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)1059 memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
1060 {
1061 	struct knote *kn = NULL;
1062 
1063 	memorystatus_klist_lock();
1064 
1065 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1066 		proc_t p = knote_get_kq(kn)->kq_p;
1067 
1068 		if (p == proc_ref(p, false)) {
1069 			task_clear_has_been_notified(proc_task(p), pressure_level_to_clear);
1070 			proc_rele(p);
1071 		}
1072 	}
1073 
1074 	memorystatus_klist_unlock();
1075 }
1076 
1077 /*
1078  * Used by the vm_pressure_thread which is
1079  * signalled from within vm_pageout_scan().
1080  */
1081 
1082 void
consider_vm_pressure_events(void)1083 consider_vm_pressure_events(void)
1084 {
1085 	vm_dispatch_memory_pressure();
1086 }
1087 
1088 static void
vm_dispatch_memory_pressure(void)1089 vm_dispatch_memory_pressure(void)
1090 {
1091 	memorystatus_update_vm_pressure(FALSE);
1092 }
1093 
1094 static struct knote *
vm_pressure_select_optimal_candidate_to_notify(struct klist * candidate_list,int level,boolean_t target_foreground_process,uint64_t * next_telemetry_update)1095 vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update)
1096 {
1097 	struct knote    *kn = NULL, *kn_max = NULL;
1098 	uint64_t    resident_max = 0;/* MB */
1099 	int        selected_task_importance = 0;
1100 	static int    pressure_snapshot = -1;
1101 	boolean_t    pressure_increase = FALSE;
1102 	uint64_t     curr_ts = mach_absolute_time();
1103 	*next_telemetry_update = UINT64_MAX;
1104 
1105 	if (pressure_snapshot == -1) {
1106 		/*
1107 		 * Initial snapshot.
1108 		 */
1109 		pressure_snapshot = level;
1110 		pressure_increase = TRUE;
1111 	} else {
1112 		if (level && (level >= pressure_snapshot)) {
1113 			pressure_increase = TRUE;
1114 		} else {
1115 			pressure_increase = FALSE;
1116 		}
1117 
1118 		pressure_snapshot = level;
1119 	}
1120 
1121 	if (pressure_increase == TRUE) {
1122 		/*
1123 		 * We'll start by considering the largest
1124 		 * unimportant task in our list.
1125 		 */
1126 		selected_task_importance = INT_MAX;
1127 	} else {
1128 		/*
1129 		 * We'll start by considering the largest
1130 		 * important task in our list.
1131 		 */
1132 		selected_task_importance = 0;
1133 	}
1134 
1135 	SLIST_FOREACH(kn, candidate_list, kn_selnext) {
1136 		uint64_t        resident_size = 0;/* MB */
1137 		proc_t            p = PROC_NULL;
1138 		struct task*        t = TASK_NULL;
1139 		int            curr_task_importance = 0;
1140 		uint64_t         telemetry_update = 0;
1141 		boolean_t        consider_knote = FALSE;
1142 		boolean_t        privileged_listener = FALSE;
1143 
1144 		p = proc_ref(knote_get_kq(kn)->kq_p, false);
1145 		if (p == PROC_NULL) {
1146 			continue;
1147 		}
1148 
1149 #if CONFIG_MEMORYSTATUS
1150 		if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
1151 			/*
1152 			 * Skip process not marked foreground.
1153 			 */
1154 			proc_rele(p);
1155 			continue;
1156 		}
1157 #endif /* CONFIG_MEMORYSTATUS */
1158 
1159 		t = (struct task *)(proc_task(p));
1160 		telemetry_update = update_knote_footprint_history(kn, t, curr_ts);
1161 		*next_telemetry_update = MIN(*next_telemetry_update, telemetry_update);
1162 
1163 		vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
1164 
1165 		if ((kn->kn_sfflags & dispatch_level) == 0) {
1166 			proc_rele(p);
1167 			continue;
1168 		}
1169 
1170 #if CONFIG_MEMORYSTATUS
1171 		if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
1172 			VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", proc_getpid(p));
1173 			proc_rele(p);
1174 			continue;
1175 		}
1176 #endif /* CONFIG_MEMORYSTATUS */
1177 
1178 #if XNU_TARGET_OS_OSX
1179 		curr_task_importance = task_importance_estimate(t);
1180 #else /* XNU_TARGET_OS_OSX */
1181 		curr_task_importance = p->p_memstat_effectivepriority;
1182 #endif /* XNU_TARGET_OS_OSX */
1183 
1184 		/*
1185 		 * Privileged listeners are only considered in the multi-level pressure scheme
1186 		 * AND only if the pressure is increasing.
1187 		 */
1188 		if (level > 0) {
1189 			if (task_has_been_notified(t, level) == FALSE) {
1190 				/*
1191 				 * Is this a privileged listener?
1192 				 */
1193 				if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
1194 					if (privileged_listener) {
1195 						kn_max = kn;
1196 						proc_rele(p);
1197 						goto done_scanning;
1198 					}
1199 				}
1200 			} else {
1201 				proc_rele(p);
1202 				continue;
1203 			}
1204 		} else if (level == 0) {
1205 			/*
1206 			 * Task wasn't notified when the pressure was increasing and so
1207 			 * no need to notify it that the pressure is decreasing.
1208 			 */
1209 			if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
1210 				proc_rele(p);
1211 				continue;
1212 			}
1213 		}
1214 
1215 		/*
1216 		 * We don't want a small process to block large processes from
1217 		 * being notified again. <rdar://problem/7955532>
1218 		 */
1219 		resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */
1220 
1221 		if (resident_size >= vm_pressure_task_footprint_min) {
1222 			if (level > 0) {
1223 				/*
1224 				 * Warning or Critical Pressure.
1225 				 */
1226 				if (pressure_increase) {
1227 					if ((curr_task_importance < selected_task_importance) ||
1228 					    ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1229 						/*
1230 						 * We have found a candidate process which is:
1231 						 * a) at a lower importance than the current selected process
1232 						 * OR
1233 						 * b) has importance equal to that of the current selected process but is larger
1234 						 */
1235 
1236 						consider_knote = TRUE;
1237 					}
1238 				} else {
1239 					if ((curr_task_importance > selected_task_importance) ||
1240 					    ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1241 						/*
1242 						 * We have found a candidate process which is:
1243 						 * a) at a higher importance than the current selected process
1244 						 * OR
1245 						 * b) has importance equal to that of the current selected process but is larger
1246 						 */
1247 
1248 						consider_knote = TRUE;
1249 					}
1250 				}
1251 			} else if (level == 0) {
1252 				/*
1253 				 * Pressure back to normal.
1254 				 */
1255 				if ((curr_task_importance > selected_task_importance) ||
1256 				    ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1257 					consider_knote = TRUE;
1258 				}
1259 			}
1260 
1261 			if (consider_knote) {
1262 				resident_max = resident_size;
1263 				kn_max = kn;
1264 				selected_task_importance = curr_task_importance;
1265 				consider_knote = FALSE; /* reset for the next candidate */
1266 			}
1267 		} else {
1268 			/* There was no candidate with enough resident memory to scavenge */
1269 			VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", proc_getpid(p), resident_size);
1270 		}
1271 		proc_rele(p);
1272 	}
1273 
1274 done_scanning:
1275 	if (kn_max) {
1276 		VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max, 0, 0);
1277 		VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max);
1278 	}
1279 
1280 	return kn_max;
1281 }
1282 
1283 /*
1284  * To avoid notification storms in a system with sawtooth behavior of pressure levels eg:
1285  * Normal -> warning (notify clients) -> critical (notify) -> warning (notify) -> critical (notify) -> warning (notify)...
1286  *
1287  * We have 'resting' periods: WARNING_NOTIFICATION_RESTING_PERIOD and CRITICAL_NOTIFICATION_RESTING_PERIOD
1288  *
1289  * So it would look like:-
1290  * Normal -> warning (notify) -> critical (notify) -> warning (notify if it has been RestPeriod since last warning) -> critical (notify if it has been RestPeriod since last critical) -> ...
1291  *
1292  * That's what these 2 timestamps below signify.
1293  */
1294 
1295 uint64_t next_warning_notification_sent_at_ts = 0;
1296 uint64_t next_critical_notification_sent_at_ts = 0;
1297 
1298 boolean_t        memorystatus_manual_testing_on = FALSE;
1299 vm_pressure_level_t    memorystatus_manual_testing_level = kVMPressureNormal;
1300 
1301 unsigned int memorystatus_sustained_pressure_maximum_band = JETSAM_PRIORITY_IDLE;
1302 #if DEVELOPMENT || DEBUG
1303 SYSCTL_INT(_kern, OID_AUTO, memorystatus_sustained_pressure_maximum_band, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_sustained_pressure_maximum_band, 0, "");
1304 #endif /* DEVELOPMENT || DEBUG */
1305 
1306 #if CONFIG_JETSAM
1307 
1308 /*
1309  * TODO(jason): The memorystatus thread should be responsible for this
1310  * It can just check how long the pressure level has been at warning and the timestamp
1311  * of the last sustained pressure kill.
1312  */
1313 static void
sustained_pressure_handler(void * arg0 __unused,void * arg1 __unused)1314 sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused)
1315 {
1316 	int max_kills = 0, kill_count = 0;
1317 	/*
1318 	 * Pressure has been elevated for too long.
1319 	 * We don't want to leave the system in this state as it can delay background
1320 	 * work indefinitely & drain battery.
1321 	 *
1322 	 * Try to return the system to normal via jetsam.
1323 	 * We'll run through the idle band up to 2 times.
1324 	 * If the pressure hasn't been relieved by then, the problem is memory
1325 	 * consumption in a higher band and this churn is probably doing more harm than good.
1326 	 */
1327 	max_kills = memorystatus_get_proccnt_upto_priority(memorystatus_sustained_pressure_maximum_band) * 2;
1328 	os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Pressure level has been elevated for too long. killing up to %d idle processes", max_kills);
1329 	while (memorystatus_vm_pressure_level != kVMPressureNormal && kill_count < max_kills) {
1330 		boolean_t killed = memorystatus_kill_on_sustained_pressure();
1331 		if (killed) {
1332 			/*
1333 			 * Pause before our next kill & see if pressure reduces.
1334 			 */
1335 			delay((int)(memorystatus_kill_on_sustained_pressure_delay_ms * NSEC_PER_MSEC / NSEC_PER_USEC));
1336 			kill_count++;
1337 			memorystatus_kill_on_sustained_pressure_count++;
1338 			/* TODO(jason): Should use os_atomic but requires rdar://76310894. */
1339 			memorystatus_pressure_interval_telemetry.num_kills++;
1340 		} else {
1341 			/* Nothing left to kill */
1342 			break;
1343 		}
1344 	}
1345 	if (memorystatus_vm_pressure_level != kVMPressureNormal) {
1346 		os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Killed %d idle processes due to sustained pressure, but device didn't quiesce. Giving up.", kill_count);
1347 	}
1348 }
1349 
1350 #endif /* CONFIG_JETSAM */
1351 
1352 /*
1353  * Returns the number of processes registered for notifications at this level.
1354  */
1355 static size_t
memorystatus_klist_length(int level)1356 memorystatus_klist_length(int level)
1357 {
1358 	LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED);
1359 	struct knote *kn;
1360 	size_t count = 0;
1361 	int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level);
1362 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1363 		if (kn->kn_sfflags & knote_pressure_level) {
1364 			count++;
1365 		}
1366 	}
1367 	return count;
1368 }
1369 
1370 /*
1371  * Updates the footprint telemetry for procs that have received notifications.
1372  */
1373 static void
update_footprints_for_telemetry(void * arg0 __unused,void * arg1 __unused)1374 update_footprints_for_telemetry(void* arg0 __unused, void* arg1 __unused)
1375 {
1376 	uint64_t curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1377 	struct knote *kn;
1378 
1379 	memorystatus_klist_lock();
1380 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1381 		proc_t            p = PROC_NULL;
1382 		struct task*      t = TASK_NULL;
1383 		uint64_t telemetry_update;
1384 
1385 		p = proc_ref(knote_get_kq(kn)->kq_p, false);
1386 		if (p == PROC_NULL) {
1387 			continue;
1388 		}
1389 		t = (struct task *)(proc_task(p));
1390 		proc_rele(p);
1391 		p = PROC_NULL;
1392 		telemetry_update = update_knote_footprint_history(kn, t, curr_ts);
1393 		next_telemetry_update = MIN(next_telemetry_update, telemetry_update);
1394 	}
1395 	memorystatus_klist_unlock();
1396 	if (next_telemetry_update != UINT64_MAX) {
1397 		uint64_t next_update_seconds;
1398 		absolutetime_to_nanoseconds(next_telemetry_update, &next_update_seconds);
1399 		next_update_seconds /= NSEC_PER_SEC;
1400 		thread_call_enter_delayed(memorystatus_notify_update_telemetry_thread_call, next_telemetry_update);
1401 	}
1402 }
1403 
1404 kern_return_t
memorystatus_update_vm_pressure(boolean_t target_foreground_process)1405 memorystatus_update_vm_pressure(boolean_t target_foreground_process)
1406 {
1407 	struct knote            *kn_max = NULL;
1408 	struct knote            *kn_cur = NULL, *kn_temp = NULL;/* for safe list traversal */
1409 	pid_t                target_pid = -1;
1410 	struct klist            dispatch_klist = { NULL };
1411 	proc_t                target_proc = PROC_NULL;
1412 	struct task            *task = NULL;
1413 	boolean_t            found_candidate = FALSE;
1414 
1415 	static vm_pressure_level_t     level_snapshot = kVMPressureNormal;
1416 	static vm_pressure_level_t    prev_level_snapshot = kVMPressureNormal;
1417 	boolean_t            smoothing_window_started = FALSE;
1418 	struct timeval            smoothing_window_start_tstamp = {0, 0};
1419 	struct timeval            curr_tstamp = {0, 0};
1420 	int64_t              elapsed_msecs = 0;
1421 	uint64_t             curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1422 
1423 
1424 	uint64_t logging_now;
1425 	absolutetime_to_nanoseconds(curr_ts, &logging_now);
1426 #if !CONFIG_JETSAM
1427 #define MAX_IDLE_KILLS 100    /* limit the number of idle kills allowed */
1428 
1429 	int    idle_kill_counter = 0;
1430 
1431 	/*
1432 	 * On desktop we take this opportunity to free up memory pressure
1433 	 * by immediately killing idle exitable processes. We use a delay
1434 	 * to avoid overkill.  And we impose a max counter as a fail safe
1435 	 * in case daemons re-launch too fast.
1436 	 */
1437 	while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
1438 		if (memorystatus_idle_exit_from_VM() == FALSE) {
1439 			/* No idle exitable processes left to kill */
1440 			break;
1441 		}
1442 		idle_kill_counter++;
1443 
1444 		if (memorystatus_manual_testing_on == TRUE) {
1445 			/*
1446 			 * Skip the delay when testing
1447 			 * the pressure notification scheme.
1448 			 */
1449 		} else {
1450 			delay(1000000); /* 1 second */
1451 		}
1452 	}
1453 #endif /* !CONFIG_JETSAM */
1454 
1455 	if (level_snapshot != kVMPressureNormal) {
1456 		/*
1457 		 * Check to see if we are still in the 'resting' period
1458 		 * after having notified all clients interested in
1459 		 * a particular pressure level.
1460 		 */
1461 
1462 		level_snapshot = memorystatus_vm_pressure_level;
1463 
1464 		if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1465 			if (next_warning_notification_sent_at_ts) {
1466 				if (curr_ts < next_warning_notification_sent_at_ts) {
1467 					delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1468 					return KERN_SUCCESS;
1469 				}
1470 
1471 				next_warning_notification_sent_at_ts = 0;
1472 				memorystatus_klist_reset_all_for_level(kVMPressureWarning);
1473 			}
1474 		} else if (level_snapshot == kVMPressureCritical) {
1475 			if (next_critical_notification_sent_at_ts) {
1476 				if (curr_ts < next_critical_notification_sent_at_ts) {
1477 					delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1478 					return KERN_SUCCESS;
1479 				}
1480 				next_critical_notification_sent_at_ts = 0;
1481 				memorystatus_klist_reset_all_for_level(kVMPressureCritical);
1482 			}
1483 		}
1484 	}
1485 
1486 #if CONFIG_JETSAM
1487 	if (memorystatus_vm_pressure_level == kVMPressureNormal && prev_level_snapshot != kVMPressureNormal) {
1488 		if (memorystatus_should_kill_on_sustained_pressure) {
1489 			os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Pressure has returned to level %d. Cancelling scheduled jetsam", memorystatus_vm_pressure_level);
1490 			thread_call_cancel(sustained_pressure_handler_thread_call);
1491 		}
1492 	} else if (memorystatus_should_kill_on_sustained_pressure && memorystatus_vm_pressure_level != kVMPressureNormal && prev_level_snapshot == kVMPressureNormal) {
1493 		/*
1494 		 * Pressure has increased from normal.
1495 		 * Hopefully the notifications will relieve it,
1496 		 * but as a fail-safe we'll trigger jetsam
1497 		 * after a configurable amount of time.
1498 		 */
1499 		os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Pressure level has increased from %d to %d. Scheduling jetsam.", prev_level_snapshot, memorystatus_vm_pressure_level);
1500 		uint64_t kill_time;
1501 		nanoseconds_to_absolutetime(memorystatus_kill_on_sustained_pressure_window_s * NSEC_PER_SEC, &kill_time);
1502 		kill_time += mach_absolute_time();
1503 		thread_call_enter_delayed(sustained_pressure_handler_thread_call, kill_time);
1504 	}
1505 #endif /* CONFIG_JETSAM */
1506 
1507 	while (1) {
1508 		/*
1509 		 * There is a race window here. But it's not clear
1510 		 * how much we benefit from having extra synchronization.
1511 		 */
1512 		level_snapshot = memorystatus_vm_pressure_level;
1513 
1514 		if (prev_level_snapshot > level_snapshot) {
1515 			/*
1516 			 * Pressure decreased? Let's take a little breather
1517 			 * and see if this condition stays.
1518 			 */
1519 			if (smoothing_window_started == FALSE) {
1520 				smoothing_window_started = TRUE;
1521 				microuptime(&smoothing_window_start_tstamp);
1522 			}
1523 
1524 			microuptime(&curr_tstamp);
1525 			timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
1526 			elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
1527 
1528 			if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
1529 				delay(INTER_NOTIFICATION_DELAY);
1530 				continue;
1531 			}
1532 		}
1533 		if (level_snapshot == kVMPressureNormal) {
1534 			memorystatus_pressure_telemetry_send();
1535 		}
1536 		prev_level_snapshot = level_snapshot;
1537 		smoothing_window_started = FALSE;
1538 		memorystatus_klist_lock();
1539 
1540 		if (level_snapshot > memorystatus_pressure_interval_telemetry.max_level) {
1541 			memorystatus_pressure_interval_telemetry.num_processes_registered = memorystatus_klist_length(level_snapshot);
1542 			memorystatus_pressure_interval_telemetry.max_level = level_snapshot;
1543 			memorystatus_pressure_interval_telemetry.num_transitions++;
1544 			if (memorystatus_pressure_interval_telemetry.duration == 0) {
1545 				/* Set the start timestamp. Duration will be finalized when we send the event. */
1546 				memorystatus_pressure_interval_telemetry.duration = curr_ts;
1547 			}
1548 		}
1549 
1550 		kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process, &next_telemetry_update);
1551 
1552 		if (kn_max == NULL) {
1553 			memorystatus_klist_unlock();
1554 
1555 			/*
1556 			 * No more level-based clients to notify.
1557 			 *
1558 			 * Start the 'resting' window within which clients will not be re-notified.
1559 			 */
1560 
1561 			if (level_snapshot != kVMPressureNormal) {
1562 				if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1563 					nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1564 
1565 					/* Next warning notification (if nothing changes) won't be sent before...*/
1566 					next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1567 				}
1568 
1569 				if (level_snapshot == kVMPressureCritical) {
1570 					nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1571 
1572 					/* Next critical notification (if nothing changes) won't be sent before...*/
1573 					next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1574 				}
1575 			}
1576 			absolutetime_to_nanoseconds(mach_absolute_time(), &logging_now);
1577 			if (next_telemetry_update != UINT64_MAX) {
1578 				thread_call_enter_delayed(memorystatus_notify_update_telemetry_thread_call, next_telemetry_update);
1579 			} else {
1580 				thread_call_cancel(memorystatus_notify_update_telemetry_thread_call);
1581 			}
1582 			return KERN_FAILURE;
1583 		}
1584 
1585 		target_proc = proc_ref(knote_get_kq(kn_max)->kq_p, false);
1586 		if (target_proc == PROC_NULL) {
1587 			memorystatus_klist_unlock();
1588 			continue;
1589 		}
1590 
1591 		target_pid = proc_getpid(target_proc);
1592 
1593 		task = (struct task *)(proc_task(target_proc));
1594 
1595 		if (level_snapshot != kVMPressureNormal) {
1596 			if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1597 				if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
1598 					found_candidate = TRUE;
1599 				}
1600 			} else {
1601 				if (level_snapshot == kVMPressureCritical) {
1602 					if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
1603 						found_candidate = TRUE;
1604 					}
1605 				}
1606 			}
1607 		} else {
1608 			if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1609 				task_clear_has_been_notified(task, kVMPressureWarning);
1610 				task_clear_has_been_notified(task, kVMPressureCritical);
1611 
1612 				found_candidate = TRUE;
1613 			}
1614 		}
1615 
1616 		if (found_candidate == FALSE) {
1617 			proc_rele(target_proc);
1618 			memorystatus_klist_unlock();
1619 			continue;
1620 		}
1621 
1622 		SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
1623 			int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
1624 
1625 			if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
1626 				proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
1627 				pid_t knote_pid = proc_getpid(knote_proc);
1628 				if (knote_pid == target_pid) {
1629 					KNOTE_DETACH(&memorystatus_klist, kn_cur);
1630 					KNOTE_ATTACH(&dispatch_klist, kn_cur);
1631 				}
1632 			}
1633 		}
1634 		if (level_snapshot != kVMPressureNormal) {
1635 			mark_knote_send_time(kn_max, task, convert_internal_pressure_level_to_dispatch_level(level_snapshot),
1636 			    (uint16_t) MIN(UINT16_MAX, memorystatus_pressure_interval_telemetry.num_notifications_sent));
1637 			memorystatus_pressure_interval_telemetry.num_notifications_sent++;
1638 		}
1639 
1640 		KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
1641 
1642 		SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
1643 			KNOTE_DETACH(&dispatch_klist, kn_cur);
1644 			KNOTE_ATTACH(&memorystatus_klist, kn_cur);
1645 		}
1646 
1647 		memorystatus_klist_unlock();
1648 
1649 		microuptime(&target_proc->vm_pressure_last_notify_tstamp);
1650 		proc_rele(target_proc);
1651 
1652 		if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
1653 			break;
1654 		}
1655 
1656 		if (memorystatus_manual_testing_on == TRUE) {
1657 			/*
1658 			 * Testing out the pressure notification scheme.
1659 			 * No need for delays etc.
1660 			 */
1661 		} else {
1662 			uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
1663 #if CONFIG_JETSAM
1664 			unsigned int page_delta = 0;
1665 			unsigned int skip_delay_page_threshold = 0;
1666 
1667 			assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
1668 
1669 			page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
1670 			skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
1671 
1672 			if (memorystatus_available_pages <= skip_delay_page_threshold) {
1673 				/*
1674 				 * We are nearing the critcal mark fast and can't afford to wait between
1675 				 * notifications.
1676 				 */
1677 				sleep_interval = 0;
1678 			}
1679 #endif /* CONFIG_JETSAM */
1680 
1681 			if (sleep_interval) {
1682 				delay(sleep_interval);
1683 			}
1684 		}
1685 	}
1686 
1687 	return KERN_SUCCESS;
1688 }
1689 
1690 static uint32_t
convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)1691 convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
1692 {
1693 	uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1694 
1695 	switch (internal_pressure_level) {
1696 	case kVMPressureNormal:
1697 	{
1698 		dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1699 		break;
1700 	}
1701 
1702 	case kVMPressureWarning:
1703 	case kVMPressureUrgent:
1704 	{
1705 		dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1706 		break;
1707 	}
1708 
1709 	case kVMPressureCritical:
1710 	{
1711 		dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
1712 		break;
1713 	}
1714 
1715 	default:
1716 		break;
1717 	}
1718 
1719 	return dispatch_level;
1720 }
1721 
1722 /*
1723  * Notify any kexts that are waiting for notification that jetsam
1724  * is approaching the foreground bands. They should use this notification
1725  * to free cached memory.
1726  */
1727 void
memorystatus_issue_fg_band_notify(void)1728 memorystatus_issue_fg_band_notify(void)
1729 {
1730 	uint64_t now;
1731 
1732 	lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
1733 	absolutetime_to_nanoseconds(mach_absolute_time(), &now);
1734 	if (now - memorystatus_jetsam_fg_band_timestamp_ns < memorystatus_jetsam_fg_band_delay_ns) {
1735 		lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1736 		return;
1737 	}
1738 
1739 	if (memorystatus_jetsam_fg_band_waiters > 0) {
1740 		thread_wakeup(&memorystatus_jetsam_fg_band_waiters);
1741 		memorystatus_jetsam_fg_band_waiters = 0;
1742 		memorystatus_jetsam_fg_band_timestamp_ns = now;
1743 	}
1744 	lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
1745 
1746 	/* Notify the buffer cache, file systems, etc. to jetison everything they can. */
1747 	if (consider_buffer_cache_collect != NULL) {
1748 		(void)(*consider_buffer_cache_collect)(1);
1749 	}
1750 }
1751 
1752 
1753 /*
1754  * Memorystatus notification debugging support
1755  */
1756 
1757 static int
1758 sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
1759 {
1760 #pragma unused(arg1, arg2, oidp)
1761 #if !XNU_TARGET_OS_OSX
1762 	int error = 0;
1763 
1764 	error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
1765 	if (error) {
1766 		return error;
1767 	}
1768 
1769 #endif /* !XNU_TARGET_OS_OSX */
1770 	uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
1771 
1772 	return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
1773 }
1774 
1775 #if DEBUG || DEVELOPMENT
1776 
1777 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
1778     0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1779 
1780 #else /* DEBUG || DEVELOPMENT */
1781 
1782 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1783     0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1784 
1785 #endif /* DEBUG || DEVELOPMENT */
1786 
1787 /*
1788  * Trigger levels to test the mechanism.
1789  * Can be used via a sysctl.
1790  */
1791 #define TEST_LOW_MEMORY_TRIGGER_ONE        1
1792 #define TEST_LOW_MEMORY_TRIGGER_ALL        2
1793 #define TEST_PURGEABLE_TRIGGER_ONE        3
1794 #define TEST_PURGEABLE_TRIGGER_ALL        4
1795 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE    5
1796 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL    6
1797 
1798 static int
1799 sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
1800 {
1801 #pragma unused(arg1, arg2)
1802 
1803 	int level = 0;
1804 	int error = 0;
1805 	int pressure_level = 0;
1806 	int trigger_request = 0;
1807 	int force_purge;
1808 
1809 	error = sysctl_handle_int(oidp, &level, 0, req);
1810 	if (error || !req->newptr) {
1811 		return error;
1812 	}
1813 
1814 	memorystatus_manual_testing_on = TRUE;
1815 
1816 	trigger_request = (level >> 16) & 0xFFFF;
1817 	pressure_level = (level & 0xFFFF);
1818 
1819 	if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
1820 	    trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
1821 		return EINVAL;
1822 	}
1823 	switch (pressure_level) {
1824 	case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
1825 	case NOTE_MEMORYSTATUS_PRESSURE_WARN:
1826 	case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
1827 		break;
1828 	default:
1829 		return EINVAL;
1830 	}
1831 
1832 	/*
1833 	 * The pressure level is being set from user-space.
1834 	 * And user-space uses the constants in sys/event.h
1835 	 * So we translate those events to our internal levels here.
1836 	 */
1837 	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1838 		memorystatus_manual_testing_level = kVMPressureNormal;
1839 		force_purge = 0;
1840 	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
1841 		memorystatus_manual_testing_level = kVMPressureWarning;
1842 		force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1843 	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
1844 		memorystatus_manual_testing_level = kVMPressureCritical;
1845 		force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1846 	}
1847 
1848 	memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
1849 
1850 	/* purge according to the new pressure level */
1851 	switch (trigger_request) {
1852 	case TEST_PURGEABLE_TRIGGER_ONE:
1853 	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
1854 		if (force_purge == 0) {
1855 			/* no purging requested */
1856 			break;
1857 		}
1858 		vm_purgeable_object_purge_one_unlocked(force_purge);
1859 		break;
1860 	case TEST_PURGEABLE_TRIGGER_ALL:
1861 	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
1862 		if (force_purge == 0) {
1863 			/* no purging requested */
1864 			break;
1865 		}
1866 		while (vm_purgeable_object_purge_one_unlocked(force_purge)) {
1867 			;
1868 		}
1869 		break;
1870 	}
1871 
1872 	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
1873 	    (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
1874 		memorystatus_update_vm_pressure(TRUE);
1875 	}
1876 
1877 	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
1878 	    (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
1879 		while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
1880 			continue;
1881 		}
1882 	}
1883 
1884 	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1885 		memorystatus_manual_testing_on = FALSE;
1886 	}
1887 
1888 	return 0;
1889 }
1890 
1891 SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1892     0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
1893 
1894 
1895 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
1896 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
1897 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
1898 
1899 extern int vm_pressure_level_transition_threshold;
1900 SYSCTL_INT(_kern, OID_AUTO, vm_pressure_level_transition_threshold, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_level_transition_threshold, 0, "");
1901 
1902 #if DEBUG || DEVELOPMENT
1903 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
1904 
1905 #if 0
1906 #if CONFIG_JETSAM && VM_PRESSURE_EVENTS
1907 static boolean_t
1908 memorystatus_issue_pressure_kevent(boolean_t pressured)
1909 {
1910 	memorystatus_klist_lock();
1911 	KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
1912 	memorystatus_klist_unlock();
1913 	return TRUE;
1914 }
1915 #endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
1916 #endif /* 0 */
1917 
1918 /*
1919  * This routine is used for targeted notifications regardless of system memory pressure
1920  * and regardless of whether or not the process has already been notified.
1921  * It bypasses and has no effect on the only-one-notification per soft-limit policy.
1922  *
1923  * "memnote" is the current user.
1924  */
1925 
1926 static int
1927 sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
1928 {
1929 #pragma unused(arg1, arg2)
1930 	/* Need to be root or have memorystatus entitlement */
1931 	if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT)) {
1932 		return EPERM;
1933 	}
1934 
1935 	int error = 0, pid = 0;
1936 	struct knote *kn = NULL;
1937 	boolean_t found_knote = FALSE;
1938 	int fflags = 0;    /* filter flags for EVFILT_MEMORYSTATUS */
1939 	uint64_t value = 0;
1940 
1941 	error = sysctl_handle_quad(oidp, &value, 0, req);
1942 	if (error || !req->newptr) {
1943 		return error;
1944 	}
1945 
1946 	/*
1947 	 * Find the pid in the low 32 bits of value passed in.
1948 	 */
1949 	pid = (int)(value & 0xFFFFFFFF);
1950 
1951 	/*
1952 	 * Find notification in the high 32 bits of the value passed in.
1953 	 */
1954 	fflags = (int)((value >> 32) & 0xFFFFFFFF);
1955 
1956 	/*
1957 	 * For backwards compatibility, when no notification is
1958 	 * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
1959 	 */
1960 	if (fflags == 0) {
1961 		fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1962 		// printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
1963 	}
1964 
1965 	/* wake up everybody waiting for kVMPressureJetsam */
1966 	if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) {
1967 		memorystatus_issue_fg_band_notify();
1968 		return error;
1969 	}
1970 
1971 	/*
1972 	 * See event.h ... fflags for EVFILT_MEMORYSTATUS
1973 	 */
1974 	if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) ||
1975 	    (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
1976 	    (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
1977 	    (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
1978 	    (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
1979 	    (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
1980 	    (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
1981 	    ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
1982 		printf("memorystatus_vm_pressure_send: notification [0x%x] not supported \n", fflags);
1983 		error = 1;
1984 		return error;
1985 	}
1986 
1987 	/*
1988 	 * Forcibly send pid a memorystatus notification.
1989 	 */
1990 
1991 	memorystatus_klist_lock();
1992 
1993 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1994 		proc_t knote_proc = knote_get_kq(kn)->kq_p;
1995 		pid_t knote_pid = proc_getpid(knote_proc);
1996 
1997 		if (knote_pid == pid) {
1998 			/*
1999 			 * Forcibly send this pid a memorystatus notification.
2000 			 */
2001 			kn->kn_fflags = fflags;
2002 			found_knote = TRUE;
2003 		}
2004 	}
2005 
2006 	if (found_knote) {
2007 		KNOTE(&memorystatus_klist, 0);
2008 		printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d] \n", value, fflags, pid);
2009 		error = 0;
2010 	} else {
2011 		printf("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
2012 		error = 1;
2013 	}
2014 
2015 	memorystatus_klist_unlock();
2016 
2017 	return error;
2018 }
2019 
2020 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
2021     0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
2022 
2023 #endif /* DEBUG || DEVELOPMENT */
2024 
2025 #endif /* VM_PRESSURE_EVENTS */
2026