xref: /xnu-11417.121.6/bsd/kern/kern_memorystatus_notify.c (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1 /*
2  * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  *
28  */
29 
30 #include <sys/kern_event.h>
31 #include <kern/sched_prim.h>
32 #include <kern/assert.h>
33 #include <kern/debug.h>
34 #include <kern/locks.h>
35 #include <kern/task.h>
36 #include <kern/thread.h>
37 #include <kern/thread_call.h>
38 #include <kern/host.h>
39 #include <kern/policy_internal.h>
40 #include <kern/thread_group.h>
41 
42 #include <IOKit/IOBSD.h>
43 
44 #include <libkern/libkern.h>
45 #include <libkern/coreanalytics/coreanalytics.h>
46 #include <mach/coalition.h>
47 #include <mach/clock_types.h>
48 #include <mach/mach_time.h>
49 #include <mach/task.h>
50 #include <mach/host_priv.h>
51 #include <mach/mach_host.h>
52 #include <os/log.h>
53 #include <pexpert/pexpert.h>
54 #include <sys/coalition.h>
55 #include <sys/kern_event.h>
56 #include <sys/proc.h>
57 #include <sys/proc_info.h>
58 #include <sys/reason.h>
59 #include <sys/signal.h>
60 #include <sys/signalvar.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysproto.h>
63 #include <sys/time.h>
64 #include <sys/wait.h>
65 #include <sys/tree.h>
66 #include <sys/priv.h>
67 #include <vm/vm_pageout_xnu.h>
68 #include <vm/vm_protos.h>
69 #include <vm/vm_purgeable_xnu.h>
70 #include <mach/machine/sdt.h>
71 #include <libkern/section_keywords.h>
72 #include <stdatomic.h>
73 
74 #if CONFIG_FREEZE
75 #include <vm/vm_map.h>
76 #endif /* CONFIG_FREEZE */
77 
78 #include <kern/kern_memorystatus_internal.h>
79 #include <sys/kern_memorystatus.h>
80 #include <sys/kern_memorystatus_notify.h>
81 #include <sys/kern_memorystatus_xnu.h>
82 
83 /*
84  * Memorystatus klist structures
85  */
86 struct klist memorystatus_klist;
87 static lck_mtx_t memorystatus_klist_mutex;
88 static void memorystatus_klist_lock(void);
89 static void memorystatus_klist_unlock(void);
90 
91 /*
92  * Memorystatus kevent filter routines
93  */
94 static int filt_memorystatusattach(struct knote *kn, struct kevent_qos_s *kev);
95 static void filt_memorystatusdetach(struct knote *kn);
96 static int filt_memorystatus(struct knote *kn, long hint);
97 static int filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev);
98 static int filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev);
99 
100 SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
101 	.f_attach = filt_memorystatusattach,
102 	.f_detach = filt_memorystatusdetach,
103 	.f_event = filt_memorystatus,
104 	.f_touch = filt_memorystatustouch,
105 	.f_process = filt_memorystatusprocess,
106 };
107 
108 /*
109  * Memorystatus notification events
110  */
111 enum {
112 	kMemorystatusNoPressure = 0x1,
113 	kMemorystatusPressure = 0x2,
114 	kMemorystatusLowSwap = 0x4,
115 	kMemorystatusProcLimitWarn = 0x8,
116 	kMemorystatusProcLimitCritical = 0x10
117 };
118 
119 #define INTER_NOTIFICATION_DELAY    (250000)    /* .25 second */
120 #define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD        5000    /* milliseconds */
121 #define WARNING_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
122 #define CRITICAL_NOTIFICATION_RESTING_PERIOD        25    /* seconds */
123 
124 /*
125  * Memorystatus notification helper routines
126  */
127 static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
128 static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
129 static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
130 static struct knote *vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update);
131 static void vm_dispatch_memory_pressure(void);
132 kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process);
133 
134 #if VM_PRESSURE_EVENTS
135 
136 /*
137  * This value is the threshold that a process must meet to be considered for scavenging.
138  */
139 #if XNU_TARGET_OS_OSX
140 #define VM_PRESSURE_MINIMUM_RSIZE        10    /* MB */
141 #else /* XNU_TARGET_OS_OSX */
142 #define VM_PRESSURE_MINIMUM_RSIZE        6    /* MB */
143 #endif /* XNU_TARGET_OS_OSX */
144 
145 static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
146 
147 #if DEVELOPMENT || DEBUG
148 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
149 #endif /* DEVELOPMENT || DEBUG */
150 
151 vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
152 
153 /*
154  * We use this flag to signal if we have any HWM offenders
155  * on the system. This way we can reduce the number of wakeups
156  * of the memorystatus_thread when the system is between the
157  * "pressure" and "critical" threshold.
158  *
159  * The (re-)setting of this variable is done without any locks
160  * or synchronization simply because it is not possible (currently)
161  * to keep track of HWM offenders that drop down below their memory
162  * limit and/or exit. So, we choose to burn a couple of wasted wakeups
163  * by allowing the unguarded modification of this variable.
164  *
165  * TODO: this should be a count of number of hwm candidates
166  */
167 _Atomic bool memorystatus_hwm_candidates = false;
168 
169 #endif /* VM_PRESSURE_EVENTS */
170 
171 uint32_t memorystatus_jetsam_fg_band_waiters = 0;
172 uint32_t memorystatus_jetsam_bg_band_waiters = 0;
173 static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = 0; /* nanosec */
174 static uint64_t memorystatus_jetsam_bg_band_timestamp_ns = 0; /* nanosec */
175 static uint64_t memorystatus_jetsam_notification_delay_ns = 5ull * 1000 * 1000 * 1000; /* nanosec */
176 
177 #if DEVELOPMENT || DEBUG
178 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_notification_delay_ns, CTLFLAG_RW | CTLFLAG_LOCKED,
179     &memorystatus_jetsam_notification_delay_ns, "");
180 #endif
181 
182 static int
filt_memorystatusattach(struct knote * kn,__unused struct kevent_qos_s * kev)183 filt_memorystatusattach(struct knote *kn, __unused struct kevent_qos_s *kev)
184 {
185 	int error;
186 
187 	kn->kn_flags |= EV_CLEAR; /* automatically set */
188 	kn->kn_sdata = 0;         /* incoming data is ignored */
189 	memset(&kn->kn_ext, 0, sizeof(kn->kn_ext));
190 
191 	error = memorystatus_knote_register(kn);
192 	if (error) {
193 		knote_set_error(kn, error);
194 	}
195 	return 0;
196 }
197 
198 static void
filt_memorystatusdetach(struct knote * kn)199 filt_memorystatusdetach(struct knote *kn)
200 {
201 	memorystatus_knote_unregister(kn);
202 }
203 
204 static int
filt_memorystatus(struct knote * kn __unused,long hint)205 filt_memorystatus(struct knote *kn __unused, long hint)
206 {
207 	if (hint) {
208 		switch (hint) {
209 		case kMemorystatusNoPressure:
210 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
211 				kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
212 			}
213 			break;
214 		case kMemorystatusPressure:
215 			if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
216 				if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
217 					kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
218 				}
219 			} else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
220 				if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
221 					kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
222 				}
223 			}
224 			break;
225 		case kMemorystatusLowSwap:
226 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
227 				kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
228 			}
229 			break;
230 
231 		case kMemorystatusProcLimitWarn:
232 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
233 				kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
234 			}
235 			break;
236 
237 		case kMemorystatusProcLimitCritical:
238 			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
239 				kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
240 			}
241 			break;
242 
243 		default:
244 			break;
245 		}
246 	}
247 
248 #if 0
249 	if (kn->kn_fflags != 0) {
250 		proc_t knote_proc = knote_get_kq(kn)->kq_p;
251 		pid_t knote_pid = proc_getpid(knote_proc);
252 
253 		printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
254 		    (unsigned long)kn, kn->kn_fflags, knote_pid);
255 	}
256 #endif
257 
258 	return kn->kn_fflags != 0;
259 }
260 
261 static int
filt_memorystatustouch(struct knote * kn,struct kevent_qos_s * kev)262 filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev)
263 {
264 	int res;
265 	int prev_kn_sfflags = 0;
266 
267 	memorystatus_klist_lock();
268 
269 	/*
270 	 * copy in new kevent settings
271 	 * (saving the "desired" data and fflags).
272 	 */
273 
274 	prev_kn_sfflags = kn->kn_sfflags;
275 	kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
276 
277 #if XNU_TARGET_OS_OSX
278 	/*
279 	 * Only on desktop do we restrict notifications to
280 	 * one per active/inactive state (soft limits only).
281 	 */
282 	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
283 		/*
284 		 * Is there previous state to preserve?
285 		 */
286 		if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
287 			/*
288 			 * This knote was previously interested in proc_limit_warn,
289 			 * so yes, preserve previous state.
290 			 */
291 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
292 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
293 			}
294 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
295 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
296 			}
297 		} else {
298 			/*
299 			 * This knote was not previously interested in proc_limit_warn,
300 			 * but it is now.  Set both states.
301 			 */
302 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
303 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
304 		}
305 	}
306 
307 	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
308 		/*
309 		 * Is there previous state to preserve?
310 		 */
311 		if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
312 			/*
313 			 * This knote was previously interested in proc_limit_critical,
314 			 * so yes, preserve previous state.
315 			 */
316 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
317 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
318 			}
319 			if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
320 				kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
321 			}
322 		} else {
323 			/*
324 			 * This knote was not previously interested in proc_limit_critical,
325 			 * but it is now.  Set both states.
326 			 */
327 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
328 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
329 		}
330 	}
331 #endif /* XNU_TARGET_OS_OSX */
332 
333 	/*
334 	 * reset the output flags based on a
335 	 * combination of the old events and
336 	 * the new desired event list.
337 	 */
338 	//kn->kn_fflags &= kn->kn_sfflags;
339 
340 	res = (kn->kn_fflags != 0);
341 
342 	memorystatus_klist_unlock();
343 
344 	return res;
345 }
346 
347 static int
filt_memorystatusprocess(struct knote * kn,struct kevent_qos_s * kev)348 filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev)
349 {
350 	int res = 0;
351 
352 	memorystatus_klist_lock();
353 	if (kn->kn_fflags) {
354 		knote_fill_kevent(kn, kev, 0);
355 		res = 1;
356 	}
357 	memorystatus_klist_unlock();
358 
359 	return res;
360 }
361 
362 static void
memorystatus_klist_lock(void)363 memorystatus_klist_lock(void)
364 {
365 	lck_mtx_lock(&memorystatus_klist_mutex);
366 }
367 
368 static void
memorystatus_klist_unlock(void)369 memorystatus_klist_unlock(void)
370 {
371 	lck_mtx_unlock(&memorystatus_klist_mutex);
372 }
373 
374 void
memorystatus_kevent_init(lck_grp_t * grp,lck_attr_t * attr)375 memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr)
376 {
377 	lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
378 	klist_init(&memorystatus_klist);
379 }
380 
381 int
memorystatus_knote_register(struct knote * kn)382 memorystatus_knote_register(struct knote *kn)
383 {
384 	int error = 0;
385 
386 	memorystatus_klist_lock();
387 
388 	/*
389 	 * Support only userspace visible flags.
390 	 */
391 	if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
392 #if XNU_TARGET_OS_OSX
393 		if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
394 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
395 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
396 		}
397 
398 		if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
399 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
400 			kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
401 		}
402 #endif /* XNU_TARGET_OS_OSX */
403 
404 		KNOTE_ATTACH(&memorystatus_klist, kn);
405 	} else {
406 		error = ENOTSUP;
407 	}
408 
409 	memorystatus_klist_unlock();
410 
411 	return error;
412 }
413 
414 void
memorystatus_knote_unregister(struct knote * kn __unused)415 memorystatus_knote_unregister(struct knote *kn __unused)
416 {
417 	memorystatus_klist_lock();
418 	KNOTE_DETACH(&memorystatus_klist, kn);
419 	memorystatus_klist_unlock();
420 }
421 
422 #if VM_PRESSURE_EVENTS
423 
424 #if CONFIG_JETSAM
425 
426 static thread_call_t sustained_pressure_handler_thread_call;
427 int memorystatus_should_kill_on_sustained_pressure = 1;
428 /* Count the number of sustained pressure kills we've done since boot. */
429 uint64_t memorystatus_kill_on_sustained_pressure_count = 0;
430 uint64_t memorystatus_kill_on_sustained_pressure_window_s = 60 * 10; /* 10 Minutes */
431 uint64_t memorystatus_kill_on_sustained_pressure_delay_ms = 500; /* .5 seconds */
432 
433 #if DEVELOPMENT || DEBUG
434 SYSCTL_INT(_kern, OID_AUTO, memorystatus_should_kill_on_sustained_pressure, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_should_kill_on_sustained_pressure, 0, "");
435 #endif /* DEVELOPMENT || DEBUG */
436 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_count, "");
437 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_window_s, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_window_s, "");
438 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_delay_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_delay_ms, "");
439 
440 static void sustained_pressure_handler(void*, void*);
441 #endif /* CONFIG_JETSAM */
442 static thread_call_t memorystatus_notify_update_telemetry_thread_call;
443 static void update_footprints_for_telemetry(void*, void*);
444 
445 
446 void
memorystatus_notify_init()447 memorystatus_notify_init()
448 {
449 #if CONFIG_JETSAM
450 	sustained_pressure_handler_thread_call = thread_call_allocate_with_options(sustained_pressure_handler, NULL, THREAD_CALL_PRIORITY_KERNEL_HIGH, THREAD_CALL_OPTIONS_ONCE);
451 #endif /* CONFIG_JETSAM */
452 	memorystatus_notify_update_telemetry_thread_call = thread_call_allocate_with_options(update_footprints_for_telemetry, NULL, THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
453 }
454 
455 #if CONFIG_MEMORYSTATUS
456 
457 inline int
memorystatus_send_note(int event_code,void * data,uint32_t data_length)458 memorystatus_send_note(int event_code, void *data, uint32_t data_length)
459 {
460 	int ret;
461 	struct kev_msg ev_msg;
462 
463 	ev_msg.vendor_code    = KEV_VENDOR_APPLE;
464 	ev_msg.kev_class      = KEV_SYSTEM_CLASS;
465 	ev_msg.kev_subclass   = KEV_MEMORYSTATUS_SUBCLASS;
466 
467 	ev_msg.event_code     = event_code;
468 
469 	ev_msg.dv[0].data_length = data_length;
470 	ev_msg.dv[0].data_ptr = data;
471 	ev_msg.dv[1].data_length = 0;
472 
473 	ret = kev_post_msg(&ev_msg);
474 	if (ret) {
475 		memorystatus_log_error("%s: kev_post_msg() failed, err %d\n", __func__, ret);
476 	}
477 
478 	return ret;
479 }
480 
481 boolean_t
memorystatus_warn_process(const proc_t p,__unused boolean_t is_active,__unused boolean_t is_fatal,boolean_t limit_exceeded)482 memorystatus_warn_process(const proc_t p, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
483 {
484 	/*
485 	 * This function doesn't take a reference to p or lock it. So it better be the current process.
486 	 */
487 	assert(p == current_proc());
488 	pid_t pid = proc_getpid(p);
489 	boolean_t ret = FALSE;
490 	boolean_t found_knote = FALSE;
491 	struct knote *kn = NULL;
492 	int send_knote_count = 0;
493 	uint32_t platform;
494 	platform = proc_platform(p);
495 
496 	/*
497 	 * See comment in sysctl_memorystatus_vm_pressure_send.
498 	 */
499 
500 	memorystatus_klist_lock();
501 
502 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
503 		proc_t knote_proc = knote_get_kq(kn)->kq_p;
504 		pid_t knote_pid = proc_getpid(knote_proc);
505 
506 		if (knote_pid == pid) {
507 			/*
508 			 * By setting the "fflags" here, we are forcing
509 			 * a process to deal with the case where it's
510 			 * bumping up into its memory limits. If we don't
511 			 * do this here, we will end up depending on the
512 			 * system pressure snapshot evaluation in
513 			 * filt_memorystatus().
514 			 */
515 
516 			/*
517 			 * The type of notification and the frequency are different between
518 			 * embedded and desktop.
519 			 *
520 			 * Embedded processes register for global pressure notifications
521 			 * (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) via UIKit
522 			 * (see applicationDidReceiveMemoryWarning in UIKit). We'll warn them here if
523 			 * they are near there memory limit. filt_memorystatus() will warn them based
524 			 * on the system pressure level.
525 			 *
526 			 * On desktop, (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL)
527 			 * are only expected to fire for system level warnings. Desktop procesess
528 			 * register for NOTE_MEMORYSTATUS_PROC_LIMIT_WARN
529 			 * if they want to be warned when they approach their limit
530 			 * and for NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL to be warned when they
531 			 * exceed their limit.
532 			 *
533 			 * On embedded we continuously warn processes that are approaching their
534 			 * memory limit. However on desktop, we only send one warning while
535 			 * the process is active/inactive if the limit is soft..
536 			 *
537 			 */
538 			if (platform == PLATFORM_MACOS || platform == PLATFORM_MACCATALYST || platform == PLATFORM_DRIVERKIT) {
539 				if (!limit_exceeded) {
540 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
541 						found_knote = TRUE;
542 						if (!is_fatal) {
543 							/*
544 							 * Restrict proc_limit_warn notifications when
545 							 * non-fatal (soft) limit is at play.
546 							 */
547 							if (is_active) {
548 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
549 									/*
550 									 * Mark this knote for delivery.
551 									 */
552 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
553 									/*
554 									 * And suppress it from future notifications.
555 									 */
556 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
557 									send_knote_count++;
558 								}
559 							} else {
560 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
561 									/*
562 									 * Mark this knote for delivery.
563 									 */
564 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
565 									/*
566 									 * And suppress it from future notifications.
567 									 */
568 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
569 									send_knote_count++;
570 								}
571 							}
572 						} else {
573 							/*
574 							 * No restriction on proc_limit_warn notifications when
575 							 * fatal (hard) limit is at play.
576 							 */
577 							kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
578 							send_knote_count++;
579 						}
580 					}
581 				} else {
582 					/*
583 					 * Send this notification when a process has exceeded a soft limit,
584 					 */
585 
586 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
587 						found_knote = TRUE;
588 						if (!is_fatal) {
589 							/*
590 							 * Restrict critical notifications for soft limits.
591 							 */
592 
593 							if (is_active) {
594 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
595 									/*
596 									 * Suppress future proc_limit_critical notifications
597 									 * for the active soft limit.
598 									 */
599 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
600 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
601 									send_knote_count++;
602 								}
603 							} else {
604 								if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
605 									/*
606 									 * Suppress future proc_limit_critical_notifications
607 									 * for the inactive soft limit.
608 									 */
609 									kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
610 									kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
611 									send_knote_count++;
612 								}
613 							}
614 						} else {
615 							/*
616 							 * We should never be trying to send a critical notification for
617 							 * a hard limit... the process would be killed before it could be
618 							 * received.
619 							 */
620 							panic("Caught sending pid %d a critical warning for a fatal limit.", pid);
621 						}
622 					}
623 				}
624 			} else {
625 				if (!limit_exceeded) {
626 					/*
627 					 * Intentionally set either the unambiguous limit warning,
628 					 * the system-wide critical or the system-wide warning
629 					 * notification bit.
630 					 */
631 
632 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
633 						kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
634 						found_knote = TRUE;
635 						send_knote_count++;
636 					} else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
637 						kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
638 						found_knote = TRUE;
639 						send_knote_count++;
640 					} else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
641 						kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
642 						found_knote = TRUE;
643 						send_knote_count++;
644 					}
645 				} else {
646 					/*
647 					 * Send this notification when a process has exceeded a soft limit.
648 					 */
649 					if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
650 						kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
651 						found_knote = TRUE;
652 						send_knote_count++;
653 					}
654 				}
655 			}
656 		}
657 	}
658 
659 	if (found_knote) {
660 		if (send_knote_count > 0) {
661 			KNOTE(&memorystatus_klist, 0);
662 		}
663 		ret = TRUE;
664 	}
665 
666 	memorystatus_klist_unlock();
667 
668 	return ret;
669 }
670 
671 /*
672  * Can only be set by the current task on itself.
673  */
674 int
memorystatus_low_mem_privileged_listener(uint32_t op_flags)675 memorystatus_low_mem_privileged_listener(uint32_t op_flags)
676 {
677 	boolean_t set_privilege = FALSE;
678 	/*
679 	 * Need an entitlement check here?
680 	 */
681 	if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
682 		set_privilege = TRUE;
683 	} else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
684 		set_privilege = FALSE;
685 	} else {
686 		return EINVAL;
687 	}
688 
689 	return task_low_mem_privileged_listener(current_task(), set_privilege, NULL);
690 }
691 
692 int
memorystatus_send_pressure_note(pid_t pid)693 memorystatus_send_pressure_note(pid_t pid)
694 {
695 	memorystatus_log_debug("memorystatus_send_pressure_note(): pid %d\n", pid);
696 	return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
697 }
698 
699 boolean_t
memorystatus_is_foreground_locked(proc_t p)700 memorystatus_is_foreground_locked(proc_t p)
701 {
702 	return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
703 	       (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
704 }
705 
706 /*
707  * This is meant for stackshot and kperf -- it does not take the proc_list_lock
708  * to access the p_memstat_dirty field.
709  */
710 void
memorystatus_proc_flags_unsafe(void * v,boolean_t * is_dirty,boolean_t * is_dirty_tracked,boolean_t * allow_idle_exit)711 memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
712 {
713 	if (!v) {
714 		*is_dirty = FALSE;
715 		*is_dirty_tracked = FALSE;
716 		*allow_idle_exit = FALSE;
717 	} else {
718 		proc_t p = (proc_t)v;
719 		*is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
720 		*is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
721 		*allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
722 	}
723 }
724 
725 boolean_t
memorystatus_bg_pressure_eligible(proc_t p)726 memorystatus_bg_pressure_eligible(proc_t p)
727 {
728 	boolean_t eligible = FALSE;
729 
730 	proc_list_lock();
731 
732 	memorystatus_log_debug("memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", proc_getpid(p), p->p_memstat_state);
733 
734 	/* Foreground processes have already been dealt with at this point, so just test for eligibility */
735 	if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
736 		eligible = TRUE;
737 	}
738 
739 	if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
740 		/*
741 		 * IDLE and IDLE_DEFERRED bands contain processes
742 		 * that have dropped memory to be under their inactive
743 		 * memory limits. And so they can't really give back
744 		 * anything.
745 		 */
746 		eligible = FALSE;
747 	}
748 
749 	proc_list_unlock();
750 
751 	return eligible;
752 }
753 
754 void
memorystatus_send_low_swap_note(void)755 memorystatus_send_low_swap_note(void)
756 {
757 	struct knote *kn = NULL;
758 
759 	memorystatus_klist_lock();
760 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
761 		/* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
762 		 * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
763 		 * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
764 		 * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
765 		if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
766 			KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
767 			break;
768 		}
769 	}
770 
771 	memorystatus_klist_unlock();
772 }
773 
774 #endif /* CONFIG_MEMORYSTATUS */
775 
776 /*
777  * Notification telemetry
778  */
779 CA_EVENT(memorystatus_pressure_interval,
780     CA_INT, num_processes_registered,
781     CA_INT, num_notifications_sent,
782     CA_INT, max_level,
783     CA_INT, num_transitions,
784     CA_INT, num_kills,
785     CA_INT, duration);
786 static CA_EVENT_TYPE(memorystatus_pressure_interval) memorystatus_pressure_interval_telemetry;
787 
788 CA_EVENT(memorystatus_proc_notification,
789     CA_INT, footprint_before_notification,
790     CA_INT, footprint_1_min_after_first_warning,
791     CA_INT, footprint_5_min_after_first_warning,
792     CA_INT, footprint_20_min_after_first_warning,
793     CA_INT, footprint_1_min_after_first_critical,
794     CA_INT, footprint_5_min_after_first_critical,
795     CA_INT, footprint_20_min_after_first_critical,
796     CA_INT, order_within_list,
797     CA_INT, num_notifications_sent,
798     CA_INT, time_between_warning_and_critical,
799     CA_STATIC_STRING(CA_PROCNAME_LEN), proc_name);
800 
801 /* The send timestamps for the first notifications are stored in the knote's kn_sdata field */
802 #define KNOTE_SEND_TIMESTAMP_WARNING_INDEX 0
803 #define KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX 1
804 
805 /* The footprint history for this task is stored in the knote's kn_ext array. */
806 struct knote_footprint_history {
807 	uint32_t kfh_starting_footprint;
808 	uint32_t kfh_footprint_after_warn_1; /* 1 minute after first warning notification */
809 	uint32_t kfh_footprint_after_warn_5; /* 5 minutes after first warning notification */
810 	uint32_t kfh_footprint_after_warn_20; /* 20 minutes after first warning notification */
811 	uint32_t kfh_footprint_after_critical_1; /* 1 minute after first critical notification */
812 	uint32_t kfh_footprint_after_critical_5; /* 5 minutes after first critical notification */
813 	uint32_t kfh_footprint_after_critical_20; /* 20 minutes after first critical notification */
814 	uint16_t kfh_num_notifications;
815 	uint16_t kfh_notification_order;
816 } __attribute__((packed));
817 
818 
819 static_assert(sizeof(struct knote_footprint_history) <= sizeof(uint64_t) * 4, "footprint history fits in knote extensions");
820 
821 static void
mark_knote_send_time(struct knote * kn,task_t task,int knote_pressure_level,uint16_t order_within_list)822 mark_knote_send_time(struct knote *kn, task_t task, int knote_pressure_level, uint16_t order_within_list)
823 {
824 	uint32_t *timestamps;
825 	uint32_t index;
826 	uint64_t curr_ts, curr_ts_seconds;
827 	struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
828 	if (knote_pressure_level != NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
829 		timestamps = (uint32_t *)&(kn->kn_sdata);
830 		index = knote_pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN ?
831 		    KNOTE_SEND_TIMESTAMP_WARNING_INDEX : KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX;
832 		if (timestamps[index] == 0) {
833 			/* First notification for this level since pressure elevated from normal. */
834 			curr_ts = mach_absolute_time();
835 			curr_ts_seconds = 0;
836 			absolutetime_to_nanoseconds(curr_ts, &curr_ts_seconds);
837 			curr_ts_seconds /= NSEC_PER_SEC;
838 
839 			timestamps[index] = (uint32_t)MIN(UINT32_MAX, curr_ts_seconds);
840 
841 			/* Record task initial footprint */
842 			if (timestamps[index == KNOTE_SEND_TIMESTAMP_WARNING_INDEX ? KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX : KNOTE_SEND_TIMESTAMP_WARNING_INDEX] == 0) {
843 				/*
844 				 * First notification at any level since pressure elevated from normal.
845 				 * Record the footprint and our order in the notification list.
846 				 */
847 				footprint_history->kfh_starting_footprint = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
848 				footprint_history->kfh_notification_order = order_within_list;
849 			}
850 		}
851 	}
852 	footprint_history->kfh_num_notifications++;
853 }
854 
855 /*
856  * Records the current footprint for this task in the knote telemetry.
857  *
858  * Returns the soonest absolutetime when this footprint history should be updated again.
859  */
860 static uint64_t
update_knote_footprint_history(struct knote * kn,task_t task,uint64_t curr_ts)861 update_knote_footprint_history(struct knote *kn, task_t task, uint64_t curr_ts)
862 {
863 	uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
864 	struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
865 	uint64_t warning_send_time, critical_send_time, minutes_since_warning = UINT64_MAX, minutes_since_critical = UINT64_MAX;
866 	warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
867 	critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
868 	uint32_t task_phys_footprint_mb = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
869 	uint64_t next_run = UINT64_MAX, absolutetime_in_minute = 0, minutes_since_last_notification = 0, curr_ts_s;
870 	absolutetime_to_nanoseconds(curr_ts, &curr_ts_s);
871 	nanoseconds_to_absolutetime(60 * NSEC_PER_SEC, &absolutetime_in_minute);
872 	curr_ts_s /= NSEC_PER_SEC;
873 
874 	if (warning_send_time != 0) {
875 		/* This task received a warning notification. */
876 		minutes_since_warning = (curr_ts_s - warning_send_time) / 60;
877 		if (footprint_history->kfh_footprint_after_warn_1 == 0 && minutes_since_warning >= 1) {
878 			footprint_history->kfh_footprint_after_warn_1 = task_phys_footprint_mb;
879 		}
880 		if (footprint_history->kfh_footprint_after_warn_5 == 0 && minutes_since_warning >= 5) {
881 			footprint_history->kfh_footprint_after_warn_5 = task_phys_footprint_mb;
882 		}
883 		if (footprint_history->kfh_footprint_after_warn_20 == 0 && minutes_since_warning >= 20) {
884 			footprint_history->kfh_footprint_after_warn_20 = task_phys_footprint_mb;
885 		}
886 	}
887 	if (critical_send_time != 0) {
888 		/* This task received a critical notification. */
889 		minutes_since_critical = (curr_ts_s - critical_send_time) / 60;
890 		if (footprint_history->kfh_footprint_after_critical_1 == 0 && minutes_since_critical >= 1) {
891 			footprint_history->kfh_footprint_after_critical_1 = task_phys_footprint_mb;
892 		}
893 		if (footprint_history->kfh_footprint_after_critical_5 == 0 && minutes_since_critical >= 5) {
894 			footprint_history->kfh_footprint_after_critical_5 = task_phys_footprint_mb;
895 		}
896 		if (footprint_history->kfh_footprint_after_critical_20 == 0 && minutes_since_critical >= 20) {
897 			footprint_history->kfh_footprint_after_critical_20 = task_phys_footprint_mb;
898 		}
899 	}
900 
901 	minutes_since_last_notification = MIN(minutes_since_warning, minutes_since_critical);
902 	if (minutes_since_last_notification < 20) {
903 		if (minutes_since_last_notification < 5) {
904 			if (minutes_since_last_notification < 1) {
905 				next_run = curr_ts + absolutetime_in_minute;
906 			} else {
907 				next_run = curr_ts + (absolutetime_in_minute * 5);
908 			}
909 		} else {
910 			next_run = curr_ts + (absolutetime_in_minute * 20);
911 		}
912 	}
913 
914 	return next_run;
915 }
916 
917 extern char *proc_name_address(void *p);
918 /*
919  * Attempt to send the given level telemetry event.
920  * Finalizes the duration.
921  * Clears the src_event struct.
922  */
923 static void
memorystatus_pressure_interval_send(CA_EVENT_TYPE (memorystatus_pressure_interval)* src_event)924 memorystatus_pressure_interval_send(CA_EVENT_TYPE(memorystatus_pressure_interval) *src_event)
925 {
926 	uint64_t duration_nanoseconds = 0;
927 	uint64_t             curr_ts = mach_absolute_time();
928 	src_event->duration = curr_ts - src_event->duration;
929 	absolutetime_to_nanoseconds(src_event->duration, &duration_nanoseconds);
930 	src_event->duration = (int64_t) (duration_nanoseconds / NSEC_PER_SEC);
931 
932 	/*
933 	 * Drop the event rather than block for memory. We should be in a normal pressure level now,
934 	 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
935 	 */
936 	ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_pressure_interval, Z_NOWAIT);
937 	if (event_wrapper) {
938 		memcpy(event_wrapper->data, src_event, sizeof(CA_EVENT_TYPE(memorystatus_pressure_interval)));
939 		CA_EVENT_SEND(event_wrapper);
940 	}
941 	src_event->num_processes_registered = 0;
942 	src_event->num_notifications_sent = 0;
943 	src_event->max_level = 0;
944 	src_event->num_transitions = 0;
945 	src_event->num_kills = 0;
946 	src_event->duration = 0;
947 }
948 
949 
950 /*
951  * Attempt to send the per-proc telemetry events.
952  * Clears the footprint histories on the knotes.
953  */
954 static void
memorystatus_pressure_proc_telemetry_send(void)955 memorystatus_pressure_proc_telemetry_send(void)
956 {
957 	struct knote *kn = NULL;
958 	memorystatus_klist_lock();
959 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
960 		proc_t            p = PROC_NULL;
961 		struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
962 		uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
963 		uint32_t warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
964 		uint32_t critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
965 		CA_EVENT_TYPE(memorystatus_proc_notification) * event = NULL;
966 		if (warning_send_time != 0 || critical_send_time != 0) {
967 			/*
968 			 * Drop the event rather than block for memory. We should be in a normal pressure level now,
969 			 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
970 			 */
971 			ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_proc_notification, Z_NOWAIT | Z_ZERO);
972 			if (event_wrapper) {
973 				event = event_wrapper->data;
974 
975 				event->footprint_before_notification = footprint_history->kfh_starting_footprint;
976 				event->footprint_1_min_after_first_warning = footprint_history->kfh_footprint_after_warn_1;
977 				event->footprint_5_min_after_first_warning = footprint_history->kfh_footprint_after_warn_5;
978 				event->footprint_20_min_after_first_warning = footprint_history->kfh_footprint_after_warn_20;
979 				event->footprint_1_min_after_first_critical = footprint_history->kfh_footprint_after_critical_1;
980 				event->footprint_5_min_after_first_critical = footprint_history->kfh_footprint_after_critical_5;
981 				event->footprint_20_min_after_first_critical = footprint_history->kfh_footprint_after_critical_20;
982 				event->num_notifications_sent = footprint_history->kfh_num_notifications;
983 				if (warning_send_time != 0 && critical_send_time != 0) {
984 					event->time_between_warning_and_critical = (critical_send_time - warning_send_time) / 60; // Minutes
985 				}
986 				event->order_within_list = footprint_history->kfh_notification_order;
987 
988 				p = proc_ref(knote_get_kq(kn)->kq_p, false);
989 				if (p == NULL) {
990 					CA_EVENT_DEALLOCATE(event_wrapper);
991 					continue;
992 				}
993 				strlcpy(event->proc_name, proc_name_address(p), sizeof(event->proc_name));
994 
995 				proc_rele(p);
996 				CA_EVENT_SEND(event_wrapper);
997 			}
998 		}
999 		memset(footprint_history, 0, sizeof(*footprint_history));
1000 		timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX] = 0;
1001 		timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX] = 0;
1002 	}
1003 	memorystatus_klist_unlock();
1004 }
1005 
1006 /*
1007  * Send all telemetry associated with the increased pressure interval.
1008  */
1009 static void
memorystatus_pressure_telemetry_send(void)1010 memorystatus_pressure_telemetry_send(void)
1011 {
1012 	LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_NOTOWNED);
1013 	memorystatus_pressure_interval_send(&memorystatus_pressure_interval_telemetry);
1014 	memorystatus_pressure_proc_telemetry_send();
1015 }
1016 
1017 
1018 /*
1019  * kn_max - knote
1020  *
1021  * knote_pressure_level - to check if the knote is registered for this notification level.
1022  *
1023  * task    - task whose bits we'll be modifying
1024  *
1025  * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
1026  *
1027  * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
1028  *
1029  */
1030 
1031 static boolean_t
is_knote_registered_modify_task_pressure_bits(struct knote * kn_max,int knote_pressure_level,task_t task,vm_pressure_level_t pressure_level_to_clear,vm_pressure_level_t pressure_level_to_set)1032 is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
1033 {
1034 	if (kn_max->kn_sfflags & knote_pressure_level) {
1035 		if (pressure_level_to_clear && task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
1036 			task_clear_has_been_notified(task, pressure_level_to_clear);
1037 		}
1038 
1039 		task_mark_has_been_notified(task, pressure_level_to_set);
1040 		return TRUE;
1041 	}
1042 
1043 	return FALSE;
1044 }
1045 
1046 static void
memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)1047 memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
1048 {
1049 	struct knote *kn = NULL;
1050 
1051 	memorystatus_klist_lock();
1052 
1053 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1054 		proc_t p = knote_get_kq(kn)->kq_p;
1055 
1056 		if (p == proc_ref(p, false)) {
1057 			task_clear_has_been_notified(proc_task(p), pressure_level_to_clear);
1058 			proc_rele(p);
1059 		}
1060 	}
1061 
1062 	memorystatus_klist_unlock();
1063 }
1064 
1065 /*
1066  * Used by the vm_pressure_thread which is
1067  * signalled from within vm_pageout_scan().
1068  */
1069 
1070 void
consider_vm_pressure_events(void)1071 consider_vm_pressure_events(void)
1072 {
1073 	vm_dispatch_memory_pressure();
1074 }
1075 
1076 static void
vm_dispatch_memory_pressure(void)1077 vm_dispatch_memory_pressure(void)
1078 {
1079 	memorystatus_update_vm_pressure(FALSE);
1080 }
1081 
1082 static struct knote *
vm_pressure_select_optimal_candidate_to_notify(struct klist * candidate_list,int level,boolean_t target_foreground_process,uint64_t * next_telemetry_update)1083 vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update)
1084 {
1085 	struct knote    *kn = NULL, *kn_max = NULL;
1086 	uint64_t    resident_max = 0;/* MB */
1087 	int        selected_task_importance = 0;
1088 	static int    pressure_snapshot = -1;
1089 	boolean_t    pressure_increase = FALSE;
1090 	uint64_t     curr_ts = mach_absolute_time();
1091 	*next_telemetry_update = UINT64_MAX;
1092 
1093 	if (pressure_snapshot == -1) {
1094 		/*
1095 		 * Initial snapshot.
1096 		 */
1097 		pressure_snapshot = level;
1098 		pressure_increase = TRUE;
1099 	} else {
1100 		if (level && (level >= pressure_snapshot)) {
1101 			pressure_increase = TRUE;
1102 		} else {
1103 			pressure_increase = FALSE;
1104 		}
1105 
1106 		pressure_snapshot = level;
1107 	}
1108 
1109 	if (pressure_increase == TRUE) {
1110 		/*
1111 		 * We'll start by considering the largest
1112 		 * unimportant task in our list.
1113 		 */
1114 		selected_task_importance = INT_MAX;
1115 	} else {
1116 		/*
1117 		 * We'll start by considering the largest
1118 		 * important task in our list.
1119 		 */
1120 		selected_task_importance = 0;
1121 	}
1122 
1123 	SLIST_FOREACH(kn, candidate_list, kn_selnext) {
1124 		uint64_t        resident_size = 0;/* MB */
1125 		proc_t            p = PROC_NULL;
1126 		struct task*        t = TASK_NULL;
1127 		int            curr_task_importance = 0;
1128 		uint64_t         telemetry_update = 0;
1129 		boolean_t        consider_knote = FALSE;
1130 		boolean_t        privileged_listener = FALSE;
1131 
1132 		p = proc_ref(knote_get_kq(kn)->kq_p, false);
1133 		if (p == PROC_NULL) {
1134 			continue;
1135 		}
1136 
1137 #if CONFIG_MEMORYSTATUS
1138 		if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
1139 			/*
1140 			 * Skip process not marked foreground.
1141 			 */
1142 			proc_rele(p);
1143 			continue;
1144 		}
1145 #endif /* CONFIG_MEMORYSTATUS */
1146 
1147 		t = (struct task *)(proc_task(p));
1148 		telemetry_update = update_knote_footprint_history(kn, t, curr_ts);
1149 		*next_telemetry_update = MIN(*next_telemetry_update, telemetry_update);
1150 
1151 		vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
1152 
1153 		if ((kn->kn_sfflags & dispatch_level) == 0) {
1154 			proc_rele(p);
1155 			continue;
1156 		}
1157 
1158 #if CONFIG_MEMORYSTATUS
1159 		if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
1160 			VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", proc_getpid(p));
1161 			proc_rele(p);
1162 			continue;
1163 		}
1164 #endif /* CONFIG_MEMORYSTATUS */
1165 
1166 #if XNU_TARGET_OS_OSX
1167 		curr_task_importance = task_importance_estimate(t);
1168 #else /* XNU_TARGET_OS_OSX */
1169 		curr_task_importance = p->p_memstat_effectivepriority;
1170 #endif /* XNU_TARGET_OS_OSX */
1171 
1172 		/*
1173 		 * Privileged listeners are only considered in the multi-level pressure scheme
1174 		 * AND only if the pressure is increasing.
1175 		 */
1176 		if (level > 0) {
1177 			if (task_has_been_notified(t, level) == FALSE) {
1178 				/*
1179 				 * Is this a privileged listener?
1180 				 */
1181 				if (task_low_mem_privileged_listener(t, FALSE, &privileged_listener) == 0) {
1182 					if (privileged_listener) {
1183 						kn_max = kn;
1184 						proc_rele(p);
1185 						goto done_scanning;
1186 					}
1187 				}
1188 			} else {
1189 				proc_rele(p);
1190 				continue;
1191 			}
1192 		} else if (level == 0) {
1193 			/*
1194 			 * Task wasn't notified when the pressure was increasing and so
1195 			 * no need to notify it that the pressure is decreasing.
1196 			 */
1197 			if ((task_has_been_notified(t, kVMPressureWarning) == FALSE) && (task_has_been_notified(t, kVMPressureCritical) == FALSE)) {
1198 				proc_rele(p);
1199 				continue;
1200 			}
1201 		}
1202 
1203 		/*
1204 		 * We don't want a small process to block large processes from
1205 		 * being notified again. <rdar://problem/7955532>
1206 		 */
1207 		resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */
1208 
1209 		if (resident_size >= vm_pressure_task_footprint_min) {
1210 			if (level > 0) {
1211 				/*
1212 				 * Warning or Critical Pressure.
1213 				 */
1214 				if (pressure_increase) {
1215 					if ((curr_task_importance < selected_task_importance) ||
1216 					    ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1217 						/*
1218 						 * We have found a candidate process which is:
1219 						 * a) at a lower importance than the current selected process
1220 						 * OR
1221 						 * b) has importance equal to that of the current selected process but is larger
1222 						 */
1223 
1224 						consider_knote = TRUE;
1225 					}
1226 				} else {
1227 					if ((curr_task_importance > selected_task_importance) ||
1228 					    ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1229 						/*
1230 						 * We have found a candidate process which is:
1231 						 * a) at a higher importance than the current selected process
1232 						 * OR
1233 						 * b) has importance equal to that of the current selected process but is larger
1234 						 */
1235 
1236 						consider_knote = TRUE;
1237 					}
1238 				}
1239 			} else if (level == 0) {
1240 				/*
1241 				 * Pressure back to normal.
1242 				 */
1243 				if ((curr_task_importance > selected_task_importance) ||
1244 				    ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1245 					consider_knote = TRUE;
1246 				}
1247 			}
1248 
1249 			if (consider_knote) {
1250 				resident_max = resident_size;
1251 				kn_max = kn;
1252 				selected_task_importance = curr_task_importance;
1253 				consider_knote = FALSE; /* reset for the next candidate */
1254 			}
1255 		} else {
1256 			/* There was no candidate with enough resident memory to scavenge */
1257 			VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", proc_getpid(p), resident_size);
1258 		}
1259 		proc_rele(p);
1260 	}
1261 
1262 done_scanning:
1263 	if (kn_max) {
1264 		VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, DBG_VM_PRESSURE_EVENT, DBG_FUNC_NONE, proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max, 0, 0);
1265 		VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max);
1266 	}
1267 
1268 	return kn_max;
1269 }
1270 
1271 /*
1272  * To avoid notification storms in a system with sawtooth behavior of pressure levels eg:
1273  * Normal -> warning (notify clients) -> critical (notify) -> warning (notify) -> critical (notify) -> warning (notify)...
1274  *
1275  * We have 'resting' periods: WARNING_NOTIFICATION_RESTING_PERIOD and CRITICAL_NOTIFICATION_RESTING_PERIOD
1276  *
1277  * So it would look like:-
1278  * Normal -> warning (notify) -> critical (notify) -> warning (notify if it has been RestPeriod since last warning) -> critical (notify if it has been RestPeriod since last critical) -> ...
1279  *
1280  * That's what these 2 timestamps below signify.
1281  */
1282 
1283 uint64_t next_warning_notification_sent_at_ts = 0;
1284 uint64_t next_critical_notification_sent_at_ts = 0;
1285 
1286 boolean_t        memorystatus_manual_testing_on = FALSE;
1287 vm_pressure_level_t    memorystatus_manual_testing_level = kVMPressureNormal;
1288 
1289 unsigned int memorystatus_sustained_pressure_maximum_band = JETSAM_PRIORITY_IDLE;
1290 #if DEVELOPMENT || DEBUG
1291 SYSCTL_INT(_kern, OID_AUTO, memorystatus_sustained_pressure_maximum_band, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_sustained_pressure_maximum_band, 0, "");
1292 #endif /* DEVELOPMENT || DEBUG */
1293 
1294 #if CONFIG_JETSAM
1295 
1296 /*
1297  * TODO(jason): The memorystatus thread should be responsible for this
1298  * It can just check how long the pressure level has been at warning and the timestamp
1299  * of the last sustained pressure kill.
1300  */
1301 static void
sustained_pressure_handler(void * arg0 __unused,void * arg1 __unused)1302 sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused)
1303 {
1304 	int max_kills = 0, kill_count = 0;
1305 	/*
1306 	 * Pressure has been elevated for too long.
1307 	 * We don't want to leave the system in this state as it can delay background
1308 	 * work indefinitely & drain battery.
1309 	 *
1310 	 * Try to return the system to normal via jetsam.
1311 	 * We'll run through the idle band up to 2 times.
1312 	 * If the pressure hasn't been relieved by then, the problem is memory
1313 	 * consumption in a higher band and this churn is probably doing more harm than good.
1314 	 */
1315 	max_kills = memstat_get_proccnt_upto_priority(memorystatus_sustained_pressure_maximum_band) * 2;
1316 	memorystatus_log("memorystatus: Pressure level has been elevated for too long. killing up to %d idle processes\n", max_kills);
1317 	while (memorystatus_vm_pressure_level != kVMPressureNormal && kill_count < max_kills) {
1318 		bool killed = memorystatus_kill_on_sustained_pressure();
1319 		if (killed) {
1320 			/*
1321 			 * Pause before our next kill & see if pressure reduces.
1322 			 */
1323 			delay((int)(memorystatus_kill_on_sustained_pressure_delay_ms * NSEC_PER_MSEC / NSEC_PER_USEC));
1324 			kill_count++;
1325 			memorystatus_kill_on_sustained_pressure_count++;
1326 			/* TODO(jason): Should use os_atomic but requires rdar://76310894. */
1327 			memorystatus_pressure_interval_telemetry.num_kills++;
1328 		} else {
1329 			/* Nothing left to kill */
1330 			break;
1331 		}
1332 	}
1333 	if (memorystatus_vm_pressure_level != kVMPressureNormal) {
1334 		memorystatus_log("memorystatus: Killed %d idle processes due to sustained pressure, but device didn't quiesce. Giving up.\n", kill_count);
1335 	}
1336 }
1337 
1338 #endif /* CONFIG_JETSAM */
1339 
1340 /*
1341  * Returns the number of processes registered for notifications at this level.
1342  */
1343 static size_t
memorystatus_klist_length(int level)1344 memorystatus_klist_length(int level)
1345 {
1346 	LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED);
1347 	struct knote *kn;
1348 	size_t count = 0;
1349 	int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level);
1350 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1351 		if (kn->kn_sfflags & knote_pressure_level) {
1352 			count++;
1353 		}
1354 	}
1355 	return count;
1356 }
1357 
1358 /*
1359  * Updates the footprint telemetry for procs that have received notifications.
1360  */
1361 static void
update_footprints_for_telemetry(void * arg0 __unused,void * arg1 __unused)1362 update_footprints_for_telemetry(void* arg0 __unused, void* arg1 __unused)
1363 {
1364 	uint64_t curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1365 	struct knote *kn;
1366 
1367 	memorystatus_klist_lock();
1368 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1369 		proc_t            p = PROC_NULL;
1370 		struct task*      t = TASK_NULL;
1371 		uint64_t telemetry_update;
1372 
1373 		p = proc_ref(knote_get_kq(kn)->kq_p, false);
1374 		if (p == PROC_NULL) {
1375 			continue;
1376 		}
1377 		t = (struct task *)(proc_task(p));
1378 		proc_rele(p);
1379 		p = PROC_NULL;
1380 		telemetry_update = update_knote_footprint_history(kn, t, curr_ts);
1381 		next_telemetry_update = MIN(next_telemetry_update, telemetry_update);
1382 	}
1383 	memorystatus_klist_unlock();
1384 	if (next_telemetry_update != UINT64_MAX) {
1385 		uint64_t next_update_seconds;
1386 		absolutetime_to_nanoseconds(next_telemetry_update, &next_update_seconds);
1387 		next_update_seconds /= NSEC_PER_SEC;
1388 		thread_call_enter_delayed(memorystatus_notify_update_telemetry_thread_call, next_telemetry_update);
1389 	}
1390 }
1391 
1392 kern_return_t
memorystatus_update_vm_pressure(boolean_t target_foreground_process)1393 memorystatus_update_vm_pressure(boolean_t target_foreground_process)
1394 {
1395 	struct knote            *kn_max = NULL;
1396 	struct knote            *kn_cur = NULL, *kn_temp = NULL;/* for safe list traversal */
1397 	pid_t                target_pid = -1;
1398 	struct klist            dispatch_klist = { NULL };
1399 	proc_t                target_proc = PROC_NULL;
1400 	struct task            *task = NULL;
1401 	boolean_t            found_candidate = FALSE;
1402 
1403 	static vm_pressure_level_t     level_snapshot = kVMPressureNormal;
1404 	static vm_pressure_level_t    prev_level_snapshot = kVMPressureNormal;
1405 	boolean_t            smoothing_window_started = FALSE;
1406 	struct timeval            smoothing_window_start_tstamp = {0, 0};
1407 	struct timeval            curr_tstamp = {0, 0};
1408 	int64_t              elapsed_msecs = 0;
1409 	uint64_t             curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1410 
1411 
1412 	uint64_t logging_now;
1413 	absolutetime_to_nanoseconds(curr_ts, &logging_now);
1414 #if !CONFIG_JETSAM
1415 #define MAX_IDLE_KILLS 100    /* limit the number of idle kills allowed */
1416 
1417 	int    idle_kill_counter = 0;
1418 
1419 	/*
1420 	 * On desktop we take this opportunity to free up memory pressure
1421 	 * by immediately killing idle exitable processes. We use a delay
1422 	 * to avoid overkill.  And we impose a max counter as a fail safe
1423 	 * in case daemons re-launch too fast.
1424 	 *
1425 	 * TODO: These jetsams should be performed on the memorystatus thread. We can
1426 	 * provide the similar false-idle mitigation by skipping processes with med/high
1427 	 * relaunch probability and/or using the sustained-pressure mechanism.
1428 	 * (rdar://134075608)
1429 	 */
1430 	while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
1431 		if (!memstat_kill_idle_process(kMemorystatusKilledIdleExit, NULL)) {
1432 			/* No idle exitable processes left to kill */
1433 			break;
1434 		}
1435 		idle_kill_counter++;
1436 
1437 		if (memorystatus_manual_testing_on == TRUE) {
1438 			/*
1439 			 * Skip the delay when testing
1440 			 * the pressure notification scheme.
1441 			 */
1442 		} else {
1443 			delay(1000000); /* 1 second */
1444 		}
1445 	}
1446 #endif /* !CONFIG_JETSAM */
1447 
1448 	if (level_snapshot != kVMPressureNormal) {
1449 		/*
1450 		 * Check to see if we are still in the 'resting' period
1451 		 * after having notified all clients interested in
1452 		 * a particular pressure level.
1453 		 */
1454 
1455 		level_snapshot = memorystatus_vm_pressure_level;
1456 
1457 		if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1458 			if (next_warning_notification_sent_at_ts) {
1459 				if (curr_ts < next_warning_notification_sent_at_ts) {
1460 					delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1461 					return KERN_SUCCESS;
1462 				}
1463 
1464 				next_warning_notification_sent_at_ts = 0;
1465 				memorystatus_klist_reset_all_for_level(kVMPressureWarning);
1466 			}
1467 		} else if (level_snapshot == kVMPressureCritical) {
1468 			if (next_critical_notification_sent_at_ts) {
1469 				if (curr_ts < next_critical_notification_sent_at_ts) {
1470 					delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1471 					return KERN_SUCCESS;
1472 				}
1473 				next_critical_notification_sent_at_ts = 0;
1474 				memorystatus_klist_reset_all_for_level(kVMPressureCritical);
1475 			}
1476 		}
1477 	}
1478 
1479 #if CONFIG_JETSAM
1480 	if (memorystatus_vm_pressure_level == kVMPressureNormal && prev_level_snapshot != kVMPressureNormal) {
1481 		if (memorystatus_should_kill_on_sustained_pressure) {
1482 			memorystatus_log("memorystatus: Pressure has returned to level %d. Cancelling scheduled jetsam\n", memorystatus_vm_pressure_level);
1483 			thread_call_cancel(sustained_pressure_handler_thread_call);
1484 		}
1485 	} else if (memorystatus_should_kill_on_sustained_pressure && memorystatus_vm_pressure_level != kVMPressureNormal && prev_level_snapshot == kVMPressureNormal) {
1486 		/*
1487 		 * Pressure has increased from normal.
1488 		 * Hopefully the notifications will relieve it,
1489 		 * but as a fail-safe we'll trigger jetsam
1490 		 * after a configurable amount of time.
1491 		 */
1492 		memorystatus_log("memorystatus: Pressure level has increased from %d to %d. Scheduling jetsam.\n", prev_level_snapshot, memorystatus_vm_pressure_level);
1493 		uint64_t kill_time;
1494 		nanoseconds_to_absolutetime(memorystatus_kill_on_sustained_pressure_window_s * NSEC_PER_SEC, &kill_time);
1495 		kill_time += mach_absolute_time();
1496 		thread_call_enter_delayed(sustained_pressure_handler_thread_call, kill_time);
1497 	}
1498 #endif /* CONFIG_JETSAM */
1499 
1500 	while (1) {
1501 		/*
1502 		 * There is a race window here. But it's not clear
1503 		 * how much we benefit from having extra synchronization.
1504 		 */
1505 		level_snapshot = memorystatus_vm_pressure_level;
1506 
1507 		if (prev_level_snapshot > level_snapshot) {
1508 			/*
1509 			 * Pressure decreased? Let's take a little breather
1510 			 * and see if this condition stays.
1511 			 */
1512 			if (smoothing_window_started == FALSE) {
1513 				smoothing_window_started = TRUE;
1514 				microuptime(&smoothing_window_start_tstamp);
1515 			}
1516 
1517 			microuptime(&curr_tstamp);
1518 			timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
1519 			elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
1520 
1521 			if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
1522 				delay(INTER_NOTIFICATION_DELAY);
1523 				continue;
1524 			}
1525 		}
1526 		if (level_snapshot == kVMPressureNormal) {
1527 			memorystatus_pressure_telemetry_send();
1528 		}
1529 		prev_level_snapshot = level_snapshot;
1530 		smoothing_window_started = FALSE;
1531 		memorystatus_klist_lock();
1532 
1533 		if (level_snapshot > memorystatus_pressure_interval_telemetry.max_level) {
1534 			memorystatus_pressure_interval_telemetry.num_processes_registered = memorystatus_klist_length(level_snapshot);
1535 			memorystatus_pressure_interval_telemetry.max_level = level_snapshot;
1536 			memorystatus_pressure_interval_telemetry.num_transitions++;
1537 			if (memorystatus_pressure_interval_telemetry.duration == 0) {
1538 				/* Set the start timestamp. Duration will be finalized when we send the event. */
1539 				memorystatus_pressure_interval_telemetry.duration = curr_ts;
1540 			}
1541 		}
1542 
1543 		kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process, &next_telemetry_update);
1544 
1545 		if (kn_max == NULL) {
1546 			memorystatus_klist_unlock();
1547 
1548 			/*
1549 			 * No more level-based clients to notify.
1550 			 *
1551 			 * Start the 'resting' window within which clients will not be re-notified.
1552 			 */
1553 
1554 			if (level_snapshot != kVMPressureNormal) {
1555 				if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1556 					nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1557 
1558 					/* Next warning notification (if nothing changes) won't be sent before...*/
1559 					next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1560 				}
1561 
1562 				if (level_snapshot == kVMPressureCritical) {
1563 					nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, &curr_ts);
1564 
1565 					/* Next critical notification (if nothing changes) won't be sent before...*/
1566 					next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1567 				}
1568 			}
1569 			absolutetime_to_nanoseconds(mach_absolute_time(), &logging_now);
1570 			if (next_telemetry_update != UINT64_MAX) {
1571 				thread_call_enter_delayed(memorystatus_notify_update_telemetry_thread_call, next_telemetry_update);
1572 			} else {
1573 				thread_call_cancel(memorystatus_notify_update_telemetry_thread_call);
1574 			}
1575 			return KERN_FAILURE;
1576 		}
1577 
1578 		target_proc = proc_ref(knote_get_kq(kn_max)->kq_p, false);
1579 		if (target_proc == PROC_NULL) {
1580 			memorystatus_klist_unlock();
1581 			continue;
1582 		}
1583 
1584 		target_pid = proc_getpid(target_proc);
1585 
1586 		task = (struct task *)(proc_task(target_proc));
1587 
1588 		if (level_snapshot != kVMPressureNormal) {
1589 			if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1590 				if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, 0, kVMPressureWarning) == TRUE) {
1591 					found_candidate = TRUE;
1592 				}
1593 			} else {
1594 				if (level_snapshot == kVMPressureCritical) {
1595 					if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, 0, kVMPressureCritical) == TRUE) {
1596 						found_candidate = TRUE;
1597 					}
1598 				}
1599 			}
1600 		} else {
1601 			if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1602 				task_clear_has_been_notified(task, kVMPressureWarning);
1603 				task_clear_has_been_notified(task, kVMPressureCritical);
1604 
1605 				found_candidate = TRUE;
1606 			}
1607 		}
1608 
1609 		if (found_candidate == FALSE) {
1610 			proc_rele(target_proc);
1611 			memorystatus_klist_unlock();
1612 			continue;
1613 		}
1614 
1615 		SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
1616 			int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
1617 
1618 			if (is_knote_registered_modify_task_pressure_bits(kn_cur, knote_pressure_level, task, 0, level_snapshot) == TRUE) {
1619 				proc_t knote_proc = knote_get_kq(kn_cur)->kq_p;
1620 				pid_t knote_pid = proc_getpid(knote_proc);
1621 				if (knote_pid == target_pid) {
1622 					KNOTE_DETACH(&memorystatus_klist, kn_cur);
1623 					KNOTE_ATTACH(&dispatch_klist, kn_cur);
1624 				}
1625 			}
1626 		}
1627 		if (level_snapshot != kVMPressureNormal) {
1628 			mark_knote_send_time(kn_max, task, convert_internal_pressure_level_to_dispatch_level(level_snapshot),
1629 			    (uint16_t) MIN(UINT16_MAX, memorystatus_pressure_interval_telemetry.num_notifications_sent));
1630 			memorystatus_pressure_interval_telemetry.num_notifications_sent++;
1631 		}
1632 
1633 		KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
1634 
1635 		SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
1636 			KNOTE_DETACH(&dispatch_klist, kn_cur);
1637 			KNOTE_ATTACH(&memorystatus_klist, kn_cur);
1638 		}
1639 
1640 		memorystatus_klist_unlock();
1641 
1642 		microuptime(&target_proc->vm_pressure_last_notify_tstamp);
1643 		proc_rele(target_proc);
1644 
1645 		if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
1646 			break;
1647 		}
1648 
1649 		if (memorystatus_manual_testing_on == TRUE) {
1650 			/*
1651 			 * Testing out the pressure notification scheme.
1652 			 * No need for delays etc.
1653 			 */
1654 		} else {
1655 			uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
1656 #if CONFIG_JETSAM
1657 
1658 			uint32_t critical_threshold = memorystatus_get_critical_page_shortage_threshold();
1659 			uint32_t soft_threshold = memorystatus_get_soft_memlimit_page_shortage_threshold();
1660 			assert(soft_threshold >= critical_threshold);
1661 
1662 			uint32_t backoff_threshold = soft_threshold -
1663 			    ((soft_threshold - critical_threshold) / 2);
1664 
1665 			if (memorystatus_get_available_page_count() <= backoff_threshold) {
1666 				/*
1667 				 * We are nearing the critcal mark fast and can't afford to wait between
1668 				 * notifications.
1669 				 */
1670 				sleep_interval = 0;
1671 			}
1672 #endif /* CONFIG_JETSAM */
1673 
1674 			if (sleep_interval) {
1675 				delay(sleep_interval);
1676 			}
1677 		}
1678 	}
1679 
1680 	return KERN_SUCCESS;
1681 }
1682 
1683 static uint32_t
convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)1684 convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
1685 {
1686 	uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1687 
1688 	switch (internal_pressure_level) {
1689 	case kVMPressureNormal:
1690 	{
1691 		dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1692 		break;
1693 	}
1694 
1695 	case kVMPressureWarning:
1696 	case kVMPressureUrgent:
1697 	{
1698 		dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1699 		break;
1700 	}
1701 
1702 	case kVMPressureCritical:
1703 	{
1704 		dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
1705 		break;
1706 	}
1707 
1708 	default:
1709 		break;
1710 	}
1711 
1712 	return dispatch_level;
1713 }
1714 
1715 /*
1716  * Issue a wakeup to any threads listening for jetsam pressure via
1717  * `mach_vm_pressure_level_monitor`. Subscribers should respond to these
1718  * notifications by freeing cached memory.
1719  */
1720 void
memorystatus_broadcast_jetsam_pressure(vm_pressure_level_t pressure_level)1721 memorystatus_broadcast_jetsam_pressure(vm_pressure_level_t pressure_level)
1722 {
1723 	uint64_t now;
1724 	uint32_t *waiters = NULL;
1725 	uint64_t *last_notification_ns = NULL;
1726 
1727 	switch (pressure_level) {
1728 	case kVMPressureForegroundJetsam:
1729 		waiters = &memorystatus_jetsam_fg_band_waiters;
1730 		last_notification_ns = &memorystatus_jetsam_fg_band_timestamp_ns;
1731 		break;
1732 	case kVMPressureBackgroundJetsam:
1733 		waiters = &memorystatus_jetsam_bg_band_waiters;
1734 		last_notification_ns = &memorystatus_jetsam_bg_band_timestamp_ns;
1735 		break;
1736 	default:
1737 		panic("Unexpected non-jetsam pressure level %d", pressure_level);
1738 	}
1739 
1740 	lck_mtx_lock(&memorystatus_jetsam_broadcast_lock);
1741 	absolutetime_to_nanoseconds(mach_absolute_time(), &now);
1742 
1743 	if (now - *last_notification_ns < memorystatus_jetsam_notification_delay_ns) {
1744 		lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
1745 		return;
1746 	}
1747 
1748 	if (*waiters > 0) {
1749 		memorystatus_log("memorystatus: issuing %s jetsam pressure notification to %d waiters",
1750 		    pressure_level == kVMPressureForegroundJetsam ?
1751 		    "foreground" : "background", *waiters);
1752 		thread_wakeup((event_t)waiters);
1753 		*waiters = 0;
1754 		*last_notification_ns = now;
1755 	}
1756 	lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
1757 }
1758 
1759 /*
1760  * Memorystatus notification debugging support
1761  */
1762 
1763 #if DEVELOPMENT || DEBUG
1764 
1765 static int
1766 sysctl_memorystatus_broadcast_jetsam_pressure SYSCTL_HANDLER_ARGS
1767 {
1768 	int error = 0;
1769 	vm_pressure_level_t pressure_level;
1770 
1771 	error = SYSCTL_IN(req, &pressure_level, sizeof(pressure_level));
1772 	if (error) {
1773 		return error;
1774 	}
1775 
1776 	if (pressure_level == kVMPressureForegroundJetsam ||
1777 	    pressure_level == kVMPressureBackgroundJetsam) {
1778 		memorystatus_broadcast_jetsam_pressure(pressure_level);
1779 	} else {
1780 		return EINVAL;
1781 	}
1782 
1783 	return SYSCTL_OUT(req, &pressure_level, sizeof(pressure_level));
1784 }
1785 
1786 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_broadcast_jetsam_pressure,
1787     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
1788     0, 0, &sysctl_memorystatus_broadcast_jetsam_pressure, "I", "");
1789 
1790 #endif /* DEVELOPMENT || DEBUG */
1791 
1792 static int
1793 sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
1794 {
1795 #pragma unused(arg1, arg2, oidp)
1796 #if !XNU_TARGET_OS_OSX
1797 	int error = 0;
1798 
1799 	error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
1800 	if (error) {
1801 		return error;
1802 	}
1803 
1804 #endif /* !XNU_TARGET_OS_OSX */
1805 	uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
1806 
1807 	return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
1808 }
1809 
1810 #if DEBUG || DEVELOPMENT
1811 
1812 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
1813     0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1814 
1815 #else /* DEBUG || DEVELOPMENT */
1816 
1817 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1818     0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1819 
1820 #endif /* DEBUG || DEVELOPMENT */
1821 
1822 /*
1823  * Trigger levels to test the mechanism.
1824  * Can be used via a sysctl.
1825  */
1826 #define TEST_LOW_MEMORY_TRIGGER_ONE        1
1827 #define TEST_LOW_MEMORY_TRIGGER_ALL        2
1828 #define TEST_PURGEABLE_TRIGGER_ONE        3
1829 #define TEST_PURGEABLE_TRIGGER_ALL        4
1830 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE    5
1831 #define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL    6
1832 
1833 static int
1834 sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
1835 {
1836 #pragma unused(arg1, arg2)
1837 
1838 	int level = 0;
1839 	int error = 0;
1840 	int pressure_level = 0;
1841 	int trigger_request = 0;
1842 	int force_purge;
1843 
1844 	error = sysctl_handle_int(oidp, &level, 0, req);
1845 	if (error || !req->newptr) {
1846 		return error;
1847 	}
1848 
1849 	memorystatus_manual_testing_on = TRUE;
1850 
1851 	trigger_request = (level >> 16) & 0xFFFF;
1852 	pressure_level = (level & 0xFFFF);
1853 
1854 	if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
1855 	    trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
1856 		return EINVAL;
1857 	}
1858 	switch (pressure_level) {
1859 	case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
1860 	case NOTE_MEMORYSTATUS_PRESSURE_WARN:
1861 	case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
1862 		break;
1863 	default:
1864 		return EINVAL;
1865 	}
1866 
1867 	/*
1868 	 * The pressure level is being set from user-space.
1869 	 * And user-space uses the constants in sys/event.h
1870 	 * So we translate those events to our internal levels here.
1871 	 */
1872 	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1873 		memorystatus_manual_testing_level = kVMPressureNormal;
1874 		force_purge = 0;
1875 	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
1876 		memorystatus_manual_testing_level = kVMPressureWarning;
1877 		force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1878 	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
1879 		memorystatus_manual_testing_level = kVMPressureCritical;
1880 		force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1881 	}
1882 
1883 	memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
1884 
1885 	/* purge according to the new pressure level */
1886 	switch (trigger_request) {
1887 	case TEST_PURGEABLE_TRIGGER_ONE:
1888 	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
1889 		if (force_purge == 0) {
1890 			/* no purging requested */
1891 			break;
1892 		}
1893 		vm_purgeable_object_purge_one_unlocked(force_purge);
1894 		break;
1895 	case TEST_PURGEABLE_TRIGGER_ALL:
1896 	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
1897 		if (force_purge == 0) {
1898 			/* no purging requested */
1899 			break;
1900 		}
1901 		while (vm_purgeable_object_purge_one_unlocked(force_purge)) {
1902 			;
1903 		}
1904 		break;
1905 	}
1906 
1907 	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
1908 	    (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
1909 		memorystatus_update_vm_pressure(TRUE);
1910 	}
1911 
1912 	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
1913 	    (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
1914 		while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
1915 			continue;
1916 		}
1917 	}
1918 
1919 	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1920 		memorystatus_manual_testing_on = FALSE;
1921 	}
1922 
1923 	return 0;
1924 }
1925 
1926 SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1927     0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
1928 
1929 
1930 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
1931 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
1932 SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
1933 
1934 extern int vm_pressure_level_transition_threshold;
1935 SYSCTL_INT(_kern, OID_AUTO, vm_pressure_level_transition_threshold, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_level_transition_threshold, 0, "");
1936 
1937 #if DEBUG || DEVELOPMENT
1938 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
1939 
1940 #if 0
1941 #if CONFIG_JETSAM && VM_PRESSURE_EVENTS
1942 static boolean_t
1943 memorystatus_issue_pressure_kevent(boolean_t pressured)
1944 {
1945 	memorystatus_klist_lock();
1946 	KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
1947 	memorystatus_klist_unlock();
1948 	return TRUE;
1949 }
1950 #endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
1951 #endif /* 0 */
1952 
1953 /*
1954  * This routine is used for targeted notifications regardless of system memory pressure
1955  * and regardless of whether or not the process has already been notified.
1956  * It bypasses and has no effect on the only-one-notification per soft-limit policy.
1957  *
1958  * "memnote" is the current user.
1959  */
1960 
1961 static int
1962 sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
1963 {
1964 #pragma unused(arg1, arg2)
1965 	/* Need to be root or have memorystatus entitlement */
1966 	if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT)) {
1967 		return EPERM;
1968 	}
1969 
1970 	int error = 0, pid = 0;
1971 	struct knote *kn = NULL;
1972 	boolean_t found_knote = FALSE;
1973 	int fflags = 0;    /* filter flags for EVFILT_MEMORYSTATUS */
1974 	uint64_t value = 0;
1975 
1976 	error = sysctl_handle_quad(oidp, &value, 0, req);
1977 	if (error || !req->newptr) {
1978 		return error;
1979 	}
1980 
1981 	/*
1982 	 * Find the pid in the low 32 bits of value passed in.
1983 	 */
1984 	pid = (int)(value & 0xFFFFFFFF);
1985 
1986 	/*
1987 	 * Find notification in the high 32 bits of the value passed in.
1988 	 */
1989 	fflags = (int)((value >> 32) & 0xFFFFFFFF);
1990 
1991 	/*
1992 	 * For backwards compatibility, when no notification is
1993 	 * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
1994 	 */
1995 	if (fflags == 0) {
1996 		fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1997 		// printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
1998 	}
1999 
2000 	/* wake up everybody waiting for kVMPressureForegroundJetsam */
2001 	if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) {
2002 		memorystatus_broadcast_jetsam_pressure(kVMPressureForegroundJetsam);
2003 		return error;
2004 	}
2005 
2006 	/*
2007 	 * See event.h ... fflags for EVFILT_MEMORYSTATUS
2008 	 */
2009 	if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) ||
2010 	    (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
2011 	    (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
2012 	    (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
2013 	    (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
2014 	    (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
2015 	    (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
2016 	    ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
2017 		memorystatus_log_error("memorystatus_vm_pressure_send: notification [0x%x] not supported\n", fflags);
2018 		error = 1;
2019 		return error;
2020 	}
2021 
2022 	/*
2023 	 * Forcibly send pid a memorystatus notification.
2024 	 */
2025 
2026 	memorystatus_klist_lock();
2027 
2028 	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
2029 		proc_t knote_proc = knote_get_kq(kn)->kq_p;
2030 		pid_t knote_pid = proc_getpid(knote_proc);
2031 
2032 		if (knote_pid == pid) {
2033 			/*
2034 			 * Forcibly send this pid a memorystatus notification.
2035 			 */
2036 			kn->kn_fflags = fflags;
2037 			found_knote = TRUE;
2038 		}
2039 	}
2040 
2041 	if (found_knote) {
2042 		KNOTE(&memorystatus_klist, 0);
2043 		memorystatus_log_debug("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d]\n", value, fflags, pid);
2044 		error = 0;
2045 	} else {
2046 		memorystatus_log_error("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
2047 		error = 1;
2048 	}
2049 
2050 	memorystatus_klist_unlock();
2051 
2052 	return error;
2053 }
2054 
2055 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
2056     0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
2057 
2058 #endif /* DEBUG || DEVELOPMENT */
2059 
2060 #endif /* VM_PRESSURE_EVENTS */
2061