xref: /xnu-11215.61.5/osfmk/arm/preemption_disable.c (revision 4f1223e81cd707a65cc109d0b8ad6653699da3c4)
1 /*
2  * Copyright (c) 2007-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Routines for preemption disablement,
31  * which prevents the current thread from giving up its current CPU.
32  */
33 
34 #include <arm/cpu_data.h>
35 #include <arm/cpu_data_internal.h>
36 #include <arm/preemption_disable_internal.h>
37 #include <kern/cpu_data.h>
38 #include <kern/percpu.h>
39 #include <kern/thread.h>
40 #include <mach/machine/sdt.h>
41 #include <os/base.h>
42 #include <stdint.h>
43 #include <sys/kdebug.h>
44 
45 #if SCHED_HYGIENE_DEBUG
46 static void
47 _do_disable_preemption_without_measurements(void);
48 #endif
49 
50 /*
51  * This function checks whether an AST_URGENT has been pended.
52  *
53  * It is called once the preemption has been reenabled, which means the thread
54  * may have been preempted right before this was called, and when this function
55  * actually performs the check, we've changed CPU.
56  *
57  * This race is however benign: the point of AST_URGENT is to trigger a context
58  * switch, so if one happened, there's nothing left to check for, and AST_URGENT
59  * was cleared in the process.
60  *
61  * It follows that this check cannot have false negatives, which allows us
62  * to avoid fiddling with interrupt state for the vast majority of cases
63  * when the check will actually be negative.
64  */
65 static OS_NOINLINE
66 void
kernel_preempt_check(void)67 kernel_preempt_check(void)
68 {
69 	uint64_t state;
70 
71 	/* If interrupts are masked, we can't take an AST here */
72 	state = __builtin_arm_rsr64("DAIF");
73 	if (state & DAIF_IRQF) {
74 		return;
75 	}
76 
77 	/* disable interrupts (IRQ FIQ ASYNCF) */
78 	__builtin_arm_wsr64("DAIFSet", DAIFSC_STANDARD_DISABLE);
79 
80 	/*
81 	 * Reload cpu_pending_ast: a context switch would cause it to change.
82 	 * Now that interrupts are disabled, this will debounce false positives.
83 	 */
84 	if (current_thread()->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
85 		ast_taken_kernel();
86 	}
87 
88 	/* restore the original interrupt mask */
89 	__builtin_arm_wsr64("DAIF", state);
90 }
91 
92 static inline void
_enable_preemption_write_count(thread_t thread,unsigned int count)93 _enable_preemption_write_count(thread_t thread, unsigned int count)
94 {
95 	os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
96 
97 	/*
98 	 * This check is racy and could load from another CPU's pending_ast mask,
99 	 * but as described above, this can't have false negatives.
100 	 */
101 	if (count == 0) {
102 		if (__improbable(thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT)) {
103 			return kernel_preempt_check();
104 		}
105 	}
106 }
107 
108 /*
109  * This function is written in a way that the codegen is extremely short.
110  *
111  * LTO isn't smart enough to inline it, yet it is profitable because
112  * the vast majority of callers use current_thread() already.
113  *
114  * TODO: It is unfortunate that we have to load
115  *       sched_preemption_disable_debug_mode
116  *
117  * /!\ Breaking inlining causes zalloc to be roughly 10% slower /!\
118  */
119 OS_ALWAYS_INLINE
120 void
_disable_preemption(void)121 _disable_preemption(void)
122 {
123 	thread_t thread = current_thread();
124 	unsigned int count = thread->machine.preemption_count;
125 
126 	os_atomic_store(&thread->machine.preemption_count,
127 	    count + 1, compiler_acq_rel);
128 
129 #if SCHED_HYGIENE_DEBUG
130 	/*
131 	 * Note that this is not the only place preemption gets disabled,
132 	 * it also gets modified on ISR and PPL entry/exit. Both of those
133 	 * events will be treated specially however, and
134 	 * increment/decrement being paired around their entry/exit means
135 	 * that collection here is not desynced otherwise.
136 	 */
137 
138 	if (__improbable(count == 0 && sched_preemption_disable_debug_mode)) {
139 		__attribute__((musttail))
140 		return _prepare_preemption_disable_measurement();
141 	}
142 #endif /* SCHED_HYGIENE_DEBUG */
143 }
144 
145 /*
146  * This variant of disable_preemption() allows disabling preemption
147  * without taking measurements (and later potentially triggering
148  * actions on those).
149  */
150 OS_ALWAYS_INLINE
151 void
_disable_preemption_without_measurements(void)152 _disable_preemption_without_measurements(void)
153 {
154 	thread_t thread = current_thread();
155 	unsigned int count = thread->machine.preemption_count;
156 
157 #if SCHED_HYGIENE_DEBUG
158 	_do_disable_preemption_without_measurements();
159 #endif /* SCHED_HYGIENE_DEBUG */
160 
161 	os_atomic_store(&thread->machine.preemption_count,
162 	    count + 1, compiler_acq_rel);
163 }
164 
165 /*
166  * To help _enable_preemption() inline everywhere with LTO,
167  * we keep these nice non inlineable functions as the panic()
168  * codegen setup is quite large and for weird reasons causes a frame.
169  */
170 __abortlike
171 static void
_enable_preemption_underflow(void)172 _enable_preemption_underflow(void)
173 {
174 	panic("Preemption count underflow");
175 }
176 
177 /*
178  * This function is written in a way that the codegen is extremely short.
179  *
180  * LTO isn't smart enough to inline it, yet it is profitable because
181  * the vast majority of callers use current_thread() already.
182  *
183  * The SCHED_HYGIENE_MARKER trick is used so that we do not have to load
184  * unrelated fields of current_thread().
185  *
186  * /!\ Breaking inlining causes zalloc to be roughly 10% slower /!\
187  */
188 OS_ALWAYS_INLINE
189 void
_enable_preemption(void)190 _enable_preemption(void)
191 {
192 	thread_t thread = current_thread();
193 	unsigned int count  = thread->machine.preemption_count;
194 
195 	if (__improbable(count == 0)) {
196 		_enable_preemption_underflow();
197 	}
198 
199 #if SCHED_HYGIENE_DEBUG
200 	if (__improbable(count == SCHED_HYGIENE_MARKER + 1)) {
201 		return _collect_preemption_disable_measurement();
202 	}
203 #endif /* SCHED_HYGIENE_DEBUG */
204 
205 	_enable_preemption_write_count(thread, count - 1);
206 }
207 
208 OS_ALWAYS_INLINE
209 unsigned int
get_preemption_level_for_thread(thread_t thread)210 get_preemption_level_for_thread(thread_t thread)
211 {
212 	unsigned int count = thread->machine.preemption_count;
213 
214 #if SCHED_HYGIENE_DEBUG
215 	/*
216 	 * hide this "flag" from callers,
217 	 * and it would make the count look negative anyway
218 	 * which some people dislike
219 	 */
220 	count &= ~SCHED_HYGIENE_MARKER;
221 #endif
222 	return (int)count;
223 }
224 
225 OS_ALWAYS_INLINE
226 int
get_preemption_level(void)227 get_preemption_level(void)
228 {
229 	return get_preemption_level_for_thread(current_thread());
230 }
231 
232 #if SCHED_HYGIENE_DEBUG
233 
234 uint64_t _Atomic PERCPU_DATA_HACK_78750602(preemption_disable_max_mt);
235 
236 #if XNU_PLATFORM_iPhoneOS
237 #define DEFAULT_PREEMPTION_TIMEOUT 120000 /* 5ms */
238 #define DEFAULT_PREEMPTION_MODE SCHED_HYGIENE_MODE_PANIC
239 #elif XNU_PLATFORM_XROS
240 #define DEFAULT_PREEMPTION_TIMEOUT 24000  /* 1ms */
241 #define DEFAULT_PREEMPTION_MODE SCHED_HYGIENE_MODE_PANIC
242 #else
243 #define DEFAULT_PREEMPTION_TIMEOUT 0      /* Disabled */
244 #define DEFAULT_PREEMPTION_MODE SCHED_HYGIENE_MODE_OFF
245 #endif /* XNU_PLATFORM_iPhoneOS */
246 
247 MACHINE_TIMEOUT_DEV_WRITEABLE(sched_preemption_disable_threshold_mt, "sched-preemption",
248     DEFAULT_PREEMPTION_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, kprintf_spam_mt_pred);
249 TUNABLE_DT_WRITEABLE(sched_hygiene_mode_t, sched_preemption_disable_debug_mode,
250     "machine-timeouts",
251     "sched-preemption-disable-mode", /* DT property names have to be 31 chars max */
252     "sched_preemption_disable_debug_mode",
253     DEFAULT_PREEMPTION_MODE,
254     TUNABLE_DT_CHECK_CHOSEN);
255 
256 struct _preemption_disable_pcpu PERCPU_DATA(_preemption_disable_pcpu_data);
257 
258 /*
259 ** Start a measurement window for the current CPU's preemption disable timeout.
260 *
261 * Interrupts must be disabled when calling this function,
262 * but the assertion has been elided as this is on the fast path.
263 */
264 static void
_preemption_disable_snap_start(void)265 _preemption_disable_snap_start(void)
266 {
267 	struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
268 	pcpu->pdp_abandon = false;
269 	pcpu->pdp_start.pds_mach_time = ml_get_sched_hygiene_timebase();
270 	pcpu->pdp_start.pds_int_mach_time = recount_current_processor_interrupt_duration_mach();
271 #if CONFIG_CPU_COUNTERS
272 	if (__probable(sched_hygiene_debug_pmc)) {
273 		mt_cur_cpu_cycles_instrs_speculative(&pcpu->pdp_start.pds_cycles,
274 		    &pcpu->pdp_start.pds_instrs);
275 	}
276 #endif /* CONFIG_CPU_COUNTERS */
277 }
278 
279 /*
280 **
281 * End a measurement window for the current CPU's preemption disable timeout,
282 * using the snapshot started by _preemption_disable_snap_start().
283 *
284 * @param start An out-parameter for the starting snapshot,
285 * captured while interrupts are disabled.
286 *
287 * @param now An out-parameter for the current times,
288 * captured at the same time as the start and with interrupts disabled.
289 * This is meant for computing a delta.
290 * Even with @link sched_hygiene_debug_pmc , the PMCs will not be read.
291 * This allows their (relatively expensive) reads to happen only if the time threshold has been violated.
292 *
293 * @return Whether to abandon the current measurement due to a call to abandon_preemption_disable_measurement().
294 */
295 static bool
_preemption_disable_snap_end(struct _preemption_disable_snap * start,struct _preemption_disable_snap * now)296 _preemption_disable_snap_end(
297 	struct _preemption_disable_snap *start,
298 	struct _preemption_disable_snap *now)
299 {
300 	struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
301 
302 	const bool int_masked_debug = false;
303 	const bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug);
304 	/*
305 	 * Collect start time and current time with interrupts disabled.
306 	 * Otherwise an interrupt coming in after grabbing the timestamp
307 	 * could spuriously inflate the measurement, because it will
308 	 * adjust preemption_disable_mt only after we already grabbed
309 	 * it.
310 	 *
311 	 * (Even worse if we collected the current time first: Then a
312 	 * subsequent interrupt could adjust preemption_disable_mt to
313 	 * make the duration go negative after subtracting the already
314 	 * grabbed time. With interrupts disabled we don't care much about
315 	 * the order.)
316 	 */
317 
318 	*start = pcpu->pdp_start;
319 	uint64_t now_time = ml_get_sched_hygiene_timebase();
320 	now->pds_mach_time = now_time;
321 	now->pds_int_mach_time = recount_current_processor_interrupt_duration_mach();
322 	const bool abandon = pcpu->pdp_abandon;
323 	const uint64_t max_duration = os_atomic_load(&pcpu->pdp_max_mach_duration, relaxed);
324 
325 	pcpu->pdp_start.pds_mach_time = 0;
326 
327 	/*
328 	 * Don't need to reset (or even save) pdp_abandon here:
329 	 * abandon_preemption_disable_measurement is a no-op anyway
330 	 * if pdp_start.pds_mach_time == 0 (which we just set), and it
331 	 * will stay that way until the next call to
332 	 * _collect_preemption_disable_measurement.
333 	 */
334 	ml_set_interrupts_enabled_with_debug(istate, int_masked_debug);
335 	if (__probable(!abandon)) {
336 		const int64_t gross_duration = now_time - start->pds_mach_time;
337 		if (__improbable(gross_duration > max_duration)) {
338 			os_atomic_store(&pcpu->pdp_max_mach_duration, gross_duration, relaxed);
339 		}
340 	}
341 	return abandon;
342 }
343 
344 OS_NOINLINE
345 void
_prepare_preemption_disable_measurement(void)346 _prepare_preemption_disable_measurement(void)
347 {
348 	thread_t thread = current_thread();
349 
350 	if (thread->machine.inthandler_timestamp == 0) {
351 		/*
352 		 * Only prepare a measurement if not currently in an interrupt
353 		 * handler.
354 		 *
355 		 * We are only interested in the net duration of disabled
356 		 * preemption, that is: The time in which preemption was
357 		 * disabled, minus the intervals in which any (likely
358 		 * unrelated) interrupts were handled.
359 		 * recount_current_thread_interrupt_time_mach() will remove those
360 		 * intervals, however we also do not even start measuring
361 		 * preemption disablement if we are already within handling of
362 		 * an interrupt when preemption was disabled (the resulting
363 		 * net time would be 0).
364 		 *
365 		 * Interrupt handling duration is handled separately, and any
366 		 * long intervals of preemption disablement are counted
367 		 * towards that.
368 		 */
369 
370 		bool const int_masked_debug = false;
371 		bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug);
372 		thread->machine.preemption_count |= SCHED_HYGIENE_MARKER;
373 		_preemption_disable_snap_start();
374 		ml_set_interrupts_enabled_with_debug(istate, int_masked_debug);
375 	}
376 }
377 
378 OS_NOINLINE
379 void
_collect_preemption_disable_measurement(void)380 _collect_preemption_disable_measurement(void)
381 {
382 	struct _preemption_disable_snap start = { 0 };
383 	struct _preemption_disable_snap now = { 0 };
384 	const bool abandon = _preemption_disable_snap_end(&start, &now);
385 
386 	if (__improbable(abandon)) {
387 		goto out;
388 	}
389 
390 	int64_t const gross_duration = now.pds_mach_time - start.pds_mach_time;
391 	uint64_t const threshold = os_atomic_load(&sched_preemption_disable_threshold_mt, relaxed);
392 	if (__improbable(threshold > 0 && gross_duration >= threshold)) {
393 		/*
394 		 * Double check that the time spent not handling interrupts is over the threshold.
395 		 */
396 		int64_t const interrupt_duration = now.pds_int_mach_time - start.pds_int_mach_time;
397 		int64_t const net_duration = gross_duration - interrupt_duration;
398 		assert3u(net_duration, >=, 0);
399 		if (net_duration < threshold) {
400 			goto out;
401 		}
402 
403 		uint64_t average_freq = 0;
404 		uint64_t average_cpi_whole = 0;
405 		uint64_t average_cpi_fractional = 0;
406 
407 #if CONFIG_CPU_COUNTERS
408 		if (__probable(sched_hygiene_debug_pmc)) {
409 			/*
410 			 * We're getting these values a bit late, but getting them
411 			 * is a bit expensive, so we take the slight hit in
412 			 * accuracy for the reported values (which aren't very
413 			 * stable anyway).
414 			 */
415 			const bool int_masked_debug = false;
416 			const bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug);
417 			mt_cur_cpu_cycles_instrs_speculative(&now.pds_cycles, &now.pds_instrs);
418 			ml_set_interrupts_enabled_with_debug(istate, int_masked_debug);
419 			const uint64_t cycles_elapsed = now.pds_cycles - start.pds_cycles;
420 			const uint64_t instrs_retired = now.pds_instrs - start.pds_instrs;
421 
422 			uint64_t duration_ns;
423 			absolutetime_to_nanoseconds(gross_duration, &duration_ns);
424 
425 			average_freq = cycles_elapsed / (duration_ns / 1000);
426 			average_cpi_whole = cycles_elapsed / instrs_retired;
427 			average_cpi_fractional =
428 			    ((cycles_elapsed * 100) / instrs_retired) % 100;
429 		}
430 #endif /* CONFIG_CPU_COUNTERS */
431 
432 		if (__probable(sched_preemption_disable_debug_mode == SCHED_HYGIENE_MODE_PANIC)) {
433 			panic("preemption disable timeout exceeded: %llu >= %llu mt ticks (start: %llu, now: %llu, gross: %llu, inttime: %llu), "
434 			    "freq = %llu MHz, CPI = %llu.%llu",
435 			    net_duration, threshold, start.pds_mach_time, now.pds_mach_time,
436 			    gross_duration, interrupt_duration,
437 			    average_freq, average_cpi_whole, average_cpi_fractional);
438 		}
439 
440 		DTRACE_SCHED4(mach_preemption_expired, uint64_t, net_duration, uint64_t, gross_duration,
441 		    uint64_t, average_cpi_whole, uint64_t, average_cpi_fractional);
442 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PREEMPTION_EXPIRED), net_duration, gross_duration, average_cpi_whole, average_cpi_fractional);
443 	}
444 
445 out:
446 	/*
447 	 * the preemption count is SCHED_HYGIENE_MARKER, we need to clear it.
448 	 */
449 	_enable_preemption_write_count(current_thread(), 0);
450 }
451 
452 /*
453  * Abandon a potential preemption disable measurement. Useful for
454  * example for the idle thread, which would just spuriously
455  * trigger the threshold while actually idling, which we don't
456  * care about.
457  */
458 void
abandon_preemption_disable_measurement(void)459 abandon_preemption_disable_measurement(void)
460 {
461 	const bool int_masked_debug = false;
462 	bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug);
463 	struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
464 	if (pcpu->pdp_start.pds_mach_time != 0) {
465 		pcpu->pdp_abandon = true;
466 	}
467 	ml_set_interrupts_enabled_with_debug(istate, int_masked_debug);
468 }
469 
470 /* Inner part of disable_preemption_without_measuerments() */
471 OS_ALWAYS_INLINE
472 static void
_do_disable_preemption_without_measurements(void)473 _do_disable_preemption_without_measurements(void)
474 {
475 	/*
476 	 * Inform _collect_preemption_disable_measurement()
477 	 * that we didn't really care.
478 	 */
479 	struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
480 	pcpu->pdp_abandon = true;
481 }
482 
483 /**
484  * Reset the max interrupt durations of all CPUs.
485  */
486 void preemption_disable_reset_max_durations(void);
487 void
preemption_disable_reset_max_durations(void)488 preemption_disable_reset_max_durations(void)
489 {
490 	percpu_foreach(pcpu, _preemption_disable_pcpu_data) {
491 		os_atomic_store(&pcpu->pdp_max_mach_duration, 0, relaxed);
492 	}
493 }
494 
495 unsigned int preemption_disable_get_max_durations(uint64_t *durations, size_t count);
496 unsigned int
preemption_disable_get_max_durations(uint64_t * durations,size_t count)497 preemption_disable_get_max_durations(uint64_t *durations, size_t count)
498 {
499 	int cpu = 0;
500 	percpu_foreach(pcpu, _preemption_disable_pcpu_data) {
501 		if (cpu < count) {
502 			durations[cpu++] = os_atomic_load(&pcpu->pdp_max_mach_duration, relaxed);
503 		}
504 	}
505 	return cpu;
506 }
507 
508 /*
509  * Skip predicate for sched_preemption_disable, which would trigger
510  * spuriously when kprintf spam is enabled.
511  */
512 bool
kprintf_spam_mt_pred(struct machine_timeout_spec const __unused * spec)513 kprintf_spam_mt_pred(struct machine_timeout_spec const __unused *spec)
514 {
515 	bool const kprintf_spam_enabled = !(disable_kprintf_output || disable_serial_output);
516 	return kprintf_spam_enabled;
517 }
518 
519 /*
520  * Abandon function exported for AppleCLPC, as a workaround to rdar://91668370.
521  *
522  * Only for AppleCLPC!
523  */
524 void
sched_perfcontrol_abandon_preemption_disable_measurement(void)525 sched_perfcontrol_abandon_preemption_disable_measurement(void)
526 {
527 	abandon_preemption_disable_measurement();
528 }
529 
530 #else /* SCHED_HYGIENE_DEBUG */
531 
532 void
sched_perfcontrol_abandon_preemption_disable_measurement(void)533 sched_perfcontrol_abandon_preemption_disable_measurement(void)
534 {
535 	// No-op. Function is exported, so needs to be defined
536 }
537 
538 #endif /* SCHED_HYGIENE_DEBUG */
539