1 /*
2 * Copyright (c) 2007-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Routines for preemption disablement,
31 * which prevents the current thread from giving up its current CPU.
32 */
33
34 #include <arm/cpu_data.h>
35 #include <arm/cpu_data_internal.h>
36 #include <arm/preemption_disable_internal.h>
37 #include <kern/cpu_data.h>
38 #include <kern/percpu.h>
39 #include <kern/thread.h>
40 #include <mach/machine/sdt.h>
41 #include <os/base.h>
42 #include <stdint.h>
43 #include <sys/kdebug.h>
44
45 #if SCHED_HYGIENE_DEBUG
46 static void
47 _do_disable_preemption_without_measurements(void);
48 #endif
49
50 /*
51 * This function checks whether an AST_URGENT has been pended.
52 *
53 * It is called once the preemption has been reenabled, which means the thread
54 * may have been preempted right before this was called, and when this function
55 * actually performs the check, we've changed CPU.
56 *
57 * This race is however benign: the point of AST_URGENT is to trigger a context
58 * switch, so if one happened, there's nothing left to check for, and AST_URGENT
59 * was cleared in the process.
60 *
61 * It follows that this check cannot have false negatives, which allows us
62 * to avoid fiddling with interrupt state for the vast majority of cases
63 * when the check will actually be negative.
64 */
65 static OS_NOINLINE
66 void
kernel_preempt_check(void)67 kernel_preempt_check(void)
68 {
69 uint64_t state;
70
71 /* If interrupts are masked, we can't take an AST here */
72 state = __builtin_arm_rsr64("DAIF");
73 if (state & DAIF_IRQF) {
74 return;
75 }
76
77 /* disable interrupts (IRQ FIQ ASYNCF) */
78 __builtin_arm_wsr64("DAIFSet", DAIFSC_STANDARD_DISABLE);
79
80 /*
81 * Reload cpu_pending_ast: a context switch would cause it to change.
82 * Now that interrupts are disabled, this will debounce false positives.
83 */
84 if (current_thread()->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
85 ast_taken_kernel();
86 }
87
88 /* restore the original interrupt mask */
89 __builtin_arm_wsr64("DAIF", state);
90 }
91
92 static inline void
_enable_preemption_write_count(thread_t thread,unsigned int count)93 _enable_preemption_write_count(thread_t thread, unsigned int count)
94 {
95 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
96
97 /*
98 * This check is racy and could load from another CPU's pending_ast mask,
99 * but as described above, this can't have false negatives.
100 */
101 if (count == 0) {
102 if (__improbable(thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT)) {
103 return kernel_preempt_check();
104 }
105 }
106 }
107
108 /*
109 * This function is written in a way that the codegen is extremely short.
110 *
111 * LTO isn't smart enough to inline it, yet it is profitable because
112 * the vast majority of callers use current_thread() already.
113 *
114 * /!\ Breaking inlining causes zalloc to be roughly 10% slower /!\
115 */
116 OS_ALWAYS_INLINE __mockable
117 void
_disable_preemption(void)118 _disable_preemption(void)
119 {
120 thread_t thread = current_thread();
121 unsigned int count = thread->machine.preemption_count;
122
123 os_atomic_store(&thread->machine.preemption_count,
124 count + 1, compiler_acq_rel);
125
126 #if SCHED_HYGIENE_DEBUG
127 /*
128 * Note that this is not the only place preemption gets disabled,
129 * it also gets modified on ISR and PPL entry/exit. Both of those
130 * events will be treated specially however, and
131 * increment/decrement being paired around their entry/exit means
132 * that collection here is not desynced otherwise.
133 */
134 if (improbable_static_if(sched_debug_preemption_disable)) {
135 if (__improbable(count == 0 &&
136 sched_preemption_disable_debug_mode)) {
137 __attribute__((musttail))
138 return _prepare_preemption_disable_measurement();
139 }
140 }
141 #endif /* SCHED_HYGIENE_DEBUG */
142 }
143
144 /*
145 * This variant of disable_preemption() allows disabling preemption
146 * without taking measurements (and later potentially triggering
147 * actions on those).
148 */
149 OS_ALWAYS_INLINE __mockable
150 void
_disable_preemption_without_measurements(void)151 _disable_preemption_without_measurements(void)
152 {
153 thread_t thread = current_thread();
154 unsigned int count = thread->machine.preemption_count;
155
156 #if SCHED_HYGIENE_DEBUG
157 _do_disable_preemption_without_measurements();
158 #endif /* SCHED_HYGIENE_DEBUG */
159
160 os_atomic_store(&thread->machine.preemption_count,
161 count + 1, compiler_acq_rel);
162 }
163
164 /*
165 * To help _enable_preemption() inline everywhere with LTO,
166 * we keep these nice non inlineable functions as the panic()
167 * codegen setup is quite large and for weird reasons causes a frame.
168 */
169 __abortlike
170 static void
_enable_preemption_underflow(void)171 _enable_preemption_underflow(void)
172 {
173 panic("Preemption count underflow");
174 }
175
176 /*
177 * This function is written in a way that the codegen is extremely short.
178 *
179 * LTO isn't smart enough to inline it, yet it is profitable because
180 * the vast majority of callers use current_thread() already.
181 *
182 * The SCHED_HYGIENE_MARKER trick is used so that we do not have to load
183 * unrelated fields of current_thread().
184 *
185 * /!\ Breaking inlining causes zalloc to be roughly 10% slower /!\
186 */
187 OS_ALWAYS_INLINE __mockable
188 void
_enable_preemption(void)189 _enable_preemption(void)
190 {
191 thread_t thread = current_thread();
192 unsigned int count = thread->machine.preemption_count;
193
194 if (__improbable(count == 0)) {
195 _enable_preemption_underflow();
196 }
197
198 #if SCHED_HYGIENE_DEBUG
199 if (improbable_static_if(sched_debug_preemption_disable)) {
200 if (__improbable(count == SCHED_HYGIENE_MARKER + 1)) {
201 return _collect_preemption_disable_measurement();
202 }
203 }
204 #endif /* SCHED_HYGIENE_DEBUG */
205
206 _enable_preemption_write_count(thread, count - 1);
207 }
208
209 OS_ALWAYS_INLINE
210 unsigned int
get_preemption_level_for_thread(thread_t thread)211 get_preemption_level_for_thread(thread_t thread)
212 {
213 unsigned int count = thread->machine.preemption_count;
214
215 #if SCHED_HYGIENE_DEBUG
216 /*
217 * hide this "flag" from callers,
218 * and it would make the count look negative anyway
219 * which some people dislike
220 */
221 count &= ~SCHED_HYGIENE_MARKER;
222 #endif
223 return (int)count;
224 }
225
226 OS_ALWAYS_INLINE
227 int
get_preemption_level(void)228 get_preemption_level(void)
229 {
230 return get_preemption_level_for_thread(current_thread());
231 }
232
233 #if SCHED_HYGIENE_DEBUG
234
235 uint64_t _Atomic PERCPU_DATA_HACK_78750602(preemption_disable_max_mt);
236
237 #if XNU_PLATFORM_iPhoneOS
238 #define DEFAULT_PREEMPTION_TIMEOUT 120000 /* 5ms */
239 #define DEFAULT_PREEMPTION_MODE SCHED_HYGIENE_MODE_PANIC
240 #elif XNU_PLATFORM_XROS
241 #define DEFAULT_PREEMPTION_TIMEOUT 24000 /* 1ms */
242 #define DEFAULT_PREEMPTION_MODE SCHED_HYGIENE_MODE_PANIC
243 #else
244 #define DEFAULT_PREEMPTION_TIMEOUT 0 /* Disabled */
245 #define DEFAULT_PREEMPTION_MODE SCHED_HYGIENE_MODE_OFF
246 #endif /* XNU_PLATFORM_iPhoneOS */
247
248 MACHINE_TIMEOUT_DEV_WRITEABLE(sched_preemption_disable_threshold_mt, "sched-preemption",
249 DEFAULT_PREEMPTION_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, kprintf_spam_mt_pred);
250 TUNABLE_DT_WRITEABLE(sched_hygiene_mode_t, sched_preemption_disable_debug_mode,
251 "machine-timeouts",
252 "sched-preemption-disable-mode", /* DT property names have to be 31 chars max */
253 "sched_preemption_disable_debug_mode",
254 DEFAULT_PREEMPTION_MODE,
255 TUNABLE_DT_CHECK_CHOSEN);
256
257 struct _preemption_disable_pcpu PERCPU_DATA(_preemption_disable_pcpu_data);
258
259 /*
260 ** Start a measurement window for the current CPU's preemption disable timeout.
261 *
262 * Interrupts must be disabled when calling this function,
263 * but the assertion has been elided as this is on the fast path.
264 */
265 OS_ALWAYS_INLINE
266 static void
_preemption_disable_snap_start(void)267 _preemption_disable_snap_start(void)
268 {
269 struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
270 const timeout_flags_t flags = ML_TIMEOUT_TIMEBASE_FLAGS | ML_TIMEOUT_PMC_FLAGS | TF_SAMPLE_INTERRUPT_TIME | TF_BACKTRACE;
271
272 kern_timeout_start(&pcpu->pdp_timeout, flags);
273 }
274
275 /*
276 **
277 * End a measurement window for the current CPU's preemption disable timeout,
278 * using the snapshot started by _preemption_disable_snap_start().
279 *
280 * @param top An out-parameter for the current times,
281 * captured at the same time as the start and with interrupts disabled.
282 *
283 * This is meant for computing a delta.
284 * Even with @link sched_hygiene_debug_pmc , the PMCs will not be read.
285 * This allows their (relatively expensive) reads to happen only if the time threshold has been violated.
286 *
287 * @return Whether to abandon the current measurement due to a call to abandon_preemption_disable_measurement().
288 */
289 OS_ALWAYS_INLINE
290 static bool
_preemption_disable_snap_end(kern_timeout_t * top)291 _preemption_disable_snap_end(kern_timeout_t *top)
292 {
293 struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
294 const timeout_flags_t flags = ML_TIMEOUT_TIMEBASE_FLAGS | TF_SAMPLE_INTERRUPT_TIME;
295 const bool int_masked_debug = false;
296 const bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug);
297 /*
298 * Collect start time and current time with interrupts disabled.
299 * Otherwise an interrupt coming in after grabbing the timestamp
300 * could spuriously inflate the measurement, because it will
301 * adjust preemption_disable_mt only after we already grabbed
302 * it.
303 *
304 * (Even worse if we collected the current time first: Then a
305 * subsequent interrupt could adjust preemption_disable_mt to
306 * make the duration go negative after subtracting the already
307 * grabbed time. With interrupts disabled we don't care much about
308 * the order.)
309 */
310 kern_timeout_end(&pcpu->pdp_timeout, flags);
311
312 const uint64_t max_duration = os_atomic_load(&pcpu->pdp_max_mach_duration, relaxed);
313 const uint64_t gross_duration = kern_timeout_gross_duration(&pcpu->pdp_timeout);
314 if (__improbable(gross_duration > max_duration)) {
315 os_atomic_store(&pcpu->pdp_max_mach_duration, gross_duration, relaxed);
316 }
317
318 *top = pcpu->pdp_timeout;
319 ml_set_interrupts_enabled_with_debug(istate, int_masked_debug);
320
321 return gross_duration == 0;
322 }
323
324 OS_NOINLINE
325 void
_prepare_preemption_disable_measurement(void)326 _prepare_preemption_disable_measurement(void)
327 {
328 thread_t thread = current_thread();
329
330 if (thread->machine.int_handler_addr == 0) {
331 /*
332 * Only prepare a measurement if not currently in an interrupt
333 * handler.
334 *
335 * We are only interested in the net duration of disabled
336 * preemption, that is: The time in which preemption was
337 * disabled, minus the intervals in which any (likely
338 * unrelated) interrupts were handled.
339 * recount_current_thread_interrupt_time_mach() will remove those
340 * intervals, however we also do not even start measuring
341 * preemption disablement if we are already within handling of
342 * an interrupt when preemption was disabled (the resulting
343 * net time would be 0).
344 *
345 * Interrupt handling duration is handled separately, and any
346 * long intervals of preemption disablement are counted
347 * towards that.
348 */
349
350 bool const int_masked_debug = false;
351 bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug);
352 thread->machine.preemption_count |= SCHED_HYGIENE_MARKER;
353 _preemption_disable_snap_start();
354 ml_set_interrupts_enabled_with_debug(istate, int_masked_debug);
355 }
356 }
357
358 OS_NOINLINE
359 void
_collect_preemption_disable_measurement(void)360 _collect_preemption_disable_measurement(void)
361 {
362 kern_timeout_t to;
363 const bool abandon = _preemption_disable_snap_end(&to);
364
365 if (__improbable(abandon)) {
366 goto out;
367 }
368
369 const uint64_t gross_duration = kern_timeout_gross_duration(&to);
370 const uint64_t threshold = os_atomic_load(&sched_preemption_disable_threshold_mt, relaxed);
371 if (__improbable(threshold > 0 && gross_duration >= threshold)) {
372 /*
373 * Double check that the time spent not handling interrupts is over the threshold.
374 */
375 const int64_t net_duration = kern_timeout_net_duration(&to);
376 uint64_t average_cpi_whole, average_cpi_fractional;
377
378 assert3u(net_duration, >=, 0);
379 if (net_duration < threshold) {
380 goto out;
381 }
382
383 if (__probable(sched_preemption_disable_debug_mode == SCHED_HYGIENE_MODE_PANIC)) {
384 kern_timeout_try_panic(KERN_TIMEOUT_PREEMPTION, 0, &to,
385 "preemption disable timeout exceeded:", threshold);
386 }
387
388 kern_timeout_cpi(&to, &average_cpi_whole, &average_cpi_fractional);
389
390 DTRACE_SCHED4(mach_preemption_expired, uint64_t, net_duration, uint64_t, gross_duration,
391 uint64_t, average_cpi_whole, uint64_t, average_cpi_fractional);
392 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PREEMPTION_EXPIRED), net_duration, gross_duration, average_cpi_whole, average_cpi_fractional);
393 }
394
395 out:
396 /*
397 * the preemption count is SCHED_HYGIENE_MARKER, we need to clear it.
398 */
399 _enable_preemption_write_count(current_thread(), 0);
400 }
401
402 /*
403 * Abandon a potential preemption disable measurement. Useful for
404 * example for the idle thread, which would just spuriously
405 * trigger the threshold while actually idling, which we don't
406 * care about.
407 */
408 void
abandon_preemption_disable_measurement(void)409 abandon_preemption_disable_measurement(void)
410 {
411 struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
412
413 kern_timeout_override(&pcpu->pdp_timeout);
414 }
415
416 /* Inner part of disable_preemption_without_measuerments() */
417 OS_ALWAYS_INLINE
418 static void
_do_disable_preemption_without_measurements(void)419 _do_disable_preemption_without_measurements(void)
420 {
421 /*
422 * Inform _collect_preemption_disable_measurement()
423 * that we didn't really care.
424 */
425 struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data);
426 kern_timeout_override(&pcpu->pdp_timeout);
427 }
428
429 /**
430 * Reset the max interrupt durations of all CPUs.
431 */
432 void preemption_disable_reset_max_durations(void);
433 void
preemption_disable_reset_max_durations(void)434 preemption_disable_reset_max_durations(void)
435 {
436 percpu_foreach(pcpu, _preemption_disable_pcpu_data) {
437 os_atomic_store(&pcpu->pdp_max_mach_duration, 0, relaxed);
438 }
439 }
440
441 unsigned int preemption_disable_get_max_durations(uint64_t *durations, size_t count);
442 unsigned int
preemption_disable_get_max_durations(uint64_t * durations,size_t count)443 preemption_disable_get_max_durations(uint64_t *durations, size_t count)
444 {
445 int cpu = 0;
446 percpu_foreach(pcpu, _preemption_disable_pcpu_data) {
447 if (cpu < count) {
448 durations[cpu++] = os_atomic_load(&pcpu->pdp_max_mach_duration, relaxed);
449 }
450 }
451 return cpu;
452 }
453
454 /*
455 * Skip predicate for sched_preemption_disable, which would trigger
456 * spuriously when kprintf spam is enabled.
457 */
458 bool
kprintf_spam_mt_pred(struct machine_timeout_spec const __unused * spec)459 kprintf_spam_mt_pred(struct machine_timeout_spec const __unused *spec)
460 {
461 bool const kprintf_spam_enabled = !(disable_kprintf_output || disable_serial_output);
462 return kprintf_spam_enabled;
463 }
464
465 /*
466 * Abandon function exported for AppleCLPC, as a workaround to rdar://91668370.
467 *
468 * Only for AppleCLPC!
469 */
470 void
sched_perfcontrol_abandon_preemption_disable_measurement(void)471 sched_perfcontrol_abandon_preemption_disable_measurement(void)
472 {
473 abandon_preemption_disable_measurement();
474 }
475
476 #else /* SCHED_HYGIENE_DEBUG */
477
478 void
abandon_preemption_disable_measurement(void)479 abandon_preemption_disable_measurement(void)
480 {
481 // No-op. Function is exported, so needs to be defined
482 }
483
484 void
sched_perfcontrol_abandon_preemption_disable_measurement(void)485 sched_perfcontrol_abandon_preemption_disable_measurement(void)
486 {
487 // No-op. Function is exported, so needs to be defined
488 }
489
490 #endif /* SCHED_HYGIENE_DEBUG */
491