1 /*
2 * Copyright (c) 2000-2025 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: kern/machine.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1987
62 *
63 * Support for machine independent machine abstraction.
64 */
65
66 #include <string.h>
67
68 #include <mach/mach_types.h>
69 #include <mach/boolean.h>
70 #include <mach/kern_return.h>
71 #include <mach/machine.h>
72 #include <mach/host_info.h>
73 #include <mach/host_reboot.h>
74 #include <mach/host_priv_server.h>
75 #include <mach/processor_server.h>
76 #include <mach/sdt.h>
77
78 #include <kern/kern_types.h>
79 #include <kern/cpu_data.h>
80 #include <kern/ipc_host.h>
81 #include <kern/host.h>
82 #include <kern/machine.h>
83 #include <kern/misc_protos.h>
84 #include <kern/percpu.h>
85 #include <kern/processor.h>
86 #include <kern/queue.h>
87 #include <kern/sched.h>
88 #include <kern/startup.h>
89 #include <kern/task.h>
90 #include <kern/thread.h>
91 #include <kern/iotrace.h>
92
93 #include <libkern/OSDebug.h>
94 #if ML_IO_TIMEOUTS_ENABLED
95 #include <libkern/tree.h>
96 #endif
97
98 #include <pexpert/device_tree.h>
99
100 #include <machine/commpage.h>
101 #include <machine/machine_routines.h>
102
103 #if HIBERNATION
104 #include <IOKit/IOHibernatePrivate.h>
105 #endif
106 #include <IOKit/IOPlatformExpert.h>
107
108 #if CONFIG_DTRACE
109 extern void (*dtrace_cpu_state_changed_hook)(int, boolean_t);
110 #endif
111
112 #if defined(__arm64__)
113 extern void wait_while_mp_kdp_trap(bool check_SIGPdebug);
114 #if CONFIG_SPTM
115 #include <arm64/sptm/pmap/pmap_data.h>
116 #else
117 #include <arm/pmap/pmap_data.h>
118 #endif /* CONFIG_SPTM */
119 #endif /* defined(__arm64__) */
120
121 #if defined(__x86_64__)
122 #include <i386/panic_notify.h>
123 #endif
124
125 #if ML_IO_TIMEOUTS_ENABLED
126 #if defined(__x86_64__)
127 #define ml_io_timestamp mach_absolute_time
128 #else
129 #define ml_io_timestamp ml_get_timebase
130 #endif /* __x86_64__ */
131 #endif /* ML_IO_TIMEOUTS_ENABLED */
132
133 /*
134 * Exported variables:
135 */
136
137 TUNABLE(long, wdt, "wdt", 0);
138
139 struct machine_info machine_info;
140
141 /* Forwards */
142 static void
143 processor_offline(void * parameter, __unused wait_result_t result);
144
145 static void
146 processor_offline_intstack(processor_t processor) __dead2;
147
148
149 /*
150 * processor_up:
151 *
152 * Flag processor as up and running, and available
153 * for scheduling.
154 */
155 void
processor_up(processor_t processor)156 processor_up(
157 processor_t processor)
158 {
159 spl_t s = splsched();
160 init_ast_check(processor);
161
162 #if defined(__arm64__)
163 /*
164 * A processor coming online won't have received a SIGPdebug signal
165 * to cause it to spin while a stackshot or panic is taking place,
166 * so spin here on mp_kdp_trap.
167 *
168 * However, since cpu_signal() is not yet enabled for this processor,
169 * there is a race if we have just passed this when a cpu_signal()
170 * is attempted. The sender will assume the cpu is offline, so it will
171 * not end up spinning anywhere. See processor_cpu_reinit() for the fix
172 * for this race.
173 */
174 wait_while_mp_kdp_trap(false);
175 #endif
176
177 /* Boot CPU coming online for the first time, either at boot or after sleep */
178 __assert_only bool is_first_online_processor;
179
180 is_first_online_processor = sched_mark_processor_online(processor,
181 processor->last_startup_reason);
182
183 simple_lock(&processor_start_state_lock, LCK_GRP_NULL);
184 assert(processor->processor_instartup == true || is_first_online_processor);
185 simple_unlock(&processor_start_state_lock);
186
187 splx(s);
188
189 #if defined(__x86_64__)
190 ml_cpu_up();
191 #endif /* defined(__x86_64__) */
192
193 #if CONFIG_DTRACE
194 if (dtrace_cpu_state_changed_hook) {
195 (*dtrace_cpu_state_changed_hook)(processor->cpu_id, TRUE);
196 }
197 #endif
198 }
199
200 #include <atm/atm_internal.h>
201
202 kern_return_t
host_reboot(host_priv_t host_priv,int options)203 host_reboot(
204 host_priv_t host_priv,
205 int options)
206 {
207 if (host_priv == HOST_PRIV_NULL) {
208 return KERN_INVALID_HOST;
209 }
210
211 #if DEVELOPMENT || DEBUG
212 if (options & HOST_REBOOT_DEBUGGER) {
213 Debugger("Debugger");
214 return KERN_SUCCESS;
215 }
216 #endif
217
218 if (options & HOST_REBOOT_UPSDELAY) {
219 // UPS power cutoff path
220 PEHaltRestart( kPEUPSDelayHaltCPU );
221 } else {
222 halt_all_cpus(!(options & HOST_REBOOT_HALT));
223 }
224
225 return KERN_SUCCESS;
226 }
227
228 kern_return_t
processor_assign(__unused processor_t processor,__unused processor_set_t new_pset,__unused boolean_t wait)229 processor_assign(
230 __unused processor_t processor,
231 __unused processor_set_t new_pset,
232 __unused boolean_t wait)
233 {
234 return KERN_FAILURE;
235 }
236
237 void
processor_doshutdown(processor_t processor,bool is_final_system_sleep)238 processor_doshutdown(
239 processor_t processor,
240 bool is_final_system_sleep)
241 {
242 lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
243 lck_mtx_assert(&processor_updown_lock, LCK_MTX_ASSERT_OWNED);
244
245 if (!processor->processor_booted) {
246 panic("processor %d not booted", processor->cpu_id);
247 }
248
249 if (is_final_system_sleep) {
250 assert(processor == current_processor());
251 assert(processor == master_processor);
252 assert(processor_avail_count == 1);
253 }
254
255 processor_set_t pset = processor->processor_set;
256
257 ml_cpu_begin_state_transition(processor->cpu_id);
258
259 ml_broadcast_cpu_event(CPU_EXIT_REQUESTED, processor->cpu_id);
260
261 #if HIBERNATION
262 if (is_final_system_sleep) {
263 /*
264 * Ensure the page queues are in a state where the hibernation
265 * code can manipulate them without requiring other threads
266 * to be scheduled.
267 *
268 * This operation can block,
269 * and unlock must be done from the same thread.
270 */
271 assert(processor_avail_count < 2);
272 hibernate_vm_lock();
273 }
274 #endif
275
276 spl_t s = splsched();
277 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
278 pset_lock(pset);
279
280 assert(processor->state != PROCESSOR_START);
281 assert(processor->state != PROCESSOR_PENDING_OFFLINE);
282 assert(processor->state != PROCESSOR_OFF_LINE);
283
284 assert(!processor->processor_inshutdown);
285 processor->processor_inshutdown = true;
286
287 assert(processor->processor_offline_state == PROCESSOR_OFFLINE_RUNNING);
288 processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_BEGIN_SHUTDOWN);
289
290 if (!is_final_system_sleep) {
291 sched_assert_not_last_online_cpu(processor->cpu_id);
292 }
293
294 pset_unlock(pset);
295 simple_unlock(&sched_available_cores_lock);
296
297 if (is_final_system_sleep) {
298 assert(processor == current_processor());
299
300 #if HIBERNATION
301 /*
302 * After this point, the system is now
303 * committed to hibernation and must
304 * not run any other thread that could take this lock.
305 */
306 hibernate_vm_unlock();
307 #endif
308 } else {
309 /*
310 * Get onto the processor to shut down.
311 * The scheduler picks this thread naturally according to its
312 * priority.
313 * The processor can run any other thread if this one blocks.
314 * So, don't block.
315 */
316 processor_t prev = thread_bind(processor);
317 thread_block(THREAD_CONTINUE_NULL);
318
319 /* interrupts still disabled */
320 assert(ml_get_interrupts_enabled() == FALSE);
321
322 assert(processor == current_processor());
323 assert(processor->processor_inshutdown);
324
325 thread_bind(prev);
326 /* interrupts still disabled */
327 }
328
329 /*
330 * Continue processor shutdown on the processor's idle thread.
331 * The handoff won't fail because the idle thread has a reserved stack.
332 * Switching to the idle thread leaves interrupts disabled,
333 * so we can't accidentally take an interrupt after the context switch.
334 */
335 thread_t shutdown_thread = processor->idle_thread;
336 shutdown_thread->continuation = processor_offline;
337 shutdown_thread->parameter = (void*)is_final_system_sleep;
338
339 thread_run(current_thread(), THREAD_CONTINUE_NULL, NULL, shutdown_thread);
340
341 /*
342 * After this point, we are in regular scheduled context on a remaining
343 * available CPU. Interrupts are still disabled.
344 */
345
346 if (is_final_system_sleep) {
347 /*
348 * We are coming out of system sleep here, so there won't be a
349 * corresponding processor_startup for this processor, so we
350 * need to put it back in the correct running state.
351 *
352 * There's nowhere to execute a call to CPU_EXITED during system
353 * sleep for the boot processor, and it's already been CPU_BOOTED
354 * by this point anyways, so skip the call.
355 */
356 assert(current_processor() == master_processor);
357 assert(processor->state == PROCESSOR_RUNNING);
358 assert(processor->processor_inshutdown);
359 assert(processor->processor_offline_state == PROCESSOR_OFFLINE_STARTED_NOT_WAITED);
360 processor->processor_inshutdown = false;
361 processor_update_offline_state(processor, PROCESSOR_OFFLINE_RUNNING);
362
363 splx(s);
364 } else {
365 splx(s);
366
367 cpu_exit_wait(processor->cpu_id);
368
369 s = splsched();
370 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
371 pset_lock(pset);
372 assert(processor->processor_inshutdown);
373 assert(processor->processor_offline_state == PROCESSOR_OFFLINE_PENDING_OFFLINE);
374 assert(processor->state == PROCESSOR_PENDING_OFFLINE);
375 pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
376 processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_CPU_OFFLINE);
377 pset_unlock(pset);
378 simple_unlock(&sched_available_cores_lock);
379 splx(s);
380
381 ml_broadcast_cpu_event(CPU_EXITED, processor->cpu_id);
382 ml_cpu_power_disable(processor->cpu_id);
383
384 assert(processor->processor_offline_state == PROCESSOR_OFFLINE_CPU_OFFLINE);
385 processor_update_offline_state(processor, PROCESSOR_OFFLINE_FULLY_OFFLINE);
386 }
387
388 ml_cpu_end_state_transition(processor->cpu_id);
389 }
390
391 /*
392 * Called in the context of the idle thread to shut down the processor
393 *
394 * A shut-down processor looks like it's 'running' the idle thread parked
395 * in this routine, but it's actually been powered off and has no hardware state.
396 */
397 static void
processor_offline(void * parameter,__unused wait_result_t result)398 processor_offline(
399 void * parameter,
400 __unused wait_result_t result)
401 {
402 bool is_final_system_sleep = (bool) parameter;
403 processor_t processor = current_processor();
404 thread_t self = current_thread();
405 __assert_only thread_t old_thread = THREAD_NULL;
406
407 assert(self->state & TH_IDLE);
408 assert(processor->idle_thread == self);
409 assert(ml_get_interrupts_enabled() == FALSE);
410 assert(self->continuation == NULL);
411 assert(processor->processor_online == true);
412 assert(processor->running_timers_active == false);
413
414 if (is_final_system_sleep) {
415 assert(processor == current_processor());
416 assert(processor == master_processor);
417 assert(processor_avail_count == 1);
418 }
419
420 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROCESSOR_SHUTDOWN) | DBG_FUNC_START, processor->cpu_id);
421
422 bool enforce_quiesce_safety = gEnforcePlatformActionSafety;
423
424 /*
425 * Scheduling is now disabled for this processor.
426 * Ensure that primitives that need scheduling (like mutexes) know this.
427 */
428 if (enforce_quiesce_safety) {
429 disable_preemption_without_measurements();
430 }
431
432 #if CONFIG_DTRACE
433 if (dtrace_cpu_state_changed_hook) {
434 (*dtrace_cpu_state_changed_hook)(processor->cpu_id, FALSE);
435 }
436 #endif
437
438 smr_cpu_down(processor, SMR_CPU_REASON_OFFLINE);
439
440 /* Drain pending IPIs for the last time here. */
441 ml_cpu_down();
442
443 sched_mark_processor_offline(processor, is_final_system_sleep);
444
445 /*
446 * Switch to the interrupt stack and shut down the processor.
447 *
448 * When the processor comes back, it will eventually call load_context which
449 * restores the context saved by machine_processor_shutdown, returning here.
450 */
451 old_thread = machine_processor_shutdown(self, processor_offline_intstack, processor);
452
453 /*
454 * The processor is back. sched_mark_processor_online and
455 * friends have already run via processor_up.
456 */
457
458 /* old_thread should be NULL because we got here through Load_context */
459 assert(old_thread == THREAD_NULL);
460
461 assert(processor == current_processor());
462 assert(processor->idle_thread == current_thread());
463 assert(processor->processor_online == true);
464
465 assert(ml_get_interrupts_enabled() == FALSE);
466 assert(self->continuation == NULL);
467
468 /* Extract the machine_param value stashed by secondary_cpu_main */
469 void * machine_param = self->parameter;
470 self->parameter = NULL;
471
472 processor_cpu_reinit(machine_param, true, is_final_system_sleep);
473
474 if (enforce_quiesce_safety) {
475 enable_preemption();
476 }
477
478 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROCESSOR_SHUTDOWN) | DBG_FUNC_END, processor->cpu_id);
479
480 /*
481 * Now that the processor is back, invoke the idle thread to find out what to do next.
482 * idle_thread will enable interrupts.
483 */
484 thread_block(idle_thread);
485 /*NOTREACHED*/
486 }
487
488 /*
489 * Complete the shutdown and place the processor offline.
490 *
491 * Called at splsched in the shutdown context
492 * (i.e. on the idle thread, on the interrupt stack)
493 *
494 * The onlining half of this is done in load_context().
495 */
496 static void
processor_offline_intstack(processor_t processor)497 processor_offline_intstack(
498 processor_t processor)
499 {
500 assert(processor == current_processor());
501 assert(processor->active_thread == current_thread());
502
503 struct recount_snap snap = { 0 };
504 recount_snapshot(&snap);
505 recount_processor_idle(&processor->pr_recount, &snap);
506
507 smr_cpu_leave(processor, processor->last_dispatch);
508
509 PMAP_DEACTIVATE_KERNEL(processor->cpu_id);
510
511 cpu_sleep();
512 panic("zombie processor");
513 /*NOTREACHED*/
514 }
515
516 /*
517 * Called on the idle thread with interrupts disabled to initialize a
518 * secondary processor on boot or to reinitialize any processor on resume
519 * from processor offline.
520 */
521 void
processor_cpu_reinit(void * machine_param,__unused bool wait_for_cpu_signal,__assert_only bool is_final_system_sleep)522 processor_cpu_reinit(void* machine_param,
523 __unused bool wait_for_cpu_signal,
524 __assert_only bool is_final_system_sleep)
525 {
526 /* Re-initialize the processor */
527 machine_cpu_reinit(machine_param);
528
529 #if defined(__arm64__)
530 /*
531 * See the comments for wait_while_mp_kdp_trap in processor_up().
532 *
533 * SIGPdisabled is cleared (to enable cpu_signal() to succeed with this processor)
534 * the first time we take an IPI. This is triggered by machine_cpu_reinit(), above,
535 * which calls cpu_machine_init()->PE_cpu_machine_init()->PE_cpu_signal() which sends
536 * a self-IPI to ensure that happens when we enable interrupts. So enable interrupts
537 * here so that cpu_signal() can succeed before we spin on mp_kdp_trap.
538 */
539 assert_ml_cpu_signal_is_enabled(false);
540
541 ml_set_interrupts_enabled(TRUE);
542
543 if (wait_for_cpu_signal) {
544 ml_wait_for_cpu_signal_to_enable();
545 }
546
547 ml_set_interrupts_enabled(FALSE);
548
549 wait_while_mp_kdp_trap(true);
550
551 /*
552 * At this point,
553 * if a stackshot or panic is in progress, we either spin on mp_kdp_trap
554 * or we sucessfully received a SIGPdebug signal which will cause us to
555 * break out of the spin on mp_kdp_trap and instead
556 * spin next time interrupts are enabled in idle_thread().
557 */
558 if (wait_for_cpu_signal) {
559 assert_ml_cpu_signal_is_enabled(true);
560 }
561
562 /*
563 * Now that we know SIGPdisabled is cleared, we can publish that
564 * this CPU has fully come out of offline state.
565 *
566 * Without wait_for_cpu_signal, we'll publish this earlier than
567 * cpu_signal is actually ready, but as long as it's ready by next S2R,
568 * it will be good enough.
569 */
570 ml_cpu_up();
571 #endif
572
573 /*
574 * Interrupts must be disabled while processor_start_state_lock is
575 * held to prevent a deadlock with CPU startup of other CPUs that
576 * may be proceeding in parallel to this CPU's reinitialization.
577 */
578 spl_t s = splsched();
579 processor_t processor = current_processor();
580
581 simple_lock(&processor_start_state_lock, LCK_GRP_NULL);
582 assert(processor->processor_instartup == true || is_final_system_sleep);
583 processor->processor_instartup = false;
584 simple_unlock(&processor_start_state_lock);
585
586 splx(s);
587
588 thread_wakeup((event_t)&processor->processor_instartup);
589 }
590
591 kern_return_t
host_get_boot_info(host_priv_t host_priv,kernel_boot_info_t boot_info)592 host_get_boot_info(
593 host_priv_t host_priv,
594 kernel_boot_info_t boot_info)
595 {
596 const char *src = "";
597 if (host_priv == HOST_PRIV_NULL) {
598 return KERN_INVALID_HOST;
599 }
600
601 /*
602 * Copy first operator string terminated by '\0' followed by
603 * standardized strings generated from boot string.
604 */
605 src = machine_boot_info(boot_info, KERNEL_BOOT_INFO_MAX);
606 if (src != boot_info) {
607 (void) strncpy(boot_info, src, KERNEL_BOOT_INFO_MAX);
608 }
609
610 return KERN_SUCCESS;
611 }
612
613 // These are configured through sysctls.
614 #if DEVELOPMENT || DEBUG
615 uint32_t phy_read_panic = 1;
616 uint32_t phy_write_panic = 1;
617 uint64_t simulate_stretched_io = 0;
618 #else
619 uint32_t phy_read_panic = 0;
620 uint32_t phy_write_panic = 0;
621 #endif
622
623 #if !defined(__x86_64__)
624
625 #if DEVELOPMENT || DEBUG
626 static const uint64_t TIMEBASE_TICKS_PER_USEC = 24000000ULL / USEC_PER_SEC;
627 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 100 * TIMEBASE_TICKS_PER_USEC;
628 #else
629 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 0;
630 #endif
631
632 // The MACHINE_TIMEOUT facility only exists on ARM.
633 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_read_delay_to, "report-phy-read-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
634 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_write_delay_to, "report-phy-write-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
635 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_read_delay_to, "trace-phy-read-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
636 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_write_delay_to, "trace-phy-write-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
637
638 #if SCHED_HYGIENE_DEBUG
639 /*
640 * Note: The interrupt-masked timeout goes through two initializations - one
641 * early in boot and one later. Thus this function is also called twice and
642 * can't be marked '__startup_func'.
643 */
644 static void
ml_io_init_timeouts(void)645 ml_io_init_timeouts(void)
646 {
647 /*
648 * The timeouts may be completely disabled via an override.
649 */
650 if (kern_feature_override(KF_IO_TIMEOUT_OVRD)) {
651 os_atomic_store(&report_phy_write_delay_to, 0, relaxed);
652 os_atomic_store(&report_phy_read_delay_to, 0, relaxed);
653 return;
654 }
655
656 /*
657 * There may be no interrupt masked timeout set.
658 */
659 const uint64_t interrupt_masked_to = os_atomic_load(&interrupt_masked_timeout, relaxed);
660 if (interrupt_masked_timeout == 0) {
661 return;
662 }
663
664 /*
665 * Inherit from the interrupt masked timeout if smaller and the timeout
666 * hasn't been explicitly set via boot-arg.
667 */
668 uint64_t arg = 0;
669
670 if (!PE_parse_boot_argn("ml-timeout-report-phy-read-delay", &arg, sizeof(arg))) {
671 uint64_t report_phy_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
672 report_phy_read_delay = report_phy_read_delay == 0 ?
673 interrupt_masked_to :
674 MIN(report_phy_read_delay, interrupt_masked_to);
675 os_atomic_store(&report_phy_read_delay_to, report_phy_read_delay, relaxed);
676 }
677
678 if (!PE_parse_boot_argn("ml-timeout-report-phy-write-delay", &arg, sizeof(arg))) {
679 uint64_t report_phy_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
680 report_phy_write_delay = report_phy_write_delay == 0 ?
681 interrupt_masked_to :
682 MIN(report_phy_write_delay, interrupt_masked_to);
683 os_atomic_store(&report_phy_write_delay_to, report_phy_write_delay, relaxed);
684 }
685 }
686
687 /*
688 * It's important that this happens after machine timeouts have initialized so
689 * the correct timeouts can be inherited.
690 */
691 STARTUP(TIMEOUTS, STARTUP_RANK_SECOND, ml_io_init_timeouts);
692 #endif /* SCHED_HYGIENE_DEBUG */
693
694 extern pmap_paddr_t kvtophys(vm_offset_t va);
695 #endif /* !defined(__x86_64__) */
696
697 #if ML_IO_TIMEOUTS_ENABLED
698
699 static LCK_GRP_DECLARE(io_timeout_override_lock_grp, "io_timeout_override");
700 static LCK_SPIN_DECLARE(io_timeout_override_lock, &io_timeout_override_lock_grp);
701
702 struct io_timeout_override_entry {
703 RB_ENTRY(io_timeout_override_entry) tree;
704
705 uintptr_t ioaddr_base;
706 unsigned int size;
707 uint32_t read_timeout;
708 uint32_t write_timeout;
709 };
710
711 static inline int
io_timeout_override_cmp(const struct io_timeout_override_entry * a,const struct io_timeout_override_entry * b)712 io_timeout_override_cmp(const struct io_timeout_override_entry *a, const struct io_timeout_override_entry *b)
713 {
714 if (a->ioaddr_base < b->ioaddr_base) {
715 return -1;
716 } else if (a->ioaddr_base > b->ioaddr_base) {
717 return 1;
718 } else {
719 return 0;
720 }
721 }
722
723 static RB_HEAD(io_timeout_override, io_timeout_override_entry)
724 io_timeout_override_root_pa, io_timeout_override_root_va;
725
726 RB_PROTOTYPE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
727 RB_GENERATE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
728
729 static int
io_increase_timeouts(struct io_timeout_override * root,uintptr_t ioaddr_base,unsigned int size,uint32_t read_timeout_us,uint32_t write_timeout_us)730 io_increase_timeouts(struct io_timeout_override *root, uintptr_t ioaddr_base,
731 unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us)
732 {
733 const uint64_t MAX_TIMEOUT_ABS = UINT32_MAX;
734
735 assert(preemption_enabled());
736
737 int ret = KERN_SUCCESS;
738
739 if (size == 0) {
740 return KERN_INVALID_ARGUMENT;
741 }
742
743 uintptr_t ioaddr_end;
744 if (os_add_overflow(ioaddr_base, size - 1, &ioaddr_end)) {
745 return KERN_INVALID_ARGUMENT;
746 }
747
748 uint64_t read_timeout_abs, write_timeout_abs;
749 nanoseconds_to_absolutetime(NSEC_PER_USEC * read_timeout_us, &read_timeout_abs);
750 nanoseconds_to_absolutetime(NSEC_PER_USEC * write_timeout_us, &write_timeout_abs);
751 if (read_timeout_abs > MAX_TIMEOUT_ABS || write_timeout_abs > MAX_TIMEOUT_ABS) {
752 return KERN_INVALID_ARGUMENT;
753 }
754
755 struct io_timeout_override_entry *node = kalloc_type(struct io_timeout_override_entry, Z_WAITOK | Z_ZERO | Z_NOFAIL);
756 node->ioaddr_base = ioaddr_base;
757 node->size = size;
758 node->read_timeout = (uint32_t)read_timeout_abs;
759 node->write_timeout = (uint32_t)write_timeout_abs;
760
761 /*
762 * Interrupt handlers are allowed to call ml_io_{read,write}*, so
763 * interrupts must be disabled any time io_timeout_override_lock is
764 * held. Otherwise the CPU could take an interrupt while holding the
765 * lock, invoke an ISR that calls ml_io_{read,write}*, and deadlock
766 * trying to acquire the lock again.
767 */
768 boolean_t istate = ml_set_interrupts_enabled(FALSE);
769 lck_spin_lock(&io_timeout_override_lock);
770 if (RB_INSERT(io_timeout_override, root, node)) {
771 ret = KERN_INVALID_ARGUMENT;
772 goto out;
773 }
774
775 /* Check that this didn't create any new overlaps */
776 struct io_timeout_override_entry *prev = RB_PREV(io_timeout_override, root, node);
777 if (prev && (prev->ioaddr_base + prev->size) > node->ioaddr_base) {
778 RB_REMOVE(io_timeout_override, root, node);
779 ret = KERN_INVALID_ARGUMENT;
780 goto out;
781 }
782 struct io_timeout_override_entry *next = RB_NEXT(io_timeout_override, root, node);
783 if (next && (node->ioaddr_base + node->size) > next->ioaddr_base) {
784 RB_REMOVE(io_timeout_override, root, node);
785 ret = KERN_INVALID_ARGUMENT;
786 goto out;
787 }
788
789 out:
790 lck_spin_unlock(&io_timeout_override_lock);
791 ml_set_interrupts_enabled(istate);
792 if (ret != KERN_SUCCESS) {
793 kfree_type(struct io_timeout_override_entry, node);
794 }
795 return ret;
796 }
797
798 static int
io_reset_timeouts(struct io_timeout_override * root,uintptr_t ioaddr_base,unsigned int size)799 io_reset_timeouts(struct io_timeout_override *root, uintptr_t ioaddr_base, unsigned int size)
800 {
801 assert(preemption_enabled());
802
803 struct io_timeout_override_entry key = { .ioaddr_base = ioaddr_base };
804
805 boolean_t istate = ml_set_interrupts_enabled(FALSE);
806 lck_spin_lock(&io_timeout_override_lock);
807 struct io_timeout_override_entry *node = RB_FIND(io_timeout_override, root, &key);
808 if (node) {
809 if (node->size == size) {
810 RB_REMOVE(io_timeout_override, root, node);
811 } else {
812 node = NULL;
813 }
814 }
815 lck_spin_unlock(&io_timeout_override_lock);
816 ml_set_interrupts_enabled(istate);
817
818 if (!node) {
819 return KERN_NOT_FOUND;
820 }
821
822 kfree_type(struct io_timeout_override_entry, node);
823 return KERN_SUCCESS;
824 }
825
826 static bool
io_override_timeout(struct io_timeout_override * root,uintptr_t addr,uint64_t * read_timeout,uint64_t * write_timeout)827 io_override_timeout(struct io_timeout_override *root, uintptr_t addr,
828 uint64_t *read_timeout, uint64_t *write_timeout)
829 {
830 assert(!ml_get_interrupts_enabled());
831 assert3p(read_timeout, !=, NULL);
832 assert3p(write_timeout, !=, NULL);
833
834 struct io_timeout_override_entry *node = RB_ROOT(root);
835
836 lck_spin_lock(&io_timeout_override_lock);
837 /* RB_FIND() doesn't support custom cmp functions, so we have to open-code our own */
838 while (node) {
839 if (node->ioaddr_base <= addr && addr < node->ioaddr_base + node->size) {
840 *read_timeout = node->read_timeout;
841 *write_timeout = node->write_timeout;
842 lck_spin_unlock(&io_timeout_override_lock);
843 return true;
844 } else if (addr < node->ioaddr_base) {
845 node = RB_LEFT(node, tree);
846 } else {
847 node = RB_RIGHT(node, tree);
848 }
849 }
850 lck_spin_unlock(&io_timeout_override_lock);
851
852 return false;
853 }
854
855 static bool
io_override_timeout_ss(uint64_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)856 io_override_timeout_ss(uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
857 {
858 #if defined(__arm64__)
859
860 /*
861 * PCIe regions are marked with PMAP_IO_RANGE_STRONG_SYNC. Apply a
862 * timeout greater than two PCIe completion timeouts (90ms) as they can
863 * stack.
864 */
865 #define STRONG_SYNC_TIMEOUT 2160000 /* 90ms */
866
867 pmap_io_range_t *range = pmap_find_io_attr(paddr);
868 if (range != NULL && (range->wimg & PMAP_IO_RANGE_STRONG_SYNC) != 0) {
869 *read_timeout = STRONG_SYNC_TIMEOUT;
870 *write_timeout = STRONG_SYNC_TIMEOUT;
871 return true;
872 }
873 #else
874 (void)paddr;
875 (void)read_timeout;
876 (void)write_timeout;
877 #endif /* __arm64__ */
878 return false;
879 }
880
881 /*
882 * Return timeout override values for the read/write timeout for a given
883 * address.
884 * A virtual address (vaddr), physical address (paddr) or both may be passed.
885 * Up to three separate timeout overrides can be found
886 * - A virtual address override
887 * - A physical address override
888 * - A strong sync override
889 * The largest override found is returned.
890 */
891 void
override_io_timeouts(uintptr_t vaddr,uint64_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)892 override_io_timeouts(uintptr_t vaddr, uint64_t paddr, uint64_t *read_timeout,
893 uint64_t *write_timeout)
894 {
895 uint64_t rt_va = 0, wt_va = 0, rt_pa = 0, wt_pa = 0, rt_ss = 0, wt_ss = 0;
896
897 if (vaddr != 0) {
898 /* Override from virtual address. */
899 io_override_timeout(&io_timeout_override_root_va, vaddr, &rt_va, &wt_va);
900 }
901
902 if (paddr != 0) {
903 /* Override from physical address. */
904 io_override_timeout(&io_timeout_override_root_pa, paddr, &rt_pa, &wt_pa);
905
906 /* Override from strong sync range. */
907 io_override_timeout_ss(paddr, &rt_ss, &wt_ss);
908 }
909
910 if (read_timeout != NULL) {
911 *read_timeout = MAX(MAX(rt_va, rt_pa), rt_ss);
912 }
913
914 if (write_timeout != NULL) {
915 *write_timeout = MAX(MAX(wt_va, wt_pa), wt_ss);
916 }
917 }
918
919 #endif /* ML_IO_TIMEOUTS_ENABLED */
920
921 int
ml_io_increase_timeouts(uintptr_t ioaddr_base,unsigned int size,uint32_t read_timeout_us,uint32_t write_timeout_us)922 ml_io_increase_timeouts(uintptr_t ioaddr_base, unsigned int size,
923 uint32_t read_timeout_us, uint32_t write_timeout_us)
924 {
925 #if ML_IO_TIMEOUTS_ENABLED
926 const size_t MAX_SIZE = 4096;
927
928 if (size > MAX_SIZE) {
929 return KERN_INVALID_ARGUMENT;
930 }
931
932 return io_increase_timeouts(&io_timeout_override_root_va, ioaddr_base,
933 size, read_timeout_us, write_timeout_us);
934 #else
935 #pragma unused(ioaddr_base, size, read_timeout_us, write_timeout_us)
936 return KERN_SUCCESS;
937 #endif /* ML_IO_TIMEOUTS_ENABLED */
938 }
939
940 int
ml_io_increase_timeouts_phys(vm_offset_t ioaddr_base,unsigned int size,uint32_t read_timeout_us,uint32_t write_timeout_us)941 ml_io_increase_timeouts_phys(vm_offset_t ioaddr_base, unsigned int size,
942 uint32_t read_timeout_us, uint32_t write_timeout_us)
943 {
944 #if ML_IO_TIMEOUTS_ENABLED
945 return io_increase_timeouts(&io_timeout_override_root_pa, ioaddr_base,
946 size, read_timeout_us, write_timeout_us);
947 #else
948 #pragma unused(ioaddr_base, size, read_timeout_us, write_timeout_us)
949 return KERN_SUCCESS;
950 #endif /* ML_IO_TIMEOUTS_ENABLED */
951 }
952
953 int
ml_io_reset_timeouts(uintptr_t ioaddr_base,unsigned int size)954 ml_io_reset_timeouts(uintptr_t ioaddr_base, unsigned int size)
955 {
956 #if ML_IO_TIMEOUTS_ENABLED
957 return io_reset_timeouts(&io_timeout_override_root_va, ioaddr_base, size);
958 #else
959 #pragma unused(ioaddr_base, size)
960 return KERN_SUCCESS;
961 #endif /* ML_IO_TIMEOUTS_ENABLED */
962 }
963
964 int
ml_io_reset_timeouts_phys(vm_offset_t ioaddr_base,unsigned int size)965 ml_io_reset_timeouts_phys(vm_offset_t ioaddr_base, unsigned int size)
966 {
967 #if ML_IO_TIMEOUTS_ENABLED
968 return io_reset_timeouts(&io_timeout_override_root_pa, ioaddr_base, size);
969 #else
970 #pragma unused(ioaddr_base, size)
971 return KERN_SUCCESS;
972 #endif /* ML_IO_TIMEOUTS_ENABLED */
973 }
974
975 unsigned long long
ml_io_read(uintptr_t vaddr,int size)976 ml_io_read(uintptr_t vaddr, int size)
977 {
978 unsigned long long result = 0;
979 unsigned char s1;
980 unsigned short s2;
981
982 #ifdef ML_IO_VERIFY_UNCACHEABLE
983 uintptr_t paddr = pmap_verify_noncacheable(vaddr);
984 #elif defined(ML_IO_TIMEOUTS_ENABLED)
985 uintptr_t paddr = 0;
986 #endif
987
988 #ifdef ML_IO_TIMEOUTS_ENABLED
989 uint64_t sabs, eabs;
990 boolean_t istate, timeread = FALSE;
991 uint64_t report_read_delay;
992 #if __x86_64__
993 report_read_delay = report_phy_read_delay;
994 #else
995 report_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
996 uint64_t const trace_phy_read_delay = os_atomic_load(&trace_phy_read_delay_to, relaxed);
997 #endif /* __x86_64__ */
998
999 if (__improbable(report_read_delay != 0)) {
1000 istate = ml_set_interrupts_enabled_with_debug(false, false);
1001 sabs = ml_io_timestamp();
1002 timeread = TRUE;
1003 }
1004
1005 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
1006 if (__improbable(timeread && simulate_stretched_io)) {
1007 sabs -= simulate_stretched_io;
1008 }
1009 #endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */
1010 #endif /* ML_IO_TIMEOUTS_ENABLED */
1011
1012 #if DEVELOPMENT || DEBUG
1013 boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
1014 if (use_fences) {
1015 ml_timebase_to_memory_fence();
1016 }
1017 #endif
1018
1019 switch (size) {
1020 case 1:
1021 s1 = *(volatile unsigned char *)vaddr;
1022 result = s1;
1023 break;
1024 case 2:
1025 s2 = *(volatile unsigned short *)vaddr;
1026 result = s2;
1027 break;
1028 case 4:
1029 result = *(volatile unsigned int *)vaddr;
1030 break;
1031 case 8:
1032 result = *(volatile unsigned long long *)vaddr;
1033 break;
1034 default:
1035 panic("Invalid size %d for ml_io_read(%p)", size, (void *)vaddr);
1036 break;
1037 }
1038
1039 #if DEVELOPMENT || DEBUG
1040 if (use_fences) {
1041 ml_memory_to_timebase_fence();
1042 }
1043 #endif
1044
1045 #ifdef ML_IO_TIMEOUTS_ENABLED
1046 if (__improbable(timeread == TRUE)) {
1047 eabs = ml_io_timestamp();
1048
1049
1050 /* Prevent the processor from calling iotrace during its
1051 * initialization procedure. */
1052 if (current_processor()->state == PROCESSOR_RUNNING) {
1053 iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, sabs, eabs - sabs);
1054 }
1055
1056 if (__improbable((eabs - sabs) > report_read_delay)) {
1057 if (paddr == 0) {
1058 paddr = kvtophys(vaddr);
1059 }
1060
1061 DTRACE_PHYSLAT5(physioread, uint64_t, (eabs - sabs),
1062 uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, result);
1063
1064 uint64_t override = 0;
1065 override_io_timeouts(vaddr, paddr, &override, NULL);
1066
1067 if (override != 0) {
1068 #if SCHED_HYGIENE_DEBUG
1069 /*
1070 * The IO timeout was overridden. If we were called in an
1071 * interrupt handler context, that can lead to a timeout
1072 * panic, so we need to abandon the measurement.
1073 */
1074 if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1075 ml_irq_debug_abandon();
1076 }
1077 #endif
1078 report_read_delay = override;
1079 }
1080 }
1081
1082 if (__improbable((eabs - sabs) > report_read_delay)) {
1083 if (phy_read_panic && (machine_timeout_suspended() == FALSE)) {
1084 #if defined(__x86_64__)
1085 panic_notify();
1086 #endif /* defined(__x86_64__) */
1087 uint64_t nsec = 0;
1088 absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1089 panic("Read from IO vaddr 0x%lx paddr 0x%lx took %llu ns, "
1090 "result: 0x%llx (start: %llu, end: %llu), ceiling: %llu",
1091 vaddr, paddr, nsec, result, sabs, eabs,
1092 report_read_delay);
1093 }
1094 }
1095
1096 if (__improbable(trace_phy_read_delay > 0 && (eabs - sabs) > trace_phy_read_delay)) {
1097 KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_READ),
1098 (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result);
1099 }
1100
1101 (void)ml_set_interrupts_enabled_with_debug(istate, false);
1102 }
1103 #endif /* ML_IO_TIMEOUTS_ENABLED */
1104 return result;
1105 }
1106
1107 unsigned int
ml_io_read8(uintptr_t vaddr)1108 ml_io_read8(uintptr_t vaddr)
1109 {
1110 return (unsigned) ml_io_read(vaddr, 1);
1111 }
1112
1113 unsigned int
ml_io_read16(uintptr_t vaddr)1114 ml_io_read16(uintptr_t vaddr)
1115 {
1116 return (unsigned) ml_io_read(vaddr, 2);
1117 }
1118
1119 unsigned int
ml_io_read32(uintptr_t vaddr)1120 ml_io_read32(uintptr_t vaddr)
1121 {
1122 return (unsigned) ml_io_read(vaddr, 4);
1123 }
1124
1125 unsigned long long
ml_io_read64(uintptr_t vaddr)1126 ml_io_read64(uintptr_t vaddr)
1127 {
1128 return ml_io_read(vaddr, 8);
1129 }
1130
1131 /* ml_io_write* */
1132
1133 void
ml_io_write(uintptr_t vaddr,uint64_t val,int size)1134 ml_io_write(uintptr_t vaddr, uint64_t val, int size)
1135 {
1136 #ifdef ML_IO_VERIFY_UNCACHEABLE
1137 uintptr_t paddr = pmap_verify_noncacheable(vaddr);
1138 #elif defined(ML_IO_TIMEOUTS_ENABLED)
1139 uintptr_t paddr = 0;
1140 #endif
1141
1142 #ifdef ML_IO_TIMEOUTS_ENABLED
1143 uint64_t sabs, eabs;
1144 boolean_t istate, timewrite = FALSE;
1145 uint64_t report_write_delay;
1146 #if __x86_64__
1147 report_write_delay = report_phy_write_delay;
1148 #else
1149 report_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
1150 uint64_t trace_phy_write_delay = os_atomic_load(&trace_phy_write_delay_to, relaxed);
1151 #endif /* !defined(__x86_64__) */
1152 if (__improbable(report_write_delay != 0)) {
1153 istate = ml_set_interrupts_enabled_with_debug(false, false);
1154 sabs = ml_io_timestamp();
1155 timewrite = TRUE;
1156 }
1157
1158 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
1159 if (__improbable(timewrite && simulate_stretched_io)) {
1160 sabs -= simulate_stretched_io;
1161 }
1162 #endif /* DEVELOPMENT || DEBUG */
1163 #endif /* ML_IO_TIMEOUTS_ENABLED */
1164
1165 #if DEVELOPMENT || DEBUG
1166 boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
1167 if (use_fences) {
1168 ml_timebase_to_memory_fence();
1169 }
1170 #endif
1171
1172 switch (size) {
1173 case 1:
1174 *(volatile uint8_t *)vaddr = (uint8_t)val;
1175 break;
1176 case 2:
1177 *(volatile uint16_t *)vaddr = (uint16_t)val;
1178 break;
1179 case 4:
1180 *(volatile uint32_t *)vaddr = (uint32_t)val;
1181 break;
1182 case 8:
1183 *(volatile uint64_t *)vaddr = (uint64_t)val;
1184 break;
1185 default:
1186 panic("Invalid size %d for ml_io_write(%p, 0x%llx)", size, (void *)vaddr, val);
1187 break;
1188 }
1189
1190 #if DEVELOPMENT || DEBUG
1191 if (use_fences) {
1192 ml_memory_to_timebase_fence();
1193 }
1194 #endif
1195
1196 #ifdef ML_IO_TIMEOUTS_ENABLED
1197 if (__improbable(timewrite == TRUE)) {
1198 eabs = ml_io_timestamp();
1199
1200 /* Prevent the processor from calling iotrace during its
1201 * initialization procedure. */
1202 if (current_processor()->state == PROCESSOR_RUNNING) {
1203 iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, sabs, eabs - sabs);
1204 }
1205
1206
1207 if (__improbable((eabs - sabs) > report_write_delay)) {
1208 if (paddr == 0) {
1209 paddr = kvtophys(vaddr);
1210 }
1211
1212 DTRACE_PHYSLAT5(physiowrite, uint64_t, (eabs - sabs),
1213 uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, val);
1214
1215 uint64_t override = 0;
1216 override_io_timeouts(vaddr, paddr, NULL, &override);
1217
1218 if (override != 0) {
1219 #if SCHED_HYGIENE_DEBUG
1220 /*
1221 * The IO timeout was overridden. If we were called in an
1222 * interrupt handler context, that can lead to a timeout
1223 * panic, so we need to abandon the measurement.
1224 */
1225 if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1226 ml_irq_debug_abandon();
1227 }
1228 #endif
1229 report_write_delay = override;
1230 }
1231 }
1232
1233 if (__improbable((eabs - sabs) > report_write_delay)) {
1234 if (phy_write_panic && (machine_timeout_suspended() == FALSE)) {
1235 #if defined(__x86_64__)
1236 panic_notify();
1237 #endif /* defined(__x86_64__) */
1238
1239 uint64_t nsec = 0;
1240 absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1241 panic("Write to IO vaddr %p paddr %p val 0x%llx took %llu ns,"
1242 " (start: %llu, end: %llu), ceiling: %llu",
1243 (void *)vaddr, (void *)paddr, val, nsec, sabs, eabs,
1244 report_write_delay);
1245 }
1246 }
1247
1248 if (__improbable(trace_phy_write_delay > 0 && (eabs - sabs) > trace_phy_write_delay)) {
1249 KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_WRITE),
1250 (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val);
1251 }
1252
1253 (void)ml_set_interrupts_enabled_with_debug(istate, false);
1254 }
1255 #endif /* ML_IO_TIMEOUTS_ENABLED */
1256 }
1257
1258 void
ml_io_write8(uintptr_t vaddr,uint8_t val)1259 ml_io_write8(uintptr_t vaddr, uint8_t val)
1260 {
1261 ml_io_write(vaddr, val, 1);
1262 }
1263
1264 void
ml_io_write16(uintptr_t vaddr,uint16_t val)1265 ml_io_write16(uintptr_t vaddr, uint16_t val)
1266 {
1267 ml_io_write(vaddr, val, 2);
1268 }
1269
1270 void
ml_io_write32(uintptr_t vaddr,uint32_t val)1271 ml_io_write32(uintptr_t vaddr, uint32_t val)
1272 {
1273 ml_io_write(vaddr, val, 4);
1274 }
1275
1276 void
ml_io_write64(uintptr_t vaddr,uint64_t val)1277 ml_io_write64(uintptr_t vaddr, uint64_t val)
1278 {
1279 ml_io_write(vaddr, val, 8);
1280 }
1281
1282 struct cpu_callback_chain_elem {
1283 cpu_callback_t fn;
1284 void *param;
1285 struct cpu_callback_chain_elem *next;
1286 };
1287
1288 static struct cpu_callback_chain_elem *cpu_callback_chain;
1289 static LCK_GRP_DECLARE(cpu_callback_chain_lock_grp, "cpu_callback_chain");
1290 static LCK_SPIN_DECLARE(cpu_callback_chain_lock, &cpu_callback_chain_lock_grp);
1291
1292 void
cpu_event_register_callback(cpu_callback_t fn,void * param)1293 cpu_event_register_callback(cpu_callback_t fn, void *param)
1294 {
1295 struct cpu_callback_chain_elem *new_elem;
1296
1297 new_elem = zalloc_permanent_type(struct cpu_callback_chain_elem);
1298 if (!new_elem) {
1299 panic("can't allocate cpu_callback_chain_elem");
1300 }
1301
1302 lck_spin_lock(&cpu_callback_chain_lock);
1303 new_elem->next = cpu_callback_chain;
1304 new_elem->fn = fn;
1305 new_elem->param = param;
1306 os_atomic_store(&cpu_callback_chain, new_elem, release);
1307 lck_spin_unlock(&cpu_callback_chain_lock);
1308 }
1309
1310 __attribute__((noreturn))
1311 void
cpu_event_unregister_callback(__unused cpu_callback_t fn)1312 cpu_event_unregister_callback(__unused cpu_callback_t fn)
1313 {
1314 panic("Unfortunately, cpu_event_unregister_callback is unimplemented.");
1315 }
1316
1317 void
ml_broadcast_cpu_event(enum cpu_event event,unsigned int cpu_or_cluster)1318 ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster)
1319 {
1320 struct cpu_callback_chain_elem *cursor;
1321
1322 cursor = os_atomic_load(&cpu_callback_chain, dependency);
1323 for (; cursor != NULL; cursor = cursor->next) {
1324 cursor->fn(cursor->param, event, cpu_or_cluster);
1325 }
1326 }
1327
1328 // Initialize Machine Timeouts (see the MACHINE_TIMEOUT macro
1329 // definition)
1330
1331 void
machine_timeout_init_with_suffix(const struct machine_timeout_spec * spec,char const * suffix)1332 machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *suffix)
1333 {
1334 if (wdt == -1 || (spec->skip_predicate != NULL && spec->skip_predicate(spec))) {
1335 // This timeout should be disabled.
1336 os_atomic_store_wide((uint64_t*)spec->ptr, 0, relaxed);
1337 return;
1338 }
1339
1340 assert(suffix != NULL);
1341 assert(strlen(spec->name) <= MACHINE_TIMEOUT_MAX_NAME_LEN);
1342
1343 size_t const suffix_len = strlen(suffix);
1344
1345 size_t const dt_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + 1;
1346 char dt_name[dt_name_size];
1347
1348 strlcpy(dt_name, spec->name, dt_name_size);
1349 strlcat(dt_name, suffix, dt_name_size);
1350
1351 size_t const scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + strlen("-scale") + 1;
1352 char scale_name[scale_name_size];
1353
1354 strlcpy(scale_name, spec->name, scale_name_size);
1355 strlcat(scale_name, suffix, scale_name_size);
1356 strlcat(scale_name, "-scale", scale_name_size);
1357
1358 size_t const boot_arg_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + strlen("ml-timeout-") + suffix_len + 1;
1359 char boot_arg_name[boot_arg_name_size];
1360
1361 strlcpy(boot_arg_name, "ml-timeout-", boot_arg_name_size);
1362 strlcat(boot_arg_name, spec->name, boot_arg_name_size);
1363 strlcat(boot_arg_name, suffix, boot_arg_name_size);
1364
1365 size_t const boot_arg_scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN +
1366 strlen("ml-timeout-") + strlen("-scale") + suffix_len + 1;
1367 char boot_arg_scale_name[boot_arg_scale_name_size];
1368
1369 strlcpy(boot_arg_scale_name, "ml-timeout-", boot_arg_scale_name_size);
1370 strlcat(boot_arg_scale_name, spec->name, boot_arg_scale_name_size);
1371 strlcat(boot_arg_scale_name, suffix, boot_arg_name_size);
1372 strlcat(boot_arg_scale_name, "-scale", boot_arg_scale_name_size);
1373
1374
1375 /*
1376 * Determine base value from DT and boot-args.
1377 */
1378
1379 DTEntry base, chosen;
1380
1381 if (SecureDTLookupEntry(NULL, "/machine-timeouts", &base) != kSuccess) {
1382 base = NULL;
1383 }
1384
1385 if (SecureDTLookupEntry(NULL, "/chosen/machine-timeouts", &chosen) != kSuccess) {
1386 chosen = NULL;
1387 }
1388
1389 uint64_t timeout = spec->default_value;
1390 bool found = false;
1391
1392 uint64_t const *data = NULL;
1393 unsigned int data_size = sizeof(*data);
1394
1395 /* First look in /machine-timeouts/<name> */
1396 if (base != NULL && SecureDTGetProperty(base, dt_name, (const void **)&data, &data_size) == kSuccess) {
1397 if (data_size != sizeof(*data)) {
1398 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s", __func__, data_size, dt_name);
1399 }
1400
1401 timeout = *data;
1402 found = true;
1403 }
1404
1405 /* A value in /chosen/machine-timeouts/<name> overrides */
1406 if (chosen != NULL && SecureDTGetProperty(chosen, dt_name, (const void **)&data, &data_size) == kSuccess) {
1407 if (data_size != sizeof(*data)) {
1408 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s", __func__, data_size, dt_name);
1409 }
1410
1411 timeout = *data;
1412 found = true;
1413 }
1414
1415 /* A boot-arg ml-timeout-<name> overrides */
1416 uint64_t boot_arg = 0;
1417
1418 if (PE_parse_boot_argn(boot_arg_name, &boot_arg, sizeof(boot_arg))) {
1419 timeout = boot_arg;
1420 found = true;
1421 }
1422
1423
1424 /*
1425 * Determine scale value from DT and boot-args.
1426 */
1427
1428 uint64_t scale = 1;
1429 uint32_t const *scale_data;
1430 unsigned int scale_size = sizeof(scale_data);
1431
1432 /* If there is a scale factor /machine-timeouts/<name>-scale, apply it. */
1433 if (base != NULL && SecureDTGetProperty(base, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1434 if (scale_size != sizeof(*scale_data)) {
1435 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s-scale", __func__, scale_size, dt_name);
1436 }
1437
1438 scale = *scale_data;
1439 }
1440
1441 /* If there is a scale factor /chosen/machine-timeouts/<name>-scale, use that. */
1442 if (chosen != NULL && SecureDTGetProperty(chosen, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1443 if (scale_size != sizeof(*scale_data)) {
1444 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s-scale", __func__,
1445 scale_size, dt_name);
1446 }
1447
1448 scale = *scale_data;
1449 }
1450
1451 /* Finally, a boot-arg ml-timeout-<name>-scale takes precedence. */
1452 if (PE_parse_boot_argn(boot_arg_scale_name, &boot_arg, sizeof(boot_arg))) {
1453 scale = boot_arg;
1454 }
1455
1456 static bool global_scale_set;
1457 static uint64_t global_scale;
1458
1459 if (!global_scale_set) {
1460 /* Apply /machine-timeouts/global-scale if present */
1461 if (SecureDTGetProperty(base, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1462 if (scale_size != sizeof(*scale_data)) {
1463 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/global-scale", __func__,
1464 scale_size);
1465 }
1466
1467 global_scale = *scale_data;
1468 global_scale_set = true;
1469 }
1470
1471 /* Use /chosen/machine-timeouts/global-scale if present */
1472 if (SecureDTGetProperty(chosen, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1473 if (scale_size != sizeof(*scale_data)) {
1474 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/global-scale", __func__,
1475 scale_size);
1476 }
1477
1478 global_scale = *scale_data;
1479 global_scale_set = true;
1480 }
1481
1482 /* Finally, the boot-arg ml-timeout-global-scale takes precedence. */
1483 if (PE_parse_boot_argn("ml-timeout-global-scale", &boot_arg, sizeof(boot_arg))) {
1484 global_scale = boot_arg;
1485 global_scale_set = true;
1486 }
1487 }
1488
1489 if (global_scale_set) {
1490 scale *= global_scale;
1491 }
1492
1493 /* Compute the final timeout, and done. */
1494 if (found && timeout > 0) {
1495 /* Only apply inherent unit scale if the value came in
1496 * externally. */
1497
1498 if (spec->unit_scale == MACHINE_TIMEOUT_UNIT_TIMEBASE) {
1499 uint64_t nanoseconds = timeout / 1000;
1500 nanoseconds_to_absolutetime(nanoseconds, &timeout);
1501 } else {
1502 timeout /= spec->unit_scale;
1503 }
1504
1505 if (timeout == 0) {
1506 /* Ensure unit scaling did not disable the timeout. */
1507 timeout = 1;
1508 }
1509 }
1510
1511 if (os_mul_overflow(timeout, scale, &timeout)) {
1512 timeout = UINT64_MAX; // clamp
1513 }
1514
1515 os_atomic_store_wide((uint64_t*)spec->ptr, timeout, relaxed);
1516 }
1517
1518 void
machine_timeout_init(const struct machine_timeout_spec * spec)1519 machine_timeout_init(const struct machine_timeout_spec *spec)
1520 {
1521 machine_timeout_init_with_suffix(spec, "");
1522 }
1523
1524 #if DEVELOPMENT || DEBUG
1525 /*
1526 * Late timeout (re-)initialization, at the end of bsd_init()
1527 */
1528 void
machine_timeout_bsd_init(void)1529 machine_timeout_bsd_init(void)
1530 {
1531 char const * const __unused mt_suffix = "-b";
1532 #if SCHED_HYGIENE_DEBUG
1533 machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix);
1534 machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix);
1535
1536 /*
1537 * The io timeouts can inherit from interrupt_masked_timeout.
1538 * Re-initialize, as interrupt_masked_timeout may have changed.
1539 */
1540 ml_io_init_timeouts();
1541
1542 extern void preemption_disable_reset_max_durations(void);
1543 /*
1544 * Reset the preemption disable stats, so that they are not
1545 * polluted by long early boot code.
1546 */
1547 preemption_disable_reset_max_durations();
1548 #endif /* SCHED_HYGIENE_DEBUG */
1549 }
1550 #endif /* DEVELOPMENT || DEBUG */
1551
1552 #if ML_IO_TIMEOUTS_ENABLED && CONFIG_XNUPOST
1553 #include <tests/xnupost.h>
1554
1555 extern kern_return_t ml_io_timeout_test(void);
1556
1557 static inline void
ml_io_timeout_test_get_timeouts(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)1558 ml_io_timeout_test_get_timeouts(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
1559 {
1560 *read_timeout = 0;
1561 *write_timeout = 0;
1562
1563 vm_offset_t paddr = kvtophys(vaddr);
1564
1565 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1566 override_io_timeouts(vaddr, paddr, read_timeout, write_timeout);
1567 ml_set_interrupts_enabled(istate);
1568 }
1569
1570 static inline void
ml_io_timeout_test_get_timeouts_phys(vm_offset_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)1571 ml_io_timeout_test_get_timeouts_phys(vm_offset_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
1572 {
1573 *read_timeout = 0;
1574 *write_timeout = 0;
1575
1576 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1577 override_io_timeouts(0, paddr, read_timeout, write_timeout);
1578 ml_set_interrupts_enabled(istate);
1579 }
1580
1581 kern_return_t
ml_io_timeout_test(void)1582 ml_io_timeout_test(void)
1583 {
1584 const size_t SIZE = 16;
1585 /*
1586 * Page align the base address to ensure that the regions are physically
1587 * contiguous.
1588 */
1589 const uintptr_t iovaddr_base1 = (uintptr_t)kernel_pmap & ~PAGE_MASK;
1590
1591 const uintptr_t iovaddr_base2 = iovaddr_base1 + SIZE;
1592 const uintptr_t vaddr1 = iovaddr_base1 + SIZE / 2;
1593 const uintptr_t vaddr2 = iovaddr_base2 + SIZE / 2;
1594
1595 const vm_offset_t iopaddr_base1 = kvtophys(iovaddr_base1);
1596 const vm_offset_t iopaddr_base2 = kvtophys(iovaddr_base2);
1597 const vm_offset_t paddr1 = iopaddr_base1 + SIZE / 2;
1598 const vm_offset_t paddr2 = iopaddr_base2 + SIZE / 2;
1599
1600 const uint64_t READ_TIMEOUT1_US = 50000, WRITE_TIMEOUT1_US = 50001;
1601 const uint64_t READ_TIMEOUT2_US = 50002, WRITE_TIMEOUT2_US = 50003;
1602 uint64_t read_timeout1_abs, write_timeout1_abs;
1603 uint64_t read_timeout2_abs, write_timeout2_abs;
1604 nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT1_US, &read_timeout1_abs);
1605 nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT1_US, &write_timeout1_abs);
1606 nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT2_US, &read_timeout2_abs);
1607 nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT2_US, &write_timeout2_abs);
1608
1609 int err = ml_io_increase_timeouts(iovaddr_base1, 0, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1610 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for empty region");
1611
1612 err = ml_io_increase_timeouts(iovaddr_base1, 4097, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1613 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for region > 4096 bytes");
1614
1615 err = ml_io_increase_timeouts(UINTPTR_MAX, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1616 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overflowed region");
1617
1618 err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1619 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first VA region should succeed");
1620
1621 err = ml_io_increase_timeouts(iovaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US);
1622 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second VA region should succeed");
1623
1624 err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1625 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for same region twice");
1626
1627 err = ml_io_increase_timeouts(vaddr1, (uint32_t)(vaddr2 - vaddr1), READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1628 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overlapping regions");
1629
1630 uint64_t read_timeout, write_timeout;
1631 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1632 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1633 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1634
1635 ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout);
1636 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region");
1637 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region");
1638
1639 ml_io_timeout_test_get_timeouts(iovaddr_base2 + SIZE, &read_timeout, &write_timeout);
1640 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout without override");
1641 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout without override");
1642
1643 err = ml_io_reset_timeouts(iovaddr_base1 + 1, SIZE - 1);
1644 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for subregion");
1645
1646 err = ml_io_reset_timeouts(iovaddr_base2 + SIZE, SIZE);
1647 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for non-existent region");
1648
1649 err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1650 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first VA region should succeed");
1651
1652 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1653 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region");
1654 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region");
1655
1656 err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1657 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for same region twice");
1658
1659 err = ml_io_reset_timeouts(iovaddr_base2, SIZE);
1660 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second VA region should succeed");
1661
1662 err = ml_io_increase_timeouts_phys(iopaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1663 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first PA region should succeed");
1664
1665 err = ml_io_increase_timeouts_phys(iopaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US);
1666 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second PA region should succeed");
1667
1668 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1669 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1670 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1671
1672 ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout);
1673 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region");
1674 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region");
1675
1676 ml_io_timeout_test_get_timeouts_phys(paddr1, &read_timeout, &write_timeout);
1677 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1678 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1679
1680 ml_io_timeout_test_get_timeouts_phys(paddr2, &read_timeout, &write_timeout);
1681 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first physical region");
1682 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first physical region");
1683
1684 err = ml_io_reset_timeouts_phys(iopaddr_base1, SIZE);
1685 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first PA region should succeed");
1686
1687 err = ml_io_reset_timeouts_phys(iopaddr_base2, SIZE);
1688 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second PA region should succeed");
1689
1690 ml_io_timeout_test_get_timeouts_phys(paddr1, &read_timeout, &write_timeout);
1691 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region");
1692 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region");
1693
1694 return KERN_SUCCESS;
1695 }
1696 #endif /* CONFIG_XNUPOST */
1697