1 /*
2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: kern/machine.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1987
62 *
63 * Support for machine independent machine abstraction.
64 */
65
66 #include <string.h>
67
68 #include <mach/mach_types.h>
69 #include <mach/boolean.h>
70 #include <mach/kern_return.h>
71 #include <mach/machine.h>
72 #include <mach/host_info.h>
73 #include <mach/host_reboot.h>
74 #include <mach/host_priv_server.h>
75 #include <mach/processor_server.h>
76 #include <mach/sdt.h>
77
78 #include <kern/kern_types.h>
79 #include <kern/cpu_data.h>
80 #include <kern/ipc_host.h>
81 #include <kern/host.h>
82 #include <kern/machine.h>
83 #include <kern/misc_protos.h>
84 #include <kern/percpu.h>
85 #include <kern/processor.h>
86 #include <kern/queue.h>
87 #include <kern/sched.h>
88 #include <kern/startup.h>
89 #include <kern/task.h>
90 #include <kern/thread.h>
91 #include <kern/iotrace.h>
92
93 #include <libkern/OSDebug.h>
94 #if ML_IO_TIMEOUTS_ENABLED
95 #include <libkern/tree.h>
96 #endif
97
98 #include <pexpert/device_tree.h>
99
100 #include <machine/commpage.h>
101 #include <machine/machine_routines.h>
102
103 #if HIBERNATION
104 #include <IOKit/IOHibernatePrivate.h>
105 #endif
106 #include <IOKit/IOPlatformExpert.h>
107
108 #if CONFIG_DTRACE
109 extern void (*dtrace_cpu_state_changed_hook)(int, boolean_t);
110 #endif
111
112 #if defined(__arm64__)
113 extern void wait_while_mp_kdp_trap(bool check_SIGPdebug);
114 #if CONFIG_SPTM
115 #include <arm64/sptm/pmap/pmap_data.h>
116 #else
117 #include <arm/pmap/pmap_data.h>
118 #endif /* CONFIG_SPTM */
119 #endif /* defined(__arm64__) */
120
121 #if defined(__x86_64__)
122 #include <i386/panic_notify.h>
123 #endif
124
125 #if ML_IO_TIMEOUTS_ENABLED
126 #if defined(__x86_64__)
127 #define ml_io_timestamp mach_absolute_time
128 #else
129 #define ml_io_timestamp ml_get_timebase
130 #endif /* __x86_64__ */
131 #endif /* ML_IO_TIMEOUTS_ENABLED */
132
133 /*
134 * Exported variables:
135 */
136
137 struct machine_info machine_info;
138
139 /* Forwards */
140 static void
141 processor_offline(void * parameter, __unused wait_result_t result);
142
143 static void
144 processor_offline_intstack(processor_t processor) __dead2;
145
146
147 /*
148 * processor_up:
149 *
150 * Flag processor as up and running, and available
151 * for scheduling.
152 */
153 void
processor_up(processor_t processor)154 processor_up(
155 processor_t processor)
156 {
157 spl_t s = splsched();
158 init_ast_check(processor);
159
160 #if defined(__arm64__)
161 /*
162 * A processor coming online won't have received a SIGPdebug signal
163 * to cause it to spin while a stackshot or panic is taking place,
164 * so spin here on mp_kdp_trap.
165 *
166 * However, since cpu_signal() is not yet enabled for this processor,
167 * there is a race if we have just passed this when a cpu_signal()
168 * is attempted. The sender will assume the cpu is offline, so it will
169 * not end up spinning anywhere. See processor_cpu_reinit() for the fix
170 * for this race.
171 */
172 wait_while_mp_kdp_trap(false);
173 #endif
174
175 /* Boot CPU coming online for the first time, either at boot or after sleep */
176 __assert_only bool is_first_online_processor;
177
178 is_first_online_processor = sched_mark_processor_online(processor,
179 processor->last_startup_reason);
180
181 simple_lock(&processor_start_state_lock, LCK_GRP_NULL);
182 assert(processor->processor_instartup == true || is_first_online_processor);
183 simple_unlock(&processor_start_state_lock);
184
185 splx(s);
186
187 #if defined(__x86_64__)
188 ml_cpu_up();
189 #endif /* defined(__x86_64__) */
190
191 #if CONFIG_DTRACE
192 if (dtrace_cpu_state_changed_hook) {
193 (*dtrace_cpu_state_changed_hook)(processor->cpu_id, TRUE);
194 }
195 #endif
196 }
197
198 #include <atm/atm_internal.h>
199
200 kern_return_t
host_reboot(host_priv_t host_priv,int options)201 host_reboot(
202 host_priv_t host_priv,
203 int options)
204 {
205 if (host_priv == HOST_PRIV_NULL) {
206 return KERN_INVALID_HOST;
207 }
208
209 #if DEVELOPMENT || DEBUG
210 if (options & HOST_REBOOT_DEBUGGER) {
211 Debugger("Debugger");
212 return KERN_SUCCESS;
213 }
214 #endif
215
216 if (options & HOST_REBOOT_UPSDELAY) {
217 // UPS power cutoff path
218 PEHaltRestart( kPEUPSDelayHaltCPU );
219 } else {
220 halt_all_cpus(!(options & HOST_REBOOT_HALT));
221 }
222
223 return KERN_SUCCESS;
224 }
225
226 kern_return_t
processor_assign(__unused processor_t processor,__unused processor_set_t new_pset,__unused boolean_t wait)227 processor_assign(
228 __unused processor_t processor,
229 __unused processor_set_t new_pset,
230 __unused boolean_t wait)
231 {
232 return KERN_FAILURE;
233 }
234
235 void
processor_doshutdown(processor_t processor,bool is_final_system_sleep)236 processor_doshutdown(
237 processor_t processor,
238 bool is_final_system_sleep)
239 {
240 lck_mtx_assert(&cluster_powerdown_lock, LCK_MTX_ASSERT_OWNED);
241 lck_mtx_assert(&processor_updown_lock, LCK_MTX_ASSERT_OWNED);
242
243 if (!processor->processor_booted) {
244 panic("processor %d not booted", processor->cpu_id);
245 }
246
247 if (is_final_system_sleep) {
248 assert(processor == current_processor());
249 assert(processor == master_processor);
250 assert(processor_avail_count == 1);
251 }
252
253 processor_set_t pset = processor->processor_set;
254
255 ml_cpu_begin_state_transition(processor->cpu_id);
256
257 ml_broadcast_cpu_event(CPU_EXIT_REQUESTED, processor->cpu_id);
258
259 #if HIBERNATION
260 if (is_final_system_sleep) {
261 /*
262 * Ensure the page queues are in a state where the hibernation
263 * code can manipulate them without requiring other threads
264 * to be scheduled.
265 *
266 * This operation can block,
267 * and unlock must be done from the same thread.
268 */
269 assert(processor_avail_count < 2);
270 hibernate_vm_lock();
271 }
272 #endif
273
274 spl_t s = splsched();
275 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
276 pset_lock(pset);
277
278 assert(processor->state != PROCESSOR_START);
279 assert(processor->state != PROCESSOR_PENDING_OFFLINE);
280 assert(processor->state != PROCESSOR_OFF_LINE);
281
282 assert(!processor->processor_inshutdown);
283 processor->processor_inshutdown = true;
284
285 assert(processor->processor_offline_state == PROCESSOR_OFFLINE_RUNNING);
286 processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_BEGIN_SHUTDOWN);
287
288 if (!is_final_system_sleep) {
289 sched_assert_not_last_online_cpu(processor->cpu_id);
290 }
291
292 pset_unlock(pset);
293 simple_unlock(&sched_available_cores_lock);
294
295 if (is_final_system_sleep) {
296 assert(processor == current_processor());
297
298 #if HIBERNATION
299 /*
300 * After this point, the system is now
301 * committed to hibernation and must
302 * not run any other thread that could take this lock.
303 */
304 hibernate_vm_unlock();
305 #endif
306 } else {
307 /*
308 * Get onto the processor to shut down.
309 * The scheduler picks this thread naturally according to its
310 * priority.
311 * The processor can run any other thread if this one blocks.
312 * So, don't block.
313 */
314 processor_t prev = thread_bind(processor);
315 thread_block(THREAD_CONTINUE_NULL);
316
317 /* interrupts still disabled */
318 assert(ml_get_interrupts_enabled() == FALSE);
319
320 assert(processor == current_processor());
321 assert(processor->processor_inshutdown);
322
323 thread_bind(prev);
324 /* interrupts still disabled */
325 }
326
327 /*
328 * Continue processor shutdown on the processor's idle thread.
329 * The handoff won't fail because the idle thread has a reserved stack.
330 * Switching to the idle thread leaves interrupts disabled,
331 * so we can't accidentally take an interrupt after the context switch.
332 */
333 thread_t shutdown_thread = processor->idle_thread;
334 shutdown_thread->continuation = processor_offline;
335 shutdown_thread->parameter = (void*)is_final_system_sleep;
336
337 thread_run(current_thread(), THREAD_CONTINUE_NULL, NULL, shutdown_thread);
338
339 /*
340 * After this point, we are in regular scheduled context on a remaining
341 * available CPU. Interrupts are still disabled.
342 */
343
344 if (is_final_system_sleep) {
345 /*
346 * We are coming out of system sleep here, so there won't be a
347 * corresponding processor_startup for this processor, so we
348 * need to put it back in the correct running state.
349 *
350 * There's nowhere to execute a call to CPU_EXITED during system
351 * sleep for the boot processor, and it's already been CPU_BOOTED
352 * by this point anyways, so skip the call.
353 */
354 assert(current_processor() == master_processor);
355 assert(processor->state == PROCESSOR_RUNNING);
356 assert(processor->processor_inshutdown);
357 assert(processor->processor_offline_state == PROCESSOR_OFFLINE_STARTED_NOT_WAITED);
358 processor->processor_inshutdown = false;
359 processor_update_offline_state(processor, PROCESSOR_OFFLINE_RUNNING);
360
361 splx(s);
362 } else {
363 splx(s);
364
365 cpu_exit_wait(processor->cpu_id);
366
367 s = splsched();
368 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
369 pset_lock(pset);
370 assert(processor->processor_inshutdown);
371 assert(processor->processor_offline_state == PROCESSOR_OFFLINE_PENDING_OFFLINE);
372 assert(processor->state == PROCESSOR_PENDING_OFFLINE);
373 pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
374 processor_update_offline_state_locked(processor, PROCESSOR_OFFLINE_CPU_OFFLINE);
375 pset_unlock(pset);
376 simple_unlock(&sched_available_cores_lock);
377 splx(s);
378
379 ml_broadcast_cpu_event(CPU_EXITED, processor->cpu_id);
380 ml_cpu_power_disable(processor->cpu_id);
381
382 assert(processor->processor_offline_state == PROCESSOR_OFFLINE_CPU_OFFLINE);
383 processor_update_offline_state(processor, PROCESSOR_OFFLINE_FULLY_OFFLINE);
384 }
385
386 ml_cpu_end_state_transition(processor->cpu_id);
387 }
388
389 /*
390 * Called in the context of the idle thread to shut down the processor
391 *
392 * A shut-down processor looks like it's 'running' the idle thread parked
393 * in this routine, but it's actually been powered off and has no hardware state.
394 */
395 static void
processor_offline(void * parameter,__unused wait_result_t result)396 processor_offline(
397 void * parameter,
398 __unused wait_result_t result)
399 {
400 bool is_final_system_sleep = (bool) parameter;
401 processor_t processor = current_processor();
402 thread_t self = current_thread();
403 __assert_only thread_t old_thread = THREAD_NULL;
404
405 assert(self->state & TH_IDLE);
406 assert(processor->idle_thread == self);
407 assert(ml_get_interrupts_enabled() == FALSE);
408 assert(self->continuation == NULL);
409 assert(processor->processor_online == true);
410 assert(processor->running_timers_active == false);
411
412 if (is_final_system_sleep) {
413 assert(processor == current_processor());
414 assert(processor == master_processor);
415 assert(processor_avail_count == 1);
416 }
417
418 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROCESSOR_SHUTDOWN) | DBG_FUNC_START, processor->cpu_id);
419
420 bool enforce_quiesce_safety = gEnforcePlatformActionSafety;
421
422 /*
423 * Scheduling is now disabled for this processor.
424 * Ensure that primitives that need scheduling (like mutexes) know this.
425 */
426 if (enforce_quiesce_safety) {
427 disable_preemption_without_measurements();
428 }
429
430 #if CONFIG_DTRACE
431 if (dtrace_cpu_state_changed_hook) {
432 (*dtrace_cpu_state_changed_hook)(processor->cpu_id, FALSE);
433 }
434 #endif
435
436 smr_cpu_down(processor, SMR_CPU_REASON_OFFLINE);
437
438 /* Drain pending IPIs for the last time here. */
439 ml_cpu_down();
440
441 sched_mark_processor_offline(processor, is_final_system_sleep);
442
443 /*
444 * Switch to the interrupt stack and shut down the processor.
445 *
446 * When the processor comes back, it will eventually call load_context which
447 * restores the context saved by machine_processor_shutdown, returning here.
448 */
449 old_thread = machine_processor_shutdown(self, processor_offline_intstack, processor);
450
451 /*
452 * The processor is back. sched_mark_processor_online and
453 * friends have already run via processor_up.
454 */
455
456 /* old_thread should be NULL because we got here through Load_context */
457 assert(old_thread == THREAD_NULL);
458
459 assert(processor == current_processor());
460 assert(processor->idle_thread == current_thread());
461 assert(processor->processor_online == true);
462
463 assert(ml_get_interrupts_enabled() == FALSE);
464 assert(self->continuation == NULL);
465
466 /* Extract the machine_param value stashed by secondary_cpu_main */
467 void * machine_param = self->parameter;
468 self->parameter = NULL;
469
470 processor_cpu_reinit(machine_param, true, is_final_system_sleep);
471
472 if (enforce_quiesce_safety) {
473 enable_preemption();
474 }
475
476 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROCESSOR_SHUTDOWN) | DBG_FUNC_END, processor->cpu_id);
477
478 /*
479 * Now that the processor is back, invoke the idle thread to find out what to do next.
480 * idle_thread will enable interrupts.
481 */
482 thread_block(idle_thread);
483 /*NOTREACHED*/
484 }
485
486 /*
487 * Complete the shutdown and place the processor offline.
488 *
489 * Called at splsched in the shutdown context
490 * (i.e. on the idle thread, on the interrupt stack)
491 *
492 * The onlining half of this is done in load_context().
493 */
494 static void
processor_offline_intstack(processor_t processor)495 processor_offline_intstack(
496 processor_t processor)
497 {
498 assert(processor == current_processor());
499 assert(processor->active_thread == current_thread());
500
501 struct recount_snap snap = { 0 };
502 recount_snapshot(&snap);
503 recount_processor_idle(&processor->pr_recount, &snap);
504
505 smr_cpu_leave(processor, processor->last_dispatch);
506
507 PMAP_DEACTIVATE_KERNEL(processor->cpu_id);
508
509 cpu_sleep();
510 panic("zombie processor");
511 /*NOTREACHED*/
512 }
513
514 /*
515 * Called on the idle thread with interrupts disabled to initialize a
516 * secondary processor on boot or to reinitialize any processor on resume
517 * from processor offline.
518 */
519 void
processor_cpu_reinit(void * machine_param,__unused bool wait_for_cpu_signal,__assert_only bool is_final_system_sleep)520 processor_cpu_reinit(void* machine_param,
521 __unused bool wait_for_cpu_signal,
522 __assert_only bool is_final_system_sleep)
523 {
524 /* Re-initialize the processor */
525 machine_cpu_reinit(machine_param);
526
527 #if defined(__arm64__)
528 /*
529 * See the comments for wait_while_mp_kdp_trap in processor_up().
530 *
531 * SIGPdisabled is cleared (to enable cpu_signal() to succeed with this processor)
532 * the first time we take an IPI. This is triggered by machine_cpu_reinit(), above,
533 * which calls cpu_machine_init()->PE_cpu_machine_init()->PE_cpu_signal() which sends
534 * a self-IPI to ensure that happens when we enable interrupts. So enable interrupts
535 * here so that cpu_signal() can succeed before we spin on mp_kdp_trap.
536 */
537 assert_ml_cpu_signal_is_enabled(false);
538
539 ml_set_interrupts_enabled(TRUE);
540
541 if (wait_for_cpu_signal) {
542 ml_wait_for_cpu_signal_to_enable();
543 }
544
545 ml_set_interrupts_enabled(FALSE);
546
547 wait_while_mp_kdp_trap(true);
548
549 /*
550 * At this point,
551 * if a stackshot or panic is in progress, we either spin on mp_kdp_trap
552 * or we sucessfully received a SIGPdebug signal which will cause us to
553 * break out of the spin on mp_kdp_trap and instead
554 * spin next time interrupts are enabled in idle_thread().
555 */
556 if (wait_for_cpu_signal) {
557 assert_ml_cpu_signal_is_enabled(true);
558 }
559
560 /*
561 * Now that we know SIGPdisabled is cleared, we can publish that
562 * this CPU has fully come out of offline state.
563 *
564 * Without wait_for_cpu_signal, we'll publish this earlier than
565 * cpu_signal is actually ready, but as long as it's ready by next S2R,
566 * it will be good enough.
567 */
568 ml_cpu_up();
569 #endif
570
571 processor_t processor = current_processor();
572
573 simple_lock(&processor_start_state_lock, LCK_GRP_NULL);
574 assert(processor->processor_instartup == true || is_final_system_sleep);
575 processor->processor_instartup = false;
576 simple_unlock(&processor_start_state_lock);
577
578 thread_wakeup((event_t)&processor->processor_instartup);
579 }
580
581 kern_return_t
host_get_boot_info(host_priv_t host_priv,kernel_boot_info_t boot_info)582 host_get_boot_info(
583 host_priv_t host_priv,
584 kernel_boot_info_t boot_info)
585 {
586 const char *src = "";
587 if (host_priv == HOST_PRIV_NULL) {
588 return KERN_INVALID_HOST;
589 }
590
591 /*
592 * Copy first operator string terminated by '\0' followed by
593 * standardized strings generated from boot string.
594 */
595 src = machine_boot_info(boot_info, KERNEL_BOOT_INFO_MAX);
596 if (src != boot_info) {
597 (void) strncpy(boot_info, src, KERNEL_BOOT_INFO_MAX);
598 }
599
600 return KERN_SUCCESS;
601 }
602
603 // These are configured through sysctls.
604 #if DEVELOPMENT || DEBUG
605 uint32_t phy_read_panic = 1;
606 uint32_t phy_write_panic = 1;
607 uint64_t simulate_stretched_io = 0;
608 #else
609 uint32_t phy_read_panic = 0;
610 uint32_t phy_write_panic = 0;
611 #endif
612
613 #if !defined(__x86_64__)
614
615 #if DEVELOPMENT || DEBUG
616 static const uint64_t TIMEBASE_TICKS_PER_USEC = 24000000ULL / USEC_PER_SEC;
617 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 100 * TIMEBASE_TICKS_PER_USEC;
618 #else
619 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 0;
620 #endif
621
622 // The MACHINE_TIMEOUT facility only exists on ARM.
623 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_read_delay_to, "report-phy-read-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
624 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_write_delay_to, "report-phy-write-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
625 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_read_delay_to, "trace-phy-read-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
626 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_write_delay_to, "trace-phy-write-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
627
628 #if SCHED_HYGIENE_DEBUG
629 /*
630 * Note: The interrupt-masked timeout goes through two initializations - one
631 * early in boot and one later. Thus this function is also called twice and
632 * can't be marked '__startup_func'.
633 */
634 static void
ml_io_init_timeouts(void)635 ml_io_init_timeouts(void)
636 {
637 /*
638 * The timeouts may be completely disabled via an override.
639 */
640 if (kern_feature_override(KF_IO_TIMEOUT_OVRD)) {
641 os_atomic_store(&report_phy_write_delay_to, 0, relaxed);
642 os_atomic_store(&report_phy_read_delay_to, 0, relaxed);
643 return;
644 }
645
646 /*
647 * There may be no interrupt masked timeout set.
648 */
649 const uint64_t interrupt_masked_to = os_atomic_load(&interrupt_masked_timeout, relaxed);
650 if (interrupt_masked_timeout == 0) {
651 return;
652 }
653
654 /*
655 * Inherit from the interrupt masked timeout if smaller and the timeout
656 * hasn't been explicitly set via boot-arg.
657 */
658 uint64_t arg = 0;
659
660 if (!PE_parse_boot_argn("ml-timeout-report-phy-read-delay", &arg, sizeof(arg))) {
661 uint64_t report_phy_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
662 report_phy_read_delay = report_phy_read_delay == 0 ?
663 interrupt_masked_to :
664 MIN(report_phy_read_delay, interrupt_masked_to);
665 os_atomic_store(&report_phy_read_delay_to, report_phy_read_delay, relaxed);
666 }
667
668 if (!PE_parse_boot_argn("ml-timeout-report-phy-write-delay", &arg, sizeof(arg))) {
669 uint64_t report_phy_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
670 report_phy_write_delay = report_phy_write_delay == 0 ?
671 interrupt_masked_to :
672 MIN(report_phy_write_delay, interrupt_masked_to);
673 os_atomic_store(&report_phy_write_delay_to, report_phy_write_delay, relaxed);
674 }
675 }
676
677 /*
678 * It's important that this happens after machine timeouts have initialized so
679 * the correct timeouts can be inherited.
680 */
681 STARTUP(TIMEOUTS, STARTUP_RANK_SECOND, ml_io_init_timeouts);
682 #endif /* SCHED_HYGIENE_DEBUG */
683
684 extern pmap_paddr_t kvtophys(vm_offset_t va);
685 #endif /* !defined(__x86_64__) */
686
687 #if ML_IO_TIMEOUTS_ENABLED
688
689 static LCK_GRP_DECLARE(io_timeout_override_lock_grp, "io_timeout_override");
690 static LCK_SPIN_DECLARE(io_timeout_override_lock, &io_timeout_override_lock_grp);
691
692 struct io_timeout_override_entry {
693 RB_ENTRY(io_timeout_override_entry) tree;
694
695 uintptr_t ioaddr_base;
696 unsigned int size;
697 uint32_t read_timeout;
698 uint32_t write_timeout;
699 };
700
701 static inline int
io_timeout_override_cmp(const struct io_timeout_override_entry * a,const struct io_timeout_override_entry * b)702 io_timeout_override_cmp(const struct io_timeout_override_entry *a, const struct io_timeout_override_entry *b)
703 {
704 if (a->ioaddr_base < b->ioaddr_base) {
705 return -1;
706 } else if (a->ioaddr_base > b->ioaddr_base) {
707 return 1;
708 } else {
709 return 0;
710 }
711 }
712
713 static RB_HEAD(io_timeout_override, io_timeout_override_entry)
714 io_timeout_override_root_pa, io_timeout_override_root_va;
715
716 RB_PROTOTYPE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
717 RB_GENERATE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
718
719 static int
io_increase_timeouts(struct io_timeout_override * root,uintptr_t ioaddr_base,unsigned int size,uint32_t read_timeout_us,uint32_t write_timeout_us)720 io_increase_timeouts(struct io_timeout_override *root, uintptr_t ioaddr_base,
721 unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us)
722 {
723 const uint64_t MAX_TIMEOUT_ABS = UINT32_MAX;
724
725 assert(preemption_enabled());
726
727 int ret = KERN_SUCCESS;
728
729 if (size == 0) {
730 return KERN_INVALID_ARGUMENT;
731 }
732
733 uintptr_t ioaddr_end;
734 if (os_add_overflow(ioaddr_base, size - 1, &ioaddr_end)) {
735 return KERN_INVALID_ARGUMENT;
736 }
737
738 uint64_t read_timeout_abs, write_timeout_abs;
739 nanoseconds_to_absolutetime(NSEC_PER_USEC * read_timeout_us, &read_timeout_abs);
740 nanoseconds_to_absolutetime(NSEC_PER_USEC * write_timeout_us, &write_timeout_abs);
741 if (read_timeout_abs > MAX_TIMEOUT_ABS || write_timeout_abs > MAX_TIMEOUT_ABS) {
742 return KERN_INVALID_ARGUMENT;
743 }
744
745 struct io_timeout_override_entry *node = kalloc_type(struct io_timeout_override_entry, Z_WAITOK | Z_ZERO | Z_NOFAIL);
746 node->ioaddr_base = ioaddr_base;
747 node->size = size;
748 node->read_timeout = (uint32_t)read_timeout_abs;
749 node->write_timeout = (uint32_t)write_timeout_abs;
750
751 /*
752 * Interrupt handlers are allowed to call ml_io_{read,write}*, so
753 * interrupts must be disabled any time io_timeout_override_lock is
754 * held. Otherwise the CPU could take an interrupt while holding the
755 * lock, invoke an ISR that calls ml_io_{read,write}*, and deadlock
756 * trying to acquire the lock again.
757 */
758 boolean_t istate = ml_set_interrupts_enabled(FALSE);
759 lck_spin_lock(&io_timeout_override_lock);
760 if (RB_INSERT(io_timeout_override, root, node)) {
761 ret = KERN_INVALID_ARGUMENT;
762 goto out;
763 }
764
765 /* Check that this didn't create any new overlaps */
766 struct io_timeout_override_entry *prev = RB_PREV(io_timeout_override, root, node);
767 if (prev && (prev->ioaddr_base + prev->size) > node->ioaddr_base) {
768 RB_REMOVE(io_timeout_override, root, node);
769 ret = KERN_INVALID_ARGUMENT;
770 goto out;
771 }
772 struct io_timeout_override_entry *next = RB_NEXT(io_timeout_override, root, node);
773 if (next && (node->ioaddr_base + node->size) > next->ioaddr_base) {
774 RB_REMOVE(io_timeout_override, root, node);
775 ret = KERN_INVALID_ARGUMENT;
776 goto out;
777 }
778
779 out:
780 lck_spin_unlock(&io_timeout_override_lock);
781 ml_set_interrupts_enabled(istate);
782 if (ret != KERN_SUCCESS) {
783 kfree_type(struct io_timeout_override_entry, node);
784 }
785 return ret;
786 }
787
788 static int
io_reset_timeouts(struct io_timeout_override * root,uintptr_t ioaddr_base,unsigned int size)789 io_reset_timeouts(struct io_timeout_override *root, uintptr_t ioaddr_base, unsigned int size)
790 {
791 assert(preemption_enabled());
792
793 struct io_timeout_override_entry key = { .ioaddr_base = ioaddr_base };
794
795 boolean_t istate = ml_set_interrupts_enabled(FALSE);
796 lck_spin_lock(&io_timeout_override_lock);
797 struct io_timeout_override_entry *node = RB_FIND(io_timeout_override, root, &key);
798 if (node) {
799 if (node->size == size) {
800 RB_REMOVE(io_timeout_override, root, node);
801 } else {
802 node = NULL;
803 }
804 }
805 lck_spin_unlock(&io_timeout_override_lock);
806 ml_set_interrupts_enabled(istate);
807
808 if (!node) {
809 return KERN_NOT_FOUND;
810 }
811
812 kfree_type(struct io_timeout_override_entry, node);
813 return KERN_SUCCESS;
814 }
815
816 static bool
io_override_timeout(struct io_timeout_override * root,uintptr_t addr,uint64_t * read_timeout,uint64_t * write_timeout)817 io_override_timeout(struct io_timeout_override *root, uintptr_t addr,
818 uint64_t *read_timeout, uint64_t *write_timeout)
819 {
820 assert(!ml_get_interrupts_enabled());
821 assert3p(read_timeout, !=, NULL);
822 assert3p(write_timeout, !=, NULL);
823
824 struct io_timeout_override_entry *node = RB_ROOT(root);
825
826 lck_spin_lock(&io_timeout_override_lock);
827 /* RB_FIND() doesn't support custom cmp functions, so we have to open-code our own */
828 while (node) {
829 if (node->ioaddr_base <= addr && addr < node->ioaddr_base + node->size) {
830 *read_timeout = node->read_timeout;
831 *write_timeout = node->write_timeout;
832 lck_spin_unlock(&io_timeout_override_lock);
833 return true;
834 } else if (addr < node->ioaddr_base) {
835 node = RB_LEFT(node, tree);
836 } else {
837 node = RB_RIGHT(node, tree);
838 }
839 }
840 lck_spin_unlock(&io_timeout_override_lock);
841
842 return false;
843 }
844
845 static bool
io_override_timeout_ss(uint64_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)846 io_override_timeout_ss(uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
847 {
848 #if defined(__arm64__)
849
850 /*
851 * PCIe regions are marked with PMAP_IO_RANGE_STRONG_SYNC. Apply a
852 * timeout greater than two PCIe completion timeouts (90ms) as they can
853 * stack.
854 */
855 #define STRONG_SYNC_TIMEOUT 2160000 /* 90ms */
856
857 pmap_io_range_t *range = pmap_find_io_attr(paddr);
858 if (range != NULL && (range->wimg & PMAP_IO_RANGE_STRONG_SYNC) != 0) {
859 *read_timeout = STRONG_SYNC_TIMEOUT;
860 *write_timeout = STRONG_SYNC_TIMEOUT;
861 return true;
862 }
863 #else
864 (void)paddr;
865 (void)read_timeout;
866 (void)write_timeout;
867 #endif /* __arm64__ */
868 return false;
869 }
870
871 /*
872 * Return timeout override values for the read/write timeout for a given
873 * address.
874 * A virtual address (vaddr), physical address (paddr) or both may be passed.
875 * Up to three separate timeout overrides can be found
876 * - A virtual address override
877 * - A physical address override
878 * - A strong sync override
879 * The largest override found is returned.
880 */
881 void
override_io_timeouts(uintptr_t vaddr,uint64_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)882 override_io_timeouts(uintptr_t vaddr, uint64_t paddr, uint64_t *read_timeout,
883 uint64_t *write_timeout)
884 {
885 uint64_t rt_va = 0, wt_va = 0, rt_pa = 0, wt_pa = 0, rt_ss = 0, wt_ss = 0;
886
887 if (vaddr != 0) {
888 /* Override from virtual address. */
889 io_override_timeout(&io_timeout_override_root_va, vaddr, &rt_va, &wt_va);
890 }
891
892 if (paddr != 0) {
893 /* Override from physical address. */
894 io_override_timeout(&io_timeout_override_root_pa, paddr, &rt_pa, &wt_pa);
895
896 /* Override from strong sync range. */
897 io_override_timeout_ss(paddr, &rt_ss, &wt_ss);
898 }
899
900 if (read_timeout != NULL) {
901 *read_timeout = MAX(MAX(rt_va, rt_pa), rt_ss);
902 }
903
904 if (write_timeout != NULL) {
905 *write_timeout = MAX(MAX(wt_va, wt_pa), wt_ss);
906 }
907 }
908
909 #endif /* ML_IO_TIMEOUTS_ENABLED */
910
911 int
ml_io_increase_timeouts(uintptr_t ioaddr_base,unsigned int size,uint32_t read_timeout_us,uint32_t write_timeout_us)912 ml_io_increase_timeouts(uintptr_t ioaddr_base, unsigned int size,
913 uint32_t read_timeout_us, uint32_t write_timeout_us)
914 {
915 #if ML_IO_TIMEOUTS_ENABLED
916 const size_t MAX_SIZE = 4096;
917
918 if (size > MAX_SIZE) {
919 return KERN_INVALID_ARGUMENT;
920 }
921
922 return io_increase_timeouts(&io_timeout_override_root_va, ioaddr_base,
923 size, read_timeout_us, write_timeout_us);
924 #else
925 #pragma unused(ioaddr_base, size, read_timeout_us, write_timeout_us)
926 return KERN_SUCCESS;
927 #endif /* ML_IO_TIMEOUTS_ENABLED */
928 }
929
930 int
ml_io_increase_timeouts_phys(vm_offset_t ioaddr_base,unsigned int size,uint32_t read_timeout_us,uint32_t write_timeout_us)931 ml_io_increase_timeouts_phys(vm_offset_t ioaddr_base, unsigned int size,
932 uint32_t read_timeout_us, uint32_t write_timeout_us)
933 {
934 #if ML_IO_TIMEOUTS_ENABLED
935 return io_increase_timeouts(&io_timeout_override_root_pa, ioaddr_base,
936 size, read_timeout_us, write_timeout_us);
937 #else
938 #pragma unused(ioaddr_base, size, read_timeout_us, write_timeout_us)
939 return KERN_SUCCESS;
940 #endif /* ML_IO_TIMEOUTS_ENABLED */
941 }
942
943 int
ml_io_reset_timeouts(uintptr_t ioaddr_base,unsigned int size)944 ml_io_reset_timeouts(uintptr_t ioaddr_base, unsigned int size)
945 {
946 #if ML_IO_TIMEOUTS_ENABLED
947 return io_reset_timeouts(&io_timeout_override_root_va, ioaddr_base, size);
948 #else
949 #pragma unused(ioaddr_base, size)
950 return KERN_SUCCESS;
951 #endif /* ML_IO_TIMEOUTS_ENABLED */
952 }
953
954 int
ml_io_reset_timeouts_phys(vm_offset_t ioaddr_base,unsigned int size)955 ml_io_reset_timeouts_phys(vm_offset_t ioaddr_base, unsigned int size)
956 {
957 #if ML_IO_TIMEOUTS_ENABLED
958 return io_reset_timeouts(&io_timeout_override_root_pa, ioaddr_base, size);
959 #else
960 #pragma unused(ioaddr_base, size)
961 return KERN_SUCCESS;
962 #endif /* ML_IO_TIMEOUTS_ENABLED */
963 }
964
965 unsigned long long
ml_io_read(uintptr_t vaddr,int size)966 ml_io_read(uintptr_t vaddr, int size)
967 {
968 unsigned long long result = 0;
969 unsigned char s1;
970 unsigned short s2;
971
972 #ifdef ML_IO_VERIFY_UNCACHEABLE
973 uintptr_t paddr = pmap_verify_noncacheable(vaddr);
974 #elif defined(ML_IO_TIMEOUTS_ENABLED)
975 uintptr_t paddr = 0;
976 #endif
977
978 #ifdef ML_IO_TIMEOUTS_ENABLED
979 uint64_t sabs, eabs;
980 boolean_t istate, timeread = FALSE;
981 uint64_t report_read_delay;
982 #if __x86_64__
983 report_read_delay = report_phy_read_delay;
984 #else
985 report_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
986 uint64_t const trace_phy_read_delay = os_atomic_load(&trace_phy_read_delay_to, relaxed);
987 #endif /* __x86_64__ */
988
989 if (__improbable(report_read_delay != 0)) {
990 istate = ml_set_interrupts_enabled_with_debug(false, false);
991 sabs = ml_io_timestamp();
992 timeread = TRUE;
993 }
994
995 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
996 if (__improbable(timeread && simulate_stretched_io)) {
997 sabs -= simulate_stretched_io;
998 }
999 #endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */
1000 #endif /* ML_IO_TIMEOUTS_ENABLED */
1001
1002 #if DEVELOPMENT || DEBUG
1003 boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
1004 if (use_fences) {
1005 ml_timebase_to_memory_fence();
1006 }
1007 #endif
1008
1009 switch (size) {
1010 case 1:
1011 s1 = *(volatile unsigned char *)vaddr;
1012 result = s1;
1013 break;
1014 case 2:
1015 s2 = *(volatile unsigned short *)vaddr;
1016 result = s2;
1017 break;
1018 case 4:
1019 result = *(volatile unsigned int *)vaddr;
1020 break;
1021 case 8:
1022 result = *(volatile unsigned long long *)vaddr;
1023 break;
1024 default:
1025 panic("Invalid size %d for ml_io_read(%p)", size, (void *)vaddr);
1026 break;
1027 }
1028
1029 #if DEVELOPMENT || DEBUG
1030 if (use_fences) {
1031 ml_memory_to_timebase_fence();
1032 }
1033 #endif
1034
1035 #ifdef ML_IO_TIMEOUTS_ENABLED
1036 if (__improbable(timeread == TRUE)) {
1037 eabs = ml_io_timestamp();
1038
1039
1040 /* Prevent the processor from calling iotrace during its
1041 * initialization procedure. */
1042 if (current_processor()->state == PROCESSOR_RUNNING) {
1043 iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, sabs, eabs - sabs);
1044 }
1045
1046 if (__improbable((eabs - sabs) > report_read_delay)) {
1047 if (paddr == 0) {
1048 paddr = kvtophys(vaddr);
1049 }
1050
1051 DTRACE_PHYSLAT5(physioread, uint64_t, (eabs - sabs),
1052 uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, result);
1053
1054 uint64_t override = 0;
1055 override_io_timeouts(vaddr, paddr, &override, NULL);
1056
1057 if (override != 0) {
1058 #if SCHED_HYGIENE_DEBUG
1059 /*
1060 * The IO timeout was overridden. If we were called in an
1061 * interrupt handler context, that can lead to a timeout
1062 * panic, so we need to abandon the measurement.
1063 */
1064 if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1065 ml_irq_debug_abandon();
1066 }
1067 #endif
1068 report_read_delay = override;
1069 }
1070 }
1071
1072 if (__improbable((eabs - sabs) > report_read_delay)) {
1073 if (phy_read_panic && (machine_timeout_suspended() == FALSE)) {
1074 #if defined(__x86_64__)
1075 panic_notify();
1076 #endif /* defined(__x86_64__) */
1077 uint64_t nsec = 0;
1078 absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1079 panic("Read from IO vaddr 0x%lx paddr 0x%lx took %llu ns, "
1080 "result: 0x%llx (start: %llu, end: %llu), ceiling: %llu",
1081 vaddr, paddr, nsec, result, sabs, eabs,
1082 report_read_delay);
1083 }
1084 }
1085
1086 if (__improbable(trace_phy_read_delay > 0 && (eabs - sabs) > trace_phy_read_delay)) {
1087 KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_READ),
1088 (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result);
1089 }
1090
1091 (void)ml_set_interrupts_enabled_with_debug(istate, false);
1092 }
1093 #endif /* ML_IO_TIMEOUTS_ENABLED */
1094 return result;
1095 }
1096
1097 unsigned int
ml_io_read8(uintptr_t vaddr)1098 ml_io_read8(uintptr_t vaddr)
1099 {
1100 return (unsigned) ml_io_read(vaddr, 1);
1101 }
1102
1103 unsigned int
ml_io_read16(uintptr_t vaddr)1104 ml_io_read16(uintptr_t vaddr)
1105 {
1106 return (unsigned) ml_io_read(vaddr, 2);
1107 }
1108
1109 unsigned int
ml_io_read32(uintptr_t vaddr)1110 ml_io_read32(uintptr_t vaddr)
1111 {
1112 return (unsigned) ml_io_read(vaddr, 4);
1113 }
1114
1115 unsigned long long
ml_io_read64(uintptr_t vaddr)1116 ml_io_read64(uintptr_t vaddr)
1117 {
1118 return ml_io_read(vaddr, 8);
1119 }
1120
1121 /* ml_io_write* */
1122
1123 void
ml_io_write(uintptr_t vaddr,uint64_t val,int size)1124 ml_io_write(uintptr_t vaddr, uint64_t val, int size)
1125 {
1126 #ifdef ML_IO_VERIFY_UNCACHEABLE
1127 uintptr_t paddr = pmap_verify_noncacheable(vaddr);
1128 #elif defined(ML_IO_TIMEOUTS_ENABLED)
1129 uintptr_t paddr = 0;
1130 #endif
1131
1132 #ifdef ML_IO_TIMEOUTS_ENABLED
1133 uint64_t sabs, eabs;
1134 boolean_t istate, timewrite = FALSE;
1135 uint64_t report_write_delay;
1136 #if __x86_64__
1137 report_write_delay = report_phy_write_delay;
1138 #else
1139 report_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
1140 uint64_t trace_phy_write_delay = os_atomic_load(&trace_phy_write_delay_to, relaxed);
1141 #endif /* !defined(__x86_64__) */
1142 if (__improbable(report_write_delay != 0)) {
1143 istate = ml_set_interrupts_enabled_with_debug(false, false);
1144 sabs = ml_io_timestamp();
1145 timewrite = TRUE;
1146 }
1147
1148 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
1149 if (__improbable(timewrite && simulate_stretched_io)) {
1150 sabs -= simulate_stretched_io;
1151 }
1152 #endif /* DEVELOPMENT || DEBUG */
1153 #endif /* ML_IO_TIMEOUTS_ENABLED */
1154
1155 #if DEVELOPMENT || DEBUG
1156 boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
1157 if (use_fences) {
1158 ml_timebase_to_memory_fence();
1159 }
1160 #endif
1161
1162 switch (size) {
1163 case 1:
1164 *(volatile uint8_t *)vaddr = (uint8_t)val;
1165 break;
1166 case 2:
1167 *(volatile uint16_t *)vaddr = (uint16_t)val;
1168 break;
1169 case 4:
1170 *(volatile uint32_t *)vaddr = (uint32_t)val;
1171 break;
1172 case 8:
1173 *(volatile uint64_t *)vaddr = (uint64_t)val;
1174 break;
1175 default:
1176 panic("Invalid size %d for ml_io_write(%p, 0x%llx)", size, (void *)vaddr, val);
1177 break;
1178 }
1179
1180 #if DEVELOPMENT || DEBUG
1181 if (use_fences) {
1182 ml_memory_to_timebase_fence();
1183 }
1184 #endif
1185
1186 #ifdef ML_IO_TIMEOUTS_ENABLED
1187 if (__improbable(timewrite == TRUE)) {
1188 eabs = ml_io_timestamp();
1189
1190 /* Prevent the processor from calling iotrace during its
1191 * initialization procedure. */
1192 if (current_processor()->state == PROCESSOR_RUNNING) {
1193 iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, sabs, eabs - sabs);
1194 }
1195
1196
1197 if (__improbable((eabs - sabs) > report_write_delay)) {
1198 if (paddr == 0) {
1199 paddr = kvtophys(vaddr);
1200 }
1201
1202 DTRACE_PHYSLAT5(physiowrite, uint64_t, (eabs - sabs),
1203 uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, val);
1204
1205 uint64_t override = 0;
1206 override_io_timeouts(vaddr, paddr, NULL, &override);
1207
1208 if (override != 0) {
1209 #if SCHED_HYGIENE_DEBUG
1210 /*
1211 * The IO timeout was overridden. If we were called in an
1212 * interrupt handler context, that can lead to a timeout
1213 * panic, so we need to abandon the measurement.
1214 */
1215 if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1216 ml_irq_debug_abandon();
1217 }
1218 #endif
1219 report_write_delay = override;
1220 }
1221 }
1222
1223 if (__improbable((eabs - sabs) > report_write_delay)) {
1224 if (phy_write_panic && (machine_timeout_suspended() == FALSE)) {
1225 #if defined(__x86_64__)
1226 panic_notify();
1227 #endif /* defined(__x86_64__) */
1228
1229 uint64_t nsec = 0;
1230 absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1231 panic("Write to IO vaddr %p paddr %p val 0x%llx took %llu ns,"
1232 " (start: %llu, end: %llu), ceiling: %llu",
1233 (void *)vaddr, (void *)paddr, val, nsec, sabs, eabs,
1234 report_write_delay);
1235 }
1236 }
1237
1238 if (__improbable(trace_phy_write_delay > 0 && (eabs - sabs) > trace_phy_write_delay)) {
1239 KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_WRITE),
1240 (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val);
1241 }
1242
1243 (void)ml_set_interrupts_enabled_with_debug(istate, false);
1244 }
1245 #endif /* ML_IO_TIMEOUTS_ENABLED */
1246 }
1247
1248 void
ml_io_write8(uintptr_t vaddr,uint8_t val)1249 ml_io_write8(uintptr_t vaddr, uint8_t val)
1250 {
1251 ml_io_write(vaddr, val, 1);
1252 }
1253
1254 void
ml_io_write16(uintptr_t vaddr,uint16_t val)1255 ml_io_write16(uintptr_t vaddr, uint16_t val)
1256 {
1257 ml_io_write(vaddr, val, 2);
1258 }
1259
1260 void
ml_io_write32(uintptr_t vaddr,uint32_t val)1261 ml_io_write32(uintptr_t vaddr, uint32_t val)
1262 {
1263 ml_io_write(vaddr, val, 4);
1264 }
1265
1266 void
ml_io_write64(uintptr_t vaddr,uint64_t val)1267 ml_io_write64(uintptr_t vaddr, uint64_t val)
1268 {
1269 ml_io_write(vaddr, val, 8);
1270 }
1271
1272 struct cpu_callback_chain_elem {
1273 cpu_callback_t fn;
1274 void *param;
1275 struct cpu_callback_chain_elem *next;
1276 };
1277
1278 static struct cpu_callback_chain_elem *cpu_callback_chain;
1279 static LCK_GRP_DECLARE(cpu_callback_chain_lock_grp, "cpu_callback_chain");
1280 static LCK_SPIN_DECLARE(cpu_callback_chain_lock, &cpu_callback_chain_lock_grp);
1281
1282 void
cpu_event_register_callback(cpu_callback_t fn,void * param)1283 cpu_event_register_callback(cpu_callback_t fn, void *param)
1284 {
1285 struct cpu_callback_chain_elem *new_elem;
1286
1287 new_elem = zalloc_permanent_type(struct cpu_callback_chain_elem);
1288 if (!new_elem) {
1289 panic("can't allocate cpu_callback_chain_elem");
1290 }
1291
1292 lck_spin_lock(&cpu_callback_chain_lock);
1293 new_elem->next = cpu_callback_chain;
1294 new_elem->fn = fn;
1295 new_elem->param = param;
1296 os_atomic_store(&cpu_callback_chain, new_elem, release);
1297 lck_spin_unlock(&cpu_callback_chain_lock);
1298 }
1299
1300 __attribute__((noreturn))
1301 void
cpu_event_unregister_callback(__unused cpu_callback_t fn)1302 cpu_event_unregister_callback(__unused cpu_callback_t fn)
1303 {
1304 panic("Unfortunately, cpu_event_unregister_callback is unimplemented.");
1305 }
1306
1307 void
ml_broadcast_cpu_event(enum cpu_event event,unsigned int cpu_or_cluster)1308 ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster)
1309 {
1310 struct cpu_callback_chain_elem *cursor;
1311
1312 cursor = os_atomic_load(&cpu_callback_chain, dependency);
1313 for (; cursor != NULL; cursor = cursor->next) {
1314 cursor->fn(cursor->param, event, cpu_or_cluster);
1315 }
1316 }
1317
1318 // Initialize Machine Timeouts (see the MACHINE_TIMEOUT macro
1319 // definition)
1320
1321 void
machine_timeout_init_with_suffix(const struct machine_timeout_spec * spec,char const * suffix)1322 machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *suffix)
1323 {
1324 if (spec->skip_predicate != NULL && spec->skip_predicate(spec)) {
1325 // This timeout should be disabled.
1326 os_atomic_store_wide((uint64_t*)spec->ptr, 0, relaxed);
1327 return;
1328 }
1329
1330 assert(suffix != NULL);
1331 assert(strlen(spec->name) <= MACHINE_TIMEOUT_MAX_NAME_LEN);
1332
1333 size_t const suffix_len = strlen(suffix);
1334
1335 size_t const dt_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + 1;
1336 char dt_name[dt_name_size];
1337
1338 strlcpy(dt_name, spec->name, dt_name_size);
1339 strlcat(dt_name, suffix, dt_name_size);
1340
1341 size_t const scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + strlen("-scale") + 1;
1342 char scale_name[scale_name_size];
1343
1344 strlcpy(scale_name, spec->name, scale_name_size);
1345 strlcat(scale_name, suffix, scale_name_size);
1346 strlcat(scale_name, "-scale", scale_name_size);
1347
1348 size_t const boot_arg_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + strlen("ml-timeout-") + suffix_len + 1;
1349 char boot_arg_name[boot_arg_name_size];
1350
1351 strlcpy(boot_arg_name, "ml-timeout-", boot_arg_name_size);
1352 strlcat(boot_arg_name, spec->name, boot_arg_name_size);
1353 strlcat(boot_arg_name, suffix, boot_arg_name_size);
1354
1355 size_t const boot_arg_scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN +
1356 strlen("ml-timeout-") + strlen("-scale") + suffix_len + 1;
1357 char boot_arg_scale_name[boot_arg_scale_name_size];
1358
1359 strlcpy(boot_arg_scale_name, "ml-timeout-", boot_arg_scale_name_size);
1360 strlcat(boot_arg_scale_name, spec->name, boot_arg_scale_name_size);
1361 strlcat(boot_arg_scale_name, suffix, boot_arg_name_size);
1362 strlcat(boot_arg_scale_name, "-scale", boot_arg_scale_name_size);
1363
1364
1365 /*
1366 * Determine base value from DT and boot-args.
1367 */
1368
1369 DTEntry base, chosen;
1370
1371 if (SecureDTLookupEntry(NULL, "/machine-timeouts", &base) != kSuccess) {
1372 base = NULL;
1373 }
1374
1375 if (SecureDTLookupEntry(NULL, "/chosen/machine-timeouts", &chosen) != kSuccess) {
1376 chosen = NULL;
1377 }
1378
1379 uint64_t timeout = spec->default_value;
1380 bool found = false;
1381
1382 uint64_t const *data = NULL;
1383 unsigned int data_size = sizeof(*data);
1384
1385 /* First look in /machine-timeouts/<name> */
1386 if (base != NULL && SecureDTGetProperty(base, dt_name, (const void **)&data, &data_size) == kSuccess) {
1387 if (data_size != sizeof(*data)) {
1388 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s", __func__, data_size, dt_name);
1389 }
1390
1391 timeout = *data;
1392 found = true;
1393 }
1394
1395 /* A value in /chosen/machine-timeouts/<name> overrides */
1396 if (chosen != NULL && SecureDTGetProperty(chosen, dt_name, (const void **)&data, &data_size) == kSuccess) {
1397 if (data_size != sizeof(*data)) {
1398 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s", __func__, data_size, dt_name);
1399 }
1400
1401 timeout = *data;
1402 found = true;
1403 }
1404
1405 /* A boot-arg ml-timeout-<name> overrides */
1406 uint64_t boot_arg = 0;
1407
1408 if (PE_parse_boot_argn(boot_arg_name, &boot_arg, sizeof(boot_arg))) {
1409 timeout = boot_arg;
1410 found = true;
1411 }
1412
1413
1414 /*
1415 * Determine scale value from DT and boot-args.
1416 */
1417
1418 uint64_t scale = 1;
1419 uint32_t const *scale_data;
1420 unsigned int scale_size = sizeof(scale_data);
1421
1422 /* If there is a scale factor /machine-timeouts/<name>-scale, apply it. */
1423 if (base != NULL && SecureDTGetProperty(base, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1424 if (scale_size != sizeof(*scale_data)) {
1425 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s-scale", __func__, scale_size, dt_name);
1426 }
1427
1428 scale = *scale_data;
1429 }
1430
1431 /* If there is a scale factor /chosen/machine-timeouts/<name>-scale, use that. */
1432 if (chosen != NULL && SecureDTGetProperty(chosen, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1433 if (scale_size != sizeof(*scale_data)) {
1434 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s-scale", __func__,
1435 scale_size, dt_name);
1436 }
1437
1438 scale = *scale_data;
1439 }
1440
1441 /* Finally, a boot-arg ml-timeout-<name>-scale takes precedence. */
1442 if (PE_parse_boot_argn(boot_arg_scale_name, &boot_arg, sizeof(boot_arg))) {
1443 scale = boot_arg;
1444 }
1445
1446 static bool global_scale_set;
1447 static uint64_t global_scale;
1448
1449 if (!global_scale_set) {
1450 /* Apply /machine-timeouts/global-scale if present */
1451 if (SecureDTGetProperty(base, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1452 if (scale_size != sizeof(*scale_data)) {
1453 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/global-scale", __func__,
1454 scale_size);
1455 }
1456
1457 global_scale = *scale_data;
1458 global_scale_set = true;
1459 }
1460
1461 /* Use /chosen/machine-timeouts/global-scale if present */
1462 if (SecureDTGetProperty(chosen, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1463 if (scale_size != sizeof(*scale_data)) {
1464 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/global-scale", __func__,
1465 scale_size);
1466 }
1467
1468 global_scale = *scale_data;
1469 global_scale_set = true;
1470 }
1471
1472 /* Finally, the boot-arg ml-timeout-global-scale takes precedence. */
1473 if (PE_parse_boot_argn("ml-timeout-global-scale", &boot_arg, sizeof(boot_arg))) {
1474 global_scale = boot_arg;
1475 global_scale_set = true;
1476 }
1477 }
1478
1479 if (global_scale_set) {
1480 scale *= global_scale;
1481 }
1482
1483 /* Compute the final timeout, and done. */
1484 if (found && timeout > 0) {
1485 /* Only apply inherent unit scale if the value came in
1486 * externally. */
1487
1488 if (spec->unit_scale == MACHINE_TIMEOUT_UNIT_TIMEBASE) {
1489 uint64_t nanoseconds = timeout / 1000;
1490 nanoseconds_to_absolutetime(nanoseconds, &timeout);
1491 } else {
1492 timeout /= spec->unit_scale;
1493 }
1494
1495 if (timeout == 0) {
1496 /* Ensure unit scaling did not disable the timeout. */
1497 timeout = 1;
1498 }
1499 }
1500
1501 if (os_mul_overflow(timeout, scale, &timeout)) {
1502 timeout = UINT64_MAX; // clamp
1503 }
1504
1505 os_atomic_store_wide((uint64_t*)spec->ptr, timeout, relaxed);
1506 }
1507
1508 void
machine_timeout_init(const struct machine_timeout_spec * spec)1509 machine_timeout_init(const struct machine_timeout_spec *spec)
1510 {
1511 machine_timeout_init_with_suffix(spec, "");
1512 }
1513
1514 #if DEVELOPMENT || DEBUG
1515 /*
1516 * Late timeout (re-)initialization, at the end of bsd_init()
1517 */
1518 void
machine_timeout_bsd_init(void)1519 machine_timeout_bsd_init(void)
1520 {
1521 char const * const __unused mt_suffix = "-b";
1522 #if SCHED_HYGIENE_DEBUG
1523 machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix);
1524 machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix);
1525
1526 /*
1527 * The io timeouts can inherit from interrupt_masked_timeout.
1528 * Re-initialize, as interrupt_masked_timeout may have changed.
1529 */
1530 ml_io_init_timeouts();
1531
1532 extern void preemption_disable_reset_max_durations(void);
1533 /*
1534 * Reset the preemption disable stats, so that they are not
1535 * polluted by long early boot code.
1536 */
1537 preemption_disable_reset_max_durations();
1538 #endif /* SCHED_HYGIENE_DEBUG */
1539 }
1540 #endif /* DEVELOPMENT || DEBUG */
1541
1542 #if ML_IO_TIMEOUTS_ENABLED && CONFIG_XNUPOST
1543 #include <tests/xnupost.h>
1544
1545 extern kern_return_t ml_io_timeout_test(void);
1546
1547 static inline void
ml_io_timeout_test_get_timeouts(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)1548 ml_io_timeout_test_get_timeouts(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
1549 {
1550 *read_timeout = 0;
1551 *write_timeout = 0;
1552
1553 vm_offset_t paddr = kvtophys(vaddr);
1554
1555 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1556 override_io_timeouts(vaddr, paddr, read_timeout, write_timeout);
1557 ml_set_interrupts_enabled(istate);
1558 }
1559
1560 static inline void
ml_io_timeout_test_get_timeouts_phys(vm_offset_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)1561 ml_io_timeout_test_get_timeouts_phys(vm_offset_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
1562 {
1563 *read_timeout = 0;
1564 *write_timeout = 0;
1565
1566 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1567 override_io_timeouts(0, paddr, read_timeout, write_timeout);
1568 ml_set_interrupts_enabled(istate);
1569 }
1570
1571 kern_return_t
ml_io_timeout_test(void)1572 ml_io_timeout_test(void)
1573 {
1574 const size_t SIZE = 16;
1575 /*
1576 * Page align the base address to ensure that the regions are physically
1577 * contiguous.
1578 */
1579 const uintptr_t iovaddr_base1 = (uintptr_t)kernel_pmap & ~PAGE_MASK;
1580
1581 const uintptr_t iovaddr_base2 = iovaddr_base1 + SIZE;
1582 const uintptr_t vaddr1 = iovaddr_base1 + SIZE / 2;
1583 const uintptr_t vaddr2 = iovaddr_base2 + SIZE / 2;
1584
1585 const vm_offset_t iopaddr_base1 = kvtophys(iovaddr_base1);
1586 const vm_offset_t iopaddr_base2 = kvtophys(iovaddr_base2);
1587 const vm_offset_t paddr1 = iopaddr_base1 + SIZE / 2;
1588 const vm_offset_t paddr2 = iopaddr_base2 + SIZE / 2;
1589
1590 const uint64_t READ_TIMEOUT1_US = 50000, WRITE_TIMEOUT1_US = 50001;
1591 const uint64_t READ_TIMEOUT2_US = 50002, WRITE_TIMEOUT2_US = 50003;
1592 uint64_t read_timeout1_abs, write_timeout1_abs;
1593 uint64_t read_timeout2_abs, write_timeout2_abs;
1594 nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT1_US, &read_timeout1_abs);
1595 nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT1_US, &write_timeout1_abs);
1596 nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT2_US, &read_timeout2_abs);
1597 nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT2_US, &write_timeout2_abs);
1598
1599 int err = ml_io_increase_timeouts(iovaddr_base1, 0, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1600 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for empty region");
1601
1602 err = ml_io_increase_timeouts(iovaddr_base1, 4097, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1603 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for region > 4096 bytes");
1604
1605 err = ml_io_increase_timeouts(UINTPTR_MAX, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1606 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overflowed region");
1607
1608 err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1609 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first VA region should succeed");
1610
1611 err = ml_io_increase_timeouts(iovaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US);
1612 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second VA region should succeed");
1613
1614 err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1615 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for same region twice");
1616
1617 err = ml_io_increase_timeouts(vaddr1, (uint32_t)(vaddr2 - vaddr1), READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1618 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overlapping regions");
1619
1620 uint64_t read_timeout, write_timeout;
1621 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1622 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1623 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1624
1625 ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout);
1626 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region");
1627 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region");
1628
1629 ml_io_timeout_test_get_timeouts(iovaddr_base2 + SIZE, &read_timeout, &write_timeout);
1630 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout without override");
1631 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout without override");
1632
1633 err = ml_io_reset_timeouts(iovaddr_base1 + 1, SIZE - 1);
1634 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for subregion");
1635
1636 err = ml_io_reset_timeouts(iovaddr_base2 + SIZE, SIZE);
1637 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for non-existent region");
1638
1639 err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1640 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first VA region should succeed");
1641
1642 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1643 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region");
1644 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region");
1645
1646 err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1647 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for same region twice");
1648
1649 err = ml_io_reset_timeouts(iovaddr_base2, SIZE);
1650 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second VA region should succeed");
1651
1652 err = ml_io_increase_timeouts_phys(iopaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1653 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first PA region should succeed");
1654
1655 err = ml_io_increase_timeouts_phys(iopaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US);
1656 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second PA region should succeed");
1657
1658 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1659 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1660 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1661
1662 ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout);
1663 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region");
1664 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region");
1665
1666 ml_io_timeout_test_get_timeouts_phys(paddr1, &read_timeout, &write_timeout);
1667 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1668 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1669
1670 ml_io_timeout_test_get_timeouts_phys(paddr2, &read_timeout, &write_timeout);
1671 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first physical region");
1672 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first physical region");
1673
1674 err = ml_io_reset_timeouts_phys(iopaddr_base1, SIZE);
1675 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first PA region should succeed");
1676
1677 err = ml_io_reset_timeouts_phys(iopaddr_base2, SIZE);
1678 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second PA region should succeed");
1679
1680 ml_io_timeout_test_get_timeouts_phys(paddr1, &read_timeout, &write_timeout);
1681 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region");
1682 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region");
1683
1684 return KERN_SUCCESS;
1685 }
1686 #endif /* CONFIG_XNUPOST */
1687