1 /*
2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: kern/machine.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1987
62 *
63 * Support for machine independent machine abstraction.
64 */
65
66 #include <string.h>
67
68 #include <mach/mach_types.h>
69 #include <mach/boolean.h>
70 #include <mach/kern_return.h>
71 #include <mach/machine.h>
72 #include <mach/host_info.h>
73 #include <mach/host_reboot.h>
74 #include <mach/host_priv_server.h>
75 #include <mach/processor_server.h>
76 #include <mach/sdt.h>
77
78 #include <kern/kern_types.h>
79 #include <kern/cpu_data.h>
80 #include <kern/ipc_host.h>
81 #include <kern/host.h>
82 #include <kern/machine.h>
83 #include <kern/misc_protos.h>
84 #include <kern/percpu.h>
85 #include <kern/processor.h>
86 #include <kern/queue.h>
87 #include <kern/sched.h>
88 #include <kern/startup.h>
89 #include <kern/task.h>
90 #include <kern/thread.h>
91 #include <kern/iotrace.h>
92
93 #include <libkern/OSDebug.h>
94 #if ML_IO_TIMEOUTS_ENABLED
95 #include <libkern/tree.h>
96 #endif
97
98 #include <pexpert/device_tree.h>
99
100 #include <machine/commpage.h>
101 #include <machine/machine_routines.h>
102
103 #if HIBERNATION
104 #include <IOKit/IOHibernatePrivate.h>
105 #endif
106 #include <IOKit/IOPlatformExpert.h>
107
108 #if CONFIG_DTRACE
109 extern void (*dtrace_cpu_state_changed_hook)(int, boolean_t);
110 #endif
111
112 #if defined(__arm64__)
113 extern void wait_while_mp_kdp_trap(bool check_SIGPdebug);
114 #include <arm/pmap/pmap_data.h>
115 #endif
116
117 #if defined(__x86_64__)
118 #include <i386/panic_notify.h>
119 #endif
120
121 /*
122 * Exported variables:
123 */
124
125 struct machine_info machine_info;
126
127 /* Forwards */
128 static void
129 processor_doshutdown(processor_t processor);
130
131 static void
132 processor_offline(void * parameter, __unused wait_result_t result);
133
134 static void
135 processor_offline_intstack(processor_t processor) __dead2;
136
137 static void
processor_up_update_counts(processor_t processor)138 processor_up_update_counts(processor_t processor)
139 {
140 ml_cpu_up_update_counts(processor->cpu_id);
141
142 os_atomic_inc(&processor_avail_count, relaxed);
143 if (processor->is_recommended) {
144 os_atomic_inc(&processor_avail_count_user, relaxed);
145 }
146 if (processor->processor_primary == processor) {
147 os_atomic_inc(&primary_processor_avail_count, relaxed);
148 if (processor->is_recommended) {
149 os_atomic_inc(&primary_processor_avail_count_user, relaxed);
150 }
151 }
152 commpage_update_active_cpus();
153 }
154
155 /*
156 * processor_up:
157 *
158 * Flag processor as up and running, and available
159 * for scheduling.
160 */
161 void
processor_up(processor_t processor)162 processor_up(
163 processor_t processor)
164 {
165 processor_set_t pset;
166 spl_t s;
167
168 s = splsched();
169 init_ast_check(processor);
170
171 #if defined(__arm64__)
172 /*
173 * A processor coming online won't have received a SIGPdebug signal
174 * to cause it to spin while a stackshot or panic is taking place,
175 * so spin here on mp_kdp_trap.
176 *
177 * However, since cpu_signal() is not yet enabled for this processor,
178 * there is a race if we have just passed this when a cpu_signal()
179 * is attempted. The sender will assume the cpu is offline, so it will
180 * not end up spinning anywhere. See processor_offline() for the fix
181 * for this race.
182 */
183 wait_while_mp_kdp_trap(false);
184 #endif
185
186 pset = processor->processor_set;
187 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
188 pset_lock(pset);
189
190 ++pset->online_processor_count;
191 simple_lock(&processor->start_state_lock, LCK_GRP_NULL);
192 pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
193 simple_unlock(&processor->start_state_lock);
194 bool temporary = processor->shutdown_temporary;
195 if (temporary) {
196 processor->shutdown_temporary = false;
197 } else {
198 processor_up_update_counts(processor);
199 }
200 if (processor->is_recommended) {
201 SCHED(pset_made_schedulable)(processor, pset, false);
202 }
203 pset_unlock(pset);
204 ml_cpu_up();
205 smr_cpu_up(processor, SMR_CPU_REASON_OFFLINE);
206 sched_mark_processor_online_locked(processor, processor->last_startup_reason);
207 simple_unlock(&sched_available_cores_lock);
208 splx(s);
209
210 thread_wakeup((event_t)&processor->state);
211
212 #if CONFIG_DTRACE
213 if (dtrace_cpu_state_changed_hook) {
214 (*dtrace_cpu_state_changed_hook)(processor->cpu_id, TRUE);
215 }
216 #endif
217 }
218 #include <atm/atm_internal.h>
219
220 kern_return_t
host_reboot(host_priv_t host_priv,int options)221 host_reboot(
222 host_priv_t host_priv,
223 int options)
224 {
225 if (host_priv == HOST_PRIV_NULL) {
226 return KERN_INVALID_HOST;
227 }
228
229 #if DEVELOPMENT || DEBUG
230 if (options & HOST_REBOOT_DEBUGGER) {
231 Debugger("Debugger");
232 return KERN_SUCCESS;
233 }
234 #endif
235
236 if (options & HOST_REBOOT_UPSDELAY) {
237 // UPS power cutoff path
238 PEHaltRestart( kPEUPSDelayHaltCPU );
239 } else {
240 halt_all_cpus(!(options & HOST_REBOOT_HALT));
241 }
242
243 return KERN_SUCCESS;
244 }
245
246 kern_return_t
processor_assign(__unused processor_t processor,__unused processor_set_t new_pset,__unused boolean_t wait)247 processor_assign(
248 __unused processor_t processor,
249 __unused processor_set_t new_pset,
250 __unused boolean_t wait)
251 {
252 return KERN_FAILURE;
253 }
254
255 static void
processor_down_update_counts(processor_t processor)256 processor_down_update_counts(processor_t processor)
257 {
258 ml_cpu_down_update_counts(processor->cpu_id);
259
260 os_atomic_dec(&processor_avail_count, relaxed);
261 if (processor->is_recommended) {
262 os_atomic_dec(&processor_avail_count_user, relaxed);
263 }
264 if (processor->processor_primary == processor) {
265 os_atomic_dec(&primary_processor_avail_count, relaxed);
266 if (processor->is_recommended) {
267 os_atomic_dec(&primary_processor_avail_count_user, relaxed);
268 }
269 }
270 commpage_update_active_cpus();
271 }
272
273 extern lck_mtx_t processor_updown_lock;
274
275 kern_return_t
processor_shutdown(processor_t processor,processor_reason_t reason,uint32_t flags)276 processor_shutdown(
277 processor_t processor,
278 processor_reason_t reason,
279 uint32_t flags)
280 {
281 if (!ml_cpu_can_exit(processor->cpu_id, reason)) {
282 /*
283 * Failure if disallowed by arch code.
284 */
285 return KERN_NOT_SUPPORTED;
286 }
287
288 lck_mtx_lock(&processor_updown_lock);
289
290 spl_t s = splsched();
291 processor_set_t pset = processor->processor_set;
292
293 pset_lock(pset);
294
295 if (processor->state == PROCESSOR_START) {
296 pset_unlock(pset);
297 splx(s);
298
299 processor_wait_for_start(processor);
300
301 s = splsched();
302 pset_lock(pset);
303 }
304
305 /*
306 * If the processor is dispatching, let it finish.
307 */
308 while (processor->state == PROCESSOR_DISPATCHING) {
309 pset_unlock(pset);
310 splx(s);
311 delay(1);
312 s = splsched();
313 pset_lock(pset);
314 }
315 pset_unlock(pset);
316 splx(s);
317
318 kern_return_t mark_ret = sched_mark_processor_offline(processor, reason);
319 if (mark_ret != KERN_SUCCESS) {
320 /* Must fail or we deadlock */
321 lck_mtx_unlock(&processor_updown_lock);
322 return KERN_FAILURE;
323 }
324
325 ml_cpu_begin_state_transition(processor->cpu_id);
326 s = splsched();
327
328 pset_lock(pset);
329 if (processor->state == PROCESSOR_OFF_LINE) {
330 /*
331 * Success if already shutdown.
332 */
333 if (processor->shutdown_temporary && !(flags & SHUTDOWN_TEMPORARY)) {
334 /* Convert a temporary shutdown into a permanent shutdown */
335 processor->shutdown_temporary = false;
336 processor_down_update_counts(processor);
337 }
338 pset_unlock(pset);
339 splx(s);
340 ml_cpu_end_state_transition(processor->cpu_id);
341
342 lck_mtx_unlock(&processor_updown_lock);
343 return KERN_SUCCESS;
344 }
345
346 if (processor->shutdown_locked && (reason != REASON_SYSTEM)) {
347 /*
348 * Failure if processor is locked against shutdown.
349 */
350 pset_unlock(pset);
351 splx(s);
352
353 lck_mtx_unlock(&processor_updown_lock);
354 return KERN_FAILURE;
355 }
356
357 /*
358 * If the processor is dispatching, let it finish.
359 */
360 while (processor->state == PROCESSOR_DISPATCHING) {
361 pset_unlock(pset);
362 splx(s);
363 delay(1);
364 s = splsched();
365 pset_lock(pset);
366 }
367
368 /*
369 * Success if already being shutdown with matching SHUTDOWN_TEMPORARY flag.
370 */
371 if ((processor->state == PROCESSOR_SHUTDOWN) || (processor->state == PROCESSOR_PENDING_OFFLINE)) {
372 bool success = (flags & SHUTDOWN_TEMPORARY) ? processor->shutdown_temporary : !processor->shutdown_temporary;
373
374 pset_unlock(pset);
375 splx(s);
376 ml_cpu_end_state_transition(processor->cpu_id);
377
378 lck_mtx_unlock(&processor_updown_lock);
379 return success ? KERN_SUCCESS : KERN_FAILURE;
380 }
381
382 ml_broadcast_cpu_event(CPU_EXIT_REQUESTED, processor->cpu_id);
383 pset_update_processor_state(pset, processor, PROCESSOR_SHUTDOWN);
384 processor->last_shutdown_reason = reason;
385 if (flags & SHUTDOWN_TEMPORARY) {
386 processor->shutdown_temporary = true;
387 }
388 pset_unlock(pset);
389
390 processor_doshutdown(processor);
391 splx(s);
392
393 cpu_exit_wait(processor->cpu_id);
394
395 if (processor != master_processor) {
396 s = splsched();
397 pset_lock(pset);
398 pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
399 pset_unlock(pset);
400 splx(s);
401 }
402
403 ml_cpu_end_state_transition(processor->cpu_id);
404 ml_broadcast_cpu_event(CPU_EXITED, processor->cpu_id);
405 ml_cpu_power_disable(processor->cpu_id);
406
407 lck_mtx_unlock(&processor_updown_lock);
408 return KERN_SUCCESS;
409 }
410
411 /*
412 * Called with interrupts disabled.
413 */
414 static void
processor_doshutdown(processor_t processor)415 processor_doshutdown(
416 processor_t processor)
417 {
418 thread_t self = current_thread();
419
420 /*
421 * Get onto the processor to shutdown
422 */
423 processor_t prev = thread_bind(processor);
424 thread_block(THREAD_CONTINUE_NULL);
425
426 /* interrupts still disabled */
427 assert(ml_get_interrupts_enabled() == FALSE);
428
429 assert(processor == current_processor());
430 assert(processor->state == PROCESSOR_SHUTDOWN);
431
432 #if CONFIG_DTRACE
433 if (dtrace_cpu_state_changed_hook) {
434 (*dtrace_cpu_state_changed_hook)(processor->cpu_id, FALSE);
435 }
436 #endif
437
438 #if defined(__arm64__)
439 /*
440 * Catch a processor going offline
441 * while a panic or stackshot is in progress, as it won't
442 * receive a SIGPdebug now that interrupts are disabled.
443 */
444 wait_while_mp_kdp_trap(false);
445 #endif
446
447 smr_cpu_down(processor, SMR_CPU_REASON_OFFLINE);
448 ml_cpu_down();
449
450 #if HIBERNATION
451 if (processor_avail_count < 2) {
452 hibernate_vm_lock();
453 hibernate_vm_unlock();
454 }
455 #endif
456
457 processor_set_t pset = processor->processor_set;
458
459 pset_lock(pset);
460 pset_update_processor_state(pset, processor, PROCESSOR_PENDING_OFFLINE);
461 --pset->online_processor_count;
462 if (!processor->shutdown_temporary) {
463 processor_down_update_counts(processor);
464 }
465 SCHED(processor_queue_shutdown)(processor);
466 /* pset lock dropped */
467 SCHED(rt_queue_shutdown)(processor);
468
469 thread_bind(prev);
470
471 /* interrupts still disabled */
472
473 /*
474 * Continue processor shutdown on the processor's idle thread.
475 * The handoff won't fail because the idle thread has a reserved stack.
476 * Switching to the idle thread leaves interrupts disabled,
477 * so we can't accidentally take an interrupt after the context switch.
478 */
479 thread_t shutdown_thread = processor->idle_thread;
480 shutdown_thread->continuation = processor_offline;
481 shutdown_thread->parameter = processor;
482
483 thread_run(self, NULL, NULL, shutdown_thread);
484 }
485
486 /*
487 * Called in the context of the idle thread to shut down the processor
488 *
489 * A shut-down processor looks like it's 'running' the idle thread parked
490 * in this routine, but it's actually been powered off and has no hardware state.
491 */
492 static void
processor_offline(void * parameter,__unused wait_result_t result)493 processor_offline(
494 void * parameter,
495 __unused wait_result_t result)
496 {
497 processor_t processor = (processor_t) parameter;
498 thread_t self = current_thread();
499 __assert_only thread_t old_thread = THREAD_NULL;
500
501 assert(processor == current_processor());
502 assert(self->state & TH_IDLE);
503 assert(processor->idle_thread == self);
504 assert(ml_get_interrupts_enabled() == FALSE);
505 assert(self->continuation == NULL);
506 assert(processor->processor_offlined == false);
507 assert(processor->running_timers_active == false);
508
509 bool enforce_quiesce_safety = gEnforcePlatformActionSafety;
510
511 /*
512 * Scheduling is now disabled for this processor.
513 * Ensure that primitives that need scheduling (like mutexes) know this.
514 */
515 if (enforce_quiesce_safety) {
516 disable_preemption_without_measurements();
517 }
518
519 /* convince slave_main to come back here */
520 processor->processor_offlined = true;
521
522 /*
523 * Switch to the interrupt stack and shut down the processor.
524 *
525 * When the processor comes back, it will eventually call load_context which
526 * restores the context saved by machine_processor_shutdown, returning here.
527 */
528 old_thread = machine_processor_shutdown(self, processor_offline_intstack, processor);
529
530 /* old_thread should be NULL because we got here through Load_context */
531 assert(old_thread == THREAD_NULL);
532
533 assert(processor == current_processor());
534 assert(processor->idle_thread == current_thread());
535
536 assert(ml_get_interrupts_enabled() == FALSE);
537 assert(self->continuation == NULL);
538
539 /* Extract the machine_param value stashed by slave_main */
540 void * machine_param = self->parameter;
541 self->parameter = NULL;
542
543 /* Re-initialize the processor */
544 slave_machine_init(machine_param);
545
546 assert(processor->processor_offlined == true);
547 processor->processor_offlined = false;
548
549 if (enforce_quiesce_safety) {
550 enable_preemption();
551 }
552
553 #if defined(__arm64__)
554 /*
555 * See the comments for DebuggerLock in processor_up().
556 *
557 * SIGPdisabled is cleared (to enable cpu_signal() to succeed with this processor)
558 * the first time we take an IPI. This is triggered by slave_machine_init(), above,
559 * which calls cpu_machine_init()->PE_cpu_machine_init()->PE_cpu_signal() which sends
560 * a self-IPI to ensure that happens when we enable interrupts. So enable interrupts
561 * here so that cpu_signal() can succeed before we spin on mp_kdp_trap.
562 */
563 ml_set_interrupts_enabled(TRUE);
564
565 ml_set_interrupts_enabled(FALSE);
566
567 wait_while_mp_kdp_trap(true);
568
569 /*
570 * At this point,
571 * if a stackshot or panic is in progress, we either spin on mp_kdp_trap
572 * or we sucessfully received a SIGPdebug signal which will cause us to
573 * break out of the spin on mp_kdp_trap and instead
574 * spin next time interrupts are enabled in idle_thread().
575 */
576 #endif
577
578 /*
579 * Now that the processor is back, invoke the idle thread to find out what to do next.
580 * idle_thread will enable interrupts.
581 */
582 thread_block(idle_thread);
583 /*NOTREACHED*/
584 }
585
586 /*
587 * Complete the shutdown and place the processor offline.
588 *
589 * Called at splsched in the shutdown context
590 * (i.e. on the idle thread, on the interrupt stack)
591 *
592 * The onlining half of this is done in load_context().
593 */
594 static void
processor_offline_intstack(processor_t processor)595 processor_offline_intstack(
596 processor_t processor)
597 {
598 assert(processor == current_processor());
599 assert(processor->active_thread == current_thread());
600
601 struct recount_snap snap = { 0 };
602 recount_snapshot(&snap);
603 recount_processor_idle(&processor->pr_recount, &snap);
604
605 smr_cpu_leave(processor, processor->last_dispatch);
606
607 PMAP_DEACTIVATE_KERNEL(processor->cpu_id);
608
609 cpu_sleep();
610 panic("zombie processor");
611 /*NOTREACHED*/
612 }
613
614 kern_return_t
host_get_boot_info(host_priv_t host_priv,kernel_boot_info_t boot_info)615 host_get_boot_info(
616 host_priv_t host_priv,
617 kernel_boot_info_t boot_info)
618 {
619 const char *src = "";
620 if (host_priv == HOST_PRIV_NULL) {
621 return KERN_INVALID_HOST;
622 }
623
624 /*
625 * Copy first operator string terminated by '\0' followed by
626 * standardized strings generated from boot string.
627 */
628 src = machine_boot_info(boot_info, KERNEL_BOOT_INFO_MAX);
629 if (src != boot_info) {
630 (void) strncpy(boot_info, src, KERNEL_BOOT_INFO_MAX);
631 }
632
633 return KERN_SUCCESS;
634 }
635
636 // These are configured through sysctls.
637 #if DEVELOPMENT || DEBUG
638 uint32_t phy_read_panic = 1;
639 uint32_t phy_write_panic = 1;
640 uint64_t simulate_stretched_io = 0;
641 #else
642 uint32_t phy_read_panic = 0;
643 uint32_t phy_write_panic = 0;
644 #endif
645
646 #if !defined(__x86_64__)
647
648 #if DEVELOPMENT || DEBUG
649 static const uint64_t TIMEBASE_TICKS_PER_USEC = 24000000ULL / USEC_PER_SEC;
650 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 100 * TIMEBASE_TICKS_PER_USEC;
651 #else
652 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 0;
653 #endif
654
655 // The MACHINE_TIMEOUT facility only exists on ARM.
656 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_read_delay_to, "report-phy-read-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
657 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_write_delay_to, "report-phy-write-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
658 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_read_delay_to, "trace-phy-read-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
659 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_write_delay_to, "trace-phy-write-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
660
661 #if SCHED_HYGIENE_DEBUG
662 /*
663 * Note: The interrupt-masked timeout goes through two initializations - one
664 * early in boot and one later. Thus this function is also called twice and
665 * can't be marked '__startup_func'.
666 */
667 static void
ml_io_init_timeouts(void)668 ml_io_init_timeouts(void)
669 {
670 /*
671 * The timeouts may be completely disabled via an override. Check that
672 * last and set the timeouts to zero (disabling) if that's the case.
673 */
674 if (kern_feature_override(KF_IO_TIMEOUT_OVRD)) {
675 os_atomic_store(&report_phy_write_delay_to, 0, relaxed);
676 os_atomic_store(&report_phy_read_delay_to, 0, relaxed);
677 }
678 }
679
680 /*
681 * It's important that this happens after machine timeouts have initialized so
682 * the correct timeouts can be inherited.
683 */
684 STARTUP(TIMEOUTS, STARTUP_RANK_SECOND, ml_io_init_timeouts);
685 #endif /* SCHED_HYGIENE_DEBUG */
686
687 extern pmap_paddr_t kvtophys(vm_offset_t va);
688 #endif
689
690 #if ML_IO_TIMEOUTS_ENABLED
691
692 static LCK_GRP_DECLARE(io_timeout_override_lock_grp, "io_timeout_override");
693 static LCK_SPIN_DECLARE(io_timeout_override_lock, &io_timeout_override_lock_grp);
694
695 struct io_timeout_override_entry {
696 RB_ENTRY(io_timeout_override_entry) tree;
697
698 uintptr_t iovaddr_base;
699 unsigned int size;
700 uint32_t read_timeout;
701 uint32_t write_timeout;
702 };
703
704 static inline int
io_timeout_override_cmp(const struct io_timeout_override_entry * a,const struct io_timeout_override_entry * b)705 io_timeout_override_cmp(const struct io_timeout_override_entry *a, const struct io_timeout_override_entry *b)
706 {
707 if (a->iovaddr_base < b->iovaddr_base) {
708 return -1;
709 } else if (a->iovaddr_base > b->iovaddr_base) {
710 return 1;
711 } else {
712 return 0;
713 }
714 }
715
716 static RB_HEAD(io_timeout_override, io_timeout_override_entry) io_timeout_override_root;
717 RB_PROTOTYPE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
718 RB_GENERATE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
719
720 #endif /* ML_IO_TIMEOUTS_ENABLED */
721
722 int
ml_io_increase_timeouts(uintptr_t iovaddr_base,unsigned int size,uint32_t read_timeout_us,uint32_t write_timeout_us)723 ml_io_increase_timeouts(uintptr_t iovaddr_base, unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us)
724 {
725 #if ML_IO_TIMEOUTS_ENABLED
726 const size_t MAX_SIZE = 4096;
727 const uint64_t MAX_TIMEOUT_ABS = UINT32_MAX;
728
729 assert(preemption_enabled());
730
731 int ret = KERN_SUCCESS;
732
733 if (size == 0) {
734 return KERN_INVALID_ARGUMENT;
735 }
736
737 uintptr_t iovaddr_end;
738 if (size > MAX_SIZE || os_add_overflow(iovaddr_base, size - 1, &iovaddr_end)) {
739 return KERN_INVALID_ARGUMENT;
740 }
741
742 uint64_t read_timeout_abs, write_timeout_abs;
743 nanoseconds_to_absolutetime(NSEC_PER_USEC * read_timeout_us, &read_timeout_abs);
744 nanoseconds_to_absolutetime(NSEC_PER_USEC * write_timeout_us, &write_timeout_abs);
745 if (read_timeout_abs > MAX_TIMEOUT_ABS || write_timeout_abs > MAX_TIMEOUT_ABS) {
746 return KERN_INVALID_ARGUMENT;
747 }
748
749 struct io_timeout_override_entry *node = kalloc_type(struct io_timeout_override_entry, Z_WAITOK | Z_ZERO | Z_NOFAIL);
750 node->iovaddr_base = iovaddr_base;
751 node->size = size;
752 node->read_timeout = (uint32_t)read_timeout_abs;
753 node->write_timeout = (uint32_t)write_timeout_abs;
754
755 /*
756 * Interrupt handlers are allowed to call ml_io_{read,write}*, so
757 * interrupts must be disabled any time io_timeout_override_lock is
758 * held. Otherwise the CPU could take an interrupt while holding the
759 * lock, invoke an ISR that calls ml_io_{read,write}*, and deadlock
760 * trying to acquire the lock again.
761 */
762 boolean_t istate = ml_set_interrupts_enabled(FALSE);
763 lck_spin_lock(&io_timeout_override_lock);
764 if (RB_INSERT(io_timeout_override, &io_timeout_override_root, node)) {
765 ret = KERN_INVALID_ARGUMENT;
766 goto out;
767 }
768
769 /* Check that this didn't create any new overlaps */
770 struct io_timeout_override_entry *prev = RB_PREV(io_timeout_override, &io_timeout_override_root, node);
771 if (prev && (prev->iovaddr_base + prev->size) > node->iovaddr_base) {
772 RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
773 ret = KERN_INVALID_ARGUMENT;
774 goto out;
775 }
776 struct io_timeout_override_entry *next = RB_NEXT(io_timeout_override, &io_timeout_override_root, node);
777 if (next && (node->iovaddr_base + node->size) > next->iovaddr_base) {
778 RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
779 ret = KERN_INVALID_ARGUMENT;
780 goto out;
781 }
782
783 out:
784 lck_spin_unlock(&io_timeout_override_lock);
785 ml_set_interrupts_enabled(istate);
786 if (ret != KERN_SUCCESS) {
787 kfree_type(struct io_timeout_override_entry, node);
788 }
789 return ret;
790 #else /* !ML_IO_TIMEOUTS_ENABLED */
791 #pragma unused(iovaddr_base, size, read_timeout_us, write_timeout_us)
792 return KERN_SUCCESS;
793 #endif
794 }
795
796 int
ml_io_reset_timeouts(uintptr_t iovaddr_base,unsigned int size)797 ml_io_reset_timeouts(uintptr_t iovaddr_base, unsigned int size)
798 {
799 #if ML_IO_TIMEOUTS_ENABLED
800 assert(preemption_enabled());
801
802 struct io_timeout_override_entry key = { .iovaddr_base = iovaddr_base };
803
804 boolean_t istate = ml_set_interrupts_enabled(FALSE);
805 lck_spin_lock(&io_timeout_override_lock);
806 struct io_timeout_override_entry *node = RB_FIND(io_timeout_override, &io_timeout_override_root, &key);
807 if (node) {
808 if (node->size == size) {
809 RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
810 } else {
811 node = NULL;
812 }
813 }
814 lck_spin_unlock(&io_timeout_override_lock);
815 ml_set_interrupts_enabled(istate);
816
817 if (!node) {
818 return KERN_NOT_FOUND;
819 }
820
821 kfree_type(struct io_timeout_override_entry, node);
822 #else /* !ML_IO_TIMEOUTS_ENABLED */
823 #pragma unused(iovaddr_base, size)
824 #endif
825 return KERN_SUCCESS;
826 }
827
828 #if ML_IO_TIMEOUTS_ENABLED
829
830 static bool
override_io_timeouts_va(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)831 override_io_timeouts_va(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
832 {
833 assert(!ml_get_interrupts_enabled());
834
835 struct io_timeout_override_entry *node = RB_ROOT(&io_timeout_override_root);
836
837 lck_spin_lock(&io_timeout_override_lock);
838 /* RB_FIND() doesn't support custom cmp functions, so we have to open-code our own */
839 while (node) {
840 if (node->iovaddr_base <= vaddr && vaddr < node->iovaddr_base + node->size) {
841 if (read_timeout) {
842 *read_timeout = node->read_timeout;
843 }
844 if (write_timeout) {
845 *write_timeout = node->write_timeout;
846 }
847 lck_spin_unlock(&io_timeout_override_lock);
848 return true;
849 } else if (vaddr < node->iovaddr_base) {
850 node = RB_LEFT(node, tree);
851 } else {
852 node = RB_RIGHT(node, tree);
853 }
854 }
855 lck_spin_unlock(&io_timeout_override_lock);
856
857 return false;
858 }
859
860 static bool
override_io_timeouts_pa(uint64_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)861 override_io_timeouts_pa(uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
862 {
863 #if defined(__arm64__)
864 /*
865 * PCIe regions are marked with PMAP_IO_RANGE_STRONG_SYNC. Apply a
866 * timeout greater than the PCIe completion timeout (50ms). In some
867 * cases those timeouts can stack so make the timeout significantly
868 * higher.
869 */
870 #define STRONG_SYNC_TIMEOUT 1800000 /* 75ms */
871
872 pmap_io_range_t *range = pmap_find_io_attr(paddr);
873 if (range != NULL && (range->wimg & PMAP_IO_RANGE_STRONG_SYNC) != 0) {
874 if (read_timeout) {
875 *read_timeout = STRONG_SYNC_TIMEOUT;
876 }
877 if (write_timeout) {
878 *write_timeout = STRONG_SYNC_TIMEOUT;
879 }
880
881 return true;
882 }
883 #else
884 (void)paddr;
885 (void)read_timeout;
886 (void)write_timeout;
887 #endif /* __arm64__ */
888 return false;
889 }
890
891 void
override_io_timeouts(uintptr_t vaddr,uint64_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)892 override_io_timeouts(uintptr_t vaddr, uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
893 {
894 if (vaddr != 0 &&
895 override_io_timeouts_va(vaddr, read_timeout, write_timeout)) {
896 return;
897 }
898
899 if (paddr != 0 &&
900 override_io_timeouts_pa(paddr, read_timeout, write_timeout)) {
901 return;
902 }
903 }
904 #endif /* ML_IO_TIMEOUTS_ENABLED */
905
906 unsigned long long
ml_io_read(uintptr_t vaddr,int size)907 ml_io_read(uintptr_t vaddr, int size)
908 {
909 unsigned long long result = 0;
910 unsigned char s1;
911 unsigned short s2;
912
913 #ifdef ML_IO_VERIFY_UNCACHEABLE
914 uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
915 #elif defined(ML_IO_TIMEOUTS_ENABLED)
916 uintptr_t const paddr = kvtophys(vaddr);
917 #endif
918
919 #ifdef ML_IO_TIMEOUTS_ENABLED
920 uint64_t sabs, eabs;
921 boolean_t istate, timeread = FALSE;
922 uint64_t report_read_delay;
923 #if __x86_64__
924 report_read_delay = report_phy_read_delay;
925 #else
926 report_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
927 uint64_t const trace_phy_read_delay = os_atomic_load(&trace_phy_read_delay_to, relaxed);
928 #endif /* __x86_64__ */
929
930 if (__improbable(report_read_delay != 0)) {
931 istate = ml_set_interrupts_enabled(FALSE);
932 sabs = mach_absolute_time();
933 timeread = TRUE;
934 }
935
936 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
937 if (__improbable(timeread && simulate_stretched_io)) {
938 sabs -= simulate_stretched_io;
939 }
940 #endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */
941 #endif /* ML_IO_TIMEOUTS_ENABLED */
942
943 #if DEVELOPMENT || DEBUG
944 boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
945 if (use_fences) {
946 ml_timebase_to_memory_fence();
947 }
948 #endif
949
950 switch (size) {
951 case 1:
952 s1 = *(volatile unsigned char *)vaddr;
953 result = s1;
954 break;
955 case 2:
956 s2 = *(volatile unsigned short *)vaddr;
957 result = s2;
958 break;
959 case 4:
960 result = *(volatile unsigned int *)vaddr;
961 break;
962 case 8:
963 result = *(volatile unsigned long long *)vaddr;
964 break;
965 default:
966 panic("Invalid size %d for ml_io_read(%p)", size, (void *)vaddr);
967 break;
968 }
969
970 #if DEVELOPMENT || DEBUG
971 if (use_fences) {
972 ml_memory_to_timebase_fence();
973 }
974 #endif
975
976 #ifdef ML_IO_TIMEOUTS_ENABLED
977 if (__improbable(timeread == TRUE)) {
978 eabs = mach_absolute_time();
979
980 /* Prevent the processor from calling iotrace during its
981 * initialization procedure. */
982 if (current_processor()->state == PROCESSOR_RUNNING) {
983 iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, sabs, eabs - sabs);
984 }
985
986 if (__improbable((eabs - sabs) > report_read_delay)) {
987 DTRACE_PHYSLAT5(physioread, uint64_t, (eabs - sabs),
988 uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, result);
989
990 uint64_t override = 0;
991 override_io_timeouts(vaddr, paddr, &override, NULL);
992
993 if (override != 0) {
994 #if SCHED_HYGIENE_DEBUG
995 /*
996 * The IO timeout was overridden. As interrupts are disabled in
997 * order to accurately measure IO time this can cause the
998 * interrupt masked timeout threshold to be exceeded. If the
999 * interrupt masked debug mode is set to panic, abandon the
1000 * measurement. If in trace mode leave it as-is for
1001 * observability.
1002 */
1003 if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1004 ml_spin_debug_clear(current_thread());
1005 ml_irq_debug_abandon();
1006 }
1007 #endif
1008 report_read_delay = override;
1009 }
1010 }
1011
1012 if (__improbable((eabs - sabs) > report_read_delay)) {
1013 if (phy_read_panic && (machine_timeout_suspended() == FALSE)) {
1014 #if defined(__x86_64__)
1015 panic_notify();
1016 #endif /* defined(__x86_64__) */
1017 uint64_t nsec = 0;
1018 absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1019 panic("Read from IO vaddr 0x%lx paddr 0x%lx took %llu ns, "
1020 "result: 0x%llx (start: %llu, end: %llu), ceiling: %llu",
1021 vaddr, paddr, nsec, result, sabs, eabs,
1022 report_read_delay);
1023 }
1024 }
1025
1026 if (__improbable(trace_phy_read_delay > 0 && (eabs - sabs) > trace_phy_read_delay)) {
1027 KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_READ),
1028 (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result);
1029 }
1030
1031 (void)ml_set_interrupts_enabled(istate);
1032 }
1033 #endif /* ML_IO_TIMEOUTS_ENABLED */
1034 return result;
1035 }
1036
1037 unsigned int
ml_io_read8(uintptr_t vaddr)1038 ml_io_read8(uintptr_t vaddr)
1039 {
1040 return (unsigned) ml_io_read(vaddr, 1);
1041 }
1042
1043 unsigned int
ml_io_read16(uintptr_t vaddr)1044 ml_io_read16(uintptr_t vaddr)
1045 {
1046 return (unsigned) ml_io_read(vaddr, 2);
1047 }
1048
1049 unsigned int
ml_io_read32(uintptr_t vaddr)1050 ml_io_read32(uintptr_t vaddr)
1051 {
1052 return (unsigned) ml_io_read(vaddr, 4);
1053 }
1054
1055 unsigned long long
ml_io_read64(uintptr_t vaddr)1056 ml_io_read64(uintptr_t vaddr)
1057 {
1058 return ml_io_read(vaddr, 8);
1059 }
1060
1061 /* ml_io_write* */
1062
1063 void
ml_io_write(uintptr_t vaddr,uint64_t val,int size)1064 ml_io_write(uintptr_t vaddr, uint64_t val, int size)
1065 {
1066 #ifdef ML_IO_VERIFY_UNCACHEABLE
1067 uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
1068 #elif defined(ML_IO_TIMEOUTS_ENABLED)
1069 uintptr_t const paddr = kvtophys(vaddr);
1070 #endif
1071
1072 #ifdef ML_IO_TIMEOUTS_ENABLED
1073 uint64_t sabs, eabs;
1074 boolean_t istate, timewrite = FALSE;
1075 uint64_t report_write_delay;
1076 #if __x86_64__
1077 report_write_delay = report_phy_write_delay;
1078 #else
1079 report_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
1080 uint64_t trace_phy_write_delay = os_atomic_load(&trace_phy_write_delay_to, relaxed);
1081 #endif /* !defined(__x86_64__) */
1082 if (__improbable(report_write_delay != 0)) {
1083 istate = ml_set_interrupts_enabled(FALSE);
1084 sabs = mach_absolute_time();
1085 timewrite = TRUE;
1086 }
1087
1088 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
1089 if (__improbable(timewrite && simulate_stretched_io)) {
1090 sabs -= simulate_stretched_io;
1091 }
1092 #endif /* DEVELOPMENT || DEBUG */
1093 #endif /* ML_IO_TIMEOUTS_ENABLED */
1094
1095 #if DEVELOPMENT || DEBUG
1096 boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
1097 if (use_fences) {
1098 ml_timebase_to_memory_fence();
1099 }
1100 #endif
1101
1102 switch (size) {
1103 case 1:
1104 *(volatile uint8_t *)vaddr = (uint8_t)val;
1105 break;
1106 case 2:
1107 *(volatile uint16_t *)vaddr = (uint16_t)val;
1108 break;
1109 case 4:
1110 *(volatile uint32_t *)vaddr = (uint32_t)val;
1111 break;
1112 case 8:
1113 *(volatile uint64_t *)vaddr = (uint64_t)val;
1114 break;
1115 default:
1116 panic("Invalid size %d for ml_io_write(%p, 0x%llx)", size, (void *)vaddr, val);
1117 break;
1118 }
1119
1120 #if DEVELOPMENT || DEBUG
1121 if (use_fences) {
1122 ml_memory_to_timebase_fence();
1123 }
1124 #endif
1125
1126 #ifdef ML_IO_TIMEOUTS_ENABLED
1127 if (__improbable(timewrite == TRUE)) {
1128 eabs = mach_absolute_time();
1129
1130
1131 /* Prevent the processor from calling iotrace during its
1132 * initialization procedure. */
1133 if (current_processor()->state == PROCESSOR_RUNNING) {
1134 iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, sabs, eabs - sabs);
1135 }
1136
1137
1138 if (__improbable((eabs - sabs) > report_write_delay)) {
1139 DTRACE_PHYSLAT5(physiowrite, uint64_t, (eabs - sabs),
1140 uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, val);
1141
1142 uint64_t override = 0;
1143 override_io_timeouts(vaddr, paddr, NULL, &override);
1144
1145 if (override != 0) {
1146 #if SCHED_HYGIENE_DEBUG
1147 /*
1148 * The IO timeout was overridden. As interrupts are disabled in
1149 * order to accurately measure IO time this can cause the
1150 * interrupt masked timeout threshold to be exceeded. If the
1151 * interrupt masked debug mode is set to panic, abandon the
1152 * measurement. If in trace mode leave it as-is for
1153 * observability.
1154 */
1155 if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1156 ml_spin_debug_clear(current_thread());
1157 ml_irq_debug_abandon();
1158 }
1159 #endif
1160 report_write_delay = override;
1161 }
1162 }
1163
1164 if (__improbable((eabs - sabs) > report_write_delay)) {
1165 if (phy_write_panic && (machine_timeout_suspended() == FALSE)) {
1166 #if defined(__x86_64__)
1167 panic_notify();
1168 #endif /* defined(__x86_64__) */
1169
1170 uint64_t nsec = 0;
1171 absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1172 panic("Write to IO vaddr %p paddr %p val 0x%llx took %llu ns,"
1173 " (start: %llu, end: %llu), ceiling: %llu",
1174 (void *)vaddr, (void *)paddr, val, nsec, sabs, eabs,
1175 report_write_delay);
1176 }
1177 }
1178
1179 if (__improbable(trace_phy_write_delay > 0 && (eabs - sabs) > trace_phy_write_delay)) {
1180 KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_WRITE),
1181 (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val);
1182 }
1183
1184 (void)ml_set_interrupts_enabled(istate);
1185 }
1186 #endif /* ML_IO_TIMEOUTS_ENABLED */
1187 }
1188
1189 void
ml_io_write8(uintptr_t vaddr,uint8_t val)1190 ml_io_write8(uintptr_t vaddr, uint8_t val)
1191 {
1192 ml_io_write(vaddr, val, 1);
1193 }
1194
1195 void
ml_io_write16(uintptr_t vaddr,uint16_t val)1196 ml_io_write16(uintptr_t vaddr, uint16_t val)
1197 {
1198 ml_io_write(vaddr, val, 2);
1199 }
1200
1201 void
ml_io_write32(uintptr_t vaddr,uint32_t val)1202 ml_io_write32(uintptr_t vaddr, uint32_t val)
1203 {
1204 ml_io_write(vaddr, val, 4);
1205 }
1206
1207 void
ml_io_write64(uintptr_t vaddr,uint64_t val)1208 ml_io_write64(uintptr_t vaddr, uint64_t val)
1209 {
1210 ml_io_write(vaddr, val, 8);
1211 }
1212
1213 struct cpu_callback_chain_elem {
1214 cpu_callback_t fn;
1215 void *param;
1216 struct cpu_callback_chain_elem *next;
1217 };
1218
1219 static struct cpu_callback_chain_elem *cpu_callback_chain;
1220 static LCK_GRP_DECLARE(cpu_callback_chain_lock_grp, "cpu_callback_chain");
1221 static LCK_SPIN_DECLARE(cpu_callback_chain_lock, &cpu_callback_chain_lock_grp);
1222
1223 void
cpu_event_register_callback(cpu_callback_t fn,void * param)1224 cpu_event_register_callback(cpu_callback_t fn, void *param)
1225 {
1226 struct cpu_callback_chain_elem *new_elem;
1227
1228 new_elem = zalloc_permanent_type(struct cpu_callback_chain_elem);
1229 if (!new_elem) {
1230 panic("can't allocate cpu_callback_chain_elem");
1231 }
1232
1233 lck_spin_lock(&cpu_callback_chain_lock);
1234 new_elem->next = cpu_callback_chain;
1235 new_elem->fn = fn;
1236 new_elem->param = param;
1237 os_atomic_store(&cpu_callback_chain, new_elem, release);
1238 lck_spin_unlock(&cpu_callback_chain_lock);
1239 }
1240
1241 __attribute__((noreturn))
1242 void
cpu_event_unregister_callback(__unused cpu_callback_t fn)1243 cpu_event_unregister_callback(__unused cpu_callback_t fn)
1244 {
1245 panic("Unfortunately, cpu_event_unregister_callback is unimplemented.");
1246 }
1247
1248 void
ml_broadcast_cpu_event(enum cpu_event event,unsigned int cpu_or_cluster)1249 ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster)
1250 {
1251 struct cpu_callback_chain_elem *cursor;
1252
1253 cursor = os_atomic_load(&cpu_callback_chain, dependency);
1254 for (; cursor != NULL; cursor = cursor->next) {
1255 cursor->fn(cursor->param, event, cpu_or_cluster);
1256 }
1257 }
1258
1259 // Initialize Machine Timeouts (see the MACHINE_TIMEOUT macro
1260 // definition)
1261
1262 void
machine_timeout_init_with_suffix(const struct machine_timeout_spec * spec,char const * suffix)1263 machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *suffix)
1264 {
1265 if (spec->skip_predicate != NULL && spec->skip_predicate(spec)) {
1266 // This timeout should be disabled.
1267 os_atomic_store_wide((uint64_t*)spec->ptr, 0, relaxed);
1268 return;
1269 }
1270
1271 assert(suffix != NULL);
1272 assert(strlen(spec->name) <= MACHINE_TIMEOUT_MAX_NAME_LEN);
1273
1274 size_t const suffix_len = strlen(suffix);
1275
1276 size_t const dt_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + 1;
1277 char dt_name[dt_name_size];
1278
1279 strlcpy(dt_name, spec->name, dt_name_size);
1280 strlcat(dt_name, suffix, dt_name_size);
1281
1282 size_t const scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + strlen("-scale") + 1;
1283 char scale_name[scale_name_size];
1284
1285 strlcpy(scale_name, spec->name, scale_name_size);
1286 strlcat(scale_name, suffix, scale_name_size);
1287 strlcat(scale_name, "-scale", scale_name_size);
1288
1289 size_t const boot_arg_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + strlen("ml-timeout-") + suffix_len + 1;
1290 char boot_arg_name[boot_arg_name_size];
1291
1292 strlcpy(boot_arg_name, "ml-timeout-", boot_arg_name_size);
1293 strlcat(boot_arg_name, spec->name, boot_arg_name_size);
1294 strlcat(boot_arg_name, suffix, boot_arg_name_size);
1295
1296 size_t const boot_arg_scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN +
1297 strlen("ml-timeout-") + strlen("-scale") + suffix_len + 1;
1298 char boot_arg_scale_name[boot_arg_scale_name_size];
1299
1300 strlcpy(boot_arg_scale_name, "ml-timeout-", boot_arg_scale_name_size);
1301 strlcat(boot_arg_scale_name, spec->name, boot_arg_scale_name_size);
1302 strlcat(boot_arg_scale_name, suffix, boot_arg_name_size);
1303 strlcat(boot_arg_scale_name, "-scale", boot_arg_scale_name_size);
1304
1305
1306 /*
1307 * Determine base value from DT and boot-args.
1308 */
1309
1310 DTEntry base, chosen;
1311
1312 if (SecureDTLookupEntry(NULL, "/machine-timeouts", &base) != kSuccess) {
1313 base = NULL;
1314 }
1315
1316 if (SecureDTLookupEntry(NULL, "/chosen/machine-timeouts", &chosen) != kSuccess) {
1317 chosen = NULL;
1318 }
1319
1320 uint64_t timeout = spec->default_value;
1321 bool found = false;
1322
1323 uint64_t const *data = NULL;
1324 unsigned int data_size = sizeof(*data);
1325
1326 /* First look in /machine-timeouts/<name> */
1327 if (base != NULL && SecureDTGetProperty(base, dt_name, (const void **)&data, &data_size) == kSuccess) {
1328 if (data_size != sizeof(*data)) {
1329 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s", __func__, data_size, dt_name);
1330 }
1331
1332 timeout = *data;
1333 found = true;
1334 }
1335
1336 /* A value in /chosen/machine-timeouts/<name> overrides */
1337 if (chosen != NULL && SecureDTGetProperty(chosen, dt_name, (const void **)&data, &data_size) == kSuccess) {
1338 if (data_size != sizeof(*data)) {
1339 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s", __func__, data_size, dt_name);
1340 }
1341
1342 timeout = *data;
1343 found = true;
1344 }
1345
1346 /* A boot-arg ml-timeout-<name> overrides */
1347 uint64_t boot_arg = 0;
1348
1349 if (PE_parse_boot_argn(boot_arg_name, &boot_arg, sizeof(boot_arg))) {
1350 timeout = boot_arg;
1351 found = true;
1352 }
1353
1354
1355 /*
1356 * Determine scale value from DT and boot-args.
1357 */
1358
1359 uint64_t scale = 1;
1360 uint32_t const *scale_data;
1361 unsigned int scale_size = sizeof(scale_data);
1362
1363 /* If there is a scale factor /machine-timeouts/<name>-scale, apply it. */
1364 if (base != NULL && SecureDTGetProperty(base, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1365 if (scale_size != sizeof(*scale_data)) {
1366 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s-scale", __func__, scale_size, dt_name);
1367 }
1368
1369 scale = *scale_data;
1370 }
1371
1372 /* If there is a scale factor /chosen/machine-timeouts/<name>-scale, use that. */
1373 if (chosen != NULL && SecureDTGetProperty(chosen, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1374 if (scale_size != sizeof(*scale_data)) {
1375 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s-scale", __func__,
1376 scale_size, dt_name);
1377 }
1378
1379 scale = *scale_data;
1380 }
1381
1382 /* Finally, a boot-arg ml-timeout-<name>-scale takes precedence. */
1383 if (PE_parse_boot_argn(boot_arg_scale_name, &boot_arg, sizeof(boot_arg))) {
1384 scale = boot_arg;
1385 }
1386
1387 static bool global_scale_set;
1388 static uint64_t global_scale;
1389
1390 if (!global_scale_set) {
1391 /* Apply /machine-timeouts/global-scale if present */
1392 if (SecureDTGetProperty(base, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1393 if (scale_size != sizeof(*scale_data)) {
1394 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/global-scale", __func__,
1395 scale_size);
1396 }
1397
1398 global_scale = *scale_data;
1399 global_scale_set = true;
1400 }
1401
1402 /* Use /chosen/machine-timeouts/global-scale if present */
1403 if (SecureDTGetProperty(chosen, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1404 if (scale_size != sizeof(*scale_data)) {
1405 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/global-scale", __func__,
1406 scale_size);
1407 }
1408
1409 global_scale = *scale_data;
1410 global_scale_set = true;
1411 }
1412
1413 /* Finally, the boot-arg ml-timeout-global-scale takes precedence. */
1414 if (PE_parse_boot_argn("ml-timeout-global-scale", &boot_arg, sizeof(boot_arg))) {
1415 global_scale = boot_arg;
1416 global_scale_set = true;
1417 }
1418 }
1419
1420 if (global_scale_set) {
1421 scale *= global_scale;
1422 }
1423
1424 /* Compute the final timeout, and done. */
1425 if (found && timeout > 0) {
1426 /* Only apply inherent unit scale if the value came in
1427 * externally. */
1428
1429 if (spec->unit_scale == MACHINE_TIMEOUT_UNIT_TIMEBASE) {
1430 uint64_t nanoseconds = timeout / 1000;
1431 nanoseconds_to_absolutetime(nanoseconds, &timeout);
1432 } else {
1433 timeout /= spec->unit_scale;
1434 }
1435
1436 if (timeout == 0) {
1437 /* Ensure unit scaling did not disable the timeout. */
1438 timeout = 1;
1439 }
1440 }
1441
1442 if (os_mul_overflow(timeout, scale, &timeout)) {
1443 timeout = UINT64_MAX; // clamp
1444 }
1445
1446 os_atomic_store_wide((uint64_t*)spec->ptr, timeout, relaxed);
1447 }
1448
1449 void
machine_timeout_init(const struct machine_timeout_spec * spec)1450 machine_timeout_init(const struct machine_timeout_spec *spec)
1451 {
1452 machine_timeout_init_with_suffix(spec, "");
1453 }
1454
1455 #if DEVELOPMENT || DEBUG
1456 /*
1457 * Late timeout (re-)initialization, at the end of bsd_init()
1458 */
1459 void
machine_timeout_bsd_init(void)1460 machine_timeout_bsd_init(void)
1461 {
1462 char const * const __unused mt_suffix = "-b";
1463 #if SCHED_HYGIENE_DEBUG
1464 machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix);
1465 machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix);
1466
1467 /*
1468 * The io timeouts can inherit from interrupt_masked_timeout.
1469 * Re-initialize, as interrupt_masked_timeout may have changed.
1470 */
1471 ml_io_init_timeouts();
1472
1473 extern void preemption_disable_reset_max_durations(void);
1474 /*
1475 * Reset the preemption disable stats, so that they are not
1476 * polluted by long early boot code.
1477 */
1478 preemption_disable_reset_max_durations();
1479 #endif /* SCHED_HYGIENE_DEBUG */
1480 }
1481 #endif /* DEVELOPMENT || DEBUG */
1482
1483 #if ML_IO_TIMEOUTS_ENABLED && CONFIG_XNUPOST
1484 #include <tests/xnupost.h>
1485
1486 extern kern_return_t ml_io_timeout_test(void);
1487
1488 static inline void
ml_io_timeout_test_get_timeouts(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)1489 ml_io_timeout_test_get_timeouts(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
1490 {
1491 *read_timeout = 0;
1492 *write_timeout = 0;
1493
1494 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1495 override_io_timeouts(vaddr, 0, read_timeout, write_timeout);
1496 ml_set_interrupts_enabled(istate);
1497 }
1498
1499 kern_return_t
ml_io_timeout_test(void)1500 ml_io_timeout_test(void)
1501 {
1502 const size_t SIZE = 16;
1503 uintptr_t iovaddr_base1 = (uintptr_t)&ml_io_timeout_test;
1504 uintptr_t iovaddr_base2 = iovaddr_base1 + SIZE;
1505 uintptr_t vaddr1 = iovaddr_base1 + SIZE / 2;
1506 uintptr_t vaddr2 = iovaddr_base2 + SIZE / 2;
1507
1508 const uint64_t READ_TIMEOUT1_US = 50000, WRITE_TIMEOUT1_US = 50001;
1509 const uint64_t READ_TIMEOUT2_US = 50002, WRITE_TIMEOUT2_US = 50003;
1510 uint64_t read_timeout1_abs, write_timeout1_abs;
1511 uint64_t read_timeout2_abs, write_timeout2_abs;
1512 nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT1_US, &read_timeout1_abs);
1513 nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT1_US, &write_timeout1_abs);
1514 nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT2_US, &read_timeout2_abs);
1515 nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT2_US, &write_timeout2_abs);
1516
1517 int err = ml_io_increase_timeouts(iovaddr_base1, 0, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1518 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for empty region");
1519
1520 err = ml_io_increase_timeouts(iovaddr_base1, 4097, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1521 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for region > 4096 bytes");
1522
1523 err = ml_io_increase_timeouts(UINTPTR_MAX, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1524 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overflowed region");
1525
1526 err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1527 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first VA region should succeed");
1528
1529 err = ml_io_increase_timeouts(iovaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US);
1530 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second VA region should succeed");
1531
1532 err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1533 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for same region twice");
1534
1535 err = ml_io_increase_timeouts(vaddr1, (uint32_t)(vaddr2 - vaddr1), READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1536 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overlapping regions");
1537
1538 uint64_t read_timeout, write_timeout;
1539 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1540 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1541 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1542
1543 ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout);
1544 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region");
1545 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region");
1546
1547 ml_io_timeout_test_get_timeouts(iovaddr_base2 + SIZE, &read_timeout, &write_timeout);
1548 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout without override");
1549 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout without override");
1550
1551 err = ml_io_reset_timeouts(iovaddr_base1 + 1, SIZE - 1);
1552 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for subregion");
1553
1554 err = ml_io_reset_timeouts(iovaddr_base2 + SIZE, SIZE);
1555 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for non-existent region");
1556
1557 err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1558 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first VA region should succeed");
1559
1560 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1561 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region");
1562 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region");
1563
1564 err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1565 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for same region twice");
1566
1567 err = ml_io_reset_timeouts(iovaddr_base2, SIZE);
1568 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second VA region should succeed");
1569
1570 return KERN_SUCCESS;
1571 }
1572 #endif /* CONFIG_XNUPOST */
1573