1 /*
2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: kern/machine.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1987
62 *
63 * Support for machine independent machine abstraction.
64 */
65
66 #include <string.h>
67
68 #include <mach/mach_types.h>
69 #include <mach/boolean.h>
70 #include <mach/kern_return.h>
71 #include <mach/machine.h>
72 #include <mach/host_info.h>
73 #include <mach/host_reboot.h>
74 #include <mach/host_priv_server.h>
75 #include <mach/processor_server.h>
76 #include <mach/sdt.h>
77
78 #include <kern/kern_types.h>
79 #include <kern/cpu_data.h>
80 #include <kern/cpu_quiesce.h>
81 #include <kern/ipc_host.h>
82 #include <kern/host.h>
83 #include <kern/machine.h>
84 #include <kern/misc_protos.h>
85 #include <kern/percpu.h>
86 #include <kern/processor.h>
87 #include <kern/queue.h>
88 #include <kern/sched.h>
89 #include <kern/startup.h>
90 #include <kern/task.h>
91 #include <kern/thread.h>
92 #include <kern/iotrace.h>
93
94 #include <libkern/OSDebug.h>
95 #if ML_IO_TIMEOUTS_ENABLED
96 #include <libkern/tree.h>
97 #endif
98
99 #include <pexpert/device_tree.h>
100
101 #include <machine/commpage.h>
102 #include <machine/machine_routines.h>
103
104 #if HIBERNATION
105 #include <IOKit/IOHibernatePrivate.h>
106 #endif
107 #include <IOKit/IOPlatformExpert.h>
108
109 #if CONFIG_DTRACE
110 extern void (*dtrace_cpu_state_changed_hook)(int, boolean_t);
111 #endif
112
113 #if defined(__arm64__)
114 extern void wait_while_mp_kdp_trap(bool check_SIGPdebug);
115 #endif
116
117 #if defined(__x86_64__)
118 #include <i386/panic_notify.h>
119 #endif
120
121 /*
122 * Exported variables:
123 */
124
125 struct machine_info machine_info;
126
127 /* Forwards */
128 static void
129 processor_doshutdown(processor_t processor);
130
131 static void
132 processor_offline(void * parameter, __unused wait_result_t result);
133
134 static void
135 processor_offline_intstack(processor_t processor) __dead2;
136
137 static void
processor_up_update_counts(processor_t processor)138 processor_up_update_counts(processor_t processor)
139 {
140 ml_cpu_up_update_counts(processor->cpu_id);
141
142 os_atomic_inc(&processor_avail_count, relaxed);
143 if (processor->is_recommended) {
144 os_atomic_inc(&processor_avail_count_user, relaxed);
145 }
146 if (processor->processor_primary == processor) {
147 os_atomic_inc(&primary_processor_avail_count, relaxed);
148 if (processor->is_recommended) {
149 os_atomic_inc(&primary_processor_avail_count_user, relaxed);
150 }
151 }
152 commpage_update_active_cpus();
153 }
154
155 /*
156 * processor_up:
157 *
158 * Flag processor as up and running, and available
159 * for scheduling.
160 */
161 void
processor_up(processor_t processor)162 processor_up(
163 processor_t processor)
164 {
165 processor_set_t pset;
166 spl_t s;
167
168 s = splsched();
169 init_ast_check(processor);
170
171 #if defined(__arm64__)
172 /*
173 * A processor coming online won't have received a SIGPdebug signal
174 * to cause it to spin while a stackshot or panic is taking place,
175 * so spin here on mp_kdp_trap.
176 *
177 * However, since cpu_signal() is not yet enabled for this processor,
178 * there is a race if we have just passed this when a cpu_signal()
179 * is attempted. The sender will assume the cpu is offline, so it will
180 * not end up spinning anywhere. See processor_offline() for the fix
181 * for this race.
182 */
183 wait_while_mp_kdp_trap(false);
184 #endif
185
186 pset = processor->processor_set;
187 simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
188 pset_lock(pset);
189
190 ++pset->online_processor_count;
191 simple_lock(&processor->start_state_lock, LCK_GRP_NULL);
192 pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
193 simple_unlock(&processor->start_state_lock);
194 bool temporary = processor->shutdown_temporary;
195 if (temporary) {
196 processor->shutdown_temporary = false;
197 } else {
198 processor_up_update_counts(processor);
199 }
200 if (processor->is_recommended) {
201 SCHED(pset_made_schedulable)(processor, pset, false);
202 }
203 pset_unlock(pset);
204 ml_cpu_up();
205 sched_mark_processor_online_locked(processor, processor->last_startup_reason);
206 simple_unlock(&sched_available_cores_lock);
207 splx(s);
208
209 thread_wakeup((event_t)&processor->state);
210
211 #if CONFIG_DTRACE
212 if (dtrace_cpu_state_changed_hook) {
213 (*dtrace_cpu_state_changed_hook)(processor->cpu_id, TRUE);
214 }
215 #endif
216 }
217 #include <atm/atm_internal.h>
218
219 kern_return_t
host_reboot(host_priv_t host_priv,int options)220 host_reboot(
221 host_priv_t host_priv,
222 int options)
223 {
224 if (host_priv == HOST_PRIV_NULL) {
225 return KERN_INVALID_HOST;
226 }
227
228 #if DEVELOPMENT || DEBUG
229 if (options & HOST_REBOOT_DEBUGGER) {
230 Debugger("Debugger");
231 return KERN_SUCCESS;
232 }
233 #endif
234
235 if (options & HOST_REBOOT_UPSDELAY) {
236 // UPS power cutoff path
237 PEHaltRestart( kPEUPSDelayHaltCPU );
238 } else {
239 halt_all_cpus(!(options & HOST_REBOOT_HALT));
240 }
241
242 return KERN_SUCCESS;
243 }
244
245 kern_return_t
processor_assign(__unused processor_t processor,__unused processor_set_t new_pset,__unused boolean_t wait)246 processor_assign(
247 __unused processor_t processor,
248 __unused processor_set_t new_pset,
249 __unused boolean_t wait)
250 {
251 return KERN_FAILURE;
252 }
253
254 static void
processor_down_update_counts(processor_t processor)255 processor_down_update_counts(processor_t processor)
256 {
257 ml_cpu_down_update_counts(processor->cpu_id);
258
259 os_atomic_dec(&processor_avail_count, relaxed);
260 if (processor->is_recommended) {
261 os_atomic_dec(&processor_avail_count_user, relaxed);
262 }
263 if (processor->processor_primary == processor) {
264 os_atomic_dec(&primary_processor_avail_count, relaxed);
265 if (processor->is_recommended) {
266 os_atomic_dec(&primary_processor_avail_count_user, relaxed);
267 }
268 }
269 commpage_update_active_cpus();
270 }
271
272 extern lck_mtx_t processor_updown_lock;
273
274 kern_return_t
processor_shutdown(processor_t processor,processor_reason_t reason,uint32_t flags)275 processor_shutdown(
276 processor_t processor,
277 processor_reason_t reason,
278 uint32_t flags)
279 {
280 if (!ml_cpu_can_exit(processor->cpu_id, reason)) {
281 /*
282 * Failure if disallowed by arch code.
283 */
284 return KERN_NOT_SUPPORTED;
285 }
286
287 lck_mtx_lock(&processor_updown_lock);
288
289 kern_return_t mark_ret = sched_mark_processor_offline(processor, reason);
290 if (mark_ret != KERN_SUCCESS) {
291 /* Must fail or we deadlock */
292 lck_mtx_unlock(&processor_updown_lock);
293 return KERN_FAILURE;
294 }
295
296 ml_cpu_begin_state_transition(processor->cpu_id);
297 spl_t s = splsched();
298 processor_set_t pset = processor->processor_set;
299
300 pset_lock(pset);
301 if (processor->state == PROCESSOR_OFF_LINE) {
302 /*
303 * Success if already shutdown.
304 */
305 if (processor->shutdown_temporary && !(flags & SHUTDOWN_TEMPORARY)) {
306 /* Convert a temporary shutdown into a permanent shutdown */
307 processor->shutdown_temporary = false;
308 processor_down_update_counts(processor);
309 }
310 pset_unlock(pset);
311 splx(s);
312 ml_cpu_end_state_transition(processor->cpu_id);
313
314 lck_mtx_unlock(&processor_updown_lock);
315 return KERN_SUCCESS;
316 }
317
318 if (processor->shutdown_locked && (reason != REASON_SYSTEM)) {
319 /*
320 * Failure if processor is locked against shutdown.
321 */
322 pset_unlock(pset);
323 splx(s);
324
325 lck_mtx_unlock(&processor_updown_lock);
326 return KERN_FAILURE;
327 }
328
329 if (processor->state == PROCESSOR_START) {
330 pset_unlock(pset);
331 splx(s);
332
333 processor_wait_for_start(processor);
334
335 s = splsched();
336 pset_lock(pset);
337 }
338
339 /*
340 * If the processor is dispatching, let it finish.
341 */
342 while (processor->state == PROCESSOR_DISPATCHING) {
343 pset_unlock(pset);
344 splx(s);
345 delay(1);
346 s = splsched();
347 pset_lock(pset);
348 }
349
350 /*
351 * Success if already being shutdown with matching SHUTDOWN_TEMPORARY flag.
352 */
353 if ((processor->state == PROCESSOR_SHUTDOWN) || (processor->state == PROCESSOR_PENDING_OFFLINE)) {
354 bool success = (flags & SHUTDOWN_TEMPORARY) ? processor->shutdown_temporary : !processor->shutdown_temporary;
355
356 pset_unlock(pset);
357 splx(s);
358 ml_cpu_end_state_transition(processor->cpu_id);
359
360 lck_mtx_unlock(&processor_updown_lock);
361 return success ? KERN_SUCCESS : KERN_FAILURE;
362 }
363
364 ml_broadcast_cpu_event(CPU_EXIT_REQUESTED, processor->cpu_id);
365 pset_update_processor_state(pset, processor, PROCESSOR_SHUTDOWN);
366 processor->last_shutdown_reason = reason;
367 if (flags & SHUTDOWN_TEMPORARY) {
368 processor->shutdown_temporary = true;
369 }
370 pset_unlock(pset);
371
372 processor_doshutdown(processor);
373 splx(s);
374
375 cpu_exit_wait(processor->cpu_id);
376
377 if (processor != master_processor) {
378 s = splsched();
379 pset_lock(pset);
380 pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
381 pset_unlock(pset);
382 splx(s);
383 }
384
385 ml_cpu_end_state_transition(processor->cpu_id);
386 ml_broadcast_cpu_event(CPU_EXITED, processor->cpu_id);
387 ml_cpu_power_disable(processor->cpu_id);
388
389 lck_mtx_unlock(&processor_updown_lock);
390 return KERN_SUCCESS;
391 }
392
393 /*
394 * Called with interrupts disabled.
395 */
396 static void
processor_doshutdown(processor_t processor)397 processor_doshutdown(
398 processor_t processor)
399 {
400 thread_t self = current_thread();
401
402 /*
403 * Get onto the processor to shutdown
404 */
405 processor_t prev = thread_bind(processor);
406 thread_block(THREAD_CONTINUE_NULL);
407
408 /* interrupts still disabled */
409 assert(ml_get_interrupts_enabled() == FALSE);
410
411 assert(processor == current_processor());
412 assert(processor->state == PROCESSOR_SHUTDOWN);
413
414 #if CONFIG_DTRACE
415 if (dtrace_cpu_state_changed_hook) {
416 (*dtrace_cpu_state_changed_hook)(processor->cpu_id, FALSE);
417 }
418 #endif
419
420 #if defined(__arm64__)
421 /*
422 * Catch a processor going offline
423 * while a panic or stackshot is in progress, as it won't
424 * receive a SIGPdebug now that interrupts are disabled.
425 */
426 wait_while_mp_kdp_trap(false);
427 #endif
428
429 ml_cpu_down();
430
431 #if HIBERNATION
432 if (processor_avail_count < 2) {
433 hibernate_vm_lock();
434 hibernate_vm_unlock();
435 }
436 #endif
437
438 processor_set_t pset = processor->processor_set;
439
440 pset_lock(pset);
441 pset_update_processor_state(pset, processor, PROCESSOR_PENDING_OFFLINE);
442 --pset->online_processor_count;
443 if (!processor->shutdown_temporary) {
444 processor_down_update_counts(processor);
445 }
446 SCHED(processor_queue_shutdown)(processor);
447 /* pset lock dropped */
448 SCHED(rt_queue_shutdown)(processor);
449
450 thread_bind(prev);
451
452 /* interrupts still disabled */
453
454 /*
455 * Continue processor shutdown on the processor's idle thread.
456 * The handoff won't fail because the idle thread has a reserved stack.
457 * Switching to the idle thread leaves interrupts disabled,
458 * so we can't accidentally take an interrupt after the context switch.
459 */
460 thread_t shutdown_thread = processor->idle_thread;
461 shutdown_thread->continuation = processor_offline;
462 shutdown_thread->parameter = processor;
463
464 thread_run(self, NULL, NULL, shutdown_thread);
465 }
466
467 /*
468 * Called in the context of the idle thread to shut down the processor
469 *
470 * A shut-down processor looks like it's 'running' the idle thread parked
471 * in this routine, but it's actually been powered off and has no hardware state.
472 */
473 static void
processor_offline(void * parameter,__unused wait_result_t result)474 processor_offline(
475 void * parameter,
476 __unused wait_result_t result)
477 {
478 processor_t processor = (processor_t) parameter;
479 thread_t self = current_thread();
480 __assert_only thread_t old_thread = THREAD_NULL;
481
482 assert(processor == current_processor());
483 assert(self->state & TH_IDLE);
484 assert(processor->idle_thread == self);
485 assert(ml_get_interrupts_enabled() == FALSE);
486 assert(self->continuation == NULL);
487 assert(processor->processor_offlined == false);
488 assert(processor->running_timers_active == false);
489
490 bool enforce_quiesce_safety = gEnforcePlatformActionSafety;
491
492 /*
493 * Scheduling is now disabled for this processor.
494 * Ensure that primitives that need scheduling (like mutexes) know this.
495 */
496 if (enforce_quiesce_safety) {
497 disable_preemption_without_measurements();
498 }
499
500 /* convince slave_main to come back here */
501 processor->processor_offlined = true;
502
503 /*
504 * Switch to the interrupt stack and shut down the processor.
505 *
506 * When the processor comes back, it will eventually call load_context which
507 * restores the context saved by machine_processor_shutdown, returning here.
508 */
509 old_thread = machine_processor_shutdown(self, processor_offline_intstack, processor);
510
511 /* old_thread should be NULL because we got here through Load_context */
512 assert(old_thread == THREAD_NULL);
513
514 assert(processor == current_processor());
515 assert(processor->idle_thread == current_thread());
516
517 assert(ml_get_interrupts_enabled() == FALSE);
518 assert(self->continuation == NULL);
519
520 /* Extract the machine_param value stashed by slave_main */
521 void * machine_param = self->parameter;
522 self->parameter = NULL;
523
524 /* Re-initialize the processor */
525 slave_machine_init(machine_param);
526
527 assert(processor->processor_offlined == true);
528 processor->processor_offlined = false;
529
530 if (enforce_quiesce_safety) {
531 enable_preemption();
532 }
533
534 #if defined(__arm64__)
535 /*
536 * See the comments for DebuggerLock in processor_up().
537 *
538 * SIGPdisabled is cleared (to enable cpu_signal() to succeed with this processor)
539 * the first time we take an IPI. This is triggered by slave_machine_init(), above,
540 * which calls cpu_machine_init()->PE_cpu_machine_init()->PE_cpu_signal() which sends
541 * a self-IPI to ensure that happens when we enable interrupts. So enable interrupts
542 * here so that cpu_signal() can succeed before we spin on mp_kdp_trap.
543 */
544 ml_set_interrupts_enabled(TRUE);
545
546 ml_set_interrupts_enabled(FALSE);
547
548 wait_while_mp_kdp_trap(true);
549
550 /*
551 * At this point,
552 * if a stackshot or panic is in progress, we either spin on mp_kdp_trap
553 * or we sucessfully received a SIGPdebug signal which will cause us to
554 * break out of the spin on mp_kdp_trap and instead
555 * spin next time interrupts are enabled in idle_thread().
556 */
557 #endif
558
559 /*
560 * Now that the processor is back, invoke the idle thread to find out what to do next.
561 * idle_thread will enable interrupts.
562 */
563 thread_block(idle_thread);
564 /*NOTREACHED*/
565 }
566
567 /*
568 * Complete the shutdown and place the processor offline.
569 *
570 * Called at splsched in the shutdown context
571 * (i.e. on the idle thread, on the interrupt stack)
572 *
573 * The onlining half of this is done in load_context().
574 */
575 static void
processor_offline_intstack(processor_t processor)576 processor_offline_intstack(
577 processor_t processor)
578 {
579 assert(processor == current_processor());
580 assert(processor->active_thread == current_thread());
581
582 struct recount_snap snap = { 0 };
583 recount_snapshot(&snap);
584 recount_processor_idle(&processor->pr_recount, &snap);
585
586 cpu_quiescent_counter_leave(processor->last_dispatch);
587
588 PMAP_DEACTIVATE_KERNEL(processor->cpu_id);
589
590 cpu_sleep();
591 panic("zombie processor");
592 /*NOTREACHED*/
593 }
594
595 kern_return_t
host_get_boot_info(host_priv_t host_priv,kernel_boot_info_t boot_info)596 host_get_boot_info(
597 host_priv_t host_priv,
598 kernel_boot_info_t boot_info)
599 {
600 const char *src = "";
601 if (host_priv == HOST_PRIV_NULL) {
602 return KERN_INVALID_HOST;
603 }
604
605 /*
606 * Copy first operator string terminated by '\0' followed by
607 * standardized strings generated from boot string.
608 */
609 src = machine_boot_info(boot_info, KERNEL_BOOT_INFO_MAX);
610 if (src != boot_info) {
611 (void) strncpy(boot_info, src, KERNEL_BOOT_INFO_MAX);
612 }
613
614 return KERN_SUCCESS;
615 }
616
617 // These are configured through sysctls.
618 #if DEVELOPMENT || DEBUG
619 uint32_t phy_read_panic = 1;
620 uint32_t phy_write_panic = 1;
621 uint64_t simulate_stretched_io = 0;
622 #else
623 uint32_t phy_read_panic = 0;
624 uint32_t phy_write_panic = 0;
625 #endif
626
627 #if !defined(__x86_64__)
628 // The MACHINE_TIMEOUT facility only exists on ARM.
629 MACHINE_TIMEOUT_WRITEABLE(report_phy_read_delay_to, "report-phy-read-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
630 MACHINE_TIMEOUT_WRITEABLE(report_phy_write_delay_to, "report-phy-write-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
631 MACHINE_TIMEOUT_WRITEABLE(trace_phy_read_delay_to, "trace-phy-read-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
632 MACHINE_TIMEOUT_WRITEABLE(trace_phy_write_delay_to, "trace-phy-write-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
633
634 #if SCHED_HYGIENE_DEBUG
635 /*
636 * Note: The interrupt-masked timeout goes through two initializations - one
637 * early in boot and one later. Thus this function is also called twice and
638 * can't be marked '__startup_func'.
639 */
640 static void
ml_io_init_timeouts(void)641 ml_io_init_timeouts(void)
642 {
643 /*
644 * The timeouts may be completely disabled via an override. Check that
645 * last and set the timeouts to zero (disabling) if that's the case.
646 */
647 if (kern_feature_override(KF_IO_TIMEOUT_OVRD)) {
648 os_atomic_store(&report_phy_write_delay_to, 0, relaxed);
649 os_atomic_store(&report_phy_read_delay_to, 0, relaxed);
650 }
651 }
652
653 /*
654 * It's important that this happens after machine timeouts have initialized so
655 * the correct timeouts can be inherited.
656 */
657 STARTUP(TIMEOUTS, STARTUP_RANK_SECOND, ml_io_init_timeouts);
658 #endif /* SCHED_HYGIENE_DEBUG */
659
660 unsigned int report_phy_read_osbt;
661 unsigned int report_phy_write_osbt;
662
663 extern pmap_paddr_t kvtophys(vm_offset_t va);
664 #endif
665
666 #if ML_IO_TIMEOUTS_ENABLED
667
668 static LCK_GRP_DECLARE(io_timeout_override_lock_grp, "io_timeout_override");
669 static LCK_SPIN_DECLARE(io_timeout_override_lock, &io_timeout_override_lock_grp);
670
671 struct io_timeout_override_entry {
672 RB_ENTRY(io_timeout_override_entry) tree;
673
674 uintptr_t iovaddr_base;
675 unsigned int size;
676 uint32_t read_timeout;
677 uint32_t write_timeout;
678 };
679
680 static inline int
io_timeout_override_cmp(const struct io_timeout_override_entry * a,const struct io_timeout_override_entry * b)681 io_timeout_override_cmp(const struct io_timeout_override_entry *a, const struct io_timeout_override_entry *b)
682 {
683 if (a->iovaddr_base < b->iovaddr_base) {
684 return -1;
685 } else if (a->iovaddr_base > b->iovaddr_base) {
686 return 1;
687 } else {
688 return 0;
689 }
690 }
691
692 static RB_HEAD(io_timeout_override, io_timeout_override_entry) io_timeout_override_root;
693 RB_PROTOTYPE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
694 RB_GENERATE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
695
696 #endif /* ML_IO_TIMEOUTS_ENABLED */
697
698 int
ml_io_increase_timeouts(uintptr_t iovaddr_base,unsigned int size,uint32_t read_timeout_us,uint32_t write_timeout_us)699 ml_io_increase_timeouts(uintptr_t iovaddr_base, unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us)
700 {
701 #if ML_IO_TIMEOUTS_ENABLED
702 const size_t MAX_SIZE = 4096;
703 const uint64_t MAX_TIMEOUT_ABS = UINT32_MAX;
704
705 assert(preemption_enabled());
706
707 int ret = KERN_SUCCESS;
708
709 if (size == 0) {
710 return KERN_INVALID_ARGUMENT;
711 }
712
713 uintptr_t iovaddr_end;
714 if (size > MAX_SIZE || os_add_overflow(iovaddr_base, size - 1, &iovaddr_end)) {
715 return KERN_INVALID_ARGUMENT;
716 }
717
718 uint64_t read_timeout_abs, write_timeout_abs;
719 nanoseconds_to_absolutetime(NSEC_PER_USEC * read_timeout_us, &read_timeout_abs);
720 nanoseconds_to_absolutetime(NSEC_PER_USEC * write_timeout_us, &write_timeout_abs);
721 if (read_timeout_abs > MAX_TIMEOUT_ABS || write_timeout_abs > MAX_TIMEOUT_ABS) {
722 return KERN_INVALID_ARGUMENT;
723 }
724
725 struct io_timeout_override_entry *node = kalloc_type(struct io_timeout_override_entry, Z_WAITOK | Z_ZERO | Z_NOFAIL);
726 node->iovaddr_base = iovaddr_base;
727 node->size = size;
728 node->read_timeout = (uint32_t)read_timeout_abs;
729 node->write_timeout = (uint32_t)write_timeout_abs;
730
731 /*
732 * Interrupt handlers are allowed to call ml_io_{read,write}*, so
733 * interrupts must be disabled any time io_timeout_override_lock is
734 * held. Otherwise the CPU could take an interrupt while holding the
735 * lock, invoke an ISR that calls ml_io_{read,write}*, and deadlock
736 * trying to acquire the lock again.
737 */
738 boolean_t istate = ml_set_interrupts_enabled(FALSE);
739 lck_spin_lock(&io_timeout_override_lock);
740 if (RB_INSERT(io_timeout_override, &io_timeout_override_root, node)) {
741 ret = KERN_INVALID_ARGUMENT;
742 goto out;
743 }
744
745 /* Check that this didn't create any new overlaps */
746 struct io_timeout_override_entry *prev = RB_PREV(io_timeout_override, &io_timeout_override_root, node);
747 if (prev && (prev->iovaddr_base + prev->size) > node->iovaddr_base) {
748 RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
749 ret = KERN_INVALID_ARGUMENT;
750 goto out;
751 }
752 struct io_timeout_override_entry *next = RB_NEXT(io_timeout_override, &io_timeout_override_root, node);
753 if (next && (node->iovaddr_base + node->size) > next->iovaddr_base) {
754 RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
755 ret = KERN_INVALID_ARGUMENT;
756 goto out;
757 }
758
759 out:
760 lck_spin_unlock(&io_timeout_override_lock);
761 ml_set_interrupts_enabled(istate);
762 if (ret != KERN_SUCCESS) {
763 kfree_type(struct io_timeout_override_entry, node);
764 }
765 return ret;
766 #else /* !ML_IO_TIMEOUTS_ENABLED */
767 #pragma unused(iovaddr_base, size, read_timeout_us, write_timeout_us)
768 return KERN_SUCCESS;
769 #endif
770 }
771
772 int
ml_io_reset_timeouts(uintptr_t iovaddr_base,unsigned int size)773 ml_io_reset_timeouts(uintptr_t iovaddr_base, unsigned int size)
774 {
775 #if ML_IO_TIMEOUTS_ENABLED
776 assert(preemption_enabled());
777
778 struct io_timeout_override_entry key = { .iovaddr_base = iovaddr_base };
779
780 boolean_t istate = ml_set_interrupts_enabled(FALSE);
781 lck_spin_lock(&io_timeout_override_lock);
782 struct io_timeout_override_entry *node = RB_FIND(io_timeout_override, &io_timeout_override_root, &key);
783 if (node) {
784 if (node->size == size) {
785 RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
786 } else {
787 node = NULL;
788 }
789 }
790 lck_spin_unlock(&io_timeout_override_lock);
791 ml_set_interrupts_enabled(istate);
792
793 if (!node) {
794 return KERN_NOT_FOUND;
795 }
796
797 kfree_type(struct io_timeout_override_entry, node);
798 #else /* !ML_IO_TIMEOUTS_ENABLED */
799 #pragma unused(iovaddr_base, size)
800 #endif
801 return KERN_SUCCESS;
802 }
803
804 #if ML_IO_TIMEOUTS_ENABLED
805 static void
override_io_timeouts(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)806 override_io_timeouts(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
807 {
808 assert(!ml_get_interrupts_enabled());
809
810 struct io_timeout_override_entry *node = RB_ROOT(&io_timeout_override_root);
811
812 lck_spin_lock(&io_timeout_override_lock);
813 /* RB_FIND() doesn't support custom cmp functions, so we have to open-code our own */
814 while (node) {
815 if (node->iovaddr_base <= vaddr && vaddr < node->iovaddr_base + node->size) {
816 if (read_timeout) {
817 *read_timeout = node->read_timeout;
818 }
819 if (write_timeout) {
820 *write_timeout = node->write_timeout;
821 }
822 break;
823 } else if (vaddr < node->iovaddr_base) {
824 node = RB_LEFT(node, tree);
825 } else {
826 node = RB_RIGHT(node, tree);
827 }
828 }
829 lck_spin_unlock(&io_timeout_override_lock);
830 }
831 #endif /* ML_IO_TIMEOUTS_ENABLED */
832
833 unsigned long long
ml_io_read(uintptr_t vaddr,int size)834 ml_io_read(uintptr_t vaddr, int size)
835 {
836 unsigned long long result = 0;
837 unsigned char s1;
838 unsigned short s2;
839
840 #ifdef ML_IO_VERIFY_UNCACHEABLE
841 uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
842 #elif defined(ML_IO_TIMEOUTS_ENABLED)
843 uintptr_t const paddr = kvtophys(vaddr);
844 #endif
845
846 #ifdef ML_IO_TIMEOUTS_ENABLED
847 uint64_t sabs, eabs;
848 boolean_t istate, timeread = FALSE;
849 uint64_t report_read_delay;
850 #if __x86_64__
851 report_read_delay = report_phy_read_delay;
852 #else
853 report_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
854 uint64_t const trace_phy_read_delay = os_atomic_load(&trace_phy_read_delay_to, relaxed);
855 #endif /* __x86_64__ */
856
857 if (__improbable(report_read_delay != 0)) {
858 istate = ml_set_interrupts_enabled(FALSE);
859 sabs = mach_absolute_time();
860 timeread = TRUE;
861 }
862
863 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
864 if (__improbable(timeread && simulate_stretched_io)) {
865 sabs -= simulate_stretched_io;
866 }
867 #endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */
868 #endif /* ML_IO_TIMEOUTS_ENABLED */
869
870 switch (size) {
871 case 1:
872 s1 = *(volatile unsigned char *)vaddr;
873 result = s1;
874 break;
875 case 2:
876 s2 = *(volatile unsigned short *)vaddr;
877 result = s2;
878 break;
879 case 4:
880 result = *(volatile unsigned int *)vaddr;
881 break;
882 case 8:
883 result = *(volatile unsigned long long *)vaddr;
884 break;
885 default:
886 panic("Invalid size %d for ml_io_read(%p)", size, (void *)vaddr);
887 break;
888 }
889
890 #ifdef ML_IO_TIMEOUTS_ENABLED
891 if (__improbable(timeread == TRUE)) {
892 eabs = mach_absolute_time();
893
894 /* Prevent the processor from calling iotrace during its
895 * initialization procedure. */
896 if (current_processor()->state == PROCESSOR_RUNNING) {
897 iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, sabs, eabs - sabs);
898 }
899
900 if (__improbable((eabs - sabs) > report_read_delay)) {
901 uint64_t override = 0;
902 override_io_timeouts(vaddr, &override, NULL);
903
904 if (override != 0) {
905 #if SCHED_HYGIENE_DEBUG
906 /*
907 * The IO timeout was overridden. As interrupts are disabled in
908 * order to accurately measure IO time this can cause the
909 * interrupt masked timeout threshold to be exceeded. If the
910 * interrupt masked debug mode is set to panic, abandon the
911 * measurement. If in trace mode leave it as-is for
912 * observability.
913 */
914 if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
915 ml_spin_debug_clear(current_thread());
916 }
917 #endif
918 report_read_delay = override;
919 }
920 }
921
922 if (__improbable((eabs - sabs) > report_read_delay)) {
923 if (phy_read_panic && (machine_timeout_suspended() == FALSE)) {
924 #if defined(__x86_64__)
925 panic_notify();
926 #endif /* defined(__x86_64__) */
927 uint64_t nsec = 0;
928 absolutetime_to_nanoseconds(eabs - sabs, &nsec);
929 panic("Read from IO vaddr 0x%lx paddr 0x%lx took %llu ns, "
930 "result: 0x%llx (start: %llu, end: %llu), ceiling: %llu",
931 vaddr, paddr, nsec, result, sabs, eabs,
932 report_read_delay);
933 }
934
935 (void)ml_set_interrupts_enabled(istate);
936
937 if (report_phy_read_osbt) {
938 uint64_t nsec = 0;
939 absolutetime_to_nanoseconds(eabs - sabs, &nsec);
940 OSReportWithBacktrace("ml_io_read(v=%p, p=%p) size %d result 0x%llx "
941 "took %lluus",
942 (void *)vaddr, (void *)paddr, size, result,
943 nsec / NSEC_PER_USEC);
944 }
945 DTRACE_PHYSLAT5(physioread, uint64_t, (eabs - sabs),
946 uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, result);
947 } else if (__improbable(trace_phy_read_delay > 0 && (eabs - sabs) > trace_phy_read_delay)) {
948 KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_READ),
949 (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result);
950
951 (void)ml_set_interrupts_enabled(istate);
952 } else {
953 (void)ml_set_interrupts_enabled(istate);
954 }
955 }
956 #endif /* ML_IO_TIMEOUTS_ENABLED */
957 return result;
958 }
959
960 unsigned int
ml_io_read8(uintptr_t vaddr)961 ml_io_read8(uintptr_t vaddr)
962 {
963 return (unsigned) ml_io_read(vaddr, 1);
964 }
965
966 unsigned int
ml_io_read16(uintptr_t vaddr)967 ml_io_read16(uintptr_t vaddr)
968 {
969 return (unsigned) ml_io_read(vaddr, 2);
970 }
971
972 unsigned int
ml_io_read32(uintptr_t vaddr)973 ml_io_read32(uintptr_t vaddr)
974 {
975 return (unsigned) ml_io_read(vaddr, 4);
976 }
977
978 unsigned long long
ml_io_read64(uintptr_t vaddr)979 ml_io_read64(uintptr_t vaddr)
980 {
981 return ml_io_read(vaddr, 8);
982 }
983
984 /* ml_io_write* */
985
986 void
ml_io_write(uintptr_t vaddr,uint64_t val,int size)987 ml_io_write(uintptr_t vaddr, uint64_t val, int size)
988 {
989 #ifdef ML_IO_VERIFY_UNCACHEABLE
990 uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
991 #elif defined(ML_IO_TIMEOUTS_ENABLED)
992 uintptr_t const paddr = kvtophys(vaddr);
993 #endif
994
995 #ifdef ML_IO_TIMEOUTS_ENABLED
996 uint64_t sabs, eabs;
997 boolean_t istate, timewrite = FALSE;
998 uint64_t report_write_delay;
999 #if __x86_64__
1000 report_write_delay = report_phy_write_delay;
1001 #else
1002 report_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
1003 uint64_t trace_phy_write_delay = os_atomic_load(&trace_phy_write_delay_to, relaxed);
1004 #endif /* !defined(__x86_64__) */
1005 if (__improbable(report_write_delay != 0)) {
1006 istate = ml_set_interrupts_enabled(FALSE);
1007 sabs = mach_absolute_time();
1008 timewrite = TRUE;
1009 }
1010
1011 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
1012 if (__improbable(timewrite && simulate_stretched_io)) {
1013 sabs -= simulate_stretched_io;
1014 }
1015 #endif /* DEVELOPMENT || DEBUG */
1016 #endif /* ML_IO_TIMEOUTS_ENABLED */
1017
1018 switch (size) {
1019 case 1:
1020 *(volatile uint8_t *)vaddr = (uint8_t)val;
1021 break;
1022 case 2:
1023 *(volatile uint16_t *)vaddr = (uint16_t)val;
1024 break;
1025 case 4:
1026 *(volatile uint32_t *)vaddr = (uint32_t)val;
1027 break;
1028 case 8:
1029 *(volatile uint64_t *)vaddr = (uint64_t)val;
1030 break;
1031 default:
1032 panic("Invalid size %d for ml_io_write(%p, 0x%llx)", size, (void *)vaddr, val);
1033 break;
1034 }
1035
1036 #ifdef ML_IO_TIMEOUTS_ENABLED
1037 if (__improbable(timewrite == TRUE)) {
1038 eabs = mach_absolute_time();
1039
1040
1041 /* Prevent the processor from calling iotrace during its
1042 * initialization procedure. */
1043 if (current_processor()->state == PROCESSOR_RUNNING) {
1044 iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, sabs, eabs - sabs);
1045 }
1046
1047
1048 if (__improbable((eabs - sabs) > report_write_delay)) {
1049 uint64_t override = 0;
1050 override_io_timeouts(vaddr, NULL, &override);
1051
1052 if (override != 0) {
1053 #if SCHED_HYGIENE_DEBUG
1054 /*
1055 * The IO timeout was overridden. As interrupts are disabled in
1056 * order to accurately measure IO time this can cause the
1057 * interrupt masked timeout threshold to be exceeded. If the
1058 * interrupt masked debug mode is set to panic, abandon the
1059 * measurement. If in trace mode leave it as-is for
1060 * observability.
1061 */
1062 if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1063 ml_spin_debug_clear(current_thread());
1064 }
1065 #endif
1066 report_write_delay = override;
1067 }
1068 }
1069
1070 if (__improbable((eabs - sabs) > report_write_delay)) {
1071 if (phy_write_panic && (machine_timeout_suspended() == FALSE)) {
1072 #if defined(__x86_64__)
1073 panic_notify();
1074 #endif /* defined(__x86_64__) */
1075
1076 uint64_t nsec = 0;
1077 absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1078 panic("Write to IO vaddr %p paddr %p val 0x%llx took %llu ns,"
1079 " (start: %llu, end: %llu), ceiling: %llu",
1080 (void *)vaddr, (void *)paddr, val, nsec, sabs, eabs,
1081 report_write_delay);
1082 }
1083
1084 (void)ml_set_interrupts_enabled(istate);
1085
1086 if (report_phy_write_osbt) {
1087 uint64_t nsec = 0;
1088 absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1089 OSReportWithBacktrace("ml_io_write size %d (v=%p, p=%p, 0x%llx) "
1090 "took %lluus",
1091 size, (void *)vaddr, (void *)paddr, val, nsec / NSEC_PER_USEC);
1092 }
1093 DTRACE_PHYSLAT5(physiowrite, uint64_t, (eabs - sabs),
1094 uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, val);
1095 } else if (__improbable(trace_phy_write_delay > 0 && (eabs - sabs) > trace_phy_write_delay)) {
1096 KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_WRITE),
1097 (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val);
1098
1099 (void)ml_set_interrupts_enabled(istate);
1100 } else {
1101 (void)ml_set_interrupts_enabled(istate);
1102 }
1103 }
1104 #endif /* ML_IO_TIMEOUTS_ENABLED */
1105 }
1106
1107 void
ml_io_write8(uintptr_t vaddr,uint8_t val)1108 ml_io_write8(uintptr_t vaddr, uint8_t val)
1109 {
1110 ml_io_write(vaddr, val, 1);
1111 }
1112
1113 void
ml_io_write16(uintptr_t vaddr,uint16_t val)1114 ml_io_write16(uintptr_t vaddr, uint16_t val)
1115 {
1116 ml_io_write(vaddr, val, 2);
1117 }
1118
1119 void
ml_io_write32(uintptr_t vaddr,uint32_t val)1120 ml_io_write32(uintptr_t vaddr, uint32_t val)
1121 {
1122 ml_io_write(vaddr, val, 4);
1123 }
1124
1125 void
ml_io_write64(uintptr_t vaddr,uint64_t val)1126 ml_io_write64(uintptr_t vaddr, uint64_t val)
1127 {
1128 ml_io_write(vaddr, val, 8);
1129 }
1130
1131 struct cpu_callback_chain_elem {
1132 cpu_callback_t fn;
1133 void *param;
1134 struct cpu_callback_chain_elem *next;
1135 };
1136
1137 static struct cpu_callback_chain_elem *cpu_callback_chain;
1138 static LCK_GRP_DECLARE(cpu_callback_chain_lock_grp, "cpu_callback_chain");
1139 static LCK_SPIN_DECLARE(cpu_callback_chain_lock, &cpu_callback_chain_lock_grp);
1140
1141 void
cpu_event_register_callback(cpu_callback_t fn,void * param)1142 cpu_event_register_callback(cpu_callback_t fn, void *param)
1143 {
1144 struct cpu_callback_chain_elem *new_elem;
1145
1146 new_elem = zalloc_permanent_type(struct cpu_callback_chain_elem);
1147 if (!new_elem) {
1148 panic("can't allocate cpu_callback_chain_elem");
1149 }
1150
1151 lck_spin_lock(&cpu_callback_chain_lock);
1152 new_elem->next = cpu_callback_chain;
1153 new_elem->fn = fn;
1154 new_elem->param = param;
1155 os_atomic_store(&cpu_callback_chain, new_elem, release);
1156 lck_spin_unlock(&cpu_callback_chain_lock);
1157 }
1158
1159 __attribute__((noreturn))
1160 void
cpu_event_unregister_callback(__unused cpu_callback_t fn)1161 cpu_event_unregister_callback(__unused cpu_callback_t fn)
1162 {
1163 panic("Unfortunately, cpu_event_unregister_callback is unimplemented.");
1164 }
1165
1166 void
ml_broadcast_cpu_event(enum cpu_event event,unsigned int cpu_or_cluster)1167 ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster)
1168 {
1169 struct cpu_callback_chain_elem *cursor;
1170
1171 cursor = os_atomic_load(&cpu_callback_chain, dependency);
1172 for (; cursor != NULL; cursor = cursor->next) {
1173 cursor->fn(cursor->param, event, cpu_or_cluster);
1174 }
1175 }
1176
1177 // Initialize Machine Timeouts (see the MACHINE_TIMEOUT macro
1178 // definition)
1179
1180 void
machine_timeout_init_with_suffix(const struct machine_timeout_spec * spec,char const * suffix)1181 machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *suffix)
1182 {
1183 if (spec->skip_predicate != NULL && spec->skip_predicate(spec)) {
1184 // This timeout should be disabled.
1185 os_atomic_store_wide((uint64_t*)spec->ptr, 0, relaxed);
1186 return;
1187 }
1188
1189 assert(suffix != NULL);
1190 assert(strlen(spec->name) <= MACHINE_TIMEOUT_MAX_NAME_LEN);
1191
1192 size_t const suffix_len = strlen(suffix);
1193
1194 size_t const dt_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + 1;
1195 char dt_name[dt_name_size];
1196
1197 strlcpy(dt_name, spec->name, dt_name_size);
1198 strlcat(dt_name, suffix, dt_name_size);
1199
1200 size_t const scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + strlen("-scale") + 1;
1201 char scale_name[scale_name_size];
1202
1203 strlcpy(scale_name, spec->name, scale_name_size);
1204 strlcat(scale_name, suffix, scale_name_size);
1205 strlcat(scale_name, "-scale", scale_name_size);
1206
1207 size_t const boot_arg_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + strlen("ml-timeout-") + suffix_len + 1;
1208 char boot_arg_name[boot_arg_name_size];
1209
1210 strlcpy(boot_arg_name, "ml-timeout-", boot_arg_name_size);
1211 strlcat(boot_arg_name, spec->name, boot_arg_name_size);
1212 strlcat(boot_arg_name, suffix, boot_arg_name_size);
1213
1214 size_t const boot_arg_scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN +
1215 strlen("ml-timeout-") + strlen("-scale") + suffix_len + 1;
1216 char boot_arg_scale_name[boot_arg_scale_name_size];
1217
1218 strlcpy(boot_arg_scale_name, "ml-timeout-", boot_arg_scale_name_size);
1219 strlcat(boot_arg_scale_name, spec->name, boot_arg_scale_name_size);
1220 strlcat(boot_arg_scale_name, suffix, boot_arg_name_size);
1221 strlcat(boot_arg_scale_name, "-scale", boot_arg_scale_name_size);
1222
1223
1224 /*
1225 * Determine base value from DT and boot-args.
1226 */
1227
1228 DTEntry base, chosen;
1229
1230 if (SecureDTLookupEntry(NULL, "/machine-timeouts", &base) != kSuccess) {
1231 base = NULL;
1232 }
1233
1234 if (SecureDTLookupEntry(NULL, "/chosen/machine-timeouts", &chosen) != kSuccess) {
1235 chosen = NULL;
1236 }
1237
1238 uint64_t timeout = spec->default_value;
1239 bool found = false;
1240
1241 uint64_t const *data = NULL;
1242 unsigned int data_size = sizeof(*data);
1243
1244 /* First look in /machine-timeouts/<name> */
1245 if (base != NULL && SecureDTGetProperty(base, dt_name, (const void **)&data, &data_size) == kSuccess) {
1246 if (data_size != sizeof(*data)) {
1247 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s", __func__, data_size, dt_name);
1248 }
1249
1250 timeout = *data;
1251 found = true;
1252 }
1253
1254 /* A value in /chosen/machine-timeouts/<name> overrides */
1255 if (chosen != NULL && SecureDTGetProperty(chosen, dt_name, (const void **)&data, &data_size) == kSuccess) {
1256 if (data_size != sizeof(*data)) {
1257 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s", __func__, data_size, dt_name);
1258 }
1259
1260 timeout = *data;
1261 found = true;
1262 }
1263
1264 /* A boot-arg ml-timeout-<name> overrides */
1265 uint64_t boot_arg = 0;
1266
1267 if (PE_parse_boot_argn(boot_arg_name, &boot_arg, sizeof(boot_arg))) {
1268 timeout = boot_arg;
1269 found = true;
1270 }
1271
1272
1273 /*
1274 * Determine scale value from DT and boot-args.
1275 */
1276
1277 uint32_t scale = 1;
1278 uint32_t const *scale_data;
1279 unsigned int scale_size = sizeof(scale_data);
1280
1281 /* If there is a scale factor /machine-timeouts/<name>-scale,
1282 * apply it. */
1283 if (base != NULL && SecureDTGetProperty(base, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1284 if (scale_size != sizeof(*scale_data)) {
1285 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s-scale", __func__, scale_size, dt_name);
1286 }
1287
1288 scale *= *scale_data;
1289 }
1290
1291 /* If there is a scale factor /chosen/machine-timeouts/<name>-scale,
1292 * apply it as well. */
1293 if (chosen != NULL && SecureDTGetProperty(chosen, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1294 if (scale_size != sizeof(*scale_data)) {
1295 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s-scale", __func__,
1296 scale_size, dt_name);
1297 }
1298
1299 scale *= *scale_data;
1300 }
1301
1302 /* Finally, a boot-arg ml-timeout-<name>-scale applies as well. */
1303 if (PE_parse_boot_argn(boot_arg_scale_name, &boot_arg, sizeof(boot_arg))) {
1304 scale *= boot_arg;
1305 }
1306
1307 static bool global_scale_set;
1308 static uint32_t global_scale;
1309
1310 if (!global_scale_set) {
1311 /* Apply /machine-timeouts/global-scale if present */
1312 if (SecureDTGetProperty(base, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1313 if (scale_size != sizeof(*scale_data)) {
1314 panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/global-scale", __func__,
1315 scale_size);
1316 }
1317
1318 global_scale *= *scale_data;
1319 global_scale_set = true;
1320 }
1321
1322 /* Apply /chosen/machine-timeouts/global-scale if present */
1323 if (SecureDTGetProperty(chosen, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1324 if (scale_size != sizeof(*scale_data)) {
1325 panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/global-scale", __func__,
1326 scale_size);
1327 }
1328
1329 global_scale *= *scale_data;
1330 global_scale_set = true;
1331 }
1332
1333 /* Finally, the boot-arg ml-timeout-global-scale applies */
1334 if (PE_parse_boot_argn("ml-timeout-global-scale", &boot_arg, sizeof(boot_arg))) {
1335 global_scale *= boot_arg;
1336 global_scale_set = true;
1337 }
1338 }
1339
1340 if (global_scale_set) {
1341 scale *= global_scale;
1342 }
1343
1344 /* Compute the final timeout, and done. */
1345 if (found && timeout > 0) {
1346 /* Only apply inherent unit scale if the value came in
1347 * externally. */
1348
1349 if (spec->unit_scale == MACHINE_TIMEOUT_UNIT_TIMEBASE) {
1350 uint64_t nanoseconds = timeout / 1000;
1351 nanoseconds_to_absolutetime(nanoseconds, &timeout);
1352 } else {
1353 timeout /= spec->unit_scale;
1354 }
1355
1356 if (timeout == 0) {
1357 /* Ensure unit scaling did not disable the timeout. */
1358 timeout = 1;
1359 }
1360 }
1361
1362 if (os_mul_overflow(timeout, scale, &timeout)) {
1363 timeout = UINT64_MAX; // clamp
1364 }
1365
1366 os_atomic_store_wide((uint64_t*)spec->ptr, timeout, relaxed);
1367 }
1368
1369 void
machine_timeout_init(const struct machine_timeout_spec * spec)1370 machine_timeout_init(const struct machine_timeout_spec *spec)
1371 {
1372 machine_timeout_init_with_suffix(spec, "");
1373 }
1374
1375 /*
1376 * Late timeout (re-)initialization, at the end of bsd_init()
1377 */
1378 void
machine_timeout_bsd_init(void)1379 machine_timeout_bsd_init(void)
1380 {
1381 char const * const __unused mt_suffix = "-b";
1382 #if SCHED_HYGIENE_DEBUG
1383 machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix);
1384 machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix);
1385
1386 /*
1387 * The io timeouts can inherit from interrupt_masked_timeout.
1388 * Re-initialize, as interrupt_masked_timeout may have changed.
1389 */
1390 ml_io_init_timeouts();
1391
1392 PERCPU_DECL(uint64_t _Atomic, preemption_disable_max_mt);
1393
1394 /*
1395 * Reset the preemption disable stats, so that they are not
1396 * polluted by long early boot code.
1397 */
1398 percpu_foreach(max_stat, preemption_disable_max_mt) {
1399 os_atomic_store(max_stat, 0, relaxed);
1400
1401 /*
1402 * No additional synchronization needed. The time when we
1403 * switch to late boot timeouts is relatively arbitrary
1404 * anyway: By now we don't expect any long preemption
1405 * disabling anymore. While that is still a clear delineation
1406 * for the boot CPU, other CPUs can be in the middle of doing
1407 * whatever. So if the missing synchronization causes a new
1408 * maximum to be missed on a secondary CPU, it could just as
1409 * well have been missed by racing with this function.
1410 */
1411 }
1412
1413 #endif
1414 }
1415
1416 #if ML_IO_TIMEOUTS_ENABLED && CONFIG_XNUPOST
1417 #include <tests/xnupost.h>
1418
1419 extern kern_return_t ml_io_timeout_test(void);
1420
1421 static inline void
ml_io_timeout_test_get_timeouts(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)1422 ml_io_timeout_test_get_timeouts(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
1423 {
1424 *read_timeout = 0;
1425 *write_timeout = 0;
1426
1427 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1428 override_io_timeouts(vaddr, read_timeout, write_timeout);
1429 ml_set_interrupts_enabled(istate);
1430 }
1431
1432 kern_return_t
ml_io_timeout_test(void)1433 ml_io_timeout_test(void)
1434 {
1435 const size_t SIZE = 16;
1436 uintptr_t iovaddr_base1 = (uintptr_t)&ml_io_timeout_test;
1437 uintptr_t iovaddr_base2 = iovaddr_base1 + SIZE;
1438 uintptr_t vaddr1 = iovaddr_base1 + SIZE / 2;
1439 uintptr_t vaddr2 = iovaddr_base2 + SIZE / 2;
1440
1441 const uint64_t READ_TIMEOUT1_US = 50000, WRITE_TIMEOUT1_US = 50001;
1442 const uint64_t READ_TIMEOUT2_US = 50002, WRITE_TIMEOUT2_US = 50003;
1443 uint64_t read_timeout1_abs, write_timeout1_abs;
1444 uint64_t read_timeout2_abs, write_timeout2_abs;
1445 nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT1_US, &read_timeout1_abs);
1446 nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT1_US, &write_timeout1_abs);
1447 nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT2_US, &read_timeout2_abs);
1448 nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT2_US, &write_timeout2_abs);
1449
1450 int err = ml_io_increase_timeouts(iovaddr_base1, 0, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1451 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for empty region");
1452
1453 err = ml_io_increase_timeouts(iovaddr_base1, 4097, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1454 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for region > 4096 bytes");
1455
1456 err = ml_io_increase_timeouts(UINTPTR_MAX, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1457 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overflowed region");
1458
1459 err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1460 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first VA region should succeed");
1461
1462 err = ml_io_increase_timeouts(iovaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US);
1463 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second VA region should succeed");
1464
1465 err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1466 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for same region twice");
1467
1468 err = ml_io_increase_timeouts(vaddr1, (uint32_t)(vaddr2 - vaddr1), READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1469 T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overlapping regions");
1470
1471 uint64_t read_timeout, write_timeout;
1472 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1473 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1474 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1475
1476 ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout);
1477 T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region");
1478 T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region");
1479
1480 ml_io_timeout_test_get_timeouts(iovaddr_base2 + SIZE, &read_timeout, &write_timeout);
1481 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout without override");
1482 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout without override");
1483
1484 err = ml_io_reset_timeouts(iovaddr_base1 + 1, SIZE - 1);
1485 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for subregion");
1486
1487 err = ml_io_reset_timeouts(iovaddr_base2 + SIZE, SIZE);
1488 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for non-existent region");
1489
1490 err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1491 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first VA region should succeed");
1492
1493 ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1494 T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region");
1495 T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region");
1496
1497 err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1498 T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for same region twice");
1499
1500 err = ml_io_reset_timeouts(iovaddr_base2, SIZE);
1501 T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second VA region should succeed");
1502
1503 return KERN_SUCCESS;
1504 }
1505 #endif /* CONFIG_XNUPOST */
1506