xref: /xnu-10063.121.3/osfmk/kern/machine.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	kern/machine.c
60  *	Author:	Avadis Tevanian, Jr.
61  *	Date:	1987
62  *
63  *	Support for machine independent machine abstraction.
64  */
65 
66 #include <string.h>
67 
68 #include <mach/mach_types.h>
69 #include <mach/boolean.h>
70 #include <mach/kern_return.h>
71 #include <mach/machine.h>
72 #include <mach/host_info.h>
73 #include <mach/host_reboot.h>
74 #include <mach/host_priv_server.h>
75 #include <mach/processor_server.h>
76 #include <mach/sdt.h>
77 
78 #include <kern/kern_types.h>
79 #include <kern/cpu_data.h>
80 #include <kern/ipc_host.h>
81 #include <kern/host.h>
82 #include <kern/machine.h>
83 #include <kern/misc_protos.h>
84 #include <kern/percpu.h>
85 #include <kern/processor.h>
86 #include <kern/queue.h>
87 #include <kern/sched.h>
88 #include <kern/startup.h>
89 #include <kern/task.h>
90 #include <kern/thread.h>
91 #include <kern/iotrace.h>
92 
93 #include <libkern/OSDebug.h>
94 #if ML_IO_TIMEOUTS_ENABLED
95 #include <libkern/tree.h>
96 #endif
97 
98 #include <pexpert/device_tree.h>
99 
100 #include <machine/commpage.h>
101 #include <machine/machine_routines.h>
102 
103 #if HIBERNATION
104 #include <IOKit/IOHibernatePrivate.h>
105 #endif
106 #include <IOKit/IOPlatformExpert.h>
107 
108 #if CONFIG_DTRACE
109 extern void (*dtrace_cpu_state_changed_hook)(int, boolean_t);
110 #endif
111 
112 #if defined(__arm64__)
113 extern void wait_while_mp_kdp_trap(bool check_SIGPdebug);
114 #if CONFIG_SPTM
115 #include <arm64/sptm/pmap/pmap_data.h>
116 #else
117 #include <arm/pmap/pmap_data.h>
118 #endif /* CONFIG_SPTM */
119 #endif /* defined(__arm64__) */
120 
121 #if defined(__x86_64__)
122 #include <i386/panic_notify.h>
123 #endif
124 
125 #if ML_IO_TIMEOUTS_ENABLED
126 #if defined(__x86_64__)
127 #define ml_io_timestamp mach_absolute_time
128 #else
129 #define ml_io_timestamp ml_get_timebase
130 #endif /* __x86_64__ */
131 #endif /* ML_IO_TIMEOUTS_ENABLED */
132 
133 /*
134  *	Exported variables:
135  */
136 
137 struct machine_info     machine_info;
138 
139 /* Forwards */
140 static void
141 processor_doshutdown(processor_t processor);
142 
143 static void
144 processor_offline(void * parameter, __unused wait_result_t result);
145 
146 static void
147 processor_offline_intstack(processor_t processor) __dead2;
148 
149 static void
processor_up_update_counts(processor_t processor)150 processor_up_update_counts(processor_t processor)
151 {
152 	ml_cpu_up_update_counts(processor->cpu_id);
153 
154 	os_atomic_inc(&processor_avail_count, relaxed);
155 	if (processor->is_recommended) {
156 		os_atomic_inc(&processor_avail_count_user, relaxed);
157 	}
158 	if (processor->processor_primary == processor) {
159 		os_atomic_inc(&primary_processor_avail_count, relaxed);
160 		if (processor->is_recommended) {
161 			os_atomic_inc(&primary_processor_avail_count_user, relaxed);
162 		}
163 	}
164 	commpage_update_active_cpus();
165 }
166 
167 /*
168  *	processor_up:
169  *
170  *	Flag processor as up and running, and available
171  *	for scheduling.
172  */
173 void
processor_up(processor_t processor)174 processor_up(
175 	processor_t                     processor)
176 {
177 	processor_set_t         pset;
178 	spl_t                           s;
179 
180 	s = splsched();
181 	init_ast_check(processor);
182 
183 #if defined(__arm64__)
184 	/*
185 	 * A processor coming online won't have received a SIGPdebug signal
186 	 * to cause it to spin while a stackshot or panic is taking place,
187 	 * so spin here on mp_kdp_trap.
188 	 *
189 	 * However, since cpu_signal() is not yet enabled for this processor,
190 	 * there is a race if we have just passed this when a cpu_signal()
191 	 * is attempted.  The sender will assume the cpu is offline, so it will
192 	 * not end up spinning anywhere.  See processor_offline() for the fix
193 	 * for this race.
194 	 */
195 	wait_while_mp_kdp_trap(false);
196 #endif
197 
198 	pset = processor->processor_set;
199 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
200 	pset_lock(pset);
201 
202 	++pset->online_processor_count;
203 	simple_lock(&processor->start_state_lock, LCK_GRP_NULL);
204 	pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
205 	simple_unlock(&processor->start_state_lock);
206 	bool temporary = processor->shutdown_temporary;
207 	if (temporary) {
208 		processor->shutdown_temporary = false;
209 	} else {
210 		processor_up_update_counts(processor);
211 	}
212 	if (processor->is_recommended) {
213 		SCHED(pset_made_schedulable)(processor, pset, false);
214 	}
215 	pset_unlock(pset);
216 	ml_cpu_up();
217 	smr_cpu_up(processor, SMR_CPU_REASON_OFFLINE);
218 	sched_mark_processor_online_locked(processor, processor->last_startup_reason);
219 	simple_unlock(&sched_available_cores_lock);
220 	splx(s);
221 
222 	thread_wakeup((event_t)&processor->state);
223 
224 #if CONFIG_DTRACE
225 	if (dtrace_cpu_state_changed_hook) {
226 		(*dtrace_cpu_state_changed_hook)(processor->cpu_id, TRUE);
227 	}
228 #endif
229 }
230 #include <atm/atm_internal.h>
231 
232 kern_return_t
host_reboot(host_priv_t host_priv,int options)233 host_reboot(
234 	host_priv_t             host_priv,
235 	int                             options)
236 {
237 	if (host_priv == HOST_PRIV_NULL) {
238 		return KERN_INVALID_HOST;
239 	}
240 
241 #if DEVELOPMENT || DEBUG
242 	if (options & HOST_REBOOT_DEBUGGER) {
243 		Debugger("Debugger");
244 		return KERN_SUCCESS;
245 	}
246 #endif
247 
248 	if (options & HOST_REBOOT_UPSDELAY) {
249 		// UPS power cutoff path
250 		PEHaltRestart( kPEUPSDelayHaltCPU );
251 	} else {
252 		halt_all_cpus(!(options & HOST_REBOOT_HALT));
253 	}
254 
255 	return KERN_SUCCESS;
256 }
257 
258 kern_return_t
processor_assign(__unused processor_t processor,__unused processor_set_t new_pset,__unused boolean_t wait)259 processor_assign(
260 	__unused processor_t            processor,
261 	__unused processor_set_t        new_pset,
262 	__unused boolean_t              wait)
263 {
264 	return KERN_FAILURE;
265 }
266 
267 static void
processor_down_update_counts(processor_t processor)268 processor_down_update_counts(processor_t processor)
269 {
270 	ml_cpu_down_update_counts(processor->cpu_id);
271 
272 	os_atomic_dec(&processor_avail_count, relaxed);
273 	if (processor->is_recommended) {
274 		os_atomic_dec(&processor_avail_count_user, relaxed);
275 	}
276 	if (processor->processor_primary == processor) {
277 		os_atomic_dec(&primary_processor_avail_count, relaxed);
278 		if (processor->is_recommended) {
279 			os_atomic_dec(&primary_processor_avail_count_user, relaxed);
280 		}
281 	}
282 	commpage_update_active_cpus();
283 }
284 
285 extern lck_mtx_t processor_updown_lock;
286 
287 kern_return_t
processor_shutdown(processor_t processor,processor_reason_t reason,uint32_t flags)288 processor_shutdown(
289 	processor_t                     processor,
290 	processor_reason_t              reason,
291 	uint32_t                        flags)
292 {
293 	if (!ml_cpu_can_exit(processor->cpu_id, reason)) {
294 		/*
295 		 * Failure if disallowed by arch code.
296 		 */
297 		return KERN_NOT_SUPPORTED;
298 	}
299 
300 	lck_mtx_lock(&processor_updown_lock);
301 
302 	spl_t s = splsched();
303 	processor_set_t pset = processor->processor_set;
304 
305 	pset_lock(pset);
306 
307 	if (processor->state == PROCESSOR_START) {
308 		pset_unlock(pset);
309 		splx(s);
310 
311 		processor_wait_for_start(processor);
312 
313 		s = splsched();
314 		pset_lock(pset);
315 	}
316 
317 	/*
318 	 * If the processor is dispatching, let it finish.
319 	 */
320 	while (processor->state == PROCESSOR_DISPATCHING) {
321 		pset_unlock(pset);
322 		splx(s);
323 		delay(1);
324 		s = splsched();
325 		pset_lock(pset);
326 	}
327 	pset_unlock(pset);
328 	splx(s);
329 
330 	kern_return_t mark_ret = sched_mark_processor_offline(processor, reason);
331 	if (mark_ret != KERN_SUCCESS) {
332 		/* Must fail or we deadlock */
333 		lck_mtx_unlock(&processor_updown_lock);
334 		return KERN_FAILURE;
335 	}
336 
337 	ml_cpu_begin_state_transition(processor->cpu_id);
338 	s = splsched();
339 
340 	pset_lock(pset);
341 	if (processor->state == PROCESSOR_OFF_LINE) {
342 		/*
343 		 * Success if already shutdown.
344 		 */
345 		if (processor->shutdown_temporary && !(flags & SHUTDOWN_TEMPORARY)) {
346 			/* Convert a temporary shutdown into a permanent shutdown */
347 			processor->shutdown_temporary = false;
348 			processor_down_update_counts(processor);
349 		}
350 		pset_unlock(pset);
351 		splx(s);
352 		ml_cpu_end_state_transition(processor->cpu_id);
353 
354 		lck_mtx_unlock(&processor_updown_lock);
355 		return KERN_SUCCESS;
356 	}
357 
358 	if (processor->shutdown_locked && (reason != REASON_SYSTEM)) {
359 		/*
360 		 * Failure if processor is locked against shutdown.
361 		 */
362 		pset_unlock(pset);
363 		splx(s);
364 
365 		lck_mtx_unlock(&processor_updown_lock);
366 		return KERN_FAILURE;
367 	}
368 
369 	/*
370 	 * If the processor is dispatching, let it finish.
371 	 */
372 	while (processor->state == PROCESSOR_DISPATCHING) {
373 		pset_unlock(pset);
374 		splx(s);
375 		delay(1);
376 		s = splsched();
377 		pset_lock(pset);
378 	}
379 
380 	/*
381 	 * Success if already being shutdown with matching SHUTDOWN_TEMPORARY flag.
382 	 */
383 	if ((processor->state == PROCESSOR_SHUTDOWN) || (processor->state == PROCESSOR_PENDING_OFFLINE)) {
384 		bool success = (flags & SHUTDOWN_TEMPORARY) ? processor->shutdown_temporary : !processor->shutdown_temporary;
385 
386 		pset_unlock(pset);
387 		splx(s);
388 		ml_cpu_end_state_transition(processor->cpu_id);
389 
390 		lck_mtx_unlock(&processor_updown_lock);
391 		return success ? KERN_SUCCESS : KERN_FAILURE;
392 	}
393 
394 	ml_broadcast_cpu_event(CPU_EXIT_REQUESTED, processor->cpu_id);
395 	pset_update_processor_state(pset, processor, PROCESSOR_SHUTDOWN);
396 	processor->last_shutdown_reason = reason;
397 	if (flags & SHUTDOWN_TEMPORARY) {
398 		processor->shutdown_temporary = true;
399 	}
400 	pset_unlock(pset);
401 
402 	processor_doshutdown(processor);
403 	splx(s);
404 
405 	cpu_exit_wait(processor->cpu_id);
406 
407 	if (processor != master_processor) {
408 		s = splsched();
409 		pset_lock(pset);
410 		pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
411 		pset_unlock(pset);
412 		splx(s);
413 	}
414 
415 	ml_cpu_end_state_transition(processor->cpu_id);
416 	ml_broadcast_cpu_event(CPU_EXITED, processor->cpu_id);
417 	ml_cpu_power_disable(processor->cpu_id);
418 
419 	lck_mtx_unlock(&processor_updown_lock);
420 	return KERN_SUCCESS;
421 }
422 
423 /*
424  * Called with interrupts disabled.
425  */
426 static void
processor_doshutdown(processor_t processor)427 processor_doshutdown(
428 	processor_t processor)
429 {
430 	thread_t self = current_thread();
431 
432 	/*
433 	 *	Get onto the processor to shutdown
434 	 */
435 	processor_t prev = thread_bind(processor);
436 	thread_block(THREAD_CONTINUE_NULL);
437 
438 	/* interrupts still disabled */
439 	assert(ml_get_interrupts_enabled() == FALSE);
440 
441 	assert(processor == current_processor());
442 	assert(processor->state == PROCESSOR_SHUTDOWN);
443 
444 #if CONFIG_DTRACE
445 	if (dtrace_cpu_state_changed_hook) {
446 		(*dtrace_cpu_state_changed_hook)(processor->cpu_id, FALSE);
447 	}
448 #endif
449 
450 #if defined(__arm64__)
451 	/*
452 	 * Catch a processor going offline
453 	 * while a panic or stackshot is in progress, as it won't
454 	 * receive a SIGPdebug now that interrupts are disabled.
455 	 */
456 	wait_while_mp_kdp_trap(false);
457 #endif
458 
459 	smr_cpu_down(processor, SMR_CPU_REASON_OFFLINE);
460 	ml_cpu_down();
461 
462 #if HIBERNATION
463 	if (processor_avail_count < 2) {
464 		hibernate_vm_lock();
465 		hibernate_vm_unlock();
466 	}
467 #endif
468 
469 	processor_set_t pset = processor->processor_set;
470 
471 	pset_lock(pset);
472 	pset_update_processor_state(pset, processor, PROCESSOR_PENDING_OFFLINE);
473 	--pset->online_processor_count;
474 	if (!processor->shutdown_temporary) {
475 		processor_down_update_counts(processor);
476 	}
477 	SCHED(processor_queue_shutdown)(processor);
478 	/* pset lock dropped */
479 	SCHED(rt_queue_shutdown)(processor);
480 
481 	thread_bind(prev);
482 
483 	/* interrupts still disabled */
484 
485 	/*
486 	 * Continue processor shutdown on the processor's idle thread.
487 	 * The handoff won't fail because the idle thread has a reserved stack.
488 	 * Switching to the idle thread leaves interrupts disabled,
489 	 * so we can't accidentally take an interrupt after the context switch.
490 	 */
491 	thread_t shutdown_thread = processor->idle_thread;
492 	shutdown_thread->continuation = processor_offline;
493 	shutdown_thread->parameter = processor;
494 
495 	thread_run(self, NULL, NULL, shutdown_thread);
496 }
497 
498 /*
499  * Called in the context of the idle thread to shut down the processor
500  *
501  * A shut-down processor looks like it's 'running' the idle thread parked
502  * in this routine, but it's actually been powered off and has no hardware state.
503  */
504 static void
processor_offline(void * parameter,__unused wait_result_t result)505 processor_offline(
506 	void * parameter,
507 	__unused wait_result_t result)
508 {
509 	processor_t processor = (processor_t) parameter;
510 	thread_t self = current_thread();
511 	__assert_only thread_t old_thread = THREAD_NULL;
512 
513 	assert(processor == current_processor());
514 	assert(self->state & TH_IDLE);
515 	assert(processor->idle_thread == self);
516 	assert(ml_get_interrupts_enabled() == FALSE);
517 	assert(self->continuation == NULL);
518 	assert(processor->processor_offlined == false);
519 	assert(processor->running_timers_active == false);
520 
521 	bool enforce_quiesce_safety = gEnforcePlatformActionSafety;
522 
523 	/*
524 	 * Scheduling is now disabled for this processor.
525 	 * Ensure that primitives that need scheduling (like mutexes) know this.
526 	 */
527 	if (enforce_quiesce_safety) {
528 		disable_preemption_without_measurements();
529 	}
530 
531 	/* convince slave_main to come back here */
532 	processor->processor_offlined = true;
533 
534 	/*
535 	 * Switch to the interrupt stack and shut down the processor.
536 	 *
537 	 * When the processor comes back, it will eventually call load_context which
538 	 * restores the context saved by machine_processor_shutdown, returning here.
539 	 */
540 	old_thread = machine_processor_shutdown(self, processor_offline_intstack, processor);
541 
542 	/* old_thread should be NULL because we got here through Load_context */
543 	assert(old_thread == THREAD_NULL);
544 
545 	assert(processor == current_processor());
546 	assert(processor->idle_thread == current_thread());
547 
548 	assert(ml_get_interrupts_enabled() == FALSE);
549 	assert(self->continuation == NULL);
550 
551 	/* Extract the machine_param value stashed by slave_main */
552 	void * machine_param = self->parameter;
553 	self->parameter = NULL;
554 
555 	/* Re-initialize the processor */
556 	slave_machine_init(machine_param);
557 
558 	assert(processor->processor_offlined == true);
559 	processor->processor_offlined = false;
560 
561 	if (enforce_quiesce_safety) {
562 		enable_preemption();
563 	}
564 
565 #if defined(__arm64__)
566 	/*
567 	 * See the comments for DebuggerLock in processor_up().
568 	 *
569 	 * SIGPdisabled is cleared (to enable cpu_signal() to succeed with this processor)
570 	 * the first time we take an IPI.  This is triggered by slave_machine_init(), above,
571 	 * which calls cpu_machine_init()->PE_cpu_machine_init()->PE_cpu_signal() which sends
572 	 * a self-IPI to ensure that happens when we enable interrupts.  So enable interrupts
573 	 * here so that cpu_signal() can succeed before we spin on mp_kdp_trap.
574 	 */
575 	ml_set_interrupts_enabled(TRUE);
576 
577 	ml_set_interrupts_enabled(FALSE);
578 
579 	wait_while_mp_kdp_trap(true);
580 
581 	/*
582 	 * At this point,
583 	 * if a stackshot or panic is in progress, we either spin on mp_kdp_trap
584 	 * or we sucessfully received a SIGPdebug signal which will cause us to
585 	 * break out of the spin on mp_kdp_trap and instead
586 	 * spin next time interrupts are enabled in idle_thread().
587 	 */
588 #endif
589 
590 	/*
591 	 * Now that the processor is back, invoke the idle thread to find out what to do next.
592 	 * idle_thread will enable interrupts.
593 	 */
594 	thread_block(idle_thread);
595 	/*NOTREACHED*/
596 }
597 
598 /*
599  * Complete the shutdown and place the processor offline.
600  *
601  * Called at splsched in the shutdown context
602  * (i.e. on the idle thread, on the interrupt stack)
603  *
604  * The onlining half of this is done in load_context().
605  */
606 static void
processor_offline_intstack(processor_t processor)607 processor_offline_intstack(
608 	processor_t processor)
609 {
610 	assert(processor == current_processor());
611 	assert(processor->active_thread == current_thread());
612 
613 	struct recount_snap snap = { 0 };
614 	recount_snapshot(&snap);
615 	recount_processor_idle(&processor->pr_recount, &snap);
616 
617 	smr_cpu_leave(processor, processor->last_dispatch);
618 
619 	PMAP_DEACTIVATE_KERNEL(processor->cpu_id);
620 
621 	cpu_sleep();
622 	panic("zombie processor");
623 	/*NOTREACHED*/
624 }
625 
626 kern_return_t
host_get_boot_info(host_priv_t host_priv,kernel_boot_info_t boot_info)627 host_get_boot_info(
628 	host_priv_t         host_priv,
629 	kernel_boot_info_t  boot_info)
630 {
631 	const char *src = "";
632 	if (host_priv == HOST_PRIV_NULL) {
633 		return KERN_INVALID_HOST;
634 	}
635 
636 	/*
637 	 * Copy first operator string terminated by '\0' followed by
638 	 *	standardized strings generated from boot string.
639 	 */
640 	src = machine_boot_info(boot_info, KERNEL_BOOT_INFO_MAX);
641 	if (src != boot_info) {
642 		(void) strncpy(boot_info, src, KERNEL_BOOT_INFO_MAX);
643 	}
644 
645 	return KERN_SUCCESS;
646 }
647 
648 // These are configured through sysctls.
649 #if DEVELOPMENT || DEBUG
650 uint32_t phy_read_panic = 1;
651 uint32_t phy_write_panic = 1;
652 uint64_t simulate_stretched_io = 0;
653 #else
654 uint32_t phy_read_panic = 0;
655 uint32_t phy_write_panic = 0;
656 #endif
657 
658 #if !defined(__x86_64__)
659 
660 #if DEVELOPMENT || DEBUG
661 static const uint64_t TIMEBASE_TICKS_PER_USEC = 24000000ULL / USEC_PER_SEC;
662 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 100 * TIMEBASE_TICKS_PER_USEC;
663 #else
664 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 0;
665 #endif
666 
667 // The MACHINE_TIMEOUT facility only exists on ARM.
668 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_read_delay_to, "report-phy-read-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
669 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_write_delay_to, "report-phy-write-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
670 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_read_delay_to, "trace-phy-read-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
671 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_write_delay_to, "trace-phy-write-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
672 
673 #if SCHED_HYGIENE_DEBUG
674 /*
675  * Note: The interrupt-masked timeout goes through two initializations - one
676  * early in boot and one later. Thus this function is also called twice and
677  * can't be marked '__startup_func'.
678  */
679 static void
ml_io_init_timeouts(void)680 ml_io_init_timeouts(void)
681 {
682 	/*
683 	 * The timeouts may be completely disabled via an override.
684 	 */
685 	if (kern_feature_override(KF_IO_TIMEOUT_OVRD)) {
686 		os_atomic_store(&report_phy_write_delay_to, 0, relaxed);
687 		os_atomic_store(&report_phy_read_delay_to, 0, relaxed);
688 		return;
689 	}
690 
691 	/*
692 	 * There may be no interrupt masked timeout set.
693 	 */
694 	const uint64_t interrupt_masked_to = os_atomic_load(&interrupt_masked_timeout, relaxed);
695 	if (interrupt_masked_timeout == 0) {
696 		return;
697 	}
698 
699 	/*
700 	 * Inherit from the interrupt masked timeout if smaller and the timeout
701 	 * hasn't been explicitly set via boot-arg.
702 	 */
703 	uint64_t arg = 0;
704 
705 	if (!PE_parse_boot_argn("ml-timeout-report-phy-read-delay", &arg, sizeof(arg))) {
706 		uint64_t report_phy_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
707 		report_phy_read_delay = report_phy_read_delay == 0 ?
708 		    interrupt_masked_to :
709 		    MIN(report_phy_read_delay, interrupt_masked_to);
710 		os_atomic_store(&report_phy_read_delay_to, report_phy_read_delay, relaxed);
711 	}
712 
713 	if (!PE_parse_boot_argn("ml-timeout-report-phy-write-delay", &arg, sizeof(arg))) {
714 		uint64_t report_phy_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
715 		report_phy_write_delay = report_phy_write_delay == 0 ?
716 		    interrupt_masked_to :
717 		    MIN(report_phy_write_delay, interrupt_masked_to);
718 		os_atomic_store(&report_phy_write_delay_to, report_phy_write_delay, relaxed);
719 	}
720 }
721 
722 /*
723  * It's important that this happens after machine timeouts have initialized so
724  * the correct timeouts can be inherited.
725  */
726 STARTUP(TIMEOUTS, STARTUP_RANK_SECOND, ml_io_init_timeouts);
727 #endif /* SCHED_HYGIENE_DEBUG */
728 
729 extern pmap_paddr_t kvtophys(vm_offset_t va);
730 #endif
731 
732 #if ML_IO_TIMEOUTS_ENABLED
733 
734 static LCK_GRP_DECLARE(io_timeout_override_lock_grp, "io_timeout_override");
735 static LCK_SPIN_DECLARE(io_timeout_override_lock, &io_timeout_override_lock_grp);
736 
737 struct io_timeout_override_entry {
738 	RB_ENTRY(io_timeout_override_entry) tree;
739 
740 	uintptr_t iovaddr_base;
741 	unsigned int size;
742 	uint32_t read_timeout;
743 	uint32_t write_timeout;
744 };
745 
746 static inline int
io_timeout_override_cmp(const struct io_timeout_override_entry * a,const struct io_timeout_override_entry * b)747 io_timeout_override_cmp(const struct io_timeout_override_entry *a, const struct io_timeout_override_entry *b)
748 {
749 	if (a->iovaddr_base < b->iovaddr_base) {
750 		return -1;
751 	} else if (a->iovaddr_base > b->iovaddr_base) {
752 		return 1;
753 	} else {
754 		return 0;
755 	}
756 }
757 
758 static RB_HEAD(io_timeout_override, io_timeout_override_entry) io_timeout_override_root;
759 RB_PROTOTYPE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
760 RB_GENERATE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
761 
762 #endif /* ML_IO_TIMEOUTS_ENABLED */
763 
764 int
ml_io_increase_timeouts(uintptr_t iovaddr_base,unsigned int size,uint32_t read_timeout_us,uint32_t write_timeout_us)765 ml_io_increase_timeouts(uintptr_t iovaddr_base, unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us)
766 {
767 #if ML_IO_TIMEOUTS_ENABLED
768 	const size_t MAX_SIZE = 4096;
769 	const uint64_t MAX_TIMEOUT_ABS = UINT32_MAX;
770 
771 	assert(preemption_enabled());
772 
773 	int ret = KERN_SUCCESS;
774 
775 	if (size == 0) {
776 		return KERN_INVALID_ARGUMENT;
777 	}
778 
779 	uintptr_t iovaddr_end;
780 	if (size > MAX_SIZE || os_add_overflow(iovaddr_base, size - 1, &iovaddr_end)) {
781 		return KERN_INVALID_ARGUMENT;
782 	}
783 
784 	uint64_t read_timeout_abs, write_timeout_abs;
785 	nanoseconds_to_absolutetime(NSEC_PER_USEC * read_timeout_us, &read_timeout_abs);
786 	nanoseconds_to_absolutetime(NSEC_PER_USEC * write_timeout_us, &write_timeout_abs);
787 	if (read_timeout_abs > MAX_TIMEOUT_ABS || write_timeout_abs > MAX_TIMEOUT_ABS) {
788 		return KERN_INVALID_ARGUMENT;
789 	}
790 
791 	struct io_timeout_override_entry *node = kalloc_type(struct io_timeout_override_entry, Z_WAITOK | Z_ZERO | Z_NOFAIL);
792 	node->iovaddr_base = iovaddr_base;
793 	node->size = size;
794 	node->read_timeout = (uint32_t)read_timeout_abs;
795 	node->write_timeout = (uint32_t)write_timeout_abs;
796 
797 	/*
798 	 * Interrupt handlers are allowed to call ml_io_{read,write}*, so
799 	 * interrupts must be disabled any time io_timeout_override_lock is
800 	 * held.  Otherwise the CPU could take an interrupt while holding the
801 	 * lock, invoke an ISR that calls ml_io_{read,write}*, and deadlock
802 	 * trying to acquire the lock again.
803 	 */
804 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
805 	lck_spin_lock(&io_timeout_override_lock);
806 	if (RB_INSERT(io_timeout_override, &io_timeout_override_root, node)) {
807 		ret = KERN_INVALID_ARGUMENT;
808 		goto out;
809 	}
810 
811 	/* Check that this didn't create any new overlaps */
812 	struct io_timeout_override_entry *prev = RB_PREV(io_timeout_override, &io_timeout_override_root, node);
813 	if (prev && (prev->iovaddr_base + prev->size) > node->iovaddr_base) {
814 		RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
815 		ret = KERN_INVALID_ARGUMENT;
816 		goto out;
817 	}
818 	struct io_timeout_override_entry *next = RB_NEXT(io_timeout_override, &io_timeout_override_root, node);
819 	if (next && (node->iovaddr_base + node->size) > next->iovaddr_base) {
820 		RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
821 		ret = KERN_INVALID_ARGUMENT;
822 		goto out;
823 	}
824 
825 out:
826 	lck_spin_unlock(&io_timeout_override_lock);
827 	ml_set_interrupts_enabled(istate);
828 	if (ret != KERN_SUCCESS) {
829 		kfree_type(struct io_timeout_override_entry, node);
830 	}
831 	return ret;
832 #else /* !ML_IO_TIMEOUTS_ENABLED */
833 #pragma unused(iovaddr_base, size, read_timeout_us, write_timeout_us)
834 	return KERN_SUCCESS;
835 #endif
836 }
837 
838 int
ml_io_reset_timeouts(uintptr_t iovaddr_base,unsigned int size)839 ml_io_reset_timeouts(uintptr_t iovaddr_base, unsigned int size)
840 {
841 #if ML_IO_TIMEOUTS_ENABLED
842 	assert(preemption_enabled());
843 
844 	struct io_timeout_override_entry key = { .iovaddr_base = iovaddr_base };
845 
846 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
847 	lck_spin_lock(&io_timeout_override_lock);
848 	struct io_timeout_override_entry *node = RB_FIND(io_timeout_override, &io_timeout_override_root, &key);
849 	if (node) {
850 		if (node->size == size) {
851 			RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
852 		} else {
853 			node = NULL;
854 		}
855 	}
856 	lck_spin_unlock(&io_timeout_override_lock);
857 	ml_set_interrupts_enabled(istate);
858 
859 	if (!node) {
860 		return KERN_NOT_FOUND;
861 	}
862 
863 	kfree_type(struct io_timeout_override_entry, node);
864 #else /* !ML_IO_TIMEOUTS_ENABLED */
865 #pragma unused(iovaddr_base, size)
866 #endif
867 	return KERN_SUCCESS;
868 }
869 
870 #if ML_IO_TIMEOUTS_ENABLED
871 
872 static bool
override_io_timeouts_va(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)873 override_io_timeouts_va(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
874 {
875 	assert(!ml_get_interrupts_enabled());
876 
877 	struct io_timeout_override_entry *node = RB_ROOT(&io_timeout_override_root);
878 
879 	lck_spin_lock(&io_timeout_override_lock);
880 	/* RB_FIND() doesn't support custom cmp functions, so we have to open-code our own */
881 	while (node) {
882 		if (node->iovaddr_base <= vaddr && vaddr < node->iovaddr_base + node->size) {
883 			if (read_timeout) {
884 				*read_timeout = node->read_timeout;
885 			}
886 			if (write_timeout) {
887 				*write_timeout = node->write_timeout;
888 			}
889 			lck_spin_unlock(&io_timeout_override_lock);
890 			return true;
891 		} else if (vaddr < node->iovaddr_base) {
892 			node = RB_LEFT(node, tree);
893 		} else {
894 			node = RB_RIGHT(node, tree);
895 		}
896 	}
897 	lck_spin_unlock(&io_timeout_override_lock);
898 
899 	return false;
900 }
901 
902 static bool
override_io_timeouts_pa(uint64_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)903 override_io_timeouts_pa(uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
904 {
905 #if defined(__arm64__)
906 	/*
907 	 * PCIe regions are marked with PMAP_IO_RANGE_STRONG_SYNC. Apply a
908 	 * timeout greater than the PCIe completion timeout (50ms). In some
909 	 * cases those timeouts can stack so make the timeout significantly
910 	 * higher.
911 	 */
912 	#define STRONG_SYNC_TIMEOUT 1800000 /* 75ms */
913 
914 	pmap_io_range_t *range = pmap_find_io_attr(paddr);
915 	if (range != NULL && (range->wimg & PMAP_IO_RANGE_STRONG_SYNC) != 0) {
916 		if (read_timeout) {
917 			*read_timeout = STRONG_SYNC_TIMEOUT;
918 		}
919 		if (write_timeout) {
920 			*write_timeout = STRONG_SYNC_TIMEOUT;
921 		}
922 
923 		return true;
924 	}
925 #else
926 	(void)paddr;
927 	(void)read_timeout;
928 	(void)write_timeout;
929 #endif /* __arm64__ */
930 	return false;
931 }
932 
933 void
override_io_timeouts(uintptr_t vaddr,uint64_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)934 override_io_timeouts(uintptr_t vaddr, uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
935 {
936 	if (vaddr != 0 &&
937 	    override_io_timeouts_va(vaddr, read_timeout, write_timeout)) {
938 		return;
939 	}
940 
941 	if (paddr != 0 &&
942 	    override_io_timeouts_pa(paddr, read_timeout, write_timeout)) {
943 		return;
944 	}
945 }
946 #endif /* ML_IO_TIMEOUTS_ENABLED */
947 
948 unsigned long long
ml_io_read(uintptr_t vaddr,int size)949 ml_io_read(uintptr_t vaddr, int size)
950 {
951 	unsigned long long result = 0;
952 	unsigned char s1;
953 	unsigned short s2;
954 
955 #ifdef ML_IO_VERIFY_UNCACHEABLE
956 	uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
957 #elif defined(ML_IO_TIMEOUTS_ENABLED)
958 	uintptr_t const paddr = kvtophys(vaddr);
959 #endif
960 
961 #ifdef ML_IO_TIMEOUTS_ENABLED
962 	uint64_t sabs, eabs;
963 	boolean_t istate, timeread = FALSE;
964 	uint64_t report_read_delay;
965 #if __x86_64__
966 	report_read_delay = report_phy_read_delay;
967 #else
968 	report_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
969 	uint64_t const trace_phy_read_delay = os_atomic_load(&trace_phy_read_delay_to, relaxed);
970 #endif /* __x86_64__ */
971 
972 	if (__improbable(report_read_delay != 0)) {
973 		istate = ml_set_interrupts_enabled(FALSE);
974 		sabs = ml_io_timestamp();
975 		timeread = TRUE;
976 	}
977 
978 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
979 	if (__improbable(timeread && simulate_stretched_io)) {
980 		sabs -= simulate_stretched_io;
981 	}
982 #endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */
983 #endif /* ML_IO_TIMEOUTS_ENABLED */
984 
985 #if DEVELOPMENT || DEBUG
986 	boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
987 	if (use_fences) {
988 		ml_timebase_to_memory_fence();
989 	}
990 #endif
991 
992 	switch (size) {
993 	case 1:
994 		s1 = *(volatile unsigned char *)vaddr;
995 		result = s1;
996 		break;
997 	case 2:
998 		s2 = *(volatile unsigned short *)vaddr;
999 		result = s2;
1000 		break;
1001 	case 4:
1002 		result = *(volatile unsigned int *)vaddr;
1003 		break;
1004 	case 8:
1005 		result = *(volatile unsigned long long *)vaddr;
1006 		break;
1007 	default:
1008 		panic("Invalid size %d for ml_io_read(%p)", size, (void *)vaddr);
1009 		break;
1010 	}
1011 
1012 #if DEVELOPMENT || DEBUG
1013 	if (use_fences) {
1014 		ml_memory_to_timebase_fence();
1015 	}
1016 #endif
1017 
1018 #ifdef ML_IO_TIMEOUTS_ENABLED
1019 	if (__improbable(timeread == TRUE)) {
1020 		eabs = ml_io_timestamp();
1021 
1022 		/* Prevent the processor from calling iotrace during its
1023 		 * initialization procedure. */
1024 		if (current_processor()->state == PROCESSOR_RUNNING) {
1025 			iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, sabs, eabs - sabs);
1026 		}
1027 
1028 		if (__improbable((eabs - sabs) > report_read_delay)) {
1029 			DTRACE_PHYSLAT5(physioread, uint64_t, (eabs - sabs),
1030 			    uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, result);
1031 
1032 			uint64_t override = 0;
1033 			override_io_timeouts(vaddr, paddr, &override, NULL);
1034 
1035 			if (override != 0) {
1036 #if SCHED_HYGIENE_DEBUG
1037 				/*
1038 				 * The IO timeout was overridden. As interrupts are disabled in
1039 				 * order to accurately measure IO time this can cause the
1040 				 * interrupt masked timeout threshold to be exceeded.  If the
1041 				 * interrupt masked debug mode is set to panic, abandon the
1042 				 * measurement. If in trace mode leave it as-is for
1043 				 * observability.
1044 				 */
1045 				if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1046 					ml_spin_debug_clear(current_thread());
1047 					ml_irq_debug_abandon();
1048 				}
1049 #endif
1050 				report_read_delay = override;
1051 			}
1052 		}
1053 
1054 		if (__improbable((eabs - sabs) > report_read_delay)) {
1055 			if (phy_read_panic && (machine_timeout_suspended() == FALSE)) {
1056 #if defined(__x86_64__)
1057 				panic_notify();
1058 #endif /* defined(__x86_64__) */
1059 				uint64_t nsec = 0;
1060 				absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1061 				panic("Read from IO vaddr 0x%lx paddr 0x%lx took %llu ns, "
1062 				    "result: 0x%llx (start: %llu, end: %llu), ceiling: %llu",
1063 				    vaddr, paddr, nsec, result, sabs, eabs,
1064 				    report_read_delay);
1065 			}
1066 		}
1067 
1068 		if (__improbable(trace_phy_read_delay > 0 && (eabs - sabs) > trace_phy_read_delay)) {
1069 			KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_READ),
1070 			    (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result);
1071 		}
1072 
1073 		(void)ml_set_interrupts_enabled(istate);
1074 	}
1075 #endif /*  ML_IO_TIMEOUTS_ENABLED */
1076 	return result;
1077 }
1078 
1079 unsigned int
ml_io_read8(uintptr_t vaddr)1080 ml_io_read8(uintptr_t vaddr)
1081 {
1082 	return (unsigned) ml_io_read(vaddr, 1);
1083 }
1084 
1085 unsigned int
ml_io_read16(uintptr_t vaddr)1086 ml_io_read16(uintptr_t vaddr)
1087 {
1088 	return (unsigned) ml_io_read(vaddr, 2);
1089 }
1090 
1091 unsigned int
ml_io_read32(uintptr_t vaddr)1092 ml_io_read32(uintptr_t vaddr)
1093 {
1094 	return (unsigned) ml_io_read(vaddr, 4);
1095 }
1096 
1097 unsigned long long
ml_io_read64(uintptr_t vaddr)1098 ml_io_read64(uintptr_t vaddr)
1099 {
1100 	return ml_io_read(vaddr, 8);
1101 }
1102 
1103 /* ml_io_write* */
1104 
1105 void
ml_io_write(uintptr_t vaddr,uint64_t val,int size)1106 ml_io_write(uintptr_t vaddr, uint64_t val, int size)
1107 {
1108 #ifdef ML_IO_VERIFY_UNCACHEABLE
1109 	uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
1110 #elif defined(ML_IO_TIMEOUTS_ENABLED)
1111 	uintptr_t const paddr = kvtophys(vaddr);
1112 #endif
1113 
1114 #ifdef ML_IO_TIMEOUTS_ENABLED
1115 	uint64_t sabs, eabs;
1116 	boolean_t istate, timewrite = FALSE;
1117 	uint64_t report_write_delay;
1118 #if __x86_64__
1119 	report_write_delay = report_phy_write_delay;
1120 #else
1121 	report_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
1122 	uint64_t trace_phy_write_delay = os_atomic_load(&trace_phy_write_delay_to, relaxed);
1123 #endif /* !defined(__x86_64__) */
1124 	if (__improbable(report_write_delay != 0)) {
1125 		istate = ml_set_interrupts_enabled(FALSE);
1126 		sabs = ml_io_timestamp();
1127 		timewrite = TRUE;
1128 	}
1129 
1130 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
1131 	if (__improbable(timewrite && simulate_stretched_io)) {
1132 		sabs -= simulate_stretched_io;
1133 	}
1134 #endif /* DEVELOPMENT || DEBUG */
1135 #endif /* ML_IO_TIMEOUTS_ENABLED */
1136 
1137 #if DEVELOPMENT || DEBUG
1138 	boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
1139 	if (use_fences) {
1140 		ml_timebase_to_memory_fence();
1141 	}
1142 #endif
1143 
1144 	switch (size) {
1145 	case 1:
1146 		*(volatile uint8_t *)vaddr = (uint8_t)val;
1147 		break;
1148 	case 2:
1149 		*(volatile uint16_t *)vaddr = (uint16_t)val;
1150 		break;
1151 	case 4:
1152 		*(volatile uint32_t *)vaddr = (uint32_t)val;
1153 		break;
1154 	case 8:
1155 		*(volatile uint64_t *)vaddr = (uint64_t)val;
1156 		break;
1157 	default:
1158 		panic("Invalid size %d for ml_io_write(%p, 0x%llx)", size, (void *)vaddr, val);
1159 		break;
1160 	}
1161 
1162 #if DEVELOPMENT || DEBUG
1163 	if (use_fences) {
1164 		ml_memory_to_timebase_fence();
1165 	}
1166 #endif
1167 
1168 #ifdef ML_IO_TIMEOUTS_ENABLED
1169 	if (__improbable(timewrite == TRUE)) {
1170 		eabs = ml_io_timestamp();
1171 
1172 
1173 		/* Prevent the processor from calling iotrace during its
1174 		 * initialization procedure. */
1175 		if (current_processor()->state == PROCESSOR_RUNNING) {
1176 			iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, sabs, eabs - sabs);
1177 		}
1178 
1179 
1180 		if (__improbable((eabs - sabs) > report_write_delay)) {
1181 			DTRACE_PHYSLAT5(physiowrite, uint64_t, (eabs - sabs),
1182 			    uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, val);
1183 
1184 			uint64_t override = 0;
1185 			override_io_timeouts(vaddr, paddr, NULL, &override);
1186 
1187 			if (override != 0) {
1188 #if SCHED_HYGIENE_DEBUG
1189 				/*
1190 				 * The IO timeout was overridden. As interrupts are disabled in
1191 				 * order to accurately measure IO time this can cause the
1192 				 * interrupt masked timeout threshold to be exceeded.  If the
1193 				 * interrupt masked debug mode is set to panic, abandon the
1194 				 * measurement. If in trace mode leave it as-is for
1195 				 * observability.
1196 				 */
1197 				if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1198 					ml_spin_debug_clear(current_thread());
1199 					ml_irq_debug_abandon();
1200 				}
1201 #endif
1202 				report_write_delay = override;
1203 			}
1204 		}
1205 
1206 		if (__improbable((eabs - sabs) > report_write_delay)) {
1207 			if (phy_write_panic && (machine_timeout_suspended() == FALSE)) {
1208 #if defined(__x86_64__)
1209 				panic_notify();
1210 #endif /*  defined(__x86_64__) */
1211 
1212 				uint64_t nsec = 0;
1213 				absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1214 				panic("Write to IO vaddr %p paddr %p val 0x%llx took %llu ns,"
1215 				    " (start: %llu, end: %llu), ceiling: %llu",
1216 				    (void *)vaddr, (void *)paddr, val, nsec, sabs, eabs,
1217 				    report_write_delay);
1218 			}
1219 		}
1220 
1221 		if (__improbable(trace_phy_write_delay > 0 && (eabs - sabs) > trace_phy_write_delay)) {
1222 			KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_WRITE),
1223 			    (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val);
1224 		}
1225 
1226 		(void)ml_set_interrupts_enabled(istate);
1227 	}
1228 #endif /* ML_IO_TIMEOUTS_ENABLED */
1229 }
1230 
1231 void
ml_io_write8(uintptr_t vaddr,uint8_t val)1232 ml_io_write8(uintptr_t vaddr, uint8_t val)
1233 {
1234 	ml_io_write(vaddr, val, 1);
1235 }
1236 
1237 void
ml_io_write16(uintptr_t vaddr,uint16_t val)1238 ml_io_write16(uintptr_t vaddr, uint16_t val)
1239 {
1240 	ml_io_write(vaddr, val, 2);
1241 }
1242 
1243 void
ml_io_write32(uintptr_t vaddr,uint32_t val)1244 ml_io_write32(uintptr_t vaddr, uint32_t val)
1245 {
1246 	ml_io_write(vaddr, val, 4);
1247 }
1248 
1249 void
ml_io_write64(uintptr_t vaddr,uint64_t val)1250 ml_io_write64(uintptr_t vaddr, uint64_t val)
1251 {
1252 	ml_io_write(vaddr, val, 8);
1253 }
1254 
1255 struct cpu_callback_chain_elem {
1256 	cpu_callback_t                  fn;
1257 	void                            *param;
1258 	struct cpu_callback_chain_elem  *next;
1259 };
1260 
1261 static struct cpu_callback_chain_elem *cpu_callback_chain;
1262 static LCK_GRP_DECLARE(cpu_callback_chain_lock_grp, "cpu_callback_chain");
1263 static LCK_SPIN_DECLARE(cpu_callback_chain_lock, &cpu_callback_chain_lock_grp);
1264 
1265 void
cpu_event_register_callback(cpu_callback_t fn,void * param)1266 cpu_event_register_callback(cpu_callback_t fn, void *param)
1267 {
1268 	struct cpu_callback_chain_elem *new_elem;
1269 
1270 	new_elem = zalloc_permanent_type(struct cpu_callback_chain_elem);
1271 	if (!new_elem) {
1272 		panic("can't allocate cpu_callback_chain_elem");
1273 	}
1274 
1275 	lck_spin_lock(&cpu_callback_chain_lock);
1276 	new_elem->next = cpu_callback_chain;
1277 	new_elem->fn = fn;
1278 	new_elem->param = param;
1279 	os_atomic_store(&cpu_callback_chain, new_elem, release);
1280 	lck_spin_unlock(&cpu_callback_chain_lock);
1281 }
1282 
1283 __attribute__((noreturn))
1284 void
cpu_event_unregister_callback(__unused cpu_callback_t fn)1285 cpu_event_unregister_callback(__unused cpu_callback_t fn)
1286 {
1287 	panic("Unfortunately, cpu_event_unregister_callback is unimplemented.");
1288 }
1289 
1290 void
ml_broadcast_cpu_event(enum cpu_event event,unsigned int cpu_or_cluster)1291 ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster)
1292 {
1293 	struct cpu_callback_chain_elem *cursor;
1294 
1295 	cursor = os_atomic_load(&cpu_callback_chain, dependency);
1296 	for (; cursor != NULL; cursor = cursor->next) {
1297 		cursor->fn(cursor->param, event, cpu_or_cluster);
1298 	}
1299 }
1300 
1301 // Initialize Machine Timeouts (see the MACHINE_TIMEOUT macro
1302 // definition)
1303 
1304 void
machine_timeout_init_with_suffix(const struct machine_timeout_spec * spec,char const * suffix)1305 machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *suffix)
1306 {
1307 	if (spec->skip_predicate != NULL && spec->skip_predicate(spec)) {
1308 		// This timeout should be disabled.
1309 		os_atomic_store_wide((uint64_t*)spec->ptr, 0, relaxed);
1310 		return;
1311 	}
1312 
1313 	assert(suffix != NULL);
1314 	assert(strlen(spec->name) <= MACHINE_TIMEOUT_MAX_NAME_LEN);
1315 
1316 	size_t const suffix_len = strlen(suffix);
1317 
1318 	size_t const dt_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + 1;
1319 	char dt_name[dt_name_size];
1320 
1321 	strlcpy(dt_name, spec->name, dt_name_size);
1322 	strlcat(dt_name, suffix, dt_name_size);
1323 
1324 	size_t const scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + strlen("-scale") + 1;
1325 	char scale_name[scale_name_size];
1326 
1327 	strlcpy(scale_name, spec->name, scale_name_size);
1328 	strlcat(scale_name, suffix, scale_name_size);
1329 	strlcat(scale_name, "-scale", scale_name_size);
1330 
1331 	size_t const boot_arg_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + strlen("ml-timeout-") + suffix_len + 1;
1332 	char boot_arg_name[boot_arg_name_size];
1333 
1334 	strlcpy(boot_arg_name, "ml-timeout-", boot_arg_name_size);
1335 	strlcat(boot_arg_name, spec->name, boot_arg_name_size);
1336 	strlcat(boot_arg_name, suffix, boot_arg_name_size);
1337 
1338 	size_t const boot_arg_scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN +
1339 	    strlen("ml-timeout-") + strlen("-scale") + suffix_len + 1;
1340 	char boot_arg_scale_name[boot_arg_scale_name_size];
1341 
1342 	strlcpy(boot_arg_scale_name, "ml-timeout-", boot_arg_scale_name_size);
1343 	strlcat(boot_arg_scale_name, spec->name, boot_arg_scale_name_size);
1344 	strlcat(boot_arg_scale_name, suffix, boot_arg_name_size);
1345 	strlcat(boot_arg_scale_name, "-scale", boot_arg_scale_name_size);
1346 
1347 
1348 	/*
1349 	 * Determine base value from DT and boot-args.
1350 	 */
1351 
1352 	DTEntry base, chosen;
1353 
1354 	if (SecureDTLookupEntry(NULL, "/machine-timeouts", &base) != kSuccess) {
1355 		base = NULL;
1356 	}
1357 
1358 	if (SecureDTLookupEntry(NULL, "/chosen/machine-timeouts", &chosen) != kSuccess) {
1359 		chosen = NULL;
1360 	}
1361 
1362 	uint64_t timeout = spec->default_value;
1363 	bool found = false;
1364 
1365 	uint64_t const *data = NULL;
1366 	unsigned int data_size = sizeof(*data);
1367 
1368 	/* First look in /machine-timeouts/<name> */
1369 	if (base != NULL && SecureDTGetProperty(base, dt_name, (const void **)&data, &data_size) == kSuccess) {
1370 		if (data_size != sizeof(*data)) {
1371 			panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s", __func__, data_size, dt_name);
1372 		}
1373 
1374 		timeout = *data;
1375 		found = true;
1376 	}
1377 
1378 	/* A value in /chosen/machine-timeouts/<name> overrides */
1379 	if (chosen != NULL && SecureDTGetProperty(chosen, dt_name, (const void **)&data, &data_size) == kSuccess) {
1380 		if (data_size != sizeof(*data)) {
1381 			panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s", __func__, data_size, dt_name);
1382 		}
1383 
1384 		timeout = *data;
1385 		found = true;
1386 	}
1387 
1388 	/* A boot-arg ml-timeout-<name> overrides */
1389 	uint64_t boot_arg = 0;
1390 
1391 	if (PE_parse_boot_argn(boot_arg_name, &boot_arg, sizeof(boot_arg))) {
1392 		timeout = boot_arg;
1393 		found = true;
1394 	}
1395 
1396 
1397 	/*
1398 	 * Determine scale value from DT and boot-args.
1399 	 */
1400 
1401 	uint64_t scale = 1;
1402 	uint32_t const *scale_data;
1403 	unsigned int scale_size = sizeof(scale_data);
1404 
1405 	/* If there is a scale factor /machine-timeouts/<name>-scale, apply it. */
1406 	if (base != NULL && SecureDTGetProperty(base, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1407 		if (scale_size != sizeof(*scale_data)) {
1408 			panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s-scale", __func__, scale_size, dt_name);
1409 		}
1410 
1411 		scale = *scale_data;
1412 	}
1413 
1414 	/* If there is a scale factor /chosen/machine-timeouts/<name>-scale, use that. */
1415 	if (chosen != NULL && SecureDTGetProperty(chosen, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1416 		if (scale_size != sizeof(*scale_data)) {
1417 			panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s-scale", __func__,
1418 			    scale_size, dt_name);
1419 		}
1420 
1421 		scale = *scale_data;
1422 	}
1423 
1424 	/* Finally, a boot-arg ml-timeout-<name>-scale takes precedence. */
1425 	if (PE_parse_boot_argn(boot_arg_scale_name, &boot_arg, sizeof(boot_arg))) {
1426 		scale = boot_arg;
1427 	}
1428 
1429 	static bool global_scale_set;
1430 	static uint64_t global_scale;
1431 
1432 	if (!global_scale_set) {
1433 		/* Apply /machine-timeouts/global-scale if present */
1434 		if (SecureDTGetProperty(base, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1435 			if (scale_size != sizeof(*scale_data)) {
1436 				panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/global-scale", __func__,
1437 				    scale_size);
1438 			}
1439 
1440 			global_scale = *scale_data;
1441 			global_scale_set = true;
1442 		}
1443 
1444 		/* Use /chosen/machine-timeouts/global-scale if present */
1445 		if (SecureDTGetProperty(chosen, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1446 			if (scale_size != sizeof(*scale_data)) {
1447 				panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/global-scale", __func__,
1448 				    scale_size);
1449 			}
1450 
1451 			global_scale = *scale_data;
1452 			global_scale_set = true;
1453 		}
1454 
1455 		/* Finally, the boot-arg ml-timeout-global-scale takes precedence. */
1456 		if (PE_parse_boot_argn("ml-timeout-global-scale", &boot_arg, sizeof(boot_arg))) {
1457 			global_scale = boot_arg;
1458 			global_scale_set = true;
1459 		}
1460 	}
1461 
1462 	if (global_scale_set) {
1463 		scale *= global_scale;
1464 	}
1465 
1466 	/* Compute the final timeout, and done. */
1467 	if (found && timeout > 0) {
1468 		/* Only apply inherent unit scale if the value came in
1469 		 * externally. */
1470 
1471 		if (spec->unit_scale == MACHINE_TIMEOUT_UNIT_TIMEBASE) {
1472 			uint64_t nanoseconds = timeout / 1000;
1473 			nanoseconds_to_absolutetime(nanoseconds, &timeout);
1474 		} else {
1475 			timeout /= spec->unit_scale;
1476 		}
1477 
1478 		if (timeout == 0) {
1479 			/* Ensure unit scaling did not disable the timeout. */
1480 			timeout = 1;
1481 		}
1482 	}
1483 
1484 	if (os_mul_overflow(timeout, scale, &timeout)) {
1485 		timeout = UINT64_MAX; // clamp
1486 	}
1487 
1488 	os_atomic_store_wide((uint64_t*)spec->ptr, timeout, relaxed);
1489 }
1490 
1491 void
machine_timeout_init(const struct machine_timeout_spec * spec)1492 machine_timeout_init(const struct machine_timeout_spec *spec)
1493 {
1494 	machine_timeout_init_with_suffix(spec, "");
1495 }
1496 
1497 #if DEVELOPMENT || DEBUG
1498 /*
1499  * Late timeout (re-)initialization, at the end of bsd_init()
1500  */
1501 void
machine_timeout_bsd_init(void)1502 machine_timeout_bsd_init(void)
1503 {
1504 	char const * const __unused mt_suffix = "-b";
1505 #if SCHED_HYGIENE_DEBUG
1506 	machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix);
1507 	machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix);
1508 
1509 	/*
1510 	 * The io timeouts can inherit from interrupt_masked_timeout.
1511 	 * Re-initialize, as interrupt_masked_timeout may have changed.
1512 	 */
1513 	ml_io_init_timeouts();
1514 
1515 	extern void preemption_disable_reset_max_durations(void);
1516 	/*
1517 	 * Reset the preemption disable stats, so that they are not
1518 	 * polluted by long early boot code.
1519 	 */
1520 	preemption_disable_reset_max_durations();
1521 #endif /* SCHED_HYGIENE_DEBUG */
1522 }
1523 #endif /* DEVELOPMENT || DEBUG */
1524 
1525 #if ML_IO_TIMEOUTS_ENABLED && CONFIG_XNUPOST
1526 #include <tests/xnupost.h>
1527 
1528 extern kern_return_t ml_io_timeout_test(void);
1529 
1530 static inline void
ml_io_timeout_test_get_timeouts(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)1531 ml_io_timeout_test_get_timeouts(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
1532 {
1533 	*read_timeout = 0;
1534 	*write_timeout = 0;
1535 
1536 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
1537 	override_io_timeouts(vaddr, 0, read_timeout, write_timeout);
1538 	ml_set_interrupts_enabled(istate);
1539 }
1540 
1541 kern_return_t
ml_io_timeout_test(void)1542 ml_io_timeout_test(void)
1543 {
1544 	const size_t SIZE = 16;
1545 	uintptr_t iovaddr_base1 = (uintptr_t)&ml_io_timeout_test;
1546 	uintptr_t iovaddr_base2 = iovaddr_base1 + SIZE;
1547 	uintptr_t vaddr1 = iovaddr_base1 + SIZE / 2;
1548 	uintptr_t vaddr2 = iovaddr_base2 + SIZE / 2;
1549 
1550 	const uint64_t READ_TIMEOUT1_US = 50000, WRITE_TIMEOUT1_US = 50001;
1551 	const uint64_t READ_TIMEOUT2_US = 50002, WRITE_TIMEOUT2_US = 50003;
1552 	uint64_t read_timeout1_abs, write_timeout1_abs;
1553 	uint64_t read_timeout2_abs, write_timeout2_abs;
1554 	nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT1_US, &read_timeout1_abs);
1555 	nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT1_US, &write_timeout1_abs);
1556 	nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT2_US, &read_timeout2_abs);
1557 	nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT2_US, &write_timeout2_abs);
1558 
1559 	int err = ml_io_increase_timeouts(iovaddr_base1, 0, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1560 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for empty region");
1561 
1562 	err = ml_io_increase_timeouts(iovaddr_base1, 4097, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1563 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for region > 4096 bytes");
1564 
1565 	err = ml_io_increase_timeouts(UINTPTR_MAX, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1566 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overflowed region");
1567 
1568 	err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1569 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first VA region should succeed");
1570 
1571 	err = ml_io_increase_timeouts(iovaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US);
1572 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second VA region should succeed");
1573 
1574 	err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1575 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for same region twice");
1576 
1577 	err = ml_io_increase_timeouts(vaddr1, (uint32_t)(vaddr2 - vaddr1), READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1578 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overlapping regions");
1579 
1580 	uint64_t read_timeout, write_timeout;
1581 	ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1582 	T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1583 	T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1584 
1585 	ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout);
1586 	T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region");
1587 	T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region");
1588 
1589 	ml_io_timeout_test_get_timeouts(iovaddr_base2 + SIZE, &read_timeout, &write_timeout);
1590 	T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout without override");
1591 	T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout without override");
1592 
1593 	err = ml_io_reset_timeouts(iovaddr_base1 + 1, SIZE - 1);
1594 	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for subregion");
1595 
1596 	err = ml_io_reset_timeouts(iovaddr_base2 + SIZE, SIZE);
1597 	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for non-existent region");
1598 
1599 	err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1600 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first VA region should succeed");
1601 
1602 	ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1603 	T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region");
1604 	T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region");
1605 
1606 	err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1607 	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for same region twice");
1608 
1609 	err = ml_io_reset_timeouts(iovaddr_base2, SIZE);
1610 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second VA region should succeed");
1611 
1612 	return KERN_SUCCESS;
1613 }
1614 #endif /* CONFIG_XNUPOST */
1615