xref: /xnu-8796.121.2/osfmk/kern/machine.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	kern/machine.c
60  *	Author:	Avadis Tevanian, Jr.
61  *	Date:	1987
62  *
63  *	Support for machine independent machine abstraction.
64  */
65 
66 #include <string.h>
67 
68 #include <mach/mach_types.h>
69 #include <mach/boolean.h>
70 #include <mach/kern_return.h>
71 #include <mach/machine.h>
72 #include <mach/host_info.h>
73 #include <mach/host_reboot.h>
74 #include <mach/host_priv_server.h>
75 #include <mach/processor_server.h>
76 #include <mach/sdt.h>
77 
78 #include <kern/kern_types.h>
79 #include <kern/cpu_data.h>
80 #include <kern/cpu_quiesce.h>
81 #include <kern/ipc_host.h>
82 #include <kern/host.h>
83 #include <kern/machine.h>
84 #include <kern/misc_protos.h>
85 #include <kern/percpu.h>
86 #include <kern/processor.h>
87 #include <kern/queue.h>
88 #include <kern/sched.h>
89 #include <kern/startup.h>
90 #include <kern/task.h>
91 #include <kern/thread.h>
92 #include <kern/iotrace.h>
93 
94 #include <libkern/OSDebug.h>
95 #if ML_IO_TIMEOUTS_ENABLED
96 #include <libkern/tree.h>
97 #endif
98 
99 #include <pexpert/device_tree.h>
100 
101 #include <machine/commpage.h>
102 #include <machine/machine_routines.h>
103 
104 #if HIBERNATION
105 #include <IOKit/IOHibernatePrivate.h>
106 #endif
107 #include <IOKit/IOPlatformExpert.h>
108 
109 #if CONFIG_DTRACE
110 extern void (*dtrace_cpu_state_changed_hook)(int, boolean_t);
111 #endif
112 
113 #if defined(__arm64__)
114 extern void wait_while_mp_kdp_trap(bool check_SIGPdebug);
115 #endif
116 
117 #if defined(__x86_64__)
118 #include <i386/panic_notify.h>
119 #endif
120 
121 /*
122  *	Exported variables:
123  */
124 
125 struct machine_info     machine_info;
126 
127 /* Forwards */
128 static void
129 processor_doshutdown(processor_t processor);
130 
131 static void
132 processor_offline(void * parameter, __unused wait_result_t result);
133 
134 static void
135 processor_offline_intstack(processor_t processor) __dead2;
136 
137 static void
processor_up_update_counts(processor_t processor)138 processor_up_update_counts(processor_t processor)
139 {
140 	ml_cpu_up_update_counts(processor->cpu_id);
141 
142 	os_atomic_inc(&processor_avail_count, relaxed);
143 	if (processor->is_recommended) {
144 		os_atomic_inc(&processor_avail_count_user, relaxed);
145 	}
146 	if (processor->processor_primary == processor) {
147 		os_atomic_inc(&primary_processor_avail_count, relaxed);
148 		if (processor->is_recommended) {
149 			os_atomic_inc(&primary_processor_avail_count_user, relaxed);
150 		}
151 	}
152 	commpage_update_active_cpus();
153 }
154 
155 /*
156  *	processor_up:
157  *
158  *	Flag processor as up and running, and available
159  *	for scheduling.
160  */
161 void
processor_up(processor_t processor)162 processor_up(
163 	processor_t                     processor)
164 {
165 	processor_set_t         pset;
166 	spl_t                           s;
167 
168 	s = splsched();
169 	init_ast_check(processor);
170 
171 #if defined(__arm64__)
172 	/*
173 	 * A processor coming online won't have received a SIGPdebug signal
174 	 * to cause it to spin while a stackshot or panic is taking place,
175 	 * so spin here on mp_kdp_trap.
176 	 *
177 	 * However, since cpu_signal() is not yet enabled for this processor,
178 	 * there is a race if we have just passed this when a cpu_signal()
179 	 * is attempted.  The sender will assume the cpu is offline, so it will
180 	 * not end up spinning anywhere.  See processor_offline() for the fix
181 	 * for this race.
182 	 */
183 	wait_while_mp_kdp_trap(false);
184 #endif
185 
186 	pset = processor->processor_set;
187 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
188 	pset_lock(pset);
189 
190 	++pset->online_processor_count;
191 	simple_lock(&processor->start_state_lock, LCK_GRP_NULL);
192 	pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
193 	simple_unlock(&processor->start_state_lock);
194 	bool temporary = processor->shutdown_temporary;
195 	if (temporary) {
196 		processor->shutdown_temporary = false;
197 	} else {
198 		processor_up_update_counts(processor);
199 	}
200 	if (processor->is_recommended) {
201 		SCHED(pset_made_schedulable)(processor, pset, false);
202 	}
203 	pset_unlock(pset);
204 	ml_cpu_up();
205 	sched_mark_processor_online_locked(processor, processor->last_startup_reason);
206 	simple_unlock(&sched_available_cores_lock);
207 	splx(s);
208 
209 	thread_wakeup((event_t)&processor->state);
210 
211 #if CONFIG_DTRACE
212 	if (dtrace_cpu_state_changed_hook) {
213 		(*dtrace_cpu_state_changed_hook)(processor->cpu_id, TRUE);
214 	}
215 #endif
216 }
217 #include <atm/atm_internal.h>
218 
219 kern_return_t
host_reboot(host_priv_t host_priv,int options)220 host_reboot(
221 	host_priv_t             host_priv,
222 	int                             options)
223 {
224 	if (host_priv == HOST_PRIV_NULL) {
225 		return KERN_INVALID_HOST;
226 	}
227 
228 #if DEVELOPMENT || DEBUG
229 	if (options & HOST_REBOOT_DEBUGGER) {
230 		Debugger("Debugger");
231 		return KERN_SUCCESS;
232 	}
233 #endif
234 
235 	if (options & HOST_REBOOT_UPSDELAY) {
236 		// UPS power cutoff path
237 		PEHaltRestart( kPEUPSDelayHaltCPU );
238 	} else {
239 		halt_all_cpus(!(options & HOST_REBOOT_HALT));
240 	}
241 
242 	return KERN_SUCCESS;
243 }
244 
245 kern_return_t
processor_assign(__unused processor_t processor,__unused processor_set_t new_pset,__unused boolean_t wait)246 processor_assign(
247 	__unused processor_t            processor,
248 	__unused processor_set_t        new_pset,
249 	__unused boolean_t              wait)
250 {
251 	return KERN_FAILURE;
252 }
253 
254 static void
processor_down_update_counts(processor_t processor)255 processor_down_update_counts(processor_t processor)
256 {
257 	ml_cpu_down_update_counts(processor->cpu_id);
258 
259 	os_atomic_dec(&processor_avail_count, relaxed);
260 	if (processor->is_recommended) {
261 		os_atomic_dec(&processor_avail_count_user, relaxed);
262 	}
263 	if (processor->processor_primary == processor) {
264 		os_atomic_dec(&primary_processor_avail_count, relaxed);
265 		if (processor->is_recommended) {
266 			os_atomic_dec(&primary_processor_avail_count_user, relaxed);
267 		}
268 	}
269 	commpage_update_active_cpus();
270 }
271 
272 extern lck_mtx_t processor_updown_lock;
273 
274 kern_return_t
processor_shutdown(processor_t processor,processor_reason_t reason,uint32_t flags)275 processor_shutdown(
276 	processor_t                     processor,
277 	processor_reason_t              reason,
278 	uint32_t                        flags)
279 {
280 	if (!ml_cpu_can_exit(processor->cpu_id, reason)) {
281 		/*
282 		 * Failure if disallowed by arch code.
283 		 */
284 		return KERN_NOT_SUPPORTED;
285 	}
286 
287 	lck_mtx_lock(&processor_updown_lock);
288 
289 	kern_return_t mark_ret = sched_mark_processor_offline(processor, reason);
290 	if (mark_ret != KERN_SUCCESS) {
291 		/* Must fail or we deadlock */
292 		lck_mtx_unlock(&processor_updown_lock);
293 		return KERN_FAILURE;
294 	}
295 
296 	ml_cpu_begin_state_transition(processor->cpu_id);
297 	spl_t s = splsched();
298 	processor_set_t pset = processor->processor_set;
299 
300 	pset_lock(pset);
301 	if (processor->state == PROCESSOR_OFF_LINE) {
302 		/*
303 		 * Success if already shutdown.
304 		 */
305 		if (processor->shutdown_temporary && !(flags & SHUTDOWN_TEMPORARY)) {
306 			/* Convert a temporary shutdown into a permanent shutdown */
307 			processor->shutdown_temporary = false;
308 			processor_down_update_counts(processor);
309 		}
310 		pset_unlock(pset);
311 		splx(s);
312 		ml_cpu_end_state_transition(processor->cpu_id);
313 
314 		lck_mtx_unlock(&processor_updown_lock);
315 		return KERN_SUCCESS;
316 	}
317 
318 	if (processor->shutdown_locked && (reason != REASON_SYSTEM)) {
319 		/*
320 		 * Failure if processor is locked against shutdown.
321 		 */
322 		pset_unlock(pset);
323 		splx(s);
324 
325 		lck_mtx_unlock(&processor_updown_lock);
326 		return KERN_FAILURE;
327 	}
328 
329 	if (processor->state == PROCESSOR_START) {
330 		pset_unlock(pset);
331 		splx(s);
332 
333 		processor_wait_for_start(processor);
334 
335 		s = splsched();
336 		pset_lock(pset);
337 	}
338 
339 	/*
340 	 * If the processor is dispatching, let it finish.
341 	 */
342 	while (processor->state == PROCESSOR_DISPATCHING) {
343 		pset_unlock(pset);
344 		splx(s);
345 		delay(1);
346 		s = splsched();
347 		pset_lock(pset);
348 	}
349 
350 	/*
351 	 * Success if already being shutdown with matching SHUTDOWN_TEMPORARY flag.
352 	 */
353 	if ((processor->state == PROCESSOR_SHUTDOWN) || (processor->state == PROCESSOR_PENDING_OFFLINE)) {
354 		bool success = (flags & SHUTDOWN_TEMPORARY) ? processor->shutdown_temporary : !processor->shutdown_temporary;
355 
356 		pset_unlock(pset);
357 		splx(s);
358 		ml_cpu_end_state_transition(processor->cpu_id);
359 
360 		lck_mtx_unlock(&processor_updown_lock);
361 		return success ? KERN_SUCCESS : KERN_FAILURE;
362 	}
363 
364 	ml_broadcast_cpu_event(CPU_EXIT_REQUESTED, processor->cpu_id);
365 	pset_update_processor_state(pset, processor, PROCESSOR_SHUTDOWN);
366 	processor->last_shutdown_reason = reason;
367 	if (flags & SHUTDOWN_TEMPORARY) {
368 		processor->shutdown_temporary = true;
369 	}
370 	pset_unlock(pset);
371 
372 	processor_doshutdown(processor);
373 	splx(s);
374 
375 	cpu_exit_wait(processor->cpu_id);
376 
377 	if (processor != master_processor) {
378 		s = splsched();
379 		pset_lock(pset);
380 		pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
381 		pset_unlock(pset);
382 		splx(s);
383 	}
384 
385 	ml_cpu_end_state_transition(processor->cpu_id);
386 	ml_broadcast_cpu_event(CPU_EXITED, processor->cpu_id);
387 	ml_cpu_power_disable(processor->cpu_id);
388 
389 	lck_mtx_unlock(&processor_updown_lock);
390 	return KERN_SUCCESS;
391 }
392 
393 /*
394  * Called with interrupts disabled.
395  */
396 static void
processor_doshutdown(processor_t processor)397 processor_doshutdown(
398 	processor_t processor)
399 {
400 	thread_t self = current_thread();
401 
402 	/*
403 	 *	Get onto the processor to shutdown
404 	 */
405 	processor_t prev = thread_bind(processor);
406 	thread_block(THREAD_CONTINUE_NULL);
407 
408 	/* interrupts still disabled */
409 	assert(ml_get_interrupts_enabled() == FALSE);
410 
411 	assert(processor == current_processor());
412 	assert(processor->state == PROCESSOR_SHUTDOWN);
413 
414 #if CONFIG_DTRACE
415 	if (dtrace_cpu_state_changed_hook) {
416 		(*dtrace_cpu_state_changed_hook)(processor->cpu_id, FALSE);
417 	}
418 #endif
419 
420 #if defined(__arm64__)
421 	/*
422 	 * Catch a processor going offline
423 	 * while a panic or stackshot is in progress, as it won't
424 	 * receive a SIGPdebug now that interrupts are disabled.
425 	 */
426 	wait_while_mp_kdp_trap(false);
427 #endif
428 
429 	ml_cpu_down();
430 
431 #if HIBERNATION
432 	if (processor_avail_count < 2) {
433 		hibernate_vm_lock();
434 		hibernate_vm_unlock();
435 	}
436 #endif
437 
438 	processor_set_t pset = processor->processor_set;
439 
440 	pset_lock(pset);
441 	pset_update_processor_state(pset, processor, PROCESSOR_PENDING_OFFLINE);
442 	--pset->online_processor_count;
443 	if (!processor->shutdown_temporary) {
444 		processor_down_update_counts(processor);
445 	}
446 	SCHED(processor_queue_shutdown)(processor);
447 	/* pset lock dropped */
448 	SCHED(rt_queue_shutdown)(processor);
449 
450 	thread_bind(prev);
451 
452 	/* interrupts still disabled */
453 
454 	/*
455 	 * Continue processor shutdown on the processor's idle thread.
456 	 * The handoff won't fail because the idle thread has a reserved stack.
457 	 * Switching to the idle thread leaves interrupts disabled,
458 	 * so we can't accidentally take an interrupt after the context switch.
459 	 */
460 	thread_t shutdown_thread = processor->idle_thread;
461 	shutdown_thread->continuation = processor_offline;
462 	shutdown_thread->parameter = processor;
463 
464 	thread_run(self, NULL, NULL, shutdown_thread);
465 }
466 
467 /*
468  * Called in the context of the idle thread to shut down the processor
469  *
470  * A shut-down processor looks like it's 'running' the idle thread parked
471  * in this routine, but it's actually been powered off and has no hardware state.
472  */
473 static void
processor_offline(void * parameter,__unused wait_result_t result)474 processor_offline(
475 	void * parameter,
476 	__unused wait_result_t result)
477 {
478 	processor_t processor = (processor_t) parameter;
479 	thread_t self = current_thread();
480 	__assert_only thread_t old_thread = THREAD_NULL;
481 
482 	assert(processor == current_processor());
483 	assert(self->state & TH_IDLE);
484 	assert(processor->idle_thread == self);
485 	assert(ml_get_interrupts_enabled() == FALSE);
486 	assert(self->continuation == NULL);
487 	assert(processor->processor_offlined == false);
488 	assert(processor->running_timers_active == false);
489 
490 	bool enforce_quiesce_safety = gEnforcePlatformActionSafety;
491 
492 	/*
493 	 * Scheduling is now disabled for this processor.
494 	 * Ensure that primitives that need scheduling (like mutexes) know this.
495 	 */
496 	if (enforce_quiesce_safety) {
497 		disable_preemption_without_measurements();
498 	}
499 
500 	/* convince slave_main to come back here */
501 	processor->processor_offlined = true;
502 
503 	/*
504 	 * Switch to the interrupt stack and shut down the processor.
505 	 *
506 	 * When the processor comes back, it will eventually call load_context which
507 	 * restores the context saved by machine_processor_shutdown, returning here.
508 	 */
509 	old_thread = machine_processor_shutdown(self, processor_offline_intstack, processor);
510 
511 	/* old_thread should be NULL because we got here through Load_context */
512 	assert(old_thread == THREAD_NULL);
513 
514 	assert(processor == current_processor());
515 	assert(processor->idle_thread == current_thread());
516 
517 	assert(ml_get_interrupts_enabled() == FALSE);
518 	assert(self->continuation == NULL);
519 
520 	/* Extract the machine_param value stashed by slave_main */
521 	void * machine_param = self->parameter;
522 	self->parameter = NULL;
523 
524 	/* Re-initialize the processor */
525 	slave_machine_init(machine_param);
526 
527 	assert(processor->processor_offlined == true);
528 	processor->processor_offlined = false;
529 
530 	if (enforce_quiesce_safety) {
531 		enable_preemption();
532 	}
533 
534 #if defined(__arm64__)
535 	/*
536 	 * See the comments for DebuggerLock in processor_up().
537 	 *
538 	 * SIGPdisabled is cleared (to enable cpu_signal() to succeed with this processor)
539 	 * the first time we take an IPI.  This is triggered by slave_machine_init(), above,
540 	 * which calls cpu_machine_init()->PE_cpu_machine_init()->PE_cpu_signal() which sends
541 	 * a self-IPI to ensure that happens when we enable interrupts.  So enable interrupts
542 	 * here so that cpu_signal() can succeed before we spin on mp_kdp_trap.
543 	 */
544 	ml_set_interrupts_enabled(TRUE);
545 
546 	ml_set_interrupts_enabled(FALSE);
547 
548 	wait_while_mp_kdp_trap(true);
549 
550 	/*
551 	 * At this point,
552 	 * if a stackshot or panic is in progress, we either spin on mp_kdp_trap
553 	 * or we sucessfully received a SIGPdebug signal which will cause us to
554 	 * break out of the spin on mp_kdp_trap and instead
555 	 * spin next time interrupts are enabled in idle_thread().
556 	 */
557 #endif
558 
559 	/*
560 	 * Now that the processor is back, invoke the idle thread to find out what to do next.
561 	 * idle_thread will enable interrupts.
562 	 */
563 	thread_block(idle_thread);
564 	/*NOTREACHED*/
565 }
566 
567 /*
568  * Complete the shutdown and place the processor offline.
569  *
570  * Called at splsched in the shutdown context
571  * (i.e. on the idle thread, on the interrupt stack)
572  *
573  * The onlining half of this is done in load_context().
574  */
575 static void
processor_offline_intstack(processor_t processor)576 processor_offline_intstack(
577 	processor_t processor)
578 {
579 	assert(processor == current_processor());
580 	assert(processor->active_thread == current_thread());
581 
582 	struct recount_snap snap = { 0 };
583 	recount_snapshot(&snap);
584 	recount_processor_idle(&processor->pr_recount, &snap);
585 
586 	cpu_quiescent_counter_leave(processor->last_dispatch);
587 
588 	PMAP_DEACTIVATE_KERNEL(processor->cpu_id);
589 
590 	cpu_sleep();
591 	panic("zombie processor");
592 	/*NOTREACHED*/
593 }
594 
595 kern_return_t
host_get_boot_info(host_priv_t host_priv,kernel_boot_info_t boot_info)596 host_get_boot_info(
597 	host_priv_t         host_priv,
598 	kernel_boot_info_t  boot_info)
599 {
600 	const char *src = "";
601 	if (host_priv == HOST_PRIV_NULL) {
602 		return KERN_INVALID_HOST;
603 	}
604 
605 	/*
606 	 * Copy first operator string terminated by '\0' followed by
607 	 *	standardized strings generated from boot string.
608 	 */
609 	src = machine_boot_info(boot_info, KERNEL_BOOT_INFO_MAX);
610 	if (src != boot_info) {
611 		(void) strncpy(boot_info, src, KERNEL_BOOT_INFO_MAX);
612 	}
613 
614 	return KERN_SUCCESS;
615 }
616 
617 // These are configured through sysctls.
618 #if DEVELOPMENT || DEBUG
619 uint32_t phy_read_panic = 1;
620 uint32_t phy_write_panic = 1;
621 uint64_t simulate_stretched_io = 0;
622 #else
623 uint32_t phy_read_panic = 0;
624 uint32_t phy_write_panic = 0;
625 #endif
626 
627 #if !defined(__x86_64__)
628 
629 #if DEVELOPMENT || DEBUG
630 static const uint64_t TIMEBASE_TICKS_PER_USEC = 24000000ULL / USEC_PER_SEC;
631 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 100 * TIMEBASE_TICKS_PER_USEC;
632 #else
633 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 0;
634 #endif
635 
636 // The MACHINE_TIMEOUT facility only exists on ARM.
637 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_read_delay_to, "report-phy-read-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
638 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_write_delay_to, "report-phy-write-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
639 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_read_delay_to, "trace-phy-read-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
640 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_write_delay_to, "trace-phy-write-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
641 
642 #if SCHED_HYGIENE_DEBUG
643 /*
644  * Note: The interrupt-masked timeout goes through two initializations - one
645  * early in boot and one later. Thus this function is also called twice and
646  * can't be marked '__startup_func'.
647  */
648 static void
ml_io_init_timeouts(void)649 ml_io_init_timeouts(void)
650 {
651 	/*
652 	 * The timeouts may be completely disabled via an override. Check that
653 	 * last and set the timeouts to zero (disabling) if that's the case.
654 	 */
655 	if (kern_feature_override(KF_IO_TIMEOUT_OVRD)) {
656 		os_atomic_store(&report_phy_write_delay_to, 0, relaxed);
657 		os_atomic_store(&report_phy_read_delay_to, 0, relaxed);
658 	}
659 }
660 
661 /*
662  * It's important that this happens after machine timeouts have initialized so
663  * the correct timeouts can be inherited.
664  */
665 STARTUP(TIMEOUTS, STARTUP_RANK_SECOND, ml_io_init_timeouts);
666 #endif /* SCHED_HYGIENE_DEBUG */
667 
668 unsigned int report_phy_read_osbt;
669 unsigned int report_phy_write_osbt;
670 
671 extern pmap_paddr_t kvtophys(vm_offset_t va);
672 #endif
673 
674 #if ML_IO_TIMEOUTS_ENABLED
675 
676 static LCK_GRP_DECLARE(io_timeout_override_lock_grp, "io_timeout_override");
677 static LCK_SPIN_DECLARE(io_timeout_override_lock, &io_timeout_override_lock_grp);
678 
679 struct io_timeout_override_entry {
680 	RB_ENTRY(io_timeout_override_entry) tree;
681 
682 	uintptr_t iovaddr_base;
683 	unsigned int size;
684 	uint32_t read_timeout;
685 	uint32_t write_timeout;
686 };
687 
688 static inline int
io_timeout_override_cmp(const struct io_timeout_override_entry * a,const struct io_timeout_override_entry * b)689 io_timeout_override_cmp(const struct io_timeout_override_entry *a, const struct io_timeout_override_entry *b)
690 {
691 	if (a->iovaddr_base < b->iovaddr_base) {
692 		return -1;
693 	} else if (a->iovaddr_base > b->iovaddr_base) {
694 		return 1;
695 	} else {
696 		return 0;
697 	}
698 }
699 
700 static RB_HEAD(io_timeout_override, io_timeout_override_entry) io_timeout_override_root;
701 RB_PROTOTYPE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
702 RB_GENERATE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
703 
704 #endif /* ML_IO_TIMEOUTS_ENABLED */
705 
706 int
ml_io_increase_timeouts(uintptr_t iovaddr_base,unsigned int size,uint32_t read_timeout_us,uint32_t write_timeout_us)707 ml_io_increase_timeouts(uintptr_t iovaddr_base, unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us)
708 {
709 #if ML_IO_TIMEOUTS_ENABLED
710 	const size_t MAX_SIZE = 4096;
711 	const uint64_t MAX_TIMEOUT_ABS = UINT32_MAX;
712 
713 	assert(preemption_enabled());
714 
715 	int ret = KERN_SUCCESS;
716 
717 	if (size == 0) {
718 		return KERN_INVALID_ARGUMENT;
719 	}
720 
721 	uintptr_t iovaddr_end;
722 	if (size > MAX_SIZE || os_add_overflow(iovaddr_base, size - 1, &iovaddr_end)) {
723 		return KERN_INVALID_ARGUMENT;
724 	}
725 
726 	uint64_t read_timeout_abs, write_timeout_abs;
727 	nanoseconds_to_absolutetime(NSEC_PER_USEC * read_timeout_us, &read_timeout_abs);
728 	nanoseconds_to_absolutetime(NSEC_PER_USEC * write_timeout_us, &write_timeout_abs);
729 	if (read_timeout_abs > MAX_TIMEOUT_ABS || write_timeout_abs > MAX_TIMEOUT_ABS) {
730 		return KERN_INVALID_ARGUMENT;
731 	}
732 
733 	struct io_timeout_override_entry *node = kalloc_type(struct io_timeout_override_entry, Z_WAITOK | Z_ZERO | Z_NOFAIL);
734 	node->iovaddr_base = iovaddr_base;
735 	node->size = size;
736 	node->read_timeout = (uint32_t)read_timeout_abs;
737 	node->write_timeout = (uint32_t)write_timeout_abs;
738 
739 	/*
740 	 * Interrupt handlers are allowed to call ml_io_{read,write}*, so
741 	 * interrupts must be disabled any time io_timeout_override_lock is
742 	 * held.  Otherwise the CPU could take an interrupt while holding the
743 	 * lock, invoke an ISR that calls ml_io_{read,write}*, and deadlock
744 	 * trying to acquire the lock again.
745 	 */
746 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
747 	lck_spin_lock(&io_timeout_override_lock);
748 	if (RB_INSERT(io_timeout_override, &io_timeout_override_root, node)) {
749 		ret = KERN_INVALID_ARGUMENT;
750 		goto out;
751 	}
752 
753 	/* Check that this didn't create any new overlaps */
754 	struct io_timeout_override_entry *prev = RB_PREV(io_timeout_override, &io_timeout_override_root, node);
755 	if (prev && (prev->iovaddr_base + prev->size) > node->iovaddr_base) {
756 		RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
757 		ret = KERN_INVALID_ARGUMENT;
758 		goto out;
759 	}
760 	struct io_timeout_override_entry *next = RB_NEXT(io_timeout_override, &io_timeout_override_root, node);
761 	if (next && (node->iovaddr_base + node->size) > next->iovaddr_base) {
762 		RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
763 		ret = KERN_INVALID_ARGUMENT;
764 		goto out;
765 	}
766 
767 out:
768 	lck_spin_unlock(&io_timeout_override_lock);
769 	ml_set_interrupts_enabled(istate);
770 	if (ret != KERN_SUCCESS) {
771 		kfree_type(struct io_timeout_override_entry, node);
772 	}
773 	return ret;
774 #else /* !ML_IO_TIMEOUTS_ENABLED */
775 #pragma unused(iovaddr_base, size, read_timeout_us, write_timeout_us)
776 	return KERN_SUCCESS;
777 #endif
778 }
779 
780 int
ml_io_reset_timeouts(uintptr_t iovaddr_base,unsigned int size)781 ml_io_reset_timeouts(uintptr_t iovaddr_base, unsigned int size)
782 {
783 #if ML_IO_TIMEOUTS_ENABLED
784 	assert(preemption_enabled());
785 
786 	struct io_timeout_override_entry key = { .iovaddr_base = iovaddr_base };
787 
788 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
789 	lck_spin_lock(&io_timeout_override_lock);
790 	struct io_timeout_override_entry *node = RB_FIND(io_timeout_override, &io_timeout_override_root, &key);
791 	if (node) {
792 		if (node->size == size) {
793 			RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
794 		} else {
795 			node = NULL;
796 		}
797 	}
798 	lck_spin_unlock(&io_timeout_override_lock);
799 	ml_set_interrupts_enabled(istate);
800 
801 	if (!node) {
802 		return KERN_NOT_FOUND;
803 	}
804 
805 	kfree_type(struct io_timeout_override_entry, node);
806 #else /* !ML_IO_TIMEOUTS_ENABLED */
807 #pragma unused(iovaddr_base, size)
808 #endif
809 	return KERN_SUCCESS;
810 }
811 
812 #if ML_IO_TIMEOUTS_ENABLED
813 static void
override_io_timeouts(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)814 override_io_timeouts(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
815 {
816 	assert(!ml_get_interrupts_enabled());
817 
818 	struct io_timeout_override_entry *node = RB_ROOT(&io_timeout_override_root);
819 
820 	lck_spin_lock(&io_timeout_override_lock);
821 	/* RB_FIND() doesn't support custom cmp functions, so we have to open-code our own */
822 	while (node) {
823 		if (node->iovaddr_base <= vaddr && vaddr < node->iovaddr_base + node->size) {
824 			if (read_timeout) {
825 				*read_timeout = node->read_timeout;
826 			}
827 			if (write_timeout) {
828 				*write_timeout = node->write_timeout;
829 			}
830 			break;
831 		} else if (vaddr < node->iovaddr_base) {
832 			node = RB_LEFT(node, tree);
833 		} else {
834 			node = RB_RIGHT(node, tree);
835 		}
836 	}
837 	lck_spin_unlock(&io_timeout_override_lock);
838 }
839 #endif /* ML_IO_TIMEOUTS_ENABLED */
840 
841 unsigned long long
ml_io_read(uintptr_t vaddr,int size)842 ml_io_read(uintptr_t vaddr, int size)
843 {
844 	unsigned long long result = 0;
845 	unsigned char s1;
846 	unsigned short s2;
847 
848 #ifdef ML_IO_VERIFY_UNCACHEABLE
849 	uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
850 #elif defined(ML_IO_TIMEOUTS_ENABLED)
851 	uintptr_t const paddr = kvtophys(vaddr);
852 #endif
853 
854 #ifdef ML_IO_TIMEOUTS_ENABLED
855 	uint64_t sabs, eabs;
856 	boolean_t istate, timeread = FALSE;
857 	uint64_t report_read_delay;
858 #if __x86_64__
859 	report_read_delay = report_phy_read_delay;
860 #else
861 	report_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
862 	uint64_t const trace_phy_read_delay = os_atomic_load(&trace_phy_read_delay_to, relaxed);
863 #endif /* __x86_64__ */
864 
865 	if (__improbable(report_read_delay != 0)) {
866 		istate = ml_set_interrupts_enabled(FALSE);
867 		sabs = mach_absolute_time();
868 		timeread = TRUE;
869 	}
870 
871 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
872 	if (__improbable(timeread && simulate_stretched_io)) {
873 		sabs -= simulate_stretched_io;
874 	}
875 #endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */
876 #endif /* ML_IO_TIMEOUTS_ENABLED */
877 
878 #if DEVELOPMENT || DEBUG
879 	boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
880 	if (use_fences) {
881 		ml_timebase_to_memory_fence();
882 	}
883 #endif
884 
885 	switch (size) {
886 	case 1:
887 		s1 = *(volatile unsigned char *)vaddr;
888 		result = s1;
889 		break;
890 	case 2:
891 		s2 = *(volatile unsigned short *)vaddr;
892 		result = s2;
893 		break;
894 	case 4:
895 		result = *(volatile unsigned int *)vaddr;
896 		break;
897 	case 8:
898 		result = *(volatile unsigned long long *)vaddr;
899 		break;
900 	default:
901 		panic("Invalid size %d for ml_io_read(%p)", size, (void *)vaddr);
902 		break;
903 	}
904 
905 #if DEVELOPMENT || DEBUG
906 	if (use_fences) {
907 		ml_memory_to_timebase_fence();
908 	}
909 #endif
910 
911 #ifdef ML_IO_TIMEOUTS_ENABLED
912 	if (__improbable(timeread == TRUE)) {
913 		eabs = mach_absolute_time();
914 
915 		/* Prevent the processor from calling iotrace during its
916 		 * initialization procedure. */
917 		if (current_processor()->state == PROCESSOR_RUNNING) {
918 			iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, sabs, eabs - sabs);
919 		}
920 
921 		if (__improbable((eabs - sabs) > report_read_delay)) {
922 			uint64_t override = 0;
923 			override_io_timeouts(vaddr, &override, NULL);
924 
925 			if (override != 0) {
926 #if SCHED_HYGIENE_DEBUG
927 				/*
928 				 * The IO timeout was overridden. As interrupts are disabled in
929 				 * order to accurately measure IO time this can cause the
930 				 * interrupt masked timeout threshold to be exceeded.  If the
931 				 * interrupt masked debug mode is set to panic, abandon the
932 				 * measurement. If in trace mode leave it as-is for
933 				 * observability.
934 				 */
935 				if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
936 					ml_spin_debug_clear(current_thread());
937 				}
938 #endif
939 				report_read_delay = override;
940 			}
941 		}
942 
943 		if (__improbable((eabs - sabs) > report_read_delay)) {
944 			if (phy_read_panic && (machine_timeout_suspended() == FALSE)) {
945 #if defined(__x86_64__)
946 				panic_notify();
947 #endif /* defined(__x86_64__) */
948 				uint64_t nsec = 0;
949 				absolutetime_to_nanoseconds(eabs - sabs, &nsec);
950 				panic("Read from IO vaddr 0x%lx paddr 0x%lx took %llu ns, "
951 				    "result: 0x%llx (start: %llu, end: %llu), ceiling: %llu",
952 				    vaddr, paddr, nsec, result, sabs, eabs,
953 				    report_read_delay);
954 			}
955 
956 			(void)ml_set_interrupts_enabled(istate);
957 
958 			if (report_phy_read_osbt) {
959 				uint64_t nsec = 0;
960 				absolutetime_to_nanoseconds(eabs - sabs, &nsec);
961 				OSReportWithBacktrace("ml_io_read(v=%p, p=%p) size %d result 0x%llx "
962 				    "took %lluus",
963 				    (void *)vaddr, (void *)paddr, size, result,
964 				    nsec / NSEC_PER_USEC);
965 			}
966 			DTRACE_PHYSLAT5(physioread, uint64_t, (eabs - sabs),
967 			    uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, result);
968 		} else if (__improbable(trace_phy_read_delay > 0 && (eabs - sabs) > trace_phy_read_delay)) {
969 			KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_READ),
970 			    (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result);
971 
972 			(void)ml_set_interrupts_enabled(istate);
973 		} else {
974 			(void)ml_set_interrupts_enabled(istate);
975 		}
976 	}
977 #endif /*  ML_IO_TIMEOUTS_ENABLED */
978 	return result;
979 }
980 
981 unsigned int
ml_io_read8(uintptr_t vaddr)982 ml_io_read8(uintptr_t vaddr)
983 {
984 	return (unsigned) ml_io_read(vaddr, 1);
985 }
986 
987 unsigned int
ml_io_read16(uintptr_t vaddr)988 ml_io_read16(uintptr_t vaddr)
989 {
990 	return (unsigned) ml_io_read(vaddr, 2);
991 }
992 
993 unsigned int
ml_io_read32(uintptr_t vaddr)994 ml_io_read32(uintptr_t vaddr)
995 {
996 	return (unsigned) ml_io_read(vaddr, 4);
997 }
998 
999 unsigned long long
ml_io_read64(uintptr_t vaddr)1000 ml_io_read64(uintptr_t vaddr)
1001 {
1002 	return ml_io_read(vaddr, 8);
1003 }
1004 
1005 /* ml_io_write* */
1006 
1007 void
ml_io_write(uintptr_t vaddr,uint64_t val,int size)1008 ml_io_write(uintptr_t vaddr, uint64_t val, int size)
1009 {
1010 #ifdef ML_IO_VERIFY_UNCACHEABLE
1011 	uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
1012 #elif defined(ML_IO_TIMEOUTS_ENABLED)
1013 	uintptr_t const paddr = kvtophys(vaddr);
1014 #endif
1015 
1016 #ifdef ML_IO_TIMEOUTS_ENABLED
1017 	uint64_t sabs, eabs;
1018 	boolean_t istate, timewrite = FALSE;
1019 	uint64_t report_write_delay;
1020 #if __x86_64__
1021 	report_write_delay = report_phy_write_delay;
1022 #else
1023 	report_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
1024 	uint64_t trace_phy_write_delay = os_atomic_load(&trace_phy_write_delay_to, relaxed);
1025 #endif /* !defined(__x86_64__) */
1026 	if (__improbable(report_write_delay != 0)) {
1027 		istate = ml_set_interrupts_enabled(FALSE);
1028 		sabs = mach_absolute_time();
1029 		timewrite = TRUE;
1030 	}
1031 
1032 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
1033 	if (__improbable(timewrite && simulate_stretched_io)) {
1034 		sabs -= simulate_stretched_io;
1035 	}
1036 #endif /* DEVELOPMENT || DEBUG */
1037 #endif /* ML_IO_TIMEOUTS_ENABLED */
1038 
1039 #if DEVELOPMENT || DEBUG
1040 	boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
1041 	if (use_fences) {
1042 		ml_timebase_to_memory_fence();
1043 	}
1044 #endif
1045 
1046 	switch (size) {
1047 	case 1:
1048 		*(volatile uint8_t *)vaddr = (uint8_t)val;
1049 		break;
1050 	case 2:
1051 		*(volatile uint16_t *)vaddr = (uint16_t)val;
1052 		break;
1053 	case 4:
1054 		*(volatile uint32_t *)vaddr = (uint32_t)val;
1055 		break;
1056 	case 8:
1057 		*(volatile uint64_t *)vaddr = (uint64_t)val;
1058 		break;
1059 	default:
1060 		panic("Invalid size %d for ml_io_write(%p, 0x%llx)", size, (void *)vaddr, val);
1061 		break;
1062 	}
1063 
1064 #if DEVELOPMENT || DEBUG
1065 	if (use_fences) {
1066 		ml_memory_to_timebase_fence();
1067 	}
1068 #endif
1069 
1070 #ifdef ML_IO_TIMEOUTS_ENABLED
1071 	if (__improbable(timewrite == TRUE)) {
1072 		eabs = mach_absolute_time();
1073 
1074 
1075 		/* Prevent the processor from calling iotrace during its
1076 		 * initialization procedure. */
1077 		if (current_processor()->state == PROCESSOR_RUNNING) {
1078 			iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, sabs, eabs - sabs);
1079 		}
1080 
1081 
1082 		if (__improbable((eabs - sabs) > report_write_delay)) {
1083 			uint64_t override = 0;
1084 			override_io_timeouts(vaddr, NULL, &override);
1085 
1086 			if (override != 0) {
1087 #if SCHED_HYGIENE_DEBUG
1088 				/*
1089 				 * The IO timeout was overridden. As interrupts are disabled in
1090 				 * order to accurately measure IO time this can cause the
1091 				 * interrupt masked timeout threshold to be exceeded.  If the
1092 				 * interrupt masked debug mode is set to panic, abandon the
1093 				 * measurement. If in trace mode leave it as-is for
1094 				 * observability.
1095 				 */
1096 				if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1097 					ml_spin_debug_clear(current_thread());
1098 				}
1099 #endif
1100 				report_write_delay = override;
1101 			}
1102 		}
1103 
1104 		if (__improbable((eabs - sabs) > report_write_delay)) {
1105 			if (phy_write_panic && (machine_timeout_suspended() == FALSE)) {
1106 #if defined(__x86_64__)
1107 				panic_notify();
1108 #endif /*  defined(__x86_64__) */
1109 
1110 				uint64_t nsec = 0;
1111 				absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1112 				panic("Write to IO vaddr %p paddr %p val 0x%llx took %llu ns,"
1113 				    " (start: %llu, end: %llu), ceiling: %llu",
1114 				    (void *)vaddr, (void *)paddr, val, nsec, sabs, eabs,
1115 				    report_write_delay);
1116 			}
1117 
1118 			(void)ml_set_interrupts_enabled(istate);
1119 
1120 			if (report_phy_write_osbt) {
1121 				uint64_t nsec = 0;
1122 				absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1123 				OSReportWithBacktrace("ml_io_write size %d (v=%p, p=%p, 0x%llx) "
1124 				    "took %lluus",
1125 				    size, (void *)vaddr, (void *)paddr, val, nsec / NSEC_PER_USEC);
1126 			}
1127 			DTRACE_PHYSLAT5(physiowrite, uint64_t, (eabs - sabs),
1128 			    uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, val);
1129 		} else if (__improbable(trace_phy_write_delay > 0 && (eabs - sabs) > trace_phy_write_delay)) {
1130 			KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_WRITE),
1131 			    (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val);
1132 
1133 			(void)ml_set_interrupts_enabled(istate);
1134 		} else {
1135 			(void)ml_set_interrupts_enabled(istate);
1136 		}
1137 	}
1138 #endif /* ML_IO_TIMEOUTS_ENABLED */
1139 }
1140 
1141 void
ml_io_write8(uintptr_t vaddr,uint8_t val)1142 ml_io_write8(uintptr_t vaddr, uint8_t val)
1143 {
1144 	ml_io_write(vaddr, val, 1);
1145 }
1146 
1147 void
ml_io_write16(uintptr_t vaddr,uint16_t val)1148 ml_io_write16(uintptr_t vaddr, uint16_t val)
1149 {
1150 	ml_io_write(vaddr, val, 2);
1151 }
1152 
1153 void
ml_io_write32(uintptr_t vaddr,uint32_t val)1154 ml_io_write32(uintptr_t vaddr, uint32_t val)
1155 {
1156 	ml_io_write(vaddr, val, 4);
1157 }
1158 
1159 void
ml_io_write64(uintptr_t vaddr,uint64_t val)1160 ml_io_write64(uintptr_t vaddr, uint64_t val)
1161 {
1162 	ml_io_write(vaddr, val, 8);
1163 }
1164 
1165 struct cpu_callback_chain_elem {
1166 	cpu_callback_t                  fn;
1167 	void                            *param;
1168 	struct cpu_callback_chain_elem  *next;
1169 };
1170 
1171 static struct cpu_callback_chain_elem *cpu_callback_chain;
1172 static LCK_GRP_DECLARE(cpu_callback_chain_lock_grp, "cpu_callback_chain");
1173 static LCK_SPIN_DECLARE(cpu_callback_chain_lock, &cpu_callback_chain_lock_grp);
1174 
1175 void
cpu_event_register_callback(cpu_callback_t fn,void * param)1176 cpu_event_register_callback(cpu_callback_t fn, void *param)
1177 {
1178 	struct cpu_callback_chain_elem *new_elem;
1179 
1180 	new_elem = zalloc_permanent_type(struct cpu_callback_chain_elem);
1181 	if (!new_elem) {
1182 		panic("can't allocate cpu_callback_chain_elem");
1183 	}
1184 
1185 	lck_spin_lock(&cpu_callback_chain_lock);
1186 	new_elem->next = cpu_callback_chain;
1187 	new_elem->fn = fn;
1188 	new_elem->param = param;
1189 	os_atomic_store(&cpu_callback_chain, new_elem, release);
1190 	lck_spin_unlock(&cpu_callback_chain_lock);
1191 }
1192 
1193 __attribute__((noreturn))
1194 void
cpu_event_unregister_callback(__unused cpu_callback_t fn)1195 cpu_event_unregister_callback(__unused cpu_callback_t fn)
1196 {
1197 	panic("Unfortunately, cpu_event_unregister_callback is unimplemented.");
1198 }
1199 
1200 void
ml_broadcast_cpu_event(enum cpu_event event,unsigned int cpu_or_cluster)1201 ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster)
1202 {
1203 	struct cpu_callback_chain_elem *cursor;
1204 
1205 	cursor = os_atomic_load(&cpu_callback_chain, dependency);
1206 	for (; cursor != NULL; cursor = cursor->next) {
1207 		cursor->fn(cursor->param, event, cpu_or_cluster);
1208 	}
1209 }
1210 
1211 // Initialize Machine Timeouts (see the MACHINE_TIMEOUT macro
1212 // definition)
1213 
1214 void
machine_timeout_init_with_suffix(const struct machine_timeout_spec * spec,char const * suffix)1215 machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *suffix)
1216 {
1217 	if (spec->skip_predicate != NULL && spec->skip_predicate(spec)) {
1218 		// This timeout should be disabled.
1219 		os_atomic_store_wide((uint64_t*)spec->ptr, 0, relaxed);
1220 		return;
1221 	}
1222 
1223 	assert(suffix != NULL);
1224 	assert(strlen(spec->name) <= MACHINE_TIMEOUT_MAX_NAME_LEN);
1225 
1226 	size_t const suffix_len = strlen(suffix);
1227 
1228 	size_t const dt_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + 1;
1229 	char dt_name[dt_name_size];
1230 
1231 	strlcpy(dt_name, spec->name, dt_name_size);
1232 	strlcat(dt_name, suffix, dt_name_size);
1233 
1234 	size_t const scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + strlen("-scale") + 1;
1235 	char scale_name[scale_name_size];
1236 
1237 	strlcpy(scale_name, spec->name, scale_name_size);
1238 	strlcat(scale_name, suffix, scale_name_size);
1239 	strlcat(scale_name, "-scale", scale_name_size);
1240 
1241 	size_t const boot_arg_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + strlen("ml-timeout-") + suffix_len + 1;
1242 	char boot_arg_name[boot_arg_name_size];
1243 
1244 	strlcpy(boot_arg_name, "ml-timeout-", boot_arg_name_size);
1245 	strlcat(boot_arg_name, spec->name, boot_arg_name_size);
1246 	strlcat(boot_arg_name, suffix, boot_arg_name_size);
1247 
1248 	size_t const boot_arg_scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN +
1249 	    strlen("ml-timeout-") + strlen("-scale") + suffix_len + 1;
1250 	char boot_arg_scale_name[boot_arg_scale_name_size];
1251 
1252 	strlcpy(boot_arg_scale_name, "ml-timeout-", boot_arg_scale_name_size);
1253 	strlcat(boot_arg_scale_name, spec->name, boot_arg_scale_name_size);
1254 	strlcat(boot_arg_scale_name, suffix, boot_arg_name_size);
1255 	strlcat(boot_arg_scale_name, "-scale", boot_arg_scale_name_size);
1256 
1257 
1258 	/*
1259 	 * Determine base value from DT and boot-args.
1260 	 */
1261 
1262 	DTEntry base, chosen;
1263 
1264 	if (SecureDTLookupEntry(NULL, "/machine-timeouts", &base) != kSuccess) {
1265 		base = NULL;
1266 	}
1267 
1268 	if (SecureDTLookupEntry(NULL, "/chosen/machine-timeouts", &chosen) != kSuccess) {
1269 		chosen = NULL;
1270 	}
1271 
1272 	uint64_t timeout = spec->default_value;
1273 	bool found = false;
1274 
1275 	uint64_t const *data = NULL;
1276 	unsigned int data_size = sizeof(*data);
1277 
1278 	/* First look in /machine-timeouts/<name> */
1279 	if (base != NULL && SecureDTGetProperty(base, dt_name, (const void **)&data, &data_size) == kSuccess) {
1280 		if (data_size != sizeof(*data)) {
1281 			panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s", __func__, data_size, dt_name);
1282 		}
1283 
1284 		timeout = *data;
1285 		found = true;
1286 	}
1287 
1288 	/* A value in /chosen/machine-timeouts/<name> overrides */
1289 	if (chosen != NULL && SecureDTGetProperty(chosen, dt_name, (const void **)&data, &data_size) == kSuccess) {
1290 		if (data_size != sizeof(*data)) {
1291 			panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s", __func__, data_size, dt_name);
1292 		}
1293 
1294 		timeout = *data;
1295 		found = true;
1296 	}
1297 
1298 	/* A boot-arg ml-timeout-<name> overrides */
1299 	uint64_t boot_arg = 0;
1300 
1301 	if (PE_parse_boot_argn(boot_arg_name, &boot_arg, sizeof(boot_arg))) {
1302 		timeout = boot_arg;
1303 		found = true;
1304 	}
1305 
1306 
1307 	/*
1308 	 * Determine scale value from DT and boot-args.
1309 	 */
1310 
1311 	uint64_t scale = 1;
1312 	uint32_t const *scale_data;
1313 	unsigned int scale_size = sizeof(scale_data);
1314 
1315 	/* If there is a scale factor /machine-timeouts/<name>-scale, apply it. */
1316 	if (base != NULL && SecureDTGetProperty(base, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1317 		if (scale_size != sizeof(*scale_data)) {
1318 			panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s-scale", __func__, scale_size, dt_name);
1319 		}
1320 
1321 		scale = *scale_data;
1322 	}
1323 
1324 	/* If there is a scale factor /chosen/machine-timeouts/<name>-scale, use that. */
1325 	if (chosen != NULL && SecureDTGetProperty(chosen, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1326 		if (scale_size != sizeof(*scale_data)) {
1327 			panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s-scale", __func__,
1328 			    scale_size, dt_name);
1329 		}
1330 
1331 		scale = *scale_data;
1332 	}
1333 
1334 	/* Finally, a boot-arg ml-timeout-<name>-scale takes precedence. */
1335 	if (PE_parse_boot_argn(boot_arg_scale_name, &boot_arg, sizeof(boot_arg))) {
1336 		scale = boot_arg;
1337 	}
1338 
1339 	static bool global_scale_set;
1340 	static uint64_t global_scale;
1341 
1342 	if (!global_scale_set) {
1343 		/* Apply /machine-timeouts/global-scale if present */
1344 		if (SecureDTGetProperty(base, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1345 			if (scale_size != sizeof(*scale_data)) {
1346 				panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/global-scale", __func__,
1347 				    scale_size);
1348 			}
1349 
1350 			global_scale = *scale_data;
1351 			global_scale_set = true;
1352 		}
1353 
1354 		/* Use /chosen/machine-timeouts/global-scale if present */
1355 		if (SecureDTGetProperty(chosen, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1356 			if (scale_size != sizeof(*scale_data)) {
1357 				panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/global-scale", __func__,
1358 				    scale_size);
1359 			}
1360 
1361 			global_scale = *scale_data;
1362 			global_scale_set = true;
1363 		}
1364 
1365 		/* Finally, the boot-arg ml-timeout-global-scale takes precedence. */
1366 		if (PE_parse_boot_argn("ml-timeout-global-scale", &boot_arg, sizeof(boot_arg))) {
1367 			global_scale = boot_arg;
1368 			global_scale_set = true;
1369 		}
1370 	}
1371 
1372 	if (global_scale_set) {
1373 		scale *= global_scale;
1374 	}
1375 
1376 	/* Compute the final timeout, and done. */
1377 	if (found && timeout > 0) {
1378 		/* Only apply inherent unit scale if the value came in
1379 		 * externally. */
1380 
1381 		if (spec->unit_scale == MACHINE_TIMEOUT_UNIT_TIMEBASE) {
1382 			uint64_t nanoseconds = timeout / 1000;
1383 			nanoseconds_to_absolutetime(nanoseconds, &timeout);
1384 		} else {
1385 			timeout /= spec->unit_scale;
1386 		}
1387 
1388 		if (timeout == 0) {
1389 			/* Ensure unit scaling did not disable the timeout. */
1390 			timeout = 1;
1391 		}
1392 	}
1393 
1394 	if (os_mul_overflow(timeout, scale, &timeout)) {
1395 		timeout = UINT64_MAX; // clamp
1396 	}
1397 
1398 	os_atomic_store_wide((uint64_t*)spec->ptr, timeout, relaxed);
1399 }
1400 
1401 void
machine_timeout_init(const struct machine_timeout_spec * spec)1402 machine_timeout_init(const struct machine_timeout_spec *spec)
1403 {
1404 	machine_timeout_init_with_suffix(spec, "");
1405 }
1406 
1407 #if DEVELOPMENT || DEBUG
1408 /*
1409  * Late timeout (re-)initialization, at the end of bsd_init()
1410  */
1411 void
machine_timeout_bsd_init(void)1412 machine_timeout_bsd_init(void)
1413 {
1414 	char const * const __unused mt_suffix = "-b";
1415 #if SCHED_HYGIENE_DEBUG
1416 	machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix);
1417 	machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix);
1418 
1419 	/*
1420 	 * The io timeouts can inherit from interrupt_masked_timeout.
1421 	 * Re-initialize, as interrupt_masked_timeout may have changed.
1422 	 */
1423 	ml_io_init_timeouts();
1424 
1425 	PERCPU_DECL(uint64_t _Atomic, preemption_disable_max_mt);
1426 
1427 	/*
1428 	 * Reset the preemption disable stats, so that they are not
1429 	 * polluted by long early boot code.
1430 	 */
1431 	percpu_foreach(max_stat, preemption_disable_max_mt) {
1432 		os_atomic_store(max_stat, 0, relaxed);
1433 
1434 		/*
1435 		 * No additional synchronization needed.  The time when we
1436 		 * switch to late boot timeouts is relatively arbitrary
1437 		 * anyway: By now we don't expect any long preemption
1438 		 * disabling anymore. While that is still a clear delineation
1439 		 * for the boot CPU, other CPUs can be in the middle of doing
1440 		 * whatever. So if the missing synchronization causes a new
1441 		 * maximum to be missed on a secondary CPU, it could just as
1442 		 * well have been missed by racing with this function.
1443 		 */
1444 	}
1445 
1446 #endif
1447 }
1448 #endif /* DEVELOPMENT || DEBUG */
1449 
1450 #if ML_IO_TIMEOUTS_ENABLED && CONFIG_XNUPOST
1451 #include <tests/xnupost.h>
1452 
1453 extern kern_return_t ml_io_timeout_test(void);
1454 
1455 static inline void
ml_io_timeout_test_get_timeouts(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)1456 ml_io_timeout_test_get_timeouts(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
1457 {
1458 	*read_timeout = 0;
1459 	*write_timeout = 0;
1460 
1461 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
1462 	override_io_timeouts(vaddr, read_timeout, write_timeout);
1463 	ml_set_interrupts_enabled(istate);
1464 }
1465 
1466 kern_return_t
ml_io_timeout_test(void)1467 ml_io_timeout_test(void)
1468 {
1469 	const size_t SIZE = 16;
1470 	uintptr_t iovaddr_base1 = (uintptr_t)&ml_io_timeout_test;
1471 	uintptr_t iovaddr_base2 = iovaddr_base1 + SIZE;
1472 	uintptr_t vaddr1 = iovaddr_base1 + SIZE / 2;
1473 	uintptr_t vaddr2 = iovaddr_base2 + SIZE / 2;
1474 
1475 	const uint64_t READ_TIMEOUT1_US = 50000, WRITE_TIMEOUT1_US = 50001;
1476 	const uint64_t READ_TIMEOUT2_US = 50002, WRITE_TIMEOUT2_US = 50003;
1477 	uint64_t read_timeout1_abs, write_timeout1_abs;
1478 	uint64_t read_timeout2_abs, write_timeout2_abs;
1479 	nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT1_US, &read_timeout1_abs);
1480 	nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT1_US, &write_timeout1_abs);
1481 	nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT2_US, &read_timeout2_abs);
1482 	nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT2_US, &write_timeout2_abs);
1483 
1484 	int err = ml_io_increase_timeouts(iovaddr_base1, 0, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1485 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for empty region");
1486 
1487 	err = ml_io_increase_timeouts(iovaddr_base1, 4097, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1488 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for region > 4096 bytes");
1489 
1490 	err = ml_io_increase_timeouts(UINTPTR_MAX, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1491 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overflowed region");
1492 
1493 	err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1494 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first VA region should succeed");
1495 
1496 	err = ml_io_increase_timeouts(iovaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US);
1497 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second VA region should succeed");
1498 
1499 	err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1500 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for same region twice");
1501 
1502 	err = ml_io_increase_timeouts(vaddr1, (uint32_t)(vaddr2 - vaddr1), READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1503 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overlapping regions");
1504 
1505 	uint64_t read_timeout, write_timeout;
1506 	ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1507 	T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1508 	T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1509 
1510 	ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout);
1511 	T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region");
1512 	T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region");
1513 
1514 	ml_io_timeout_test_get_timeouts(iovaddr_base2 + SIZE, &read_timeout, &write_timeout);
1515 	T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout without override");
1516 	T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout without override");
1517 
1518 	err = ml_io_reset_timeouts(iovaddr_base1 + 1, SIZE - 1);
1519 	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for subregion");
1520 
1521 	err = ml_io_reset_timeouts(iovaddr_base2 + SIZE, SIZE);
1522 	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for non-existent region");
1523 
1524 	err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1525 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first VA region should succeed");
1526 
1527 	ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1528 	T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region");
1529 	T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region");
1530 
1531 	err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1532 	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for same region twice");
1533 
1534 	err = ml_io_reset_timeouts(iovaddr_base2, SIZE);
1535 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second VA region should succeed");
1536 
1537 	return KERN_SUCCESS;
1538 }
1539 #endif /* CONFIG_XNUPOST */
1540