xref: /xnu-10002.1.13/osfmk/kern/machine.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a) !
1 /*
2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	kern/machine.c
60  *	Author:	Avadis Tevanian, Jr.
61  *	Date:	1987
62  *
63  *	Support for machine independent machine abstraction.
64  */
65 
66 #include <string.h>
67 
68 #include <mach/mach_types.h>
69 #include <mach/boolean.h>
70 #include <mach/kern_return.h>
71 #include <mach/machine.h>
72 #include <mach/host_info.h>
73 #include <mach/host_reboot.h>
74 #include <mach/host_priv_server.h>
75 #include <mach/processor_server.h>
76 #include <mach/sdt.h>
77 
78 #include <kern/kern_types.h>
79 #include <kern/cpu_data.h>
80 #include <kern/ipc_host.h>
81 #include <kern/host.h>
82 #include <kern/machine.h>
83 #include <kern/misc_protos.h>
84 #include <kern/percpu.h>
85 #include <kern/processor.h>
86 #include <kern/queue.h>
87 #include <kern/sched.h>
88 #include <kern/startup.h>
89 #include <kern/task.h>
90 #include <kern/thread.h>
91 #include <kern/iotrace.h>
92 
93 #include <libkern/OSDebug.h>
94 #if ML_IO_TIMEOUTS_ENABLED
95 #include <libkern/tree.h>
96 #endif
97 
98 #include <pexpert/device_tree.h>
99 
100 #include <machine/commpage.h>
101 #include <machine/machine_routines.h>
102 
103 #if HIBERNATION
104 #include <IOKit/IOHibernatePrivate.h>
105 #endif
106 #include <IOKit/IOPlatformExpert.h>
107 
108 #if CONFIG_DTRACE
109 extern void (*dtrace_cpu_state_changed_hook)(int, boolean_t);
110 #endif
111 
112 #if defined(__arm64__)
113 extern void wait_while_mp_kdp_trap(bool check_SIGPdebug);
114 #include <arm/pmap/pmap_data.h>
115 #endif
116 
117 #if defined(__x86_64__)
118 #include <i386/panic_notify.h>
119 #endif
120 
121 /*
122  *	Exported variables:
123  */
124 
125 struct machine_info     machine_info;
126 
127 /* Forwards */
128 static void
129 processor_doshutdown(processor_t processor);
130 
131 static void
132 processor_offline(void * parameter, __unused wait_result_t result);
133 
134 static void
135 processor_offline_intstack(processor_t processor) __dead2;
136 
137 static void
processor_up_update_counts(processor_t processor)138 processor_up_update_counts(processor_t processor)
139 {
140 	ml_cpu_up_update_counts(processor->cpu_id);
141 
142 	os_atomic_inc(&processor_avail_count, relaxed);
143 	if (processor->is_recommended) {
144 		os_atomic_inc(&processor_avail_count_user, relaxed);
145 	}
146 	if (processor->processor_primary == processor) {
147 		os_atomic_inc(&primary_processor_avail_count, relaxed);
148 		if (processor->is_recommended) {
149 			os_atomic_inc(&primary_processor_avail_count_user, relaxed);
150 		}
151 	}
152 	commpage_update_active_cpus();
153 }
154 
155 /*
156  *	processor_up:
157  *
158  *	Flag processor as up and running, and available
159  *	for scheduling.
160  */
161 void
processor_up(processor_t processor)162 processor_up(
163 	processor_t                     processor)
164 {
165 	processor_set_t         pset;
166 	spl_t                           s;
167 
168 	s = splsched();
169 	init_ast_check(processor);
170 
171 #if defined(__arm64__)
172 	/*
173 	 * A processor coming online won't have received a SIGPdebug signal
174 	 * to cause it to spin while a stackshot or panic is taking place,
175 	 * so spin here on mp_kdp_trap.
176 	 *
177 	 * However, since cpu_signal() is not yet enabled for this processor,
178 	 * there is a race if we have just passed this when a cpu_signal()
179 	 * is attempted.  The sender will assume the cpu is offline, so it will
180 	 * not end up spinning anywhere.  See processor_offline() for the fix
181 	 * for this race.
182 	 */
183 	wait_while_mp_kdp_trap(false);
184 #endif
185 
186 	pset = processor->processor_set;
187 	simple_lock(&sched_available_cores_lock, LCK_GRP_NULL);
188 	pset_lock(pset);
189 
190 	++pset->online_processor_count;
191 	simple_lock(&processor->start_state_lock, LCK_GRP_NULL);
192 	pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
193 	simple_unlock(&processor->start_state_lock);
194 	bool temporary = processor->shutdown_temporary;
195 	if (temporary) {
196 		processor->shutdown_temporary = false;
197 	} else {
198 		processor_up_update_counts(processor);
199 	}
200 	if (processor->is_recommended) {
201 		SCHED(pset_made_schedulable)(processor, pset, false);
202 	}
203 	pset_unlock(pset);
204 	ml_cpu_up();
205 	smr_cpu_up(processor, SMR_CPU_REASON_OFFLINE);
206 	sched_mark_processor_online_locked(processor, processor->last_startup_reason);
207 	simple_unlock(&sched_available_cores_lock);
208 	splx(s);
209 
210 	thread_wakeup((event_t)&processor->state);
211 
212 #if CONFIG_DTRACE
213 	if (dtrace_cpu_state_changed_hook) {
214 		(*dtrace_cpu_state_changed_hook)(processor->cpu_id, TRUE);
215 	}
216 #endif
217 }
218 #include <atm/atm_internal.h>
219 
220 kern_return_t
host_reboot(host_priv_t host_priv,int options)221 host_reboot(
222 	host_priv_t             host_priv,
223 	int                             options)
224 {
225 	if (host_priv == HOST_PRIV_NULL) {
226 		return KERN_INVALID_HOST;
227 	}
228 
229 #if DEVELOPMENT || DEBUG
230 	if (options & HOST_REBOOT_DEBUGGER) {
231 		Debugger("Debugger");
232 		return KERN_SUCCESS;
233 	}
234 #endif
235 
236 	if (options & HOST_REBOOT_UPSDELAY) {
237 		// UPS power cutoff path
238 		PEHaltRestart( kPEUPSDelayHaltCPU );
239 	} else {
240 		halt_all_cpus(!(options & HOST_REBOOT_HALT));
241 	}
242 
243 	return KERN_SUCCESS;
244 }
245 
246 kern_return_t
processor_assign(__unused processor_t processor,__unused processor_set_t new_pset,__unused boolean_t wait)247 processor_assign(
248 	__unused processor_t            processor,
249 	__unused processor_set_t        new_pset,
250 	__unused boolean_t              wait)
251 {
252 	return KERN_FAILURE;
253 }
254 
255 static void
processor_down_update_counts(processor_t processor)256 processor_down_update_counts(processor_t processor)
257 {
258 	ml_cpu_down_update_counts(processor->cpu_id);
259 
260 	os_atomic_dec(&processor_avail_count, relaxed);
261 	if (processor->is_recommended) {
262 		os_atomic_dec(&processor_avail_count_user, relaxed);
263 	}
264 	if (processor->processor_primary == processor) {
265 		os_atomic_dec(&primary_processor_avail_count, relaxed);
266 		if (processor->is_recommended) {
267 			os_atomic_dec(&primary_processor_avail_count_user, relaxed);
268 		}
269 	}
270 	commpage_update_active_cpus();
271 }
272 
273 extern lck_mtx_t processor_updown_lock;
274 
275 kern_return_t
processor_shutdown(processor_t processor,processor_reason_t reason,uint32_t flags)276 processor_shutdown(
277 	processor_t                     processor,
278 	processor_reason_t              reason,
279 	uint32_t                        flags)
280 {
281 	if (!ml_cpu_can_exit(processor->cpu_id, reason)) {
282 		/*
283 		 * Failure if disallowed by arch code.
284 		 */
285 		return KERN_NOT_SUPPORTED;
286 	}
287 
288 	lck_mtx_lock(&processor_updown_lock);
289 
290 	kern_return_t mark_ret = sched_mark_processor_offline(processor, reason);
291 	if (mark_ret != KERN_SUCCESS) {
292 		/* Must fail or we deadlock */
293 		lck_mtx_unlock(&processor_updown_lock);
294 		return KERN_FAILURE;
295 	}
296 
297 	ml_cpu_begin_state_transition(processor->cpu_id);
298 	spl_t s = splsched();
299 	processor_set_t pset = processor->processor_set;
300 
301 	pset_lock(pset);
302 	if (processor->state == PROCESSOR_OFF_LINE) {
303 		/*
304 		 * Success if already shutdown.
305 		 */
306 		if (processor->shutdown_temporary && !(flags & SHUTDOWN_TEMPORARY)) {
307 			/* Convert a temporary shutdown into a permanent shutdown */
308 			processor->shutdown_temporary = false;
309 			processor_down_update_counts(processor);
310 		}
311 		pset_unlock(pset);
312 		splx(s);
313 		ml_cpu_end_state_transition(processor->cpu_id);
314 
315 		lck_mtx_unlock(&processor_updown_lock);
316 		return KERN_SUCCESS;
317 	}
318 
319 	if (processor->shutdown_locked && (reason != REASON_SYSTEM)) {
320 		/*
321 		 * Failure if processor is locked against shutdown.
322 		 */
323 		pset_unlock(pset);
324 		splx(s);
325 
326 		lck_mtx_unlock(&processor_updown_lock);
327 		return KERN_FAILURE;
328 	}
329 
330 	if (processor->state == PROCESSOR_START) {
331 		pset_unlock(pset);
332 		splx(s);
333 
334 		processor_wait_for_start(processor);
335 
336 		s = splsched();
337 		pset_lock(pset);
338 	}
339 
340 	/*
341 	 * If the processor is dispatching, let it finish.
342 	 */
343 	while (processor->state == PROCESSOR_DISPATCHING) {
344 		pset_unlock(pset);
345 		splx(s);
346 		delay(1);
347 		s = splsched();
348 		pset_lock(pset);
349 	}
350 
351 	/*
352 	 * Success if already being shutdown with matching SHUTDOWN_TEMPORARY flag.
353 	 */
354 	if ((processor->state == PROCESSOR_SHUTDOWN) || (processor->state == PROCESSOR_PENDING_OFFLINE)) {
355 		bool success = (flags & SHUTDOWN_TEMPORARY) ? processor->shutdown_temporary : !processor->shutdown_temporary;
356 
357 		pset_unlock(pset);
358 		splx(s);
359 		ml_cpu_end_state_transition(processor->cpu_id);
360 
361 		lck_mtx_unlock(&processor_updown_lock);
362 		return success ? KERN_SUCCESS : KERN_FAILURE;
363 	}
364 
365 	ml_broadcast_cpu_event(CPU_EXIT_REQUESTED, processor->cpu_id);
366 	pset_update_processor_state(pset, processor, PROCESSOR_SHUTDOWN);
367 	processor->last_shutdown_reason = reason;
368 	if (flags & SHUTDOWN_TEMPORARY) {
369 		processor->shutdown_temporary = true;
370 	}
371 	pset_unlock(pset);
372 
373 	processor_doshutdown(processor);
374 	splx(s);
375 
376 	cpu_exit_wait(processor->cpu_id);
377 
378 	if (processor != master_processor) {
379 		s = splsched();
380 		pset_lock(pset);
381 		pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
382 		pset_unlock(pset);
383 		splx(s);
384 	}
385 
386 	ml_cpu_end_state_transition(processor->cpu_id);
387 	ml_broadcast_cpu_event(CPU_EXITED, processor->cpu_id);
388 	ml_cpu_power_disable(processor->cpu_id);
389 
390 	lck_mtx_unlock(&processor_updown_lock);
391 	return KERN_SUCCESS;
392 }
393 
394 /*
395  * Called with interrupts disabled.
396  */
397 static void
processor_doshutdown(processor_t processor)398 processor_doshutdown(
399 	processor_t processor)
400 {
401 	thread_t self = current_thread();
402 
403 	/*
404 	 *	Get onto the processor to shutdown
405 	 */
406 	processor_t prev = thread_bind(processor);
407 	thread_block(THREAD_CONTINUE_NULL);
408 
409 	/* interrupts still disabled */
410 	assert(ml_get_interrupts_enabled() == FALSE);
411 
412 	assert(processor == current_processor());
413 	assert(processor->state == PROCESSOR_SHUTDOWN);
414 
415 #if CONFIG_DTRACE
416 	if (dtrace_cpu_state_changed_hook) {
417 		(*dtrace_cpu_state_changed_hook)(processor->cpu_id, FALSE);
418 	}
419 #endif
420 
421 #if defined(__arm64__)
422 	/*
423 	 * Catch a processor going offline
424 	 * while a panic or stackshot is in progress, as it won't
425 	 * receive a SIGPdebug now that interrupts are disabled.
426 	 */
427 	wait_while_mp_kdp_trap(false);
428 #endif
429 
430 	smr_cpu_down(processor, SMR_CPU_REASON_OFFLINE);
431 	ml_cpu_down();
432 
433 #if HIBERNATION
434 	if (processor_avail_count < 2) {
435 		hibernate_vm_lock();
436 		hibernate_vm_unlock();
437 	}
438 #endif
439 
440 	processor_set_t pset = processor->processor_set;
441 
442 	pset_lock(pset);
443 	pset_update_processor_state(pset, processor, PROCESSOR_PENDING_OFFLINE);
444 	--pset->online_processor_count;
445 	if (!processor->shutdown_temporary) {
446 		processor_down_update_counts(processor);
447 	}
448 	SCHED(processor_queue_shutdown)(processor);
449 	/* pset lock dropped */
450 	SCHED(rt_queue_shutdown)(processor);
451 
452 	thread_bind(prev);
453 
454 	/* interrupts still disabled */
455 
456 	/*
457 	 * Continue processor shutdown on the processor's idle thread.
458 	 * The handoff won't fail because the idle thread has a reserved stack.
459 	 * Switching to the idle thread leaves interrupts disabled,
460 	 * so we can't accidentally take an interrupt after the context switch.
461 	 */
462 	thread_t shutdown_thread = processor->idle_thread;
463 	shutdown_thread->continuation = processor_offline;
464 	shutdown_thread->parameter = processor;
465 
466 	thread_run(self, NULL, NULL, shutdown_thread);
467 }
468 
469 /*
470  * Called in the context of the idle thread to shut down the processor
471  *
472  * A shut-down processor looks like it's 'running' the idle thread parked
473  * in this routine, but it's actually been powered off and has no hardware state.
474  */
475 static void
processor_offline(void * parameter,__unused wait_result_t result)476 processor_offline(
477 	void * parameter,
478 	__unused wait_result_t result)
479 {
480 	processor_t processor = (processor_t) parameter;
481 	thread_t self = current_thread();
482 	__assert_only thread_t old_thread = THREAD_NULL;
483 
484 	assert(processor == current_processor());
485 	assert(self->state & TH_IDLE);
486 	assert(processor->idle_thread == self);
487 	assert(ml_get_interrupts_enabled() == FALSE);
488 	assert(self->continuation == NULL);
489 	assert(processor->processor_offlined == false);
490 	assert(processor->running_timers_active == false);
491 
492 	bool enforce_quiesce_safety = gEnforcePlatformActionSafety;
493 
494 	/*
495 	 * Scheduling is now disabled for this processor.
496 	 * Ensure that primitives that need scheduling (like mutexes) know this.
497 	 */
498 	if (enforce_quiesce_safety) {
499 		disable_preemption_without_measurements();
500 	}
501 
502 	/* convince slave_main to come back here */
503 	processor->processor_offlined = true;
504 
505 	/*
506 	 * Switch to the interrupt stack and shut down the processor.
507 	 *
508 	 * When the processor comes back, it will eventually call load_context which
509 	 * restores the context saved by machine_processor_shutdown, returning here.
510 	 */
511 	old_thread = machine_processor_shutdown(self, processor_offline_intstack, processor);
512 
513 	/* old_thread should be NULL because we got here through Load_context */
514 	assert(old_thread == THREAD_NULL);
515 
516 	assert(processor == current_processor());
517 	assert(processor->idle_thread == current_thread());
518 
519 	assert(ml_get_interrupts_enabled() == FALSE);
520 	assert(self->continuation == NULL);
521 
522 	/* Extract the machine_param value stashed by slave_main */
523 	void * machine_param = self->parameter;
524 	self->parameter = NULL;
525 
526 	/* Re-initialize the processor */
527 	slave_machine_init(machine_param);
528 
529 	assert(processor->processor_offlined == true);
530 	processor->processor_offlined = false;
531 
532 	if (enforce_quiesce_safety) {
533 		enable_preemption();
534 	}
535 
536 #if defined(__arm64__)
537 	/*
538 	 * See the comments for DebuggerLock in processor_up().
539 	 *
540 	 * SIGPdisabled is cleared (to enable cpu_signal() to succeed with this processor)
541 	 * the first time we take an IPI.  This is triggered by slave_machine_init(), above,
542 	 * which calls cpu_machine_init()->PE_cpu_machine_init()->PE_cpu_signal() which sends
543 	 * a self-IPI to ensure that happens when we enable interrupts.  So enable interrupts
544 	 * here so that cpu_signal() can succeed before we spin on mp_kdp_trap.
545 	 */
546 	ml_set_interrupts_enabled(TRUE);
547 
548 	ml_set_interrupts_enabled(FALSE);
549 
550 	wait_while_mp_kdp_trap(true);
551 
552 	/*
553 	 * At this point,
554 	 * if a stackshot or panic is in progress, we either spin on mp_kdp_trap
555 	 * or we sucessfully received a SIGPdebug signal which will cause us to
556 	 * break out of the spin on mp_kdp_trap and instead
557 	 * spin next time interrupts are enabled in idle_thread().
558 	 */
559 #endif
560 
561 	/*
562 	 * Now that the processor is back, invoke the idle thread to find out what to do next.
563 	 * idle_thread will enable interrupts.
564 	 */
565 	thread_block(idle_thread);
566 	/*NOTREACHED*/
567 }
568 
569 /*
570  * Complete the shutdown and place the processor offline.
571  *
572  * Called at splsched in the shutdown context
573  * (i.e. on the idle thread, on the interrupt stack)
574  *
575  * The onlining half of this is done in load_context().
576  */
577 static void
processor_offline_intstack(processor_t processor)578 processor_offline_intstack(
579 	processor_t processor)
580 {
581 	assert(processor == current_processor());
582 	assert(processor->active_thread == current_thread());
583 
584 	struct recount_snap snap = { 0 };
585 	recount_snapshot(&snap);
586 	recount_processor_idle(&processor->pr_recount, &snap);
587 
588 	smr_cpu_leave(processor, processor->last_dispatch);
589 
590 	PMAP_DEACTIVATE_KERNEL(processor->cpu_id);
591 
592 	cpu_sleep();
593 	panic("zombie processor");
594 	/*NOTREACHED*/
595 }
596 
597 kern_return_t
host_get_boot_info(host_priv_t host_priv,kernel_boot_info_t boot_info)598 host_get_boot_info(
599 	host_priv_t         host_priv,
600 	kernel_boot_info_t  boot_info)
601 {
602 	const char *src = "";
603 	if (host_priv == HOST_PRIV_NULL) {
604 		return KERN_INVALID_HOST;
605 	}
606 
607 	/*
608 	 * Copy first operator string terminated by '\0' followed by
609 	 *	standardized strings generated from boot string.
610 	 */
611 	src = machine_boot_info(boot_info, KERNEL_BOOT_INFO_MAX);
612 	if (src != boot_info) {
613 		(void) strncpy(boot_info, src, KERNEL_BOOT_INFO_MAX);
614 	}
615 
616 	return KERN_SUCCESS;
617 }
618 
619 // These are configured through sysctls.
620 #if DEVELOPMENT || DEBUG
621 uint32_t phy_read_panic = 1;
622 uint32_t phy_write_panic = 1;
623 uint64_t simulate_stretched_io = 0;
624 #else
625 uint32_t phy_read_panic = 0;
626 uint32_t phy_write_panic = 0;
627 #endif
628 
629 #if !defined(__x86_64__)
630 
631 #if DEVELOPMENT || DEBUG
632 static const uint64_t TIMEBASE_TICKS_PER_USEC = 24000000ULL / USEC_PER_SEC;
633 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 100 * TIMEBASE_TICKS_PER_USEC;
634 #else
635 static const uint64_t DEFAULT_TRACE_PHY_TIMEOUT = 0;
636 #endif
637 
638 // The MACHINE_TIMEOUT facility only exists on ARM.
639 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_read_delay_to, "report-phy-read-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
640 MACHINE_TIMEOUT_DEV_WRITEABLE(report_phy_write_delay_to, "report-phy-write-delay", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
641 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_read_delay_to, "trace-phy-read-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
642 MACHINE_TIMEOUT_DEV_WRITEABLE(trace_phy_write_delay_to, "trace-phy-write-delay", DEFAULT_TRACE_PHY_TIMEOUT, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
643 
644 #if SCHED_HYGIENE_DEBUG
645 /*
646  * Note: The interrupt-masked timeout goes through two initializations - one
647  * early in boot and one later. Thus this function is also called twice and
648  * can't be marked '__startup_func'.
649  */
650 static void
ml_io_init_timeouts(void)651 ml_io_init_timeouts(void)
652 {
653 	/*
654 	 * The timeouts may be completely disabled via an override. Check that
655 	 * last and set the timeouts to zero (disabling) if that's the case.
656 	 */
657 	if (kern_feature_override(KF_IO_TIMEOUT_OVRD)) {
658 		os_atomic_store(&report_phy_write_delay_to, 0, relaxed);
659 		os_atomic_store(&report_phy_read_delay_to, 0, relaxed);
660 	}
661 }
662 
663 /*
664  * It's important that this happens after machine timeouts have initialized so
665  * the correct timeouts can be inherited.
666  */
667 STARTUP(TIMEOUTS, STARTUP_RANK_SECOND, ml_io_init_timeouts);
668 #endif /* SCHED_HYGIENE_DEBUG */
669 
670 extern pmap_paddr_t kvtophys(vm_offset_t va);
671 #endif
672 
673 #if ML_IO_TIMEOUTS_ENABLED
674 
675 static LCK_GRP_DECLARE(io_timeout_override_lock_grp, "io_timeout_override");
676 static LCK_SPIN_DECLARE(io_timeout_override_lock, &io_timeout_override_lock_grp);
677 
678 struct io_timeout_override_entry {
679 	RB_ENTRY(io_timeout_override_entry) tree;
680 
681 	uintptr_t iovaddr_base;
682 	unsigned int size;
683 	uint32_t read_timeout;
684 	uint32_t write_timeout;
685 };
686 
687 static inline int
io_timeout_override_cmp(const struct io_timeout_override_entry * a,const struct io_timeout_override_entry * b)688 io_timeout_override_cmp(const struct io_timeout_override_entry *a, const struct io_timeout_override_entry *b)
689 {
690 	if (a->iovaddr_base < b->iovaddr_base) {
691 		return -1;
692 	} else if (a->iovaddr_base > b->iovaddr_base) {
693 		return 1;
694 	} else {
695 		return 0;
696 	}
697 }
698 
699 static RB_HEAD(io_timeout_override, io_timeout_override_entry) io_timeout_override_root;
700 RB_PROTOTYPE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
701 RB_GENERATE_PREV(io_timeout_override, io_timeout_override_entry, tree, io_timeout_override_cmp);
702 
703 #endif /* ML_IO_TIMEOUTS_ENABLED */
704 
705 int
ml_io_increase_timeouts(uintptr_t iovaddr_base,unsigned int size,uint32_t read_timeout_us,uint32_t write_timeout_us)706 ml_io_increase_timeouts(uintptr_t iovaddr_base, unsigned int size, uint32_t read_timeout_us, uint32_t write_timeout_us)
707 {
708 #if ML_IO_TIMEOUTS_ENABLED
709 	const size_t MAX_SIZE = 4096;
710 	const uint64_t MAX_TIMEOUT_ABS = UINT32_MAX;
711 
712 	assert(preemption_enabled());
713 
714 	int ret = KERN_SUCCESS;
715 
716 	if (size == 0) {
717 		return KERN_INVALID_ARGUMENT;
718 	}
719 
720 	uintptr_t iovaddr_end;
721 	if (size > MAX_SIZE || os_add_overflow(iovaddr_base, size - 1, &iovaddr_end)) {
722 		return KERN_INVALID_ARGUMENT;
723 	}
724 
725 	uint64_t read_timeout_abs, write_timeout_abs;
726 	nanoseconds_to_absolutetime(NSEC_PER_USEC * read_timeout_us, &read_timeout_abs);
727 	nanoseconds_to_absolutetime(NSEC_PER_USEC * write_timeout_us, &write_timeout_abs);
728 	if (read_timeout_abs > MAX_TIMEOUT_ABS || write_timeout_abs > MAX_TIMEOUT_ABS) {
729 		return KERN_INVALID_ARGUMENT;
730 	}
731 
732 	struct io_timeout_override_entry *node = kalloc_type(struct io_timeout_override_entry, Z_WAITOK | Z_ZERO | Z_NOFAIL);
733 	node->iovaddr_base = iovaddr_base;
734 	node->size = size;
735 	node->read_timeout = (uint32_t)read_timeout_abs;
736 	node->write_timeout = (uint32_t)write_timeout_abs;
737 
738 	/*
739 	 * Interrupt handlers are allowed to call ml_io_{read,write}*, so
740 	 * interrupts must be disabled any time io_timeout_override_lock is
741 	 * held.  Otherwise the CPU could take an interrupt while holding the
742 	 * lock, invoke an ISR that calls ml_io_{read,write}*, and deadlock
743 	 * trying to acquire the lock again.
744 	 */
745 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
746 	lck_spin_lock(&io_timeout_override_lock);
747 	if (RB_INSERT(io_timeout_override, &io_timeout_override_root, node)) {
748 		ret = KERN_INVALID_ARGUMENT;
749 		goto out;
750 	}
751 
752 	/* Check that this didn't create any new overlaps */
753 	struct io_timeout_override_entry *prev = RB_PREV(io_timeout_override, &io_timeout_override_root, node);
754 	if (prev && (prev->iovaddr_base + prev->size) > node->iovaddr_base) {
755 		RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
756 		ret = KERN_INVALID_ARGUMENT;
757 		goto out;
758 	}
759 	struct io_timeout_override_entry *next = RB_NEXT(io_timeout_override, &io_timeout_override_root, node);
760 	if (next && (node->iovaddr_base + node->size) > next->iovaddr_base) {
761 		RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
762 		ret = KERN_INVALID_ARGUMENT;
763 		goto out;
764 	}
765 
766 out:
767 	lck_spin_unlock(&io_timeout_override_lock);
768 	ml_set_interrupts_enabled(istate);
769 	if (ret != KERN_SUCCESS) {
770 		kfree_type(struct io_timeout_override_entry, node);
771 	}
772 	return ret;
773 #else /* !ML_IO_TIMEOUTS_ENABLED */
774 #pragma unused(iovaddr_base, size, read_timeout_us, write_timeout_us)
775 	return KERN_SUCCESS;
776 #endif
777 }
778 
779 int
ml_io_reset_timeouts(uintptr_t iovaddr_base,unsigned int size)780 ml_io_reset_timeouts(uintptr_t iovaddr_base, unsigned int size)
781 {
782 #if ML_IO_TIMEOUTS_ENABLED
783 	assert(preemption_enabled());
784 
785 	struct io_timeout_override_entry key = { .iovaddr_base = iovaddr_base };
786 
787 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
788 	lck_spin_lock(&io_timeout_override_lock);
789 	struct io_timeout_override_entry *node = RB_FIND(io_timeout_override, &io_timeout_override_root, &key);
790 	if (node) {
791 		if (node->size == size) {
792 			RB_REMOVE(io_timeout_override, &io_timeout_override_root, node);
793 		} else {
794 			node = NULL;
795 		}
796 	}
797 	lck_spin_unlock(&io_timeout_override_lock);
798 	ml_set_interrupts_enabled(istate);
799 
800 	if (!node) {
801 		return KERN_NOT_FOUND;
802 	}
803 
804 	kfree_type(struct io_timeout_override_entry, node);
805 #else /* !ML_IO_TIMEOUTS_ENABLED */
806 #pragma unused(iovaddr_base, size)
807 #endif
808 	return KERN_SUCCESS;
809 }
810 
811 #if ML_IO_TIMEOUTS_ENABLED
812 
813 static bool
override_io_timeouts_va(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)814 override_io_timeouts_va(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
815 {
816 	assert(!ml_get_interrupts_enabled());
817 
818 	struct io_timeout_override_entry *node = RB_ROOT(&io_timeout_override_root);
819 
820 	lck_spin_lock(&io_timeout_override_lock);
821 	/* RB_FIND() doesn't support custom cmp functions, so we have to open-code our own */
822 	while (node) {
823 		if (node->iovaddr_base <= vaddr && vaddr < node->iovaddr_base + node->size) {
824 			if (read_timeout) {
825 				*read_timeout = node->read_timeout;
826 			}
827 			if (write_timeout) {
828 				*write_timeout = node->write_timeout;
829 			}
830 			lck_spin_unlock(&io_timeout_override_lock);
831 			return true;
832 		} else if (vaddr < node->iovaddr_base) {
833 			node = RB_LEFT(node, tree);
834 		} else {
835 			node = RB_RIGHT(node, tree);
836 		}
837 	}
838 	lck_spin_unlock(&io_timeout_override_lock);
839 
840 	return false;
841 }
842 
843 static bool
override_io_timeouts_pa(uint64_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)844 override_io_timeouts_pa(uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
845 {
846 #if defined(__arm64__)
847 	/*
848 	 * PCIe regions are marked with PMAP_IO_RANGE_STRONG_SYNC. Apply a
849 	 * timeout greater than the PCIe completion timeout (50ms). In some
850 	 * cases those timeouts can stack so make the timeout significantly
851 	 * higher.
852 	 */
853 	#define STRONG_SYNC_TIMEOUT 1800000 /* 75ms */
854 
855 	pmap_io_range_t *range = pmap_find_io_attr(paddr);
856 	if (range != NULL && (range->wimg & PMAP_IO_RANGE_STRONG_SYNC) != 0) {
857 		if (read_timeout) {
858 			*read_timeout = STRONG_SYNC_TIMEOUT;
859 		}
860 		if (write_timeout) {
861 			*write_timeout = STRONG_SYNC_TIMEOUT;
862 		}
863 
864 		return true;
865 	}
866 #else
867 	(void)paddr;
868 	(void)read_timeout;
869 	(void)write_timeout;
870 #endif /* __arm64__ */
871 	return false;
872 }
873 
874 void
override_io_timeouts(uintptr_t vaddr,uint64_t paddr,uint64_t * read_timeout,uint64_t * write_timeout)875 override_io_timeouts(uintptr_t vaddr, uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout)
876 {
877 	if (vaddr != 0 &&
878 	    override_io_timeouts_va(vaddr, read_timeout, write_timeout)) {
879 		return;
880 	}
881 
882 	if (paddr != 0 &&
883 	    override_io_timeouts_pa(paddr, read_timeout, write_timeout)) {
884 		return;
885 	}
886 }
887 #endif /* ML_IO_TIMEOUTS_ENABLED */
888 
889 unsigned long long
ml_io_read(uintptr_t vaddr,int size)890 ml_io_read(uintptr_t vaddr, int size)
891 {
892 	unsigned long long result = 0;
893 	unsigned char s1;
894 	unsigned short s2;
895 
896 #ifdef ML_IO_VERIFY_UNCACHEABLE
897 	uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
898 #elif defined(ML_IO_TIMEOUTS_ENABLED)
899 	uintptr_t const paddr = kvtophys(vaddr);
900 #endif
901 
902 #ifdef ML_IO_TIMEOUTS_ENABLED
903 	uint64_t sabs, eabs;
904 	boolean_t istate, timeread = FALSE;
905 	uint64_t report_read_delay;
906 #if __x86_64__
907 	report_read_delay = report_phy_read_delay;
908 #else
909 	report_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed);
910 	uint64_t const trace_phy_read_delay = os_atomic_load(&trace_phy_read_delay_to, relaxed);
911 #endif /* __x86_64__ */
912 
913 	if (__improbable(report_read_delay != 0)) {
914 		istate = ml_set_interrupts_enabled(FALSE);
915 		sabs = mach_absolute_time();
916 		timeread = TRUE;
917 	}
918 
919 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
920 	if (__improbable(timeread && simulate_stretched_io)) {
921 		sabs -= simulate_stretched_io;
922 	}
923 #endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */
924 #endif /* ML_IO_TIMEOUTS_ENABLED */
925 
926 #if DEVELOPMENT || DEBUG
927 	boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
928 	if (use_fences) {
929 		ml_timebase_to_memory_fence();
930 	}
931 #endif
932 
933 	switch (size) {
934 	case 1:
935 		s1 = *(volatile unsigned char *)vaddr;
936 		result = s1;
937 		break;
938 	case 2:
939 		s2 = *(volatile unsigned short *)vaddr;
940 		result = s2;
941 		break;
942 	case 4:
943 		result = *(volatile unsigned int *)vaddr;
944 		break;
945 	case 8:
946 		result = *(volatile unsigned long long *)vaddr;
947 		break;
948 	default:
949 		panic("Invalid size %d for ml_io_read(%p)", size, (void *)vaddr);
950 		break;
951 	}
952 
953 #if DEVELOPMENT || DEBUG
954 	if (use_fences) {
955 		ml_memory_to_timebase_fence();
956 	}
957 #endif
958 
959 #ifdef ML_IO_TIMEOUTS_ENABLED
960 	if (__improbable(timeread == TRUE)) {
961 		eabs = mach_absolute_time();
962 
963 		/* Prevent the processor from calling iotrace during its
964 		 * initialization procedure. */
965 		if (current_processor()->state == PROCESSOR_RUNNING) {
966 			iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, sabs, eabs - sabs);
967 		}
968 
969 		if (__improbable((eabs - sabs) > report_read_delay)) {
970 			DTRACE_PHYSLAT5(physioread, uint64_t, (eabs - sabs),
971 			    uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, result);
972 
973 			uint64_t override = 0;
974 			override_io_timeouts(vaddr, paddr, &override, NULL);
975 
976 			if (override != 0) {
977 #if SCHED_HYGIENE_DEBUG
978 				/*
979 				 * The IO timeout was overridden. As interrupts are disabled in
980 				 * order to accurately measure IO time this can cause the
981 				 * interrupt masked timeout threshold to be exceeded.  If the
982 				 * interrupt masked debug mode is set to panic, abandon the
983 				 * measurement. If in trace mode leave it as-is for
984 				 * observability.
985 				 */
986 				if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
987 					ml_spin_debug_clear(current_thread());
988 					ml_irq_debug_abandon();
989 				}
990 #endif
991 				report_read_delay = override;
992 			}
993 		}
994 
995 		if (__improbable((eabs - sabs) > report_read_delay)) {
996 			if (phy_read_panic && (machine_timeout_suspended() == FALSE)) {
997 #if defined(__x86_64__)
998 				panic_notify();
999 #endif /* defined(__x86_64__) */
1000 				uint64_t nsec = 0;
1001 				absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1002 				panic("Read from IO vaddr 0x%lx paddr 0x%lx took %llu ns, "
1003 				    "result: 0x%llx (start: %llu, end: %llu), ceiling: %llu",
1004 				    vaddr, paddr, nsec, result, sabs, eabs,
1005 				    report_read_delay);
1006 			}
1007 		}
1008 
1009 		if (__improbable(trace_phy_read_delay > 0 && (eabs - sabs) > trace_phy_read_delay)) {
1010 			KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_READ),
1011 			    (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result);
1012 		}
1013 
1014 		(void)ml_set_interrupts_enabled(istate);
1015 	}
1016 #endif /*  ML_IO_TIMEOUTS_ENABLED */
1017 	return result;
1018 }
1019 
1020 unsigned int
ml_io_read8(uintptr_t vaddr)1021 ml_io_read8(uintptr_t vaddr)
1022 {
1023 	return (unsigned) ml_io_read(vaddr, 1);
1024 }
1025 
1026 unsigned int
ml_io_read16(uintptr_t vaddr)1027 ml_io_read16(uintptr_t vaddr)
1028 {
1029 	return (unsigned) ml_io_read(vaddr, 2);
1030 }
1031 
1032 unsigned int
ml_io_read32(uintptr_t vaddr)1033 ml_io_read32(uintptr_t vaddr)
1034 {
1035 	return (unsigned) ml_io_read(vaddr, 4);
1036 }
1037 
1038 unsigned long long
ml_io_read64(uintptr_t vaddr)1039 ml_io_read64(uintptr_t vaddr)
1040 {
1041 	return ml_io_read(vaddr, 8);
1042 }
1043 
1044 /* ml_io_write* */
1045 
1046 void
ml_io_write(uintptr_t vaddr,uint64_t val,int size)1047 ml_io_write(uintptr_t vaddr, uint64_t val, int size)
1048 {
1049 #ifdef ML_IO_VERIFY_UNCACHEABLE
1050 	uintptr_t const paddr = pmap_verify_noncacheable(vaddr);
1051 #elif defined(ML_IO_TIMEOUTS_ENABLED)
1052 	uintptr_t const paddr = kvtophys(vaddr);
1053 #endif
1054 
1055 #ifdef ML_IO_TIMEOUTS_ENABLED
1056 	uint64_t sabs, eabs;
1057 	boolean_t istate, timewrite = FALSE;
1058 	uint64_t report_write_delay;
1059 #if __x86_64__
1060 	report_write_delay = report_phy_write_delay;
1061 #else
1062 	report_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed);
1063 	uint64_t trace_phy_write_delay = os_atomic_load(&trace_phy_write_delay_to, relaxed);
1064 #endif /* !defined(__x86_64__) */
1065 	if (__improbable(report_write_delay != 0)) {
1066 		istate = ml_set_interrupts_enabled(FALSE);
1067 		sabs = mach_absolute_time();
1068 		timewrite = TRUE;
1069 	}
1070 
1071 #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED
1072 	if (__improbable(timewrite && simulate_stretched_io)) {
1073 		sabs -= simulate_stretched_io;
1074 	}
1075 #endif /* DEVELOPMENT || DEBUG */
1076 #endif /* ML_IO_TIMEOUTS_ENABLED */
1077 
1078 #if DEVELOPMENT || DEBUG
1079 	boolean_t use_fences = !kern_feature_override(KF_IO_TIMEOUT_OVRD);
1080 	if (use_fences) {
1081 		ml_timebase_to_memory_fence();
1082 	}
1083 #endif
1084 
1085 	switch (size) {
1086 	case 1:
1087 		*(volatile uint8_t *)vaddr = (uint8_t)val;
1088 		break;
1089 	case 2:
1090 		*(volatile uint16_t *)vaddr = (uint16_t)val;
1091 		break;
1092 	case 4:
1093 		*(volatile uint32_t *)vaddr = (uint32_t)val;
1094 		break;
1095 	case 8:
1096 		*(volatile uint64_t *)vaddr = (uint64_t)val;
1097 		break;
1098 	default:
1099 		panic("Invalid size %d for ml_io_write(%p, 0x%llx)", size, (void *)vaddr, val);
1100 		break;
1101 	}
1102 
1103 #if DEVELOPMENT || DEBUG
1104 	if (use_fences) {
1105 		ml_memory_to_timebase_fence();
1106 	}
1107 #endif
1108 
1109 #ifdef ML_IO_TIMEOUTS_ENABLED
1110 	if (__improbable(timewrite == TRUE)) {
1111 		eabs = mach_absolute_time();
1112 
1113 
1114 		/* Prevent the processor from calling iotrace during its
1115 		 * initialization procedure. */
1116 		if (current_processor()->state == PROCESSOR_RUNNING) {
1117 			iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, sabs, eabs - sabs);
1118 		}
1119 
1120 
1121 		if (__improbable((eabs - sabs) > report_write_delay)) {
1122 			DTRACE_PHYSLAT5(physiowrite, uint64_t, (eabs - sabs),
1123 			    uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, val);
1124 
1125 			uint64_t override = 0;
1126 			override_io_timeouts(vaddr, paddr, NULL, &override);
1127 
1128 			if (override != 0) {
1129 #if SCHED_HYGIENE_DEBUG
1130 				/*
1131 				 * The IO timeout was overridden. As interrupts are disabled in
1132 				 * order to accurately measure IO time this can cause the
1133 				 * interrupt masked timeout threshold to be exceeded.  If the
1134 				 * interrupt masked debug mode is set to panic, abandon the
1135 				 * measurement. If in trace mode leave it as-is for
1136 				 * observability.
1137 				 */
1138 				if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
1139 					ml_spin_debug_clear(current_thread());
1140 					ml_irq_debug_abandon();
1141 				}
1142 #endif
1143 				report_write_delay = override;
1144 			}
1145 		}
1146 
1147 		if (__improbable((eabs - sabs) > report_write_delay)) {
1148 			if (phy_write_panic && (machine_timeout_suspended() == FALSE)) {
1149 #if defined(__x86_64__)
1150 				panic_notify();
1151 #endif /*  defined(__x86_64__) */
1152 
1153 				uint64_t nsec = 0;
1154 				absolutetime_to_nanoseconds(eabs - sabs, &nsec);
1155 				panic("Write to IO vaddr %p paddr %p val 0x%llx took %llu ns,"
1156 				    " (start: %llu, end: %llu), ceiling: %llu",
1157 				    (void *)vaddr, (void *)paddr, val, nsec, sabs, eabs,
1158 				    report_write_delay);
1159 			}
1160 		}
1161 
1162 		if (__improbable(trace_phy_write_delay > 0 && (eabs - sabs) > trace_phy_write_delay)) {
1163 			KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_WRITE),
1164 			    (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val);
1165 		}
1166 
1167 		(void)ml_set_interrupts_enabled(istate);
1168 	}
1169 #endif /* ML_IO_TIMEOUTS_ENABLED */
1170 }
1171 
1172 void
ml_io_write8(uintptr_t vaddr,uint8_t val)1173 ml_io_write8(uintptr_t vaddr, uint8_t val)
1174 {
1175 	ml_io_write(vaddr, val, 1);
1176 }
1177 
1178 void
ml_io_write16(uintptr_t vaddr,uint16_t val)1179 ml_io_write16(uintptr_t vaddr, uint16_t val)
1180 {
1181 	ml_io_write(vaddr, val, 2);
1182 }
1183 
1184 void
ml_io_write32(uintptr_t vaddr,uint32_t val)1185 ml_io_write32(uintptr_t vaddr, uint32_t val)
1186 {
1187 	ml_io_write(vaddr, val, 4);
1188 }
1189 
1190 void
ml_io_write64(uintptr_t vaddr,uint64_t val)1191 ml_io_write64(uintptr_t vaddr, uint64_t val)
1192 {
1193 	ml_io_write(vaddr, val, 8);
1194 }
1195 
1196 struct cpu_callback_chain_elem {
1197 	cpu_callback_t                  fn;
1198 	void                            *param;
1199 	struct cpu_callback_chain_elem  *next;
1200 };
1201 
1202 static struct cpu_callback_chain_elem *cpu_callback_chain;
1203 static LCK_GRP_DECLARE(cpu_callback_chain_lock_grp, "cpu_callback_chain");
1204 static LCK_SPIN_DECLARE(cpu_callback_chain_lock, &cpu_callback_chain_lock_grp);
1205 
1206 void
cpu_event_register_callback(cpu_callback_t fn,void * param)1207 cpu_event_register_callback(cpu_callback_t fn, void *param)
1208 {
1209 	struct cpu_callback_chain_elem *new_elem;
1210 
1211 	new_elem = zalloc_permanent_type(struct cpu_callback_chain_elem);
1212 	if (!new_elem) {
1213 		panic("can't allocate cpu_callback_chain_elem");
1214 	}
1215 
1216 	lck_spin_lock(&cpu_callback_chain_lock);
1217 	new_elem->next = cpu_callback_chain;
1218 	new_elem->fn = fn;
1219 	new_elem->param = param;
1220 	os_atomic_store(&cpu_callback_chain, new_elem, release);
1221 	lck_spin_unlock(&cpu_callback_chain_lock);
1222 }
1223 
1224 __attribute__((noreturn))
1225 void
cpu_event_unregister_callback(__unused cpu_callback_t fn)1226 cpu_event_unregister_callback(__unused cpu_callback_t fn)
1227 {
1228 	panic("Unfortunately, cpu_event_unregister_callback is unimplemented.");
1229 }
1230 
1231 void
ml_broadcast_cpu_event(enum cpu_event event,unsigned int cpu_or_cluster)1232 ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster)
1233 {
1234 	struct cpu_callback_chain_elem *cursor;
1235 
1236 	cursor = os_atomic_load(&cpu_callback_chain, dependency);
1237 	for (; cursor != NULL; cursor = cursor->next) {
1238 		cursor->fn(cursor->param, event, cpu_or_cluster);
1239 	}
1240 }
1241 
1242 // Initialize Machine Timeouts (see the MACHINE_TIMEOUT macro
1243 // definition)
1244 
1245 void
machine_timeout_init_with_suffix(const struct machine_timeout_spec * spec,char const * suffix)1246 machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *suffix)
1247 {
1248 	if (spec->skip_predicate != NULL && spec->skip_predicate(spec)) {
1249 		// This timeout should be disabled.
1250 		os_atomic_store_wide((uint64_t*)spec->ptr, 0, relaxed);
1251 		return;
1252 	}
1253 
1254 	assert(suffix != NULL);
1255 	assert(strlen(spec->name) <= MACHINE_TIMEOUT_MAX_NAME_LEN);
1256 
1257 	size_t const suffix_len = strlen(suffix);
1258 
1259 	size_t const dt_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + 1;
1260 	char dt_name[dt_name_size];
1261 
1262 	strlcpy(dt_name, spec->name, dt_name_size);
1263 	strlcat(dt_name, suffix, dt_name_size);
1264 
1265 	size_t const scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + suffix_len + strlen("-scale") + 1;
1266 	char scale_name[scale_name_size];
1267 
1268 	strlcpy(scale_name, spec->name, scale_name_size);
1269 	strlcat(scale_name, suffix, scale_name_size);
1270 	strlcat(scale_name, "-scale", scale_name_size);
1271 
1272 	size_t const boot_arg_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN + strlen("ml-timeout-") + suffix_len + 1;
1273 	char boot_arg_name[boot_arg_name_size];
1274 
1275 	strlcpy(boot_arg_name, "ml-timeout-", boot_arg_name_size);
1276 	strlcat(boot_arg_name, spec->name, boot_arg_name_size);
1277 	strlcat(boot_arg_name, suffix, boot_arg_name_size);
1278 
1279 	size_t const boot_arg_scale_name_size = MACHINE_TIMEOUT_MAX_NAME_LEN +
1280 	    strlen("ml-timeout-") + strlen("-scale") + suffix_len + 1;
1281 	char boot_arg_scale_name[boot_arg_scale_name_size];
1282 
1283 	strlcpy(boot_arg_scale_name, "ml-timeout-", boot_arg_scale_name_size);
1284 	strlcat(boot_arg_scale_name, spec->name, boot_arg_scale_name_size);
1285 	strlcat(boot_arg_scale_name, suffix, boot_arg_name_size);
1286 	strlcat(boot_arg_scale_name, "-scale", boot_arg_scale_name_size);
1287 
1288 
1289 	/*
1290 	 * Determine base value from DT and boot-args.
1291 	 */
1292 
1293 	DTEntry base, chosen;
1294 
1295 	if (SecureDTLookupEntry(NULL, "/machine-timeouts", &base) != kSuccess) {
1296 		base = NULL;
1297 	}
1298 
1299 	if (SecureDTLookupEntry(NULL, "/chosen/machine-timeouts", &chosen) != kSuccess) {
1300 		chosen = NULL;
1301 	}
1302 
1303 	uint64_t timeout = spec->default_value;
1304 	bool found = false;
1305 
1306 	uint64_t const *data = NULL;
1307 	unsigned int data_size = sizeof(*data);
1308 
1309 	/* First look in /machine-timeouts/<name> */
1310 	if (base != NULL && SecureDTGetProperty(base, dt_name, (const void **)&data, &data_size) == kSuccess) {
1311 		if (data_size != sizeof(*data)) {
1312 			panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s", __func__, data_size, dt_name);
1313 		}
1314 
1315 		timeout = *data;
1316 		found = true;
1317 	}
1318 
1319 	/* A value in /chosen/machine-timeouts/<name> overrides */
1320 	if (chosen != NULL && SecureDTGetProperty(chosen, dt_name, (const void **)&data, &data_size) == kSuccess) {
1321 		if (data_size != sizeof(*data)) {
1322 			panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s", __func__, data_size, dt_name);
1323 		}
1324 
1325 		timeout = *data;
1326 		found = true;
1327 	}
1328 
1329 	/* A boot-arg ml-timeout-<name> overrides */
1330 	uint64_t boot_arg = 0;
1331 
1332 	if (PE_parse_boot_argn(boot_arg_name, &boot_arg, sizeof(boot_arg))) {
1333 		timeout = boot_arg;
1334 		found = true;
1335 	}
1336 
1337 
1338 	/*
1339 	 * Determine scale value from DT and boot-args.
1340 	 */
1341 
1342 	uint64_t scale = 1;
1343 	uint32_t const *scale_data;
1344 	unsigned int scale_size = sizeof(scale_data);
1345 
1346 	/* If there is a scale factor /machine-timeouts/<name>-scale, apply it. */
1347 	if (base != NULL && SecureDTGetProperty(base, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1348 		if (scale_size != sizeof(*scale_data)) {
1349 			panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/%s-scale", __func__, scale_size, dt_name);
1350 		}
1351 
1352 		scale = *scale_data;
1353 	}
1354 
1355 	/* If there is a scale factor /chosen/machine-timeouts/<name>-scale, use that. */
1356 	if (chosen != NULL && SecureDTGetProperty(chosen, scale_name, (const void **)&scale_data, &scale_size) == kSuccess) {
1357 		if (scale_size != sizeof(*scale_data)) {
1358 			panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/%s-scale", __func__,
1359 			    scale_size, dt_name);
1360 		}
1361 
1362 		scale = *scale_data;
1363 	}
1364 
1365 	/* Finally, a boot-arg ml-timeout-<name>-scale takes precedence. */
1366 	if (PE_parse_boot_argn(boot_arg_scale_name, &boot_arg, sizeof(boot_arg))) {
1367 		scale = boot_arg;
1368 	}
1369 
1370 	static bool global_scale_set;
1371 	static uint64_t global_scale;
1372 
1373 	if (!global_scale_set) {
1374 		/* Apply /machine-timeouts/global-scale if present */
1375 		if (SecureDTGetProperty(base, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1376 			if (scale_size != sizeof(*scale_data)) {
1377 				panic("%s: unexpected machine timeout data_size %u for /machine-timeouts/global-scale", __func__,
1378 				    scale_size);
1379 			}
1380 
1381 			global_scale = *scale_data;
1382 			global_scale_set = true;
1383 		}
1384 
1385 		/* Use /chosen/machine-timeouts/global-scale if present */
1386 		if (SecureDTGetProperty(chosen, "global-scale", (const void **)&scale_data, &scale_size) == kSuccess) {
1387 			if (scale_size != sizeof(*scale_data)) {
1388 				panic("%s: unexpected machine timeout data_size %u for /chosen/machine-timeouts/global-scale", __func__,
1389 				    scale_size);
1390 			}
1391 
1392 			global_scale = *scale_data;
1393 			global_scale_set = true;
1394 		}
1395 
1396 		/* Finally, the boot-arg ml-timeout-global-scale takes precedence. */
1397 		if (PE_parse_boot_argn("ml-timeout-global-scale", &boot_arg, sizeof(boot_arg))) {
1398 			global_scale = boot_arg;
1399 			global_scale_set = true;
1400 		}
1401 	}
1402 
1403 	if (global_scale_set) {
1404 		scale *= global_scale;
1405 	}
1406 
1407 	/* Compute the final timeout, and done. */
1408 	if (found && timeout > 0) {
1409 		/* Only apply inherent unit scale if the value came in
1410 		 * externally. */
1411 
1412 		if (spec->unit_scale == MACHINE_TIMEOUT_UNIT_TIMEBASE) {
1413 			uint64_t nanoseconds = timeout / 1000;
1414 			nanoseconds_to_absolutetime(nanoseconds, &timeout);
1415 		} else {
1416 			timeout /= spec->unit_scale;
1417 		}
1418 
1419 		if (timeout == 0) {
1420 			/* Ensure unit scaling did not disable the timeout. */
1421 			timeout = 1;
1422 		}
1423 	}
1424 
1425 	if (os_mul_overflow(timeout, scale, &timeout)) {
1426 		timeout = UINT64_MAX; // clamp
1427 	}
1428 
1429 	os_atomic_store_wide((uint64_t*)spec->ptr, timeout, relaxed);
1430 }
1431 
1432 void
machine_timeout_init(const struct machine_timeout_spec * spec)1433 machine_timeout_init(const struct machine_timeout_spec *spec)
1434 {
1435 	machine_timeout_init_with_suffix(spec, "");
1436 }
1437 
1438 #if DEVELOPMENT || DEBUG
1439 /*
1440  * Late timeout (re-)initialization, at the end of bsd_init()
1441  */
1442 void
machine_timeout_bsd_init(void)1443 machine_timeout_bsd_init(void)
1444 {
1445 	char const * const __unused mt_suffix = "-b";
1446 #if SCHED_HYGIENE_DEBUG
1447 	machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix);
1448 	machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix);
1449 
1450 	/*
1451 	 * The io timeouts can inherit from interrupt_masked_timeout.
1452 	 * Re-initialize, as interrupt_masked_timeout may have changed.
1453 	 */
1454 	ml_io_init_timeouts();
1455 
1456 	PERCPU_DECL(uint64_t _Atomic, preemption_disable_max_mt);
1457 
1458 	/*
1459 	 * Reset the preemption disable stats, so that they are not
1460 	 * polluted by long early boot code.
1461 	 */
1462 	percpu_foreach(max_stat, preemption_disable_max_mt) {
1463 		os_atomic_store(max_stat, 0, relaxed);
1464 
1465 		/*
1466 		 * No additional synchronization needed.  The time when we
1467 		 * switch to late boot timeouts is relatively arbitrary
1468 		 * anyway: By now we don't expect any long preemption
1469 		 * disabling anymore. While that is still a clear delineation
1470 		 * for the boot CPU, other CPUs can be in the middle of doing
1471 		 * whatever. So if the missing synchronization causes a new
1472 		 * maximum to be missed on a secondary CPU, it could just as
1473 		 * well have been missed by racing with this function.
1474 		 */
1475 	}
1476 
1477 #endif
1478 }
1479 #endif /* DEVELOPMENT || DEBUG */
1480 
1481 #if ML_IO_TIMEOUTS_ENABLED && CONFIG_XNUPOST
1482 #include <tests/xnupost.h>
1483 
1484 extern kern_return_t ml_io_timeout_test(void);
1485 
1486 static inline void
ml_io_timeout_test_get_timeouts(uintptr_t vaddr,uint64_t * read_timeout,uint64_t * write_timeout)1487 ml_io_timeout_test_get_timeouts(uintptr_t vaddr, uint64_t *read_timeout, uint64_t *write_timeout)
1488 {
1489 	*read_timeout = 0;
1490 	*write_timeout = 0;
1491 
1492 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
1493 	override_io_timeouts(vaddr, 0, read_timeout, write_timeout);
1494 	ml_set_interrupts_enabled(istate);
1495 }
1496 
1497 kern_return_t
ml_io_timeout_test(void)1498 ml_io_timeout_test(void)
1499 {
1500 	const size_t SIZE = 16;
1501 	uintptr_t iovaddr_base1 = (uintptr_t)&ml_io_timeout_test;
1502 	uintptr_t iovaddr_base2 = iovaddr_base1 + SIZE;
1503 	uintptr_t vaddr1 = iovaddr_base1 + SIZE / 2;
1504 	uintptr_t vaddr2 = iovaddr_base2 + SIZE / 2;
1505 
1506 	const uint64_t READ_TIMEOUT1_US = 50000, WRITE_TIMEOUT1_US = 50001;
1507 	const uint64_t READ_TIMEOUT2_US = 50002, WRITE_TIMEOUT2_US = 50003;
1508 	uint64_t read_timeout1_abs, write_timeout1_abs;
1509 	uint64_t read_timeout2_abs, write_timeout2_abs;
1510 	nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT1_US, &read_timeout1_abs);
1511 	nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT1_US, &write_timeout1_abs);
1512 	nanoseconds_to_absolutetime(NSEC_PER_USEC * READ_TIMEOUT2_US, &read_timeout2_abs);
1513 	nanoseconds_to_absolutetime(NSEC_PER_USEC * WRITE_TIMEOUT2_US, &write_timeout2_abs);
1514 
1515 	int err = ml_io_increase_timeouts(iovaddr_base1, 0, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1516 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for empty region");
1517 
1518 	err = ml_io_increase_timeouts(iovaddr_base1, 4097, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1519 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for region > 4096 bytes");
1520 
1521 	err = ml_io_increase_timeouts(UINTPTR_MAX, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1522 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overflowed region");
1523 
1524 	err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1525 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for first VA region should succeed");
1526 
1527 	err = ml_io_increase_timeouts(iovaddr_base2, SIZE, READ_TIMEOUT2_US, WRITE_TIMEOUT2_US);
1528 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Setting timeout for second VA region should succeed");
1529 
1530 	err = ml_io_increase_timeouts(iovaddr_base1, SIZE, READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1531 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for same region twice");
1532 
1533 	err = ml_io_increase_timeouts(vaddr1, (uint32_t)(vaddr2 - vaddr1), READ_TIMEOUT1_US, WRITE_TIMEOUT1_US);
1534 	T_EXPECT_EQ_INT(err, KERN_INVALID_ARGUMENT, "Can't set timeout for overlapping regions");
1535 
1536 	uint64_t read_timeout, write_timeout;
1537 	ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1538 	T_EXPECT_EQ_ULLONG(read_timeout, read_timeout1_abs, "Read timeout for first region");
1539 	T_EXPECT_EQ_ULLONG(write_timeout, write_timeout1_abs, "Write timeout for first region");
1540 
1541 	ml_io_timeout_test_get_timeouts(vaddr2, &read_timeout, &write_timeout);
1542 	T_EXPECT_EQ_ULLONG(read_timeout, read_timeout2_abs, "Read timeout for first region");
1543 	T_EXPECT_EQ_ULLONG(write_timeout, write_timeout2_abs, "Write timeout for first region");
1544 
1545 	ml_io_timeout_test_get_timeouts(iovaddr_base2 + SIZE, &read_timeout, &write_timeout);
1546 	T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout without override");
1547 	T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout without override");
1548 
1549 	err = ml_io_reset_timeouts(iovaddr_base1 + 1, SIZE - 1);
1550 	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for subregion");
1551 
1552 	err = ml_io_reset_timeouts(iovaddr_base2 + SIZE, SIZE);
1553 	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for non-existent region");
1554 
1555 	err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1556 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for first VA region should succeed");
1557 
1558 	ml_io_timeout_test_get_timeouts(vaddr1, &read_timeout, &write_timeout);
1559 	T_EXPECT_EQ_ULLONG(read_timeout, 0, "Read timeout for reset region");
1560 	T_EXPECT_EQ_ULLONG(write_timeout, 0, "Write timeout for reset region");
1561 
1562 	err = ml_io_reset_timeouts(iovaddr_base1, SIZE);
1563 	T_EXPECT_EQ_INT(err, KERN_NOT_FOUND, "Can't reset timeout for same region twice");
1564 
1565 	err = ml_io_reset_timeouts(iovaddr_base2, SIZE);
1566 	T_EXPECT_EQ_INT(err, KERN_SUCCESS, "Resetting timeout for second VA region should succeed");
1567 
1568 	return KERN_SUCCESS;
1569 }
1570 #endif /* CONFIG_XNUPOST */
1571