xref: /xnu-11215.61.5/osfmk/i386/mp.c (revision 4f1223e81cd707a65cc109d0b8ad6653699da3c4)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 
32 #include <mach_kdp.h>
33 #include <kdp/kdp_internal.h>
34 #include <mach_ldebug.h>
35 
36 #include <mach/mach_types.h>
37 #include <mach/kern_return.h>
38 
39 #include <kern/kern_types.h>
40 #include <kern/startup.h>
41 #include <kern/timer_queue.h>
42 #include <kern/processor.h>
43 #include <kern/cpu_number.h>
44 #include <kern/cpu_data.h>
45 #include <kern/assert.h>
46 #include <kern/lock_group.h>
47 #include <kern/machine.h>
48 #include <kern/pms.h>
49 #include <kern/misc_protos.h>
50 #include <kern/timer_call.h>
51 #include <kern/zalloc.h>
52 #include <kern/queue.h>
53 #include <kern/monotonic.h>
54 #include <kern/kern_stackshot.h>
55 #include <prng/random.h>
56 
57 #include <vm/vm_map.h>
58 #include <vm/vm_kern.h>
59 
60 #include <i386/bit_routines.h>
61 #include <i386/proc_reg.h>
62 #include <i386/cpu_threads.h>
63 #include <i386/mp_desc.h>
64 #include <i386/misc_protos.h>
65 #include <i386/trap_internal.h>
66 #include <i386/postcode.h>
67 #include <i386/machine_routines.h>
68 #include <i386/mp.h>
69 #include <i386/mp_events.h>
70 #include <i386/lapic.h>
71 #include <i386/cpuid.h>
72 #include <i386/fpu.h>
73 #include <i386/machine_cpu.h>
74 #include <i386/pmCPU.h>
75 #if CONFIG_MCA
76 #include <i386/machine_check.h>
77 #endif
78 #include <i386/acpi.h>
79 
80 #include <sys/kdebug.h>
81 
82 #include <console/serial_protos.h>
83 
84 #if KPERF
85 #include <kperf/kptimer.h>
86 #endif /* KPERF */
87 
88 #if     MP_DEBUG
89 #define PAUSE           delay(1000000)
90 #define DBG(x...)       kprintf(x)
91 #else
92 #define DBG(x...)
93 #define PAUSE
94 #endif  /* MP_DEBUG */
95 
96 /* Debugging/test trace events: */
97 #define TRACE_MP_TLB_FLUSH              MACHDBG_CODE(DBG_MACH_MP, 0)
98 #define TRACE_MP_CPUS_CALL              MACHDBG_CODE(DBG_MACH_MP, 1)
99 #define TRACE_MP_CPUS_CALL_LOCAL        MACHDBG_CODE(DBG_MACH_MP, 2)
100 #define TRACE_MP_CPUS_CALL_ACTION       MACHDBG_CODE(DBG_MACH_MP, 3)
101 #define TRACE_MP_CPUS_CALL_NOBUF        MACHDBG_CODE(DBG_MACH_MP, 4)
102 #define TRACE_MP_CPU_FAST_START         MACHDBG_CODE(DBG_MACH_MP, 5)
103 #define TRACE_MP_CPU_START              MACHDBG_CODE(DBG_MACH_MP, 6)
104 #define TRACE_MP_CPU_DEACTIVATE         MACHDBG_CODE(DBG_MACH_MP, 7)
105 
106 #define ABS(v)          (((v) > 0)?(v):-(v))
107 
108 void            slave_boot_init(void);
109 void            i386_cpu_IPI(int cpu);
110 
111 #if MACH_KDP
112 static void     mp_kdp_wait(boolean_t flush, boolean_t isNMI);
113 #endif /* MACH_KDP */
114 
115 #if MACH_KDP
116 static boolean_t        cpu_signal_pending(int cpu, mp_event_t event);
117 #endif /* MACH_KDP */
118 static int              NMIInterruptHandler(x86_saved_state_t *regs);
119 
120 boolean_t               smp_initialized = FALSE;
121 uint32_t                TSC_sync_margin = 0xFFF;
122 volatile boolean_t      force_immediate_debugger_NMI = FALSE;
123 volatile boolean_t      pmap_tlb_flush_timeout = FALSE;
124 #if DEBUG || DEVELOPMENT
125 boolean_t               mp_interrupt_watchdog_enabled = TRUE;
126 uint32_t                mp_interrupt_watchdog_events = 0;
127 #endif
128 
129 SIMPLE_LOCK_DECLARE(debugger_callback_lock, 0);
130 struct debugger_callback *debugger_callback = NULL;
131 
132 static LCK_GRP_DECLARE(smp_lck_grp, "i386_smp");
133 static LCK_MTX_DECLARE(mp_cpu_boot_lock, &smp_lck_grp);
134 
135 /* Variables needed for MP rendezvous. */
136 SIMPLE_LOCK_DECLARE(mp_rv_lock, 0);
137 static void     (*mp_rv_setup_func)(void *arg);
138 static void     (*mp_rv_action_func)(void *arg);
139 static void     (*mp_rv_teardown_func)(void *arg);
140 static void     *mp_rv_func_arg;
141 static volatile int     mp_rv_ncpus;
142 /* Cache-aligned barriers: */
143 static volatile long    mp_rv_entry    __attribute__((aligned(64)));
144 static volatile long    mp_rv_exit     __attribute__((aligned(64)));
145 static volatile long    mp_rv_complete __attribute__((aligned(64)));
146 
147 volatile        uint64_t        debugger_entry_time;
148 volatile        uint64_t        debugger_exit_time;
149 #if MACH_KDP
150 #include <kdp/kdp.h>
151 extern int kdp_snapshot;
152 static struct _kdp_xcpu_call_func {
153 	kdp_x86_xcpu_func_t func;
154 	void     *arg0, *arg1;
155 	volatile long     ret;
156 	volatile uint16_t cpu;
157 } kdp_xcpu_call_func = {
158 	.cpu  = KDP_XCPU_NONE
159 };
160 
161 #endif
162 
163 /* Variables needed for MP broadcast. */
164 static void        (*mp_bc_action_func)(void *arg);
165 static void        *mp_bc_func_arg;
166 static int      mp_bc_ncpus;
167 static volatile long   mp_bc_count;
168 static LCK_MTX_DECLARE(mp_bc_lock, &smp_lck_grp);
169 static  volatile int    debugger_cpu = -1;
170 volatile long    NMIPI_acks = 0;
171 volatile long    NMI_count = 0;
172 static int              vector_timed_out;
173 
174 NMI_reason_t    NMI_panic_reason = NONE;
175 extern void     NMI_cpus(void);
176 
177 static void     mp_cpus_call_init(void);
178 static void     mp_cpus_call_action(void);
179 static void     mp_call_PM(void);
180 
181 char            mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init
182 
183 /* PAL-related routines */
184 boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler,
185     int ipi_vector, i386_intr_func_t ipi_handler);
186 void i386_start_cpu(int lapic_id, int cpu_num);
187 void i386_send_NMI(int cpu);
188 void NMIPI_enable(boolean_t);
189 
190 #define NUM_CPU_WARM_CALLS      20
191 struct timer_call       cpu_warm_call_arr[NUM_CPU_WARM_CALLS];
192 queue_head_t            cpu_warm_call_list;
193 decl_simple_lock_data(static, cpu_warm_lock);
194 
195 typedef struct cpu_warm_data {
196 	timer_call_t    cwd_call;
197 	uint64_t        cwd_deadline;
198 	int             cwd_result;
199 } *cpu_warm_data_t;
200 
201 static void             cpu_prewarm_init(void);
202 static void             cpu_warm_timer_call_func(timer_call_param_t p0, timer_call_param_t p1);
203 static void             _cpu_warm_setup(void *arg);
204 static timer_call_t     grab_warm_timer_call(void);
205 static void             free_warm_timer_call(timer_call_t call);
206 
207 void
smp_init(void)208 smp_init(void)
209 {
210 	console_init();
211 
212 	if (!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler,
213 	    LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler)) {
214 		return;
215 	}
216 
217 	cpu_thread_init();
218 
219 	DBGLOG_CPU_INIT(master_cpu);
220 
221 	mp_cpus_call_init();
222 	mp_cpus_call_cpu_init(master_cpu);
223 
224 #if DEBUG || DEVELOPMENT
225 	if (PE_parse_boot_argn("interrupt_watchdog",
226 	    &mp_interrupt_watchdog_enabled,
227 	    sizeof(mp_interrupt_watchdog_enabled))) {
228 		kprintf("Interrupt watchdog %sabled\n",
229 		    mp_interrupt_watchdog_enabled ? "en" : "dis");
230 	}
231 #endif
232 
233 	if (PE_parse_boot_argn("TSC_sync_margin",
234 	    &TSC_sync_margin, sizeof(TSC_sync_margin))) {
235 		kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin);
236 	} else if (cpuid_vmm_present()) {
237 		kprintf("TSC sync margin disabled\n");
238 		TSC_sync_margin = 0;
239 	}
240 	smp_initialized = TRUE;
241 
242 	cpu_prewarm_init();
243 
244 	return;
245 }
246 
247 typedef struct {
248 	int                     target_cpu;
249 	int                     target_lapic;
250 	int                     starter_cpu;
251 } processor_start_info_t;
252 static processor_start_info_t   start_info        __attribute__((aligned(64)));
253 
254 /*
255  * Cache-alignment is to avoid cross-cpu false-sharing interference.
256  */
257 static volatile long            tsc_entry_barrier __attribute__((aligned(64)));
258 static volatile long            tsc_exit_barrier  __attribute__((aligned(64)));
259 static volatile uint64_t        tsc_target        __attribute__((aligned(64)));
260 
261 /*
262  * Poll a CPU to see when it has marked itself as running.
263  */
264 static void
mp_wait_for_cpu_up(int slot_num,unsigned int iters,unsigned int usecdelay)265 mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay)
266 {
267 	while (iters-- > 0) {
268 		if (cpu_datap(slot_num)->cpu_running) {
269 			break;
270 		}
271 		delay(usecdelay);
272 	}
273 }
274 
275 /*
276  * Quickly bring a CPU back online which has been halted.
277  */
278 kern_return_t
intel_startCPU_fast(int slot_num)279 intel_startCPU_fast(int slot_num)
280 {
281 	kern_return_t   rc;
282 
283 	/*
284 	 * Try to perform a fast restart
285 	 */
286 	rc = pmCPUExitHalt(slot_num);
287 	if (rc != KERN_SUCCESS) {
288 		/*
289 		 * The CPU was not eligible for a fast restart.
290 		 */
291 		return rc;
292 	}
293 
294 	KERNEL_DEBUG_CONSTANT(
295 		TRACE_MP_CPU_FAST_START | DBG_FUNC_START,
296 		slot_num, 0, 0, 0, 0);
297 
298 	/*
299 	 * Wait until the CPU is back online.
300 	 */
301 	mp_disable_preemption();
302 
303 	/*
304 	 * We use short pauses (1us) for low latency.  30,000 iterations is
305 	 * longer than a full restart would require so it should be more
306 	 * than long enough.
307 	 */
308 
309 	mp_wait_for_cpu_up(slot_num, 30000, 1);
310 	mp_enable_preemption();
311 
312 	KERNEL_DEBUG_CONSTANT(
313 		TRACE_MP_CPU_FAST_START | DBG_FUNC_END,
314 		slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0);
315 
316 	/*
317 	 * Check to make sure that the CPU is really running.  If not,
318 	 * go through the slow path.
319 	 */
320 	if (cpu_datap(slot_num)->cpu_running) {
321 		return KERN_SUCCESS;
322 	} else {
323 		return KERN_FAILURE;
324 	}
325 }
326 
327 static void
started_cpu(void)328 started_cpu(void)
329 {
330 	/* Here on the started cpu with cpu_running set TRUE */
331 
332 	if (TSC_sync_margin &&
333 	    start_info.target_cpu == cpu_number()) {
334 		/*
335 		 * I've just started-up, synchronize again with the starter cpu
336 		 * and then snap my TSC.
337 		 */
338 		tsc_target   = 0;
339 		atomic_decl(&tsc_entry_barrier, 1);
340 		while (tsc_entry_barrier != 0) {
341 			;       /* spin for starter and target at barrier */
342 		}
343 		tsc_target = rdtsc64();
344 		atomic_decl(&tsc_exit_barrier, 1);
345 	}
346 }
347 
348 static void
start_cpu(void * arg)349 start_cpu(void *arg)
350 {
351 	int                     i = 1000;
352 	processor_start_info_t  *psip = (processor_start_info_t *) arg;
353 
354 	/* Ignore this if the current processor is not the starter */
355 	if (cpu_number() != psip->starter_cpu) {
356 		return;
357 	}
358 
359 	DBG("start_cpu(%p) about to start cpu %d, lapic %d\n",
360 	    arg, psip->target_cpu, psip->target_lapic);
361 
362 	KERNEL_DEBUG_CONSTANT(
363 		TRACE_MP_CPU_START | DBG_FUNC_START,
364 		psip->target_cpu,
365 		psip->target_lapic, 0, 0, 0);
366 
367 	i386_start_cpu(psip->target_lapic, psip->target_cpu);
368 
369 #ifdef  POSTCODE_DELAY
370 	/* Wait much longer if postcodes are displayed for a delay period. */
371 	i *= 10000;
372 #endif
373 	DBG("start_cpu(%p) about to wait for cpu %d\n",
374 	    arg, psip->target_cpu);
375 
376 	mp_wait_for_cpu_up(psip->target_cpu, i * 100, 100);
377 
378 	KERNEL_DEBUG_CONSTANT(
379 		TRACE_MP_CPU_START | DBG_FUNC_END,
380 		psip->target_cpu,
381 		cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0);
382 
383 	if (TSC_sync_margin &&
384 	    cpu_datap(psip->target_cpu)->cpu_running) {
385 		/*
386 		 * Compare the TSC from the started processor with ours.
387 		 * Report and log/panic if it diverges by more than
388 		 * TSC_sync_margin (TSC_SYNC_MARGIN) ticks. This margin
389 		 * can be overriden by boot-arg (with 0 meaning no checking).
390 		 */
391 		uint64_t        tsc_starter;
392 		int64_t         tsc_delta;
393 		atomic_decl(&tsc_entry_barrier, 1);
394 		while (tsc_entry_barrier != 0) {
395 			;       /* spin for both processors at barrier */
396 		}
397 		tsc_starter = rdtsc64();
398 		atomic_decl(&tsc_exit_barrier, 1);
399 		while (tsc_exit_barrier != 0) {
400 			;       /* spin for target to store its TSC */
401 		}
402 		tsc_delta = tsc_target - tsc_starter;
403 		kprintf("TSC sync for cpu %d: 0x%016llx delta 0x%llx (%lld)\n",
404 		    psip->target_cpu, tsc_target, tsc_delta, tsc_delta);
405 #if DEBUG || DEVELOPMENT
406 		/*
407 		 * Stash the delta for inspection later, since we can no
408 		 * longer print/log it with interrupts disabled.
409 		 */
410 		cpu_datap(psip->target_cpu)->tsc_sync_delta = tsc_delta;
411 #endif
412 		if (ABS(tsc_delta) > (int64_t) TSC_sync_margin) {
413 #if DEBUG
414 			panic(
415 #else
416 			kprintf(
417 #endif
418 				"Unsynchronized  TSC for cpu %d: "
419 				"0x%016llx, delta 0x%llx\n",
420 				psip->target_cpu, tsc_target, tsc_delta);
421 		}
422 	}
423 }
424 
425 kern_return_t
intel_startCPU(int slot_num)426 intel_startCPU(
427 	int     slot_num)
428 {
429 	int             lapic = cpu_to_lapic[slot_num];
430 	boolean_t       istate;
431 
432 	assert(lapic != -1);
433 
434 	DBGLOG_CPU_INIT(slot_num);
435 
436 	DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic);
437 	DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD);
438 
439 	/*
440 	 * Initialize (or re-initialize) the descriptor tables for this cpu.
441 	 * Propagate processor mode to slave.
442 	 */
443 	cpu_desc_init(cpu_datap(slot_num));
444 
445 	/* Serialize use of the slave boot stack, etc. */
446 	lck_mtx_lock(&mp_cpu_boot_lock);
447 
448 	istate = ml_set_interrupts_enabled(FALSE);
449 	if (slot_num == get_cpu_number()) {
450 		ml_set_interrupts_enabled(istate);
451 		lck_mtx_unlock(&mp_cpu_boot_lock);
452 		return KERN_SUCCESS;
453 	}
454 
455 	start_info.starter_cpu  = cpu_number();
456 	start_info.target_cpu   = slot_num;
457 	start_info.target_lapic = lapic;
458 	tsc_entry_barrier = 2;
459 	tsc_exit_barrier = 2;
460 
461 	/*
462 	 * Perform the processor startup sequence with all running
463 	 * processors rendezvous'ed. This is required during periods when
464 	 * the cache-disable bit is set for MTRR/PAT initialization.
465 	 */
466 	mp_rendezvous_no_intrs(start_cpu, (void *) &start_info);
467 
468 	start_info.target_cpu = 0;
469 
470 	ml_set_interrupts_enabled(istate);
471 	lck_mtx_unlock(&mp_cpu_boot_lock);
472 
473 	if (!cpu_datap(slot_num)->cpu_running) {
474 		kprintf("Failed to start CPU %02d\n", slot_num);
475 		printf("Failed to start CPU %02d, rebooting...\n", slot_num);
476 		delay(1000000);
477 		halt_cpu();
478 		return KERN_SUCCESS;
479 	} else {
480 		kprintf("Started cpu %d (lapic id %08x)\n", slot_num, lapic);
481 		return KERN_SUCCESS;
482 	}
483 }
484 
485 #if     MP_DEBUG
486 cpu_signal_event_log_t  *cpu_signal[MAX_CPUS];
487 cpu_signal_event_log_t  *cpu_handle[MAX_CPUS];
488 
489 MP_EVENT_NAME_DECL();
490 
491 #endif  /* MP_DEBUG */
492 
493 /*
494  * Note: called with NULL state when polling for TLB flush and cross-calls.
495  */
496 int
cpu_signal_handler(x86_saved_state_t * regs)497 cpu_signal_handler(x86_saved_state_t *regs)
498 {
499 #if     !MACH_KDP
500 #pragma unused (regs)
501 #endif /* !MACH_KDP */
502 	int             my_cpu;
503 	volatile int    *my_word;
504 
505 	SCHED_STATS_INC(ipi_count);
506 
507 	my_cpu = cpu_number();
508 	my_word = &cpu_data_ptr[my_cpu]->cpu_signals;
509 	/* Store the initial set of signals for diagnostics. New
510 	 * signals could arrive while these are being processed
511 	 * so it's no more than a hint.
512 	 */
513 
514 	cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word;
515 
516 	do {
517 #if     MACH_KDP
518 		if (i_bit(MP_KDP, my_word)) {
519 			DBGLOG(cpu_handle, my_cpu, MP_KDP);
520 			i_bit_clear(MP_KDP, my_word);
521 /* Ensure that the i386_kernel_state at the base of the
522  * current thread's stack (if any) is synchronized with the
523  * context at the moment of the interrupt, to facilitate
524  * access through the debugger.
525  */
526 			sync_iss_to_iks(regs);
527 			if (pmsafe_debug && !kdp_snapshot) {
528 				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
529 			}
530 			mp_kdp_wait(TRUE, FALSE);
531 			if (pmsafe_debug && !kdp_snapshot) {
532 				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
533 			}
534 		} else
535 #endif  /* MACH_KDP */
536 		if (i_bit(MP_TLB_FLUSH, my_word)) {
537 			DBGLOG(cpu_handle, my_cpu, MP_TLB_FLUSH);
538 			i_bit_clear(MP_TLB_FLUSH, my_word);
539 			pmap_update_interrupt();
540 		} else if (i_bit(MP_CALL, my_word)) {
541 			DBGLOG(cpu_handle, my_cpu, MP_CALL);
542 			i_bit_clear(MP_CALL, my_word);
543 			mp_cpus_call_action();
544 		} else if (i_bit(MP_CALL_PM, my_word)) {
545 			DBGLOG(cpu_handle, my_cpu, MP_CALL_PM);
546 			i_bit_clear(MP_CALL_PM, my_word);
547 			mp_call_PM();
548 		}
549 		if (regs == NULL) {
550 			/* Called to poll only for cross-calls and TLB flush */
551 			break;
552 		} else if (i_bit(MP_AST, my_word)) {
553 			DBGLOG(cpu_handle, my_cpu, MP_AST);
554 			i_bit_clear(MP_AST, my_word);
555 			ast_check(cpu_to_processor(my_cpu));
556 		}
557 	} while (*my_word);
558 
559 	return 0;
560 }
561 
562 long
NMI_pte_corruption_callback(__unused void * arg0,__unused void * arg1,uint16_t lcpu)563 NMI_pte_corruption_callback(__unused void *arg0, __unused void *arg1, uint16_t lcpu)
564 {
565 	static char     pstr[256];      /* global since this callback is serialized */
566 	void            *stackptr;
567 	__asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
568 
569 	snprintf(&pstr[0], sizeof(pstr),
570 	    "Panic(CPU %d): PTE corruption detected on PTEP 0x%llx VAL 0x%llx\n",
571 	    lcpu, (unsigned long long)(uintptr_t)PTE_corrupted_ptr, *(uint64_t *)PTE_corrupted_ptr);
572 	panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, current_cpu_datap()->cpu_int_state);
573 	return 0;
574 }
575 
576 extern void kprintf_break_lock(void);
577 int
NMIInterruptHandler(x86_saved_state_t * regs)578 NMIInterruptHandler(x86_saved_state_t *regs)
579 {
580 	void            *stackptr;
581 	char            pstr[256];
582 	uint64_t        now = mach_absolute_time();
583 
584 	if (panic_active() && !panicDebugging) {
585 		if (pmsafe_debug) {
586 			pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
587 		}
588 		for (;;) {
589 			cpu_pause();
590 		}
591 	}
592 
593 	atomic_incl(&NMIPI_acks, 1);
594 	atomic_incl(&NMI_count, 1);
595 	sync_iss_to_iks_unconditionally(regs);
596 	__asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
597 
598 	if (cpu_number() == debugger_cpu) {
599 		goto NMExit;
600 	}
601 
602 	if (NMI_panic_reason == SPINLOCK_TIMEOUT) {
603 		lck_spinlock_to_info_t lsti;
604 
605 		lsti = os_atomic_load(&lck_spinlock_timeout_in_progress, acquire);
606 		snprintf(&pstr[0], sizeof(pstr),
607 		    "Panic(CPU %d, time %llu): NMIPI for spinlock acquisition timeout, spinlock: %p, "
608 		    "spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n",
609 		    cpu_number(), now, lsti->lock, (void *)lsti->owner_thread_cur,
610 		    current_thread(), lsti->owner_cpu);
611 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
612 	} else if (NMI_panic_reason == TLB_FLUSH_TIMEOUT) {
613 		snprintf(&pstr[0], sizeof(pstr),
614 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: TLB flush timeout, TLB state:0x%x\n",
615 		    cpu_number(), now, current_cpu_datap()->cpu_tlb_invalid);
616 		panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs);
617 	} else if (NMI_panic_reason == CROSSCALL_TIMEOUT) {
618 		snprintf(&pstr[0], sizeof(pstr),
619 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: cross-call timeout\n",
620 		    cpu_number(), now);
621 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
622 	} else if (NMI_panic_reason == INTERRUPT_WATCHDOG) {
623 		snprintf(&pstr[0], sizeof(pstr),
624 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: interrupt watchdog for vector 0x%x\n",
625 		    cpu_number(), now, vector_timed_out);
626 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
627 	}
628 
629 #if MACH_KDP
630 	if (pmsafe_debug && !kdp_snapshot) {
631 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
632 	}
633 	current_cpu_datap()->cpu_NMI_acknowledged = TRUE;
634 	i_bit_clear(MP_KDP, &current_cpu_datap()->cpu_signals);
635 	if (panic_active() || NMI_panic_reason != NONE) {
636 		mp_kdp_wait(FALSE, TRUE);
637 	} else if (!mp_kdp_trap &&
638 	    !mp_kdp_is_NMI &&
639 	    virtualized && (debug_boot_arg & DB_NMI)) {
640 		/*
641 		 * Under a VMM with the debug boot-arg set, drop into kdp.
642 		 * Since an NMI is involved, there's a risk of contending with
643 		 * a panic. And side-effects of NMIs may result in entry into,
644 		 * and continuing from, the debugger being unreliable.
645 		 */
646 		if (__sync_bool_compare_and_swap(&mp_kdp_is_NMI, FALSE, TRUE)) {
647 			kprintf_break_lock();
648 
649 			DebuggerWithContext(EXC_BREAKPOINT, saved_state64(regs),
650 			    "requested by NMI", DEBUGGER_OPTION_NONE,
651 			    (unsigned long)(char *)__builtin_return_address(0));
652 
653 			mp_kdp_is_NMI = FALSE;
654 		} else {
655 			mp_kdp_wait(FALSE, FALSE);
656 		}
657 	} else {
658 		mp_kdp_wait(FALSE, FALSE);
659 	}
660 	if (pmsafe_debug && !kdp_snapshot) {
661 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
662 	}
663 #endif
664 NMExit:
665 	return 1;
666 }
667 
668 /*
669  * cpu_interrupt is really just to be used by the scheduler to
670  * get a CPU's attention it may not always issue an IPI.  If an
671  * IPI is always needed then use i386_cpu_IPI.
672  */
673 void
cpu_interrupt(int cpu)674 cpu_interrupt(int cpu)
675 {
676 	boolean_t did_IPI = FALSE;
677 
678 	if (smp_initialized
679 	    && pmCPUExitIdle(cpu_datap(cpu))) {
680 		i386_cpu_IPI(cpu);
681 		did_IPI = TRUE;
682 	}
683 
684 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0);
685 }
686 
687 /*
688  * Send a true NMI via the local APIC to the specified CPU.
689  */
690 void
cpu_NMI_interrupt(int cpu)691 cpu_NMI_interrupt(int cpu)
692 {
693 	if (smp_initialized) {
694 		i386_send_NMI(cpu);
695 	}
696 }
697 
698 void
NMI_cpus(void)699 NMI_cpus(void)
700 {
701 	unsigned int    cpu;
702 	boolean_t       intrs_enabled;
703 	uint64_t        tsc_timeout;
704 
705 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
706 	NMIPI_enable(TRUE);
707 	for (cpu = 0; cpu < real_ncpus; cpu++) {
708 		if (!cpu_is_running(cpu)) {
709 			continue;
710 		}
711 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
712 		cpu_NMI_interrupt(cpu);
713 		tsc_timeout = !machine_timeout_suspended() ?
714 		    rdtsc64() + (1000 * 1000 * 1000 * 10ULL) :
715 		    ~0ULL;
716 		while (!cpu_datap(cpu)->cpu_NMI_acknowledged) {
717 			handle_pending_TLB_flushes();
718 			cpu_pause();
719 			if (rdtsc64() > tsc_timeout) {
720 				panic("NMI_cpus() timeout cpu %d", cpu);
721 			}
722 		}
723 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
724 	}
725 	NMIPI_enable(FALSE);
726 
727 	ml_set_interrupts_enabled(intrs_enabled);
728 }
729 
730 static void(*volatile mp_PM_func)(void) = NULL;
731 
732 static void
mp_call_PM(void)733 mp_call_PM(void)
734 {
735 	assert(!ml_get_interrupts_enabled());
736 
737 	if (mp_PM_func != NULL) {
738 		mp_PM_func();
739 	}
740 }
741 
742 void
cpu_PM_interrupt(int cpu)743 cpu_PM_interrupt(int cpu)
744 {
745 	assert(!ml_get_interrupts_enabled());
746 
747 	if (mp_PM_func != NULL) {
748 		if (cpu == cpu_number()) {
749 			mp_PM_func();
750 		} else {
751 			i386_signal_cpu(cpu, MP_CALL_PM, ASYNC);
752 		}
753 	}
754 }
755 
756 void
PM_interrupt_register(void (* fn)(void))757 PM_interrupt_register(void (*fn)(void))
758 {
759 	mp_PM_func = fn;
760 }
761 
762 void
i386_signal_cpu(int cpu,mp_event_t event,mp_sync_t mode)763 i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode)
764 {
765 	volatile int    *signals = &cpu_datap(cpu)->cpu_signals;
766 	uint64_t        tsc_timeout;
767 
768 
769 	if (!cpu_datap(cpu)->cpu_running) {
770 		return;
771 	}
772 
773 	if (event == MP_TLB_FLUSH) {
774 		KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0);
775 	}
776 
777 	DBGLOG(cpu_signal, cpu, event);
778 
779 	i_bit_set(event, signals);
780 	i386_cpu_IPI(cpu);
781 	if (mode == SYNC) {
782 again:
783 		tsc_timeout = !machine_timeout_suspended() ?
784 		    rdtsc64() + (1000 * 1000 * 1000) :
785 		    ~0ULL;
786 		while (i_bit(event, signals) && rdtsc64() < tsc_timeout) {
787 			cpu_pause();
788 		}
789 		if (i_bit(event, signals)) {
790 			DBG("i386_signal_cpu(%d, 0x%x, SYNC) timed out\n",
791 			    cpu, event);
792 			goto again;
793 		}
794 	}
795 	if (event == MP_TLB_FLUSH) {
796 		KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0);
797 	}
798 }
799 
800 /*
801  * Helper function called when busy-waiting: panic if too long
802  * a TSC-based time has elapsed since the start of the spin.
803  */
804 static boolean_t
mp_spin_timeout(uint64_t tsc_start)805 mp_spin_timeout(uint64_t tsc_start)
806 {
807 	uint64_t        tsc_timeout;
808 
809 	cpu_pause();
810 	if (machine_timeout_suspended()) {
811 		return FALSE;
812 	}
813 
814 	/*
815 	 * The timeout is 4 * the spinlock timeout period
816 	 * unless we have serial console printing (kprintf) enabled
817 	 * in which case we allow an even greater margin.
818 	 */
819 	tsc_timeout = disable_serial_output ? LockTimeOutTSC << 2
820 	        : LockTimeOutTSC << 4;
821 	return rdtsc64() > tsc_start + tsc_timeout;
822 }
823 
824 /*
825  * Helper function to take a spinlock while ensuring that incoming IPIs
826  * are still serviced if interrupts are masked while we spin.
827  * Returns current interrupt state.
828  */
829 boolean_t
mp_safe_spin_lock(usimple_lock_t lock)830 mp_safe_spin_lock(usimple_lock_t lock)
831 {
832 	if (ml_get_interrupts_enabled()) {
833 		simple_lock(lock, LCK_GRP_NULL);
834 		return TRUE;
835 	}
836 
837 	lck_spinlock_to_info_t lsti;
838 	uint64_t tsc_spin_start = rdtsc64();
839 
840 	while (!simple_lock_try(lock, LCK_GRP_NULL)) {
841 		cpu_signal_handler(NULL);
842 		if (mp_spin_timeout(tsc_spin_start)) {
843 			uintptr_t lowner = (uintptr_t)lock->interlock.lock_data;
844 
845 			lsti = lck_spinlock_timeout_hit(lock, lowner);
846 			NMIPI_panic(cpu_to_cpumask(lsti->owner_cpu), SPINLOCK_TIMEOUT);
847 			panic("mp_safe_spin_lock() timed out, lock: %p, "
848 			    "owner thread: 0x%lx, current_thread: %p, "
849 			    "owner on CPU 0x%x, time: %llu",
850 			    lock, lowner, current_thread(),
851 			    lsti->owner_cpu, mach_absolute_time());
852 		}
853 	}
854 
855 	return FALSE;
856 }
857 
858 /*
859  * All-CPU rendezvous:
860  *      - CPUs are signalled,
861  *	- all execute the setup function (if specified),
862  *	- rendezvous (i.e. all cpus reach a barrier),
863  *	- all execute the action function (if specified),
864  *	- rendezvous again,
865  *	- execute the teardown function (if specified), and then
866  *	- resume.
867  *
868  * Note that the supplied external functions _must_ be reentrant and aware
869  * that they are running in parallel and in an unknown lock context.
870  */
871 
872 static void
mp_rendezvous_action(__unused void * null)873 mp_rendezvous_action(__unused void *null)
874 {
875 	boolean_t       intrs_enabled;
876 	uint64_t        tsc_spin_start;
877 
878 	/*
879 	 * Note that mp_rv_lock was acquired by the thread that initiated the
880 	 * rendezvous and must have been acquired before we enter
881 	 * mp_rendezvous_action().
882 	 */
883 	current_cpu_datap()->cpu_rendezvous_in_progress = TRUE;
884 
885 	/* setup function */
886 	if (mp_rv_setup_func != NULL) {
887 		mp_rv_setup_func(mp_rv_func_arg);
888 	}
889 
890 	intrs_enabled = ml_get_interrupts_enabled();
891 
892 	/* spin on entry rendezvous */
893 	atomic_incl(&mp_rv_entry, 1);
894 	tsc_spin_start = rdtsc64();
895 
896 	while (mp_rv_entry < mp_rv_ncpus) {
897 		/* poll for pesky tlb flushes if interrupts disabled */
898 		if (!intrs_enabled) {
899 			handle_pending_TLB_flushes();
900 		}
901 		if (mp_spin_timeout(tsc_spin_start)) {
902 			panic("mp_rv_action() entry: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_entry, mp_rv_ncpus, tsc_spin_start, rdtsc64());
903 		}
904 	}
905 
906 	/* action function */
907 	if (mp_rv_action_func != NULL) {
908 		mp_rv_action_func(mp_rv_func_arg);
909 	}
910 
911 	/* spin on exit rendezvous */
912 	atomic_incl(&mp_rv_exit, 1);
913 	tsc_spin_start = rdtsc64();
914 	while (mp_rv_exit < mp_rv_ncpus) {
915 		if (!intrs_enabled) {
916 			handle_pending_TLB_flushes();
917 		}
918 		if (mp_spin_timeout(tsc_spin_start)) {
919 			panic("mp_rv_action() exit: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_exit, mp_rv_ncpus, tsc_spin_start, rdtsc64());
920 		}
921 	}
922 
923 	/* teardown function */
924 	if (mp_rv_teardown_func != NULL) {
925 		mp_rv_teardown_func(mp_rv_func_arg);
926 	}
927 
928 	current_cpu_datap()->cpu_rendezvous_in_progress = FALSE;
929 
930 	/* Bump completion count */
931 	atomic_incl(&mp_rv_complete, 1);
932 }
933 
934 void
mp_rendezvous(void (* setup_func)(void *),void (* action_func)(void *),void (* teardown_func)(void *),void * arg)935 mp_rendezvous(void (*setup_func)(void *),
936     void (*action_func)(void *),
937     void (*teardown_func)(void *),
938     void *arg)
939 {
940 	uint64_t        tsc_spin_start;
941 
942 	if (!smp_initialized) {
943 		if (setup_func != NULL) {
944 			setup_func(arg);
945 		}
946 		if (action_func != NULL) {
947 			action_func(arg);
948 		}
949 		if (teardown_func != NULL) {
950 			teardown_func(arg);
951 		}
952 		return;
953 	}
954 
955 	/* obtain rendezvous lock */
956 	mp_rendezvous_lock();
957 
958 	/* set static function pointers */
959 	mp_rv_setup_func = setup_func;
960 	mp_rv_action_func = action_func;
961 	mp_rv_teardown_func = teardown_func;
962 	mp_rv_func_arg = arg;
963 
964 	mp_rv_entry    = 0;
965 	mp_rv_exit     = 0;
966 	mp_rv_complete = 0;
967 
968 	/*
969 	 * signal other processors, which will call mp_rendezvous_action()
970 	 * with interrupts disabled
971 	 */
972 	mp_rv_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, &mp_rendezvous_action, NULL) + 1;
973 
974 	/* call executor function on this cpu */
975 	mp_rendezvous_action(NULL);
976 
977 	/*
978 	 * Spin for everyone to complete.
979 	 * This is necessary to ensure that all processors have proceeded
980 	 * from the exit barrier before we release the rendezvous structure.
981 	 */
982 	tsc_spin_start = rdtsc64();
983 	while (mp_rv_complete < mp_rv_ncpus) {
984 		if (mp_spin_timeout(tsc_spin_start)) {
985 			panic("mp_rendezvous() timeout: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_complete, mp_rv_ncpus, tsc_spin_start, rdtsc64());
986 		}
987 	}
988 
989 	/* Tidy up */
990 	mp_rv_setup_func = NULL;
991 	mp_rv_action_func = NULL;
992 	mp_rv_teardown_func = NULL;
993 	mp_rv_func_arg = NULL;
994 
995 	/* release lock */
996 	mp_rendezvous_unlock();
997 }
998 
999 void
mp_rendezvous_lock(void)1000 mp_rendezvous_lock(void)
1001 {
1002 	(void) mp_safe_spin_lock(&mp_rv_lock);
1003 }
1004 
1005 void
mp_rendezvous_unlock(void)1006 mp_rendezvous_unlock(void)
1007 {
1008 	simple_unlock(&mp_rv_lock);
1009 }
1010 
1011 void
mp_rendezvous_break_lock(void)1012 mp_rendezvous_break_lock(void)
1013 {
1014 	simple_lock_init(&mp_rv_lock, 0);
1015 }
1016 
1017 static void
setup_disable_intrs(__unused void * param_not_used)1018 setup_disable_intrs(__unused void * param_not_used)
1019 {
1020 	/* disable interrupts before the first barrier */
1021 	boolean_t intr = ml_set_interrupts_enabled(FALSE);
1022 
1023 	current_cpu_datap()->cpu_iflag = intr;
1024 	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1025 }
1026 
1027 static void
teardown_restore_intrs(__unused void * param_not_used)1028 teardown_restore_intrs(__unused void * param_not_used)
1029 {
1030 	/* restore interrupt flag following MTRR changes */
1031 	ml_set_interrupts_enabled(current_cpu_datap()->cpu_iflag);
1032 	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1033 }
1034 
1035 /*
1036  * A wrapper to mp_rendezvous() to call action_func() with interrupts disabled.
1037  * This is exported for use by kexts.
1038  */
1039 void
mp_rendezvous_no_intrs(void (* action_func)(void *),void * arg)1040 mp_rendezvous_no_intrs(
1041 	void (*action_func)(void *),
1042 	void *arg)
1043 {
1044 	mp_rendezvous(setup_disable_intrs,
1045 	    action_func,
1046 	    teardown_restore_intrs,
1047 	    arg);
1048 }
1049 
1050 
1051 typedef struct {
1052 	queue_chain_t   link;                   /* queue linkage */
1053 	void            (*func)(void *, void *); /* routine to call */
1054 	void            *arg0;                  /* routine's 1st arg */
1055 	void            *arg1;                  /* routine's 2nd arg */
1056 	cpumask_t       *maskp;                 /* completion response mask */
1057 } mp_call_t;
1058 
1059 
1060 typedef struct {
1061 	queue_head_t            queue;
1062 	decl_simple_lock_data(, lock);
1063 } mp_call_queue_t;
1064 #define MP_CPUS_CALL_BUFS_PER_CPU       MAX_CPUS
1065 static mp_call_queue_t  mp_cpus_call_freelist;
1066 static mp_call_queue_t  mp_cpus_call_head[MAX_CPUS];
1067 
1068 static inline boolean_t
mp_call_head_lock(mp_call_queue_t * cqp)1069 mp_call_head_lock(mp_call_queue_t *cqp)
1070 {
1071 	boolean_t       intrs_enabled;
1072 
1073 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
1074 	simple_lock(&cqp->lock, LCK_GRP_NULL);
1075 
1076 	return intrs_enabled;
1077 }
1078 
1079 /*
1080  * Deliver an NMIPI to a set of processors to cause them to panic .
1081  */
1082 void
NMIPI_panic(cpumask_t cpu_mask,NMI_reason_t why)1083 NMIPI_panic(cpumask_t cpu_mask, NMI_reason_t why)
1084 {
1085 	unsigned int cpu;
1086 	cpumask_t cpu_bit;
1087 	uint64_t deadline;
1088 
1089 	NMIPI_enable(TRUE);
1090 	NMI_panic_reason = why;
1091 
1092 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1093 		if ((cpu_mask & cpu_bit) == 0) {
1094 			continue;
1095 		}
1096 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1097 		cpu_NMI_interrupt(cpu);
1098 	}
1099 
1100 	/* Wait (only so long) for NMi'ed cpus to respond */
1101 	deadline = mach_absolute_time() + LockTimeOut;
1102 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1103 		if ((cpu_mask & cpu_bit) == 0) {
1104 			continue;
1105 		}
1106 		while (!cpu_datap(cpu)->cpu_NMI_acknowledged &&
1107 		    mach_absolute_time() < deadline) {
1108 			cpu_pause();
1109 		}
1110 	}
1111 }
1112 
1113 #if MACH_ASSERT
1114 static inline boolean_t
mp_call_head_is_locked(mp_call_queue_t * cqp)1115 mp_call_head_is_locked(mp_call_queue_t *cqp)
1116 {
1117 	return !ml_get_interrupts_enabled() &&
1118 	       hw_lock_held((hw_lock_t)&cqp->lock);
1119 }
1120 #endif
1121 
1122 static inline void
mp_call_head_unlock(mp_call_queue_t * cqp,boolean_t intrs_enabled)1123 mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled)
1124 {
1125 	simple_unlock(&cqp->lock);
1126 	ml_set_interrupts_enabled(intrs_enabled);
1127 }
1128 
1129 static inline mp_call_t *
mp_call_alloc(void)1130 mp_call_alloc(void)
1131 {
1132 	mp_call_t       *callp = NULL;
1133 	boolean_t       intrs_enabled;
1134 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1135 
1136 	intrs_enabled = mp_call_head_lock(cqp);
1137 	if (!queue_empty(&cqp->queue)) {
1138 		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1139 	}
1140 	mp_call_head_unlock(cqp, intrs_enabled);
1141 
1142 	return callp;
1143 }
1144 
1145 static inline void
mp_call_free(mp_call_t * callp)1146 mp_call_free(mp_call_t *callp)
1147 {
1148 	boolean_t       intrs_enabled;
1149 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1150 
1151 	intrs_enabled = mp_call_head_lock(cqp);
1152 	queue_enter_first(&cqp->queue, callp, typeof(callp), link);
1153 	mp_call_head_unlock(cqp, intrs_enabled);
1154 }
1155 
1156 static inline mp_call_t *
mp_call_dequeue_locked(mp_call_queue_t * cqp)1157 mp_call_dequeue_locked(mp_call_queue_t *cqp)
1158 {
1159 	mp_call_t       *callp = NULL;
1160 
1161 	assert(mp_call_head_is_locked(cqp));
1162 	if (!queue_empty(&cqp->queue)) {
1163 		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1164 	}
1165 	return callp;
1166 }
1167 
1168 static inline void
mp_call_enqueue_locked(mp_call_queue_t * cqp,mp_call_t * callp)1169 mp_call_enqueue_locked(
1170 	mp_call_queue_t *cqp,
1171 	mp_call_t       *callp)
1172 {
1173 	queue_enter(&cqp->queue, callp, typeof(callp), link);
1174 }
1175 
1176 /* Called on the boot processor to initialize global structures */
1177 static void
mp_cpus_call_init(void)1178 mp_cpus_call_init(void)
1179 {
1180 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1181 
1182 	DBG("mp_cpus_call_init()\n");
1183 	simple_lock_init(&cqp->lock, 0);
1184 	queue_init(&cqp->queue);
1185 }
1186 
1187 /*
1188  * Called at processor registration to add call buffers to the free list
1189  * and to initialize the per-cpu call queue.
1190  */
1191 void
mp_cpus_call_cpu_init(int cpu)1192 mp_cpus_call_cpu_init(int cpu)
1193 {
1194 	int             i;
1195 	mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1196 	mp_call_t       *callp;
1197 
1198 	simple_lock_init(&cqp->lock, 0);
1199 	queue_init(&cqp->queue);
1200 	for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) {
1201 		callp = zalloc_permanent_type(mp_call_t);
1202 		mp_call_free(callp);
1203 	}
1204 
1205 	DBG("mp_cpus_call_init(%d) done\n", cpu);
1206 }
1207 
1208 /*
1209  * This is called from cpu_signal_handler() to process an MP_CALL signal.
1210  * And also from i386_deactivate_cpu() when a cpu is being taken offline.
1211  */
1212 static void
mp_cpus_call_action(void)1213 mp_cpus_call_action(void)
1214 {
1215 	mp_call_queue_t *cqp;
1216 	boolean_t       intrs_enabled;
1217 	mp_call_t       *callp;
1218 	mp_call_t       call;
1219 
1220 	assert(!ml_get_interrupts_enabled());
1221 	cqp = &mp_cpus_call_head[cpu_number()];
1222 	intrs_enabled = mp_call_head_lock(cqp);
1223 	while ((callp = mp_call_dequeue_locked(cqp)) != NULL) {
1224 		/* Copy call request to the stack to free buffer */
1225 		call = *callp;
1226 		mp_call_free(callp);
1227 		if (call.func != NULL) {
1228 			mp_call_head_unlock(cqp, intrs_enabled);
1229 			KERNEL_DEBUG_CONSTANT(
1230 				TRACE_MP_CPUS_CALL_ACTION,
1231 				VM_KERNEL_UNSLIDE(call.func), VM_KERNEL_UNSLIDE_OR_PERM(call.arg0),
1232 				VM_KERNEL_UNSLIDE_OR_PERM(call.arg1), VM_KERNEL_ADDRPERM(call.maskp), 0);
1233 			call.func(call.arg0, call.arg1);
1234 			(void) mp_call_head_lock(cqp);
1235 		}
1236 		if (call.maskp != NULL) {
1237 			i_bit_set(cpu_number(), call.maskp);
1238 		}
1239 	}
1240 	mp_call_head_unlock(cqp, intrs_enabled);
1241 }
1242 
1243 #pragma clang diagnostic push
1244 #pragma clang diagnostic ignored "-Wcast-function-type"
1245 
1246 /*
1247  * mp_cpus_call() runs a given function on cpus specified in a given cpu mask.
1248  * Possible modes are:
1249  *  SYNC:   function is called serially on target cpus in logical cpu order
1250  *	    waiting for each call to be acknowledged before proceeding
1251  *  ASYNC:  function call is queued to the specified cpus
1252  *	    waiting for all calls to complete in parallel before returning
1253  *  NOSYNC: function calls are queued
1254  *	    but we return before confirmation of calls completing.
1255  * The action function may be NULL.
1256  * The cpu mask may include the local cpu. Offline cpus are ignored.
1257  * The return value is the number of cpus on which the call was made or queued.
1258  */
1259 cpu_t
mp_cpus_call(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *),void * arg)1260 mp_cpus_call(
1261 	cpumask_t       cpus,
1262 	mp_sync_t       mode,
1263 	void            (*action_func)(void *),
1264 	void            *arg)
1265 {
1266 	return mp_cpus_call1(
1267 		cpus,
1268 		mode,
1269 		(void (*)(void *, void *))action_func,
1270 		arg,
1271 		NULL,
1272 		NULL);
1273 }
1274 
1275 #pragma clang diagnostic pop
1276 
1277 static void
mp_cpus_call_wait(boolean_t intrs_enabled,cpumask_t cpus_called,cpumask_t * cpus_responded)1278 mp_cpus_call_wait(boolean_t     intrs_enabled,
1279     cpumask_t     cpus_called,
1280     cpumask_t     *cpus_responded)
1281 {
1282 	mp_call_queue_t         *cqp;
1283 	uint64_t                tsc_spin_start;
1284 
1285 	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1286 	cqp = &mp_cpus_call_head[cpu_number()];
1287 
1288 	tsc_spin_start = rdtsc64();
1289 	while (*cpus_responded != cpus_called) {
1290 		if (!intrs_enabled) {
1291 			/* Sniffing w/o locking */
1292 			if (!queue_empty(&cqp->queue)) {
1293 				mp_cpus_call_action();
1294 			}
1295 			cpu_signal_handler(NULL);
1296 		}
1297 		if (mp_spin_timeout(tsc_spin_start)) {
1298 			cpumask_t       cpus_unresponsive;
1299 
1300 			cpus_unresponsive = cpus_called & ~(*cpus_responded);
1301 			NMIPI_panic(cpus_unresponsive, CROSSCALL_TIMEOUT);
1302 			panic("mp_cpus_call_wait() timeout, cpus: 0x%llx",
1303 			    cpus_unresponsive);
1304 		}
1305 	}
1306 }
1307 
1308 cpu_t
mp_cpus_call1(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *,void *),void * arg0,void * arg1,cpumask_t * cpus_calledp)1309 mp_cpus_call1(
1310 	cpumask_t       cpus,
1311 	mp_sync_t       mode,
1312 	void            (*action_func)(void *, void *),
1313 	void            *arg0,
1314 	void            *arg1,
1315 	cpumask_t       *cpus_calledp)
1316 {
1317 	cpu_t           cpu = 0;
1318 	boolean_t       intrs_enabled = FALSE;
1319 	boolean_t       call_self = FALSE;
1320 	cpumask_t       cpus_called = 0;
1321 	cpumask_t       cpus_responded = 0;
1322 	long            cpus_call_count = 0;
1323 	uint64_t        tsc_spin_start;
1324 	boolean_t       topo_lock;
1325 
1326 	KERNEL_DEBUG_CONSTANT(
1327 		TRACE_MP_CPUS_CALL | DBG_FUNC_START,
1328 		cpus, mode, VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1));
1329 
1330 	if (!smp_initialized) {
1331 		if ((cpus & CPUMASK_SELF) == 0) {
1332 			goto out;
1333 		}
1334 		if (action_func != NULL) {
1335 			intrs_enabled = ml_set_interrupts_enabled(FALSE);
1336 			action_func(arg0, arg1);
1337 			ml_set_interrupts_enabled(intrs_enabled);
1338 		}
1339 		call_self = TRUE;
1340 		goto out;
1341 	}
1342 
1343 	/*
1344 	 * Queue the call for each non-local requested cpu.
1345 	 * This is performed under the topo lock to prevent changes to
1346 	 * cpus online state and to prevent concurrent rendezvouses --
1347 	 * although an exception is made if we're calling only the master
1348 	 * processor since that always remains active. Note: this exception
1349 	 * is expected for longterm timer nosync cross-calls to the master cpu.
1350 	 */
1351 	mp_disable_preemption();
1352 	intrs_enabled = ml_get_interrupts_enabled();
1353 	topo_lock = (cpus != cpu_to_cpumask(master_cpu));
1354 	if (topo_lock) {
1355 		ml_set_interrupts_enabled(FALSE);
1356 		(void) mp_safe_spin_lock(&x86_topo_lock);
1357 	}
1358 	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1359 		if (((cpu_to_cpumask(cpu) & cpus) == 0) ||
1360 		    !cpu_is_running(cpu)) {
1361 			continue;
1362 		}
1363 		tsc_spin_start = rdtsc64();
1364 		if (cpu == (cpu_t) cpu_number()) {
1365 			/*
1366 			 * We don't IPI ourself and if calling asynchronously,
1367 			 * we defer our call until we have signalled all others.
1368 			 */
1369 			call_self = TRUE;
1370 			if (mode == SYNC && action_func != NULL) {
1371 				KERNEL_DEBUG_CONSTANT(
1372 					TRACE_MP_CPUS_CALL_LOCAL,
1373 					VM_KERNEL_UNSLIDE(action_func),
1374 					VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1375 				action_func(arg0, arg1);
1376 			}
1377 		} else {
1378 			/*
1379 			 * Here to queue a call to cpu and IPI.
1380 			 */
1381 			mp_call_t       *callp = NULL;
1382 			mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1383 			boolean_t       intrs_inner;
1384 
1385 queue_call:
1386 			if (callp == NULL) {
1387 				callp = mp_call_alloc();
1388 			}
1389 			intrs_inner = mp_call_head_lock(cqp);
1390 			if (callp == NULL) {
1391 				mp_call_head_unlock(cqp, intrs_inner);
1392 				KERNEL_DEBUG_CONSTANT(
1393 					TRACE_MP_CPUS_CALL_NOBUF,
1394 					cpu, 0, 0, 0, 0);
1395 				if (!intrs_inner) {
1396 					/* Sniffing w/o locking */
1397 					if (!queue_empty(&cqp->queue)) {
1398 						mp_cpus_call_action();
1399 					}
1400 					handle_pending_TLB_flushes();
1401 				}
1402 				if (mp_spin_timeout(tsc_spin_start)) {
1403 					panic("mp_cpus_call1() timeout start: 0x%llx, cur: 0x%llx",
1404 					    tsc_spin_start, rdtsc64());
1405 				}
1406 				goto queue_call;
1407 			}
1408 			callp->maskp = (mode == NOSYNC) ? NULL : &cpus_responded;
1409 			callp->func = action_func;
1410 			callp->arg0 = arg0;
1411 			callp->arg1 = arg1;
1412 			mp_call_enqueue_locked(cqp, callp);
1413 			cpus_call_count++;
1414 			cpus_called |= cpu_to_cpumask(cpu);
1415 			i386_signal_cpu(cpu, MP_CALL, ASYNC);
1416 			mp_call_head_unlock(cqp, intrs_inner);
1417 			if (mode == SYNC) {
1418 				mp_cpus_call_wait(intrs_inner, cpus_called, &cpus_responded);
1419 			}
1420 		}
1421 	}
1422 	if (topo_lock) {
1423 		simple_unlock(&x86_topo_lock);
1424 		ml_set_interrupts_enabled(intrs_enabled);
1425 	}
1426 
1427 	/* Call locally if mode not SYNC */
1428 	if (mode != SYNC && call_self) {
1429 		KERNEL_DEBUG_CONSTANT(
1430 			TRACE_MP_CPUS_CALL_LOCAL,
1431 			VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1432 		if (action_func != NULL) {
1433 			ml_set_interrupts_enabled(FALSE);
1434 			action_func(arg0, arg1);
1435 			ml_set_interrupts_enabled(intrs_enabled);
1436 		}
1437 	}
1438 
1439 	/* For ASYNC, now wait for all signaled cpus to complete their calls */
1440 	if (mode == ASYNC) {
1441 		mp_cpus_call_wait(intrs_enabled, cpus_called, &cpus_responded);
1442 	}
1443 
1444 	/* Safe to allow pre-emption now */
1445 	mp_enable_preemption();
1446 
1447 out:
1448 	if (call_self) {
1449 		cpus_called |= cpu_to_cpumask(cpu);
1450 		cpus_call_count++;
1451 	}
1452 
1453 	if (cpus_calledp) {
1454 		*cpus_calledp = cpus_called;
1455 	}
1456 
1457 	KERNEL_DEBUG_CONSTANT(
1458 		TRACE_MP_CPUS_CALL | DBG_FUNC_END,
1459 		cpus_call_count, cpus_called, 0, 0, 0);
1460 
1461 	return (cpu_t) cpus_call_count;
1462 }
1463 
1464 
1465 static void
mp_broadcast_action(__unused void * null)1466 mp_broadcast_action(__unused void *null)
1467 {
1468 	/* call action function */
1469 	if (mp_bc_action_func != NULL) {
1470 		mp_bc_action_func(mp_bc_func_arg);
1471 	}
1472 
1473 	/* if we're the last one through, wake up the instigator */
1474 	if (atomic_decl_and_test(&mp_bc_count, 1)) {
1475 		thread_wakeup(((event_t)(uintptr_t) &mp_bc_count));
1476 	}
1477 }
1478 
1479 /*
1480  * mp_broadcast() runs a given function on all active cpus.
1481  * The caller blocks until the functions has run on all cpus.
1482  * The caller will also block if there is another pending broadcast.
1483  */
1484 void
mp_broadcast(void (* action_func)(void *),void * arg)1485 mp_broadcast(
1486 	void (*action_func)(void *),
1487 	void *arg)
1488 {
1489 	if (!smp_initialized) {
1490 		if (action_func != NULL) {
1491 			action_func(arg);
1492 		}
1493 		return;
1494 	}
1495 
1496 	/* obtain broadcast lock */
1497 	lck_mtx_lock(&mp_bc_lock);
1498 
1499 	/* set static function pointers */
1500 	mp_bc_action_func = action_func;
1501 	mp_bc_func_arg = arg;
1502 
1503 	assert_wait((event_t)(uintptr_t)&mp_bc_count, THREAD_UNINT);
1504 
1505 	/*
1506 	 * signal other processors, which will call mp_broadcast_action()
1507 	 */
1508 	mp_bc_count = real_ncpus;                       /* assume max possible active */
1509 	mp_bc_ncpus = mp_cpus_call(CPUMASK_ALL, NOSYNC, *mp_broadcast_action, NULL);
1510 	atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */
1511 
1512 	/* block for other cpus to have run action_func */
1513 	if (mp_bc_ncpus > 1) {
1514 		thread_block(THREAD_CONTINUE_NULL);
1515 	} else {
1516 		clear_wait(current_thread(), THREAD_AWAKENED);
1517 	}
1518 
1519 	/* release lock */
1520 	lck_mtx_unlock(&mp_bc_lock);
1521 }
1522 
1523 void
mp_cpus_kick(cpumask_t cpus)1524 mp_cpus_kick(cpumask_t cpus)
1525 {
1526 	cpu_t           cpu;
1527 	boolean_t       intrs_enabled = FALSE;
1528 
1529 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
1530 	mp_safe_spin_lock(&x86_topo_lock);
1531 
1532 	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1533 		if (((cpu_to_cpumask(cpu) & cpus) == 0)
1534 		    || !cpu_is_running(cpu)) {
1535 			continue;
1536 		}
1537 
1538 		lapic_send_ipi(cpu, LAPIC_VECTOR(KICK));
1539 	}
1540 
1541 	simple_unlock(&x86_topo_lock);
1542 	ml_set_interrupts_enabled(intrs_enabled);
1543 }
1544 
1545 void
i386_activate_cpu(void)1546 i386_activate_cpu(void)
1547 {
1548 	cpu_data_t      *cdp = current_cpu_datap();
1549 
1550 	assert(!ml_get_interrupts_enabled());
1551 
1552 	if (!smp_initialized) {
1553 		cdp->cpu_running = TRUE;
1554 		return;
1555 	}
1556 
1557 	mp_safe_spin_lock(&x86_topo_lock);
1558 	cdp->cpu_running = TRUE;
1559 	started_cpu();
1560 	pmap_tlbi_range(0, ~0ULL, true, 0);
1561 	simple_unlock(&x86_topo_lock);
1562 }
1563 
1564 void
i386_deactivate_cpu(void)1565 i386_deactivate_cpu(void)
1566 {
1567 	cpu_data_t      *cdp = current_cpu_datap();
1568 
1569 	assert(!ml_get_interrupts_enabled());
1570 
1571 	KERNEL_DEBUG_CONSTANT(
1572 		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START,
1573 		0, 0, 0, 0, 0);
1574 
1575 	mp_safe_spin_lock(&x86_topo_lock);
1576 	cdp->cpu_running = FALSE;
1577 	simple_unlock(&x86_topo_lock);
1578 
1579 	/*
1580 	 * Move all of this cpu's timers to the master/boot cpu,
1581 	 * and poke it in case there's a sooner deadline for it to schedule.
1582 	 * We don't need to wait for it to ack the IPI.
1583 	 */
1584 	timer_queue_shutdown(master_cpu,
1585 	    &cdp->rtclock_timer.queue,
1586 	    &cpu_datap(master_cpu)->rtclock_timer.queue);
1587 
1588 	mp_cpus_call(cpu_to_cpumask(master_cpu), NOSYNC, timer_queue_expire_local, NULL);
1589 
1590 #if CONFIG_CPU_COUNTERS
1591 	mt_cpu_down(cdp);
1592 #endif /* CONFIG_CPU_COUNTERS */
1593 #if KPERF
1594 	kptimer_stop_curcpu();
1595 #endif /* KPERF */
1596 
1597 	/*
1598 	 * Open an interrupt window
1599 	 * and ensure any pending IPI or timer is serviced
1600 	 */
1601 	mp_disable_preemption();
1602 	ml_set_interrupts_enabled(TRUE);
1603 
1604 	while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime) {
1605 		cpu_pause();
1606 	}
1607 	/*
1608 	 * Ensure there's no remaining timer deadline set
1609 	 * - AICPM may have left one active.
1610 	 */
1611 	setPop(0);
1612 
1613 	ml_set_interrupts_enabled(FALSE);
1614 	mp_enable_preemption();
1615 
1616 	KERNEL_DEBUG_CONSTANT(
1617 		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END,
1618 		0, 0, 0, 0, 0);
1619 }
1620 
1621 int     pmsafe_debug    = 1;
1622 
1623 #if     MACH_KDP
1624 volatile boolean_t      mp_kdp_trap = FALSE;
1625 volatile boolean_t      mp_kdp_is_NMI = FALSE;
1626 volatile unsigned long  mp_kdp_ncpus;
1627 boolean_t               mp_kdp_state;
1628 bool                    mp_kdp_is_stackshot = false;
1629 
1630 void
mp_kdp_enter(boolean_t proceed_on_failure,bool is_stackshot)1631 mp_kdp_enter(boolean_t proceed_on_failure, bool is_stackshot)
1632 {
1633 	unsigned int    cpu;
1634 	unsigned int    ncpus = 0;
1635 	unsigned int    my_cpu;
1636 	uint64_t        tsc_timeout;
1637 
1638 	DBG("mp_kdp_enter()\n");
1639 
1640 	/*
1641 	 * Here to enter the debugger.
1642 	 * In case of races, only one cpu is allowed to enter kdp after
1643 	 * stopping others.
1644 	 */
1645 	mp_kdp_state = ml_set_interrupts_enabled(FALSE);
1646 	my_cpu = cpu_number();
1647 	mp_kdp_is_stackshot = is_stackshot;
1648 
1649 	if (my_cpu == (unsigned) debugger_cpu) {
1650 		kprintf("\n\nRECURSIVE DEBUGGER ENTRY DETECTED\n\n");
1651 		kdp_reset();
1652 		return;
1653 	}
1654 
1655 	uint64_t start_time = cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time();
1656 	int locked = 0;
1657 	while (!locked || mp_kdp_trap) {
1658 		if (locked) {
1659 			simple_unlock(&x86_topo_lock);
1660 		}
1661 		if (proceed_on_failure) {
1662 			if (mach_absolute_time() - start_time > 500000000ll) {
1663 				paniclog_append_noflush("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n");
1664 				break;
1665 			}
1666 			locked = simple_lock_try(&x86_topo_lock, LCK_GRP_NULL);
1667 			if (!locked) {
1668 				cpu_pause();
1669 			}
1670 		} else {
1671 			mp_safe_spin_lock(&x86_topo_lock);
1672 			locked = TRUE;
1673 		}
1674 
1675 		if (locked && mp_kdp_trap) {
1676 			simple_unlock(&x86_topo_lock);
1677 			DBG("mp_kdp_enter() race lost\n");
1678 #if MACH_KDP
1679 			mp_kdp_wait(TRUE, FALSE);
1680 #endif
1681 			locked = FALSE;
1682 		}
1683 	}
1684 
1685 	if (pmsafe_debug && !kdp_snapshot) {
1686 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
1687 	}
1688 
1689 	debugger_cpu = my_cpu;
1690 	ncpus = 1;
1691 	atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1692 	mp_kdp_trap = TRUE;
1693 	debugger_entry_time = cpu_datap(my_cpu)->debugger_entry_time;
1694 
1695 	/*
1696 	 * Deliver a nudge to other cpus, counting how many
1697 	 */
1698 	DBG("mp_kdp_enter() signaling other processors\n");
1699 	if (force_immediate_debugger_NMI == FALSE) {
1700 		for (cpu = 0; cpu < real_ncpus; cpu++) {
1701 			if (cpu == my_cpu || !cpu_is_running(cpu)) {
1702 				continue;
1703 			}
1704 			ncpus++;
1705 			i386_signal_cpu(cpu, MP_KDP, ASYNC);
1706 		}
1707 		/*
1708 		 * Wait other processors to synchronize
1709 		 */
1710 		DBG("mp_kdp_enter() waiting for (%d) processors to suspend\n", ncpus);
1711 
1712 		/*
1713 		 * This timeout is rather arbitrary; we don't want to NMI
1714 		 * processors that are executing at potentially
1715 		 * "unsafe-to-interrupt" points such as the trampolines,
1716 		 * but neither do we want to lose state by waiting too long.
1717 		 */
1718 		tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1719 
1720 		while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1721 			/*
1722 			 * A TLB shootdown request may be pending--this would
1723 			 * result in the requesting processor waiting in
1724 			 * PMAP_UPDATE_TLBS() until this processor deals with it.
1725 			 * Process it, so it can now enter mp_kdp_wait()
1726 			 */
1727 			handle_pending_TLB_flushes();
1728 			cpu_pause();
1729 		}
1730 		/* If we've timed out, and some processor(s) are still unresponsive,
1731 		 * interrupt them with an NMI via the local APIC, iff a panic is
1732 		 * in progress.
1733 		 */
1734 		if (panic_active()) {
1735 			NMIPI_enable(TRUE);
1736 		}
1737 		if (mp_kdp_ncpus != ncpus) {
1738 			unsigned int wait_cycles = 0;
1739 			if (proceed_on_failure) {
1740 				paniclog_append_noflush("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1741 			} else {
1742 				DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1743 			}
1744 			for (cpu = 0; cpu < real_ncpus; cpu++) {
1745 				if (cpu == my_cpu || !cpu_is_running(cpu)) {
1746 					continue;
1747 				}
1748 				if (cpu_signal_pending(cpu, MP_KDP)) {
1749 					cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1750 					cpu_NMI_interrupt(cpu);
1751 				}
1752 			}
1753 			/* Wait again for the same timeout */
1754 			tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1755 			while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1756 				handle_pending_TLB_flushes();
1757 				cpu_pause();
1758 				++wait_cycles;
1759 			}
1760 			if (mp_kdp_ncpus != ncpus) {
1761 				paniclog_append_noflush("mp_kdp_enter() NMI pending on cpus:");
1762 				for (cpu = 0; cpu < real_ncpus; cpu++) {
1763 					if (cpu_is_running(cpu) && !cpu_datap(cpu)->cpu_NMI_acknowledged) {
1764 						paniclog_append_noflush(" %d", cpu);
1765 					}
1766 				}
1767 				paniclog_append_noflush("\n");
1768 				if (proceed_on_failure) {
1769 					paniclog_append_noflush("mp_kdp_enter() timed-out during %s wait after NMI;"
1770 					    "expected %u acks but received %lu after %u loops in %llu ticks\n",
1771 					    (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1772 				} else {
1773 					panic("mp_kdp_enter() timed-out during %s wait after NMI;"
1774 					    "expected %u acks but received %lu after %u loops in %llu ticks",
1775 					    (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1776 				}
1777 			}
1778 		}
1779 	} else if (NMI_panic_reason != PTE_CORRUPTION) {  /* In the pte corruption case, the detecting CPU has already NMIed other CPUs */
1780 		for (cpu = 0; cpu < real_ncpus; cpu++) {
1781 			if (cpu == my_cpu || !cpu_is_running(cpu)) {
1782 				continue;
1783 			}
1784 			cpu_NMI_interrupt(cpu);
1785 		}
1786 	}
1787 
1788 	if (locked) {
1789 		simple_unlock(&x86_topo_lock);
1790 	}
1791 
1792 	DBG("mp_kdp_enter() %d processors done %s\n",
1793 	    (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out");
1794 
1795 	postcode(MP_KDP_ENTER);
1796 }
1797 
1798 boolean_t
mp_kdp_all_cpus_halted()1799 mp_kdp_all_cpus_halted()
1800 {
1801 	unsigned int ncpus = 0, cpu = 0, my_cpu = 0;
1802 
1803 	my_cpu = cpu_number();
1804 	ncpus = 1; /* current CPU */
1805 	for (cpu = 0; cpu < real_ncpus; cpu++) {
1806 		if (cpu == my_cpu || !cpu_is_running(cpu)) {
1807 			continue;
1808 		}
1809 		ncpus++;
1810 	}
1811 
1812 	return mp_kdp_ncpus == ncpus;
1813 }
1814 
1815 static boolean_t
cpu_signal_pending(int cpu,mp_event_t event)1816 cpu_signal_pending(int cpu, mp_event_t event)
1817 {
1818 	volatile int    *signals = &cpu_datap(cpu)->cpu_signals;
1819 	boolean_t retval = FALSE;
1820 
1821 	if (i_bit(event, signals)) {
1822 		retval = TRUE;
1823 	}
1824 	return retval;
1825 }
1826 
1827 long
kdp_x86_xcpu_invoke(const uint16_t lcpu,kdp_x86_xcpu_func_t func,void * arg0,void * arg1,uint64_t timeout)1828 kdp_x86_xcpu_invoke(const uint16_t lcpu, kdp_x86_xcpu_func_t func,
1829     void *arg0, void *arg1, uint64_t timeout)
1830 {
1831 	uint64_t now;
1832 
1833 	if (lcpu > (real_ncpus - 1)) {
1834 		return -1;
1835 	}
1836 
1837 	if (func == NULL) {
1838 		return -1;
1839 	}
1840 
1841 	kdp_xcpu_call_func.func = func;
1842 	kdp_xcpu_call_func.ret  = -1;
1843 	kdp_xcpu_call_func.arg0 = arg0;
1844 	kdp_xcpu_call_func.arg1 = arg1;
1845 	kdp_xcpu_call_func.cpu  = lcpu;
1846 	DBG("Invoking function %p on CPU %d\n", func, (int32_t)lcpu);
1847 	now = mach_absolute_time();
1848 	while (kdp_xcpu_call_func.cpu != KDP_XCPU_NONE &&
1849 	    (timeout == 0 || (mach_absolute_time() - now) < timeout)) {
1850 		cpu_pause();
1851 	}
1852 	return kdp_xcpu_call_func.ret;
1853 }
1854 
1855 static void
kdp_x86_xcpu_poll(void)1856 kdp_x86_xcpu_poll(void)
1857 {
1858 	if ((uint16_t)cpu_number() == kdp_xcpu_call_func.cpu) {
1859 		kdp_xcpu_call_func.ret =
1860 		    kdp_xcpu_call_func.func(kdp_xcpu_call_func.arg0,
1861 		    kdp_xcpu_call_func.arg1,
1862 		    cpu_number());
1863 		kdp_xcpu_call_func.cpu = KDP_XCPU_NONE;
1864 	}
1865 }
1866 
1867 static void
mp_kdp_wait(boolean_t flush,boolean_t isNMI)1868 mp_kdp_wait(boolean_t flush, boolean_t isNMI)
1869 {
1870 	DBG("mp_kdp_wait()\n");
1871 
1872 	current_cpu_datap()->debugger_ipi_time = mach_absolute_time();
1873 #if CONFIG_MCA
1874 	/* If we've trapped due to a machine-check, save MCA registers */
1875 	mca_check_save();
1876 #endif
1877 
1878 	/* If this is a stackshot, setup the CPU state before signalling we've entered the debugger. */
1879 	if (mp_kdp_is_stackshot) {
1880 		stackshot_cpu_preflight();
1881 	}
1882 
1883 	atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1884 
1885 	/* If this is a stackshot, join in on the fun. */
1886 	if (mp_kdp_is_stackshot) {
1887 		stackshot_aux_cpu_entry();
1888 	}
1889 
1890 	while (mp_kdp_trap || (isNMI == TRUE)) {
1891 		/*
1892 		 * A TLB shootdown request may be pending--this would result
1893 		 * in the requesting processor waiting in PMAP_UPDATE_TLBS()
1894 		 * until this processor handles it.
1895 		 * Process it, so it can now enter mp_kdp_wait()
1896 		 */
1897 		if (flush) {
1898 			handle_pending_TLB_flushes();
1899 		}
1900 
1901 		kdp_x86_xcpu_poll();
1902 		cpu_pause();
1903 	}
1904 
1905 	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1906 	DBG("mp_kdp_wait() done\n");
1907 }
1908 
1909 void
mp_kdp_exit(void)1910 mp_kdp_exit(void)
1911 {
1912 	DBG("mp_kdp_exit()\n");
1913 	debugger_cpu = -1;
1914 	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1915 
1916 	debugger_exit_time = mach_absolute_time();
1917 
1918 	mp_kdp_is_stackshot = false;
1919 	mp_kdp_trap = FALSE;
1920 	mfence();
1921 
1922 	/* Wait other processors to stop spinning. XXX needs timeout */
1923 	DBG("mp_kdp_exit() waiting for processors to resume\n");
1924 	while (mp_kdp_ncpus > 0) {
1925 		/*
1926 		 * a TLB shootdown request may be pending... this would result in the requesting
1927 		 * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it.
1928 		 * Process it, so it can now enter mp_kdp_wait()
1929 		 */
1930 		handle_pending_TLB_flushes();
1931 
1932 		cpu_pause();
1933 	}
1934 
1935 	if (pmsafe_debug && !kdp_snapshot) {
1936 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
1937 	}
1938 
1939 	debugger_exit_time = mach_absolute_time();
1940 
1941 	DBG("mp_kdp_exit() done\n");
1942 	(void) ml_set_interrupts_enabled(mp_kdp_state);
1943 	postcode(MP_KDP_EXIT);
1944 }
1945 
1946 #endif  /* MACH_KDP */
1947 
1948 boolean_t
mp_recent_debugger_activity(void)1949 mp_recent_debugger_activity(void)
1950 {
1951 	uint64_t abstime = mach_absolute_time();
1952 	return ((abstime - debugger_entry_time) < LastDebuggerEntryAllowance) ||
1953 	       ((abstime - debugger_exit_time) < LastDebuggerEntryAllowance);
1954 }
1955 
1956 /*ARGSUSED*/
1957 void
init_ast_check(__unused processor_t processor)1958 init_ast_check(
1959 	__unused processor_t    processor)
1960 {
1961 }
1962 
1963 void
cause_ast_check(processor_t processor)1964 cause_ast_check(
1965 	processor_t     processor)
1966 {
1967 	assert(processor != PROCESSOR_NULL);
1968 
1969 	int     cpu = processor->cpu_id;
1970 
1971 	if (cpu != cpu_number()) {
1972 		i386_signal_cpu(cpu, MP_AST, ASYNC);
1973 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0);
1974 	}
1975 }
1976 
1977 void
machine_cpu_reinit(void * param)1978 machine_cpu_reinit(void *param)
1979 {
1980 	/*
1981 	 * Here in process context, but with interrupts disabled.
1982 	 */
1983 	DBG("machine_cpu_reinit() CPU%d\n", get_cpu_number());
1984 
1985 	if (param == FULL_SLAVE_INIT) {
1986 		/*
1987 		 * Cold start
1988 		 */
1989 		clock_init();
1990 	}
1991 	cpu_machine_init();     /* Interrupts enabled hereafter */
1992 }
1993 
1994 #undef cpu_number
1995 int
cpu_number(void)1996 cpu_number(void)
1997 {
1998 	return get_cpu_number();
1999 }
2000 
2001 vm_offset_t
current_percpu_base(void)2002 current_percpu_base(void)
2003 {
2004 	return get_current_percpu_base();
2005 }
2006 
2007 vm_offset_t
other_percpu_base(int cpu)2008 other_percpu_base(int cpu)
2009 {
2010 	return cpu_datap(cpu)->cpu_pcpu_base;
2011 }
2012 
2013 static void
cpu_prewarm_init()2014 cpu_prewarm_init()
2015 {
2016 	int i;
2017 
2018 	simple_lock_init(&cpu_warm_lock, 0);
2019 	queue_init(&cpu_warm_call_list);
2020 	for (i = 0; i < NUM_CPU_WARM_CALLS; i++) {
2021 		enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]);
2022 	}
2023 }
2024 
2025 static timer_call_t
grab_warm_timer_call()2026 grab_warm_timer_call()
2027 {
2028 	spl_t x;
2029 	timer_call_t call = NULL;
2030 
2031 	x = splsched();
2032 	simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2033 	if (!queue_empty(&cpu_warm_call_list)) {
2034 		call = (timer_call_t) dequeue_head(&cpu_warm_call_list);
2035 	}
2036 	simple_unlock(&cpu_warm_lock);
2037 	splx(x);
2038 
2039 	return call;
2040 }
2041 
2042 static void
free_warm_timer_call(timer_call_t call)2043 free_warm_timer_call(timer_call_t call)
2044 {
2045 	spl_t x;
2046 
2047 	x = splsched();
2048 	simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2049 	enqueue_head(&cpu_warm_call_list, (queue_entry_t)call);
2050 	simple_unlock(&cpu_warm_lock);
2051 	splx(x);
2052 }
2053 
2054 /*
2055  * Runs in timer call context (interrupts disabled).
2056  */
2057 static void
cpu_warm_timer_call_func(timer_call_param_t p0,__unused timer_call_param_t p1)2058 cpu_warm_timer_call_func(
2059 	timer_call_param_t p0,
2060 	__unused timer_call_param_t p1)
2061 {
2062 	free_warm_timer_call((timer_call_t)p0);
2063 	return;
2064 }
2065 
2066 /*
2067  * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0).
2068  */
2069 static void
_cpu_warm_setup(void * arg)2070 _cpu_warm_setup(
2071 	void *arg)
2072 {
2073 	cpu_warm_data_t cwdp = (cpu_warm_data_t)arg;
2074 
2075 	timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2076 	cwdp->cwd_result = 0;
2077 
2078 	return;
2079 }
2080 
2081 /*
2082  * Not safe to call with interrupts disabled.
2083  */
2084 kern_return_t
ml_interrupt_prewarm(uint64_t deadline)2085 ml_interrupt_prewarm(
2086 	uint64_t        deadline)
2087 {
2088 	struct cpu_warm_data cwd;
2089 	timer_call_t call;
2090 	cpu_t ct;
2091 
2092 	if (ml_get_interrupts_enabled() == FALSE) {
2093 		panic("%s: Interrupts disabled?", __FUNCTION__);
2094 	}
2095 
2096 	/*
2097 	 * If the platform doesn't need our help, say that we succeeded.
2098 	 */
2099 	if (!ml_get_interrupt_prewake_applicable()) {
2100 		return KERN_SUCCESS;
2101 	}
2102 
2103 	/*
2104 	 * Grab a timer call to use.
2105 	 */
2106 	call = grab_warm_timer_call();
2107 	if (call == NULL) {
2108 		return KERN_RESOURCE_SHORTAGE;
2109 	}
2110 
2111 	timer_call_setup(call, cpu_warm_timer_call_func, call);
2112 	cwd.cwd_call = call;
2113 	cwd.cwd_deadline = deadline;
2114 	cwd.cwd_result = 0;
2115 
2116 	/*
2117 	 * For now, non-local interrupts happen on the master processor.
2118 	 */
2119 	ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd);
2120 	if (ct == 0) {
2121 		free_warm_timer_call(call);
2122 		return KERN_FAILURE;
2123 	} else {
2124 		return cwd.cwd_result;
2125 	}
2126 }
2127 
2128 #if DEBUG || DEVELOPMENT
2129 void
kernel_spin(uint64_t spin_ns)2130 kernel_spin(uint64_t spin_ns)
2131 {
2132 	boolean_t       istate;
2133 	uint64_t        spin_abs;
2134 	uint64_t        deadline;
2135 	cpu_data_t      *cdp;
2136 
2137 	kprintf("kernel_spin(%llu) spinning uninterruptibly\n", spin_ns);
2138 	istate = ml_set_interrupts_enabled(FALSE);
2139 	cdp = current_cpu_datap();
2140 	nanoseconds_to_absolutetime(spin_ns, &spin_abs);
2141 
2142 	/* Fake interrupt handler entry for testing mp_interrupt_watchdog() */
2143 	cdp->cpu_int_event_time = mach_absolute_time();
2144 	cdp->cpu_int_state = (void *) USER_STATE(current_thread());
2145 
2146 	deadline = mach_absolute_time() + spin_ns;
2147 	while (mach_absolute_time() < deadline) {
2148 		cpu_pause();
2149 	}
2150 
2151 	cdp->cpu_int_event_time = 0;
2152 	cdp->cpu_int_state = NULL;
2153 
2154 	ml_set_interrupts_enabled(istate);
2155 	kprintf("kernel_spin() continuing\n");
2156 }
2157 
2158 /*
2159  * Called from the scheduler's maintenance thread,
2160  * scan running processors for long-running ISRs and:
2161  *  - panic if longer than LockTimeOut, or
2162  *  - log if more than a quantum.
2163  */
2164 void
mp_interrupt_watchdog(void)2165 mp_interrupt_watchdog(void)
2166 {
2167 	cpu_t                   cpu;
2168 	boolean_t               intrs_enabled = FALSE;
2169 	uint16_t                cpu_int_num;
2170 	uint64_t                cpu_int_event_time;
2171 	uint64_t                cpu_rip;
2172 	uint64_t                cpu_int_duration;
2173 	uint64_t                now;
2174 	x86_saved_state_t       *cpu_int_state;
2175 
2176 	if (__improbable(!mp_interrupt_watchdog_enabled)) {
2177 		return;
2178 	}
2179 
2180 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
2181 	now = mach_absolute_time();
2182 	/*
2183 	 * While timeouts are not suspended,
2184 	 * check all other processors for long outstanding interrupt handling.
2185 	 */
2186 	for (cpu = 0;
2187 	    cpu < (cpu_t) real_ncpus && !machine_timeout_suspended();
2188 	    cpu++) {
2189 		if ((cpu == (cpu_t) cpu_number()) ||
2190 		    (!cpu_is_running(cpu))) {
2191 			continue;
2192 		}
2193 		cpu_int_event_time = cpu_datap(cpu)->cpu_int_event_time;
2194 		if (cpu_int_event_time == 0) {
2195 			continue;
2196 		}
2197 		if (__improbable(now < cpu_int_event_time)) {
2198 			continue;       /* skip due to inter-processor skew */
2199 		}
2200 		cpu_int_state = cpu_datap(cpu)->cpu_int_state;
2201 		if (__improbable(cpu_int_state == NULL)) {
2202 			/* The interrupt may have been dismissed */
2203 			continue;
2204 		}
2205 
2206 		/* Here with a cpu handling an interrupt */
2207 
2208 		cpu_int_duration = now - cpu_int_event_time;
2209 		if (__improbable(cpu_int_duration > LockTimeOut)) {
2210 			cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2211 			cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2212 			vector_timed_out = cpu_int_num;
2213 			NMIPI_panic(cpu_to_cpumask(cpu), INTERRUPT_WATCHDOG);
2214 			panic("Interrupt watchdog, "
2215 			    "cpu: %d interrupt: 0x%x time: %llu..%llu state: %p RIP: 0x%llx",
2216 			    cpu, cpu_int_num, cpu_int_event_time, now, cpu_int_state, cpu_rip);
2217 			/* NOT REACHED */
2218 		} else if (__improbable(cpu_int_duration > (uint64_t) std_quantum)) {
2219 			mp_interrupt_watchdog_events++;
2220 			cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2221 			cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2222 			ml_set_interrupts_enabled(intrs_enabled);
2223 			printf("Interrupt watchdog, "
2224 			    "cpu: %d interrupt: 0x%x time: %llu..%llu RIP: 0x%llx\n",
2225 			    cpu, cpu_int_num, cpu_int_event_time, now, cpu_rip);
2226 			return;
2227 		}
2228 	}
2229 
2230 	ml_set_interrupts_enabled(intrs_enabled);
2231 }
2232 #endif
2233