xref: /xnu-8796.121.2/osfmk/i386/mp.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 
32 #include <mach_kdp.h>
33 #include <kdp/kdp_internal.h>
34 #include <mach_ldebug.h>
35 
36 #include <mach/mach_types.h>
37 #include <mach/kern_return.h>
38 
39 #include <kern/kern_types.h>
40 #include <kern/startup.h>
41 #include <kern/timer_queue.h>
42 #include <kern/processor.h>
43 #include <kern/cpu_number.h>
44 #include <kern/cpu_data.h>
45 #include <kern/assert.h>
46 #include <kern/lock_group.h>
47 #include <kern/machine.h>
48 #include <kern/pms.h>
49 #include <kern/misc_protos.h>
50 #include <kern/timer_call.h>
51 #include <kern/zalloc.h>
52 #include <kern/queue.h>
53 #include <prng/random.h>
54 
55 #include <vm/vm_map.h>
56 #include <vm/vm_kern.h>
57 
58 #include <i386/bit_routines.h>
59 #include <i386/proc_reg.h>
60 #include <i386/cpu_threads.h>
61 #include <i386/mp_desc.h>
62 #include <i386/misc_protos.h>
63 #include <i386/trap.h>
64 #include <i386/postcode.h>
65 #include <i386/machine_routines.h>
66 #include <i386/mp.h>
67 #include <i386/mp_events.h>
68 #include <i386/lapic.h>
69 #include <i386/cpuid.h>
70 #include <i386/fpu.h>
71 #include <i386/machine_cpu.h>
72 #include <i386/pmCPU.h>
73 #if CONFIG_MCA
74 #include <i386/machine_check.h>
75 #endif
76 #include <i386/acpi.h>
77 
78 #include <sys/kdebug.h>
79 
80 #include <console/serial_protos.h>
81 
82 #if MONOTONIC
83 #include <kern/monotonic.h>
84 #endif /* MONOTONIC */
85 
86 #if KPERF
87 #include <kperf/kptimer.h>
88 #endif /* KPERF */
89 
90 #if     MP_DEBUG
91 #define PAUSE           delay(1000000)
92 #define DBG(x...)       kprintf(x)
93 #else
94 #define DBG(x...)
95 #define PAUSE
96 #endif  /* MP_DEBUG */
97 
98 /* Debugging/test trace events: */
99 #define TRACE_MP_TLB_FLUSH              MACHDBG_CODE(DBG_MACH_MP, 0)
100 #define TRACE_MP_CPUS_CALL              MACHDBG_CODE(DBG_MACH_MP, 1)
101 #define TRACE_MP_CPUS_CALL_LOCAL        MACHDBG_CODE(DBG_MACH_MP, 2)
102 #define TRACE_MP_CPUS_CALL_ACTION       MACHDBG_CODE(DBG_MACH_MP, 3)
103 #define TRACE_MP_CPUS_CALL_NOBUF        MACHDBG_CODE(DBG_MACH_MP, 4)
104 #define TRACE_MP_CPU_FAST_START         MACHDBG_CODE(DBG_MACH_MP, 5)
105 #define TRACE_MP_CPU_START              MACHDBG_CODE(DBG_MACH_MP, 6)
106 #define TRACE_MP_CPU_DEACTIVATE         MACHDBG_CODE(DBG_MACH_MP, 7)
107 
108 #define ABS(v)          (((v) > 0)?(v):-(v))
109 
110 void            slave_boot_init(void);
111 void            i386_cpu_IPI(int cpu);
112 
113 #if MACH_KDP
114 static void     mp_kdp_wait(boolean_t flush, boolean_t isNMI);
115 #endif /* MACH_KDP */
116 
117 #if MACH_KDP
118 static boolean_t        cpu_signal_pending(int cpu, mp_event_t event);
119 #endif /* MACH_KDP */
120 static int              NMIInterruptHandler(x86_saved_state_t *regs);
121 
122 boolean_t               smp_initialized = FALSE;
123 uint32_t                TSC_sync_margin = 0xFFF;
124 volatile boolean_t      force_immediate_debugger_NMI = FALSE;
125 volatile boolean_t      pmap_tlb_flush_timeout = FALSE;
126 #if DEBUG || DEVELOPMENT
127 boolean_t               mp_interrupt_watchdog_enabled = TRUE;
128 uint32_t                mp_interrupt_watchdog_events = 0;
129 #endif
130 
131 SIMPLE_LOCK_DECLARE(debugger_callback_lock, 0);
132 struct debugger_callback *debugger_callback = NULL;
133 
134 static LCK_GRP_DECLARE(smp_lck_grp, "i386_smp");
135 static LCK_MTX_DECLARE(mp_cpu_boot_lock, &smp_lck_grp);
136 
137 /* Variables needed for MP rendezvous. */
138 SIMPLE_LOCK_DECLARE(mp_rv_lock, 0);
139 static void     (*mp_rv_setup_func)(void *arg);
140 static void     (*mp_rv_action_func)(void *arg);
141 static void     (*mp_rv_teardown_func)(void *arg);
142 static void     *mp_rv_func_arg;
143 static volatile int     mp_rv_ncpus;
144 /* Cache-aligned barriers: */
145 static volatile long    mp_rv_entry    __attribute__((aligned(64)));
146 static volatile long    mp_rv_exit     __attribute__((aligned(64)));
147 static volatile long    mp_rv_complete __attribute__((aligned(64)));
148 
149 volatile        uint64_t        debugger_entry_time;
150 volatile        uint64_t        debugger_exit_time;
151 #if MACH_KDP
152 #include <kdp/kdp.h>
153 extern int kdp_snapshot;
154 static struct _kdp_xcpu_call_func {
155 	kdp_x86_xcpu_func_t func;
156 	void     *arg0, *arg1;
157 	volatile long     ret;
158 	volatile uint16_t cpu;
159 } kdp_xcpu_call_func = {
160 	.cpu  = KDP_XCPU_NONE
161 };
162 
163 #endif
164 
165 /* Variables needed for MP broadcast. */
166 static void        (*mp_bc_action_func)(void *arg);
167 static void        *mp_bc_func_arg;
168 static int      mp_bc_ncpus;
169 static volatile long   mp_bc_count;
170 static LCK_MTX_DECLARE(mp_bc_lock, &smp_lck_grp);
171 static  volatile int    debugger_cpu = -1;
172 volatile long    NMIPI_acks = 0;
173 volatile long    NMI_count = 0;
174 static int              vector_timed_out;
175 
176 NMI_reason_t    NMI_panic_reason = NONE;
177 extern void     NMI_cpus(void);
178 
179 static void     mp_cpus_call_init(void);
180 static void     mp_cpus_call_action(void);
181 static void     mp_call_PM(void);
182 
183 char            mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init
184 
185 /* PAL-related routines */
186 boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler,
187     int ipi_vector, i386_intr_func_t ipi_handler);
188 void i386_start_cpu(int lapic_id, int cpu_num);
189 void i386_send_NMI(int cpu);
190 void NMIPI_enable(boolean_t);
191 
192 #define NUM_CPU_WARM_CALLS      20
193 struct timer_call       cpu_warm_call_arr[NUM_CPU_WARM_CALLS];
194 queue_head_t            cpu_warm_call_list;
195 decl_simple_lock_data(static, cpu_warm_lock);
196 
197 typedef struct cpu_warm_data {
198 	timer_call_t    cwd_call;
199 	uint64_t        cwd_deadline;
200 	int             cwd_result;
201 } *cpu_warm_data_t;
202 
203 static void             cpu_prewarm_init(void);
204 static void             cpu_warm_timer_call_func(timer_call_param_t p0, timer_call_param_t p1);
205 static void             _cpu_warm_setup(void *arg);
206 static timer_call_t     grab_warm_timer_call(void);
207 static void             free_warm_timer_call(timer_call_t call);
208 
209 void
smp_init(void)210 smp_init(void)
211 {
212 	console_init();
213 
214 	if (!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler,
215 	    LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler)) {
216 		return;
217 	}
218 
219 	cpu_thread_init();
220 
221 	DBGLOG_CPU_INIT(master_cpu);
222 
223 	mp_cpus_call_init();
224 	mp_cpus_call_cpu_init(master_cpu);
225 
226 #if DEBUG || DEVELOPMENT
227 	if (PE_parse_boot_argn("interrupt_watchdog",
228 	    &mp_interrupt_watchdog_enabled,
229 	    sizeof(mp_interrupt_watchdog_enabled))) {
230 		kprintf("Interrupt watchdog %sabled\n",
231 		    mp_interrupt_watchdog_enabled ? "en" : "dis");
232 	}
233 #endif
234 
235 	if (PE_parse_boot_argn("TSC_sync_margin",
236 	    &TSC_sync_margin, sizeof(TSC_sync_margin))) {
237 		kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin);
238 	} else if (cpuid_vmm_present()) {
239 		kprintf("TSC sync margin disabled\n");
240 		TSC_sync_margin = 0;
241 	}
242 	smp_initialized = TRUE;
243 
244 	cpu_prewarm_init();
245 
246 	return;
247 }
248 
249 typedef struct {
250 	int                     target_cpu;
251 	int                     target_lapic;
252 	int                     starter_cpu;
253 } processor_start_info_t;
254 static processor_start_info_t   start_info        __attribute__((aligned(64)));
255 
256 /*
257  * Cache-alignment is to avoid cross-cpu false-sharing interference.
258  */
259 static volatile long            tsc_entry_barrier __attribute__((aligned(64)));
260 static volatile long            tsc_exit_barrier  __attribute__((aligned(64)));
261 static volatile uint64_t        tsc_target        __attribute__((aligned(64)));
262 
263 /*
264  * Poll a CPU to see when it has marked itself as running.
265  */
266 static void
mp_wait_for_cpu_up(int slot_num,unsigned int iters,unsigned int usecdelay)267 mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay)
268 {
269 	while (iters-- > 0) {
270 		if (cpu_datap(slot_num)->cpu_running) {
271 			break;
272 		}
273 		delay(usecdelay);
274 	}
275 }
276 
277 /*
278  * Quickly bring a CPU back online which has been halted.
279  */
280 kern_return_t
intel_startCPU_fast(int slot_num)281 intel_startCPU_fast(int slot_num)
282 {
283 	kern_return_t   rc;
284 
285 	/*
286 	 * Try to perform a fast restart
287 	 */
288 	rc = pmCPUExitHalt(slot_num);
289 	if (rc != KERN_SUCCESS) {
290 		/*
291 		 * The CPU was not eligible for a fast restart.
292 		 */
293 		return rc;
294 	}
295 
296 	KERNEL_DEBUG_CONSTANT(
297 		TRACE_MP_CPU_FAST_START | DBG_FUNC_START,
298 		slot_num, 0, 0, 0, 0);
299 
300 	/*
301 	 * Wait until the CPU is back online.
302 	 */
303 	mp_disable_preemption();
304 
305 	/*
306 	 * We use short pauses (1us) for low latency.  30,000 iterations is
307 	 * longer than a full restart would require so it should be more
308 	 * than long enough.
309 	 */
310 
311 	mp_wait_for_cpu_up(slot_num, 30000, 1);
312 	mp_enable_preemption();
313 
314 	KERNEL_DEBUG_CONSTANT(
315 		TRACE_MP_CPU_FAST_START | DBG_FUNC_END,
316 		slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0);
317 
318 	/*
319 	 * Check to make sure that the CPU is really running.  If not,
320 	 * go through the slow path.
321 	 */
322 	if (cpu_datap(slot_num)->cpu_running) {
323 		return KERN_SUCCESS;
324 	} else {
325 		return KERN_FAILURE;
326 	}
327 }
328 
329 static void
started_cpu(void)330 started_cpu(void)
331 {
332 	/* Here on the started cpu with cpu_running set TRUE */
333 
334 	if (TSC_sync_margin &&
335 	    start_info.target_cpu == cpu_number()) {
336 		/*
337 		 * I've just started-up, synchronize again with the starter cpu
338 		 * and then snap my TSC.
339 		 */
340 		tsc_target   = 0;
341 		atomic_decl(&tsc_entry_barrier, 1);
342 		while (tsc_entry_barrier != 0) {
343 			;       /* spin for starter and target at barrier */
344 		}
345 		tsc_target = rdtsc64();
346 		atomic_decl(&tsc_exit_barrier, 1);
347 	}
348 }
349 
350 static void
start_cpu(void * arg)351 start_cpu(void *arg)
352 {
353 	int                     i = 1000;
354 	processor_start_info_t  *psip = (processor_start_info_t *) arg;
355 
356 	/* Ignore this if the current processor is not the starter */
357 	if (cpu_number() != psip->starter_cpu) {
358 		return;
359 	}
360 
361 	DBG("start_cpu(%p) about to start cpu %d, lapic %d\n",
362 	    arg, psip->target_cpu, psip->target_lapic);
363 
364 	KERNEL_DEBUG_CONSTANT(
365 		TRACE_MP_CPU_START | DBG_FUNC_START,
366 		psip->target_cpu,
367 		psip->target_lapic, 0, 0, 0);
368 
369 	i386_start_cpu(psip->target_lapic, psip->target_cpu);
370 
371 #ifdef  POSTCODE_DELAY
372 	/* Wait much longer if postcodes are displayed for a delay period. */
373 	i *= 10000;
374 #endif
375 	DBG("start_cpu(%p) about to wait for cpu %d\n",
376 	    arg, psip->target_cpu);
377 
378 	mp_wait_for_cpu_up(psip->target_cpu, i * 100, 100);
379 
380 	KERNEL_DEBUG_CONSTANT(
381 		TRACE_MP_CPU_START | DBG_FUNC_END,
382 		psip->target_cpu,
383 		cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0);
384 
385 	if (TSC_sync_margin &&
386 	    cpu_datap(psip->target_cpu)->cpu_running) {
387 		/*
388 		 * Compare the TSC from the started processor with ours.
389 		 * Report and log/panic if it diverges by more than
390 		 * TSC_sync_margin (TSC_SYNC_MARGIN) ticks. This margin
391 		 * can be overriden by boot-arg (with 0 meaning no checking).
392 		 */
393 		uint64_t        tsc_starter;
394 		int64_t         tsc_delta;
395 		atomic_decl(&tsc_entry_barrier, 1);
396 		while (tsc_entry_barrier != 0) {
397 			;       /* spin for both processors at barrier */
398 		}
399 		tsc_starter = rdtsc64();
400 		atomic_decl(&tsc_exit_barrier, 1);
401 		while (tsc_exit_barrier != 0) {
402 			;       /* spin for target to store its TSC */
403 		}
404 		tsc_delta = tsc_target - tsc_starter;
405 		kprintf("TSC sync for cpu %d: 0x%016llx delta 0x%llx (%lld)\n",
406 		    psip->target_cpu, tsc_target, tsc_delta, tsc_delta);
407 #if DEBUG || DEVELOPMENT
408 		/*
409 		 * Stash the delta for inspection later, since we can no
410 		 * longer print/log it with interrupts disabled.
411 		 */
412 		cpu_datap(psip->target_cpu)->tsc_sync_delta = tsc_delta;
413 #endif
414 		if (ABS(tsc_delta) > (int64_t) TSC_sync_margin) {
415 #if DEBUG
416 			panic(
417 #else
418 			kprintf(
419 #endif
420 				"Unsynchronized  TSC for cpu %d: "
421 				"0x%016llx, delta 0x%llx\n",
422 				psip->target_cpu, tsc_target, tsc_delta);
423 		}
424 	}
425 }
426 
427 kern_return_t
intel_startCPU(int slot_num)428 intel_startCPU(
429 	int     slot_num)
430 {
431 	int             lapic = cpu_to_lapic[slot_num];
432 	boolean_t       istate;
433 
434 	assert(lapic != -1);
435 
436 	DBGLOG_CPU_INIT(slot_num);
437 
438 	DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic);
439 	DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD);
440 
441 	/*
442 	 * Initialize (or re-initialize) the descriptor tables for this cpu.
443 	 * Propagate processor mode to slave.
444 	 */
445 	cpu_desc_init(cpu_datap(slot_num));
446 
447 	/* Serialize use of the slave boot stack, etc. */
448 	lck_mtx_lock(&mp_cpu_boot_lock);
449 
450 	istate = ml_set_interrupts_enabled(FALSE);
451 	if (slot_num == get_cpu_number()) {
452 		ml_set_interrupts_enabled(istate);
453 		lck_mtx_unlock(&mp_cpu_boot_lock);
454 		return KERN_SUCCESS;
455 	}
456 
457 	start_info.starter_cpu  = cpu_number();
458 	start_info.target_cpu   = slot_num;
459 	start_info.target_lapic = lapic;
460 	tsc_entry_barrier = 2;
461 	tsc_exit_barrier = 2;
462 
463 	/*
464 	 * Perform the processor startup sequence with all running
465 	 * processors rendezvous'ed. This is required during periods when
466 	 * the cache-disable bit is set for MTRR/PAT initialization.
467 	 */
468 	mp_rendezvous_no_intrs(start_cpu, (void *) &start_info);
469 
470 	start_info.target_cpu = 0;
471 
472 	ml_set_interrupts_enabled(istate);
473 	lck_mtx_unlock(&mp_cpu_boot_lock);
474 
475 	if (!cpu_datap(slot_num)->cpu_running) {
476 		kprintf("Failed to start CPU %02d\n", slot_num);
477 		printf("Failed to start CPU %02d, rebooting...\n", slot_num);
478 		delay(1000000);
479 		halt_cpu();
480 		return KERN_SUCCESS;
481 	} else {
482 		kprintf("Started cpu %d (lapic id %08x)\n", slot_num, lapic);
483 		return KERN_SUCCESS;
484 	}
485 }
486 
487 #if     MP_DEBUG
488 cpu_signal_event_log_t  *cpu_signal[MAX_CPUS];
489 cpu_signal_event_log_t  *cpu_handle[MAX_CPUS];
490 
491 MP_EVENT_NAME_DECL();
492 
493 #endif  /* MP_DEBUG */
494 
495 /*
496  * Note: called with NULL state when polling for TLB flush and cross-calls.
497  */
498 int
cpu_signal_handler(x86_saved_state_t * regs)499 cpu_signal_handler(x86_saved_state_t *regs)
500 {
501 #if     !MACH_KDP
502 #pragma unused (regs)
503 #endif /* !MACH_KDP */
504 	int             my_cpu;
505 	volatile int    *my_word;
506 
507 	SCHED_STATS_INC(ipi_count);
508 
509 	my_cpu = cpu_number();
510 	my_word = &cpu_data_ptr[my_cpu]->cpu_signals;
511 	/* Store the initial set of signals for diagnostics. New
512 	 * signals could arrive while these are being processed
513 	 * so it's no more than a hint.
514 	 */
515 
516 	cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word;
517 
518 	do {
519 #if     MACH_KDP
520 		if (i_bit(MP_KDP, my_word)) {
521 			DBGLOG(cpu_handle, my_cpu, MP_KDP);
522 			i_bit_clear(MP_KDP, my_word);
523 /* Ensure that the i386_kernel_state at the base of the
524  * current thread's stack (if any) is synchronized with the
525  * context at the moment of the interrupt, to facilitate
526  * access through the debugger.
527  */
528 			sync_iss_to_iks(regs);
529 			if (pmsafe_debug && !kdp_snapshot) {
530 				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
531 			}
532 			mp_kdp_wait(TRUE, FALSE);
533 			if (pmsafe_debug && !kdp_snapshot) {
534 				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
535 			}
536 		} else
537 #endif  /* MACH_KDP */
538 		if (i_bit(MP_TLB_FLUSH, my_word)) {
539 			DBGLOG(cpu_handle, my_cpu, MP_TLB_FLUSH);
540 			i_bit_clear(MP_TLB_FLUSH, my_word);
541 			pmap_update_interrupt();
542 		} else if (i_bit(MP_CALL, my_word)) {
543 			DBGLOG(cpu_handle, my_cpu, MP_CALL);
544 			i_bit_clear(MP_CALL, my_word);
545 			mp_cpus_call_action();
546 		} else if (i_bit(MP_CALL_PM, my_word)) {
547 			DBGLOG(cpu_handle, my_cpu, MP_CALL_PM);
548 			i_bit_clear(MP_CALL_PM, my_word);
549 			mp_call_PM();
550 		}
551 		if (regs == NULL) {
552 			/* Called to poll only for cross-calls and TLB flush */
553 			break;
554 		} else if (i_bit(MP_AST, my_word)) {
555 			DBGLOG(cpu_handle, my_cpu, MP_AST);
556 			i_bit_clear(MP_AST, my_word);
557 			ast_check(cpu_to_processor(my_cpu));
558 		}
559 	} while (*my_word);
560 
561 	return 0;
562 }
563 
564 long
NMI_pte_corruption_callback(__unused void * arg0,__unused void * arg1,uint16_t lcpu)565 NMI_pte_corruption_callback(__unused void *arg0, __unused void *arg1, uint16_t lcpu)
566 {
567 	static char     pstr[256];      /* global since this callback is serialized */
568 	void            *stackptr;
569 	__asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
570 
571 	snprintf(&pstr[0], sizeof(pstr),
572 	    "Panic(CPU %d): PTE corruption detected on PTEP 0x%llx VAL 0x%llx\n",
573 	    lcpu, (unsigned long long)(uintptr_t)PTE_corrupted_ptr, *(uint64_t *)PTE_corrupted_ptr);
574 	panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, current_cpu_datap()->cpu_int_state);
575 	return 0;
576 }
577 
578 extern void kprintf_break_lock(void);
579 int
NMIInterruptHandler(x86_saved_state_t * regs)580 NMIInterruptHandler(x86_saved_state_t *regs)
581 {
582 	void            *stackptr;
583 	char            pstr[256];
584 	uint64_t        now = mach_absolute_time();
585 
586 	if (panic_active() && !panicDebugging) {
587 		if (pmsafe_debug) {
588 			pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
589 		}
590 		for (;;) {
591 			cpu_pause();
592 		}
593 	}
594 
595 	atomic_incl(&NMIPI_acks, 1);
596 	atomic_incl(&NMI_count, 1);
597 	sync_iss_to_iks_unconditionally(regs);
598 	__asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
599 
600 	if (cpu_number() == debugger_cpu) {
601 		goto NMExit;
602 	}
603 
604 	if (NMI_panic_reason == SPINLOCK_TIMEOUT) {
605 		lck_spinlock_to_info_t lsti;
606 
607 		lsti = os_atomic_load(&lck_spinlock_timeout_in_progress, acquire);
608 		snprintf(&pstr[0], sizeof(pstr),
609 		    "Panic(CPU %d, time %llu): NMIPI for spinlock acquisition timeout, spinlock: %p, "
610 		    "spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n",
611 		    cpu_number(), now, lsti->lock, (void *)lsti->owner_thread_cur,
612 		    current_thread(), lsti->owner_cpu);
613 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
614 	} else if (NMI_panic_reason == TLB_FLUSH_TIMEOUT) {
615 		snprintf(&pstr[0], sizeof(pstr),
616 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: TLB flush timeout, TLB state:0x%x\n",
617 		    cpu_number(), now, current_cpu_datap()->cpu_tlb_invalid);
618 		panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs);
619 	} else if (NMI_panic_reason == CROSSCALL_TIMEOUT) {
620 		snprintf(&pstr[0], sizeof(pstr),
621 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: cross-call timeout\n",
622 		    cpu_number(), now);
623 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
624 	} else if (NMI_panic_reason == INTERRUPT_WATCHDOG) {
625 		snprintf(&pstr[0], sizeof(pstr),
626 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: interrupt watchdog for vector 0x%x\n",
627 		    cpu_number(), now, vector_timed_out);
628 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
629 	}
630 
631 #if MACH_KDP
632 	if (pmsafe_debug && !kdp_snapshot) {
633 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
634 	}
635 	current_cpu_datap()->cpu_NMI_acknowledged = TRUE;
636 	i_bit_clear(MP_KDP, &current_cpu_datap()->cpu_signals);
637 	if (panic_active() || NMI_panic_reason != NONE) {
638 		mp_kdp_wait(FALSE, TRUE);
639 	} else if (!mp_kdp_trap &&
640 	    !mp_kdp_is_NMI &&
641 	    virtualized && (debug_boot_arg & DB_NMI)) {
642 		/*
643 		 * Under a VMM with the debug boot-arg set, drop into kdp.
644 		 * Since an NMI is involved, there's a risk of contending with
645 		 * a panic. And side-effects of NMIs may result in entry into,
646 		 * and continuing from, the debugger being unreliable.
647 		 */
648 		if (__sync_bool_compare_and_swap(&mp_kdp_is_NMI, FALSE, TRUE)) {
649 			kprintf_break_lock();
650 
651 			DebuggerWithContext(EXC_BREAKPOINT, saved_state64(regs),
652 			    "requested by NMI", DEBUGGER_OPTION_NONE,
653 			    (unsigned long)(char *)__builtin_return_address(0));
654 
655 			mp_kdp_is_NMI = FALSE;
656 		} else {
657 			mp_kdp_wait(FALSE, FALSE);
658 		}
659 	} else {
660 		mp_kdp_wait(FALSE, FALSE);
661 	}
662 	if (pmsafe_debug && !kdp_snapshot) {
663 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
664 	}
665 #endif
666 NMExit:
667 	return 1;
668 }
669 
670 /*
671  * cpu_interrupt is really just to be used by the scheduler to
672  * get a CPU's attention it may not always issue an IPI.  If an
673  * IPI is always needed then use i386_cpu_IPI.
674  */
675 void
cpu_interrupt(int cpu)676 cpu_interrupt(int cpu)
677 {
678 	boolean_t did_IPI = FALSE;
679 
680 	if (smp_initialized
681 	    && pmCPUExitIdle(cpu_datap(cpu))) {
682 		i386_cpu_IPI(cpu);
683 		did_IPI = TRUE;
684 	}
685 
686 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0);
687 }
688 
689 /*
690  * Send a true NMI via the local APIC to the specified CPU.
691  */
692 void
cpu_NMI_interrupt(int cpu)693 cpu_NMI_interrupt(int cpu)
694 {
695 	if (smp_initialized) {
696 		i386_send_NMI(cpu);
697 	}
698 }
699 
700 void
NMI_cpus(void)701 NMI_cpus(void)
702 {
703 	unsigned int    cpu;
704 	boolean_t       intrs_enabled;
705 	uint64_t        tsc_timeout;
706 
707 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
708 	NMIPI_enable(TRUE);
709 	for (cpu = 0; cpu < real_ncpus; cpu++) {
710 		if (!cpu_is_running(cpu)) {
711 			continue;
712 		}
713 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
714 		cpu_NMI_interrupt(cpu);
715 		tsc_timeout = !machine_timeout_suspended() ?
716 		    rdtsc64() + (1000 * 1000 * 1000 * 10ULL) :
717 		    ~0ULL;
718 		while (!cpu_datap(cpu)->cpu_NMI_acknowledged) {
719 			handle_pending_TLB_flushes();
720 			cpu_pause();
721 			if (rdtsc64() > tsc_timeout) {
722 				panic("NMI_cpus() timeout cpu %d", cpu);
723 			}
724 		}
725 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
726 	}
727 	NMIPI_enable(FALSE);
728 
729 	ml_set_interrupts_enabled(intrs_enabled);
730 }
731 
732 static void(*volatile mp_PM_func)(void) = NULL;
733 
734 static void
mp_call_PM(void)735 mp_call_PM(void)
736 {
737 	assert(!ml_get_interrupts_enabled());
738 
739 	if (mp_PM_func != NULL) {
740 		mp_PM_func();
741 	}
742 }
743 
744 void
cpu_PM_interrupt(int cpu)745 cpu_PM_interrupt(int cpu)
746 {
747 	assert(!ml_get_interrupts_enabled());
748 
749 	if (mp_PM_func != NULL) {
750 		if (cpu == cpu_number()) {
751 			mp_PM_func();
752 		} else {
753 			i386_signal_cpu(cpu, MP_CALL_PM, ASYNC);
754 		}
755 	}
756 }
757 
758 void
PM_interrupt_register(void (* fn)(void))759 PM_interrupt_register(void (*fn)(void))
760 {
761 	mp_PM_func = fn;
762 }
763 
764 void
i386_signal_cpu(int cpu,mp_event_t event,mp_sync_t mode)765 i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode)
766 {
767 	volatile int    *signals = &cpu_datap(cpu)->cpu_signals;
768 	uint64_t        tsc_timeout;
769 
770 
771 	if (!cpu_datap(cpu)->cpu_running) {
772 		return;
773 	}
774 
775 	if (event == MP_TLB_FLUSH) {
776 		KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0);
777 	}
778 
779 	DBGLOG(cpu_signal, cpu, event);
780 
781 	i_bit_set(event, signals);
782 	i386_cpu_IPI(cpu);
783 	if (mode == SYNC) {
784 again:
785 		tsc_timeout = !machine_timeout_suspended() ?
786 		    rdtsc64() + (1000 * 1000 * 1000) :
787 		    ~0ULL;
788 		while (i_bit(event, signals) && rdtsc64() < tsc_timeout) {
789 			cpu_pause();
790 		}
791 		if (i_bit(event, signals)) {
792 			DBG("i386_signal_cpu(%d, 0x%x, SYNC) timed out\n",
793 			    cpu, event);
794 			goto again;
795 		}
796 	}
797 	if (event == MP_TLB_FLUSH) {
798 		KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0);
799 	}
800 }
801 
802 /*
803  * Helper function called when busy-waiting: panic if too long
804  * a TSC-based time has elapsed since the start of the spin.
805  */
806 static boolean_t
mp_spin_timeout(uint64_t tsc_start)807 mp_spin_timeout(uint64_t tsc_start)
808 {
809 	uint64_t        tsc_timeout;
810 
811 	cpu_pause();
812 	if (machine_timeout_suspended()) {
813 		return FALSE;
814 	}
815 
816 	/*
817 	 * The timeout is 4 * the spinlock timeout period
818 	 * unless we have serial console printing (kprintf) enabled
819 	 * in which case we allow an even greater margin.
820 	 */
821 	tsc_timeout = disable_serial_output ? LockTimeOutTSC << 2
822 	        : LockTimeOutTSC << 4;
823 	return rdtsc64() > tsc_start + tsc_timeout;
824 }
825 
826 /*
827  * Helper function to take a spinlock while ensuring that incoming IPIs
828  * are still serviced if interrupts are masked while we spin.
829  * Returns current interrupt state.
830  */
831 boolean_t
mp_safe_spin_lock(usimple_lock_t lock)832 mp_safe_spin_lock(usimple_lock_t lock)
833 {
834 	if (ml_get_interrupts_enabled()) {
835 		simple_lock(lock, LCK_GRP_NULL);
836 		return TRUE;
837 	}
838 
839 	lck_spinlock_to_info_t lsti;
840 	uint64_t tsc_spin_start = rdtsc64();
841 
842 	while (!simple_lock_try(lock, LCK_GRP_NULL)) {
843 		cpu_signal_handler(NULL);
844 		if (mp_spin_timeout(tsc_spin_start)) {
845 			uintptr_t lowner = (uintptr_t)lock->interlock.lock_data;
846 
847 			lsti = lck_spinlock_timeout_hit(lock, lowner);
848 			NMIPI_panic(cpu_to_cpumask(lsti->owner_cpu), SPINLOCK_TIMEOUT);
849 			panic("mp_safe_spin_lock() timed out, lock: %p, "
850 			    "owner thread: 0x%lx, current_thread: %p, "
851 			    "owner on CPU 0x%x, time: %llu",
852 			    lock, lowner, current_thread(),
853 			    lsti->owner_cpu, mach_absolute_time());
854 		}
855 	}
856 
857 	return FALSE;
858 }
859 
860 /*
861  * All-CPU rendezvous:
862  *      - CPUs are signalled,
863  *	- all execute the setup function (if specified),
864  *	- rendezvous (i.e. all cpus reach a barrier),
865  *	- all execute the action function (if specified),
866  *	- rendezvous again,
867  *	- execute the teardown function (if specified), and then
868  *	- resume.
869  *
870  * Note that the supplied external functions _must_ be reentrant and aware
871  * that they are running in parallel and in an unknown lock context.
872  */
873 
874 static void
mp_rendezvous_action(__unused void * null)875 mp_rendezvous_action(__unused void *null)
876 {
877 	boolean_t       intrs_enabled;
878 	uint64_t        tsc_spin_start;
879 
880 	/*
881 	 * Note that mp_rv_lock was acquired by the thread that initiated the
882 	 * rendezvous and must have been acquired before we enter
883 	 * mp_rendezvous_action().
884 	 */
885 	current_cpu_datap()->cpu_rendezvous_in_progress = TRUE;
886 
887 	/* setup function */
888 	if (mp_rv_setup_func != NULL) {
889 		mp_rv_setup_func(mp_rv_func_arg);
890 	}
891 
892 	intrs_enabled = ml_get_interrupts_enabled();
893 
894 	/* spin on entry rendezvous */
895 	atomic_incl(&mp_rv_entry, 1);
896 	tsc_spin_start = rdtsc64();
897 
898 	while (mp_rv_entry < mp_rv_ncpus) {
899 		/* poll for pesky tlb flushes if interrupts disabled */
900 		if (!intrs_enabled) {
901 			handle_pending_TLB_flushes();
902 		}
903 		if (mp_spin_timeout(tsc_spin_start)) {
904 			panic("mp_rv_action() entry: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_entry, mp_rv_ncpus, tsc_spin_start, rdtsc64());
905 		}
906 	}
907 
908 	/* action function */
909 	if (mp_rv_action_func != NULL) {
910 		mp_rv_action_func(mp_rv_func_arg);
911 	}
912 
913 	/* spin on exit rendezvous */
914 	atomic_incl(&mp_rv_exit, 1);
915 	tsc_spin_start = rdtsc64();
916 	while (mp_rv_exit < mp_rv_ncpus) {
917 		if (!intrs_enabled) {
918 			handle_pending_TLB_flushes();
919 		}
920 		if (mp_spin_timeout(tsc_spin_start)) {
921 			panic("mp_rv_action() exit: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_exit, mp_rv_ncpus, tsc_spin_start, rdtsc64());
922 		}
923 	}
924 
925 	/* teardown function */
926 	if (mp_rv_teardown_func != NULL) {
927 		mp_rv_teardown_func(mp_rv_func_arg);
928 	}
929 
930 	current_cpu_datap()->cpu_rendezvous_in_progress = FALSE;
931 
932 	/* Bump completion count */
933 	atomic_incl(&mp_rv_complete, 1);
934 }
935 
936 void
mp_rendezvous(void (* setup_func)(void *),void (* action_func)(void *),void (* teardown_func)(void *),void * arg)937 mp_rendezvous(void (*setup_func)(void *),
938     void (*action_func)(void *),
939     void (*teardown_func)(void *),
940     void *arg)
941 {
942 	uint64_t        tsc_spin_start;
943 
944 	if (!smp_initialized) {
945 		if (setup_func != NULL) {
946 			setup_func(arg);
947 		}
948 		if (action_func != NULL) {
949 			action_func(arg);
950 		}
951 		if (teardown_func != NULL) {
952 			teardown_func(arg);
953 		}
954 		return;
955 	}
956 
957 	/* obtain rendezvous lock */
958 	mp_rendezvous_lock();
959 
960 	/* set static function pointers */
961 	mp_rv_setup_func = setup_func;
962 	mp_rv_action_func = action_func;
963 	mp_rv_teardown_func = teardown_func;
964 	mp_rv_func_arg = arg;
965 
966 	mp_rv_entry    = 0;
967 	mp_rv_exit     = 0;
968 	mp_rv_complete = 0;
969 
970 	/*
971 	 * signal other processors, which will call mp_rendezvous_action()
972 	 * with interrupts disabled
973 	 */
974 	mp_rv_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, &mp_rendezvous_action, NULL) + 1;
975 
976 	/* call executor function on this cpu */
977 	mp_rendezvous_action(NULL);
978 
979 	/*
980 	 * Spin for everyone to complete.
981 	 * This is necessary to ensure that all processors have proceeded
982 	 * from the exit barrier before we release the rendezvous structure.
983 	 */
984 	tsc_spin_start = rdtsc64();
985 	while (mp_rv_complete < mp_rv_ncpus) {
986 		if (mp_spin_timeout(tsc_spin_start)) {
987 			panic("mp_rendezvous() timeout: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_complete, mp_rv_ncpus, tsc_spin_start, rdtsc64());
988 		}
989 	}
990 
991 	/* Tidy up */
992 	mp_rv_setup_func = NULL;
993 	mp_rv_action_func = NULL;
994 	mp_rv_teardown_func = NULL;
995 	mp_rv_func_arg = NULL;
996 
997 	/* release lock */
998 	mp_rendezvous_unlock();
999 }
1000 
1001 void
mp_rendezvous_lock(void)1002 mp_rendezvous_lock(void)
1003 {
1004 	(void) mp_safe_spin_lock(&mp_rv_lock);
1005 }
1006 
1007 void
mp_rendezvous_unlock(void)1008 mp_rendezvous_unlock(void)
1009 {
1010 	simple_unlock(&mp_rv_lock);
1011 }
1012 
1013 void
mp_rendezvous_break_lock(void)1014 mp_rendezvous_break_lock(void)
1015 {
1016 	simple_lock_init(&mp_rv_lock, 0);
1017 }
1018 
1019 static void
setup_disable_intrs(__unused void * param_not_used)1020 setup_disable_intrs(__unused void * param_not_used)
1021 {
1022 	/* disable interrupts before the first barrier */
1023 	boolean_t intr = ml_set_interrupts_enabled(FALSE);
1024 
1025 	current_cpu_datap()->cpu_iflag = intr;
1026 	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1027 }
1028 
1029 static void
teardown_restore_intrs(__unused void * param_not_used)1030 teardown_restore_intrs(__unused void * param_not_used)
1031 {
1032 	/* restore interrupt flag following MTRR changes */
1033 	ml_set_interrupts_enabled(current_cpu_datap()->cpu_iflag);
1034 	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1035 }
1036 
1037 /*
1038  * A wrapper to mp_rendezvous() to call action_func() with interrupts disabled.
1039  * This is exported for use by kexts.
1040  */
1041 void
mp_rendezvous_no_intrs(void (* action_func)(void *),void * arg)1042 mp_rendezvous_no_intrs(
1043 	void (*action_func)(void *),
1044 	void *arg)
1045 {
1046 	mp_rendezvous(setup_disable_intrs,
1047 	    action_func,
1048 	    teardown_restore_intrs,
1049 	    arg);
1050 }
1051 
1052 
1053 typedef struct {
1054 	queue_chain_t   link;                   /* queue linkage */
1055 	void            (*func)(void *, void *); /* routine to call */
1056 	void            *arg0;                  /* routine's 1st arg */
1057 	void            *arg1;                  /* routine's 2nd arg */
1058 	cpumask_t       *maskp;                 /* completion response mask */
1059 } mp_call_t;
1060 
1061 
1062 typedef struct {
1063 	queue_head_t            queue;
1064 	decl_simple_lock_data(, lock);
1065 } mp_call_queue_t;
1066 #define MP_CPUS_CALL_BUFS_PER_CPU       MAX_CPUS
1067 static mp_call_queue_t  mp_cpus_call_freelist;
1068 static mp_call_queue_t  mp_cpus_call_head[MAX_CPUS];
1069 
1070 static inline boolean_t
mp_call_head_lock(mp_call_queue_t * cqp)1071 mp_call_head_lock(mp_call_queue_t *cqp)
1072 {
1073 	boolean_t       intrs_enabled;
1074 
1075 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
1076 	simple_lock(&cqp->lock, LCK_GRP_NULL);
1077 
1078 	return intrs_enabled;
1079 }
1080 
1081 /*
1082  * Deliver an NMIPI to a set of processors to cause them to panic .
1083  */
1084 void
NMIPI_panic(cpumask_t cpu_mask,NMI_reason_t why)1085 NMIPI_panic(cpumask_t cpu_mask, NMI_reason_t why)
1086 {
1087 	unsigned int cpu;
1088 	cpumask_t cpu_bit;
1089 	uint64_t deadline;
1090 
1091 	NMIPI_enable(TRUE);
1092 	NMI_panic_reason = why;
1093 
1094 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1095 		if ((cpu_mask & cpu_bit) == 0) {
1096 			continue;
1097 		}
1098 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1099 		cpu_NMI_interrupt(cpu);
1100 	}
1101 
1102 	/* Wait (only so long) for NMi'ed cpus to respond */
1103 	deadline = mach_absolute_time() + LockTimeOut;
1104 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1105 		if ((cpu_mask & cpu_bit) == 0) {
1106 			continue;
1107 		}
1108 		while (!cpu_datap(cpu)->cpu_NMI_acknowledged &&
1109 		    mach_absolute_time() < deadline) {
1110 			cpu_pause();
1111 		}
1112 	}
1113 }
1114 
1115 #if MACH_ASSERT
1116 static inline boolean_t
mp_call_head_is_locked(mp_call_queue_t * cqp)1117 mp_call_head_is_locked(mp_call_queue_t *cqp)
1118 {
1119 	return !ml_get_interrupts_enabled() &&
1120 	       hw_lock_held((hw_lock_t)&cqp->lock);
1121 }
1122 #endif
1123 
1124 static inline void
mp_call_head_unlock(mp_call_queue_t * cqp,boolean_t intrs_enabled)1125 mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled)
1126 {
1127 	simple_unlock(&cqp->lock);
1128 	ml_set_interrupts_enabled(intrs_enabled);
1129 }
1130 
1131 static inline mp_call_t *
mp_call_alloc(void)1132 mp_call_alloc(void)
1133 {
1134 	mp_call_t       *callp = NULL;
1135 	boolean_t       intrs_enabled;
1136 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1137 
1138 	intrs_enabled = mp_call_head_lock(cqp);
1139 	if (!queue_empty(&cqp->queue)) {
1140 		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1141 	}
1142 	mp_call_head_unlock(cqp, intrs_enabled);
1143 
1144 	return callp;
1145 }
1146 
1147 static inline void
mp_call_free(mp_call_t * callp)1148 mp_call_free(mp_call_t *callp)
1149 {
1150 	boolean_t       intrs_enabled;
1151 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1152 
1153 	intrs_enabled = mp_call_head_lock(cqp);
1154 	queue_enter_first(&cqp->queue, callp, typeof(callp), link);
1155 	mp_call_head_unlock(cqp, intrs_enabled);
1156 }
1157 
1158 static inline mp_call_t *
mp_call_dequeue_locked(mp_call_queue_t * cqp)1159 mp_call_dequeue_locked(mp_call_queue_t *cqp)
1160 {
1161 	mp_call_t       *callp = NULL;
1162 
1163 	assert(mp_call_head_is_locked(cqp));
1164 	if (!queue_empty(&cqp->queue)) {
1165 		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1166 	}
1167 	return callp;
1168 }
1169 
1170 static inline void
mp_call_enqueue_locked(mp_call_queue_t * cqp,mp_call_t * callp)1171 mp_call_enqueue_locked(
1172 	mp_call_queue_t *cqp,
1173 	mp_call_t       *callp)
1174 {
1175 	queue_enter(&cqp->queue, callp, typeof(callp), link);
1176 }
1177 
1178 /* Called on the boot processor to initialize global structures */
1179 static void
mp_cpus_call_init(void)1180 mp_cpus_call_init(void)
1181 {
1182 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1183 
1184 	DBG("mp_cpus_call_init()\n");
1185 	simple_lock_init(&cqp->lock, 0);
1186 	queue_init(&cqp->queue);
1187 }
1188 
1189 /*
1190  * Called at processor registration to add call buffers to the free list
1191  * and to initialize the per-cpu call queue.
1192  */
1193 void
mp_cpus_call_cpu_init(int cpu)1194 mp_cpus_call_cpu_init(int cpu)
1195 {
1196 	int             i;
1197 	mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1198 	mp_call_t       *callp;
1199 
1200 	simple_lock_init(&cqp->lock, 0);
1201 	queue_init(&cqp->queue);
1202 	for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) {
1203 		callp = zalloc_permanent_type(mp_call_t);
1204 		mp_call_free(callp);
1205 	}
1206 
1207 	DBG("mp_cpus_call_init(%d) done\n", cpu);
1208 }
1209 
1210 /*
1211  * This is called from cpu_signal_handler() to process an MP_CALL signal.
1212  * And also from i386_deactivate_cpu() when a cpu is being taken offline.
1213  */
1214 static void
mp_cpus_call_action(void)1215 mp_cpus_call_action(void)
1216 {
1217 	mp_call_queue_t *cqp;
1218 	boolean_t       intrs_enabled;
1219 	mp_call_t       *callp;
1220 	mp_call_t       call;
1221 
1222 	assert(!ml_get_interrupts_enabled());
1223 	cqp = &mp_cpus_call_head[cpu_number()];
1224 	intrs_enabled = mp_call_head_lock(cqp);
1225 	while ((callp = mp_call_dequeue_locked(cqp)) != NULL) {
1226 		/* Copy call request to the stack to free buffer */
1227 		call = *callp;
1228 		mp_call_free(callp);
1229 		if (call.func != NULL) {
1230 			mp_call_head_unlock(cqp, intrs_enabled);
1231 			KERNEL_DEBUG_CONSTANT(
1232 				TRACE_MP_CPUS_CALL_ACTION,
1233 				VM_KERNEL_UNSLIDE(call.func), VM_KERNEL_UNSLIDE_OR_PERM(call.arg0),
1234 				VM_KERNEL_UNSLIDE_OR_PERM(call.arg1), VM_KERNEL_ADDRPERM(call.maskp), 0);
1235 			call.func(call.arg0, call.arg1);
1236 			(void) mp_call_head_lock(cqp);
1237 		}
1238 		if (call.maskp != NULL) {
1239 			i_bit_set(cpu_number(), call.maskp);
1240 		}
1241 	}
1242 	mp_call_head_unlock(cqp, intrs_enabled);
1243 }
1244 
1245 #pragma clang diagnostic push
1246 #pragma clang diagnostic ignored "-Wcast-function-type"
1247 
1248 /*
1249  * mp_cpus_call() runs a given function on cpus specified in a given cpu mask.
1250  * Possible modes are:
1251  *  SYNC:   function is called serially on target cpus in logical cpu order
1252  *	    waiting for each call to be acknowledged before proceeding
1253  *  ASYNC:  function call is queued to the specified cpus
1254  *	    waiting for all calls to complete in parallel before returning
1255  *  NOSYNC: function calls are queued
1256  *	    but we return before confirmation of calls completing.
1257  * The action function may be NULL.
1258  * The cpu mask may include the local cpu. Offline cpus are ignored.
1259  * The return value is the number of cpus on which the call was made or queued.
1260  */
1261 cpu_t
mp_cpus_call(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *),void * arg)1262 mp_cpus_call(
1263 	cpumask_t       cpus,
1264 	mp_sync_t       mode,
1265 	void            (*action_func)(void *),
1266 	void            *arg)
1267 {
1268 	return mp_cpus_call1(
1269 		cpus,
1270 		mode,
1271 		(void (*)(void *, void *))action_func,
1272 		arg,
1273 		NULL,
1274 		NULL);
1275 }
1276 
1277 #pragma clang diagnostic pop
1278 
1279 static void
mp_cpus_call_wait(boolean_t intrs_enabled,cpumask_t cpus_called,cpumask_t * cpus_responded)1280 mp_cpus_call_wait(boolean_t     intrs_enabled,
1281     cpumask_t     cpus_called,
1282     cpumask_t     *cpus_responded)
1283 {
1284 	mp_call_queue_t         *cqp;
1285 	uint64_t                tsc_spin_start;
1286 
1287 	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1288 	cqp = &mp_cpus_call_head[cpu_number()];
1289 
1290 	tsc_spin_start = rdtsc64();
1291 	while (*cpus_responded != cpus_called) {
1292 		if (!intrs_enabled) {
1293 			/* Sniffing w/o locking */
1294 			if (!queue_empty(&cqp->queue)) {
1295 				mp_cpus_call_action();
1296 			}
1297 			cpu_signal_handler(NULL);
1298 		}
1299 		if (mp_spin_timeout(tsc_spin_start)) {
1300 			cpumask_t       cpus_unresponsive;
1301 
1302 			cpus_unresponsive = cpus_called & ~(*cpus_responded);
1303 			NMIPI_panic(cpus_unresponsive, CROSSCALL_TIMEOUT);
1304 			panic("mp_cpus_call_wait() timeout, cpus: 0x%llx",
1305 			    cpus_unresponsive);
1306 		}
1307 	}
1308 }
1309 
1310 cpu_t
mp_cpus_call1(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *,void *),void * arg0,void * arg1,cpumask_t * cpus_calledp)1311 mp_cpus_call1(
1312 	cpumask_t       cpus,
1313 	mp_sync_t       mode,
1314 	void            (*action_func)(void *, void *),
1315 	void            *arg0,
1316 	void            *arg1,
1317 	cpumask_t       *cpus_calledp)
1318 {
1319 	cpu_t           cpu = 0;
1320 	boolean_t       intrs_enabled = FALSE;
1321 	boolean_t       call_self = FALSE;
1322 	cpumask_t       cpus_called = 0;
1323 	cpumask_t       cpus_responded = 0;
1324 	long            cpus_call_count = 0;
1325 	uint64_t        tsc_spin_start;
1326 	boolean_t       topo_lock;
1327 
1328 	KERNEL_DEBUG_CONSTANT(
1329 		TRACE_MP_CPUS_CALL | DBG_FUNC_START,
1330 		cpus, mode, VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1));
1331 
1332 	if (!smp_initialized) {
1333 		if ((cpus & CPUMASK_SELF) == 0) {
1334 			goto out;
1335 		}
1336 		if (action_func != NULL) {
1337 			intrs_enabled = ml_set_interrupts_enabled(FALSE);
1338 			action_func(arg0, arg1);
1339 			ml_set_interrupts_enabled(intrs_enabled);
1340 		}
1341 		call_self = TRUE;
1342 		goto out;
1343 	}
1344 
1345 	/*
1346 	 * Queue the call for each non-local requested cpu.
1347 	 * This is performed under the topo lock to prevent changes to
1348 	 * cpus online state and to prevent concurrent rendezvouses --
1349 	 * although an exception is made if we're calling only the master
1350 	 * processor since that always remains active. Note: this exception
1351 	 * is expected for longterm timer nosync cross-calls to the master cpu.
1352 	 */
1353 	mp_disable_preemption();
1354 	intrs_enabled = ml_get_interrupts_enabled();
1355 	topo_lock = (cpus != cpu_to_cpumask(master_cpu));
1356 	if (topo_lock) {
1357 		ml_set_interrupts_enabled(FALSE);
1358 		(void) mp_safe_spin_lock(&x86_topo_lock);
1359 	}
1360 	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1361 		if (((cpu_to_cpumask(cpu) & cpus) == 0) ||
1362 		    !cpu_is_running(cpu)) {
1363 			continue;
1364 		}
1365 		tsc_spin_start = rdtsc64();
1366 		if (cpu == (cpu_t) cpu_number()) {
1367 			/*
1368 			 * We don't IPI ourself and if calling asynchronously,
1369 			 * we defer our call until we have signalled all others.
1370 			 */
1371 			call_self = TRUE;
1372 			if (mode == SYNC && action_func != NULL) {
1373 				KERNEL_DEBUG_CONSTANT(
1374 					TRACE_MP_CPUS_CALL_LOCAL,
1375 					VM_KERNEL_UNSLIDE(action_func),
1376 					VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1377 				action_func(arg0, arg1);
1378 			}
1379 		} else {
1380 			/*
1381 			 * Here to queue a call to cpu and IPI.
1382 			 */
1383 			mp_call_t       *callp = NULL;
1384 			mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1385 			boolean_t       intrs_inner;
1386 
1387 queue_call:
1388 			if (callp == NULL) {
1389 				callp = mp_call_alloc();
1390 			}
1391 			intrs_inner = mp_call_head_lock(cqp);
1392 			if (callp == NULL) {
1393 				mp_call_head_unlock(cqp, intrs_inner);
1394 				KERNEL_DEBUG_CONSTANT(
1395 					TRACE_MP_CPUS_CALL_NOBUF,
1396 					cpu, 0, 0, 0, 0);
1397 				if (!intrs_inner) {
1398 					/* Sniffing w/o locking */
1399 					if (!queue_empty(&cqp->queue)) {
1400 						mp_cpus_call_action();
1401 					}
1402 					handle_pending_TLB_flushes();
1403 				}
1404 				if (mp_spin_timeout(tsc_spin_start)) {
1405 					panic("mp_cpus_call1() timeout start: 0x%llx, cur: 0x%llx",
1406 					    tsc_spin_start, rdtsc64());
1407 				}
1408 				goto queue_call;
1409 			}
1410 			callp->maskp = (mode == NOSYNC) ? NULL : &cpus_responded;
1411 			callp->func = action_func;
1412 			callp->arg0 = arg0;
1413 			callp->arg1 = arg1;
1414 			mp_call_enqueue_locked(cqp, callp);
1415 			cpus_call_count++;
1416 			cpus_called |= cpu_to_cpumask(cpu);
1417 			i386_signal_cpu(cpu, MP_CALL, ASYNC);
1418 			mp_call_head_unlock(cqp, intrs_inner);
1419 			if (mode == SYNC) {
1420 				mp_cpus_call_wait(intrs_inner, cpus_called, &cpus_responded);
1421 			}
1422 		}
1423 	}
1424 	if (topo_lock) {
1425 		simple_unlock(&x86_topo_lock);
1426 		ml_set_interrupts_enabled(intrs_enabled);
1427 	}
1428 
1429 	/* Call locally if mode not SYNC */
1430 	if (mode != SYNC && call_self) {
1431 		KERNEL_DEBUG_CONSTANT(
1432 			TRACE_MP_CPUS_CALL_LOCAL,
1433 			VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1434 		if (action_func != NULL) {
1435 			ml_set_interrupts_enabled(FALSE);
1436 			action_func(arg0, arg1);
1437 			ml_set_interrupts_enabled(intrs_enabled);
1438 		}
1439 	}
1440 
1441 	/* For ASYNC, now wait for all signaled cpus to complete their calls */
1442 	if (mode == ASYNC) {
1443 		mp_cpus_call_wait(intrs_enabled, cpus_called, &cpus_responded);
1444 	}
1445 
1446 	/* Safe to allow pre-emption now */
1447 	mp_enable_preemption();
1448 
1449 out:
1450 	if (call_self) {
1451 		cpus_called |= cpu_to_cpumask(cpu);
1452 		cpus_call_count++;
1453 	}
1454 
1455 	if (cpus_calledp) {
1456 		*cpus_calledp = cpus_called;
1457 	}
1458 
1459 	KERNEL_DEBUG_CONSTANT(
1460 		TRACE_MP_CPUS_CALL | DBG_FUNC_END,
1461 		cpus_call_count, cpus_called, 0, 0, 0);
1462 
1463 	return (cpu_t) cpus_call_count;
1464 }
1465 
1466 
1467 static void
mp_broadcast_action(__unused void * null)1468 mp_broadcast_action(__unused void *null)
1469 {
1470 	/* call action function */
1471 	if (mp_bc_action_func != NULL) {
1472 		mp_bc_action_func(mp_bc_func_arg);
1473 	}
1474 
1475 	/* if we're the last one through, wake up the instigator */
1476 	if (atomic_decl_and_test(&mp_bc_count, 1)) {
1477 		thread_wakeup(((event_t)(uintptr_t) &mp_bc_count));
1478 	}
1479 }
1480 
1481 /*
1482  * mp_broadcast() runs a given function on all active cpus.
1483  * The caller blocks until the functions has run on all cpus.
1484  * The caller will also block if there is another pending broadcast.
1485  */
1486 void
mp_broadcast(void (* action_func)(void *),void * arg)1487 mp_broadcast(
1488 	void (*action_func)(void *),
1489 	void *arg)
1490 {
1491 	if (!smp_initialized) {
1492 		if (action_func != NULL) {
1493 			action_func(arg);
1494 		}
1495 		return;
1496 	}
1497 
1498 	/* obtain broadcast lock */
1499 	lck_mtx_lock(&mp_bc_lock);
1500 
1501 	/* set static function pointers */
1502 	mp_bc_action_func = action_func;
1503 	mp_bc_func_arg = arg;
1504 
1505 	assert_wait((event_t)(uintptr_t)&mp_bc_count, THREAD_UNINT);
1506 
1507 	/*
1508 	 * signal other processors, which will call mp_broadcast_action()
1509 	 */
1510 	mp_bc_count = real_ncpus;                       /* assume max possible active */
1511 	mp_bc_ncpus = mp_cpus_call(CPUMASK_ALL, NOSYNC, *mp_broadcast_action, NULL);
1512 	atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */
1513 
1514 	/* block for other cpus to have run action_func */
1515 	if (mp_bc_ncpus > 1) {
1516 		thread_block(THREAD_CONTINUE_NULL);
1517 	} else {
1518 		clear_wait(current_thread(), THREAD_AWAKENED);
1519 	}
1520 
1521 	/* release lock */
1522 	lck_mtx_unlock(&mp_bc_lock);
1523 }
1524 
1525 void
mp_cpus_kick(cpumask_t cpus)1526 mp_cpus_kick(cpumask_t cpus)
1527 {
1528 	cpu_t           cpu;
1529 	boolean_t       intrs_enabled = FALSE;
1530 
1531 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
1532 	mp_safe_spin_lock(&x86_topo_lock);
1533 
1534 	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1535 		if (((cpu_to_cpumask(cpu) & cpus) == 0)
1536 		    || !cpu_is_running(cpu)) {
1537 			continue;
1538 		}
1539 
1540 		lapic_send_ipi(cpu, LAPIC_VECTOR(KICK));
1541 	}
1542 
1543 	simple_unlock(&x86_topo_lock);
1544 	ml_set_interrupts_enabled(intrs_enabled);
1545 }
1546 
1547 void
i386_activate_cpu(void)1548 i386_activate_cpu(void)
1549 {
1550 	cpu_data_t      *cdp = current_cpu_datap();
1551 
1552 	assert(!ml_get_interrupts_enabled());
1553 
1554 	if (!smp_initialized) {
1555 		cdp->cpu_running = TRUE;
1556 		return;
1557 	}
1558 
1559 	mp_safe_spin_lock(&x86_topo_lock);
1560 	cdp->cpu_running = TRUE;
1561 	started_cpu();
1562 	pmap_tlbi_range(0, ~0ULL, true, 0);
1563 	simple_unlock(&x86_topo_lock);
1564 }
1565 
1566 void
i386_deactivate_cpu(void)1567 i386_deactivate_cpu(void)
1568 {
1569 	cpu_data_t      *cdp = current_cpu_datap();
1570 
1571 	assert(!ml_get_interrupts_enabled());
1572 
1573 	KERNEL_DEBUG_CONSTANT(
1574 		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START,
1575 		0, 0, 0, 0, 0);
1576 
1577 	mp_safe_spin_lock(&x86_topo_lock);
1578 	cdp->cpu_running = FALSE;
1579 	simple_unlock(&x86_topo_lock);
1580 
1581 	/*
1582 	 * Move all of this cpu's timers to the master/boot cpu,
1583 	 * and poke it in case there's a sooner deadline for it to schedule.
1584 	 */
1585 	timer_queue_shutdown(&cdp->rtclock_timer.queue);
1586 	mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, timer_queue_expire_local, NULL);
1587 
1588 #if MONOTONIC
1589 	mt_cpu_down(cdp);
1590 #endif /* MONOTONIC */
1591 #if KPERF
1592 	kptimer_stop_curcpu();
1593 #endif /* KPERF */
1594 
1595 	/*
1596 	 * Open an interrupt window
1597 	 * and ensure any pending IPI or timer is serviced
1598 	 */
1599 	mp_disable_preemption();
1600 	ml_set_interrupts_enabled(TRUE);
1601 
1602 	while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime) {
1603 		cpu_pause();
1604 	}
1605 	/*
1606 	 * Ensure there's no remaining timer deadline set
1607 	 * - AICPM may have left one active.
1608 	 */
1609 	setPop(0);
1610 
1611 	ml_set_interrupts_enabled(FALSE);
1612 	mp_enable_preemption();
1613 
1614 	KERNEL_DEBUG_CONSTANT(
1615 		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END,
1616 		0, 0, 0, 0, 0);
1617 }
1618 
1619 int     pmsafe_debug    = 1;
1620 
1621 #if     MACH_KDP
1622 volatile boolean_t      mp_kdp_trap = FALSE;
1623 volatile boolean_t      mp_kdp_is_NMI = FALSE;
1624 volatile unsigned long  mp_kdp_ncpus;
1625 boolean_t               mp_kdp_state;
1626 
1627 
1628 void
mp_kdp_enter(boolean_t proceed_on_failure)1629 mp_kdp_enter(boolean_t proceed_on_failure)
1630 {
1631 	unsigned int    cpu;
1632 	unsigned int    ncpus = 0;
1633 	unsigned int    my_cpu;
1634 	uint64_t        tsc_timeout;
1635 
1636 	DBG("mp_kdp_enter()\n");
1637 
1638 	/*
1639 	 * Here to enter the debugger.
1640 	 * In case of races, only one cpu is allowed to enter kdp after
1641 	 * stopping others.
1642 	 */
1643 	mp_kdp_state = ml_set_interrupts_enabled(FALSE);
1644 	my_cpu = cpu_number();
1645 
1646 	if (my_cpu == (unsigned) debugger_cpu) {
1647 		kprintf("\n\nRECURSIVE DEBUGGER ENTRY DETECTED\n\n");
1648 		kdp_reset();
1649 		return;
1650 	}
1651 
1652 	uint64_t start_time = cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time();
1653 	int locked = 0;
1654 	while (!locked || mp_kdp_trap) {
1655 		if (locked) {
1656 			simple_unlock(&x86_topo_lock);
1657 		}
1658 		if (proceed_on_failure) {
1659 			if (mach_absolute_time() - start_time > 500000000ll) {
1660 				paniclog_append_noflush("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n");
1661 				break;
1662 			}
1663 			locked = simple_lock_try(&x86_topo_lock, LCK_GRP_NULL);
1664 			if (!locked) {
1665 				cpu_pause();
1666 			}
1667 		} else {
1668 			mp_safe_spin_lock(&x86_topo_lock);
1669 			locked = TRUE;
1670 		}
1671 
1672 		if (locked && mp_kdp_trap) {
1673 			simple_unlock(&x86_topo_lock);
1674 			DBG("mp_kdp_enter() race lost\n");
1675 #if MACH_KDP
1676 			mp_kdp_wait(TRUE, FALSE);
1677 #endif
1678 			locked = FALSE;
1679 		}
1680 	}
1681 
1682 	if (pmsafe_debug && !kdp_snapshot) {
1683 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
1684 	}
1685 
1686 	debugger_cpu = my_cpu;
1687 	ncpus = 1;
1688 	atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1689 	mp_kdp_trap = TRUE;
1690 	debugger_entry_time = cpu_datap(my_cpu)->debugger_entry_time;
1691 
1692 	/*
1693 	 * Deliver a nudge to other cpus, counting how many
1694 	 */
1695 	DBG("mp_kdp_enter() signaling other processors\n");
1696 	if (force_immediate_debugger_NMI == FALSE) {
1697 		for (cpu = 0; cpu < real_ncpus; cpu++) {
1698 			if (cpu == my_cpu || !cpu_is_running(cpu)) {
1699 				continue;
1700 			}
1701 			ncpus++;
1702 			i386_signal_cpu(cpu, MP_KDP, ASYNC);
1703 		}
1704 		/*
1705 		 * Wait other processors to synchronize
1706 		 */
1707 		DBG("mp_kdp_enter() waiting for (%d) processors to suspend\n", ncpus);
1708 
1709 		/*
1710 		 * This timeout is rather arbitrary; we don't want to NMI
1711 		 * processors that are executing at potentially
1712 		 * "unsafe-to-interrupt" points such as the trampolines,
1713 		 * but neither do we want to lose state by waiting too long.
1714 		 */
1715 		tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1716 
1717 		while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1718 			/*
1719 			 * A TLB shootdown request may be pending--this would
1720 			 * result in the requesting processor waiting in
1721 			 * PMAP_UPDATE_TLBS() until this processor deals with it.
1722 			 * Process it, so it can now enter mp_kdp_wait()
1723 			 */
1724 			handle_pending_TLB_flushes();
1725 			cpu_pause();
1726 		}
1727 		/* If we've timed out, and some processor(s) are still unresponsive,
1728 		 * interrupt them with an NMI via the local APIC, iff a panic is
1729 		 * in progress.
1730 		 */
1731 		if (panic_active()) {
1732 			NMIPI_enable(TRUE);
1733 		}
1734 		if (mp_kdp_ncpus != ncpus) {
1735 			unsigned int wait_cycles = 0;
1736 			if (proceed_on_failure) {
1737 				paniclog_append_noflush("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1738 			} else {
1739 				DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1740 			}
1741 			for (cpu = 0; cpu < real_ncpus; cpu++) {
1742 				if (cpu == my_cpu || !cpu_is_running(cpu)) {
1743 					continue;
1744 				}
1745 				if (cpu_signal_pending(cpu, MP_KDP)) {
1746 					cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1747 					cpu_NMI_interrupt(cpu);
1748 				}
1749 			}
1750 			/* Wait again for the same timeout */
1751 			tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1752 			while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1753 				handle_pending_TLB_flushes();
1754 				cpu_pause();
1755 				++wait_cycles;
1756 			}
1757 			if (mp_kdp_ncpus != ncpus) {
1758 				paniclog_append_noflush("mp_kdp_enter() NMI pending on cpus:");
1759 				for (cpu = 0; cpu < real_ncpus; cpu++) {
1760 					if (cpu_is_running(cpu) && !cpu_datap(cpu)->cpu_NMI_acknowledged) {
1761 						paniclog_append_noflush(" %d", cpu);
1762 					}
1763 				}
1764 				paniclog_append_noflush("\n");
1765 				if (proceed_on_failure) {
1766 					paniclog_append_noflush("mp_kdp_enter() timed-out during %s wait after NMI;"
1767 					    "expected %u acks but received %lu after %u loops in %llu ticks\n",
1768 					    (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1769 				} else {
1770 					panic("mp_kdp_enter() timed-out during %s wait after NMI;"
1771 					    "expected %u acks but received %lu after %u loops in %llu ticks",
1772 					    (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1773 				}
1774 			}
1775 		}
1776 	} else if (NMI_panic_reason != PTE_CORRUPTION) {  /* In the pte corruption case, the detecting CPU has already NMIed other CPUs */
1777 		for (cpu = 0; cpu < real_ncpus; cpu++) {
1778 			if (cpu == my_cpu || !cpu_is_running(cpu)) {
1779 				continue;
1780 			}
1781 			cpu_NMI_interrupt(cpu);
1782 		}
1783 	}
1784 
1785 	if (locked) {
1786 		simple_unlock(&x86_topo_lock);
1787 	}
1788 
1789 	DBG("mp_kdp_enter() %d processors done %s\n",
1790 	    (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out");
1791 
1792 	postcode(MP_KDP_ENTER);
1793 }
1794 
1795 boolean_t
mp_kdp_all_cpus_halted()1796 mp_kdp_all_cpus_halted()
1797 {
1798 	unsigned int ncpus = 0, cpu = 0, my_cpu = 0;
1799 
1800 	my_cpu = cpu_number();
1801 	ncpus = 1; /* current CPU */
1802 	for (cpu = 0; cpu < real_ncpus; cpu++) {
1803 		if (cpu == my_cpu || !cpu_is_running(cpu)) {
1804 			continue;
1805 		}
1806 		ncpus++;
1807 	}
1808 
1809 	return mp_kdp_ncpus == ncpus;
1810 }
1811 
1812 static boolean_t
cpu_signal_pending(int cpu,mp_event_t event)1813 cpu_signal_pending(int cpu, mp_event_t event)
1814 {
1815 	volatile int    *signals = &cpu_datap(cpu)->cpu_signals;
1816 	boolean_t retval = FALSE;
1817 
1818 	if (i_bit(event, signals)) {
1819 		retval = TRUE;
1820 	}
1821 	return retval;
1822 }
1823 
1824 long
kdp_x86_xcpu_invoke(const uint16_t lcpu,kdp_x86_xcpu_func_t func,void * arg0,void * arg1,uint64_t timeout)1825 kdp_x86_xcpu_invoke(const uint16_t lcpu, kdp_x86_xcpu_func_t func,
1826     void *arg0, void *arg1, uint64_t timeout)
1827 {
1828 	uint64_t now;
1829 
1830 	if (lcpu > (real_ncpus - 1)) {
1831 		return -1;
1832 	}
1833 
1834 	if (func == NULL) {
1835 		return -1;
1836 	}
1837 
1838 	kdp_xcpu_call_func.func = func;
1839 	kdp_xcpu_call_func.ret  = -1;
1840 	kdp_xcpu_call_func.arg0 = arg0;
1841 	kdp_xcpu_call_func.arg1 = arg1;
1842 	kdp_xcpu_call_func.cpu  = lcpu;
1843 	DBG("Invoking function %p on CPU %d\n", func, (int32_t)lcpu);
1844 	now = mach_absolute_time();
1845 	while (kdp_xcpu_call_func.cpu != KDP_XCPU_NONE &&
1846 	    (timeout == 0 || (mach_absolute_time() - now) < timeout)) {
1847 		cpu_pause();
1848 	}
1849 	return kdp_xcpu_call_func.ret;
1850 }
1851 
1852 static void
kdp_x86_xcpu_poll(void)1853 kdp_x86_xcpu_poll(void)
1854 {
1855 	if ((uint16_t)cpu_number() == kdp_xcpu_call_func.cpu) {
1856 		kdp_xcpu_call_func.ret =
1857 		    kdp_xcpu_call_func.func(kdp_xcpu_call_func.arg0,
1858 		    kdp_xcpu_call_func.arg1,
1859 		    cpu_number());
1860 		kdp_xcpu_call_func.cpu = KDP_XCPU_NONE;
1861 	}
1862 }
1863 
1864 static void
mp_kdp_wait(boolean_t flush,boolean_t isNMI)1865 mp_kdp_wait(boolean_t flush, boolean_t isNMI)
1866 {
1867 	DBG("mp_kdp_wait()\n");
1868 
1869 	current_cpu_datap()->debugger_ipi_time = mach_absolute_time();
1870 #if CONFIG_MCA
1871 	/* If we've trapped due to a machine-check, save MCA registers */
1872 	mca_check_save();
1873 #endif
1874 
1875 	atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1876 	while (mp_kdp_trap || (isNMI == TRUE)) {
1877 		/*
1878 		 * A TLB shootdown request may be pending--this would result
1879 		 * in the requesting processor waiting in PMAP_UPDATE_TLBS()
1880 		 * until this processor handles it.
1881 		 * Process it, so it can now enter mp_kdp_wait()
1882 		 */
1883 		if (flush) {
1884 			handle_pending_TLB_flushes();
1885 		}
1886 
1887 		kdp_x86_xcpu_poll();
1888 		cpu_pause();
1889 	}
1890 
1891 	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1892 	DBG("mp_kdp_wait() done\n");
1893 }
1894 
1895 void
mp_kdp_exit(void)1896 mp_kdp_exit(void)
1897 {
1898 	DBG("mp_kdp_exit()\n");
1899 	debugger_cpu = -1;
1900 	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1901 
1902 	debugger_exit_time = mach_absolute_time();
1903 
1904 	mp_kdp_trap = FALSE;
1905 	mfence();
1906 
1907 	/* Wait other processors to stop spinning. XXX needs timeout */
1908 	DBG("mp_kdp_exit() waiting for processors to resume\n");
1909 	while (mp_kdp_ncpus > 0) {
1910 		/*
1911 		 * a TLB shootdown request may be pending... this would result in the requesting
1912 		 * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it.
1913 		 * Process it, so it can now enter mp_kdp_wait()
1914 		 */
1915 		handle_pending_TLB_flushes();
1916 
1917 		cpu_pause();
1918 	}
1919 
1920 	if (pmsafe_debug && !kdp_snapshot) {
1921 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
1922 	}
1923 
1924 	debugger_exit_time = mach_absolute_time();
1925 
1926 	DBG("mp_kdp_exit() done\n");
1927 	(void) ml_set_interrupts_enabled(mp_kdp_state);
1928 	postcode(MP_KDP_EXIT);
1929 }
1930 
1931 #endif  /* MACH_KDP */
1932 
1933 boolean_t
mp_recent_debugger_activity(void)1934 mp_recent_debugger_activity(void)
1935 {
1936 	uint64_t abstime = mach_absolute_time();
1937 	return ((abstime - debugger_entry_time) < LastDebuggerEntryAllowance) ||
1938 	       ((abstime - debugger_exit_time) < LastDebuggerEntryAllowance);
1939 }
1940 
1941 /*ARGSUSED*/
1942 void
init_ast_check(__unused processor_t processor)1943 init_ast_check(
1944 	__unused processor_t    processor)
1945 {
1946 }
1947 
1948 void
cause_ast_check(processor_t processor)1949 cause_ast_check(
1950 	processor_t     processor)
1951 {
1952 	int     cpu = processor->cpu_id;
1953 
1954 	if (cpu != cpu_number()) {
1955 		i386_signal_cpu(cpu, MP_AST, ASYNC);
1956 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0);
1957 	}
1958 }
1959 
1960 void
slave_machine_init(void * param)1961 slave_machine_init(void *param)
1962 {
1963 	/*
1964 	 * Here in process context, but with interrupts disabled.
1965 	 */
1966 	DBG("slave_machine_init() CPU%d\n", get_cpu_number());
1967 
1968 	if (param == FULL_SLAVE_INIT) {
1969 		/*
1970 		 * Cold start
1971 		 */
1972 		clock_init();
1973 	}
1974 	cpu_machine_init();     /* Interrupts enabled hereafter */
1975 }
1976 
1977 #undef cpu_number
1978 int
cpu_number(void)1979 cpu_number(void)
1980 {
1981 	return get_cpu_number();
1982 }
1983 
1984 vm_offset_t
current_percpu_base(void)1985 current_percpu_base(void)
1986 {
1987 	return get_current_percpu_base();
1988 }
1989 
1990 vm_offset_t
other_percpu_base(int cpu)1991 other_percpu_base(int cpu)
1992 {
1993 	return cpu_datap(cpu)->cpu_pcpu_base;
1994 }
1995 
1996 static void
cpu_prewarm_init()1997 cpu_prewarm_init()
1998 {
1999 	int i;
2000 
2001 	simple_lock_init(&cpu_warm_lock, 0);
2002 	queue_init(&cpu_warm_call_list);
2003 	for (i = 0; i < NUM_CPU_WARM_CALLS; i++) {
2004 		enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]);
2005 	}
2006 }
2007 
2008 static timer_call_t
grab_warm_timer_call()2009 grab_warm_timer_call()
2010 {
2011 	spl_t x;
2012 	timer_call_t call = NULL;
2013 
2014 	x = splsched();
2015 	simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2016 	if (!queue_empty(&cpu_warm_call_list)) {
2017 		call = (timer_call_t) dequeue_head(&cpu_warm_call_list);
2018 	}
2019 	simple_unlock(&cpu_warm_lock);
2020 	splx(x);
2021 
2022 	return call;
2023 }
2024 
2025 static void
free_warm_timer_call(timer_call_t call)2026 free_warm_timer_call(timer_call_t call)
2027 {
2028 	spl_t x;
2029 
2030 	x = splsched();
2031 	simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2032 	enqueue_head(&cpu_warm_call_list, (queue_entry_t)call);
2033 	simple_unlock(&cpu_warm_lock);
2034 	splx(x);
2035 }
2036 
2037 /*
2038  * Runs in timer call context (interrupts disabled).
2039  */
2040 static void
cpu_warm_timer_call_func(timer_call_param_t p0,__unused timer_call_param_t p1)2041 cpu_warm_timer_call_func(
2042 	timer_call_param_t p0,
2043 	__unused timer_call_param_t p1)
2044 {
2045 	free_warm_timer_call((timer_call_t)p0);
2046 	return;
2047 }
2048 
2049 /*
2050  * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0).
2051  */
2052 static void
_cpu_warm_setup(void * arg)2053 _cpu_warm_setup(
2054 	void *arg)
2055 {
2056 	cpu_warm_data_t cwdp = (cpu_warm_data_t)arg;
2057 
2058 	timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2059 	cwdp->cwd_result = 0;
2060 
2061 	return;
2062 }
2063 
2064 /*
2065  * Not safe to call with interrupts disabled.
2066  */
2067 kern_return_t
ml_interrupt_prewarm(uint64_t deadline)2068 ml_interrupt_prewarm(
2069 	uint64_t        deadline)
2070 {
2071 	struct cpu_warm_data cwd;
2072 	timer_call_t call;
2073 	cpu_t ct;
2074 
2075 	if (ml_get_interrupts_enabled() == FALSE) {
2076 		panic("%s: Interrupts disabled?", __FUNCTION__);
2077 	}
2078 
2079 	/*
2080 	 * If the platform doesn't need our help, say that we succeeded.
2081 	 */
2082 	if (!ml_get_interrupt_prewake_applicable()) {
2083 		return KERN_SUCCESS;
2084 	}
2085 
2086 	/*
2087 	 * Grab a timer call to use.
2088 	 */
2089 	call = grab_warm_timer_call();
2090 	if (call == NULL) {
2091 		return KERN_RESOURCE_SHORTAGE;
2092 	}
2093 
2094 	timer_call_setup(call, cpu_warm_timer_call_func, call);
2095 	cwd.cwd_call = call;
2096 	cwd.cwd_deadline = deadline;
2097 	cwd.cwd_result = 0;
2098 
2099 	/*
2100 	 * For now, non-local interrupts happen on the master processor.
2101 	 */
2102 	ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd);
2103 	if (ct == 0) {
2104 		free_warm_timer_call(call);
2105 		return KERN_FAILURE;
2106 	} else {
2107 		return cwd.cwd_result;
2108 	}
2109 }
2110 
2111 #if DEBUG || DEVELOPMENT
2112 void
kernel_spin(uint64_t spin_ns)2113 kernel_spin(uint64_t spin_ns)
2114 {
2115 	boolean_t       istate;
2116 	uint64_t        spin_abs;
2117 	uint64_t        deadline;
2118 	cpu_data_t      *cdp;
2119 
2120 	kprintf("kernel_spin(%llu) spinning uninterruptibly\n", spin_ns);
2121 	istate = ml_set_interrupts_enabled(FALSE);
2122 	cdp = current_cpu_datap();
2123 	nanoseconds_to_absolutetime(spin_ns, &spin_abs);
2124 
2125 	/* Fake interrupt handler entry for testing mp_interrupt_watchdog() */
2126 	cdp->cpu_int_event_time = mach_absolute_time();
2127 	cdp->cpu_int_state = (void *) USER_STATE(current_thread());
2128 
2129 	deadline = mach_absolute_time() + spin_ns;
2130 	while (mach_absolute_time() < deadline) {
2131 		cpu_pause();
2132 	}
2133 
2134 	cdp->cpu_int_event_time = 0;
2135 	cdp->cpu_int_state = NULL;
2136 
2137 	ml_set_interrupts_enabled(istate);
2138 	kprintf("kernel_spin() continuing\n");
2139 }
2140 
2141 /*
2142  * Called from the scheduler's maintenance thread,
2143  * scan running processors for long-running ISRs and:
2144  *  - panic if longer than LockTimeOut, or
2145  *  - log if more than a quantum.
2146  */
2147 void
mp_interrupt_watchdog(void)2148 mp_interrupt_watchdog(void)
2149 {
2150 	cpu_t                   cpu;
2151 	boolean_t               intrs_enabled = FALSE;
2152 	uint16_t                cpu_int_num;
2153 	uint64_t                cpu_int_event_time;
2154 	uint64_t                cpu_rip;
2155 	uint64_t                cpu_int_duration;
2156 	uint64_t                now;
2157 	x86_saved_state_t       *cpu_int_state;
2158 
2159 	if (__improbable(!mp_interrupt_watchdog_enabled)) {
2160 		return;
2161 	}
2162 
2163 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
2164 	now = mach_absolute_time();
2165 	/*
2166 	 * While timeouts are not suspended,
2167 	 * check all other processors for long outstanding interrupt handling.
2168 	 */
2169 	for (cpu = 0;
2170 	    cpu < (cpu_t) real_ncpus && !machine_timeout_suspended();
2171 	    cpu++) {
2172 		if ((cpu == (cpu_t) cpu_number()) ||
2173 		    (!cpu_is_running(cpu))) {
2174 			continue;
2175 		}
2176 		cpu_int_event_time = cpu_datap(cpu)->cpu_int_event_time;
2177 		if (cpu_int_event_time == 0) {
2178 			continue;
2179 		}
2180 		if (__improbable(now < cpu_int_event_time)) {
2181 			continue;       /* skip due to inter-processor skew */
2182 		}
2183 		cpu_int_state = cpu_datap(cpu)->cpu_int_state;
2184 		if (__improbable(cpu_int_state == NULL)) {
2185 			/* The interrupt may have been dismissed */
2186 			continue;
2187 		}
2188 
2189 		/* Here with a cpu handling an interrupt */
2190 
2191 		cpu_int_duration = now - cpu_int_event_time;
2192 		if (__improbable(cpu_int_duration > LockTimeOut)) {
2193 			cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2194 			cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2195 			vector_timed_out = cpu_int_num;
2196 			NMIPI_panic(cpu_to_cpumask(cpu), INTERRUPT_WATCHDOG);
2197 			panic("Interrupt watchdog, "
2198 			    "cpu: %d interrupt: 0x%x time: %llu..%llu state: %p RIP: 0x%llx",
2199 			    cpu, cpu_int_num, cpu_int_event_time, now, cpu_int_state, cpu_rip);
2200 			/* NOT REACHED */
2201 		} else if (__improbable(cpu_int_duration > (uint64_t) std_quantum)) {
2202 			mp_interrupt_watchdog_events++;
2203 			cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2204 			cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2205 			ml_set_interrupts_enabled(intrs_enabled);
2206 			printf("Interrupt watchdog, "
2207 			    "cpu: %d interrupt: 0x%x time: %llu..%llu RIP: 0x%llx\n",
2208 			    cpu, cpu_int_num, cpu_int_event_time, now, cpu_rip);
2209 			return;
2210 		}
2211 	}
2212 
2213 	ml_set_interrupts_enabled(intrs_enabled);
2214 }
2215 #endif
2216