xref: /xnu-10063.121.3/osfmk/i386/mp.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 
32 #include <mach_kdp.h>
33 #include <kdp/kdp_internal.h>
34 #include <mach_ldebug.h>
35 
36 #include <mach/mach_types.h>
37 #include <mach/kern_return.h>
38 
39 #include <kern/kern_types.h>
40 #include <kern/startup.h>
41 #include <kern/timer_queue.h>
42 #include <kern/processor.h>
43 #include <kern/cpu_number.h>
44 #include <kern/cpu_data.h>
45 #include <kern/assert.h>
46 #include <kern/lock_group.h>
47 #include <kern/machine.h>
48 #include <kern/pms.h>
49 #include <kern/misc_protos.h>
50 #include <kern/timer_call.h>
51 #include <kern/zalloc.h>
52 #include <kern/queue.h>
53 #include <kern/monotonic.h>
54 #include <prng/random.h>
55 
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 
59 #include <i386/bit_routines.h>
60 #include <i386/proc_reg.h>
61 #include <i386/cpu_threads.h>
62 #include <i386/mp_desc.h>
63 #include <i386/misc_protos.h>
64 #include <i386/trap_internal.h>
65 #include <i386/postcode.h>
66 #include <i386/machine_routines.h>
67 #include <i386/mp.h>
68 #include <i386/mp_events.h>
69 #include <i386/lapic.h>
70 #include <i386/cpuid.h>
71 #include <i386/fpu.h>
72 #include <i386/machine_cpu.h>
73 #include <i386/pmCPU.h>
74 #if CONFIG_MCA
75 #include <i386/machine_check.h>
76 #endif
77 #include <i386/acpi.h>
78 
79 #include <sys/kdebug.h>
80 
81 #include <console/serial_protos.h>
82 
83 #if KPERF
84 #include <kperf/kptimer.h>
85 #endif /* KPERF */
86 
87 #if     MP_DEBUG
88 #define PAUSE           delay(1000000)
89 #define DBG(x...)       kprintf(x)
90 #else
91 #define DBG(x...)
92 #define PAUSE
93 #endif  /* MP_DEBUG */
94 
95 /* Debugging/test trace events: */
96 #define TRACE_MP_TLB_FLUSH              MACHDBG_CODE(DBG_MACH_MP, 0)
97 #define TRACE_MP_CPUS_CALL              MACHDBG_CODE(DBG_MACH_MP, 1)
98 #define TRACE_MP_CPUS_CALL_LOCAL        MACHDBG_CODE(DBG_MACH_MP, 2)
99 #define TRACE_MP_CPUS_CALL_ACTION       MACHDBG_CODE(DBG_MACH_MP, 3)
100 #define TRACE_MP_CPUS_CALL_NOBUF        MACHDBG_CODE(DBG_MACH_MP, 4)
101 #define TRACE_MP_CPU_FAST_START         MACHDBG_CODE(DBG_MACH_MP, 5)
102 #define TRACE_MP_CPU_START              MACHDBG_CODE(DBG_MACH_MP, 6)
103 #define TRACE_MP_CPU_DEACTIVATE         MACHDBG_CODE(DBG_MACH_MP, 7)
104 
105 #define ABS(v)          (((v) > 0)?(v):-(v))
106 
107 void            slave_boot_init(void);
108 void            i386_cpu_IPI(int cpu);
109 
110 #if MACH_KDP
111 static void     mp_kdp_wait(boolean_t flush, boolean_t isNMI);
112 #endif /* MACH_KDP */
113 
114 #if MACH_KDP
115 static boolean_t        cpu_signal_pending(int cpu, mp_event_t event);
116 #endif /* MACH_KDP */
117 static int              NMIInterruptHandler(x86_saved_state_t *regs);
118 
119 boolean_t               smp_initialized = FALSE;
120 uint32_t                TSC_sync_margin = 0xFFF;
121 volatile boolean_t      force_immediate_debugger_NMI = FALSE;
122 volatile boolean_t      pmap_tlb_flush_timeout = FALSE;
123 #if DEBUG || DEVELOPMENT
124 boolean_t               mp_interrupt_watchdog_enabled = TRUE;
125 uint32_t                mp_interrupt_watchdog_events = 0;
126 #endif
127 
128 SIMPLE_LOCK_DECLARE(debugger_callback_lock, 0);
129 struct debugger_callback *debugger_callback = NULL;
130 
131 static LCK_GRP_DECLARE(smp_lck_grp, "i386_smp");
132 static LCK_MTX_DECLARE(mp_cpu_boot_lock, &smp_lck_grp);
133 
134 /* Variables needed for MP rendezvous. */
135 SIMPLE_LOCK_DECLARE(mp_rv_lock, 0);
136 static void     (*mp_rv_setup_func)(void *arg);
137 static void     (*mp_rv_action_func)(void *arg);
138 static void     (*mp_rv_teardown_func)(void *arg);
139 static void     *mp_rv_func_arg;
140 static volatile int     mp_rv_ncpus;
141 /* Cache-aligned barriers: */
142 static volatile long    mp_rv_entry    __attribute__((aligned(64)));
143 static volatile long    mp_rv_exit     __attribute__((aligned(64)));
144 static volatile long    mp_rv_complete __attribute__((aligned(64)));
145 
146 volatile        uint64_t        debugger_entry_time;
147 volatile        uint64_t        debugger_exit_time;
148 #if MACH_KDP
149 #include <kdp/kdp.h>
150 extern int kdp_snapshot;
151 static struct _kdp_xcpu_call_func {
152 	kdp_x86_xcpu_func_t func;
153 	void     *arg0, *arg1;
154 	volatile long     ret;
155 	volatile uint16_t cpu;
156 } kdp_xcpu_call_func = {
157 	.cpu  = KDP_XCPU_NONE
158 };
159 
160 #endif
161 
162 /* Variables needed for MP broadcast. */
163 static void        (*mp_bc_action_func)(void *arg);
164 static void        *mp_bc_func_arg;
165 static int      mp_bc_ncpus;
166 static volatile long   mp_bc_count;
167 static LCK_MTX_DECLARE(mp_bc_lock, &smp_lck_grp);
168 static  volatile int    debugger_cpu = -1;
169 volatile long    NMIPI_acks = 0;
170 volatile long    NMI_count = 0;
171 static int              vector_timed_out;
172 
173 NMI_reason_t    NMI_panic_reason = NONE;
174 extern void     NMI_cpus(void);
175 
176 static void     mp_cpus_call_init(void);
177 static void     mp_cpus_call_action(void);
178 static void     mp_call_PM(void);
179 
180 char            mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init
181 
182 /* PAL-related routines */
183 boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler,
184     int ipi_vector, i386_intr_func_t ipi_handler);
185 void i386_start_cpu(int lapic_id, int cpu_num);
186 void i386_send_NMI(int cpu);
187 void NMIPI_enable(boolean_t);
188 
189 #define NUM_CPU_WARM_CALLS      20
190 struct timer_call       cpu_warm_call_arr[NUM_CPU_WARM_CALLS];
191 queue_head_t            cpu_warm_call_list;
192 decl_simple_lock_data(static, cpu_warm_lock);
193 
194 typedef struct cpu_warm_data {
195 	timer_call_t    cwd_call;
196 	uint64_t        cwd_deadline;
197 	int             cwd_result;
198 } *cpu_warm_data_t;
199 
200 static void             cpu_prewarm_init(void);
201 static void             cpu_warm_timer_call_func(timer_call_param_t p0, timer_call_param_t p1);
202 static void             _cpu_warm_setup(void *arg);
203 static timer_call_t     grab_warm_timer_call(void);
204 static void             free_warm_timer_call(timer_call_t call);
205 
206 void
smp_init(void)207 smp_init(void)
208 {
209 	console_init();
210 
211 	if (!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler,
212 	    LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler)) {
213 		return;
214 	}
215 
216 	cpu_thread_init();
217 
218 	DBGLOG_CPU_INIT(master_cpu);
219 
220 	mp_cpus_call_init();
221 	mp_cpus_call_cpu_init(master_cpu);
222 
223 #if DEBUG || DEVELOPMENT
224 	if (PE_parse_boot_argn("interrupt_watchdog",
225 	    &mp_interrupt_watchdog_enabled,
226 	    sizeof(mp_interrupt_watchdog_enabled))) {
227 		kprintf("Interrupt watchdog %sabled\n",
228 		    mp_interrupt_watchdog_enabled ? "en" : "dis");
229 	}
230 #endif
231 
232 	if (PE_parse_boot_argn("TSC_sync_margin",
233 	    &TSC_sync_margin, sizeof(TSC_sync_margin))) {
234 		kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin);
235 	} else if (cpuid_vmm_present()) {
236 		kprintf("TSC sync margin disabled\n");
237 		TSC_sync_margin = 0;
238 	}
239 	smp_initialized = TRUE;
240 
241 	cpu_prewarm_init();
242 
243 	return;
244 }
245 
246 typedef struct {
247 	int                     target_cpu;
248 	int                     target_lapic;
249 	int                     starter_cpu;
250 } processor_start_info_t;
251 static processor_start_info_t   start_info        __attribute__((aligned(64)));
252 
253 /*
254  * Cache-alignment is to avoid cross-cpu false-sharing interference.
255  */
256 static volatile long            tsc_entry_barrier __attribute__((aligned(64)));
257 static volatile long            tsc_exit_barrier  __attribute__((aligned(64)));
258 static volatile uint64_t        tsc_target        __attribute__((aligned(64)));
259 
260 /*
261  * Poll a CPU to see when it has marked itself as running.
262  */
263 static void
mp_wait_for_cpu_up(int slot_num,unsigned int iters,unsigned int usecdelay)264 mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay)
265 {
266 	while (iters-- > 0) {
267 		if (cpu_datap(slot_num)->cpu_running) {
268 			break;
269 		}
270 		delay(usecdelay);
271 	}
272 }
273 
274 /*
275  * Quickly bring a CPU back online which has been halted.
276  */
277 kern_return_t
intel_startCPU_fast(int slot_num)278 intel_startCPU_fast(int slot_num)
279 {
280 	kern_return_t   rc;
281 
282 	/*
283 	 * Try to perform a fast restart
284 	 */
285 	rc = pmCPUExitHalt(slot_num);
286 	if (rc != KERN_SUCCESS) {
287 		/*
288 		 * The CPU was not eligible for a fast restart.
289 		 */
290 		return rc;
291 	}
292 
293 	KERNEL_DEBUG_CONSTANT(
294 		TRACE_MP_CPU_FAST_START | DBG_FUNC_START,
295 		slot_num, 0, 0, 0, 0);
296 
297 	/*
298 	 * Wait until the CPU is back online.
299 	 */
300 	mp_disable_preemption();
301 
302 	/*
303 	 * We use short pauses (1us) for low latency.  30,000 iterations is
304 	 * longer than a full restart would require so it should be more
305 	 * than long enough.
306 	 */
307 
308 	mp_wait_for_cpu_up(slot_num, 30000, 1);
309 	mp_enable_preemption();
310 
311 	KERNEL_DEBUG_CONSTANT(
312 		TRACE_MP_CPU_FAST_START | DBG_FUNC_END,
313 		slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0);
314 
315 	/*
316 	 * Check to make sure that the CPU is really running.  If not,
317 	 * go through the slow path.
318 	 */
319 	if (cpu_datap(slot_num)->cpu_running) {
320 		return KERN_SUCCESS;
321 	} else {
322 		return KERN_FAILURE;
323 	}
324 }
325 
326 static void
started_cpu(void)327 started_cpu(void)
328 {
329 	/* Here on the started cpu with cpu_running set TRUE */
330 
331 	if (TSC_sync_margin &&
332 	    start_info.target_cpu == cpu_number()) {
333 		/*
334 		 * I've just started-up, synchronize again with the starter cpu
335 		 * and then snap my TSC.
336 		 */
337 		tsc_target   = 0;
338 		atomic_decl(&tsc_entry_barrier, 1);
339 		while (tsc_entry_barrier != 0) {
340 			;       /* spin for starter and target at barrier */
341 		}
342 		tsc_target = rdtsc64();
343 		atomic_decl(&tsc_exit_barrier, 1);
344 	}
345 }
346 
347 static void
start_cpu(void * arg)348 start_cpu(void *arg)
349 {
350 	int                     i = 1000;
351 	processor_start_info_t  *psip = (processor_start_info_t *) arg;
352 
353 	/* Ignore this if the current processor is not the starter */
354 	if (cpu_number() != psip->starter_cpu) {
355 		return;
356 	}
357 
358 	DBG("start_cpu(%p) about to start cpu %d, lapic %d\n",
359 	    arg, psip->target_cpu, psip->target_lapic);
360 
361 	KERNEL_DEBUG_CONSTANT(
362 		TRACE_MP_CPU_START | DBG_FUNC_START,
363 		psip->target_cpu,
364 		psip->target_lapic, 0, 0, 0);
365 
366 	i386_start_cpu(psip->target_lapic, psip->target_cpu);
367 
368 #ifdef  POSTCODE_DELAY
369 	/* Wait much longer if postcodes are displayed for a delay period. */
370 	i *= 10000;
371 #endif
372 	DBG("start_cpu(%p) about to wait for cpu %d\n",
373 	    arg, psip->target_cpu);
374 
375 	mp_wait_for_cpu_up(psip->target_cpu, i * 100, 100);
376 
377 	KERNEL_DEBUG_CONSTANT(
378 		TRACE_MP_CPU_START | DBG_FUNC_END,
379 		psip->target_cpu,
380 		cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0);
381 
382 	if (TSC_sync_margin &&
383 	    cpu_datap(psip->target_cpu)->cpu_running) {
384 		/*
385 		 * Compare the TSC from the started processor with ours.
386 		 * Report and log/panic if it diverges by more than
387 		 * TSC_sync_margin (TSC_SYNC_MARGIN) ticks. This margin
388 		 * can be overriden by boot-arg (with 0 meaning no checking).
389 		 */
390 		uint64_t        tsc_starter;
391 		int64_t         tsc_delta;
392 		atomic_decl(&tsc_entry_barrier, 1);
393 		while (tsc_entry_barrier != 0) {
394 			;       /* spin for both processors at barrier */
395 		}
396 		tsc_starter = rdtsc64();
397 		atomic_decl(&tsc_exit_barrier, 1);
398 		while (tsc_exit_barrier != 0) {
399 			;       /* spin for target to store its TSC */
400 		}
401 		tsc_delta = tsc_target - tsc_starter;
402 		kprintf("TSC sync for cpu %d: 0x%016llx delta 0x%llx (%lld)\n",
403 		    psip->target_cpu, tsc_target, tsc_delta, tsc_delta);
404 #if DEBUG || DEVELOPMENT
405 		/*
406 		 * Stash the delta for inspection later, since we can no
407 		 * longer print/log it with interrupts disabled.
408 		 */
409 		cpu_datap(psip->target_cpu)->tsc_sync_delta = tsc_delta;
410 #endif
411 		if (ABS(tsc_delta) > (int64_t) TSC_sync_margin) {
412 #if DEBUG
413 			panic(
414 #else
415 			kprintf(
416 #endif
417 				"Unsynchronized  TSC for cpu %d: "
418 				"0x%016llx, delta 0x%llx\n",
419 				psip->target_cpu, tsc_target, tsc_delta);
420 		}
421 	}
422 }
423 
424 kern_return_t
intel_startCPU(int slot_num)425 intel_startCPU(
426 	int     slot_num)
427 {
428 	int             lapic = cpu_to_lapic[slot_num];
429 	boolean_t       istate;
430 
431 	assert(lapic != -1);
432 
433 	DBGLOG_CPU_INIT(slot_num);
434 
435 	DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic);
436 	DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD);
437 
438 	/*
439 	 * Initialize (or re-initialize) the descriptor tables for this cpu.
440 	 * Propagate processor mode to slave.
441 	 */
442 	cpu_desc_init(cpu_datap(slot_num));
443 
444 	/* Serialize use of the slave boot stack, etc. */
445 	lck_mtx_lock(&mp_cpu_boot_lock);
446 
447 	istate = ml_set_interrupts_enabled(FALSE);
448 	if (slot_num == get_cpu_number()) {
449 		ml_set_interrupts_enabled(istate);
450 		lck_mtx_unlock(&mp_cpu_boot_lock);
451 		return KERN_SUCCESS;
452 	}
453 
454 	start_info.starter_cpu  = cpu_number();
455 	start_info.target_cpu   = slot_num;
456 	start_info.target_lapic = lapic;
457 	tsc_entry_barrier = 2;
458 	tsc_exit_barrier = 2;
459 
460 	/*
461 	 * Perform the processor startup sequence with all running
462 	 * processors rendezvous'ed. This is required during periods when
463 	 * the cache-disable bit is set for MTRR/PAT initialization.
464 	 */
465 	mp_rendezvous_no_intrs(start_cpu, (void *) &start_info);
466 
467 	start_info.target_cpu = 0;
468 
469 	ml_set_interrupts_enabled(istate);
470 	lck_mtx_unlock(&mp_cpu_boot_lock);
471 
472 	if (!cpu_datap(slot_num)->cpu_running) {
473 		kprintf("Failed to start CPU %02d\n", slot_num);
474 		printf("Failed to start CPU %02d, rebooting...\n", slot_num);
475 		delay(1000000);
476 		halt_cpu();
477 		return KERN_SUCCESS;
478 	} else {
479 		kprintf("Started cpu %d (lapic id %08x)\n", slot_num, lapic);
480 		return KERN_SUCCESS;
481 	}
482 }
483 
484 #if     MP_DEBUG
485 cpu_signal_event_log_t  *cpu_signal[MAX_CPUS];
486 cpu_signal_event_log_t  *cpu_handle[MAX_CPUS];
487 
488 MP_EVENT_NAME_DECL();
489 
490 #endif  /* MP_DEBUG */
491 
492 /*
493  * Note: called with NULL state when polling for TLB flush and cross-calls.
494  */
495 int
cpu_signal_handler(x86_saved_state_t * regs)496 cpu_signal_handler(x86_saved_state_t *regs)
497 {
498 #if     !MACH_KDP
499 #pragma unused (regs)
500 #endif /* !MACH_KDP */
501 	int             my_cpu;
502 	volatile int    *my_word;
503 
504 	SCHED_STATS_INC(ipi_count);
505 
506 	my_cpu = cpu_number();
507 	my_word = &cpu_data_ptr[my_cpu]->cpu_signals;
508 	/* Store the initial set of signals for diagnostics. New
509 	 * signals could arrive while these are being processed
510 	 * so it's no more than a hint.
511 	 */
512 
513 	cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word;
514 
515 	do {
516 #if     MACH_KDP
517 		if (i_bit(MP_KDP, my_word)) {
518 			DBGLOG(cpu_handle, my_cpu, MP_KDP);
519 			i_bit_clear(MP_KDP, my_word);
520 /* Ensure that the i386_kernel_state at the base of the
521  * current thread's stack (if any) is synchronized with the
522  * context at the moment of the interrupt, to facilitate
523  * access through the debugger.
524  */
525 			sync_iss_to_iks(regs);
526 			if (pmsafe_debug && !kdp_snapshot) {
527 				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
528 			}
529 			mp_kdp_wait(TRUE, FALSE);
530 			if (pmsafe_debug && !kdp_snapshot) {
531 				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
532 			}
533 		} else
534 #endif  /* MACH_KDP */
535 		if (i_bit(MP_TLB_FLUSH, my_word)) {
536 			DBGLOG(cpu_handle, my_cpu, MP_TLB_FLUSH);
537 			i_bit_clear(MP_TLB_FLUSH, my_word);
538 			pmap_update_interrupt();
539 		} else if (i_bit(MP_CALL, my_word)) {
540 			DBGLOG(cpu_handle, my_cpu, MP_CALL);
541 			i_bit_clear(MP_CALL, my_word);
542 			mp_cpus_call_action();
543 		} else if (i_bit(MP_CALL_PM, my_word)) {
544 			DBGLOG(cpu_handle, my_cpu, MP_CALL_PM);
545 			i_bit_clear(MP_CALL_PM, my_word);
546 			mp_call_PM();
547 		}
548 		if (regs == NULL) {
549 			/* Called to poll only for cross-calls and TLB flush */
550 			break;
551 		} else if (i_bit(MP_AST, my_word)) {
552 			DBGLOG(cpu_handle, my_cpu, MP_AST);
553 			i_bit_clear(MP_AST, my_word);
554 			ast_check(cpu_to_processor(my_cpu));
555 		}
556 	} while (*my_word);
557 
558 	return 0;
559 }
560 
561 long
NMI_pte_corruption_callback(__unused void * arg0,__unused void * arg1,uint16_t lcpu)562 NMI_pte_corruption_callback(__unused void *arg0, __unused void *arg1, uint16_t lcpu)
563 {
564 	static char     pstr[256];      /* global since this callback is serialized */
565 	void            *stackptr;
566 	__asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
567 
568 	snprintf(&pstr[0], sizeof(pstr),
569 	    "Panic(CPU %d): PTE corruption detected on PTEP 0x%llx VAL 0x%llx\n",
570 	    lcpu, (unsigned long long)(uintptr_t)PTE_corrupted_ptr, *(uint64_t *)PTE_corrupted_ptr);
571 	panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, current_cpu_datap()->cpu_int_state);
572 	return 0;
573 }
574 
575 extern void kprintf_break_lock(void);
576 int
NMIInterruptHandler(x86_saved_state_t * regs)577 NMIInterruptHandler(x86_saved_state_t *regs)
578 {
579 	void            *stackptr;
580 	char            pstr[256];
581 	uint64_t        now = mach_absolute_time();
582 
583 	if (panic_active() && !panicDebugging) {
584 		if (pmsafe_debug) {
585 			pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
586 		}
587 		for (;;) {
588 			cpu_pause();
589 		}
590 	}
591 
592 	atomic_incl(&NMIPI_acks, 1);
593 	atomic_incl(&NMI_count, 1);
594 	sync_iss_to_iks_unconditionally(regs);
595 	__asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
596 
597 	if (cpu_number() == debugger_cpu) {
598 		goto NMExit;
599 	}
600 
601 	if (NMI_panic_reason == SPINLOCK_TIMEOUT) {
602 		lck_spinlock_to_info_t lsti;
603 
604 		lsti = os_atomic_load(&lck_spinlock_timeout_in_progress, acquire);
605 		snprintf(&pstr[0], sizeof(pstr),
606 		    "Panic(CPU %d, time %llu): NMIPI for spinlock acquisition timeout, spinlock: %p, "
607 		    "spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n",
608 		    cpu_number(), now, lsti->lock, (void *)lsti->owner_thread_cur,
609 		    current_thread(), lsti->owner_cpu);
610 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
611 	} else if (NMI_panic_reason == TLB_FLUSH_TIMEOUT) {
612 		snprintf(&pstr[0], sizeof(pstr),
613 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: TLB flush timeout, TLB state:0x%x\n",
614 		    cpu_number(), now, current_cpu_datap()->cpu_tlb_invalid);
615 		panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs);
616 	} else if (NMI_panic_reason == CROSSCALL_TIMEOUT) {
617 		snprintf(&pstr[0], sizeof(pstr),
618 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: cross-call timeout\n",
619 		    cpu_number(), now);
620 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
621 	} else if (NMI_panic_reason == INTERRUPT_WATCHDOG) {
622 		snprintf(&pstr[0], sizeof(pstr),
623 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: interrupt watchdog for vector 0x%x\n",
624 		    cpu_number(), now, vector_timed_out);
625 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
626 	}
627 
628 #if MACH_KDP
629 	if (pmsafe_debug && !kdp_snapshot) {
630 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
631 	}
632 	current_cpu_datap()->cpu_NMI_acknowledged = TRUE;
633 	i_bit_clear(MP_KDP, &current_cpu_datap()->cpu_signals);
634 	if (panic_active() || NMI_panic_reason != NONE) {
635 		mp_kdp_wait(FALSE, TRUE);
636 	} else if (!mp_kdp_trap &&
637 	    !mp_kdp_is_NMI &&
638 	    virtualized && (debug_boot_arg & DB_NMI)) {
639 		/*
640 		 * Under a VMM with the debug boot-arg set, drop into kdp.
641 		 * Since an NMI is involved, there's a risk of contending with
642 		 * a panic. And side-effects of NMIs may result in entry into,
643 		 * and continuing from, the debugger being unreliable.
644 		 */
645 		if (__sync_bool_compare_and_swap(&mp_kdp_is_NMI, FALSE, TRUE)) {
646 			kprintf_break_lock();
647 
648 			DebuggerWithContext(EXC_BREAKPOINT, saved_state64(regs),
649 			    "requested by NMI", DEBUGGER_OPTION_NONE,
650 			    (unsigned long)(char *)__builtin_return_address(0));
651 
652 			mp_kdp_is_NMI = FALSE;
653 		} else {
654 			mp_kdp_wait(FALSE, FALSE);
655 		}
656 	} else {
657 		mp_kdp_wait(FALSE, FALSE);
658 	}
659 	if (pmsafe_debug && !kdp_snapshot) {
660 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
661 	}
662 #endif
663 NMExit:
664 	return 1;
665 }
666 
667 /*
668  * cpu_interrupt is really just to be used by the scheduler to
669  * get a CPU's attention it may not always issue an IPI.  If an
670  * IPI is always needed then use i386_cpu_IPI.
671  */
672 void
cpu_interrupt(int cpu)673 cpu_interrupt(int cpu)
674 {
675 	boolean_t did_IPI = FALSE;
676 
677 	if (smp_initialized
678 	    && pmCPUExitIdle(cpu_datap(cpu))) {
679 		i386_cpu_IPI(cpu);
680 		did_IPI = TRUE;
681 	}
682 
683 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0);
684 }
685 
686 /*
687  * Send a true NMI via the local APIC to the specified CPU.
688  */
689 void
cpu_NMI_interrupt(int cpu)690 cpu_NMI_interrupt(int cpu)
691 {
692 	if (smp_initialized) {
693 		i386_send_NMI(cpu);
694 	}
695 }
696 
697 void
NMI_cpus(void)698 NMI_cpus(void)
699 {
700 	unsigned int    cpu;
701 	boolean_t       intrs_enabled;
702 	uint64_t        tsc_timeout;
703 
704 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
705 	NMIPI_enable(TRUE);
706 	for (cpu = 0; cpu < real_ncpus; cpu++) {
707 		if (!cpu_is_running(cpu)) {
708 			continue;
709 		}
710 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
711 		cpu_NMI_interrupt(cpu);
712 		tsc_timeout = !machine_timeout_suspended() ?
713 		    rdtsc64() + (1000 * 1000 * 1000 * 10ULL) :
714 		    ~0ULL;
715 		while (!cpu_datap(cpu)->cpu_NMI_acknowledged) {
716 			handle_pending_TLB_flushes();
717 			cpu_pause();
718 			if (rdtsc64() > tsc_timeout) {
719 				panic("NMI_cpus() timeout cpu %d", cpu);
720 			}
721 		}
722 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
723 	}
724 	NMIPI_enable(FALSE);
725 
726 	ml_set_interrupts_enabled(intrs_enabled);
727 }
728 
729 static void(*volatile mp_PM_func)(void) = NULL;
730 
731 static void
mp_call_PM(void)732 mp_call_PM(void)
733 {
734 	assert(!ml_get_interrupts_enabled());
735 
736 	if (mp_PM_func != NULL) {
737 		mp_PM_func();
738 	}
739 }
740 
741 void
cpu_PM_interrupt(int cpu)742 cpu_PM_interrupt(int cpu)
743 {
744 	assert(!ml_get_interrupts_enabled());
745 
746 	if (mp_PM_func != NULL) {
747 		if (cpu == cpu_number()) {
748 			mp_PM_func();
749 		} else {
750 			i386_signal_cpu(cpu, MP_CALL_PM, ASYNC);
751 		}
752 	}
753 }
754 
755 void
PM_interrupt_register(void (* fn)(void))756 PM_interrupt_register(void (*fn)(void))
757 {
758 	mp_PM_func = fn;
759 }
760 
761 void
i386_signal_cpu(int cpu,mp_event_t event,mp_sync_t mode)762 i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode)
763 {
764 	volatile int    *signals = &cpu_datap(cpu)->cpu_signals;
765 	uint64_t        tsc_timeout;
766 
767 
768 	if (!cpu_datap(cpu)->cpu_running) {
769 		return;
770 	}
771 
772 	if (event == MP_TLB_FLUSH) {
773 		KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0);
774 	}
775 
776 	DBGLOG(cpu_signal, cpu, event);
777 
778 	i_bit_set(event, signals);
779 	i386_cpu_IPI(cpu);
780 	if (mode == SYNC) {
781 again:
782 		tsc_timeout = !machine_timeout_suspended() ?
783 		    rdtsc64() + (1000 * 1000 * 1000) :
784 		    ~0ULL;
785 		while (i_bit(event, signals) && rdtsc64() < tsc_timeout) {
786 			cpu_pause();
787 		}
788 		if (i_bit(event, signals)) {
789 			DBG("i386_signal_cpu(%d, 0x%x, SYNC) timed out\n",
790 			    cpu, event);
791 			goto again;
792 		}
793 	}
794 	if (event == MP_TLB_FLUSH) {
795 		KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0);
796 	}
797 }
798 
799 /*
800  * Helper function called when busy-waiting: panic if too long
801  * a TSC-based time has elapsed since the start of the spin.
802  */
803 static boolean_t
mp_spin_timeout(uint64_t tsc_start)804 mp_spin_timeout(uint64_t tsc_start)
805 {
806 	uint64_t        tsc_timeout;
807 
808 	cpu_pause();
809 	if (machine_timeout_suspended()) {
810 		return FALSE;
811 	}
812 
813 	/*
814 	 * The timeout is 4 * the spinlock timeout period
815 	 * unless we have serial console printing (kprintf) enabled
816 	 * in which case we allow an even greater margin.
817 	 */
818 	tsc_timeout = disable_serial_output ? LockTimeOutTSC << 2
819 	        : LockTimeOutTSC << 4;
820 	return rdtsc64() > tsc_start + tsc_timeout;
821 }
822 
823 /*
824  * Helper function to take a spinlock while ensuring that incoming IPIs
825  * are still serviced if interrupts are masked while we spin.
826  * Returns current interrupt state.
827  */
828 boolean_t
mp_safe_spin_lock(usimple_lock_t lock)829 mp_safe_spin_lock(usimple_lock_t lock)
830 {
831 	if (ml_get_interrupts_enabled()) {
832 		simple_lock(lock, LCK_GRP_NULL);
833 		return TRUE;
834 	}
835 
836 	lck_spinlock_to_info_t lsti;
837 	uint64_t tsc_spin_start = rdtsc64();
838 
839 	while (!simple_lock_try(lock, LCK_GRP_NULL)) {
840 		cpu_signal_handler(NULL);
841 		if (mp_spin_timeout(tsc_spin_start)) {
842 			uintptr_t lowner = (uintptr_t)lock->interlock.lock_data;
843 
844 			lsti = lck_spinlock_timeout_hit(lock, lowner);
845 			NMIPI_panic(cpu_to_cpumask(lsti->owner_cpu), SPINLOCK_TIMEOUT);
846 			panic("mp_safe_spin_lock() timed out, lock: %p, "
847 			    "owner thread: 0x%lx, current_thread: %p, "
848 			    "owner on CPU 0x%x, time: %llu",
849 			    lock, lowner, current_thread(),
850 			    lsti->owner_cpu, mach_absolute_time());
851 		}
852 	}
853 
854 	return FALSE;
855 }
856 
857 /*
858  * All-CPU rendezvous:
859  *      - CPUs are signalled,
860  *	- all execute the setup function (if specified),
861  *	- rendezvous (i.e. all cpus reach a barrier),
862  *	- all execute the action function (if specified),
863  *	- rendezvous again,
864  *	- execute the teardown function (if specified), and then
865  *	- resume.
866  *
867  * Note that the supplied external functions _must_ be reentrant and aware
868  * that they are running in parallel and in an unknown lock context.
869  */
870 
871 static void
mp_rendezvous_action(__unused void * null)872 mp_rendezvous_action(__unused void *null)
873 {
874 	boolean_t       intrs_enabled;
875 	uint64_t        tsc_spin_start;
876 
877 	/*
878 	 * Note that mp_rv_lock was acquired by the thread that initiated the
879 	 * rendezvous and must have been acquired before we enter
880 	 * mp_rendezvous_action().
881 	 */
882 	current_cpu_datap()->cpu_rendezvous_in_progress = TRUE;
883 
884 	/* setup function */
885 	if (mp_rv_setup_func != NULL) {
886 		mp_rv_setup_func(mp_rv_func_arg);
887 	}
888 
889 	intrs_enabled = ml_get_interrupts_enabled();
890 
891 	/* spin on entry rendezvous */
892 	atomic_incl(&mp_rv_entry, 1);
893 	tsc_spin_start = rdtsc64();
894 
895 	while (mp_rv_entry < mp_rv_ncpus) {
896 		/* poll for pesky tlb flushes if interrupts disabled */
897 		if (!intrs_enabled) {
898 			handle_pending_TLB_flushes();
899 		}
900 		if (mp_spin_timeout(tsc_spin_start)) {
901 			panic("mp_rv_action() entry: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_entry, mp_rv_ncpus, tsc_spin_start, rdtsc64());
902 		}
903 	}
904 
905 	/* action function */
906 	if (mp_rv_action_func != NULL) {
907 		mp_rv_action_func(mp_rv_func_arg);
908 	}
909 
910 	/* spin on exit rendezvous */
911 	atomic_incl(&mp_rv_exit, 1);
912 	tsc_spin_start = rdtsc64();
913 	while (mp_rv_exit < mp_rv_ncpus) {
914 		if (!intrs_enabled) {
915 			handle_pending_TLB_flushes();
916 		}
917 		if (mp_spin_timeout(tsc_spin_start)) {
918 			panic("mp_rv_action() exit: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_exit, mp_rv_ncpus, tsc_spin_start, rdtsc64());
919 		}
920 	}
921 
922 	/* teardown function */
923 	if (mp_rv_teardown_func != NULL) {
924 		mp_rv_teardown_func(mp_rv_func_arg);
925 	}
926 
927 	current_cpu_datap()->cpu_rendezvous_in_progress = FALSE;
928 
929 	/* Bump completion count */
930 	atomic_incl(&mp_rv_complete, 1);
931 }
932 
933 void
mp_rendezvous(void (* setup_func)(void *),void (* action_func)(void *),void (* teardown_func)(void *),void * arg)934 mp_rendezvous(void (*setup_func)(void *),
935     void (*action_func)(void *),
936     void (*teardown_func)(void *),
937     void *arg)
938 {
939 	uint64_t        tsc_spin_start;
940 
941 	if (!smp_initialized) {
942 		if (setup_func != NULL) {
943 			setup_func(arg);
944 		}
945 		if (action_func != NULL) {
946 			action_func(arg);
947 		}
948 		if (teardown_func != NULL) {
949 			teardown_func(arg);
950 		}
951 		return;
952 	}
953 
954 	/* obtain rendezvous lock */
955 	mp_rendezvous_lock();
956 
957 	/* set static function pointers */
958 	mp_rv_setup_func = setup_func;
959 	mp_rv_action_func = action_func;
960 	mp_rv_teardown_func = teardown_func;
961 	mp_rv_func_arg = arg;
962 
963 	mp_rv_entry    = 0;
964 	mp_rv_exit     = 0;
965 	mp_rv_complete = 0;
966 
967 	/*
968 	 * signal other processors, which will call mp_rendezvous_action()
969 	 * with interrupts disabled
970 	 */
971 	mp_rv_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, &mp_rendezvous_action, NULL) + 1;
972 
973 	/* call executor function on this cpu */
974 	mp_rendezvous_action(NULL);
975 
976 	/*
977 	 * Spin for everyone to complete.
978 	 * This is necessary to ensure that all processors have proceeded
979 	 * from the exit barrier before we release the rendezvous structure.
980 	 */
981 	tsc_spin_start = rdtsc64();
982 	while (mp_rv_complete < mp_rv_ncpus) {
983 		if (mp_spin_timeout(tsc_spin_start)) {
984 			panic("mp_rendezvous() timeout: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_complete, mp_rv_ncpus, tsc_spin_start, rdtsc64());
985 		}
986 	}
987 
988 	/* Tidy up */
989 	mp_rv_setup_func = NULL;
990 	mp_rv_action_func = NULL;
991 	mp_rv_teardown_func = NULL;
992 	mp_rv_func_arg = NULL;
993 
994 	/* release lock */
995 	mp_rendezvous_unlock();
996 }
997 
998 void
mp_rendezvous_lock(void)999 mp_rendezvous_lock(void)
1000 {
1001 	(void) mp_safe_spin_lock(&mp_rv_lock);
1002 }
1003 
1004 void
mp_rendezvous_unlock(void)1005 mp_rendezvous_unlock(void)
1006 {
1007 	simple_unlock(&mp_rv_lock);
1008 }
1009 
1010 void
mp_rendezvous_break_lock(void)1011 mp_rendezvous_break_lock(void)
1012 {
1013 	simple_lock_init(&mp_rv_lock, 0);
1014 }
1015 
1016 static void
setup_disable_intrs(__unused void * param_not_used)1017 setup_disable_intrs(__unused void * param_not_used)
1018 {
1019 	/* disable interrupts before the first barrier */
1020 	boolean_t intr = ml_set_interrupts_enabled(FALSE);
1021 
1022 	current_cpu_datap()->cpu_iflag = intr;
1023 	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1024 }
1025 
1026 static void
teardown_restore_intrs(__unused void * param_not_used)1027 teardown_restore_intrs(__unused void * param_not_used)
1028 {
1029 	/* restore interrupt flag following MTRR changes */
1030 	ml_set_interrupts_enabled(current_cpu_datap()->cpu_iflag);
1031 	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1032 }
1033 
1034 /*
1035  * A wrapper to mp_rendezvous() to call action_func() with interrupts disabled.
1036  * This is exported for use by kexts.
1037  */
1038 void
mp_rendezvous_no_intrs(void (* action_func)(void *),void * arg)1039 mp_rendezvous_no_intrs(
1040 	void (*action_func)(void *),
1041 	void *arg)
1042 {
1043 	mp_rendezvous(setup_disable_intrs,
1044 	    action_func,
1045 	    teardown_restore_intrs,
1046 	    arg);
1047 }
1048 
1049 
1050 typedef struct {
1051 	queue_chain_t   link;                   /* queue linkage */
1052 	void            (*func)(void *, void *); /* routine to call */
1053 	void            *arg0;                  /* routine's 1st arg */
1054 	void            *arg1;                  /* routine's 2nd arg */
1055 	cpumask_t       *maskp;                 /* completion response mask */
1056 } mp_call_t;
1057 
1058 
1059 typedef struct {
1060 	queue_head_t            queue;
1061 	decl_simple_lock_data(, lock);
1062 } mp_call_queue_t;
1063 #define MP_CPUS_CALL_BUFS_PER_CPU       MAX_CPUS
1064 static mp_call_queue_t  mp_cpus_call_freelist;
1065 static mp_call_queue_t  mp_cpus_call_head[MAX_CPUS];
1066 
1067 static inline boolean_t
mp_call_head_lock(mp_call_queue_t * cqp)1068 mp_call_head_lock(mp_call_queue_t *cqp)
1069 {
1070 	boolean_t       intrs_enabled;
1071 
1072 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
1073 	simple_lock(&cqp->lock, LCK_GRP_NULL);
1074 
1075 	return intrs_enabled;
1076 }
1077 
1078 /*
1079  * Deliver an NMIPI to a set of processors to cause them to panic .
1080  */
1081 void
NMIPI_panic(cpumask_t cpu_mask,NMI_reason_t why)1082 NMIPI_panic(cpumask_t cpu_mask, NMI_reason_t why)
1083 {
1084 	unsigned int cpu;
1085 	cpumask_t cpu_bit;
1086 	uint64_t deadline;
1087 
1088 	NMIPI_enable(TRUE);
1089 	NMI_panic_reason = why;
1090 
1091 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1092 		if ((cpu_mask & cpu_bit) == 0) {
1093 			continue;
1094 		}
1095 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1096 		cpu_NMI_interrupt(cpu);
1097 	}
1098 
1099 	/* Wait (only so long) for NMi'ed cpus to respond */
1100 	deadline = mach_absolute_time() + LockTimeOut;
1101 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1102 		if ((cpu_mask & cpu_bit) == 0) {
1103 			continue;
1104 		}
1105 		while (!cpu_datap(cpu)->cpu_NMI_acknowledged &&
1106 		    mach_absolute_time() < deadline) {
1107 			cpu_pause();
1108 		}
1109 	}
1110 }
1111 
1112 #if MACH_ASSERT
1113 static inline boolean_t
mp_call_head_is_locked(mp_call_queue_t * cqp)1114 mp_call_head_is_locked(mp_call_queue_t *cqp)
1115 {
1116 	return !ml_get_interrupts_enabled() &&
1117 	       hw_lock_held((hw_lock_t)&cqp->lock);
1118 }
1119 #endif
1120 
1121 static inline void
mp_call_head_unlock(mp_call_queue_t * cqp,boolean_t intrs_enabled)1122 mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled)
1123 {
1124 	simple_unlock(&cqp->lock);
1125 	ml_set_interrupts_enabled(intrs_enabled);
1126 }
1127 
1128 static inline mp_call_t *
mp_call_alloc(void)1129 mp_call_alloc(void)
1130 {
1131 	mp_call_t       *callp = NULL;
1132 	boolean_t       intrs_enabled;
1133 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1134 
1135 	intrs_enabled = mp_call_head_lock(cqp);
1136 	if (!queue_empty(&cqp->queue)) {
1137 		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1138 	}
1139 	mp_call_head_unlock(cqp, intrs_enabled);
1140 
1141 	return callp;
1142 }
1143 
1144 static inline void
mp_call_free(mp_call_t * callp)1145 mp_call_free(mp_call_t *callp)
1146 {
1147 	boolean_t       intrs_enabled;
1148 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1149 
1150 	intrs_enabled = mp_call_head_lock(cqp);
1151 	queue_enter_first(&cqp->queue, callp, typeof(callp), link);
1152 	mp_call_head_unlock(cqp, intrs_enabled);
1153 }
1154 
1155 static inline mp_call_t *
mp_call_dequeue_locked(mp_call_queue_t * cqp)1156 mp_call_dequeue_locked(mp_call_queue_t *cqp)
1157 {
1158 	mp_call_t       *callp = NULL;
1159 
1160 	assert(mp_call_head_is_locked(cqp));
1161 	if (!queue_empty(&cqp->queue)) {
1162 		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1163 	}
1164 	return callp;
1165 }
1166 
1167 static inline void
mp_call_enqueue_locked(mp_call_queue_t * cqp,mp_call_t * callp)1168 mp_call_enqueue_locked(
1169 	mp_call_queue_t *cqp,
1170 	mp_call_t       *callp)
1171 {
1172 	queue_enter(&cqp->queue, callp, typeof(callp), link);
1173 }
1174 
1175 /* Called on the boot processor to initialize global structures */
1176 static void
mp_cpus_call_init(void)1177 mp_cpus_call_init(void)
1178 {
1179 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1180 
1181 	DBG("mp_cpus_call_init()\n");
1182 	simple_lock_init(&cqp->lock, 0);
1183 	queue_init(&cqp->queue);
1184 }
1185 
1186 /*
1187  * Called at processor registration to add call buffers to the free list
1188  * and to initialize the per-cpu call queue.
1189  */
1190 void
mp_cpus_call_cpu_init(int cpu)1191 mp_cpus_call_cpu_init(int cpu)
1192 {
1193 	int             i;
1194 	mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1195 	mp_call_t       *callp;
1196 
1197 	simple_lock_init(&cqp->lock, 0);
1198 	queue_init(&cqp->queue);
1199 	for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) {
1200 		callp = zalloc_permanent_type(mp_call_t);
1201 		mp_call_free(callp);
1202 	}
1203 
1204 	DBG("mp_cpus_call_init(%d) done\n", cpu);
1205 }
1206 
1207 /*
1208  * This is called from cpu_signal_handler() to process an MP_CALL signal.
1209  * And also from i386_deactivate_cpu() when a cpu is being taken offline.
1210  */
1211 static void
mp_cpus_call_action(void)1212 mp_cpus_call_action(void)
1213 {
1214 	mp_call_queue_t *cqp;
1215 	boolean_t       intrs_enabled;
1216 	mp_call_t       *callp;
1217 	mp_call_t       call;
1218 
1219 	assert(!ml_get_interrupts_enabled());
1220 	cqp = &mp_cpus_call_head[cpu_number()];
1221 	intrs_enabled = mp_call_head_lock(cqp);
1222 	while ((callp = mp_call_dequeue_locked(cqp)) != NULL) {
1223 		/* Copy call request to the stack to free buffer */
1224 		call = *callp;
1225 		mp_call_free(callp);
1226 		if (call.func != NULL) {
1227 			mp_call_head_unlock(cqp, intrs_enabled);
1228 			KERNEL_DEBUG_CONSTANT(
1229 				TRACE_MP_CPUS_CALL_ACTION,
1230 				VM_KERNEL_UNSLIDE(call.func), VM_KERNEL_UNSLIDE_OR_PERM(call.arg0),
1231 				VM_KERNEL_UNSLIDE_OR_PERM(call.arg1), VM_KERNEL_ADDRPERM(call.maskp), 0);
1232 			call.func(call.arg0, call.arg1);
1233 			(void) mp_call_head_lock(cqp);
1234 		}
1235 		if (call.maskp != NULL) {
1236 			i_bit_set(cpu_number(), call.maskp);
1237 		}
1238 	}
1239 	mp_call_head_unlock(cqp, intrs_enabled);
1240 }
1241 
1242 #pragma clang diagnostic push
1243 #pragma clang diagnostic ignored "-Wcast-function-type"
1244 
1245 /*
1246  * mp_cpus_call() runs a given function on cpus specified in a given cpu mask.
1247  * Possible modes are:
1248  *  SYNC:   function is called serially on target cpus in logical cpu order
1249  *	    waiting for each call to be acknowledged before proceeding
1250  *  ASYNC:  function call is queued to the specified cpus
1251  *	    waiting for all calls to complete in parallel before returning
1252  *  NOSYNC: function calls are queued
1253  *	    but we return before confirmation of calls completing.
1254  * The action function may be NULL.
1255  * The cpu mask may include the local cpu. Offline cpus are ignored.
1256  * The return value is the number of cpus on which the call was made or queued.
1257  */
1258 cpu_t
mp_cpus_call(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *),void * arg)1259 mp_cpus_call(
1260 	cpumask_t       cpus,
1261 	mp_sync_t       mode,
1262 	void            (*action_func)(void *),
1263 	void            *arg)
1264 {
1265 	return mp_cpus_call1(
1266 		cpus,
1267 		mode,
1268 		(void (*)(void *, void *))action_func,
1269 		arg,
1270 		NULL,
1271 		NULL);
1272 }
1273 
1274 #pragma clang diagnostic pop
1275 
1276 static void
mp_cpus_call_wait(boolean_t intrs_enabled,cpumask_t cpus_called,cpumask_t * cpus_responded)1277 mp_cpus_call_wait(boolean_t     intrs_enabled,
1278     cpumask_t     cpus_called,
1279     cpumask_t     *cpus_responded)
1280 {
1281 	mp_call_queue_t         *cqp;
1282 	uint64_t                tsc_spin_start;
1283 
1284 	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1285 	cqp = &mp_cpus_call_head[cpu_number()];
1286 
1287 	tsc_spin_start = rdtsc64();
1288 	while (*cpus_responded != cpus_called) {
1289 		if (!intrs_enabled) {
1290 			/* Sniffing w/o locking */
1291 			if (!queue_empty(&cqp->queue)) {
1292 				mp_cpus_call_action();
1293 			}
1294 			cpu_signal_handler(NULL);
1295 		}
1296 		if (mp_spin_timeout(tsc_spin_start)) {
1297 			cpumask_t       cpus_unresponsive;
1298 
1299 			cpus_unresponsive = cpus_called & ~(*cpus_responded);
1300 			NMIPI_panic(cpus_unresponsive, CROSSCALL_TIMEOUT);
1301 			panic("mp_cpus_call_wait() timeout, cpus: 0x%llx",
1302 			    cpus_unresponsive);
1303 		}
1304 	}
1305 }
1306 
1307 cpu_t
mp_cpus_call1(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *,void *),void * arg0,void * arg1,cpumask_t * cpus_calledp)1308 mp_cpus_call1(
1309 	cpumask_t       cpus,
1310 	mp_sync_t       mode,
1311 	void            (*action_func)(void *, void *),
1312 	void            *arg0,
1313 	void            *arg1,
1314 	cpumask_t       *cpus_calledp)
1315 {
1316 	cpu_t           cpu = 0;
1317 	boolean_t       intrs_enabled = FALSE;
1318 	boolean_t       call_self = FALSE;
1319 	cpumask_t       cpus_called = 0;
1320 	cpumask_t       cpus_responded = 0;
1321 	long            cpus_call_count = 0;
1322 	uint64_t        tsc_spin_start;
1323 	boolean_t       topo_lock;
1324 
1325 	KERNEL_DEBUG_CONSTANT(
1326 		TRACE_MP_CPUS_CALL | DBG_FUNC_START,
1327 		cpus, mode, VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1));
1328 
1329 	if (!smp_initialized) {
1330 		if ((cpus & CPUMASK_SELF) == 0) {
1331 			goto out;
1332 		}
1333 		if (action_func != NULL) {
1334 			intrs_enabled = ml_set_interrupts_enabled(FALSE);
1335 			action_func(arg0, arg1);
1336 			ml_set_interrupts_enabled(intrs_enabled);
1337 		}
1338 		call_self = TRUE;
1339 		goto out;
1340 	}
1341 
1342 	/*
1343 	 * Queue the call for each non-local requested cpu.
1344 	 * This is performed under the topo lock to prevent changes to
1345 	 * cpus online state and to prevent concurrent rendezvouses --
1346 	 * although an exception is made if we're calling only the master
1347 	 * processor since that always remains active. Note: this exception
1348 	 * is expected for longterm timer nosync cross-calls to the master cpu.
1349 	 */
1350 	mp_disable_preemption();
1351 	intrs_enabled = ml_get_interrupts_enabled();
1352 	topo_lock = (cpus != cpu_to_cpumask(master_cpu));
1353 	if (topo_lock) {
1354 		ml_set_interrupts_enabled(FALSE);
1355 		(void) mp_safe_spin_lock(&x86_topo_lock);
1356 	}
1357 	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1358 		if (((cpu_to_cpumask(cpu) & cpus) == 0) ||
1359 		    !cpu_is_running(cpu)) {
1360 			continue;
1361 		}
1362 		tsc_spin_start = rdtsc64();
1363 		if (cpu == (cpu_t) cpu_number()) {
1364 			/*
1365 			 * We don't IPI ourself and if calling asynchronously,
1366 			 * we defer our call until we have signalled all others.
1367 			 */
1368 			call_self = TRUE;
1369 			if (mode == SYNC && action_func != NULL) {
1370 				KERNEL_DEBUG_CONSTANT(
1371 					TRACE_MP_CPUS_CALL_LOCAL,
1372 					VM_KERNEL_UNSLIDE(action_func),
1373 					VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1374 				action_func(arg0, arg1);
1375 			}
1376 		} else {
1377 			/*
1378 			 * Here to queue a call to cpu and IPI.
1379 			 */
1380 			mp_call_t       *callp = NULL;
1381 			mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1382 			boolean_t       intrs_inner;
1383 
1384 queue_call:
1385 			if (callp == NULL) {
1386 				callp = mp_call_alloc();
1387 			}
1388 			intrs_inner = mp_call_head_lock(cqp);
1389 			if (callp == NULL) {
1390 				mp_call_head_unlock(cqp, intrs_inner);
1391 				KERNEL_DEBUG_CONSTANT(
1392 					TRACE_MP_CPUS_CALL_NOBUF,
1393 					cpu, 0, 0, 0, 0);
1394 				if (!intrs_inner) {
1395 					/* Sniffing w/o locking */
1396 					if (!queue_empty(&cqp->queue)) {
1397 						mp_cpus_call_action();
1398 					}
1399 					handle_pending_TLB_flushes();
1400 				}
1401 				if (mp_spin_timeout(tsc_spin_start)) {
1402 					panic("mp_cpus_call1() timeout start: 0x%llx, cur: 0x%llx",
1403 					    tsc_spin_start, rdtsc64());
1404 				}
1405 				goto queue_call;
1406 			}
1407 			callp->maskp = (mode == NOSYNC) ? NULL : &cpus_responded;
1408 			callp->func = action_func;
1409 			callp->arg0 = arg0;
1410 			callp->arg1 = arg1;
1411 			mp_call_enqueue_locked(cqp, callp);
1412 			cpus_call_count++;
1413 			cpus_called |= cpu_to_cpumask(cpu);
1414 			i386_signal_cpu(cpu, MP_CALL, ASYNC);
1415 			mp_call_head_unlock(cqp, intrs_inner);
1416 			if (mode == SYNC) {
1417 				mp_cpus_call_wait(intrs_inner, cpus_called, &cpus_responded);
1418 			}
1419 		}
1420 	}
1421 	if (topo_lock) {
1422 		simple_unlock(&x86_topo_lock);
1423 		ml_set_interrupts_enabled(intrs_enabled);
1424 	}
1425 
1426 	/* Call locally if mode not SYNC */
1427 	if (mode != SYNC && call_self) {
1428 		KERNEL_DEBUG_CONSTANT(
1429 			TRACE_MP_CPUS_CALL_LOCAL,
1430 			VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1431 		if (action_func != NULL) {
1432 			ml_set_interrupts_enabled(FALSE);
1433 			action_func(arg0, arg1);
1434 			ml_set_interrupts_enabled(intrs_enabled);
1435 		}
1436 	}
1437 
1438 	/* For ASYNC, now wait for all signaled cpus to complete their calls */
1439 	if (mode == ASYNC) {
1440 		mp_cpus_call_wait(intrs_enabled, cpus_called, &cpus_responded);
1441 	}
1442 
1443 	/* Safe to allow pre-emption now */
1444 	mp_enable_preemption();
1445 
1446 out:
1447 	if (call_self) {
1448 		cpus_called |= cpu_to_cpumask(cpu);
1449 		cpus_call_count++;
1450 	}
1451 
1452 	if (cpus_calledp) {
1453 		*cpus_calledp = cpus_called;
1454 	}
1455 
1456 	KERNEL_DEBUG_CONSTANT(
1457 		TRACE_MP_CPUS_CALL | DBG_FUNC_END,
1458 		cpus_call_count, cpus_called, 0, 0, 0);
1459 
1460 	return (cpu_t) cpus_call_count;
1461 }
1462 
1463 
1464 static void
mp_broadcast_action(__unused void * null)1465 mp_broadcast_action(__unused void *null)
1466 {
1467 	/* call action function */
1468 	if (mp_bc_action_func != NULL) {
1469 		mp_bc_action_func(mp_bc_func_arg);
1470 	}
1471 
1472 	/* if we're the last one through, wake up the instigator */
1473 	if (atomic_decl_and_test(&mp_bc_count, 1)) {
1474 		thread_wakeup(((event_t)(uintptr_t) &mp_bc_count));
1475 	}
1476 }
1477 
1478 /*
1479  * mp_broadcast() runs a given function on all active cpus.
1480  * The caller blocks until the functions has run on all cpus.
1481  * The caller will also block if there is another pending broadcast.
1482  */
1483 void
mp_broadcast(void (* action_func)(void *),void * arg)1484 mp_broadcast(
1485 	void (*action_func)(void *),
1486 	void *arg)
1487 {
1488 	if (!smp_initialized) {
1489 		if (action_func != NULL) {
1490 			action_func(arg);
1491 		}
1492 		return;
1493 	}
1494 
1495 	/* obtain broadcast lock */
1496 	lck_mtx_lock(&mp_bc_lock);
1497 
1498 	/* set static function pointers */
1499 	mp_bc_action_func = action_func;
1500 	mp_bc_func_arg = arg;
1501 
1502 	assert_wait((event_t)(uintptr_t)&mp_bc_count, THREAD_UNINT);
1503 
1504 	/*
1505 	 * signal other processors, which will call mp_broadcast_action()
1506 	 */
1507 	mp_bc_count = real_ncpus;                       /* assume max possible active */
1508 	mp_bc_ncpus = mp_cpus_call(CPUMASK_ALL, NOSYNC, *mp_broadcast_action, NULL);
1509 	atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */
1510 
1511 	/* block for other cpus to have run action_func */
1512 	if (mp_bc_ncpus > 1) {
1513 		thread_block(THREAD_CONTINUE_NULL);
1514 	} else {
1515 		clear_wait(current_thread(), THREAD_AWAKENED);
1516 	}
1517 
1518 	/* release lock */
1519 	lck_mtx_unlock(&mp_bc_lock);
1520 }
1521 
1522 void
mp_cpus_kick(cpumask_t cpus)1523 mp_cpus_kick(cpumask_t cpus)
1524 {
1525 	cpu_t           cpu;
1526 	boolean_t       intrs_enabled = FALSE;
1527 
1528 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
1529 	mp_safe_spin_lock(&x86_topo_lock);
1530 
1531 	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1532 		if (((cpu_to_cpumask(cpu) & cpus) == 0)
1533 		    || !cpu_is_running(cpu)) {
1534 			continue;
1535 		}
1536 
1537 		lapic_send_ipi(cpu, LAPIC_VECTOR(KICK));
1538 	}
1539 
1540 	simple_unlock(&x86_topo_lock);
1541 	ml_set_interrupts_enabled(intrs_enabled);
1542 }
1543 
1544 void
i386_activate_cpu(void)1545 i386_activate_cpu(void)
1546 {
1547 	cpu_data_t      *cdp = current_cpu_datap();
1548 
1549 	assert(!ml_get_interrupts_enabled());
1550 
1551 	if (!smp_initialized) {
1552 		cdp->cpu_running = TRUE;
1553 		return;
1554 	}
1555 
1556 	mp_safe_spin_lock(&x86_topo_lock);
1557 	cdp->cpu_running = TRUE;
1558 	started_cpu();
1559 	pmap_tlbi_range(0, ~0ULL, true, 0);
1560 	simple_unlock(&x86_topo_lock);
1561 }
1562 
1563 void
i386_deactivate_cpu(void)1564 i386_deactivate_cpu(void)
1565 {
1566 	cpu_data_t      *cdp = current_cpu_datap();
1567 
1568 	assert(!ml_get_interrupts_enabled());
1569 
1570 	KERNEL_DEBUG_CONSTANT(
1571 		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START,
1572 		0, 0, 0, 0, 0);
1573 
1574 	mp_safe_spin_lock(&x86_topo_lock);
1575 	cdp->cpu_running = FALSE;
1576 	simple_unlock(&x86_topo_lock);
1577 
1578 	/*
1579 	 * Move all of this cpu's timers to the master/boot cpu,
1580 	 * and poke it in case there's a sooner deadline for it to schedule.
1581 	 */
1582 	timer_queue_shutdown(&cdp->rtclock_timer.queue);
1583 	mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, timer_queue_expire_local, NULL);
1584 
1585 #if CONFIG_CPU_COUNTERS
1586 	mt_cpu_down(cdp);
1587 #endif /* CONFIG_CPU_COUNTERS */
1588 #if KPERF
1589 	kptimer_stop_curcpu();
1590 #endif /* KPERF */
1591 
1592 	/*
1593 	 * Open an interrupt window
1594 	 * and ensure any pending IPI or timer is serviced
1595 	 */
1596 	mp_disable_preemption();
1597 	ml_set_interrupts_enabled(TRUE);
1598 
1599 	while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime) {
1600 		cpu_pause();
1601 	}
1602 	/*
1603 	 * Ensure there's no remaining timer deadline set
1604 	 * - AICPM may have left one active.
1605 	 */
1606 	setPop(0);
1607 
1608 	ml_set_interrupts_enabled(FALSE);
1609 	mp_enable_preemption();
1610 
1611 	KERNEL_DEBUG_CONSTANT(
1612 		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END,
1613 		0, 0, 0, 0, 0);
1614 }
1615 
1616 int     pmsafe_debug    = 1;
1617 
1618 #if     MACH_KDP
1619 volatile boolean_t      mp_kdp_trap = FALSE;
1620 volatile boolean_t      mp_kdp_is_NMI = FALSE;
1621 volatile unsigned long  mp_kdp_ncpus;
1622 boolean_t               mp_kdp_state;
1623 
1624 
1625 void
mp_kdp_enter(boolean_t proceed_on_failure)1626 mp_kdp_enter(boolean_t proceed_on_failure)
1627 {
1628 	unsigned int    cpu;
1629 	unsigned int    ncpus = 0;
1630 	unsigned int    my_cpu;
1631 	uint64_t        tsc_timeout;
1632 
1633 	DBG("mp_kdp_enter()\n");
1634 
1635 	/*
1636 	 * Here to enter the debugger.
1637 	 * In case of races, only one cpu is allowed to enter kdp after
1638 	 * stopping others.
1639 	 */
1640 	mp_kdp_state = ml_set_interrupts_enabled(FALSE);
1641 	my_cpu = cpu_number();
1642 
1643 	if (my_cpu == (unsigned) debugger_cpu) {
1644 		kprintf("\n\nRECURSIVE DEBUGGER ENTRY DETECTED\n\n");
1645 		kdp_reset();
1646 		return;
1647 	}
1648 
1649 	uint64_t start_time = cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time();
1650 	int locked = 0;
1651 	while (!locked || mp_kdp_trap) {
1652 		if (locked) {
1653 			simple_unlock(&x86_topo_lock);
1654 		}
1655 		if (proceed_on_failure) {
1656 			if (mach_absolute_time() - start_time > 500000000ll) {
1657 				paniclog_append_noflush("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n");
1658 				break;
1659 			}
1660 			locked = simple_lock_try(&x86_topo_lock, LCK_GRP_NULL);
1661 			if (!locked) {
1662 				cpu_pause();
1663 			}
1664 		} else {
1665 			mp_safe_spin_lock(&x86_topo_lock);
1666 			locked = TRUE;
1667 		}
1668 
1669 		if (locked && mp_kdp_trap) {
1670 			simple_unlock(&x86_topo_lock);
1671 			DBG("mp_kdp_enter() race lost\n");
1672 #if MACH_KDP
1673 			mp_kdp_wait(TRUE, FALSE);
1674 #endif
1675 			locked = FALSE;
1676 		}
1677 	}
1678 
1679 	if (pmsafe_debug && !kdp_snapshot) {
1680 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
1681 	}
1682 
1683 	debugger_cpu = my_cpu;
1684 	ncpus = 1;
1685 	atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1686 	mp_kdp_trap = TRUE;
1687 	debugger_entry_time = cpu_datap(my_cpu)->debugger_entry_time;
1688 
1689 	/*
1690 	 * Deliver a nudge to other cpus, counting how many
1691 	 */
1692 	DBG("mp_kdp_enter() signaling other processors\n");
1693 	if (force_immediate_debugger_NMI == FALSE) {
1694 		for (cpu = 0; cpu < real_ncpus; cpu++) {
1695 			if (cpu == my_cpu || !cpu_is_running(cpu)) {
1696 				continue;
1697 			}
1698 			ncpus++;
1699 			i386_signal_cpu(cpu, MP_KDP, ASYNC);
1700 		}
1701 		/*
1702 		 * Wait other processors to synchronize
1703 		 */
1704 		DBG("mp_kdp_enter() waiting for (%d) processors to suspend\n", ncpus);
1705 
1706 		/*
1707 		 * This timeout is rather arbitrary; we don't want to NMI
1708 		 * processors that are executing at potentially
1709 		 * "unsafe-to-interrupt" points such as the trampolines,
1710 		 * but neither do we want to lose state by waiting too long.
1711 		 */
1712 		tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1713 
1714 		while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1715 			/*
1716 			 * A TLB shootdown request may be pending--this would
1717 			 * result in the requesting processor waiting in
1718 			 * PMAP_UPDATE_TLBS() until this processor deals with it.
1719 			 * Process it, so it can now enter mp_kdp_wait()
1720 			 */
1721 			handle_pending_TLB_flushes();
1722 			cpu_pause();
1723 		}
1724 		/* If we've timed out, and some processor(s) are still unresponsive,
1725 		 * interrupt them with an NMI via the local APIC, iff a panic is
1726 		 * in progress.
1727 		 */
1728 		if (panic_active()) {
1729 			NMIPI_enable(TRUE);
1730 		}
1731 		if (mp_kdp_ncpus != ncpus) {
1732 			unsigned int wait_cycles = 0;
1733 			if (proceed_on_failure) {
1734 				paniclog_append_noflush("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1735 			} else {
1736 				DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1737 			}
1738 			for (cpu = 0; cpu < real_ncpus; cpu++) {
1739 				if (cpu == my_cpu || !cpu_is_running(cpu)) {
1740 					continue;
1741 				}
1742 				if (cpu_signal_pending(cpu, MP_KDP)) {
1743 					cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1744 					cpu_NMI_interrupt(cpu);
1745 				}
1746 			}
1747 			/* Wait again for the same timeout */
1748 			tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1749 			while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1750 				handle_pending_TLB_flushes();
1751 				cpu_pause();
1752 				++wait_cycles;
1753 			}
1754 			if (mp_kdp_ncpus != ncpus) {
1755 				paniclog_append_noflush("mp_kdp_enter() NMI pending on cpus:");
1756 				for (cpu = 0; cpu < real_ncpus; cpu++) {
1757 					if (cpu_is_running(cpu) && !cpu_datap(cpu)->cpu_NMI_acknowledged) {
1758 						paniclog_append_noflush(" %d", cpu);
1759 					}
1760 				}
1761 				paniclog_append_noflush("\n");
1762 				if (proceed_on_failure) {
1763 					paniclog_append_noflush("mp_kdp_enter() timed-out during %s wait after NMI;"
1764 					    "expected %u acks but received %lu after %u loops in %llu ticks\n",
1765 					    (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1766 				} else {
1767 					panic("mp_kdp_enter() timed-out during %s wait after NMI;"
1768 					    "expected %u acks but received %lu after %u loops in %llu ticks",
1769 					    (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1770 				}
1771 			}
1772 		}
1773 	} else if (NMI_panic_reason != PTE_CORRUPTION) {  /* In the pte corruption case, the detecting CPU has already NMIed other CPUs */
1774 		for (cpu = 0; cpu < real_ncpus; cpu++) {
1775 			if (cpu == my_cpu || !cpu_is_running(cpu)) {
1776 				continue;
1777 			}
1778 			cpu_NMI_interrupt(cpu);
1779 		}
1780 	}
1781 
1782 	if (locked) {
1783 		simple_unlock(&x86_topo_lock);
1784 	}
1785 
1786 	DBG("mp_kdp_enter() %d processors done %s\n",
1787 	    (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out");
1788 
1789 	postcode(MP_KDP_ENTER);
1790 }
1791 
1792 boolean_t
mp_kdp_all_cpus_halted()1793 mp_kdp_all_cpus_halted()
1794 {
1795 	unsigned int ncpus = 0, cpu = 0, my_cpu = 0;
1796 
1797 	my_cpu = cpu_number();
1798 	ncpus = 1; /* current CPU */
1799 	for (cpu = 0; cpu < real_ncpus; cpu++) {
1800 		if (cpu == my_cpu || !cpu_is_running(cpu)) {
1801 			continue;
1802 		}
1803 		ncpus++;
1804 	}
1805 
1806 	return mp_kdp_ncpus == ncpus;
1807 }
1808 
1809 static boolean_t
cpu_signal_pending(int cpu,mp_event_t event)1810 cpu_signal_pending(int cpu, mp_event_t event)
1811 {
1812 	volatile int    *signals = &cpu_datap(cpu)->cpu_signals;
1813 	boolean_t retval = FALSE;
1814 
1815 	if (i_bit(event, signals)) {
1816 		retval = TRUE;
1817 	}
1818 	return retval;
1819 }
1820 
1821 long
kdp_x86_xcpu_invoke(const uint16_t lcpu,kdp_x86_xcpu_func_t func,void * arg0,void * arg1,uint64_t timeout)1822 kdp_x86_xcpu_invoke(const uint16_t lcpu, kdp_x86_xcpu_func_t func,
1823     void *arg0, void *arg1, uint64_t timeout)
1824 {
1825 	uint64_t now;
1826 
1827 	if (lcpu > (real_ncpus - 1)) {
1828 		return -1;
1829 	}
1830 
1831 	if (func == NULL) {
1832 		return -1;
1833 	}
1834 
1835 	kdp_xcpu_call_func.func = func;
1836 	kdp_xcpu_call_func.ret  = -1;
1837 	kdp_xcpu_call_func.arg0 = arg0;
1838 	kdp_xcpu_call_func.arg1 = arg1;
1839 	kdp_xcpu_call_func.cpu  = lcpu;
1840 	DBG("Invoking function %p on CPU %d\n", func, (int32_t)lcpu);
1841 	now = mach_absolute_time();
1842 	while (kdp_xcpu_call_func.cpu != KDP_XCPU_NONE &&
1843 	    (timeout == 0 || (mach_absolute_time() - now) < timeout)) {
1844 		cpu_pause();
1845 	}
1846 	return kdp_xcpu_call_func.ret;
1847 }
1848 
1849 static void
kdp_x86_xcpu_poll(void)1850 kdp_x86_xcpu_poll(void)
1851 {
1852 	if ((uint16_t)cpu_number() == kdp_xcpu_call_func.cpu) {
1853 		kdp_xcpu_call_func.ret =
1854 		    kdp_xcpu_call_func.func(kdp_xcpu_call_func.arg0,
1855 		    kdp_xcpu_call_func.arg1,
1856 		    cpu_number());
1857 		kdp_xcpu_call_func.cpu = KDP_XCPU_NONE;
1858 	}
1859 }
1860 
1861 static void
mp_kdp_wait(boolean_t flush,boolean_t isNMI)1862 mp_kdp_wait(boolean_t flush, boolean_t isNMI)
1863 {
1864 	DBG("mp_kdp_wait()\n");
1865 
1866 	current_cpu_datap()->debugger_ipi_time = mach_absolute_time();
1867 #if CONFIG_MCA
1868 	/* If we've trapped due to a machine-check, save MCA registers */
1869 	mca_check_save();
1870 #endif
1871 
1872 	atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1873 	while (mp_kdp_trap || (isNMI == TRUE)) {
1874 		/*
1875 		 * A TLB shootdown request may be pending--this would result
1876 		 * in the requesting processor waiting in PMAP_UPDATE_TLBS()
1877 		 * until this processor handles it.
1878 		 * Process it, so it can now enter mp_kdp_wait()
1879 		 */
1880 		if (flush) {
1881 			handle_pending_TLB_flushes();
1882 		}
1883 
1884 		kdp_x86_xcpu_poll();
1885 		cpu_pause();
1886 	}
1887 
1888 	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1889 	DBG("mp_kdp_wait() done\n");
1890 }
1891 
1892 void
mp_kdp_exit(void)1893 mp_kdp_exit(void)
1894 {
1895 	DBG("mp_kdp_exit()\n");
1896 	debugger_cpu = -1;
1897 	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1898 
1899 	debugger_exit_time = mach_absolute_time();
1900 
1901 	mp_kdp_trap = FALSE;
1902 	mfence();
1903 
1904 	/* Wait other processors to stop spinning. XXX needs timeout */
1905 	DBG("mp_kdp_exit() waiting for processors to resume\n");
1906 	while (mp_kdp_ncpus > 0) {
1907 		/*
1908 		 * a TLB shootdown request may be pending... this would result in the requesting
1909 		 * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it.
1910 		 * Process it, so it can now enter mp_kdp_wait()
1911 		 */
1912 		handle_pending_TLB_flushes();
1913 
1914 		cpu_pause();
1915 	}
1916 
1917 	if (pmsafe_debug && !kdp_snapshot) {
1918 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
1919 	}
1920 
1921 	debugger_exit_time = mach_absolute_time();
1922 
1923 	DBG("mp_kdp_exit() done\n");
1924 	(void) ml_set_interrupts_enabled(mp_kdp_state);
1925 	postcode(MP_KDP_EXIT);
1926 }
1927 
1928 #endif  /* MACH_KDP */
1929 
1930 boolean_t
mp_recent_debugger_activity(void)1931 mp_recent_debugger_activity(void)
1932 {
1933 	uint64_t abstime = mach_absolute_time();
1934 	return ((abstime - debugger_entry_time) < LastDebuggerEntryAllowance) ||
1935 	       ((abstime - debugger_exit_time) < LastDebuggerEntryAllowance);
1936 }
1937 
1938 /*ARGSUSED*/
1939 void
init_ast_check(__unused processor_t processor)1940 init_ast_check(
1941 	__unused processor_t    processor)
1942 {
1943 }
1944 
1945 void
cause_ast_check(processor_t processor)1946 cause_ast_check(
1947 	processor_t     processor)
1948 {
1949 	int     cpu = processor->cpu_id;
1950 
1951 	if (cpu != cpu_number()) {
1952 		i386_signal_cpu(cpu, MP_AST, ASYNC);
1953 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0);
1954 	}
1955 }
1956 
1957 void
slave_machine_init(void * param)1958 slave_machine_init(void *param)
1959 {
1960 	/*
1961 	 * Here in process context, but with interrupts disabled.
1962 	 */
1963 	DBG("slave_machine_init() CPU%d\n", get_cpu_number());
1964 
1965 	if (param == FULL_SLAVE_INIT) {
1966 		/*
1967 		 * Cold start
1968 		 */
1969 		clock_init();
1970 	}
1971 	cpu_machine_init();     /* Interrupts enabled hereafter */
1972 }
1973 
1974 #undef cpu_number
1975 int
cpu_number(void)1976 cpu_number(void)
1977 {
1978 	return get_cpu_number();
1979 }
1980 
1981 vm_offset_t
current_percpu_base(void)1982 current_percpu_base(void)
1983 {
1984 	return get_current_percpu_base();
1985 }
1986 
1987 vm_offset_t
other_percpu_base(int cpu)1988 other_percpu_base(int cpu)
1989 {
1990 	return cpu_datap(cpu)->cpu_pcpu_base;
1991 }
1992 
1993 static void
cpu_prewarm_init()1994 cpu_prewarm_init()
1995 {
1996 	int i;
1997 
1998 	simple_lock_init(&cpu_warm_lock, 0);
1999 	queue_init(&cpu_warm_call_list);
2000 	for (i = 0; i < NUM_CPU_WARM_CALLS; i++) {
2001 		enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]);
2002 	}
2003 }
2004 
2005 static timer_call_t
grab_warm_timer_call()2006 grab_warm_timer_call()
2007 {
2008 	spl_t x;
2009 	timer_call_t call = NULL;
2010 
2011 	x = splsched();
2012 	simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2013 	if (!queue_empty(&cpu_warm_call_list)) {
2014 		call = (timer_call_t) dequeue_head(&cpu_warm_call_list);
2015 	}
2016 	simple_unlock(&cpu_warm_lock);
2017 	splx(x);
2018 
2019 	return call;
2020 }
2021 
2022 static void
free_warm_timer_call(timer_call_t call)2023 free_warm_timer_call(timer_call_t call)
2024 {
2025 	spl_t x;
2026 
2027 	x = splsched();
2028 	simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2029 	enqueue_head(&cpu_warm_call_list, (queue_entry_t)call);
2030 	simple_unlock(&cpu_warm_lock);
2031 	splx(x);
2032 }
2033 
2034 /*
2035  * Runs in timer call context (interrupts disabled).
2036  */
2037 static void
cpu_warm_timer_call_func(timer_call_param_t p0,__unused timer_call_param_t p1)2038 cpu_warm_timer_call_func(
2039 	timer_call_param_t p0,
2040 	__unused timer_call_param_t p1)
2041 {
2042 	free_warm_timer_call((timer_call_t)p0);
2043 	return;
2044 }
2045 
2046 /*
2047  * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0).
2048  */
2049 static void
_cpu_warm_setup(void * arg)2050 _cpu_warm_setup(
2051 	void *arg)
2052 {
2053 	cpu_warm_data_t cwdp = (cpu_warm_data_t)arg;
2054 
2055 	timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2056 	cwdp->cwd_result = 0;
2057 
2058 	return;
2059 }
2060 
2061 /*
2062  * Not safe to call with interrupts disabled.
2063  */
2064 kern_return_t
ml_interrupt_prewarm(uint64_t deadline)2065 ml_interrupt_prewarm(
2066 	uint64_t        deadline)
2067 {
2068 	struct cpu_warm_data cwd;
2069 	timer_call_t call;
2070 	cpu_t ct;
2071 
2072 	if (ml_get_interrupts_enabled() == FALSE) {
2073 		panic("%s: Interrupts disabled?", __FUNCTION__);
2074 	}
2075 
2076 	/*
2077 	 * If the platform doesn't need our help, say that we succeeded.
2078 	 */
2079 	if (!ml_get_interrupt_prewake_applicable()) {
2080 		return KERN_SUCCESS;
2081 	}
2082 
2083 	/*
2084 	 * Grab a timer call to use.
2085 	 */
2086 	call = grab_warm_timer_call();
2087 	if (call == NULL) {
2088 		return KERN_RESOURCE_SHORTAGE;
2089 	}
2090 
2091 	timer_call_setup(call, cpu_warm_timer_call_func, call);
2092 	cwd.cwd_call = call;
2093 	cwd.cwd_deadline = deadline;
2094 	cwd.cwd_result = 0;
2095 
2096 	/*
2097 	 * For now, non-local interrupts happen on the master processor.
2098 	 */
2099 	ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd);
2100 	if (ct == 0) {
2101 		free_warm_timer_call(call);
2102 		return KERN_FAILURE;
2103 	} else {
2104 		return cwd.cwd_result;
2105 	}
2106 }
2107 
2108 #if DEBUG || DEVELOPMENT
2109 void
kernel_spin(uint64_t spin_ns)2110 kernel_spin(uint64_t spin_ns)
2111 {
2112 	boolean_t       istate;
2113 	uint64_t        spin_abs;
2114 	uint64_t        deadline;
2115 	cpu_data_t      *cdp;
2116 
2117 	kprintf("kernel_spin(%llu) spinning uninterruptibly\n", spin_ns);
2118 	istate = ml_set_interrupts_enabled(FALSE);
2119 	cdp = current_cpu_datap();
2120 	nanoseconds_to_absolutetime(spin_ns, &spin_abs);
2121 
2122 	/* Fake interrupt handler entry for testing mp_interrupt_watchdog() */
2123 	cdp->cpu_int_event_time = mach_absolute_time();
2124 	cdp->cpu_int_state = (void *) USER_STATE(current_thread());
2125 
2126 	deadline = mach_absolute_time() + spin_ns;
2127 	while (mach_absolute_time() < deadline) {
2128 		cpu_pause();
2129 	}
2130 
2131 	cdp->cpu_int_event_time = 0;
2132 	cdp->cpu_int_state = NULL;
2133 
2134 	ml_set_interrupts_enabled(istate);
2135 	kprintf("kernel_spin() continuing\n");
2136 }
2137 
2138 /*
2139  * Called from the scheduler's maintenance thread,
2140  * scan running processors for long-running ISRs and:
2141  *  - panic if longer than LockTimeOut, or
2142  *  - log if more than a quantum.
2143  */
2144 void
mp_interrupt_watchdog(void)2145 mp_interrupt_watchdog(void)
2146 {
2147 	cpu_t                   cpu;
2148 	boolean_t               intrs_enabled = FALSE;
2149 	uint16_t                cpu_int_num;
2150 	uint64_t                cpu_int_event_time;
2151 	uint64_t                cpu_rip;
2152 	uint64_t                cpu_int_duration;
2153 	uint64_t                now;
2154 	x86_saved_state_t       *cpu_int_state;
2155 
2156 	if (__improbable(!mp_interrupt_watchdog_enabled)) {
2157 		return;
2158 	}
2159 
2160 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
2161 	now = mach_absolute_time();
2162 	/*
2163 	 * While timeouts are not suspended,
2164 	 * check all other processors for long outstanding interrupt handling.
2165 	 */
2166 	for (cpu = 0;
2167 	    cpu < (cpu_t) real_ncpus && !machine_timeout_suspended();
2168 	    cpu++) {
2169 		if ((cpu == (cpu_t) cpu_number()) ||
2170 		    (!cpu_is_running(cpu))) {
2171 			continue;
2172 		}
2173 		cpu_int_event_time = cpu_datap(cpu)->cpu_int_event_time;
2174 		if (cpu_int_event_time == 0) {
2175 			continue;
2176 		}
2177 		if (__improbable(now < cpu_int_event_time)) {
2178 			continue;       /* skip due to inter-processor skew */
2179 		}
2180 		cpu_int_state = cpu_datap(cpu)->cpu_int_state;
2181 		if (__improbable(cpu_int_state == NULL)) {
2182 			/* The interrupt may have been dismissed */
2183 			continue;
2184 		}
2185 
2186 		/* Here with a cpu handling an interrupt */
2187 
2188 		cpu_int_duration = now - cpu_int_event_time;
2189 		if (__improbable(cpu_int_duration > LockTimeOut)) {
2190 			cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2191 			cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2192 			vector_timed_out = cpu_int_num;
2193 			NMIPI_panic(cpu_to_cpumask(cpu), INTERRUPT_WATCHDOG);
2194 			panic("Interrupt watchdog, "
2195 			    "cpu: %d interrupt: 0x%x time: %llu..%llu state: %p RIP: 0x%llx",
2196 			    cpu, cpu_int_num, cpu_int_event_time, now, cpu_int_state, cpu_rip);
2197 			/* NOT REACHED */
2198 		} else if (__improbable(cpu_int_duration > (uint64_t) std_quantum)) {
2199 			mp_interrupt_watchdog_events++;
2200 			cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2201 			cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2202 			ml_set_interrupts_enabled(intrs_enabled);
2203 			printf("Interrupt watchdog, "
2204 			    "cpu: %d interrupt: 0x%x time: %llu..%llu RIP: 0x%llx\n",
2205 			    cpu, cpu_int_num, cpu_int_event_time, now, cpu_rip);
2206 			return;
2207 		}
2208 	}
2209 
2210 	ml_set_interrupts_enabled(intrs_enabled);
2211 }
2212 #endif
2213