xref: /xnu-8020.121.3/osfmk/i386/mp.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 
32 #include <mach_kdp.h>
33 #include <kdp/kdp_internal.h>
34 #include <mach_ldebug.h>
35 
36 #include <mach/mach_types.h>
37 #include <mach/kern_return.h>
38 
39 #include <kern/kern_types.h>
40 #include <kern/startup.h>
41 #include <kern/timer_queue.h>
42 #include <kern/processor.h>
43 #include <kern/cpu_number.h>
44 #include <kern/cpu_data.h>
45 #include <kern/assert.h>
46 #include <kern/lock_group.h>
47 #include <kern/machine.h>
48 #include <kern/pms.h>
49 #include <kern/misc_protos.h>
50 #include <kern/timer_call.h>
51 #include <kern/zalloc.h>
52 #include <kern/queue.h>
53 #include <prng/random.h>
54 
55 #include <vm/vm_map.h>
56 #include <vm/vm_kern.h>
57 
58 #include <i386/bit_routines.h>
59 #include <i386/proc_reg.h>
60 #include <i386/cpu_threads.h>
61 #include <i386/mp_desc.h>
62 #include <i386/misc_protos.h>
63 #include <i386/trap.h>
64 #include <i386/postcode.h>
65 #include <i386/machine_routines.h>
66 #include <i386/mp.h>
67 #include <i386/mp_events.h>
68 #include <i386/lapic.h>
69 #include <i386/cpuid.h>
70 #include <i386/fpu.h>
71 #include <i386/machine_cpu.h>
72 #include <i386/pmCPU.h>
73 #if CONFIG_MCA
74 #include <i386/machine_check.h>
75 #endif
76 #include <i386/acpi.h>
77 
78 #include <sys/kdebug.h>
79 
80 #include <console/serial_protos.h>
81 
82 #if MONOTONIC
83 #include <kern/monotonic.h>
84 #endif /* MONOTONIC */
85 
86 #if KPERF
87 #include <kperf/kptimer.h>
88 #endif /* KPERF */
89 
90 #if     MP_DEBUG
91 #define PAUSE           delay(1000000)
92 #define DBG(x...)       kprintf(x)
93 #else
94 #define DBG(x...)
95 #define PAUSE
96 #endif  /* MP_DEBUG */
97 
98 /* Debugging/test trace events: */
99 #define TRACE_MP_TLB_FLUSH              MACHDBG_CODE(DBG_MACH_MP, 0)
100 #define TRACE_MP_CPUS_CALL              MACHDBG_CODE(DBG_MACH_MP, 1)
101 #define TRACE_MP_CPUS_CALL_LOCAL        MACHDBG_CODE(DBG_MACH_MP, 2)
102 #define TRACE_MP_CPUS_CALL_ACTION       MACHDBG_CODE(DBG_MACH_MP, 3)
103 #define TRACE_MP_CPUS_CALL_NOBUF        MACHDBG_CODE(DBG_MACH_MP, 4)
104 #define TRACE_MP_CPU_FAST_START         MACHDBG_CODE(DBG_MACH_MP, 5)
105 #define TRACE_MP_CPU_START              MACHDBG_CODE(DBG_MACH_MP, 6)
106 #define TRACE_MP_CPU_DEACTIVATE         MACHDBG_CODE(DBG_MACH_MP, 7)
107 
108 #define ABS(v)          (((v) > 0)?(v):-(v))
109 
110 void            slave_boot_init(void);
111 void            i386_cpu_IPI(int cpu);
112 
113 #if MACH_KDP
114 static void     mp_kdp_wait(boolean_t flush, boolean_t isNMI);
115 #endif /* MACH_KDP */
116 
117 #if MACH_KDP
118 static boolean_t        cpu_signal_pending(int cpu, mp_event_t event);
119 #endif /* MACH_KDP */
120 static int              NMIInterruptHandler(x86_saved_state_t *regs);
121 
122 boolean_t               smp_initialized = FALSE;
123 uint32_t                TSC_sync_margin = 0xFFF;
124 volatile boolean_t      force_immediate_debugger_NMI = FALSE;
125 volatile boolean_t      pmap_tlb_flush_timeout = FALSE;
126 #if DEBUG || DEVELOPMENT
127 boolean_t               mp_interrupt_watchdog_enabled = TRUE;
128 uint32_t                mp_interrupt_watchdog_events = 0;
129 #endif
130 
131 SIMPLE_LOCK_DECLARE(debugger_callback_lock, 0);
132 struct debugger_callback *debugger_callback = NULL;
133 
134 static LCK_GRP_DECLARE(smp_lck_grp, "i386_smp");
135 static LCK_MTX_EARLY_DECLARE(mp_cpu_boot_lock, &smp_lck_grp);
136 
137 /* Variables needed for MP rendezvous. */
138 SIMPLE_LOCK_DECLARE(mp_rv_lock, 0);
139 static void     (*mp_rv_setup_func)(void *arg);
140 static void     (*mp_rv_action_func)(void *arg);
141 static void     (*mp_rv_teardown_func)(void *arg);
142 static void     *mp_rv_func_arg;
143 static volatile int     mp_rv_ncpus;
144 /* Cache-aligned barriers: */
145 static volatile long    mp_rv_entry    __attribute__((aligned(64)));
146 static volatile long    mp_rv_exit     __attribute__((aligned(64)));
147 static volatile long    mp_rv_complete __attribute__((aligned(64)));
148 
149 volatile        uint64_t        debugger_entry_time;
150 volatile        uint64_t        debugger_exit_time;
151 #if MACH_KDP
152 #include <kdp/kdp.h>
153 extern int kdp_snapshot;
154 static struct _kdp_xcpu_call_func {
155 	kdp_x86_xcpu_func_t func;
156 	void     *arg0, *arg1;
157 	volatile long     ret;
158 	volatile uint16_t cpu;
159 } kdp_xcpu_call_func = {
160 	.cpu  = KDP_XCPU_NONE
161 };
162 
163 #endif
164 
165 /* Variables needed for MP broadcast. */
166 static void        (*mp_bc_action_func)(void *arg);
167 static void        *mp_bc_func_arg;
168 static int      mp_bc_ncpus;
169 static volatile long   mp_bc_count;
170 static LCK_MTX_EARLY_DECLARE(mp_bc_lock, &smp_lck_grp);
171 static  volatile int    debugger_cpu = -1;
172 volatile long    NMIPI_acks = 0;
173 volatile long    NMI_count = 0;
174 static int              vector_timed_out;
175 
176 NMI_reason_t    NMI_panic_reason = NONE;
177 extern void     NMI_cpus(void);
178 
179 static void     mp_cpus_call_init(void);
180 static void     mp_cpus_call_action(void);
181 static void     mp_call_PM(void);
182 
183 char            mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init
184 
185 /* PAL-related routines */
186 boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler,
187     int ipi_vector, i386_intr_func_t ipi_handler);
188 void i386_start_cpu(int lapic_id, int cpu_num);
189 void i386_send_NMI(int cpu);
190 void NMIPI_enable(boolean_t);
191 
192 #define NUM_CPU_WARM_CALLS      20
193 struct timer_call       cpu_warm_call_arr[NUM_CPU_WARM_CALLS];
194 queue_head_t            cpu_warm_call_list;
195 decl_simple_lock_data(static, cpu_warm_lock);
196 
197 typedef struct cpu_warm_data {
198 	timer_call_t    cwd_call;
199 	uint64_t        cwd_deadline;
200 	int             cwd_result;
201 } *cpu_warm_data_t;
202 
203 static void             cpu_prewarm_init(void);
204 static void             cpu_warm_timer_call_func(timer_call_param_t p0, timer_call_param_t p1);
205 static void             _cpu_warm_setup(void *arg);
206 static timer_call_t     grab_warm_timer_call(void);
207 static void             free_warm_timer_call(timer_call_t call);
208 
209 void
smp_init(void)210 smp_init(void)
211 {
212 	console_init();
213 
214 	if (!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler,
215 	    LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler)) {
216 		return;
217 	}
218 
219 	cpu_thread_init();
220 
221 	DBGLOG_CPU_INIT(master_cpu);
222 
223 	mp_cpus_call_init();
224 	mp_cpus_call_cpu_init(master_cpu);
225 
226 #if DEBUG || DEVELOPMENT
227 	if (PE_parse_boot_argn("interrupt_watchdog",
228 	    &mp_interrupt_watchdog_enabled,
229 	    sizeof(mp_interrupt_watchdog_enabled))) {
230 		kprintf("Interrupt watchdog %sabled\n",
231 		    mp_interrupt_watchdog_enabled ? "en" : "dis");
232 	}
233 #endif
234 
235 	if (PE_parse_boot_argn("TSC_sync_margin",
236 	    &TSC_sync_margin, sizeof(TSC_sync_margin))) {
237 		kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin);
238 	} else if (cpuid_vmm_present()) {
239 		kprintf("TSC sync margin disabled\n");
240 		TSC_sync_margin = 0;
241 	}
242 	smp_initialized = TRUE;
243 
244 	cpu_prewarm_init();
245 
246 	return;
247 }
248 
249 typedef struct {
250 	int                     target_cpu;
251 	int                     target_lapic;
252 	int                     starter_cpu;
253 } processor_start_info_t;
254 static processor_start_info_t   start_info        __attribute__((aligned(64)));
255 
256 /*
257  * Cache-alignment is to avoid cross-cpu false-sharing interference.
258  */
259 static volatile long            tsc_entry_barrier __attribute__((aligned(64)));
260 static volatile long            tsc_exit_barrier  __attribute__((aligned(64)));
261 static volatile uint64_t        tsc_target        __attribute__((aligned(64)));
262 
263 /*
264  * Poll a CPU to see when it has marked itself as running.
265  */
266 static void
mp_wait_for_cpu_up(int slot_num,unsigned int iters,unsigned int usecdelay)267 mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay)
268 {
269 	while (iters-- > 0) {
270 		if (cpu_datap(slot_num)->cpu_running) {
271 			break;
272 		}
273 		delay(usecdelay);
274 	}
275 }
276 
277 /*
278  * Quickly bring a CPU back online which has been halted.
279  */
280 kern_return_t
intel_startCPU_fast(int slot_num)281 intel_startCPU_fast(int slot_num)
282 {
283 	kern_return_t   rc;
284 
285 	/*
286 	 * Try to perform a fast restart
287 	 */
288 	rc = pmCPUExitHalt(slot_num);
289 	if (rc != KERN_SUCCESS) {
290 		/*
291 		 * The CPU was not eligible for a fast restart.
292 		 */
293 		return rc;
294 	}
295 
296 	KERNEL_DEBUG_CONSTANT(
297 		TRACE_MP_CPU_FAST_START | DBG_FUNC_START,
298 		slot_num, 0, 0, 0, 0);
299 
300 	/*
301 	 * Wait until the CPU is back online.
302 	 */
303 	mp_disable_preemption();
304 
305 	/*
306 	 * We use short pauses (1us) for low latency.  30,000 iterations is
307 	 * longer than a full restart would require so it should be more
308 	 * than long enough.
309 	 */
310 
311 	mp_wait_for_cpu_up(slot_num, 30000, 1);
312 	mp_enable_preemption();
313 
314 	KERNEL_DEBUG_CONSTANT(
315 		TRACE_MP_CPU_FAST_START | DBG_FUNC_END,
316 		slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0);
317 
318 	/*
319 	 * Check to make sure that the CPU is really running.  If not,
320 	 * go through the slow path.
321 	 */
322 	if (cpu_datap(slot_num)->cpu_running) {
323 		return KERN_SUCCESS;
324 	} else {
325 		return KERN_FAILURE;
326 	}
327 }
328 
329 static void
started_cpu(void)330 started_cpu(void)
331 {
332 	/* Here on the started cpu with cpu_running set TRUE */
333 
334 	if (TSC_sync_margin &&
335 	    start_info.target_cpu == cpu_number()) {
336 		/*
337 		 * I've just started-up, synchronize again with the starter cpu
338 		 * and then snap my TSC.
339 		 */
340 		tsc_target   = 0;
341 		atomic_decl(&tsc_entry_barrier, 1);
342 		while (tsc_entry_barrier != 0) {
343 			;       /* spin for starter and target at barrier */
344 		}
345 		tsc_target = rdtsc64();
346 		atomic_decl(&tsc_exit_barrier, 1);
347 	}
348 }
349 
350 static void
start_cpu(void * arg)351 start_cpu(void *arg)
352 {
353 	int                     i = 1000;
354 	processor_start_info_t  *psip = (processor_start_info_t *) arg;
355 
356 	/* Ignore this if the current processor is not the starter */
357 	if (cpu_number() != psip->starter_cpu) {
358 		return;
359 	}
360 
361 	DBG("start_cpu(%p) about to start cpu %d, lapic %d\n",
362 	    arg, psip->target_cpu, psip->target_lapic);
363 
364 	KERNEL_DEBUG_CONSTANT(
365 		TRACE_MP_CPU_START | DBG_FUNC_START,
366 		psip->target_cpu,
367 		psip->target_lapic, 0, 0, 0);
368 
369 	i386_start_cpu(psip->target_lapic, psip->target_cpu);
370 
371 #ifdef  POSTCODE_DELAY
372 	/* Wait much longer if postcodes are displayed for a delay period. */
373 	i *= 10000;
374 #endif
375 	DBG("start_cpu(%p) about to wait for cpu %d\n",
376 	    arg, psip->target_cpu);
377 
378 	mp_wait_for_cpu_up(psip->target_cpu, i * 100, 100);
379 
380 	KERNEL_DEBUG_CONSTANT(
381 		TRACE_MP_CPU_START | DBG_FUNC_END,
382 		psip->target_cpu,
383 		cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0);
384 
385 	if (TSC_sync_margin &&
386 	    cpu_datap(psip->target_cpu)->cpu_running) {
387 		/*
388 		 * Compare the TSC from the started processor with ours.
389 		 * Report and log/panic if it diverges by more than
390 		 * TSC_sync_margin (TSC_SYNC_MARGIN) ticks. This margin
391 		 * can be overriden by boot-arg (with 0 meaning no checking).
392 		 */
393 		uint64_t        tsc_starter;
394 		int64_t         tsc_delta;
395 		atomic_decl(&tsc_entry_barrier, 1);
396 		while (tsc_entry_barrier != 0) {
397 			;       /* spin for both processors at barrier */
398 		}
399 		tsc_starter = rdtsc64();
400 		atomic_decl(&tsc_exit_barrier, 1);
401 		while (tsc_exit_barrier != 0) {
402 			;       /* spin for target to store its TSC */
403 		}
404 		tsc_delta = tsc_target - tsc_starter;
405 		kprintf("TSC sync for cpu %d: 0x%016llx delta 0x%llx (%lld)\n",
406 		    psip->target_cpu, tsc_target, tsc_delta, tsc_delta);
407 #if DEBUG || DEVELOPMENT
408 		/*
409 		 * Stash the delta for inspection later, since we can no
410 		 * longer print/log it with interrupts disabled.
411 		 */
412 		cpu_datap(psip->target_cpu)->tsc_sync_delta = tsc_delta;
413 #endif
414 		if (ABS(tsc_delta) > (int64_t) TSC_sync_margin) {
415 #if DEBUG
416 			panic(
417 #else
418 			kprintf(
419 #endif
420 				"Unsynchronized  TSC for cpu %d: "
421 				"0x%016llx, delta 0x%llx\n",
422 				psip->target_cpu, tsc_target, tsc_delta);
423 		}
424 	}
425 }
426 
427 kern_return_t
intel_startCPU(int slot_num)428 intel_startCPU(
429 	int     slot_num)
430 {
431 	int             lapic = cpu_to_lapic[slot_num];
432 	boolean_t       istate;
433 
434 	assert(lapic != -1);
435 
436 	DBGLOG_CPU_INIT(slot_num);
437 
438 	DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic);
439 	DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD);
440 
441 	/*
442 	 * Initialize (or re-initialize) the descriptor tables for this cpu.
443 	 * Propagate processor mode to slave.
444 	 */
445 	cpu_desc_init(cpu_datap(slot_num));
446 
447 	/* Serialize use of the slave boot stack, etc. */
448 	lck_mtx_lock(&mp_cpu_boot_lock);
449 
450 	istate = ml_set_interrupts_enabled(FALSE);
451 	if (slot_num == get_cpu_number()) {
452 		ml_set_interrupts_enabled(istate);
453 		lck_mtx_unlock(&mp_cpu_boot_lock);
454 		return KERN_SUCCESS;
455 	}
456 
457 	start_info.starter_cpu  = cpu_number();
458 	start_info.target_cpu   = slot_num;
459 	start_info.target_lapic = lapic;
460 	tsc_entry_barrier = 2;
461 	tsc_exit_barrier = 2;
462 
463 	/*
464 	 * Perform the processor startup sequence with all running
465 	 * processors rendezvous'ed. This is required during periods when
466 	 * the cache-disable bit is set for MTRR/PAT initialization.
467 	 */
468 	mp_rendezvous_no_intrs(start_cpu, (void *) &start_info);
469 
470 	start_info.target_cpu = 0;
471 
472 	ml_set_interrupts_enabled(istate);
473 	lck_mtx_unlock(&mp_cpu_boot_lock);
474 
475 	if (!cpu_datap(slot_num)->cpu_running) {
476 		kprintf("Failed to start CPU %02d\n", slot_num);
477 		printf("Failed to start CPU %02d, rebooting...\n", slot_num);
478 		delay(1000000);
479 		halt_cpu();
480 		return KERN_SUCCESS;
481 	} else {
482 		kprintf("Started cpu %d (lapic id %08x)\n", slot_num, lapic);
483 		return KERN_SUCCESS;
484 	}
485 }
486 
487 #if     MP_DEBUG
488 cpu_signal_event_log_t  *cpu_signal[MAX_CPUS];
489 cpu_signal_event_log_t  *cpu_handle[MAX_CPUS];
490 
491 MP_EVENT_NAME_DECL();
492 
493 #endif  /* MP_DEBUG */
494 
495 /*
496  * Note: called with NULL state when polling for TLB flush and cross-calls.
497  */
498 int
cpu_signal_handler(x86_saved_state_t * regs)499 cpu_signal_handler(x86_saved_state_t *regs)
500 {
501 #if     !MACH_KDP
502 #pragma unused (regs)
503 #endif /* !MACH_KDP */
504 	int             my_cpu;
505 	volatile int    *my_word;
506 
507 	SCHED_STATS_INC(ipi_count);
508 
509 	my_cpu = cpu_number();
510 	my_word = &cpu_data_ptr[my_cpu]->cpu_signals;
511 	/* Store the initial set of signals for diagnostics. New
512 	 * signals could arrive while these are being processed
513 	 * so it's no more than a hint.
514 	 */
515 
516 	cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word;
517 
518 	do {
519 #if     MACH_KDP
520 		if (i_bit(MP_KDP, my_word)) {
521 			DBGLOG(cpu_handle, my_cpu, MP_KDP);
522 			i_bit_clear(MP_KDP, my_word);
523 /* Ensure that the i386_kernel_state at the base of the
524  * current thread's stack (if any) is synchronized with the
525  * context at the moment of the interrupt, to facilitate
526  * access through the debugger.
527  */
528 			sync_iss_to_iks(regs);
529 			if (pmsafe_debug && !kdp_snapshot) {
530 				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
531 			}
532 			mp_kdp_wait(TRUE, FALSE);
533 			if (pmsafe_debug && !kdp_snapshot) {
534 				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
535 			}
536 		} else
537 #endif  /* MACH_KDP */
538 		if (i_bit(MP_TLB_FLUSH, my_word)) {
539 			DBGLOG(cpu_handle, my_cpu, MP_TLB_FLUSH);
540 			i_bit_clear(MP_TLB_FLUSH, my_word);
541 			pmap_update_interrupt();
542 		} else if (i_bit(MP_CALL, my_word)) {
543 			DBGLOG(cpu_handle, my_cpu, MP_CALL);
544 			i_bit_clear(MP_CALL, my_word);
545 			mp_cpus_call_action();
546 		} else if (i_bit(MP_CALL_PM, my_word)) {
547 			DBGLOG(cpu_handle, my_cpu, MP_CALL_PM);
548 			i_bit_clear(MP_CALL_PM, my_word);
549 			mp_call_PM();
550 		}
551 		if (regs == NULL) {
552 			/* Called to poll only for cross-calls and TLB flush */
553 			break;
554 		} else if (i_bit(MP_AST, my_word)) {
555 			DBGLOG(cpu_handle, my_cpu, MP_AST);
556 			i_bit_clear(MP_AST, my_word);
557 			ast_check(cpu_to_processor(my_cpu));
558 		}
559 	} while (*my_word);
560 
561 	return 0;
562 }
563 
564 long
NMI_pte_corruption_callback(__unused void * arg0,__unused void * arg1,uint16_t lcpu)565 NMI_pte_corruption_callback(__unused void *arg0, __unused void *arg1, uint16_t lcpu)
566 {
567 	static char     pstr[256];      /* global since this callback is serialized */
568 	void            *stackptr;
569 	__asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
570 
571 	snprintf(&pstr[0], sizeof(pstr),
572 	    "Panic(CPU %d): PTE corruption detected on PTEP 0x%llx VAL 0x%llx\n",
573 	    lcpu, (unsigned long long)(uintptr_t)PTE_corrupted_ptr, *(uint64_t *)PTE_corrupted_ptr);
574 	panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, current_cpu_datap()->cpu_int_state);
575 	return 0;
576 }
577 
578 extern void kprintf_break_lock(void);
579 int
NMIInterruptHandler(x86_saved_state_t * regs)580 NMIInterruptHandler(x86_saved_state_t *regs)
581 {
582 	void            *stackptr;
583 	char            pstr[256];
584 	uint64_t        now = mach_absolute_time();
585 
586 	if (panic_active() && !panicDebugging) {
587 		if (pmsafe_debug) {
588 			pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
589 		}
590 		for (;;) {
591 			cpu_pause();
592 		}
593 	}
594 
595 	atomic_incl(&NMIPI_acks, 1);
596 	atomic_incl(&NMI_count, 1);
597 	sync_iss_to_iks_unconditionally(regs);
598 	__asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
599 
600 	if (cpu_number() == debugger_cpu) {
601 		goto NMExit;
602 	}
603 
604 	if (NMI_panic_reason == SPINLOCK_TIMEOUT) {
605 		lck_spinlock_to_info_t lsti;
606 
607 		lsti = os_atomic_load(&lck_spinlock_timeout_in_progress, acquire);
608 		snprintf(&pstr[0], sizeof(pstr),
609 		    "Panic(CPU %d, time %llu): NMIPI for spinlock acquisition timeout, spinlock: %p, "
610 		    "spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n",
611 		    cpu_number(), now, lsti->lock, (void *)lsti->owner_thread_cur,
612 		    current_thread(), lsti->owner_cpu);
613 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
614 	} else if (NMI_panic_reason == TLB_FLUSH_TIMEOUT) {
615 		snprintf(&pstr[0], sizeof(pstr),
616 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: TLB flush timeout, TLB state:0x%x\n",
617 		    cpu_number(), now, current_cpu_datap()->cpu_tlb_invalid);
618 		panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs);
619 	} else if (NMI_panic_reason == CROSSCALL_TIMEOUT) {
620 		snprintf(&pstr[0], sizeof(pstr),
621 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: cross-call timeout\n",
622 		    cpu_number(), now);
623 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
624 	} else if (NMI_panic_reason == INTERRUPT_WATCHDOG) {
625 		snprintf(&pstr[0], sizeof(pstr),
626 		    "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: interrupt watchdog for vector 0x%x\n",
627 		    cpu_number(), now, vector_timed_out);
628 		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
629 	}
630 
631 #if MACH_KDP
632 	if (pmsafe_debug && !kdp_snapshot) {
633 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
634 	}
635 	current_cpu_datap()->cpu_NMI_acknowledged = TRUE;
636 	i_bit_clear(MP_KDP, &current_cpu_datap()->cpu_signals);
637 	if (panic_active() || NMI_panic_reason != NONE) {
638 		mp_kdp_wait(FALSE, TRUE);
639 	} else if (!mp_kdp_trap &&
640 	    !mp_kdp_is_NMI &&
641 	    virtualized && (debug_boot_arg & DB_NMI)) {
642 		/*
643 		 * Under a VMM with the debug boot-arg set, drop into kdp.
644 		 * Since an NMI is involved, there's a risk of contending with
645 		 * a panic. And side-effects of NMIs may result in entry into,
646 		 * and continuing from, the debugger being unreliable.
647 		 */
648 		if (__sync_bool_compare_and_swap(&mp_kdp_is_NMI, FALSE, TRUE)) {
649 			kprintf_break_lock();
650 
651 			DebuggerWithContext(EXC_BREAKPOINT, saved_state64(regs),
652 			    "requested by NMI", DEBUGGER_OPTION_NONE,
653 			    (unsigned long)(char *)__builtin_return_address(0));
654 
655 			mp_kdp_is_NMI = FALSE;
656 		} else {
657 			mp_kdp_wait(FALSE, FALSE);
658 		}
659 	} else {
660 		mp_kdp_wait(FALSE, FALSE);
661 	}
662 	if (pmsafe_debug && !kdp_snapshot) {
663 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
664 	}
665 #endif
666 NMExit:
667 	return 1;
668 }
669 
670 /*
671  * cpu_interrupt is really just to be used by the scheduler to
672  * get a CPU's attention it may not always issue an IPI.  If an
673  * IPI is always needed then use i386_cpu_IPI.
674  */
675 void
cpu_interrupt(int cpu)676 cpu_interrupt(int cpu)
677 {
678 	boolean_t did_IPI = FALSE;
679 
680 	if (smp_initialized
681 	    && pmCPUExitIdle(cpu_datap(cpu))) {
682 		i386_cpu_IPI(cpu);
683 		did_IPI = TRUE;
684 	}
685 
686 	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0);
687 }
688 
689 /*
690  * Send a true NMI via the local APIC to the specified CPU.
691  */
692 void
cpu_NMI_interrupt(int cpu)693 cpu_NMI_interrupt(int cpu)
694 {
695 	if (smp_initialized) {
696 		i386_send_NMI(cpu);
697 	}
698 }
699 
700 void
NMI_cpus(void)701 NMI_cpus(void)
702 {
703 	unsigned int    cpu;
704 	boolean_t       intrs_enabled;
705 	uint64_t        tsc_timeout;
706 
707 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
708 	NMIPI_enable(TRUE);
709 	for (cpu = 0; cpu < real_ncpus; cpu++) {
710 		if (!cpu_is_running(cpu)) {
711 			continue;
712 		}
713 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
714 		cpu_NMI_interrupt(cpu);
715 		tsc_timeout = !machine_timeout_suspended() ?
716 		    rdtsc64() + (1000 * 1000 * 1000 * 10ULL) :
717 		    ~0ULL;
718 		while (!cpu_datap(cpu)->cpu_NMI_acknowledged) {
719 			handle_pending_TLB_flushes();
720 			cpu_pause();
721 			if (rdtsc64() > tsc_timeout) {
722 				panic("NMI_cpus() timeout cpu %d", cpu);
723 			}
724 		}
725 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
726 	}
727 	NMIPI_enable(FALSE);
728 
729 	ml_set_interrupts_enabled(intrs_enabled);
730 }
731 
732 static void(*volatile mp_PM_func)(void) = NULL;
733 
734 static void
mp_call_PM(void)735 mp_call_PM(void)
736 {
737 	assert(!ml_get_interrupts_enabled());
738 
739 	if (mp_PM_func != NULL) {
740 		mp_PM_func();
741 	}
742 }
743 
744 void
cpu_PM_interrupt(int cpu)745 cpu_PM_interrupt(int cpu)
746 {
747 	assert(!ml_get_interrupts_enabled());
748 
749 	if (mp_PM_func != NULL) {
750 		if (cpu == cpu_number()) {
751 			mp_PM_func();
752 		} else {
753 			i386_signal_cpu(cpu, MP_CALL_PM, ASYNC);
754 		}
755 	}
756 }
757 
758 void
PM_interrupt_register(void (* fn)(void))759 PM_interrupt_register(void (*fn)(void))
760 {
761 	mp_PM_func = fn;
762 }
763 
764 void
i386_signal_cpu(int cpu,mp_event_t event,mp_sync_t mode)765 i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode)
766 {
767 	volatile int    *signals = &cpu_datap(cpu)->cpu_signals;
768 	uint64_t        tsc_timeout;
769 
770 
771 	if (!cpu_datap(cpu)->cpu_running) {
772 		return;
773 	}
774 
775 	if (event == MP_TLB_FLUSH) {
776 		KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0);
777 	}
778 
779 	DBGLOG(cpu_signal, cpu, event);
780 
781 	i_bit_set(event, signals);
782 	i386_cpu_IPI(cpu);
783 	if (mode == SYNC) {
784 again:
785 		tsc_timeout = !machine_timeout_suspended() ?
786 		    rdtsc64() + (1000 * 1000 * 1000) :
787 		    ~0ULL;
788 		while (i_bit(event, signals) && rdtsc64() < tsc_timeout) {
789 			cpu_pause();
790 		}
791 		if (i_bit(event, signals)) {
792 			DBG("i386_signal_cpu(%d, 0x%x, SYNC) timed out\n",
793 			    cpu, event);
794 			goto again;
795 		}
796 	}
797 	if (event == MP_TLB_FLUSH) {
798 		KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0);
799 	}
800 }
801 
802 /*
803  * Helper function called when busy-waiting: panic if too long
804  * a TSC-based time has elapsed since the start of the spin.
805  */
806 static boolean_t
mp_spin_timeout(uint64_t tsc_start)807 mp_spin_timeout(uint64_t tsc_start)
808 {
809 	uint64_t        tsc_timeout;
810 
811 	cpu_pause();
812 	if (machine_timeout_suspended()) {
813 		return FALSE;
814 	}
815 
816 	/*
817 	 * The timeout is 4 * the spinlock timeout period
818 	 * unless we have serial console printing (kprintf) enabled
819 	 * in which case we allow an even greater margin.
820 	 */
821 	tsc_timeout = disable_serial_output ? LockTimeOutTSC << 2
822 	        : LockTimeOutTSC << 4;
823 	return rdtsc64() > tsc_start + tsc_timeout;
824 }
825 
826 /*
827  * Helper function to take a spinlock while ensuring that incoming IPIs
828  * are still serviced if interrupts are masked while we spin.
829  * Returns current interrupt state.
830  */
831 boolean_t
mp_safe_spin_lock(usimple_lock_t lock)832 mp_safe_spin_lock(usimple_lock_t lock)
833 {
834 	if (ml_get_interrupts_enabled()) {
835 		simple_lock(lock, LCK_GRP_NULL);
836 		return TRUE;
837 	}
838 
839 	lck_spinlock_to_info_t lsti;
840 	uint64_t tsc_spin_start = rdtsc64();
841 
842 	while (!simple_lock_try(lock, LCK_GRP_NULL)) {
843 		cpu_signal_handler(NULL);
844 		if (mp_spin_timeout(tsc_spin_start)) {
845 			uintptr_t lowner = (uintptr_t)lock->interlock.lock_data;
846 
847 			lsti = lck_spinlock_timeout_hit(lock, lowner);
848 			NMIPI_panic(cpu_to_cpumask(lsti->owner_cpu), SPINLOCK_TIMEOUT);
849 			panic("mp_safe_spin_lock() timed out, lock: %p, "
850 			    "owner thread: 0x%lx, current_thread: %p, "
851 			    "owner on CPU 0x%x, time: %llu",
852 			    lock, lowner, current_thread(),
853 			    lsti->owner_cpu, mach_absolute_time());
854 		}
855 	}
856 
857 	return FALSE;
858 }
859 
860 /*
861  * All-CPU rendezvous:
862  *      - CPUs are signalled,
863  *	- all execute the setup function (if specified),
864  *	- rendezvous (i.e. all cpus reach a barrier),
865  *	- all execute the action function (if specified),
866  *	- rendezvous again,
867  *	- execute the teardown function (if specified), and then
868  *	- resume.
869  *
870  * Note that the supplied external functions _must_ be reentrant and aware
871  * that they are running in parallel and in an unknown lock context.
872  */
873 
874 static void
mp_rendezvous_action(__unused void * null)875 mp_rendezvous_action(__unused void *null)
876 {
877 	boolean_t       intrs_enabled;
878 	uint64_t        tsc_spin_start;
879 
880 	/*
881 	 * Note that mp_rv_lock was acquired by the thread that initiated the
882 	 * rendezvous and must have been acquired before we enter
883 	 * mp_rendezvous_action().
884 	 */
885 	current_cpu_datap()->cpu_rendezvous_in_progress = TRUE;
886 
887 	/* setup function */
888 	if (mp_rv_setup_func != NULL) {
889 		mp_rv_setup_func(mp_rv_func_arg);
890 	}
891 
892 	intrs_enabled = ml_get_interrupts_enabled();
893 
894 	/* spin on entry rendezvous */
895 	atomic_incl(&mp_rv_entry, 1);
896 	tsc_spin_start = rdtsc64();
897 
898 	while (mp_rv_entry < mp_rv_ncpus) {
899 		/* poll for pesky tlb flushes if interrupts disabled */
900 		if (!intrs_enabled) {
901 			handle_pending_TLB_flushes();
902 		}
903 		if (mp_spin_timeout(tsc_spin_start)) {
904 			panic("mp_rv_action() entry: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_entry, mp_rv_ncpus, tsc_spin_start, rdtsc64());
905 		}
906 	}
907 
908 	/* action function */
909 	if (mp_rv_action_func != NULL) {
910 		mp_rv_action_func(mp_rv_func_arg);
911 	}
912 
913 	/* spin on exit rendezvous */
914 	atomic_incl(&mp_rv_exit, 1);
915 	tsc_spin_start = rdtsc64();
916 	while (mp_rv_exit < mp_rv_ncpus) {
917 		if (!intrs_enabled) {
918 			handle_pending_TLB_flushes();
919 		}
920 		if (mp_spin_timeout(tsc_spin_start)) {
921 			panic("mp_rv_action() exit: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_exit, mp_rv_ncpus, tsc_spin_start, rdtsc64());
922 		}
923 	}
924 
925 	/* teardown function */
926 	if (mp_rv_teardown_func != NULL) {
927 		mp_rv_teardown_func(mp_rv_func_arg);
928 	}
929 
930 	current_cpu_datap()->cpu_rendezvous_in_progress = FALSE;
931 
932 	/* Bump completion count */
933 	atomic_incl(&mp_rv_complete, 1);
934 }
935 
936 void
mp_rendezvous(void (* setup_func)(void *),void (* action_func)(void *),void (* teardown_func)(void *),void * arg)937 mp_rendezvous(void (*setup_func)(void *),
938     void (*action_func)(void *),
939     void (*teardown_func)(void *),
940     void *arg)
941 {
942 	uint64_t        tsc_spin_start;
943 
944 	if (!smp_initialized) {
945 		if (setup_func != NULL) {
946 			setup_func(arg);
947 		}
948 		if (action_func != NULL) {
949 			action_func(arg);
950 		}
951 		if (teardown_func != NULL) {
952 			teardown_func(arg);
953 		}
954 		return;
955 	}
956 
957 	/* obtain rendezvous lock */
958 	mp_rendezvous_lock();
959 
960 	/* set static function pointers */
961 	mp_rv_setup_func = setup_func;
962 	mp_rv_action_func = action_func;
963 	mp_rv_teardown_func = teardown_func;
964 	mp_rv_func_arg = arg;
965 
966 	mp_rv_entry    = 0;
967 	mp_rv_exit     = 0;
968 	mp_rv_complete = 0;
969 
970 	/*
971 	 * signal other processors, which will call mp_rendezvous_action()
972 	 * with interrupts disabled
973 	 */
974 	mp_rv_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, &mp_rendezvous_action, NULL) + 1;
975 
976 	/* call executor function on this cpu */
977 	mp_rendezvous_action(NULL);
978 
979 	/*
980 	 * Spin for everyone to complete.
981 	 * This is necessary to ensure that all processors have proceeded
982 	 * from the exit barrier before we release the rendezvous structure.
983 	 */
984 	tsc_spin_start = rdtsc64();
985 	while (mp_rv_complete < mp_rv_ncpus) {
986 		if (mp_spin_timeout(tsc_spin_start)) {
987 			panic("mp_rendezvous() timeout: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_complete, mp_rv_ncpus, tsc_spin_start, rdtsc64());
988 		}
989 	}
990 
991 	/* Tidy up */
992 	mp_rv_setup_func = NULL;
993 	mp_rv_action_func = NULL;
994 	mp_rv_teardown_func = NULL;
995 	mp_rv_func_arg = NULL;
996 
997 	/* release lock */
998 	mp_rendezvous_unlock();
999 }
1000 
1001 void
mp_rendezvous_lock(void)1002 mp_rendezvous_lock(void)
1003 {
1004 	(void) mp_safe_spin_lock(&mp_rv_lock);
1005 }
1006 
1007 void
mp_rendezvous_unlock(void)1008 mp_rendezvous_unlock(void)
1009 {
1010 	simple_unlock(&mp_rv_lock);
1011 }
1012 
1013 void
mp_rendezvous_break_lock(void)1014 mp_rendezvous_break_lock(void)
1015 {
1016 	simple_lock_init(&mp_rv_lock, 0);
1017 }
1018 
1019 static void
setup_disable_intrs(__unused void * param_not_used)1020 setup_disable_intrs(__unused void * param_not_used)
1021 {
1022 	/* disable interrupts before the first barrier */
1023 	boolean_t intr = ml_set_interrupts_enabled(FALSE);
1024 
1025 	current_cpu_datap()->cpu_iflag = intr;
1026 	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1027 }
1028 
1029 static void
teardown_restore_intrs(__unused void * param_not_used)1030 teardown_restore_intrs(__unused void * param_not_used)
1031 {
1032 	/* restore interrupt flag following MTRR changes */
1033 	ml_set_interrupts_enabled(current_cpu_datap()->cpu_iflag);
1034 	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1035 }
1036 
1037 /*
1038  * A wrapper to mp_rendezvous() to call action_func() with interrupts disabled.
1039  * This is exported for use by kexts.
1040  */
1041 void
mp_rendezvous_no_intrs(void (* action_func)(void *),void * arg)1042 mp_rendezvous_no_intrs(
1043 	void (*action_func)(void *),
1044 	void *arg)
1045 {
1046 	mp_rendezvous(setup_disable_intrs,
1047 	    action_func,
1048 	    teardown_restore_intrs,
1049 	    arg);
1050 }
1051 
1052 
1053 typedef struct {
1054 	queue_chain_t   link;                   /* queue linkage */
1055 	void            (*func)(void *, void *); /* routine to call */
1056 	void            *arg0;                  /* routine's 1st arg */
1057 	void            *arg1;                  /* routine's 2nd arg */
1058 	cpumask_t       *maskp;                 /* completion response mask */
1059 } mp_call_t;
1060 
1061 
1062 typedef struct {
1063 	queue_head_t            queue;
1064 	decl_simple_lock_data(, lock);
1065 } mp_call_queue_t;
1066 #define MP_CPUS_CALL_BUFS_PER_CPU       MAX_CPUS
1067 static mp_call_queue_t  mp_cpus_call_freelist;
1068 static mp_call_queue_t  mp_cpus_call_head[MAX_CPUS];
1069 
1070 static inline boolean_t
mp_call_head_lock(mp_call_queue_t * cqp)1071 mp_call_head_lock(mp_call_queue_t *cqp)
1072 {
1073 	boolean_t       intrs_enabled;
1074 
1075 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
1076 	simple_lock(&cqp->lock, LCK_GRP_NULL);
1077 
1078 	return intrs_enabled;
1079 }
1080 
1081 /*
1082  * Deliver an NMIPI to a set of processors to cause them to panic .
1083  */
1084 void
NMIPI_panic(cpumask_t cpu_mask,NMI_reason_t why)1085 NMIPI_panic(cpumask_t cpu_mask, NMI_reason_t why)
1086 {
1087 	unsigned int cpu;
1088 	cpumask_t cpu_bit;
1089 	uint64_t deadline;
1090 
1091 	NMIPI_enable(TRUE);
1092 	NMI_panic_reason = why;
1093 
1094 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1095 		if ((cpu_mask & cpu_bit) == 0) {
1096 			continue;
1097 		}
1098 		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1099 		cpu_NMI_interrupt(cpu);
1100 	}
1101 
1102 	/* Wait (only so long) for NMi'ed cpus to respond */
1103 	deadline = mach_absolute_time() + LockTimeOut;
1104 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1105 		if ((cpu_mask & cpu_bit) == 0) {
1106 			continue;
1107 		}
1108 		while (!cpu_datap(cpu)->cpu_NMI_acknowledged &&
1109 		    mach_absolute_time() < deadline) {
1110 			cpu_pause();
1111 		}
1112 	}
1113 }
1114 
1115 #if MACH_ASSERT
1116 static inline boolean_t
mp_call_head_is_locked(mp_call_queue_t * cqp)1117 mp_call_head_is_locked(mp_call_queue_t *cqp)
1118 {
1119 	return !ml_get_interrupts_enabled() &&
1120 	       hw_lock_held((hw_lock_t)&cqp->lock);
1121 }
1122 #endif
1123 
1124 static inline void
mp_call_head_unlock(mp_call_queue_t * cqp,boolean_t intrs_enabled)1125 mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled)
1126 {
1127 	simple_unlock(&cqp->lock);
1128 	ml_set_interrupts_enabled(intrs_enabled);
1129 }
1130 
1131 static inline mp_call_t *
mp_call_alloc(void)1132 mp_call_alloc(void)
1133 {
1134 	mp_call_t       *callp = NULL;
1135 	boolean_t       intrs_enabled;
1136 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1137 
1138 	intrs_enabled = mp_call_head_lock(cqp);
1139 	if (!queue_empty(&cqp->queue)) {
1140 		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1141 	}
1142 	mp_call_head_unlock(cqp, intrs_enabled);
1143 
1144 	return callp;
1145 }
1146 
1147 static inline void
mp_call_free(mp_call_t * callp)1148 mp_call_free(mp_call_t *callp)
1149 {
1150 	boolean_t       intrs_enabled;
1151 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1152 
1153 	intrs_enabled = mp_call_head_lock(cqp);
1154 	queue_enter_first(&cqp->queue, callp, typeof(callp), link);
1155 	mp_call_head_unlock(cqp, intrs_enabled);
1156 }
1157 
1158 static inline mp_call_t *
mp_call_dequeue_locked(mp_call_queue_t * cqp)1159 mp_call_dequeue_locked(mp_call_queue_t *cqp)
1160 {
1161 	mp_call_t       *callp = NULL;
1162 
1163 	assert(mp_call_head_is_locked(cqp));
1164 	if (!queue_empty(&cqp->queue)) {
1165 		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1166 	}
1167 	return callp;
1168 }
1169 
1170 static inline void
mp_call_enqueue_locked(mp_call_queue_t * cqp,mp_call_t * callp)1171 mp_call_enqueue_locked(
1172 	mp_call_queue_t *cqp,
1173 	mp_call_t       *callp)
1174 {
1175 	queue_enter(&cqp->queue, callp, typeof(callp), link);
1176 }
1177 
1178 /* Called on the boot processor to initialize global structures */
1179 static void
mp_cpus_call_init(void)1180 mp_cpus_call_init(void)
1181 {
1182 	mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1183 
1184 	DBG("mp_cpus_call_init()\n");
1185 	simple_lock_init(&cqp->lock, 0);
1186 	queue_init(&cqp->queue);
1187 }
1188 
1189 /*
1190  * Called at processor registration to add call buffers to the free list
1191  * and to initialize the per-cpu call queue.
1192  */
1193 void
mp_cpus_call_cpu_init(int cpu)1194 mp_cpus_call_cpu_init(int cpu)
1195 {
1196 	int             i;
1197 	mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1198 	mp_call_t       *callp;
1199 
1200 	simple_lock_init(&cqp->lock, 0);
1201 	queue_init(&cqp->queue);
1202 	for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) {
1203 		callp = zalloc_permanent_type(mp_call_t);
1204 		mp_call_free(callp);
1205 	}
1206 
1207 	DBG("mp_cpus_call_init(%d) done\n", cpu);
1208 }
1209 
1210 /*
1211  * This is called from cpu_signal_handler() to process an MP_CALL signal.
1212  * And also from i386_deactivate_cpu() when a cpu is being taken offline.
1213  */
1214 static void
mp_cpus_call_action(void)1215 mp_cpus_call_action(void)
1216 {
1217 	mp_call_queue_t *cqp;
1218 	boolean_t       intrs_enabled;
1219 	mp_call_t       *callp;
1220 	mp_call_t       call;
1221 
1222 	assert(!ml_get_interrupts_enabled());
1223 	cqp = &mp_cpus_call_head[cpu_number()];
1224 	intrs_enabled = mp_call_head_lock(cqp);
1225 	while ((callp = mp_call_dequeue_locked(cqp)) != NULL) {
1226 		/* Copy call request to the stack to free buffer */
1227 		call = *callp;
1228 		mp_call_free(callp);
1229 		if (call.func != NULL) {
1230 			mp_call_head_unlock(cqp, intrs_enabled);
1231 			KERNEL_DEBUG_CONSTANT(
1232 				TRACE_MP_CPUS_CALL_ACTION,
1233 				VM_KERNEL_UNSLIDE(call.func), VM_KERNEL_UNSLIDE_OR_PERM(call.arg0),
1234 				VM_KERNEL_UNSLIDE_OR_PERM(call.arg1), VM_KERNEL_ADDRPERM(call.maskp), 0);
1235 			call.func(call.arg0, call.arg1);
1236 			(void) mp_call_head_lock(cqp);
1237 		}
1238 		if (call.maskp != NULL) {
1239 			i_bit_set(cpu_number(), call.maskp);
1240 		}
1241 	}
1242 	mp_call_head_unlock(cqp, intrs_enabled);
1243 }
1244 
1245 /*
1246  * mp_cpus_call() runs a given function on cpus specified in a given cpu mask.
1247  * Possible modes are:
1248  *  SYNC:   function is called serially on target cpus in logical cpu order
1249  *	    waiting for each call to be acknowledged before proceeding
1250  *  ASYNC:  function call is queued to the specified cpus
1251  *	    waiting for all calls to complete in parallel before returning
1252  *  NOSYNC: function calls are queued
1253  *	    but we return before confirmation of calls completing.
1254  * The action function may be NULL.
1255  * The cpu mask may include the local cpu. Offline cpus are ignored.
1256  * The return value is the number of cpus on which the call was made or queued.
1257  */
1258 cpu_t
mp_cpus_call(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *),void * arg)1259 mp_cpus_call(
1260 	cpumask_t       cpus,
1261 	mp_sync_t       mode,
1262 	void            (*action_func)(void *),
1263 	void            *arg)
1264 {
1265 	return mp_cpus_call1(
1266 		cpus,
1267 		mode,
1268 		(void (*)(void *, void *))action_func,
1269 		arg,
1270 		NULL,
1271 		NULL);
1272 }
1273 
1274 static void
mp_cpus_call_wait(boolean_t intrs_enabled,cpumask_t cpus_called,cpumask_t * cpus_responded)1275 mp_cpus_call_wait(boolean_t     intrs_enabled,
1276     cpumask_t     cpus_called,
1277     cpumask_t     *cpus_responded)
1278 {
1279 	mp_call_queue_t         *cqp;
1280 	uint64_t                tsc_spin_start;
1281 
1282 	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1283 	cqp = &mp_cpus_call_head[cpu_number()];
1284 
1285 	tsc_spin_start = rdtsc64();
1286 	while (*cpus_responded != cpus_called) {
1287 		if (!intrs_enabled) {
1288 			/* Sniffing w/o locking */
1289 			if (!queue_empty(&cqp->queue)) {
1290 				mp_cpus_call_action();
1291 			}
1292 			cpu_signal_handler(NULL);
1293 		}
1294 		if (mp_spin_timeout(tsc_spin_start)) {
1295 			cpumask_t       cpus_unresponsive;
1296 
1297 			cpus_unresponsive = cpus_called & ~(*cpus_responded);
1298 			NMIPI_panic(cpus_unresponsive, CROSSCALL_TIMEOUT);
1299 			panic("mp_cpus_call_wait() timeout, cpus: 0x%llx",
1300 			    cpus_unresponsive);
1301 		}
1302 	}
1303 }
1304 
1305 cpu_t
mp_cpus_call1(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *,void *),void * arg0,void * arg1,cpumask_t * cpus_calledp)1306 mp_cpus_call1(
1307 	cpumask_t       cpus,
1308 	mp_sync_t       mode,
1309 	void            (*action_func)(void *, void *),
1310 	void            *arg0,
1311 	void            *arg1,
1312 	cpumask_t       *cpus_calledp)
1313 {
1314 	cpu_t           cpu = 0;
1315 	boolean_t       intrs_enabled = FALSE;
1316 	boolean_t       call_self = FALSE;
1317 	cpumask_t       cpus_called = 0;
1318 	cpumask_t       cpus_responded = 0;
1319 	long            cpus_call_count = 0;
1320 	uint64_t        tsc_spin_start;
1321 	boolean_t       topo_lock;
1322 
1323 	KERNEL_DEBUG_CONSTANT(
1324 		TRACE_MP_CPUS_CALL | DBG_FUNC_START,
1325 		cpus, mode, VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1));
1326 
1327 	if (!smp_initialized) {
1328 		if ((cpus & CPUMASK_SELF) == 0) {
1329 			goto out;
1330 		}
1331 		if (action_func != NULL) {
1332 			intrs_enabled = ml_set_interrupts_enabled(FALSE);
1333 			action_func(arg0, arg1);
1334 			ml_set_interrupts_enabled(intrs_enabled);
1335 		}
1336 		call_self = TRUE;
1337 		goto out;
1338 	}
1339 
1340 	/*
1341 	 * Queue the call for each non-local requested cpu.
1342 	 * This is performed under the topo lock to prevent changes to
1343 	 * cpus online state and to prevent concurrent rendezvouses --
1344 	 * although an exception is made if we're calling only the master
1345 	 * processor since that always remains active. Note: this exception
1346 	 * is expected for longterm timer nosync cross-calls to the master cpu.
1347 	 */
1348 	mp_disable_preemption();
1349 	intrs_enabled = ml_get_interrupts_enabled();
1350 	topo_lock = (cpus != cpu_to_cpumask(master_cpu));
1351 	if (topo_lock) {
1352 		ml_set_interrupts_enabled(FALSE);
1353 		(void) mp_safe_spin_lock(&x86_topo_lock);
1354 	}
1355 	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1356 		if (((cpu_to_cpumask(cpu) & cpus) == 0) ||
1357 		    !cpu_is_running(cpu)) {
1358 			continue;
1359 		}
1360 		tsc_spin_start = rdtsc64();
1361 		if (cpu == (cpu_t) cpu_number()) {
1362 			/*
1363 			 * We don't IPI ourself and if calling asynchronously,
1364 			 * we defer our call until we have signalled all others.
1365 			 */
1366 			call_self = TRUE;
1367 			if (mode == SYNC && action_func != NULL) {
1368 				KERNEL_DEBUG_CONSTANT(
1369 					TRACE_MP_CPUS_CALL_LOCAL,
1370 					VM_KERNEL_UNSLIDE(action_func),
1371 					VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1372 				action_func(arg0, arg1);
1373 			}
1374 		} else {
1375 			/*
1376 			 * Here to queue a call to cpu and IPI.
1377 			 */
1378 			mp_call_t       *callp = NULL;
1379 			mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1380 			boolean_t       intrs_inner;
1381 
1382 queue_call:
1383 			if (callp == NULL) {
1384 				callp = mp_call_alloc();
1385 			}
1386 			intrs_inner = mp_call_head_lock(cqp);
1387 			if (callp == NULL) {
1388 				mp_call_head_unlock(cqp, intrs_inner);
1389 				KERNEL_DEBUG_CONSTANT(
1390 					TRACE_MP_CPUS_CALL_NOBUF,
1391 					cpu, 0, 0, 0, 0);
1392 				if (!intrs_inner) {
1393 					/* Sniffing w/o locking */
1394 					if (!queue_empty(&cqp->queue)) {
1395 						mp_cpus_call_action();
1396 					}
1397 					handle_pending_TLB_flushes();
1398 				}
1399 				if (mp_spin_timeout(tsc_spin_start)) {
1400 					panic("mp_cpus_call1() timeout start: 0x%llx, cur: 0x%llx",
1401 					    tsc_spin_start, rdtsc64());
1402 				}
1403 				goto queue_call;
1404 			}
1405 			callp->maskp = (mode == NOSYNC) ? NULL : &cpus_responded;
1406 			callp->func = action_func;
1407 			callp->arg0 = arg0;
1408 			callp->arg1 = arg1;
1409 			mp_call_enqueue_locked(cqp, callp);
1410 			cpus_call_count++;
1411 			cpus_called |= cpu_to_cpumask(cpu);
1412 			i386_signal_cpu(cpu, MP_CALL, ASYNC);
1413 			mp_call_head_unlock(cqp, intrs_inner);
1414 			if (mode == SYNC) {
1415 				mp_cpus_call_wait(intrs_inner, cpus_called, &cpus_responded);
1416 			}
1417 		}
1418 	}
1419 	if (topo_lock) {
1420 		simple_unlock(&x86_topo_lock);
1421 		ml_set_interrupts_enabled(intrs_enabled);
1422 	}
1423 
1424 	/* Call locally if mode not SYNC */
1425 	if (mode != SYNC && call_self) {
1426 		KERNEL_DEBUG_CONSTANT(
1427 			TRACE_MP_CPUS_CALL_LOCAL,
1428 			VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1429 		if (action_func != NULL) {
1430 			ml_set_interrupts_enabled(FALSE);
1431 			action_func(arg0, arg1);
1432 			ml_set_interrupts_enabled(intrs_enabled);
1433 		}
1434 	}
1435 
1436 	/* For ASYNC, now wait for all signaled cpus to complete their calls */
1437 	if (mode == ASYNC) {
1438 		mp_cpus_call_wait(intrs_enabled, cpus_called, &cpus_responded);
1439 	}
1440 
1441 	/* Safe to allow pre-emption now */
1442 	mp_enable_preemption();
1443 
1444 out:
1445 	if (call_self) {
1446 		cpus_called |= cpu_to_cpumask(cpu);
1447 		cpus_call_count++;
1448 	}
1449 
1450 	if (cpus_calledp) {
1451 		*cpus_calledp = cpus_called;
1452 	}
1453 
1454 	KERNEL_DEBUG_CONSTANT(
1455 		TRACE_MP_CPUS_CALL | DBG_FUNC_END,
1456 		cpus_call_count, cpus_called, 0, 0, 0);
1457 
1458 	return (cpu_t) cpus_call_count;
1459 }
1460 
1461 
1462 static void
mp_broadcast_action(__unused void * null)1463 mp_broadcast_action(__unused void *null)
1464 {
1465 	/* call action function */
1466 	if (mp_bc_action_func != NULL) {
1467 		mp_bc_action_func(mp_bc_func_arg);
1468 	}
1469 
1470 	/* if we're the last one through, wake up the instigator */
1471 	if (atomic_decl_and_test(&mp_bc_count, 1)) {
1472 		thread_wakeup(((event_t)(uintptr_t) &mp_bc_count));
1473 	}
1474 }
1475 
1476 /*
1477  * mp_broadcast() runs a given function on all active cpus.
1478  * The caller blocks until the functions has run on all cpus.
1479  * The caller will also block if there is another pending broadcast.
1480  */
1481 void
mp_broadcast(void (* action_func)(void *),void * arg)1482 mp_broadcast(
1483 	void (*action_func)(void *),
1484 	void *arg)
1485 {
1486 	if (!smp_initialized) {
1487 		if (action_func != NULL) {
1488 			action_func(arg);
1489 		}
1490 		return;
1491 	}
1492 
1493 	/* obtain broadcast lock */
1494 	lck_mtx_lock(&mp_bc_lock);
1495 
1496 	/* set static function pointers */
1497 	mp_bc_action_func = action_func;
1498 	mp_bc_func_arg = arg;
1499 
1500 	assert_wait((event_t)(uintptr_t)&mp_bc_count, THREAD_UNINT);
1501 
1502 	/*
1503 	 * signal other processors, which will call mp_broadcast_action()
1504 	 */
1505 	mp_bc_count = real_ncpus;                       /* assume max possible active */
1506 	mp_bc_ncpus = mp_cpus_call(CPUMASK_ALL, NOSYNC, *mp_broadcast_action, NULL);
1507 	atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */
1508 
1509 	/* block for other cpus to have run action_func */
1510 	if (mp_bc_ncpus > 1) {
1511 		thread_block(THREAD_CONTINUE_NULL);
1512 	} else {
1513 		clear_wait(current_thread(), THREAD_AWAKENED);
1514 	}
1515 
1516 	/* release lock */
1517 	lck_mtx_unlock(&mp_bc_lock);
1518 }
1519 
1520 void
mp_cpus_kick(cpumask_t cpus)1521 mp_cpus_kick(cpumask_t cpus)
1522 {
1523 	cpu_t           cpu;
1524 	boolean_t       intrs_enabled = FALSE;
1525 
1526 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
1527 	mp_safe_spin_lock(&x86_topo_lock);
1528 
1529 	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1530 		if (((cpu_to_cpumask(cpu) & cpus) == 0)
1531 		    || !cpu_is_running(cpu)) {
1532 			continue;
1533 		}
1534 
1535 		lapic_send_ipi(cpu, LAPIC_VECTOR(KICK));
1536 	}
1537 
1538 	simple_unlock(&x86_topo_lock);
1539 	ml_set_interrupts_enabled(intrs_enabled);
1540 }
1541 
1542 void
i386_activate_cpu(void)1543 i386_activate_cpu(void)
1544 {
1545 	cpu_data_t      *cdp = current_cpu_datap();
1546 
1547 	assert(!ml_get_interrupts_enabled());
1548 
1549 	if (!smp_initialized) {
1550 		cdp->cpu_running = TRUE;
1551 		return;
1552 	}
1553 
1554 	mp_safe_spin_lock(&x86_topo_lock);
1555 	cdp->cpu_running = TRUE;
1556 	started_cpu();
1557 	pmap_tlbi_range(0, ~0ULL, true, 0);
1558 	simple_unlock(&x86_topo_lock);
1559 }
1560 
1561 void
i386_deactivate_cpu(void)1562 i386_deactivate_cpu(void)
1563 {
1564 	cpu_data_t      *cdp = current_cpu_datap();
1565 
1566 	assert(!ml_get_interrupts_enabled());
1567 
1568 	KERNEL_DEBUG_CONSTANT(
1569 		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START,
1570 		0, 0, 0, 0, 0);
1571 
1572 	mp_safe_spin_lock(&x86_topo_lock);
1573 	cdp->cpu_running = FALSE;
1574 	simple_unlock(&x86_topo_lock);
1575 
1576 	/*
1577 	 * Move all of this cpu's timers to the master/boot cpu,
1578 	 * and poke it in case there's a sooner deadline for it to schedule.
1579 	 */
1580 	timer_queue_shutdown(&cdp->rtclock_timer.queue);
1581 	mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, timer_queue_expire_local, NULL);
1582 
1583 #if MONOTONIC
1584 	mt_cpu_down(cdp);
1585 #endif /* MONOTONIC */
1586 #if KPERF
1587 	kptimer_stop_curcpu();
1588 #endif /* KPERF */
1589 
1590 	/*
1591 	 * Open an interrupt window
1592 	 * and ensure any pending IPI or timer is serviced
1593 	 */
1594 	mp_disable_preemption();
1595 	ml_set_interrupts_enabled(TRUE);
1596 
1597 	while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime) {
1598 		cpu_pause();
1599 	}
1600 	/*
1601 	 * Ensure there's no remaining timer deadline set
1602 	 * - AICPM may have left one active.
1603 	 */
1604 	setPop(0);
1605 
1606 	ml_set_interrupts_enabled(FALSE);
1607 	mp_enable_preemption();
1608 
1609 	KERNEL_DEBUG_CONSTANT(
1610 		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END,
1611 		0, 0, 0, 0, 0);
1612 }
1613 
1614 int     pmsafe_debug    = 1;
1615 
1616 #if     MACH_KDP
1617 volatile boolean_t      mp_kdp_trap = FALSE;
1618 volatile boolean_t      mp_kdp_is_NMI = FALSE;
1619 volatile unsigned long  mp_kdp_ncpus;
1620 boolean_t               mp_kdp_state;
1621 
1622 
1623 void
mp_kdp_enter(boolean_t proceed_on_failure)1624 mp_kdp_enter(boolean_t proceed_on_failure)
1625 {
1626 	unsigned int    cpu;
1627 	unsigned int    ncpus = 0;
1628 	unsigned int    my_cpu;
1629 	uint64_t        tsc_timeout;
1630 
1631 	DBG("mp_kdp_enter()\n");
1632 
1633 	/*
1634 	 * Here to enter the debugger.
1635 	 * In case of races, only one cpu is allowed to enter kdp after
1636 	 * stopping others.
1637 	 */
1638 	mp_kdp_state = ml_set_interrupts_enabled(FALSE);
1639 	my_cpu = cpu_number();
1640 
1641 	if (my_cpu == (unsigned) debugger_cpu) {
1642 		kprintf("\n\nRECURSIVE DEBUGGER ENTRY DETECTED\n\n");
1643 		kdp_reset();
1644 		return;
1645 	}
1646 
1647 	uint64_t start_time = cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time();
1648 	int locked = 0;
1649 	while (!locked || mp_kdp_trap) {
1650 		if (locked) {
1651 			simple_unlock(&x86_topo_lock);
1652 		}
1653 		if (proceed_on_failure) {
1654 			if (mach_absolute_time() - start_time > 500000000ll) {
1655 				paniclog_append_noflush("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n");
1656 				break;
1657 			}
1658 			locked = simple_lock_try(&x86_topo_lock, LCK_GRP_NULL);
1659 			if (!locked) {
1660 				cpu_pause();
1661 			}
1662 		} else {
1663 			mp_safe_spin_lock(&x86_topo_lock);
1664 			locked = TRUE;
1665 		}
1666 
1667 		if (locked && mp_kdp_trap) {
1668 			simple_unlock(&x86_topo_lock);
1669 			DBG("mp_kdp_enter() race lost\n");
1670 #if MACH_KDP
1671 			mp_kdp_wait(TRUE, FALSE);
1672 #endif
1673 			locked = FALSE;
1674 		}
1675 	}
1676 
1677 	if (pmsafe_debug && !kdp_snapshot) {
1678 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
1679 	}
1680 
1681 	debugger_cpu = my_cpu;
1682 	ncpus = 1;
1683 	atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1684 	mp_kdp_trap = TRUE;
1685 	debugger_entry_time = cpu_datap(my_cpu)->debugger_entry_time;
1686 
1687 	/*
1688 	 * Deliver a nudge to other cpus, counting how many
1689 	 */
1690 	DBG("mp_kdp_enter() signaling other processors\n");
1691 	if (force_immediate_debugger_NMI == FALSE) {
1692 		for (cpu = 0; cpu < real_ncpus; cpu++) {
1693 			if (cpu == my_cpu || !cpu_is_running(cpu)) {
1694 				continue;
1695 			}
1696 			ncpus++;
1697 			i386_signal_cpu(cpu, MP_KDP, ASYNC);
1698 		}
1699 		/*
1700 		 * Wait other processors to synchronize
1701 		 */
1702 		DBG("mp_kdp_enter() waiting for (%d) processors to suspend\n", ncpus);
1703 
1704 		/*
1705 		 * This timeout is rather arbitrary; we don't want to NMI
1706 		 * processors that are executing at potentially
1707 		 * "unsafe-to-interrupt" points such as the trampolines,
1708 		 * but neither do we want to lose state by waiting too long.
1709 		 */
1710 		tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1711 
1712 		while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1713 			/*
1714 			 * A TLB shootdown request may be pending--this would
1715 			 * result in the requesting processor waiting in
1716 			 * PMAP_UPDATE_TLBS() until this processor deals with it.
1717 			 * Process it, so it can now enter mp_kdp_wait()
1718 			 */
1719 			handle_pending_TLB_flushes();
1720 			cpu_pause();
1721 		}
1722 		/* If we've timed out, and some processor(s) are still unresponsive,
1723 		 * interrupt them with an NMI via the local APIC, iff a panic is
1724 		 * in progress.
1725 		 */
1726 		if (panic_active()) {
1727 			NMIPI_enable(TRUE);
1728 		}
1729 		if (mp_kdp_ncpus != ncpus) {
1730 			unsigned int wait_cycles = 0;
1731 			if (proceed_on_failure) {
1732 				paniclog_append_noflush("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1733 			} else {
1734 				DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1735 			}
1736 			for (cpu = 0; cpu < real_ncpus; cpu++) {
1737 				if (cpu == my_cpu || !cpu_is_running(cpu)) {
1738 					continue;
1739 				}
1740 				if (cpu_signal_pending(cpu, MP_KDP)) {
1741 					cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1742 					cpu_NMI_interrupt(cpu);
1743 				}
1744 			}
1745 			/* Wait again for the same timeout */
1746 			tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1747 			while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1748 				handle_pending_TLB_flushes();
1749 				cpu_pause();
1750 				++wait_cycles;
1751 			}
1752 			if (mp_kdp_ncpus != ncpus) {
1753 				paniclog_append_noflush("mp_kdp_enter() NMI pending on cpus:");
1754 				for (cpu = 0; cpu < real_ncpus; cpu++) {
1755 					if (cpu_is_running(cpu) && !cpu_datap(cpu)->cpu_NMI_acknowledged) {
1756 						paniclog_append_noflush(" %d", cpu);
1757 					}
1758 				}
1759 				paniclog_append_noflush("\n");
1760 				if (proceed_on_failure) {
1761 					paniclog_append_noflush("mp_kdp_enter() timed-out during %s wait after NMI;"
1762 					    "expected %u acks but received %lu after %u loops in %llu ticks\n",
1763 					    (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1764 				} else {
1765 					panic("mp_kdp_enter() timed-out during %s wait after NMI;"
1766 					    "expected %u acks but received %lu after %u loops in %llu ticks",
1767 					    (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1768 				}
1769 			}
1770 		}
1771 	} else if (NMI_panic_reason != PTE_CORRUPTION) {  /* In the pte corruption case, the detecting CPU has already NMIed other CPUs */
1772 		for (cpu = 0; cpu < real_ncpus; cpu++) {
1773 			if (cpu == my_cpu || !cpu_is_running(cpu)) {
1774 				continue;
1775 			}
1776 			cpu_NMI_interrupt(cpu);
1777 		}
1778 	}
1779 
1780 	if (locked) {
1781 		simple_unlock(&x86_topo_lock);
1782 	}
1783 
1784 	DBG("mp_kdp_enter() %d processors done %s\n",
1785 	    (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out");
1786 
1787 	postcode(MP_KDP_ENTER);
1788 }
1789 
1790 boolean_t
mp_kdp_all_cpus_halted()1791 mp_kdp_all_cpus_halted()
1792 {
1793 	unsigned int ncpus = 0, cpu = 0, my_cpu = 0;
1794 
1795 	my_cpu = cpu_number();
1796 	ncpus = 1; /* current CPU */
1797 	for (cpu = 0; cpu < real_ncpus; cpu++) {
1798 		if (cpu == my_cpu || !cpu_is_running(cpu)) {
1799 			continue;
1800 		}
1801 		ncpus++;
1802 	}
1803 
1804 	return mp_kdp_ncpus == ncpus;
1805 }
1806 
1807 static boolean_t
cpu_signal_pending(int cpu,mp_event_t event)1808 cpu_signal_pending(int cpu, mp_event_t event)
1809 {
1810 	volatile int    *signals = &cpu_datap(cpu)->cpu_signals;
1811 	boolean_t retval = FALSE;
1812 
1813 	if (i_bit(event, signals)) {
1814 		retval = TRUE;
1815 	}
1816 	return retval;
1817 }
1818 
1819 long
kdp_x86_xcpu_invoke(const uint16_t lcpu,kdp_x86_xcpu_func_t func,void * arg0,void * arg1,uint64_t timeout)1820 kdp_x86_xcpu_invoke(const uint16_t lcpu, kdp_x86_xcpu_func_t func,
1821     void *arg0, void *arg1, uint64_t timeout)
1822 {
1823 	uint64_t now;
1824 
1825 	if (lcpu > (real_ncpus - 1)) {
1826 		return -1;
1827 	}
1828 
1829 	if (func == NULL) {
1830 		return -1;
1831 	}
1832 
1833 	kdp_xcpu_call_func.func = func;
1834 	kdp_xcpu_call_func.ret  = -1;
1835 	kdp_xcpu_call_func.arg0 = arg0;
1836 	kdp_xcpu_call_func.arg1 = arg1;
1837 	kdp_xcpu_call_func.cpu  = lcpu;
1838 	DBG("Invoking function %p on CPU %d\n", func, (int32_t)lcpu);
1839 	now = mach_absolute_time();
1840 	while (kdp_xcpu_call_func.cpu != KDP_XCPU_NONE &&
1841 	    (timeout == 0 || (mach_absolute_time() - now) < timeout)) {
1842 		cpu_pause();
1843 	}
1844 	return kdp_xcpu_call_func.ret;
1845 }
1846 
1847 static void
kdp_x86_xcpu_poll(void)1848 kdp_x86_xcpu_poll(void)
1849 {
1850 	if ((uint16_t)cpu_number() == kdp_xcpu_call_func.cpu) {
1851 		kdp_xcpu_call_func.ret =
1852 		    kdp_xcpu_call_func.func(kdp_xcpu_call_func.arg0,
1853 		    kdp_xcpu_call_func.arg1,
1854 		    cpu_number());
1855 		kdp_xcpu_call_func.cpu = KDP_XCPU_NONE;
1856 	}
1857 }
1858 
1859 static void
mp_kdp_wait(boolean_t flush,boolean_t isNMI)1860 mp_kdp_wait(boolean_t flush, boolean_t isNMI)
1861 {
1862 	DBG("mp_kdp_wait()\n");
1863 
1864 	current_cpu_datap()->debugger_ipi_time = mach_absolute_time();
1865 #if CONFIG_MCA
1866 	/* If we've trapped due to a machine-check, save MCA registers */
1867 	mca_check_save();
1868 #endif
1869 
1870 	atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1871 	while (mp_kdp_trap || (isNMI == TRUE)) {
1872 		/*
1873 		 * A TLB shootdown request may be pending--this would result
1874 		 * in the requesting processor waiting in PMAP_UPDATE_TLBS()
1875 		 * until this processor handles it.
1876 		 * Process it, so it can now enter mp_kdp_wait()
1877 		 */
1878 		if (flush) {
1879 			handle_pending_TLB_flushes();
1880 		}
1881 
1882 		kdp_x86_xcpu_poll();
1883 		cpu_pause();
1884 	}
1885 
1886 	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1887 	DBG("mp_kdp_wait() done\n");
1888 }
1889 
1890 void
mp_kdp_exit(void)1891 mp_kdp_exit(void)
1892 {
1893 	DBG("mp_kdp_exit()\n");
1894 	debugger_cpu = -1;
1895 	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1896 
1897 	debugger_exit_time = mach_absolute_time();
1898 
1899 	mp_kdp_trap = FALSE;
1900 	mfence();
1901 
1902 	/* Wait other processors to stop spinning. XXX needs timeout */
1903 	DBG("mp_kdp_exit() waiting for processors to resume\n");
1904 	while (mp_kdp_ncpus > 0) {
1905 		/*
1906 		 * a TLB shootdown request may be pending... this would result in the requesting
1907 		 * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it.
1908 		 * Process it, so it can now enter mp_kdp_wait()
1909 		 */
1910 		handle_pending_TLB_flushes();
1911 
1912 		cpu_pause();
1913 	}
1914 
1915 	if (pmsafe_debug && !kdp_snapshot) {
1916 		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
1917 	}
1918 
1919 	debugger_exit_time = mach_absolute_time();
1920 
1921 	DBG("mp_kdp_exit() done\n");
1922 	(void) ml_set_interrupts_enabled(mp_kdp_state);
1923 	postcode(MP_KDP_EXIT);
1924 }
1925 
1926 #endif  /* MACH_KDP */
1927 
1928 boolean_t
mp_recent_debugger_activity(void)1929 mp_recent_debugger_activity(void)
1930 {
1931 	uint64_t abstime = mach_absolute_time();
1932 	return ((abstime - debugger_entry_time) < LastDebuggerEntryAllowance) ||
1933 	       ((abstime - debugger_exit_time) < LastDebuggerEntryAllowance);
1934 }
1935 
1936 /*ARGSUSED*/
1937 void
init_ast_check(__unused processor_t processor)1938 init_ast_check(
1939 	__unused processor_t    processor)
1940 {
1941 }
1942 
1943 void
cause_ast_check(processor_t processor)1944 cause_ast_check(
1945 	processor_t     processor)
1946 {
1947 	int     cpu = processor->cpu_id;
1948 
1949 	if (cpu != cpu_number()) {
1950 		i386_signal_cpu(cpu, MP_AST, ASYNC);
1951 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0);
1952 	}
1953 }
1954 
1955 void
slave_machine_init(void * param)1956 slave_machine_init(void *param)
1957 {
1958 	/*
1959 	 * Here in process context, but with interrupts disabled.
1960 	 */
1961 	DBG("slave_machine_init() CPU%d\n", get_cpu_number());
1962 
1963 	if (param == FULL_SLAVE_INIT) {
1964 		/*
1965 		 * Cold start
1966 		 */
1967 		clock_init();
1968 	}
1969 	cpu_machine_init();     /* Interrupts enabled hereafter */
1970 }
1971 
1972 #undef cpu_number
1973 int
cpu_number(void)1974 cpu_number(void)
1975 {
1976 	return get_cpu_number();
1977 }
1978 
1979 vm_offset_t
current_percpu_base(void)1980 current_percpu_base(void)
1981 {
1982 	return get_current_percpu_base();
1983 }
1984 
1985 vm_offset_t
other_percpu_base(int cpu)1986 other_percpu_base(int cpu)
1987 {
1988 	return cpu_datap(cpu)->cpu_pcpu_base;
1989 }
1990 
1991 static void
cpu_prewarm_init()1992 cpu_prewarm_init()
1993 {
1994 	int i;
1995 
1996 	simple_lock_init(&cpu_warm_lock, 0);
1997 	queue_init(&cpu_warm_call_list);
1998 	for (i = 0; i < NUM_CPU_WARM_CALLS; i++) {
1999 		enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]);
2000 	}
2001 }
2002 
2003 static timer_call_t
grab_warm_timer_call()2004 grab_warm_timer_call()
2005 {
2006 	spl_t x;
2007 	timer_call_t call = NULL;
2008 
2009 	x = splsched();
2010 	simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2011 	if (!queue_empty(&cpu_warm_call_list)) {
2012 		call = (timer_call_t) dequeue_head(&cpu_warm_call_list);
2013 	}
2014 	simple_unlock(&cpu_warm_lock);
2015 	splx(x);
2016 
2017 	return call;
2018 }
2019 
2020 static void
free_warm_timer_call(timer_call_t call)2021 free_warm_timer_call(timer_call_t call)
2022 {
2023 	spl_t x;
2024 
2025 	x = splsched();
2026 	simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2027 	enqueue_head(&cpu_warm_call_list, (queue_entry_t)call);
2028 	simple_unlock(&cpu_warm_lock);
2029 	splx(x);
2030 }
2031 
2032 /*
2033  * Runs in timer call context (interrupts disabled).
2034  */
2035 static void
cpu_warm_timer_call_func(timer_call_param_t p0,__unused timer_call_param_t p1)2036 cpu_warm_timer_call_func(
2037 	timer_call_param_t p0,
2038 	__unused timer_call_param_t p1)
2039 {
2040 	free_warm_timer_call((timer_call_t)p0);
2041 	return;
2042 }
2043 
2044 /*
2045  * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0).
2046  */
2047 static void
_cpu_warm_setup(void * arg)2048 _cpu_warm_setup(
2049 	void *arg)
2050 {
2051 	cpu_warm_data_t cwdp = (cpu_warm_data_t)arg;
2052 
2053 	timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2054 	cwdp->cwd_result = 0;
2055 
2056 	return;
2057 }
2058 
2059 /*
2060  * Not safe to call with interrupts disabled.
2061  */
2062 kern_return_t
ml_interrupt_prewarm(uint64_t deadline)2063 ml_interrupt_prewarm(
2064 	uint64_t        deadline)
2065 {
2066 	struct cpu_warm_data cwd;
2067 	timer_call_t call;
2068 	cpu_t ct;
2069 
2070 	if (ml_get_interrupts_enabled() == FALSE) {
2071 		panic("%s: Interrupts disabled?", __FUNCTION__);
2072 	}
2073 
2074 	/*
2075 	 * If the platform doesn't need our help, say that we succeeded.
2076 	 */
2077 	if (!ml_get_interrupt_prewake_applicable()) {
2078 		return KERN_SUCCESS;
2079 	}
2080 
2081 	/*
2082 	 * Grab a timer call to use.
2083 	 */
2084 	call = grab_warm_timer_call();
2085 	if (call == NULL) {
2086 		return KERN_RESOURCE_SHORTAGE;
2087 	}
2088 
2089 	timer_call_setup(call, cpu_warm_timer_call_func, call);
2090 	cwd.cwd_call = call;
2091 	cwd.cwd_deadline = deadline;
2092 	cwd.cwd_result = 0;
2093 
2094 	/*
2095 	 * For now, non-local interrupts happen on the master processor.
2096 	 */
2097 	ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd);
2098 	if (ct == 0) {
2099 		free_warm_timer_call(call);
2100 		return KERN_FAILURE;
2101 	} else {
2102 		return cwd.cwd_result;
2103 	}
2104 }
2105 
2106 #if DEBUG || DEVELOPMENT
2107 void
kernel_spin(uint64_t spin_ns)2108 kernel_spin(uint64_t spin_ns)
2109 {
2110 	boolean_t       istate;
2111 	uint64_t        spin_abs;
2112 	uint64_t        deadline;
2113 	cpu_data_t      *cdp;
2114 
2115 	kprintf("kernel_spin(%llu) spinning uninterruptibly\n", spin_ns);
2116 	istate = ml_set_interrupts_enabled(FALSE);
2117 	cdp = current_cpu_datap();
2118 	nanoseconds_to_absolutetime(spin_ns, &spin_abs);
2119 
2120 	/* Fake interrupt handler entry for testing mp_interrupt_watchdog() */
2121 	cdp->cpu_int_event_time = mach_absolute_time();
2122 	cdp->cpu_int_state = (void *) USER_STATE(current_thread());
2123 
2124 	deadline = mach_absolute_time() + spin_ns;
2125 	while (mach_absolute_time() < deadline) {
2126 		cpu_pause();
2127 	}
2128 
2129 	cdp->cpu_int_event_time = 0;
2130 	cdp->cpu_int_state = NULL;
2131 
2132 	ml_set_interrupts_enabled(istate);
2133 	kprintf("kernel_spin() continuing\n");
2134 }
2135 
2136 /*
2137  * Called from the scheduler's maintenance thread,
2138  * scan running processors for long-running ISRs and:
2139  *  - panic if longer than LockTimeOut, or
2140  *  - log if more than a quantum.
2141  */
2142 void
mp_interrupt_watchdog(void)2143 mp_interrupt_watchdog(void)
2144 {
2145 	cpu_t                   cpu;
2146 	boolean_t               intrs_enabled = FALSE;
2147 	uint16_t                cpu_int_num;
2148 	uint64_t                cpu_int_event_time;
2149 	uint64_t                cpu_rip;
2150 	uint64_t                cpu_int_duration;
2151 	uint64_t                now;
2152 	x86_saved_state_t       *cpu_int_state;
2153 
2154 	if (__improbable(!mp_interrupt_watchdog_enabled)) {
2155 		return;
2156 	}
2157 
2158 	intrs_enabled = ml_set_interrupts_enabled(FALSE);
2159 	now = mach_absolute_time();
2160 	/*
2161 	 * While timeouts are not suspended,
2162 	 * check all other processors for long outstanding interrupt handling.
2163 	 */
2164 	for (cpu = 0;
2165 	    cpu < (cpu_t) real_ncpus && !machine_timeout_suspended();
2166 	    cpu++) {
2167 		if ((cpu == (cpu_t) cpu_number()) ||
2168 		    (!cpu_is_running(cpu))) {
2169 			continue;
2170 		}
2171 		cpu_int_event_time = cpu_datap(cpu)->cpu_int_event_time;
2172 		if (cpu_int_event_time == 0) {
2173 			continue;
2174 		}
2175 		if (__improbable(now < cpu_int_event_time)) {
2176 			continue;       /* skip due to inter-processor skew */
2177 		}
2178 		cpu_int_state = cpu_datap(cpu)->cpu_int_state;
2179 		if (__improbable(cpu_int_state == NULL)) {
2180 			/* The interrupt may have been dismissed */
2181 			continue;
2182 		}
2183 
2184 		/* Here with a cpu handling an interrupt */
2185 
2186 		cpu_int_duration = now - cpu_int_event_time;
2187 		if (__improbable(cpu_int_duration > LockTimeOut)) {
2188 			cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2189 			cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2190 			vector_timed_out = cpu_int_num;
2191 			NMIPI_panic(cpu_to_cpumask(cpu), INTERRUPT_WATCHDOG);
2192 			panic("Interrupt watchdog, "
2193 			    "cpu: %d interrupt: 0x%x time: %llu..%llu state: %p RIP: 0x%llx",
2194 			    cpu, cpu_int_num, cpu_int_event_time, now, cpu_int_state, cpu_rip);
2195 			/* NOT REACHED */
2196 		} else if (__improbable(cpu_int_duration > (uint64_t) std_quantum)) {
2197 			mp_interrupt_watchdog_events++;
2198 			cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2199 			cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2200 			ml_set_interrupts_enabled(intrs_enabled);
2201 			printf("Interrupt watchdog, "
2202 			    "cpu: %d interrupt: 0x%x time: %llu..%llu RIP: 0x%llx\n",
2203 			    cpu, cpu_int_num, cpu_int_event_time, now, cpu_rip);
2204 			return;
2205 		}
2206 	}
2207 
2208 	ml_set_interrupts_enabled(intrs_enabled);
2209 }
2210 #endif
2211