1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31
32 #include <mach_kdp.h>
33 #include <kdp/kdp_internal.h>
34 #include <mach_ldebug.h>
35
36 #include <mach/mach_types.h>
37 #include <mach/kern_return.h>
38
39 #include <kern/kern_types.h>
40 #include <kern/startup.h>
41 #include <kern/timer_queue.h>
42 #include <kern/processor.h>
43 #include <kern/cpu_number.h>
44 #include <kern/cpu_data.h>
45 #include <kern/assert.h>
46 #include <kern/lock_group.h>
47 #include <kern/machine.h>
48 #include <kern/pms.h>
49 #include <kern/misc_protos.h>
50 #include <kern/timer_call.h>
51 #include <kern/zalloc.h>
52 #include <kern/queue.h>
53 #include <kern/monotonic.h>
54 #include <kern/kern_stackshot.h>
55 #include <prng/random.h>
56
57 #include <vm/vm_map.h>
58 #include <vm/vm_kern.h>
59
60 #include <i386/bit_routines.h>
61 #include <i386/proc_reg.h>
62 #include <i386/cpu_threads.h>
63 #include <i386/mp_desc.h>
64 #include <i386/misc_protos.h>
65 #include <i386/trap_internal.h>
66 #include <i386/postcode.h>
67 #include <i386/machine_routines.h>
68 #include <i386/mp.h>
69 #include <i386/mp_events.h>
70 #include <i386/lapic.h>
71 #include <i386/cpuid.h>
72 #include <i386/fpu.h>
73 #include <i386/machine_cpu.h>
74 #include <i386/pmCPU.h>
75 #if CONFIG_MCA
76 #include <i386/machine_check.h>
77 #endif
78 #include <i386/acpi.h>
79
80 #include <sys/kdebug.h>
81
82 #include <console/serial_protos.h>
83
84 #if KPERF
85 #include <kperf/kptimer.h>
86 #endif /* KPERF */
87
88 #if MP_DEBUG
89 #define PAUSE delay(1000000)
90 #define DBG(x...) kprintf(x)
91 #else
92 #define DBG(x...)
93 #define PAUSE
94 #endif /* MP_DEBUG */
95
96 /* Debugging/test trace events: */
97 #define TRACE_MP_TLB_FLUSH MACHDBG_CODE(DBG_MACH_MP, 0)
98 #define TRACE_MP_CPUS_CALL MACHDBG_CODE(DBG_MACH_MP, 1)
99 #define TRACE_MP_CPUS_CALL_LOCAL MACHDBG_CODE(DBG_MACH_MP, 2)
100 #define TRACE_MP_CPUS_CALL_ACTION MACHDBG_CODE(DBG_MACH_MP, 3)
101 #define TRACE_MP_CPUS_CALL_NOBUF MACHDBG_CODE(DBG_MACH_MP, 4)
102 #define TRACE_MP_CPU_FAST_START MACHDBG_CODE(DBG_MACH_MP, 5)
103 #define TRACE_MP_CPU_START MACHDBG_CODE(DBG_MACH_MP, 6)
104 #define TRACE_MP_CPU_DEACTIVATE MACHDBG_CODE(DBG_MACH_MP, 7)
105
106 #define ABS(v) (((v) > 0)?(v):-(v))
107
108 void slave_boot_init(void);
109 void i386_cpu_IPI(int cpu);
110
111 #if MACH_KDP
112 static void mp_kdp_wait(boolean_t flush, boolean_t isNMI);
113 #endif /* MACH_KDP */
114
115 #if MACH_KDP
116 static boolean_t cpu_signal_pending(int cpu, mp_event_t event);
117 #endif /* MACH_KDP */
118 static int NMIInterruptHandler(x86_saved_state_t *regs);
119
120 boolean_t smp_initialized = FALSE;
121 uint32_t TSC_sync_margin = 0xFFF;
122 volatile boolean_t force_immediate_debugger_NMI = FALSE;
123 volatile boolean_t pmap_tlb_flush_timeout = FALSE;
124 #if DEBUG || DEVELOPMENT
125 boolean_t mp_interrupt_watchdog_enabled = TRUE;
126 uint32_t mp_interrupt_watchdog_events = 0;
127 #endif
128
129 SIMPLE_LOCK_DECLARE(debugger_callback_lock, 0);
130 struct debugger_callback *debugger_callback = NULL;
131
132 static LCK_GRP_DECLARE(smp_lck_grp, "i386_smp");
133 static LCK_MTX_DECLARE(mp_cpu_boot_lock, &smp_lck_grp);
134
135 /* Variables needed for MP rendezvous. */
136 SIMPLE_LOCK_DECLARE(mp_rv_lock, 0);
137 static void (*mp_rv_setup_func)(void *arg);
138 static void (*mp_rv_action_func)(void *arg);
139 static void (*mp_rv_teardown_func)(void *arg);
140 static void *mp_rv_func_arg;
141 static volatile int mp_rv_ncpus;
142 /* Cache-aligned barriers: */
143 static volatile long mp_rv_entry __attribute__((aligned(64)));
144 static volatile long mp_rv_exit __attribute__((aligned(64)));
145 static volatile long mp_rv_complete __attribute__((aligned(64)));
146
147 volatile uint64_t debugger_entry_time;
148 volatile uint64_t debugger_exit_time;
149 #if MACH_KDP
150 #include <kdp/kdp.h>
151 extern int kdp_snapshot;
152 static struct _kdp_xcpu_call_func {
153 kdp_x86_xcpu_func_t func;
154 void *arg0, *arg1;
155 volatile long ret;
156 volatile uint16_t cpu;
157 } kdp_xcpu_call_func = {
158 .cpu = KDP_XCPU_NONE
159 };
160
161 #endif
162
163 /* Variables needed for MP broadcast. */
164 static void (*mp_bc_action_func)(void *arg);
165 static void *mp_bc_func_arg;
166 static int mp_bc_ncpus;
167 static volatile long mp_bc_count;
168 static LCK_MTX_DECLARE(mp_bc_lock, &smp_lck_grp);
169 static volatile int debugger_cpu = -1;
170 volatile long NMIPI_acks = 0;
171 volatile long NMI_count = 0;
172 static int vector_timed_out;
173
174 NMI_reason_t NMI_panic_reason = NONE;
175 extern void NMI_cpus(void);
176
177 static void mp_cpus_call_init(void);
178 static void mp_cpus_call_action(void);
179 static void mp_call_PM(void);
180
181 char mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init
182
183 /* PAL-related routines */
184 boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler,
185 int ipi_vector, i386_intr_func_t ipi_handler);
186 void i386_start_cpu(int lapic_id, int cpu_num);
187 void i386_send_NMI(int cpu);
188 void NMIPI_enable(boolean_t);
189
190 #define NUM_CPU_WARM_CALLS 20
191 struct timer_call cpu_warm_call_arr[NUM_CPU_WARM_CALLS];
192 queue_head_t cpu_warm_call_list;
193 decl_simple_lock_data(static, cpu_warm_lock);
194
195 typedef struct cpu_warm_data {
196 timer_call_t cwd_call;
197 uint64_t cwd_deadline;
198 int cwd_result;
199 } *cpu_warm_data_t;
200
201 static void cpu_prewarm_init(void);
202 static void cpu_warm_timer_call_func(timer_call_param_t p0, timer_call_param_t p1);
203 static void _cpu_warm_setup(void *arg);
204 static timer_call_t grab_warm_timer_call(void);
205 static void free_warm_timer_call(timer_call_t call);
206
207 void
smp_init(void)208 smp_init(void)
209 {
210 console_init();
211
212 if (!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler,
213 LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler)) {
214 return;
215 }
216
217 cpu_thread_init();
218
219 DBGLOG_CPU_INIT(master_cpu);
220
221 mp_cpus_call_init();
222 mp_cpus_call_cpu_init(master_cpu);
223
224 #if DEBUG || DEVELOPMENT
225 if (PE_parse_boot_argn("interrupt_watchdog",
226 &mp_interrupt_watchdog_enabled,
227 sizeof(mp_interrupt_watchdog_enabled))) {
228 kprintf("Interrupt watchdog %sabled\n",
229 mp_interrupt_watchdog_enabled ? "en" : "dis");
230 }
231 #endif
232
233 if (PE_parse_boot_argn("TSC_sync_margin",
234 &TSC_sync_margin, sizeof(TSC_sync_margin))) {
235 kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin);
236 } else if (cpuid_vmm_present()) {
237 kprintf("TSC sync margin disabled\n");
238 TSC_sync_margin = 0;
239 }
240 smp_initialized = TRUE;
241
242 cpu_prewarm_init();
243
244 return;
245 }
246
247 typedef struct {
248 int target_cpu;
249 int target_lapic;
250 int starter_cpu;
251 } processor_start_info_t;
252 static processor_start_info_t start_info __attribute__((aligned(64)));
253
254 /*
255 * Cache-alignment is to avoid cross-cpu false-sharing interference.
256 */
257 static volatile long tsc_entry_barrier __attribute__((aligned(64)));
258 static volatile long tsc_exit_barrier __attribute__((aligned(64)));
259 static volatile uint64_t tsc_target __attribute__((aligned(64)));
260
261 /*
262 * Poll a CPU to see when it has marked itself as running.
263 */
264 static void
mp_wait_for_cpu_up(int slot_num,unsigned int iters,unsigned int usecdelay)265 mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay)
266 {
267 while (iters-- > 0) {
268 if (cpu_datap(slot_num)->cpu_running) {
269 break;
270 }
271 delay(usecdelay);
272 }
273 }
274
275 /*
276 * Quickly bring a CPU back online which has been halted.
277 */
278 kern_return_t
intel_startCPU_fast(int slot_num)279 intel_startCPU_fast(int slot_num)
280 {
281 kern_return_t rc;
282
283 /*
284 * Try to perform a fast restart
285 */
286 rc = pmCPUExitHalt(slot_num);
287 if (rc != KERN_SUCCESS) {
288 /*
289 * The CPU was not eligible for a fast restart.
290 */
291 return rc;
292 }
293
294 KERNEL_DEBUG_CONSTANT(
295 TRACE_MP_CPU_FAST_START | DBG_FUNC_START,
296 slot_num, 0, 0, 0, 0);
297
298 /*
299 * Wait until the CPU is back online.
300 */
301 mp_disable_preemption();
302
303 /*
304 * We use short pauses (1us) for low latency. 30,000 iterations is
305 * longer than a full restart would require so it should be more
306 * than long enough.
307 */
308
309 mp_wait_for_cpu_up(slot_num, 30000, 1);
310 mp_enable_preemption();
311
312 KERNEL_DEBUG_CONSTANT(
313 TRACE_MP_CPU_FAST_START | DBG_FUNC_END,
314 slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0);
315
316 /*
317 * Check to make sure that the CPU is really running. If not,
318 * go through the slow path.
319 */
320 if (cpu_datap(slot_num)->cpu_running) {
321 return KERN_SUCCESS;
322 } else {
323 return KERN_FAILURE;
324 }
325 }
326
327 static void
started_cpu(void)328 started_cpu(void)
329 {
330 /* Here on the started cpu with cpu_running set TRUE */
331
332 if (TSC_sync_margin &&
333 start_info.target_cpu == cpu_number()) {
334 /*
335 * I've just started-up, synchronize again with the starter cpu
336 * and then snap my TSC.
337 */
338 tsc_target = 0;
339 atomic_decl(&tsc_entry_barrier, 1);
340 while (tsc_entry_barrier != 0) {
341 ; /* spin for starter and target at barrier */
342 }
343 tsc_target = rdtsc64();
344 atomic_decl(&tsc_exit_barrier, 1);
345 }
346 }
347
348 static void
start_cpu(void * arg)349 start_cpu(void *arg)
350 {
351 int i = 1000;
352 processor_start_info_t *psip = (processor_start_info_t *) arg;
353
354 /* Ignore this if the current processor is not the starter */
355 if (cpu_number() != psip->starter_cpu) {
356 return;
357 }
358
359 DBG("start_cpu(%p) about to start cpu %d, lapic %d\n",
360 arg, psip->target_cpu, psip->target_lapic);
361
362 KERNEL_DEBUG_CONSTANT(
363 TRACE_MP_CPU_START | DBG_FUNC_START,
364 psip->target_cpu,
365 psip->target_lapic, 0, 0, 0);
366
367 i386_start_cpu(psip->target_lapic, psip->target_cpu);
368
369 #ifdef POSTCODE_DELAY
370 /* Wait much longer if postcodes are displayed for a delay period. */
371 i *= 10000;
372 #endif
373 DBG("start_cpu(%p) about to wait for cpu %d\n",
374 arg, psip->target_cpu);
375
376 mp_wait_for_cpu_up(psip->target_cpu, i * 100, 100);
377
378 KERNEL_DEBUG_CONSTANT(
379 TRACE_MP_CPU_START | DBG_FUNC_END,
380 psip->target_cpu,
381 cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0);
382
383 if (TSC_sync_margin &&
384 cpu_datap(psip->target_cpu)->cpu_running) {
385 /*
386 * Compare the TSC from the started processor with ours.
387 * Report and log/panic if it diverges by more than
388 * TSC_sync_margin (TSC_SYNC_MARGIN) ticks. This margin
389 * can be overriden by boot-arg (with 0 meaning no checking).
390 */
391 uint64_t tsc_starter;
392 int64_t tsc_delta;
393 atomic_decl(&tsc_entry_barrier, 1);
394 while (tsc_entry_barrier != 0) {
395 ; /* spin for both processors at barrier */
396 }
397 tsc_starter = rdtsc64();
398 atomic_decl(&tsc_exit_barrier, 1);
399 while (tsc_exit_barrier != 0) {
400 ; /* spin for target to store its TSC */
401 }
402 tsc_delta = tsc_target - tsc_starter;
403 kprintf("TSC sync for cpu %d: 0x%016llx delta 0x%llx (%lld)\n",
404 psip->target_cpu, tsc_target, tsc_delta, tsc_delta);
405 #if DEBUG || DEVELOPMENT
406 /*
407 * Stash the delta for inspection later, since we can no
408 * longer print/log it with interrupts disabled.
409 */
410 cpu_datap(psip->target_cpu)->tsc_sync_delta = tsc_delta;
411 #endif
412 if (ABS(tsc_delta) > (int64_t) TSC_sync_margin) {
413 #if DEBUG
414 panic(
415 #else
416 kprintf(
417 #endif
418 "Unsynchronized TSC for cpu %d: "
419 "0x%016llx, delta 0x%llx\n",
420 psip->target_cpu, tsc_target, tsc_delta);
421 }
422 }
423 }
424
425 kern_return_t
intel_startCPU(int slot_num)426 intel_startCPU(
427 int slot_num)
428 {
429 int lapic = cpu_to_lapic[slot_num];
430 boolean_t istate;
431
432 assert(lapic != -1);
433
434 DBGLOG_CPU_INIT(slot_num);
435
436 DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic);
437 DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD);
438
439 /*
440 * Initialize (or re-initialize) the descriptor tables for this cpu.
441 * Propagate processor mode to slave.
442 */
443 cpu_desc_init(cpu_datap(slot_num));
444
445 /* Serialize use of the slave boot stack, etc. */
446 lck_mtx_lock(&mp_cpu_boot_lock);
447
448 istate = ml_set_interrupts_enabled(FALSE);
449 if (slot_num == get_cpu_number()) {
450 ml_set_interrupts_enabled(istate);
451 lck_mtx_unlock(&mp_cpu_boot_lock);
452 return KERN_SUCCESS;
453 }
454
455 start_info.starter_cpu = cpu_number();
456 start_info.target_cpu = slot_num;
457 start_info.target_lapic = lapic;
458 tsc_entry_barrier = 2;
459 tsc_exit_barrier = 2;
460
461 /*
462 * Perform the processor startup sequence with all running
463 * processors rendezvous'ed. This is required during periods when
464 * the cache-disable bit is set for MTRR/PAT initialization.
465 */
466 mp_rendezvous_no_intrs(start_cpu, (void *) &start_info);
467
468 start_info.target_cpu = 0;
469
470 ml_set_interrupts_enabled(istate);
471 lck_mtx_unlock(&mp_cpu_boot_lock);
472
473 if (!cpu_datap(slot_num)->cpu_running) {
474 kprintf("Failed to start CPU %02d\n", slot_num);
475 printf("Failed to start CPU %02d, rebooting...\n", slot_num);
476 delay(1000000);
477 halt_cpu();
478 return KERN_SUCCESS;
479 } else {
480 kprintf("Started cpu %d (lapic id %08x)\n", slot_num, lapic);
481 return KERN_SUCCESS;
482 }
483 }
484
485 #if MP_DEBUG
486 cpu_signal_event_log_t *cpu_signal[MAX_CPUS];
487 cpu_signal_event_log_t *cpu_handle[MAX_CPUS];
488
489 MP_EVENT_NAME_DECL();
490
491 #endif /* MP_DEBUG */
492
493 /*
494 * Note: called with NULL state when polling for TLB flush and cross-calls.
495 */
496 int
cpu_signal_handler(x86_saved_state_t * regs)497 cpu_signal_handler(x86_saved_state_t *regs)
498 {
499 #if !MACH_KDP
500 #pragma unused (regs)
501 #endif /* !MACH_KDP */
502 int my_cpu;
503 volatile int *my_word;
504
505 SCHED_STATS_INC(ipi_count);
506
507 my_cpu = cpu_number();
508 my_word = &cpu_data_ptr[my_cpu]->cpu_signals;
509 /* Store the initial set of signals for diagnostics. New
510 * signals could arrive while these are being processed
511 * so it's no more than a hint.
512 */
513
514 cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word;
515
516 do {
517 #if MACH_KDP
518 if (i_bit(MP_KDP, my_word)) {
519 DBGLOG(cpu_handle, my_cpu, MP_KDP);
520 i_bit_clear(MP_KDP, my_word);
521 /* Ensure that the i386_kernel_state at the base of the
522 * current thread's stack (if any) is synchronized with the
523 * context at the moment of the interrupt, to facilitate
524 * access through the debugger.
525 */
526 sync_iss_to_iks(regs);
527 if (pmsafe_debug && !kdp_snapshot) {
528 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
529 }
530 mp_kdp_wait(TRUE, FALSE);
531 if (pmsafe_debug && !kdp_snapshot) {
532 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
533 }
534 } else
535 #endif /* MACH_KDP */
536 if (i_bit(MP_TLB_FLUSH, my_word)) {
537 DBGLOG(cpu_handle, my_cpu, MP_TLB_FLUSH);
538 i_bit_clear(MP_TLB_FLUSH, my_word);
539 pmap_update_interrupt();
540 } else if (i_bit(MP_CALL, my_word)) {
541 DBGLOG(cpu_handle, my_cpu, MP_CALL);
542 i_bit_clear(MP_CALL, my_word);
543 mp_cpus_call_action();
544 } else if (i_bit(MP_CALL_PM, my_word)) {
545 DBGLOG(cpu_handle, my_cpu, MP_CALL_PM);
546 i_bit_clear(MP_CALL_PM, my_word);
547 mp_call_PM();
548 }
549 if (regs == NULL) {
550 /* Called to poll only for cross-calls and TLB flush */
551 break;
552 } else if (i_bit(MP_AST, my_word)) {
553 DBGLOG(cpu_handle, my_cpu, MP_AST);
554 i_bit_clear(MP_AST, my_word);
555 ast_check(cpu_to_processor(my_cpu));
556 }
557 } while (*my_word);
558
559 return 0;
560 }
561
562 long
NMI_pte_corruption_callback(__unused void * arg0,__unused void * arg1,uint16_t lcpu)563 NMI_pte_corruption_callback(__unused void *arg0, __unused void *arg1, uint16_t lcpu)
564 {
565 static char pstr[256]; /* global since this callback is serialized */
566 void *stackptr;
567 __asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
568
569 snprintf(&pstr[0], sizeof(pstr),
570 "Panic(CPU %d): PTE corruption detected on PTEP 0x%llx VAL 0x%llx\n",
571 lcpu, (unsigned long long)(uintptr_t)PTE_corrupted_ptr, *(uint64_t *)PTE_corrupted_ptr);
572 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, current_cpu_datap()->cpu_int_state);
573 return 0;
574 }
575
576 extern void kprintf_break_lock(void);
577 int
NMIInterruptHandler(x86_saved_state_t * regs)578 NMIInterruptHandler(x86_saved_state_t *regs)
579 {
580 void *stackptr;
581 char pstr[256];
582 uint64_t now = mach_absolute_time();
583
584 if (panic_active() && !panicDebugging) {
585 if (pmsafe_debug) {
586 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
587 }
588 for (;;) {
589 cpu_pause();
590 }
591 }
592
593 atomic_incl(&NMIPI_acks, 1);
594 atomic_incl(&NMI_count, 1);
595 sync_iss_to_iks_unconditionally(regs);
596 __asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
597
598 if (cpu_number() == debugger_cpu) {
599 goto NMExit;
600 }
601
602 if (NMI_panic_reason == SPINLOCK_TIMEOUT) {
603 lck_spinlock_to_info_t lsti;
604
605 lsti = os_atomic_load(&lck_spinlock_timeout_in_progress, acquire);
606 snprintf(&pstr[0], sizeof(pstr),
607 "Panic(CPU %d, time %llu): NMIPI for spinlock acquisition timeout, spinlock: %p, "
608 "spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n",
609 cpu_number(), now, lsti->lock, (void *)lsti->owner_thread_cur,
610 current_thread(), lsti->owner_cpu);
611 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
612 } else if (NMI_panic_reason == TLB_FLUSH_TIMEOUT) {
613 snprintf(&pstr[0], sizeof(pstr),
614 "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: TLB flush timeout, TLB state:0x%x\n",
615 cpu_number(), now, current_cpu_datap()->cpu_tlb_invalid);
616 panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs);
617 } else if (NMI_panic_reason == CROSSCALL_TIMEOUT) {
618 snprintf(&pstr[0], sizeof(pstr),
619 "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: cross-call timeout\n",
620 cpu_number(), now);
621 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
622 } else if (NMI_panic_reason == INTERRUPT_WATCHDOG) {
623 snprintf(&pstr[0], sizeof(pstr),
624 "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: interrupt watchdog for vector 0x%x\n",
625 cpu_number(), now, vector_timed_out);
626 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
627 }
628
629 #if MACH_KDP
630 if (pmsafe_debug && !kdp_snapshot) {
631 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
632 }
633 current_cpu_datap()->cpu_NMI_acknowledged = TRUE;
634 i_bit_clear(MP_KDP, ¤t_cpu_datap()->cpu_signals);
635 if (panic_active() || NMI_panic_reason != NONE) {
636 mp_kdp_wait(FALSE, TRUE);
637 } else if (!mp_kdp_trap &&
638 !mp_kdp_is_NMI &&
639 virtualized && (debug_boot_arg & DB_NMI)) {
640 /*
641 * Under a VMM with the debug boot-arg set, drop into kdp.
642 * Since an NMI is involved, there's a risk of contending with
643 * a panic. And side-effects of NMIs may result in entry into,
644 * and continuing from, the debugger being unreliable.
645 */
646 if (__sync_bool_compare_and_swap(&mp_kdp_is_NMI, FALSE, TRUE)) {
647 kprintf_break_lock();
648
649 DebuggerWithContext(EXC_BREAKPOINT, saved_state64(regs),
650 "requested by NMI", DEBUGGER_OPTION_NONE,
651 (unsigned long)(char *)__builtin_return_address(0));
652
653 mp_kdp_is_NMI = FALSE;
654 } else {
655 mp_kdp_wait(FALSE, FALSE);
656 }
657 } else {
658 mp_kdp_wait(FALSE, FALSE);
659 }
660 if (pmsafe_debug && !kdp_snapshot) {
661 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
662 }
663 #endif
664 NMExit:
665 return 1;
666 }
667
668 /*
669 * cpu_interrupt is really just to be used by the scheduler to
670 * get a CPU's attention it may not always issue an IPI. If an
671 * IPI is always needed then use i386_cpu_IPI.
672 */
673 void
cpu_interrupt(int cpu)674 cpu_interrupt(int cpu)
675 {
676 boolean_t did_IPI = FALSE;
677
678 if (smp_initialized
679 && pmCPUExitIdle(cpu_datap(cpu))) {
680 i386_cpu_IPI(cpu);
681 did_IPI = TRUE;
682 }
683
684 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0);
685 }
686
687 /*
688 * Send a true NMI via the local APIC to the specified CPU.
689 */
690 void
cpu_NMI_interrupt(int cpu)691 cpu_NMI_interrupt(int cpu)
692 {
693 if (smp_initialized) {
694 i386_send_NMI(cpu);
695 }
696 }
697
698 void
NMI_cpus(void)699 NMI_cpus(void)
700 {
701 unsigned int cpu;
702 boolean_t intrs_enabled;
703 uint64_t tsc_timeout;
704
705 intrs_enabled = ml_set_interrupts_enabled(FALSE);
706 NMIPI_enable(TRUE);
707 for (cpu = 0; cpu < real_ncpus; cpu++) {
708 if (!cpu_is_running(cpu)) {
709 continue;
710 }
711 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
712 cpu_NMI_interrupt(cpu);
713 tsc_timeout = !machine_timeout_suspended() ?
714 rdtsc64() + (1000 * 1000 * 1000 * 10ULL) :
715 ~0ULL;
716 while (!cpu_datap(cpu)->cpu_NMI_acknowledged) {
717 handle_pending_TLB_flushes();
718 cpu_pause();
719 if (rdtsc64() > tsc_timeout) {
720 panic("NMI_cpus() timeout cpu %d", cpu);
721 }
722 }
723 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
724 }
725 NMIPI_enable(FALSE);
726
727 ml_set_interrupts_enabled(intrs_enabled);
728 }
729
730 static void(*volatile mp_PM_func)(void) = NULL;
731
732 static void
mp_call_PM(void)733 mp_call_PM(void)
734 {
735 assert(!ml_get_interrupts_enabled());
736
737 if (mp_PM_func != NULL) {
738 mp_PM_func();
739 }
740 }
741
742 void
cpu_PM_interrupt(int cpu)743 cpu_PM_interrupt(int cpu)
744 {
745 assert(!ml_get_interrupts_enabled());
746
747 if (mp_PM_func != NULL) {
748 if (cpu == cpu_number()) {
749 mp_PM_func();
750 } else {
751 i386_signal_cpu(cpu, MP_CALL_PM, ASYNC);
752 }
753 }
754 }
755
756 void
PM_interrupt_register(void (* fn)(void))757 PM_interrupt_register(void (*fn)(void))
758 {
759 mp_PM_func = fn;
760 }
761
762 void
i386_signal_cpu(int cpu,mp_event_t event,mp_sync_t mode)763 i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode)
764 {
765 volatile int *signals = &cpu_datap(cpu)->cpu_signals;
766 uint64_t tsc_timeout;
767
768
769 if (!cpu_datap(cpu)->cpu_running) {
770 return;
771 }
772
773 if (event == MP_TLB_FLUSH) {
774 KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0);
775 }
776
777 DBGLOG(cpu_signal, cpu, event);
778
779 i_bit_set(event, signals);
780 i386_cpu_IPI(cpu);
781 if (mode == SYNC) {
782 again:
783 tsc_timeout = !machine_timeout_suspended() ?
784 rdtsc64() + (1000 * 1000 * 1000) :
785 ~0ULL;
786 while (i_bit(event, signals) && rdtsc64() < tsc_timeout) {
787 cpu_pause();
788 }
789 if (i_bit(event, signals)) {
790 DBG("i386_signal_cpu(%d, 0x%x, SYNC) timed out\n",
791 cpu, event);
792 goto again;
793 }
794 }
795 if (event == MP_TLB_FLUSH) {
796 KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0);
797 }
798 }
799
800 /*
801 * Helper function called when busy-waiting: panic if too long
802 * a TSC-based time has elapsed since the start of the spin.
803 */
804 static boolean_t
mp_spin_timeout(uint64_t tsc_start)805 mp_spin_timeout(uint64_t tsc_start)
806 {
807 uint64_t tsc_timeout;
808
809 cpu_pause();
810 if (machine_timeout_suspended()) {
811 return FALSE;
812 }
813
814 /*
815 * The timeout is 4 * the spinlock timeout period
816 * unless we have serial console printing (kprintf) enabled
817 * in which case we allow an even greater margin.
818 */
819 tsc_timeout = disable_serial_output ? LockTimeOutTSC << 2
820 : LockTimeOutTSC << 4;
821 return rdtsc64() > tsc_start + tsc_timeout;
822 }
823
824 /*
825 * Helper function to take a spinlock while ensuring that incoming IPIs
826 * are still serviced if interrupts are masked while we spin.
827 * Returns current interrupt state.
828 */
829 boolean_t
mp_safe_spin_lock(usimple_lock_t lock)830 mp_safe_spin_lock(usimple_lock_t lock)
831 {
832 if (ml_get_interrupts_enabled()) {
833 simple_lock(lock, LCK_GRP_NULL);
834 return TRUE;
835 }
836
837 lck_spinlock_to_info_t lsti;
838 uint64_t tsc_spin_start = rdtsc64();
839
840 while (!simple_lock_try(lock, LCK_GRP_NULL)) {
841 cpu_signal_handler(NULL);
842 if (mp_spin_timeout(tsc_spin_start)) {
843 uintptr_t lowner = (uintptr_t)lock->interlock.lock_data;
844
845 lsti = lck_spinlock_timeout_hit(lock, lowner);
846 NMIPI_panic(cpu_to_cpumask(lsti->owner_cpu), SPINLOCK_TIMEOUT);
847 panic("mp_safe_spin_lock() timed out, lock: %p, "
848 "owner thread: 0x%lx, current_thread: %p, "
849 "owner on CPU 0x%x, time: %llu",
850 lock, lowner, current_thread(),
851 lsti->owner_cpu, mach_absolute_time());
852 }
853 }
854
855 return FALSE;
856 }
857
858 /*
859 * All-CPU rendezvous:
860 * - CPUs are signalled,
861 * - all execute the setup function (if specified),
862 * - rendezvous (i.e. all cpus reach a barrier),
863 * - all execute the action function (if specified),
864 * - rendezvous again,
865 * - execute the teardown function (if specified), and then
866 * - resume.
867 *
868 * Note that the supplied external functions _must_ be reentrant and aware
869 * that they are running in parallel and in an unknown lock context.
870 */
871
872 static void
mp_rendezvous_action(__unused void * null)873 mp_rendezvous_action(__unused void *null)
874 {
875 boolean_t intrs_enabled;
876 uint64_t tsc_spin_start;
877
878 /*
879 * Note that mp_rv_lock was acquired by the thread that initiated the
880 * rendezvous and must have been acquired before we enter
881 * mp_rendezvous_action().
882 */
883 current_cpu_datap()->cpu_rendezvous_in_progress = TRUE;
884
885 /* setup function */
886 if (mp_rv_setup_func != NULL) {
887 mp_rv_setup_func(mp_rv_func_arg);
888 }
889
890 intrs_enabled = ml_get_interrupts_enabled();
891
892 /* spin on entry rendezvous */
893 atomic_incl(&mp_rv_entry, 1);
894 tsc_spin_start = rdtsc64();
895
896 while (mp_rv_entry < mp_rv_ncpus) {
897 /* poll for pesky tlb flushes if interrupts disabled */
898 if (!intrs_enabled) {
899 handle_pending_TLB_flushes();
900 }
901 if (mp_spin_timeout(tsc_spin_start)) {
902 panic("mp_rv_action() entry: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_entry, mp_rv_ncpus, tsc_spin_start, rdtsc64());
903 }
904 }
905
906 /* action function */
907 if (mp_rv_action_func != NULL) {
908 mp_rv_action_func(mp_rv_func_arg);
909 }
910
911 /* spin on exit rendezvous */
912 atomic_incl(&mp_rv_exit, 1);
913 tsc_spin_start = rdtsc64();
914 while (mp_rv_exit < mp_rv_ncpus) {
915 if (!intrs_enabled) {
916 handle_pending_TLB_flushes();
917 }
918 if (mp_spin_timeout(tsc_spin_start)) {
919 panic("mp_rv_action() exit: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_exit, mp_rv_ncpus, tsc_spin_start, rdtsc64());
920 }
921 }
922
923 /* teardown function */
924 if (mp_rv_teardown_func != NULL) {
925 mp_rv_teardown_func(mp_rv_func_arg);
926 }
927
928 current_cpu_datap()->cpu_rendezvous_in_progress = FALSE;
929
930 /* Bump completion count */
931 atomic_incl(&mp_rv_complete, 1);
932 }
933
934 void
mp_rendezvous(void (* setup_func)(void *),void (* action_func)(void *),void (* teardown_func)(void *),void * arg)935 mp_rendezvous(void (*setup_func)(void *),
936 void (*action_func)(void *),
937 void (*teardown_func)(void *),
938 void *arg)
939 {
940 uint64_t tsc_spin_start;
941
942 if (!smp_initialized) {
943 if (setup_func != NULL) {
944 setup_func(arg);
945 }
946 if (action_func != NULL) {
947 action_func(arg);
948 }
949 if (teardown_func != NULL) {
950 teardown_func(arg);
951 }
952 return;
953 }
954
955 /* obtain rendezvous lock */
956 mp_rendezvous_lock();
957
958 /* set static function pointers */
959 mp_rv_setup_func = setup_func;
960 mp_rv_action_func = action_func;
961 mp_rv_teardown_func = teardown_func;
962 mp_rv_func_arg = arg;
963
964 mp_rv_entry = 0;
965 mp_rv_exit = 0;
966 mp_rv_complete = 0;
967
968 /*
969 * signal other processors, which will call mp_rendezvous_action()
970 * with interrupts disabled
971 */
972 mp_rv_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, &mp_rendezvous_action, NULL) + 1;
973
974 /* call executor function on this cpu */
975 mp_rendezvous_action(NULL);
976
977 /*
978 * Spin for everyone to complete.
979 * This is necessary to ensure that all processors have proceeded
980 * from the exit barrier before we release the rendezvous structure.
981 */
982 tsc_spin_start = rdtsc64();
983 while (mp_rv_complete < mp_rv_ncpus) {
984 if (mp_spin_timeout(tsc_spin_start)) {
985 panic("mp_rendezvous() timeout: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_complete, mp_rv_ncpus, tsc_spin_start, rdtsc64());
986 }
987 }
988
989 /* Tidy up */
990 mp_rv_setup_func = NULL;
991 mp_rv_action_func = NULL;
992 mp_rv_teardown_func = NULL;
993 mp_rv_func_arg = NULL;
994
995 /* release lock */
996 mp_rendezvous_unlock();
997 }
998
999 void
mp_rendezvous_lock(void)1000 mp_rendezvous_lock(void)
1001 {
1002 (void) mp_safe_spin_lock(&mp_rv_lock);
1003 }
1004
1005 void
mp_rendezvous_unlock(void)1006 mp_rendezvous_unlock(void)
1007 {
1008 simple_unlock(&mp_rv_lock);
1009 }
1010
1011 void
mp_rendezvous_break_lock(void)1012 mp_rendezvous_break_lock(void)
1013 {
1014 simple_lock_init(&mp_rv_lock, 0);
1015 }
1016
1017 static void
setup_disable_intrs(__unused void * param_not_used)1018 setup_disable_intrs(__unused void * param_not_used)
1019 {
1020 /* disable interrupts before the first barrier */
1021 boolean_t intr = ml_set_interrupts_enabled(FALSE);
1022
1023 current_cpu_datap()->cpu_iflag = intr;
1024 DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1025 }
1026
1027 static void
teardown_restore_intrs(__unused void * param_not_used)1028 teardown_restore_intrs(__unused void * param_not_used)
1029 {
1030 /* restore interrupt flag following MTRR changes */
1031 ml_set_interrupts_enabled(current_cpu_datap()->cpu_iflag);
1032 DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1033 }
1034
1035 /*
1036 * A wrapper to mp_rendezvous() to call action_func() with interrupts disabled.
1037 * This is exported for use by kexts.
1038 */
1039 void
mp_rendezvous_no_intrs(void (* action_func)(void *),void * arg)1040 mp_rendezvous_no_intrs(
1041 void (*action_func)(void *),
1042 void *arg)
1043 {
1044 mp_rendezvous(setup_disable_intrs,
1045 action_func,
1046 teardown_restore_intrs,
1047 arg);
1048 }
1049
1050
1051 typedef struct {
1052 queue_chain_t link; /* queue linkage */
1053 void (*func)(void *, void *); /* routine to call */
1054 void *arg0; /* routine's 1st arg */
1055 void *arg1; /* routine's 2nd arg */
1056 cpumask_t *maskp; /* completion response mask */
1057 } mp_call_t;
1058
1059
1060 typedef struct {
1061 queue_head_t queue;
1062 decl_simple_lock_data(, lock);
1063 } mp_call_queue_t;
1064 #define MP_CPUS_CALL_BUFS_PER_CPU MAX_CPUS
1065 static mp_call_queue_t mp_cpus_call_freelist;
1066 static mp_call_queue_t mp_cpus_call_head[MAX_CPUS];
1067
1068 static inline boolean_t
mp_call_head_lock(mp_call_queue_t * cqp)1069 mp_call_head_lock(mp_call_queue_t *cqp)
1070 {
1071 boolean_t intrs_enabled;
1072
1073 intrs_enabled = ml_set_interrupts_enabled(FALSE);
1074 simple_lock(&cqp->lock, LCK_GRP_NULL);
1075
1076 return intrs_enabled;
1077 }
1078
1079 /*
1080 * Deliver an NMIPI to a set of processors to cause them to panic .
1081 */
1082 void
NMIPI_panic(cpumask_t cpu_mask,NMI_reason_t why)1083 NMIPI_panic(cpumask_t cpu_mask, NMI_reason_t why)
1084 {
1085 unsigned int cpu;
1086 cpumask_t cpu_bit;
1087 uint64_t deadline;
1088
1089 NMIPI_enable(TRUE);
1090 NMI_panic_reason = why;
1091
1092 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1093 if ((cpu_mask & cpu_bit) == 0) {
1094 continue;
1095 }
1096 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1097 cpu_NMI_interrupt(cpu);
1098 }
1099
1100 /* Wait (only so long) for NMi'ed cpus to respond */
1101 deadline = mach_absolute_time() + LockTimeOut;
1102 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1103 if ((cpu_mask & cpu_bit) == 0) {
1104 continue;
1105 }
1106 while (!cpu_datap(cpu)->cpu_NMI_acknowledged &&
1107 mach_absolute_time() < deadline) {
1108 cpu_pause();
1109 }
1110 }
1111 }
1112
1113 #if MACH_ASSERT
1114 static inline boolean_t
mp_call_head_is_locked(mp_call_queue_t * cqp)1115 mp_call_head_is_locked(mp_call_queue_t *cqp)
1116 {
1117 return !ml_get_interrupts_enabled() &&
1118 hw_lock_held((hw_lock_t)&cqp->lock);
1119 }
1120 #endif
1121
1122 static inline void
mp_call_head_unlock(mp_call_queue_t * cqp,boolean_t intrs_enabled)1123 mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled)
1124 {
1125 simple_unlock(&cqp->lock);
1126 ml_set_interrupts_enabled(intrs_enabled);
1127 }
1128
1129 static inline mp_call_t *
mp_call_alloc(void)1130 mp_call_alloc(void)
1131 {
1132 mp_call_t *callp = NULL;
1133 boolean_t intrs_enabled;
1134 mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1135
1136 intrs_enabled = mp_call_head_lock(cqp);
1137 if (!queue_empty(&cqp->queue)) {
1138 queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1139 }
1140 mp_call_head_unlock(cqp, intrs_enabled);
1141
1142 return callp;
1143 }
1144
1145 static inline void
mp_call_free(mp_call_t * callp)1146 mp_call_free(mp_call_t *callp)
1147 {
1148 boolean_t intrs_enabled;
1149 mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1150
1151 intrs_enabled = mp_call_head_lock(cqp);
1152 queue_enter_first(&cqp->queue, callp, typeof(callp), link);
1153 mp_call_head_unlock(cqp, intrs_enabled);
1154 }
1155
1156 static inline mp_call_t *
mp_call_dequeue_locked(mp_call_queue_t * cqp)1157 mp_call_dequeue_locked(mp_call_queue_t *cqp)
1158 {
1159 mp_call_t *callp = NULL;
1160
1161 assert(mp_call_head_is_locked(cqp));
1162 if (!queue_empty(&cqp->queue)) {
1163 queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1164 }
1165 return callp;
1166 }
1167
1168 static inline void
mp_call_enqueue_locked(mp_call_queue_t * cqp,mp_call_t * callp)1169 mp_call_enqueue_locked(
1170 mp_call_queue_t *cqp,
1171 mp_call_t *callp)
1172 {
1173 queue_enter(&cqp->queue, callp, typeof(callp), link);
1174 }
1175
1176 /* Called on the boot processor to initialize global structures */
1177 static void
mp_cpus_call_init(void)1178 mp_cpus_call_init(void)
1179 {
1180 mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1181
1182 DBG("mp_cpus_call_init()\n");
1183 simple_lock_init(&cqp->lock, 0);
1184 queue_init(&cqp->queue);
1185 }
1186
1187 /*
1188 * Called at processor registration to add call buffers to the free list
1189 * and to initialize the per-cpu call queue.
1190 */
1191 void
mp_cpus_call_cpu_init(int cpu)1192 mp_cpus_call_cpu_init(int cpu)
1193 {
1194 int i;
1195 mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1196 mp_call_t *callp;
1197
1198 simple_lock_init(&cqp->lock, 0);
1199 queue_init(&cqp->queue);
1200 for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) {
1201 callp = zalloc_permanent_type(mp_call_t);
1202 mp_call_free(callp);
1203 }
1204
1205 DBG("mp_cpus_call_init(%d) done\n", cpu);
1206 }
1207
1208 /*
1209 * This is called from cpu_signal_handler() to process an MP_CALL signal.
1210 * And also from i386_deactivate_cpu() when a cpu is being taken offline.
1211 */
1212 static void
mp_cpus_call_action(void)1213 mp_cpus_call_action(void)
1214 {
1215 mp_call_queue_t *cqp;
1216 boolean_t intrs_enabled;
1217 mp_call_t *callp;
1218 mp_call_t call;
1219
1220 assert(!ml_get_interrupts_enabled());
1221 cqp = &mp_cpus_call_head[cpu_number()];
1222 intrs_enabled = mp_call_head_lock(cqp);
1223 while ((callp = mp_call_dequeue_locked(cqp)) != NULL) {
1224 /* Copy call request to the stack to free buffer */
1225 call = *callp;
1226 mp_call_free(callp);
1227 if (call.func != NULL) {
1228 mp_call_head_unlock(cqp, intrs_enabled);
1229 KERNEL_DEBUG_CONSTANT(
1230 TRACE_MP_CPUS_CALL_ACTION,
1231 VM_KERNEL_UNSLIDE(call.func), VM_KERNEL_UNSLIDE_OR_PERM(call.arg0),
1232 VM_KERNEL_UNSLIDE_OR_PERM(call.arg1), VM_KERNEL_ADDRPERM(call.maskp), 0);
1233 call.func(call.arg0, call.arg1);
1234 (void) mp_call_head_lock(cqp);
1235 }
1236 if (call.maskp != NULL) {
1237 i_bit_set(cpu_number(), call.maskp);
1238 }
1239 }
1240 mp_call_head_unlock(cqp, intrs_enabled);
1241 }
1242
1243 #pragma clang diagnostic push
1244 #pragma clang diagnostic ignored "-Wcast-function-type"
1245
1246 /*
1247 * mp_cpus_call() runs a given function on cpus specified in a given cpu mask.
1248 * Possible modes are:
1249 * SYNC: function is called serially on target cpus in logical cpu order
1250 * waiting for each call to be acknowledged before proceeding
1251 * ASYNC: function call is queued to the specified cpus
1252 * waiting for all calls to complete in parallel before returning
1253 * NOSYNC: function calls are queued
1254 * but we return before confirmation of calls completing.
1255 * The action function may be NULL.
1256 * The cpu mask may include the local cpu. Offline cpus are ignored.
1257 * The return value is the number of cpus on which the call was made or queued.
1258 */
1259 cpu_t
mp_cpus_call(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *),void * arg)1260 mp_cpus_call(
1261 cpumask_t cpus,
1262 mp_sync_t mode,
1263 void (*action_func)(void *),
1264 void *arg)
1265 {
1266 return mp_cpus_call1(
1267 cpus,
1268 mode,
1269 (void (*)(void *, void *))action_func,
1270 arg,
1271 NULL,
1272 NULL);
1273 }
1274
1275 #pragma clang diagnostic pop
1276
1277 static void
mp_cpus_call_wait(boolean_t intrs_enabled,cpumask_t cpus_called,cpumask_t * cpus_responded)1278 mp_cpus_call_wait(boolean_t intrs_enabled,
1279 cpumask_t cpus_called,
1280 cpumask_t *cpus_responded)
1281 {
1282 mp_call_queue_t *cqp;
1283 uint64_t tsc_spin_start;
1284
1285 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1286 cqp = &mp_cpus_call_head[cpu_number()];
1287
1288 tsc_spin_start = rdtsc64();
1289 while (*cpus_responded != cpus_called) {
1290 if (!intrs_enabled) {
1291 /* Sniffing w/o locking */
1292 if (!queue_empty(&cqp->queue)) {
1293 mp_cpus_call_action();
1294 }
1295 cpu_signal_handler(NULL);
1296 }
1297 if (mp_spin_timeout(tsc_spin_start)) {
1298 cpumask_t cpus_unresponsive;
1299
1300 cpus_unresponsive = cpus_called & ~(*cpus_responded);
1301 NMIPI_panic(cpus_unresponsive, CROSSCALL_TIMEOUT);
1302 panic("mp_cpus_call_wait() timeout, cpus: 0x%llx",
1303 cpus_unresponsive);
1304 }
1305 }
1306 }
1307
1308 cpu_t
mp_cpus_call1(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *,void *),void * arg0,void * arg1,cpumask_t * cpus_calledp)1309 mp_cpus_call1(
1310 cpumask_t cpus,
1311 mp_sync_t mode,
1312 void (*action_func)(void *, void *),
1313 void *arg0,
1314 void *arg1,
1315 cpumask_t *cpus_calledp)
1316 {
1317 cpu_t cpu = 0;
1318 boolean_t intrs_enabled = FALSE;
1319 boolean_t call_self = FALSE;
1320 cpumask_t cpus_called = 0;
1321 cpumask_t cpus_responded = 0;
1322 long cpus_call_count = 0;
1323 uint64_t tsc_spin_start;
1324 boolean_t topo_lock;
1325
1326 KERNEL_DEBUG_CONSTANT(
1327 TRACE_MP_CPUS_CALL | DBG_FUNC_START,
1328 cpus, mode, VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1));
1329
1330 if (!smp_initialized) {
1331 if ((cpus & CPUMASK_SELF) == 0) {
1332 goto out;
1333 }
1334 if (action_func != NULL) {
1335 intrs_enabled = ml_set_interrupts_enabled(FALSE);
1336 action_func(arg0, arg1);
1337 ml_set_interrupts_enabled(intrs_enabled);
1338 }
1339 call_self = TRUE;
1340 goto out;
1341 }
1342
1343 /*
1344 * Queue the call for each non-local requested cpu.
1345 * This is performed under the topo lock to prevent changes to
1346 * cpus online state and to prevent concurrent rendezvouses --
1347 * although an exception is made if we're calling only the master
1348 * processor since that always remains active. Note: this exception
1349 * is expected for longterm timer nosync cross-calls to the master cpu.
1350 */
1351 mp_disable_preemption();
1352 intrs_enabled = ml_get_interrupts_enabled();
1353 topo_lock = (cpus != cpu_to_cpumask(master_cpu));
1354 if (topo_lock) {
1355 ml_set_interrupts_enabled(FALSE);
1356 (void) mp_safe_spin_lock(&x86_topo_lock);
1357 }
1358 for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1359 if (((cpu_to_cpumask(cpu) & cpus) == 0) ||
1360 !cpu_is_running(cpu)) {
1361 continue;
1362 }
1363 tsc_spin_start = rdtsc64();
1364 if (cpu == (cpu_t) cpu_number()) {
1365 /*
1366 * We don't IPI ourself and if calling asynchronously,
1367 * we defer our call until we have signalled all others.
1368 */
1369 call_self = TRUE;
1370 if (mode == SYNC && action_func != NULL) {
1371 KERNEL_DEBUG_CONSTANT(
1372 TRACE_MP_CPUS_CALL_LOCAL,
1373 VM_KERNEL_UNSLIDE(action_func),
1374 VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1375 action_func(arg0, arg1);
1376 }
1377 } else {
1378 /*
1379 * Here to queue a call to cpu and IPI.
1380 */
1381 mp_call_t *callp = NULL;
1382 mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1383 boolean_t intrs_inner;
1384
1385 queue_call:
1386 if (callp == NULL) {
1387 callp = mp_call_alloc();
1388 }
1389 intrs_inner = mp_call_head_lock(cqp);
1390 if (callp == NULL) {
1391 mp_call_head_unlock(cqp, intrs_inner);
1392 KERNEL_DEBUG_CONSTANT(
1393 TRACE_MP_CPUS_CALL_NOBUF,
1394 cpu, 0, 0, 0, 0);
1395 if (!intrs_inner) {
1396 /* Sniffing w/o locking */
1397 if (!queue_empty(&cqp->queue)) {
1398 mp_cpus_call_action();
1399 }
1400 handle_pending_TLB_flushes();
1401 }
1402 if (mp_spin_timeout(tsc_spin_start)) {
1403 panic("mp_cpus_call1() timeout start: 0x%llx, cur: 0x%llx",
1404 tsc_spin_start, rdtsc64());
1405 }
1406 goto queue_call;
1407 }
1408 callp->maskp = (mode == NOSYNC) ? NULL : &cpus_responded;
1409 callp->func = action_func;
1410 callp->arg0 = arg0;
1411 callp->arg1 = arg1;
1412 mp_call_enqueue_locked(cqp, callp);
1413 cpus_call_count++;
1414 cpus_called |= cpu_to_cpumask(cpu);
1415 i386_signal_cpu(cpu, MP_CALL, ASYNC);
1416 mp_call_head_unlock(cqp, intrs_inner);
1417 if (mode == SYNC) {
1418 mp_cpus_call_wait(intrs_inner, cpus_called, &cpus_responded);
1419 }
1420 }
1421 }
1422 if (topo_lock) {
1423 simple_unlock(&x86_topo_lock);
1424 ml_set_interrupts_enabled(intrs_enabled);
1425 }
1426
1427 /* Call locally if mode not SYNC */
1428 if (mode != SYNC && call_self) {
1429 KERNEL_DEBUG_CONSTANT(
1430 TRACE_MP_CPUS_CALL_LOCAL,
1431 VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1432 if (action_func != NULL) {
1433 ml_set_interrupts_enabled(FALSE);
1434 action_func(arg0, arg1);
1435 ml_set_interrupts_enabled(intrs_enabled);
1436 }
1437 }
1438
1439 /* For ASYNC, now wait for all signaled cpus to complete their calls */
1440 if (mode == ASYNC) {
1441 mp_cpus_call_wait(intrs_enabled, cpus_called, &cpus_responded);
1442 }
1443
1444 /* Safe to allow pre-emption now */
1445 mp_enable_preemption();
1446
1447 out:
1448 if (call_self) {
1449 cpus_called |= cpu_to_cpumask(cpu);
1450 cpus_call_count++;
1451 }
1452
1453 if (cpus_calledp) {
1454 *cpus_calledp = cpus_called;
1455 }
1456
1457 KERNEL_DEBUG_CONSTANT(
1458 TRACE_MP_CPUS_CALL | DBG_FUNC_END,
1459 cpus_call_count, cpus_called, 0, 0, 0);
1460
1461 return (cpu_t) cpus_call_count;
1462 }
1463
1464
1465 static void
mp_broadcast_action(__unused void * null)1466 mp_broadcast_action(__unused void *null)
1467 {
1468 /* call action function */
1469 if (mp_bc_action_func != NULL) {
1470 mp_bc_action_func(mp_bc_func_arg);
1471 }
1472
1473 /* if we're the last one through, wake up the instigator */
1474 if (atomic_decl_and_test(&mp_bc_count, 1)) {
1475 thread_wakeup(((event_t)(uintptr_t) &mp_bc_count));
1476 }
1477 }
1478
1479 /*
1480 * mp_broadcast() runs a given function on all active cpus.
1481 * The caller blocks until the functions has run on all cpus.
1482 * The caller will also block if there is another pending broadcast.
1483 */
1484 void
mp_broadcast(void (* action_func)(void *),void * arg)1485 mp_broadcast(
1486 void (*action_func)(void *),
1487 void *arg)
1488 {
1489 if (!smp_initialized) {
1490 if (action_func != NULL) {
1491 action_func(arg);
1492 }
1493 return;
1494 }
1495
1496 /* obtain broadcast lock */
1497 lck_mtx_lock(&mp_bc_lock);
1498
1499 /* set static function pointers */
1500 mp_bc_action_func = action_func;
1501 mp_bc_func_arg = arg;
1502
1503 assert_wait((event_t)(uintptr_t)&mp_bc_count, THREAD_UNINT);
1504
1505 /*
1506 * signal other processors, which will call mp_broadcast_action()
1507 */
1508 mp_bc_count = real_ncpus; /* assume max possible active */
1509 mp_bc_ncpus = mp_cpus_call(CPUMASK_ALL, NOSYNC, *mp_broadcast_action, NULL);
1510 atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */
1511
1512 /* block for other cpus to have run action_func */
1513 if (mp_bc_ncpus > 1) {
1514 thread_block(THREAD_CONTINUE_NULL);
1515 } else {
1516 clear_wait(current_thread(), THREAD_AWAKENED);
1517 }
1518
1519 /* release lock */
1520 lck_mtx_unlock(&mp_bc_lock);
1521 }
1522
1523 void
mp_cpus_kick(cpumask_t cpus)1524 mp_cpus_kick(cpumask_t cpus)
1525 {
1526 cpu_t cpu;
1527 boolean_t intrs_enabled = FALSE;
1528
1529 intrs_enabled = ml_set_interrupts_enabled(FALSE);
1530 mp_safe_spin_lock(&x86_topo_lock);
1531
1532 for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1533 if (((cpu_to_cpumask(cpu) & cpus) == 0)
1534 || !cpu_is_running(cpu)) {
1535 continue;
1536 }
1537
1538 lapic_send_ipi(cpu, LAPIC_VECTOR(KICK));
1539 }
1540
1541 simple_unlock(&x86_topo_lock);
1542 ml_set_interrupts_enabled(intrs_enabled);
1543 }
1544
1545 void
i386_activate_cpu(void)1546 i386_activate_cpu(void)
1547 {
1548 cpu_data_t *cdp = current_cpu_datap();
1549
1550 assert(!ml_get_interrupts_enabled());
1551
1552 if (!smp_initialized) {
1553 cdp->cpu_running = TRUE;
1554 return;
1555 }
1556
1557 mp_safe_spin_lock(&x86_topo_lock);
1558 cdp->cpu_running = TRUE;
1559 started_cpu();
1560 pmap_tlbi_range(0, ~0ULL, true, 0);
1561 simple_unlock(&x86_topo_lock);
1562 }
1563
1564 void
i386_deactivate_cpu(void)1565 i386_deactivate_cpu(void)
1566 {
1567 cpu_data_t *cdp = current_cpu_datap();
1568
1569 assert(!ml_get_interrupts_enabled());
1570
1571 KERNEL_DEBUG_CONSTANT(
1572 TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START,
1573 0, 0, 0, 0, 0);
1574
1575 mp_safe_spin_lock(&x86_topo_lock);
1576 cdp->cpu_running = FALSE;
1577 simple_unlock(&x86_topo_lock);
1578
1579 /*
1580 * Move all of this cpu's timers to the master/boot cpu,
1581 * and poke it in case there's a sooner deadline for it to schedule.
1582 * We don't need to wait for it to ack the IPI.
1583 */
1584 timer_queue_shutdown(master_cpu,
1585 &cdp->rtclock_timer.queue,
1586 &cpu_datap(master_cpu)->rtclock_timer.queue);
1587
1588 mp_cpus_call(cpu_to_cpumask(master_cpu), NOSYNC, timer_queue_expire_local, NULL);
1589
1590 #if CONFIG_CPU_COUNTERS
1591 mt_cpu_down(cdp);
1592 #endif /* CONFIG_CPU_COUNTERS */
1593 #if KPERF
1594 kptimer_stop_curcpu();
1595 #endif /* KPERF */
1596
1597 /*
1598 * Open an interrupt window
1599 * and ensure any pending IPI or timer is serviced
1600 */
1601 mp_disable_preemption();
1602 ml_set_interrupts_enabled(TRUE);
1603
1604 while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime) {
1605 cpu_pause();
1606 }
1607 /*
1608 * Ensure there's no remaining timer deadline set
1609 * - AICPM may have left one active.
1610 */
1611 setPop(0);
1612
1613 ml_set_interrupts_enabled(FALSE);
1614 mp_enable_preemption();
1615
1616 KERNEL_DEBUG_CONSTANT(
1617 TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END,
1618 0, 0, 0, 0, 0);
1619 }
1620
1621 int pmsafe_debug = 1;
1622
1623 #if MACH_KDP
1624 volatile boolean_t mp_kdp_trap = FALSE;
1625 volatile boolean_t mp_kdp_is_NMI = FALSE;
1626 volatile unsigned long mp_kdp_ncpus;
1627 boolean_t mp_kdp_state;
1628 bool mp_kdp_is_stackshot = false;
1629
1630 void
mp_kdp_enter(boolean_t proceed_on_failure,bool is_stackshot)1631 mp_kdp_enter(boolean_t proceed_on_failure, bool is_stackshot)
1632 {
1633 unsigned int cpu;
1634 unsigned int ncpus = 0;
1635 unsigned int my_cpu;
1636 uint64_t tsc_timeout;
1637
1638 DBG("mp_kdp_enter()\n");
1639
1640 /*
1641 * Here to enter the debugger.
1642 * In case of races, only one cpu is allowed to enter kdp after
1643 * stopping others.
1644 */
1645 mp_kdp_state = ml_set_interrupts_enabled(FALSE);
1646 my_cpu = cpu_number();
1647 mp_kdp_is_stackshot = is_stackshot;
1648
1649 if (my_cpu == (unsigned) debugger_cpu) {
1650 kprintf("\n\nRECURSIVE DEBUGGER ENTRY DETECTED\n\n");
1651 kdp_reset();
1652 return;
1653 }
1654
1655 uint64_t start_time = cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time();
1656 int locked = 0;
1657 while (!locked || mp_kdp_trap) {
1658 if (locked) {
1659 simple_unlock(&x86_topo_lock);
1660 }
1661 if (proceed_on_failure) {
1662 if (mach_absolute_time() - start_time > 500000000ll) {
1663 paniclog_append_noflush("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n");
1664 break;
1665 }
1666 locked = simple_lock_try(&x86_topo_lock, LCK_GRP_NULL);
1667 if (!locked) {
1668 cpu_pause();
1669 }
1670 } else {
1671 mp_safe_spin_lock(&x86_topo_lock);
1672 locked = TRUE;
1673 }
1674
1675 if (locked && mp_kdp_trap) {
1676 simple_unlock(&x86_topo_lock);
1677 DBG("mp_kdp_enter() race lost\n");
1678 #if MACH_KDP
1679 mp_kdp_wait(TRUE, FALSE);
1680 #endif
1681 locked = FALSE;
1682 }
1683 }
1684
1685 if (pmsafe_debug && !kdp_snapshot) {
1686 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
1687 }
1688
1689 debugger_cpu = my_cpu;
1690 ncpus = 1;
1691 atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1692 mp_kdp_trap = TRUE;
1693 debugger_entry_time = cpu_datap(my_cpu)->debugger_entry_time;
1694
1695 /*
1696 * Deliver a nudge to other cpus, counting how many
1697 */
1698 DBG("mp_kdp_enter() signaling other processors\n");
1699 if (force_immediate_debugger_NMI == FALSE) {
1700 for (cpu = 0; cpu < real_ncpus; cpu++) {
1701 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1702 continue;
1703 }
1704 ncpus++;
1705 i386_signal_cpu(cpu, MP_KDP, ASYNC);
1706 }
1707 /*
1708 * Wait other processors to synchronize
1709 */
1710 DBG("mp_kdp_enter() waiting for (%d) processors to suspend\n", ncpus);
1711
1712 /*
1713 * This timeout is rather arbitrary; we don't want to NMI
1714 * processors that are executing at potentially
1715 * "unsafe-to-interrupt" points such as the trampolines,
1716 * but neither do we want to lose state by waiting too long.
1717 */
1718 tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1719
1720 while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1721 /*
1722 * A TLB shootdown request may be pending--this would
1723 * result in the requesting processor waiting in
1724 * PMAP_UPDATE_TLBS() until this processor deals with it.
1725 * Process it, so it can now enter mp_kdp_wait()
1726 */
1727 handle_pending_TLB_flushes();
1728 cpu_pause();
1729 }
1730 /* If we've timed out, and some processor(s) are still unresponsive,
1731 * interrupt them with an NMI via the local APIC, iff a panic is
1732 * in progress.
1733 */
1734 if (panic_active()) {
1735 NMIPI_enable(TRUE);
1736 }
1737 if (mp_kdp_ncpus != ncpus) {
1738 unsigned int wait_cycles = 0;
1739 if (proceed_on_failure) {
1740 paniclog_append_noflush("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1741 } else {
1742 DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1743 }
1744 for (cpu = 0; cpu < real_ncpus; cpu++) {
1745 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1746 continue;
1747 }
1748 if (cpu_signal_pending(cpu, MP_KDP)) {
1749 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1750 cpu_NMI_interrupt(cpu);
1751 }
1752 }
1753 /* Wait again for the same timeout */
1754 tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1755 while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1756 handle_pending_TLB_flushes();
1757 cpu_pause();
1758 ++wait_cycles;
1759 }
1760 if (mp_kdp_ncpus != ncpus) {
1761 paniclog_append_noflush("mp_kdp_enter() NMI pending on cpus:");
1762 for (cpu = 0; cpu < real_ncpus; cpu++) {
1763 if (cpu_is_running(cpu) && !cpu_datap(cpu)->cpu_NMI_acknowledged) {
1764 paniclog_append_noflush(" %d", cpu);
1765 }
1766 }
1767 paniclog_append_noflush("\n");
1768 if (proceed_on_failure) {
1769 paniclog_append_noflush("mp_kdp_enter() timed-out during %s wait after NMI;"
1770 "expected %u acks but received %lu after %u loops in %llu ticks\n",
1771 (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1772 } else {
1773 panic("mp_kdp_enter() timed-out during %s wait after NMI;"
1774 "expected %u acks but received %lu after %u loops in %llu ticks",
1775 (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1776 }
1777 }
1778 }
1779 } else if (NMI_panic_reason != PTE_CORRUPTION) { /* In the pte corruption case, the detecting CPU has already NMIed other CPUs */
1780 for (cpu = 0; cpu < real_ncpus; cpu++) {
1781 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1782 continue;
1783 }
1784 cpu_NMI_interrupt(cpu);
1785 }
1786 }
1787
1788 if (locked) {
1789 simple_unlock(&x86_topo_lock);
1790 }
1791
1792 DBG("mp_kdp_enter() %d processors done %s\n",
1793 (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out");
1794
1795 postcode(MP_KDP_ENTER);
1796 }
1797
1798 boolean_t
mp_kdp_all_cpus_halted()1799 mp_kdp_all_cpus_halted()
1800 {
1801 unsigned int ncpus = 0, cpu = 0, my_cpu = 0;
1802
1803 my_cpu = cpu_number();
1804 ncpus = 1; /* current CPU */
1805 for (cpu = 0; cpu < real_ncpus; cpu++) {
1806 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1807 continue;
1808 }
1809 ncpus++;
1810 }
1811
1812 return mp_kdp_ncpus == ncpus;
1813 }
1814
1815 static boolean_t
cpu_signal_pending(int cpu,mp_event_t event)1816 cpu_signal_pending(int cpu, mp_event_t event)
1817 {
1818 volatile int *signals = &cpu_datap(cpu)->cpu_signals;
1819 boolean_t retval = FALSE;
1820
1821 if (i_bit(event, signals)) {
1822 retval = TRUE;
1823 }
1824 return retval;
1825 }
1826
1827 long
kdp_x86_xcpu_invoke(const uint16_t lcpu,kdp_x86_xcpu_func_t func,void * arg0,void * arg1,uint64_t timeout)1828 kdp_x86_xcpu_invoke(const uint16_t lcpu, kdp_x86_xcpu_func_t func,
1829 void *arg0, void *arg1, uint64_t timeout)
1830 {
1831 uint64_t now;
1832
1833 if (lcpu > (real_ncpus - 1)) {
1834 return -1;
1835 }
1836
1837 if (func == NULL) {
1838 return -1;
1839 }
1840
1841 kdp_xcpu_call_func.func = func;
1842 kdp_xcpu_call_func.ret = -1;
1843 kdp_xcpu_call_func.arg0 = arg0;
1844 kdp_xcpu_call_func.arg1 = arg1;
1845 kdp_xcpu_call_func.cpu = lcpu;
1846 DBG("Invoking function %p on CPU %d\n", func, (int32_t)lcpu);
1847 now = mach_absolute_time();
1848 while (kdp_xcpu_call_func.cpu != KDP_XCPU_NONE &&
1849 (timeout == 0 || (mach_absolute_time() - now) < timeout)) {
1850 cpu_pause();
1851 }
1852 return kdp_xcpu_call_func.ret;
1853 }
1854
1855 static void
kdp_x86_xcpu_poll(void)1856 kdp_x86_xcpu_poll(void)
1857 {
1858 if ((uint16_t)cpu_number() == kdp_xcpu_call_func.cpu) {
1859 kdp_xcpu_call_func.ret =
1860 kdp_xcpu_call_func.func(kdp_xcpu_call_func.arg0,
1861 kdp_xcpu_call_func.arg1,
1862 cpu_number());
1863 kdp_xcpu_call_func.cpu = KDP_XCPU_NONE;
1864 }
1865 }
1866
1867 static void
mp_kdp_wait(boolean_t flush,boolean_t isNMI)1868 mp_kdp_wait(boolean_t flush, boolean_t isNMI)
1869 {
1870 DBG("mp_kdp_wait()\n");
1871
1872 current_cpu_datap()->debugger_ipi_time = mach_absolute_time();
1873 #if CONFIG_MCA
1874 /* If we've trapped due to a machine-check, save MCA registers */
1875 mca_check_save();
1876 #endif
1877
1878 /* If this is a stackshot, setup the CPU state before signalling we've entered the debugger. */
1879 if (mp_kdp_is_stackshot) {
1880 stackshot_cpu_preflight();
1881 }
1882
1883 atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1884
1885 /* If this is a stackshot, join in on the fun. */
1886 if (mp_kdp_is_stackshot) {
1887 stackshot_aux_cpu_entry();
1888 }
1889
1890 while (mp_kdp_trap || (isNMI == TRUE)) {
1891 /*
1892 * A TLB shootdown request may be pending--this would result
1893 * in the requesting processor waiting in PMAP_UPDATE_TLBS()
1894 * until this processor handles it.
1895 * Process it, so it can now enter mp_kdp_wait()
1896 */
1897 if (flush) {
1898 handle_pending_TLB_flushes();
1899 }
1900
1901 kdp_x86_xcpu_poll();
1902 cpu_pause();
1903 }
1904
1905 atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1906 DBG("mp_kdp_wait() done\n");
1907 }
1908
1909 void
mp_kdp_exit(void)1910 mp_kdp_exit(void)
1911 {
1912 DBG("mp_kdp_exit()\n");
1913 debugger_cpu = -1;
1914 atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1915
1916 debugger_exit_time = mach_absolute_time();
1917
1918 mp_kdp_is_stackshot = false;
1919 mp_kdp_trap = FALSE;
1920 mfence();
1921
1922 /* Wait other processors to stop spinning. XXX needs timeout */
1923 DBG("mp_kdp_exit() waiting for processors to resume\n");
1924 while (mp_kdp_ncpus > 0) {
1925 /*
1926 * a TLB shootdown request may be pending... this would result in the requesting
1927 * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it.
1928 * Process it, so it can now enter mp_kdp_wait()
1929 */
1930 handle_pending_TLB_flushes();
1931
1932 cpu_pause();
1933 }
1934
1935 if (pmsafe_debug && !kdp_snapshot) {
1936 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
1937 }
1938
1939 debugger_exit_time = mach_absolute_time();
1940
1941 DBG("mp_kdp_exit() done\n");
1942 (void) ml_set_interrupts_enabled(mp_kdp_state);
1943 postcode(MP_KDP_EXIT);
1944 }
1945
1946 #endif /* MACH_KDP */
1947
1948 boolean_t
mp_recent_debugger_activity(void)1949 mp_recent_debugger_activity(void)
1950 {
1951 uint64_t abstime = mach_absolute_time();
1952 return ((abstime - debugger_entry_time) < LastDebuggerEntryAllowance) ||
1953 ((abstime - debugger_exit_time) < LastDebuggerEntryAllowance);
1954 }
1955
1956 /*ARGSUSED*/
1957 void
init_ast_check(__unused processor_t processor)1958 init_ast_check(
1959 __unused processor_t processor)
1960 {
1961 }
1962
1963 void
cause_ast_check(processor_t processor)1964 cause_ast_check(
1965 processor_t processor)
1966 {
1967 assert(processor != PROCESSOR_NULL);
1968
1969 int cpu = processor->cpu_id;
1970
1971 if (cpu != cpu_number()) {
1972 i386_signal_cpu(cpu, MP_AST, ASYNC);
1973 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0);
1974 }
1975 }
1976
1977 void
machine_cpu_reinit(void * param)1978 machine_cpu_reinit(void *param)
1979 {
1980 /*
1981 * Here in process context, but with interrupts disabled.
1982 */
1983 DBG("machine_cpu_reinit() CPU%d\n", get_cpu_number());
1984
1985 if (param == FULL_SLAVE_INIT) {
1986 /*
1987 * Cold start
1988 */
1989 clock_init();
1990 }
1991 cpu_machine_init(); /* Interrupts enabled hereafter */
1992 }
1993
1994 #undef cpu_number
1995 int
cpu_number(void)1996 cpu_number(void)
1997 {
1998 return get_cpu_number();
1999 }
2000
2001 vm_offset_t
current_percpu_base(void)2002 current_percpu_base(void)
2003 {
2004 return get_current_percpu_base();
2005 }
2006
2007 vm_offset_t
other_percpu_base(int cpu)2008 other_percpu_base(int cpu)
2009 {
2010 return cpu_datap(cpu)->cpu_pcpu_base;
2011 }
2012
2013 static void
cpu_prewarm_init()2014 cpu_prewarm_init()
2015 {
2016 int i;
2017
2018 simple_lock_init(&cpu_warm_lock, 0);
2019 queue_init(&cpu_warm_call_list);
2020 for (i = 0; i < NUM_CPU_WARM_CALLS; i++) {
2021 enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]);
2022 }
2023 }
2024
2025 static timer_call_t
grab_warm_timer_call()2026 grab_warm_timer_call()
2027 {
2028 spl_t x;
2029 timer_call_t call = NULL;
2030
2031 x = splsched();
2032 simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2033 if (!queue_empty(&cpu_warm_call_list)) {
2034 call = (timer_call_t) dequeue_head(&cpu_warm_call_list);
2035 }
2036 simple_unlock(&cpu_warm_lock);
2037 splx(x);
2038
2039 return call;
2040 }
2041
2042 static void
free_warm_timer_call(timer_call_t call)2043 free_warm_timer_call(timer_call_t call)
2044 {
2045 spl_t x;
2046
2047 x = splsched();
2048 simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2049 enqueue_head(&cpu_warm_call_list, (queue_entry_t)call);
2050 simple_unlock(&cpu_warm_lock);
2051 splx(x);
2052 }
2053
2054 /*
2055 * Runs in timer call context (interrupts disabled).
2056 */
2057 static void
cpu_warm_timer_call_func(timer_call_param_t p0,__unused timer_call_param_t p1)2058 cpu_warm_timer_call_func(
2059 timer_call_param_t p0,
2060 __unused timer_call_param_t p1)
2061 {
2062 free_warm_timer_call((timer_call_t)p0);
2063 return;
2064 }
2065
2066 /*
2067 * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0).
2068 */
2069 static void
_cpu_warm_setup(void * arg)2070 _cpu_warm_setup(
2071 void *arg)
2072 {
2073 cpu_warm_data_t cwdp = (cpu_warm_data_t)arg;
2074
2075 timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2076 cwdp->cwd_result = 0;
2077
2078 return;
2079 }
2080
2081 /*
2082 * Not safe to call with interrupts disabled.
2083 */
2084 kern_return_t
ml_interrupt_prewarm(uint64_t deadline)2085 ml_interrupt_prewarm(
2086 uint64_t deadline)
2087 {
2088 struct cpu_warm_data cwd;
2089 timer_call_t call;
2090 cpu_t ct;
2091
2092 if (ml_get_interrupts_enabled() == FALSE) {
2093 panic("%s: Interrupts disabled?", __FUNCTION__);
2094 }
2095
2096 /*
2097 * If the platform doesn't need our help, say that we succeeded.
2098 */
2099 if (!ml_get_interrupt_prewake_applicable()) {
2100 return KERN_SUCCESS;
2101 }
2102
2103 /*
2104 * Grab a timer call to use.
2105 */
2106 call = grab_warm_timer_call();
2107 if (call == NULL) {
2108 return KERN_RESOURCE_SHORTAGE;
2109 }
2110
2111 timer_call_setup(call, cpu_warm_timer_call_func, call);
2112 cwd.cwd_call = call;
2113 cwd.cwd_deadline = deadline;
2114 cwd.cwd_result = 0;
2115
2116 /*
2117 * For now, non-local interrupts happen on the master processor.
2118 */
2119 ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd);
2120 if (ct == 0) {
2121 free_warm_timer_call(call);
2122 return KERN_FAILURE;
2123 } else {
2124 return cwd.cwd_result;
2125 }
2126 }
2127
2128 #if DEBUG || DEVELOPMENT
2129 void
kernel_spin(uint64_t spin_ns)2130 kernel_spin(uint64_t spin_ns)
2131 {
2132 boolean_t istate;
2133 uint64_t spin_abs;
2134 uint64_t deadline;
2135 cpu_data_t *cdp;
2136
2137 kprintf("kernel_spin(%llu) spinning uninterruptibly\n", spin_ns);
2138 istate = ml_set_interrupts_enabled(FALSE);
2139 cdp = current_cpu_datap();
2140 nanoseconds_to_absolutetime(spin_ns, &spin_abs);
2141
2142 /* Fake interrupt handler entry for testing mp_interrupt_watchdog() */
2143 cdp->cpu_int_event_time = mach_absolute_time();
2144 cdp->cpu_int_state = (void *) USER_STATE(current_thread());
2145
2146 deadline = mach_absolute_time() + spin_ns;
2147 while (mach_absolute_time() < deadline) {
2148 cpu_pause();
2149 }
2150
2151 cdp->cpu_int_event_time = 0;
2152 cdp->cpu_int_state = NULL;
2153
2154 ml_set_interrupts_enabled(istate);
2155 kprintf("kernel_spin() continuing\n");
2156 }
2157
2158 /*
2159 * Called from the scheduler's maintenance thread,
2160 * scan running processors for long-running ISRs and:
2161 * - panic if longer than LockTimeOut, or
2162 * - log if more than a quantum.
2163 */
2164 void
mp_interrupt_watchdog(void)2165 mp_interrupt_watchdog(void)
2166 {
2167 cpu_t cpu;
2168 boolean_t intrs_enabled = FALSE;
2169 uint16_t cpu_int_num;
2170 uint64_t cpu_int_event_time;
2171 uint64_t cpu_rip;
2172 uint64_t cpu_int_duration;
2173 uint64_t now;
2174 x86_saved_state_t *cpu_int_state;
2175
2176 if (__improbable(!mp_interrupt_watchdog_enabled)) {
2177 return;
2178 }
2179
2180 intrs_enabled = ml_set_interrupts_enabled(FALSE);
2181 now = mach_absolute_time();
2182 /*
2183 * While timeouts are not suspended,
2184 * check all other processors for long outstanding interrupt handling.
2185 */
2186 for (cpu = 0;
2187 cpu < (cpu_t) real_ncpus && !machine_timeout_suspended();
2188 cpu++) {
2189 if ((cpu == (cpu_t) cpu_number()) ||
2190 (!cpu_is_running(cpu))) {
2191 continue;
2192 }
2193 cpu_int_event_time = cpu_datap(cpu)->cpu_int_event_time;
2194 if (cpu_int_event_time == 0) {
2195 continue;
2196 }
2197 if (__improbable(now < cpu_int_event_time)) {
2198 continue; /* skip due to inter-processor skew */
2199 }
2200 cpu_int_state = cpu_datap(cpu)->cpu_int_state;
2201 if (__improbable(cpu_int_state == NULL)) {
2202 /* The interrupt may have been dismissed */
2203 continue;
2204 }
2205
2206 /* Here with a cpu handling an interrupt */
2207
2208 cpu_int_duration = now - cpu_int_event_time;
2209 if (__improbable(cpu_int_duration > LockTimeOut)) {
2210 cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2211 cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2212 vector_timed_out = cpu_int_num;
2213 NMIPI_panic(cpu_to_cpumask(cpu), INTERRUPT_WATCHDOG);
2214 panic("Interrupt watchdog, "
2215 "cpu: %d interrupt: 0x%x time: %llu..%llu state: %p RIP: 0x%llx",
2216 cpu, cpu_int_num, cpu_int_event_time, now, cpu_int_state, cpu_rip);
2217 /* NOT REACHED */
2218 } else if (__improbable(cpu_int_duration > (uint64_t) std_quantum)) {
2219 mp_interrupt_watchdog_events++;
2220 cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2221 cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2222 ml_set_interrupts_enabled(intrs_enabled);
2223 printf("Interrupt watchdog, "
2224 "cpu: %d interrupt: 0x%x time: %llu..%llu RIP: 0x%llx\n",
2225 cpu, cpu_int_num, cpu_int_event_time, now, cpu_rip);
2226 return;
2227 }
2228 }
2229
2230 ml_set_interrupts_enabled(intrs_enabled);
2231 }
2232 #endif
2233