1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31
32 #include <mach_kdp.h>
33 #include <kdp/kdp_internal.h>
34 #include <mach_ldebug.h>
35
36 #include <mach/mach_types.h>
37 #include <mach/kern_return.h>
38
39 #include <kern/kern_types.h>
40 #include <kern/startup.h>
41 #include <kern/timer_queue.h>
42 #include <kern/processor.h>
43 #include <kern/cpu_number.h>
44 #include <kern/cpu_data.h>
45 #include <kern/assert.h>
46 #include <kern/lock_group.h>
47 #include <kern/machine.h>
48 #include <kern/pms.h>
49 #include <kern/misc_protos.h>
50 #include <kern/timer_call.h>
51 #include <kern/zalloc.h>
52 #include <kern/queue.h>
53 #include <prng/random.h>
54
55 #include <vm/vm_map.h>
56 #include <vm/vm_kern.h>
57
58 #include <i386/bit_routines.h>
59 #include <i386/proc_reg.h>
60 #include <i386/cpu_threads.h>
61 #include <i386/mp_desc.h>
62 #include <i386/misc_protos.h>
63 #include <i386/trap.h>
64 #include <i386/postcode.h>
65 #include <i386/machine_routines.h>
66 #include <i386/mp.h>
67 #include <i386/mp_events.h>
68 #include <i386/lapic.h>
69 #include <i386/cpuid.h>
70 #include <i386/fpu.h>
71 #include <i386/machine_cpu.h>
72 #include <i386/pmCPU.h>
73 #if CONFIG_MCA
74 #include <i386/machine_check.h>
75 #endif
76 #include <i386/acpi.h>
77
78 #include <sys/kdebug.h>
79
80 #include <console/serial_protos.h>
81
82 #if MONOTONIC
83 #include <kern/monotonic.h>
84 #endif /* MONOTONIC */
85
86 #if KPERF
87 #include <kperf/kptimer.h>
88 #endif /* KPERF */
89
90 #if MP_DEBUG
91 #define PAUSE delay(1000000)
92 #define DBG(x...) kprintf(x)
93 #else
94 #define DBG(x...)
95 #define PAUSE
96 #endif /* MP_DEBUG */
97
98 /* Debugging/test trace events: */
99 #define TRACE_MP_TLB_FLUSH MACHDBG_CODE(DBG_MACH_MP, 0)
100 #define TRACE_MP_CPUS_CALL MACHDBG_CODE(DBG_MACH_MP, 1)
101 #define TRACE_MP_CPUS_CALL_LOCAL MACHDBG_CODE(DBG_MACH_MP, 2)
102 #define TRACE_MP_CPUS_CALL_ACTION MACHDBG_CODE(DBG_MACH_MP, 3)
103 #define TRACE_MP_CPUS_CALL_NOBUF MACHDBG_CODE(DBG_MACH_MP, 4)
104 #define TRACE_MP_CPU_FAST_START MACHDBG_CODE(DBG_MACH_MP, 5)
105 #define TRACE_MP_CPU_START MACHDBG_CODE(DBG_MACH_MP, 6)
106 #define TRACE_MP_CPU_DEACTIVATE MACHDBG_CODE(DBG_MACH_MP, 7)
107
108 #define ABS(v) (((v) > 0)?(v):-(v))
109
110 void slave_boot_init(void);
111 void i386_cpu_IPI(int cpu);
112
113 #if MACH_KDP
114 static void mp_kdp_wait(boolean_t flush, boolean_t isNMI);
115 #endif /* MACH_KDP */
116
117 #if MACH_KDP
118 static boolean_t cpu_signal_pending(int cpu, mp_event_t event);
119 #endif /* MACH_KDP */
120 static int NMIInterruptHandler(x86_saved_state_t *regs);
121
122 boolean_t smp_initialized = FALSE;
123 uint32_t TSC_sync_margin = 0xFFF;
124 volatile boolean_t force_immediate_debugger_NMI = FALSE;
125 volatile boolean_t pmap_tlb_flush_timeout = FALSE;
126 #if DEBUG || DEVELOPMENT
127 boolean_t mp_interrupt_watchdog_enabled = TRUE;
128 uint32_t mp_interrupt_watchdog_events = 0;
129 #endif
130
131 SIMPLE_LOCK_DECLARE(debugger_callback_lock, 0);
132 struct debugger_callback *debugger_callback = NULL;
133
134 static LCK_GRP_DECLARE(smp_lck_grp, "i386_smp");
135 static LCK_MTX_EARLY_DECLARE(mp_cpu_boot_lock, &smp_lck_grp);
136
137 /* Variables needed for MP rendezvous. */
138 SIMPLE_LOCK_DECLARE(mp_rv_lock, 0);
139 static void (*mp_rv_setup_func)(void *arg);
140 static void (*mp_rv_action_func)(void *arg);
141 static void (*mp_rv_teardown_func)(void *arg);
142 static void *mp_rv_func_arg;
143 static volatile int mp_rv_ncpus;
144 /* Cache-aligned barriers: */
145 static volatile long mp_rv_entry __attribute__((aligned(64)));
146 static volatile long mp_rv_exit __attribute__((aligned(64)));
147 static volatile long mp_rv_complete __attribute__((aligned(64)));
148
149 volatile uint64_t debugger_entry_time;
150 volatile uint64_t debugger_exit_time;
151 #if MACH_KDP
152 #include <kdp/kdp.h>
153 extern int kdp_snapshot;
154 static struct _kdp_xcpu_call_func {
155 kdp_x86_xcpu_func_t func;
156 void *arg0, *arg1;
157 volatile long ret;
158 volatile uint16_t cpu;
159 } kdp_xcpu_call_func = {
160 .cpu = KDP_XCPU_NONE
161 };
162
163 #endif
164
165 /* Variables needed for MP broadcast. */
166 static void (*mp_bc_action_func)(void *arg);
167 static void *mp_bc_func_arg;
168 static int mp_bc_ncpus;
169 static volatile long mp_bc_count;
170 static LCK_MTX_EARLY_DECLARE(mp_bc_lock, &smp_lck_grp);
171 static volatile int debugger_cpu = -1;
172 volatile long NMIPI_acks = 0;
173 volatile long NMI_count = 0;
174 static int vector_timed_out;
175
176 NMI_reason_t NMI_panic_reason = NONE;
177 extern void NMI_cpus(void);
178
179 static void mp_cpus_call_init(void);
180 static void mp_cpus_call_action(void);
181 static void mp_call_PM(void);
182
183 char mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init
184
185 /* PAL-related routines */
186 boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler,
187 int ipi_vector, i386_intr_func_t ipi_handler);
188 void i386_start_cpu(int lapic_id, int cpu_num);
189 void i386_send_NMI(int cpu);
190 void NMIPI_enable(boolean_t);
191
192 #define NUM_CPU_WARM_CALLS 20
193 struct timer_call cpu_warm_call_arr[NUM_CPU_WARM_CALLS];
194 queue_head_t cpu_warm_call_list;
195 decl_simple_lock_data(static, cpu_warm_lock);
196
197 typedef struct cpu_warm_data {
198 timer_call_t cwd_call;
199 uint64_t cwd_deadline;
200 int cwd_result;
201 } *cpu_warm_data_t;
202
203 static void cpu_prewarm_init(void);
204 static void cpu_warm_timer_call_func(timer_call_param_t p0, timer_call_param_t p1);
205 static void _cpu_warm_setup(void *arg);
206 static timer_call_t grab_warm_timer_call(void);
207 static void free_warm_timer_call(timer_call_t call);
208
209 void
smp_init(void)210 smp_init(void)
211 {
212 console_init();
213
214 if (!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler,
215 LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler)) {
216 return;
217 }
218
219 cpu_thread_init();
220
221 DBGLOG_CPU_INIT(master_cpu);
222
223 mp_cpus_call_init();
224 mp_cpus_call_cpu_init(master_cpu);
225
226 #if DEBUG || DEVELOPMENT
227 if (PE_parse_boot_argn("interrupt_watchdog",
228 &mp_interrupt_watchdog_enabled,
229 sizeof(mp_interrupt_watchdog_enabled))) {
230 kprintf("Interrupt watchdog %sabled\n",
231 mp_interrupt_watchdog_enabled ? "en" : "dis");
232 }
233 #endif
234
235 if (PE_parse_boot_argn("TSC_sync_margin",
236 &TSC_sync_margin, sizeof(TSC_sync_margin))) {
237 kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin);
238 } else if (cpuid_vmm_present()) {
239 kprintf("TSC sync margin disabled\n");
240 TSC_sync_margin = 0;
241 }
242 smp_initialized = TRUE;
243
244 cpu_prewarm_init();
245
246 return;
247 }
248
249 typedef struct {
250 int target_cpu;
251 int target_lapic;
252 int starter_cpu;
253 } processor_start_info_t;
254 static processor_start_info_t start_info __attribute__((aligned(64)));
255
256 /*
257 * Cache-alignment is to avoid cross-cpu false-sharing interference.
258 */
259 static volatile long tsc_entry_barrier __attribute__((aligned(64)));
260 static volatile long tsc_exit_barrier __attribute__((aligned(64)));
261 static volatile uint64_t tsc_target __attribute__((aligned(64)));
262
263 /*
264 * Poll a CPU to see when it has marked itself as running.
265 */
266 static void
mp_wait_for_cpu_up(int slot_num,unsigned int iters,unsigned int usecdelay)267 mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay)
268 {
269 while (iters-- > 0) {
270 if (cpu_datap(slot_num)->cpu_running) {
271 break;
272 }
273 delay(usecdelay);
274 }
275 }
276
277 /*
278 * Quickly bring a CPU back online which has been halted.
279 */
280 kern_return_t
intel_startCPU_fast(int slot_num)281 intel_startCPU_fast(int slot_num)
282 {
283 kern_return_t rc;
284
285 /*
286 * Try to perform a fast restart
287 */
288 rc = pmCPUExitHalt(slot_num);
289 if (rc != KERN_SUCCESS) {
290 /*
291 * The CPU was not eligible for a fast restart.
292 */
293 return rc;
294 }
295
296 KERNEL_DEBUG_CONSTANT(
297 TRACE_MP_CPU_FAST_START | DBG_FUNC_START,
298 slot_num, 0, 0, 0, 0);
299
300 /*
301 * Wait until the CPU is back online.
302 */
303 mp_disable_preemption();
304
305 /*
306 * We use short pauses (1us) for low latency. 30,000 iterations is
307 * longer than a full restart would require so it should be more
308 * than long enough.
309 */
310
311 mp_wait_for_cpu_up(slot_num, 30000, 1);
312 mp_enable_preemption();
313
314 KERNEL_DEBUG_CONSTANT(
315 TRACE_MP_CPU_FAST_START | DBG_FUNC_END,
316 slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0);
317
318 /*
319 * Check to make sure that the CPU is really running. If not,
320 * go through the slow path.
321 */
322 if (cpu_datap(slot_num)->cpu_running) {
323 return KERN_SUCCESS;
324 } else {
325 return KERN_FAILURE;
326 }
327 }
328
329 static void
started_cpu(void)330 started_cpu(void)
331 {
332 /* Here on the started cpu with cpu_running set TRUE */
333
334 if (TSC_sync_margin &&
335 start_info.target_cpu == cpu_number()) {
336 /*
337 * I've just started-up, synchronize again with the starter cpu
338 * and then snap my TSC.
339 */
340 tsc_target = 0;
341 atomic_decl(&tsc_entry_barrier, 1);
342 while (tsc_entry_barrier != 0) {
343 ; /* spin for starter and target at barrier */
344 }
345 tsc_target = rdtsc64();
346 atomic_decl(&tsc_exit_barrier, 1);
347 }
348 }
349
350 static void
start_cpu(void * arg)351 start_cpu(void *arg)
352 {
353 int i = 1000;
354 processor_start_info_t *psip = (processor_start_info_t *) arg;
355
356 /* Ignore this if the current processor is not the starter */
357 if (cpu_number() != psip->starter_cpu) {
358 return;
359 }
360
361 DBG("start_cpu(%p) about to start cpu %d, lapic %d\n",
362 arg, psip->target_cpu, psip->target_lapic);
363
364 KERNEL_DEBUG_CONSTANT(
365 TRACE_MP_CPU_START | DBG_FUNC_START,
366 psip->target_cpu,
367 psip->target_lapic, 0, 0, 0);
368
369 i386_start_cpu(psip->target_lapic, psip->target_cpu);
370
371 #ifdef POSTCODE_DELAY
372 /* Wait much longer if postcodes are displayed for a delay period. */
373 i *= 10000;
374 #endif
375 DBG("start_cpu(%p) about to wait for cpu %d\n",
376 arg, psip->target_cpu);
377
378 mp_wait_for_cpu_up(psip->target_cpu, i * 100, 100);
379
380 KERNEL_DEBUG_CONSTANT(
381 TRACE_MP_CPU_START | DBG_FUNC_END,
382 psip->target_cpu,
383 cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0);
384
385 if (TSC_sync_margin &&
386 cpu_datap(psip->target_cpu)->cpu_running) {
387 /*
388 * Compare the TSC from the started processor with ours.
389 * Report and log/panic if it diverges by more than
390 * TSC_sync_margin (TSC_SYNC_MARGIN) ticks. This margin
391 * can be overriden by boot-arg (with 0 meaning no checking).
392 */
393 uint64_t tsc_starter;
394 int64_t tsc_delta;
395 atomic_decl(&tsc_entry_barrier, 1);
396 while (tsc_entry_barrier != 0) {
397 ; /* spin for both processors at barrier */
398 }
399 tsc_starter = rdtsc64();
400 atomic_decl(&tsc_exit_barrier, 1);
401 while (tsc_exit_barrier != 0) {
402 ; /* spin for target to store its TSC */
403 }
404 tsc_delta = tsc_target - tsc_starter;
405 kprintf("TSC sync for cpu %d: 0x%016llx delta 0x%llx (%lld)\n",
406 psip->target_cpu, tsc_target, tsc_delta, tsc_delta);
407 #if DEBUG || DEVELOPMENT
408 /*
409 * Stash the delta for inspection later, since we can no
410 * longer print/log it with interrupts disabled.
411 */
412 cpu_datap(psip->target_cpu)->tsc_sync_delta = tsc_delta;
413 #endif
414 if (ABS(tsc_delta) > (int64_t) TSC_sync_margin) {
415 #if DEBUG
416 panic(
417 #else
418 kprintf(
419 #endif
420 "Unsynchronized TSC for cpu %d: "
421 "0x%016llx, delta 0x%llx\n",
422 psip->target_cpu, tsc_target, tsc_delta);
423 }
424 }
425 }
426
427 kern_return_t
intel_startCPU(int slot_num)428 intel_startCPU(
429 int slot_num)
430 {
431 int lapic = cpu_to_lapic[slot_num];
432 boolean_t istate;
433
434 assert(lapic != -1);
435
436 DBGLOG_CPU_INIT(slot_num);
437
438 DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic);
439 DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD);
440
441 /*
442 * Initialize (or re-initialize) the descriptor tables for this cpu.
443 * Propagate processor mode to slave.
444 */
445 cpu_desc_init(cpu_datap(slot_num));
446
447 /* Serialize use of the slave boot stack, etc. */
448 lck_mtx_lock(&mp_cpu_boot_lock);
449
450 istate = ml_set_interrupts_enabled(FALSE);
451 if (slot_num == get_cpu_number()) {
452 ml_set_interrupts_enabled(istate);
453 lck_mtx_unlock(&mp_cpu_boot_lock);
454 return KERN_SUCCESS;
455 }
456
457 start_info.starter_cpu = cpu_number();
458 start_info.target_cpu = slot_num;
459 start_info.target_lapic = lapic;
460 tsc_entry_barrier = 2;
461 tsc_exit_barrier = 2;
462
463 /*
464 * Perform the processor startup sequence with all running
465 * processors rendezvous'ed. This is required during periods when
466 * the cache-disable bit is set for MTRR/PAT initialization.
467 */
468 mp_rendezvous_no_intrs(start_cpu, (void *) &start_info);
469
470 start_info.target_cpu = 0;
471
472 ml_set_interrupts_enabled(istate);
473 lck_mtx_unlock(&mp_cpu_boot_lock);
474
475 if (!cpu_datap(slot_num)->cpu_running) {
476 kprintf("Failed to start CPU %02d\n", slot_num);
477 printf("Failed to start CPU %02d, rebooting...\n", slot_num);
478 delay(1000000);
479 halt_cpu();
480 return KERN_SUCCESS;
481 } else {
482 kprintf("Started cpu %d (lapic id %08x)\n", slot_num, lapic);
483 return KERN_SUCCESS;
484 }
485 }
486
487 #if MP_DEBUG
488 cpu_signal_event_log_t *cpu_signal[MAX_CPUS];
489 cpu_signal_event_log_t *cpu_handle[MAX_CPUS];
490
491 MP_EVENT_NAME_DECL();
492
493 #endif /* MP_DEBUG */
494
495 /*
496 * Note: called with NULL state when polling for TLB flush and cross-calls.
497 */
498 int
cpu_signal_handler(x86_saved_state_t * regs)499 cpu_signal_handler(x86_saved_state_t *regs)
500 {
501 #if !MACH_KDP
502 #pragma unused (regs)
503 #endif /* !MACH_KDP */
504 int my_cpu;
505 volatile int *my_word;
506
507 SCHED_STATS_INC(ipi_count);
508
509 my_cpu = cpu_number();
510 my_word = &cpu_data_ptr[my_cpu]->cpu_signals;
511 /* Store the initial set of signals for diagnostics. New
512 * signals could arrive while these are being processed
513 * so it's no more than a hint.
514 */
515
516 cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word;
517
518 do {
519 #if MACH_KDP
520 if (i_bit(MP_KDP, my_word)) {
521 DBGLOG(cpu_handle, my_cpu, MP_KDP);
522 i_bit_clear(MP_KDP, my_word);
523 /* Ensure that the i386_kernel_state at the base of the
524 * current thread's stack (if any) is synchronized with the
525 * context at the moment of the interrupt, to facilitate
526 * access through the debugger.
527 */
528 sync_iss_to_iks(regs);
529 if (pmsafe_debug && !kdp_snapshot) {
530 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
531 }
532 mp_kdp_wait(TRUE, FALSE);
533 if (pmsafe_debug && !kdp_snapshot) {
534 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
535 }
536 } else
537 #endif /* MACH_KDP */
538 if (i_bit(MP_TLB_FLUSH, my_word)) {
539 DBGLOG(cpu_handle, my_cpu, MP_TLB_FLUSH);
540 i_bit_clear(MP_TLB_FLUSH, my_word);
541 pmap_update_interrupt();
542 } else if (i_bit(MP_CALL, my_word)) {
543 DBGLOG(cpu_handle, my_cpu, MP_CALL);
544 i_bit_clear(MP_CALL, my_word);
545 mp_cpus_call_action();
546 } else if (i_bit(MP_CALL_PM, my_word)) {
547 DBGLOG(cpu_handle, my_cpu, MP_CALL_PM);
548 i_bit_clear(MP_CALL_PM, my_word);
549 mp_call_PM();
550 }
551 if (regs == NULL) {
552 /* Called to poll only for cross-calls and TLB flush */
553 break;
554 } else if (i_bit(MP_AST, my_word)) {
555 DBGLOG(cpu_handle, my_cpu, MP_AST);
556 i_bit_clear(MP_AST, my_word);
557 ast_check(cpu_to_processor(my_cpu));
558 }
559 } while (*my_word);
560
561 return 0;
562 }
563
564 long
NMI_pte_corruption_callback(__unused void * arg0,__unused void * arg1,uint16_t lcpu)565 NMI_pte_corruption_callback(__unused void *arg0, __unused void *arg1, uint16_t lcpu)
566 {
567 static char pstr[256]; /* global since this callback is serialized */
568 void *stackptr;
569 __asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
570
571 snprintf(&pstr[0], sizeof(pstr),
572 "Panic(CPU %d): PTE corruption detected on PTEP 0x%llx VAL 0x%llx\n",
573 lcpu, (unsigned long long)(uintptr_t)PTE_corrupted_ptr, *(uint64_t *)PTE_corrupted_ptr);
574 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, current_cpu_datap()->cpu_int_state);
575 return 0;
576 }
577
578 extern void kprintf_break_lock(void);
579 int
NMIInterruptHandler(x86_saved_state_t * regs)580 NMIInterruptHandler(x86_saved_state_t *regs)
581 {
582 void *stackptr;
583 char pstr[256];
584 uint64_t now = mach_absolute_time();
585
586 if (panic_active() && !panicDebugging) {
587 if (pmsafe_debug) {
588 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
589 }
590 for (;;) {
591 cpu_pause();
592 }
593 }
594
595 atomic_incl(&NMIPI_acks, 1);
596 atomic_incl(&NMI_count, 1);
597 sync_iss_to_iks_unconditionally(regs);
598 __asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
599
600 if (cpu_number() == debugger_cpu) {
601 goto NMExit;
602 }
603
604 if (NMI_panic_reason == SPINLOCK_TIMEOUT) {
605 lck_spinlock_to_info_t lsti;
606
607 lsti = os_atomic_load(&lck_spinlock_timeout_in_progress, acquire);
608 snprintf(&pstr[0], sizeof(pstr),
609 "Panic(CPU %d, time %llu): NMIPI for spinlock acquisition timeout, spinlock: %p, "
610 "spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n",
611 cpu_number(), now, lsti->lock, (void *)lsti->owner_thread_cur,
612 current_thread(), lsti->owner_cpu);
613 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
614 } else if (NMI_panic_reason == TLB_FLUSH_TIMEOUT) {
615 snprintf(&pstr[0], sizeof(pstr),
616 "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: TLB flush timeout, TLB state:0x%x\n",
617 cpu_number(), now, current_cpu_datap()->cpu_tlb_invalid);
618 panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs);
619 } else if (NMI_panic_reason == CROSSCALL_TIMEOUT) {
620 snprintf(&pstr[0], sizeof(pstr),
621 "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: cross-call timeout\n",
622 cpu_number(), now);
623 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
624 } else if (NMI_panic_reason == INTERRUPT_WATCHDOG) {
625 snprintf(&pstr[0], sizeof(pstr),
626 "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: interrupt watchdog for vector 0x%x\n",
627 cpu_number(), now, vector_timed_out);
628 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
629 }
630
631 #if MACH_KDP
632 if (pmsafe_debug && !kdp_snapshot) {
633 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
634 }
635 current_cpu_datap()->cpu_NMI_acknowledged = TRUE;
636 i_bit_clear(MP_KDP, ¤t_cpu_datap()->cpu_signals);
637 if (panic_active() || NMI_panic_reason != NONE) {
638 mp_kdp_wait(FALSE, TRUE);
639 } else if (!mp_kdp_trap &&
640 !mp_kdp_is_NMI &&
641 virtualized && (debug_boot_arg & DB_NMI)) {
642 /*
643 * Under a VMM with the debug boot-arg set, drop into kdp.
644 * Since an NMI is involved, there's a risk of contending with
645 * a panic. And side-effects of NMIs may result in entry into,
646 * and continuing from, the debugger being unreliable.
647 */
648 if (__sync_bool_compare_and_swap(&mp_kdp_is_NMI, FALSE, TRUE)) {
649 kprintf_break_lock();
650
651 DebuggerWithContext(EXC_BREAKPOINT, saved_state64(regs),
652 "requested by NMI", DEBUGGER_OPTION_NONE,
653 (unsigned long)(char *)__builtin_return_address(0));
654
655 mp_kdp_is_NMI = FALSE;
656 } else {
657 mp_kdp_wait(FALSE, FALSE);
658 }
659 } else {
660 mp_kdp_wait(FALSE, FALSE);
661 }
662 if (pmsafe_debug && !kdp_snapshot) {
663 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
664 }
665 #endif
666 NMExit:
667 return 1;
668 }
669
670 /*
671 * cpu_interrupt is really just to be used by the scheduler to
672 * get a CPU's attention it may not always issue an IPI. If an
673 * IPI is always needed then use i386_cpu_IPI.
674 */
675 void
cpu_interrupt(int cpu)676 cpu_interrupt(int cpu)
677 {
678 boolean_t did_IPI = FALSE;
679
680 if (smp_initialized
681 && pmCPUExitIdle(cpu_datap(cpu))) {
682 i386_cpu_IPI(cpu);
683 did_IPI = TRUE;
684 }
685
686 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0);
687 }
688
689 /*
690 * Send a true NMI via the local APIC to the specified CPU.
691 */
692 void
cpu_NMI_interrupt(int cpu)693 cpu_NMI_interrupt(int cpu)
694 {
695 if (smp_initialized) {
696 i386_send_NMI(cpu);
697 }
698 }
699
700 void
NMI_cpus(void)701 NMI_cpus(void)
702 {
703 unsigned int cpu;
704 boolean_t intrs_enabled;
705 uint64_t tsc_timeout;
706
707 intrs_enabled = ml_set_interrupts_enabled(FALSE);
708 NMIPI_enable(TRUE);
709 for (cpu = 0; cpu < real_ncpus; cpu++) {
710 if (!cpu_is_running(cpu)) {
711 continue;
712 }
713 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
714 cpu_NMI_interrupt(cpu);
715 tsc_timeout = !machine_timeout_suspended() ?
716 rdtsc64() + (1000 * 1000 * 1000 * 10ULL) :
717 ~0ULL;
718 while (!cpu_datap(cpu)->cpu_NMI_acknowledged) {
719 handle_pending_TLB_flushes();
720 cpu_pause();
721 if (rdtsc64() > tsc_timeout) {
722 panic("NMI_cpus() timeout cpu %d", cpu);
723 }
724 }
725 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
726 }
727 NMIPI_enable(FALSE);
728
729 ml_set_interrupts_enabled(intrs_enabled);
730 }
731
732 static void(*volatile mp_PM_func)(void) = NULL;
733
734 static void
mp_call_PM(void)735 mp_call_PM(void)
736 {
737 assert(!ml_get_interrupts_enabled());
738
739 if (mp_PM_func != NULL) {
740 mp_PM_func();
741 }
742 }
743
744 void
cpu_PM_interrupt(int cpu)745 cpu_PM_interrupt(int cpu)
746 {
747 assert(!ml_get_interrupts_enabled());
748
749 if (mp_PM_func != NULL) {
750 if (cpu == cpu_number()) {
751 mp_PM_func();
752 } else {
753 i386_signal_cpu(cpu, MP_CALL_PM, ASYNC);
754 }
755 }
756 }
757
758 void
PM_interrupt_register(void (* fn)(void))759 PM_interrupt_register(void (*fn)(void))
760 {
761 mp_PM_func = fn;
762 }
763
764 void
i386_signal_cpu(int cpu,mp_event_t event,mp_sync_t mode)765 i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode)
766 {
767 volatile int *signals = &cpu_datap(cpu)->cpu_signals;
768 uint64_t tsc_timeout;
769
770
771 if (!cpu_datap(cpu)->cpu_running) {
772 return;
773 }
774
775 if (event == MP_TLB_FLUSH) {
776 KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0);
777 }
778
779 DBGLOG(cpu_signal, cpu, event);
780
781 i_bit_set(event, signals);
782 i386_cpu_IPI(cpu);
783 if (mode == SYNC) {
784 again:
785 tsc_timeout = !machine_timeout_suspended() ?
786 rdtsc64() + (1000 * 1000 * 1000) :
787 ~0ULL;
788 while (i_bit(event, signals) && rdtsc64() < tsc_timeout) {
789 cpu_pause();
790 }
791 if (i_bit(event, signals)) {
792 DBG("i386_signal_cpu(%d, 0x%x, SYNC) timed out\n",
793 cpu, event);
794 goto again;
795 }
796 }
797 if (event == MP_TLB_FLUSH) {
798 KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0);
799 }
800 }
801
802 /*
803 * Helper function called when busy-waiting: panic if too long
804 * a TSC-based time has elapsed since the start of the spin.
805 */
806 static boolean_t
mp_spin_timeout(uint64_t tsc_start)807 mp_spin_timeout(uint64_t tsc_start)
808 {
809 uint64_t tsc_timeout;
810
811 cpu_pause();
812 if (machine_timeout_suspended()) {
813 return FALSE;
814 }
815
816 /*
817 * The timeout is 4 * the spinlock timeout period
818 * unless we have serial console printing (kprintf) enabled
819 * in which case we allow an even greater margin.
820 */
821 tsc_timeout = disable_serial_output ? LockTimeOutTSC << 2
822 : LockTimeOutTSC << 4;
823 return rdtsc64() > tsc_start + tsc_timeout;
824 }
825
826 /*
827 * Helper function to take a spinlock while ensuring that incoming IPIs
828 * are still serviced if interrupts are masked while we spin.
829 * Returns current interrupt state.
830 */
831 boolean_t
mp_safe_spin_lock(usimple_lock_t lock)832 mp_safe_spin_lock(usimple_lock_t lock)
833 {
834 if (ml_get_interrupts_enabled()) {
835 simple_lock(lock, LCK_GRP_NULL);
836 return TRUE;
837 }
838
839 lck_spinlock_to_info_t lsti;
840 uint64_t tsc_spin_start = rdtsc64();
841
842 while (!simple_lock_try(lock, LCK_GRP_NULL)) {
843 cpu_signal_handler(NULL);
844 if (mp_spin_timeout(tsc_spin_start)) {
845 uintptr_t lowner = (uintptr_t)lock->interlock.lock_data;
846
847 lsti = lck_spinlock_timeout_hit(lock, lowner);
848 NMIPI_panic(cpu_to_cpumask(lsti->owner_cpu), SPINLOCK_TIMEOUT);
849 panic("mp_safe_spin_lock() timed out, lock: %p, "
850 "owner thread: 0x%lx, current_thread: %p, "
851 "owner on CPU 0x%x, time: %llu",
852 lock, lowner, current_thread(),
853 lsti->owner_cpu, mach_absolute_time());
854 }
855 }
856
857 return FALSE;
858 }
859
860 /*
861 * All-CPU rendezvous:
862 * - CPUs are signalled,
863 * - all execute the setup function (if specified),
864 * - rendezvous (i.e. all cpus reach a barrier),
865 * - all execute the action function (if specified),
866 * - rendezvous again,
867 * - execute the teardown function (if specified), and then
868 * - resume.
869 *
870 * Note that the supplied external functions _must_ be reentrant and aware
871 * that they are running in parallel and in an unknown lock context.
872 */
873
874 static void
mp_rendezvous_action(__unused void * null)875 mp_rendezvous_action(__unused void *null)
876 {
877 boolean_t intrs_enabled;
878 uint64_t tsc_spin_start;
879
880 /*
881 * Note that mp_rv_lock was acquired by the thread that initiated the
882 * rendezvous and must have been acquired before we enter
883 * mp_rendezvous_action().
884 */
885 current_cpu_datap()->cpu_rendezvous_in_progress = TRUE;
886
887 /* setup function */
888 if (mp_rv_setup_func != NULL) {
889 mp_rv_setup_func(mp_rv_func_arg);
890 }
891
892 intrs_enabled = ml_get_interrupts_enabled();
893
894 /* spin on entry rendezvous */
895 atomic_incl(&mp_rv_entry, 1);
896 tsc_spin_start = rdtsc64();
897
898 while (mp_rv_entry < mp_rv_ncpus) {
899 /* poll for pesky tlb flushes if interrupts disabled */
900 if (!intrs_enabled) {
901 handle_pending_TLB_flushes();
902 }
903 if (mp_spin_timeout(tsc_spin_start)) {
904 panic("mp_rv_action() entry: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_entry, mp_rv_ncpus, tsc_spin_start, rdtsc64());
905 }
906 }
907
908 /* action function */
909 if (mp_rv_action_func != NULL) {
910 mp_rv_action_func(mp_rv_func_arg);
911 }
912
913 /* spin on exit rendezvous */
914 atomic_incl(&mp_rv_exit, 1);
915 tsc_spin_start = rdtsc64();
916 while (mp_rv_exit < mp_rv_ncpus) {
917 if (!intrs_enabled) {
918 handle_pending_TLB_flushes();
919 }
920 if (mp_spin_timeout(tsc_spin_start)) {
921 panic("mp_rv_action() exit: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_exit, mp_rv_ncpus, tsc_spin_start, rdtsc64());
922 }
923 }
924
925 /* teardown function */
926 if (mp_rv_teardown_func != NULL) {
927 mp_rv_teardown_func(mp_rv_func_arg);
928 }
929
930 current_cpu_datap()->cpu_rendezvous_in_progress = FALSE;
931
932 /* Bump completion count */
933 atomic_incl(&mp_rv_complete, 1);
934 }
935
936 void
mp_rendezvous(void (* setup_func)(void *),void (* action_func)(void *),void (* teardown_func)(void *),void * arg)937 mp_rendezvous(void (*setup_func)(void *),
938 void (*action_func)(void *),
939 void (*teardown_func)(void *),
940 void *arg)
941 {
942 uint64_t tsc_spin_start;
943
944 if (!smp_initialized) {
945 if (setup_func != NULL) {
946 setup_func(arg);
947 }
948 if (action_func != NULL) {
949 action_func(arg);
950 }
951 if (teardown_func != NULL) {
952 teardown_func(arg);
953 }
954 return;
955 }
956
957 /* obtain rendezvous lock */
958 mp_rendezvous_lock();
959
960 /* set static function pointers */
961 mp_rv_setup_func = setup_func;
962 mp_rv_action_func = action_func;
963 mp_rv_teardown_func = teardown_func;
964 mp_rv_func_arg = arg;
965
966 mp_rv_entry = 0;
967 mp_rv_exit = 0;
968 mp_rv_complete = 0;
969
970 /*
971 * signal other processors, which will call mp_rendezvous_action()
972 * with interrupts disabled
973 */
974 mp_rv_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, &mp_rendezvous_action, NULL) + 1;
975
976 /* call executor function on this cpu */
977 mp_rendezvous_action(NULL);
978
979 /*
980 * Spin for everyone to complete.
981 * This is necessary to ensure that all processors have proceeded
982 * from the exit barrier before we release the rendezvous structure.
983 */
984 tsc_spin_start = rdtsc64();
985 while (mp_rv_complete < mp_rv_ncpus) {
986 if (mp_spin_timeout(tsc_spin_start)) {
987 panic("mp_rendezvous() timeout: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_complete, mp_rv_ncpus, tsc_spin_start, rdtsc64());
988 }
989 }
990
991 /* Tidy up */
992 mp_rv_setup_func = NULL;
993 mp_rv_action_func = NULL;
994 mp_rv_teardown_func = NULL;
995 mp_rv_func_arg = NULL;
996
997 /* release lock */
998 mp_rendezvous_unlock();
999 }
1000
1001 void
mp_rendezvous_lock(void)1002 mp_rendezvous_lock(void)
1003 {
1004 (void) mp_safe_spin_lock(&mp_rv_lock);
1005 }
1006
1007 void
mp_rendezvous_unlock(void)1008 mp_rendezvous_unlock(void)
1009 {
1010 simple_unlock(&mp_rv_lock);
1011 }
1012
1013 void
mp_rendezvous_break_lock(void)1014 mp_rendezvous_break_lock(void)
1015 {
1016 simple_lock_init(&mp_rv_lock, 0);
1017 }
1018
1019 static void
setup_disable_intrs(__unused void * param_not_used)1020 setup_disable_intrs(__unused void * param_not_used)
1021 {
1022 /* disable interrupts before the first barrier */
1023 boolean_t intr = ml_set_interrupts_enabled(FALSE);
1024
1025 current_cpu_datap()->cpu_iflag = intr;
1026 DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1027 }
1028
1029 static void
teardown_restore_intrs(__unused void * param_not_used)1030 teardown_restore_intrs(__unused void * param_not_used)
1031 {
1032 /* restore interrupt flag following MTRR changes */
1033 ml_set_interrupts_enabled(current_cpu_datap()->cpu_iflag);
1034 DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1035 }
1036
1037 /*
1038 * A wrapper to mp_rendezvous() to call action_func() with interrupts disabled.
1039 * This is exported for use by kexts.
1040 */
1041 void
mp_rendezvous_no_intrs(void (* action_func)(void *),void * arg)1042 mp_rendezvous_no_intrs(
1043 void (*action_func)(void *),
1044 void *arg)
1045 {
1046 mp_rendezvous(setup_disable_intrs,
1047 action_func,
1048 teardown_restore_intrs,
1049 arg);
1050 }
1051
1052
1053 typedef struct {
1054 queue_chain_t link; /* queue linkage */
1055 void (*func)(void *, void *); /* routine to call */
1056 void *arg0; /* routine's 1st arg */
1057 void *arg1; /* routine's 2nd arg */
1058 cpumask_t *maskp; /* completion response mask */
1059 } mp_call_t;
1060
1061
1062 typedef struct {
1063 queue_head_t queue;
1064 decl_simple_lock_data(, lock);
1065 } mp_call_queue_t;
1066 #define MP_CPUS_CALL_BUFS_PER_CPU MAX_CPUS
1067 static mp_call_queue_t mp_cpus_call_freelist;
1068 static mp_call_queue_t mp_cpus_call_head[MAX_CPUS];
1069
1070 static inline boolean_t
mp_call_head_lock(mp_call_queue_t * cqp)1071 mp_call_head_lock(mp_call_queue_t *cqp)
1072 {
1073 boolean_t intrs_enabled;
1074
1075 intrs_enabled = ml_set_interrupts_enabled(FALSE);
1076 simple_lock(&cqp->lock, LCK_GRP_NULL);
1077
1078 return intrs_enabled;
1079 }
1080
1081 /*
1082 * Deliver an NMIPI to a set of processors to cause them to panic .
1083 */
1084 void
NMIPI_panic(cpumask_t cpu_mask,NMI_reason_t why)1085 NMIPI_panic(cpumask_t cpu_mask, NMI_reason_t why)
1086 {
1087 unsigned int cpu;
1088 cpumask_t cpu_bit;
1089 uint64_t deadline;
1090
1091 NMIPI_enable(TRUE);
1092 NMI_panic_reason = why;
1093
1094 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1095 if ((cpu_mask & cpu_bit) == 0) {
1096 continue;
1097 }
1098 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1099 cpu_NMI_interrupt(cpu);
1100 }
1101
1102 /* Wait (only so long) for NMi'ed cpus to respond */
1103 deadline = mach_absolute_time() + LockTimeOut;
1104 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1105 if ((cpu_mask & cpu_bit) == 0) {
1106 continue;
1107 }
1108 while (!cpu_datap(cpu)->cpu_NMI_acknowledged &&
1109 mach_absolute_time() < deadline) {
1110 cpu_pause();
1111 }
1112 }
1113 }
1114
1115 #if MACH_ASSERT
1116 static inline boolean_t
mp_call_head_is_locked(mp_call_queue_t * cqp)1117 mp_call_head_is_locked(mp_call_queue_t *cqp)
1118 {
1119 return !ml_get_interrupts_enabled() &&
1120 hw_lock_held((hw_lock_t)&cqp->lock);
1121 }
1122 #endif
1123
1124 static inline void
mp_call_head_unlock(mp_call_queue_t * cqp,boolean_t intrs_enabled)1125 mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled)
1126 {
1127 simple_unlock(&cqp->lock);
1128 ml_set_interrupts_enabled(intrs_enabled);
1129 }
1130
1131 static inline mp_call_t *
mp_call_alloc(void)1132 mp_call_alloc(void)
1133 {
1134 mp_call_t *callp = NULL;
1135 boolean_t intrs_enabled;
1136 mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1137
1138 intrs_enabled = mp_call_head_lock(cqp);
1139 if (!queue_empty(&cqp->queue)) {
1140 queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1141 }
1142 mp_call_head_unlock(cqp, intrs_enabled);
1143
1144 return callp;
1145 }
1146
1147 static inline void
mp_call_free(mp_call_t * callp)1148 mp_call_free(mp_call_t *callp)
1149 {
1150 boolean_t intrs_enabled;
1151 mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1152
1153 intrs_enabled = mp_call_head_lock(cqp);
1154 queue_enter_first(&cqp->queue, callp, typeof(callp), link);
1155 mp_call_head_unlock(cqp, intrs_enabled);
1156 }
1157
1158 static inline mp_call_t *
mp_call_dequeue_locked(mp_call_queue_t * cqp)1159 mp_call_dequeue_locked(mp_call_queue_t *cqp)
1160 {
1161 mp_call_t *callp = NULL;
1162
1163 assert(mp_call_head_is_locked(cqp));
1164 if (!queue_empty(&cqp->queue)) {
1165 queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1166 }
1167 return callp;
1168 }
1169
1170 static inline void
mp_call_enqueue_locked(mp_call_queue_t * cqp,mp_call_t * callp)1171 mp_call_enqueue_locked(
1172 mp_call_queue_t *cqp,
1173 mp_call_t *callp)
1174 {
1175 queue_enter(&cqp->queue, callp, typeof(callp), link);
1176 }
1177
1178 /* Called on the boot processor to initialize global structures */
1179 static void
mp_cpus_call_init(void)1180 mp_cpus_call_init(void)
1181 {
1182 mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1183
1184 DBG("mp_cpus_call_init()\n");
1185 simple_lock_init(&cqp->lock, 0);
1186 queue_init(&cqp->queue);
1187 }
1188
1189 /*
1190 * Called at processor registration to add call buffers to the free list
1191 * and to initialize the per-cpu call queue.
1192 */
1193 void
mp_cpus_call_cpu_init(int cpu)1194 mp_cpus_call_cpu_init(int cpu)
1195 {
1196 int i;
1197 mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1198 mp_call_t *callp;
1199
1200 simple_lock_init(&cqp->lock, 0);
1201 queue_init(&cqp->queue);
1202 for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) {
1203 callp = zalloc_permanent_type(mp_call_t);
1204 mp_call_free(callp);
1205 }
1206
1207 DBG("mp_cpus_call_init(%d) done\n", cpu);
1208 }
1209
1210 /*
1211 * This is called from cpu_signal_handler() to process an MP_CALL signal.
1212 * And also from i386_deactivate_cpu() when a cpu is being taken offline.
1213 */
1214 static void
mp_cpus_call_action(void)1215 mp_cpus_call_action(void)
1216 {
1217 mp_call_queue_t *cqp;
1218 boolean_t intrs_enabled;
1219 mp_call_t *callp;
1220 mp_call_t call;
1221
1222 assert(!ml_get_interrupts_enabled());
1223 cqp = &mp_cpus_call_head[cpu_number()];
1224 intrs_enabled = mp_call_head_lock(cqp);
1225 while ((callp = mp_call_dequeue_locked(cqp)) != NULL) {
1226 /* Copy call request to the stack to free buffer */
1227 call = *callp;
1228 mp_call_free(callp);
1229 if (call.func != NULL) {
1230 mp_call_head_unlock(cqp, intrs_enabled);
1231 KERNEL_DEBUG_CONSTANT(
1232 TRACE_MP_CPUS_CALL_ACTION,
1233 VM_KERNEL_UNSLIDE(call.func), VM_KERNEL_UNSLIDE_OR_PERM(call.arg0),
1234 VM_KERNEL_UNSLIDE_OR_PERM(call.arg1), VM_KERNEL_ADDRPERM(call.maskp), 0);
1235 call.func(call.arg0, call.arg1);
1236 (void) mp_call_head_lock(cqp);
1237 }
1238 if (call.maskp != NULL) {
1239 i_bit_set(cpu_number(), call.maskp);
1240 }
1241 }
1242 mp_call_head_unlock(cqp, intrs_enabled);
1243 }
1244
1245 /*
1246 * mp_cpus_call() runs a given function on cpus specified in a given cpu mask.
1247 * Possible modes are:
1248 * SYNC: function is called serially on target cpus in logical cpu order
1249 * waiting for each call to be acknowledged before proceeding
1250 * ASYNC: function call is queued to the specified cpus
1251 * waiting for all calls to complete in parallel before returning
1252 * NOSYNC: function calls are queued
1253 * but we return before confirmation of calls completing.
1254 * The action function may be NULL.
1255 * The cpu mask may include the local cpu. Offline cpus are ignored.
1256 * The return value is the number of cpus on which the call was made or queued.
1257 */
1258 cpu_t
mp_cpus_call(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *),void * arg)1259 mp_cpus_call(
1260 cpumask_t cpus,
1261 mp_sync_t mode,
1262 void (*action_func)(void *),
1263 void *arg)
1264 {
1265 return mp_cpus_call1(
1266 cpus,
1267 mode,
1268 (void (*)(void *, void *))action_func,
1269 arg,
1270 NULL,
1271 NULL);
1272 }
1273
1274 static void
mp_cpus_call_wait(boolean_t intrs_enabled,cpumask_t cpus_called,cpumask_t * cpus_responded)1275 mp_cpus_call_wait(boolean_t intrs_enabled,
1276 cpumask_t cpus_called,
1277 cpumask_t *cpus_responded)
1278 {
1279 mp_call_queue_t *cqp;
1280 uint64_t tsc_spin_start;
1281
1282 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1283 cqp = &mp_cpus_call_head[cpu_number()];
1284
1285 tsc_spin_start = rdtsc64();
1286 while (*cpus_responded != cpus_called) {
1287 if (!intrs_enabled) {
1288 /* Sniffing w/o locking */
1289 if (!queue_empty(&cqp->queue)) {
1290 mp_cpus_call_action();
1291 }
1292 cpu_signal_handler(NULL);
1293 }
1294 if (mp_spin_timeout(tsc_spin_start)) {
1295 cpumask_t cpus_unresponsive;
1296
1297 cpus_unresponsive = cpus_called & ~(*cpus_responded);
1298 NMIPI_panic(cpus_unresponsive, CROSSCALL_TIMEOUT);
1299 panic("mp_cpus_call_wait() timeout, cpus: 0x%llx",
1300 cpus_unresponsive);
1301 }
1302 }
1303 }
1304
1305 cpu_t
mp_cpus_call1(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *,void *),void * arg0,void * arg1,cpumask_t * cpus_calledp)1306 mp_cpus_call1(
1307 cpumask_t cpus,
1308 mp_sync_t mode,
1309 void (*action_func)(void *, void *),
1310 void *arg0,
1311 void *arg1,
1312 cpumask_t *cpus_calledp)
1313 {
1314 cpu_t cpu = 0;
1315 boolean_t intrs_enabled = FALSE;
1316 boolean_t call_self = FALSE;
1317 cpumask_t cpus_called = 0;
1318 cpumask_t cpus_responded = 0;
1319 long cpus_call_count = 0;
1320 uint64_t tsc_spin_start;
1321 boolean_t topo_lock;
1322
1323 KERNEL_DEBUG_CONSTANT(
1324 TRACE_MP_CPUS_CALL | DBG_FUNC_START,
1325 cpus, mode, VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1));
1326
1327 if (!smp_initialized) {
1328 if ((cpus & CPUMASK_SELF) == 0) {
1329 goto out;
1330 }
1331 if (action_func != NULL) {
1332 intrs_enabled = ml_set_interrupts_enabled(FALSE);
1333 action_func(arg0, arg1);
1334 ml_set_interrupts_enabled(intrs_enabled);
1335 }
1336 call_self = TRUE;
1337 goto out;
1338 }
1339
1340 /*
1341 * Queue the call for each non-local requested cpu.
1342 * This is performed under the topo lock to prevent changes to
1343 * cpus online state and to prevent concurrent rendezvouses --
1344 * although an exception is made if we're calling only the master
1345 * processor since that always remains active. Note: this exception
1346 * is expected for longterm timer nosync cross-calls to the master cpu.
1347 */
1348 mp_disable_preemption();
1349 intrs_enabled = ml_get_interrupts_enabled();
1350 topo_lock = (cpus != cpu_to_cpumask(master_cpu));
1351 if (topo_lock) {
1352 ml_set_interrupts_enabled(FALSE);
1353 (void) mp_safe_spin_lock(&x86_topo_lock);
1354 }
1355 for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1356 if (((cpu_to_cpumask(cpu) & cpus) == 0) ||
1357 !cpu_is_running(cpu)) {
1358 continue;
1359 }
1360 tsc_spin_start = rdtsc64();
1361 if (cpu == (cpu_t) cpu_number()) {
1362 /*
1363 * We don't IPI ourself and if calling asynchronously,
1364 * we defer our call until we have signalled all others.
1365 */
1366 call_self = TRUE;
1367 if (mode == SYNC && action_func != NULL) {
1368 KERNEL_DEBUG_CONSTANT(
1369 TRACE_MP_CPUS_CALL_LOCAL,
1370 VM_KERNEL_UNSLIDE(action_func),
1371 VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1372 action_func(arg0, arg1);
1373 }
1374 } else {
1375 /*
1376 * Here to queue a call to cpu and IPI.
1377 */
1378 mp_call_t *callp = NULL;
1379 mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1380 boolean_t intrs_inner;
1381
1382 queue_call:
1383 if (callp == NULL) {
1384 callp = mp_call_alloc();
1385 }
1386 intrs_inner = mp_call_head_lock(cqp);
1387 if (callp == NULL) {
1388 mp_call_head_unlock(cqp, intrs_inner);
1389 KERNEL_DEBUG_CONSTANT(
1390 TRACE_MP_CPUS_CALL_NOBUF,
1391 cpu, 0, 0, 0, 0);
1392 if (!intrs_inner) {
1393 /* Sniffing w/o locking */
1394 if (!queue_empty(&cqp->queue)) {
1395 mp_cpus_call_action();
1396 }
1397 handle_pending_TLB_flushes();
1398 }
1399 if (mp_spin_timeout(tsc_spin_start)) {
1400 panic("mp_cpus_call1() timeout start: 0x%llx, cur: 0x%llx",
1401 tsc_spin_start, rdtsc64());
1402 }
1403 goto queue_call;
1404 }
1405 callp->maskp = (mode == NOSYNC) ? NULL : &cpus_responded;
1406 callp->func = action_func;
1407 callp->arg0 = arg0;
1408 callp->arg1 = arg1;
1409 mp_call_enqueue_locked(cqp, callp);
1410 cpus_call_count++;
1411 cpus_called |= cpu_to_cpumask(cpu);
1412 i386_signal_cpu(cpu, MP_CALL, ASYNC);
1413 mp_call_head_unlock(cqp, intrs_inner);
1414 if (mode == SYNC) {
1415 mp_cpus_call_wait(intrs_inner, cpus_called, &cpus_responded);
1416 }
1417 }
1418 }
1419 if (topo_lock) {
1420 simple_unlock(&x86_topo_lock);
1421 ml_set_interrupts_enabled(intrs_enabled);
1422 }
1423
1424 /* Call locally if mode not SYNC */
1425 if (mode != SYNC && call_self) {
1426 KERNEL_DEBUG_CONSTANT(
1427 TRACE_MP_CPUS_CALL_LOCAL,
1428 VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1429 if (action_func != NULL) {
1430 ml_set_interrupts_enabled(FALSE);
1431 action_func(arg0, arg1);
1432 ml_set_interrupts_enabled(intrs_enabled);
1433 }
1434 }
1435
1436 /* For ASYNC, now wait for all signaled cpus to complete their calls */
1437 if (mode == ASYNC) {
1438 mp_cpus_call_wait(intrs_enabled, cpus_called, &cpus_responded);
1439 }
1440
1441 /* Safe to allow pre-emption now */
1442 mp_enable_preemption();
1443
1444 out:
1445 if (call_self) {
1446 cpus_called |= cpu_to_cpumask(cpu);
1447 cpus_call_count++;
1448 }
1449
1450 if (cpus_calledp) {
1451 *cpus_calledp = cpus_called;
1452 }
1453
1454 KERNEL_DEBUG_CONSTANT(
1455 TRACE_MP_CPUS_CALL | DBG_FUNC_END,
1456 cpus_call_count, cpus_called, 0, 0, 0);
1457
1458 return (cpu_t) cpus_call_count;
1459 }
1460
1461
1462 static void
mp_broadcast_action(__unused void * null)1463 mp_broadcast_action(__unused void *null)
1464 {
1465 /* call action function */
1466 if (mp_bc_action_func != NULL) {
1467 mp_bc_action_func(mp_bc_func_arg);
1468 }
1469
1470 /* if we're the last one through, wake up the instigator */
1471 if (atomic_decl_and_test(&mp_bc_count, 1)) {
1472 thread_wakeup(((event_t)(uintptr_t) &mp_bc_count));
1473 }
1474 }
1475
1476 /*
1477 * mp_broadcast() runs a given function on all active cpus.
1478 * The caller blocks until the functions has run on all cpus.
1479 * The caller will also block if there is another pending broadcast.
1480 */
1481 void
mp_broadcast(void (* action_func)(void *),void * arg)1482 mp_broadcast(
1483 void (*action_func)(void *),
1484 void *arg)
1485 {
1486 if (!smp_initialized) {
1487 if (action_func != NULL) {
1488 action_func(arg);
1489 }
1490 return;
1491 }
1492
1493 /* obtain broadcast lock */
1494 lck_mtx_lock(&mp_bc_lock);
1495
1496 /* set static function pointers */
1497 mp_bc_action_func = action_func;
1498 mp_bc_func_arg = arg;
1499
1500 assert_wait((event_t)(uintptr_t)&mp_bc_count, THREAD_UNINT);
1501
1502 /*
1503 * signal other processors, which will call mp_broadcast_action()
1504 */
1505 mp_bc_count = real_ncpus; /* assume max possible active */
1506 mp_bc_ncpus = mp_cpus_call(CPUMASK_ALL, NOSYNC, *mp_broadcast_action, NULL);
1507 atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */
1508
1509 /* block for other cpus to have run action_func */
1510 if (mp_bc_ncpus > 1) {
1511 thread_block(THREAD_CONTINUE_NULL);
1512 } else {
1513 clear_wait(current_thread(), THREAD_AWAKENED);
1514 }
1515
1516 /* release lock */
1517 lck_mtx_unlock(&mp_bc_lock);
1518 }
1519
1520 void
mp_cpus_kick(cpumask_t cpus)1521 mp_cpus_kick(cpumask_t cpus)
1522 {
1523 cpu_t cpu;
1524 boolean_t intrs_enabled = FALSE;
1525
1526 intrs_enabled = ml_set_interrupts_enabled(FALSE);
1527 mp_safe_spin_lock(&x86_topo_lock);
1528
1529 for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1530 if (((cpu_to_cpumask(cpu) & cpus) == 0)
1531 || !cpu_is_running(cpu)) {
1532 continue;
1533 }
1534
1535 lapic_send_ipi(cpu, LAPIC_VECTOR(KICK));
1536 }
1537
1538 simple_unlock(&x86_topo_lock);
1539 ml_set_interrupts_enabled(intrs_enabled);
1540 }
1541
1542 void
i386_activate_cpu(void)1543 i386_activate_cpu(void)
1544 {
1545 cpu_data_t *cdp = current_cpu_datap();
1546
1547 assert(!ml_get_interrupts_enabled());
1548
1549 if (!smp_initialized) {
1550 cdp->cpu_running = TRUE;
1551 return;
1552 }
1553
1554 mp_safe_spin_lock(&x86_topo_lock);
1555 cdp->cpu_running = TRUE;
1556 started_cpu();
1557 pmap_tlbi_range(0, ~0ULL, true, 0);
1558 simple_unlock(&x86_topo_lock);
1559 }
1560
1561 void
i386_deactivate_cpu(void)1562 i386_deactivate_cpu(void)
1563 {
1564 cpu_data_t *cdp = current_cpu_datap();
1565
1566 assert(!ml_get_interrupts_enabled());
1567
1568 KERNEL_DEBUG_CONSTANT(
1569 TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START,
1570 0, 0, 0, 0, 0);
1571
1572 mp_safe_spin_lock(&x86_topo_lock);
1573 cdp->cpu_running = FALSE;
1574 simple_unlock(&x86_topo_lock);
1575
1576 /*
1577 * Move all of this cpu's timers to the master/boot cpu,
1578 * and poke it in case there's a sooner deadline for it to schedule.
1579 */
1580 timer_queue_shutdown(&cdp->rtclock_timer.queue);
1581 mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, timer_queue_expire_local, NULL);
1582
1583 #if MONOTONIC
1584 mt_cpu_down(cdp);
1585 #endif /* MONOTONIC */
1586 #if KPERF
1587 kptimer_stop_curcpu();
1588 #endif /* KPERF */
1589
1590 /*
1591 * Open an interrupt window
1592 * and ensure any pending IPI or timer is serviced
1593 */
1594 mp_disable_preemption();
1595 ml_set_interrupts_enabled(TRUE);
1596
1597 while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime) {
1598 cpu_pause();
1599 }
1600 /*
1601 * Ensure there's no remaining timer deadline set
1602 * - AICPM may have left one active.
1603 */
1604 setPop(0);
1605
1606 ml_set_interrupts_enabled(FALSE);
1607 mp_enable_preemption();
1608
1609 KERNEL_DEBUG_CONSTANT(
1610 TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END,
1611 0, 0, 0, 0, 0);
1612 }
1613
1614 int pmsafe_debug = 1;
1615
1616 #if MACH_KDP
1617 volatile boolean_t mp_kdp_trap = FALSE;
1618 volatile boolean_t mp_kdp_is_NMI = FALSE;
1619 volatile unsigned long mp_kdp_ncpus;
1620 boolean_t mp_kdp_state;
1621
1622
1623 void
mp_kdp_enter(boolean_t proceed_on_failure)1624 mp_kdp_enter(boolean_t proceed_on_failure)
1625 {
1626 unsigned int cpu;
1627 unsigned int ncpus = 0;
1628 unsigned int my_cpu;
1629 uint64_t tsc_timeout;
1630
1631 DBG("mp_kdp_enter()\n");
1632
1633 /*
1634 * Here to enter the debugger.
1635 * In case of races, only one cpu is allowed to enter kdp after
1636 * stopping others.
1637 */
1638 mp_kdp_state = ml_set_interrupts_enabled(FALSE);
1639 my_cpu = cpu_number();
1640
1641 if (my_cpu == (unsigned) debugger_cpu) {
1642 kprintf("\n\nRECURSIVE DEBUGGER ENTRY DETECTED\n\n");
1643 kdp_reset();
1644 return;
1645 }
1646
1647 uint64_t start_time = cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time();
1648 int locked = 0;
1649 while (!locked || mp_kdp_trap) {
1650 if (locked) {
1651 simple_unlock(&x86_topo_lock);
1652 }
1653 if (proceed_on_failure) {
1654 if (mach_absolute_time() - start_time > 500000000ll) {
1655 paniclog_append_noflush("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n");
1656 break;
1657 }
1658 locked = simple_lock_try(&x86_topo_lock, LCK_GRP_NULL);
1659 if (!locked) {
1660 cpu_pause();
1661 }
1662 } else {
1663 mp_safe_spin_lock(&x86_topo_lock);
1664 locked = TRUE;
1665 }
1666
1667 if (locked && mp_kdp_trap) {
1668 simple_unlock(&x86_topo_lock);
1669 DBG("mp_kdp_enter() race lost\n");
1670 #if MACH_KDP
1671 mp_kdp_wait(TRUE, FALSE);
1672 #endif
1673 locked = FALSE;
1674 }
1675 }
1676
1677 if (pmsafe_debug && !kdp_snapshot) {
1678 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
1679 }
1680
1681 debugger_cpu = my_cpu;
1682 ncpus = 1;
1683 atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1684 mp_kdp_trap = TRUE;
1685 debugger_entry_time = cpu_datap(my_cpu)->debugger_entry_time;
1686
1687 /*
1688 * Deliver a nudge to other cpus, counting how many
1689 */
1690 DBG("mp_kdp_enter() signaling other processors\n");
1691 if (force_immediate_debugger_NMI == FALSE) {
1692 for (cpu = 0; cpu < real_ncpus; cpu++) {
1693 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1694 continue;
1695 }
1696 ncpus++;
1697 i386_signal_cpu(cpu, MP_KDP, ASYNC);
1698 }
1699 /*
1700 * Wait other processors to synchronize
1701 */
1702 DBG("mp_kdp_enter() waiting for (%d) processors to suspend\n", ncpus);
1703
1704 /*
1705 * This timeout is rather arbitrary; we don't want to NMI
1706 * processors that are executing at potentially
1707 * "unsafe-to-interrupt" points such as the trampolines,
1708 * but neither do we want to lose state by waiting too long.
1709 */
1710 tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1711
1712 while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1713 /*
1714 * A TLB shootdown request may be pending--this would
1715 * result in the requesting processor waiting in
1716 * PMAP_UPDATE_TLBS() until this processor deals with it.
1717 * Process it, so it can now enter mp_kdp_wait()
1718 */
1719 handle_pending_TLB_flushes();
1720 cpu_pause();
1721 }
1722 /* If we've timed out, and some processor(s) are still unresponsive,
1723 * interrupt them with an NMI via the local APIC, iff a panic is
1724 * in progress.
1725 */
1726 if (panic_active()) {
1727 NMIPI_enable(TRUE);
1728 }
1729 if (mp_kdp_ncpus != ncpus) {
1730 unsigned int wait_cycles = 0;
1731 if (proceed_on_failure) {
1732 paniclog_append_noflush("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1733 } else {
1734 DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1735 }
1736 for (cpu = 0; cpu < real_ncpus; cpu++) {
1737 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1738 continue;
1739 }
1740 if (cpu_signal_pending(cpu, MP_KDP)) {
1741 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1742 cpu_NMI_interrupt(cpu);
1743 }
1744 }
1745 /* Wait again for the same timeout */
1746 tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1747 while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1748 handle_pending_TLB_flushes();
1749 cpu_pause();
1750 ++wait_cycles;
1751 }
1752 if (mp_kdp_ncpus != ncpus) {
1753 paniclog_append_noflush("mp_kdp_enter() NMI pending on cpus:");
1754 for (cpu = 0; cpu < real_ncpus; cpu++) {
1755 if (cpu_is_running(cpu) && !cpu_datap(cpu)->cpu_NMI_acknowledged) {
1756 paniclog_append_noflush(" %d", cpu);
1757 }
1758 }
1759 paniclog_append_noflush("\n");
1760 if (proceed_on_failure) {
1761 paniclog_append_noflush("mp_kdp_enter() timed-out during %s wait after NMI;"
1762 "expected %u acks but received %lu after %u loops in %llu ticks\n",
1763 (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1764 } else {
1765 panic("mp_kdp_enter() timed-out during %s wait after NMI;"
1766 "expected %u acks but received %lu after %u loops in %llu ticks",
1767 (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1768 }
1769 }
1770 }
1771 } else if (NMI_panic_reason != PTE_CORRUPTION) { /* In the pte corruption case, the detecting CPU has already NMIed other CPUs */
1772 for (cpu = 0; cpu < real_ncpus; cpu++) {
1773 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1774 continue;
1775 }
1776 cpu_NMI_interrupt(cpu);
1777 }
1778 }
1779
1780 if (locked) {
1781 simple_unlock(&x86_topo_lock);
1782 }
1783
1784 DBG("mp_kdp_enter() %d processors done %s\n",
1785 (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out");
1786
1787 postcode(MP_KDP_ENTER);
1788 }
1789
1790 boolean_t
mp_kdp_all_cpus_halted()1791 mp_kdp_all_cpus_halted()
1792 {
1793 unsigned int ncpus = 0, cpu = 0, my_cpu = 0;
1794
1795 my_cpu = cpu_number();
1796 ncpus = 1; /* current CPU */
1797 for (cpu = 0; cpu < real_ncpus; cpu++) {
1798 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1799 continue;
1800 }
1801 ncpus++;
1802 }
1803
1804 return mp_kdp_ncpus == ncpus;
1805 }
1806
1807 static boolean_t
cpu_signal_pending(int cpu,mp_event_t event)1808 cpu_signal_pending(int cpu, mp_event_t event)
1809 {
1810 volatile int *signals = &cpu_datap(cpu)->cpu_signals;
1811 boolean_t retval = FALSE;
1812
1813 if (i_bit(event, signals)) {
1814 retval = TRUE;
1815 }
1816 return retval;
1817 }
1818
1819 long
kdp_x86_xcpu_invoke(const uint16_t lcpu,kdp_x86_xcpu_func_t func,void * arg0,void * arg1,uint64_t timeout)1820 kdp_x86_xcpu_invoke(const uint16_t lcpu, kdp_x86_xcpu_func_t func,
1821 void *arg0, void *arg1, uint64_t timeout)
1822 {
1823 uint64_t now;
1824
1825 if (lcpu > (real_ncpus - 1)) {
1826 return -1;
1827 }
1828
1829 if (func == NULL) {
1830 return -1;
1831 }
1832
1833 kdp_xcpu_call_func.func = func;
1834 kdp_xcpu_call_func.ret = -1;
1835 kdp_xcpu_call_func.arg0 = arg0;
1836 kdp_xcpu_call_func.arg1 = arg1;
1837 kdp_xcpu_call_func.cpu = lcpu;
1838 DBG("Invoking function %p on CPU %d\n", func, (int32_t)lcpu);
1839 now = mach_absolute_time();
1840 while (kdp_xcpu_call_func.cpu != KDP_XCPU_NONE &&
1841 (timeout == 0 || (mach_absolute_time() - now) < timeout)) {
1842 cpu_pause();
1843 }
1844 return kdp_xcpu_call_func.ret;
1845 }
1846
1847 static void
kdp_x86_xcpu_poll(void)1848 kdp_x86_xcpu_poll(void)
1849 {
1850 if ((uint16_t)cpu_number() == kdp_xcpu_call_func.cpu) {
1851 kdp_xcpu_call_func.ret =
1852 kdp_xcpu_call_func.func(kdp_xcpu_call_func.arg0,
1853 kdp_xcpu_call_func.arg1,
1854 cpu_number());
1855 kdp_xcpu_call_func.cpu = KDP_XCPU_NONE;
1856 }
1857 }
1858
1859 static void
mp_kdp_wait(boolean_t flush,boolean_t isNMI)1860 mp_kdp_wait(boolean_t flush, boolean_t isNMI)
1861 {
1862 DBG("mp_kdp_wait()\n");
1863
1864 current_cpu_datap()->debugger_ipi_time = mach_absolute_time();
1865 #if CONFIG_MCA
1866 /* If we've trapped due to a machine-check, save MCA registers */
1867 mca_check_save();
1868 #endif
1869
1870 atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1871 while (mp_kdp_trap || (isNMI == TRUE)) {
1872 /*
1873 * A TLB shootdown request may be pending--this would result
1874 * in the requesting processor waiting in PMAP_UPDATE_TLBS()
1875 * until this processor handles it.
1876 * Process it, so it can now enter mp_kdp_wait()
1877 */
1878 if (flush) {
1879 handle_pending_TLB_flushes();
1880 }
1881
1882 kdp_x86_xcpu_poll();
1883 cpu_pause();
1884 }
1885
1886 atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1887 DBG("mp_kdp_wait() done\n");
1888 }
1889
1890 void
mp_kdp_exit(void)1891 mp_kdp_exit(void)
1892 {
1893 DBG("mp_kdp_exit()\n");
1894 debugger_cpu = -1;
1895 atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1896
1897 debugger_exit_time = mach_absolute_time();
1898
1899 mp_kdp_trap = FALSE;
1900 mfence();
1901
1902 /* Wait other processors to stop spinning. XXX needs timeout */
1903 DBG("mp_kdp_exit() waiting for processors to resume\n");
1904 while (mp_kdp_ncpus > 0) {
1905 /*
1906 * a TLB shootdown request may be pending... this would result in the requesting
1907 * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it.
1908 * Process it, so it can now enter mp_kdp_wait()
1909 */
1910 handle_pending_TLB_flushes();
1911
1912 cpu_pause();
1913 }
1914
1915 if (pmsafe_debug && !kdp_snapshot) {
1916 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
1917 }
1918
1919 debugger_exit_time = mach_absolute_time();
1920
1921 DBG("mp_kdp_exit() done\n");
1922 (void) ml_set_interrupts_enabled(mp_kdp_state);
1923 postcode(MP_KDP_EXIT);
1924 }
1925
1926 #endif /* MACH_KDP */
1927
1928 boolean_t
mp_recent_debugger_activity(void)1929 mp_recent_debugger_activity(void)
1930 {
1931 uint64_t abstime = mach_absolute_time();
1932 return ((abstime - debugger_entry_time) < LastDebuggerEntryAllowance) ||
1933 ((abstime - debugger_exit_time) < LastDebuggerEntryAllowance);
1934 }
1935
1936 /*ARGSUSED*/
1937 void
init_ast_check(__unused processor_t processor)1938 init_ast_check(
1939 __unused processor_t processor)
1940 {
1941 }
1942
1943 void
cause_ast_check(processor_t processor)1944 cause_ast_check(
1945 processor_t processor)
1946 {
1947 int cpu = processor->cpu_id;
1948
1949 if (cpu != cpu_number()) {
1950 i386_signal_cpu(cpu, MP_AST, ASYNC);
1951 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0);
1952 }
1953 }
1954
1955 void
slave_machine_init(void * param)1956 slave_machine_init(void *param)
1957 {
1958 /*
1959 * Here in process context, but with interrupts disabled.
1960 */
1961 DBG("slave_machine_init() CPU%d\n", get_cpu_number());
1962
1963 if (param == FULL_SLAVE_INIT) {
1964 /*
1965 * Cold start
1966 */
1967 clock_init();
1968 }
1969 cpu_machine_init(); /* Interrupts enabled hereafter */
1970 }
1971
1972 #undef cpu_number
1973 int
cpu_number(void)1974 cpu_number(void)
1975 {
1976 return get_cpu_number();
1977 }
1978
1979 vm_offset_t
current_percpu_base(void)1980 current_percpu_base(void)
1981 {
1982 return get_current_percpu_base();
1983 }
1984
1985 vm_offset_t
other_percpu_base(int cpu)1986 other_percpu_base(int cpu)
1987 {
1988 return cpu_datap(cpu)->cpu_pcpu_base;
1989 }
1990
1991 static void
cpu_prewarm_init()1992 cpu_prewarm_init()
1993 {
1994 int i;
1995
1996 simple_lock_init(&cpu_warm_lock, 0);
1997 queue_init(&cpu_warm_call_list);
1998 for (i = 0; i < NUM_CPU_WARM_CALLS; i++) {
1999 enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]);
2000 }
2001 }
2002
2003 static timer_call_t
grab_warm_timer_call()2004 grab_warm_timer_call()
2005 {
2006 spl_t x;
2007 timer_call_t call = NULL;
2008
2009 x = splsched();
2010 simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2011 if (!queue_empty(&cpu_warm_call_list)) {
2012 call = (timer_call_t) dequeue_head(&cpu_warm_call_list);
2013 }
2014 simple_unlock(&cpu_warm_lock);
2015 splx(x);
2016
2017 return call;
2018 }
2019
2020 static void
free_warm_timer_call(timer_call_t call)2021 free_warm_timer_call(timer_call_t call)
2022 {
2023 spl_t x;
2024
2025 x = splsched();
2026 simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2027 enqueue_head(&cpu_warm_call_list, (queue_entry_t)call);
2028 simple_unlock(&cpu_warm_lock);
2029 splx(x);
2030 }
2031
2032 /*
2033 * Runs in timer call context (interrupts disabled).
2034 */
2035 static void
cpu_warm_timer_call_func(timer_call_param_t p0,__unused timer_call_param_t p1)2036 cpu_warm_timer_call_func(
2037 timer_call_param_t p0,
2038 __unused timer_call_param_t p1)
2039 {
2040 free_warm_timer_call((timer_call_t)p0);
2041 return;
2042 }
2043
2044 /*
2045 * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0).
2046 */
2047 static void
_cpu_warm_setup(void * arg)2048 _cpu_warm_setup(
2049 void *arg)
2050 {
2051 cpu_warm_data_t cwdp = (cpu_warm_data_t)arg;
2052
2053 timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2054 cwdp->cwd_result = 0;
2055
2056 return;
2057 }
2058
2059 /*
2060 * Not safe to call with interrupts disabled.
2061 */
2062 kern_return_t
ml_interrupt_prewarm(uint64_t deadline)2063 ml_interrupt_prewarm(
2064 uint64_t deadline)
2065 {
2066 struct cpu_warm_data cwd;
2067 timer_call_t call;
2068 cpu_t ct;
2069
2070 if (ml_get_interrupts_enabled() == FALSE) {
2071 panic("%s: Interrupts disabled?", __FUNCTION__);
2072 }
2073
2074 /*
2075 * If the platform doesn't need our help, say that we succeeded.
2076 */
2077 if (!ml_get_interrupt_prewake_applicable()) {
2078 return KERN_SUCCESS;
2079 }
2080
2081 /*
2082 * Grab a timer call to use.
2083 */
2084 call = grab_warm_timer_call();
2085 if (call == NULL) {
2086 return KERN_RESOURCE_SHORTAGE;
2087 }
2088
2089 timer_call_setup(call, cpu_warm_timer_call_func, call);
2090 cwd.cwd_call = call;
2091 cwd.cwd_deadline = deadline;
2092 cwd.cwd_result = 0;
2093
2094 /*
2095 * For now, non-local interrupts happen on the master processor.
2096 */
2097 ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd);
2098 if (ct == 0) {
2099 free_warm_timer_call(call);
2100 return KERN_FAILURE;
2101 } else {
2102 return cwd.cwd_result;
2103 }
2104 }
2105
2106 #if DEBUG || DEVELOPMENT
2107 void
kernel_spin(uint64_t spin_ns)2108 kernel_spin(uint64_t spin_ns)
2109 {
2110 boolean_t istate;
2111 uint64_t spin_abs;
2112 uint64_t deadline;
2113 cpu_data_t *cdp;
2114
2115 kprintf("kernel_spin(%llu) spinning uninterruptibly\n", spin_ns);
2116 istate = ml_set_interrupts_enabled(FALSE);
2117 cdp = current_cpu_datap();
2118 nanoseconds_to_absolutetime(spin_ns, &spin_abs);
2119
2120 /* Fake interrupt handler entry for testing mp_interrupt_watchdog() */
2121 cdp->cpu_int_event_time = mach_absolute_time();
2122 cdp->cpu_int_state = (void *) USER_STATE(current_thread());
2123
2124 deadline = mach_absolute_time() + spin_ns;
2125 while (mach_absolute_time() < deadline) {
2126 cpu_pause();
2127 }
2128
2129 cdp->cpu_int_event_time = 0;
2130 cdp->cpu_int_state = NULL;
2131
2132 ml_set_interrupts_enabled(istate);
2133 kprintf("kernel_spin() continuing\n");
2134 }
2135
2136 /*
2137 * Called from the scheduler's maintenance thread,
2138 * scan running processors for long-running ISRs and:
2139 * - panic if longer than LockTimeOut, or
2140 * - log if more than a quantum.
2141 */
2142 void
mp_interrupt_watchdog(void)2143 mp_interrupt_watchdog(void)
2144 {
2145 cpu_t cpu;
2146 boolean_t intrs_enabled = FALSE;
2147 uint16_t cpu_int_num;
2148 uint64_t cpu_int_event_time;
2149 uint64_t cpu_rip;
2150 uint64_t cpu_int_duration;
2151 uint64_t now;
2152 x86_saved_state_t *cpu_int_state;
2153
2154 if (__improbable(!mp_interrupt_watchdog_enabled)) {
2155 return;
2156 }
2157
2158 intrs_enabled = ml_set_interrupts_enabled(FALSE);
2159 now = mach_absolute_time();
2160 /*
2161 * While timeouts are not suspended,
2162 * check all other processors for long outstanding interrupt handling.
2163 */
2164 for (cpu = 0;
2165 cpu < (cpu_t) real_ncpus && !machine_timeout_suspended();
2166 cpu++) {
2167 if ((cpu == (cpu_t) cpu_number()) ||
2168 (!cpu_is_running(cpu))) {
2169 continue;
2170 }
2171 cpu_int_event_time = cpu_datap(cpu)->cpu_int_event_time;
2172 if (cpu_int_event_time == 0) {
2173 continue;
2174 }
2175 if (__improbable(now < cpu_int_event_time)) {
2176 continue; /* skip due to inter-processor skew */
2177 }
2178 cpu_int_state = cpu_datap(cpu)->cpu_int_state;
2179 if (__improbable(cpu_int_state == NULL)) {
2180 /* The interrupt may have been dismissed */
2181 continue;
2182 }
2183
2184 /* Here with a cpu handling an interrupt */
2185
2186 cpu_int_duration = now - cpu_int_event_time;
2187 if (__improbable(cpu_int_duration > LockTimeOut)) {
2188 cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2189 cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2190 vector_timed_out = cpu_int_num;
2191 NMIPI_panic(cpu_to_cpumask(cpu), INTERRUPT_WATCHDOG);
2192 panic("Interrupt watchdog, "
2193 "cpu: %d interrupt: 0x%x time: %llu..%llu state: %p RIP: 0x%llx",
2194 cpu, cpu_int_num, cpu_int_event_time, now, cpu_int_state, cpu_rip);
2195 /* NOT REACHED */
2196 } else if (__improbable(cpu_int_duration > (uint64_t) std_quantum)) {
2197 mp_interrupt_watchdog_events++;
2198 cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2199 cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2200 ml_set_interrupts_enabled(intrs_enabled);
2201 printf("Interrupt watchdog, "
2202 "cpu: %d interrupt: 0x%x time: %llu..%llu RIP: 0x%llx\n",
2203 cpu, cpu_int_num, cpu_int_event_time, now, cpu_rip);
2204 return;
2205 }
2206 }
2207
2208 ml_set_interrupts_enabled(intrs_enabled);
2209 }
2210 #endif
2211