1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31
32 #include <mach_kdp.h>
33 #include <kdp/kdp_internal.h>
34 #include <mach_ldebug.h>
35
36 #include <mach/mach_types.h>
37 #include <mach/kern_return.h>
38
39 #include <kern/kern_types.h>
40 #include <kern/startup.h>
41 #include <kern/timer_queue.h>
42 #include <kern/processor.h>
43 #include <kern/cpu_number.h>
44 #include <kern/cpu_data.h>
45 #include <kern/assert.h>
46 #include <kern/lock_group.h>
47 #include <kern/machine.h>
48 #include <kern/pms.h>
49 #include <kern/misc_protos.h>
50 #include <kern/timer_call.h>
51 #include <kern/zalloc.h>
52 #include <kern/queue.h>
53 #include <prng/random.h>
54
55 #include <vm/vm_map.h>
56 #include <vm/vm_kern.h>
57
58 #include <i386/bit_routines.h>
59 #include <i386/proc_reg.h>
60 #include <i386/cpu_threads.h>
61 #include <i386/mp_desc.h>
62 #include <i386/misc_protos.h>
63 #include <i386/trap.h>
64 #include <i386/postcode.h>
65 #include <i386/machine_routines.h>
66 #include <i386/mp.h>
67 #include <i386/mp_events.h>
68 #include <i386/lapic.h>
69 #include <i386/cpuid.h>
70 #include <i386/fpu.h>
71 #include <i386/machine_cpu.h>
72 #include <i386/pmCPU.h>
73 #if CONFIG_MCA
74 #include <i386/machine_check.h>
75 #endif
76 #include <i386/acpi.h>
77
78 #include <sys/kdebug.h>
79
80 #include <console/serial_protos.h>
81
82 #if MONOTONIC
83 #include <kern/monotonic.h>
84 #endif /* MONOTONIC */
85
86 #if KPERF
87 #include <kperf/kptimer.h>
88 #endif /* KPERF */
89
90 #if MP_DEBUG
91 #define PAUSE delay(1000000)
92 #define DBG(x...) kprintf(x)
93 #else
94 #define DBG(x...)
95 #define PAUSE
96 #endif /* MP_DEBUG */
97
98 /* Debugging/test trace events: */
99 #define TRACE_MP_TLB_FLUSH MACHDBG_CODE(DBG_MACH_MP, 0)
100 #define TRACE_MP_CPUS_CALL MACHDBG_CODE(DBG_MACH_MP, 1)
101 #define TRACE_MP_CPUS_CALL_LOCAL MACHDBG_CODE(DBG_MACH_MP, 2)
102 #define TRACE_MP_CPUS_CALL_ACTION MACHDBG_CODE(DBG_MACH_MP, 3)
103 #define TRACE_MP_CPUS_CALL_NOBUF MACHDBG_CODE(DBG_MACH_MP, 4)
104 #define TRACE_MP_CPU_FAST_START MACHDBG_CODE(DBG_MACH_MP, 5)
105 #define TRACE_MP_CPU_START MACHDBG_CODE(DBG_MACH_MP, 6)
106 #define TRACE_MP_CPU_DEACTIVATE MACHDBG_CODE(DBG_MACH_MP, 7)
107
108 #define ABS(v) (((v) > 0)?(v):-(v))
109
110 void slave_boot_init(void);
111 void i386_cpu_IPI(int cpu);
112
113 #if MACH_KDP
114 static void mp_kdp_wait(boolean_t flush, boolean_t isNMI);
115 #endif /* MACH_KDP */
116
117 #if MACH_KDP
118 static boolean_t cpu_signal_pending(int cpu, mp_event_t event);
119 #endif /* MACH_KDP */
120 static int NMIInterruptHandler(x86_saved_state_t *regs);
121
122 boolean_t smp_initialized = FALSE;
123 uint32_t TSC_sync_margin = 0xFFF;
124 volatile boolean_t force_immediate_debugger_NMI = FALSE;
125 volatile boolean_t pmap_tlb_flush_timeout = FALSE;
126 #if DEBUG || DEVELOPMENT
127 boolean_t mp_interrupt_watchdog_enabled = TRUE;
128 uint32_t mp_interrupt_watchdog_events = 0;
129 #endif
130
131 SIMPLE_LOCK_DECLARE(debugger_callback_lock, 0);
132 struct debugger_callback *debugger_callback = NULL;
133
134 static LCK_GRP_DECLARE(smp_lck_grp, "i386_smp");
135 static LCK_MTX_DECLARE(mp_cpu_boot_lock, &smp_lck_grp);
136
137 /* Variables needed for MP rendezvous. */
138 SIMPLE_LOCK_DECLARE(mp_rv_lock, 0);
139 static void (*mp_rv_setup_func)(void *arg);
140 static void (*mp_rv_action_func)(void *arg);
141 static void (*mp_rv_teardown_func)(void *arg);
142 static void *mp_rv_func_arg;
143 static volatile int mp_rv_ncpus;
144 /* Cache-aligned barriers: */
145 static volatile long mp_rv_entry __attribute__((aligned(64)));
146 static volatile long mp_rv_exit __attribute__((aligned(64)));
147 static volatile long mp_rv_complete __attribute__((aligned(64)));
148
149 volatile uint64_t debugger_entry_time;
150 volatile uint64_t debugger_exit_time;
151 #if MACH_KDP
152 #include <kdp/kdp.h>
153 extern int kdp_snapshot;
154 static struct _kdp_xcpu_call_func {
155 kdp_x86_xcpu_func_t func;
156 void *arg0, *arg1;
157 volatile long ret;
158 volatile uint16_t cpu;
159 } kdp_xcpu_call_func = {
160 .cpu = KDP_XCPU_NONE
161 };
162
163 #endif
164
165 /* Variables needed for MP broadcast. */
166 static void (*mp_bc_action_func)(void *arg);
167 static void *mp_bc_func_arg;
168 static int mp_bc_ncpus;
169 static volatile long mp_bc_count;
170 static LCK_MTX_DECLARE(mp_bc_lock, &smp_lck_grp);
171 static volatile int debugger_cpu = -1;
172 volatile long NMIPI_acks = 0;
173 volatile long NMI_count = 0;
174 static int vector_timed_out;
175
176 NMI_reason_t NMI_panic_reason = NONE;
177 extern void NMI_cpus(void);
178
179 static void mp_cpus_call_init(void);
180 static void mp_cpus_call_action(void);
181 static void mp_call_PM(void);
182
183 char mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init
184
185 /* PAL-related routines */
186 boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler,
187 int ipi_vector, i386_intr_func_t ipi_handler);
188 void i386_start_cpu(int lapic_id, int cpu_num);
189 void i386_send_NMI(int cpu);
190 void NMIPI_enable(boolean_t);
191
192 #define NUM_CPU_WARM_CALLS 20
193 struct timer_call cpu_warm_call_arr[NUM_CPU_WARM_CALLS];
194 queue_head_t cpu_warm_call_list;
195 decl_simple_lock_data(static, cpu_warm_lock);
196
197 typedef struct cpu_warm_data {
198 timer_call_t cwd_call;
199 uint64_t cwd_deadline;
200 int cwd_result;
201 } *cpu_warm_data_t;
202
203 static void cpu_prewarm_init(void);
204 static void cpu_warm_timer_call_func(timer_call_param_t p0, timer_call_param_t p1);
205 static void _cpu_warm_setup(void *arg);
206 static timer_call_t grab_warm_timer_call(void);
207 static void free_warm_timer_call(timer_call_t call);
208
209 void
smp_init(void)210 smp_init(void)
211 {
212 console_init();
213
214 if (!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler,
215 LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler)) {
216 return;
217 }
218
219 cpu_thread_init();
220
221 DBGLOG_CPU_INIT(master_cpu);
222
223 mp_cpus_call_init();
224 mp_cpus_call_cpu_init(master_cpu);
225
226 #if DEBUG || DEVELOPMENT
227 if (PE_parse_boot_argn("interrupt_watchdog",
228 &mp_interrupt_watchdog_enabled,
229 sizeof(mp_interrupt_watchdog_enabled))) {
230 kprintf("Interrupt watchdog %sabled\n",
231 mp_interrupt_watchdog_enabled ? "en" : "dis");
232 }
233 #endif
234
235 if (PE_parse_boot_argn("TSC_sync_margin",
236 &TSC_sync_margin, sizeof(TSC_sync_margin))) {
237 kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin);
238 } else if (cpuid_vmm_present()) {
239 kprintf("TSC sync margin disabled\n");
240 TSC_sync_margin = 0;
241 }
242 smp_initialized = TRUE;
243
244 cpu_prewarm_init();
245
246 return;
247 }
248
249 typedef struct {
250 int target_cpu;
251 int target_lapic;
252 int starter_cpu;
253 } processor_start_info_t;
254 static processor_start_info_t start_info __attribute__((aligned(64)));
255
256 /*
257 * Cache-alignment is to avoid cross-cpu false-sharing interference.
258 */
259 static volatile long tsc_entry_barrier __attribute__((aligned(64)));
260 static volatile long tsc_exit_barrier __attribute__((aligned(64)));
261 static volatile uint64_t tsc_target __attribute__((aligned(64)));
262
263 /*
264 * Poll a CPU to see when it has marked itself as running.
265 */
266 static void
mp_wait_for_cpu_up(int slot_num,unsigned int iters,unsigned int usecdelay)267 mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay)
268 {
269 while (iters-- > 0) {
270 if (cpu_datap(slot_num)->cpu_running) {
271 break;
272 }
273 delay(usecdelay);
274 }
275 }
276
277 /*
278 * Quickly bring a CPU back online which has been halted.
279 */
280 kern_return_t
intel_startCPU_fast(int slot_num)281 intel_startCPU_fast(int slot_num)
282 {
283 kern_return_t rc;
284
285 /*
286 * Try to perform a fast restart
287 */
288 rc = pmCPUExitHalt(slot_num);
289 if (rc != KERN_SUCCESS) {
290 /*
291 * The CPU was not eligible for a fast restart.
292 */
293 return rc;
294 }
295
296 KERNEL_DEBUG_CONSTANT(
297 TRACE_MP_CPU_FAST_START | DBG_FUNC_START,
298 slot_num, 0, 0, 0, 0);
299
300 /*
301 * Wait until the CPU is back online.
302 */
303 mp_disable_preemption();
304
305 /*
306 * We use short pauses (1us) for low latency. 30,000 iterations is
307 * longer than a full restart would require so it should be more
308 * than long enough.
309 */
310
311 mp_wait_for_cpu_up(slot_num, 30000, 1);
312 mp_enable_preemption();
313
314 KERNEL_DEBUG_CONSTANT(
315 TRACE_MP_CPU_FAST_START | DBG_FUNC_END,
316 slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0);
317
318 /*
319 * Check to make sure that the CPU is really running. If not,
320 * go through the slow path.
321 */
322 if (cpu_datap(slot_num)->cpu_running) {
323 return KERN_SUCCESS;
324 } else {
325 return KERN_FAILURE;
326 }
327 }
328
329 static void
started_cpu(void)330 started_cpu(void)
331 {
332 /* Here on the started cpu with cpu_running set TRUE */
333
334 if (TSC_sync_margin &&
335 start_info.target_cpu == cpu_number()) {
336 /*
337 * I've just started-up, synchronize again with the starter cpu
338 * and then snap my TSC.
339 */
340 tsc_target = 0;
341 atomic_decl(&tsc_entry_barrier, 1);
342 while (tsc_entry_barrier != 0) {
343 ; /* spin for starter and target at barrier */
344 }
345 tsc_target = rdtsc64();
346 atomic_decl(&tsc_exit_barrier, 1);
347 }
348 }
349
350 static void
start_cpu(void * arg)351 start_cpu(void *arg)
352 {
353 int i = 1000;
354 processor_start_info_t *psip = (processor_start_info_t *) arg;
355
356 /* Ignore this if the current processor is not the starter */
357 if (cpu_number() != psip->starter_cpu) {
358 return;
359 }
360
361 DBG("start_cpu(%p) about to start cpu %d, lapic %d\n",
362 arg, psip->target_cpu, psip->target_lapic);
363
364 KERNEL_DEBUG_CONSTANT(
365 TRACE_MP_CPU_START | DBG_FUNC_START,
366 psip->target_cpu,
367 psip->target_lapic, 0, 0, 0);
368
369 i386_start_cpu(psip->target_lapic, psip->target_cpu);
370
371 #ifdef POSTCODE_DELAY
372 /* Wait much longer if postcodes are displayed for a delay period. */
373 i *= 10000;
374 #endif
375 DBG("start_cpu(%p) about to wait for cpu %d\n",
376 arg, psip->target_cpu);
377
378 mp_wait_for_cpu_up(psip->target_cpu, i * 100, 100);
379
380 KERNEL_DEBUG_CONSTANT(
381 TRACE_MP_CPU_START | DBG_FUNC_END,
382 psip->target_cpu,
383 cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0);
384
385 if (TSC_sync_margin &&
386 cpu_datap(psip->target_cpu)->cpu_running) {
387 /*
388 * Compare the TSC from the started processor with ours.
389 * Report and log/panic if it diverges by more than
390 * TSC_sync_margin (TSC_SYNC_MARGIN) ticks. This margin
391 * can be overriden by boot-arg (with 0 meaning no checking).
392 */
393 uint64_t tsc_starter;
394 int64_t tsc_delta;
395 atomic_decl(&tsc_entry_barrier, 1);
396 while (tsc_entry_barrier != 0) {
397 ; /* spin for both processors at barrier */
398 }
399 tsc_starter = rdtsc64();
400 atomic_decl(&tsc_exit_barrier, 1);
401 while (tsc_exit_barrier != 0) {
402 ; /* spin for target to store its TSC */
403 }
404 tsc_delta = tsc_target - tsc_starter;
405 kprintf("TSC sync for cpu %d: 0x%016llx delta 0x%llx (%lld)\n",
406 psip->target_cpu, tsc_target, tsc_delta, tsc_delta);
407 #if DEBUG || DEVELOPMENT
408 /*
409 * Stash the delta for inspection later, since we can no
410 * longer print/log it with interrupts disabled.
411 */
412 cpu_datap(psip->target_cpu)->tsc_sync_delta = tsc_delta;
413 #endif
414 if (ABS(tsc_delta) > (int64_t) TSC_sync_margin) {
415 #if DEBUG
416 panic(
417 #else
418 kprintf(
419 #endif
420 "Unsynchronized TSC for cpu %d: "
421 "0x%016llx, delta 0x%llx\n",
422 psip->target_cpu, tsc_target, tsc_delta);
423 }
424 }
425 }
426
427 kern_return_t
intel_startCPU(int slot_num)428 intel_startCPU(
429 int slot_num)
430 {
431 int lapic = cpu_to_lapic[slot_num];
432 boolean_t istate;
433
434 assert(lapic != -1);
435
436 DBGLOG_CPU_INIT(slot_num);
437
438 DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic);
439 DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD);
440
441 /*
442 * Initialize (or re-initialize) the descriptor tables for this cpu.
443 * Propagate processor mode to slave.
444 */
445 cpu_desc_init(cpu_datap(slot_num));
446
447 /* Serialize use of the slave boot stack, etc. */
448 lck_mtx_lock(&mp_cpu_boot_lock);
449
450 istate = ml_set_interrupts_enabled(FALSE);
451 if (slot_num == get_cpu_number()) {
452 ml_set_interrupts_enabled(istate);
453 lck_mtx_unlock(&mp_cpu_boot_lock);
454 return KERN_SUCCESS;
455 }
456
457 start_info.starter_cpu = cpu_number();
458 start_info.target_cpu = slot_num;
459 start_info.target_lapic = lapic;
460 tsc_entry_barrier = 2;
461 tsc_exit_barrier = 2;
462
463 /*
464 * Perform the processor startup sequence with all running
465 * processors rendezvous'ed. This is required during periods when
466 * the cache-disable bit is set for MTRR/PAT initialization.
467 */
468 mp_rendezvous_no_intrs(start_cpu, (void *) &start_info);
469
470 start_info.target_cpu = 0;
471
472 ml_set_interrupts_enabled(istate);
473 lck_mtx_unlock(&mp_cpu_boot_lock);
474
475 if (!cpu_datap(slot_num)->cpu_running) {
476 kprintf("Failed to start CPU %02d\n", slot_num);
477 printf("Failed to start CPU %02d, rebooting...\n", slot_num);
478 delay(1000000);
479 halt_cpu();
480 return KERN_SUCCESS;
481 } else {
482 kprintf("Started cpu %d (lapic id %08x)\n", slot_num, lapic);
483 return KERN_SUCCESS;
484 }
485 }
486
487 #if MP_DEBUG
488 cpu_signal_event_log_t *cpu_signal[MAX_CPUS];
489 cpu_signal_event_log_t *cpu_handle[MAX_CPUS];
490
491 MP_EVENT_NAME_DECL();
492
493 #endif /* MP_DEBUG */
494
495 /*
496 * Note: called with NULL state when polling for TLB flush and cross-calls.
497 */
498 int
cpu_signal_handler(x86_saved_state_t * regs)499 cpu_signal_handler(x86_saved_state_t *regs)
500 {
501 #if !MACH_KDP
502 #pragma unused (regs)
503 #endif /* !MACH_KDP */
504 int my_cpu;
505 volatile int *my_word;
506
507 SCHED_STATS_INC(ipi_count);
508
509 my_cpu = cpu_number();
510 my_word = &cpu_data_ptr[my_cpu]->cpu_signals;
511 /* Store the initial set of signals for diagnostics. New
512 * signals could arrive while these are being processed
513 * so it's no more than a hint.
514 */
515
516 cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word;
517
518 do {
519 #if MACH_KDP
520 if (i_bit(MP_KDP, my_word)) {
521 DBGLOG(cpu_handle, my_cpu, MP_KDP);
522 i_bit_clear(MP_KDP, my_word);
523 /* Ensure that the i386_kernel_state at the base of the
524 * current thread's stack (if any) is synchronized with the
525 * context at the moment of the interrupt, to facilitate
526 * access through the debugger.
527 */
528 sync_iss_to_iks(regs);
529 if (pmsafe_debug && !kdp_snapshot) {
530 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
531 }
532 mp_kdp_wait(TRUE, FALSE);
533 if (pmsafe_debug && !kdp_snapshot) {
534 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
535 }
536 } else
537 #endif /* MACH_KDP */
538 if (i_bit(MP_TLB_FLUSH, my_word)) {
539 DBGLOG(cpu_handle, my_cpu, MP_TLB_FLUSH);
540 i_bit_clear(MP_TLB_FLUSH, my_word);
541 pmap_update_interrupt();
542 } else if (i_bit(MP_CALL, my_word)) {
543 DBGLOG(cpu_handle, my_cpu, MP_CALL);
544 i_bit_clear(MP_CALL, my_word);
545 mp_cpus_call_action();
546 } else if (i_bit(MP_CALL_PM, my_word)) {
547 DBGLOG(cpu_handle, my_cpu, MP_CALL_PM);
548 i_bit_clear(MP_CALL_PM, my_word);
549 mp_call_PM();
550 }
551 if (regs == NULL) {
552 /* Called to poll only for cross-calls and TLB flush */
553 break;
554 } else if (i_bit(MP_AST, my_word)) {
555 DBGLOG(cpu_handle, my_cpu, MP_AST);
556 i_bit_clear(MP_AST, my_word);
557 ast_check(cpu_to_processor(my_cpu));
558 }
559 } while (*my_word);
560
561 return 0;
562 }
563
564 long
NMI_pte_corruption_callback(__unused void * arg0,__unused void * arg1,uint16_t lcpu)565 NMI_pte_corruption_callback(__unused void *arg0, __unused void *arg1, uint16_t lcpu)
566 {
567 static char pstr[256]; /* global since this callback is serialized */
568 void *stackptr;
569 __asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
570
571 snprintf(&pstr[0], sizeof(pstr),
572 "Panic(CPU %d): PTE corruption detected on PTEP 0x%llx VAL 0x%llx\n",
573 lcpu, (unsigned long long)(uintptr_t)PTE_corrupted_ptr, *(uint64_t *)PTE_corrupted_ptr);
574 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, current_cpu_datap()->cpu_int_state);
575 return 0;
576 }
577
578 extern void kprintf_break_lock(void);
579 int
NMIInterruptHandler(x86_saved_state_t * regs)580 NMIInterruptHandler(x86_saved_state_t *regs)
581 {
582 void *stackptr;
583 char pstr[256];
584 uint64_t now = mach_absolute_time();
585
586 if (panic_active() && !panicDebugging) {
587 if (pmsafe_debug) {
588 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
589 }
590 for (;;) {
591 cpu_pause();
592 }
593 }
594
595 atomic_incl(&NMIPI_acks, 1);
596 atomic_incl(&NMI_count, 1);
597 sync_iss_to_iks_unconditionally(regs);
598 __asm__ volatile ("movq %%rbp, %0" : "=m" (stackptr));
599
600 if (cpu_number() == debugger_cpu) {
601 goto NMExit;
602 }
603
604 if (NMI_panic_reason == SPINLOCK_TIMEOUT) {
605 lck_spinlock_to_info_t lsti;
606
607 lsti = os_atomic_load(&lck_spinlock_timeout_in_progress, acquire);
608 snprintf(&pstr[0], sizeof(pstr),
609 "Panic(CPU %d, time %llu): NMIPI for spinlock acquisition timeout, spinlock: %p, "
610 "spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n",
611 cpu_number(), now, lsti->lock, (void *)lsti->owner_thread_cur,
612 current_thread(), lsti->owner_cpu);
613 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
614 } else if (NMI_panic_reason == TLB_FLUSH_TIMEOUT) {
615 snprintf(&pstr[0], sizeof(pstr),
616 "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: TLB flush timeout, TLB state:0x%x\n",
617 cpu_number(), now, current_cpu_datap()->cpu_tlb_invalid);
618 panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs);
619 } else if (NMI_panic_reason == CROSSCALL_TIMEOUT) {
620 snprintf(&pstr[0], sizeof(pstr),
621 "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: cross-call timeout\n",
622 cpu_number(), now);
623 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
624 } else if (NMI_panic_reason == INTERRUPT_WATCHDOG) {
625 snprintf(&pstr[0], sizeof(pstr),
626 "Panic(CPU %d, time %llu): NMIPI for unresponsive processor: interrupt watchdog for vector 0x%x\n",
627 cpu_number(), now, vector_timed_out);
628 panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
629 }
630
631 #if MACH_KDP
632 if (pmsafe_debug && !kdp_snapshot) {
633 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
634 }
635 current_cpu_datap()->cpu_NMI_acknowledged = TRUE;
636 i_bit_clear(MP_KDP, ¤t_cpu_datap()->cpu_signals);
637 if (panic_active() || NMI_panic_reason != NONE) {
638 mp_kdp_wait(FALSE, TRUE);
639 } else if (!mp_kdp_trap &&
640 !mp_kdp_is_NMI &&
641 virtualized && (debug_boot_arg & DB_NMI)) {
642 /*
643 * Under a VMM with the debug boot-arg set, drop into kdp.
644 * Since an NMI is involved, there's a risk of contending with
645 * a panic. And side-effects of NMIs may result in entry into,
646 * and continuing from, the debugger being unreliable.
647 */
648 if (__sync_bool_compare_and_swap(&mp_kdp_is_NMI, FALSE, TRUE)) {
649 kprintf_break_lock();
650
651 DebuggerWithContext(EXC_BREAKPOINT, saved_state64(regs),
652 "requested by NMI", DEBUGGER_OPTION_NONE,
653 (unsigned long)(char *)__builtin_return_address(0));
654
655 mp_kdp_is_NMI = FALSE;
656 } else {
657 mp_kdp_wait(FALSE, FALSE);
658 }
659 } else {
660 mp_kdp_wait(FALSE, FALSE);
661 }
662 if (pmsafe_debug && !kdp_snapshot) {
663 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
664 }
665 #endif
666 NMExit:
667 return 1;
668 }
669
670 /*
671 * cpu_interrupt is really just to be used by the scheduler to
672 * get a CPU's attention it may not always issue an IPI. If an
673 * IPI is always needed then use i386_cpu_IPI.
674 */
675 void
cpu_interrupt(int cpu)676 cpu_interrupt(int cpu)
677 {
678 boolean_t did_IPI = FALSE;
679
680 if (smp_initialized
681 && pmCPUExitIdle(cpu_datap(cpu))) {
682 i386_cpu_IPI(cpu);
683 did_IPI = TRUE;
684 }
685
686 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0);
687 }
688
689 /*
690 * Send a true NMI via the local APIC to the specified CPU.
691 */
692 void
cpu_NMI_interrupt(int cpu)693 cpu_NMI_interrupt(int cpu)
694 {
695 if (smp_initialized) {
696 i386_send_NMI(cpu);
697 }
698 }
699
700 void
NMI_cpus(void)701 NMI_cpus(void)
702 {
703 unsigned int cpu;
704 boolean_t intrs_enabled;
705 uint64_t tsc_timeout;
706
707 intrs_enabled = ml_set_interrupts_enabled(FALSE);
708 NMIPI_enable(TRUE);
709 for (cpu = 0; cpu < real_ncpus; cpu++) {
710 if (!cpu_is_running(cpu)) {
711 continue;
712 }
713 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
714 cpu_NMI_interrupt(cpu);
715 tsc_timeout = !machine_timeout_suspended() ?
716 rdtsc64() + (1000 * 1000 * 1000 * 10ULL) :
717 ~0ULL;
718 while (!cpu_datap(cpu)->cpu_NMI_acknowledged) {
719 handle_pending_TLB_flushes();
720 cpu_pause();
721 if (rdtsc64() > tsc_timeout) {
722 panic("NMI_cpus() timeout cpu %d", cpu);
723 }
724 }
725 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
726 }
727 NMIPI_enable(FALSE);
728
729 ml_set_interrupts_enabled(intrs_enabled);
730 }
731
732 static void(*volatile mp_PM_func)(void) = NULL;
733
734 static void
mp_call_PM(void)735 mp_call_PM(void)
736 {
737 assert(!ml_get_interrupts_enabled());
738
739 if (mp_PM_func != NULL) {
740 mp_PM_func();
741 }
742 }
743
744 void
cpu_PM_interrupt(int cpu)745 cpu_PM_interrupt(int cpu)
746 {
747 assert(!ml_get_interrupts_enabled());
748
749 if (mp_PM_func != NULL) {
750 if (cpu == cpu_number()) {
751 mp_PM_func();
752 } else {
753 i386_signal_cpu(cpu, MP_CALL_PM, ASYNC);
754 }
755 }
756 }
757
758 void
PM_interrupt_register(void (* fn)(void))759 PM_interrupt_register(void (*fn)(void))
760 {
761 mp_PM_func = fn;
762 }
763
764 void
i386_signal_cpu(int cpu,mp_event_t event,mp_sync_t mode)765 i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode)
766 {
767 volatile int *signals = &cpu_datap(cpu)->cpu_signals;
768 uint64_t tsc_timeout;
769
770
771 if (!cpu_datap(cpu)->cpu_running) {
772 return;
773 }
774
775 if (event == MP_TLB_FLUSH) {
776 KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0);
777 }
778
779 DBGLOG(cpu_signal, cpu, event);
780
781 i_bit_set(event, signals);
782 i386_cpu_IPI(cpu);
783 if (mode == SYNC) {
784 again:
785 tsc_timeout = !machine_timeout_suspended() ?
786 rdtsc64() + (1000 * 1000 * 1000) :
787 ~0ULL;
788 while (i_bit(event, signals) && rdtsc64() < tsc_timeout) {
789 cpu_pause();
790 }
791 if (i_bit(event, signals)) {
792 DBG("i386_signal_cpu(%d, 0x%x, SYNC) timed out\n",
793 cpu, event);
794 goto again;
795 }
796 }
797 if (event == MP_TLB_FLUSH) {
798 KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0);
799 }
800 }
801
802 /*
803 * Helper function called when busy-waiting: panic if too long
804 * a TSC-based time has elapsed since the start of the spin.
805 */
806 static boolean_t
mp_spin_timeout(uint64_t tsc_start)807 mp_spin_timeout(uint64_t tsc_start)
808 {
809 uint64_t tsc_timeout;
810
811 cpu_pause();
812 if (machine_timeout_suspended()) {
813 return FALSE;
814 }
815
816 /*
817 * The timeout is 4 * the spinlock timeout period
818 * unless we have serial console printing (kprintf) enabled
819 * in which case we allow an even greater margin.
820 */
821 tsc_timeout = disable_serial_output ? LockTimeOutTSC << 2
822 : LockTimeOutTSC << 4;
823 return rdtsc64() > tsc_start + tsc_timeout;
824 }
825
826 /*
827 * Helper function to take a spinlock while ensuring that incoming IPIs
828 * are still serviced if interrupts are masked while we spin.
829 * Returns current interrupt state.
830 */
831 boolean_t
mp_safe_spin_lock(usimple_lock_t lock)832 mp_safe_spin_lock(usimple_lock_t lock)
833 {
834 if (ml_get_interrupts_enabled()) {
835 simple_lock(lock, LCK_GRP_NULL);
836 return TRUE;
837 }
838
839 lck_spinlock_to_info_t lsti;
840 uint64_t tsc_spin_start = rdtsc64();
841
842 while (!simple_lock_try(lock, LCK_GRP_NULL)) {
843 cpu_signal_handler(NULL);
844 if (mp_spin_timeout(tsc_spin_start)) {
845 uintptr_t lowner = (uintptr_t)lock->interlock.lock_data;
846
847 lsti = lck_spinlock_timeout_hit(lock, lowner);
848 NMIPI_panic(cpu_to_cpumask(lsti->owner_cpu), SPINLOCK_TIMEOUT);
849 panic("mp_safe_spin_lock() timed out, lock: %p, "
850 "owner thread: 0x%lx, current_thread: %p, "
851 "owner on CPU 0x%x, time: %llu",
852 lock, lowner, current_thread(),
853 lsti->owner_cpu, mach_absolute_time());
854 }
855 }
856
857 return FALSE;
858 }
859
860 /*
861 * All-CPU rendezvous:
862 * - CPUs are signalled,
863 * - all execute the setup function (if specified),
864 * - rendezvous (i.e. all cpus reach a barrier),
865 * - all execute the action function (if specified),
866 * - rendezvous again,
867 * - execute the teardown function (if specified), and then
868 * - resume.
869 *
870 * Note that the supplied external functions _must_ be reentrant and aware
871 * that they are running in parallel and in an unknown lock context.
872 */
873
874 static void
mp_rendezvous_action(__unused void * null)875 mp_rendezvous_action(__unused void *null)
876 {
877 boolean_t intrs_enabled;
878 uint64_t tsc_spin_start;
879
880 /*
881 * Note that mp_rv_lock was acquired by the thread that initiated the
882 * rendezvous and must have been acquired before we enter
883 * mp_rendezvous_action().
884 */
885 current_cpu_datap()->cpu_rendezvous_in_progress = TRUE;
886
887 /* setup function */
888 if (mp_rv_setup_func != NULL) {
889 mp_rv_setup_func(mp_rv_func_arg);
890 }
891
892 intrs_enabled = ml_get_interrupts_enabled();
893
894 /* spin on entry rendezvous */
895 atomic_incl(&mp_rv_entry, 1);
896 tsc_spin_start = rdtsc64();
897
898 while (mp_rv_entry < mp_rv_ncpus) {
899 /* poll for pesky tlb flushes if interrupts disabled */
900 if (!intrs_enabled) {
901 handle_pending_TLB_flushes();
902 }
903 if (mp_spin_timeout(tsc_spin_start)) {
904 panic("mp_rv_action() entry: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_entry, mp_rv_ncpus, tsc_spin_start, rdtsc64());
905 }
906 }
907
908 /* action function */
909 if (mp_rv_action_func != NULL) {
910 mp_rv_action_func(mp_rv_func_arg);
911 }
912
913 /* spin on exit rendezvous */
914 atomic_incl(&mp_rv_exit, 1);
915 tsc_spin_start = rdtsc64();
916 while (mp_rv_exit < mp_rv_ncpus) {
917 if (!intrs_enabled) {
918 handle_pending_TLB_flushes();
919 }
920 if (mp_spin_timeout(tsc_spin_start)) {
921 panic("mp_rv_action() exit: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_exit, mp_rv_ncpus, tsc_spin_start, rdtsc64());
922 }
923 }
924
925 /* teardown function */
926 if (mp_rv_teardown_func != NULL) {
927 mp_rv_teardown_func(mp_rv_func_arg);
928 }
929
930 current_cpu_datap()->cpu_rendezvous_in_progress = FALSE;
931
932 /* Bump completion count */
933 atomic_incl(&mp_rv_complete, 1);
934 }
935
936 void
mp_rendezvous(void (* setup_func)(void *),void (* action_func)(void *),void (* teardown_func)(void *),void * arg)937 mp_rendezvous(void (*setup_func)(void *),
938 void (*action_func)(void *),
939 void (*teardown_func)(void *),
940 void *arg)
941 {
942 uint64_t tsc_spin_start;
943
944 if (!smp_initialized) {
945 if (setup_func != NULL) {
946 setup_func(arg);
947 }
948 if (action_func != NULL) {
949 action_func(arg);
950 }
951 if (teardown_func != NULL) {
952 teardown_func(arg);
953 }
954 return;
955 }
956
957 /* obtain rendezvous lock */
958 mp_rendezvous_lock();
959
960 /* set static function pointers */
961 mp_rv_setup_func = setup_func;
962 mp_rv_action_func = action_func;
963 mp_rv_teardown_func = teardown_func;
964 mp_rv_func_arg = arg;
965
966 mp_rv_entry = 0;
967 mp_rv_exit = 0;
968 mp_rv_complete = 0;
969
970 /*
971 * signal other processors, which will call mp_rendezvous_action()
972 * with interrupts disabled
973 */
974 mp_rv_ncpus = mp_cpus_call(CPUMASK_OTHERS, NOSYNC, &mp_rendezvous_action, NULL) + 1;
975
976 /* call executor function on this cpu */
977 mp_rendezvous_action(NULL);
978
979 /*
980 * Spin for everyone to complete.
981 * This is necessary to ensure that all processors have proceeded
982 * from the exit barrier before we release the rendezvous structure.
983 */
984 tsc_spin_start = rdtsc64();
985 while (mp_rv_complete < mp_rv_ncpus) {
986 if (mp_spin_timeout(tsc_spin_start)) {
987 panic("mp_rendezvous() timeout: %ld of %d responses, start: 0x%llx, cur: 0x%llx", mp_rv_complete, mp_rv_ncpus, tsc_spin_start, rdtsc64());
988 }
989 }
990
991 /* Tidy up */
992 mp_rv_setup_func = NULL;
993 mp_rv_action_func = NULL;
994 mp_rv_teardown_func = NULL;
995 mp_rv_func_arg = NULL;
996
997 /* release lock */
998 mp_rendezvous_unlock();
999 }
1000
1001 void
mp_rendezvous_lock(void)1002 mp_rendezvous_lock(void)
1003 {
1004 (void) mp_safe_spin_lock(&mp_rv_lock);
1005 }
1006
1007 void
mp_rendezvous_unlock(void)1008 mp_rendezvous_unlock(void)
1009 {
1010 simple_unlock(&mp_rv_lock);
1011 }
1012
1013 void
mp_rendezvous_break_lock(void)1014 mp_rendezvous_break_lock(void)
1015 {
1016 simple_lock_init(&mp_rv_lock, 0);
1017 }
1018
1019 static void
setup_disable_intrs(__unused void * param_not_used)1020 setup_disable_intrs(__unused void * param_not_used)
1021 {
1022 /* disable interrupts before the first barrier */
1023 boolean_t intr = ml_set_interrupts_enabled(FALSE);
1024
1025 current_cpu_datap()->cpu_iflag = intr;
1026 DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1027 }
1028
1029 static void
teardown_restore_intrs(__unused void * param_not_used)1030 teardown_restore_intrs(__unused void * param_not_used)
1031 {
1032 /* restore interrupt flag following MTRR changes */
1033 ml_set_interrupts_enabled(current_cpu_datap()->cpu_iflag);
1034 DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1035 }
1036
1037 /*
1038 * A wrapper to mp_rendezvous() to call action_func() with interrupts disabled.
1039 * This is exported for use by kexts.
1040 */
1041 void
mp_rendezvous_no_intrs(void (* action_func)(void *),void * arg)1042 mp_rendezvous_no_intrs(
1043 void (*action_func)(void *),
1044 void *arg)
1045 {
1046 mp_rendezvous(setup_disable_intrs,
1047 action_func,
1048 teardown_restore_intrs,
1049 arg);
1050 }
1051
1052
1053 typedef struct {
1054 queue_chain_t link; /* queue linkage */
1055 void (*func)(void *, void *); /* routine to call */
1056 void *arg0; /* routine's 1st arg */
1057 void *arg1; /* routine's 2nd arg */
1058 cpumask_t *maskp; /* completion response mask */
1059 } mp_call_t;
1060
1061
1062 typedef struct {
1063 queue_head_t queue;
1064 decl_simple_lock_data(, lock);
1065 } mp_call_queue_t;
1066 #define MP_CPUS_CALL_BUFS_PER_CPU MAX_CPUS
1067 static mp_call_queue_t mp_cpus_call_freelist;
1068 static mp_call_queue_t mp_cpus_call_head[MAX_CPUS];
1069
1070 static inline boolean_t
mp_call_head_lock(mp_call_queue_t * cqp)1071 mp_call_head_lock(mp_call_queue_t *cqp)
1072 {
1073 boolean_t intrs_enabled;
1074
1075 intrs_enabled = ml_set_interrupts_enabled(FALSE);
1076 simple_lock(&cqp->lock, LCK_GRP_NULL);
1077
1078 return intrs_enabled;
1079 }
1080
1081 /*
1082 * Deliver an NMIPI to a set of processors to cause them to panic .
1083 */
1084 void
NMIPI_panic(cpumask_t cpu_mask,NMI_reason_t why)1085 NMIPI_panic(cpumask_t cpu_mask, NMI_reason_t why)
1086 {
1087 unsigned int cpu;
1088 cpumask_t cpu_bit;
1089 uint64_t deadline;
1090
1091 NMIPI_enable(TRUE);
1092 NMI_panic_reason = why;
1093
1094 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1095 if ((cpu_mask & cpu_bit) == 0) {
1096 continue;
1097 }
1098 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1099 cpu_NMI_interrupt(cpu);
1100 }
1101
1102 /* Wait (only so long) for NMi'ed cpus to respond */
1103 deadline = mach_absolute_time() + LockTimeOut;
1104 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1105 if ((cpu_mask & cpu_bit) == 0) {
1106 continue;
1107 }
1108 while (!cpu_datap(cpu)->cpu_NMI_acknowledged &&
1109 mach_absolute_time() < deadline) {
1110 cpu_pause();
1111 }
1112 }
1113 }
1114
1115 #if MACH_ASSERT
1116 static inline boolean_t
mp_call_head_is_locked(mp_call_queue_t * cqp)1117 mp_call_head_is_locked(mp_call_queue_t *cqp)
1118 {
1119 return !ml_get_interrupts_enabled() &&
1120 hw_lock_held((hw_lock_t)&cqp->lock);
1121 }
1122 #endif
1123
1124 static inline void
mp_call_head_unlock(mp_call_queue_t * cqp,boolean_t intrs_enabled)1125 mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled)
1126 {
1127 simple_unlock(&cqp->lock);
1128 ml_set_interrupts_enabled(intrs_enabled);
1129 }
1130
1131 static inline mp_call_t *
mp_call_alloc(void)1132 mp_call_alloc(void)
1133 {
1134 mp_call_t *callp = NULL;
1135 boolean_t intrs_enabled;
1136 mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1137
1138 intrs_enabled = mp_call_head_lock(cqp);
1139 if (!queue_empty(&cqp->queue)) {
1140 queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1141 }
1142 mp_call_head_unlock(cqp, intrs_enabled);
1143
1144 return callp;
1145 }
1146
1147 static inline void
mp_call_free(mp_call_t * callp)1148 mp_call_free(mp_call_t *callp)
1149 {
1150 boolean_t intrs_enabled;
1151 mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1152
1153 intrs_enabled = mp_call_head_lock(cqp);
1154 queue_enter_first(&cqp->queue, callp, typeof(callp), link);
1155 mp_call_head_unlock(cqp, intrs_enabled);
1156 }
1157
1158 static inline mp_call_t *
mp_call_dequeue_locked(mp_call_queue_t * cqp)1159 mp_call_dequeue_locked(mp_call_queue_t *cqp)
1160 {
1161 mp_call_t *callp = NULL;
1162
1163 assert(mp_call_head_is_locked(cqp));
1164 if (!queue_empty(&cqp->queue)) {
1165 queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1166 }
1167 return callp;
1168 }
1169
1170 static inline void
mp_call_enqueue_locked(mp_call_queue_t * cqp,mp_call_t * callp)1171 mp_call_enqueue_locked(
1172 mp_call_queue_t *cqp,
1173 mp_call_t *callp)
1174 {
1175 queue_enter(&cqp->queue, callp, typeof(callp), link);
1176 }
1177
1178 /* Called on the boot processor to initialize global structures */
1179 static void
mp_cpus_call_init(void)1180 mp_cpus_call_init(void)
1181 {
1182 mp_call_queue_t *cqp = &mp_cpus_call_freelist;
1183
1184 DBG("mp_cpus_call_init()\n");
1185 simple_lock_init(&cqp->lock, 0);
1186 queue_init(&cqp->queue);
1187 }
1188
1189 /*
1190 * Called at processor registration to add call buffers to the free list
1191 * and to initialize the per-cpu call queue.
1192 */
1193 void
mp_cpus_call_cpu_init(int cpu)1194 mp_cpus_call_cpu_init(int cpu)
1195 {
1196 int i;
1197 mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1198 mp_call_t *callp;
1199
1200 simple_lock_init(&cqp->lock, 0);
1201 queue_init(&cqp->queue);
1202 for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) {
1203 callp = zalloc_permanent_type(mp_call_t);
1204 mp_call_free(callp);
1205 }
1206
1207 DBG("mp_cpus_call_init(%d) done\n", cpu);
1208 }
1209
1210 /*
1211 * This is called from cpu_signal_handler() to process an MP_CALL signal.
1212 * And also from i386_deactivate_cpu() when a cpu is being taken offline.
1213 */
1214 static void
mp_cpus_call_action(void)1215 mp_cpus_call_action(void)
1216 {
1217 mp_call_queue_t *cqp;
1218 boolean_t intrs_enabled;
1219 mp_call_t *callp;
1220 mp_call_t call;
1221
1222 assert(!ml_get_interrupts_enabled());
1223 cqp = &mp_cpus_call_head[cpu_number()];
1224 intrs_enabled = mp_call_head_lock(cqp);
1225 while ((callp = mp_call_dequeue_locked(cqp)) != NULL) {
1226 /* Copy call request to the stack to free buffer */
1227 call = *callp;
1228 mp_call_free(callp);
1229 if (call.func != NULL) {
1230 mp_call_head_unlock(cqp, intrs_enabled);
1231 KERNEL_DEBUG_CONSTANT(
1232 TRACE_MP_CPUS_CALL_ACTION,
1233 VM_KERNEL_UNSLIDE(call.func), VM_KERNEL_UNSLIDE_OR_PERM(call.arg0),
1234 VM_KERNEL_UNSLIDE_OR_PERM(call.arg1), VM_KERNEL_ADDRPERM(call.maskp), 0);
1235 call.func(call.arg0, call.arg1);
1236 (void) mp_call_head_lock(cqp);
1237 }
1238 if (call.maskp != NULL) {
1239 i_bit_set(cpu_number(), call.maskp);
1240 }
1241 }
1242 mp_call_head_unlock(cqp, intrs_enabled);
1243 }
1244
1245 #pragma clang diagnostic push
1246 #pragma clang diagnostic ignored "-Wcast-function-type"
1247
1248 /*
1249 * mp_cpus_call() runs a given function on cpus specified in a given cpu mask.
1250 * Possible modes are:
1251 * SYNC: function is called serially on target cpus in logical cpu order
1252 * waiting for each call to be acknowledged before proceeding
1253 * ASYNC: function call is queued to the specified cpus
1254 * waiting for all calls to complete in parallel before returning
1255 * NOSYNC: function calls are queued
1256 * but we return before confirmation of calls completing.
1257 * The action function may be NULL.
1258 * The cpu mask may include the local cpu. Offline cpus are ignored.
1259 * The return value is the number of cpus on which the call was made or queued.
1260 */
1261 cpu_t
mp_cpus_call(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *),void * arg)1262 mp_cpus_call(
1263 cpumask_t cpus,
1264 mp_sync_t mode,
1265 void (*action_func)(void *),
1266 void *arg)
1267 {
1268 return mp_cpus_call1(
1269 cpus,
1270 mode,
1271 (void (*)(void *, void *))action_func,
1272 arg,
1273 NULL,
1274 NULL);
1275 }
1276
1277 #pragma clang diagnostic pop
1278
1279 static void
mp_cpus_call_wait(boolean_t intrs_enabled,cpumask_t cpus_called,cpumask_t * cpus_responded)1280 mp_cpus_call_wait(boolean_t intrs_enabled,
1281 cpumask_t cpus_called,
1282 cpumask_t *cpus_responded)
1283 {
1284 mp_call_queue_t *cqp;
1285 uint64_t tsc_spin_start;
1286
1287 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1288 cqp = &mp_cpus_call_head[cpu_number()];
1289
1290 tsc_spin_start = rdtsc64();
1291 while (*cpus_responded != cpus_called) {
1292 if (!intrs_enabled) {
1293 /* Sniffing w/o locking */
1294 if (!queue_empty(&cqp->queue)) {
1295 mp_cpus_call_action();
1296 }
1297 cpu_signal_handler(NULL);
1298 }
1299 if (mp_spin_timeout(tsc_spin_start)) {
1300 cpumask_t cpus_unresponsive;
1301
1302 cpus_unresponsive = cpus_called & ~(*cpus_responded);
1303 NMIPI_panic(cpus_unresponsive, CROSSCALL_TIMEOUT);
1304 panic("mp_cpus_call_wait() timeout, cpus: 0x%llx",
1305 cpus_unresponsive);
1306 }
1307 }
1308 }
1309
1310 cpu_t
mp_cpus_call1(cpumask_t cpus,mp_sync_t mode,void (* action_func)(void *,void *),void * arg0,void * arg1,cpumask_t * cpus_calledp)1311 mp_cpus_call1(
1312 cpumask_t cpus,
1313 mp_sync_t mode,
1314 void (*action_func)(void *, void *),
1315 void *arg0,
1316 void *arg1,
1317 cpumask_t *cpus_calledp)
1318 {
1319 cpu_t cpu = 0;
1320 boolean_t intrs_enabled = FALSE;
1321 boolean_t call_self = FALSE;
1322 cpumask_t cpus_called = 0;
1323 cpumask_t cpus_responded = 0;
1324 long cpus_call_count = 0;
1325 uint64_t tsc_spin_start;
1326 boolean_t topo_lock;
1327
1328 KERNEL_DEBUG_CONSTANT(
1329 TRACE_MP_CPUS_CALL | DBG_FUNC_START,
1330 cpus, mode, VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1));
1331
1332 if (!smp_initialized) {
1333 if ((cpus & CPUMASK_SELF) == 0) {
1334 goto out;
1335 }
1336 if (action_func != NULL) {
1337 intrs_enabled = ml_set_interrupts_enabled(FALSE);
1338 action_func(arg0, arg1);
1339 ml_set_interrupts_enabled(intrs_enabled);
1340 }
1341 call_self = TRUE;
1342 goto out;
1343 }
1344
1345 /*
1346 * Queue the call for each non-local requested cpu.
1347 * This is performed under the topo lock to prevent changes to
1348 * cpus online state and to prevent concurrent rendezvouses --
1349 * although an exception is made if we're calling only the master
1350 * processor since that always remains active. Note: this exception
1351 * is expected for longterm timer nosync cross-calls to the master cpu.
1352 */
1353 mp_disable_preemption();
1354 intrs_enabled = ml_get_interrupts_enabled();
1355 topo_lock = (cpus != cpu_to_cpumask(master_cpu));
1356 if (topo_lock) {
1357 ml_set_interrupts_enabled(FALSE);
1358 (void) mp_safe_spin_lock(&x86_topo_lock);
1359 }
1360 for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1361 if (((cpu_to_cpumask(cpu) & cpus) == 0) ||
1362 !cpu_is_running(cpu)) {
1363 continue;
1364 }
1365 tsc_spin_start = rdtsc64();
1366 if (cpu == (cpu_t) cpu_number()) {
1367 /*
1368 * We don't IPI ourself and if calling asynchronously,
1369 * we defer our call until we have signalled all others.
1370 */
1371 call_self = TRUE;
1372 if (mode == SYNC && action_func != NULL) {
1373 KERNEL_DEBUG_CONSTANT(
1374 TRACE_MP_CPUS_CALL_LOCAL,
1375 VM_KERNEL_UNSLIDE(action_func),
1376 VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1377 action_func(arg0, arg1);
1378 }
1379 } else {
1380 /*
1381 * Here to queue a call to cpu and IPI.
1382 */
1383 mp_call_t *callp = NULL;
1384 mp_call_queue_t *cqp = &mp_cpus_call_head[cpu];
1385 boolean_t intrs_inner;
1386
1387 queue_call:
1388 if (callp == NULL) {
1389 callp = mp_call_alloc();
1390 }
1391 intrs_inner = mp_call_head_lock(cqp);
1392 if (callp == NULL) {
1393 mp_call_head_unlock(cqp, intrs_inner);
1394 KERNEL_DEBUG_CONSTANT(
1395 TRACE_MP_CPUS_CALL_NOBUF,
1396 cpu, 0, 0, 0, 0);
1397 if (!intrs_inner) {
1398 /* Sniffing w/o locking */
1399 if (!queue_empty(&cqp->queue)) {
1400 mp_cpus_call_action();
1401 }
1402 handle_pending_TLB_flushes();
1403 }
1404 if (mp_spin_timeout(tsc_spin_start)) {
1405 panic("mp_cpus_call1() timeout start: 0x%llx, cur: 0x%llx",
1406 tsc_spin_start, rdtsc64());
1407 }
1408 goto queue_call;
1409 }
1410 callp->maskp = (mode == NOSYNC) ? NULL : &cpus_responded;
1411 callp->func = action_func;
1412 callp->arg0 = arg0;
1413 callp->arg1 = arg1;
1414 mp_call_enqueue_locked(cqp, callp);
1415 cpus_call_count++;
1416 cpus_called |= cpu_to_cpumask(cpu);
1417 i386_signal_cpu(cpu, MP_CALL, ASYNC);
1418 mp_call_head_unlock(cqp, intrs_inner);
1419 if (mode == SYNC) {
1420 mp_cpus_call_wait(intrs_inner, cpus_called, &cpus_responded);
1421 }
1422 }
1423 }
1424 if (topo_lock) {
1425 simple_unlock(&x86_topo_lock);
1426 ml_set_interrupts_enabled(intrs_enabled);
1427 }
1428
1429 /* Call locally if mode not SYNC */
1430 if (mode != SYNC && call_self) {
1431 KERNEL_DEBUG_CONSTANT(
1432 TRACE_MP_CPUS_CALL_LOCAL,
1433 VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0);
1434 if (action_func != NULL) {
1435 ml_set_interrupts_enabled(FALSE);
1436 action_func(arg0, arg1);
1437 ml_set_interrupts_enabled(intrs_enabled);
1438 }
1439 }
1440
1441 /* For ASYNC, now wait for all signaled cpus to complete their calls */
1442 if (mode == ASYNC) {
1443 mp_cpus_call_wait(intrs_enabled, cpus_called, &cpus_responded);
1444 }
1445
1446 /* Safe to allow pre-emption now */
1447 mp_enable_preemption();
1448
1449 out:
1450 if (call_self) {
1451 cpus_called |= cpu_to_cpumask(cpu);
1452 cpus_call_count++;
1453 }
1454
1455 if (cpus_calledp) {
1456 *cpus_calledp = cpus_called;
1457 }
1458
1459 KERNEL_DEBUG_CONSTANT(
1460 TRACE_MP_CPUS_CALL | DBG_FUNC_END,
1461 cpus_call_count, cpus_called, 0, 0, 0);
1462
1463 return (cpu_t) cpus_call_count;
1464 }
1465
1466
1467 static void
mp_broadcast_action(__unused void * null)1468 mp_broadcast_action(__unused void *null)
1469 {
1470 /* call action function */
1471 if (mp_bc_action_func != NULL) {
1472 mp_bc_action_func(mp_bc_func_arg);
1473 }
1474
1475 /* if we're the last one through, wake up the instigator */
1476 if (atomic_decl_and_test(&mp_bc_count, 1)) {
1477 thread_wakeup(((event_t)(uintptr_t) &mp_bc_count));
1478 }
1479 }
1480
1481 /*
1482 * mp_broadcast() runs a given function on all active cpus.
1483 * The caller blocks until the functions has run on all cpus.
1484 * The caller will also block if there is another pending broadcast.
1485 */
1486 void
mp_broadcast(void (* action_func)(void *),void * arg)1487 mp_broadcast(
1488 void (*action_func)(void *),
1489 void *arg)
1490 {
1491 if (!smp_initialized) {
1492 if (action_func != NULL) {
1493 action_func(arg);
1494 }
1495 return;
1496 }
1497
1498 /* obtain broadcast lock */
1499 lck_mtx_lock(&mp_bc_lock);
1500
1501 /* set static function pointers */
1502 mp_bc_action_func = action_func;
1503 mp_bc_func_arg = arg;
1504
1505 assert_wait((event_t)(uintptr_t)&mp_bc_count, THREAD_UNINT);
1506
1507 /*
1508 * signal other processors, which will call mp_broadcast_action()
1509 */
1510 mp_bc_count = real_ncpus; /* assume max possible active */
1511 mp_bc_ncpus = mp_cpus_call(CPUMASK_ALL, NOSYNC, *mp_broadcast_action, NULL);
1512 atomic_decl(&mp_bc_count, real_ncpus - mp_bc_ncpus); /* subtract inactive */
1513
1514 /* block for other cpus to have run action_func */
1515 if (mp_bc_ncpus > 1) {
1516 thread_block(THREAD_CONTINUE_NULL);
1517 } else {
1518 clear_wait(current_thread(), THREAD_AWAKENED);
1519 }
1520
1521 /* release lock */
1522 lck_mtx_unlock(&mp_bc_lock);
1523 }
1524
1525 void
mp_cpus_kick(cpumask_t cpus)1526 mp_cpus_kick(cpumask_t cpus)
1527 {
1528 cpu_t cpu;
1529 boolean_t intrs_enabled = FALSE;
1530
1531 intrs_enabled = ml_set_interrupts_enabled(FALSE);
1532 mp_safe_spin_lock(&x86_topo_lock);
1533
1534 for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1535 if (((cpu_to_cpumask(cpu) & cpus) == 0)
1536 || !cpu_is_running(cpu)) {
1537 continue;
1538 }
1539
1540 lapic_send_ipi(cpu, LAPIC_VECTOR(KICK));
1541 }
1542
1543 simple_unlock(&x86_topo_lock);
1544 ml_set_interrupts_enabled(intrs_enabled);
1545 }
1546
1547 void
i386_activate_cpu(void)1548 i386_activate_cpu(void)
1549 {
1550 cpu_data_t *cdp = current_cpu_datap();
1551
1552 assert(!ml_get_interrupts_enabled());
1553
1554 if (!smp_initialized) {
1555 cdp->cpu_running = TRUE;
1556 return;
1557 }
1558
1559 mp_safe_spin_lock(&x86_topo_lock);
1560 cdp->cpu_running = TRUE;
1561 started_cpu();
1562 pmap_tlbi_range(0, ~0ULL, true, 0);
1563 simple_unlock(&x86_topo_lock);
1564 }
1565
1566 void
i386_deactivate_cpu(void)1567 i386_deactivate_cpu(void)
1568 {
1569 cpu_data_t *cdp = current_cpu_datap();
1570
1571 assert(!ml_get_interrupts_enabled());
1572
1573 KERNEL_DEBUG_CONSTANT(
1574 TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START,
1575 0, 0, 0, 0, 0);
1576
1577 mp_safe_spin_lock(&x86_topo_lock);
1578 cdp->cpu_running = FALSE;
1579 simple_unlock(&x86_topo_lock);
1580
1581 /*
1582 * Move all of this cpu's timers to the master/boot cpu,
1583 * and poke it in case there's a sooner deadline for it to schedule.
1584 */
1585 timer_queue_shutdown(&cdp->rtclock_timer.queue);
1586 mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, timer_queue_expire_local, NULL);
1587
1588 #if MONOTONIC
1589 mt_cpu_down(cdp);
1590 #endif /* MONOTONIC */
1591 #if KPERF
1592 kptimer_stop_curcpu();
1593 #endif /* KPERF */
1594
1595 /*
1596 * Open an interrupt window
1597 * and ensure any pending IPI or timer is serviced
1598 */
1599 mp_disable_preemption();
1600 ml_set_interrupts_enabled(TRUE);
1601
1602 while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime) {
1603 cpu_pause();
1604 }
1605 /*
1606 * Ensure there's no remaining timer deadline set
1607 * - AICPM may have left one active.
1608 */
1609 setPop(0);
1610
1611 ml_set_interrupts_enabled(FALSE);
1612 mp_enable_preemption();
1613
1614 KERNEL_DEBUG_CONSTANT(
1615 TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END,
1616 0, 0, 0, 0, 0);
1617 }
1618
1619 int pmsafe_debug = 1;
1620
1621 #if MACH_KDP
1622 volatile boolean_t mp_kdp_trap = FALSE;
1623 volatile boolean_t mp_kdp_is_NMI = FALSE;
1624 volatile unsigned long mp_kdp_ncpus;
1625 boolean_t mp_kdp_state;
1626
1627
1628 void
mp_kdp_enter(boolean_t proceed_on_failure)1629 mp_kdp_enter(boolean_t proceed_on_failure)
1630 {
1631 unsigned int cpu;
1632 unsigned int ncpus = 0;
1633 unsigned int my_cpu;
1634 uint64_t tsc_timeout;
1635
1636 DBG("mp_kdp_enter()\n");
1637
1638 /*
1639 * Here to enter the debugger.
1640 * In case of races, only one cpu is allowed to enter kdp after
1641 * stopping others.
1642 */
1643 mp_kdp_state = ml_set_interrupts_enabled(FALSE);
1644 my_cpu = cpu_number();
1645
1646 if (my_cpu == (unsigned) debugger_cpu) {
1647 kprintf("\n\nRECURSIVE DEBUGGER ENTRY DETECTED\n\n");
1648 kdp_reset();
1649 return;
1650 }
1651
1652 uint64_t start_time = cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time();
1653 int locked = 0;
1654 while (!locked || mp_kdp_trap) {
1655 if (locked) {
1656 simple_unlock(&x86_topo_lock);
1657 }
1658 if (proceed_on_failure) {
1659 if (mach_absolute_time() - start_time > 500000000ll) {
1660 paniclog_append_noflush("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n");
1661 break;
1662 }
1663 locked = simple_lock_try(&x86_topo_lock, LCK_GRP_NULL);
1664 if (!locked) {
1665 cpu_pause();
1666 }
1667 } else {
1668 mp_safe_spin_lock(&x86_topo_lock);
1669 locked = TRUE;
1670 }
1671
1672 if (locked && mp_kdp_trap) {
1673 simple_unlock(&x86_topo_lock);
1674 DBG("mp_kdp_enter() race lost\n");
1675 #if MACH_KDP
1676 mp_kdp_wait(TRUE, FALSE);
1677 #endif
1678 locked = FALSE;
1679 }
1680 }
1681
1682 if (pmsafe_debug && !kdp_snapshot) {
1683 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
1684 }
1685
1686 debugger_cpu = my_cpu;
1687 ncpus = 1;
1688 atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1689 mp_kdp_trap = TRUE;
1690 debugger_entry_time = cpu_datap(my_cpu)->debugger_entry_time;
1691
1692 /*
1693 * Deliver a nudge to other cpus, counting how many
1694 */
1695 DBG("mp_kdp_enter() signaling other processors\n");
1696 if (force_immediate_debugger_NMI == FALSE) {
1697 for (cpu = 0; cpu < real_ncpus; cpu++) {
1698 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1699 continue;
1700 }
1701 ncpus++;
1702 i386_signal_cpu(cpu, MP_KDP, ASYNC);
1703 }
1704 /*
1705 * Wait other processors to synchronize
1706 */
1707 DBG("mp_kdp_enter() waiting for (%d) processors to suspend\n", ncpus);
1708
1709 /*
1710 * This timeout is rather arbitrary; we don't want to NMI
1711 * processors that are executing at potentially
1712 * "unsafe-to-interrupt" points such as the trampolines,
1713 * but neither do we want to lose state by waiting too long.
1714 */
1715 tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1716
1717 while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1718 /*
1719 * A TLB shootdown request may be pending--this would
1720 * result in the requesting processor waiting in
1721 * PMAP_UPDATE_TLBS() until this processor deals with it.
1722 * Process it, so it can now enter mp_kdp_wait()
1723 */
1724 handle_pending_TLB_flushes();
1725 cpu_pause();
1726 }
1727 /* If we've timed out, and some processor(s) are still unresponsive,
1728 * interrupt them with an NMI via the local APIC, iff a panic is
1729 * in progress.
1730 */
1731 if (panic_active()) {
1732 NMIPI_enable(TRUE);
1733 }
1734 if (mp_kdp_ncpus != ncpus) {
1735 unsigned int wait_cycles = 0;
1736 if (proceed_on_failure) {
1737 paniclog_append_noflush("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1738 } else {
1739 DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
1740 }
1741 for (cpu = 0; cpu < real_ncpus; cpu++) {
1742 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1743 continue;
1744 }
1745 if (cpu_signal_pending(cpu, MP_KDP)) {
1746 cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
1747 cpu_NMI_interrupt(cpu);
1748 }
1749 }
1750 /* Wait again for the same timeout */
1751 tsc_timeout = rdtsc64() + (LockTimeOutTSC);
1752 while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1753 handle_pending_TLB_flushes();
1754 cpu_pause();
1755 ++wait_cycles;
1756 }
1757 if (mp_kdp_ncpus != ncpus) {
1758 paniclog_append_noflush("mp_kdp_enter() NMI pending on cpus:");
1759 for (cpu = 0; cpu < real_ncpus; cpu++) {
1760 if (cpu_is_running(cpu) && !cpu_datap(cpu)->cpu_NMI_acknowledged) {
1761 paniclog_append_noflush(" %d", cpu);
1762 }
1763 }
1764 paniclog_append_noflush("\n");
1765 if (proceed_on_failure) {
1766 paniclog_append_noflush("mp_kdp_enter() timed-out during %s wait after NMI;"
1767 "expected %u acks but received %lu after %u loops in %llu ticks\n",
1768 (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1769 } else {
1770 panic("mp_kdp_enter() timed-out during %s wait after NMI;"
1771 "expected %u acks but received %lu after %u loops in %llu ticks",
1772 (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
1773 }
1774 }
1775 }
1776 } else if (NMI_panic_reason != PTE_CORRUPTION) { /* In the pte corruption case, the detecting CPU has already NMIed other CPUs */
1777 for (cpu = 0; cpu < real_ncpus; cpu++) {
1778 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1779 continue;
1780 }
1781 cpu_NMI_interrupt(cpu);
1782 }
1783 }
1784
1785 if (locked) {
1786 simple_unlock(&x86_topo_lock);
1787 }
1788
1789 DBG("mp_kdp_enter() %d processors done %s\n",
1790 (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out");
1791
1792 postcode(MP_KDP_ENTER);
1793 }
1794
1795 boolean_t
mp_kdp_all_cpus_halted()1796 mp_kdp_all_cpus_halted()
1797 {
1798 unsigned int ncpus = 0, cpu = 0, my_cpu = 0;
1799
1800 my_cpu = cpu_number();
1801 ncpus = 1; /* current CPU */
1802 for (cpu = 0; cpu < real_ncpus; cpu++) {
1803 if (cpu == my_cpu || !cpu_is_running(cpu)) {
1804 continue;
1805 }
1806 ncpus++;
1807 }
1808
1809 return mp_kdp_ncpus == ncpus;
1810 }
1811
1812 static boolean_t
cpu_signal_pending(int cpu,mp_event_t event)1813 cpu_signal_pending(int cpu, mp_event_t event)
1814 {
1815 volatile int *signals = &cpu_datap(cpu)->cpu_signals;
1816 boolean_t retval = FALSE;
1817
1818 if (i_bit(event, signals)) {
1819 retval = TRUE;
1820 }
1821 return retval;
1822 }
1823
1824 long
kdp_x86_xcpu_invoke(const uint16_t lcpu,kdp_x86_xcpu_func_t func,void * arg0,void * arg1,uint64_t timeout)1825 kdp_x86_xcpu_invoke(const uint16_t lcpu, kdp_x86_xcpu_func_t func,
1826 void *arg0, void *arg1, uint64_t timeout)
1827 {
1828 uint64_t now;
1829
1830 if (lcpu > (real_ncpus - 1)) {
1831 return -1;
1832 }
1833
1834 if (func == NULL) {
1835 return -1;
1836 }
1837
1838 kdp_xcpu_call_func.func = func;
1839 kdp_xcpu_call_func.ret = -1;
1840 kdp_xcpu_call_func.arg0 = arg0;
1841 kdp_xcpu_call_func.arg1 = arg1;
1842 kdp_xcpu_call_func.cpu = lcpu;
1843 DBG("Invoking function %p on CPU %d\n", func, (int32_t)lcpu);
1844 now = mach_absolute_time();
1845 while (kdp_xcpu_call_func.cpu != KDP_XCPU_NONE &&
1846 (timeout == 0 || (mach_absolute_time() - now) < timeout)) {
1847 cpu_pause();
1848 }
1849 return kdp_xcpu_call_func.ret;
1850 }
1851
1852 static void
kdp_x86_xcpu_poll(void)1853 kdp_x86_xcpu_poll(void)
1854 {
1855 if ((uint16_t)cpu_number() == kdp_xcpu_call_func.cpu) {
1856 kdp_xcpu_call_func.ret =
1857 kdp_xcpu_call_func.func(kdp_xcpu_call_func.arg0,
1858 kdp_xcpu_call_func.arg1,
1859 cpu_number());
1860 kdp_xcpu_call_func.cpu = KDP_XCPU_NONE;
1861 }
1862 }
1863
1864 static void
mp_kdp_wait(boolean_t flush,boolean_t isNMI)1865 mp_kdp_wait(boolean_t flush, boolean_t isNMI)
1866 {
1867 DBG("mp_kdp_wait()\n");
1868
1869 current_cpu_datap()->debugger_ipi_time = mach_absolute_time();
1870 #if CONFIG_MCA
1871 /* If we've trapped due to a machine-check, save MCA registers */
1872 mca_check_save();
1873 #endif
1874
1875 atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1876 while (mp_kdp_trap || (isNMI == TRUE)) {
1877 /*
1878 * A TLB shootdown request may be pending--this would result
1879 * in the requesting processor waiting in PMAP_UPDATE_TLBS()
1880 * until this processor handles it.
1881 * Process it, so it can now enter mp_kdp_wait()
1882 */
1883 if (flush) {
1884 handle_pending_TLB_flushes();
1885 }
1886
1887 kdp_x86_xcpu_poll();
1888 cpu_pause();
1889 }
1890
1891 atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1892 DBG("mp_kdp_wait() done\n");
1893 }
1894
1895 void
mp_kdp_exit(void)1896 mp_kdp_exit(void)
1897 {
1898 DBG("mp_kdp_exit()\n");
1899 debugger_cpu = -1;
1900 atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1901
1902 debugger_exit_time = mach_absolute_time();
1903
1904 mp_kdp_trap = FALSE;
1905 mfence();
1906
1907 /* Wait other processors to stop spinning. XXX needs timeout */
1908 DBG("mp_kdp_exit() waiting for processors to resume\n");
1909 while (mp_kdp_ncpus > 0) {
1910 /*
1911 * a TLB shootdown request may be pending... this would result in the requesting
1912 * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it.
1913 * Process it, so it can now enter mp_kdp_wait()
1914 */
1915 handle_pending_TLB_flushes();
1916
1917 cpu_pause();
1918 }
1919
1920 if (pmsafe_debug && !kdp_snapshot) {
1921 pmSafeMode(¤t_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
1922 }
1923
1924 debugger_exit_time = mach_absolute_time();
1925
1926 DBG("mp_kdp_exit() done\n");
1927 (void) ml_set_interrupts_enabled(mp_kdp_state);
1928 postcode(MP_KDP_EXIT);
1929 }
1930
1931 #endif /* MACH_KDP */
1932
1933 boolean_t
mp_recent_debugger_activity(void)1934 mp_recent_debugger_activity(void)
1935 {
1936 uint64_t abstime = mach_absolute_time();
1937 return ((abstime - debugger_entry_time) < LastDebuggerEntryAllowance) ||
1938 ((abstime - debugger_exit_time) < LastDebuggerEntryAllowance);
1939 }
1940
1941 /*ARGSUSED*/
1942 void
init_ast_check(__unused processor_t processor)1943 init_ast_check(
1944 __unused processor_t processor)
1945 {
1946 }
1947
1948 void
cause_ast_check(processor_t processor)1949 cause_ast_check(
1950 processor_t processor)
1951 {
1952 int cpu = processor->cpu_id;
1953
1954 if (cpu != cpu_number()) {
1955 i386_signal_cpu(cpu, MP_AST, ASYNC);
1956 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0);
1957 }
1958 }
1959
1960 void
slave_machine_init(void * param)1961 slave_machine_init(void *param)
1962 {
1963 /*
1964 * Here in process context, but with interrupts disabled.
1965 */
1966 DBG("slave_machine_init() CPU%d\n", get_cpu_number());
1967
1968 if (param == FULL_SLAVE_INIT) {
1969 /*
1970 * Cold start
1971 */
1972 clock_init();
1973 }
1974 cpu_machine_init(); /* Interrupts enabled hereafter */
1975 }
1976
1977 #undef cpu_number
1978 int
cpu_number(void)1979 cpu_number(void)
1980 {
1981 return get_cpu_number();
1982 }
1983
1984 vm_offset_t
current_percpu_base(void)1985 current_percpu_base(void)
1986 {
1987 return get_current_percpu_base();
1988 }
1989
1990 vm_offset_t
other_percpu_base(int cpu)1991 other_percpu_base(int cpu)
1992 {
1993 return cpu_datap(cpu)->cpu_pcpu_base;
1994 }
1995
1996 static void
cpu_prewarm_init()1997 cpu_prewarm_init()
1998 {
1999 int i;
2000
2001 simple_lock_init(&cpu_warm_lock, 0);
2002 queue_init(&cpu_warm_call_list);
2003 for (i = 0; i < NUM_CPU_WARM_CALLS; i++) {
2004 enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]);
2005 }
2006 }
2007
2008 static timer_call_t
grab_warm_timer_call()2009 grab_warm_timer_call()
2010 {
2011 spl_t x;
2012 timer_call_t call = NULL;
2013
2014 x = splsched();
2015 simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2016 if (!queue_empty(&cpu_warm_call_list)) {
2017 call = (timer_call_t) dequeue_head(&cpu_warm_call_list);
2018 }
2019 simple_unlock(&cpu_warm_lock);
2020 splx(x);
2021
2022 return call;
2023 }
2024
2025 static void
free_warm_timer_call(timer_call_t call)2026 free_warm_timer_call(timer_call_t call)
2027 {
2028 spl_t x;
2029
2030 x = splsched();
2031 simple_lock(&cpu_warm_lock, LCK_GRP_NULL);
2032 enqueue_head(&cpu_warm_call_list, (queue_entry_t)call);
2033 simple_unlock(&cpu_warm_lock);
2034 splx(x);
2035 }
2036
2037 /*
2038 * Runs in timer call context (interrupts disabled).
2039 */
2040 static void
cpu_warm_timer_call_func(timer_call_param_t p0,__unused timer_call_param_t p1)2041 cpu_warm_timer_call_func(
2042 timer_call_param_t p0,
2043 __unused timer_call_param_t p1)
2044 {
2045 free_warm_timer_call((timer_call_t)p0);
2046 return;
2047 }
2048
2049 /*
2050 * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0).
2051 */
2052 static void
_cpu_warm_setup(void * arg)2053 _cpu_warm_setup(
2054 void *arg)
2055 {
2056 cpu_warm_data_t cwdp = (cpu_warm_data_t)arg;
2057
2058 timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2059 cwdp->cwd_result = 0;
2060
2061 return;
2062 }
2063
2064 /*
2065 * Not safe to call with interrupts disabled.
2066 */
2067 kern_return_t
ml_interrupt_prewarm(uint64_t deadline)2068 ml_interrupt_prewarm(
2069 uint64_t deadline)
2070 {
2071 struct cpu_warm_data cwd;
2072 timer_call_t call;
2073 cpu_t ct;
2074
2075 if (ml_get_interrupts_enabled() == FALSE) {
2076 panic("%s: Interrupts disabled?", __FUNCTION__);
2077 }
2078
2079 /*
2080 * If the platform doesn't need our help, say that we succeeded.
2081 */
2082 if (!ml_get_interrupt_prewake_applicable()) {
2083 return KERN_SUCCESS;
2084 }
2085
2086 /*
2087 * Grab a timer call to use.
2088 */
2089 call = grab_warm_timer_call();
2090 if (call == NULL) {
2091 return KERN_RESOURCE_SHORTAGE;
2092 }
2093
2094 timer_call_setup(call, cpu_warm_timer_call_func, call);
2095 cwd.cwd_call = call;
2096 cwd.cwd_deadline = deadline;
2097 cwd.cwd_result = 0;
2098
2099 /*
2100 * For now, non-local interrupts happen on the master processor.
2101 */
2102 ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd);
2103 if (ct == 0) {
2104 free_warm_timer_call(call);
2105 return KERN_FAILURE;
2106 } else {
2107 return cwd.cwd_result;
2108 }
2109 }
2110
2111 #if DEBUG || DEVELOPMENT
2112 void
kernel_spin(uint64_t spin_ns)2113 kernel_spin(uint64_t spin_ns)
2114 {
2115 boolean_t istate;
2116 uint64_t spin_abs;
2117 uint64_t deadline;
2118 cpu_data_t *cdp;
2119
2120 kprintf("kernel_spin(%llu) spinning uninterruptibly\n", spin_ns);
2121 istate = ml_set_interrupts_enabled(FALSE);
2122 cdp = current_cpu_datap();
2123 nanoseconds_to_absolutetime(spin_ns, &spin_abs);
2124
2125 /* Fake interrupt handler entry for testing mp_interrupt_watchdog() */
2126 cdp->cpu_int_event_time = mach_absolute_time();
2127 cdp->cpu_int_state = (void *) USER_STATE(current_thread());
2128
2129 deadline = mach_absolute_time() + spin_ns;
2130 while (mach_absolute_time() < deadline) {
2131 cpu_pause();
2132 }
2133
2134 cdp->cpu_int_event_time = 0;
2135 cdp->cpu_int_state = NULL;
2136
2137 ml_set_interrupts_enabled(istate);
2138 kprintf("kernel_spin() continuing\n");
2139 }
2140
2141 /*
2142 * Called from the scheduler's maintenance thread,
2143 * scan running processors for long-running ISRs and:
2144 * - panic if longer than LockTimeOut, or
2145 * - log if more than a quantum.
2146 */
2147 void
mp_interrupt_watchdog(void)2148 mp_interrupt_watchdog(void)
2149 {
2150 cpu_t cpu;
2151 boolean_t intrs_enabled = FALSE;
2152 uint16_t cpu_int_num;
2153 uint64_t cpu_int_event_time;
2154 uint64_t cpu_rip;
2155 uint64_t cpu_int_duration;
2156 uint64_t now;
2157 x86_saved_state_t *cpu_int_state;
2158
2159 if (__improbable(!mp_interrupt_watchdog_enabled)) {
2160 return;
2161 }
2162
2163 intrs_enabled = ml_set_interrupts_enabled(FALSE);
2164 now = mach_absolute_time();
2165 /*
2166 * While timeouts are not suspended,
2167 * check all other processors for long outstanding interrupt handling.
2168 */
2169 for (cpu = 0;
2170 cpu < (cpu_t) real_ncpus && !machine_timeout_suspended();
2171 cpu++) {
2172 if ((cpu == (cpu_t) cpu_number()) ||
2173 (!cpu_is_running(cpu))) {
2174 continue;
2175 }
2176 cpu_int_event_time = cpu_datap(cpu)->cpu_int_event_time;
2177 if (cpu_int_event_time == 0) {
2178 continue;
2179 }
2180 if (__improbable(now < cpu_int_event_time)) {
2181 continue; /* skip due to inter-processor skew */
2182 }
2183 cpu_int_state = cpu_datap(cpu)->cpu_int_state;
2184 if (__improbable(cpu_int_state == NULL)) {
2185 /* The interrupt may have been dismissed */
2186 continue;
2187 }
2188
2189 /* Here with a cpu handling an interrupt */
2190
2191 cpu_int_duration = now - cpu_int_event_time;
2192 if (__improbable(cpu_int_duration > LockTimeOut)) {
2193 cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2194 cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2195 vector_timed_out = cpu_int_num;
2196 NMIPI_panic(cpu_to_cpumask(cpu), INTERRUPT_WATCHDOG);
2197 panic("Interrupt watchdog, "
2198 "cpu: %d interrupt: 0x%x time: %llu..%llu state: %p RIP: 0x%llx",
2199 cpu, cpu_int_num, cpu_int_event_time, now, cpu_int_state, cpu_rip);
2200 /* NOT REACHED */
2201 } else if (__improbable(cpu_int_duration > (uint64_t) std_quantum)) {
2202 mp_interrupt_watchdog_events++;
2203 cpu_int_num = saved_state64(cpu_int_state)->isf.trapno;
2204 cpu_rip = saved_state64(cpu_int_state)->isf.rip;
2205 ml_set_interrupts_enabled(intrs_enabled);
2206 printf("Interrupt watchdog, "
2207 "cpu: %d interrupt: 0x%x time: %llu..%llu RIP: 0x%llx\n",
2208 cpu, cpu_int_num, cpu_int_event_time, now, cpu_rip);
2209 return;
2210 }
2211 }
2212
2213 ml_set_interrupts_enabled(intrs_enabled);
2214 }
2215 #endif
2216