/* * Copyright (c) 2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #ifdef __x86_64__ #error This file is only needed on weakly-ordered systems! #endif #include #include #include #include #include #include #include /* * CPU quiescing generation counter implemented with a checkin mask * * A tri-state bitfield, with 2 bits for each processor:; * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle. * * When a processor causes the 'expected' bits to equal the 'checkin' bits, which * indicates that all processors have executed the barrier, it ticks the algorithm * and resets the state. * * Idle CPUs won't check in, because they don't run, so the algorithm won't tick. * However, they can't do anything in userspace while idle, so we don't need * them to execute barriers, so we have them 'leave' the counter so that * they don't delay the tick while idle. * * This bitfield currently limits MAX_CPUS to 32 on LP64. * In the future, we can use double-wide atomics and int128 if we need 64 CPUS. * * The mask only guarantees ordering to code running in userspace. * We defer joining the counter until we actually reach userspace, allowing * processors that come out of idle and only run kernel code to avoid the overhead * of participation. * * We additionally defer updating the counter for a minimum interval to * reduce the frequency of executing the exclusive atomic operations. * * The longest delay between two checkins assuming that at least one processor * joins is + ( * 2) */ typedef unsigned long checkin_mask_t; static _Atomic checkin_mask_t cpu_quiescing_checkin_state; static uint64_t cpu_checkin_last_commit; struct cpu_quiesce { cpu_quiescent_state_t state; uint64_t last_checkin; }; static struct cpu_quiesce PERCPU_DATA(cpu_quiesce); #define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */ #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */ static uint64_t cpu_checkin_min_interval; static uint32_t cpu_checkin_min_interval_us; #if __LP64__ #define CPU_CHECKIN_MASK_MAX_CPUS 32 #define CPU_CHECKIN_MASK 0x5555555555555555UL #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) #else /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */ #define CPU_CHECKIN_MASK_MAX_CPUS 16 #define CPU_CHECKIN_MASK 0x55555555UL #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) #endif static_assert(MAX_CPUS <= CPU_CHECKIN_MASK_MAX_CPUS); static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1); static inline checkin_mask_t cpu_checked_in_bit(int cpuid) { return 1UL << (2 * cpuid); } static inline checkin_mask_t cpu_expected_bit(int cpuid) { return 1UL << (2 * cpuid + 1); } void cpu_quiescent_counter_init(void) { assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS - 1)); assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS - 1)); assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS - 1)) == 0); assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS - 1)) == 0); cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US); } void cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us) { /* clamp to something vaguely sane */ if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) { new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US; } cpu_checkin_min_interval_us = new_value_us; uint64_t abstime = 0; clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us, NSEC_PER_USEC, &abstime); cpu_checkin_min_interval = abstime; } uint32_t cpu_quiescent_counter_get_min_interval_us(void) { return cpu_checkin_min_interval_us; } /* * Called when all running CPUs have checked in. * * The commpage increment is protected by the 'lock' of having caused the tick, * and it is published by the state reset release barrier. */ static void cpu_quiescent_counter_commit(uint64_t ctime) { __kdebug_only uint64_t old_gen; __kdebug_only checkin_mask_t old_state; old_gen = commpage_increment_cpu_quiescent_counter(); cpu_checkin_last_commit = ctime; old_state = os_atomic_andnot(&cpu_quiescing_checkin_state, CPU_CHECKIN_MASK, release); KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0); } /* * Have all the expected CPUs checked in? */ static bool cpu_quiescent_counter_needs_commit(checkin_mask_t state) { return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1); } /* * Called when a processor wants to start participating in the counter, e.g. * 1) when context switching away from the idle thread * 2) when coming up for the first time * 3) when coming up after a shutdown * * Called with interrupts disabled. */ void cpu_quiescent_counter_join(__unused uint64_t ctime) { struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); __assert_only int cpuid = cpu_number(); assert(cpuid < MAX_CPUS); assert(st->state == CPU_QUIESCE_COUNTER_NONE || st->state == CPU_QUIESCE_COUNTER_LEFT); assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); st->state = CPU_QUIESCE_COUNTER_PENDING_JOIN; /* * Mark the processor to call cpu_quiescent_counter_ast before it * ever returns to userspace. */ ast_on(AST_UNQUIESCE); } /* * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback * It needs to acquire the counter to see data and the counter published by other CPUs. */ void cpu_quiescent_counter_ast(void) { struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); int cpuid = cpu_number(); assert(st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN); /* We had better not already be joined. */ assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); /* * No release barrier needed because we have no prior state to publish. * Acquire barrier needed because we need this processor to see * the latest counter value. * * The state may be in 'needs checkin' both before and after * this atomic or. * * Additionally, if this is the first processor to come out of idle, * it may need to kickstart the algorithm, otherwise it would * stay in 'needs commit' perpetually with no processor assigned to * actually do the commit. To do that, the first processor only adds * its expected bit. */ st->state = CPU_QUIESCE_COUNTER_JOINED; st->last_checkin = mach_absolute_time(); checkin_mask_t old_mask, new_mask; os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, { if (old_mask == 0) { new_mask = old_mask | cpu_expected_bit(cpuid); } else { new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid); } }); } /* * Called when a processor no longer wants to participate in the counter, * i.e. when a processor is on its way to idle or shutdown. * * Called with interrupts disabled. * * The processor needs to remove itself from the expected mask, to allow the * algorithm to continue ticking without its participation. * However, it needs to ensure that anything it has done since the last time * it checked in has been published before the next tick is allowed to commit. */ void cpu_quiescent_counter_leave(uint64_t ctime) { struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); int cpuid = cpu_number(); assert(st->state == CPU_QUIESCE_COUNTER_JOINED || st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN); /* We no longer need the cpu_quiescent_counter_ast callback to be armed */ ast_off(AST_UNQUIESCE); if (st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN) { /* We never actually joined, so we don't have to do the work to leave. */ st->state = CPU_QUIESCE_COUNTER_LEFT; return; } /* Leaving can't be deferred, even if we're within the min interval */ st->last_checkin = ctime; checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid); checkin_mask_t orig_state = os_atomic_andnot_orig(&cpu_quiescing_checkin_state, mask, acq_rel); assert((orig_state & cpu_expected_bit(cpuid))); st->state = CPU_QUIESCE_COUNTER_LEFT; if (cpu_quiescent_counter_needs_commit(orig_state)) { /* * the old state indicates someone else was already doing a commit * but hadn't finished yet. We successfully inserted the acq_rel * before they finished the commit by resetting the bitfield, * so we're done here. */ return; } checkin_mask_t new_state = orig_state & ~mask; if (cpu_quiescent_counter_needs_commit(new_state)) { cpu_quiescent_counter_commit(ctime); } } /* * Called when a processor wants to check in to the counter * If it hasn't yet fully joined, it doesn't need to check in. * * Called with interrupts disabled. */ void cpu_quiescent_counter_checkin(uint64_t ctime) { struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); int cpuid = cpu_number(); assert(st->state != CPU_QUIESCE_COUNTER_NONE); /* If we're not joined yet, we don't need to check in */ if (__probable(st->state != CPU_QUIESCE_COUNTER_JOINED)) { return; } /* If we've checked in recently, we don't need to check in yet. */ if (__probable((ctime - st->last_checkin) <= cpu_checkin_min_interval)) { return; } st->last_checkin = ctime; checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); assert((state & cpu_expected_bit(cpuid))); if (__probable((state & cpu_checked_in_bit(cpuid)))) { /* * Processor has already checked in for this round, no need to * acquire the cacheline exclusive. */ return; } checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state, cpu_checked_in_bit(cpuid), acq_rel); checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid); if (cpu_quiescent_counter_needs_commit(new_state)) { assertf(!cpu_quiescent_counter_needs_commit(orig_state), "old: 0x%lx, new: 0x%lx", orig_state, new_state); cpu_quiescent_counter_commit(ctime); } } #if MACH_ASSERT /* * Called on all AST exits to userspace to assert this processor actually joined * * Called with interrupts disabled after the AST should have been handled */ void cpu_quiescent_counter_assert_ast(void) { struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); int cpuid = cpu_number(); assert(st->state == CPU_QUIESCE_COUNTER_JOINED); checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); assert((state & cpu_expected_bit(cpuid))); } #endif /* MACH_ASSERT */