xref: /xnu-8796.101.5/osfmk/kern/cpu_quiesce.c (revision aca3beaa3dfbd42498b42c5e5ce20a938e6554e5)
1 /*
2  * Copyright (c) 2018 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #ifdef __x86_64__
30 #error This file is only needed on weakly-ordered systems!
31 #endif
32 
33 #include <machine/atomic.h>
34 #include <machine/commpage.h>
35 #include <machine/machine_cpu.h>
36 
37 #include <kern/sched_prim.h>
38 #include <kern/percpu.h>
39 #include <kern/ast.h>
40 
41 #include <kern/cpu_quiesce.h>
42 
43 /*
44  * CPU quiescing generation counter implemented with a checkin mask
45  *
46  * A tri-state bitfield, with 2 bits for each processor:;
47  * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier
48  * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle.
49  *
50  * When a processor causes the 'expected' bits to equal the 'checkin' bits, which
51  * indicates that all processors have executed the barrier, it ticks the algorithm
52  * and resets the state.
53  *
54  * Idle CPUs won't check in, because they don't run, so the algorithm won't tick.
55  * However, they can't do anything in userspace while idle, so we don't need
56  * them to execute barriers, so we have them 'leave' the counter so that
57  * they don't delay the tick while idle.
58  *
59  * This bitfield currently limits MAX_CPUS to 32 on LP64.
60  * In the future, we can use double-wide atomics and int128 if we need 64 CPUS.
61  *
62  * The mask only guarantees ordering to code running in userspace.
63  * We defer joining the counter until we actually reach userspace, allowing
64  * processors that come out of idle and only run kernel code to avoid the overhead
65  * of participation.
66  *
67  * We additionally defer updating the counter for a minimum interval to
68  * reduce the frequency of executing the exclusive atomic operations.
69  *
70  * The longest delay between two checkins assuming that at least one processor
71  * joins is <checkin delay> + (<thread quantum> * 2)
72  */
73 
74 typedef unsigned long checkin_mask_t;
75 
76 static _Atomic checkin_mask_t cpu_quiescing_checkin_state;
77 
78 static uint64_t cpu_checkin_last_commit;
79 
80 struct cpu_quiesce {
81 	cpu_quiescent_state_t   state;
82 	uint64_t                last_checkin;
83 };
84 
85 static struct cpu_quiesce PERCPU_DATA(cpu_quiesce);
86 
87 #define CPU_CHECKIN_MIN_INTERVAL_US     4000 /* 4ms */
88 #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
89 static uint64_t cpu_checkin_min_interval;
90 static uint32_t cpu_checkin_min_interval_us;
91 
92 #if __LP64__
93 #define CPU_CHECKIN_MASK_MAX_CPUS 32
94 #define CPU_CHECKIN_MASK          0x5555555555555555UL
95 #define CPU_EXPECTED_MASK         (~CPU_CHECKIN_MASK)
96 #else
97 /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */
98 #define CPU_CHECKIN_MASK_MAX_CPUS 16
99 #define CPU_CHECKIN_MASK          0x55555555UL
100 #define CPU_EXPECTED_MASK         (~CPU_CHECKIN_MASK)
101 #endif
102 
103 static_assert(MAX_CPUS <= CPU_CHECKIN_MASK_MAX_CPUS);
104 static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1);
105 
106 static inline checkin_mask_t
cpu_checked_in_bit(int cpuid)107 cpu_checked_in_bit(int cpuid)
108 {
109 	return 1UL << (2 * cpuid);
110 }
111 
112 static inline checkin_mask_t
cpu_expected_bit(int cpuid)113 cpu_expected_bit(int cpuid)
114 {
115 	return 1UL << (2 * cpuid + 1);
116 }
117 
118 void
cpu_quiescent_counter_init(void)119 cpu_quiescent_counter_init(void)
120 {
121 	assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS - 1));
122 	assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS - 1));
123 	assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS - 1)) == 0);
124 	assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS - 1)) == 0);
125 
126 	cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US);
127 }
128 
129 void
cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us)130 cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us)
131 {
132 	/* clamp to something vaguely sane */
133 	if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) {
134 		new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US;
135 	}
136 
137 	cpu_checkin_min_interval_us = new_value_us;
138 
139 	uint64_t abstime = 0;
140 	clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us,
141 	    NSEC_PER_USEC, &abstime);
142 	cpu_checkin_min_interval = abstime;
143 }
144 
145 uint32_t
cpu_quiescent_counter_get_min_interval_us(void)146 cpu_quiescent_counter_get_min_interval_us(void)
147 {
148 	return cpu_checkin_min_interval_us;
149 }
150 
151 
152 /*
153  * Called when all running CPUs have checked in.
154  *
155  * The commpage increment is protected by the 'lock' of having caused the tick,
156  * and it is published by the state reset release barrier.
157  */
158 static void
cpu_quiescent_counter_commit(uint64_t ctime)159 cpu_quiescent_counter_commit(uint64_t ctime)
160 {
161 	__kdebug_only uint64_t          old_gen;
162 	__kdebug_only checkin_mask_t    old_state;
163 
164 	old_gen = commpage_increment_cpu_quiescent_counter();
165 
166 	cpu_checkin_last_commit = ctime;
167 
168 	old_state = os_atomic_andnot(&cpu_quiescing_checkin_state, CPU_CHECKIN_MASK, release);
169 
170 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0);
171 }
172 
173 /*
174  * Have all the expected CPUs checked in?
175  */
176 static bool
cpu_quiescent_counter_needs_commit(checkin_mask_t state)177 cpu_quiescent_counter_needs_commit(checkin_mask_t state)
178 {
179 	return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1);
180 }
181 
182 /*
183  * Called when a processor wants to start participating in the counter, e.g.
184  * 1) when context switching away from the idle thread
185  * 2) when coming up for the first time
186  * 3) when coming up after a shutdown
187  *
188  * Called with interrupts disabled.
189  */
190 void
cpu_quiescent_counter_join(__unused uint64_t ctime)191 cpu_quiescent_counter_join(__unused uint64_t ctime)
192 {
193 	struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
194 	__assert_only int cpuid = cpu_number();
195 
196 	assert(cpuid < MAX_CPUS);
197 	assert(st->state == CPU_QUIESCE_COUNTER_NONE ||
198 	    st->state == CPU_QUIESCE_COUNTER_LEFT);
199 
200 	assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) &
201 	    (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0);
202 
203 	st->state = CPU_QUIESCE_COUNTER_PENDING_JOIN;
204 
205 	/*
206 	 * Mark the processor to call cpu_quiescent_counter_ast before it
207 	 * ever returns to userspace.
208 	 */
209 	ast_on(AST_UNQUIESCE);
210 }
211 
212 /*
213  * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback
214  * It needs to acquire the counter to see data and the counter published by other CPUs.
215  */
216 void
cpu_quiescent_counter_ast(void)217 cpu_quiescent_counter_ast(void)
218 {
219 	struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
220 	int cpuid = cpu_number();
221 
222 	assert(st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN);
223 
224 	/* We had better not already be joined. */
225 	assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) &
226 	    (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0);
227 
228 	/*
229 	 * No release barrier needed because we have no prior state to publish.
230 	 * Acquire barrier needed because we need this processor to see
231 	 * the latest counter value.
232 	 *
233 	 * The state may be in 'needs checkin' both before and after
234 	 * this atomic or.
235 	 *
236 	 * Additionally, if this is the first processor to come out of idle,
237 	 * it may need to kickstart the algorithm, otherwise it would
238 	 * stay in 'needs commit' perpetually with no processor assigned to
239 	 * actually do the commit.  To do that, the first processor only adds
240 	 * its expected bit.
241 	 */
242 
243 	st->state = CPU_QUIESCE_COUNTER_JOINED;
244 	st->last_checkin = mach_absolute_time();
245 
246 	checkin_mask_t old_mask, new_mask;
247 	os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, {
248 		if (old_mask == 0) {
249 		        new_mask = old_mask | cpu_expected_bit(cpuid);
250 		} else {
251 		        new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid);
252 		}
253 	});
254 }
255 
256 /*
257  * Called when a processor no longer wants to participate in the counter,
258  * i.e. when a processor is on its way to idle or shutdown.
259  *
260  * Called with interrupts disabled.
261  *
262  * The processor needs to remove itself from the expected mask, to allow the
263  * algorithm to continue ticking without its participation.
264  * However, it needs to ensure that anything it has done since the last time
265  * it checked in has been published before the next tick is allowed to commit.
266  */
267 void
cpu_quiescent_counter_leave(uint64_t ctime)268 cpu_quiescent_counter_leave(uint64_t ctime)
269 {
270 	struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
271 	int cpuid = cpu_number();
272 
273 	assert(st->state == CPU_QUIESCE_COUNTER_JOINED ||
274 	    st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN);
275 
276 	/* We no longer need the cpu_quiescent_counter_ast callback to be armed */
277 	ast_off(AST_UNQUIESCE);
278 
279 	if (st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN) {
280 		/* We never actually joined, so we don't have to do the work to leave. */
281 		st->state = CPU_QUIESCE_COUNTER_LEFT;
282 		return;
283 	}
284 
285 	/* Leaving can't be deferred, even if we're within the min interval */
286 	st->last_checkin = ctime;
287 
288 	checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid);
289 
290 	checkin_mask_t orig_state = os_atomic_andnot_orig(&cpu_quiescing_checkin_state,
291 	    mask, acq_rel);
292 
293 	assert((orig_state & cpu_expected_bit(cpuid)));
294 
295 	st->state = CPU_QUIESCE_COUNTER_LEFT;
296 
297 	if (cpu_quiescent_counter_needs_commit(orig_state)) {
298 		/*
299 		 * the old state indicates someone else was already doing a commit
300 		 * but hadn't finished yet.  We successfully inserted the acq_rel
301 		 * before they finished the commit by resetting the bitfield,
302 		 * so we're done here.
303 		 */
304 		return;
305 	}
306 
307 	checkin_mask_t new_state = orig_state & ~mask;
308 
309 	if (cpu_quiescent_counter_needs_commit(new_state)) {
310 		cpu_quiescent_counter_commit(ctime);
311 	}
312 }
313 
314 /*
315  * Called when a processor wants to check in to the counter
316  * If it hasn't yet fully joined, it doesn't need to check in.
317  *
318  * Called with interrupts disabled.
319  */
320 void
cpu_quiescent_counter_checkin(uint64_t ctime)321 cpu_quiescent_counter_checkin(uint64_t ctime)
322 {
323 	struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
324 	int cpuid = cpu_number();
325 
326 	assert(st->state != CPU_QUIESCE_COUNTER_NONE);
327 
328 	/* If we're not joined yet, we don't need to check in */
329 	if (__probable(st->state != CPU_QUIESCE_COUNTER_JOINED)) {
330 		return;
331 	}
332 
333 	/* If we've checked in recently, we don't need to check in yet. */
334 	if (__probable((ctime - st->last_checkin) <= cpu_checkin_min_interval)) {
335 		return;
336 	}
337 
338 	st->last_checkin = ctime;
339 
340 	checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed);
341 
342 	assert((state & cpu_expected_bit(cpuid)));
343 
344 	if (__probable((state & cpu_checked_in_bit(cpuid)))) {
345 		/*
346 		 * Processor has already checked in for this round, no need to
347 		 * acquire the cacheline exclusive.
348 		 */
349 		return;
350 	}
351 
352 	checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state,
353 	    cpu_checked_in_bit(cpuid), acq_rel);
354 
355 	checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid);
356 
357 	if (cpu_quiescent_counter_needs_commit(new_state)) {
358 		assertf(!cpu_quiescent_counter_needs_commit(orig_state),
359 		    "old: 0x%lx, new: 0x%lx", orig_state, new_state);
360 		cpu_quiescent_counter_commit(ctime);
361 	}
362 }
363 
364 #if MACH_ASSERT
365 /*
366  * Called on all AST exits to userspace to assert this processor actually joined
367  *
368  * Called with interrupts disabled after the AST should have been handled
369  */
370 void
cpu_quiescent_counter_assert_ast(void)371 cpu_quiescent_counter_assert_ast(void)
372 {
373 	struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
374 	int cpuid = cpu_number();
375 
376 	assert(st->state == CPU_QUIESCE_COUNTER_JOINED);
377 
378 	checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed);
379 	assert((state & cpu_expected_bit(cpuid)));
380 }
381 #endif /* MACH_ASSERT */
382