xref: /xnu-8020.101.4/osfmk/kern/smr.c (revision e7776783b89a353188416a9a346c6cdb4928faad)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/smr.h>
30 #include <kern/cpu_data.h>
31 #include <kern/zalloc.h>
32 
33 /*
34  * This SMR scheme is directly FreeBSD's "Global Unbounded Sequences".
35  *
36  * Major differences are:
37  *
38  * - only eager clocks are implemented (no lazy, no implicit)
39  */
40 
41 typedef long                    smr_delta_t;
42 
43 typedef struct smr_pcpu {
44 	smr_seq_t               c_rd_seq;
45 } *smr_pcpu_t;
46 
47 #define SMR_SEQ_DELTA(a, b)     ((smr_delta_t)((a) - (b)))
48 #define SMR_SEQ_CMP(a, op, b)   (SMR_SEQ_DELTA(a, b) op 0)
49 
50 #define SMR_SEQ_INC             2ul
51 
52 #define SMR_EARLY_COUNT         32
53 
54 /*
55  * On 32 bit systems, the sequence numbers might wrap.
56  */
57 #if __LP64__
58 #define smr_critical_enter()
59 #define smr_critical_leave()
60 #else
61 #define smr_critical_enter()    disable_preemption()
62 #define smr_critical_leave()    enable_preemption()
63 #endif /* !__LP64__ */
64 
65 __startup_data
66 static struct {
67 	struct smr_pcpu array[SMR_EARLY_COUNT];
68 	unsigned        used;
69 } smr_boot;
70 
71 static inline smr_pcpu_t __zpercpu
smr_pcpu(smr_t smr)72 smr_pcpu(smr_t smr)
73 {
74 	return (smr_pcpu_t __zpercpu)smr->smr_pcpu;
75 }
76 
77 __startup_func
78 void
__smr_init(smr_t smr)79 __smr_init(smr_t smr)
80 {
81 	if (startup_phase < STARTUP_SUB_TUNABLES) {
82 		smr_pcpu_t pcpu = &smr_boot.array[smr_boot.used++];
83 		assertf(smr_boot.used <= SMR_EARLY_COUNT,
84 		    "too many SMR_DEFINE_EARLY(), adjust SMR_EARLY_COUNT");
85 		smr->smr_pcpu = (unsigned long)__zpcpu_mangle_for_boot(pcpu);
86 	} else {
87 		smr_pcpu_t __zpercpu pcpu;
88 
89 		pcpu = zalloc_percpu_permanent_type(struct smr_pcpu);
90 		os_atomic_store(&smr->smr_pcpu, (unsigned long)pcpu, release);
91 	}
92 }
93 
94 static inline void
__smr_reset(smr_t smr,smr_seq_t seq)95 __smr_reset(smr_t smr, smr_seq_t seq)
96 {
97 	smr->smr_clock.s_rd_seq = seq;
98 	smr->smr_clock.s_wr_seq = seq;
99 	smr->smr_pcpu           = 0;
100 }
101 
102 void
smr_init(smr_t smr)103 smr_init(smr_t smr)
104 {
105 	smr_pcpu_t __zpercpu pcpu;
106 
107 	pcpu = zalloc_percpu(percpu_u64_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
108 	__smr_reset(smr, SMR_SEQ_INIT);
109 	os_atomic_store(&smr->smr_pcpu, (unsigned long)pcpu, release);
110 }
111 
112 void
smr_destroy(smr_t smr)113 smr_destroy(smr_t smr)
114 {
115 	smr_synchronize(smr);
116 	zfree_percpu(percpu_u64_zone, smr_pcpu(smr));
117 	__smr_reset(smr, SMR_SEQ_INVALID);
118 }
119 
120 static inline bool
smr_entered_nopreempt(smr_t smr)121 smr_entered_nopreempt(smr_t smr)
122 {
123 	return zpercpu_get(smr_pcpu(smr))->c_rd_seq != SMR_SEQ_INVALID;
124 }
125 
126 bool
smr_entered(smr_t smr)127 smr_entered(smr_t smr)
128 {
129 	return get_preemption_level() != 0 && smr_entered_nopreempt(smr);
130 }
131 
132 __attribute__((always_inline))
133 static void
__smr_enter(smr_t smr,smr_pcpu_t pcpu)134 __smr_enter(smr_t smr, smr_pcpu_t pcpu)
135 {
136 	smr_seq_t  s_wr_seq;
137 	smr_seq_t  old_seq;
138 
139 	/*
140 	 * It is possible to have a long delay between loading the s_wr_seq
141 	 * and storing it to the percpu copy of it.
142 	 *
143 	 * It is unlikely but possible by that time the s_rd_seq advances
144 	 * ahead of what we will store. This however is still safe
145 	 * and handled in __smr_scan().
146 	 *
147 	 * On Intel, to achieve the ordering we want, we could use a store
148 	 * followed by an mfence, or any RMW (XCHG, XADD, CMPXCHG, ...).
149 	 * XADD is just the fastest instruction of the alternatives,
150 	 * but it will only ever add to '0'.
151 	 */
152 	s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
153 #if __x86_64__
154 	old_seq = os_atomic_add_orig(&pcpu->c_rd_seq, s_wr_seq, seq_cst);
155 #else
156 	old_seq = os_atomic_load(&pcpu->c_rd_seq, relaxed);
157 	os_atomic_store(&pcpu->c_rd_seq, s_wr_seq, relaxed);
158 	os_atomic_thread_fence(seq_cst);
159 #endif
160 	assert(old_seq == SMR_SEQ_INVALID);
161 }
162 
163 __attribute__((always_inline))
164 void
smr_enter(smr_t smr)165 smr_enter(smr_t smr)
166 {
167 	disable_preemption();
168 	__smr_enter(smr, zpercpu_get(smr_pcpu(smr)));
169 }
170 
171 __attribute__((always_inline))
172 void
smr_leave(smr_t smr)173 smr_leave(smr_t smr)
174 {
175 	smr_pcpu_t pcpu = zpercpu_get(smr_pcpu(smr));
176 
177 	os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
178 	enable_preemption();
179 }
180 
181 #if __LP64__
182 static inline smr_seq_t
__smr_wr_advance(smr_t smr)183 __smr_wr_advance(smr_t smr)
184 {
185 	return os_atomic_add(&smr->smr_clock.s_wr_seq, SMR_SEQ_INC, release);
186 }
187 #endif /* __LP64__ */
188 
189 static inline smr_clock_t
__smr_wr_advance_combined(smr_t smr)190 __smr_wr_advance_combined(smr_t smr)
191 {
192 	smr_clock_t clk = { .s_wr_seq = SMR_SEQ_INC, };
193 
194 	/*
195 	 * Do a combined increment to get consistent read/write positions.
196 	 */
197 	clk.s_combined = os_atomic_add(&smr->smr_clock.s_combined,
198 	    clk.s_combined, release);
199 
200 	return clk;
201 }
202 
203 static inline smr_seq_t
__smr_rd_advance(smr_t smr,smr_seq_t rd_seq)204 __smr_rd_advance(smr_t smr, smr_seq_t rd_seq)
205 {
206 	smr_seq_t s_rd_seq;
207 
208 	s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, relaxed);
209 
210 	if (SMR_SEQ_CMP(rd_seq, >, s_rd_seq)) {
211 		if (os_atomic_cmpxchgv(&smr->smr_clock.s_rd_seq,
212 		    s_rd_seq, rd_seq, &s_rd_seq, relaxed)) {
213 			return rd_seq;
214 		}
215 	}
216 
217 	return s_rd_seq;
218 }
219 
220 __attribute__((noinline))
221 static bool
__smr_scan(smr_t smr,smr_seq_t goal,smr_clock_t clk,bool wait)222 __smr_scan(smr_t smr, smr_seq_t goal, smr_clock_t clk, bool wait)
223 {
224 	smr_seq_t rd_seq;
225 
226 	/*
227 	 * Validate that the goal is sane.
228 	 */
229 	if (SMR_SEQ_CMP(goal, >, clk.s_wr_seq)) {
230 		/*
231 		 * Invalid goal: the caller held on it for too long,
232 		 * and integers wrapped.
233 		 */
234 		return true;
235 	}
236 
237 	/*
238 	 * The read sequence can be no larger than the write sequence
239 	 * at the start of the poll.
240 	 *
241 	 * We know that on entry:
242 	 *
243 	 *     s_rd_seq < goal <= s_wr_seq
244 	 *
245 	 * The correctness of this algorithm relies on the fact that
246 	 * the SMR domain [s_rd_seq, s_wr_seq) can't possibly move
247 	 * by more than roughly (ULONG_MAX / 2) while __smr_scan()
248 	 * is running, otherwise the "rd_seq" we try to scan for
249 	 * might appear larger than s_rd_seq spuriously and we'd
250 	 * __smr_rd_advance() incorrectly.
251 	 *
252 	 * On LP64 this is guaranteed by the fact that this represents
253 	 * advancing 2^62 times. At one advance every nanosecond,
254 	 * it takes more than a century, which makes it possible
255 	 * to call smr_wait() or smr_poll() with preemption enabled.
256 	 *
257 	 * On 32bit systems, this represents 2^30 advances, which
258 	 * at one advance per nanosecond, would take about 1s.
259 	 * In order to prevent issues where a scanner would be preempted
260 	 * while CPUs go crazy advanding and wrapping the interval,
261 	 * preemption is disabled around manipulating either bounds.
262 	 * No supported 32bit system has a high core count which
263 	 * makes this protection sufficient.
264 	 */
265 	rd_seq = clk.s_wr_seq;
266 
267 	zpercpu_foreach(it, smr_pcpu(smr)) {
268 		smr_seq_t seq = os_atomic_load(&it->c_rd_seq, relaxed);
269 
270 		while (seq != SMR_SEQ_INVALID) {
271 			/*
272 			 * Resolve the race documented in __smr_enter().
273 			 *
274 			 * The CPU has loaded a stale s_wr_seq, and s_rd_seq
275 			 * moved past this stale value.
276 			 *
277 			 * Its critical section is however properly serialized,
278 			 * but we can't know what the "correct" s_wr_seq it
279 			 * could have observed was. We have to assume `s_rd_seq`
280 			 * to prevent it from advancing.
281 			 */
282 			if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
283 				seq = clk.s_rd_seq;
284 			}
285 
286 			if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
287 				break;
288 			}
289 
290 			disable_preemption();
291 			seq = hw_wait_while_equals_long(&it->c_rd_seq, seq);
292 			enable_preemption();
293 		}
294 
295 		if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
296 			rd_seq = seq;
297 		}
298 	}
299 
300 	/*
301 	 * Advance the rd_seq as long as we observed a more recent value.
302 	 */
303 	rd_seq = __smr_rd_advance(smr, rd_seq);
304 
305 	if (SMR_SEQ_CMP(goal, <=, rd_seq)) {
306 		/*
307 		 * Pairs with smr_leave() and smr_advance()
308 		 */
309 		os_atomic_thread_fence(acquire);
310 		return true;
311 	}
312 
313 	return false;
314 }
315 
316 static inline bool
__smr_poll(smr_t smr,smr_seq_t goal,bool wait)317 __smr_poll(smr_t smr, smr_seq_t goal, bool wait)
318 {
319 	smr_clock_t clk;
320 
321 	/*
322 	 * Load both the s_rd_seq and s_wr_seq in the right order so that we
323 	 * can't observe a s_rd_seq older than s_wr_seq.
324 	 *
325 	 * the "seq_cst" barriers pair with smr_leave() and __smr_wr_advance*()
326 	 * This compiles to `ldar` on arm64, and acquire barriers elsewhere.
327 	 */
328 	clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, seq_cst);
329 
330 	/*
331 	 * We expect this to be typical: the goal has already been observed.
332 	 */
333 	if (__probable(SMR_SEQ_CMP(goal, <=, clk.s_rd_seq))) {
334 		return true;
335 	}
336 
337 	clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, seq_cst);
338 
339 	return __smr_scan(smr, goal, clk, wait);
340 }
341 
342 smr_seq_t
smr_advance(smr_t smr)343 smr_advance(smr_t smr)
344 {
345 	smr_clock_t clk;
346 
347 	assert(!smr_entered(smr));
348 
349 #if __LP64__
350 	/*
351 	 * On LP64, we assume that there will at least be a successful
352 	 * __smr_poll call every 2^61 calls to smr_advance() or so,
353 	 * so we do not need to check if [s_rd_seq, s_wr_seq) is growing
354 	 * too wide.
355 	 */
356 	clk.s_wr_seq = __smr_wr_advance(smr);
357 #else
358 	smr_critical_enter();
359 
360 	clk = __smr_wr_advance_combined(smr);
361 
362 	/*
363 	 * The [s_rd_seq, s_rw_seq) interval MUST be smaller
364 	 * than ULONG_MAX / 2 with a comfortable margin (we pick half that).
365 	 *
366 	 * So in case we keep advancing and never poll/wait,
367 	 * the read sequence is forced to catch up.
368 	 */
369 	const smr_delta_t max_delta = ULONG_MAX / 4;
370 
371 	if (__improbable(SMR_SEQ_DELTA(clk.s_wr_seq, clk.s_rd_seq) >= max_delta)) {
372 		__smr_scan(smr, clk.s_wr_seq - max_delta / 2, clk, true);
373 	}
374 
375 	smr_critical_leave();
376 #endif /* !__LP64__ */
377 
378 	return clk.s_wr_seq;
379 }
380 
381 bool
smr_poll(smr_t smr,smr_seq_t goal)382 smr_poll(smr_t smr, smr_seq_t goal)
383 {
384 	bool success;
385 
386 	smr_critical_enter();
387 	assert(!smr_entered(smr));
388 	success = __smr_poll(smr, goal, false);
389 	smr_critical_leave();
390 	return success;
391 }
392 
393 void
smr_wait(smr_t smr,smr_seq_t goal)394 smr_wait(smr_t smr, smr_seq_t goal)
395 {
396 	smr_critical_enter();
397 	assert(!smr_entered(smr));
398 	(void)__smr_poll(smr, goal, true);
399 	smr_critical_leave();
400 }
401 
402 void
smr_synchronize(smr_t smr)403 smr_synchronize(smr_t smr)
404 {
405 	smr_clock_t clk;
406 
407 	smr_critical_enter();
408 	assert(!smr_entered(smr));
409 	clk = __smr_wr_advance_combined(smr);
410 	__smr_scan(smr, clk.s_wr_seq, clk, true);
411 	smr_critical_leave();
412 }
413