1 /*
2 * Copyright (c) 2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/smr.h>
30 #include <kern/cpu_data.h>
31 #include <kern/zalloc.h>
32
33 /*
34 * This SMR scheme is directly FreeBSD's "Global Unbounded Sequences".
35 *
36 * Major differences are:
37 *
38 * - only eager clocks are implemented (no lazy, no implicit)
39 */
40
41 typedef long smr_delta_t;
42
43 typedef struct smr_pcpu {
44 smr_seq_t c_rd_seq;
45 } *smr_pcpu_t;
46
47 #define SMR_SEQ_DELTA(a, b) ((smr_delta_t)((a) - (b)))
48 #define SMR_SEQ_CMP(a, op, b) (SMR_SEQ_DELTA(a, b) op 0)
49
50 #define SMR_SEQ_INC 2ul
51
52 #define SMR_EARLY_COUNT 32
53
54 /*
55 * On 32 bit systems, the sequence numbers might wrap.
56 */
57 #if __LP64__
58 #define smr_critical_enter()
59 #define smr_critical_leave()
60 #else
61 #define smr_critical_enter() disable_preemption()
62 #define smr_critical_leave() enable_preemption()
63 #endif /* !__LP64__ */
64
65 __startup_data
66 static struct {
67 struct smr_pcpu array[SMR_EARLY_COUNT];
68 unsigned used;
69 } smr_boot;
70
71 static inline smr_pcpu_t __zpercpu
smr_pcpu(smr_t smr)72 smr_pcpu(smr_t smr)
73 {
74 return (smr_pcpu_t __zpercpu)smr->smr_pcpu;
75 }
76
77 __startup_func
78 void
__smr_init(smr_t smr)79 __smr_init(smr_t smr)
80 {
81 if (startup_phase < STARTUP_SUB_TUNABLES) {
82 smr_pcpu_t pcpu = &smr_boot.array[smr_boot.used++];
83 assertf(smr_boot.used <= SMR_EARLY_COUNT,
84 "too many SMR_DEFINE_EARLY(), adjust SMR_EARLY_COUNT");
85 smr->smr_pcpu = (unsigned long)__zpcpu_mangle_for_boot(pcpu);
86 } else {
87 smr_pcpu_t __zpercpu pcpu;
88
89 pcpu = zalloc_percpu_permanent_type(struct smr_pcpu);
90 os_atomic_store(&smr->smr_pcpu, (unsigned long)pcpu, release);
91 }
92 }
93
94 static inline void
__smr_reset(smr_t smr,smr_seq_t seq)95 __smr_reset(smr_t smr, smr_seq_t seq)
96 {
97 smr->smr_clock.s_rd_seq = seq;
98 smr->smr_clock.s_wr_seq = seq;
99 smr->smr_pcpu = 0;
100 }
101
102 void
smr_init(smr_t smr)103 smr_init(smr_t smr)
104 {
105 smr_pcpu_t __zpercpu pcpu;
106
107 pcpu = zalloc_percpu(percpu_u64_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
108 __smr_reset(smr, SMR_SEQ_INIT);
109 os_atomic_store(&smr->smr_pcpu, (unsigned long)pcpu, release);
110 }
111
112 void
smr_destroy(smr_t smr)113 smr_destroy(smr_t smr)
114 {
115 smr_synchronize(smr);
116 zfree_percpu(percpu_u64_zone, smr_pcpu(smr));
117 __smr_reset(smr, SMR_SEQ_INVALID);
118 }
119
120 static inline bool
smr_entered_nopreempt(smr_t smr)121 smr_entered_nopreempt(smr_t smr)
122 {
123 return zpercpu_get(smr_pcpu(smr))->c_rd_seq != SMR_SEQ_INVALID;
124 }
125
126 bool
smr_entered(smr_t smr)127 smr_entered(smr_t smr)
128 {
129 return get_preemption_level() != 0 && smr_entered_nopreempt(smr);
130 }
131
132 __attribute__((always_inline))
133 static void
__smr_enter(smr_t smr,smr_pcpu_t pcpu)134 __smr_enter(smr_t smr, smr_pcpu_t pcpu)
135 {
136 smr_seq_t s_wr_seq;
137 smr_seq_t old_seq;
138
139 /*
140 * It is possible to have a long delay between loading the s_wr_seq
141 * and storing it to the percpu copy of it.
142 *
143 * It is unlikely but possible by that time the s_rd_seq advances
144 * ahead of what we will store. This however is still safe
145 * and handled in __smr_scan().
146 *
147 * On Intel, to achieve the ordering we want, we could use a store
148 * followed by an mfence, or any RMW (XCHG, XADD, CMPXCHG, ...).
149 * XADD is just the fastest instruction of the alternatives,
150 * but it will only ever add to '0'.
151 */
152 s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
153 #if __x86_64__
154 old_seq = os_atomic_add_orig(&pcpu->c_rd_seq, s_wr_seq, seq_cst);
155 #else
156 old_seq = os_atomic_load(&pcpu->c_rd_seq, relaxed);
157 os_atomic_store(&pcpu->c_rd_seq, s_wr_seq, relaxed);
158 os_atomic_thread_fence(seq_cst);
159 #endif
160 assert(old_seq == SMR_SEQ_INVALID);
161 }
162
163 __attribute__((always_inline))
164 void
smr_enter(smr_t smr)165 smr_enter(smr_t smr)
166 {
167 disable_preemption();
168 __smr_enter(smr, zpercpu_get(smr_pcpu(smr)));
169 }
170
171 __attribute__((always_inline))
172 void
smr_leave(smr_t smr)173 smr_leave(smr_t smr)
174 {
175 smr_pcpu_t pcpu = zpercpu_get(smr_pcpu(smr));
176
177 os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
178 enable_preemption();
179 }
180
181 #if __LP64__
182 static inline smr_seq_t
__smr_wr_advance(smr_t smr)183 __smr_wr_advance(smr_t smr)
184 {
185 return os_atomic_add(&smr->smr_clock.s_wr_seq, SMR_SEQ_INC, release);
186 }
187 #endif /* __LP64__ */
188
189 static inline smr_clock_t
__smr_wr_advance_combined(smr_t smr)190 __smr_wr_advance_combined(smr_t smr)
191 {
192 smr_clock_t clk = { .s_wr_seq = SMR_SEQ_INC, };
193
194 /*
195 * Do a combined increment to get consistent read/write positions.
196 */
197 clk.s_combined = os_atomic_add(&smr->smr_clock.s_combined,
198 clk.s_combined, release);
199
200 return clk;
201 }
202
203 static inline smr_seq_t
__smr_rd_advance(smr_t smr,smr_seq_t rd_seq)204 __smr_rd_advance(smr_t smr, smr_seq_t rd_seq)
205 {
206 smr_seq_t s_rd_seq;
207
208 s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, relaxed);
209
210 if (SMR_SEQ_CMP(rd_seq, >, s_rd_seq)) {
211 if (os_atomic_cmpxchgv(&smr->smr_clock.s_rd_seq,
212 s_rd_seq, rd_seq, &s_rd_seq, relaxed)) {
213 return rd_seq;
214 }
215 }
216
217 return s_rd_seq;
218 }
219
220 __attribute__((noinline))
221 static bool
__smr_scan(smr_t smr,smr_seq_t goal,smr_clock_t clk,bool wait)222 __smr_scan(smr_t smr, smr_seq_t goal, smr_clock_t clk, bool wait)
223 {
224 smr_seq_t rd_seq;
225
226 /*
227 * Validate that the goal is sane.
228 */
229 if (SMR_SEQ_CMP(goal, >, clk.s_wr_seq)) {
230 /*
231 * Invalid goal: the caller held on it for too long,
232 * and integers wrapped.
233 */
234 return true;
235 }
236
237 /*
238 * The read sequence can be no larger than the write sequence
239 * at the start of the poll.
240 *
241 * We know that on entry:
242 *
243 * s_rd_seq < goal <= s_wr_seq
244 *
245 * The correctness of this algorithm relies on the fact that
246 * the SMR domain [s_rd_seq, s_wr_seq) can't possibly move
247 * by more than roughly (ULONG_MAX / 2) while __smr_scan()
248 * is running, otherwise the "rd_seq" we try to scan for
249 * might appear larger than s_rd_seq spuriously and we'd
250 * __smr_rd_advance() incorrectly.
251 *
252 * On LP64 this is guaranteed by the fact that this represents
253 * advancing 2^62 times. At one advance every nanosecond,
254 * it takes more than a century, which makes it possible
255 * to call smr_wait() or smr_poll() with preemption enabled.
256 *
257 * On 32bit systems, this represents 2^30 advances, which
258 * at one advance per nanosecond, would take about 1s.
259 * In order to prevent issues where a scanner would be preempted
260 * while CPUs go crazy advanding and wrapping the interval,
261 * preemption is disabled around manipulating either bounds.
262 * No supported 32bit system has a high core count which
263 * makes this protection sufficient.
264 */
265 rd_seq = clk.s_wr_seq;
266
267 zpercpu_foreach(it, smr_pcpu(smr)) {
268 smr_seq_t seq = os_atomic_load(&it->c_rd_seq, relaxed);
269
270 while (seq != SMR_SEQ_INVALID) {
271 /*
272 * Resolve the race documented in __smr_enter().
273 *
274 * The CPU has loaded a stale s_wr_seq, and s_rd_seq
275 * moved past this stale value.
276 *
277 * Its critical section is however properly serialized,
278 * but we can't know what the "correct" s_wr_seq it
279 * could have observed was. We have to assume `s_rd_seq`
280 * to prevent it from advancing.
281 */
282 if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
283 seq = clk.s_rd_seq;
284 }
285
286 if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
287 break;
288 }
289
290 disable_preemption();
291 seq = hw_wait_while_equals_long(&it->c_rd_seq, seq);
292 enable_preemption();
293 }
294
295 if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
296 rd_seq = seq;
297 }
298 }
299
300 /*
301 * Advance the rd_seq as long as we observed a more recent value.
302 */
303 rd_seq = __smr_rd_advance(smr, rd_seq);
304
305 if (SMR_SEQ_CMP(goal, <=, rd_seq)) {
306 /*
307 * Pairs with smr_leave() and smr_advance()
308 */
309 os_atomic_thread_fence(acquire);
310 return true;
311 }
312
313 return false;
314 }
315
316 static inline bool
__smr_poll(smr_t smr,smr_seq_t goal,bool wait)317 __smr_poll(smr_t smr, smr_seq_t goal, bool wait)
318 {
319 smr_clock_t clk;
320
321 /*
322 * Load both the s_rd_seq and s_wr_seq in the right order so that we
323 * can't observe a s_rd_seq older than s_wr_seq.
324 *
325 * the "seq_cst" barriers pair with smr_leave() and __smr_wr_advance*()
326 * This compiles to `ldar` on arm64, and acquire barriers elsewhere.
327 */
328 clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, seq_cst);
329
330 /*
331 * We expect this to be typical: the goal has already been observed.
332 */
333 if (__probable(SMR_SEQ_CMP(goal, <=, clk.s_rd_seq))) {
334 return true;
335 }
336
337 clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, seq_cst);
338
339 return __smr_scan(smr, goal, clk, wait);
340 }
341
342 smr_seq_t
smr_advance(smr_t smr)343 smr_advance(smr_t smr)
344 {
345 smr_clock_t clk;
346
347 assert(!smr_entered(smr));
348
349 #if __LP64__
350 /*
351 * On LP64, we assume that there will at least be a successful
352 * __smr_poll call every 2^61 calls to smr_advance() or so,
353 * so we do not need to check if [s_rd_seq, s_wr_seq) is growing
354 * too wide.
355 */
356 clk.s_wr_seq = __smr_wr_advance(smr);
357 #else
358 smr_critical_enter();
359
360 clk = __smr_wr_advance_combined(smr);
361
362 /*
363 * The [s_rd_seq, s_rw_seq) interval MUST be smaller
364 * than ULONG_MAX / 2 with a comfortable margin (we pick half that).
365 *
366 * So in case we keep advancing and never poll/wait,
367 * the read sequence is forced to catch up.
368 */
369 const smr_delta_t max_delta = ULONG_MAX / 4;
370
371 if (__improbable(SMR_SEQ_DELTA(clk.s_wr_seq, clk.s_rd_seq) >= max_delta)) {
372 __smr_scan(smr, clk.s_wr_seq - max_delta / 2, clk, true);
373 }
374
375 smr_critical_leave();
376 #endif /* !__LP64__ */
377
378 return clk.s_wr_seq;
379 }
380
381 bool
smr_poll(smr_t smr,smr_seq_t goal)382 smr_poll(smr_t smr, smr_seq_t goal)
383 {
384 bool success;
385
386 smr_critical_enter();
387 assert(!smr_entered(smr));
388 success = __smr_poll(smr, goal, false);
389 smr_critical_leave();
390 return success;
391 }
392
393 void
smr_wait(smr_t smr,smr_seq_t goal)394 smr_wait(smr_t smr, smr_seq_t goal)
395 {
396 smr_critical_enter();
397 assert(!smr_entered(smr));
398 (void)__smr_poll(smr, goal, true);
399 smr_critical_leave();
400 }
401
402 void
smr_synchronize(smr_t smr)403 smr_synchronize(smr_t smr)
404 {
405 smr_clock_t clk;
406
407 smr_critical_enter();
408 assert(!smr_entered(smr));
409 clk = __smr_wr_advance_combined(smr);
410 __smr_scan(smr, clk.s_wr_seq, clk, true);
411 smr_critical_leave();
412 }
413