1 /*
2 * Copyright (c) 2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/cpu_data.h>
30 #include <kern/mpsc_queue.h>
31 #include <kern/percpu.h>
32 #include <kern/smr.h>
33 #include <kern/zalloc.h>
34 #include <sys/queue.h>
35
36 /*
37 * This SMR scheme is directly FreeBSD's "Global Unbounded Sequences".
38 *
39 * Major differences are:
40 *
41 * - only eager clocks are implemented (no lazy, no implicit)
42 */
43
44 typedef long smr_delta_t;
45
46 typedef struct smr_pcpu {
47 smr_seq_t c_rd_seq;
48 unsigned long c_rd_budget;
49 } *smr_pcpu_t;
50
51 #define SMR_SEQ_DELTA(a, b) ((smr_delta_t)((a) - (b)))
52 #define SMR_SEQ_CMP(a, op, b) (SMR_SEQ_DELTA(a, b) op 0)
53
54 #define SMR_SEQ_INC 2ul
55
56 #define SMR_EARLY_COUNT 32
57
58 __startup_data
59 static struct {
60 struct smr_pcpu array[SMR_EARLY_COUNT];
61 unsigned used;
62 } smr_boot;
63
64
65 #pragma mark - manipulating an SMR clock
66
67 /*
68 * SMR clocks have 3 state machines interacting at any given time:
69 *
70 *
71 * 1. reader critical sections
72 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
73 *
74 * Each CPU can disable preemption and do this sequence:
75 *
76 * CPU::c_rd_seq = GLOBAL::c_wr_seq;
77 *
78 * < unfortunate place to receive a long IRQ > [I]
79 *
80 * os_atomic_thread_fence(seq_cst); [R1]
81 *
82 * {
83 * // critical section
84 * }
85 *
86 * os_atomic_store(&CPU::c_rd_seq, INVALID, release); [R2]
87 *
88 *
89 *
90 * 2. writer sequence advances
91 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
92 *
93 * Each writer can increment the global write sequence
94 * at any given time:
95 *
96 * os_atomic_add(&GLOBAL::c_wr_seq, SMR_SEQ_INC, release); [W]
97 *
98 *
99 *
100 * 3. synchronization sequence: poll/wait/scan
101 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
102 *
103 * This state machine synchronizes with the other two in order to decide
104 * if a given "goal" is in the past. Only the cases when the call
105 * is successful is interresting for barrier purposes, and we will focus
106 * on cases that do not take an early return for failures.
107 *
108 * a. __smr_poll:
109 *
110 * rd_seq = os_atomic_load(&GLOBAL::c_rd_seq, acquire); [S1]
111 * if (goal < rd_seq) SUCCESS.
112 * wr_seq = os_atomic_load(&GLOBAL::c_rd_seq, relaxed);
113 *
114 * b. __smr_scan
115 *
116 * os_atomic_thread_fence(seq_cst) [S2]
117 *
118 * observe the minimum CPU::c_rd_seq "min_rd_seq"
119 * value possible or rw_seq if no CPU was in a critical section.
120 * (possibly spinning until it satisfies "goal")
121 *
122 * c. __smr_rd_advance
123 *
124 * cur_rd_seq = load_exclusive(&GLOBAL::c_rd_seq);
125 * os_atomic_thread_fence(seq_cst); [S3]
126 * if (min_rd_seq > cur_rd_seq) {
127 * store_exlusive(&GLOBAL::c_rd_seq, min_rd_seq);
128 * }
129 *
130 *
131 * One sentence summary
132 * ~~~~~~~~~~~~~~~~~~~~
133 *
134 * A simplistic one-sentence summary of the algorithm is that __smr_scan()
135 * works really hard to insert itself in the timeline of write sequences and
136 * observe a reasonnable bound for first safe-to-reclaim sequence, and
137 * issues [S3] to sequence everything around "c_rd_seq" (via [S3] -> [S1]):
138 *
139 * GLOBAL::c_rd_seq GLOBAL::c_wr_seq
140 * v v
141 * ──────────────────────┬────────────────┬─────────────────────
142 * ... safe to reclaim │ deferred │ future ...
143 * ──────────────────────┴────────────────┴─────────────────────
144 *
145 *
146 * Detailed explanation
147 * ~~~~~~~~~~~~~~~~~~~~
148 *
149 * [W] -> [R1] establishes a "happens before" relationship between a given
150 * writer and this critical section. The loaded GLOBAL::c_wr_seq might
151 * however be stale with respect to the one [R1] really synchronizes with
152 * (see [I] explanation below).
153 *
154 *
155 * [R1] -> [S2] establishes a "happens before" relationship between all the
156 * active critical sections and the scanner.
157 * It lets us compute the oldest possible sequence pinned by an active
158 * critical section.
159 *
160 *
161 * [R2] -> [S3] establishes a "happens before" relationship between all the
162 * inactive critical sections and the scanner.
163 *
164 *
165 * [S3] -> [S1] is the typical expected fastpath: when the caller can decide
166 * that its goal is older than the last update an __smr_rd_advance() did.
167 * Note that [S3] doubles as an "[S1]" when two __smr_scan() race each other
168 * and one of them finishes last but observed a "worse" read sequence.
169 *
170 *
171 * [W], [S3] -> [S1] is the last crucial property: all updates to the global
172 * clock are totally ordered because they update the entire 128bit state
173 * every time with an RMW. This guarantees that __smr_poll() can't load
174 * an `rd_seq` that is younger than the `wr_seq` it loads next.
175 *
176 *
177 * [I] __smr_enter() also can be unfortunately delayed after observing
178 * a given write sequence and right before [R1] at [I].
179 *
180 * However for a read sequence to have move past what __smr_enter() observed,
181 * it means another __smr_scan() didn't observe the store to CPU::c_rd_seq
182 * made by __smr_enter() and thought the section was inactive.
183 *
184 * This can only happen if the scan's [S2] was issued before the delayed
185 * __smr_enter() [R1] (during the [I] window).
186 *
187 * As a consequence the outcome of that scan can be accepted as the "real"
188 * write sequence __smr_enter() should have observed.
189 *
190 *
191 * Litmus tests
192 * ~~~~~~~~~~~~
193 *
194 * This is the proof of [W] -> [R1] -> [S2] being established properly:
195 * - P0 sets a global and calls smr_synchronize()
196 * - P1 does smr_enter() and loads the global
197 *
198 * AArch64 MP
199 * {
200 * global = 0;
201 * wr_seq = 123;
202 * p1_rd_seq = 0;
203 *
204 * 0:x0 = global; 0:x1 = wr_seq; 0:x2 = p1_rd_seq;
205 * 1:x0 = global; 1:x1 = wr_seq; 1:x2 = p1_rd_seq;
206 * }
207 * P0 | P1 ;
208 * MOV X8, #2 | LDR X8, [X1] ;
209 * STR X8, [X0] | STR X8, [X2] ;
210 * LDADDL X8, X9, [X1] | DMB SY ;
211 * DMB SY | LDR X10, [X0] ;
212 * LDR X10, [X2] | ;
213 * exists (0:X10 = 0 /\ 1:X8 = 123 /\ 1:X10 = 0)
214 *
215 *
216 * This is the proof that deferred advances are also correct:
217 * - P0 sets a global and does a smr_deferred_advance()
218 * - P1 does an smr_synchronize() and reads the global
219 *
220 * AArch64 MP
221 * {
222 * global = 0;
223 * wr_seq = 123;
224 *
225 * 0:x0 = global; 0:x1 = wr_seq; 0:x2 = 2;
226 * 1:x0 = global; 1:x1 = wr_seq; 1:x2 = 2;
227 * }
228 * P0 | P1 ;
229 * STR X2, [X0] | LDADDL X2, X9, [X1] ;
230 * DMB SY | DMB SY ;
231 * LDR X9, [X1] | LDR X10, [X0] ;
232 * ADD X9, X9, X2 | ;
233 * exists (0:X9 = 125 /\ 1:X9 = 123 /\ 1:X10 = 0)
234 *
235 */
236
237 static inline smr_pcpu_t __zpercpu
smr_pcpu(smr_t smr)238 smr_pcpu(smr_t smr)
239 {
240 return (smr_pcpu_t __zpercpu)smr->smr_pcpu;
241 }
242
243 __startup_func
244 void
__smr_init(smr_t smr)245 __smr_init(smr_t smr)
246 {
247 if (startup_phase < STARTUP_SUB_TUNABLES) {
248 smr_pcpu_t pcpu = &smr_boot.array[smr_boot.used++];
249 assertf(smr_boot.used <= SMR_EARLY_COUNT,
250 "too many SMR_DEFINE_EARLY(), adjust SMR_EARLY_COUNT");
251 smr->smr_pcpu = (unsigned long)__zpcpu_mangle_for_boot(pcpu);
252 } else {
253 smr_pcpu_t __zpercpu pcpu;
254
255 pcpu = zalloc_percpu_permanent_type(struct smr_pcpu);
256 os_atomic_store(&smr->smr_pcpu, (unsigned long)pcpu, release);
257 }
258 }
259
260 static inline void
__smr_reset(smr_t smr,smr_seq_t seq)261 __smr_reset(smr_t smr, smr_seq_t seq)
262 {
263 smr->smr_clock.s_rd_seq = seq;
264 smr->smr_clock.s_wr_seq = seq;
265 smr->smr_pcpu = 0;
266 }
267
268 void
smr_init(smr_t smr)269 smr_init(smr_t smr)
270 {
271 smr_pcpu_t __zpercpu pcpu;
272
273 pcpu = zalloc_percpu(percpu_u64_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
274 __smr_reset(smr, SMR_SEQ_INIT);
275 os_atomic_store(&smr->smr_pcpu, (unsigned long)pcpu, release);
276 }
277
278 void
smr_set_deferred_budget(smr_t smr,unsigned long budget)279 smr_set_deferred_budget(smr_t smr, unsigned long budget)
280 {
281 /*
282 * No need to update the per-cpu budget variables,
283 * calls to smr_deferred_advance() will eventually fix it.
284 *
285 * Note: we use `-1` because smr_deferred_advance_nopreempt()
286 * checks for overflow rather than hitting 0.
287 */
288 smr->smr_budget = budget ? budget - 1 : 0;
289 }
290
291 void
smr_destroy(smr_t smr)292 smr_destroy(smr_t smr)
293 {
294 smr_synchronize(smr);
295 zfree_percpu(percpu_u64_zone, smr_pcpu(smr));
296 __smr_reset(smr, SMR_SEQ_INVALID);
297 }
298
299 static inline bool
smr_entered_nopreempt(smr_t smr)300 smr_entered_nopreempt(smr_t smr)
301 {
302 return zpercpu_get(smr_pcpu(smr))->c_rd_seq != SMR_SEQ_INVALID;
303 }
304
305 __attribute__((always_inline))
306 bool
smr_entered(smr_t smr)307 smr_entered(smr_t smr)
308 {
309 return get_preemption_level() != 0 && smr_entered_nopreempt(smr);
310 }
311
312 __attribute__((always_inline))
313 static void
__smr_enter(smr_t smr,smr_pcpu_t pcpu)314 __smr_enter(smr_t smr, smr_pcpu_t pcpu)
315 {
316 smr_seq_t s_wr_seq;
317 smr_seq_t old_seq;
318
319 /*
320 * It is possible to have a long delay between loading the s_wr_seq
321 * and storing it to the percpu copy of it.
322 *
323 * It is unlikely but possible by that time the s_rd_seq advances
324 * ahead of what we will store. This however is still safe
325 * and handled in __smr_scan().
326 *
327 * On Intel, to achieve the ordering we want, we could use a store
328 * followed by an mfence, or any RMW (XCHG, XADD, CMPXCHG, ...).
329 * XADD is just the fastest instruction of the alternatives,
330 * but it will only ever add to '0'.
331 */
332 s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
333 #if __x86_64__
334 /* [R1] */
335 old_seq = os_atomic_add_orig(&pcpu->c_rd_seq, s_wr_seq, seq_cst);
336 #else
337 old_seq = os_atomic_load(&pcpu->c_rd_seq, relaxed);
338 os_atomic_store(&pcpu->c_rd_seq, s_wr_seq, relaxed);
339 os_atomic_thread_fence(seq_cst); /* [R1] */
340 #endif
341 assert(old_seq == SMR_SEQ_INVALID);
342 }
343
344 __attribute__((always_inline))
345 void
smr_enter(smr_t smr)346 smr_enter(smr_t smr)
347 {
348 disable_preemption();
349 __smr_enter(smr, zpercpu_get(smr_pcpu(smr)));
350 }
351
352 __attribute__((always_inline))
353 void
smr_leave(smr_t smr)354 smr_leave(smr_t smr)
355 {
356 smr_pcpu_t pcpu = zpercpu_get(smr_pcpu(smr));
357
358 /* [R2] */
359 os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
360 enable_preemption();
361 }
362
363 static inline smr_seq_t
__smr_wr_advance(smr_t smr)364 __smr_wr_advance(smr_t smr)
365 {
366 /* [W] */
367 return os_atomic_add(&smr->smr_clock.s_wr_seq, SMR_SEQ_INC, release);
368 }
369
370 static inline smr_clock_t
__smr_wr_advance_combined(smr_t smr)371 __smr_wr_advance_combined(smr_t smr)
372 {
373 smr_clock_t clk = { .s_wr_seq = SMR_SEQ_INC, };
374
375 /*
376 * Do a combined increment to get consistent read/write positions.
377 */
378
379 /* [W] */
380 clk.s_combined = os_atomic_add(&smr->smr_clock.s_combined,
381 clk.s_combined, release);
382
383 return clk;
384 }
385
386 static inline bool
__smr_rd_advance(smr_t smr,smr_seq_t goal,smr_seq_t rd_seq)387 __smr_rd_advance(smr_t smr, smr_seq_t goal, smr_seq_t rd_seq)
388 {
389 smr_clock_t oclk, nclk;
390
391 os_atomic_rmw_loop(&smr->smr_clock.s_combined,
392 oclk.s_combined, nclk.s_combined, relaxed, {
393 nclk = oclk;
394
395 os_atomic_thread_fence(seq_cst); /* [S3] */
396
397 if (SMR_SEQ_CMP(rd_seq, <=, oclk.s_rd_seq)) {
398 os_atomic_rmw_loop_give_up(break);
399 }
400 nclk.s_rd_seq = rd_seq;
401 });
402
403 return SMR_SEQ_CMP(goal, <=, nclk.s_rd_seq);
404 }
405
406 __attribute__((noinline))
407 static bool
__smr_scan(smr_t smr,smr_seq_t goal,smr_clock_t clk,bool wait)408 __smr_scan(smr_t smr, smr_seq_t goal, smr_clock_t clk, bool wait)
409 {
410 smr_delta_t delta;
411 smr_seq_t rd_seq;
412
413 /*
414 * Validate that the goal is sane.
415 */
416 delta = SMR_SEQ_DELTA(goal, clk.s_wr_seq);
417 if (delta == SMR_SEQ_INC && smr->smr_budget) {
418 /*
419 * This SMR clock uses deferred advance,
420 * and the goal is one inc in the future.
421 *
422 * If we can wait, then force the clock
423 * to advance, else we can't possibly succeed.
424 */
425 if (!wait) {
426 return false;
427 }
428 clk = __smr_wr_advance_combined(smr);
429 } else if (delta > 0) {
430 /*
431 * Invalid goal: the caller held on it for too long,
432 * and integers wrapped.
433 */
434 return true;
435 }
436
437 os_atomic_thread_fence(seq_cst); /* [S2] */
438
439 /*
440 * The read sequence can be no larger than the write sequence
441 * at the start of the poll.
442 *
443 * We know that on entry:
444 *
445 * s_rd_seq < goal <= s_wr_seq
446 *
447 * The correctness of this algorithm relies on the fact that
448 * the SMR domain [s_rd_seq, s_wr_seq) can't possibly move
449 * by more than roughly (ULONG_MAX / 2) while __smr_scan()
450 * is running, otherwise the "rd_seq" we try to scan for
451 * might appear larger than s_rd_seq spuriously and we'd
452 * __smr_rd_advance() incorrectly.
453 *
454 * This is guaranteed by the fact that this represents
455 * advancing 2^62 times. At one advance every nanosecond,
456 * it takes more than a century, which makes it possible
457 * to call smr_wait() or smr_poll() with preemption enabled.
458 */
459 rd_seq = clk.s_wr_seq;
460
461 zpercpu_foreach(it, smr_pcpu(smr)) {
462 smr_seq_t seq = os_atomic_load(&it->c_rd_seq, relaxed);
463
464 while (seq != SMR_SEQ_INVALID) {
465 /*
466 * Resolve the race documented in __smr_enter().
467 *
468 * The CPU has loaded a stale s_wr_seq, and s_rd_seq
469 * moved past this stale value.
470 *
471 * Its critical section is however properly serialized,
472 * but we can't know what the "correct" s_wr_seq it
473 * could have observed was. We have to assume `s_rd_seq`
474 * to prevent it from advancing.
475 */
476 if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
477 seq = clk.s_rd_seq;
478 }
479
480 if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
481 break;
482 }
483
484 disable_preemption();
485 seq = hw_wait_while_equals_long(&it->c_rd_seq, seq);
486 enable_preemption();
487 }
488
489 if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
490 rd_seq = seq;
491 }
492 }
493
494 /*
495 * Advance the rd_seq as long as we observed a more recent value.
496 */
497 return __smr_rd_advance(smr, goal, rd_seq);
498 }
499
500 static inline bool
__smr_poll(smr_t smr,smr_seq_t goal,bool wait)501 __smr_poll(smr_t smr, smr_seq_t goal, bool wait)
502 {
503 smr_clock_t clk;
504
505 /*
506 * Load both the s_rd_seq and s_wr_seq in the right order so that we
507 * can't observe a s_rd_seq older than s_wr_seq.
508 */
509
510 /* [S1] */
511 clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, acquire);
512
513 /*
514 * We expect this to be typical: the goal has already been observed.
515 */
516 if (__probable(SMR_SEQ_CMP(goal, <=, clk.s_rd_seq))) {
517 return true;
518 }
519
520 clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
521
522 return __smr_scan(smr, goal, clk, wait);
523 }
524
525 smr_seq_t
smr_advance(smr_t smr)526 smr_advance(smr_t smr)
527 {
528 smr_clock_t clk;
529
530 assert(!smr_entered(smr));
531
532 /*
533 * We assume that there will at least be a successful __smr_poll
534 * call every 2^61 calls to smr_advance() or so, so we do not need
535 * to check if [s_rd_seq, s_wr_seq) is growing too wide.
536 */
537 static_assert(sizeof(clk.s_wr_seq) == 8);
538 return __smr_wr_advance(smr);
539 }
540
541 smr_seq_t
smr_deferred_advance_nopreempt(smr_t smr,unsigned long step)542 smr_deferred_advance_nopreempt(smr_t smr, unsigned long step)
543 {
544 smr_pcpu_t pcpu = smr_pcpu(smr);
545
546 assert(get_preemption_level() && !smr_entered_nopreempt(smr));
547
548 if (os_sub_overflow(pcpu->c_rd_budget, step, &pcpu->c_rd_budget)) {
549 pcpu->c_rd_budget = smr->smr_budget;
550 return __smr_wr_advance(smr);
551 }
552
553 /*
554 * Deferred updates are about avoiding to touch the global c_wr_seq
555 * as often, and we return a sequence number in the future.
556 *
557 * This full barrier establishes a "happen before" relationship
558 * with the [W] barrier in the __smr_wr_advance() that will
559 * actually generate it.
560 */
561 os_atomic_thread_fence(seq_cst);
562 return SMR_SEQ_INC + os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
563 }
564
565 smr_seq_t
smr_deferred_advance(smr_t smr,unsigned long step)566 smr_deferred_advance(smr_t smr, unsigned long step)
567 {
568 smr_seq_t seq;
569
570 disable_preemption();
571 seq = smr_deferred_advance_nopreempt(smr, step);
572 enable_preemption();
573
574 return seq;
575 }
576
577 bool
smr_poll(smr_t smr,smr_seq_t goal)578 smr_poll(smr_t smr, smr_seq_t goal)
579 {
580 assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
581 return __smr_poll(smr, goal, false);
582 }
583
584 void
smr_wait(smr_t smr,smr_seq_t goal)585 smr_wait(smr_t smr, smr_seq_t goal)
586 {
587 assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
588 (void)__smr_poll(smr, goal, true);
589 }
590
591 void
smr_synchronize(smr_t smr)592 smr_synchronize(smr_t smr)
593 {
594 smr_clock_t clk;
595
596 assert(!smr_entered(smr));
597 clk = __smr_wr_advance_combined(smr);
598 __smr_scan(smr, clk.s_wr_seq, clk, true);
599 }
600
601
602 #pragma mark - system global SMR
603
604 typedef struct smr_record {
605 void *smrr_val;
606 void (*smrr_dtor)(void *);
607 } *smr_record_t;
608
609 typedef struct smr_bucket {
610 union {
611 struct mpsc_queue_chain smrb_mplink;
612 STAILQ_ENTRY(smr_bucket) smrb_stqlink;
613 };
614 uint32_t smrb_count;
615 uint32_t smrb_size;
616 smr_seq_t smrb_seq;
617 struct smr_record smrb_recs[];
618 } *smr_bucket_t;
619
620 STAILQ_HEAD(smr_bucket_list, smr_bucket);
621
622 static SMR_DEFINE_EARLY(smr_system);
623
624 /*! per-cpu state for smr pointers. */
625 static smr_bucket_t PERCPU_DATA(smr_bucket);
626
627 /*! the minimum number of items cached in per-cpu buckets */
628 static TUNABLE(uint32_t, smr_bucket_count_min, "smr_bucket_count_min", 8);
629
630 /*! the amount of memory pending retiring that causes a foreceful flush */
631 #if XNU_TARGET_OS_OSX
632 #define SMR_RETIRE_THRESHOLD_DEFAULT (256 << 10)
633 #else
634 #define SMR_RETIRE_THRESHOLD_DEFAULT (64 << 10)
635 #endif
636 static TUNABLE(vm_size_t, smr_retire_threshold, "smr_retire_threshold",
637 SMR_RETIRE_THRESHOLD_DEFAULT);
638
639 /*! the number of items cached in per-cpu buckets */
640 static SECURITY_READ_ONLY_LATE(uint32_t) smr_bucket_count;
641
642 /*! the queue of elements that couldn't be freed immediately */
643 static struct smr_bucket_list smr_buckets_pending =
644 STAILQ_HEAD_INITIALIZER(smr_buckets_pending);
645
646 /*! the atomic queue handling deferred deallocations */
647 static struct mpsc_daemon_queue smr_deallocate_queue;
648
649 __attribute__((always_inline))
650 bool
smr_global_entered(void)651 smr_global_entered(void)
652 {
653 return smr_entered(&smr_system);
654 }
655
656 __attribute__((always_inline))
657 void
smr_global_enter(void)658 smr_global_enter(void)
659 {
660 smr_enter(&smr_system);
661 }
662
663 __attribute__((always_inline))
664 void
smr_global_leave(void)665 smr_global_leave(void)
666 {
667 smr_leave(&smr_system);
668 }
669
670 static smr_bucket_t
smr_bucket_alloc(zalloc_flags_t flags)671 smr_bucket_alloc(zalloc_flags_t flags)
672 {
673 return kalloc_type(struct smr_bucket, struct smr_record,
674 smr_bucket_count, Z_ZERO | flags);
675 }
676
677 static void
smr_bucket_free(smr_bucket_t bucket)678 smr_bucket_free(smr_bucket_t bucket)
679 {
680 return kfree_type(struct smr_bucket, struct smr_record,
681 smr_bucket_count, bucket);
682 }
683
684 void
smr_global_retire(void * value,size_t size,void (* destructor)(void *))685 smr_global_retire(void *value, size_t size, void (*destructor)(void *))
686 {
687 smr_bucket_t *slot;
688 smr_bucket_t bucket, free_bucket = NULL;
689
690 if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
691 /*
692 * The system is still single threaded and this module
693 * is still not fully initialized.
694 */
695 destructor(value);
696 return;
697 }
698
699 again:
700 disable_preemption();
701 slot = PERCPU_GET(smr_bucket);
702 bucket = *slot;
703 if (bucket && bucket->smrb_seq) {
704 mpsc_daemon_enqueue(&smr_deallocate_queue,
705 &bucket->smrb_mplink, MPSC_QUEUE_NONE);
706 *slot = bucket = NULL;
707 }
708 if (bucket == NULL) {
709 if (free_bucket) {
710 bucket = free_bucket;
711 free_bucket = NULL;
712 } else if ((bucket = smr_bucket_alloc(Z_NOWAIT)) == NULL) {
713 enable_preemption();
714 free_bucket = smr_bucket_alloc(Z_WAITOK | Z_NOFAIL);
715 goto again;
716 }
717 *slot = bucket;
718 }
719
720 bucket->smrb_recs[bucket->smrb_count].smrr_val = value;
721 bucket->smrb_recs[bucket->smrb_count].smrr_dtor = destructor;
722
723 if (os_add_overflow(bucket->smrb_size, size, &bucket->smrb_size)) {
724 bucket->smrb_size = UINT32_MAX;
725 }
726
727 if (++bucket->smrb_count == smr_bucket_count ||
728 bucket->smrb_size >= smr_retire_threshold) {
729 /*
730 * This will be retired the next time around,
731 * to give readers a chance to notice the new clock.
732 */
733 bucket->smrb_seq = smr_advance(&smr_system);
734 }
735 enable_preemption();
736
737 if (__improbable(free_bucket)) {
738 smr_bucket_free(free_bucket);
739 }
740 }
741
742
743 static void
smr_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)744 smr_deallocate_queue_invoke(mpsc_queue_chain_t e,
745 __assert_only mpsc_daemon_queue_t dq)
746 {
747 smr_bucket_t bucket;
748
749 assert(dq == &smr_deallocate_queue);
750
751 bucket = mpsc_queue_element(e, struct smr_bucket, smrb_mplink);
752 smr_wait(&smr_system, bucket->smrb_seq);
753
754 for (uint32_t i = 0; i < bucket->smrb_count; i++) {
755 struct smr_record *smrr = &bucket->smrb_recs[i];
756
757 smrr->smrr_dtor(smrr->smrr_val);
758 }
759
760 smr_bucket_free(bucket);
761 }
762
763 void
smr_register_mpsc_queue(void)764 smr_register_mpsc_queue(void)
765 {
766 thread_deallocate_daemon_register_queue(&smr_deallocate_queue,
767 smr_deallocate_queue_invoke);
768 }
769
770 static void
smr_startup(void)771 smr_startup(void)
772 {
773 smr_bucket_count = zpercpu_count();
774 if (smr_bucket_count < smr_bucket_count_min) {
775 smr_bucket_count = smr_bucket_count_min;
776 }
777 }
778 STARTUP(PERCPU, STARTUP_RANK_LAST, smr_startup);
779