xref: /xnu-8792.41.9/osfmk/kern/smr.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f) !
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/cpu_data.h>
30 #include <kern/mpsc_queue.h>
31 #include <kern/percpu.h>
32 #include <kern/smr.h>
33 #include <kern/zalloc.h>
34 #include <sys/queue.h>
35 
36 /*
37  * This SMR scheme is directly FreeBSD's "Global Unbounded Sequences".
38  *
39  * Major differences are:
40  *
41  * - only eager clocks are implemented (no lazy, no implicit)
42  */
43 
44 typedef long                    smr_delta_t;
45 
46 typedef struct smr_pcpu {
47 	smr_seq_t               c_rd_seq;
48 	unsigned long           c_rd_budget;
49 } *smr_pcpu_t;
50 
51 #define SMR_SEQ_DELTA(a, b)     ((smr_delta_t)((a) - (b)))
52 #define SMR_SEQ_CMP(a, op, b)   (SMR_SEQ_DELTA(a, b) op 0)
53 
54 #define SMR_SEQ_INC             2ul
55 
56 #define SMR_EARLY_COUNT         32
57 
58 __startup_data
59 static struct {
60 	struct smr_pcpu array[SMR_EARLY_COUNT];
61 	unsigned        used;
62 } smr_boot;
63 
64 
65 #pragma mark - manipulating an SMR clock
66 
67 /*
68  * SMR clocks have 3 state machines interacting at any given time:
69  *
70  *
71  * 1. reader critical sections
72  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
73  *
74  * Each CPU can disable preemption and do this sequence:
75  *
76  *     CPU::c_rd_seq = GLOBAL::c_wr_seq;
77  *
78  *     < unfortunate place to receive a long IRQ >                      [I]
79  *
80  *     os_atomic_thread_fence(seq_cst);                                 [R1]
81  *
82  *     {
83  *         // critical section
84  *     }
85  *
86  *     os_atomic_store(&CPU::c_rd_seq, INVALID, release);               [R2]
87  *
88  *
89  *
90  * 2. writer sequence advances
91  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
92  *
93  * Each writer can increment the global write sequence
94  * at any given time:
95  *
96  *    os_atomic_add(&GLOBAL::c_wr_seq, SMR_SEQ_INC, release);           [W]
97  *
98  *
99  *
100  * 3. synchronization sequence: poll/wait/scan
101  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
102  *
103  * This state machine synchronizes with the other two in order to decide
104  * if a given "goal" is in the past. Only the cases when the call
105  * is successful is interresting for barrier purposes, and we will focus
106  * on cases that do not take an early return for failures.
107  *
108  * a. __smr_poll:
109  *
110  *     rd_seq = os_atomic_load(&GLOBAL::c_rd_seq, acquire);             [S1]
111  *     if (goal < rd_seq) SUCCESS.
112  *     wr_seq = os_atomic_load(&GLOBAL::c_rd_seq, relaxed);
113  *
114  * b. __smr_scan
115  *
116  *     os_atomic_thread_fence(seq_cst)                                  [S2]
117  *
118  *     observe the minimum CPU::c_rd_seq "min_rd_seq"
119  *     value possible or rw_seq if no CPU was in a critical section.
120  *     (possibly spinning until it satisfies "goal")
121  *
122  * c. __smr_rd_advance
123  *
124  *     cur_rd_seq = load_exclusive(&GLOBAL::c_rd_seq);
125  *     os_atomic_thread_fence(seq_cst);                                 [S3]
126  *     if (min_rd_seq > cur_rd_seq) {
127  *         store_exlusive(&GLOBAL::c_rd_seq, min_rd_seq);
128  *     }
129  *
130  *
131  * One sentence summary
132  * ~~~~~~~~~~~~~~~~~~~~
133  *
134  * A simplistic one-sentence summary of the algorithm is that __smr_scan()
135  * works really hard to insert itself in the timeline of write sequences and
136  * observe a reasonnable bound for first safe-to-reclaim sequence, and
137  * issues [S3] to sequence everything around "c_rd_seq" (via [S3] -> [S1]):
138  *
139  *              GLOBAL::c_rd_seq                GLOBAL::c_wr_seq
140  *                             v                v
141  *       ──────────────────────┬────────────────┬─────────────────────
142  *       ... safe to reclaim   │    deferred    │   future         ...
143  *       ──────────────────────┴────────────────┴─────────────────────
144  *
145  *
146  * Detailed explanation
147  * ~~~~~~~~~~~~~~~~~~~~
148  *
149  * [W] -> [R1] establishes a "happens before" relationship between a given
150  * writer and this critical section. The loaded GLOBAL::c_wr_seq might
151  * however be stale with respect to the one [R1] really synchronizes with
152  * (see [I] explanation below).
153  *
154  *
155  * [R1] -> [S2] establishes a "happens before" relationship between all the
156  * active critical sections and the scanner.
157  * It lets us compute the oldest possible sequence pinned by an active
158  * critical section.
159  *
160  *
161  * [R2] -> [S3] establishes a "happens before" relationship between all the
162  * inactive critical sections and the scanner.
163  *
164  *
165  * [S3] -> [S1] is the typical expected fastpath: when the caller can decide
166  * that its goal is older than the last update an __smr_rd_advance() did.
167  * Note that [S3] doubles as an "[S1]" when two __smr_scan() race each other
168  * and one of them finishes last but observed a "worse" read sequence.
169  *
170  *
171  * [W], [S3] -> [S1] is the last crucial property: all updates to the global
172  * clock are totally ordered because they update the entire 128bit state
173  * every time with an RMW. This guarantees that __smr_poll() can't load
174  * an `rd_seq` that is younger than the `wr_seq` it loads next.
175  *
176  *
177  * [I] __smr_enter() also can be unfortunately delayed after observing
178  * a given write sequence and right before [R1] at [I].
179  *
180  * However for a read sequence to have move past what __smr_enter() observed,
181  * it means another __smr_scan() didn't observe the store to CPU::c_rd_seq
182  * made by __smr_enter() and thought the section was inactive.
183  *
184  * This can only happen if the scan's [S2] was issued before the delayed
185  * __smr_enter() [R1] (during the [I] window).
186  *
187  * As a consequence the outcome of that scan can be accepted as the "real"
188  * write sequence __smr_enter() should have observed.
189  *
190  *
191  * Litmus tests
192  * ~~~~~~~~~~~~
193  *
194  * This is the proof of [W] -> [R1] -> [S2] being established properly:
195  * - P0 sets a global and calls smr_synchronize()
196  * - P1 does smr_enter() and loads the global
197  *
198  *     AArch64 MP
199  *     {
200  *         global = 0;
201  *         wr_seq = 123;
202  *         p1_rd_seq = 0;
203  *
204  *         0:x0 = global; 0:x1 = wr_seq; 0:x2 = p1_rd_seq;
205  *         1:x0 = global; 1:x1 = wr_seq; 1:x2 = p1_rd_seq;
206  *     }
207  *      P0                     | P1                         ;
208  *      MOV      X8, #2        | LDR        X8, [X1]        ;
209  *      STR      X8, [X0]      | STR        X8, [X2]        ;
210  *      LDADDL   X8, X9, [X1]  | DMB        SY              ;
211  *      DMB      SY            | LDR        X10, [X0]       ;
212  *      LDR      X10, [X2]     |                            ;
213  *     exists (0:X10 = 0 /\ 1:X8 = 123 /\ 1:X10 = 0)
214  *
215  *
216  * This is the proof that deferred advances are also correct:
217  * - P0 sets a global and does a smr_deferred_advance()
218  * - P1 does an smr_synchronize() and reads the global
219  *
220  *     AArch64 MP
221  *     {
222  *         global = 0;
223  *         wr_seq = 123;
224  *
225  *         0:x0 = global; 0:x1 = wr_seq; 0:x2 = 2;
226  *         1:x0 = global; 1:x1 = wr_seq; 1:x2 = 2;
227  *     }
228  *      P0                     | P1                         ;
229  *      STR      X2, [X0]      | LDADDL     X2, X9, [X1]    ;
230  *      DMB      SY            | DMB        SY              ;
231  *      LDR      X9, [X1]      | LDR        X10, [X0]       ;
232  *      ADD      X9, X9, X2    |                            ;
233  *     exists (0:X9 = 125 /\ 1:X9 = 123 /\ 1:X10 = 0)
234  *
235  */
236 
237 static inline smr_pcpu_t __zpercpu
smr_pcpu(smr_t smr)238 smr_pcpu(smr_t smr)
239 {
240 	return (smr_pcpu_t __zpercpu)smr->smr_pcpu;
241 }
242 
243 __startup_func
244 void
__smr_init(smr_t smr)245 __smr_init(smr_t smr)
246 {
247 	if (startup_phase < STARTUP_SUB_TUNABLES) {
248 		smr_pcpu_t pcpu = &smr_boot.array[smr_boot.used++];
249 		assertf(smr_boot.used <= SMR_EARLY_COUNT,
250 		    "too many SMR_DEFINE_EARLY(), adjust SMR_EARLY_COUNT");
251 		smr->smr_pcpu = (unsigned long)__zpcpu_mangle_for_boot(pcpu);
252 	} else {
253 		smr_pcpu_t __zpercpu pcpu;
254 
255 		pcpu = zalloc_percpu_permanent_type(struct smr_pcpu);
256 		os_atomic_store(&smr->smr_pcpu, (unsigned long)pcpu, release);
257 	}
258 }
259 
260 static inline void
__smr_reset(smr_t smr,smr_seq_t seq)261 __smr_reset(smr_t smr, smr_seq_t seq)
262 {
263 	smr->smr_clock.s_rd_seq = seq;
264 	smr->smr_clock.s_wr_seq = seq;
265 	smr->smr_pcpu           = 0;
266 }
267 
268 void
smr_init(smr_t smr)269 smr_init(smr_t smr)
270 {
271 	smr_pcpu_t __zpercpu pcpu;
272 
273 	pcpu = zalloc_percpu(percpu_u64_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
274 	__smr_reset(smr, SMR_SEQ_INIT);
275 	os_atomic_store(&smr->smr_pcpu, (unsigned long)pcpu, release);
276 }
277 
278 void
smr_set_deferred_budget(smr_t smr,unsigned long budget)279 smr_set_deferred_budget(smr_t smr, unsigned long budget)
280 {
281 	/*
282 	 * No need to update the per-cpu budget variables,
283 	 * calls to smr_deferred_advance() will eventually fix it.
284 	 *
285 	 * Note: we use `-1` because smr_deferred_advance_nopreempt()
286 	 *       checks for overflow rather than hitting 0.
287 	 */
288 	smr->smr_budget = budget ? budget - 1 : 0;
289 }
290 
291 void
smr_destroy(smr_t smr)292 smr_destroy(smr_t smr)
293 {
294 	smr_synchronize(smr);
295 	zfree_percpu(percpu_u64_zone, smr_pcpu(smr));
296 	__smr_reset(smr, SMR_SEQ_INVALID);
297 }
298 
299 static inline bool
smr_entered_nopreempt(smr_t smr)300 smr_entered_nopreempt(smr_t smr)
301 {
302 	return zpercpu_get(smr_pcpu(smr))->c_rd_seq != SMR_SEQ_INVALID;
303 }
304 
305 __attribute__((always_inline))
306 bool
smr_entered(smr_t smr)307 smr_entered(smr_t smr)
308 {
309 	return get_preemption_level() != 0 && smr_entered_nopreempt(smr);
310 }
311 
312 __attribute__((always_inline))
313 static void
__smr_enter(smr_t smr,smr_pcpu_t pcpu)314 __smr_enter(smr_t smr, smr_pcpu_t pcpu)
315 {
316 	smr_seq_t  s_wr_seq;
317 	smr_seq_t  old_seq;
318 
319 	/*
320 	 * It is possible to have a long delay between loading the s_wr_seq
321 	 * and storing it to the percpu copy of it.
322 	 *
323 	 * It is unlikely but possible by that time the s_rd_seq advances
324 	 * ahead of what we will store. This however is still safe
325 	 * and handled in __smr_scan().
326 	 *
327 	 * On Intel, to achieve the ordering we want, we could use a store
328 	 * followed by an mfence, or any RMW (XCHG, XADD, CMPXCHG, ...).
329 	 * XADD is just the fastest instruction of the alternatives,
330 	 * but it will only ever add to '0'.
331 	 */
332 	s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
333 #if __x86_64__
334 	/* [R1] */
335 	old_seq = os_atomic_add_orig(&pcpu->c_rd_seq, s_wr_seq, seq_cst);
336 #else
337 	old_seq = os_atomic_load(&pcpu->c_rd_seq, relaxed);
338 	os_atomic_store(&pcpu->c_rd_seq, s_wr_seq, relaxed);
339 	os_atomic_thread_fence(seq_cst); /* [R1] */
340 #endif
341 	assert(old_seq == SMR_SEQ_INVALID);
342 }
343 
344 __attribute__((always_inline))
345 void
smr_enter(smr_t smr)346 smr_enter(smr_t smr)
347 {
348 	disable_preemption();
349 	__smr_enter(smr, zpercpu_get(smr_pcpu(smr)));
350 }
351 
352 __attribute__((always_inline))
353 void
smr_leave(smr_t smr)354 smr_leave(smr_t smr)
355 {
356 	smr_pcpu_t pcpu = zpercpu_get(smr_pcpu(smr));
357 
358 	/* [R2] */
359 	os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
360 	enable_preemption();
361 }
362 
363 static inline smr_seq_t
__smr_wr_advance(smr_t smr)364 __smr_wr_advance(smr_t smr)
365 {
366 	/* [W] */
367 	return os_atomic_add(&smr->smr_clock.s_wr_seq, SMR_SEQ_INC, release);
368 }
369 
370 static inline smr_clock_t
__smr_wr_advance_combined(smr_t smr)371 __smr_wr_advance_combined(smr_t smr)
372 {
373 	smr_clock_t clk = { .s_wr_seq = SMR_SEQ_INC, };
374 
375 	/*
376 	 * Do a combined increment to get consistent read/write positions.
377 	 */
378 
379 	/* [W] */
380 	clk.s_combined = os_atomic_add(&smr->smr_clock.s_combined,
381 	    clk.s_combined, release);
382 
383 	return clk;
384 }
385 
386 static inline bool
__smr_rd_advance(smr_t smr,smr_seq_t goal,smr_seq_t rd_seq)387 __smr_rd_advance(smr_t smr, smr_seq_t goal, smr_seq_t rd_seq)
388 {
389 	smr_clock_t oclk, nclk;
390 
391 	os_atomic_rmw_loop(&smr->smr_clock.s_combined,
392 	    oclk.s_combined, nclk.s_combined, relaxed, {
393 		nclk = oclk;
394 
395 		os_atomic_thread_fence(seq_cst); /* [S3] */
396 
397 		if (SMR_SEQ_CMP(rd_seq, <=, oclk.s_rd_seq)) {
398 		        os_atomic_rmw_loop_give_up(break);
399 		}
400 		nclk.s_rd_seq = rd_seq;
401 	});
402 
403 	return SMR_SEQ_CMP(goal, <=, nclk.s_rd_seq);
404 }
405 
406 __attribute__((noinline))
407 static bool
__smr_scan(smr_t smr,smr_seq_t goal,smr_clock_t clk,bool wait)408 __smr_scan(smr_t smr, smr_seq_t goal, smr_clock_t clk, bool wait)
409 {
410 	smr_delta_t delta;
411 	smr_seq_t rd_seq;
412 
413 	/*
414 	 * Validate that the goal is sane.
415 	 */
416 	delta = SMR_SEQ_DELTA(goal, clk.s_wr_seq);
417 	if (delta == SMR_SEQ_INC && smr->smr_budget) {
418 		/*
419 		 * This SMR clock uses deferred advance,
420 		 * and the goal is one inc in the future.
421 		 *
422 		 * If we can wait, then force the clock
423 		 * to advance, else we can't possibly succeed.
424 		 */
425 		if (!wait) {
426 			return false;
427 		}
428 		clk = __smr_wr_advance_combined(smr);
429 	} else if (delta > 0) {
430 		/*
431 		 * Invalid goal: the caller held on it for too long,
432 		 * and integers wrapped.
433 		 */
434 		return true;
435 	}
436 
437 	os_atomic_thread_fence(seq_cst); /* [S2] */
438 
439 	/*
440 	 * The read sequence can be no larger than the write sequence
441 	 * at the start of the poll.
442 	 *
443 	 * We know that on entry:
444 	 *
445 	 *     s_rd_seq < goal <= s_wr_seq
446 	 *
447 	 * The correctness of this algorithm relies on the fact that
448 	 * the SMR domain [s_rd_seq, s_wr_seq) can't possibly move
449 	 * by more than roughly (ULONG_MAX / 2) while __smr_scan()
450 	 * is running, otherwise the "rd_seq" we try to scan for
451 	 * might appear larger than s_rd_seq spuriously and we'd
452 	 * __smr_rd_advance() incorrectly.
453 	 *
454 	 * This is guaranteed by the fact that this represents
455 	 * advancing 2^62 times. At one advance every nanosecond,
456 	 * it takes more than a century, which makes it possible
457 	 * to call smr_wait() or smr_poll() with preemption enabled.
458 	 */
459 	rd_seq = clk.s_wr_seq;
460 
461 	zpercpu_foreach(it, smr_pcpu(smr)) {
462 		smr_seq_t seq = os_atomic_load(&it->c_rd_seq, relaxed);
463 
464 		while (seq != SMR_SEQ_INVALID) {
465 			/*
466 			 * Resolve the race documented in __smr_enter().
467 			 *
468 			 * The CPU has loaded a stale s_wr_seq, and s_rd_seq
469 			 * moved past this stale value.
470 			 *
471 			 * Its critical section is however properly serialized,
472 			 * but we can't know what the "correct" s_wr_seq it
473 			 * could have observed was. We have to assume `s_rd_seq`
474 			 * to prevent it from advancing.
475 			 */
476 			if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
477 				seq = clk.s_rd_seq;
478 			}
479 
480 			if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
481 				break;
482 			}
483 
484 			disable_preemption();
485 			seq = hw_wait_while_equals_long(&it->c_rd_seq, seq);
486 			enable_preemption();
487 		}
488 
489 		if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
490 			rd_seq = seq;
491 		}
492 	}
493 
494 	/*
495 	 * Advance the rd_seq as long as we observed a more recent value.
496 	 */
497 	return __smr_rd_advance(smr, goal, rd_seq);
498 }
499 
500 static inline bool
__smr_poll(smr_t smr,smr_seq_t goal,bool wait)501 __smr_poll(smr_t smr, smr_seq_t goal, bool wait)
502 {
503 	smr_clock_t clk;
504 
505 	/*
506 	 * Load both the s_rd_seq and s_wr_seq in the right order so that we
507 	 * can't observe a s_rd_seq older than s_wr_seq.
508 	 */
509 
510 	/* [S1] */
511 	clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, acquire);
512 
513 	/*
514 	 * We expect this to be typical: the goal has already been observed.
515 	 */
516 	if (__probable(SMR_SEQ_CMP(goal, <=, clk.s_rd_seq))) {
517 		return true;
518 	}
519 
520 	clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
521 
522 	return __smr_scan(smr, goal, clk, wait);
523 }
524 
525 smr_seq_t
smr_advance(smr_t smr)526 smr_advance(smr_t smr)
527 {
528 	smr_clock_t clk;
529 
530 	assert(!smr_entered(smr));
531 
532 	/*
533 	 * We assume that there will at least be a successful __smr_poll
534 	 * call every 2^61 calls to smr_advance() or so, so we do not need
535 	 * to check if [s_rd_seq, s_wr_seq) is growing too wide.
536 	 */
537 	static_assert(sizeof(clk.s_wr_seq) == 8);
538 	return __smr_wr_advance(smr);
539 }
540 
541 smr_seq_t
smr_deferred_advance_nopreempt(smr_t smr,unsigned long step)542 smr_deferred_advance_nopreempt(smr_t smr, unsigned long step)
543 {
544 	smr_pcpu_t pcpu = smr_pcpu(smr);
545 
546 	assert(get_preemption_level() && !smr_entered_nopreempt(smr));
547 
548 	if (os_sub_overflow(pcpu->c_rd_budget, step, &pcpu->c_rd_budget)) {
549 		pcpu->c_rd_budget = smr->smr_budget;
550 		return __smr_wr_advance(smr);
551 	}
552 
553 	/*
554 	 * Deferred updates are about avoiding to touch the global c_wr_seq
555 	 * as often, and we return a sequence number in the future.
556 	 *
557 	 * This full barrier establishes a "happen before" relationship
558 	 * with the [W] barrier in the __smr_wr_advance() that will
559 	 * actually generate it.
560 	 */
561 	os_atomic_thread_fence(seq_cst);
562 	return SMR_SEQ_INC + os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
563 }
564 
565 smr_seq_t
smr_deferred_advance(smr_t smr,unsigned long step)566 smr_deferred_advance(smr_t smr, unsigned long step)
567 {
568 	smr_seq_t seq;
569 
570 	disable_preemption();
571 	seq = smr_deferred_advance_nopreempt(smr, step);
572 	enable_preemption();
573 
574 	return seq;
575 }
576 
577 bool
smr_poll(smr_t smr,smr_seq_t goal)578 smr_poll(smr_t smr, smr_seq_t goal)
579 {
580 	assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
581 	return __smr_poll(smr, goal, false);
582 }
583 
584 void
smr_wait(smr_t smr,smr_seq_t goal)585 smr_wait(smr_t smr, smr_seq_t goal)
586 {
587 	assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
588 	(void)__smr_poll(smr, goal, true);
589 }
590 
591 void
smr_synchronize(smr_t smr)592 smr_synchronize(smr_t smr)
593 {
594 	smr_clock_t clk;
595 
596 	assert(!smr_entered(smr));
597 	clk = __smr_wr_advance_combined(smr);
598 	__smr_scan(smr, clk.s_wr_seq, clk, true);
599 }
600 
601 
602 #pragma mark - system global SMR
603 
604 typedef struct smr_record {
605 	void                   *smrr_val;
606 	void                  (*smrr_dtor)(void *);
607 } *smr_record_t;
608 
609 typedef struct smr_bucket {
610 	union {
611 		struct mpsc_queue_chain  smrb_mplink;
612 		STAILQ_ENTRY(smr_bucket) smrb_stqlink;
613 	};
614 	uint32_t             smrb_count;
615 	uint32_t             smrb_size;
616 	smr_seq_t            smrb_seq;
617 	struct smr_record    smrb_recs[];
618 } *smr_bucket_t;
619 
620 STAILQ_HEAD(smr_bucket_list, smr_bucket);
621 
622 static SMR_DEFINE_EARLY(smr_system);
623 
624 /*! per-cpu state for smr pointers. */
625 static smr_bucket_t PERCPU_DATA(smr_bucket);
626 
627 /*! the minimum number of items cached in per-cpu buckets */
628 static TUNABLE(uint32_t, smr_bucket_count_min, "smr_bucket_count_min", 8);
629 
630 /*! the amount of memory pending retiring that causes a foreceful flush */
631 #if XNU_TARGET_OS_OSX
632 #define SMR_RETIRE_THRESHOLD_DEFAULT    (256 << 10)
633 #else
634 #define SMR_RETIRE_THRESHOLD_DEFAULT    (64 << 10)
635 #endif
636 static TUNABLE(vm_size_t, smr_retire_threshold, "smr_retire_threshold",
637     SMR_RETIRE_THRESHOLD_DEFAULT);
638 
639 /*! the number of items cached in per-cpu buckets */
640 static SECURITY_READ_ONLY_LATE(uint32_t) smr_bucket_count;
641 
642 /*! the queue of elements that couldn't be freed immediately */
643 static struct smr_bucket_list smr_buckets_pending =
644     STAILQ_HEAD_INITIALIZER(smr_buckets_pending);
645 
646 /*! the atomic queue handling deferred deallocations */
647 static struct mpsc_daemon_queue smr_deallocate_queue;
648 
649 __attribute__((always_inline))
650 bool
smr_global_entered(void)651 smr_global_entered(void)
652 {
653 	return smr_entered(&smr_system);
654 }
655 
656 __attribute__((always_inline))
657 void
smr_global_enter(void)658 smr_global_enter(void)
659 {
660 	smr_enter(&smr_system);
661 }
662 
663 __attribute__((always_inline))
664 void
smr_global_leave(void)665 smr_global_leave(void)
666 {
667 	smr_leave(&smr_system);
668 }
669 
670 static smr_bucket_t
smr_bucket_alloc(zalloc_flags_t flags)671 smr_bucket_alloc(zalloc_flags_t flags)
672 {
673 	return kalloc_type(struct smr_bucket, struct smr_record,
674 	           smr_bucket_count, Z_ZERO | flags);
675 }
676 
677 static void
smr_bucket_free(smr_bucket_t bucket)678 smr_bucket_free(smr_bucket_t bucket)
679 {
680 	return kfree_type(struct smr_bucket, struct smr_record,
681 	           smr_bucket_count, bucket);
682 }
683 
684 void
smr_global_retire(void * value,size_t size,void (* destructor)(void *))685 smr_global_retire(void *value, size_t size, void (*destructor)(void *))
686 {
687 	smr_bucket_t *slot;
688 	smr_bucket_t bucket, free_bucket = NULL;
689 
690 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
691 		/*
692 		 * The system is still single threaded and this module
693 		 * is still not fully initialized.
694 		 */
695 		destructor(value);
696 		return;
697 	}
698 
699 again:
700 	disable_preemption();
701 	slot = PERCPU_GET(smr_bucket);
702 	bucket = *slot;
703 	if (bucket && bucket->smrb_seq) {
704 		mpsc_daemon_enqueue(&smr_deallocate_queue,
705 		    &bucket->smrb_mplink, MPSC_QUEUE_NONE);
706 		*slot = bucket = NULL;
707 	}
708 	if (bucket == NULL) {
709 		if (free_bucket) {
710 			bucket = free_bucket;
711 			free_bucket = NULL;
712 		} else if ((bucket = smr_bucket_alloc(Z_NOWAIT)) == NULL) {
713 			enable_preemption();
714 			free_bucket = smr_bucket_alloc(Z_WAITOK | Z_NOFAIL);
715 			goto again;
716 		}
717 		*slot = bucket;
718 	}
719 
720 	bucket->smrb_recs[bucket->smrb_count].smrr_val = value;
721 	bucket->smrb_recs[bucket->smrb_count].smrr_dtor = destructor;
722 
723 	if (os_add_overflow(bucket->smrb_size, size, &bucket->smrb_size)) {
724 		bucket->smrb_size = UINT32_MAX;
725 	}
726 
727 	if (++bucket->smrb_count == smr_bucket_count ||
728 	    bucket->smrb_size >= smr_retire_threshold) {
729 		/*
730 		 * This will be retired the next time around,
731 		 * to give readers a chance to notice the new clock.
732 		 */
733 		bucket->smrb_seq = smr_advance(&smr_system);
734 	}
735 	enable_preemption();
736 
737 	if (__improbable(free_bucket)) {
738 		smr_bucket_free(free_bucket);
739 	}
740 }
741 
742 
743 static void
smr_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)744 smr_deallocate_queue_invoke(mpsc_queue_chain_t e,
745     __assert_only mpsc_daemon_queue_t dq)
746 {
747 	smr_bucket_t bucket;
748 
749 	assert(dq == &smr_deallocate_queue);
750 
751 	bucket = mpsc_queue_element(e, struct smr_bucket, smrb_mplink);
752 	smr_wait(&smr_system, bucket->smrb_seq);
753 
754 	for (uint32_t i = 0; i < bucket->smrb_count; i++) {
755 		struct smr_record *smrr = &bucket->smrb_recs[i];
756 
757 		smrr->smrr_dtor(smrr->smrr_val);
758 	}
759 
760 	smr_bucket_free(bucket);
761 }
762 
763 void
smr_register_mpsc_queue(void)764 smr_register_mpsc_queue(void)
765 {
766 	thread_deallocate_daemon_register_queue(&smr_deallocate_queue,
767 	    smr_deallocate_queue_invoke);
768 }
769 
770 static void
smr_startup(void)771 smr_startup(void)
772 {
773 	smr_bucket_count = zpercpu_count();
774 	if (smr_bucket_count < smr_bucket_count_min) {
775 		smr_bucket_count = smr_bucket_count_min;
776 	}
777 }
778 STARTUP(PERCPU, STARTUP_RANK_LAST, smr_startup);
779