xref: /xnu-8796.121.2/osfmk/kern/smr.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/locks_internal.h>
30 #include <kern/cpu_data.h>
31 #include <kern/mpsc_queue.h>
32 #include <kern/percpu.h>
33 #include <kern/smr.h>
34 #include <kern/smr_hash.h>
35 #include <kern/zalloc.h>
36 #include <sys/queue.h>
37 #include <os/hash.h>
38 
39 
40 #pragma mark - SMR domains
41 
42 typedef struct smr_pcpu {
43 	smr_seq_t               c_rd_seq;
44 } *smr_pcpu_t;
45 
46 /*
47  * This SMR scheme is directly FreeBSD's "Global Unbounded Sequences".
48  *
49  * Major differences are:
50  *
51  * - only eager clocks are implemented (no lazy, no implicit)
52  *
53  *
54  * SMR clocks have 3 state machines interacting at any given time:
55  *
56  * 1. reader critical sections
57  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
58  *
59  * Each CPU can disable preemption and do this sequence:
60  *
61  *     CPU::c_rd_seq = GLOBAL::c_wr_seq;
62  *
63  *     < unfortunate place to receive a long IRQ >                      [I]
64  *
65  *     os_atomic_thread_fence(seq_cst);                                 [R1]
66  *
67  *     {
68  *         // critical section
69  *     }
70  *
71  *     os_atomic_store(&CPU::c_rd_seq, INVALID, release);               [R2]
72  *
73  *
74  *
75  * 2. writer sequence advances
76  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
77  *
78  * Each writer can increment the global write sequence
79  * at any given time:
80  *
81  *    os_atomic_add(&GLOBAL::c_wr_seq, SMR_SEQ_INC, release);           [W]
82  *
83  *
84  *
85  * 3. synchronization sequence: poll/wait/scan
86  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
87  *
88  * This state machine synchronizes with the other two in order to decide
89  * if a given "goal" is in the past. Only the cases when the call
90  * is successful is interresting for barrier purposes, and we will focus
91  * on cases that do not take an early return for failures.
92  *
93  * a. __smr_poll:
94  *
95  *     rd_seq = os_atomic_load(&GLOBAL::c_rd_seq, acquire);             [S1]
96  *     if (goal < rd_seq) SUCCESS.
97  *     wr_seq = os_atomic_load(&GLOBAL::c_rd_seq, relaxed);
98  *
99  * b. __smr_scan
100  *
101  *     os_atomic_thread_fence(seq_cst)                                  [S2]
102  *
103  *     observe the minimum CPU::c_rd_seq "min_rd_seq"
104  *     value possible or rw_seq if no CPU was in a critical section.
105  *     (possibly spinning until it satisfies "goal")
106  *
107  * c. __smr_rd_advance
108  *
109  *     cur_rd_seq = load_exclusive(&GLOBAL::c_rd_seq);
110  *     os_atomic_thread_fence(seq_cst);                                 [S3]
111  *     if (min_rd_seq > cur_rd_seq) {
112  *         store_exlusive(&GLOBAL::c_rd_seq, min_rd_seq);
113  *     }
114  *
115  *
116  * One sentence summary
117  * ~~~~~~~~~~~~~~~~~~~~
118  *
119  * A simplistic one-sentence summary of the algorithm is that __smr_scan()
120  * works really hard to insert itself in the timeline of write sequences and
121  * observe a reasonnable bound for first safe-to-reclaim sequence, and
122  * issues [S3] to sequence everything around "c_rd_seq" (via [S3] -> [S1]):
123  *
124  *              GLOBAL::c_rd_seq                GLOBAL::c_wr_seq
125  *                             v                v
126  *       ──────────────────────┬────────────────┬─────────────────────
127  *       ... safe to reclaim   │    deferred    │   future         ...
128  *       ──────────────────────┴────────────────┴─────────────────────
129  *
130  *
131  * Detailed explanation
132  * ~~~~~~~~~~~~~~~~~~~~
133  *
134  * [W] -> [R1] establishes a "happens before" relationship between a given
135  * writer and this critical section. The loaded GLOBAL::c_wr_seq might
136  * however be stale with respect to the one [R1] really synchronizes with
137  * (see [I] explanation below).
138  *
139  *
140  * [R1] -> [S2] establishes a "happens before" relationship between all the
141  * active critical sections and the scanner.
142  * It lets us compute the oldest possible sequence pinned by an active
143  * critical section.
144  *
145  *
146  * [R2] -> [S3] establishes a "happens before" relationship between all the
147  * inactive critical sections and the scanner.
148  *
149  *
150  * [S3] -> [S1] is the typical expected fastpath: when the caller can decide
151  * that its goal is older than the last update an __smr_rd_advance() did.
152  * Note that [S3] doubles as an "[S1]" when two __smr_scan() race each other
153  * and one of them finishes last but observed a "worse" read sequence.
154  *
155  *
156  * [W], [S3] -> [S1] is the last crucial property: all updates to the global
157  * clock are totally ordered because they update the entire 128bit state
158  * every time with an RMW. This guarantees that __smr_poll() can't load
159  * an `rd_seq` that is younger than the `wr_seq` it loads next.
160  *
161  *
162  * [I] __smr_enter() also can be unfortunately delayed after observing
163  * a given write sequence and right before [R1] at [I].
164  *
165  * However for a read sequence to have move past what __smr_enter() observed,
166  * it means another __smr_scan() didn't observe the store to CPU::c_rd_seq
167  * made by __smr_enter() and thought the section was inactive.
168  *
169  * This can only happen if the scan's [S2] was issued before the delayed
170  * __smr_enter() [R1] (during the [I] window).
171  *
172  * As a consequence the outcome of that scan can be accepted as the "real"
173  * write sequence __smr_enter() should have observed.
174  *
175  *
176  * Litmus tests
177  * ~~~~~~~~~~~~
178  *
179  * This is the proof of [W] -> [R1] -> [S2] being established properly:
180  * - P0 sets a global and calls smr_synchronize()
181  * - P1 does smr_enter() and loads the global
182  *
183  *     AArch64 MP
184  *     {
185  *         global = 0;
186  *         wr_seq = 123;
187  *         p1_rd_seq = 0;
188  *
189  *         0:x0 = global; 0:x1 = wr_seq; 0:x2 = p1_rd_seq;
190  *         1:x0 = global; 1:x1 = wr_seq; 1:x2 = p1_rd_seq;
191  *     }
192  *      P0                     | P1                         ;
193  *      MOV      X8, #2        | LDR        X8, [X1]        ;
194  *      STR      X8, [X0]      | STR        X8, [X2]        ;
195  *      LDADDL   X8, X9, [X1]  | DMB        SY              ;
196  *      DMB      SY            | LDR        X10, [X0]       ;
197  *      LDR      X10, [X2]     |                            ;
198  *     exists (0:X10 = 0 /\ 1:X8 = 123 /\ 1:X10 = 0)
199  *
200  *
201  * This is the proof that deferred advances are also correct:
202  * - P0 sets a global and does a smr_deferred_advance()
203  * - P1 does an smr_synchronize() and reads the global
204  *
205  *     AArch64 MP
206  *     {
207  *         global = 0;
208  *         wr_seq = 123;
209  *
210  *         0:x0 = global; 0:x1 = wr_seq; 0:x2 = 2;
211  *         1:x0 = global; 1:x1 = wr_seq; 1:x2 = 2;
212  *     }
213  *      P0                     | P1                         ;
214  *      STR      X2, [X0]      | LDADDL     X2, X9, [X1]    ;
215  *      DMB      SY            | DMB        SY              ;
216  *      LDR      X9, [X1]      | LDR        X10, [X0]       ;
217  *      ADD      X9, X9, X2    |                            ;
218  *     exists (0:X9 = 125 /\ 1:X9 = 123 /\ 1:X10 = 0)
219  *
220  */
221 
222 #pragma mark SMR domains: init & helpers
223 
224 __attribute__((always_inline, overloadable))
225 static inline smr_pcpu_t
__smr_pcpu(smr_t smr,int cpu)226 __smr_pcpu(smr_t smr, int cpu)
227 {
228 	return zpercpu_get_cpu(smr->smr_pcpu, cpu);
229 }
230 
231 __attribute__((always_inline, overloadable))
232 static inline smr_pcpu_t
__smr_pcpu(smr_t smr)233 __smr_pcpu(smr_t smr)
234 {
235 	return zpercpu_get(smr->smr_pcpu);
236 }
237 
238 static inline void
__smr_pcpu_associate(smr_t smr,smr_pcpu_t pcpu)239 __smr_pcpu_associate(smr_t smr, smr_pcpu_t pcpu)
240 {
241 	os_atomic_store(&smr->smr_pcpu, pcpu, release);
242 }
243 
244 __startup_func
245 void
__smr_domain_init(smr_t smr)246 __smr_domain_init(smr_t smr)
247 {
248 	smr_pcpu_t pcpu;
249 
250 	if (startup_phase < STARTUP_SUB_TUNABLES) {
251 		smr_seq_t *rd_seqp = &smr->smr_early;
252 
253 		pcpu = __container_of(rd_seqp, struct smr_pcpu, c_rd_seq);
254 		smr->smr_pcpu = __zpcpu_mangle_for_boot(pcpu);
255 	} else {
256 		pcpu = zalloc_percpu_permanent_type(struct smr_pcpu);
257 		__smr_pcpu_associate(smr, pcpu);
258 	}
259 }
260 
261 smr_t
smr_domain_create(smr_flags_t flags)262 smr_domain_create(smr_flags_t flags)
263 {
264 	smr_pcpu_t pcpu;
265 	smr_t smr;
266 
267 	smr  = kalloc_type(struct smr, Z_WAITOK | Z_ZERO | Z_NOFAIL);
268 	pcpu = zalloc_percpu(percpu_u64_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
269 
270 	smr->smr_clock.s_rd_seq = SMR_SEQ_INIT;
271 	smr->smr_clock.s_wr_seq = SMR_SEQ_INIT;
272 	smr->smr_flags = flags;
273 
274 	__smr_pcpu_associate(smr, pcpu);
275 
276 	return smr;
277 }
278 
279 void
smr_domain_free(smr_t smr)280 smr_domain_free(smr_t smr)
281 {
282 	smr_synchronize(smr);
283 
284 	zfree_percpu(percpu_u64_zone, smr->smr_pcpu);
285 	kfree_type(struct smr, smr);
286 }
287 
288 #pragma mark SMR domains: enter / leave
289 
290 static inline bool
smr_entered_nopreempt(smr_t smr)291 smr_entered_nopreempt(smr_t smr)
292 {
293 	return __smr_pcpu(smr)->c_rd_seq != SMR_SEQ_INVALID;
294 }
295 
296 __attribute__((always_inline))
297 bool
smr_entered(smr_t smr)298 smr_entered(smr_t smr)
299 {
300 	return get_preemption_level() != 0 && smr_entered_nopreempt(smr);
301 }
302 
303 __attribute__((always_inline))
304 bool
smr_entered_cpu(smr_t smr,int cpu)305 smr_entered_cpu(smr_t smr, int cpu)
306 {
307 	return __smr_pcpu(smr, cpu)->c_rd_seq != SMR_SEQ_INVALID;
308 }
309 
310 __attribute__((always_inline))
311 static void
__smr_enter(smr_t smr,smr_pcpu_t pcpu)312 __smr_enter(smr_t smr, smr_pcpu_t pcpu)
313 {
314 	smr_seq_t  s_wr_seq;
315 	smr_seq_t  old_seq;
316 
317 	/*
318 	 * It is possible to have a long delay between loading the s_wr_seq
319 	 * and storing it to the percpu copy of it.
320 	 *
321 	 * It is unlikely but possible by that time the s_rd_seq advances
322 	 * ahead of what we will store. This however is still safe
323 	 * and handled in __smr_scan().
324 	 *
325 	 * On Intel, to achieve the ordering we want, we could use a store
326 	 * followed by an mfence, or any RMW (XCHG, XADD, CMPXCHG, ...).
327 	 * XADD is just the fastest instruction of the alternatives,
328 	 * but it will only ever add to '0'.
329 	 */
330 	s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
331 #if __x86_64__
332 	/* [R1] */
333 	old_seq = os_atomic_add_orig(&pcpu->c_rd_seq, s_wr_seq, seq_cst);
334 #else
335 	old_seq = pcpu->c_rd_seq;
336 	os_atomic_store(&pcpu->c_rd_seq, s_wr_seq, relaxed);
337 	os_atomic_thread_fence(seq_cst); /* [R1] */
338 #endif
339 	assert(old_seq == SMR_SEQ_INVALID);
340 }
341 
342 __attribute__((always_inline))
343 static void
__smr_leave(smr_pcpu_t pcpu)344 __smr_leave(smr_pcpu_t pcpu)
345 {
346 	/* [R2] */
347 	os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
348 }
349 
350 __attribute__((always_inline))
351 void
smr_enter(smr_t smr)352 smr_enter(smr_t smr)
353 {
354 	disable_preemption();
355 	__smr_enter(smr, __smr_pcpu(smr));
356 }
357 
358 __attribute__((always_inline))
359 void
smr_leave(smr_t smr)360 smr_leave(smr_t smr)
361 {
362 	__smr_leave(__smr_pcpu(smr));
363 	enable_preemption();
364 }
365 
366 
367 #pragma mark SMR domains: advance, wait, poll, synchronize
368 
369 static inline smr_seq_t
__smr_wr_advance(smr_t smr)370 __smr_wr_advance(smr_t smr)
371 {
372 	/* [W] */
373 	return os_atomic_add(&smr->smr_clock.s_wr_seq, SMR_SEQ_INC, release);
374 }
375 
376 static inline bool
__smr_rd_advance(smr_t smr,smr_seq_t goal,smr_seq_t rd_seq)377 __smr_rd_advance(smr_t smr, smr_seq_t goal, smr_seq_t rd_seq)
378 {
379 	smr_seq_t o_seq;
380 
381 	os_atomic_thread_fence(seq_cst); /* [S3] */
382 
383 	os_atomic_rmw_loop(&smr->smr_clock.s_rd_seq, o_seq, rd_seq, relaxed, {
384 		if (SMR_SEQ_CMP(rd_seq, <=, o_seq)) {
385 		        rd_seq = o_seq;
386 		        os_atomic_rmw_loop_give_up(break);
387 		}
388 	});
389 
390 	return SMR_SEQ_CMP(goal, <=, rd_seq);
391 }
392 
393 __attribute__((noinline))
394 static bool
__smr_scan(smr_t smr,smr_seq_t goal,smr_clock_t clk,bool wait)395 __smr_scan(smr_t smr, smr_seq_t goal, smr_clock_t clk, bool wait)
396 {
397 	smr_delta_t delta;
398 	smr_seq_t rd_seq;
399 
400 	/*
401 	 * Validate that the goal is sane.
402 	 */
403 	delta = SMR_SEQ_DELTA(goal, clk.s_wr_seq);
404 	if (delta == SMR_SEQ_INC) {
405 		/*
406 		 * This SMR clock uses deferred advance,
407 		 * and the goal is one inc in the future.
408 		 *
409 		 * If we can wait, then commit the sequence number,
410 		 * else we can't possibly succeed.
411 		 *
412 		 * Doing a commit here rather than an advance
413 		 * gives the hardware a chance to abort the
414 		 * transaction early in case of high contention
415 		 * compared to an unconditional advance.
416 		 */
417 		if (!wait) {
418 			return false;
419 		}
420 		if (lock_cmpxchgv(&smr->smr_clock.s_wr_seq,
421 		    clk.s_wr_seq, goal, &clk.s_wr_seq, relaxed)) {
422 			clk.s_wr_seq = goal;
423 		}
424 	} else if (delta > 0) {
425 		/*
426 		 * Invalid goal: the caller held on it for too long,
427 		 * and integers wrapped.
428 		 */
429 		return true;
430 	}
431 
432 	os_atomic_thread_fence(seq_cst); /* [S2] */
433 
434 	/*
435 	 * The read sequence can be no larger than the write sequence
436 	 * at the start of the poll.
437 	 *
438 	 * We know that on entry:
439 	 *
440 	 *     s_rd_seq < goal <= s_wr_seq
441 	 *
442 	 * The correctness of this algorithm relies on the fact that
443 	 * the SMR domain [s_rd_seq, s_wr_seq) can't possibly move
444 	 * by more than roughly (ULONG_MAX / 2) while __smr_scan()
445 	 * is running, otherwise the "rd_seq" we try to scan for
446 	 * might appear larger than s_rd_seq spuriously and we'd
447 	 * __smr_rd_advance() incorrectly.
448 	 *
449 	 * This is guaranteed by the fact that this represents
450 	 * advancing 2^62 times. At one advance every nanosecond,
451 	 * it takes more than a century, which makes it possible
452 	 * to call smr_wait() or smr_poll() with preemption enabled.
453 	 */
454 	rd_seq = clk.s_wr_seq;
455 
456 	zpercpu_foreach(it, smr->smr_pcpu) {
457 		smr_seq_t seq = os_atomic_load(&it->c_rd_seq, relaxed);
458 
459 		while (seq != SMR_SEQ_INVALID) {
460 			/*
461 			 * Resolve the race documented in __smr_enter().
462 			 *
463 			 * The CPU has loaded a stale s_wr_seq, and s_rd_seq
464 			 * moved past this stale value.
465 			 *
466 			 * Its critical section is however properly serialized,
467 			 * but we can't know what the "correct" s_wr_seq it
468 			 * could have observed was. We have to assume `s_rd_seq`
469 			 * to prevent it from advancing.
470 			 */
471 			if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
472 				seq = clk.s_rd_seq;
473 			}
474 
475 			if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
476 				break;
477 			}
478 
479 			disable_preemption();
480 			seq = hw_wait_while_equals_long(&it->c_rd_seq, seq);
481 			enable_preemption();
482 		}
483 
484 		if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
485 			rd_seq = seq;
486 		}
487 	}
488 
489 	/*
490 	 * Advance the rd_seq as long as we observed a more recent value.
491 	 */
492 	return __smr_rd_advance(smr, goal, rd_seq);
493 }
494 
495 static inline bool
__smr_poll(smr_t smr,smr_seq_t goal,bool wait)496 __smr_poll(smr_t smr, smr_seq_t goal, bool wait)
497 {
498 	smr_clock_t clk;
499 
500 	/*
501 	 * Load both the s_rd_seq and s_wr_seq in the right order so that we
502 	 * can't observe a s_rd_seq older than s_wr_seq.
503 	 */
504 
505 	/* [S1] */
506 	clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, acquire);
507 
508 	/*
509 	 * We expect this to be typical: the goal has already been observed.
510 	 */
511 	if (__probable(SMR_SEQ_CMP(goal, <=, clk.s_rd_seq))) {
512 		return true;
513 	}
514 
515 	clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
516 
517 	return __smr_scan(smr, goal, clk, wait);
518 }
519 
520 smr_seq_t
smr_advance(smr_t smr)521 smr_advance(smr_t smr)
522 {
523 	smr_clock_t clk;
524 
525 	assert(!smr_entered(smr));
526 
527 	/*
528 	 * We assume that there will at least be a successful __smr_poll
529 	 * call every 2^61 calls to smr_advance() or so, so we do not need
530 	 * to check if [s_rd_seq, s_wr_seq) is growing too wide.
531 	 */
532 	static_assert(sizeof(clk.s_wr_seq) == 8);
533 	return __smr_wr_advance(smr);
534 }
535 
536 smr_seq_t
smr_deferred_advance(smr_t smr)537 smr_deferred_advance(smr_t smr)
538 {
539 	os_atomic_thread_fence(seq_cst);
540 	return SMR_SEQ_INC + os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
541 }
542 
543 void
smr_deferred_advance_commit(smr_t smr,smr_seq_t seq)544 smr_deferred_advance_commit(smr_t smr, smr_seq_t seq)
545 {
546 	/*
547 	 * no barrier needed: smr_deferred_advance() had one already.
548 	 * no failure handling: it means someone updated the clock already!
549 	 * lock_cmpxchg: so that we pre-test for architectures needing it.
550 	 */
551 	assert(seq != SMR_SEQ_INVALID);
552 	lock_cmpxchg(&smr->smr_clock.s_wr_seq, seq - SMR_SEQ_INC, seq, relaxed);
553 }
554 
555 bool
smr_poll(smr_t smr,smr_seq_t goal)556 smr_poll(smr_t smr, smr_seq_t goal)
557 {
558 	assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
559 	return __smr_poll(smr, goal, false);
560 }
561 
562 void
smr_wait(smr_t smr,smr_seq_t goal)563 smr_wait(smr_t smr, smr_seq_t goal)
564 {
565 	assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
566 	(void)__smr_poll(smr, goal, true);
567 }
568 
569 void
smr_synchronize(smr_t smr)570 smr_synchronize(smr_t smr)
571 {
572 	smr_clock_t clk;
573 
574 	assert(!smr_entered(smr));
575 
576 	/*
577 	 * Similar to __smr_poll() but also does a deferred advance which
578 	 * __smr_scan will commit.
579 	 */
580 
581 	clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, relaxed);
582 	os_atomic_thread_fence(seq_cst);
583 	clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
584 
585 	(void)__smr_scan(smr, clk.s_wr_seq + SMR_SEQ_INC, clk, true);
586 }
587 
588 
589 #pragma mark system global SMR
590 
591 typedef struct smr_record {
592 	void                   *smrr_val;
593 	void                  (*smrr_dtor)(void *);
594 } *smr_record_t;
595 
596 typedef struct smr_bucket {
597 	union {
598 		struct mpsc_queue_chain  smrb_mplink;
599 		STAILQ_ENTRY(smr_bucket) smrb_stqlink;
600 	};
601 	uint32_t             smrb_count;
602 	uint32_t             smrb_size;
603 	smr_seq_t            smrb_seq;
604 	struct smr_record    smrb_recs[];
605 } *smr_bucket_t;
606 
607 STAILQ_HEAD(smr_bucket_list, smr_bucket);
608 
609 SMR_DEFINE(smr_system);
610 
611 /*! per-cpu state for smr pointers. */
612 static smr_bucket_t PERCPU_DATA(smr_bucket);
613 
614 /*! the minimum number of items cached in per-cpu buckets */
615 static TUNABLE(uint32_t, smr_bucket_count_min, "smr_bucket_count_min", 8);
616 
617 /*! the amount of memory pending retiring that causes a foreceful flush */
618 #if XNU_TARGET_OS_OSX
619 #define SMR_RETIRE_THRESHOLD_DEFAULT    (256 << 10)
620 #else
621 #define SMR_RETIRE_THRESHOLD_DEFAULT    (64 << 10)
622 #endif
623 static TUNABLE(vm_size_t, smr_retire_threshold, "smr_retire_threshold",
624     SMR_RETIRE_THRESHOLD_DEFAULT);
625 
626 /*! the number of items cached in per-cpu buckets */
627 static SECURITY_READ_ONLY_LATE(uint32_t) smr_bucket_count;
628 
629 /*! the queue of elements that couldn't be freed immediately */
630 static struct smr_bucket_list smr_buckets_pending =
631     STAILQ_HEAD_INITIALIZER(smr_buckets_pending);
632 
633 /*! the atomic queue handling deferred deallocations */
634 static struct mpsc_daemon_queue smr_deallocate_queue;
635 
636 static smr_bucket_t
smr_bucket_alloc(zalloc_flags_t flags)637 smr_bucket_alloc(zalloc_flags_t flags)
638 {
639 	return kalloc_type(struct smr_bucket, struct smr_record,
640 	           smr_bucket_count, Z_ZERO | flags);
641 }
642 
643 static void
smr_bucket_free(smr_bucket_t bucket)644 smr_bucket_free(smr_bucket_t bucket)
645 {
646 	return kfree_type(struct smr_bucket, struct smr_record,
647 	           smr_bucket_count, bucket);
648 }
649 
650 void
smr_global_retire(void * value,size_t size,void (* destructor)(void *))651 smr_global_retire(void *value, size_t size, void (*destructor)(void *))
652 {
653 	smr_bucket_t *slot;
654 	smr_bucket_t bucket, free_bucket = NULL;
655 
656 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
657 		/*
658 		 * The system is still single threaded and this module
659 		 * is still not fully initialized.
660 		 */
661 		destructor(value);
662 		return;
663 	}
664 
665 again:
666 	disable_preemption();
667 	slot = PERCPU_GET(smr_bucket);
668 	bucket = *slot;
669 	if (bucket && bucket->smrb_seq) {
670 		mpsc_daemon_enqueue(&smr_deallocate_queue,
671 		    &bucket->smrb_mplink, MPSC_QUEUE_NONE);
672 		*slot = bucket = NULL;
673 	}
674 	if (bucket == NULL) {
675 		if (free_bucket) {
676 			bucket = free_bucket;
677 			free_bucket = NULL;
678 		} else if ((bucket = smr_bucket_alloc(Z_NOWAIT)) == NULL) {
679 			enable_preemption();
680 			free_bucket = smr_bucket_alloc(Z_WAITOK | Z_NOFAIL);
681 			goto again;
682 		}
683 		*slot = bucket;
684 	}
685 
686 	bucket->smrb_recs[bucket->smrb_count].smrr_val = value;
687 	bucket->smrb_recs[bucket->smrb_count].smrr_dtor = destructor;
688 
689 	if (os_add_overflow(bucket->smrb_size, size, &bucket->smrb_size)) {
690 		bucket->smrb_size = UINT32_MAX;
691 	}
692 
693 	if (++bucket->smrb_count == smr_bucket_count ||
694 	    bucket->smrb_size >= smr_retire_threshold) {
695 		/*
696 		 * This will be retired the next time around,
697 		 * to give readers a chance to notice the new clock.
698 		 */
699 		bucket->smrb_seq = smr_advance(&smr_system);
700 	}
701 	enable_preemption();
702 
703 	if (__improbable(free_bucket)) {
704 		smr_bucket_free(free_bucket);
705 	}
706 }
707 
708 
709 static void
smr_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)710 smr_deallocate_queue_invoke(mpsc_queue_chain_t e,
711     __assert_only mpsc_daemon_queue_t dq)
712 {
713 	smr_bucket_t bucket;
714 
715 	assert(dq == &smr_deallocate_queue);
716 
717 	bucket = mpsc_queue_element(e, struct smr_bucket, smrb_mplink);
718 	smr_wait(&smr_system, bucket->smrb_seq);
719 
720 	for (uint32_t i = 0; i < bucket->smrb_count; i++) {
721 		struct smr_record *smrr = &bucket->smrb_recs[i];
722 
723 		smrr->smrr_dtor(smrr->smrr_val);
724 	}
725 
726 	smr_bucket_free(bucket);
727 }
728 
729 void
smr_register_mpsc_queue(void)730 smr_register_mpsc_queue(void)
731 {
732 	thread_deallocate_daemon_register_queue(&smr_deallocate_queue,
733 	    smr_deallocate_queue_invoke);
734 }
735 
736 static void
smr_startup(void)737 smr_startup(void)
738 {
739 	smr_bucket_count = zpercpu_count();
740 	if (smr_bucket_count < smr_bucket_count_min) {
741 		smr_bucket_count = smr_bucket_count_min;
742 	}
743 }
744 STARTUP(PERCPU, STARTUP_RANK_LAST, smr_startup);
745 
746 
747 #pragma mark - SMR hash tables
748 
749 static struct smrq_slist_head *
smr_hash_alloc_array(size_t size)750 smr_hash_alloc_array(size_t size)
751 {
752 	return kalloc_type(struct smrq_slist_head, size,
753 	           Z_WAITOK | Z_ZERO | Z_SPRAYQTN);
754 }
755 
756 static void
smr_hash_free_array(struct smrq_slist_head * array,size_t size)757 smr_hash_free_array(struct smrq_slist_head *array, size_t size)
758 {
759 	kfree_type(struct smrq_slist_head, size, array);
760 }
761 
762 static inline uintptr_t
smr_hash_array_encode(struct smrq_slist_head * array,uint16_t order)763 smr_hash_array_encode(struct smrq_slist_head *array, uint16_t order)
764 {
765 	uintptr_t ptr;
766 
767 	ptr  = (uintptr_t)array;
768 	ptr &= ~SMRH_ARRAY_ORDER_MASK;
769 	ptr |= (uintptr_t)order << SMRH_ARRAY_ORDER_SHIFT;
770 
771 	return ptr;
772 }
773 
774 #pragma mark SMR simple hash tables
775 
776 void
smr_hash_init(struct smr_hash * smrh,size_t size)777 smr_hash_init(struct smr_hash *smrh, size_t size)
778 {
779 	struct smrq_slist_head *array;
780 	uint16_t shift;
781 
782 	assert(size);
783 	shift = (uint16_t)flsll(size - 1);
784 	size  = 1UL << shift;
785 	if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
786 		assert(size * sizeof(struct smrq_slist_head) <=
787 		    KALLOC_SAFE_ALLOC_SIZE);
788 	}
789 	array = smr_hash_alloc_array(size);
790 
791 	*smrh = (struct smr_hash){
792 		.smrh_array = smr_hash_array_encode(array, 64 - shift),
793 	};
794 }
795 
796 void
smr_hash_destroy(struct smr_hash * smrh)797 smr_hash_destroy(struct smr_hash *smrh)
798 {
799 	struct smr_hash_array array = smr_hash_array_decode(smrh);
800 
801 	smr_hash_free_array(array.smrh_array, smr_hash_size(array));
802 	*smrh = (struct smr_hash){ };
803 }
804 
805 void
806 __smr_hash_serialized_clear(
807 	struct smr_hash        *smrh,
808 	smrh_traits_t          smrht,
809 	void                 (^free)(void *obj))
810 {
811 	struct smr_hash_array array = smr_hash_array_decode(smrh);
812 
813 	for (size_t i = 0; i < smr_hash_size(array); i++) {
814 		struct smrq_slink *link;
815 		__smrq_slink_t *prev;
816 
817 		prev = &array.smrh_array[i].first;
818 		while ((link = smr_serialized_load(prev))) {
819 			prev = &link->next;
820 			free(__smrht_link_to_obj(smrht, link));
821 		}
822 
823 		smr_clear_store(&array.smrh_array[i].first);
824 	}
825 
826 	smrh->smrh_count = 0;
827 }
828 
829 kern_return_t
__smr_hash_shrink_and_unlock(struct smr_hash * smrh,lck_mtx_t * lock,smrh_traits_t smrht)830 __smr_hash_shrink_and_unlock(
831 	struct smr_hash        *smrh,
832 	lck_mtx_t              *lock,
833 	smrh_traits_t           smrht)
834 {
835 	struct smr_hash_array decptr = smr_hash_array_decode(smrh);
836 	struct smrq_slist_head *newarray, *oldarray;
837 	uint16_t neworder = decptr.smrh_order + 1;
838 	size_t   oldsize  = smr_hash_size(decptr);
839 	size_t   newsize  = oldsize / 2;
840 
841 	assert(newsize);
842 
843 	if (os_atomic_load(&smrh->smrh_resizing, relaxed)) {
844 		lck_mtx_unlock(lock);
845 		return KERN_FAILURE;
846 	}
847 
848 	os_atomic_store(&smrh->smrh_resizing, true, relaxed);
849 	lck_mtx_unlock(lock);
850 
851 	newarray = smr_hash_alloc_array(newsize);
852 	if (newarray == NULL) {
853 		os_atomic_store(&smrh->smrh_resizing, false, relaxed);
854 		return KERN_RESOURCE_SHORTAGE;
855 	}
856 
857 	lck_mtx_lock(lock);
858 
859 	/*
860 	 * Step 1: collapse all the chains in pairs.
861 	 */
862 	oldarray = decptr.smrh_array;
863 
864 	for (size_t i = 0; i < newsize; i++) {
865 		newarray[i] = oldarray[i];
866 		smrq_serialized_append(&newarray[i], &oldarray[i + newsize]);
867 	}
868 
869 	/*
870 	 * Step 2: publish the new array.
871 	 */
872 	os_atomic_store(&smrh->smrh_array,
873 	    smr_hash_array_encode(newarray, neworder), release);
874 
875 	os_atomic_store(&smrh->smrh_resizing, false, relaxed);
876 
877 	lck_mtx_unlock(lock);
878 
879 	/*
880 	 * Step 3: free the old array once readers can't observe the old values.
881 	 */
882 	smr_synchronize(smrht->domain);
883 
884 	smr_hash_free_array(oldarray, oldsize);
885 	return KERN_SUCCESS;
886 }
887 
888 kern_return_t
__smr_hash_grow_and_unlock(struct smr_hash * smrh,lck_mtx_t * lock,smrh_traits_t smrht)889 __smr_hash_grow_and_unlock(
890 	struct smr_hash        *smrh,
891 	lck_mtx_t              *lock,
892 	smrh_traits_t           smrht)
893 {
894 	struct smr_hash_array decptr = smr_hash_array_decode(smrh);
895 	struct smrq_slist_head *newarray, *oldarray;
896 	__smrq_slink_t **prevarray;
897 	uint16_t neworder = decptr.smrh_order - 1;
898 	size_t   oldsize  = smr_hash_size(decptr);
899 	size_t   newsize  = 2 * oldsize;
900 	bool     needs_another_round = false;
901 
902 	if (smrh->smrh_resizing) {
903 		lck_mtx_unlock(lock);
904 		return KERN_FAILURE;
905 	}
906 
907 	smrh->smrh_resizing = true;
908 	lck_mtx_unlock(lock);
909 
910 	newarray = smr_hash_alloc_array(newsize);
911 	if (newarray == NULL) {
912 		os_atomic_store(&smrh->smrh_resizing, false, relaxed);
913 		return KERN_RESOURCE_SHORTAGE;
914 	}
915 
916 	prevarray = kalloc_type(__smrq_slink_t *, newsize,
917 	    Z_WAITOK | Z_ZERO | Z_SPRAYQTN);
918 	if (prevarray == NULL) {
919 		smr_hash_free_array(newarray, newsize);
920 		os_atomic_store(&smrh->smrh_resizing, false, relaxed);
921 		return KERN_RESOURCE_SHORTAGE;
922 	}
923 
924 
925 	lck_mtx_lock(lock);
926 
927 	/*
928 	 * Step 1: create a duplicated array with twice as many heads.
929 	 */
930 	oldarray = decptr.smrh_array;
931 
932 	memcpy(newarray, oldarray, oldsize * sizeof(newarray[0]));
933 	memcpy(newarray + oldsize, oldarray, oldsize * sizeof(newarray[0]));
934 
935 	/*
936 	 * Step 2: Publish the new array, and wait for readers to observe it
937 	 *         before we do any change.
938 	 */
939 	os_atomic_store(&smrh->smrh_array,
940 	    smr_hash_array_encode(newarray, neworder), release);
941 
942 	smr_synchronize(smrht->domain);
943 
944 
945 	/*
946 	 * Step 3: split the lists.
947 	 */
948 
949 	/*
950 	 * If the list we are trying to split looked like this,
951 	 * where L elements will go to the "left" bucket and "R"
952 	 * to the right one:
953 	 *
954 	 *     old_head --> L1 --> L2                -> L5
955 	 *                            \             /      \
956 	 *                             -> R3 --> R4         -> R6 --> NULL
957 	 *
958 	 * Then make sure the new heads point to their legitimate first element,
959 	 * leading to this state:
960 	 *
961 	 *     l_head   --> L1 --> L2                -> L5
962 	 *                            \             /      \
963 	 *     r_head   ----------------> R3 --> R4         -> R6 --> NULL
964 	 *
965 	 *
966 	 *     prevarray[left]  = &L2->next
967 	 *     prevarray[right] = &r_head
968 	 *     oldarray[old]    = L2
969 	 */
970 
971 	for (size_t i = 0; i < oldsize; i++) {
972 		struct smrq_slink *link, *next;
973 		uint32_t want_mask;
974 
975 		link = smr_serialized_load(&oldarray[i].first);
976 		if (link == NULL) {
977 			continue;
978 		}
979 
980 		want_mask = smrht->obj_hash(link, 0) & oldsize;
981 		while ((next = smr_serialized_load(&link->next)) &&
982 		    (smrht->obj_hash(next, 0) & oldsize) == want_mask) {
983 			link = next;
984 		}
985 
986 		if (want_mask == 0) {
987 			/* elements seen go to the "left" bucket */
988 			prevarray[i] = &link->next;
989 			prevarray[i + oldsize] = &newarray[i + oldsize].first;
990 			smr_serialized_store_relaxed(prevarray[i + oldsize], next);
991 		} else {
992 			/* elements seen go to the "right" bucket */
993 			prevarray[i] = &newarray[i].first;
994 			prevarray[i + oldsize] = &link->next;
995 			smr_serialized_store_relaxed(prevarray[i], next);
996 		}
997 
998 		smr_serialized_store_relaxed(&oldarray[i].first,
999 		    next ? link : NULL);
1000 
1001 		needs_another_round |= (next != NULL);
1002 	}
1003 
1004 	/*
1005 	 * At this point, when we split further, we must wait for
1006 	 * readers to observe the previous state before we split
1007 	 * further. Indeed, reusing the example above, the next
1008 	 * round of splitting would end up with this:
1009 	 *
1010 	 *     l_head   --> L1 --> L2 ----------------> L5
1011 	 *                                          /      \
1012 	 *     r_head   ----------------> R3 --> R4         -> R6 --> NULL
1013 	 *
1014 	 *
1015 	 *     prevarray[left]  = &L2->next
1016 	 *     prevarray[right] = &R4->next
1017 	 *     oldarray[old]    = R4
1018 	 *
1019 	 * But we must be sure that no readers can observe r_head
1020 	 * having been L1, otherwise a stale reader might skip over
1021 	 * R3/R4.
1022 	 *
1023 	 * Generally speaking we need to do that each time we do a round
1024 	 * of splitting that isn't terminating the list with NULL.
1025 	 */
1026 
1027 	while (needs_another_round) {
1028 		smr_synchronize(smrht->domain);
1029 
1030 		needs_another_round = false;
1031 
1032 		for (size_t i = 0; i < oldsize; i++) {
1033 			struct smrq_slink *link, *next;
1034 			uint32_t want_mask;
1035 
1036 			link = smr_serialized_load(&oldarray[i].first);
1037 			if (link == NULL) {
1038 				continue;
1039 			}
1040 
1041 			/*
1042 			 * If `prevarray[i]` (left) points to the linkage
1043 			 * we stopped at, then it means the next element
1044 			 * will be "to the right" and vice versa.
1045 			 *
1046 			 * We also already know "next" exists, so only probe
1047 			 * after it.
1048 			 */
1049 			if (prevarray[i] == &link->next) {
1050 				want_mask = (uint32_t)oldsize;
1051 			} else {
1052 				want_mask = 0;
1053 			}
1054 
1055 			link = smr_serialized_load(&link->next);
1056 
1057 			while ((next = smr_serialized_load(&link->next)) &&
1058 			    (smrht->obj_hash(next, 0) & oldsize) == want_mask) {
1059 				link = next;
1060 			}
1061 
1062 			if (want_mask == 0) {
1063 				/* elements seen go to the "left" bucket */
1064 				prevarray[i] = &link->next;
1065 				smr_serialized_store_relaxed(prevarray[i + oldsize], next);
1066 			} else {
1067 				/* elements seen go to the "right" bucket */
1068 				smr_serialized_store_relaxed(prevarray[i], next);
1069 				prevarray[i + oldsize] = &link->next;
1070 			}
1071 
1072 			smr_serialized_store_relaxed(&oldarray[i].first,
1073 			    next ? link : NULL);
1074 
1075 			needs_another_round |= (next != NULL);
1076 		}
1077 	}
1078 
1079 	smrh->smrh_resizing = false;
1080 	lck_mtx_unlock(lock);
1081 
1082 	/*
1083 	 * Step 4: cleanup, no need to wait for readers, this happened already
1084 	 *         at least once for splitting reasons.
1085 	 */
1086 	smr_hash_free_array(oldarray, oldsize);
1087 	kfree_type(__smrq_slink_t *, newsize, prevarray);
1088 	return KERN_SUCCESS;
1089 }
1090 
1091 #pragma mark SMR scalable hash tables
1092 
1093 #define SMRSH_MIGRATED  ((struct smrq_slink *)SMRSH_BUCKET_STOP_BIT)
1094 static LCK_GRP_DECLARE(smr_shash_grp, "smr_shash");
1095 
1096 static inline size_t
__smr_shash_min_size(struct smr_shash * smrh)1097 __smr_shash_min_size(struct smr_shash *smrh)
1098 {
1099 	return 1ul << smrh->smrsh_min_shift;
1100 }
1101 
1102 static inline size_t
__smr_shash_size_for_shift(uint8_t shift)1103 __smr_shash_size_for_shift(uint8_t shift)
1104 {
1105 	return (~0u >> shift) + 1;
1106 }
1107 
1108 static inline size_t
__smr_shash_cursize(smrsh_state_t state)1109 __smr_shash_cursize(smrsh_state_t state)
1110 {
1111 	return __smr_shash_size_for_shift(state.curshift);
1112 }
1113 
1114 static void
__smr_shash_bucket_init(hw_lck_ptr_t * head)1115 __smr_shash_bucket_init(hw_lck_ptr_t *head)
1116 {
1117 	hw_lck_ptr_init(head, __smr_shash_bucket_stop(head), &smr_shash_grp);
1118 }
1119 
1120 static void
__smr_shash_bucket_destroy(hw_lck_ptr_t * head)1121 __smr_shash_bucket_destroy(hw_lck_ptr_t *head)
1122 {
1123 	hw_lck_ptr_destroy(head, &smr_shash_grp);
1124 }
1125 
1126 __attribute__((noinline))
1127 void *
__smr_shash_entered_find_slow(const struct smr_shash * smrh,smrh_key_t key,hw_lck_ptr_t * head,smrh_traits_t traits)1128 __smr_shash_entered_find_slow(
1129 	const struct smr_shash *smrh,
1130 	smrh_key_t              key,
1131 	hw_lck_ptr_t           *head,
1132 	smrh_traits_t           traits)
1133 {
1134 	struct smrq_slink *link;
1135 	smrsh_state_t state;
1136 	uint32_t hash;
1137 
1138 	/* wait for the rehashing to be done into their target buckets */
1139 	hw_lck_ptr_wait_for_value(head, SMRSH_MIGRATED, &smr_shash_grp);
1140 
1141 	state = os_atomic_load(&smrh->smrsh_state, dependency);
1142 	hash  = __smr_shash_hash(smrh, state.newidx, key, traits);
1143 	head  = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
1144 
1145 	link  = hw_lck_ptr_value(head);
1146 	while (!__smr_shash_is_stop(link)) {
1147 		if (traits->obj_equ(link, key)) {
1148 			return __smrht_link_to_obj(traits, link);
1149 		}
1150 		link = smr_entered_load(&link->next);
1151 	}
1152 
1153 	assert(link == __smr_shash_bucket_stop(head));
1154 	return NULL;
1155 }
1156 
1157 static const uint8_t __smr_shash_grow_ratio[] = {
1158 	[SMRSH_COMPACT]           = 6,
1159 	[SMRSH_BALANCED]          = 4,
1160 	[SMRSH_BALANCED_NOSHRINK] = 4,
1161 	[SMRSH_FASTEST]           = 2,
1162 };
1163 
1164 static inline uint64_t
__smr_shash_count(struct smr_shash * smrh)1165 __smr_shash_count(struct smr_shash *smrh)
1166 {
1167 	int64_t count = (int64_t)counter_load(&smrh->smrsh_count);
1168 
1169 	/*
1170 	 * negative values make no sense and is likely due to some
1171 	 * stale values being read.
1172 	 */
1173 	return count < 0 ? 0ull : (uint64_t)count;
1174 }
1175 
1176 static inline bool
__smr_shash_should_grow(struct smr_shash * smrh,smrsh_state_t state,uint64_t count)1177 __smr_shash_should_grow(
1178 	struct smr_shash       *smrh,
1179 	smrsh_state_t           state,
1180 	uint64_t                count)
1181 {
1182 	size_t size = __smr_shash_cursize(state);
1183 
1184 	/* grow if elem:bucket ratio is worse than grow_ratio:1 */
1185 	return count > __smr_shash_grow_ratio[smrh->smrsh_policy] * size;
1186 }
1187 
1188 static inline bool
__smr_shash_should_reseed(struct smr_shash * smrh,size_t observed_depth)1189 __smr_shash_should_reseed(
1190 	struct smr_shash       *smrh,
1191 	size_t                  observed_depth)
1192 {
1193 	return observed_depth > 10 * __smr_shash_grow_ratio[smrh->smrsh_policy];
1194 }
1195 
1196 static inline bool
__smr_shash_should_shrink(struct smr_shash * smrh,smrsh_state_t state,uint64_t count)1197 __smr_shash_should_shrink(
1198 	struct smr_shash       *smrh,
1199 	smrsh_state_t           state,
1200 	uint64_t                count)
1201 {
1202 	size_t size = __smr_shash_cursize(state);
1203 
1204 	switch (smrh->smrsh_policy) {
1205 	case SMRSH_COMPACT:
1206 		/* shrink if bucket:elem ratio is worse than 1:1 */
1207 		return size > count && size > __smr_shash_min_size(smrh);
1208 	case SMRSH_BALANCED:
1209 		/* shrink if bucket:elem ratio is worse than 2:1 */
1210 		return size > 2 * count && size > __smr_shash_min_size(smrh);
1211 	case SMRSH_BALANCED_NOSHRINK:
1212 	case SMRSH_FASTEST:
1213 		return false;
1214 	}
1215 }
1216 
1217 static inline void
__smr_shash_schedule_rehash(struct smr_shash * smrh,smrh_traits_t traits,smrsh_rehash_t reason)1218 __smr_shash_schedule_rehash(
1219 	struct smr_shash       *smrh,
1220 	smrh_traits_t           traits,
1221 	smrsh_rehash_t          reason)
1222 {
1223 	smrsh_rehash_t rehash;
1224 
1225 	rehash = os_atomic_load(&smrh->smrsh_rehashing, relaxed);
1226 	if (rehash & reason) {
1227 		return;
1228 	}
1229 
1230 	rehash = os_atomic_or_orig(&smrh->smrsh_rehashing, reason, relaxed);
1231 	if (!rehash) {
1232 		thread_call_enter1(smrh->smrsh_callout,
1233 		    __DECONST(void *, traits));
1234 	}
1235 }
1236 
1237 void *
__smr_shash_entered_get_or_insert(struct smr_shash * smrh,smrh_key_t key,struct smrq_slink * link,smrh_traits_t traits)1238 __smr_shash_entered_get_or_insert(
1239 	struct smr_shash       *smrh,
1240 	smrh_key_t              key,
1241 	struct smrq_slink      *link,
1242 	smrh_traits_t           traits)
1243 {
1244 	struct smrq_slink *first;
1245 	struct smrq_slink *other;
1246 	uint32_t hash, depth;
1247 	smrsh_state_t state;
1248 	hw_lck_ptr_t *head;
1249 	void *obj;
1250 
1251 	state = os_atomic_load(&smrh->smrsh_state, dependency);
1252 	hash  = __smr_shash_hash(smrh, state.curidx, key, traits);
1253 	head  = __smr_shash_bucket(smrh, state, SMRSH_CUR, hash);
1254 	first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
1255 
1256 	if (__improbable(first == SMRSH_MIGRATED)) {
1257 		hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
1258 
1259 		state = os_atomic_load(&smrh->smrsh_state, dependency);
1260 		hash  = __smr_shash_hash(smrh, state.newidx, key, traits);
1261 		head  = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
1262 		first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
1263 	}
1264 
1265 	depth = 0;
1266 	other = first;
1267 	while (!__smr_shash_is_stop(other)) {
1268 		depth++;
1269 		if (traits->obj_equ(other, key)) {
1270 			obj = __smrht_link_to_obj(traits, other);
1271 			if (traits->obj_try_get(obj)) {
1272 				hw_lck_ptr_unlock_nopreempt(head, first,
1273 				    &smr_shash_grp);
1274 				return obj;
1275 			}
1276 			break;
1277 		}
1278 		other = smr_serialized_load(&other->next);
1279 	}
1280 
1281 	counter_inc_preemption_disabled(&smrh->smrsh_count);
1282 	smr_serialized_store_relaxed(&link->next, first);
1283 	hw_lck_ptr_unlock_nopreempt(head, link, &smr_shash_grp);
1284 
1285 	if (__smr_shash_should_reseed(smrh, depth)) {
1286 		__smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_RESEED);
1287 	} else if (depth * 2 >= __smr_shash_grow_ratio[smrh->smrsh_policy] &&
1288 	    __smr_shash_should_grow(smrh, state, __smr_shash_count(smrh))) {
1289 		__smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_GROW);
1290 	}
1291 	return NULL;
1292 }
1293 
1294 __abortlike
1295 static void
__smr_shash_missing_elt_panic(struct smr_shash * smrh,struct smrq_slink * link,smrh_traits_t traits)1296 __smr_shash_missing_elt_panic(
1297 	struct smr_shash        *smrh,
1298 	struct smrq_slink       *link,
1299 	smrh_traits_t           traits)
1300 {
1301 	panic("Unable to find item %p (linkage %p) in %p (traits %p)",
1302 	    __smrht_link_to_obj(traits, link), link, smrh, traits);
1303 }
1304 
1305 smr_shash_mut_cursor_t
__smr_shash_entered_mut_begin(struct smr_shash * smrh,struct smrq_slink * link,smrh_traits_t traits)1306 __smr_shash_entered_mut_begin(
1307 	struct smr_shash       *smrh,
1308 	struct smrq_slink      *link,
1309 	smrh_traits_t           traits)
1310 {
1311 	struct smrq_slink *first, *next;
1312 	__smrq_slink_t *prev;
1313 	smrsh_state_t state;
1314 	hw_lck_ptr_t *head;
1315 	uint32_t hash;
1316 
1317 	state = os_atomic_load(&smrh->smrsh_state, dependency);
1318 	hash  = __smr_shash_hash(smrh, state.curidx, link, traits);
1319 	head  = __smr_shash_bucket(smrh, state, SMRSH_CUR, hash);
1320 	first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
1321 
1322 	if (__improbable(first == SMRSH_MIGRATED)) {
1323 		hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
1324 
1325 		state = os_atomic_load(&smrh->smrsh_state, dependency);
1326 		hash  = __smr_shash_hash(smrh, state.newidx, link, traits);
1327 		head  = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
1328 		first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
1329 	}
1330 
1331 	next = first;
1332 	while (next != link) {
1333 		if (__smr_shash_is_stop(next)) {
1334 			__smr_shash_missing_elt_panic(smrh, link, traits);
1335 		}
1336 		prev  = &next->next;
1337 		next  = smr_serialized_load(prev);
1338 	}
1339 
1340 	return (smr_shash_mut_cursor_t){ .head = head, .prev = prev };
1341 }
1342 
1343 void
__smr_shash_entered_mut_erase(struct smr_shash * smrh,smr_shash_mut_cursor_t cursor,struct smrq_slink * link,smrh_traits_t traits)1344 __smr_shash_entered_mut_erase(
1345 	struct smr_shash       *smrh,
1346 	smr_shash_mut_cursor_t  cursor,
1347 	struct smrq_slink      *link,
1348 	smrh_traits_t           traits)
1349 {
1350 	struct smrq_slink *next, *first;
1351 	smrsh_state_t state;
1352 
1353 	first = hw_lck_ptr_value(cursor.head);
1354 
1355 	next  = smr_serialized_load(&link->next);
1356 	if (first == link) {
1357 		hw_lck_ptr_unlock_nopreempt(cursor.head, next, &smr_shash_grp);
1358 	} else {
1359 		smr_serialized_store_relaxed(cursor.prev, next);
1360 		hw_lck_ptr_unlock_nopreempt(cursor.head, first, &smr_shash_grp);
1361 	}
1362 	counter_dec_preemption_disabled(&smrh->smrsh_count);
1363 
1364 	state = atomic_load_explicit(&smrh->smrsh_state, memory_order_relaxed);
1365 	if (first == link && __smr_shash_is_stop(next) &&
1366 	    __smr_shash_should_shrink(smrh, state, __smr_shash_count(smrh))) {
1367 		__smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_SHRINK);
1368 	}
1369 }
1370 
1371 void
__smr_shash_entered_mut_replace(smr_shash_mut_cursor_t cursor,struct smrq_slink * old_link,struct smrq_slink * new_link)1372 __smr_shash_entered_mut_replace(
1373 	smr_shash_mut_cursor_t  cursor,
1374 	struct smrq_slink      *old_link,
1375 	struct smrq_slink      *new_link)
1376 {
1377 	struct smrq_slink *first, *next;
1378 
1379 	first = hw_lck_ptr_value(cursor.head);
1380 
1381 	next  = smr_serialized_load(&old_link->next);
1382 	smr_serialized_store_relaxed(&new_link->next, next);
1383 	if (first == old_link) {
1384 		hw_lck_ptr_unlock_nopreempt(cursor.head, new_link, &smr_shash_grp);
1385 	} else {
1386 		smr_serialized_store_relaxed(cursor.prev, new_link);
1387 		hw_lck_ptr_unlock_nopreempt(cursor.head, first, &smr_shash_grp);
1388 	}
1389 }
1390 
1391 void
__smr_shash_entered_mut_abort(smr_shash_mut_cursor_t cursor)1392 __smr_shash_entered_mut_abort(smr_shash_mut_cursor_t cursor)
1393 {
1394 	hw_lck_ptr_unlock_nopreempt(cursor.head,
1395 	    hw_lck_ptr_value(cursor.head), &smr_shash_grp);
1396 }
1397 
1398 static kern_return_t
__smr_shash_rehash_with_target(struct smr_shash * smrh,smrsh_state_t state,uint8_t newshift,smrh_traits_t traits)1399 __smr_shash_rehash_with_target(
1400 	struct smr_shash       *smrh,
1401 	smrsh_state_t           state,
1402 	uint8_t                 newshift,
1403 	smrh_traits_t           traits)
1404 {
1405 	const size_t FLAT_SIZE = 256;
1406 	struct smrq_slink *flat_queue[FLAT_SIZE];
1407 
1408 	size_t oldsize, newsize;
1409 	hw_lck_ptr_t *oldarray;
1410 	hw_lck_ptr_t *newarray;
1411 	uint32_t newseed;
1412 	uint8_t oldidx;
1413 
1414 	/*
1415 	 * This function resizes a scalable hash table.
1416 	 *
1417 	 * It doesn't require a lock because it is the callout
1418 	 * of a THREAD_CALL_ONCE thread call.
1419 	 */
1420 
1421 	oldidx         = state.curidx;
1422 	state.newidx   = 1 - state.curidx;
1423 	state.newshift = newshift;
1424 	assert(__smr_shash_load_array(smrh, state.newidx) == NULL);
1425 
1426 	oldsize = __smr_shash_cursize(state);
1427 	newsize = __smr_shash_size_for_shift(newshift);
1428 
1429 	oldarray = __smr_shash_load_array(smrh, state.curidx);
1430 	newarray = (hw_lck_ptr_t *)smr_hash_alloc_array(newsize);
1431 	newseed  = (uint32_t)early_random();
1432 
1433 	if (newarray == NULL) {
1434 		return KERN_RESOURCE_SHORTAGE;
1435 	}
1436 
1437 	/*
1438 	 * Step 1: initialize the new array and seed,
1439 	 *         and then publish the state referencing it.
1440 	 *
1441 	 *         We do not need to synchronize explicitly with SMR,
1442 	 *         because readers/writers will notice rehashing when
1443 	 *         the bucket they interact with has a SMRSH_MIGRATED
1444 	 *         value.
1445 	 */
1446 
1447 	for (size_t i = 0; i < newsize; i++) {
1448 		__smr_shash_bucket_init(&newarray[i]);
1449 	}
1450 	os_atomic_store(&smrh->smrsh_array[state.newidx], newarray, relaxed);
1451 	os_atomic_store(&smrh->smrsh_seed[state.newidx], newseed, relaxed);
1452 	os_atomic_store(&smrh->smrsh_state, state, release);
1453 
1454 	/*
1455 	 * Step 2: migrate buckets "atomically" under the old bucket lock.
1456 	 *
1457 	 *         This migration is atomic for writers because
1458 	 *         they take the old bucket lock first, and if
1459 	 *         they observe SMRSH_MIGRATED as the value,
1460 	 *         go look in the new bucket instead.
1461 	 *
1462 	 *         This migration is atomic for readers, because
1463 	 *         as we move elements to their new buckets,
1464 	 *         the hash chains will not circle back to their
1465 	 *         bucket head (the "stop" value won't match),
1466 	 *         or the bucket head will be SMRSH_MIGRATED.
1467 	 *
1468 	 *         This causes a slowpath which spins waiting
1469 	 *         for SMRSH_MIGRATED to appear and then looks
1470 	 *         in the new bucket.
1471 	 */
1472 	for (size_t i = 0; i < oldsize; i++) {
1473 		struct smrq_slink *first, *link, *next;
1474 		hw_lck_ptr_t *head;
1475 		uint32_t hash;
1476 		size_t n = 0;
1477 
1478 		link = first = hw_lck_ptr_lock(&oldarray[i], &smr_shash_grp);
1479 
1480 		while (!__smr_shash_is_stop(link)) {
1481 			flat_queue[n++ % FLAT_SIZE] = link;
1482 			link = smr_serialized_load(&link->next);
1483 		}
1484 
1485 		while (n-- > 0) {
1486 			for (size_t j = (n % FLAT_SIZE) + 1; j-- > 0;) {
1487 				link = flat_queue[j];
1488 				hash = traits->obj_hash(link, newseed);
1489 				head = &newarray[hash >> newshift];
1490 				next = hw_lck_ptr_lock_nopreempt(head,
1491 				    &smr_shash_grp);
1492 				smr_serialized_store_relaxed(&link->next, next);
1493 				hw_lck_ptr_unlock_nopreempt(head, link,
1494 				    &smr_shash_grp);
1495 			}
1496 			n &= ~(FLAT_SIZE - 1);
1497 
1498 			/*
1499 			 * If there were more than FLAT_SIZE elements in the
1500 			 * chain (which is super unlikely and in many ways,
1501 			 * worrisome), then we need to repopoulate
1502 			 * the flattened queue array for each run.
1503 			 *
1504 			 * This is O(n^2) but we have worse problems anyway
1505 			 * if we ever hit this path.
1506 			 */
1507 			if (__improbable(n > 0)) {
1508 				link = first;
1509 				for (size_t j = 0; j < n - FLAT_SIZE; j++) {
1510 					link = smr_serialized_load(&link->next);
1511 				}
1512 
1513 				flat_queue[0] = link;
1514 				for (size_t j = 1; j < FLAT_SIZE; j++) {
1515 					link = smr_serialized_load(&link->next);
1516 					flat_queue[j] = link;
1517 				}
1518 			}
1519 		}
1520 
1521 		hw_lck_ptr_unlock(&oldarray[i], SMRSH_MIGRATED, &smr_shash_grp);
1522 	}
1523 
1524 	/*
1525 	 * Step 3: deallocate the old array of buckets,
1526 	 *         making sure to hide it from readers.
1527 	 */
1528 
1529 	state.curshift = state.newshift;
1530 	state.curidx   = state.newidx;
1531 	os_atomic_store(&smrh->smrsh_state, state, release);
1532 
1533 	smr_synchronize(traits->domain);
1534 
1535 	os_atomic_store(&smrh->smrsh_array[oldidx], NULL, relaxed);
1536 	for (size_t i = 0; i < oldsize; i++) {
1537 		__smr_shash_bucket_destroy(&oldarray[i]);
1538 	}
1539 	smr_hash_free_array((struct smrq_slist_head *)oldarray, oldsize);
1540 
1541 	return KERN_SUCCESS;
1542 }
1543 
1544 static void
__smr_shash_rehash(thread_call_param_t arg0,thread_call_param_t arg1)1545 __smr_shash_rehash(thread_call_param_t arg0, thread_call_param_t arg1)
1546 {
1547 	struct smr_shash *smrh   = arg0;
1548 	smrh_traits_t     traits = arg1;
1549 	smrsh_rehash_t    reason;
1550 	smrsh_state_t     state;
1551 	uint64_t          count;
1552 	kern_return_t     kr;
1553 
1554 	do {
1555 		reason = os_atomic_xchg(&smrh->smrsh_rehashing,
1556 		    SMRSH_REHASH_RUNNING, relaxed);
1557 
1558 		state  = os_atomic_load(&smrh->smrsh_state, relaxed);
1559 		count  = __smr_shash_count(smrh);
1560 
1561 		if (__smr_shash_should_grow(smrh, state, count)) {
1562 			kr = __smr_shash_rehash_with_target(smrh, state,
1563 			    state.curshift - 1, traits);
1564 		} else if (__smr_shash_should_shrink(smrh, state, count)) {
1565 			kr = __smr_shash_rehash_with_target(smrh, state,
1566 			    state.curshift + 1, traits);
1567 		} else if (reason & SMRSH_REHASH_RESEED) {
1568 			kr = __smr_shash_rehash_with_target(smrh, state,
1569 			    state.curshift, traits);
1570 		} else {
1571 			kr = KERN_SUCCESS;
1572 		}
1573 
1574 		if (kr == KERN_RESOURCE_SHORTAGE) {
1575 			uint64_t deadline;
1576 
1577 			os_atomic_or(&smrh->smrsh_rehashing, reason, relaxed);
1578 			nanoseconds_to_deadline(NSEC_PER_MSEC, &deadline);
1579 			thread_call_enter1_delayed(smrh->smrsh_callout,
1580 			    arg1, deadline);
1581 			break;
1582 		}
1583 	} while (!os_atomic_cmpxchg(&smrh->smrsh_rehashing,
1584 	    SMRSH_REHASH_RUNNING, SMRSH_REHASH_NONE, relaxed));
1585 }
1586 
1587 void
smr_shash_init(struct smr_shash * smrh,smrsh_policy_t policy,size_t min_size)1588 smr_shash_init(struct smr_shash *smrh, smrsh_policy_t policy, size_t min_size)
1589 {
1590 	smrsh_state_t state;
1591 	hw_lck_ptr_t *array;
1592 	uint8_t shift;
1593 	size_t size;
1594 
1595 	switch (policy) {
1596 	case SMRSH_COMPACT:
1597 		if (min_size < 2) {
1598 			min_size = 2;
1599 		}
1600 		break;
1601 	default:
1602 		if (min_size < 16) {
1603 			min_size = 16;
1604 		}
1605 		break;
1606 	}
1607 
1608 	switch (policy) {
1609 	case SMRSH_COMPACT:
1610 		size = MIN(2, min_size);
1611 		break;
1612 	case SMRSH_BALANCED:
1613 	case SMRSH_BALANCED_NOSHRINK:
1614 		size = MIN(16, min_size);
1615 		break;
1616 	case SMRSH_FASTEST:
1617 		size = min_size;
1618 		break;
1619 	}
1620 
1621 	if (size > KALLOC_SAFE_ALLOC_SIZE / sizeof(*array)) {
1622 		size = KALLOC_SAFE_ALLOC_SIZE / sizeof(*array);
1623 	}
1624 	shift = (uint8_t)__builtin_clz((uint32_t)(size - 1));
1625 	size  = (~0u >> shift) + 1;
1626 	array = (hw_lck_ptr_t *)smr_hash_alloc_array(size);
1627 	for (size_t i = 0; i < size; i++) {
1628 		__smr_shash_bucket_init(&array[i]);
1629 	}
1630 
1631 	state = (smrsh_state_t){
1632 		.curshift = shift,
1633 		.newshift = shift,
1634 	};
1635 	*smrh = (struct smr_shash){
1636 		.smrsh_array[0]  = array,
1637 		.smrsh_seed[0]   = (uint32_t)early_random(),
1638 		.smrsh_state     = state,
1639 		.smrsh_policy    = policy,
1640 		.smrsh_min_shift = (uint8_t)flsll(min_size - 1),
1641 	};
1642 	counter_alloc(&smrh->smrsh_count);
1643 	smrh->smrsh_callout  = thread_call_allocate_with_options(__smr_shash_rehash,
1644 	    smrh, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
1645 }
1646 
1647 void
1648 __smr_shash_destroy(
1649 	struct smr_shash       *smrh,
1650 	smrh_traits_t           traits,
1651 	void                  (^free)(void *))
1652 {
1653 	smrsh_state_t state;
1654 	hw_lck_ptr_t *array;
1655 	size_t size;
1656 
1657 	thread_call_cancel_wait(smrh->smrsh_callout);
1658 
1659 	state = os_atomic_load(&smrh->smrsh_state, dependency);
1660 	assert(state.curidx == state.newidx);
1661 	assert(__smr_shash_load_array(smrh, 1 - state.curidx) == NULL);
1662 	size  = __smr_shash_cursize(state);
1663 	array = __smr_shash_load_array(smrh, state.curidx);
1664 
1665 	if (free) {
1666 		for (size_t i = 0; i < size; i++) {
1667 			struct smrq_slink *link, *next;
1668 
1669 			next = hw_lck_ptr_value(&array[i]);
1670 			while (!__smr_shash_is_stop(next)) {
1671 				link = next;
1672 				next = smr_serialized_load(&link->next);
1673 				free(__smrht_link_to_obj(traits, link));
1674 			}
1675 		}
1676 	}
1677 	for (size_t i = 0; i < size; i++) {
1678 		__smr_shash_bucket_destroy(&array[i]);
1679 	}
1680 
1681 	thread_call_free(smrh->smrsh_callout);
1682 	counter_free(&smrh->smrsh_count);
1683 	smr_hash_free_array((struct smrq_slist_head *)array, size);
1684 	bzero(smrh, sizeof(*smrh));
1685 }
1686 
1687 
1688 #pragma mark misc
1689 
1690 void
__smr_linkage_invalid(__smrq_link_t * link)1691 __smr_linkage_invalid(__smrq_link_t *link)
1692 {
1693 	struct smrq_link *elem = __container_of(link, struct smrq_link, next);
1694 	struct smrq_link *next = smr_serialized_load(&elem->next);
1695 
1696 	panic("Invalid queue linkage: elt:%p next:%p next->prev:%p",
1697 	    elem, next, __container_of(next->prev, struct smrq_link, next));
1698 }
1699 
1700 void
__smr_stail_invalid(__smrq_slink_t * link,__smrq_slink_t * last)1701 __smr_stail_invalid(__smrq_slink_t *link, __smrq_slink_t *last)
1702 {
1703 	struct smrq_slink *elem = __container_of(link, struct smrq_slink, next);
1704 	struct smrq_slink *next = smr_serialized_load(&elem->next);
1705 
1706 	if (next) {
1707 		panic("Invalid queue tail (element past end): elt:%p elt->next:%p",
1708 		    elem, next);
1709 	} else {
1710 		panic("Invalid queue tail (early end): elt:%p tail:%p",
1711 		    elem, __container_of(last, struct smrq_slink, next));
1712 	}
1713 }
1714 
1715 void
__smr_tail_invalid(__smrq_link_t * link,__smrq_link_t * last)1716 __smr_tail_invalid(__smrq_link_t *link, __smrq_link_t *last)
1717 {
1718 	struct smrq_link *elem = __container_of(link, struct smrq_link, next);
1719 	struct smrq_link *next = smr_serialized_load(&elem->next);
1720 
1721 	if (next) {
1722 		panic("Invalid queue tail (element past end): elt:%p elt->next:%p",
1723 		    elem, next);
1724 	} else {
1725 		panic("Invalid queue tail (early end): elt:%p tail:%p",
1726 		    elem, __container_of(last, struct smrq_link, next));
1727 	}
1728 }
1729