xref: /xnu-10063.101.15/osfmk/kern/smr.c (revision 94d3b452840153a99b38a3a9659680b2a006908e)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/locks_internal.h>
30 #include <kern/cpu_data.h>
31 #include <kern/machine.h>
32 #include <kern/mpsc_queue.h>
33 #include <kern/percpu.h>
34 #include <kern/sched.h>
35 #include <kern/smr.h>
36 #include <kern/smr_hash.h>
37 #include <kern/thread.h>
38 #include <kern/zalloc.h>
39 #include <machine/commpage.h>
40 #include <os/hash.h>
41 
42 
43 #pragma mark - SMR domains
44 
45 /*
46  * This SMR scheme is directly FreeBSD's "Global Unbounded Sequences".
47  *
48  * Major differences are:
49  *
50  * - only eager clocks are implemented (no lazy, no implicit)
51  *
52  *
53  * SMR clocks have 3 state machines interacting at any given time:
54  *
55  * 1. reader critical sections
56  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
57  *
58  * Each CPU can disable preemption and do this sequence:
59  *
60  *     CPU::c_rd_seq = GLOBAL::c_wr_seq;
61  *
62  *     < unfortunate place to receive a long IRQ >                      [I]
63  *
64  *     os_atomic_thread_fence(seq_cst);                                 [R1]
65  *
66  *     {
67  *         // critical section
68  *     }
69  *
70  *     os_atomic_store(&CPU::c_rd_seq, INVALID, release);               [R2]
71  *
72  *
73  *
74  * 2. writer sequence advances
75  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
76  *
77  * Each writer can increment the global write sequence
78  * at any given time:
79  *
80  *    os_atomic_add(&GLOBAL::c_wr_seq, SMR_SEQ_INC, release);           [W]
81  *
82  *
83  *
84  * 3. synchronization sequence: poll/wait/scan
85  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
86  *
87  * This state machine synchronizes with the other two in order to decide
88  * if a given "goal" is in the past. Only the cases when the call
89  * is successful is interresting for barrier purposes, and we will focus
90  * on cases that do not take an early return for failures.
91  *
92  * a. __smr_poll:
93  *
94  *     rd_seq = os_atomic_load(&GLOBAL::c_rd_seq, acquire);             [S1]
95  *     if (goal < rd_seq) SUCCESS.
96  *     wr_seq = os_atomic_load(&GLOBAL::c_rd_seq, relaxed);
97  *
98  * b. __smr_scan
99  *
100  *     os_atomic_thread_fence(seq_cst)                                  [S2]
101  *
102  *     observe the minimum CPU::c_rd_seq "min_rd_seq"
103  *     value possible or rw_seq if no CPU was in a critical section.
104  *     (possibly spinning until it satisfies "goal")
105  *
106  * c. __smr_rd_advance
107  *
108  *     cur_rd_seq = load_exclusive(&GLOBAL::c_rd_seq);
109  *     os_atomic_thread_fence(seq_cst);                                 [S3]
110  *     if (min_rd_seq > cur_rd_seq) {
111  *         store_exlusive(&GLOBAL::c_rd_seq, min_rd_seq);
112  *     }
113  *
114  *
115  * One sentence summary
116  * ~~~~~~~~~~~~~~~~~~~~
117  *
118  * A simplistic one-sentence summary of the algorithm is that __smr_scan()
119  * works really hard to insert itself in the timeline of write sequences and
120  * observe a reasonnable bound for first safe-to-reclaim sequence, and
121  * issues [S3] to sequence everything around "c_rd_seq" (via [S3] -> [S1]):
122  *
123  *              GLOBAL::c_rd_seq                GLOBAL::c_wr_seq
124  *                             v                v
125  *       ──────────────────────┬────────────────┬─────────────────────
126  *       ... safe to reclaim   │    deferred    │   future         ...
127  *       ──────────────────────┴────────────────┴─────────────────────
128  *
129  *
130  * Detailed explanation
131  * ~~~~~~~~~~~~~~~~~~~~
132  *
133  * [W] -> [R1] establishes a "happens before" relationship between a given
134  * writer and this critical section. The loaded GLOBAL::c_wr_seq might
135  * however be stale with respect to the one [R1] really synchronizes with
136  * (see [I] explanation below).
137  *
138  *
139  * [R1] -> [S2] establishes a "happens before" relationship between all the
140  * active critical sections and the scanner.
141  * It lets us compute the oldest possible sequence pinned by an active
142  * critical section.
143  *
144  *
145  * [R2] -> [S3] establishes a "happens before" relationship between all the
146  * inactive critical sections and the scanner.
147  *
148  *
149  * [S3] -> [S1] is the typical expected fastpath: when the caller can decide
150  * that its goal is older than the last update an __smr_rd_advance() did.
151  * Note that [S3] doubles as an "[S1]" when two __smr_scan() race each other
152  * and one of them finishes last but observed a "worse" read sequence.
153  *
154  *
155  * [W], [S3] -> [S1] is the last crucial property: all updates to the global
156  * clock are totally ordered because they update the entire 128bit state
157  * every time with an RMW. This guarantees that __smr_poll() can't load
158  * an `rd_seq` that is younger than the `wr_seq` it loads next.
159  *
160  *
161  * [I] __smr_enter() also can be unfortunately delayed after observing
162  * a given write sequence and right before [R1] at [I].
163  *
164  * However for a read sequence to have move past what __smr_enter() observed,
165  * it means another __smr_scan() didn't observe the store to CPU::c_rd_seq
166  * made by __smr_enter() and thought the section was inactive.
167  *
168  * This can only happen if the scan's [S2] was issued before the delayed
169  * __smr_enter() [R1] (during the [I] window).
170  *
171  * As a consequence the outcome of that scan can be accepted as the "real"
172  * write sequence __smr_enter() should have observed.
173  *
174  *
175  * Litmus tests
176  * ~~~~~~~~~~~~
177  *
178  * This is the proof of [W] -> [R1] -> [S2] being established properly:
179  * - P0 sets a global and calls smr_synchronize()
180  * - P1 does smr_enter() and loads the global
181  *
182  *     AArch64 MP
183  *     {
184  *         global = 0;
185  *         wr_seq = 123;
186  *         p1_rd_seq = 0;
187  *
188  *         0:x0 = global; 0:x1 = wr_seq; 0:x2 = p1_rd_seq;
189  *         1:x0 = global; 1:x1 = wr_seq; 1:x2 = p1_rd_seq;
190  *     }
191  *      P0                     | P1                         ;
192  *      MOV      X8, #2        | LDR        X8, [X1]        ;
193  *      STR      X8, [X0]      | STR        X8, [X2]        ;
194  *      LDADDL   X8, X9, [X1]  | DMB        SY              ;
195  *      DMB      SY            | LDR        X10, [X0]       ;
196  *      LDR      X10, [X2]     |                            ;
197  *     exists (0:X10 = 0 /\ 1:X8 = 123 /\ 1:X10 = 0)
198  *
199  *
200  * This is the proof that deferred advances are also correct:
201  * - P0 sets a global and does a smr_deferred_advance()
202  * - P1 does an smr_synchronize() and reads the global
203  *
204  *     AArch64 MP
205  *     {
206  *         global = 0;
207  *         wr_seq = 123;
208  *
209  *         0:x0 = global; 0:x1 = wr_seq; 0:x2 = 2;
210  *         1:x0 = global; 1:x1 = wr_seq; 1:x2 = 2;
211  *     }
212  *      P0                     | P1                         ;
213  *      STR      X2, [X0]      | LDADDL     X2, X9, [X1]    ;
214  *      DMB      SY            | DMB        SY              ;
215  *      LDR      X9, [X1]      | LDR        X10, [X0]       ;
216  *      ADD      X9, X9, X2    |                            ;
217  *     exists (0:X9 = 125 /\ 1:X9 = 123 /\ 1:X10 = 0)
218  *
219  */
220 
221 /*!
222  * @struct smr_worker
223  *
224  * @brief
225  * Structure tracking the per-cpu SMR workers state.
226  *
227  * @discussion
228  * This structure is system wide and global and is used to track
229  * the various active SMR domains at the granularity of a CPU.
230  *
231  * Each structure has an associated thread which is responsible
232  * for the forward progress the @c smr_call() and @c smr_barrier()
233  * interfaces.
234  *
235  * It also tracks all the active, non stalled, sleepable SMR sections.
236  */
237 struct smr_worker {
238 	/*
239 	 * The thread for this worker,
240 	 * and conveniency pointer to the processor it is bound to.
241 	 */
242 	struct thread          *thread;
243 	struct processor       *processor;
244 
245 	/*
246 	 * Thread binding/locking logic:
247 	 *
248 	 * If the worker thread is running on its canonical CPU,
249 	 * then locking to access the various SMR per-cpu data
250 	 * structures it is draining is just preemption disablement.
251 	 *
252 	 * However, if it is currently not bound to its canonical
253 	 * CPU because the CPU has been offlined or de-recommended,
254 	 * then a lock which serializes with the CPU going online
255 	 * again is being used.
256 	 */
257 	struct waitq            waitq;
258 	smr_cpu_reason_t        detach_reason;
259 
260 #if CONFIG_QUIESCE_COUNTER
261 	/*
262 	 * Currently active quiescent generation for this processor,
263 	 * and the last timestamp when a scan of all cores was performed.
264 	 */
265 	smr_seq_t               rd_quiesce_seq;
266 #endif
267 
268 	/*
269 	 * List of all the active sleepable sections that haven't
270 	 * been stalled.
271 	 */
272 	struct smrq_list_head   sect_queue;
273 	struct thread          *sect_waiter;
274 
275 	/*
276 	 * Queue of SMR domains with pending smr_call()
277 	 * callouts to drain.
278 	 *
279 	 * This uses an ageing strategy in order to amortize
280 	 * SMR clock updates:
281 	 *
282 	 * - the "old" queue have domains whose callbacks have
283 	 *   a committed and aged sequence,
284 	 * - the "age" queue have domains whose callbacks have
285 	 *   a commited but fresh sequence and need ageing,
286 	 * - the "cur" queue have domains whose callbacks have
287 	 *   a sequence in the future and need for it to be committed.
288 	 */
289 	struct smr_pcpu        *whead;
290 	struct smr_pcpu       **wold_tail;
291 	struct smr_pcpu       **wage_tail;
292 	struct smr_pcpu       **wcur_tail;
293 	uint64_t                drain_ctime;
294 
295 	/*
296 	 * Queue of smr_barrier() calls in flight,
297 	 * that will be picked up by the worker thread
298 	 * to enqueue as smr_call() entries in their
299 	 * respective per-CPU data structures.
300 	 */
301 	struct mpsc_queue_head  barrier_queue;
302 } __attribute__((aligned(64)));
303 
304 
305 typedef struct smr_pcpu {
306 	/*
307 	 * CPU private cacheline.
308 	 *
309 	 * Nothing else than the CPU this state is made for,
310 	 * ever writes to this cacheline.
311 	 *
312 	 * It holds the epoch activity witness (rd_seq), and
313 	 * the local smr_call() queue, which is structured this way:
314 	 *
315 	 *     head -> n1 -> n2 -> n3 -> n4 -> ... -> ni -> ... -> nN -> NULL
316 	 *                            ^            ^                  ^
317 	 *     qold_tail -------------'            |                  |
318 	 *     qage_tail --------------------------'                  |
319 	 *     qcur_tail ---------------------------------------------'
320 	 *
321 	 * - the "old" queue can be reclaimed once qold_seq is past,
322 	 *   qold_seq is always a commited sequence.
323 	 * - the "age" queue can be reclaimed once qage_seq is past,
324 	 *   qage_seq might not be commited yet.
325 	 * - the "cur" queue has an approximate size of qcur_size bytes,
326 	 *   and a length of qcur_cnt callbacks.
327 	 */
328 
329 	smr_seq_t               c_rd_seq; /* might have SMR_SEQ_SLEEPABLE set */
330 
331 	smr_node_t              qhead;
332 
333 	smr_seq_t               qold_seq;
334 	smr_node_t             *qold_tail;
335 
336 	smr_seq_t               qage_seq;
337 	smr_node_t             *qage_tail;
338 
339 	uint32_t                qcur_size;
340 	uint32_t                qcur_cnt;
341 	smr_node_t             *qcur_tail;
342 
343 	uint8_t                 __cacheline_sep[0];
344 
345 	/*
346 	 * Drain queue.
347 	 *
348 	 * This is used to drive smr_call() via the smr worker threads.
349 	 * If the SMR domain is not using smr_call() or smr_barrier(),
350 	 * this isn't used.
351 	 */
352 	struct smr             *drain_smr;
353 	struct smr_pcpu        *drain_next;
354 	uint16_t                __check_cpu;
355 	uint8_t                 __check_reason;
356 	uint8_t                 __check_list;
357 
358 	/*
359 	 * Stalled queue.
360 	 *
361 	 * Stalled sections are enqueued onto this queue by the scheduler
362 	 * when their thread blocks (see smr_mark_active_trackers_stalled()).
363 	 *
364 	 * If the SMR domain is not sleepable, then this isn't used.
365 	 *
366 	 * This list is protected by a lock.
367 	 *
368 	 * When there are stalled sections, stall_rd_seq contains
369 	 * the oldest active stalled sequence number.
370 	 *
371 	 * When threads want to expedite a stalled section, they set
372 	 * stall_waiter_goal to the sequence number they are waiting
373 	 * for and block via turnstile on the oldest stalled section.
374 	 */
375 	hw_lck_ticket_t         stall_lock;
376 	smr_seq_t               stall_rd_seq;
377 	smr_seq_t               stall_waiter_goal;
378 	struct smrq_tailq_head  stall_queue;
379 	struct turnstile       *stall_ts;
380 } __attribute__((aligned(128))) * smr_pcpu_t;
381 
382 static_assert(offsetof(struct smr_pcpu, __cacheline_sep) == 64);
383 static_assert(sizeof(struct smr_pcpu) == 128);
384 
385 #define CPU_CHECKIN_MIN_INTERVAL_US     5000         /* 5ms */
386 #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
387 static uint64_t cpu_checkin_min_interval;
388 static uint32_t cpu_checkin_min_interval_us;
389 
390 /*! the amount of memory pending retiring that causes a foreceful flush */
391 #if XNU_TARGET_OS_OSX
392 static TUNABLE(vm_size_t, smr_call_size_cap, "smr_call_size_cap", 256 << 10);
393 static TUNABLE(vm_size_t, smr_call_cnt_cap, "smr_call_cnt_cap", 128);
394 #else
395 static TUNABLE(vm_size_t, smr_call_size_cap, "smr_call_size_cap", 64 << 10);
396 static TUNABLE(vm_size_t, smr_call_cnt_cap, "smr_call_cnt_cap", 32);
397 #endif
398 /* time __smr_wait_for_oncore busy spins before going the expensive route */
399 static TUNABLE(uint32_t, smr_wait_spin_us, "smr_wait_spin_us", 20);
400 
401 static LCK_GRP_DECLARE(smr_lock_grp, "smr");
402 static struct smr_worker PERCPU_DATA(smr_worker);
403 static struct smrq_tailq_head smr_domains = SMRQ_TAILQ_INITIALIZER(smr_domains);
404 
405 SMR_DEFINE_FLAGS(smr_system, "system", SMR_NONE);
406 SMR_DEFINE_FLAGS(smr_system_sleepable, "system (sleepable)", SMR_SLEEPABLE);
407 
408 
409 #pragma mark SMR domains: init & helpers
410 
411 #define SMR_PCPU_NOT_QUEUED     ((struct smr_pcpu *)-1)
412 
413 __attribute__((always_inline, overloadable))
414 static inline smr_pcpu_t
__smr_pcpu(smr_t smr,int cpu)415 __smr_pcpu(smr_t smr, int cpu)
416 {
417 	return &smr->smr_pcpu[cpu];
418 }
419 
420 __attribute__((always_inline, overloadable))
421 static inline smr_pcpu_t
__smr_pcpu(smr_t smr)422 __smr_pcpu(smr_t smr)
423 {
424 	return __smr_pcpu(smr, cpu_number());
425 }
426 
427 static inline bool
__smr_pcpu_queued(smr_pcpu_t pcpu)428 __smr_pcpu_queued(smr_pcpu_t pcpu)
429 {
430 	return pcpu->drain_next != SMR_PCPU_NOT_QUEUED;
431 }
432 
433 static inline void
__smr_pcpu_set_not_queued(smr_pcpu_t pcpu)434 __smr_pcpu_set_not_queued(smr_pcpu_t pcpu)
435 {
436 	pcpu->drain_next = SMR_PCPU_NOT_QUEUED;
437 }
438 
439 static inline void
__smr_pcpu_associate(smr_t smr,smr_pcpu_t pcpu)440 __smr_pcpu_associate(smr_t smr, smr_pcpu_t pcpu)
441 {
442 	zpercpu_foreach_cpu(cpu) {
443 		pcpu[cpu].qold_tail = &pcpu[cpu].qhead;
444 		pcpu[cpu].qage_tail = &pcpu[cpu].qhead;
445 		pcpu[cpu].qcur_tail = &pcpu[cpu].qhead;
446 
447 		pcpu[cpu].drain_smr = smr;
448 		__smr_pcpu_set_not_queued(&pcpu[cpu]);
449 		hw_lck_ticket_init(&pcpu[cpu].stall_lock, &smr_lock_grp);
450 		smrq_init(&pcpu[cpu].stall_queue);
451 	}
452 
453 	os_atomic_store(&smr->smr_pcpu, pcpu, release);
454 }
455 
456 static inline event64_t
__smrw_oncore_event(struct smr_worker * smrw)457 __smrw_oncore_event(struct smr_worker *smrw)
458 {
459 	return CAST_EVENT64_T(&smrw->sect_queue);
460 }
461 
462 static inline event64_t
__smrw_drain_event(struct smr_worker * smrw)463 __smrw_drain_event(struct smr_worker *smrw)
464 {
465 	return CAST_EVENT64_T(&smrw->whead);
466 }
467 
468 static inline processor_t
__smrw_drain_bind_target(struct smr_worker * smrw)469 __smrw_drain_bind_target(struct smr_worker *smrw)
470 {
471 	return smrw->detach_reason ? PROCESSOR_NULL : smrw->processor;
472 }
473 
474 static inline void
__smrw_lock(struct smr_worker * smrw)475 __smrw_lock(struct smr_worker *smrw)
476 {
477 	waitq_lock(&smrw->waitq);
478 }
479 
480 static inline void
__smrw_unlock(struct smr_worker * smrw)481 __smrw_unlock(struct smr_worker *smrw)
482 {
483 	waitq_unlock(&smrw->waitq);
484 }
485 
486 /*!
487  * @function __smrw_wakeup_and_unlock()
488  *
489  * @brief
490  * Wakes up (with binding) the SMR worker.
491  *
492  * @discussion
493  * Wakeup the worker thread and bind it to the proper processor
494  * as a side effect.
495  *
496  * This function must be called with interrupts disabled.
497  */
498 static bool
__smrw_wakeup_and_unlock(struct smr_worker * smrw)499 __smrw_wakeup_and_unlock(struct smr_worker *smrw)
500 {
501 	thread_t thread;
502 
503 	assert(!ml_get_interrupts_enabled());
504 
505 	thread = waitq_wakeup64_identify_locked(&smrw->waitq,
506 	    __smrw_drain_event(smrw), THREAD_AWAKENED, WAITQ_UNLOCK);
507 
508 	if (thread != THREAD_NULL) {
509 		assert(thread == smrw->thread);
510 
511 		waitq_resume_and_bind_identified_thread(&smrw->waitq,
512 		    thread, __smrw_drain_bind_target(smrw),
513 		    THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT);
514 	}
515 
516 	return thread != THREAD_NULL;
517 }
518 
519 static void
__smr_call_drain(smr_node_t head)520 __smr_call_drain(smr_node_t head)
521 {
522 	smr_node_t node;
523 
524 	while ((node = head) != NULL) {
525 		head = node->smrn_next;
526 		node->smrn_next = NULL;
527 		node->smrn_cb(node);
528 	}
529 }
530 
531 __startup_func
532 void
__smr_domain_init(smr_t smr)533 __smr_domain_init(smr_t smr)
534 {
535 	smr_pcpu_t pcpu;
536 	vm_size_t size;
537 
538 	if (startup_phase < STARTUP_SUB_TUNABLES) {
539 		smr_seq_t *rd_seqp = &smr->smr_early;
540 
541 		/*
542 		 * This is a big cheat, but before the EARLY_BOOT phase,
543 		 * all smr_* APIs that would access past the rd_seq
544 		 * will early return.
545 		 */
546 		pcpu = __container_of(rd_seqp, struct smr_pcpu, c_rd_seq);
547 		smr->smr_pcpu = pcpu - cpu_number();
548 		assert(&__smr_pcpu(smr)->c_rd_seq == &smr->smr_early);
549 	} else {
550 		size = zpercpu_count() * sizeof(struct smr_pcpu);
551 		pcpu = zalloc_permanent(size, ZALIGN(struct smr_pcpu));
552 
553 		__smr_pcpu_associate(smr, pcpu);
554 	}
555 }
556 
557 smr_t
smr_domain_create(smr_flags_t flags,const char * name)558 smr_domain_create(smr_flags_t flags, const char *name)
559 {
560 	smr_pcpu_t pcpu;
561 	smr_t smr;
562 
563 	smr  = kalloc_type(struct smr, Z_WAITOK | Z_ZERO | Z_NOFAIL);
564 	pcpu = kalloc_type(struct smr_pcpu, zpercpu_count(),
565 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
566 
567 	smr->smr_clock.s_rd_seq = SMR_SEQ_INIT;
568 	smr->smr_clock.s_wr_seq = SMR_SEQ_INIT;
569 	smr->smr_flags = flags;
570 	static_assert(sizeof(struct smr) ==
571 	    offsetof(struct smr, smr_name) + SMR_NAME_MAX);
572 	strlcpy(smr->smr_name, name, sizeof(smr->smr_name));
573 
574 	__smr_pcpu_associate(smr, pcpu);
575 
576 	return smr;
577 }
578 
579 void
smr_domain_free(smr_t smr)580 smr_domain_free(smr_t smr)
581 {
582 	smr_barrier(smr);
583 
584 	zpercpu_foreach_cpu(cpu) {
585 		smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
586 
587 		assert(pcpu->qhead == NULL);
588 		hw_lck_ticket_destroy(&pcpu->stall_lock, &smr_lock_grp);
589 	}
590 
591 	kfree_type(struct smr_pcpu, zpercpu_count(), smr->smr_pcpu);
592 	kfree_type(struct smr, smr);
593 }
594 
595 
596 #pragma mark SMR domains: enter / leave
597 
598 bool
smr_entered(smr_t smr)599 smr_entered(smr_t smr)
600 {
601 	thread_t self = current_thread();
602 	smr_tracker_t t;
603 
604 	if (lock_preemption_level_for_thread(self) &&
605 	    __smr_pcpu(smr)->c_rd_seq != SMR_SEQ_INVALID) {
606 		return true;
607 	}
608 
609 	if (smr->smr_flags & SMR_SLEEPABLE) {
610 		smrq_serialized_foreach(t, &self->smr_stack, smrt_stack) {
611 			if (t->smrt_domain == smr) {
612 				return true;
613 			}
614 		}
615 	}
616 
617 	return false;
618 }
619 
620 __attribute__((always_inline))
621 bool
smr_entered_cpu_noblock(smr_t smr,int cpu)622 smr_entered_cpu_noblock(smr_t smr, int cpu)
623 {
624 	assert((smr->smr_flags & SMR_SLEEPABLE) == 0);
625 	return __smr_pcpu(smr, cpu)->c_rd_seq != SMR_SEQ_INVALID;
626 }
627 
628 __attribute__((always_inline))
629 static smr_seq_t
__smr_enter(smr_t smr,smr_pcpu_t pcpu,smr_seq_t sleepable)630 __smr_enter(smr_t smr, smr_pcpu_t pcpu, smr_seq_t sleepable)
631 {
632 	smr_seq_t  s_wr_seq;
633 	smr_seq_t  old_seq;
634 
635 	assert(!ml_at_interrupt_context());
636 
637 	/*
638 	 * It is possible to have a long delay between loading the s_wr_seq
639 	 * and storing it to the percpu copy of it.
640 	 *
641 	 * It is unlikely but possible by that time the s_rd_seq advances
642 	 * ahead of what we will store. This however is still safe
643 	 * and handled in __smr_scan().
644 	 *
645 	 * On Intel, to achieve the ordering we want, we could use a store
646 	 * followed by an mfence, or any RMW (XCHG, XADD, CMPXCHG, ...).
647 	 * XADD is just the fastest instruction of the alternatives,
648 	 * but it will only ever add to '0'.
649 	 */
650 	s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
651 #if __x86_64__
652 	/* [R1] */
653 	old_seq = os_atomic_add_orig(&pcpu->c_rd_seq, s_wr_seq | sleepable, seq_cst);
654 #else
655 	old_seq = pcpu->c_rd_seq;
656 	os_atomic_store(&pcpu->c_rd_seq, s_wr_seq | sleepable, relaxed);
657 	os_atomic_thread_fence(seq_cst); /* [R1] */
658 #endif
659 	assert(old_seq == SMR_SEQ_INVALID);
660 
661 	return s_wr_seq;
662 }
663 
664 __attribute__((always_inline))
665 static void
__smr_leave(smr_pcpu_t pcpu)666 __smr_leave(smr_pcpu_t pcpu)
667 {
668 	assert(!ml_at_interrupt_context());
669 	/* [R2] */
670 	os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
671 }
672 
673 __attribute__((always_inline))
674 void
smr_enter(smr_t smr)675 smr_enter(smr_t smr)
676 {
677 	disable_preemption();
678 	__smr_enter(smr, __smr_pcpu(smr), 0);
679 }
680 
681 __attribute__((always_inline))
682 void
smr_leave(smr_t smr)683 smr_leave(smr_t smr)
684 {
685 	__smr_leave(__smr_pcpu(smr));
686 	enable_preemption();
687 }
688 
689 void
smr_enter_sleepable(smr_t smr,smr_tracker_t tracker)690 smr_enter_sleepable(smr_t smr, smr_tracker_t tracker)
691 {
692 	thread_t self = current_thread();
693 	struct smr_worker *smrw;
694 	smr_pcpu_t pcpu;
695 
696 	assert(smr->smr_flags & SMR_SLEEPABLE);
697 
698 	lock_disable_preemption_for_thread(self);
699 	lck_rw_lock_count_inc(self, smr);
700 
701 	pcpu = __smr_pcpu(smr);
702 	smrw = PERCPU_GET(smr_worker);
703 
704 	tracker->smrt_domain = smr;
705 	tracker->smrt_seq    = __smr_enter(smr, pcpu, SMR_SEQ_SLEEPABLE);
706 	smrq_serialized_insert_head_relaxed(&smrw->sect_queue, &tracker->smrt_link);
707 	smrq_serialized_insert_head_relaxed(&self->smr_stack, &tracker->smrt_stack);
708 	tracker->smrt_ctid   = 0;
709 	tracker->smrt_cpu    = -1;
710 
711 	lock_enable_preemption();
712 }
713 
714 __attribute__((always_inline))
715 static void
__smr_wake_oncore_sleepers(struct smr_worker * smrw)716 __smr_wake_oncore_sleepers(struct smr_worker *smrw)
717 {
718 	/*
719 	 * prevent reordering of making the list empty and checking for waiters.
720 	 */
721 	if (__improbable(os_atomic_load(&smrw->sect_waiter, compiler_acq_rel))) {
722 		if (smrq_empty(&smrw->sect_queue)) {
723 			os_atomic_store(&smrw->sect_waiter, NULL, relaxed);
724 			waitq_wakeup64_all(&smrw->waitq,
725 			    __smrw_oncore_event(smrw), THREAD_AWAKENED,
726 			    WAITQ_WAKEUP_DEFAULT);
727 		}
728 	}
729 }
730 
731 void
smr_ack_ipi(void)732 smr_ack_ipi(void)
733 {
734 	/*
735 	 * see __smr_wait_for_oncore(): if at the time of the IPI ack
736 	 * the list is empty and there is still a waiter, wake it up.
737 	 *
738 	 * If the queue is not empty, then when smr_leave_sleepable()
739 	 * runs it can't possibly fail to observe smrw->sect_waiter
740 	 * being non NULL and will do the wakeup then.
741 	 */
742 	__smr_wake_oncore_sleepers(PERCPU_GET(smr_worker));
743 }
744 
745 void
smr_mark_active_trackers_stalled(thread_t self)746 smr_mark_active_trackers_stalled(thread_t self)
747 {
748 	struct smr_worker *smrw = PERCPU_GET(smr_worker);
749 	int cpu = cpu_number();
750 	smr_tracker_t t;
751 
752 	/* called at splsched */
753 
754 	smrq_serialized_foreach_safe(t, &smrw->sect_queue, smrt_link) {
755 		smr_t smr = t->smrt_domain;
756 		smr_pcpu_t pcpu;
757 
758 		pcpu = __smr_pcpu(smr, cpu);
759 
760 		t->smrt_ctid = self->ctid;
761 		t->smrt_cpu  = cpu;
762 
763 		hw_lck_ticket_lock_nopreempt(&pcpu->stall_lock, &smr_lock_grp);
764 
765 		/*
766 		 * Transfer the section to the stalled queue,
767 		 * and _then_ leave the regular one.
768 		 *
769 		 * A store-release is sufficient to order these stores,
770 		 * and guarantee that __smr_scan() can't fail to observe
771 		 * both the @c rd_seq and @c stall_rd_seq during a transfer
772 		 * of a stalled section that was active when it started.
773 		 */
774 		if (smrq_empty(&pcpu->stall_queue)) {
775 			os_atomic_store(&pcpu->stall_rd_seq, t->smrt_seq, relaxed);
776 		}
777 		os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
778 
779 		smrq_serialized_insert_tail_relaxed(&pcpu->stall_queue, &t->smrt_link);
780 
781 		hw_lck_ticket_unlock_nopreempt(&pcpu->stall_lock);
782 	}
783 
784 	smrq_init(&smrw->sect_queue);
785 
786 	__smr_wake_oncore_sleepers(smrw);
787 }
788 
789 
790 __attribute__((noinline))
791 static void
__smr_leave_stalled(smr_t smr,smr_tracker_t tracker,thread_t self)792 __smr_leave_stalled(smr_t smr, smr_tracker_t tracker, thread_t self)
793 {
794 	smr_seq_t new_stall_seq = SMR_SEQ_INVALID;
795 	smr_tracker_t first = NULL;
796 	smr_pcpu_t pcpu;
797 	bool progress;
798 
799 	pcpu = __smr_pcpu(smr, tracker->smrt_cpu);
800 
801 	hw_lck_ticket_lock_nopreempt(&pcpu->stall_lock, &smr_lock_grp);
802 
803 	progress = smrq_serialized_first(&pcpu->stall_queue,
804 	    struct smr_tracker, smrt_link) == tracker;
805 
806 	smrq_serialized_remove(&self->smr_stack, &tracker->smrt_stack);
807 	smrq_serialized_remove(&pcpu->stall_queue, &tracker->smrt_link);
808 	bzero(tracker, sizeof(*tracker));
809 
810 	if (progress) {
811 		if (!smrq_empty(&pcpu->stall_queue)) {
812 			first = smrq_serialized_first(&pcpu->stall_queue,
813 			    struct smr_tracker, smrt_link);
814 			new_stall_seq = first->smrt_seq;
815 			__builtin_assume(new_stall_seq != SMR_SEQ_INVALID);
816 			assert(SMR_SEQ_CMP(pcpu->stall_rd_seq, <=, new_stall_seq));
817 		}
818 
819 		os_atomic_store(&pcpu->stall_rd_seq, new_stall_seq, release);
820 
821 		progress = pcpu->stall_waiter_goal != SMR_SEQ_INVALID;
822 	}
823 
824 	if (progress) {
825 		struct turnstile *ts;
826 
827 		ts = turnstile_prepare((uintptr_t)pcpu, &pcpu->stall_ts,
828 		    TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
829 
830 		if (new_stall_seq == SMR_SEQ_INVALID ||
831 		    SMR_SEQ_CMP(pcpu->stall_waiter_goal, <=, new_stall_seq)) {
832 			pcpu->stall_waiter_goal = SMR_SEQ_INVALID;
833 			waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(pcpu),
834 			    THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
835 		} else {
836 			turnstile_update_inheritor(ts, ctid_get_thread(first->smrt_ctid),
837 			    TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
838 		}
839 
840 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
841 
842 		turnstile_complete((uintptr_t)pcpu, &pcpu->stall_ts,
843 		    NULL, TURNSTILE_KERNEL_MUTEX);
844 	}
845 
846 	/* reenables preemption disabled in smr_leave_sleepable() */
847 	hw_lck_ticket_unlock(&pcpu->stall_lock);
848 
849 	turnstile_cleanup();
850 }
851 
852 void
smr_leave_sleepable(smr_t smr,smr_tracker_t tracker)853 smr_leave_sleepable(smr_t smr, smr_tracker_t tracker)
854 {
855 	struct smr_worker *smrw;
856 	thread_t self = current_thread();
857 
858 	assert(tracker->smrt_seq != SMR_SEQ_INVALID);
859 	assert(smr->smr_flags & SMR_SLEEPABLE);
860 
861 	lock_disable_preemption_for_thread(self);
862 
863 	lck_rw_lock_count_dec(self, smr);
864 
865 	if (__improbable(tracker->smrt_cpu != -1)) {
866 		return __smr_leave_stalled(smr, tracker, self);
867 	}
868 
869 	__smr_leave(__smr_pcpu(smr));
870 
871 	smrw = PERCPU_GET(smr_worker);
872 	smrq_serialized_remove(&self->smr_stack, &tracker->smrt_stack);
873 	smrq_serialized_remove(&smrw->sect_queue, &tracker->smrt_link);
874 	bzero(tracker, sizeof(*tracker));
875 
876 	__smr_wake_oncore_sleepers(PERCPU_GET(smr_worker));
877 
878 	lock_enable_preemption();
879 }
880 
881 
882 #pragma mark SMR domains: advance, wait, poll, synchronize
883 
884 static inline smr_seq_t
__smr_wr_advance(smr_t smr)885 __smr_wr_advance(smr_t smr)
886 {
887 	/* [W] */
888 	return os_atomic_add(&smr->smr_clock.s_wr_seq, SMR_SEQ_INC, release);
889 }
890 
891 static inline bool
__smr_rd_advance(smr_t smr,smr_seq_t goal,smr_seq_t rd_seq)892 __smr_rd_advance(smr_t smr, smr_seq_t goal, smr_seq_t rd_seq)
893 {
894 	smr_seq_t o_seq;
895 
896 	os_atomic_thread_fence(seq_cst); /* [S3] */
897 
898 	os_atomic_rmw_loop(&smr->smr_clock.s_rd_seq, o_seq, rd_seq, relaxed, {
899 		if (SMR_SEQ_CMP(rd_seq, <=, o_seq)) {
900 		        rd_seq = o_seq;
901 		        os_atomic_rmw_loop_give_up(break);
902 		}
903 	});
904 
905 	return SMR_SEQ_CMP(goal, <=, rd_seq);
906 }
907 
908 __attribute__((noinline))
909 static smr_seq_t
__smr_wait_for_stalled(smr_pcpu_t pcpu,smr_seq_t goal)910 __smr_wait_for_stalled(smr_pcpu_t pcpu, smr_seq_t goal)
911 {
912 	struct turnstile *ts;
913 	thread_t inheritor;
914 	wait_result_t wr;
915 	smr_seq_t stall_rd_seq;
916 
917 	hw_lck_ticket_lock(&pcpu->stall_lock, &smr_lock_grp);
918 
919 	stall_rd_seq = pcpu->stall_rd_seq;
920 	if (stall_rd_seq == SMR_SEQ_INVALID ||
921 	    SMR_SEQ_CMP(goal, <=, stall_rd_seq)) {
922 		hw_lck_ticket_unlock(&pcpu->stall_lock);
923 		return stall_rd_seq;
924 	}
925 
926 	if (pcpu->stall_waiter_goal == SMR_SEQ_INVALID ||
927 	    SMR_SEQ_CMP(goal, <, pcpu->stall_waiter_goal)) {
928 		pcpu->stall_waiter_goal = goal;
929 	}
930 
931 	inheritor = ctid_get_thread(smrq_serialized_first(&pcpu->stall_queue,
932 	    struct smr_tracker, smrt_link)->smrt_ctid);
933 
934 	ts = turnstile_prepare((uintptr_t)pcpu, &pcpu->stall_ts,
935 	    TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
936 
937 	turnstile_update_inheritor(ts, inheritor,
938 	    TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD);
939 	wr = waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(pcpu),
940 	    THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
941 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
942 
943 	if (wr == THREAD_WAITING) {
944 		hw_lck_ticket_unlock(&pcpu->stall_lock);
945 		thread_block(THREAD_CONTINUE_NULL);
946 		hw_lck_ticket_lock(&pcpu->stall_lock, &smr_lock_grp);
947 	}
948 
949 	turnstile_complete((uintptr_t)pcpu, &pcpu->stall_ts,
950 	    NULL, TURNSTILE_KERNEL_MUTEX);
951 
952 	stall_rd_seq = pcpu->stall_rd_seq;
953 	hw_lck_ticket_unlock(&pcpu->stall_lock);
954 
955 	turnstile_cleanup();
956 
957 	return stall_rd_seq;
958 }
959 
960 __attribute__((noinline))
961 static smr_seq_t
__smr_wait_for_oncore(smr_pcpu_t pcpu,smr_seq_t goal,uint32_t cpu)962 __smr_wait_for_oncore(smr_pcpu_t pcpu, smr_seq_t goal, uint32_t cpu)
963 {
964 	thread_t self = current_thread();
965 	struct smr_worker *smrw;
966 	uint64_t deadline = 0;
967 	vm_offset_t base;
968 	smr_seq_t rd_seq;
969 
970 	/*
971 	 * We are waiting for a currently active SMR section.
972 	 * Start spin-waiting for it for a bit.
973 	 */
974 	for (;;) {
975 		if (hw_spin_wait_until(&pcpu->c_rd_seq, rd_seq,
976 		    rd_seq == SMR_SEQ_INVALID || SMR_SEQ_CMP(goal, <=, rd_seq))) {
977 			return rd_seq;
978 		}
979 
980 		if (deadline == 0) {
981 			clock_interval_to_deadline(smr_wait_spin_us,
982 			    NSEC_PER_USEC, &deadline);
983 		} else if (mach_absolute_time() > deadline) {
984 			break;
985 		}
986 	}
987 
988 	/*
989 	 * This section is being active for a while,
990 	 * we need to move to a more passive way of waiting.
991 	 *
992 	 * We post ourselves on the remote processor tracking head,
993 	 * to denote we need a thread_wakeup() when the tracker head clears,
994 	 * then send an IPI which will have 2 possible outcomes:
995 	 *
996 	 * 1. when smr_ack_ipi() runs, the queue is already cleared,
997 	 *    and we will be woken up immediately.
998 	 *
999 	 * 2. when smr_ack_ipi() runs, the queue isn't cleared,
1000 	 *    then it does nothing, but there is a guarantee that
1001 	 *    when the queue clears, the remote core will observe
1002 	 *    that there is a waiter, and thread_wakeup() will be
1003 	 *    called then.
1004 	 *
1005 	 * In order to avoid to actually wait, we do spin some more,
1006 	 * hoping for the remote sequence to change.
1007 	 */
1008 	base = other_percpu_base(cpu);
1009 	smrw = PERCPU_GET_WITH_BASE(base, smr_worker);
1010 
1011 	waitq_assert_wait64(&smrw->waitq, __smrw_oncore_event(smrw),
1012 	    THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1013 
1014 	if (lock_cmpxchg(&smrw->sect_waiter, NULL, self, relaxed)) {
1015 		/*
1016 		 * only really send the IPI if we're first,
1017 		 * to avoid IPI storms in case of a pile-up
1018 		 * of smr_synchronize() calls stalled on the same guy.
1019 		 */
1020 		cause_ast_check(PERCPU_GET_WITH_BASE(base, processor));
1021 	}
1022 
1023 	if (hw_spin_wait_until(&pcpu->c_rd_seq, rd_seq,
1024 	    rd_seq == SMR_SEQ_INVALID || SMR_SEQ_CMP(goal, <=, rd_seq))) {
1025 		clear_wait(self, THREAD_AWAKENED);
1026 		return rd_seq;
1027 	}
1028 
1029 	thread_block(THREAD_CONTINUE_NULL);
1030 
1031 	return os_atomic_load(&pcpu->c_rd_seq, relaxed);
1032 }
1033 
1034 __attribute__((noinline))
1035 static bool
__smr_scan(smr_t smr,smr_seq_t goal,smr_clock_t clk,bool wait)1036 __smr_scan(smr_t smr, smr_seq_t goal, smr_clock_t clk, bool wait)
1037 {
1038 	smr_delta_t delta;
1039 	smr_seq_t rd_seq;
1040 
1041 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
1042 		return true;
1043 	}
1044 
1045 	/*
1046 	 * Validate that the goal is sane.
1047 	 */
1048 	delta = SMR_SEQ_DELTA(goal, clk.s_wr_seq);
1049 	if (delta == SMR_SEQ_INC) {
1050 		/*
1051 		 * This SMR clock uses deferred advance,
1052 		 * and the goal is one inc in the future.
1053 		 *
1054 		 * If we can wait, then commit the sequence number,
1055 		 * else we can't possibly succeed.
1056 		 *
1057 		 * Doing a commit here rather than an advance
1058 		 * gives the hardware a chance to abort the
1059 		 * transaction early in case of high contention
1060 		 * compared to an unconditional advance.
1061 		 */
1062 		if (!wait) {
1063 			return false;
1064 		}
1065 		if (lock_cmpxchgv(&smr->smr_clock.s_wr_seq,
1066 		    clk.s_wr_seq, goal, &clk.s_wr_seq, relaxed)) {
1067 			clk.s_wr_seq = goal;
1068 		}
1069 	} else if (delta > 0) {
1070 		/*
1071 		 * Invalid goal: the caller held on it for too long,
1072 		 * and integers wrapped.
1073 		 */
1074 		return true;
1075 	}
1076 
1077 	os_atomic_thread_fence(seq_cst); /* [S2] */
1078 
1079 	/*
1080 	 * The read sequence can be no larger than the write sequence
1081 	 * at the start of the poll.
1082 	 *
1083 	 * We know that on entry:
1084 	 *
1085 	 *     s_rd_seq < goal <= s_wr_seq
1086 	 *
1087 	 * The correctness of this algorithm relies on the fact that
1088 	 * the SMR domain [s_rd_seq, s_wr_seq) can't possibly move
1089 	 * by more than roughly (ULONG_MAX / 2) while __smr_scan()
1090 	 * is running, otherwise the "rd_seq" we try to scan for
1091 	 * might appear larger than s_rd_seq spuriously and we'd
1092 	 * __smr_rd_advance() incorrectly.
1093 	 *
1094 	 * This is guaranteed by the fact that this represents
1095 	 * advancing 2^62 times. At one advance every nanosecond,
1096 	 * it takes more than a century, which makes it possible
1097 	 * to call smr_wait() or smr_poll() with preemption enabled.
1098 	 */
1099 	rd_seq = clk.s_wr_seq;
1100 
1101 	zpercpu_foreach_cpu(cpu) {
1102 		smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
1103 		smr_seq_t seq   = os_atomic_load(&pcpu->c_rd_seq, relaxed);
1104 
1105 		while (seq != SMR_SEQ_INVALID) {
1106 			/*
1107 			 * Resolve the race documented in __smr_enter().
1108 			 *
1109 			 * The CPU has loaded a stale s_wr_seq, and s_rd_seq
1110 			 * moved past this stale value.
1111 			 *
1112 			 * Its critical section is however properly serialized,
1113 			 * but we can't know what the "correct" s_wr_seq it
1114 			 * could have observed was. We have to assume `s_rd_seq`
1115 			 * to prevent it from advancing.
1116 			 */
1117 			if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
1118 				seq = clk.s_rd_seq;
1119 			}
1120 
1121 			if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
1122 				seq &= ~SMR_SEQ_SLEEPABLE;
1123 				break;
1124 			}
1125 
1126 			if (seq & SMR_SEQ_SLEEPABLE) {
1127 				seq = __smr_wait_for_oncore(pcpu, goal, cpu);
1128 			} else {
1129 				disable_preemption();
1130 				seq = hw_wait_while_equals_long(&pcpu->c_rd_seq, seq);
1131 				enable_preemption();
1132 			}
1133 		}
1134 
1135 		if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
1136 			rd_seq = seq;
1137 		}
1138 	}
1139 
1140 	if (smr->smr_flags & SMR_SLEEPABLE) {
1141 		/*
1142 		 * Order observation of stalled sections,
1143 		 * see smr_mark_active_trackers_stalled().
1144 		 */
1145 		os_atomic_thread_fence(seq_cst);
1146 
1147 		zpercpu_foreach_cpu(cpu) {
1148 			smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
1149 			smr_seq_t  seq  = os_atomic_load(&pcpu->stall_rd_seq, relaxed);
1150 
1151 			while (seq != SMR_SEQ_INVALID) {
1152 				if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
1153 					seq = clk.s_rd_seq;
1154 				}
1155 
1156 				if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
1157 					seq &= ~SMR_SEQ_SLEEPABLE;
1158 					break;
1159 				}
1160 
1161 				seq = __smr_wait_for_stalled(pcpu, goal);
1162 			}
1163 
1164 			if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
1165 				rd_seq = seq;
1166 			}
1167 		}
1168 	}
1169 
1170 	/*
1171 	 * Advance the rd_seq as long as we observed a more recent value.
1172 	 */
1173 	return __smr_rd_advance(smr, goal, rd_seq);
1174 }
1175 
1176 static inline bool
__smr_poll(smr_t smr,smr_seq_t goal,bool wait)1177 __smr_poll(smr_t smr, smr_seq_t goal, bool wait)
1178 {
1179 	smr_clock_t clk;
1180 
1181 	/*
1182 	 * Load both the s_rd_seq and s_wr_seq in the right order so that we
1183 	 * can't observe a s_rd_seq older than s_wr_seq.
1184 	 */
1185 
1186 	/* [S1] */
1187 	clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, acquire);
1188 
1189 	/*
1190 	 * We expect this to be typical: the goal has already been observed.
1191 	 */
1192 	if (__probable(SMR_SEQ_CMP(goal, <=, clk.s_rd_seq))) {
1193 		return true;
1194 	}
1195 
1196 	clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1197 
1198 	return __smr_scan(smr, goal, clk, wait);
1199 }
1200 
1201 smr_seq_t
smr_advance(smr_t smr)1202 smr_advance(smr_t smr)
1203 {
1204 	smr_clock_t clk;
1205 
1206 	assert(!smr_entered(smr));
1207 
1208 	/*
1209 	 * We assume that there will at least be a successful __smr_poll
1210 	 * call every 2^60 calls to smr_advance() or so, so we do not need
1211 	 * to check if [s_rd_seq, s_wr_seq) is growing too wide.
1212 	 */
1213 	static_assert(sizeof(clk.s_wr_seq) == 8);
1214 	return __smr_wr_advance(smr);
1215 }
1216 
1217 smr_seq_t
smr_deferred_advance(smr_t smr)1218 smr_deferred_advance(smr_t smr)
1219 {
1220 	os_atomic_thread_fence(seq_cst);
1221 	return SMR_SEQ_INC + os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1222 }
1223 
1224 void
smr_deferred_advance_commit(smr_t smr,smr_seq_t seq)1225 smr_deferred_advance_commit(smr_t smr, smr_seq_t seq)
1226 {
1227 	/*
1228 	 * no barrier needed: smr_deferred_advance() had one already.
1229 	 * no failure handling: it means someone updated the clock already!
1230 	 * lock_cmpxchg: so that we pre-test for architectures needing it.
1231 	 */
1232 	assert(seq != SMR_SEQ_INVALID);
1233 	lock_cmpxchg(&smr->smr_clock.s_wr_seq, seq - SMR_SEQ_INC, seq, relaxed);
1234 }
1235 
1236 bool
smr_poll(smr_t smr,smr_seq_t goal)1237 smr_poll(smr_t smr, smr_seq_t goal)
1238 {
1239 	assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
1240 	return __smr_poll(smr, goal, false);
1241 }
1242 
1243 void
smr_wait(smr_t smr,smr_seq_t goal)1244 smr_wait(smr_t smr, smr_seq_t goal)
1245 {
1246 	assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
1247 	if (smr->smr_flags & SMR_SLEEPABLE) {
1248 		assert(get_preemption_level() == 0);
1249 	}
1250 	(void)__smr_poll(smr, goal, true);
1251 }
1252 
1253 void
smr_synchronize(smr_t smr)1254 smr_synchronize(smr_t smr)
1255 {
1256 	smr_clock_t clk;
1257 
1258 	assert(!smr_entered(smr));
1259 	assert(!ml_at_interrupt_context());
1260 	if (smr->smr_flags & SMR_SLEEPABLE) {
1261 		assert(get_preemption_level() == 0);
1262 	}
1263 
1264 	/*
1265 	 * Similar to __smr_poll() but also does a deferred advance which
1266 	 * __smr_scan will commit.
1267 	 */
1268 
1269 	clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, relaxed);
1270 	os_atomic_thread_fence(seq_cst);
1271 	clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1272 
1273 	(void)__smr_scan(smr, clk.s_wr_seq + SMR_SEQ_INC, clk, true);
1274 }
1275 
1276 
1277 #pragma mark SMR domains: smr_call & smr_barrier
1278 
1279 /*!
1280  * @struct smr_barrier_ctx
1281  *
1282  * @brief
1283  * Data structure to track the completion of an smr_barrier() call.
1284  */
1285 struct smr_barrier_ctx {
1286 	struct smr             *smrb_domain;
1287 	struct thread          *smrb_waiter;
1288 	uint32_t                smrb_pending;
1289 	uint32_t                smrb_count;
1290 };
1291 
1292 /*!
1293  * @struct smr_barrier_job
1294  *
1295  * @brief
1296  * Data structure used to track completion of smr_barrier() calls.
1297  */
1298 struct smr_barrier_job {
1299 	struct smr_barrier_ctx *smrj_context;
1300 	union {
1301 		struct smr_node smrj_node;
1302 		struct mpsc_queue_chain smrj_link;
1303 	};
1304 };
1305 
1306 #define SMR_BARRIER_SIZE        24
1307 static_assert(sizeof(struct smr_barrier_job) == SMR_BARRIER_SIZE);
1308 #define SMR_BARRIER_USE_STACK   (SMR_BARRIER_SIZE * MAX_CPUS <= 512)
1309 
1310 static void
__smr_worker_check_invariants(struct smr_worker * smrw)1311 __smr_worker_check_invariants(struct smr_worker *smrw)
1312 {
1313 #if MACH_ASSERT
1314 	smr_pcpu_t pcpu = smrw->whead;
1315 	uint16_t num = (uint16_t)cpu_number();
1316 
1317 	assert(!ml_get_interrupts_enabled() || get_preemption_level());
1318 
1319 	for (; pcpu != *smrw->wold_tail; pcpu = pcpu->drain_next) {
1320 		assertf(pcpu->qold_seq != SMR_SEQ_INVALID &&
1321 		    __smr_pcpu_queued(pcpu),
1322 		    "pcpu %p doesn't belong on %p old queue", pcpu, smrw);
1323 		pcpu->__check_cpu = num;
1324 		pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1325 		pcpu->__check_list = 1;
1326 	}
1327 
1328 	for (; pcpu != *smrw->wage_tail; pcpu = pcpu->drain_next) {
1329 		__assert_only smr_t smr = pcpu->drain_smr;
1330 
1331 		assertf(pcpu->qold_seq == SMR_SEQ_INVALID &&
1332 		    pcpu->qage_seq != SMR_SEQ_INVALID &&
1333 		    SMR_SEQ_CMP(pcpu->qage_seq, <=, smr->smr_clock.s_wr_seq) &&
1334 		    __smr_pcpu_queued(pcpu),
1335 		    "pcpu %p doesn't belong on %p aging queue", pcpu, smrw);
1336 		pcpu->__check_cpu = num;
1337 		pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1338 		pcpu->__check_list = 2;
1339 	}
1340 
1341 	for (; pcpu != *smrw->wcur_tail; pcpu = pcpu->drain_next) {
1342 		assertf(pcpu->qold_seq == SMR_SEQ_INVALID &&
1343 		    pcpu->qage_seq != SMR_SEQ_INVALID &&
1344 		    __smr_pcpu_queued(pcpu),
1345 		    "pcpu %p doesn't belong on %p current queue", pcpu, smrw);
1346 		pcpu->__check_cpu = num;
1347 		pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1348 		pcpu->__check_list = 3;
1349 	}
1350 
1351 	assert(pcpu == NULL);
1352 #else
1353 	(void)smrw;
1354 #endif
1355 }
1356 
1357 __attribute__((noinline))
1358 static void
__smr_cpu_lazy_up(struct smr_worker * smrw)1359 __smr_cpu_lazy_up(struct smr_worker *smrw)
1360 {
1361 	spl_t spl;
1362 
1363 	/*
1364 	 * calling smr_call/smr_barrier() from the context of a CPU
1365 	 * with a detached worker is illegal.
1366 	 *
1367 	 * However, bound threads might run on a derecommended (IGNORED)
1368 	 * cpu which we correct for here (and the CPU will go back to IGNORED
1369 	 * in smr_cpu_leave()).
1370 	 */
1371 	assert(smrw->detach_reason == SMR_CPU_REASON_IGNORED);
1372 
1373 	spl = splsched();
1374 	__smrw_lock(smrw);
1375 	smrw->detach_reason &= ~SMR_CPU_REASON_IGNORED;
1376 	__smrw_unlock(smrw);
1377 	splx(spl);
1378 }
1379 
1380 static void
__smr_cpu_lazy_up_if_needed(struct smr_worker * smrw)1381 __smr_cpu_lazy_up_if_needed(struct smr_worker *smrw)
1382 {
1383 	if (__improbable(smrw->detach_reason != SMR_CPU_REASON_NONE)) {
1384 		__smr_cpu_lazy_up(smrw);
1385 	}
1386 }
1387 
1388 static bool
__smr_call_should_advance(smr_pcpu_t pcpu)1389 __smr_call_should_advance(smr_pcpu_t pcpu)
1390 {
1391 	if (pcpu->qcur_cnt > smr_call_cnt_cap) {
1392 		return true;
1393 	}
1394 	if (pcpu->qcur_size > smr_call_size_cap) {
1395 		return true;
1396 	}
1397 	return false;
1398 }
1399 
1400 static void
__smr_call_advance_qcur(smr_t smr,smr_pcpu_t pcpu,bool needs_commit)1401 __smr_call_advance_qcur(smr_t smr, smr_pcpu_t pcpu, bool needs_commit)
1402 {
1403 	smr_seq_t new_seq;
1404 
1405 	if (needs_commit || pcpu->qage_seq) {
1406 		new_seq = smr_advance(smr);
1407 	} else {
1408 		new_seq = smr_deferred_advance(smr);
1409 	}
1410 	__builtin_assume(new_seq != SMR_SEQ_INVALID);
1411 
1412 	pcpu->qage_seq  = new_seq;
1413 	pcpu->qage_tail = pcpu->qcur_tail;
1414 
1415 	pcpu->qcur_size = 0;
1416 	pcpu->qcur_cnt  = 0;
1417 }
1418 
1419 static void
__smr_call_push(smr_pcpu_t pcpu,smr_node_t node,smr_cb_t cb)1420 __smr_call_push(smr_pcpu_t pcpu, smr_node_t node, smr_cb_t cb)
1421 {
1422 	assert(pcpu->c_rd_seq == SMR_SEQ_INVALID);
1423 
1424 	node->smrn_next  = NULL;
1425 	node->smrn_cb    = cb;
1426 
1427 	*pcpu->qcur_tail = node;
1428 	pcpu->qcur_tail  = &node->smrn_next;
1429 	pcpu->qcur_cnt  += 1;
1430 }
1431 
1432 static void
__smr_call_dispatch(struct smr_worker * smrw,smr_pcpu_t pcpu)1433 __smr_call_dispatch(struct smr_worker *smrw, smr_pcpu_t pcpu)
1434 {
1435 	__smr_worker_check_invariants(smrw);
1436 
1437 	if (!__smr_pcpu_queued(pcpu)) {
1438 		assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1439 		assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1440 
1441 		pcpu->drain_next   = NULL;
1442 		*smrw->wcur_tail   = pcpu;
1443 		smrw->wcur_tail    = &pcpu->drain_next;
1444 	}
1445 }
1446 
1447 void
smr_call(smr_t smr,smr_node_t node,vm_size_t size,smr_cb_t cb)1448 smr_call(smr_t smr, smr_node_t node, vm_size_t size, smr_cb_t cb)
1449 {
1450 	struct smr_worker *smrw;
1451 	smr_pcpu_t pcpu;
1452 
1453 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
1454 		return cb(node);
1455 	}
1456 
1457 	lock_disable_preemption_for_thread(current_thread());
1458 	assert(!ml_at_interrupt_context());
1459 
1460 	smrw = PERCPU_GET(smr_worker);
1461 	__smr_cpu_lazy_up_if_needed(smrw);
1462 
1463 	pcpu = __smr_pcpu(smr);
1464 	assert(pcpu->c_rd_seq == SMR_SEQ_INVALID);
1465 
1466 	if (os_add_overflow(pcpu->qcur_size, size, &pcpu->qcur_size)) {
1467 		pcpu->qcur_size = UINT32_MAX;
1468 	}
1469 
1470 	__smr_call_push(pcpu, node, cb);
1471 	if (__smr_call_should_advance(pcpu)) {
1472 		if (pcpu->qage_seq == SMR_SEQ_INVALID) {
1473 			__smr_call_advance_qcur(smr, pcpu, false);
1474 		}
1475 		__smr_call_dispatch(smrw, pcpu);
1476 	}
1477 
1478 	return lock_enable_preemption();
1479 }
1480 
1481 static inline event_t
__smrb_event(struct smr_barrier_ctx * ctx)1482 __smrb_event(struct smr_barrier_ctx *ctx)
1483 {
1484 	return ctx;
1485 }
1486 
1487 static void
__smr_barrier_cb(struct smr_node * node)1488 __smr_barrier_cb(struct smr_node *node)
1489 {
1490 	struct smr_barrier_job *job;
1491 	struct smr_barrier_ctx *ctx;
1492 
1493 	job = __container_of(node, struct smr_barrier_job, smrj_node);
1494 	ctx = job->smrj_context;
1495 
1496 	if (os_atomic_dec(&ctx->smrb_pending, relaxed) == 0) {
1497 		/*
1498 		 * It is permitted to still reach into the context
1499 		 * because smr_barrier() always blocks, which means
1500 		 * that the context will be valid until this wakeup
1501 		 * happens.
1502 		 */
1503 		thread_wakeup_thread(__smrb_event(ctx), ctx->smrb_waiter);
1504 	}
1505 }
1506 
1507 static bool
__smr_barrier_drain(struct smr_worker * smrw,bool needs_commit)1508 __smr_barrier_drain(struct smr_worker *smrw, bool needs_commit)
1509 {
1510 	mpsc_queue_chain_t head, tail, it;
1511 
1512 	head = mpsc_queue_dequeue_batch(&smrw->barrier_queue, &tail,
1513 	    OS_ATOMIC_DEPENDENCY_NONE);
1514 
1515 	mpsc_queue_batch_foreach_safe(it, head, tail) {
1516 		struct smr_barrier_job *job;
1517 		struct smr_barrier_ctx *ctx;
1518 		smr_pcpu_t pcpu;
1519 		smr_t smr;
1520 
1521 		job  = __container_of(it, struct smr_barrier_job, smrj_link);
1522 		ctx  = job->smrj_context;
1523 		smr  = ctx->smrb_domain;
1524 		pcpu = __smr_pcpu(smr, smrw->processor->cpu_id);
1525 
1526 		pcpu->qcur_size = UINT32_MAX;
1527 		__smr_call_push(pcpu, &job->smrj_node, __smr_barrier_cb);
1528 		__smr_call_advance_qcur(smr, pcpu, needs_commit);
1529 		__smr_call_dispatch(smrw, pcpu);
1530 	}
1531 
1532 	return head != NULL;
1533 }
1534 
1535 
1536 void
smr_barrier(smr_t smr)1537 smr_barrier(smr_t smr)
1538 {
1539 #if SMR_BARRIER_USE_STACK
1540 	struct smr_barrier_job jobs[MAX_CPUS];
1541 #else
1542 	struct smr_barrier_job *jobs;
1543 #endif
1544 	struct smr_barrier_job *job;
1545 	struct smr_barrier_ctx  ctx = {
1546 		.smrb_domain  = smr,
1547 		.smrb_waiter  = current_thread(),
1548 		.smrb_pending = zpercpu_count(),
1549 		.smrb_count   = zpercpu_count(),
1550 	};
1551 	spl_t spl;
1552 
1553 	/*
1554 	 * First wait for all readers to observe whatever it is
1555 	 * that changed prior to this call.
1556 	 *
1557 	 * _then_ enqueue callbacks that push out anything ahead.
1558 	 */
1559 	smr_synchronize(smr);
1560 
1561 #if !SMR_BARRIER_USE_STACK
1562 	jobs = kalloc_type(struct smr_barrier_job, ctx.smrb_count,
1563 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1564 #endif
1565 	job  = jobs;
1566 	spl  = splsched();
1567 
1568 	__smr_cpu_lazy_up_if_needed(PERCPU_GET(smr_worker));
1569 
1570 	percpu_foreach(smrw, smr_worker) {
1571 		job->smrj_context = &ctx;
1572 		if (mpsc_queue_append(&smrw->barrier_queue, &job->smrj_link)) {
1573 			__smrw_lock(smrw);
1574 			__smrw_wakeup_and_unlock(smrw);
1575 		}
1576 		job++;
1577 	}
1578 
1579 	/*
1580 	 * Because we disabled interrupts, our own CPU's callback
1581 	 * can't possibly have run, so just block.
1582 	 *
1583 	 * We must block in order to guarantee the lifetime of "ctx".
1584 	 * (See comment in __smr_barrier_cb).
1585 	 */
1586 	assert_wait(__smrb_event(&ctx), THREAD_UNINT);
1587 	assert(ctx.smrb_pending > 0);
1588 	splx(spl);
1589 	thread_block(THREAD_CONTINUE_NULL);
1590 
1591 #if !SMR_BARRIER_USE_STACK
1592 	kfree_type(struct smr_barrier_job, ctx.smrb_count, jobs);
1593 #endif
1594 }
1595 
1596 
1597 #pragma mark SMR domains: smr_worker
1598 
1599 static void
__smr_worker_drain_lock(struct smr_worker * smrw)1600 __smr_worker_drain_lock(struct smr_worker *smrw)
1601 {
1602 	for (;;) {
1603 		ml_set_interrupts_enabled(false);
1604 		__smrw_lock(smrw);
1605 
1606 		/*
1607 		 * Check we are on an appropriate processor
1608 		 *
1609 		 * Note that we might be running on the canonical
1610 		 * processor incorrectly: if the processor has been
1611 		 * de-recommended but isn't offline.
1612 		 */
1613 		if (__probable(current_processor() == smrw->processor)) {
1614 			if (__probable(!smrw->detach_reason)) {
1615 				break;
1616 			}
1617 		} else {
1618 			if (__probable(smrw->detach_reason)) {
1619 				break;
1620 			}
1621 		}
1622 
1623 		/* go bind in the right place and retry */
1624 		thread_bind(__smrw_drain_bind_target(smrw));
1625 		__smrw_unlock(smrw);
1626 		ml_set_interrupts_enabled(true);
1627 		thread_block(THREAD_CONTINUE_NULL);
1628 	}
1629 }
1630 
1631 static void
__smr_worker_drain_unlock(struct smr_worker * smrw)1632 __smr_worker_drain_unlock(struct smr_worker *smrw)
1633 {
1634 	__smrw_unlock(smrw);
1635 	ml_set_interrupts_enabled(true);
1636 }
1637 
1638 /*!
1639  * @function __smr_worker_tick
1640  *
1641  * @brief
1642  * Make the SMR worker queues make gentle progress
1643  *
1644  * @discussion
1645  * One round of progress will:
1646  * - move entries that have aged as being old,
1647  * - commit entries that have a deferred sequence and let them age.
1648  *
1649  * If this results into any callbacks to become "old",
1650  * then the worker is being woken up to start running callbacks.
1651  *
1652  * This function must run either on the processfor for this worker,
1653  * or under the worker drain lock being held.
1654  */
1655 static void
__smr_worker_tick(struct smr_worker * smrw,uint64_t ctime,bool wakeup)1656 __smr_worker_tick(struct smr_worker *smrw, uint64_t ctime, bool wakeup)
1657 {
1658 	smr_pcpu_t pcpu = *smrw->wold_tail;
1659 
1660 	__smr_worker_check_invariants(smrw);
1661 
1662 	for (; pcpu != *smrw->wage_tail; pcpu = pcpu->drain_next) {
1663 		assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1664 		assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1665 
1666 		pcpu->qold_seq  = pcpu->qage_seq;
1667 		pcpu->qold_tail = pcpu->qage_tail;
1668 
1669 		pcpu->qage_seq  = SMR_SEQ_INVALID;
1670 	}
1671 
1672 	for (; pcpu; pcpu = pcpu->drain_next) {
1673 		assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1674 		assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1675 
1676 		smr_deferred_advance_commit(pcpu->drain_smr, pcpu->qage_seq);
1677 	}
1678 
1679 	smrw->wold_tail = smrw->wage_tail;
1680 	smrw->wage_tail = smrw->wcur_tail;
1681 	smrw->drain_ctime = ctime;
1682 
1683 	__smr_worker_check_invariants(smrw);
1684 
1685 	if (wakeup && smrw->wold_tail != &smrw->whead) {
1686 		__smrw_lock(smrw);
1687 		__smrw_wakeup_and_unlock(smrw);
1688 	}
1689 }
1690 
1691 static void
__smr_worker_update_wold_tail(struct smr_worker * smrw,smr_pcpu_t * new_tail)1692 __smr_worker_update_wold_tail(struct smr_worker *smrw, smr_pcpu_t *new_tail)
1693 {
1694 	smr_pcpu_t *old_tail = smrw->wold_tail;
1695 
1696 	if (smrw->wcur_tail == old_tail) {
1697 		smrw->wage_tail = new_tail;
1698 		smrw->wcur_tail = new_tail;
1699 	} else if (smrw->wage_tail == old_tail) {
1700 		smrw->wage_tail = new_tail;
1701 	}
1702 
1703 	smrw->wold_tail = new_tail;
1704 }
1705 
1706 static void
__smr_worker_drain_one(struct smr_worker * smrw,smr_pcpu_t pcpu)1707 __smr_worker_drain_one(struct smr_worker *smrw, smr_pcpu_t pcpu)
1708 {
1709 	smr_t       smr  = pcpu->drain_smr;
1710 	smr_seq_t   seq  = pcpu->qold_seq;
1711 	smr_node_t  head;
1712 
1713 	/*
1714 	 * Step 1: pop the "old" items,
1715 	 *         (qold_tail/qold_seq left dangling)
1716 	 */
1717 
1718 	assert(seq != SMR_SEQ_INVALID);
1719 	head = pcpu->qhead;
1720 	pcpu->qhead = *pcpu->qold_tail;
1721 	*pcpu->qold_tail = NULL;
1722 
1723 	/*
1724 	 * Step 2: Reconstruct the queue
1725 	 *         based on the sequence numbers and count fields.
1726 	 *
1727 	 *         Do what __smr_worker_tick() would do on this queue:
1728 	 *         - commit the aging queue
1729 	 *         - advance the current queue if needed
1730 	 */
1731 
1732 	if (pcpu->qage_seq != SMR_SEQ_INVALID) {
1733 		assert(pcpu->qage_tail != pcpu->qold_tail);
1734 
1735 		smr_deferred_advance_commit(smr, pcpu->qage_seq);
1736 		pcpu->qold_seq  = pcpu->qage_seq;
1737 		pcpu->qold_tail = pcpu->qage_tail;
1738 	} else {
1739 		assert(pcpu->qage_tail == pcpu->qold_tail);
1740 
1741 		pcpu->qold_seq  = SMR_SEQ_INVALID;
1742 		pcpu->qold_tail = &pcpu->qhead;
1743 	}
1744 
1745 	if (__smr_call_should_advance(pcpu)) {
1746 		__smr_call_advance_qcur(smr, pcpu, false);
1747 	} else {
1748 		pcpu->qage_seq  = SMR_SEQ_INVALID;
1749 		pcpu->qage_tail = pcpu->qold_tail;
1750 		if (pcpu->qcur_cnt == 0) {
1751 			pcpu->qcur_tail = pcpu->qage_tail;
1752 		}
1753 	}
1754 
1755 	if (pcpu->qold_seq != SMR_SEQ_INVALID) {
1756 		/*
1757 		 * The node has gained an "old seq" back,
1758 		 * it goes to the ready queue.
1759 		 */
1760 		pcpu->drain_next = *smrw->wold_tail;
1761 		*smrw->wold_tail = pcpu;
1762 		__smr_worker_update_wold_tail(smrw,
1763 		    &pcpu->drain_next);
1764 	} else if (pcpu->qage_seq != SMR_SEQ_INVALID) {
1765 		/*
1766 		 * The node has gained an "age seq" back,
1767 		 * it needs to age and wait for a tick
1768 		 * for its sequence number to be commited.
1769 		 */
1770 		pcpu->drain_next = NULL;
1771 		*smrw->wcur_tail = pcpu;
1772 		smrw->wcur_tail  = &pcpu->drain_next;
1773 	} else {
1774 		/*
1775 		 * The node is empty or with "current"
1776 		 * callbacks only, it can be dequeued.
1777 		 */
1778 		assert(!__smr_call_should_advance(pcpu));
1779 		pcpu->__check_cpu = (uint16_t)cpu_number();
1780 		pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1781 		pcpu->__check_list = 0;
1782 		__smr_pcpu_set_not_queued(pcpu);
1783 	}
1784 
1785 	/*
1786 	 * Step 3: drain callbacks.
1787 	 */
1788 	__smr_worker_check_invariants(smrw);
1789 	__smr_worker_drain_unlock(smrw);
1790 
1791 	__smr_poll(smr, seq, true);
1792 	__smr_call_drain(head);
1793 
1794 	__smr_worker_drain_lock(smrw);
1795 }
1796 
1797 static void
__smr_worker_continue(void * arg,wait_result_t wr __unused)1798 __smr_worker_continue(void *arg, wait_result_t wr __unused)
1799 {
1800 	smr_pcpu_t pcpu = NULL, next = NULL;
1801 	struct smr_worker *const smrw = arg;
1802 	uint64_t deadline;
1803 
1804 	__smr_worker_drain_lock(smrw);
1805 	__smr_worker_check_invariants(smrw);
1806 
1807 	if (smrw->wold_tail != &smrw->whead) {
1808 		next = smrw->whead;
1809 		smrw->whead = *smrw->wold_tail;
1810 		*smrw->wold_tail = NULL;
1811 		__smr_worker_update_wold_tail(smrw, &smrw->whead);
1812 	}
1813 
1814 	/*
1815 	 * The pipeline of per-cpu SMR data structures with pending
1816 	 * smr_call() callbacks has three stages: wcur -> wage -> wold.
1817 	 *
1818 	 * In order to guarantee forward progress, a tick happens
1819 	 * for each of them, either via __smr_worker_tick(),
1820 	 * or via __smr_worker_drain_one().
1821 	 *
1822 	 * The second tick will happen either because to core stayed
1823 	 * busy enough that a subsequent smr_cpu_tick() decided to
1824 	 * perform it, or because the CPU idled, and smr_cpu_leave()
1825 	 * will perform an unconditional __smr_worker_tick().
1826 	 */
1827 	__smr_barrier_drain(smrw, false);
1828 	__smr_worker_tick(smrw, mach_absolute_time(), false);
1829 
1830 	while ((pcpu = next)) {
1831 		next = next->drain_next;
1832 		__smr_worker_drain_one(smrw, pcpu);
1833 	}
1834 
1835 	if (__improbable(smrw->whead && smrw->detach_reason)) {
1836 		/*
1837 		 * If the thread isn't bound, we want to flush anything
1838 		 * that is pending without causing too much contention.
1839 		 *
1840 		 * Sleep for a bit in order to give the system time
1841 		 * to observe any advance commits we did.
1842 		 */
1843 		deadline = mach_absolute_time() + cpu_checkin_min_interval;
1844 	} else {
1845 		deadline = TIMEOUT_WAIT_FOREVER;
1846 	}
1847 	waitq_assert_wait64_locked(&smrw->waitq, __smrw_drain_event(smrw),
1848 	    THREAD_UNINT, TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1849 	    TIMEOUT_NO_LEEWAY, smrw->thread);
1850 
1851 	/*
1852 	 * Make sure there's no barrier left, after we called assert_wait()
1853 	 * in order to pair with __smr_barrier_cb(). If we do find some,
1854 	 * we must be careful about invariants and forward progress.
1855 	 *
1856 	 * For affected domains, the dequeued barriers have been added
1857 	 * to their "qage" queue. If their "qage" queue was non empty,
1858 	 * then its "qage_seq" was already commited, and we must preserve
1859 	 * this invariant.
1860 	 *
1861 	 * Affected domains that were idle before will get enqueued on this
1862 	 * worker's "wcur" queue. In order to guarantee forward progress,
1863 	 * we must force a tick if both the "wage" and "wold" queues
1864 	 * of the worker are empty.
1865 	 */
1866 	if (__improbable(__smr_barrier_drain(smrw, true))) {
1867 		if (smrw->wage_tail == &smrw->whead) {
1868 			__smr_worker_tick(smrw, mach_absolute_time(), false);
1869 		}
1870 	}
1871 
1872 	__smr_worker_check_invariants(smrw);
1873 	__smr_worker_drain_unlock(smrw);
1874 
1875 	thread_block_parameter(__smr_worker_continue, smrw);
1876 }
1877 
1878 
1879 #pragma mark SMR domains: scheduler integration
1880 
1881 #if CONFIG_QUIESCE_COUNTER
1882 __startup_data
1883 static uint64_t _Atomic quiesce_gen_startup;
1884 static uint64_t _Atomic *quiesce_genp = &quiesce_gen_startup;
1885 static uint64_t _Atomic quiesce_ctime;
1886 
1887 void
cpu_quiescent_set_storage(uint64_t _Atomic * ptr)1888 cpu_quiescent_set_storage(uint64_t _Atomic *ptr)
1889 {
1890 	/*
1891 	 * Transfer to the real location for the commpage.
1892 	 *
1893 	 * this is ok to do like this because the system
1894 	 * is still single threaded.
1895 	 */
1896 	uint64_t gen = os_atomic_load(&quiesce_gen_startup, relaxed);
1897 
1898 	os_atomic_store(ptr, gen, relaxed);
1899 	quiesce_genp = ptr;
1900 }
1901 
1902 static smr_seq_t
cpu_quiescent_gen_to_seq(uint64_t gen)1903 cpu_quiescent_gen_to_seq(uint64_t gen)
1904 {
1905 	return gen * SMR_SEQ_INC + SMR_SEQ_INIT;
1906 }
1907 
1908 static void
cpu_quiescent_advance(uint64_t gen,uint64_t ctime __kdebug_only)1909 cpu_quiescent_advance(uint64_t gen, uint64_t ctime __kdebug_only)
1910 {
1911 	smr_seq_t seq = cpu_quiescent_gen_to_seq(gen);
1912 
1913 	os_atomic_thread_fence(seq_cst);
1914 
1915 	percpu_foreach(it, smr_worker) {
1916 		smr_seq_t rd_seq = os_atomic_load(&it->rd_quiesce_seq, relaxed);
1917 
1918 		if (rd_seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(rd_seq, <, seq)) {
1919 			return;
1920 		}
1921 	}
1922 
1923 	os_atomic_thread_fence(seq_cst);
1924 
1925 	if (lock_cmpxchg(quiesce_genp, gen, gen + 1, relaxed)) {
1926 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER),
1927 		    gen, 0, ctime, 0);
1928 	}
1929 }
1930 
1931 static void
cpu_quiescent_join(struct smr_worker * smrw)1932 cpu_quiescent_join(struct smr_worker *smrw)
1933 {
1934 	uint64_t gen = os_atomic_load(quiesce_genp, relaxed);
1935 
1936 	assert(smrw->rd_quiesce_seq == SMR_SEQ_INVALID);
1937 	os_atomic_store(&smrw->rd_quiesce_seq,
1938 	    cpu_quiescent_gen_to_seq(gen), relaxed);
1939 	os_atomic_thread_fence(seq_cst);
1940 }
1941 
1942 static void
cpu_quiescent_tick(struct smr_worker * smrw,uint64_t ctime,uint64_t interval)1943 cpu_quiescent_tick(struct smr_worker *smrw, uint64_t ctime, uint64_t interval)
1944 {
1945 	uint64_t  gen  = os_atomic_load(quiesce_genp, relaxed);
1946 	smr_seq_t seq  = cpu_quiescent_gen_to_seq(gen);
1947 
1948 	if (smrw->rd_quiesce_seq == SMR_SEQ_INVALID) {
1949 		/*
1950 		 * Likely called because of the scheduler tick,
1951 		 * smr_maintenance() will do the right thing.
1952 		 */
1953 		assert(current_processor()->state != PROCESSOR_RUNNING);
1954 	} else if (seq != smrw->rd_quiesce_seq) {
1955 		/*
1956 		 * Someone managed to update the sequence already,
1957 		 * learn it, update our ctime.
1958 		 */
1959 		os_atomic_store(&smrw->rd_quiesce_seq, seq, release);
1960 		os_atomic_store(&quiesce_ctime, ctime, relaxed);
1961 		os_atomic_thread_fence(seq_cst);
1962 	} else if ((ctime - os_atomic_load(&quiesce_ctime, relaxed)) > interval) {
1963 		/*
1964 		 * The system looks busy enough we want to update
1965 		 * the counter faster than every scheduler tick.
1966 		 */
1967 		os_atomic_store(&quiesce_ctime, ctime, relaxed);
1968 		cpu_quiescent_advance(gen, ctime);
1969 	}
1970 }
1971 
1972 static void
cpu_quiescent_leave(struct smr_worker * smrw)1973 cpu_quiescent_leave(struct smr_worker *smrw)
1974 {
1975 	assert(smrw->rd_quiesce_seq != SMR_SEQ_INVALID);
1976 	os_atomic_store(&smrw->rd_quiesce_seq, SMR_SEQ_INVALID, release);
1977 }
1978 #endif /* CONFIG_QUIESCE_COUNTER */
1979 
1980 uint32_t
smr_cpu_checkin_get_min_interval_us(void)1981 smr_cpu_checkin_get_min_interval_us(void)
1982 {
1983 	return cpu_checkin_min_interval_us;
1984 }
1985 
1986 void
smr_cpu_checkin_set_min_interval_us(uint32_t new_value_us)1987 smr_cpu_checkin_set_min_interval_us(uint32_t new_value_us)
1988 {
1989 	/* clamp to something vaguely sane */
1990 	if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) {
1991 		new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US;
1992 	}
1993 
1994 	cpu_checkin_min_interval_us = new_value_us;
1995 
1996 	uint64_t abstime = 0;
1997 	clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us,
1998 	    NSEC_PER_USEC, &abstime);
1999 	cpu_checkin_min_interval = abstime;
2000 }
2001 
2002 __startup_func
2003 static void
smr_cpu_checkin_init_min_interval_us(void)2004 smr_cpu_checkin_init_min_interval_us(void)
2005 {
2006 	smr_cpu_checkin_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US);
2007 }
2008 STARTUP(TUNABLES, STARTUP_RANK_FIRST, smr_cpu_checkin_init_min_interval_us);
2009 
2010 static void
__smr_cpu_init_thread(struct smr_worker * smrw)2011 __smr_cpu_init_thread(struct smr_worker *smrw)
2012 {
2013 	char name[MAXTHREADNAMESIZE];
2014 	thread_t th = THREAD_NULL;
2015 
2016 	kernel_thread_create(__smr_worker_continue, smrw, MINPRI_KERNEL, &th);
2017 	smrw->thread = th;
2018 
2019 	snprintf(name, sizeof(name), "smr.reclaim:%d", smrw->processor->cpu_id);
2020 	thread_set_thread_name(th, name);
2021 	thread_start_in_assert_wait(th,
2022 	    &smrw->waitq, __smrw_drain_event(smrw), THREAD_UNINT);
2023 }
2024 
2025 void
smr_cpu_init(struct processor * processor)2026 smr_cpu_init(struct processor *processor)
2027 {
2028 	struct smr_worker *smrw;
2029 
2030 	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2031 	smrw->processor = processor;
2032 
2033 	waitq_init(&smrw->waitq, WQT_QUEUE, SYNC_POLICY_FIFO);
2034 	smrw->detach_reason = SMR_CPU_REASON_OFFLINE;
2035 
2036 	smrq_init(&smrw->sect_queue);
2037 	smrw->wold_tail = &smrw->whead;
2038 	smrw->wage_tail = &smrw->whead;
2039 	smrw->wcur_tail = &smrw->whead;
2040 	mpsc_queue_init(&smrw->barrier_queue);
2041 
2042 	if (processor != master_processor) {
2043 		__smr_cpu_init_thread(smrw);
2044 	}
2045 }
2046 STARTUP_ARG(LOCKS, STARTUP_RANK_LAST, smr_cpu_init, master_processor);
2047 STARTUP_ARG(THREAD_CALL, STARTUP_RANK_LAST,
2048     __smr_cpu_init_thread, PERCPU_GET_MASTER(smr_worker));
2049 
2050 /*!
2051  * @function smr_cpu_up()
2052  *
2053  * @brief
2054  * Scheduler callback to notify this processor is going up.
2055  *
2056  * @discussion
2057  * Called at splsched() under the sched_available_cores_lock.
2058  */
2059 void
smr_cpu_up(struct processor * processor,smr_cpu_reason_t reason)2060 smr_cpu_up(struct processor *processor, smr_cpu_reason_t reason)
2061 {
2062 	struct smr_worker *smrw;
2063 
2064 	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2065 
2066 	__smrw_lock(smrw);
2067 	if (reason != SMR_CPU_REASON_IGNORED) {
2068 		assert((smrw->detach_reason & reason) == reason);
2069 	}
2070 	smrw->detach_reason &= ~reason;
2071 	__smrw_unlock(smrw);
2072 }
2073 
2074 static void
__smr_cpu_down_and_unlock(struct processor * processor,struct smr_worker * smrw,smr_cpu_reason_t reason)2075 __smr_cpu_down_and_unlock(
2076 	struct processor       *processor,
2077 	struct smr_worker      *smrw,
2078 	smr_cpu_reason_t        reason)
2079 {
2080 	bool detach = !smrw->detach_reason;
2081 
2082 	/*
2083 	 * When reason is SMR_CPU_REASON_IGNORED,
2084 	 * this is called from smr_cpu_leave() on the way to idle.
2085 	 *
2086 	 * However this isn't sychronized with the recommendation
2087 	 * lock, hence it is possible that the CPU might actually
2088 	 * be recommended again while we're on the way to idle.
2089 	 *
2090 	 * By re-checking processor recommendation under
2091 	 * the __smrw_lock, we serialize with smr_cpu_up().
2092 	 */
2093 	if (reason != SMR_CPU_REASON_IGNORED) {
2094 		assert((smrw->detach_reason & reason) == 0);
2095 	} else if (processor->is_recommended) {
2096 		/*
2097 		 * The race we try to detect happened,
2098 		 * do nothing.
2099 		 */
2100 		reason = SMR_CPU_REASON_NONE;
2101 		detach = false;
2102 	}
2103 	smrw->detach_reason |= reason;
2104 	reason = smrw->detach_reason;
2105 
2106 	if (detach && smrw->whead) {
2107 		detach = !__smrw_wakeup_and_unlock(smrw);
2108 	} else {
2109 		__smrw_unlock(smrw);
2110 	}
2111 
2112 	if (detach) {
2113 		thread_unbind_after_queue_shutdown(smrw->thread, processor);
2114 	}
2115 }
2116 
2117 /*!
2118  * @function smr_cpu_down()
2119  *
2120  * @brief
2121  * Scheduler callback to notify this processor is going down.
2122  *
2123  * @discussion
2124  * Called at splsched() when the processor run queue is being shut down.
2125  */
2126 void
smr_cpu_down(struct processor * processor,smr_cpu_reason_t reason)2127 smr_cpu_down(struct processor *processor, smr_cpu_reason_t reason)
2128 {
2129 	struct smr_worker *smrw;
2130 
2131 	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2132 
2133 	__smrw_lock(smrw);
2134 	__smr_cpu_down_and_unlock(processor, smrw, reason);
2135 }
2136 
2137 
2138 /*!
2139  * @function smr_cpu_join()
2140  *
2141  * @brief
2142  * Scheduler callback to notify this processor is going out of idle.
2143  *
2144  * @discussion
2145  * Called at splsched().
2146  */
2147 void
smr_cpu_join(struct processor * processor,uint64_t ctime __unused)2148 smr_cpu_join(struct processor *processor, uint64_t ctime __unused)
2149 {
2150 #if CONFIG_QUIESCE_COUNTER
2151 	struct smr_worker *smrw;
2152 
2153 	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2154 	cpu_quiescent_join(smrw);
2155 #else
2156 	(void)processor;
2157 #endif /* CONFIG_QUIESCE_COUNTER */
2158 }
2159 
2160 /*!
2161  * @function smr_cpu_tick()
2162  *
2163  * @brief
2164  * Scheduler callback invoked during the scheduler maintenance routine.
2165  *
2166  * @discussion
2167  * Called at splsched().
2168  */
2169 void
smr_cpu_tick(uint64_t ctime,bool safe_point)2170 smr_cpu_tick(uint64_t ctime, bool safe_point)
2171 {
2172 	struct smr_worker *smrw = PERCPU_GET(smr_worker);
2173 	uint64_t interval = cpu_checkin_min_interval;
2174 
2175 #if CONFIG_QUIESCE_COUNTER
2176 	cpu_quiescent_tick(smrw, ctime, interval);
2177 #endif /* CONFIG_QUIESCE_COUNTER */
2178 
2179 	/*
2180 	 * if a bound thread was woken up on a derecommended core,
2181 	 * our detach_reason might be "IGNORED" and we want to leave
2182 	 * it alone in that case
2183 	 */
2184 	if (safe_point && !smrw->detach_reason && smrw->whead &&
2185 	    current_processor()->state == PROCESSOR_RUNNING &&
2186 	    (ctime - smrw->drain_ctime) > interval) {
2187 		__smr_worker_tick(smrw, ctime, true);
2188 	}
2189 }
2190 
2191 /*!
2192  * @function smr_cpu_leave()
2193  *
2194  * @brief
2195  * Scheduler callback to notify this processor is going idle.
2196  *
2197  * @discussion
2198  * Called at splsched().
2199  */
2200 void
smr_cpu_leave(struct processor * processor,uint64_t ctime)2201 smr_cpu_leave(struct processor *processor, uint64_t ctime)
2202 {
2203 	struct smr_worker *smrw;
2204 
2205 	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2206 
2207 	/*
2208 	 * if a bound thread was woken up on a derecommended core,
2209 	 * our detach_reason might be "IGNORED" and we want to leave
2210 	 * it alone in that case
2211 	 *
2212 	 * See comment in __smr_worker_continue for why this must be
2213 	 * done unconditionally otherwise.
2214 	 */
2215 	if (!smrw->detach_reason && smrw->whead) {
2216 		__smr_worker_tick(smrw, ctime, true);
2217 	}
2218 
2219 	if (__improbable(!processor->is_recommended)) {
2220 		__smrw_lock(smrw);
2221 		__smr_cpu_down_and_unlock(processor, smrw, SMR_CPU_REASON_IGNORED);
2222 	}
2223 
2224 #if CONFIG_QUIESCE_COUNTER
2225 	cpu_quiescent_leave(smrw);
2226 #endif /* CONFIG_QUIESCE_COUNTER */
2227 }
2228 
2229 /*!
2230  * @function smr_maintenance()
2231  *
2232  * @brief
2233  * Scheduler callback called at the scheduler tick.
2234  *
2235  * @discussion
2236  * Called at splsched().
2237  */
2238 void
smr_maintenance(uint64_t ctime)2239 smr_maintenance(uint64_t ctime)
2240 {
2241 #if CONFIG_QUIESCE_COUNTER
2242 	cpu_quiescent_advance(os_atomic_load(quiesce_genp, relaxed), ctime);
2243 #else
2244 	(void)ctime;
2245 #endif /* CONFIG_QUIESCE_COUNTER */
2246 }
2247 
2248 
2249 #pragma mark - SMR hash tables
2250 
2251 static struct smrq_slist_head *
smr_hash_alloc_array(size_t size)2252 smr_hash_alloc_array(size_t size)
2253 {
2254 	return kalloc_type(struct smrq_slist_head, size,
2255 	           Z_WAITOK | Z_ZERO | Z_SPRAYQTN);
2256 }
2257 
2258 static void
smr_hash_free_array(struct smrq_slist_head * array,size_t size)2259 smr_hash_free_array(struct smrq_slist_head *array, size_t size)
2260 {
2261 	kfree_type(struct smrq_slist_head, size, array);
2262 }
2263 
2264 static inline uintptr_t
smr_hash_array_encode(struct smrq_slist_head * array,uint16_t order)2265 smr_hash_array_encode(struct smrq_slist_head *array, uint16_t order)
2266 {
2267 	uintptr_t ptr;
2268 
2269 	ptr  = (uintptr_t)array;
2270 	ptr &= ~SMRH_ARRAY_ORDER_MASK;
2271 	ptr |= (uintptr_t)order << SMRH_ARRAY_ORDER_SHIFT;
2272 
2273 	return ptr;
2274 }
2275 
2276 #pragma mark SMR simple hash tables
2277 
2278 void
smr_hash_init(struct smr_hash * smrh,size_t size)2279 smr_hash_init(struct smr_hash *smrh, size_t size)
2280 {
2281 	struct smrq_slist_head *array;
2282 	uint16_t shift;
2283 
2284 	assert(size);
2285 	shift = (uint16_t)flsll(size - 1);
2286 	size  = 1UL << shift;
2287 	if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
2288 		assert(size * sizeof(struct smrq_slist_head) <=
2289 		    KALLOC_SAFE_ALLOC_SIZE);
2290 	}
2291 	array = smr_hash_alloc_array(size);
2292 
2293 	*smrh = (struct smr_hash){
2294 		.smrh_array = smr_hash_array_encode(array, 64 - shift),
2295 	};
2296 }
2297 
2298 void
smr_hash_destroy(struct smr_hash * smrh)2299 smr_hash_destroy(struct smr_hash *smrh)
2300 {
2301 	struct smr_hash_array array = smr_hash_array_decode(smrh);
2302 
2303 	smr_hash_free_array(array.smrh_array, smr_hash_size(array));
2304 	*smrh = (struct smr_hash){ };
2305 }
2306 
2307 void
2308 __smr_hash_serialized_clear(
2309 	struct smr_hash        *smrh,
2310 	smrh_traits_t          smrht,
2311 	void                 (^free)(void *obj))
2312 {
2313 	struct smr_hash_array array = smr_hash_array_decode(smrh);
2314 
2315 	for (size_t i = 0; i < smr_hash_size(array); i++) {
2316 		struct smrq_slink *link;
2317 		__smrq_slink_t *prev;
2318 
2319 		prev = &array.smrh_array[i].first;
2320 		while ((link = smr_serialized_load(prev))) {
2321 			prev = &link->next;
2322 			free(__smrht_link_to_obj(smrht, link));
2323 		}
2324 
2325 		smr_clear_store(&array.smrh_array[i].first);
2326 	}
2327 
2328 	smrh->smrh_count = 0;
2329 }
2330 
2331 kern_return_t
__smr_hash_shrink_and_unlock(struct smr_hash * smrh,lck_mtx_t * lock,smrh_traits_t smrht)2332 __smr_hash_shrink_and_unlock(
2333 	struct smr_hash        *smrh,
2334 	lck_mtx_t              *lock,
2335 	smrh_traits_t           smrht)
2336 {
2337 	struct smr_hash_array decptr = smr_hash_array_decode(smrh);
2338 	struct smrq_slist_head *newarray, *oldarray;
2339 	uint16_t neworder = decptr.smrh_order + 1;
2340 	size_t   oldsize  = smr_hash_size(decptr);
2341 	size_t   newsize  = oldsize / 2;
2342 
2343 	assert(newsize);
2344 
2345 	if (os_atomic_load(&smrh->smrh_resizing, relaxed)) {
2346 		lck_mtx_unlock(lock);
2347 		return KERN_FAILURE;
2348 	}
2349 
2350 	os_atomic_store(&smrh->smrh_resizing, true, relaxed);
2351 	lck_mtx_unlock(lock);
2352 
2353 	newarray = smr_hash_alloc_array(newsize);
2354 	if (newarray == NULL) {
2355 		os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2356 		return KERN_RESOURCE_SHORTAGE;
2357 	}
2358 
2359 	lck_mtx_lock(lock);
2360 
2361 	/*
2362 	 * Step 1: collapse all the chains in pairs.
2363 	 */
2364 	oldarray = decptr.smrh_array;
2365 
2366 	for (size_t i = 0; i < newsize; i++) {
2367 		newarray[i] = oldarray[i];
2368 		smrq_serialized_append(&newarray[i], &oldarray[i + newsize]);
2369 	}
2370 
2371 	/*
2372 	 * Step 2: publish the new array.
2373 	 */
2374 	os_atomic_store(&smrh->smrh_array,
2375 	    smr_hash_array_encode(newarray, neworder), release);
2376 
2377 	os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2378 
2379 	lck_mtx_unlock(lock);
2380 
2381 	/*
2382 	 * Step 3: free the old array once readers can't observe the old values.
2383 	 */
2384 	smr_synchronize(smrht->domain);
2385 
2386 	smr_hash_free_array(oldarray, oldsize);
2387 	return KERN_SUCCESS;
2388 }
2389 
2390 kern_return_t
__smr_hash_grow_and_unlock(struct smr_hash * smrh,lck_mtx_t * lock,smrh_traits_t smrht)2391 __smr_hash_grow_and_unlock(
2392 	struct smr_hash        *smrh,
2393 	lck_mtx_t              *lock,
2394 	smrh_traits_t           smrht)
2395 {
2396 	struct smr_hash_array decptr = smr_hash_array_decode(smrh);
2397 	struct smrq_slist_head *newarray, *oldarray;
2398 	__smrq_slink_t **prevarray;
2399 	uint16_t neworder = decptr.smrh_order - 1;
2400 	size_t   oldsize  = smr_hash_size(decptr);
2401 	size_t   newsize  = 2 * oldsize;
2402 	bool     needs_another_round = false;
2403 
2404 	if (smrh->smrh_resizing) {
2405 		lck_mtx_unlock(lock);
2406 		return KERN_FAILURE;
2407 	}
2408 
2409 	smrh->smrh_resizing = true;
2410 	lck_mtx_unlock(lock);
2411 
2412 	newarray = smr_hash_alloc_array(newsize);
2413 	if (newarray == NULL) {
2414 		os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2415 		return KERN_RESOURCE_SHORTAGE;
2416 	}
2417 
2418 	prevarray = kalloc_type(__smrq_slink_t *, newsize,
2419 	    Z_WAITOK | Z_ZERO | Z_SPRAYQTN);
2420 	if (prevarray == NULL) {
2421 		smr_hash_free_array(newarray, newsize);
2422 		os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2423 		return KERN_RESOURCE_SHORTAGE;
2424 	}
2425 
2426 
2427 	lck_mtx_lock(lock);
2428 
2429 	/*
2430 	 * Step 1: create a duplicated array with twice as many heads.
2431 	 */
2432 	oldarray = decptr.smrh_array;
2433 
2434 	memcpy(newarray, oldarray, oldsize * sizeof(newarray[0]));
2435 	memcpy(newarray + oldsize, oldarray, oldsize * sizeof(newarray[0]));
2436 
2437 	/*
2438 	 * Step 2: Publish the new array, and wait for readers to observe it
2439 	 *         before we do any change.
2440 	 */
2441 	os_atomic_store(&smrh->smrh_array,
2442 	    smr_hash_array_encode(newarray, neworder), release);
2443 
2444 	smr_synchronize(smrht->domain);
2445 
2446 
2447 	/*
2448 	 * Step 3: split the lists.
2449 	 */
2450 
2451 	/*
2452 	 * If the list we are trying to split looked like this,
2453 	 * where L elements will go to the "left" bucket and "R"
2454 	 * to the right one:
2455 	 *
2456 	 *     old_head --> L1 --> L2                -> L5
2457 	 *                            \             /      \
2458 	 *                             -> R3 --> R4         -> R6 --> NULL
2459 	 *
2460 	 * Then make sure the new heads point to their legitimate first element,
2461 	 * leading to this state:
2462 	 *
2463 	 *     l_head   --> L1 --> L2                -> L5
2464 	 *                            \             /      \
2465 	 *     r_head   ----------------> R3 --> R4         -> R6 --> NULL
2466 	 *
2467 	 *
2468 	 *     prevarray[left]  = &L2->next
2469 	 *     prevarray[right] = &r_head
2470 	 *     oldarray[old]    = L2
2471 	 */
2472 
2473 	for (size_t i = 0; i < oldsize; i++) {
2474 		struct smrq_slink *link, *next;
2475 		uint32_t want_mask;
2476 
2477 		link = smr_serialized_load(&oldarray[i].first);
2478 		if (link == NULL) {
2479 			continue;
2480 		}
2481 
2482 		want_mask = smrht->obj_hash(link, 0) & oldsize;
2483 		while ((next = smr_serialized_load(&link->next)) &&
2484 		    (smrht->obj_hash(next, 0) & oldsize) == want_mask) {
2485 			link = next;
2486 		}
2487 
2488 		if (want_mask == 0) {
2489 			/* elements seen go to the "left" bucket */
2490 			prevarray[i] = &link->next;
2491 			prevarray[i + oldsize] = &newarray[i + oldsize].first;
2492 			smr_serialized_store_relaxed(prevarray[i + oldsize], next);
2493 		} else {
2494 			/* elements seen go to the "right" bucket */
2495 			prevarray[i] = &newarray[i].first;
2496 			prevarray[i + oldsize] = &link->next;
2497 			smr_serialized_store_relaxed(prevarray[i], next);
2498 		}
2499 
2500 		smr_serialized_store_relaxed(&oldarray[i].first,
2501 		    next ? link : NULL);
2502 
2503 		needs_another_round |= (next != NULL);
2504 	}
2505 
2506 	/*
2507 	 * At this point, when we split further, we must wait for
2508 	 * readers to observe the previous state before we split
2509 	 * further. Indeed, reusing the example above, the next
2510 	 * round of splitting would end up with this:
2511 	 *
2512 	 *     l_head   --> L1 --> L2 ----------------> L5
2513 	 *                                          /      \
2514 	 *     r_head   ----------------> R3 --> R4         -> R6 --> NULL
2515 	 *
2516 	 *
2517 	 *     prevarray[left]  = &L2->next
2518 	 *     prevarray[right] = &R4->next
2519 	 *     oldarray[old]    = R4
2520 	 *
2521 	 * But we must be sure that no readers can observe r_head
2522 	 * having been L1, otherwise a stale reader might skip over
2523 	 * R3/R4.
2524 	 *
2525 	 * Generally speaking we need to do that each time we do a round
2526 	 * of splitting that isn't terminating the list with NULL.
2527 	 */
2528 
2529 	while (needs_another_round) {
2530 		smr_synchronize(smrht->domain);
2531 
2532 		needs_another_round = false;
2533 
2534 		for (size_t i = 0; i < oldsize; i++) {
2535 			struct smrq_slink *link, *next;
2536 			uint32_t want_mask;
2537 
2538 			link = smr_serialized_load(&oldarray[i].first);
2539 			if (link == NULL) {
2540 				continue;
2541 			}
2542 
2543 			/*
2544 			 * If `prevarray[i]` (left) points to the linkage
2545 			 * we stopped at, then it means the next element
2546 			 * will be "to the right" and vice versa.
2547 			 *
2548 			 * We also already know "next" exists, so only probe
2549 			 * after it.
2550 			 */
2551 			if (prevarray[i] == &link->next) {
2552 				want_mask = (uint32_t)oldsize;
2553 			} else {
2554 				want_mask = 0;
2555 			}
2556 
2557 			link = smr_serialized_load(&link->next);
2558 
2559 			while ((next = smr_serialized_load(&link->next)) &&
2560 			    (smrht->obj_hash(next, 0) & oldsize) == want_mask) {
2561 				link = next;
2562 			}
2563 
2564 			if (want_mask == 0) {
2565 				/* elements seen go to the "left" bucket */
2566 				prevarray[i] = &link->next;
2567 				smr_serialized_store_relaxed(prevarray[i + oldsize], next);
2568 			} else {
2569 				/* elements seen go to the "right" bucket */
2570 				smr_serialized_store_relaxed(prevarray[i], next);
2571 				prevarray[i + oldsize] = &link->next;
2572 			}
2573 
2574 			smr_serialized_store_relaxed(&oldarray[i].first,
2575 			    next ? link : NULL);
2576 
2577 			needs_another_round |= (next != NULL);
2578 		}
2579 	}
2580 
2581 	smrh->smrh_resizing = false;
2582 	lck_mtx_unlock(lock);
2583 
2584 	/*
2585 	 * Step 4: cleanup, no need to wait for readers, this happened already
2586 	 *         at least once for splitting reasons.
2587 	 */
2588 	smr_hash_free_array(oldarray, oldsize);
2589 	kfree_type(__smrq_slink_t *, newsize, prevarray);
2590 	return KERN_SUCCESS;
2591 }
2592 
2593 #pragma mark SMR scalable hash tables
2594 
2595 #define SMRSH_MIGRATED  ((struct smrq_slink *)SMRSH_BUCKET_STOP_BIT)
2596 static LCK_GRP_DECLARE(smr_shash_grp, "smr_shash");
2597 
2598 static inline size_t
__smr_shash_min_size(struct smr_shash * smrh)2599 __smr_shash_min_size(struct smr_shash *smrh)
2600 {
2601 	return 1ul << smrh->smrsh_min_shift;
2602 }
2603 
2604 static inline size_t
__smr_shash_size_for_shift(uint8_t shift)2605 __smr_shash_size_for_shift(uint8_t shift)
2606 {
2607 	return (~0u >> shift) + 1;
2608 }
2609 
2610 static inline size_t
__smr_shash_cursize(smrsh_state_t state)2611 __smr_shash_cursize(smrsh_state_t state)
2612 {
2613 	return __smr_shash_size_for_shift(state.curshift);
2614 }
2615 
2616 static void
__smr_shash_bucket_init(hw_lck_ptr_t * head)2617 __smr_shash_bucket_init(hw_lck_ptr_t *head)
2618 {
2619 	hw_lck_ptr_init(head, __smr_shash_bucket_stop(head), &smr_shash_grp);
2620 }
2621 
2622 static void
__smr_shash_bucket_destroy(hw_lck_ptr_t * head)2623 __smr_shash_bucket_destroy(hw_lck_ptr_t *head)
2624 {
2625 	hw_lck_ptr_destroy(head, &smr_shash_grp);
2626 }
2627 
2628 __attribute__((noinline))
2629 void *
__smr_shash_entered_find_slow(const struct smr_shash * smrh,smrh_key_t key,hw_lck_ptr_t * head,smrh_traits_t traits)2630 __smr_shash_entered_find_slow(
2631 	const struct smr_shash *smrh,
2632 	smrh_key_t              key,
2633 	hw_lck_ptr_t           *head,
2634 	smrh_traits_t           traits)
2635 {
2636 	struct smrq_slink *link;
2637 	smrsh_state_t state;
2638 	uint32_t hash;
2639 
2640 	/* wait for the rehashing to be done into their target buckets */
2641 	hw_lck_ptr_wait_for_value(head, SMRSH_MIGRATED, &smr_shash_grp);
2642 
2643 	state = os_atomic_load(&smrh->smrsh_state, dependency);
2644 	hash  = __smr_shash_hash(smrh, state.newidx, key, traits);
2645 	head  = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
2646 
2647 	link  = hw_lck_ptr_value(head);
2648 	while (!__smr_shash_is_stop(link)) {
2649 		if (traits->obj_equ(link, key)) {
2650 			return __smrht_link_to_obj(traits, link);
2651 		}
2652 		link = smr_entered_load(&link->next);
2653 	}
2654 
2655 	assert(link == __smr_shash_bucket_stop(head));
2656 	return NULL;
2657 }
2658 
2659 static const uint8_t __smr_shash_grow_ratio[] = {
2660 	[SMRSH_COMPACT]           = 6,
2661 	[SMRSH_BALANCED]          = 4,
2662 	[SMRSH_BALANCED_NOSHRINK] = 4,
2663 	[SMRSH_FASTEST]           = 2,
2664 };
2665 
2666 static inline uint64_t
__smr_shash_count(struct smr_shash * smrh)2667 __smr_shash_count(struct smr_shash *smrh)
2668 {
2669 	int64_t count = (int64_t)counter_load(&smrh->smrsh_count);
2670 
2671 	/*
2672 	 * negative values make no sense and is likely due to some
2673 	 * stale values being read.
2674 	 */
2675 	return count < 0 ? 0ull : (uint64_t)count;
2676 }
2677 
2678 static inline bool
__smr_shash_should_grow(struct smr_shash * smrh,smrsh_state_t state,uint64_t count)2679 __smr_shash_should_grow(
2680 	struct smr_shash       *smrh,
2681 	smrsh_state_t           state,
2682 	uint64_t                count)
2683 {
2684 	size_t size = __smr_shash_cursize(state);
2685 
2686 	/* grow if elem:bucket ratio is worse than grow_ratio:1 */
2687 	return count > __smr_shash_grow_ratio[smrh->smrsh_policy] * size;
2688 }
2689 
2690 static inline bool
__smr_shash_should_reseed(struct smr_shash * smrh,size_t observed_depth)2691 __smr_shash_should_reseed(
2692 	struct smr_shash       *smrh,
2693 	size_t                  observed_depth)
2694 {
2695 	return observed_depth > 10 * __smr_shash_grow_ratio[smrh->smrsh_policy];
2696 }
2697 
2698 static inline bool
__smr_shash_should_shrink(struct smr_shash * smrh,smrsh_state_t state,uint64_t count)2699 __smr_shash_should_shrink(
2700 	struct smr_shash       *smrh,
2701 	smrsh_state_t           state,
2702 	uint64_t                count)
2703 {
2704 	size_t size = __smr_shash_cursize(state);
2705 
2706 	switch (smrh->smrsh_policy) {
2707 	case SMRSH_COMPACT:
2708 		/* shrink if bucket:elem ratio is worse than 1:1 */
2709 		return size > count && size > __smr_shash_min_size(smrh);
2710 	case SMRSH_BALANCED:
2711 		/* shrink if bucket:elem ratio is worse than 2:1 */
2712 		return size > 2 * count && size > __smr_shash_min_size(smrh);
2713 	case SMRSH_BALANCED_NOSHRINK:
2714 	case SMRSH_FASTEST:
2715 		return false;
2716 	}
2717 }
2718 
2719 static inline void
__smr_shash_schedule_rehash(struct smr_shash * smrh,smrh_traits_t traits,smrsh_rehash_t reason)2720 __smr_shash_schedule_rehash(
2721 	struct smr_shash       *smrh,
2722 	smrh_traits_t           traits,
2723 	smrsh_rehash_t          reason)
2724 {
2725 	smrsh_rehash_t rehash;
2726 
2727 	rehash = os_atomic_load(&smrh->smrsh_rehashing, relaxed);
2728 	if (rehash & reason) {
2729 		return;
2730 	}
2731 
2732 	rehash = os_atomic_or_orig(&smrh->smrsh_rehashing, reason, relaxed);
2733 	if (!rehash) {
2734 		thread_call_enter1(smrh->smrsh_callout,
2735 		    __DECONST(void *, traits));
2736 	}
2737 }
2738 
2739 void *
__smr_shash_entered_get_or_insert(struct smr_shash * smrh,smrh_key_t key,struct smrq_slink * link,smrh_traits_t traits)2740 __smr_shash_entered_get_or_insert(
2741 	struct smr_shash       *smrh,
2742 	smrh_key_t              key,
2743 	struct smrq_slink      *link,
2744 	smrh_traits_t           traits)
2745 {
2746 	struct smrq_slink *first;
2747 	struct smrq_slink *other;
2748 	uint32_t hash, depth;
2749 	smrsh_state_t state;
2750 	hw_lck_ptr_t *head;
2751 	void *obj;
2752 
2753 	state = os_atomic_load(&smrh->smrsh_state, dependency);
2754 	hash  = __smr_shash_hash(smrh, state.curidx, key, traits);
2755 	head  = __smr_shash_bucket(smrh, state, SMRSH_CUR, hash);
2756 	first = hw_lck_ptr_lock(head, &smr_shash_grp);
2757 
2758 	if (__improbable(first == SMRSH_MIGRATED)) {
2759 		hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
2760 
2761 		state = os_atomic_load(&smrh->smrsh_state, dependency);
2762 		hash  = __smr_shash_hash(smrh, state.newidx, key, traits);
2763 		head  = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
2764 		first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
2765 	}
2766 
2767 	depth = 0;
2768 	other = first;
2769 	while (!__smr_shash_is_stop(other)) {
2770 		depth++;
2771 		if (traits->obj_equ(other, key)) {
2772 			obj = __smrht_link_to_obj(traits, other);
2773 			if (traits->obj_try_get(obj)) {
2774 				hw_lck_ptr_unlock(head, first,
2775 				    &smr_shash_grp);
2776 				return obj;
2777 			}
2778 			break;
2779 		}
2780 		other = smr_serialized_load(&other->next);
2781 	}
2782 
2783 	counter_inc_preemption_disabled(&smrh->smrsh_count);
2784 	smr_serialized_store_relaxed(&link->next, first);
2785 	hw_lck_ptr_unlock(head, link, &smr_shash_grp);
2786 
2787 	if (__smr_shash_should_reseed(smrh, depth)) {
2788 		__smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_RESEED);
2789 	} else if (depth * 2 >= __smr_shash_grow_ratio[smrh->smrsh_policy] &&
2790 	    __smr_shash_should_grow(smrh, state, __smr_shash_count(smrh))) {
2791 		__smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_GROW);
2792 	}
2793 	return NULL;
2794 }
2795 
2796 __abortlike
2797 static void
__smr_shash_missing_elt_panic(struct smr_shash * smrh,struct smrq_slink * link,smrh_traits_t traits)2798 __smr_shash_missing_elt_panic(
2799 	struct smr_shash        *smrh,
2800 	struct smrq_slink       *link,
2801 	smrh_traits_t           traits)
2802 {
2803 	panic("Unable to find item %p (linkage %p) in %p (traits %p)",
2804 	    __smrht_link_to_obj(traits, link), link, smrh, traits);
2805 }
2806 
2807 smr_shash_mut_cursor_t
__smr_shash_entered_mut_begin(struct smr_shash * smrh,struct smrq_slink * link,smrh_traits_t traits)2808 __smr_shash_entered_mut_begin(
2809 	struct smr_shash       *smrh,
2810 	struct smrq_slink      *link,
2811 	smrh_traits_t           traits)
2812 {
2813 	struct smrq_slink *first, *next;
2814 	__smrq_slink_t *prev;
2815 	smrsh_state_t state;
2816 	hw_lck_ptr_t *head;
2817 	uint32_t hash;
2818 
2819 	state = os_atomic_load(&smrh->smrsh_state, dependency);
2820 	hash  = __smr_shash_hash(smrh, state.curidx, link, traits);
2821 	head  = __smr_shash_bucket(smrh, state, SMRSH_CUR, hash);
2822 	first = hw_lck_ptr_lock(head, &smr_shash_grp);
2823 
2824 	if (__improbable(first == SMRSH_MIGRATED)) {
2825 		hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
2826 
2827 		state = os_atomic_load(&smrh->smrsh_state, dependency);
2828 		hash  = __smr_shash_hash(smrh, state.newidx, link, traits);
2829 		head  = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
2830 		first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
2831 	}
2832 
2833 	next = first;
2834 	while (next != link) {
2835 		if (__smr_shash_is_stop(next)) {
2836 			__smr_shash_missing_elt_panic(smrh, link, traits);
2837 		}
2838 		prev  = &next->next;
2839 		next  = smr_serialized_load(prev);
2840 	}
2841 
2842 	return (smr_shash_mut_cursor_t){ .head = head, .prev = prev };
2843 }
2844 
2845 void
__smr_shash_entered_mut_erase(struct smr_shash * smrh,smr_shash_mut_cursor_t cursor,struct smrq_slink * link,smrh_traits_t traits)2846 __smr_shash_entered_mut_erase(
2847 	struct smr_shash       *smrh,
2848 	smr_shash_mut_cursor_t  cursor,
2849 	struct smrq_slink      *link,
2850 	smrh_traits_t           traits)
2851 {
2852 	struct smrq_slink *next, *first;
2853 	smrsh_state_t state;
2854 
2855 	first = hw_lck_ptr_value(cursor.head);
2856 
2857 	next  = smr_serialized_load(&link->next);
2858 	if (first == link) {
2859 		counter_dec_preemption_disabled(&smrh->smrsh_count);
2860 		hw_lck_ptr_unlock(cursor.head, next, &smr_shash_grp);
2861 	} else {
2862 		smr_serialized_store_relaxed(cursor.prev, next);
2863 		counter_dec_preemption_disabled(&smrh->smrsh_count);
2864 		hw_lck_ptr_unlock(cursor.head, first, &smr_shash_grp);
2865 	}
2866 
2867 	state = atomic_load_explicit(&smrh->smrsh_state, memory_order_relaxed);
2868 	if (first == link && __smr_shash_is_stop(next) &&
2869 	    __smr_shash_should_shrink(smrh, state, __smr_shash_count(smrh))) {
2870 		__smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_SHRINK);
2871 	}
2872 }
2873 
2874 void
__smr_shash_entered_mut_replace(smr_shash_mut_cursor_t cursor,struct smrq_slink * old_link,struct smrq_slink * new_link)2875 __smr_shash_entered_mut_replace(
2876 	smr_shash_mut_cursor_t  cursor,
2877 	struct smrq_slink      *old_link,
2878 	struct smrq_slink      *new_link)
2879 {
2880 	struct smrq_slink *first, *next;
2881 
2882 	first = hw_lck_ptr_value(cursor.head);
2883 
2884 	next  = smr_serialized_load(&old_link->next);
2885 	smr_serialized_store_relaxed(&new_link->next, next);
2886 	if (first == old_link) {
2887 		hw_lck_ptr_unlock(cursor.head, new_link, &smr_shash_grp);
2888 	} else {
2889 		smr_serialized_store_relaxed(cursor.prev, new_link);
2890 		hw_lck_ptr_unlock(cursor.head, first, &smr_shash_grp);
2891 	}
2892 }
2893 
2894 void
__smr_shash_entered_mut_abort(smr_shash_mut_cursor_t cursor)2895 __smr_shash_entered_mut_abort(smr_shash_mut_cursor_t cursor)
2896 {
2897 	hw_lck_ptr_unlock(cursor.head,
2898 	    hw_lck_ptr_value(cursor.head), &smr_shash_grp);
2899 }
2900 
2901 static kern_return_t
__smr_shash_rehash_with_target(struct smr_shash * smrh,smrsh_state_t state,uint8_t newshift,smrh_traits_t traits)2902 __smr_shash_rehash_with_target(
2903 	struct smr_shash       *smrh,
2904 	smrsh_state_t           state,
2905 	uint8_t                 newshift,
2906 	smrh_traits_t           traits)
2907 {
2908 	const size_t FLAT_SIZE = 256;
2909 	struct smrq_slink *flat_queue[FLAT_SIZE];
2910 
2911 	size_t oldsize, newsize;
2912 	hw_lck_ptr_t *oldarray;
2913 	hw_lck_ptr_t *newarray;
2914 	uint32_t newseed;
2915 	uint8_t oldidx;
2916 
2917 	/*
2918 	 * This function resizes a scalable hash table.
2919 	 *
2920 	 * It doesn't require a lock because it is the callout
2921 	 * of a THREAD_CALL_ONCE thread call.
2922 	 */
2923 
2924 	oldidx         = state.curidx;
2925 	state.newidx   = 1 - state.curidx;
2926 	state.newshift = newshift;
2927 	assert(__smr_shash_load_array(smrh, state.newidx) == NULL);
2928 
2929 	oldsize = __smr_shash_cursize(state);
2930 	newsize = __smr_shash_size_for_shift(newshift);
2931 
2932 	oldarray = __smr_shash_load_array(smrh, state.curidx);
2933 	newarray = (hw_lck_ptr_t *)smr_hash_alloc_array(newsize);
2934 	newseed  = (uint32_t)early_random();
2935 
2936 	if (newarray == NULL) {
2937 		return KERN_RESOURCE_SHORTAGE;
2938 	}
2939 
2940 	/*
2941 	 * Step 1: initialize the new array and seed,
2942 	 *         and then publish the state referencing it.
2943 	 *
2944 	 *         We do not need to synchronize explicitly with SMR,
2945 	 *         because readers/writers will notice rehashing when
2946 	 *         the bucket they interact with has a SMRSH_MIGRATED
2947 	 *         value.
2948 	 */
2949 
2950 	for (size_t i = 0; i < newsize; i++) {
2951 		__smr_shash_bucket_init(&newarray[i]);
2952 	}
2953 	os_atomic_store(&smrh->smrsh_array[state.newidx], newarray, relaxed);
2954 	os_atomic_store(&smrh->smrsh_seed[state.newidx], newseed, relaxed);
2955 	os_atomic_store(&smrh->smrsh_state, state, release);
2956 
2957 	/*
2958 	 * Step 2: migrate buckets "atomically" under the old bucket lock.
2959 	 *
2960 	 *         This migration is atomic for writers because
2961 	 *         they take the old bucket lock first, and if
2962 	 *         they observe SMRSH_MIGRATED as the value,
2963 	 *         go look in the new bucket instead.
2964 	 *
2965 	 *         This migration is atomic for readers, because
2966 	 *         as we move elements to their new buckets,
2967 	 *         the hash chains will not circle back to their
2968 	 *         bucket head (the "stop" value won't match),
2969 	 *         or the bucket head will be SMRSH_MIGRATED.
2970 	 *
2971 	 *         This causes a slowpath which spins waiting
2972 	 *         for SMRSH_MIGRATED to appear and then looks
2973 	 *         in the new bucket.
2974 	 */
2975 	for (size_t i = 0; i < oldsize; i++) {
2976 		struct smrq_slink *first, *link, *next;
2977 		hw_lck_ptr_t *head;
2978 		uint32_t hash;
2979 		size_t n = 0;
2980 
2981 		link = first = hw_lck_ptr_lock(&oldarray[i], &smr_shash_grp);
2982 
2983 		while (!__smr_shash_is_stop(link)) {
2984 			flat_queue[n++ % FLAT_SIZE] = link;
2985 			link = smr_serialized_load(&link->next);
2986 		}
2987 
2988 		while (n-- > 0) {
2989 			for (size_t j = (n % FLAT_SIZE) + 1; j-- > 0;) {
2990 				link = flat_queue[j];
2991 				hash = traits->obj_hash(link, newseed);
2992 				head = &newarray[hash >> newshift];
2993 				next = hw_lck_ptr_lock_nopreempt(head,
2994 				    &smr_shash_grp);
2995 				smr_serialized_store_relaxed(&link->next, next);
2996 				hw_lck_ptr_unlock_nopreempt(head, link,
2997 				    &smr_shash_grp);
2998 			}
2999 			n &= ~(FLAT_SIZE - 1);
3000 
3001 			/*
3002 			 * If there were more than FLAT_SIZE elements in the
3003 			 * chain (which is super unlikely and in many ways,
3004 			 * worrisome), then we need to repopoulate
3005 			 * the flattened queue array for each run.
3006 			 *
3007 			 * This is O(n^2) but we have worse problems anyway
3008 			 * if we ever hit this path.
3009 			 */
3010 			if (__improbable(n > 0)) {
3011 				link = first;
3012 				for (size_t j = 0; j < n - FLAT_SIZE; j++) {
3013 					link = smr_serialized_load(&link->next);
3014 				}
3015 
3016 				flat_queue[0] = link;
3017 				for (size_t j = 1; j < FLAT_SIZE; j++) {
3018 					link = smr_serialized_load(&link->next);
3019 					flat_queue[j] = link;
3020 				}
3021 			}
3022 		}
3023 
3024 		hw_lck_ptr_unlock(&oldarray[i], SMRSH_MIGRATED, &smr_shash_grp);
3025 	}
3026 
3027 	/*
3028 	 * Step 3: deallocate the old array of buckets,
3029 	 *         making sure to hide it from readers.
3030 	 */
3031 
3032 	state.curshift = state.newshift;
3033 	state.curidx   = state.newidx;
3034 	os_atomic_store(&smrh->smrsh_state, state, release);
3035 
3036 	smr_synchronize(traits->domain);
3037 
3038 	os_atomic_store(&smrh->smrsh_array[oldidx], NULL, relaxed);
3039 	for (size_t i = 0; i < oldsize; i++) {
3040 		__smr_shash_bucket_destroy(&oldarray[i]);
3041 	}
3042 	smr_hash_free_array((struct smrq_slist_head *)oldarray, oldsize);
3043 
3044 	return KERN_SUCCESS;
3045 }
3046 
3047 static void
__smr_shash_rehash(thread_call_param_t arg0,thread_call_param_t arg1)3048 __smr_shash_rehash(thread_call_param_t arg0, thread_call_param_t arg1)
3049 {
3050 	struct smr_shash *smrh   = arg0;
3051 	smrh_traits_t     traits = arg1;
3052 	smrsh_rehash_t    reason;
3053 	smrsh_state_t     state;
3054 	uint64_t          count;
3055 	kern_return_t     kr;
3056 
3057 	do {
3058 		reason = os_atomic_xchg(&smrh->smrsh_rehashing,
3059 		    SMRSH_REHASH_RUNNING, relaxed);
3060 
3061 		state  = os_atomic_load(&smrh->smrsh_state, relaxed);
3062 		count  = __smr_shash_count(smrh);
3063 
3064 		if (__smr_shash_should_grow(smrh, state, count)) {
3065 			kr = __smr_shash_rehash_with_target(smrh, state,
3066 			    state.curshift - 1, traits);
3067 		} else if (__smr_shash_should_shrink(smrh, state, count)) {
3068 			kr = __smr_shash_rehash_with_target(smrh, state,
3069 			    state.curshift + 1, traits);
3070 		} else if (reason & SMRSH_REHASH_RESEED) {
3071 			kr = __smr_shash_rehash_with_target(smrh, state,
3072 			    state.curshift, traits);
3073 		} else {
3074 			kr = KERN_SUCCESS;
3075 		}
3076 
3077 		if (kr == KERN_RESOURCE_SHORTAGE) {
3078 			uint64_t deadline;
3079 
3080 			os_atomic_or(&smrh->smrsh_rehashing, reason, relaxed);
3081 			nanoseconds_to_deadline(NSEC_PER_MSEC, &deadline);
3082 			thread_call_enter1_delayed(smrh->smrsh_callout,
3083 			    arg1, deadline);
3084 			break;
3085 		}
3086 	} while (!os_atomic_cmpxchg(&smrh->smrsh_rehashing,
3087 	    SMRSH_REHASH_RUNNING, SMRSH_REHASH_NONE, relaxed));
3088 }
3089 
3090 void
smr_shash_init(struct smr_shash * smrh,smrsh_policy_t policy,size_t min_size)3091 smr_shash_init(struct smr_shash *smrh, smrsh_policy_t policy, size_t min_size)
3092 {
3093 	smrsh_state_t state;
3094 	hw_lck_ptr_t *array;
3095 	uint8_t shift;
3096 	size_t size;
3097 
3098 	switch (policy) {
3099 	case SMRSH_COMPACT:
3100 		if (min_size < 2) {
3101 			min_size = 2;
3102 		}
3103 		break;
3104 	default:
3105 		if (min_size < 16) {
3106 			min_size = 16;
3107 		}
3108 		break;
3109 	}
3110 
3111 	switch (policy) {
3112 	case SMRSH_COMPACT:
3113 		size = MIN(2, min_size);
3114 		break;
3115 	case SMRSH_BALANCED:
3116 	case SMRSH_BALANCED_NOSHRINK:
3117 		size = MIN(16, min_size);
3118 		break;
3119 	case SMRSH_FASTEST:
3120 		size = min_size;
3121 		break;
3122 	}
3123 
3124 	if (size > KALLOC_SAFE_ALLOC_SIZE / sizeof(*array)) {
3125 		size = KALLOC_SAFE_ALLOC_SIZE / sizeof(*array);
3126 	}
3127 	shift = (uint8_t)__builtin_clz((uint32_t)(size - 1));
3128 	size  = (~0u >> shift) + 1;
3129 	array = (hw_lck_ptr_t *)smr_hash_alloc_array(size);
3130 	for (size_t i = 0; i < size; i++) {
3131 		__smr_shash_bucket_init(&array[i]);
3132 	}
3133 
3134 	state = (smrsh_state_t){
3135 		.curshift = shift,
3136 		.newshift = shift,
3137 	};
3138 	*smrh = (struct smr_shash){
3139 		.smrsh_array[0]  = array,
3140 		.smrsh_seed[0]   = (uint32_t)early_random(),
3141 		.smrsh_state     = state,
3142 		.smrsh_policy    = policy,
3143 		.smrsh_min_shift = (uint8_t)flsll(min_size - 1),
3144 	};
3145 	counter_alloc(&smrh->smrsh_count);
3146 	smrh->smrsh_callout  = thread_call_allocate_with_options(__smr_shash_rehash,
3147 	    smrh, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
3148 }
3149 
3150 void
3151 __smr_shash_destroy(
3152 	struct smr_shash       *smrh,
3153 	smrh_traits_t           traits,
3154 	void                  (^free)(void *))
3155 {
3156 	smrsh_state_t state;
3157 	hw_lck_ptr_t *array;
3158 	size_t size;
3159 
3160 	thread_call_cancel_wait(smrh->smrsh_callout);
3161 
3162 	state = os_atomic_load(&smrh->smrsh_state, dependency);
3163 	assert(state.curidx == state.newidx);
3164 	assert(__smr_shash_load_array(smrh, 1 - state.curidx) == NULL);
3165 	size  = __smr_shash_cursize(state);
3166 	array = __smr_shash_load_array(smrh, state.curidx);
3167 
3168 	if (free) {
3169 		for (size_t i = 0; i < size; i++) {
3170 			struct smrq_slink *link, *next;
3171 
3172 			next = hw_lck_ptr_value(&array[i]);
3173 			while (!__smr_shash_is_stop(next)) {
3174 				link = next;
3175 				next = smr_serialized_load(&link->next);
3176 				free(__smrht_link_to_obj(traits, link));
3177 			}
3178 		}
3179 	}
3180 	for (size_t i = 0; i < size; i++) {
3181 		__smr_shash_bucket_destroy(&array[i]);
3182 	}
3183 
3184 	thread_call_free(smrh->smrsh_callout);
3185 	counter_free(&smrh->smrsh_count);
3186 	smr_hash_free_array((struct smrq_slist_head *)array, size);
3187 	bzero(smrh, sizeof(*smrh));
3188 }
3189 
3190 
3191 #pragma mark misc
3192 
3193 void
__smr_linkage_invalid(__smrq_link_t * link)3194 __smr_linkage_invalid(__smrq_link_t *link)
3195 {
3196 	struct smrq_link *elem = __container_of(link, struct smrq_link, next);
3197 	struct smrq_link *next = smr_serialized_load(&elem->next);
3198 
3199 	panic("Invalid queue linkage: elt:%p next:%p next->prev:%p",
3200 	    elem, next, __container_of(next->prev, struct smrq_link, next));
3201 }
3202 
3203 void
__smr_stail_invalid(__smrq_slink_t * link,__smrq_slink_t * last)3204 __smr_stail_invalid(__smrq_slink_t *link, __smrq_slink_t *last)
3205 {
3206 	struct smrq_slink *elem = __container_of(link, struct smrq_slink, next);
3207 	struct smrq_slink *next = smr_serialized_load(&elem->next);
3208 
3209 	if (next) {
3210 		panic("Invalid queue tail (element past end): elt:%p elt->next:%p",
3211 		    elem, next);
3212 	} else {
3213 		panic("Invalid queue tail (early end): elt:%p tail:%p",
3214 		    elem, __container_of(last, struct smrq_slink, next));
3215 	}
3216 }
3217 
3218 void
__smr_tail_invalid(__smrq_link_t * link,__smrq_link_t * last)3219 __smr_tail_invalid(__smrq_link_t *link, __smrq_link_t *last)
3220 {
3221 	struct smrq_link *elem = __container_of(link, struct smrq_link, next);
3222 	struct smrq_link *next = smr_serialized_load(&elem->next);
3223 
3224 	if (next) {
3225 		panic("Invalid queue tail (element past end): elt:%p elt->next:%p",
3226 		    elem, next);
3227 	} else {
3228 		panic("Invalid queue tail (early end): elt:%p tail:%p",
3229 		    elem, __container_of(last, struct smrq_link, next));
3230 	}
3231 }
3232