xref: /xnu-12377.41.6/osfmk/kern/smr.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/locks_internal.h>
30 #include <kern/lock_ptr.h>
31 #include <kern/cpu_data.h>
32 #include <kern/machine.h>
33 #include <kern/mpsc_queue.h>
34 #include <kern/percpu.h>
35 #include <kern/sched.h>
36 #include <kern/smr.h>
37 #include <kern/smr_hash.h>
38 #include <kern/thread.h>
39 #include <kern/zalloc.h>
40 #include <machine/commpage.h>
41 #include <os/hash.h>
42 
43 
44 #pragma mark - SMR domains
45 
46 /*
47  * This SMR scheme is directly FreeBSD's "Global Unbounded Sequences".
48  *
49  * Major differences are:
50  *
51  * - only eager clocks are implemented (no lazy, no implicit)
52  *
53  *
54  * SMR clocks have 3 state machines interacting at any given time:
55  *
56  * 1. reader critical sections
57  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
58  *
59  * Each CPU can disable preemption and do this sequence:
60  *
61  *     CPU::c_rd_seq = GLOBAL::c_wr_seq;
62  *
63  *     < unfortunate place to receive a long IRQ >                      [I]
64  *
65  *     os_atomic_thread_fence(seq_cst);                                 [R1]
66  *
67  *     {
68  *         // critical section
69  *     }
70  *
71  *     os_atomic_store(&CPU::c_rd_seq, INVALID, release);               [R2]
72  *
73  *
74  *
75  * 2. writer sequence advances
76  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
77  *
78  * Each writer can increment the global write sequence
79  * at any given time:
80  *
81  *    os_atomic_add(&GLOBAL::c_wr_seq, SMR_SEQ_INC, release);           [W]
82  *
83  *
84  *
85  * 3. synchronization sequence: poll/wait/scan
86  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
87  *
88  * This state machine synchronizes with the other two in order to decide
89  * if a given "goal" is in the past. Only the cases when the call
90  * is successful is interresting for barrier purposes, and we will focus
91  * on cases that do not take an early return for failures.
92  *
93  * a. __smr_poll:
94  *
95  *     rd_seq = os_atomic_load(&GLOBAL::c_rd_seq, acquire);             [S1]
96  *     if (goal < rd_seq) SUCCESS.
97  *     wr_seq = os_atomic_load(&GLOBAL::c_rd_seq, relaxed);
98  *
99  * b. __smr_scan
100  *
101  *     os_atomic_thread_fence(seq_cst)                                  [S2]
102  *
103  *     observe the minimum CPU::c_rd_seq "min_rd_seq"
104  *     value possible or rw_seq if no CPU was in a critical section.
105  *     (possibly spinning until it satisfies "goal")
106  *
107  * c. __smr_rd_advance
108  *
109  *     cur_rd_seq = load_exclusive(&GLOBAL::c_rd_seq);
110  *     os_atomic_thread_fence(seq_cst);                                 [S3]
111  *     if (min_rd_seq > cur_rd_seq) {
112  *         store_exlusive(&GLOBAL::c_rd_seq, min_rd_seq);
113  *     }
114  *
115  *
116  * One sentence summary
117  * ~~~~~~~~~~~~~~~~~~~~
118  *
119  * A simplistic one-sentence summary of the algorithm is that __smr_scan()
120  * works really hard to insert itself in the timeline of write sequences and
121  * observe a reasonnable bound for first safe-to-reclaim sequence, and
122  * issues [S3] to sequence everything around "c_rd_seq" (via [S3] -> [S1]):
123  *
124  *              GLOBAL::c_rd_seq                GLOBAL::c_wr_seq
125  *                             v                v
126  *       ──────────────────────┬────────────────┬─────────────────────
127  *       ... safe to reclaim   │    deferred    │   future         ...
128  *       ──────────────────────┴────────────────┴─────────────────────
129  *
130  *
131  * Detailed explanation
132  * ~~~~~~~~~~~~~~~~~~~~
133  *
134  * [W] -> [R1] establishes a "happens before" relationship between a given
135  * writer and this critical section. The loaded GLOBAL::c_wr_seq might
136  * however be stale with respect to the one [R1] really synchronizes with
137  * (see [I] explanation below).
138  *
139  *
140  * [R1] -> [S2] establishes a "happens before" relationship between all the
141  * active critical sections and the scanner.
142  * It lets us compute the oldest possible sequence pinned by an active
143  * critical section.
144  *
145  *
146  * [R2] -> [S3] establishes a "happens before" relationship between all the
147  * inactive critical sections and the scanner.
148  *
149  *
150  * [S3] -> [S1] is the typical expected fastpath: when the caller can decide
151  * that its goal is older than the last update an __smr_rd_advance() did.
152  * Note that [S3] doubles as an "[S1]" when two __smr_scan() race each other
153  * and one of them finishes last but observed a "worse" read sequence.
154  *
155  *
156  * [W], [S3] -> [S1] is the last crucial property: all updates to the global
157  * clock are totally ordered because they update the entire 128bit state
158  * every time with an RMW. This guarantees that __smr_poll() can't load
159  * an `rd_seq` that is younger than the `wr_seq` it loads next.
160  *
161  *
162  * [I] __smr_enter() also can be unfortunately delayed after observing
163  * a given write sequence and right before [R1] at [I].
164  *
165  * However for a read sequence to have move past what __smr_enter() observed,
166  * it means another __smr_scan() didn't observe the store to CPU::c_rd_seq
167  * made by __smr_enter() and thought the section was inactive.
168  *
169  * This can only happen if the scan's [S2] was issued before the delayed
170  * __smr_enter() [R1] (during the [I] window).
171  *
172  * As a consequence the outcome of that scan can be accepted as the "real"
173  * write sequence __smr_enter() should have observed.
174  *
175  *
176  * Litmus tests
177  * ~~~~~~~~~~~~
178  *
179  * This is the proof of [W] -> [R1] -> [S2] being established properly:
180  * - P0 sets a global and calls smr_synchronize()
181  * - P1 does smr_enter() and loads the global
182  *
183  *     AArch64 MP
184  *     {
185  *         global = 0;
186  *         wr_seq = 123;
187  *         p1_rd_seq = 0;
188  *
189  *         0:x0 = global; 0:x1 = wr_seq; 0:x2 = p1_rd_seq;
190  *         1:x0 = global; 1:x1 = wr_seq; 1:x2 = p1_rd_seq;
191  *     }
192  *      P0                     | P1                         ;
193  *      MOV      X8, #2        | LDR        X8, [X1]        ;
194  *      STR      X8, [X0]      | STR        X8, [X2]        ;
195  *      LDADDL   X8, X9, [X1]  | DMB        SY              ;
196  *      DMB      SY            | LDR        X10, [X0]       ;
197  *      LDR      X10, [X2]     |                            ;
198  *     exists (0:X10 = 0 /\ 1:X8 = 123 /\ 1:X10 = 0)
199  *
200  *
201  * This is the proof that deferred advances are also correct:
202  * - P0 sets a global and does a smr_deferred_advance()
203  * - P1 does an smr_synchronize() and reads the global
204  *
205  *     AArch64 MP
206  *     {
207  *         global = 0;
208  *         wr_seq = 123;
209  *
210  *         0:x0 = global; 0:x1 = wr_seq; 0:x2 = 2;
211  *         1:x0 = global; 1:x1 = wr_seq; 1:x2 = 2;
212  *     }
213  *      P0                     | P1                         ;
214  *      STR      X2, [X0]      | LDADDL     X2, X9, [X1]    ;
215  *      DMB      SY            | DMB        SY              ;
216  *      LDR      X9, [X1]      | LDR        X10, [X0]       ;
217  *      ADD      X9, X9, X2    |                            ;
218  *     exists (0:X9 = 125 /\ 1:X9 = 123 /\ 1:X10 = 0)
219  *
220  */
221 
222 /*!
223  * @struct smr_worker
224  *
225  * @brief
226  * Structure tracking the per-cpu SMR workers state.
227  *
228  * @discussion
229  * This structure is system wide and global and is used to track
230  * the various active SMR domains at the granularity of a CPU.
231  *
232  * Each structure has an associated thread which is responsible
233  * for the forward progress the @c smr_call() and @c smr_barrier()
234  * interfaces.
235  *
236  * It also tracks all the active, non stalled, sleepable SMR sections.
237  */
238 struct smr_worker {
239 	/*
240 	 * The thread for this worker,
241 	 * and conveniency pointer to the processor it is bound to.
242 	 */
243 	struct thread          *thread;
244 	struct processor       *processor;
245 
246 	/*
247 	 * Thread binding/locking logic:
248 	 *
249 	 * If the worker thread is running on its canonical CPU,
250 	 * then locking to access the various SMR per-cpu data
251 	 * structures it is draining is just preemption disablement.
252 	 *
253 	 * However, if it is currently not bound to its canonical
254 	 * CPU because the CPU has been offlined or de-recommended,
255 	 * then a lock which serializes with the CPU going online
256 	 * again is being used.
257 	 */
258 	struct waitq            waitq;
259 	smr_cpu_reason_t        detach_reason;
260 
261 #if CONFIG_QUIESCE_COUNTER
262 	/*
263 	 * Currently active quiescent generation for this processor,
264 	 * and the last timestamp when a scan of all cores was performed.
265 	 */
266 	smr_seq_t               rd_quiesce_seq;
267 #endif
268 
269 	/*
270 	 * List of all the active sleepable sections that haven't
271 	 * been stalled.
272 	 */
273 	struct smrq_list_head   sect_queue;
274 	struct thread          *sect_waiter;
275 
276 	/*
277 	 * Queue of SMR domains with pending smr_call()
278 	 * callouts to drain.
279 	 *
280 	 * This uses an ageing strategy in order to amortize
281 	 * SMR clock updates:
282 	 *
283 	 * - the "old" queue have domains whose callbacks have
284 	 *   a committed and aged sequence,
285 	 * - the "age" queue have domains whose callbacks have
286 	 *   a commited but fresh sequence and need ageing,
287 	 * - the "cur" queue have domains whose callbacks have
288 	 *   a sequence in the future and need for it to be committed.
289 	 */
290 	struct smr_pcpu        *whead;
291 	struct smr_pcpu       **wold_tail;
292 	struct smr_pcpu       **wage_tail;
293 	struct smr_pcpu       **wcur_tail;
294 	uint64_t                drain_ctime;
295 
296 	/*
297 	 * Queue of smr_barrier() calls in flight,
298 	 * that will be picked up by the worker thread
299 	 * to enqueue as smr_call() entries in their
300 	 * respective per-CPU data structures.
301 	 */
302 	struct mpsc_queue_head  barrier_queue;
303 } __attribute__((aligned(64)));
304 
305 
306 typedef struct smr_pcpu {
307 	/*
308 	 * CPU private cacheline.
309 	 *
310 	 * Nothing else than the CPU this state is made for,
311 	 * ever writes to this cacheline.
312 	 *
313 	 * It holds the epoch activity witness (rd_seq), and
314 	 * the local smr_call() queue, which is structured this way:
315 	 *
316 	 *     head -> n1 -> n2 -> n3 -> n4 -> ... -> ni -> ... -> nN -> NULL
317 	 *                            ^            ^                  ^
318 	 *     qold_tail -------------'            |                  |
319 	 *     qage_tail --------------------------'                  |
320 	 *     qcur_tail ---------------------------------------------'
321 	 *
322 	 * - the "old" queue can be reclaimed once qold_seq is past,
323 	 *   qold_seq is always a commited sequence.
324 	 * - the "age" queue can be reclaimed once qage_seq is past,
325 	 *   qage_seq might not be commited yet.
326 	 * - the "cur" queue has an approximate size of qcur_size bytes,
327 	 *   and a length of qcur_cnt callbacks.
328 	 */
329 
330 	smr_seq_t               c_rd_seq; /* might have SMR_SEQ_SLEEPABLE set */
331 
332 	smr_node_t              qhead;
333 
334 	smr_seq_t               qold_seq;
335 	smr_node_t             *qold_tail;
336 
337 	smr_seq_t               qage_seq;
338 	smr_node_t             *qage_tail;
339 
340 	uint32_t                qcur_size;
341 	uint32_t                qcur_cnt;
342 	smr_node_t             *qcur_tail;
343 
344 	uint8_t                 __cacheline_sep[0];
345 
346 	/*
347 	 * Drain queue.
348 	 *
349 	 * This is used to drive smr_call() via the smr worker threads.
350 	 * If the SMR domain is not using smr_call() or smr_barrier(),
351 	 * this isn't used.
352 	 */
353 	struct smr             *drain_smr;
354 	struct smr_pcpu        *drain_next;
355 	uint16_t                __check_cpu;
356 	uint8_t                 __check_reason;
357 	uint8_t                 __check_list;
358 
359 	/*
360 	 * Stalled queue.
361 	 *
362 	 * Stalled sections are enqueued onto this queue by the scheduler
363 	 * when their thread blocks (see smr_mark_active_trackers_stalled()).
364 	 *
365 	 * If the SMR domain is not sleepable, then this isn't used.
366 	 *
367 	 * This list is protected by a lock.
368 	 *
369 	 * When there are stalled sections, stall_rd_seq contains
370 	 * the oldest active stalled sequence number.
371 	 *
372 	 * When threads want to expedite a stalled section, they set
373 	 * stall_waiter_goal to the sequence number they are waiting
374 	 * for and block via turnstile on the oldest stalled section.
375 	 */
376 	hw_lck_ticket_t         stall_lock;
377 	smr_seq_t               stall_rd_seq;
378 	smr_seq_t               stall_waiter_goal;
379 	struct smrq_tailq_head  stall_queue;
380 	struct turnstile       *stall_ts;
381 } __attribute__((aligned(128))) * smr_pcpu_t;
382 
383 static_assert(offsetof(struct smr_pcpu, __cacheline_sep) == 64);
384 static_assert(sizeof(struct smr_pcpu) == 128);
385 
386 #define CPU_CHECKIN_MIN_INTERVAL_US     5000         /* 5ms */
387 #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
388 static uint64_t cpu_checkin_min_interval;
389 static uint32_t cpu_checkin_min_interval_us;
390 
391 /*! the amount of memory pending retiring that causes a foreceful flush */
392 #if XNU_TARGET_OS_OSX
393 static TUNABLE(vm_size_t, smr_call_size_cap, "smr_call_size_cap", 256 << 10);
394 static TUNABLE(vm_size_t, smr_call_cnt_cap, "smr_call_cnt_cap", 128);
395 #else
396 static TUNABLE(vm_size_t, smr_call_size_cap, "smr_call_size_cap", 64 << 10);
397 static TUNABLE(vm_size_t, smr_call_cnt_cap, "smr_call_cnt_cap", 32);
398 #endif
399 /* time __smr_wait_for_oncore busy spins before going the expensive route */
400 static TUNABLE(uint32_t, smr_wait_spin_us, "smr_wait_spin_us", 20);
401 
402 static LCK_GRP_DECLARE(smr_lock_grp, "smr");
403 static struct smr_worker PERCPU_DATA(smr_worker);
404 static struct smrq_tailq_head smr_domains = SMRQ_TAILQ_INITIALIZER(smr_domains);
405 
406 SMR_DEFINE_FLAGS(smr_system, "system", SMR_NONE);
407 SMR_DEFINE_FLAGS(smr_system_sleepable, "system (sleepable)", SMR_SLEEPABLE);
408 
409 
410 #pragma mark SMR domains: init & helpers
411 
412 #define SMR_PCPU_NOT_QUEUED     ((struct smr_pcpu *)-1)
413 
414 __attribute__((always_inline, overloadable))
415 static inline smr_pcpu_t
__smr_pcpu(smr_t smr,int cpu)416 __smr_pcpu(smr_t smr, int cpu)
417 {
418 	return &smr->smr_pcpu[cpu];
419 }
420 
421 __attribute__((always_inline, overloadable))
422 static inline smr_pcpu_t
__smr_pcpu(smr_t smr)423 __smr_pcpu(smr_t smr)
424 {
425 	return __smr_pcpu(smr, cpu_number());
426 }
427 
428 static inline bool
__smr_pcpu_queued(smr_pcpu_t pcpu)429 __smr_pcpu_queued(smr_pcpu_t pcpu)
430 {
431 	return pcpu->drain_next != SMR_PCPU_NOT_QUEUED;
432 }
433 
434 static inline void
__smr_pcpu_set_not_queued(smr_pcpu_t pcpu)435 __smr_pcpu_set_not_queued(smr_pcpu_t pcpu)
436 {
437 	pcpu->drain_next = SMR_PCPU_NOT_QUEUED;
438 }
439 
440 static inline void
__smr_pcpu_associate(smr_t smr,smr_pcpu_t pcpu)441 __smr_pcpu_associate(smr_t smr, smr_pcpu_t pcpu)
442 {
443 	zpercpu_foreach_cpu(cpu) {
444 		pcpu[cpu].qold_tail = &pcpu[cpu].qhead;
445 		pcpu[cpu].qage_tail = &pcpu[cpu].qhead;
446 		pcpu[cpu].qcur_tail = &pcpu[cpu].qhead;
447 
448 		pcpu[cpu].drain_smr = smr;
449 		__smr_pcpu_set_not_queued(&pcpu[cpu]);
450 		hw_lck_ticket_init(&pcpu[cpu].stall_lock, &smr_lock_grp);
451 		smrq_init(&pcpu[cpu].stall_queue);
452 	}
453 
454 	os_atomic_store(&smr->smr_pcpu, pcpu, release);
455 }
456 
457 static inline event64_t
__smrw_oncore_event(struct smr_worker * smrw)458 __smrw_oncore_event(struct smr_worker *smrw)
459 {
460 	return CAST_EVENT64_T(&smrw->sect_queue);
461 }
462 
463 static inline event64_t
__smrw_drain_event(struct smr_worker * smrw)464 __smrw_drain_event(struct smr_worker *smrw)
465 {
466 	return CAST_EVENT64_T(&smrw->whead);
467 }
468 
469 static inline processor_t
__smrw_drain_bind_target(struct smr_worker * smrw)470 __smrw_drain_bind_target(struct smr_worker *smrw)
471 {
472 	return smrw->detach_reason ? PROCESSOR_NULL : smrw->processor;
473 }
474 
475 static inline void
__smrw_lock(struct smr_worker * smrw)476 __smrw_lock(struct smr_worker *smrw)
477 {
478 	waitq_lock(&smrw->waitq);
479 }
480 
481 static inline void
__smrw_unlock(struct smr_worker * smrw)482 __smrw_unlock(struct smr_worker *smrw)
483 {
484 	waitq_unlock(&smrw->waitq);
485 }
486 
487 /*!
488  * @function __smrw_wakeup_and_unlock()
489  *
490  * @brief
491  * Wakes up (with binding) the SMR worker.
492  *
493  * @discussion
494  * Wakeup the worker thread and bind it to the proper processor
495  * as a side effect.
496  *
497  * This function must be called with interrupts disabled.
498  */
499 static bool
__smrw_wakeup_and_unlock(struct smr_worker * smrw)500 __smrw_wakeup_and_unlock(struct smr_worker *smrw)
501 {
502 	thread_t thread;
503 
504 	assert(!ml_get_interrupts_enabled());
505 
506 	thread = waitq_wakeup64_identify_locked(&smrw->waitq,
507 	    __smrw_drain_event(smrw), WAITQ_UNLOCK);
508 
509 	if (thread != THREAD_NULL) {
510 		assert(thread == smrw->thread);
511 
512 		waitq_resume_and_bind_identified_thread(&smrw->waitq,
513 		    thread, __smrw_drain_bind_target(smrw),
514 		    THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT);
515 	}
516 
517 	return thread != THREAD_NULL;
518 }
519 
520 static void
__smr_call_drain(smr_node_t head)521 __smr_call_drain(smr_node_t head)
522 {
523 	smr_node_t node;
524 
525 	while ((node = head) != NULL) {
526 		head = node->smrn_next;
527 		node->smrn_next = NULL;
528 		node->smrn_cb(node);
529 	}
530 }
531 
532 __startup_func
533 void
__smr_domain_init(smr_t smr)534 __smr_domain_init(smr_t smr)
535 {
536 	smr_pcpu_t pcpu;
537 	vm_size_t size;
538 
539 	if (startup_phase < STARTUP_SUB_TUNABLES) {
540 		smr_seq_t *rd_seqp = &smr->smr_early;
541 
542 		/*
543 		 * This is a big cheat, but before the EARLY_BOOT phase,
544 		 * all smr_* APIs that would access past the rd_seq
545 		 * will early return.
546 		 */
547 		pcpu = __container_of(rd_seqp, struct smr_pcpu, c_rd_seq);
548 		smr->smr_pcpu = pcpu - cpu_number();
549 		assert(&__smr_pcpu(smr)->c_rd_seq == &smr->smr_early);
550 	} else {
551 		size = zpercpu_count() * sizeof(struct smr_pcpu);
552 		pcpu = zalloc_permanent(size, ZALIGN(struct smr_pcpu));
553 
554 		__smr_pcpu_associate(smr, pcpu);
555 	}
556 }
557 
558 smr_t
smr_domain_create(smr_flags_t flags,const char * name)559 smr_domain_create(smr_flags_t flags, const char *name)
560 {
561 	smr_pcpu_t pcpu;
562 	smr_t smr;
563 
564 	smr  = kalloc_type(struct smr, Z_WAITOK | Z_ZERO | Z_NOFAIL);
565 	pcpu = kalloc_type(struct smr_pcpu, zpercpu_count(),
566 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
567 
568 	smr->smr_clock.s_rd_seq = SMR_SEQ_INIT;
569 	smr->smr_clock.s_wr_seq = SMR_SEQ_INIT;
570 	smr->smr_flags = flags;
571 	static_assert(sizeof(struct smr) ==
572 	    offsetof(struct smr, smr_name) + SMR_NAME_MAX);
573 	strlcpy(smr->smr_name, name, sizeof(smr->smr_name));
574 
575 	__smr_pcpu_associate(smr, pcpu);
576 
577 	return smr;
578 }
579 
580 void
smr_domain_free(smr_t smr)581 smr_domain_free(smr_t smr)
582 {
583 	smr_barrier(smr);
584 
585 	zpercpu_foreach_cpu(cpu) {
586 		smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
587 
588 		assert(pcpu->qhead == NULL);
589 		hw_lck_ticket_destroy(&pcpu->stall_lock, &smr_lock_grp);
590 	}
591 
592 	kfree_type(struct smr_pcpu, zpercpu_count(), smr->smr_pcpu);
593 	kfree_type(struct smr, smr);
594 }
595 
596 
597 #pragma mark SMR domains: enter / leave
598 
599 bool
smr_entered(smr_t smr)600 smr_entered(smr_t smr)
601 {
602 	thread_t self = current_thread();
603 	smr_tracker_t t;
604 
605 	if (lock_preemption_level_for_thread(self) &&
606 	    __smr_pcpu(smr)->c_rd_seq != SMR_SEQ_INVALID) {
607 		return true;
608 	}
609 
610 	if (smr->smr_flags & SMR_SLEEPABLE) {
611 		smrq_serialized_foreach(t, &self->smr_stack, smrt_stack) {
612 			if (t->smrt_domain == smr) {
613 				return true;
614 			}
615 		}
616 	}
617 
618 	return false;
619 }
620 
621 __attribute__((always_inline))
622 bool
smr_entered_cpu_noblock(smr_t smr,int cpu)623 smr_entered_cpu_noblock(smr_t smr, int cpu)
624 {
625 	assert((smr->smr_flags & SMR_SLEEPABLE) == 0);
626 	return __smr_pcpu(smr, cpu)->c_rd_seq != SMR_SEQ_INVALID;
627 }
628 
629 __attribute__((always_inline))
630 static smr_seq_t
__smr_enter(smr_t smr,smr_pcpu_t pcpu,smr_seq_t sleepable)631 __smr_enter(smr_t smr, smr_pcpu_t pcpu, smr_seq_t sleepable)
632 {
633 	smr_seq_t  s_wr_seq;
634 	smr_seq_t  old_seq;
635 
636 	assert(!ml_at_interrupt_context());
637 
638 	/*
639 	 * It is possible to have a long delay between loading the s_wr_seq
640 	 * and storing it to the percpu copy of it.
641 	 *
642 	 * It is unlikely but possible by that time the s_rd_seq advances
643 	 * ahead of what we will store. This however is still safe
644 	 * and handled in __smr_scan().
645 	 *
646 	 * On Intel, to achieve the ordering we want, we could use a store
647 	 * followed by an mfence, or any RMW (XCHG, XADD, CMPXCHG, ...).
648 	 * XADD is just the fastest instruction of the alternatives,
649 	 * but it will only ever add to '0'.
650 	 */
651 	s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
652 #if __x86_64__
653 	/* [R1] */
654 	old_seq = os_atomic_add_orig(&pcpu->c_rd_seq, s_wr_seq | sleepable, seq_cst);
655 #else
656 	old_seq = pcpu->c_rd_seq;
657 	os_atomic_store(&pcpu->c_rd_seq, s_wr_seq | sleepable, relaxed);
658 	os_atomic_thread_fence(seq_cst); /* [R1] */
659 #endif
660 	assert(old_seq == SMR_SEQ_INVALID);
661 
662 	return s_wr_seq;
663 }
664 
665 __attribute__((always_inline))
666 static void
__smr_leave(smr_pcpu_t pcpu)667 __smr_leave(smr_pcpu_t pcpu)
668 {
669 	assert(!ml_at_interrupt_context());
670 	/* [R2] */
671 	os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
672 }
673 
674 __attribute__((always_inline))
675 void
smr_enter(smr_t smr)676 smr_enter(smr_t smr)
677 {
678 	disable_preemption();
679 	__smr_enter(smr, __smr_pcpu(smr), 0);
680 }
681 
682 __attribute__((always_inline))
683 void
smr_leave(smr_t smr)684 smr_leave(smr_t smr)
685 {
686 	__smr_leave(__smr_pcpu(smr));
687 	enable_preemption();
688 }
689 
690 void
smr_enter_sleepable(smr_t smr,smr_tracker_t tracker)691 smr_enter_sleepable(smr_t smr, smr_tracker_t tracker)
692 {
693 	thread_t self = current_thread();
694 	struct smr_worker *smrw;
695 	smr_pcpu_t pcpu;
696 
697 	assert(smr->smr_flags & SMR_SLEEPABLE);
698 
699 	lock_disable_preemption_for_thread(self);
700 	lck_rw_lock_count_inc(self, smr);
701 
702 	pcpu = __smr_pcpu(smr);
703 	smrw = PERCPU_GET(smr_worker);
704 
705 	tracker->smrt_domain = smr;
706 	tracker->smrt_seq    = __smr_enter(smr, pcpu, SMR_SEQ_SLEEPABLE);
707 	smrq_serialized_insert_head_relaxed(&smrw->sect_queue, &tracker->smrt_link);
708 	smrq_serialized_insert_head_relaxed(&self->smr_stack, &tracker->smrt_stack);
709 	tracker->smrt_ctid   = 0;
710 	tracker->smrt_cpu    = -1;
711 
712 	lock_enable_preemption();
713 }
714 
715 __attribute__((always_inline))
716 static void
__smr_wake_oncore_sleepers(struct smr_worker * smrw)717 __smr_wake_oncore_sleepers(struct smr_worker *smrw)
718 {
719 	/*
720 	 * prevent reordering of making the list empty and checking for waiters.
721 	 */
722 	if (__improbable(os_atomic_load(&smrw->sect_waiter, compiler_acq_rel))) {
723 		if (smrq_empty(&smrw->sect_queue)) {
724 			os_atomic_store(&smrw->sect_waiter, NULL, relaxed);
725 			waitq_wakeup64_all(&smrw->waitq,
726 			    __smrw_oncore_event(smrw), THREAD_AWAKENED,
727 			    WAITQ_WAKEUP_DEFAULT);
728 		}
729 	}
730 }
731 
732 void
smr_ack_ipi(void)733 smr_ack_ipi(void)
734 {
735 	/*
736 	 * see __smr_wait_for_oncore(): if at the time of the IPI ack
737 	 * the list is empty and there is still a waiter, wake it up.
738 	 *
739 	 * If the queue is not empty, then when smr_leave_sleepable()
740 	 * runs it can't possibly fail to observe smrw->sect_waiter
741 	 * being non NULL and will do the wakeup then.
742 	 */
743 	__smr_wake_oncore_sleepers(PERCPU_GET(smr_worker));
744 }
745 
746 void
smr_mark_active_trackers_stalled(thread_t self)747 smr_mark_active_trackers_stalled(thread_t self)
748 {
749 	struct smr_worker *smrw = PERCPU_GET(smr_worker);
750 	int cpu = cpu_number();
751 	smr_tracker_t t;
752 
753 	/* called at splsched */
754 
755 	smrq_serialized_foreach_safe(t, &smrw->sect_queue, smrt_link) {
756 		smr_t smr = t->smrt_domain;
757 		smr_pcpu_t pcpu;
758 
759 		pcpu = __smr_pcpu(smr, cpu);
760 
761 		t->smrt_ctid = self->ctid;
762 		t->smrt_cpu  = cpu;
763 
764 		hw_lck_ticket_lock_nopreempt(&pcpu->stall_lock, &smr_lock_grp);
765 
766 		/*
767 		 * Transfer the section to the stalled queue,
768 		 * and _then_ leave the regular one.
769 		 *
770 		 * A store-release is sufficient to order these stores,
771 		 * and guarantee that __smr_scan() can't fail to observe
772 		 * both the @c rd_seq and @c stall_rd_seq during a transfer
773 		 * of a stalled section that was active when it started.
774 		 */
775 		if (smrq_empty(&pcpu->stall_queue)) {
776 			os_atomic_store(&pcpu->stall_rd_seq, t->smrt_seq, relaxed);
777 		}
778 		os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
779 
780 		smrq_serialized_insert_tail_relaxed(&pcpu->stall_queue, &t->smrt_link);
781 
782 		hw_lck_ticket_unlock_nopreempt(&pcpu->stall_lock);
783 	}
784 
785 	smrq_init(&smrw->sect_queue);
786 
787 	__smr_wake_oncore_sleepers(smrw);
788 }
789 
790 
791 __attribute__((noinline))
792 static void
__smr_leave_stalled(smr_t smr,smr_tracker_t tracker,thread_t self)793 __smr_leave_stalled(smr_t smr, smr_tracker_t tracker, thread_t self)
794 {
795 	smr_seq_t new_stall_seq = SMR_SEQ_INVALID;
796 	smr_tracker_t first = NULL;
797 	smr_pcpu_t pcpu;
798 	bool progress;
799 
800 	pcpu = __smr_pcpu(smr, tracker->smrt_cpu);
801 
802 	hw_lck_ticket_lock_nopreempt(&pcpu->stall_lock, &smr_lock_grp);
803 
804 	progress = smrq_serialized_first(&pcpu->stall_queue,
805 	    struct smr_tracker, smrt_link) == tracker;
806 
807 	smrq_serialized_remove(&self->smr_stack, &tracker->smrt_stack);
808 	smrq_serialized_remove(&pcpu->stall_queue, &tracker->smrt_link);
809 	bzero(tracker, sizeof(*tracker));
810 
811 	if (progress) {
812 		if (!smrq_empty(&pcpu->stall_queue)) {
813 			first = smrq_serialized_first(&pcpu->stall_queue,
814 			    struct smr_tracker, smrt_link);
815 			new_stall_seq = first->smrt_seq;
816 			__builtin_assume(new_stall_seq != SMR_SEQ_INVALID);
817 			assert(SMR_SEQ_CMP(pcpu->stall_rd_seq, <=, new_stall_seq));
818 		}
819 
820 		os_atomic_store(&pcpu->stall_rd_seq, new_stall_seq, release);
821 
822 		progress = pcpu->stall_waiter_goal != SMR_SEQ_INVALID;
823 	}
824 
825 	if (progress) {
826 		struct turnstile *ts;
827 
828 		ts = turnstile_prepare((uintptr_t)pcpu, &pcpu->stall_ts,
829 		    TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
830 
831 		if (new_stall_seq == SMR_SEQ_INVALID ||
832 		    SMR_SEQ_CMP(pcpu->stall_waiter_goal, <=, new_stall_seq)) {
833 			pcpu->stall_waiter_goal = SMR_SEQ_INVALID;
834 			waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(pcpu),
835 			    THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
836 		} else {
837 			turnstile_update_inheritor(ts, ctid_get_thread(first->smrt_ctid),
838 			    TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
839 		}
840 
841 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
842 
843 		turnstile_complete((uintptr_t)pcpu, &pcpu->stall_ts,
844 		    NULL, TURNSTILE_KERNEL_MUTEX);
845 	}
846 
847 	/* reenables preemption disabled in smr_leave_sleepable() */
848 	hw_lck_ticket_unlock(&pcpu->stall_lock);
849 
850 	turnstile_cleanup();
851 }
852 
853 void
smr_leave_sleepable(smr_t smr,smr_tracker_t tracker)854 smr_leave_sleepable(smr_t smr, smr_tracker_t tracker)
855 {
856 	struct smr_worker *smrw;
857 	thread_t self = current_thread();
858 
859 	assert(tracker->smrt_seq != SMR_SEQ_INVALID);
860 	assert(smr->smr_flags & SMR_SLEEPABLE);
861 
862 	lock_disable_preemption_for_thread(self);
863 
864 	lck_rw_lock_count_dec(self, smr);
865 
866 	if (__improbable(tracker->smrt_cpu != -1)) {
867 		return __smr_leave_stalled(smr, tracker, self);
868 	}
869 
870 	__smr_leave(__smr_pcpu(smr));
871 
872 	smrw = PERCPU_GET(smr_worker);
873 	smrq_serialized_remove(&self->smr_stack, &tracker->smrt_stack);
874 	smrq_serialized_remove(&smrw->sect_queue, &tracker->smrt_link);
875 	bzero(tracker, sizeof(*tracker));
876 
877 	__smr_wake_oncore_sleepers(PERCPU_GET(smr_worker));
878 
879 	lock_enable_preemption();
880 }
881 
882 
883 #pragma mark SMR domains: advance, wait, poll, synchronize
884 
885 static inline smr_seq_t
__smr_wr_advance(smr_t smr)886 __smr_wr_advance(smr_t smr)
887 {
888 	/* [W] */
889 	return os_atomic_add(&smr->smr_clock.s_wr_seq, SMR_SEQ_INC, release);
890 }
891 
892 static inline bool
__smr_rd_advance(smr_t smr,smr_seq_t goal,smr_seq_t rd_seq)893 __smr_rd_advance(smr_t smr, smr_seq_t goal, smr_seq_t rd_seq)
894 {
895 	smr_seq_t o_seq;
896 
897 	os_atomic_thread_fence(seq_cst); /* [S3] */
898 
899 	os_atomic_rmw_loop(&smr->smr_clock.s_rd_seq, o_seq, rd_seq, relaxed, {
900 		if (SMR_SEQ_CMP(rd_seq, <=, o_seq)) {
901 		        rd_seq = o_seq;
902 		        os_atomic_rmw_loop_give_up(break);
903 		}
904 	});
905 
906 	return SMR_SEQ_CMP(goal, <=, rd_seq);
907 }
908 
909 __attribute__((noinline))
910 static smr_seq_t
__smr_wait_for_stalled(smr_pcpu_t pcpu,smr_seq_t goal)911 __smr_wait_for_stalled(smr_pcpu_t pcpu, smr_seq_t goal)
912 {
913 	struct turnstile *ts;
914 	thread_t inheritor;
915 	wait_result_t wr;
916 	smr_seq_t stall_rd_seq;
917 
918 	hw_lck_ticket_lock(&pcpu->stall_lock, &smr_lock_grp);
919 
920 	stall_rd_seq = pcpu->stall_rd_seq;
921 	if (stall_rd_seq == SMR_SEQ_INVALID ||
922 	    SMR_SEQ_CMP(goal, <=, stall_rd_seq)) {
923 		hw_lck_ticket_unlock(&pcpu->stall_lock);
924 		return stall_rd_seq;
925 	}
926 
927 	if (pcpu->stall_waiter_goal == SMR_SEQ_INVALID ||
928 	    SMR_SEQ_CMP(goal, <, pcpu->stall_waiter_goal)) {
929 		pcpu->stall_waiter_goal = goal;
930 	}
931 
932 	inheritor = ctid_get_thread(smrq_serialized_first(&pcpu->stall_queue,
933 	    struct smr_tracker, smrt_link)->smrt_ctid);
934 
935 	ts = turnstile_prepare((uintptr_t)pcpu, &pcpu->stall_ts,
936 	    TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
937 
938 	turnstile_update_inheritor(ts, inheritor,
939 	    TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD);
940 	wr = waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(pcpu),
941 	    THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
942 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
943 
944 	if (wr == THREAD_WAITING) {
945 		hw_lck_ticket_unlock(&pcpu->stall_lock);
946 		thread_block(THREAD_CONTINUE_NULL);
947 		hw_lck_ticket_lock(&pcpu->stall_lock, &smr_lock_grp);
948 	}
949 
950 	turnstile_complete((uintptr_t)pcpu, &pcpu->stall_ts,
951 	    NULL, TURNSTILE_KERNEL_MUTEX);
952 
953 	stall_rd_seq = pcpu->stall_rd_seq;
954 	hw_lck_ticket_unlock(&pcpu->stall_lock);
955 
956 	turnstile_cleanup();
957 
958 	return stall_rd_seq;
959 }
960 
961 __attribute__((noinline))
962 static smr_seq_t
__smr_wait_for_oncore(smr_pcpu_t pcpu,smr_seq_t goal,uint32_t cpu)963 __smr_wait_for_oncore(smr_pcpu_t pcpu, smr_seq_t goal, uint32_t cpu)
964 {
965 	thread_t self = current_thread();
966 	struct smr_worker *smrw;
967 	uint64_t deadline = 0;
968 	vm_offset_t base;
969 	smr_seq_t rd_seq;
970 
971 	/*
972 	 * We are waiting for a currently active SMR section.
973 	 * Start spin-waiting for it for a bit.
974 	 */
975 	for (;;) {
976 		if (hw_spin_wait_until(&pcpu->c_rd_seq, rd_seq,
977 		    rd_seq == SMR_SEQ_INVALID || SMR_SEQ_CMP(goal, <=, rd_seq))) {
978 			return rd_seq;
979 		}
980 
981 		if (deadline == 0) {
982 			clock_interval_to_deadline(smr_wait_spin_us,
983 			    NSEC_PER_USEC, &deadline);
984 		} else if (mach_absolute_time() > deadline) {
985 			break;
986 		}
987 	}
988 
989 	/*
990 	 * This section is being active for a while,
991 	 * we need to move to a more passive way of waiting.
992 	 *
993 	 * We post ourselves on the remote processor tracking head,
994 	 * to denote we need a thread_wakeup() when the tracker head clears,
995 	 * then send an IPI which will have 2 possible outcomes:
996 	 *
997 	 * 1. when smr_ack_ipi() runs, the queue is already cleared,
998 	 *    and we will be woken up immediately.
999 	 *
1000 	 * 2. when smr_ack_ipi() runs, the queue isn't cleared,
1001 	 *    then it does nothing, but there is a guarantee that
1002 	 *    when the queue clears, the remote core will observe
1003 	 *    that there is a waiter, and thread_wakeup() will be
1004 	 *    called then.
1005 	 *
1006 	 * In order to avoid to actually wait, we do spin some more,
1007 	 * hoping for the remote sequence to change.
1008 	 */
1009 	base = other_percpu_base(cpu);
1010 	smrw = PERCPU_GET_WITH_BASE(base, smr_worker);
1011 
1012 	waitq_assert_wait64(&smrw->waitq, __smrw_oncore_event(smrw),
1013 	    THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1014 
1015 	if (lock_cmpxchg(&smrw->sect_waiter, NULL, self, relaxed)) {
1016 		/*
1017 		 * only really send the IPI if we're first,
1018 		 * to avoid IPI storms in case of a pile-up
1019 		 * of smr_synchronize() calls stalled on the same guy.
1020 		 */
1021 		cause_ast_check(PERCPU_GET_WITH_BASE(base, processor));
1022 	}
1023 
1024 	if (hw_spin_wait_until(&pcpu->c_rd_seq, rd_seq,
1025 	    rd_seq == SMR_SEQ_INVALID || SMR_SEQ_CMP(goal, <=, rd_seq))) {
1026 		clear_wait(self, THREAD_AWAKENED);
1027 		return rd_seq;
1028 	}
1029 
1030 	thread_block(THREAD_CONTINUE_NULL);
1031 
1032 	return os_atomic_load(&pcpu->c_rd_seq, relaxed);
1033 }
1034 
1035 __attribute__((noinline))
1036 static bool
__smr_scan(smr_t smr,smr_seq_t goal,smr_clock_t clk,bool wait)1037 __smr_scan(smr_t smr, smr_seq_t goal, smr_clock_t clk, bool wait)
1038 {
1039 	smr_delta_t delta;
1040 	smr_seq_t rd_seq;
1041 
1042 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
1043 		return true;
1044 	}
1045 
1046 	/*
1047 	 * Validate that the goal is sane.
1048 	 */
1049 	delta = SMR_SEQ_DELTA(goal, clk.s_wr_seq);
1050 	if (delta == SMR_SEQ_INC) {
1051 		/*
1052 		 * This SMR clock uses deferred advance,
1053 		 * and the goal is one inc in the future.
1054 		 *
1055 		 * If we can wait, then commit the sequence number,
1056 		 * else we can't possibly succeed.
1057 		 *
1058 		 * Doing a commit here rather than an advance
1059 		 * gives the hardware a chance to abort the
1060 		 * transaction early in case of high contention
1061 		 * compared to an unconditional advance.
1062 		 */
1063 		if (!wait) {
1064 			return false;
1065 		}
1066 		if (lock_cmpxchgv(&smr->smr_clock.s_wr_seq,
1067 		    clk.s_wr_seq, goal, &clk.s_wr_seq, relaxed)) {
1068 			clk.s_wr_seq = goal;
1069 		}
1070 	} else if (delta > 0) {
1071 		/*
1072 		 * Invalid goal: the caller held on it for too long,
1073 		 * and integers wrapped.
1074 		 */
1075 		return true;
1076 	}
1077 
1078 	os_atomic_thread_fence(seq_cst); /* [S2] */
1079 
1080 	/*
1081 	 * The read sequence can be no larger than the write sequence
1082 	 * at the start of the poll.
1083 	 *
1084 	 * We know that on entry:
1085 	 *
1086 	 *     s_rd_seq < goal <= s_wr_seq
1087 	 *
1088 	 * The correctness of this algorithm relies on the fact that
1089 	 * the SMR domain [s_rd_seq, s_wr_seq) can't possibly move
1090 	 * by more than roughly (ULONG_MAX / 2) while __smr_scan()
1091 	 * is running, otherwise the "rd_seq" we try to scan for
1092 	 * might appear larger than s_rd_seq spuriously and we'd
1093 	 * __smr_rd_advance() incorrectly.
1094 	 *
1095 	 * This is guaranteed by the fact that this represents
1096 	 * advancing 2^62 times. At one advance every nanosecond,
1097 	 * it takes more than a century, which makes it possible
1098 	 * to call smr_wait() or smr_poll() with preemption enabled.
1099 	 */
1100 	rd_seq = clk.s_wr_seq;
1101 
1102 	zpercpu_foreach_cpu(cpu) {
1103 		smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
1104 		smr_seq_t seq   = os_atomic_load(&pcpu->c_rd_seq, relaxed);
1105 
1106 		while (seq != SMR_SEQ_INVALID) {
1107 			/*
1108 			 * Resolve the race documented in __smr_enter().
1109 			 *
1110 			 * The CPU has loaded a stale s_wr_seq, and s_rd_seq
1111 			 * moved past this stale value.
1112 			 *
1113 			 * Its critical section is however properly serialized,
1114 			 * but we can't know what the "correct" s_wr_seq it
1115 			 * could have observed was. We have to assume `s_rd_seq`
1116 			 * to prevent it from advancing.
1117 			 */
1118 			if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
1119 				seq = clk.s_rd_seq;
1120 			}
1121 
1122 			if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
1123 				seq &= ~SMR_SEQ_SLEEPABLE;
1124 				break;
1125 			}
1126 
1127 			if (seq & SMR_SEQ_SLEEPABLE) {
1128 				seq = __smr_wait_for_oncore(pcpu, goal, cpu);
1129 			} else {
1130 				disable_preemption();
1131 				seq = hw_wait_while_equals_long(&pcpu->c_rd_seq, seq);
1132 				enable_preemption();
1133 			}
1134 		}
1135 
1136 		if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
1137 			rd_seq = seq;
1138 		}
1139 	}
1140 
1141 	if (smr->smr_flags & SMR_SLEEPABLE) {
1142 		/*
1143 		 * Order observation of stalled sections,
1144 		 * see smr_mark_active_trackers_stalled().
1145 		 */
1146 		os_atomic_thread_fence(seq_cst);
1147 
1148 		zpercpu_foreach_cpu(cpu) {
1149 			smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
1150 			smr_seq_t  seq  = os_atomic_load(&pcpu->stall_rd_seq, relaxed);
1151 
1152 			while (seq != SMR_SEQ_INVALID) {
1153 				if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
1154 					seq = clk.s_rd_seq;
1155 				}
1156 
1157 				if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
1158 					seq &= ~SMR_SEQ_SLEEPABLE;
1159 					break;
1160 				}
1161 
1162 				seq = __smr_wait_for_stalled(pcpu, goal);
1163 			}
1164 
1165 			if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
1166 				rd_seq = seq;
1167 			}
1168 		}
1169 	}
1170 
1171 	/*
1172 	 * Advance the rd_seq as long as we observed a more recent value.
1173 	 */
1174 	return __smr_rd_advance(smr, goal, rd_seq);
1175 }
1176 
1177 static inline bool
__smr_poll(smr_t smr,smr_seq_t goal,bool wait)1178 __smr_poll(smr_t smr, smr_seq_t goal, bool wait)
1179 {
1180 	smr_clock_t clk;
1181 
1182 	/*
1183 	 * Load both the s_rd_seq and s_wr_seq in the right order so that we
1184 	 * can't observe a s_rd_seq older than s_wr_seq.
1185 	 */
1186 
1187 	/* [S1] */
1188 	clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, acquire);
1189 
1190 	/*
1191 	 * We expect this to be typical: the goal has already been observed.
1192 	 */
1193 	if (__probable(SMR_SEQ_CMP(goal, <=, clk.s_rd_seq))) {
1194 		return true;
1195 	}
1196 
1197 	clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1198 
1199 	return __smr_scan(smr, goal, clk, wait);
1200 }
1201 
1202 smr_seq_t
smr_advance(smr_t smr)1203 smr_advance(smr_t smr)
1204 {
1205 	smr_clock_t clk;
1206 
1207 	assert(!smr_entered(smr));
1208 
1209 	/*
1210 	 * We assume that there will at least be a successful __smr_poll
1211 	 * call every 2^60 calls to smr_advance() or so, so we do not need
1212 	 * to check if [s_rd_seq, s_wr_seq) is growing too wide.
1213 	 */
1214 	static_assert(sizeof(clk.s_wr_seq) == 8);
1215 	return __smr_wr_advance(smr);
1216 }
1217 
1218 smr_seq_t
smr_deferred_advance(smr_t smr)1219 smr_deferred_advance(smr_t smr)
1220 {
1221 	os_atomic_thread_fence(seq_cst);
1222 	return SMR_SEQ_INC + os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1223 }
1224 
1225 void
smr_deferred_advance_commit(smr_t smr,smr_seq_t seq)1226 smr_deferred_advance_commit(smr_t smr, smr_seq_t seq)
1227 {
1228 	/*
1229 	 * no barrier needed: smr_deferred_advance() had one already.
1230 	 * no failure handling: it means someone updated the clock already!
1231 	 * lock_cmpxchg: so that we pre-test for architectures needing it.
1232 	 */
1233 	assert(seq != SMR_SEQ_INVALID);
1234 	lock_cmpxchg(&smr->smr_clock.s_wr_seq, seq - SMR_SEQ_INC, seq, relaxed);
1235 }
1236 
1237 bool
smr_poll(smr_t smr,smr_seq_t goal)1238 smr_poll(smr_t smr, smr_seq_t goal)
1239 {
1240 	assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
1241 	return __smr_poll(smr, goal, false);
1242 }
1243 
1244 void
smr_wait(smr_t smr,smr_seq_t goal)1245 smr_wait(smr_t smr, smr_seq_t goal)
1246 {
1247 	assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
1248 	if (smr->smr_flags & SMR_SLEEPABLE) {
1249 		assert(get_preemption_level() == 0);
1250 	}
1251 	(void)__smr_poll(smr, goal, true);
1252 }
1253 
1254 void
smr_synchronize(smr_t smr)1255 smr_synchronize(smr_t smr)
1256 {
1257 	smr_clock_t clk;
1258 
1259 	assert(!smr_entered(smr));
1260 	assert(!ml_at_interrupt_context());
1261 	if (smr->smr_flags & SMR_SLEEPABLE) {
1262 		assert(get_preemption_level() == 0);
1263 	}
1264 
1265 	/*
1266 	 * Similar to __smr_poll() but also does a deferred advance which
1267 	 * __smr_scan will commit.
1268 	 */
1269 
1270 	clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, relaxed);
1271 	os_atomic_thread_fence(seq_cst);
1272 	clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1273 
1274 	(void)__smr_scan(smr, clk.s_wr_seq + SMR_SEQ_INC, clk, true);
1275 }
1276 
1277 
1278 #pragma mark SMR domains: smr_call & smr_barrier
1279 
1280 /*!
1281  * @struct smr_barrier_ctx
1282  *
1283  * @brief
1284  * Data structure to track the completion of an smr_barrier() call.
1285  */
1286 struct smr_barrier_ctx {
1287 	struct smr             *smrb_domain;
1288 	struct thread          *smrb_waiter;
1289 	uint32_t                smrb_pending;
1290 	uint32_t                smrb_count;
1291 };
1292 
1293 /*!
1294  * @struct smr_barrier_job
1295  *
1296  * @brief
1297  * Data structure used to track completion of smr_barrier() calls.
1298  */
1299 struct smr_barrier_job {
1300 	struct smr_barrier_ctx *smrj_context;
1301 	union {
1302 		struct smr_node smrj_node;
1303 		struct mpsc_queue_chain smrj_link;
1304 	};
1305 };
1306 
1307 #define SMR_BARRIER_SIZE        24
1308 static_assert(sizeof(struct smr_barrier_job) == SMR_BARRIER_SIZE);
1309 #define SMR_BARRIER_USE_STACK   (SMR_BARRIER_SIZE * MAX_CPUS <= 512)
1310 
1311 static void
__smr_worker_check_invariants(struct smr_worker * smrw)1312 __smr_worker_check_invariants(struct smr_worker *smrw)
1313 {
1314 #if MACH_ASSERT
1315 	smr_pcpu_t pcpu = smrw->whead;
1316 	uint16_t num = (uint16_t)cpu_number();
1317 
1318 	assert(!ml_get_interrupts_enabled() || get_preemption_level());
1319 
1320 	for (; pcpu != *smrw->wold_tail; pcpu = pcpu->drain_next) {
1321 		assertf(pcpu->qold_seq != SMR_SEQ_INVALID &&
1322 		    __smr_pcpu_queued(pcpu),
1323 		    "pcpu %p doesn't belong on %p old queue", pcpu, smrw);
1324 		pcpu->__check_cpu = num;
1325 		pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1326 		pcpu->__check_list = 1;
1327 	}
1328 
1329 	for (; pcpu != *smrw->wage_tail; pcpu = pcpu->drain_next) {
1330 		__assert_only smr_t smr = pcpu->drain_smr;
1331 
1332 		assertf(pcpu->qold_seq == SMR_SEQ_INVALID &&
1333 		    pcpu->qage_seq != SMR_SEQ_INVALID &&
1334 		    SMR_SEQ_CMP(pcpu->qage_seq, <=, smr->smr_clock.s_wr_seq) &&
1335 		    __smr_pcpu_queued(pcpu),
1336 		    "pcpu %p doesn't belong on %p aging queue", pcpu, smrw);
1337 		pcpu->__check_cpu = num;
1338 		pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1339 		pcpu->__check_list = 2;
1340 	}
1341 
1342 	for (; pcpu != *smrw->wcur_tail; pcpu = pcpu->drain_next) {
1343 		assertf(pcpu->qold_seq == SMR_SEQ_INVALID &&
1344 		    pcpu->qage_seq != SMR_SEQ_INVALID &&
1345 		    __smr_pcpu_queued(pcpu),
1346 		    "pcpu %p doesn't belong on %p current queue", pcpu, smrw);
1347 		pcpu->__check_cpu = num;
1348 		pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1349 		pcpu->__check_list = 3;
1350 	}
1351 
1352 	assert(pcpu == NULL);
1353 #else
1354 	(void)smrw;
1355 #endif
1356 }
1357 
1358 __attribute__((noinline))
1359 static void
__smr_cpu_lazy_up(struct smr_worker * smrw)1360 __smr_cpu_lazy_up(struct smr_worker *smrw)
1361 {
1362 	spl_t spl;
1363 
1364 	/*
1365 	 * calling smr_call/smr_barrier() from the context of a CPU
1366 	 * with a detached worker is illegal.
1367 	 *
1368 	 * However, bound threads might run on a derecommended (IGNORED)
1369 	 * cpu which we correct for here (and the CPU will go back to IGNORED
1370 	 * in smr_cpu_leave()).
1371 	 */
1372 	assert(smrw->detach_reason == SMR_CPU_REASON_IGNORED);
1373 
1374 	spl = splsched();
1375 	__smrw_lock(smrw);
1376 	smrw->detach_reason &= ~SMR_CPU_REASON_IGNORED;
1377 	__smrw_unlock(smrw);
1378 	splx(spl);
1379 }
1380 
1381 static void
__smr_cpu_lazy_up_if_needed(struct smr_worker * smrw)1382 __smr_cpu_lazy_up_if_needed(struct smr_worker *smrw)
1383 {
1384 	if (__improbable(smrw->detach_reason != SMR_CPU_REASON_NONE)) {
1385 		__smr_cpu_lazy_up(smrw);
1386 	}
1387 }
1388 
1389 static bool
__smr_call_should_advance(smr_pcpu_t pcpu)1390 __smr_call_should_advance(smr_pcpu_t pcpu)
1391 {
1392 	if (pcpu->qcur_cnt > smr_call_cnt_cap) {
1393 		return true;
1394 	}
1395 	if (pcpu->qcur_size > smr_call_size_cap) {
1396 		return true;
1397 	}
1398 	return false;
1399 }
1400 
1401 static void
__smr_call_advance_qcur(smr_t smr,smr_pcpu_t pcpu,bool needs_commit)1402 __smr_call_advance_qcur(smr_t smr, smr_pcpu_t pcpu, bool needs_commit)
1403 {
1404 	smr_seq_t new_seq;
1405 
1406 	if (needs_commit || pcpu->qage_seq) {
1407 		new_seq = smr_advance(smr);
1408 	} else {
1409 		new_seq = smr_deferred_advance(smr);
1410 	}
1411 	__builtin_assume(new_seq != SMR_SEQ_INVALID);
1412 
1413 	pcpu->qage_seq  = new_seq;
1414 	pcpu->qage_tail = pcpu->qcur_tail;
1415 
1416 	pcpu->qcur_size = 0;
1417 	pcpu->qcur_cnt  = 0;
1418 }
1419 
1420 static void
__smr_call_push(smr_pcpu_t pcpu,smr_node_t node,smr_cb_t cb)1421 __smr_call_push(smr_pcpu_t pcpu, smr_node_t node, smr_cb_t cb)
1422 {
1423 	assert(pcpu->c_rd_seq == SMR_SEQ_INVALID);
1424 
1425 	node->smrn_next  = NULL;
1426 	node->smrn_cb    = cb;
1427 
1428 	*pcpu->qcur_tail = node;
1429 	pcpu->qcur_tail  = &node->smrn_next;
1430 	pcpu->qcur_cnt  += 1;
1431 }
1432 
1433 static void
__smr_call_dispatch(struct smr_worker * smrw,smr_pcpu_t pcpu)1434 __smr_call_dispatch(struct smr_worker *smrw, smr_pcpu_t pcpu)
1435 {
1436 	__smr_worker_check_invariants(smrw);
1437 
1438 	if (!__smr_pcpu_queued(pcpu)) {
1439 		assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1440 		assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1441 
1442 		pcpu->drain_next   = NULL;
1443 		*smrw->wcur_tail   = pcpu;
1444 		smrw->wcur_tail    = &pcpu->drain_next;
1445 	}
1446 }
1447 
1448 void
smr_call(smr_t smr,smr_node_t node,vm_size_t size,smr_cb_t cb)1449 smr_call(smr_t smr, smr_node_t node, vm_size_t size, smr_cb_t cb)
1450 {
1451 	struct smr_worker *smrw;
1452 	smr_pcpu_t pcpu;
1453 
1454 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
1455 		return cb(node);
1456 	}
1457 
1458 	lock_disable_preemption_for_thread(current_thread());
1459 	assert(!ml_at_interrupt_context());
1460 
1461 	smrw = PERCPU_GET(smr_worker);
1462 	__smr_cpu_lazy_up_if_needed(smrw);
1463 
1464 	pcpu = __smr_pcpu(smr);
1465 	assert(pcpu->c_rd_seq == SMR_SEQ_INVALID);
1466 
1467 	if (os_add_overflow(pcpu->qcur_size, size, &pcpu->qcur_size)) {
1468 		pcpu->qcur_size = UINT32_MAX;
1469 	}
1470 
1471 	__smr_call_push(pcpu, node, cb);
1472 	if (__smr_call_should_advance(pcpu)) {
1473 		if (pcpu->qage_seq == SMR_SEQ_INVALID) {
1474 			__smr_call_advance_qcur(smr, pcpu, false);
1475 		}
1476 		__smr_call_dispatch(smrw, pcpu);
1477 	}
1478 
1479 	return lock_enable_preemption();
1480 }
1481 
1482 static inline event_t
__smrb_event(struct smr_barrier_ctx * ctx)1483 __smrb_event(struct smr_barrier_ctx *ctx)
1484 {
1485 	return ctx;
1486 }
1487 
1488 static void
__smr_barrier_cb(struct smr_node * node)1489 __smr_barrier_cb(struct smr_node *node)
1490 {
1491 	struct smr_barrier_job *job;
1492 	struct smr_barrier_ctx *ctx;
1493 
1494 	job = __container_of(node, struct smr_barrier_job, smrj_node);
1495 	ctx = job->smrj_context;
1496 
1497 	if (os_atomic_dec(&ctx->smrb_pending, relaxed) == 0) {
1498 		/*
1499 		 * It is permitted to still reach into the context
1500 		 * because smr_barrier() always blocks, which means
1501 		 * that the context will be valid until this wakeup
1502 		 * happens.
1503 		 */
1504 		thread_wakeup_thread(__smrb_event(ctx), ctx->smrb_waiter);
1505 	}
1506 }
1507 
1508 static bool
__smr_barrier_drain(struct smr_worker * smrw,bool needs_commit)1509 __smr_barrier_drain(struct smr_worker *smrw, bool needs_commit)
1510 {
1511 	mpsc_queue_chain_t head, tail, it;
1512 
1513 	head = mpsc_queue_dequeue_batch(&smrw->barrier_queue, &tail,
1514 	    OS_ATOMIC_DEPENDENCY_NONE);
1515 
1516 	mpsc_queue_batch_foreach_safe(it, head, tail) {
1517 		struct smr_barrier_job *job;
1518 		struct smr_barrier_ctx *ctx;
1519 		smr_pcpu_t pcpu;
1520 		smr_t smr;
1521 
1522 		job  = __container_of(it, struct smr_barrier_job, smrj_link);
1523 		ctx  = job->smrj_context;
1524 		smr  = ctx->smrb_domain;
1525 		pcpu = __smr_pcpu(smr, smrw->processor->cpu_id);
1526 
1527 		pcpu->qcur_size = UINT32_MAX;
1528 		__smr_call_push(pcpu, &job->smrj_node, __smr_barrier_cb);
1529 		__smr_call_advance_qcur(smr, pcpu, needs_commit);
1530 		__smr_call_dispatch(smrw, pcpu);
1531 	}
1532 
1533 	return head != NULL;
1534 }
1535 
1536 
1537 void
smr_barrier(smr_t smr)1538 smr_barrier(smr_t smr)
1539 {
1540 #if SMR_BARRIER_USE_STACK
1541 	struct smr_barrier_job jobs[MAX_CPUS];
1542 #else
1543 	struct smr_barrier_job *jobs;
1544 #endif
1545 	struct smr_barrier_job *job;
1546 	struct smr_barrier_ctx  ctx = {
1547 		.smrb_domain  = smr,
1548 		.smrb_waiter  = current_thread(),
1549 		.smrb_pending = zpercpu_count(),
1550 		.smrb_count   = zpercpu_count(),
1551 	};
1552 	spl_t spl;
1553 
1554 	/*
1555 	 * First wait for all readers to observe whatever it is
1556 	 * that changed prior to this call.
1557 	 *
1558 	 * _then_ enqueue callbacks that push out anything ahead.
1559 	 */
1560 	smr_synchronize(smr);
1561 
1562 #if !SMR_BARRIER_USE_STACK
1563 	jobs = kalloc_type(struct smr_barrier_job, ctx.smrb_count,
1564 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1565 #endif
1566 	job  = jobs;
1567 	spl  = splsched();
1568 
1569 	__smr_cpu_lazy_up_if_needed(PERCPU_GET(smr_worker));
1570 
1571 	percpu_foreach(smrw, smr_worker) {
1572 		job->smrj_context = &ctx;
1573 		if (mpsc_queue_append(&smrw->barrier_queue, &job->smrj_link)) {
1574 			__smrw_lock(smrw);
1575 			__smrw_wakeup_and_unlock(smrw);
1576 		}
1577 		job++;
1578 	}
1579 
1580 	/*
1581 	 * Because we disabled interrupts, our own CPU's callback
1582 	 * can't possibly have run, so just block.
1583 	 *
1584 	 * We must block in order to guarantee the lifetime of "ctx".
1585 	 * (See comment in __smr_barrier_cb).
1586 	 */
1587 	assert_wait(__smrb_event(&ctx), THREAD_UNINT);
1588 	assert(ctx.smrb_pending > 0);
1589 	splx(spl);
1590 	thread_block(THREAD_CONTINUE_NULL);
1591 
1592 #if !SMR_BARRIER_USE_STACK
1593 	kfree_type(struct smr_barrier_job, ctx.smrb_count, jobs);
1594 #endif
1595 }
1596 
1597 
1598 #pragma mark SMR domains: smr_worker
1599 
1600 static void
__smr_worker_drain_lock(struct smr_worker * smrw)1601 __smr_worker_drain_lock(struct smr_worker *smrw)
1602 {
1603 	for (;;) {
1604 		ml_set_interrupts_enabled(false);
1605 		__smrw_lock(smrw);
1606 
1607 		/*
1608 		 * Check we are on an appropriate processor
1609 		 *
1610 		 * Note that we might be running on the canonical
1611 		 * processor incorrectly: if the processor has been
1612 		 * de-recommended but isn't offline.
1613 		 */
1614 		if (__probable(current_processor() == smrw->processor)) {
1615 			if (__probable(!smrw->detach_reason)) {
1616 				break;
1617 			}
1618 		} else {
1619 			if (__probable(smrw->detach_reason)) {
1620 				break;
1621 			}
1622 		}
1623 
1624 		/* go bind in the right place and retry */
1625 		thread_bind(__smrw_drain_bind_target(smrw));
1626 		__smrw_unlock(smrw);
1627 		ml_set_interrupts_enabled(true);
1628 		thread_block(THREAD_CONTINUE_NULL);
1629 	}
1630 }
1631 
1632 static void
__smr_worker_drain_unlock(struct smr_worker * smrw)1633 __smr_worker_drain_unlock(struct smr_worker *smrw)
1634 {
1635 	__smrw_unlock(smrw);
1636 	ml_set_interrupts_enabled(true);
1637 }
1638 
1639 /*!
1640  * @function __smr_worker_tick
1641  *
1642  * @brief
1643  * Make the SMR worker queues make gentle progress
1644  *
1645  * @discussion
1646  * One round of progress will:
1647  * - move entries that have aged as being old,
1648  * - commit entries that have a deferred sequence and let them age.
1649  *
1650  * If this results into any callbacks to become "old",
1651  * then the worker is being woken up to start running callbacks.
1652  *
1653  * This function must run either on the processfor for this worker,
1654  * or under the worker drain lock being held.
1655  */
1656 static void
__smr_worker_tick(struct smr_worker * smrw,uint64_t ctime,bool wakeup)1657 __smr_worker_tick(struct smr_worker *smrw, uint64_t ctime, bool wakeup)
1658 {
1659 	smr_pcpu_t pcpu = *smrw->wold_tail;
1660 
1661 	__smr_worker_check_invariants(smrw);
1662 
1663 	for (; pcpu != *smrw->wage_tail; pcpu = pcpu->drain_next) {
1664 		assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1665 		assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1666 
1667 		pcpu->qold_seq  = pcpu->qage_seq;
1668 		pcpu->qold_tail = pcpu->qage_tail;
1669 
1670 		pcpu->qage_seq  = SMR_SEQ_INVALID;
1671 	}
1672 
1673 	for (; pcpu; pcpu = pcpu->drain_next) {
1674 		assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1675 		assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1676 
1677 		smr_deferred_advance_commit(pcpu->drain_smr, pcpu->qage_seq);
1678 	}
1679 
1680 	smrw->wold_tail = smrw->wage_tail;
1681 	smrw->wage_tail = smrw->wcur_tail;
1682 	smrw->drain_ctime = ctime;
1683 
1684 	__smr_worker_check_invariants(smrw);
1685 
1686 	if (wakeup && smrw->wold_tail != &smrw->whead) {
1687 		__smrw_lock(smrw);
1688 		__smrw_wakeup_and_unlock(smrw);
1689 	}
1690 }
1691 
1692 static void
__smr_worker_update_wold_tail(struct smr_worker * smrw,smr_pcpu_t * new_tail)1693 __smr_worker_update_wold_tail(struct smr_worker *smrw, smr_pcpu_t *new_tail)
1694 {
1695 	smr_pcpu_t *old_tail = smrw->wold_tail;
1696 
1697 	if (smrw->wcur_tail == old_tail) {
1698 		smrw->wage_tail = new_tail;
1699 		smrw->wcur_tail = new_tail;
1700 	} else if (smrw->wage_tail == old_tail) {
1701 		smrw->wage_tail = new_tail;
1702 	}
1703 
1704 	smrw->wold_tail = new_tail;
1705 }
1706 
1707 static void
__smr_worker_drain_one(struct smr_worker * smrw,smr_pcpu_t pcpu)1708 __smr_worker_drain_one(struct smr_worker *smrw, smr_pcpu_t pcpu)
1709 {
1710 	smr_t       smr  = pcpu->drain_smr;
1711 	smr_seq_t   seq  = pcpu->qold_seq;
1712 	smr_node_t  head;
1713 
1714 	/*
1715 	 * Step 1: pop the "old" items,
1716 	 *         (qold_tail/qold_seq left dangling)
1717 	 */
1718 
1719 	assert(seq != SMR_SEQ_INVALID);
1720 	head = pcpu->qhead;
1721 	pcpu->qhead = *pcpu->qold_tail;
1722 	*pcpu->qold_tail = NULL;
1723 
1724 	/*
1725 	 * Step 2: Reconstruct the queue
1726 	 *         based on the sequence numbers and count fields.
1727 	 *
1728 	 *         Do what __smr_worker_tick() would do on this queue:
1729 	 *         - commit the aging queue
1730 	 *         - advance the current queue if needed
1731 	 */
1732 
1733 	if (pcpu->qage_seq != SMR_SEQ_INVALID) {
1734 		assert(pcpu->qage_tail != pcpu->qold_tail);
1735 
1736 		smr_deferred_advance_commit(smr, pcpu->qage_seq);
1737 		pcpu->qold_seq  = pcpu->qage_seq;
1738 		pcpu->qold_tail = pcpu->qage_tail;
1739 	} else {
1740 		assert(pcpu->qage_tail == pcpu->qold_tail);
1741 
1742 		pcpu->qold_seq  = SMR_SEQ_INVALID;
1743 		pcpu->qold_tail = &pcpu->qhead;
1744 	}
1745 
1746 	if (__smr_call_should_advance(pcpu)) {
1747 		__smr_call_advance_qcur(smr, pcpu, false);
1748 	} else {
1749 		pcpu->qage_seq  = SMR_SEQ_INVALID;
1750 		pcpu->qage_tail = pcpu->qold_tail;
1751 		if (pcpu->qcur_cnt == 0) {
1752 			pcpu->qcur_tail = pcpu->qage_tail;
1753 		}
1754 	}
1755 
1756 	if (pcpu->qold_seq != SMR_SEQ_INVALID) {
1757 		/*
1758 		 * The node has gained an "old seq" back,
1759 		 * it goes to the ready queue.
1760 		 */
1761 		pcpu->drain_next = *smrw->wold_tail;
1762 		*smrw->wold_tail = pcpu;
1763 		__smr_worker_update_wold_tail(smrw,
1764 		    &pcpu->drain_next);
1765 	} else if (pcpu->qage_seq != SMR_SEQ_INVALID) {
1766 		/*
1767 		 * The node has gained an "age seq" back,
1768 		 * it needs to age and wait for a tick
1769 		 * for its sequence number to be commited.
1770 		 */
1771 		pcpu->drain_next = NULL;
1772 		*smrw->wcur_tail = pcpu;
1773 		smrw->wcur_tail  = &pcpu->drain_next;
1774 	} else {
1775 		/*
1776 		 * The node is empty or with "current"
1777 		 * callbacks only, it can be dequeued.
1778 		 */
1779 		assert(!__smr_call_should_advance(pcpu));
1780 		pcpu->__check_cpu = (uint16_t)cpu_number();
1781 		pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1782 		pcpu->__check_list = 0;
1783 		__smr_pcpu_set_not_queued(pcpu);
1784 	}
1785 
1786 	/*
1787 	 * Step 3: drain callbacks.
1788 	 */
1789 	__smr_worker_check_invariants(smrw);
1790 	__smr_worker_drain_unlock(smrw);
1791 
1792 	__smr_poll(smr, seq, true);
1793 	__smr_call_drain(head);
1794 
1795 	__smr_worker_drain_lock(smrw);
1796 }
1797 
1798 static void
__smr_worker_continue(void * arg,wait_result_t wr __unused)1799 __smr_worker_continue(void *arg, wait_result_t wr __unused)
1800 {
1801 	smr_pcpu_t pcpu = NULL, next = NULL;
1802 	struct smr_worker *const smrw = arg;
1803 	uint64_t deadline;
1804 
1805 	__smr_worker_drain_lock(smrw);
1806 	__smr_worker_check_invariants(smrw);
1807 
1808 	if (smrw->wold_tail != &smrw->whead) {
1809 		next = smrw->whead;
1810 		smrw->whead = *smrw->wold_tail;
1811 		*smrw->wold_tail = NULL;
1812 		__smr_worker_update_wold_tail(smrw, &smrw->whead);
1813 	}
1814 
1815 	/*
1816 	 * The pipeline of per-cpu SMR data structures with pending
1817 	 * smr_call() callbacks has three stages: wcur -> wage -> wold.
1818 	 *
1819 	 * In order to guarantee forward progress, a tick happens
1820 	 * for each of them, either via __smr_worker_tick(),
1821 	 * or via __smr_worker_drain_one().
1822 	 *
1823 	 * The second tick will happen either because to core stayed
1824 	 * busy enough that a subsequent smr_cpu_tick() decided to
1825 	 * perform it, or because the CPU idled, and smr_cpu_leave()
1826 	 * will perform an unconditional __smr_worker_tick().
1827 	 */
1828 	__smr_barrier_drain(smrw, false);
1829 	__smr_worker_tick(smrw, mach_absolute_time(), false);
1830 
1831 	while ((pcpu = next)) {
1832 		next = next->drain_next;
1833 		__smr_worker_drain_one(smrw, pcpu);
1834 	}
1835 
1836 	if (__improbable(smrw->whead && smrw->detach_reason)) {
1837 		/*
1838 		 * If the thread isn't bound, we want to flush anything
1839 		 * that is pending without causing too much contention.
1840 		 *
1841 		 * Sleep for a bit in order to give the system time
1842 		 * to observe any advance commits we did.
1843 		 */
1844 		deadline = mach_absolute_time() + cpu_checkin_min_interval;
1845 	} else {
1846 		deadline = TIMEOUT_WAIT_FOREVER;
1847 	}
1848 	waitq_assert_wait64_locked(&smrw->waitq, __smrw_drain_event(smrw),
1849 	    THREAD_UNINT, TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1850 	    TIMEOUT_NO_LEEWAY, smrw->thread);
1851 
1852 	/*
1853 	 * Make sure there's no barrier left, after we called assert_wait()
1854 	 * in order to pair with __smr_barrier_cb(). If we do find some,
1855 	 * we must be careful about invariants and forward progress.
1856 	 *
1857 	 * For affected domains, the dequeued barriers have been added
1858 	 * to their "qage" queue. If their "qage" queue was non empty,
1859 	 * then its "qage_seq" was already commited, and we must preserve
1860 	 * this invariant.
1861 	 *
1862 	 * Affected domains that were idle before will get enqueued on this
1863 	 * worker's "wcur" queue. In order to guarantee forward progress,
1864 	 * we must force a tick if both the "wage" and "wold" queues
1865 	 * of the worker are empty.
1866 	 */
1867 	if (__improbable(__smr_barrier_drain(smrw, true))) {
1868 		if (smrw->wage_tail == &smrw->whead) {
1869 			__smr_worker_tick(smrw, mach_absolute_time(), false);
1870 		}
1871 	}
1872 
1873 	__smr_worker_check_invariants(smrw);
1874 	__smr_worker_drain_unlock(smrw);
1875 
1876 	thread_block_parameter(__smr_worker_continue, smrw);
1877 }
1878 
1879 
1880 #pragma mark SMR domains: scheduler integration
1881 
1882 #if CONFIG_QUIESCE_COUNTER
1883 __startup_data
1884 static uint64_t _Atomic quiesce_gen_startup;
1885 static uint64_t _Atomic *quiesce_genp = &quiesce_gen_startup;
1886 static uint64_t _Atomic quiesce_ctime;
1887 
1888 void
cpu_quiescent_set_storage(uint64_t _Atomic * ptr)1889 cpu_quiescent_set_storage(uint64_t _Atomic *ptr)
1890 {
1891 	/*
1892 	 * Transfer to the real location for the commpage.
1893 	 *
1894 	 * this is ok to do like this because the system
1895 	 * is still single threaded.
1896 	 */
1897 	uint64_t gen = os_atomic_load(&quiesce_gen_startup, relaxed);
1898 
1899 	os_atomic_store(ptr, gen, relaxed);
1900 	quiesce_genp = ptr;
1901 }
1902 
1903 static smr_seq_t
cpu_quiescent_gen_to_seq(uint64_t gen)1904 cpu_quiescent_gen_to_seq(uint64_t gen)
1905 {
1906 	return gen * SMR_SEQ_INC + SMR_SEQ_INIT;
1907 }
1908 
1909 static void
cpu_quiescent_advance(uint64_t gen,uint64_t ctime __kdebug_only)1910 cpu_quiescent_advance(uint64_t gen, uint64_t ctime __kdebug_only)
1911 {
1912 	smr_seq_t seq = cpu_quiescent_gen_to_seq(gen);
1913 
1914 	os_atomic_thread_fence(seq_cst);
1915 
1916 	percpu_foreach(it, smr_worker) {
1917 		smr_seq_t rd_seq = os_atomic_load(&it->rd_quiesce_seq, relaxed);
1918 
1919 		if (rd_seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(rd_seq, <, seq)) {
1920 			return;
1921 		}
1922 	}
1923 
1924 	os_atomic_thread_fence(seq_cst);
1925 
1926 	if (lock_cmpxchg(quiesce_genp, gen, gen + 1, relaxed)) {
1927 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER),
1928 		    gen, 0, ctime, 0);
1929 	}
1930 }
1931 
1932 static void
cpu_quiescent_join(struct smr_worker * smrw)1933 cpu_quiescent_join(struct smr_worker *smrw)
1934 {
1935 	uint64_t gen = os_atomic_load(quiesce_genp, relaxed);
1936 
1937 	assert(smrw->rd_quiesce_seq == SMR_SEQ_INVALID);
1938 	os_atomic_store(&smrw->rd_quiesce_seq,
1939 	    cpu_quiescent_gen_to_seq(gen), relaxed);
1940 	os_atomic_thread_fence(seq_cst);
1941 }
1942 
1943 static void
cpu_quiescent_tick(struct smr_worker * smrw,uint64_t ctime,uint64_t interval)1944 cpu_quiescent_tick(struct smr_worker *smrw, uint64_t ctime, uint64_t interval)
1945 {
1946 	uint64_t  gen  = os_atomic_load(quiesce_genp, relaxed);
1947 	smr_seq_t seq  = cpu_quiescent_gen_to_seq(gen);
1948 
1949 	if (smrw->rd_quiesce_seq == SMR_SEQ_INVALID) {
1950 		/*
1951 		 * Likely called because of the scheduler tick,
1952 		 * smr_maintenance() will do the right thing.
1953 		 */
1954 		assert(current_processor()->state != PROCESSOR_RUNNING);
1955 	} else if (seq != smrw->rd_quiesce_seq) {
1956 		/*
1957 		 * Someone managed to update the sequence already,
1958 		 * learn it, update our ctime.
1959 		 */
1960 		os_atomic_store(&smrw->rd_quiesce_seq, seq, release);
1961 		os_atomic_store(&quiesce_ctime, ctime, relaxed);
1962 		os_atomic_thread_fence(seq_cst);
1963 	} else if ((ctime - os_atomic_load(&quiesce_ctime, relaxed)) > interval) {
1964 		/*
1965 		 * The system looks busy enough we want to update
1966 		 * the counter faster than every scheduler tick.
1967 		 */
1968 		os_atomic_store(&quiesce_ctime, ctime, relaxed);
1969 		cpu_quiescent_advance(gen, ctime);
1970 	}
1971 }
1972 
1973 static void
cpu_quiescent_leave(struct smr_worker * smrw)1974 cpu_quiescent_leave(struct smr_worker *smrw)
1975 {
1976 	assert(smrw->rd_quiesce_seq != SMR_SEQ_INVALID);
1977 	os_atomic_store(&smrw->rd_quiesce_seq, SMR_SEQ_INVALID, release);
1978 }
1979 #endif /* CONFIG_QUIESCE_COUNTER */
1980 
1981 uint32_t
smr_cpu_checkin_get_min_interval_us(void)1982 smr_cpu_checkin_get_min_interval_us(void)
1983 {
1984 	return cpu_checkin_min_interval_us;
1985 }
1986 
1987 void
smr_cpu_checkin_set_min_interval_us(uint32_t new_value_us)1988 smr_cpu_checkin_set_min_interval_us(uint32_t new_value_us)
1989 {
1990 	/* clamp to something vaguely sane */
1991 	if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) {
1992 		new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US;
1993 	}
1994 
1995 	cpu_checkin_min_interval_us = new_value_us;
1996 
1997 	uint64_t abstime = 0;
1998 	clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us,
1999 	    NSEC_PER_USEC, &abstime);
2000 	cpu_checkin_min_interval = abstime;
2001 }
2002 
2003 __startup_func
2004 static void
smr_cpu_checkin_init_min_interval_us(void)2005 smr_cpu_checkin_init_min_interval_us(void)
2006 {
2007 	smr_cpu_checkin_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US);
2008 }
2009 STARTUP(TUNABLES, STARTUP_RANK_FIRST, smr_cpu_checkin_init_min_interval_us);
2010 
2011 static void
__smr_cpu_init_thread(struct smr_worker * smrw)2012 __smr_cpu_init_thread(struct smr_worker *smrw)
2013 {
2014 	char name[MAXTHREADNAMESIZE];
2015 	thread_t th = THREAD_NULL;
2016 
2017 	kernel_thread_create(__smr_worker_continue, smrw, MINPRI_KERNEL, &th);
2018 	smrw->thread = th;
2019 
2020 	snprintf(name, sizeof(name), "smr.reclaim:%d", smrw->processor->cpu_id);
2021 	thread_set_thread_name(th, name);
2022 	thread_start_in_assert_wait(th,
2023 	    &smrw->waitq, __smrw_drain_event(smrw), THREAD_UNINT);
2024 }
2025 
2026 void
smr_cpu_init(struct processor * processor)2027 smr_cpu_init(struct processor *processor)
2028 {
2029 	struct smr_worker *smrw;
2030 
2031 	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2032 	smrw->processor = processor;
2033 
2034 	waitq_init(&smrw->waitq, WQT_QUEUE, SYNC_POLICY_FIFO);
2035 	smrw->detach_reason = SMR_CPU_REASON_OFFLINE;
2036 
2037 	smrq_init(&smrw->sect_queue);
2038 	smrw->wold_tail = &smrw->whead;
2039 	smrw->wage_tail = &smrw->whead;
2040 	smrw->wcur_tail = &smrw->whead;
2041 	mpsc_queue_init(&smrw->barrier_queue);
2042 
2043 	if (processor != master_processor) {
2044 		__smr_cpu_init_thread(smrw);
2045 	}
2046 }
2047 STARTUP_ARG(LOCKS, STARTUP_RANK_LAST, smr_cpu_init, master_processor);
2048 STARTUP_ARG(THREAD_CALL, STARTUP_RANK_LAST,
2049     __smr_cpu_init_thread, PERCPU_GET_MASTER(smr_worker));
2050 
2051 /*!
2052  * @function smr_cpu_up()
2053  *
2054  * @brief
2055  * Scheduler callback to notify this processor is going up.
2056  *
2057  * @discussion
2058  * Called at splsched() under the sched_available_cores_lock.
2059  */
2060 void
smr_cpu_up(struct processor * processor,smr_cpu_reason_t reason)2061 smr_cpu_up(struct processor *processor, smr_cpu_reason_t reason)
2062 {
2063 	struct smr_worker *smrw;
2064 
2065 	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2066 
2067 	__smrw_lock(smrw);
2068 	if (reason != SMR_CPU_REASON_IGNORED) {
2069 		assert((smrw->detach_reason & reason) == reason);
2070 	}
2071 	smrw->detach_reason &= ~reason;
2072 	__smrw_unlock(smrw);
2073 }
2074 
2075 static void
__smr_cpu_down_and_unlock(struct processor * processor,struct smr_worker * smrw,smr_cpu_reason_t reason)2076 __smr_cpu_down_and_unlock(
2077 	struct processor       *processor,
2078 	struct smr_worker      *smrw,
2079 	smr_cpu_reason_t        reason)
2080 {
2081 	bool detach = !smrw->detach_reason;
2082 
2083 	/*
2084 	 * When reason is SMR_CPU_REASON_IGNORED,
2085 	 * this is called from smr_cpu_leave() on the way to idle.
2086 	 *
2087 	 * However this isn't sychronized with the recommendation
2088 	 * lock, hence it is possible that the CPU might actually
2089 	 * be recommended again while we're on the way to idle.
2090 	 *
2091 	 * By re-checking processor recommendation under
2092 	 * the __smrw_lock, we serialize with smr_cpu_up().
2093 	 */
2094 	if (reason != SMR_CPU_REASON_IGNORED) {
2095 		assert((smrw->detach_reason & reason) == 0);
2096 	} else if (processor->is_recommended) {
2097 		/*
2098 		 * The race we try to detect happened,
2099 		 * do nothing.
2100 		 */
2101 		reason = SMR_CPU_REASON_NONE;
2102 		detach = false;
2103 	}
2104 	smrw->detach_reason |= reason;
2105 	reason = smrw->detach_reason;
2106 
2107 	if (detach && smrw->whead) {
2108 		detach = !__smrw_wakeup_and_unlock(smrw);
2109 	} else {
2110 		__smrw_unlock(smrw);
2111 	}
2112 
2113 	if (detach) {
2114 		thread_unbind_after_queue_shutdown(smrw->thread, processor);
2115 	}
2116 }
2117 
2118 /*!
2119  * @function smr_cpu_down()
2120  *
2121  * @brief
2122  * Scheduler callback to notify this processor is going down.
2123  *
2124  * @discussion
2125  * Called at splsched() when the processor run queue is being shut down.
2126  */
2127 void
smr_cpu_down(struct processor * processor,smr_cpu_reason_t reason)2128 smr_cpu_down(struct processor *processor, smr_cpu_reason_t reason)
2129 {
2130 	struct smr_worker *smrw;
2131 
2132 	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2133 
2134 	__smrw_lock(smrw);
2135 	__smr_cpu_down_and_unlock(processor, smrw, reason);
2136 }
2137 
2138 
2139 /*!
2140  * @function smr_cpu_join()
2141  *
2142  * @brief
2143  * Scheduler callback to notify this processor is going out of idle.
2144  *
2145  * @discussion
2146  * Called at splsched().
2147  */
2148 void
smr_cpu_join(struct processor * processor,uint64_t ctime __unused)2149 smr_cpu_join(struct processor *processor, uint64_t ctime __unused)
2150 {
2151 #if CONFIG_QUIESCE_COUNTER
2152 	struct smr_worker *smrw;
2153 
2154 	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2155 	cpu_quiescent_join(smrw);
2156 #else
2157 	(void)processor;
2158 #endif /* CONFIG_QUIESCE_COUNTER */
2159 }
2160 
2161 /*!
2162  * @function smr_cpu_tick()
2163  *
2164  * @brief
2165  * Scheduler callback invoked during the scheduler maintenance routine.
2166  *
2167  * @discussion
2168  * Called at splsched().
2169  */
2170 void
smr_cpu_tick(uint64_t ctime,bool safe_point)2171 smr_cpu_tick(uint64_t ctime, bool safe_point)
2172 {
2173 	struct smr_worker *smrw = PERCPU_GET(smr_worker);
2174 	uint64_t interval = cpu_checkin_min_interval;
2175 
2176 #if CONFIG_QUIESCE_COUNTER
2177 	cpu_quiescent_tick(smrw, ctime, interval);
2178 #endif /* CONFIG_QUIESCE_COUNTER */
2179 
2180 	/*
2181 	 * if a bound thread was woken up on a derecommended core,
2182 	 * our detach_reason might be "IGNORED" and we want to leave
2183 	 * it alone in that case
2184 	 */
2185 	if (safe_point && !smrw->detach_reason && smrw->whead &&
2186 	    current_processor()->state == PROCESSOR_RUNNING &&
2187 	    (ctime - smrw->drain_ctime) > interval) {
2188 		__smr_worker_tick(smrw, ctime, true);
2189 	}
2190 }
2191 
2192 /*!
2193  * @function smr_cpu_leave()
2194  *
2195  * @brief
2196  * Scheduler callback to notify this processor is going idle.
2197  *
2198  * @discussion
2199  * Called at splsched().
2200  */
2201 void
smr_cpu_leave(struct processor * processor,uint64_t ctime)2202 smr_cpu_leave(struct processor *processor, uint64_t ctime)
2203 {
2204 	struct smr_worker *smrw;
2205 
2206 	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2207 
2208 	/*
2209 	 * if a bound thread was woken up on a derecommended core,
2210 	 * our detach_reason might be "IGNORED" and we want to leave
2211 	 * it alone in that case
2212 	 *
2213 	 * See comment in __smr_worker_continue for why this must be
2214 	 * done unconditionally otherwise.
2215 	 */
2216 	if (!smrw->detach_reason && smrw->whead) {
2217 		__smr_worker_tick(smrw, ctime, true);
2218 	}
2219 
2220 	if (__improbable(!processor->is_recommended)) {
2221 		__smrw_lock(smrw);
2222 		__smr_cpu_down_and_unlock(processor, smrw, SMR_CPU_REASON_IGNORED);
2223 	}
2224 
2225 #if CONFIG_QUIESCE_COUNTER
2226 	cpu_quiescent_leave(smrw);
2227 #endif /* CONFIG_QUIESCE_COUNTER */
2228 }
2229 
2230 /*!
2231  * @function smr_maintenance()
2232  *
2233  * @brief
2234  * Scheduler callback called at the scheduler tick.
2235  *
2236  * @discussion
2237  * Called at splsched().
2238  */
2239 void
smr_maintenance(uint64_t ctime)2240 smr_maintenance(uint64_t ctime)
2241 {
2242 #if CONFIG_QUIESCE_COUNTER
2243 	cpu_quiescent_advance(os_atomic_load(quiesce_genp, relaxed), ctime);
2244 #else
2245 	(void)ctime;
2246 #endif /* CONFIG_QUIESCE_COUNTER */
2247 }
2248 
2249 
2250 #pragma mark - SMR hash tables
2251 
2252 static struct smrq_slist_head *
smr_hash_alloc_array(size_t size)2253 smr_hash_alloc_array(size_t size)
2254 {
2255 	return kalloc_type(struct smrq_slist_head, size,
2256 	           Z_WAITOK | Z_ZERO | Z_SPRAYQTN);
2257 }
2258 
2259 static void
smr_hash_free_array(struct smrq_slist_head * array,size_t size)2260 smr_hash_free_array(struct smrq_slist_head *array, size_t size)
2261 {
2262 	kfree_type(struct smrq_slist_head, size, array);
2263 }
2264 
2265 static inline uintptr_t
smr_hash_array_encode(struct smrq_slist_head * array,uint16_t order)2266 smr_hash_array_encode(struct smrq_slist_head *array, uint16_t order)
2267 {
2268 	uintptr_t ptr;
2269 
2270 	ptr  = (uintptr_t)array;
2271 	ptr &= ~SMRH_ARRAY_ORDER_MASK;
2272 	ptr |= (uintptr_t)order << SMRH_ARRAY_ORDER_SHIFT;
2273 
2274 	return ptr;
2275 }
2276 
2277 #pragma mark SMR simple hash tables
2278 
2279 __security_const_late
2280 static struct smrq_slist_head __smrh_empty[2] = {
2281 	SMRQ_SLIST_INITIALIZER(__smrh_empty[0]),
2282 	SMRQ_SLIST_INITIALIZER(__smrh_empty[1]),
2283 };
2284 
2285 void
smr_hash_init_empty(struct smr_hash * smrh)2286 smr_hash_init_empty(struct smr_hash *smrh)
2287 {
2288 	*smrh = (struct smr_hash){
2289 		.smrh_array = smr_hash_array_encode(__smrh_empty, 63),
2290 	};
2291 }
2292 
2293 bool
smr_hash_is_empty_initialized(struct smr_hash * smrh)2294 smr_hash_is_empty_initialized(struct smr_hash *smrh)
2295 {
2296 	return smrh->smrh_array == smr_hash_array_encode(__smrh_empty, 63);
2297 }
2298 
2299 void
smr_hash_init(struct smr_hash * smrh,size_t size)2300 smr_hash_init(struct smr_hash *smrh, size_t size)
2301 {
2302 	uintptr_t array;
2303 	uint16_t shift;
2304 
2305 	assert(size);
2306 	shift = (uint16_t)flsll(size - 1);
2307 	size  = 1UL << shift;
2308 	if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
2309 		assert(size * sizeof(struct smrq_slist_head) <=
2310 		    KALLOC_SAFE_ALLOC_SIZE);
2311 	}
2312 	array = smr_hash_array_encode(smr_hash_alloc_array(size), 64 - shift);
2313 
2314 	if (smr_hash_is_empty_initialized(smrh)) {
2315 		os_atomic_store(&smrh->smrh_array, array, release);
2316 	} else {
2317 		*smrh = (struct smr_hash){
2318 			.smrh_array = array,
2319 		};
2320 	}
2321 }
2322 
2323 void
smr_hash_destroy(struct smr_hash * smrh)2324 smr_hash_destroy(struct smr_hash *smrh)
2325 {
2326 	if (!smr_hash_is_empty_initialized(smrh)) {
2327 		struct smr_hash_array array = smr_hash_array_decode(smrh);
2328 
2329 		smr_hash_free_array(array.smrh_array, smr_hash_size(array));
2330 	}
2331 	*smrh = (struct smr_hash){ };
2332 }
2333 
2334 void
2335 __smr_hash_serialized_clear(
2336 	struct smr_hash        *smrh,
2337 	smrh_traits_t          smrht,
2338 	void                 (^free)(void *obj))
2339 {
2340 	struct smr_hash_array array = smr_hash_array_decode(smrh);
2341 
2342 	for (size_t i = 0; i < smr_hash_size(array); i++) {
2343 		struct smrq_slink *link;
2344 		__smrq_slink_t *prev;
2345 
2346 		prev = &array.smrh_array[i].first;
2347 		while ((link = smr_serialized_load(prev))) {
2348 			prev = &link->next;
2349 			free(__smrht_link_to_obj(smrht, link));
2350 		}
2351 
2352 		smr_clear_store(&array.smrh_array[i].first);
2353 	}
2354 
2355 	smrh->smrh_count = 0;
2356 }
2357 
2358 kern_return_t
__smr_hash_shrink_and_unlock(struct smr_hash * smrh,lck_mtx_t * lock,smrh_traits_t smrht)2359 __smr_hash_shrink_and_unlock(
2360 	struct smr_hash        *smrh,
2361 	lck_mtx_t              *lock,
2362 	smrh_traits_t           smrht)
2363 {
2364 	struct smr_hash_array decptr = smr_hash_array_decode(smrh);
2365 	struct smrq_slist_head *newarray, *oldarray;
2366 	uint16_t neworder = decptr.smrh_order + 1;
2367 	size_t   oldsize  = smr_hash_size(decptr);
2368 	size_t   newsize  = oldsize / 2;
2369 
2370 	assert(newsize);
2371 
2372 	if (os_atomic_load(&smrh->smrh_resizing, relaxed)) {
2373 		lck_mtx_unlock(lock);
2374 		return KERN_FAILURE;
2375 	}
2376 
2377 	os_atomic_store(&smrh->smrh_resizing, true, relaxed);
2378 	lck_mtx_unlock(lock);
2379 
2380 	newarray = smr_hash_alloc_array(newsize);
2381 	if (newarray == NULL) {
2382 		os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2383 		return KERN_RESOURCE_SHORTAGE;
2384 	}
2385 
2386 	lck_mtx_lock(lock);
2387 
2388 	/*
2389 	 * Step 1: collapse all the chains in pairs.
2390 	 */
2391 	oldarray = decptr.smrh_array;
2392 
2393 	for (size_t i = 0; i < newsize; i++) {
2394 		newarray[i] = oldarray[i];
2395 		smrq_serialized_append(&newarray[i], &oldarray[i + newsize]);
2396 	}
2397 
2398 	/*
2399 	 * Step 2: publish the new array.
2400 	 */
2401 	os_atomic_store(&smrh->smrh_array,
2402 	    smr_hash_array_encode(newarray, neworder), release);
2403 
2404 	os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2405 
2406 	lck_mtx_unlock(lock);
2407 
2408 	/*
2409 	 * Step 3: free the old array once readers can't observe the old values.
2410 	 */
2411 	smr_synchronize(smrht->domain);
2412 
2413 	smr_hash_free_array(oldarray, oldsize);
2414 	return KERN_SUCCESS;
2415 }
2416 
2417 kern_return_t
__smr_hash_grow_and_unlock(struct smr_hash * smrh,lck_mtx_t * lock,smrh_traits_t smrht)2418 __smr_hash_grow_and_unlock(
2419 	struct smr_hash        *smrh,
2420 	lck_mtx_t              *lock,
2421 	smrh_traits_t           smrht)
2422 {
2423 	struct smr_hash_array decptr = smr_hash_array_decode(smrh);
2424 	struct smrq_slist_head *newarray, *oldarray;
2425 	__smrq_slink_t **prevarray;
2426 	uint16_t neworder = decptr.smrh_order - 1;
2427 	size_t   oldsize  = smr_hash_size(decptr);
2428 	size_t   newsize  = 2 * oldsize;
2429 	bool     needs_another_round = false;
2430 
2431 	if (smrh->smrh_resizing) {
2432 		lck_mtx_unlock(lock);
2433 		return KERN_FAILURE;
2434 	}
2435 
2436 	smrh->smrh_resizing = true;
2437 	lck_mtx_unlock(lock);
2438 
2439 	newarray = smr_hash_alloc_array(newsize);
2440 	if (newarray == NULL) {
2441 		os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2442 		return KERN_RESOURCE_SHORTAGE;
2443 	}
2444 
2445 	prevarray = kalloc_type(__smrq_slink_t *, newsize,
2446 	    Z_WAITOK | Z_ZERO | Z_SPRAYQTN);
2447 	if (prevarray == NULL) {
2448 		smr_hash_free_array(newarray, newsize);
2449 		os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2450 		return KERN_RESOURCE_SHORTAGE;
2451 	}
2452 
2453 
2454 	lck_mtx_lock(lock);
2455 
2456 	/*
2457 	 * Step 1: create a duplicated array with twice as many heads.
2458 	 */
2459 	oldarray = decptr.smrh_array;
2460 
2461 	memcpy(newarray, oldarray, oldsize * sizeof(newarray[0]));
2462 	memcpy(newarray + oldsize, oldarray, oldsize * sizeof(newarray[0]));
2463 
2464 	/*
2465 	 * Step 2: Publish the new array, and wait for readers to observe it
2466 	 *         before we do any change.
2467 	 */
2468 	os_atomic_store(&smrh->smrh_array,
2469 	    smr_hash_array_encode(newarray, neworder), release);
2470 
2471 	smr_synchronize(smrht->domain);
2472 
2473 
2474 	/*
2475 	 * Step 3: split the lists.
2476 	 */
2477 
2478 	/*
2479 	 * If the list we are trying to split looked like this,
2480 	 * where L elements will go to the "left" bucket and "R"
2481 	 * to the right one:
2482 	 *
2483 	 *     old_head --> L1 --> L2                -> L5
2484 	 *                            \             /      \
2485 	 *                             -> R3 --> R4         -> R6 --> NULL
2486 	 *
2487 	 * Then make sure the new heads point to their legitimate first element,
2488 	 * leading to this state:
2489 	 *
2490 	 *     l_head   --> L1 --> L2                -> L5
2491 	 *                            \             /      \
2492 	 *     r_head   ----------------> R3 --> R4         -> R6 --> NULL
2493 	 *
2494 	 *
2495 	 *     prevarray[left]  = &L2->next
2496 	 *     prevarray[right] = &r_head
2497 	 *     oldarray[old]    = L2
2498 	 */
2499 
2500 	for (size_t i = 0; i < oldsize; i++) {
2501 		struct smrq_slink *link, *next;
2502 		uint32_t want_mask;
2503 
2504 		link = smr_serialized_load(&oldarray[i].first);
2505 		if (link == NULL) {
2506 			continue;
2507 		}
2508 
2509 		want_mask = smrht->obj_hash(link, 0) & oldsize;
2510 		while ((next = smr_serialized_load(&link->next)) &&
2511 		    (smrht->obj_hash(next, 0) & oldsize) == want_mask) {
2512 			link = next;
2513 		}
2514 
2515 		if (want_mask == 0) {
2516 			/* elements seen go to the "left" bucket */
2517 			prevarray[i] = &link->next;
2518 			prevarray[i + oldsize] = &newarray[i + oldsize].first;
2519 			smr_serialized_store_relaxed(prevarray[i + oldsize], next);
2520 		} else {
2521 			/* elements seen go to the "right" bucket */
2522 			prevarray[i] = &newarray[i].first;
2523 			prevarray[i + oldsize] = &link->next;
2524 			smr_serialized_store_relaxed(prevarray[i], next);
2525 		}
2526 
2527 		smr_serialized_store_relaxed(&oldarray[i].first,
2528 		    next ? link : NULL);
2529 
2530 		needs_another_round |= (next != NULL);
2531 	}
2532 
2533 	/*
2534 	 * At this point, when we split further, we must wait for
2535 	 * readers to observe the previous state before we split
2536 	 * further. Indeed, reusing the example above, the next
2537 	 * round of splitting would end up with this:
2538 	 *
2539 	 *     l_head   --> L1 --> L2 ----------------> L5
2540 	 *                                          /      \
2541 	 *     r_head   ----------------> R3 --> R4         -> R6 --> NULL
2542 	 *
2543 	 *
2544 	 *     prevarray[left]  = &L2->next
2545 	 *     prevarray[right] = &R4->next
2546 	 *     oldarray[old]    = R4
2547 	 *
2548 	 * But we must be sure that no readers can observe r_head
2549 	 * having been L1, otherwise a stale reader might skip over
2550 	 * R3/R4.
2551 	 *
2552 	 * Generally speaking we need to do that each time we do a round
2553 	 * of splitting that isn't terminating the list with NULL.
2554 	 */
2555 
2556 	while (needs_another_round) {
2557 		smr_synchronize(smrht->domain);
2558 
2559 		needs_another_round = false;
2560 
2561 		for (size_t i = 0; i < oldsize; i++) {
2562 			struct smrq_slink *link, *next;
2563 			uint32_t want_mask;
2564 
2565 			link = smr_serialized_load(&oldarray[i].first);
2566 			if (link == NULL) {
2567 				continue;
2568 			}
2569 
2570 			/*
2571 			 * If `prevarray[i]` (left) points to the linkage
2572 			 * we stopped at, then it means the next element
2573 			 * will be "to the right" and vice versa.
2574 			 *
2575 			 * We also already know "next" exists, so only probe
2576 			 * after it.
2577 			 */
2578 			if (prevarray[i] == &link->next) {
2579 				want_mask = (uint32_t)oldsize;
2580 			} else {
2581 				want_mask = 0;
2582 			}
2583 
2584 			link = smr_serialized_load(&link->next);
2585 
2586 			while ((next = smr_serialized_load(&link->next)) &&
2587 			    (smrht->obj_hash(next, 0) & oldsize) == want_mask) {
2588 				link = next;
2589 			}
2590 
2591 			if (want_mask == 0) {
2592 				/* elements seen go to the "left" bucket */
2593 				prevarray[i] = &link->next;
2594 				smr_serialized_store_relaxed(prevarray[i + oldsize], next);
2595 			} else {
2596 				/* elements seen go to the "right" bucket */
2597 				smr_serialized_store_relaxed(prevarray[i], next);
2598 				prevarray[i + oldsize] = &link->next;
2599 			}
2600 
2601 			smr_serialized_store_relaxed(&oldarray[i].first,
2602 			    next ? link : NULL);
2603 
2604 			needs_another_round |= (next != NULL);
2605 		}
2606 	}
2607 
2608 	smrh->smrh_resizing = false;
2609 	lck_mtx_unlock(lock);
2610 
2611 	/*
2612 	 * Step 4: cleanup, no need to wait for readers, this happened already
2613 	 *         at least once for splitting reasons.
2614 	 */
2615 	smr_hash_free_array(oldarray, oldsize);
2616 	kfree_type(__smrq_slink_t *, newsize, prevarray);
2617 	return KERN_SUCCESS;
2618 }
2619 
2620 #pragma mark SMR scalable hash tables
2621 
2622 #define SMRSH_MIGRATED  ((struct smrq_slink *)SMRSH_BUCKET_STOP_BIT)
2623 static LCK_GRP_DECLARE(smr_shash_grp, "smr_shash");
2624 
2625 static inline size_t
__smr_shash_min_size(struct smr_shash * smrh)2626 __smr_shash_min_size(struct smr_shash *smrh)
2627 {
2628 	return 1ul << smrh->smrsh_min_shift;
2629 }
2630 
2631 static inline size_t
__smr_shash_size_for_shift(uint8_t shift)2632 __smr_shash_size_for_shift(uint8_t shift)
2633 {
2634 	return (~0u >> shift) + 1;
2635 }
2636 
2637 static inline size_t
__smr_shash_cursize(smrsh_state_t state)2638 __smr_shash_cursize(smrsh_state_t state)
2639 {
2640 	return __smr_shash_size_for_shift(state.curshift);
2641 }
2642 
2643 static void
__smr_shash_bucket_init(hw_lck_ptr_t * head)2644 __smr_shash_bucket_init(hw_lck_ptr_t *head)
2645 {
2646 	hw_lck_ptr_init(head, __smr_shash_bucket_stop(head), &smr_shash_grp);
2647 }
2648 
2649 static void
__smr_shash_bucket_destroy(hw_lck_ptr_t * head)2650 __smr_shash_bucket_destroy(hw_lck_ptr_t *head)
2651 {
2652 	hw_lck_ptr_destroy(head, &smr_shash_grp);
2653 }
2654 
2655 __attribute__((noinline))
2656 void *
__smr_shash_entered_find_slow(const struct smr_shash * smrh,smrh_key_t key,hw_lck_ptr_t * head,smrh_traits_t traits)2657 __smr_shash_entered_find_slow(
2658 	const struct smr_shash *smrh,
2659 	smrh_key_t              key,
2660 	hw_lck_ptr_t           *head,
2661 	smrh_traits_t           traits)
2662 {
2663 	struct smrq_slink *link;
2664 	smrsh_state_t state;
2665 	uint32_t hash;
2666 
2667 	/* wait for the rehashing to be done into their target buckets */
2668 	hw_lck_ptr_wait_for_value(head, SMRSH_MIGRATED, &smr_shash_grp);
2669 
2670 	state = os_atomic_load(&smrh->smrsh_state, dependency);
2671 	hash  = __smr_shash_hash(smrh, state.newidx, key, traits);
2672 	head  = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
2673 
2674 	link  = hw_lck_ptr_value(head);
2675 	while (!__smr_shash_is_stop(link)) {
2676 		if (traits->obj_equ(link, key)) {
2677 			return __smrht_link_to_obj(traits, link);
2678 		}
2679 		link = smr_entered_load(&link->next);
2680 	}
2681 
2682 	assert(link == __smr_shash_bucket_stop(head));
2683 	return NULL;
2684 }
2685 
2686 static const uint8_t __smr_shash_grow_ratio[] = {
2687 	[SMRSH_COMPACT]           = 6,
2688 	[SMRSH_BALANCED]          = 4,
2689 	[SMRSH_BALANCED_NOSHRINK] = 4,
2690 	[SMRSH_FASTEST]           = 2,
2691 };
2692 
2693 static inline uint64_t
__smr_shash_count(struct smr_shash * smrh)2694 __smr_shash_count(struct smr_shash *smrh)
2695 {
2696 	int64_t count = (int64_t)counter_load(&smrh->smrsh_count);
2697 
2698 	/*
2699 	 * negative values make no sense and is likely due to some
2700 	 * stale values being read.
2701 	 */
2702 	return count < 0 ? 0ull : (uint64_t)count;
2703 }
2704 
2705 static inline bool
__smr_shash_should_grow(struct smr_shash * smrh,smrsh_state_t state,uint64_t count)2706 __smr_shash_should_grow(
2707 	struct smr_shash       *smrh,
2708 	smrsh_state_t           state,
2709 	uint64_t                count)
2710 {
2711 	size_t size = __smr_shash_cursize(state);
2712 
2713 	/* grow if elem:bucket ratio is worse than grow_ratio:1 */
2714 	return count > __smr_shash_grow_ratio[smrh->smrsh_policy] * size;
2715 }
2716 
2717 static inline bool
__smr_shash_should_reseed(struct smr_shash * smrh,size_t observed_depth)2718 __smr_shash_should_reseed(
2719 	struct smr_shash       *smrh,
2720 	size_t                  observed_depth)
2721 {
2722 	return observed_depth > 10 * __smr_shash_grow_ratio[smrh->smrsh_policy];
2723 }
2724 
2725 static inline bool
__smr_shash_should_shrink(struct smr_shash * smrh,smrsh_state_t state,uint64_t count)2726 __smr_shash_should_shrink(
2727 	struct smr_shash       *smrh,
2728 	smrsh_state_t           state,
2729 	uint64_t                count)
2730 {
2731 	size_t size = __smr_shash_cursize(state);
2732 
2733 	switch (smrh->smrsh_policy) {
2734 	case SMRSH_COMPACT:
2735 		/* shrink if bucket:elem ratio is worse than 1:1 */
2736 		return size > count && size > __smr_shash_min_size(smrh);
2737 	case SMRSH_BALANCED:
2738 		/* shrink if bucket:elem ratio is worse than 2:1 */
2739 		return size > 2 * count && size > __smr_shash_min_size(smrh);
2740 	case SMRSH_BALANCED_NOSHRINK:
2741 	case SMRSH_FASTEST:
2742 		return false;
2743 	}
2744 }
2745 
2746 static inline void
__smr_shash_schedule_rehash(struct smr_shash * smrh,smrh_traits_t traits,smrsh_rehash_t reason)2747 __smr_shash_schedule_rehash(
2748 	struct smr_shash       *smrh,
2749 	smrh_traits_t           traits,
2750 	smrsh_rehash_t          reason)
2751 {
2752 	smrsh_rehash_t rehash;
2753 
2754 	rehash = os_atomic_load(&smrh->smrsh_rehashing, relaxed);
2755 	if (rehash & reason) {
2756 		return;
2757 	}
2758 
2759 	rehash = os_atomic_or_orig(&smrh->smrsh_rehashing, reason, relaxed);
2760 	if (!rehash) {
2761 		thread_call_enter1(smrh->smrsh_callout,
2762 		    __DECONST(void *, traits));
2763 	}
2764 }
2765 
2766 void *
__smr_shash_entered_get_or_insert(struct smr_shash * smrh,smrh_key_t key,struct smrq_slink * link,smrh_traits_t traits)2767 __smr_shash_entered_get_or_insert(
2768 	struct smr_shash       *smrh,
2769 	smrh_key_t              key,
2770 	struct smrq_slink      *link,
2771 	smrh_traits_t           traits)
2772 {
2773 	struct smrq_slink *first;
2774 	struct smrq_slink *other;
2775 	uint32_t hash, depth;
2776 	smrsh_state_t state;
2777 	hw_lck_ptr_t *head;
2778 	void *obj;
2779 
2780 	state = os_atomic_load(&smrh->smrsh_state, dependency);
2781 	hash  = __smr_shash_hash(smrh, state.curidx, key, traits);
2782 	head  = __smr_shash_bucket(smrh, state, SMRSH_CUR, hash);
2783 	first = hw_lck_ptr_lock(head, &smr_shash_grp);
2784 
2785 	if (__improbable(first == SMRSH_MIGRATED)) {
2786 		hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
2787 
2788 		state = os_atomic_load(&smrh->smrsh_state, dependency);
2789 		hash  = __smr_shash_hash(smrh, state.newidx, key, traits);
2790 		head  = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
2791 		first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
2792 	}
2793 
2794 	depth = 0;
2795 	other = first;
2796 	while (!__smr_shash_is_stop(other)) {
2797 		depth++;
2798 		if (traits->obj_equ(other, key)) {
2799 			obj = __smrht_link_to_obj(traits, other);
2800 			if (traits->obj_try_get(obj)) {
2801 				hw_lck_ptr_unlock(head, first,
2802 				    &smr_shash_grp);
2803 				return obj;
2804 			}
2805 			break;
2806 		}
2807 		other = smr_serialized_load(&other->next);
2808 	}
2809 
2810 	counter_inc_preemption_disabled(&smrh->smrsh_count);
2811 	smr_serialized_store_relaxed(&link->next, first);
2812 	hw_lck_ptr_unlock(head, link, &smr_shash_grp);
2813 
2814 	if (__smr_shash_should_reseed(smrh, depth)) {
2815 		__smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_RESEED);
2816 	} else if (depth * 2 >= __smr_shash_grow_ratio[smrh->smrsh_policy] &&
2817 	    __smr_shash_should_grow(smrh, state, __smr_shash_count(smrh))) {
2818 		__smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_GROW);
2819 	}
2820 	return NULL;
2821 }
2822 
2823 __abortlike
2824 static void
__smr_shash_missing_elt_panic(struct smr_shash * smrh,struct smrq_slink * link,smrh_traits_t traits)2825 __smr_shash_missing_elt_panic(
2826 	struct smr_shash        *smrh,
2827 	struct smrq_slink       *link,
2828 	smrh_traits_t           traits)
2829 {
2830 	panic("Unable to find item %p (linkage %p) in %p (traits %p)",
2831 	    __smrht_link_to_obj(traits, link), link, smrh, traits);
2832 }
2833 
2834 smr_shash_mut_cursor_t
__smr_shash_entered_mut_begin(struct smr_shash * smrh,struct smrq_slink * link,smrh_traits_t traits)2835 __smr_shash_entered_mut_begin(
2836 	struct smr_shash       *smrh,
2837 	struct smrq_slink      *link,
2838 	smrh_traits_t           traits)
2839 {
2840 	struct smrq_slink *first, *next;
2841 	__smrq_slink_t *prev;
2842 	smrsh_state_t state;
2843 	hw_lck_ptr_t *head;
2844 	uint32_t hash;
2845 
2846 	state = os_atomic_load(&smrh->smrsh_state, dependency);
2847 	hash  = __smr_shash_hash(smrh, state.curidx, link, traits);
2848 	head  = __smr_shash_bucket(smrh, state, SMRSH_CUR, hash);
2849 	first = hw_lck_ptr_lock(head, &smr_shash_grp);
2850 
2851 	if (__improbable(first == SMRSH_MIGRATED)) {
2852 		hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
2853 
2854 		state = os_atomic_load(&smrh->smrsh_state, dependency);
2855 		hash  = __smr_shash_hash(smrh, state.newidx, link, traits);
2856 		head  = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
2857 		first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
2858 	}
2859 
2860 	next = first;
2861 	while (next != link) {
2862 		if (__smr_shash_is_stop(next)) {
2863 			__smr_shash_missing_elt_panic(smrh, link, traits);
2864 		}
2865 		prev  = &next->next;
2866 		next  = smr_serialized_load(prev);
2867 	}
2868 
2869 	return (smr_shash_mut_cursor_t){ .head = head, .prev = prev };
2870 }
2871 
2872 void
__smr_shash_entered_mut_erase(struct smr_shash * smrh,smr_shash_mut_cursor_t cursor,struct smrq_slink * link,smrh_traits_t traits)2873 __smr_shash_entered_mut_erase(
2874 	struct smr_shash       *smrh,
2875 	smr_shash_mut_cursor_t  cursor,
2876 	struct smrq_slink      *link,
2877 	smrh_traits_t           traits)
2878 {
2879 	struct smrq_slink *next, *first;
2880 	smrsh_state_t state;
2881 
2882 	first = hw_lck_ptr_value(cursor.head);
2883 
2884 	next  = smr_serialized_load(&link->next);
2885 	if (first == link) {
2886 		counter_dec_preemption_disabled(&smrh->smrsh_count);
2887 		hw_lck_ptr_unlock(cursor.head, next, &smr_shash_grp);
2888 	} else {
2889 		smr_serialized_store_relaxed(cursor.prev, next);
2890 		counter_dec_preemption_disabled(&smrh->smrsh_count);
2891 		hw_lck_ptr_unlock(cursor.head, first, &smr_shash_grp);
2892 	}
2893 
2894 	state = atomic_load_explicit(&smrh->smrsh_state, memory_order_relaxed);
2895 	if (first == link && __smr_shash_is_stop(next) &&
2896 	    __smr_shash_should_shrink(smrh, state, __smr_shash_count(smrh))) {
2897 		__smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_SHRINK);
2898 	}
2899 }
2900 
2901 void
__smr_shash_entered_mut_replace(smr_shash_mut_cursor_t cursor,struct smrq_slink * old_link,struct smrq_slink * new_link)2902 __smr_shash_entered_mut_replace(
2903 	smr_shash_mut_cursor_t  cursor,
2904 	struct smrq_slink      *old_link,
2905 	struct smrq_slink      *new_link)
2906 {
2907 	struct smrq_slink *first, *next;
2908 
2909 	first = hw_lck_ptr_value(cursor.head);
2910 
2911 	next  = smr_serialized_load(&old_link->next);
2912 	smr_serialized_store_relaxed(&new_link->next, next);
2913 	if (first == old_link) {
2914 		hw_lck_ptr_unlock(cursor.head, new_link, &smr_shash_grp);
2915 	} else {
2916 		smr_serialized_store_relaxed(cursor.prev, new_link);
2917 		hw_lck_ptr_unlock(cursor.head, first, &smr_shash_grp);
2918 	}
2919 }
2920 
2921 void
__smr_shash_entered_mut_abort(smr_shash_mut_cursor_t cursor)2922 __smr_shash_entered_mut_abort(smr_shash_mut_cursor_t cursor)
2923 {
2924 	hw_lck_ptr_unlock(cursor.head,
2925 	    hw_lck_ptr_value(cursor.head), &smr_shash_grp);
2926 }
2927 
2928 static kern_return_t
__smr_shash_rehash_with_target(struct smr_shash * smrh,smrsh_state_t state,uint8_t newshift,smrh_traits_t traits)2929 __smr_shash_rehash_with_target(
2930 	struct smr_shash       *smrh,
2931 	smrsh_state_t           state,
2932 	uint8_t                 newshift,
2933 	smrh_traits_t           traits)
2934 {
2935 	const size_t FLAT_SIZE = 256;
2936 	struct smrq_slink *flat_queue[FLAT_SIZE];
2937 
2938 	size_t oldsize, newsize;
2939 	hw_lck_ptr_t *oldarray;
2940 	hw_lck_ptr_t *newarray;
2941 	uint32_t newseed;
2942 	uint8_t oldidx;
2943 
2944 	/*
2945 	 * This function resizes a scalable hash table.
2946 	 *
2947 	 * It doesn't require a lock because it is the callout
2948 	 * of a THREAD_CALL_ONCE thread call.
2949 	 */
2950 
2951 	oldidx         = state.curidx;
2952 	state.newidx   = 1 - state.curidx;
2953 	state.newshift = newshift;
2954 	assert(__smr_shash_load_array(smrh, state.newidx) == NULL);
2955 
2956 	oldsize = __smr_shash_cursize(state);
2957 	newsize = __smr_shash_size_for_shift(newshift);
2958 
2959 	oldarray = __smr_shash_load_array(smrh, state.curidx);
2960 	newarray = (hw_lck_ptr_t *)smr_hash_alloc_array(newsize);
2961 	newseed  = (uint32_t)early_random();
2962 
2963 	if (newarray == NULL) {
2964 		return KERN_RESOURCE_SHORTAGE;
2965 	}
2966 
2967 	/*
2968 	 * Step 1: initialize the new array and seed,
2969 	 *         and then publish the state referencing it.
2970 	 *
2971 	 *         We do not need to synchronize explicitly with SMR,
2972 	 *         because readers/writers will notice rehashing when
2973 	 *         the bucket they interact with has a SMRSH_MIGRATED
2974 	 *         value.
2975 	 */
2976 
2977 	for (size_t i = 0; i < newsize; i++) {
2978 		__smr_shash_bucket_init(&newarray[i]);
2979 	}
2980 	os_atomic_store(&smrh->smrsh_array[state.newidx], newarray, relaxed);
2981 	os_atomic_store(&smrh->smrsh_seed[state.newidx], newseed, relaxed);
2982 	os_atomic_store(&smrh->smrsh_state, state, release);
2983 
2984 	/*
2985 	 * Step 2: migrate buckets "atomically" under the old bucket lock.
2986 	 *
2987 	 *         This migration is atomic for writers because
2988 	 *         they take the old bucket lock first, and if
2989 	 *         they observe SMRSH_MIGRATED as the value,
2990 	 *         go look in the new bucket instead.
2991 	 *
2992 	 *         This migration is atomic for readers, because
2993 	 *         as we move elements to their new buckets,
2994 	 *         the hash chains will not circle back to their
2995 	 *         bucket head (the "stop" value won't match),
2996 	 *         or the bucket head will be SMRSH_MIGRATED.
2997 	 *
2998 	 *         This causes a slowpath which spins waiting
2999 	 *         for SMRSH_MIGRATED to appear and then looks
3000 	 *         in the new bucket.
3001 	 */
3002 	for (size_t i = 0; i < oldsize; i++) {
3003 		struct smrq_slink *first, *link, *next;
3004 		hw_lck_ptr_t *head;
3005 		uint32_t hash;
3006 		size_t n = 0;
3007 
3008 		link = first = hw_lck_ptr_lock(&oldarray[i], &smr_shash_grp);
3009 
3010 		while (!__smr_shash_is_stop(link)) {
3011 			flat_queue[n++ % FLAT_SIZE] = link;
3012 			link = smr_serialized_load(&link->next);
3013 		}
3014 
3015 		while (n-- > 0) {
3016 			for (size_t j = (n % FLAT_SIZE) + 1; j-- > 0;) {
3017 				link = flat_queue[j];
3018 				hash = traits->obj_hash(link, newseed);
3019 				head = &newarray[hash >> newshift];
3020 				next = hw_lck_ptr_lock_nopreempt(head,
3021 				    &smr_shash_grp);
3022 				smr_serialized_store_relaxed(&link->next, next);
3023 				hw_lck_ptr_unlock_nopreempt(head, link,
3024 				    &smr_shash_grp);
3025 			}
3026 			n &= ~(FLAT_SIZE - 1);
3027 
3028 			/*
3029 			 * If there were more than FLAT_SIZE elements in the
3030 			 * chain (which is super unlikely and in many ways,
3031 			 * worrisome), then we need to repopoulate
3032 			 * the flattened queue array for each run.
3033 			 *
3034 			 * This is O(n^2) but we have worse problems anyway
3035 			 * if we ever hit this path.
3036 			 */
3037 			if (__improbable(n > 0)) {
3038 				link = first;
3039 				for (size_t j = 0; j < n - FLAT_SIZE; j++) {
3040 					link = smr_serialized_load(&link->next);
3041 				}
3042 
3043 				flat_queue[0] = link;
3044 				for (size_t j = 1; j < FLAT_SIZE; j++) {
3045 					link = smr_serialized_load(&link->next);
3046 					flat_queue[j] = link;
3047 				}
3048 			}
3049 		}
3050 
3051 		hw_lck_ptr_unlock(&oldarray[i], SMRSH_MIGRATED, &smr_shash_grp);
3052 	}
3053 
3054 	/*
3055 	 * Step 3: deallocate the old array of buckets,
3056 	 *         making sure to hide it from readers.
3057 	 */
3058 
3059 	state.curshift = state.newshift;
3060 	state.curidx   = state.newidx;
3061 	os_atomic_store(&smrh->smrsh_state, state, release);
3062 
3063 	smr_synchronize(traits->domain);
3064 
3065 	os_atomic_store(&smrh->smrsh_array[oldidx], NULL, relaxed);
3066 	for (size_t i = 0; i < oldsize; i++) {
3067 		__smr_shash_bucket_destroy(&oldarray[i]);
3068 	}
3069 	smr_hash_free_array((struct smrq_slist_head *)oldarray, oldsize);
3070 
3071 	return KERN_SUCCESS;
3072 }
3073 
3074 static void
__smr_shash_rehash(thread_call_param_t arg0,thread_call_param_t arg1)3075 __smr_shash_rehash(thread_call_param_t arg0, thread_call_param_t arg1)
3076 {
3077 	struct smr_shash *smrh   = arg0;
3078 	smrh_traits_t     traits = arg1;
3079 	smrsh_rehash_t    reason;
3080 	smrsh_state_t     state;
3081 	uint64_t          count;
3082 	kern_return_t     kr;
3083 
3084 	do {
3085 		reason = os_atomic_xchg(&smrh->smrsh_rehashing,
3086 		    SMRSH_REHASH_RUNNING, relaxed);
3087 
3088 		state  = os_atomic_load(&smrh->smrsh_state, relaxed);
3089 		count  = __smr_shash_count(smrh);
3090 
3091 		if (__smr_shash_should_grow(smrh, state, count)) {
3092 			kr = __smr_shash_rehash_with_target(smrh, state,
3093 			    state.curshift - 1, traits);
3094 		} else if (__smr_shash_should_shrink(smrh, state, count)) {
3095 			kr = __smr_shash_rehash_with_target(smrh, state,
3096 			    state.curshift + 1, traits);
3097 		} else if (reason & SMRSH_REHASH_RESEED) {
3098 			kr = __smr_shash_rehash_with_target(smrh, state,
3099 			    state.curshift, traits);
3100 		} else {
3101 			kr = KERN_SUCCESS;
3102 		}
3103 
3104 		if (kr == KERN_RESOURCE_SHORTAGE) {
3105 			uint64_t deadline;
3106 
3107 			os_atomic_or(&smrh->smrsh_rehashing, reason, relaxed);
3108 			nanoseconds_to_deadline(NSEC_PER_MSEC, &deadline);
3109 			thread_call_enter1_delayed(smrh->smrsh_callout,
3110 			    arg1, deadline);
3111 			break;
3112 		}
3113 	} while (!os_atomic_cmpxchg(&smrh->smrsh_rehashing,
3114 	    SMRSH_REHASH_RUNNING, SMRSH_REHASH_NONE, relaxed));
3115 }
3116 
3117 void
smr_shash_init(struct smr_shash * smrh,smrsh_policy_t policy,size_t min_size)3118 smr_shash_init(struct smr_shash *smrh, smrsh_policy_t policy, size_t min_size)
3119 {
3120 	smrsh_state_t state;
3121 	hw_lck_ptr_t *array;
3122 	uint8_t shift;
3123 	size_t size;
3124 
3125 	switch (policy) {
3126 	case SMRSH_COMPACT:
3127 		if (min_size < 2) {
3128 			min_size = 2;
3129 		}
3130 		break;
3131 	default:
3132 		if (min_size < 16) {
3133 			min_size = 16;
3134 		}
3135 		break;
3136 	}
3137 
3138 	switch (policy) {
3139 	case SMRSH_COMPACT:
3140 		size = MIN(2, min_size);
3141 		break;
3142 	case SMRSH_BALANCED:
3143 	case SMRSH_BALANCED_NOSHRINK:
3144 		size = MIN(16, min_size);
3145 		break;
3146 	case SMRSH_FASTEST:
3147 		size = min_size;
3148 		break;
3149 	}
3150 
3151 	if (size > KALLOC_SAFE_ALLOC_SIZE / sizeof(*array)) {
3152 		size = KALLOC_SAFE_ALLOC_SIZE / sizeof(*array);
3153 	}
3154 	shift = (uint8_t)__builtin_clz((uint32_t)(size - 1));
3155 	size  = (~0u >> shift) + 1;
3156 	array = (hw_lck_ptr_t *)smr_hash_alloc_array(size);
3157 	for (size_t i = 0; i < size; i++) {
3158 		__smr_shash_bucket_init(&array[i]);
3159 	}
3160 
3161 	state = (smrsh_state_t){
3162 		.curshift = shift,
3163 		.newshift = shift,
3164 	};
3165 	*smrh = (struct smr_shash){
3166 		.smrsh_array[0]  = array,
3167 		.smrsh_seed[0]   = (uint32_t)early_random(),
3168 		.smrsh_state     = state,
3169 		.smrsh_policy    = policy,
3170 		.smrsh_min_shift = (uint8_t)flsll(min_size - 1),
3171 	};
3172 	counter_alloc(&smrh->smrsh_count);
3173 	smrh->smrsh_callout  = thread_call_allocate_with_options(__smr_shash_rehash,
3174 	    smrh, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
3175 }
3176 
3177 void
3178 __smr_shash_destroy(
3179 	struct smr_shash       *smrh,
3180 	smrh_traits_t           traits,
3181 	void                  (^free)(void *))
3182 {
3183 	smrsh_state_t state;
3184 	hw_lck_ptr_t *array;
3185 	size_t size;
3186 
3187 	thread_call_cancel_wait(smrh->smrsh_callout);
3188 
3189 	state = os_atomic_load(&smrh->smrsh_state, dependency);
3190 	assert(state.curidx == state.newidx);
3191 	assert(__smr_shash_load_array(smrh, 1 - state.curidx) == NULL);
3192 	size  = __smr_shash_cursize(state);
3193 	array = __smr_shash_load_array(smrh, state.curidx);
3194 
3195 	if (free) {
3196 		for (size_t i = 0; i < size; i++) {
3197 			struct smrq_slink *link, *next;
3198 
3199 			next = hw_lck_ptr_value(&array[i]);
3200 			while (!__smr_shash_is_stop(next)) {
3201 				link = next;
3202 				next = smr_serialized_load(&link->next);
3203 				free(__smrht_link_to_obj(traits, link));
3204 			}
3205 		}
3206 	}
3207 	for (size_t i = 0; i < size; i++) {
3208 		__smr_shash_bucket_destroy(&array[i]);
3209 	}
3210 
3211 	thread_call_free(smrh->smrsh_callout);
3212 	counter_free(&smrh->smrsh_count);
3213 	smr_hash_free_array((struct smrq_slist_head *)array, size);
3214 	bzero(smrh, sizeof(*smrh));
3215 }
3216