1 /*
2 * Copyright (c) 2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks_internal.h>
30 #include <kern/cpu_data.h>
31 #include <kern/machine.h>
32 #include <kern/mpsc_queue.h>
33 #include <kern/percpu.h>
34 #include <kern/sched.h>
35 #include <kern/smr.h>
36 #include <kern/smr_hash.h>
37 #include <kern/thread.h>
38 #include <kern/zalloc.h>
39 #include <machine/commpage.h>
40 #include <os/hash.h>
41
42
43 #pragma mark - SMR domains
44
45 /*
46 * This SMR scheme is directly FreeBSD's "Global Unbounded Sequences".
47 *
48 * Major differences are:
49 *
50 * - only eager clocks are implemented (no lazy, no implicit)
51 *
52 *
53 * SMR clocks have 3 state machines interacting at any given time:
54 *
55 * 1. reader critical sections
56 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
57 *
58 * Each CPU can disable preemption and do this sequence:
59 *
60 * CPU::c_rd_seq = GLOBAL::c_wr_seq;
61 *
62 * < unfortunate place to receive a long IRQ > [I]
63 *
64 * os_atomic_thread_fence(seq_cst); [R1]
65 *
66 * {
67 * // critical section
68 * }
69 *
70 * os_atomic_store(&CPU::c_rd_seq, INVALID, release); [R2]
71 *
72 *
73 *
74 * 2. writer sequence advances
75 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
76 *
77 * Each writer can increment the global write sequence
78 * at any given time:
79 *
80 * os_atomic_add(&GLOBAL::c_wr_seq, SMR_SEQ_INC, release); [W]
81 *
82 *
83 *
84 * 3. synchronization sequence: poll/wait/scan
85 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
86 *
87 * This state machine synchronizes with the other two in order to decide
88 * if a given "goal" is in the past. Only the cases when the call
89 * is successful is interresting for barrier purposes, and we will focus
90 * on cases that do not take an early return for failures.
91 *
92 * a. __smr_poll:
93 *
94 * rd_seq = os_atomic_load(&GLOBAL::c_rd_seq, acquire); [S1]
95 * if (goal < rd_seq) SUCCESS.
96 * wr_seq = os_atomic_load(&GLOBAL::c_rd_seq, relaxed);
97 *
98 * b. __smr_scan
99 *
100 * os_atomic_thread_fence(seq_cst) [S2]
101 *
102 * observe the minimum CPU::c_rd_seq "min_rd_seq"
103 * value possible or rw_seq if no CPU was in a critical section.
104 * (possibly spinning until it satisfies "goal")
105 *
106 * c. __smr_rd_advance
107 *
108 * cur_rd_seq = load_exclusive(&GLOBAL::c_rd_seq);
109 * os_atomic_thread_fence(seq_cst); [S3]
110 * if (min_rd_seq > cur_rd_seq) {
111 * store_exlusive(&GLOBAL::c_rd_seq, min_rd_seq);
112 * }
113 *
114 *
115 * One sentence summary
116 * ~~~~~~~~~~~~~~~~~~~~
117 *
118 * A simplistic one-sentence summary of the algorithm is that __smr_scan()
119 * works really hard to insert itself in the timeline of write sequences and
120 * observe a reasonnable bound for first safe-to-reclaim sequence, and
121 * issues [S3] to sequence everything around "c_rd_seq" (via [S3] -> [S1]):
122 *
123 * GLOBAL::c_rd_seq GLOBAL::c_wr_seq
124 * v v
125 * ──────────────────────┬────────────────┬─────────────────────
126 * ... safe to reclaim │ deferred │ future ...
127 * ──────────────────────┴────────────────┴─────────────────────
128 *
129 *
130 * Detailed explanation
131 * ~~~~~~~~~~~~~~~~~~~~
132 *
133 * [W] -> [R1] establishes a "happens before" relationship between a given
134 * writer and this critical section. The loaded GLOBAL::c_wr_seq might
135 * however be stale with respect to the one [R1] really synchronizes with
136 * (see [I] explanation below).
137 *
138 *
139 * [R1] -> [S2] establishes a "happens before" relationship between all the
140 * active critical sections and the scanner.
141 * It lets us compute the oldest possible sequence pinned by an active
142 * critical section.
143 *
144 *
145 * [R2] -> [S3] establishes a "happens before" relationship between all the
146 * inactive critical sections and the scanner.
147 *
148 *
149 * [S3] -> [S1] is the typical expected fastpath: when the caller can decide
150 * that its goal is older than the last update an __smr_rd_advance() did.
151 * Note that [S3] doubles as an "[S1]" when two __smr_scan() race each other
152 * and one of them finishes last but observed a "worse" read sequence.
153 *
154 *
155 * [W], [S3] -> [S1] is the last crucial property: all updates to the global
156 * clock are totally ordered because they update the entire 128bit state
157 * every time with an RMW. This guarantees that __smr_poll() can't load
158 * an `rd_seq` that is younger than the `wr_seq` it loads next.
159 *
160 *
161 * [I] __smr_enter() also can be unfortunately delayed after observing
162 * a given write sequence and right before [R1] at [I].
163 *
164 * However for a read sequence to have move past what __smr_enter() observed,
165 * it means another __smr_scan() didn't observe the store to CPU::c_rd_seq
166 * made by __smr_enter() and thought the section was inactive.
167 *
168 * This can only happen if the scan's [S2] was issued before the delayed
169 * __smr_enter() [R1] (during the [I] window).
170 *
171 * As a consequence the outcome of that scan can be accepted as the "real"
172 * write sequence __smr_enter() should have observed.
173 *
174 *
175 * Litmus tests
176 * ~~~~~~~~~~~~
177 *
178 * This is the proof of [W] -> [R1] -> [S2] being established properly:
179 * - P0 sets a global and calls smr_synchronize()
180 * - P1 does smr_enter() and loads the global
181 *
182 * AArch64 MP
183 * {
184 * global = 0;
185 * wr_seq = 123;
186 * p1_rd_seq = 0;
187 *
188 * 0:x0 = global; 0:x1 = wr_seq; 0:x2 = p1_rd_seq;
189 * 1:x0 = global; 1:x1 = wr_seq; 1:x2 = p1_rd_seq;
190 * }
191 * P0 | P1 ;
192 * MOV X8, #2 | LDR X8, [X1] ;
193 * STR X8, [X0] | STR X8, [X2] ;
194 * LDADDL X8, X9, [X1] | DMB SY ;
195 * DMB SY | LDR X10, [X0] ;
196 * LDR X10, [X2] | ;
197 * exists (0:X10 = 0 /\ 1:X8 = 123 /\ 1:X10 = 0)
198 *
199 *
200 * This is the proof that deferred advances are also correct:
201 * - P0 sets a global and does a smr_deferred_advance()
202 * - P1 does an smr_synchronize() and reads the global
203 *
204 * AArch64 MP
205 * {
206 * global = 0;
207 * wr_seq = 123;
208 *
209 * 0:x0 = global; 0:x1 = wr_seq; 0:x2 = 2;
210 * 1:x0 = global; 1:x1 = wr_seq; 1:x2 = 2;
211 * }
212 * P0 | P1 ;
213 * STR X2, [X0] | LDADDL X2, X9, [X1] ;
214 * DMB SY | DMB SY ;
215 * LDR X9, [X1] | LDR X10, [X0] ;
216 * ADD X9, X9, X2 | ;
217 * exists (0:X9 = 125 /\ 1:X9 = 123 /\ 1:X10 = 0)
218 *
219 */
220
221 /*!
222 * @struct smr_worker
223 *
224 * @brief
225 * Structure tracking the per-cpu SMR workers state.
226 *
227 * @discussion
228 * This structure is system wide and global and is used to track
229 * the various active SMR domains at the granularity of a CPU.
230 *
231 * Each structure has an associated thread which is responsible
232 * for the forward progress the @c smr_call() and @c smr_barrier()
233 * interfaces.
234 *
235 * It also tracks all the active, non stalled, sleepable SMR sections.
236 */
237 struct smr_worker {
238 /*
239 * The thread for this worker,
240 * and conveniency pointer to the processor it is bound to.
241 */
242 struct thread *thread;
243 struct processor *processor;
244
245 /*
246 * Thread binding/locking logic:
247 *
248 * If the worker thread is running on its canonical CPU,
249 * then locking to access the various SMR per-cpu data
250 * structures it is draining is just preemption disablement.
251 *
252 * However, if it is currently not bound to its canonical
253 * CPU because the CPU has been offlined or de-recommended,
254 * then a lock which serializes with the CPU going online
255 * again is being used.
256 */
257 struct waitq waitq;
258 smr_cpu_reason_t detach_reason;
259
260 #if CONFIG_QUIESCE_COUNTER
261 /*
262 * Currently active quiescent generation for this processor,
263 * and the last timestamp when a scan of all cores was performed.
264 */
265 smr_seq_t rd_quiesce_seq;
266 #endif
267
268 /*
269 * List of all the active sleepable sections that haven't
270 * been stalled.
271 */
272 struct smrq_list_head sect_queue;
273 struct thread *sect_waiter;
274
275 /*
276 * Queue of SMR domains with pending smr_call()
277 * callouts to drain.
278 *
279 * This uses an ageing strategy in order to amortize
280 * SMR clock updates:
281 *
282 * - the "old" queue have domains whose callbacks have
283 * a committed and aged sequence,
284 * - the "age" queue have domains whose callbacks have
285 * a commited but fresh sequence and need ageing,
286 * - the "cur" queue have domains whose callbacks have
287 * a sequence in the future and need for it to be committed.
288 */
289 struct smr_pcpu *whead;
290 struct smr_pcpu **wold_tail;
291 struct smr_pcpu **wage_tail;
292 struct smr_pcpu **wcur_tail;
293 uint64_t drain_ctime;
294
295 /*
296 * Queue of smr_barrier() calls in flight,
297 * that will be picked up by the worker thread
298 * to enqueue as smr_call() entries in their
299 * respective per-CPU data structures.
300 */
301 struct mpsc_queue_head barrier_queue;
302 } __attribute__((aligned(64)));
303
304
305 typedef struct smr_pcpu {
306 /*
307 * CPU private cacheline.
308 *
309 * Nothing else than the CPU this state is made for,
310 * ever writes to this cacheline.
311 *
312 * It holds the epoch activity witness (rd_seq), and
313 * the local smr_call() queue, which is structured this way:
314 *
315 * head -> n1 -> n2 -> n3 -> n4 -> ... -> ni -> ... -> nN -> NULL
316 * ^ ^ ^
317 * qold_tail -------------' | |
318 * qage_tail --------------------------' |
319 * qcur_tail ---------------------------------------------'
320 *
321 * - the "old" queue can be reclaimed once qold_seq is past,
322 * qold_seq is always a commited sequence.
323 * - the "age" queue can be reclaimed once qage_seq is past,
324 * qage_seq might not be commited yet.
325 * - the "cur" queue has an approximate size of qcur_size bytes,
326 * and a length of qcur_cnt callbacks.
327 */
328
329 smr_seq_t c_rd_seq; /* might have SMR_SEQ_SLEEPABLE set */
330
331 smr_node_t qhead;
332
333 smr_seq_t qold_seq;
334 smr_node_t *qold_tail;
335
336 smr_seq_t qage_seq;
337 smr_node_t *qage_tail;
338
339 uint32_t qcur_size;
340 uint32_t qcur_cnt;
341 smr_node_t *qcur_tail;
342
343 uint8_t __cacheline_sep[0];
344
345 /*
346 * Drain queue.
347 *
348 * This is used to drive smr_call() via the smr worker threads.
349 * If the SMR domain is not using smr_call() or smr_barrier(),
350 * this isn't used.
351 */
352 struct smr *drain_smr;
353 struct smr_pcpu *drain_next;
354 uint16_t __check_cpu;
355 uint8_t __check_reason;
356 uint8_t __check_list;
357
358 /*
359 * Stalled queue.
360 *
361 * Stalled sections are enqueued onto this queue by the scheduler
362 * when their thread blocks (see smr_mark_active_trackers_stalled()).
363 *
364 * If the SMR domain is not sleepable, then this isn't used.
365 *
366 * This list is protected by a lock.
367 *
368 * When there are stalled sections, stall_rd_seq contains
369 * the oldest active stalled sequence number.
370 *
371 * When threads want to expedite a stalled section, they set
372 * stall_waiter_goal to the sequence number they are waiting
373 * for and block via turnstile on the oldest stalled section.
374 */
375 hw_lck_ticket_t stall_lock;
376 smr_seq_t stall_rd_seq;
377 smr_seq_t stall_waiter_goal;
378 struct smrq_tailq_head stall_queue;
379 struct turnstile *stall_ts;
380 } __attribute__((aligned(128))) * smr_pcpu_t;
381
382 static_assert(offsetof(struct smr_pcpu, __cacheline_sep) == 64);
383 static_assert(sizeof(struct smr_pcpu) == 128);
384
385 #define CPU_CHECKIN_MIN_INTERVAL_US 5000 /* 5ms */
386 #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
387 static uint64_t cpu_checkin_min_interval;
388 static uint32_t cpu_checkin_min_interval_us;
389
390 /*! the amount of memory pending retiring that causes a foreceful flush */
391 #if XNU_TARGET_OS_OSX
392 static TUNABLE(vm_size_t, smr_call_size_cap, "smr_call_size_cap", 256 << 10);
393 static TUNABLE(vm_size_t, smr_call_cnt_cap, "smr_call_cnt_cap", 128);
394 #else
395 static TUNABLE(vm_size_t, smr_call_size_cap, "smr_call_size_cap", 64 << 10);
396 static TUNABLE(vm_size_t, smr_call_cnt_cap, "smr_call_cnt_cap", 32);
397 #endif
398 /* time __smr_wait_for_oncore busy spins before going the expensive route */
399 static TUNABLE(uint32_t, smr_wait_spin_us, "smr_wait_spin_us", 20);
400
401 static LCK_GRP_DECLARE(smr_lock_grp, "smr");
402 static struct smr_worker PERCPU_DATA(smr_worker);
403 static struct smrq_tailq_head smr_domains = SMRQ_TAILQ_INITIALIZER(smr_domains);
404
405 SMR_DEFINE_FLAGS(smr_system, "system", SMR_NONE);
406 SMR_DEFINE_FLAGS(smr_system_sleepable, "system (sleepable)", SMR_SLEEPABLE);
407
408
409 #pragma mark SMR domains: init & helpers
410
411 #define SMR_PCPU_NOT_QUEUED ((struct smr_pcpu *)-1)
412
413 __attribute__((always_inline, overloadable))
414 static inline smr_pcpu_t
__smr_pcpu(smr_t smr,int cpu)415 __smr_pcpu(smr_t smr, int cpu)
416 {
417 return &smr->smr_pcpu[cpu];
418 }
419
420 __attribute__((always_inline, overloadable))
421 static inline smr_pcpu_t
__smr_pcpu(smr_t smr)422 __smr_pcpu(smr_t smr)
423 {
424 return __smr_pcpu(smr, cpu_number());
425 }
426
427 static inline bool
__smr_pcpu_queued(smr_pcpu_t pcpu)428 __smr_pcpu_queued(smr_pcpu_t pcpu)
429 {
430 return pcpu->drain_next != SMR_PCPU_NOT_QUEUED;
431 }
432
433 static inline void
__smr_pcpu_set_not_queued(smr_pcpu_t pcpu)434 __smr_pcpu_set_not_queued(smr_pcpu_t pcpu)
435 {
436 pcpu->drain_next = SMR_PCPU_NOT_QUEUED;
437 }
438
439 static inline void
__smr_pcpu_associate(smr_t smr,smr_pcpu_t pcpu)440 __smr_pcpu_associate(smr_t smr, smr_pcpu_t pcpu)
441 {
442 zpercpu_foreach_cpu(cpu) {
443 pcpu[cpu].qold_tail = &pcpu[cpu].qhead;
444 pcpu[cpu].qage_tail = &pcpu[cpu].qhead;
445 pcpu[cpu].qcur_tail = &pcpu[cpu].qhead;
446
447 pcpu[cpu].drain_smr = smr;
448 __smr_pcpu_set_not_queued(&pcpu[cpu]);
449 hw_lck_ticket_init(&pcpu[cpu].stall_lock, &smr_lock_grp);
450 smrq_init(&pcpu[cpu].stall_queue);
451 }
452
453 os_atomic_store(&smr->smr_pcpu, pcpu, release);
454 }
455
456 static inline event64_t
__smrw_oncore_event(struct smr_worker * smrw)457 __smrw_oncore_event(struct smr_worker *smrw)
458 {
459 return CAST_EVENT64_T(&smrw->sect_queue);
460 }
461
462 static inline event64_t
__smrw_drain_event(struct smr_worker * smrw)463 __smrw_drain_event(struct smr_worker *smrw)
464 {
465 return CAST_EVENT64_T(&smrw->whead);
466 }
467
468 static inline processor_t
__smrw_drain_bind_target(struct smr_worker * smrw)469 __smrw_drain_bind_target(struct smr_worker *smrw)
470 {
471 return smrw->detach_reason ? PROCESSOR_NULL : smrw->processor;
472 }
473
474 static inline void
__smrw_lock(struct smr_worker * smrw)475 __smrw_lock(struct smr_worker *smrw)
476 {
477 waitq_lock(&smrw->waitq);
478 }
479
480 static inline void
__smrw_unlock(struct smr_worker * smrw)481 __smrw_unlock(struct smr_worker *smrw)
482 {
483 waitq_unlock(&smrw->waitq);
484 }
485
486 /*!
487 * @function __smrw_wakeup_and_unlock()
488 *
489 * @brief
490 * Wakes up (with binding) the SMR worker.
491 *
492 * @discussion
493 * Wakeup the worker thread and bind it to the proper processor
494 * as a side effect.
495 *
496 * This function must be called with interrupts disabled.
497 */
498 static bool
__smrw_wakeup_and_unlock(struct smr_worker * smrw)499 __smrw_wakeup_and_unlock(struct smr_worker *smrw)
500 {
501 thread_t thread;
502
503 assert(!ml_get_interrupts_enabled());
504
505 thread = waitq_wakeup64_identify_locked(&smrw->waitq,
506 __smrw_drain_event(smrw), THREAD_AWAKENED, WAITQ_UNLOCK);
507
508 if (thread != THREAD_NULL) {
509 assert(thread == smrw->thread);
510
511 waitq_resume_and_bind_identified_thread(&smrw->waitq,
512 thread, __smrw_drain_bind_target(smrw),
513 THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT);
514 }
515
516 return thread != THREAD_NULL;
517 }
518
519 static void
__smr_call_drain(smr_node_t head)520 __smr_call_drain(smr_node_t head)
521 {
522 smr_node_t node;
523
524 while ((node = head) != NULL) {
525 head = node->smrn_next;
526 node->smrn_next = NULL;
527 node->smrn_cb(node);
528 }
529 }
530
531 __startup_func
532 void
__smr_domain_init(smr_t smr)533 __smr_domain_init(smr_t smr)
534 {
535 smr_pcpu_t pcpu;
536 vm_size_t size;
537
538 if (startup_phase < STARTUP_SUB_TUNABLES) {
539 smr_seq_t *rd_seqp = &smr->smr_early;
540
541 /*
542 * This is a big cheat, but before the EARLY_BOOT phase,
543 * all smr_* APIs that would access past the rd_seq
544 * will early return.
545 */
546 pcpu = __container_of(rd_seqp, struct smr_pcpu, c_rd_seq);
547 smr->smr_pcpu = pcpu - cpu_number();
548 assert(&__smr_pcpu(smr)->c_rd_seq == &smr->smr_early);
549 } else {
550 size = zpercpu_count() * sizeof(struct smr_pcpu);
551 pcpu = zalloc_permanent(size, ZALIGN(struct smr_pcpu));
552
553 __smr_pcpu_associate(smr, pcpu);
554 }
555 }
556
557 smr_t
smr_domain_create(smr_flags_t flags,const char * name)558 smr_domain_create(smr_flags_t flags, const char *name)
559 {
560 smr_pcpu_t pcpu;
561 smr_t smr;
562
563 smr = kalloc_type(struct smr, Z_WAITOK | Z_ZERO | Z_NOFAIL);
564 pcpu = kalloc_type(struct smr_pcpu, zpercpu_count(),
565 Z_WAITOK | Z_ZERO | Z_NOFAIL);
566
567 smr->smr_clock.s_rd_seq = SMR_SEQ_INIT;
568 smr->smr_clock.s_wr_seq = SMR_SEQ_INIT;
569 smr->smr_flags = flags;
570 static_assert(sizeof(struct smr) ==
571 offsetof(struct smr, smr_name) + SMR_NAME_MAX);
572 strlcpy(smr->smr_name, name, sizeof(smr->smr_name));
573
574 __smr_pcpu_associate(smr, pcpu);
575
576 return smr;
577 }
578
579 void
smr_domain_free(smr_t smr)580 smr_domain_free(smr_t smr)
581 {
582 smr_barrier(smr);
583
584 zpercpu_foreach_cpu(cpu) {
585 smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
586
587 assert(pcpu->qhead == NULL);
588 hw_lck_ticket_destroy(&pcpu->stall_lock, &smr_lock_grp);
589 }
590
591 kfree_type(struct smr_pcpu, zpercpu_count(), smr->smr_pcpu);
592 kfree_type(struct smr, smr);
593 }
594
595
596 #pragma mark SMR domains: enter / leave
597
598 bool
smr_entered(smr_t smr)599 smr_entered(smr_t smr)
600 {
601 thread_t self = current_thread();
602 smr_tracker_t t;
603
604 if (lock_preemption_level_for_thread(self) &&
605 __smr_pcpu(smr)->c_rd_seq != SMR_SEQ_INVALID) {
606 return true;
607 }
608
609 if (smr->smr_flags & SMR_SLEEPABLE) {
610 smrq_serialized_foreach(t, &self->smr_stack, smrt_stack) {
611 if (t->smrt_domain == smr) {
612 return true;
613 }
614 }
615 }
616
617 return false;
618 }
619
620 __attribute__((always_inline))
621 bool
smr_entered_cpu_noblock(smr_t smr,int cpu)622 smr_entered_cpu_noblock(smr_t smr, int cpu)
623 {
624 assert((smr->smr_flags & SMR_SLEEPABLE) == 0);
625 return __smr_pcpu(smr, cpu)->c_rd_seq != SMR_SEQ_INVALID;
626 }
627
628 __attribute__((always_inline))
629 static smr_seq_t
__smr_enter(smr_t smr,smr_pcpu_t pcpu,smr_seq_t sleepable)630 __smr_enter(smr_t smr, smr_pcpu_t pcpu, smr_seq_t sleepable)
631 {
632 smr_seq_t s_wr_seq;
633 smr_seq_t old_seq;
634
635 /*
636 * It is possible to have a long delay between loading the s_wr_seq
637 * and storing it to the percpu copy of it.
638 *
639 * It is unlikely but possible by that time the s_rd_seq advances
640 * ahead of what we will store. This however is still safe
641 * and handled in __smr_scan().
642 *
643 * On Intel, to achieve the ordering we want, we could use a store
644 * followed by an mfence, or any RMW (XCHG, XADD, CMPXCHG, ...).
645 * XADD is just the fastest instruction of the alternatives,
646 * but it will only ever add to '0'.
647 */
648 s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
649 #if __x86_64__
650 /* [R1] */
651 old_seq = os_atomic_add_orig(&pcpu->c_rd_seq, s_wr_seq | sleepable, seq_cst);
652 #else
653 old_seq = pcpu->c_rd_seq;
654 os_atomic_store(&pcpu->c_rd_seq, s_wr_seq | sleepable, relaxed);
655 os_atomic_thread_fence(seq_cst); /* [R1] */
656 #endif
657 assert(old_seq == SMR_SEQ_INVALID);
658
659 return s_wr_seq;
660 }
661
662 __attribute__((always_inline))
663 static void
__smr_leave(smr_pcpu_t pcpu)664 __smr_leave(smr_pcpu_t pcpu)
665 {
666 /* [R2] */
667 os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
668 }
669
670 __attribute__((always_inline))
671 void
smr_enter(smr_t smr)672 smr_enter(smr_t smr)
673 {
674 disable_preemption();
675 __smr_enter(smr, __smr_pcpu(smr), 0);
676 }
677
678 __attribute__((always_inline))
679 void
smr_leave(smr_t smr)680 smr_leave(smr_t smr)
681 {
682 __smr_leave(__smr_pcpu(smr));
683 enable_preemption();
684 }
685
686 void
smr_enter_sleepable(smr_t smr,smr_tracker_t tracker)687 smr_enter_sleepable(smr_t smr, smr_tracker_t tracker)
688 {
689 thread_t self = current_thread();
690 struct smr_worker *smrw;
691 smr_pcpu_t pcpu;
692
693 assert(smr->smr_flags & SMR_SLEEPABLE);
694
695 lock_disable_preemption_for_thread(self);
696 lck_rw_lock_count_inc(self, smr);
697
698 pcpu = __smr_pcpu(smr);
699 smrw = PERCPU_GET(smr_worker);
700
701 tracker->smrt_domain = smr;
702 tracker->smrt_seq = __smr_enter(smr, pcpu, SMR_SEQ_SLEEPABLE);
703 smrq_serialized_insert_head_relaxed(&smrw->sect_queue, &tracker->smrt_link);
704 smrq_serialized_insert_head_relaxed(&self->smr_stack, &tracker->smrt_stack);
705 tracker->smrt_ctid = 0;
706 tracker->smrt_cpu = -1;
707
708 lock_enable_preemption();
709 }
710
711 __attribute__((always_inline))
712 static void
__smr_wake_oncore_sleepers(struct smr_worker * smrw)713 __smr_wake_oncore_sleepers(struct smr_worker *smrw)
714 {
715 /*
716 * prevent reordering of making the list empty and checking for waiters.
717 */
718 if (__improbable(os_atomic_load(&smrw->sect_waiter, compiler_acq_rel))) {
719 if (smrq_empty(&smrw->sect_queue)) {
720 os_atomic_store(&smrw->sect_waiter, NULL, relaxed);
721 waitq_wakeup64_all(&smrw->waitq,
722 __smrw_oncore_event(smrw), THREAD_AWAKENED,
723 WAITQ_WAKEUP_DEFAULT);
724 }
725 }
726 }
727
728 void
smr_ack_ipi(void)729 smr_ack_ipi(void)
730 {
731 /*
732 * see __smr_wait_for_oncore(): if at the time of the IPI ack
733 * the list is empty and there is still a waiter, wake it up.
734 *
735 * If the queue is not empty, then when smr_leave_sleepable()
736 * runs it can't possibly fail to observe smrw->sect_waiter
737 * being non NULL and will do the wakeup then.
738 */
739 __smr_wake_oncore_sleepers(PERCPU_GET(smr_worker));
740 }
741
742 void
smr_mark_active_trackers_stalled(thread_t self)743 smr_mark_active_trackers_stalled(thread_t self)
744 {
745 struct smr_worker *smrw = PERCPU_GET(smr_worker);
746 int cpu = cpu_number();
747 smr_tracker_t t;
748
749 /* called at splsched */
750
751 smrq_serialized_foreach_safe(t, &smrw->sect_queue, smrt_link) {
752 smr_t smr = t->smrt_domain;
753 smr_pcpu_t pcpu;
754
755 pcpu = __smr_pcpu(smr, cpu);
756
757 t->smrt_ctid = self->ctid;
758 t->smrt_cpu = cpu;
759
760 hw_lck_ticket_lock_nopreempt(&pcpu->stall_lock, &smr_lock_grp);
761
762 /*
763 * Transfer the section to the stalled queue,
764 * and _then_ leave the regular one.
765 *
766 * A store-release is sufficient to order these stores,
767 * and guarantee that __smr_scan() can't fail to observe
768 * both the @c rd_seq and @c stall_rd_seq during a transfer
769 * of a stalled section that was active when it started.
770 */
771 if (smrq_empty(&pcpu->stall_queue)) {
772 os_atomic_store(&pcpu->stall_rd_seq, t->smrt_seq, relaxed);
773 }
774 os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
775
776 smrq_serialized_insert_tail_relaxed(&pcpu->stall_queue, &t->smrt_link);
777
778 hw_lck_ticket_unlock_nopreempt(&pcpu->stall_lock);
779 }
780
781 smrq_init(&smrw->sect_queue);
782
783 __smr_wake_oncore_sleepers(smrw);
784 }
785
786
787 __attribute__((noinline))
788 static void
__smr_leave_stalled(smr_t smr,smr_tracker_t tracker,thread_t self)789 __smr_leave_stalled(smr_t smr, smr_tracker_t tracker, thread_t self)
790 {
791 smr_seq_t new_stall_seq = SMR_SEQ_INVALID;
792 smr_tracker_t first = NULL;
793 smr_pcpu_t pcpu;
794 bool progress;
795
796 pcpu = __smr_pcpu(smr, tracker->smrt_cpu);
797
798 hw_lck_ticket_lock_nopreempt(&pcpu->stall_lock, &smr_lock_grp);
799
800 progress = smrq_serialized_first(&pcpu->stall_queue,
801 struct smr_tracker, smrt_link) == tracker;
802
803 smrq_serialized_remove(&self->smr_stack, &tracker->smrt_stack);
804 smrq_serialized_remove(&pcpu->stall_queue, &tracker->smrt_link);
805 bzero(tracker, sizeof(*tracker));
806
807 if (progress) {
808 if (!smrq_empty(&pcpu->stall_queue)) {
809 first = smrq_serialized_first(&pcpu->stall_queue,
810 struct smr_tracker, smrt_link);
811 new_stall_seq = first->smrt_seq;
812 __builtin_assume(new_stall_seq != SMR_SEQ_INVALID);
813 assert(SMR_SEQ_CMP(pcpu->stall_rd_seq, <=, new_stall_seq));
814 }
815
816 os_atomic_store(&pcpu->stall_rd_seq, new_stall_seq, release);
817
818 progress = pcpu->stall_waiter_goal != SMR_SEQ_INVALID;
819 }
820
821 if (progress) {
822 struct turnstile *ts;
823
824 ts = turnstile_prepare((uintptr_t)pcpu, &pcpu->stall_ts,
825 TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
826
827 if (new_stall_seq == SMR_SEQ_INVALID ||
828 SMR_SEQ_CMP(pcpu->stall_waiter_goal, <=, new_stall_seq)) {
829 pcpu->stall_waiter_goal = SMR_SEQ_INVALID;
830 waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(pcpu),
831 THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
832 } else {
833 turnstile_update_inheritor(ts, ctid_get_thread(first->smrt_ctid),
834 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
835 }
836
837 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
838
839 turnstile_complete((uintptr_t)pcpu, &pcpu->stall_ts,
840 NULL, TURNSTILE_KERNEL_MUTEX);
841 }
842
843 /* reenables preemption disabled in smr_leave_sleepable() */
844 hw_lck_ticket_unlock(&pcpu->stall_lock);
845
846 turnstile_cleanup();
847 }
848
849 void
smr_leave_sleepable(smr_t smr,smr_tracker_t tracker)850 smr_leave_sleepable(smr_t smr, smr_tracker_t tracker)
851 {
852 struct smr_worker *smrw;
853 thread_t self = current_thread();
854
855 assert(tracker->smrt_seq != SMR_SEQ_INVALID);
856 assert(smr->smr_flags & SMR_SLEEPABLE);
857
858 lock_disable_preemption_for_thread(self);
859
860 lck_rw_lock_count_dec(self, smr);
861
862 if (__improbable(tracker->smrt_cpu != -1)) {
863 return __smr_leave_stalled(smr, tracker, self);
864 }
865
866 __smr_leave(__smr_pcpu(smr));
867
868 smrw = PERCPU_GET(smr_worker);
869 smrq_serialized_remove(&self->smr_stack, &tracker->smrt_stack);
870 smrq_serialized_remove(&smrw->sect_queue, &tracker->smrt_link);
871 bzero(tracker, sizeof(*tracker));
872
873 __smr_wake_oncore_sleepers(PERCPU_GET(smr_worker));
874
875 lock_enable_preemption();
876 }
877
878
879 #pragma mark SMR domains: advance, wait, poll, synchronize
880
881 static inline smr_seq_t
__smr_wr_advance(smr_t smr)882 __smr_wr_advance(smr_t smr)
883 {
884 /* [W] */
885 return os_atomic_add(&smr->smr_clock.s_wr_seq, SMR_SEQ_INC, release);
886 }
887
888 static inline bool
__smr_rd_advance(smr_t smr,smr_seq_t goal,smr_seq_t rd_seq)889 __smr_rd_advance(smr_t smr, smr_seq_t goal, smr_seq_t rd_seq)
890 {
891 smr_seq_t o_seq;
892
893 os_atomic_thread_fence(seq_cst); /* [S3] */
894
895 os_atomic_rmw_loop(&smr->smr_clock.s_rd_seq, o_seq, rd_seq, relaxed, {
896 if (SMR_SEQ_CMP(rd_seq, <=, o_seq)) {
897 rd_seq = o_seq;
898 os_atomic_rmw_loop_give_up(break);
899 }
900 });
901
902 return SMR_SEQ_CMP(goal, <=, rd_seq);
903 }
904
905 __attribute__((noinline))
906 static smr_seq_t
__smr_wait_for_stalled(smr_pcpu_t pcpu,smr_seq_t goal)907 __smr_wait_for_stalled(smr_pcpu_t pcpu, smr_seq_t goal)
908 {
909 struct turnstile *ts;
910 thread_t inheritor;
911 wait_result_t wr;
912 smr_seq_t stall_rd_seq;
913
914 hw_lck_ticket_lock(&pcpu->stall_lock, &smr_lock_grp);
915
916 stall_rd_seq = pcpu->stall_rd_seq;
917 if (stall_rd_seq == SMR_SEQ_INVALID ||
918 SMR_SEQ_CMP(goal, <=, stall_rd_seq)) {
919 hw_lck_ticket_unlock(&pcpu->stall_lock);
920 return stall_rd_seq;
921 }
922
923 if (pcpu->stall_waiter_goal == SMR_SEQ_INVALID ||
924 SMR_SEQ_CMP(goal, <, pcpu->stall_waiter_goal)) {
925 pcpu->stall_waiter_goal = goal;
926 }
927
928 inheritor = ctid_get_thread(smrq_serialized_first(&pcpu->stall_queue,
929 struct smr_tracker, smrt_link)->smrt_ctid);
930
931 ts = turnstile_prepare((uintptr_t)pcpu, &pcpu->stall_ts,
932 TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
933
934 turnstile_update_inheritor(ts, inheritor,
935 TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD);
936 wr = waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(pcpu),
937 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
938 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
939
940 if (wr == THREAD_WAITING) {
941 hw_lck_ticket_unlock(&pcpu->stall_lock);
942 thread_block(THREAD_CONTINUE_NULL);
943 hw_lck_ticket_lock(&pcpu->stall_lock, &smr_lock_grp);
944 }
945
946 turnstile_complete((uintptr_t)pcpu, &pcpu->stall_ts,
947 NULL, TURNSTILE_KERNEL_MUTEX);
948
949 stall_rd_seq = pcpu->stall_rd_seq;
950 hw_lck_ticket_unlock(&pcpu->stall_lock);
951
952 turnstile_cleanup();
953
954 return stall_rd_seq;
955 }
956
957 __attribute__((noinline))
958 static smr_seq_t
__smr_wait_for_oncore(smr_pcpu_t pcpu,smr_seq_t goal,uint32_t cpu)959 __smr_wait_for_oncore(smr_pcpu_t pcpu, smr_seq_t goal, uint32_t cpu)
960 {
961 thread_t self = current_thread();
962 struct smr_worker *smrw;
963 uint64_t deadline = 0;
964 vm_offset_t base;
965 smr_seq_t rd_seq;
966
967 /*
968 * We are waiting for a currently active SMR section.
969 * Start spin-waiting for it for a bit.
970 */
971 for (;;) {
972 if (hw_spin_wait_until(&pcpu->c_rd_seq, rd_seq,
973 rd_seq == SMR_SEQ_INVALID || SMR_SEQ_CMP(goal, <=, rd_seq))) {
974 return rd_seq;
975 }
976
977 if (deadline == 0) {
978 clock_interval_to_deadline(smr_wait_spin_us,
979 NSEC_PER_USEC, &deadline);
980 } else if (mach_absolute_time() > deadline) {
981 break;
982 }
983 }
984
985 /*
986 * This section is being active for a while,
987 * we need to move to a more passive way of waiting.
988 *
989 * We post ourselves on the remote processor tracking head,
990 * to denote we need a thread_wakeup() when the tracker head clears,
991 * then send an IPI which will have 2 possible outcomes:
992 *
993 * 1. when smr_ack_ipi() runs, the queue is already cleared,
994 * and we will be woken up immediately.
995 *
996 * 2. when smr_ack_ipi() runs, the queue isn't cleared,
997 * then it does nothing, but there is a guarantee that
998 * when the queue clears, the remote core will observe
999 * that there is a waiter, and thread_wakeup() will be
1000 * called then.
1001 *
1002 * In order to avoid to actually wait, we do spin some more,
1003 * hoping for the remote sequence to change.
1004 */
1005 base = other_percpu_base(cpu);
1006 smrw = PERCPU_GET_WITH_BASE(base, smr_worker);
1007
1008 waitq_assert_wait64(&smrw->waitq, __smrw_oncore_event(smrw),
1009 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1010
1011 if (lock_cmpxchg(&smrw->sect_waiter, NULL, self, relaxed)) {
1012 /*
1013 * only really send the IPI if we're first,
1014 * to avoid IPI storms in case of a pile-up
1015 * of smr_synchronize() calls stalled on the same guy.
1016 */
1017 cause_ast_check(PERCPU_GET_WITH_BASE(base, processor));
1018 }
1019
1020 if (hw_spin_wait_until(&pcpu->c_rd_seq, rd_seq,
1021 rd_seq == SMR_SEQ_INVALID || SMR_SEQ_CMP(goal, <=, rd_seq))) {
1022 clear_wait(self, THREAD_AWAKENED);
1023 return rd_seq;
1024 }
1025
1026 thread_block(THREAD_CONTINUE_NULL);
1027
1028 return os_atomic_load(&pcpu->c_rd_seq, relaxed);
1029 }
1030
1031 __attribute__((noinline))
1032 static bool
__smr_scan(smr_t smr,smr_seq_t goal,smr_clock_t clk,bool wait)1033 __smr_scan(smr_t smr, smr_seq_t goal, smr_clock_t clk, bool wait)
1034 {
1035 smr_delta_t delta;
1036 smr_seq_t rd_seq;
1037
1038 if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
1039 return true;
1040 }
1041
1042 /*
1043 * Validate that the goal is sane.
1044 */
1045 delta = SMR_SEQ_DELTA(goal, clk.s_wr_seq);
1046 if (delta == SMR_SEQ_INC) {
1047 /*
1048 * This SMR clock uses deferred advance,
1049 * and the goal is one inc in the future.
1050 *
1051 * If we can wait, then commit the sequence number,
1052 * else we can't possibly succeed.
1053 *
1054 * Doing a commit here rather than an advance
1055 * gives the hardware a chance to abort the
1056 * transaction early in case of high contention
1057 * compared to an unconditional advance.
1058 */
1059 if (!wait) {
1060 return false;
1061 }
1062 if (lock_cmpxchgv(&smr->smr_clock.s_wr_seq,
1063 clk.s_wr_seq, goal, &clk.s_wr_seq, relaxed)) {
1064 clk.s_wr_seq = goal;
1065 }
1066 } else if (delta > 0) {
1067 /*
1068 * Invalid goal: the caller held on it for too long,
1069 * and integers wrapped.
1070 */
1071 return true;
1072 }
1073
1074 os_atomic_thread_fence(seq_cst); /* [S2] */
1075
1076 /*
1077 * The read sequence can be no larger than the write sequence
1078 * at the start of the poll.
1079 *
1080 * We know that on entry:
1081 *
1082 * s_rd_seq < goal <= s_wr_seq
1083 *
1084 * The correctness of this algorithm relies on the fact that
1085 * the SMR domain [s_rd_seq, s_wr_seq) can't possibly move
1086 * by more than roughly (ULONG_MAX / 2) while __smr_scan()
1087 * is running, otherwise the "rd_seq" we try to scan for
1088 * might appear larger than s_rd_seq spuriously and we'd
1089 * __smr_rd_advance() incorrectly.
1090 *
1091 * This is guaranteed by the fact that this represents
1092 * advancing 2^62 times. At one advance every nanosecond,
1093 * it takes more than a century, which makes it possible
1094 * to call smr_wait() or smr_poll() with preemption enabled.
1095 */
1096 rd_seq = clk.s_wr_seq;
1097
1098 zpercpu_foreach_cpu(cpu) {
1099 smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
1100 smr_seq_t seq = os_atomic_load(&pcpu->c_rd_seq, relaxed);
1101
1102 while (seq != SMR_SEQ_INVALID) {
1103 /*
1104 * Resolve the race documented in __smr_enter().
1105 *
1106 * The CPU has loaded a stale s_wr_seq, and s_rd_seq
1107 * moved past this stale value.
1108 *
1109 * Its critical section is however properly serialized,
1110 * but we can't know what the "correct" s_wr_seq it
1111 * could have observed was. We have to assume `s_rd_seq`
1112 * to prevent it from advancing.
1113 */
1114 if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
1115 seq = clk.s_rd_seq;
1116 }
1117
1118 if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
1119 seq &= ~SMR_SEQ_SLEEPABLE;
1120 break;
1121 }
1122
1123 if (seq & SMR_SEQ_SLEEPABLE) {
1124 seq = __smr_wait_for_oncore(pcpu, goal, cpu);
1125 } else {
1126 disable_preemption();
1127 seq = hw_wait_while_equals_long(&pcpu->c_rd_seq, seq);
1128 enable_preemption();
1129 }
1130 }
1131
1132 if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
1133 rd_seq = seq;
1134 }
1135 }
1136
1137 if (smr->smr_flags & SMR_SLEEPABLE) {
1138 /*
1139 * Order observation of stalled sections,
1140 * see smr_mark_active_trackers_stalled().
1141 */
1142 os_atomic_thread_fence(seq_cst);
1143
1144 zpercpu_foreach_cpu(cpu) {
1145 smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
1146 smr_seq_t seq = os_atomic_load(&pcpu->stall_rd_seq, relaxed);
1147
1148 while (seq != SMR_SEQ_INVALID) {
1149 if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
1150 seq = clk.s_rd_seq;
1151 }
1152
1153 if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
1154 seq &= ~SMR_SEQ_SLEEPABLE;
1155 break;
1156 }
1157
1158 seq = __smr_wait_for_stalled(pcpu, goal);
1159 }
1160
1161 if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
1162 rd_seq = seq;
1163 }
1164 }
1165 }
1166
1167 /*
1168 * Advance the rd_seq as long as we observed a more recent value.
1169 */
1170 return __smr_rd_advance(smr, goal, rd_seq);
1171 }
1172
1173 static inline bool
__smr_poll(smr_t smr,smr_seq_t goal,bool wait)1174 __smr_poll(smr_t smr, smr_seq_t goal, bool wait)
1175 {
1176 smr_clock_t clk;
1177
1178 /*
1179 * Load both the s_rd_seq and s_wr_seq in the right order so that we
1180 * can't observe a s_rd_seq older than s_wr_seq.
1181 */
1182
1183 /* [S1] */
1184 clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, acquire);
1185
1186 /*
1187 * We expect this to be typical: the goal has already been observed.
1188 */
1189 if (__probable(SMR_SEQ_CMP(goal, <=, clk.s_rd_seq))) {
1190 return true;
1191 }
1192
1193 clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1194
1195 return __smr_scan(smr, goal, clk, wait);
1196 }
1197
1198 smr_seq_t
smr_advance(smr_t smr)1199 smr_advance(smr_t smr)
1200 {
1201 smr_clock_t clk;
1202
1203 assert(!smr_entered(smr));
1204
1205 /*
1206 * We assume that there will at least be a successful __smr_poll
1207 * call every 2^60 calls to smr_advance() or so, so we do not need
1208 * to check if [s_rd_seq, s_wr_seq) is growing too wide.
1209 */
1210 static_assert(sizeof(clk.s_wr_seq) == 8);
1211 return __smr_wr_advance(smr);
1212 }
1213
1214 smr_seq_t
smr_deferred_advance(smr_t smr)1215 smr_deferred_advance(smr_t smr)
1216 {
1217 os_atomic_thread_fence(seq_cst);
1218 return SMR_SEQ_INC + os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1219 }
1220
1221 void
smr_deferred_advance_commit(smr_t smr,smr_seq_t seq)1222 smr_deferred_advance_commit(smr_t smr, smr_seq_t seq)
1223 {
1224 /*
1225 * no barrier needed: smr_deferred_advance() had one already.
1226 * no failure handling: it means someone updated the clock already!
1227 * lock_cmpxchg: so that we pre-test for architectures needing it.
1228 */
1229 assert(seq != SMR_SEQ_INVALID);
1230 lock_cmpxchg(&smr->smr_clock.s_wr_seq, seq - SMR_SEQ_INC, seq, relaxed);
1231 }
1232
1233 bool
smr_poll(smr_t smr,smr_seq_t goal)1234 smr_poll(smr_t smr, smr_seq_t goal)
1235 {
1236 assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
1237 return __smr_poll(smr, goal, false);
1238 }
1239
1240 void
smr_wait(smr_t smr,smr_seq_t goal)1241 smr_wait(smr_t smr, smr_seq_t goal)
1242 {
1243 assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
1244 if (smr->smr_flags & SMR_SLEEPABLE) {
1245 assert(get_preemption_level() == 0);
1246 }
1247 (void)__smr_poll(smr, goal, true);
1248 }
1249
1250 void
smr_synchronize(smr_t smr)1251 smr_synchronize(smr_t smr)
1252 {
1253 smr_clock_t clk;
1254
1255 assert(!smr_entered(smr));
1256 if (smr->smr_flags & SMR_SLEEPABLE) {
1257 assert(get_preemption_level() == 0);
1258 }
1259
1260 /*
1261 * Similar to __smr_poll() but also does a deferred advance which
1262 * __smr_scan will commit.
1263 */
1264
1265 clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, relaxed);
1266 os_atomic_thread_fence(seq_cst);
1267 clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1268
1269 (void)__smr_scan(smr, clk.s_wr_seq + SMR_SEQ_INC, clk, true);
1270 }
1271
1272
1273 #pragma mark SMR domains: smr_call & smr_barrier
1274
1275 /*!
1276 * @struct smr_barrier_ctx
1277 *
1278 * @brief
1279 * Data structure to track the completion of an smr_barrier() call.
1280 */
1281 struct smr_barrier_ctx {
1282 struct smr *smrb_domain;
1283 struct thread *smrb_waiter;
1284 uint32_t smrb_pending;
1285 uint32_t smrb_count;
1286 };
1287
1288 /*!
1289 * @struct smr_barrier_job
1290 *
1291 * @brief
1292 * Data structure used to track completion of smr_barrier() calls.
1293 */
1294 struct smr_barrier_job {
1295 struct smr_barrier_ctx *smrj_context;
1296 union {
1297 struct smr_node smrj_node;
1298 struct mpsc_queue_chain smrj_link;
1299 };
1300 };
1301
1302 #define SMR_BARRIER_SIZE 24
1303 static_assert(sizeof(struct smr_barrier_job) == SMR_BARRIER_SIZE);
1304 #define SMR_BARRIER_USE_STACK (SMR_BARRIER_SIZE * MAX_CPUS <= 512)
1305
1306 static void
__smr_worker_check_invariants(struct smr_worker * smrw)1307 __smr_worker_check_invariants(struct smr_worker *smrw)
1308 {
1309 #if MACH_ASSERT
1310 smr_pcpu_t pcpu = smrw->whead;
1311 uint16_t num = (uint16_t)cpu_number();
1312
1313 assert(!ml_get_interrupts_enabled() || get_preemption_level());
1314
1315 for (; pcpu != *smrw->wold_tail; pcpu = pcpu->drain_next) {
1316 assertf(pcpu->qold_seq != SMR_SEQ_INVALID &&
1317 __smr_pcpu_queued(pcpu),
1318 "pcpu %p doesn't belong on %p old queue", pcpu, smrw);
1319 pcpu->__check_cpu = num;
1320 pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1321 pcpu->__check_list = 1;
1322 }
1323
1324 for (; pcpu != *smrw->wage_tail; pcpu = pcpu->drain_next) {
1325 __assert_only smr_t smr = pcpu->drain_smr;
1326
1327 assertf(pcpu->qold_seq == SMR_SEQ_INVALID &&
1328 pcpu->qage_seq != SMR_SEQ_INVALID &&
1329 SMR_SEQ_CMP(pcpu->qage_seq, <=, smr->smr_clock.s_wr_seq) &&
1330 __smr_pcpu_queued(pcpu),
1331 "pcpu %p doesn't belong on %p aging queue", pcpu, smrw);
1332 pcpu->__check_cpu = num;
1333 pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1334 pcpu->__check_list = 2;
1335 }
1336
1337 for (; pcpu != *smrw->wcur_tail; pcpu = pcpu->drain_next) {
1338 assertf(pcpu->qold_seq == SMR_SEQ_INVALID &&
1339 pcpu->qage_seq != SMR_SEQ_INVALID &&
1340 __smr_pcpu_queued(pcpu),
1341 "pcpu %p doesn't belong on %p current queue", pcpu, smrw);
1342 pcpu->__check_cpu = num;
1343 pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1344 pcpu->__check_list = 3;
1345 }
1346
1347 assert(pcpu == NULL);
1348 #else
1349 (void)smrw;
1350 #endif
1351 }
1352
1353 __attribute__((noinline))
1354 static void
__smr_cpu_lazy_up(struct smr_worker * smrw)1355 __smr_cpu_lazy_up(struct smr_worker *smrw)
1356 {
1357 spl_t spl;
1358
1359 /*
1360 * calling smr_call/smr_barrier() from the context of a CPU
1361 * with a detached worker is illegal.
1362 *
1363 * However, bound threads might run on a derecommended (IGNORED)
1364 * cpu which we correct for here (and the CPU will go back to IGNORED
1365 * in smr_cpu_leave()).
1366 */
1367 assert(smrw->detach_reason == SMR_CPU_REASON_IGNORED);
1368
1369 spl = splsched();
1370 __smrw_lock(smrw);
1371 smrw->detach_reason &= ~SMR_CPU_REASON_IGNORED;
1372 __smrw_unlock(smrw);
1373 splx(spl);
1374 }
1375
1376 static void
__smr_cpu_lazy_up_if_needed(struct smr_worker * smrw)1377 __smr_cpu_lazy_up_if_needed(struct smr_worker *smrw)
1378 {
1379 if (__improbable(smrw->detach_reason != SMR_CPU_REASON_NONE)) {
1380 __smr_cpu_lazy_up(smrw);
1381 }
1382 }
1383
1384 static bool
__smr_call_should_advance(smr_pcpu_t pcpu)1385 __smr_call_should_advance(smr_pcpu_t pcpu)
1386 {
1387 if (pcpu->qcur_cnt > smr_call_cnt_cap) {
1388 return true;
1389 }
1390 if (pcpu->qcur_size > smr_call_size_cap) {
1391 return true;
1392 }
1393 return false;
1394 }
1395
1396 static void
__smr_call_advance_qcur(smr_t smr,smr_pcpu_t pcpu,bool needs_commit)1397 __smr_call_advance_qcur(smr_t smr, smr_pcpu_t pcpu, bool needs_commit)
1398 {
1399 smr_seq_t new_seq;
1400
1401 if (needs_commit || pcpu->qage_seq) {
1402 new_seq = smr_advance(smr);
1403 } else {
1404 new_seq = smr_deferred_advance(smr);
1405 }
1406 __builtin_assume(new_seq != SMR_SEQ_INVALID);
1407
1408 pcpu->qage_seq = new_seq;
1409 pcpu->qage_tail = pcpu->qcur_tail;
1410
1411 pcpu->qcur_size = 0;
1412 pcpu->qcur_cnt = 0;
1413 }
1414
1415 static void
__smr_call_push(smr_pcpu_t pcpu,smr_node_t node,smr_cb_t cb)1416 __smr_call_push(smr_pcpu_t pcpu, smr_node_t node, smr_cb_t cb)
1417 {
1418 assert(pcpu->c_rd_seq == SMR_SEQ_INVALID);
1419
1420 node->smrn_next = NULL;
1421 node->smrn_cb = cb;
1422
1423 *pcpu->qcur_tail = node;
1424 pcpu->qcur_tail = &node->smrn_next;
1425 pcpu->qcur_cnt += 1;
1426 }
1427
1428 static void
__smr_call_dispatch(struct smr_worker * smrw,smr_pcpu_t pcpu)1429 __smr_call_dispatch(struct smr_worker *smrw, smr_pcpu_t pcpu)
1430 {
1431 __smr_worker_check_invariants(smrw);
1432
1433 if (!__smr_pcpu_queued(pcpu)) {
1434 assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1435 assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1436
1437 pcpu->drain_next = NULL;
1438 *smrw->wcur_tail = pcpu;
1439 smrw->wcur_tail = &pcpu->drain_next;
1440 }
1441 }
1442
1443 void
smr_call(smr_t smr,smr_node_t node,vm_size_t size,smr_cb_t cb)1444 smr_call(smr_t smr, smr_node_t node, vm_size_t size, smr_cb_t cb)
1445 {
1446 struct smr_worker *smrw;
1447 smr_pcpu_t pcpu;
1448
1449 if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
1450 return cb(node);
1451 }
1452
1453 lock_disable_preemption_for_thread(current_thread());
1454
1455 smrw = PERCPU_GET(smr_worker);
1456 __smr_cpu_lazy_up_if_needed(smrw);
1457
1458 pcpu = __smr_pcpu(smr);
1459 assert(pcpu->c_rd_seq == SMR_SEQ_INVALID);
1460
1461 if (os_add_overflow(pcpu->qcur_size, size, &pcpu->qcur_size)) {
1462 pcpu->qcur_size = UINT32_MAX;
1463 }
1464
1465 __smr_call_push(pcpu, node, cb);
1466 if (__smr_call_should_advance(pcpu)) {
1467 if (pcpu->qage_seq == SMR_SEQ_INVALID) {
1468 __smr_call_advance_qcur(smr, pcpu, false);
1469 }
1470 __smr_call_dispatch(smrw, pcpu);
1471 }
1472
1473 return lock_enable_preemption();
1474 }
1475
1476 static inline event_t
__smrb_event(struct smr_barrier_ctx * ctx)1477 __smrb_event(struct smr_barrier_ctx *ctx)
1478 {
1479 return ctx;
1480 }
1481
1482 static void
__smr_barrier_cb(struct smr_node * node)1483 __smr_barrier_cb(struct smr_node *node)
1484 {
1485 struct smr_barrier_job *job;
1486 struct smr_barrier_ctx *ctx;
1487
1488 job = __container_of(node, struct smr_barrier_job, smrj_node);
1489 ctx = job->smrj_context;
1490
1491 if (os_atomic_dec(&ctx->smrb_pending, relaxed) == 0) {
1492 /*
1493 * It is permitted to still reach into the context
1494 * because smr_barrier() always blocks, which means
1495 * that the context will be valid until this wakeup
1496 * happens.
1497 */
1498 thread_wakeup_thread(__smrb_event(ctx), ctx->smrb_waiter);
1499 }
1500 }
1501
1502 static bool
__smr_barrier_drain(struct smr_worker * smrw,bool needs_commit)1503 __smr_barrier_drain(struct smr_worker *smrw, bool needs_commit)
1504 {
1505 mpsc_queue_chain_t head, tail, it;
1506
1507 head = mpsc_queue_dequeue_batch(&smrw->barrier_queue, &tail,
1508 OS_ATOMIC_DEPENDENCY_NONE);
1509
1510 mpsc_queue_batch_foreach_safe(it, head, tail) {
1511 struct smr_barrier_job *job;
1512 struct smr_barrier_ctx *ctx;
1513 smr_pcpu_t pcpu;
1514 smr_t smr;
1515
1516 job = __container_of(it, struct smr_barrier_job, smrj_link);
1517 ctx = job->smrj_context;
1518 smr = ctx->smrb_domain;
1519 pcpu = __smr_pcpu(smr, smrw->processor->cpu_id);
1520
1521 pcpu->qcur_size = UINT32_MAX;
1522 __smr_call_push(pcpu, &job->smrj_node, __smr_barrier_cb);
1523 __smr_call_advance_qcur(smr, pcpu, needs_commit);
1524 __smr_call_dispatch(smrw, pcpu);
1525 }
1526
1527 return head != NULL;
1528 }
1529
1530
1531 void
smr_barrier(smr_t smr)1532 smr_barrier(smr_t smr)
1533 {
1534 #if SMR_BARRIER_USE_STACK
1535 struct smr_barrier_job jobs[MAX_CPUS];
1536 #else
1537 struct smr_barrier_job *jobs;
1538 #endif
1539 struct smr_barrier_job *job;
1540 struct smr_barrier_ctx ctx = {
1541 .smrb_domain = smr,
1542 .smrb_waiter = current_thread(),
1543 .smrb_pending = zpercpu_count(),
1544 .smrb_count = zpercpu_count(),
1545 };
1546 spl_t spl;
1547
1548 /*
1549 * First wait for all readers to observe whatever it is
1550 * that changed prior to this call.
1551 *
1552 * _then_ enqueue callbacks that push out anything ahead.
1553 */
1554 smr_synchronize(smr);
1555
1556 #if !SMR_BARRIER_USE_STACK
1557 jobs = kalloc_type(struct smr_barrier_job, ctx.smrb_count,
1558 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1559 #endif
1560 job = jobs;
1561 spl = splsched();
1562
1563 __smr_cpu_lazy_up_if_needed(PERCPU_GET(smr_worker));
1564
1565 percpu_foreach(smrw, smr_worker) {
1566 job->smrj_context = &ctx;
1567 if (mpsc_queue_append(&smrw->barrier_queue, &job->smrj_link)) {
1568 __smrw_lock(smrw);
1569 __smrw_wakeup_and_unlock(smrw);
1570 }
1571 job++;
1572 }
1573
1574 /*
1575 * Because we disabled interrupts, our own CPU's callback
1576 * can't possibly have run, so just block.
1577 *
1578 * We must block in order to guarantee the lifetime of "ctx".
1579 * (See comment in __smr_barrier_cb).
1580 */
1581 assert_wait(__smrb_event(&ctx), THREAD_UNINT);
1582 assert(ctx.smrb_pending > 0);
1583 splx(spl);
1584 thread_block(THREAD_CONTINUE_NULL);
1585
1586 #if !SMR_BARRIER_USE_STACK
1587 kfree_type(struct smr_barrier_job, ctx.smrb_count, jobs);
1588 #endif
1589 }
1590
1591
1592 #pragma mark SMR domains: smr_worker
1593
1594 static void
__smr_worker_drain_lock(struct smr_worker * smrw)1595 __smr_worker_drain_lock(struct smr_worker *smrw)
1596 {
1597 for (;;) {
1598 ml_set_interrupts_enabled(false);
1599 __smrw_lock(smrw);
1600
1601 /*
1602 * Check we are on an appropriate processor
1603 *
1604 * Note that we might be running on the canonical
1605 * processor incorrectly: if the processor has been
1606 * de-recommended but isn't offline.
1607 */
1608 if (__probable(current_processor() == smrw->processor)) {
1609 if (__probable(!smrw->detach_reason)) {
1610 break;
1611 }
1612 } else {
1613 if (__probable(smrw->detach_reason)) {
1614 break;
1615 }
1616 }
1617
1618 /* go bind in the right place and retry */
1619 thread_bind(__smrw_drain_bind_target(smrw));
1620 __smrw_unlock(smrw);
1621 ml_set_interrupts_enabled(true);
1622 thread_block(THREAD_CONTINUE_NULL);
1623 }
1624 }
1625
1626 static void
__smr_worker_drain_unlock(struct smr_worker * smrw)1627 __smr_worker_drain_unlock(struct smr_worker *smrw)
1628 {
1629 __smrw_unlock(smrw);
1630 ml_set_interrupts_enabled(true);
1631 }
1632
1633 /*!
1634 * @function __smr_worker_tick
1635 *
1636 * @brief
1637 * Make the SMR worker queues make gentle progress
1638 *
1639 * @discussion
1640 * One round of progress will:
1641 * - move entries that have aged as being old,
1642 * - commit entries that have a deferred sequence and let them age.
1643 *
1644 * If this results into any callbacks to become "old",
1645 * then the worker is being woken up to start running callbacks.
1646 *
1647 * This function must run either on the processfor for this worker,
1648 * or under the worker drain lock being held.
1649 */
1650 static void
__smr_worker_tick(struct smr_worker * smrw,uint64_t ctime,bool wakeup)1651 __smr_worker_tick(struct smr_worker *smrw, uint64_t ctime, bool wakeup)
1652 {
1653 smr_pcpu_t pcpu = *smrw->wold_tail;
1654
1655 __smr_worker_check_invariants(smrw);
1656
1657 for (; pcpu != *smrw->wage_tail; pcpu = pcpu->drain_next) {
1658 assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1659 assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1660
1661 pcpu->qold_seq = pcpu->qage_seq;
1662 pcpu->qold_tail = pcpu->qage_tail;
1663
1664 pcpu->qage_seq = SMR_SEQ_INVALID;
1665 }
1666
1667 for (; pcpu; pcpu = pcpu->drain_next) {
1668 assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1669 assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1670
1671 smr_deferred_advance_commit(pcpu->drain_smr, pcpu->qage_seq);
1672 }
1673
1674 smrw->wold_tail = smrw->wage_tail;
1675 smrw->wage_tail = smrw->wcur_tail;
1676 smrw->drain_ctime = ctime;
1677
1678 __smr_worker_check_invariants(smrw);
1679
1680 if (wakeup && smrw->wold_tail != &smrw->whead) {
1681 __smrw_lock(smrw);
1682 __smrw_wakeup_and_unlock(smrw);
1683 }
1684 }
1685
1686 static void
__smr_worker_update_wold_tail(struct smr_worker * smrw,smr_pcpu_t * new_tail)1687 __smr_worker_update_wold_tail(struct smr_worker *smrw, smr_pcpu_t *new_tail)
1688 {
1689 smr_pcpu_t *old_tail = smrw->wold_tail;
1690
1691 if (smrw->wcur_tail == old_tail) {
1692 smrw->wage_tail = new_tail;
1693 smrw->wcur_tail = new_tail;
1694 } else if (smrw->wage_tail == old_tail) {
1695 smrw->wage_tail = new_tail;
1696 }
1697
1698 smrw->wold_tail = new_tail;
1699 }
1700
1701 static void
__smr_worker_drain_one(struct smr_worker * smrw,smr_pcpu_t pcpu)1702 __smr_worker_drain_one(struct smr_worker *smrw, smr_pcpu_t pcpu)
1703 {
1704 smr_t smr = pcpu->drain_smr;
1705 smr_seq_t seq = pcpu->qold_seq;
1706 smr_node_t head;
1707
1708 /*
1709 * Step 1: pop the "old" items,
1710 * (qold_tail/qold_seq left dangling)
1711 */
1712
1713 assert(seq != SMR_SEQ_INVALID);
1714 head = pcpu->qhead;
1715 pcpu->qhead = *pcpu->qold_tail;
1716 *pcpu->qold_tail = NULL;
1717
1718 /*
1719 * Step 2: Reconstruct the queue
1720 * based on the sequence numbers and count fields.
1721 *
1722 * Do what __smr_worker_tick() would do on this queue:
1723 * - commit the aging queue
1724 * - advance the current queue if needed
1725 */
1726
1727 if (pcpu->qage_seq != SMR_SEQ_INVALID) {
1728 assert(pcpu->qage_tail != pcpu->qold_tail);
1729
1730 smr_deferred_advance_commit(smr, pcpu->qage_seq);
1731 pcpu->qold_seq = pcpu->qage_seq;
1732 pcpu->qold_tail = pcpu->qage_tail;
1733 } else {
1734 assert(pcpu->qage_tail == pcpu->qold_tail);
1735
1736 pcpu->qold_seq = SMR_SEQ_INVALID;
1737 pcpu->qold_tail = &pcpu->qhead;
1738 }
1739
1740 if (__smr_call_should_advance(pcpu)) {
1741 __smr_call_advance_qcur(smr, pcpu, false);
1742 } else {
1743 pcpu->qage_seq = SMR_SEQ_INVALID;
1744 pcpu->qage_tail = pcpu->qold_tail;
1745 if (pcpu->qcur_cnt == 0) {
1746 pcpu->qcur_tail = pcpu->qage_tail;
1747 }
1748 }
1749
1750 if (pcpu->qold_seq != SMR_SEQ_INVALID) {
1751 /*
1752 * The node has gained an "old seq" back,
1753 * it goes to the ready queue.
1754 */
1755 pcpu->drain_next = *smrw->wold_tail;
1756 *smrw->wold_tail = pcpu;
1757 __smr_worker_update_wold_tail(smrw,
1758 &pcpu->drain_next);
1759 } else if (pcpu->qage_seq != SMR_SEQ_INVALID) {
1760 /*
1761 * The node has gained an "age seq" back,
1762 * it needs to age and wait for a tick
1763 * for its sequence number to be commited.
1764 */
1765 pcpu->drain_next = NULL;
1766 *smrw->wcur_tail = pcpu;
1767 smrw->wcur_tail = &pcpu->drain_next;
1768 } else {
1769 /*
1770 * The node is empty or with "current"
1771 * callbacks only, it can be dequeued.
1772 */
1773 assert(!__smr_call_should_advance(pcpu));
1774 pcpu->__check_cpu = (uint16_t)cpu_number();
1775 pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1776 pcpu->__check_list = 0;
1777 __smr_pcpu_set_not_queued(pcpu);
1778 }
1779
1780 /*
1781 * Step 3: drain callbacks.
1782 */
1783 __smr_worker_check_invariants(smrw);
1784 __smr_worker_drain_unlock(smrw);
1785
1786 __smr_poll(smr, seq, true);
1787 __smr_call_drain(head);
1788
1789 __smr_worker_drain_lock(smrw);
1790 }
1791
1792 static void
__smr_worker_continue(void * arg,wait_result_t wr __unused)1793 __smr_worker_continue(void *arg, wait_result_t wr __unused)
1794 {
1795 smr_pcpu_t pcpu = NULL, next = NULL;
1796 struct smr_worker *const smrw = arg;
1797 uint64_t deadline;
1798
1799 __smr_worker_drain_lock(smrw);
1800 __smr_worker_check_invariants(smrw);
1801
1802 if (smrw->wold_tail != &smrw->whead) {
1803 next = smrw->whead;
1804 smrw->whead = *smrw->wold_tail;
1805 *smrw->wold_tail = NULL;
1806 __smr_worker_update_wold_tail(smrw, &smrw->whead);
1807 }
1808
1809 /*
1810 * The pipeline of per-cpu SMR data structures with pending
1811 * smr_call() callbacks has three stages: wcur -> wage -> wold.
1812 *
1813 * In order to guarantee forward progress, a tick happens
1814 * for each of them, either via __smr_worker_tick(),
1815 * or via __smr_worker_drain_one().
1816 *
1817 * The second tick will happen either because to core stayed
1818 * busy enough that a subsequent smr_cpu_tick() decided to
1819 * perform it, or because the CPU idled, and smr_cpu_leave()
1820 * will perform an unconditional __smr_worker_tick().
1821 */
1822 __smr_barrier_drain(smrw, false);
1823 __smr_worker_tick(smrw, mach_absolute_time(), false);
1824
1825 while ((pcpu = next)) {
1826 next = next->drain_next;
1827 __smr_worker_drain_one(smrw, pcpu);
1828 }
1829
1830 if (__improbable(smrw->whead && smrw->detach_reason)) {
1831 /*
1832 * If the thread isn't bound, we want to flush anything
1833 * that is pending without causing too much contention.
1834 *
1835 * Sleep for a bit in order to give the system time
1836 * to observe any advance commits we did.
1837 */
1838 deadline = mach_absolute_time() + cpu_checkin_min_interval;
1839 } else {
1840 deadline = TIMEOUT_WAIT_FOREVER;
1841 }
1842 waitq_assert_wait64_locked(&smrw->waitq, __smrw_drain_event(smrw),
1843 THREAD_UNINT, TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1844 TIMEOUT_NO_LEEWAY, smrw->thread);
1845
1846 /*
1847 * Make sure there's no barrier left, after we called assert_wait()
1848 * in order to pair with __smr_barrier_cb(). If we do find some,
1849 * we must be careful about invariants and forward progress.
1850 *
1851 * For affected domains, the dequeued barriers have been added
1852 * to their "qage" queue. If their "qage" queue was non empty,
1853 * then its "qage_seq" was already commited, and we must preserve
1854 * this invariant.
1855 *
1856 * Affected domains that were idle before will get enqueued on this
1857 * worker's "wcur" queue. In order to guarantee forward progress,
1858 * we must force a tick if both the "wage" and "wold" queues
1859 * of the worker are empty.
1860 */
1861 if (__improbable(__smr_barrier_drain(smrw, true))) {
1862 if (smrw->wage_tail == &smrw->whead) {
1863 __smr_worker_tick(smrw, mach_absolute_time(), false);
1864 }
1865 }
1866
1867 __smr_worker_check_invariants(smrw);
1868 __smr_worker_drain_unlock(smrw);
1869
1870 thread_block_parameter(__smr_worker_continue, smrw);
1871 }
1872
1873
1874 #pragma mark SMR domains: scheduler integration
1875
1876 #if CONFIG_QUIESCE_COUNTER
1877 __startup_data
1878 static uint64_t _Atomic quiesce_gen_startup;
1879 static uint64_t _Atomic *quiesce_genp = &quiesce_gen_startup;
1880 static uint64_t _Atomic quiesce_ctime;
1881
1882 void
cpu_quiescent_set_storage(uint64_t _Atomic * ptr)1883 cpu_quiescent_set_storage(uint64_t _Atomic *ptr)
1884 {
1885 /*
1886 * Transfer to the real location for the commpage.
1887 *
1888 * this is ok to do like this because the system
1889 * is still single threaded.
1890 */
1891 uint64_t gen = os_atomic_load(&quiesce_gen_startup, relaxed);
1892
1893 os_atomic_store(ptr, gen, relaxed);
1894 quiesce_genp = ptr;
1895 }
1896
1897 static smr_seq_t
cpu_quiescent_gen_to_seq(uint64_t gen)1898 cpu_quiescent_gen_to_seq(uint64_t gen)
1899 {
1900 return gen * SMR_SEQ_INC + SMR_SEQ_INIT;
1901 }
1902
1903 static void
cpu_quiescent_advance(uint64_t gen,uint64_t ctime __kdebug_only)1904 cpu_quiescent_advance(uint64_t gen, uint64_t ctime __kdebug_only)
1905 {
1906 smr_seq_t seq = cpu_quiescent_gen_to_seq(gen);
1907
1908 os_atomic_thread_fence(seq_cst);
1909
1910 percpu_foreach(it, smr_worker) {
1911 smr_seq_t rd_seq = os_atomic_load(&it->rd_quiesce_seq, relaxed);
1912
1913 if (rd_seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(rd_seq, <, seq)) {
1914 return;
1915 }
1916 }
1917
1918 os_atomic_thread_fence(seq_cst);
1919
1920 if (lock_cmpxchg(quiesce_genp, gen, gen + 1, relaxed)) {
1921 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER),
1922 gen, 0, ctime, 0);
1923 }
1924 }
1925
1926 static void
cpu_quiescent_join(struct smr_worker * smrw)1927 cpu_quiescent_join(struct smr_worker *smrw)
1928 {
1929 uint64_t gen = os_atomic_load(quiesce_genp, relaxed);
1930
1931 assert(smrw->rd_quiesce_seq == SMR_SEQ_INVALID);
1932 os_atomic_store(&smrw->rd_quiesce_seq,
1933 cpu_quiescent_gen_to_seq(gen), relaxed);
1934 os_atomic_thread_fence(seq_cst);
1935 }
1936
1937 static void
cpu_quiescent_tick(struct smr_worker * smrw,uint64_t ctime,uint64_t interval)1938 cpu_quiescent_tick(struct smr_worker *smrw, uint64_t ctime, uint64_t interval)
1939 {
1940 uint64_t gen = os_atomic_load(quiesce_genp, relaxed);
1941 smr_seq_t seq = cpu_quiescent_gen_to_seq(gen);
1942
1943 if (smrw->rd_quiesce_seq == SMR_SEQ_INVALID) {
1944 /*
1945 * Likely called because of the scheduler tick,
1946 * smr_maintenance() will do the right thing.
1947 */
1948 assert(current_processor()->state != PROCESSOR_RUNNING);
1949 } else if (seq != smrw->rd_quiesce_seq) {
1950 /*
1951 * Someone managed to update the sequence already,
1952 * learn it, update our ctime.
1953 */
1954 os_atomic_store(&smrw->rd_quiesce_seq, seq, release);
1955 os_atomic_store(&quiesce_ctime, ctime, relaxed);
1956 os_atomic_thread_fence(seq_cst);
1957 } else if ((ctime - os_atomic_load(&quiesce_ctime, relaxed)) > interval) {
1958 /*
1959 * The system looks busy enough we want to update
1960 * the counter faster than every scheduler tick.
1961 */
1962 os_atomic_store(&quiesce_ctime, ctime, relaxed);
1963 cpu_quiescent_advance(gen, ctime);
1964 }
1965 }
1966
1967 static void
cpu_quiescent_leave(struct smr_worker * smrw)1968 cpu_quiescent_leave(struct smr_worker *smrw)
1969 {
1970 assert(smrw->rd_quiesce_seq != SMR_SEQ_INVALID);
1971 os_atomic_store(&smrw->rd_quiesce_seq, SMR_SEQ_INVALID, release);
1972 }
1973 #endif /* CONFIG_QUIESCE_COUNTER */
1974
1975 uint32_t
smr_cpu_checkin_get_min_interval_us(void)1976 smr_cpu_checkin_get_min_interval_us(void)
1977 {
1978 return cpu_checkin_min_interval_us;
1979 }
1980
1981 void
smr_cpu_checkin_set_min_interval_us(uint32_t new_value_us)1982 smr_cpu_checkin_set_min_interval_us(uint32_t new_value_us)
1983 {
1984 /* clamp to something vaguely sane */
1985 if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) {
1986 new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US;
1987 }
1988
1989 cpu_checkin_min_interval_us = new_value_us;
1990
1991 uint64_t abstime = 0;
1992 clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us,
1993 NSEC_PER_USEC, &abstime);
1994 cpu_checkin_min_interval = abstime;
1995 }
1996
1997 __startup_func
1998 static void
smr_cpu_checkin_init_min_interval_us(void)1999 smr_cpu_checkin_init_min_interval_us(void)
2000 {
2001 smr_cpu_checkin_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US);
2002 }
2003 STARTUP(TUNABLES, STARTUP_RANK_FIRST, smr_cpu_checkin_init_min_interval_us);
2004
2005 static void
__smr_cpu_init_thread(struct smr_worker * smrw)2006 __smr_cpu_init_thread(struct smr_worker *smrw)
2007 {
2008 char name[MAXTHREADNAMESIZE];
2009 thread_t th = THREAD_NULL;
2010
2011 kernel_thread_create(__smr_worker_continue, smrw, MINPRI_KERNEL, &th);
2012 smrw->thread = th;
2013
2014 snprintf(name, sizeof(name), "smr.reclaim:%d", smrw->processor->cpu_id);
2015 thread_set_thread_name(th, name);
2016 thread_start_in_assert_wait(th,
2017 &smrw->waitq, __smrw_drain_event(smrw), THREAD_UNINT);
2018 }
2019
2020 void
smr_cpu_init(struct processor * processor)2021 smr_cpu_init(struct processor *processor)
2022 {
2023 struct smr_worker *smrw;
2024
2025 smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2026 smrw->processor = processor;
2027
2028 waitq_init(&smrw->waitq, WQT_QUEUE, SYNC_POLICY_FIFO);
2029 smrw->detach_reason = SMR_CPU_REASON_OFFLINE;
2030
2031 smrq_init(&smrw->sect_queue);
2032 smrw->wold_tail = &smrw->whead;
2033 smrw->wage_tail = &smrw->whead;
2034 smrw->wcur_tail = &smrw->whead;
2035 mpsc_queue_init(&smrw->barrier_queue);
2036
2037 if (processor != master_processor) {
2038 __smr_cpu_init_thread(smrw);
2039 }
2040 }
2041 STARTUP_ARG(LOCKS, STARTUP_RANK_LAST, smr_cpu_init, master_processor);
2042 STARTUP_ARG(THREAD_CALL, STARTUP_RANK_LAST,
2043 __smr_cpu_init_thread, PERCPU_GET_MASTER(smr_worker));
2044
2045 /*!
2046 * @function smr_cpu_up()
2047 *
2048 * @brief
2049 * Scheduler callback to notify this processor is going up.
2050 *
2051 * @discussion
2052 * Called at splsched() under the sched_available_cores_lock.
2053 */
2054 void
smr_cpu_up(struct processor * processor,smr_cpu_reason_t reason)2055 smr_cpu_up(struct processor *processor, smr_cpu_reason_t reason)
2056 {
2057 struct smr_worker *smrw;
2058
2059 smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2060
2061 __smrw_lock(smrw);
2062 if (reason != SMR_CPU_REASON_IGNORED) {
2063 assert((smrw->detach_reason & reason) == reason);
2064 }
2065 smrw->detach_reason &= ~reason;
2066 __smrw_unlock(smrw);
2067 }
2068
2069 static void
__smr_cpu_down_and_unlock(struct processor * processor,struct smr_worker * smrw,smr_cpu_reason_t reason)2070 __smr_cpu_down_and_unlock(
2071 struct processor *processor,
2072 struct smr_worker *smrw,
2073 smr_cpu_reason_t reason)
2074 {
2075 bool detach = !smrw->detach_reason;
2076
2077 /*
2078 * When reason is SMR_CPU_REASON_IGNORED,
2079 * this is called from smr_cpu_leave() on the way to idle.
2080 *
2081 * However this isn't sychronized with the recommendation
2082 * lock, hence it is possible that the CPU might actually
2083 * be recommended again while we're on the way to idle.
2084 *
2085 * By re-checking processor recommendation under
2086 * the __smrw_lock, we serialize with smr_cpu_up().
2087 */
2088 if (reason != SMR_CPU_REASON_IGNORED) {
2089 assert((smrw->detach_reason & reason) == 0);
2090 } else if (processor->is_recommended) {
2091 /*
2092 * The race we try to detect happened,
2093 * do nothing.
2094 */
2095 reason = SMR_CPU_REASON_NONE;
2096 detach = false;
2097 }
2098 smrw->detach_reason |= reason;
2099 reason = smrw->detach_reason;
2100
2101 if (detach && smrw->whead) {
2102 detach = !__smrw_wakeup_and_unlock(smrw);
2103 } else {
2104 __smrw_unlock(smrw);
2105 }
2106
2107 if (detach) {
2108 thread_unbind_after_queue_shutdown(smrw->thread, processor);
2109 }
2110 }
2111
2112 /*!
2113 * @function smr_cpu_down()
2114 *
2115 * @brief
2116 * Scheduler callback to notify this processor is going down.
2117 *
2118 * @discussion
2119 * Called at splsched() when the processor run queue is being shut down.
2120 */
2121 void
smr_cpu_down(struct processor * processor,smr_cpu_reason_t reason)2122 smr_cpu_down(struct processor *processor, smr_cpu_reason_t reason)
2123 {
2124 struct smr_worker *smrw;
2125
2126 smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2127
2128 __smrw_lock(smrw);
2129 __smr_cpu_down_and_unlock(processor, smrw, reason);
2130 }
2131
2132
2133 /*!
2134 * @function smr_cpu_join()
2135 *
2136 * @brief
2137 * Scheduler callback to notify this processor is going out of idle.
2138 *
2139 * @discussion
2140 * Called at splsched().
2141 */
2142 void
smr_cpu_join(struct processor * processor,uint64_t ctime __unused)2143 smr_cpu_join(struct processor *processor, uint64_t ctime __unused)
2144 {
2145 #if CONFIG_QUIESCE_COUNTER
2146 struct smr_worker *smrw;
2147
2148 smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2149 cpu_quiescent_join(smrw);
2150 #else
2151 (void)processor;
2152 #endif /* CONFIG_QUIESCE_COUNTER */
2153 }
2154
2155 /*!
2156 * @function smr_cpu_tick()
2157 *
2158 * @brief
2159 * Scheduler callback invoked during the scheduler maintenance routine.
2160 *
2161 * @discussion
2162 * Called at splsched().
2163 */
2164 void
smr_cpu_tick(uint64_t ctime,bool safe_point)2165 smr_cpu_tick(uint64_t ctime, bool safe_point)
2166 {
2167 struct smr_worker *smrw = PERCPU_GET(smr_worker);
2168 uint64_t interval = cpu_checkin_min_interval;
2169
2170 #if CONFIG_QUIESCE_COUNTER
2171 cpu_quiescent_tick(smrw, ctime, interval);
2172 #endif /* CONFIG_QUIESCE_COUNTER */
2173
2174 /*
2175 * if a bound thread was woken up on a derecommended core,
2176 * our detach_reason might be "IGNORED" and we want to leave
2177 * it alone in that case
2178 */
2179 if (safe_point && !smrw->detach_reason && smrw->whead &&
2180 current_processor()->state == PROCESSOR_RUNNING &&
2181 (ctime - smrw->drain_ctime) > interval) {
2182 __smr_worker_tick(smrw, ctime, true);
2183 }
2184 }
2185
2186 /*!
2187 * @function smr_cpu_leave()
2188 *
2189 * @brief
2190 * Scheduler callback to notify this processor is going idle.
2191 *
2192 * @discussion
2193 * Called at splsched().
2194 */
2195 void
smr_cpu_leave(struct processor * processor,uint64_t ctime)2196 smr_cpu_leave(struct processor *processor, uint64_t ctime)
2197 {
2198 struct smr_worker *smrw;
2199
2200 smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2201
2202 /*
2203 * if a bound thread was woken up on a derecommended core,
2204 * our detach_reason might be "IGNORED" and we want to leave
2205 * it alone in that case
2206 *
2207 * See comment in __smr_worker_continue for why this must be
2208 * done unconditionally otherwise.
2209 */
2210 if (!smrw->detach_reason && smrw->whead) {
2211 __smr_worker_tick(smrw, ctime, true);
2212 }
2213
2214 if (__improbable(!processor->is_recommended)) {
2215 __smrw_lock(smrw);
2216 __smr_cpu_down_and_unlock(processor, smrw, SMR_CPU_REASON_IGNORED);
2217 }
2218
2219 #if CONFIG_QUIESCE_COUNTER
2220 cpu_quiescent_leave(smrw);
2221 #endif /* CONFIG_QUIESCE_COUNTER */
2222 }
2223
2224 /*!
2225 * @function smr_maintenance()
2226 *
2227 * @brief
2228 * Scheduler callback called at the scheduler tick.
2229 *
2230 * @discussion
2231 * Called at splsched().
2232 */
2233 void
smr_maintenance(uint64_t ctime)2234 smr_maintenance(uint64_t ctime)
2235 {
2236 #if CONFIG_QUIESCE_COUNTER
2237 cpu_quiescent_advance(os_atomic_load(quiesce_genp, relaxed), ctime);
2238 #else
2239 (void)ctime;
2240 #endif /* CONFIG_QUIESCE_COUNTER */
2241 }
2242
2243
2244 #pragma mark - SMR hash tables
2245
2246 static struct smrq_slist_head *
smr_hash_alloc_array(size_t size)2247 smr_hash_alloc_array(size_t size)
2248 {
2249 return kalloc_type(struct smrq_slist_head, size,
2250 Z_WAITOK | Z_ZERO | Z_SPRAYQTN);
2251 }
2252
2253 static void
smr_hash_free_array(struct smrq_slist_head * array,size_t size)2254 smr_hash_free_array(struct smrq_slist_head *array, size_t size)
2255 {
2256 kfree_type(struct smrq_slist_head, size, array);
2257 }
2258
2259 static inline uintptr_t
smr_hash_array_encode(struct smrq_slist_head * array,uint16_t order)2260 smr_hash_array_encode(struct smrq_slist_head *array, uint16_t order)
2261 {
2262 uintptr_t ptr;
2263
2264 ptr = (uintptr_t)array;
2265 ptr &= ~SMRH_ARRAY_ORDER_MASK;
2266 ptr |= (uintptr_t)order << SMRH_ARRAY_ORDER_SHIFT;
2267
2268 return ptr;
2269 }
2270
2271 #pragma mark SMR simple hash tables
2272
2273 void
smr_hash_init(struct smr_hash * smrh,size_t size)2274 smr_hash_init(struct smr_hash *smrh, size_t size)
2275 {
2276 struct smrq_slist_head *array;
2277 uint16_t shift;
2278
2279 assert(size);
2280 shift = (uint16_t)flsll(size - 1);
2281 size = 1UL << shift;
2282 if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
2283 assert(size * sizeof(struct smrq_slist_head) <=
2284 KALLOC_SAFE_ALLOC_SIZE);
2285 }
2286 array = smr_hash_alloc_array(size);
2287
2288 *smrh = (struct smr_hash){
2289 .smrh_array = smr_hash_array_encode(array, 64 - shift),
2290 };
2291 }
2292
2293 void
smr_hash_destroy(struct smr_hash * smrh)2294 smr_hash_destroy(struct smr_hash *smrh)
2295 {
2296 struct smr_hash_array array = smr_hash_array_decode(smrh);
2297
2298 smr_hash_free_array(array.smrh_array, smr_hash_size(array));
2299 *smrh = (struct smr_hash){ };
2300 }
2301
2302 void
2303 __smr_hash_serialized_clear(
2304 struct smr_hash *smrh,
2305 smrh_traits_t smrht,
2306 void (^free)(void *obj))
2307 {
2308 struct smr_hash_array array = smr_hash_array_decode(smrh);
2309
2310 for (size_t i = 0; i < smr_hash_size(array); i++) {
2311 struct smrq_slink *link;
2312 __smrq_slink_t *prev;
2313
2314 prev = &array.smrh_array[i].first;
2315 while ((link = smr_serialized_load(prev))) {
2316 prev = &link->next;
2317 free(__smrht_link_to_obj(smrht, link));
2318 }
2319
2320 smr_clear_store(&array.smrh_array[i].first);
2321 }
2322
2323 smrh->smrh_count = 0;
2324 }
2325
2326 kern_return_t
__smr_hash_shrink_and_unlock(struct smr_hash * smrh,lck_mtx_t * lock,smrh_traits_t smrht)2327 __smr_hash_shrink_and_unlock(
2328 struct smr_hash *smrh,
2329 lck_mtx_t *lock,
2330 smrh_traits_t smrht)
2331 {
2332 struct smr_hash_array decptr = smr_hash_array_decode(smrh);
2333 struct smrq_slist_head *newarray, *oldarray;
2334 uint16_t neworder = decptr.smrh_order + 1;
2335 size_t oldsize = smr_hash_size(decptr);
2336 size_t newsize = oldsize / 2;
2337
2338 assert(newsize);
2339
2340 if (os_atomic_load(&smrh->smrh_resizing, relaxed)) {
2341 lck_mtx_unlock(lock);
2342 return KERN_FAILURE;
2343 }
2344
2345 os_atomic_store(&smrh->smrh_resizing, true, relaxed);
2346 lck_mtx_unlock(lock);
2347
2348 newarray = smr_hash_alloc_array(newsize);
2349 if (newarray == NULL) {
2350 os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2351 return KERN_RESOURCE_SHORTAGE;
2352 }
2353
2354 lck_mtx_lock(lock);
2355
2356 /*
2357 * Step 1: collapse all the chains in pairs.
2358 */
2359 oldarray = decptr.smrh_array;
2360
2361 for (size_t i = 0; i < newsize; i++) {
2362 newarray[i] = oldarray[i];
2363 smrq_serialized_append(&newarray[i], &oldarray[i + newsize]);
2364 }
2365
2366 /*
2367 * Step 2: publish the new array.
2368 */
2369 os_atomic_store(&smrh->smrh_array,
2370 smr_hash_array_encode(newarray, neworder), release);
2371
2372 os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2373
2374 lck_mtx_unlock(lock);
2375
2376 /*
2377 * Step 3: free the old array once readers can't observe the old values.
2378 */
2379 smr_synchronize(smrht->domain);
2380
2381 smr_hash_free_array(oldarray, oldsize);
2382 return KERN_SUCCESS;
2383 }
2384
2385 kern_return_t
__smr_hash_grow_and_unlock(struct smr_hash * smrh,lck_mtx_t * lock,smrh_traits_t smrht)2386 __smr_hash_grow_and_unlock(
2387 struct smr_hash *smrh,
2388 lck_mtx_t *lock,
2389 smrh_traits_t smrht)
2390 {
2391 struct smr_hash_array decptr = smr_hash_array_decode(smrh);
2392 struct smrq_slist_head *newarray, *oldarray;
2393 __smrq_slink_t **prevarray;
2394 uint16_t neworder = decptr.smrh_order - 1;
2395 size_t oldsize = smr_hash_size(decptr);
2396 size_t newsize = 2 * oldsize;
2397 bool needs_another_round = false;
2398
2399 if (smrh->smrh_resizing) {
2400 lck_mtx_unlock(lock);
2401 return KERN_FAILURE;
2402 }
2403
2404 smrh->smrh_resizing = true;
2405 lck_mtx_unlock(lock);
2406
2407 newarray = smr_hash_alloc_array(newsize);
2408 if (newarray == NULL) {
2409 os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2410 return KERN_RESOURCE_SHORTAGE;
2411 }
2412
2413 prevarray = kalloc_type(__smrq_slink_t *, newsize,
2414 Z_WAITOK | Z_ZERO | Z_SPRAYQTN);
2415 if (prevarray == NULL) {
2416 smr_hash_free_array(newarray, newsize);
2417 os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2418 return KERN_RESOURCE_SHORTAGE;
2419 }
2420
2421
2422 lck_mtx_lock(lock);
2423
2424 /*
2425 * Step 1: create a duplicated array with twice as many heads.
2426 */
2427 oldarray = decptr.smrh_array;
2428
2429 memcpy(newarray, oldarray, oldsize * sizeof(newarray[0]));
2430 memcpy(newarray + oldsize, oldarray, oldsize * sizeof(newarray[0]));
2431
2432 /*
2433 * Step 2: Publish the new array, and wait for readers to observe it
2434 * before we do any change.
2435 */
2436 os_atomic_store(&smrh->smrh_array,
2437 smr_hash_array_encode(newarray, neworder), release);
2438
2439 smr_synchronize(smrht->domain);
2440
2441
2442 /*
2443 * Step 3: split the lists.
2444 */
2445
2446 /*
2447 * If the list we are trying to split looked like this,
2448 * where L elements will go to the "left" bucket and "R"
2449 * to the right one:
2450 *
2451 * old_head --> L1 --> L2 -> L5
2452 * \ / \
2453 * -> R3 --> R4 -> R6 --> NULL
2454 *
2455 * Then make sure the new heads point to their legitimate first element,
2456 * leading to this state:
2457 *
2458 * l_head --> L1 --> L2 -> L5
2459 * \ / \
2460 * r_head ----------------> R3 --> R4 -> R6 --> NULL
2461 *
2462 *
2463 * prevarray[left] = &L2->next
2464 * prevarray[right] = &r_head
2465 * oldarray[old] = L2
2466 */
2467
2468 for (size_t i = 0; i < oldsize; i++) {
2469 struct smrq_slink *link, *next;
2470 uint32_t want_mask;
2471
2472 link = smr_serialized_load(&oldarray[i].first);
2473 if (link == NULL) {
2474 continue;
2475 }
2476
2477 want_mask = smrht->obj_hash(link, 0) & oldsize;
2478 while ((next = smr_serialized_load(&link->next)) &&
2479 (smrht->obj_hash(next, 0) & oldsize) == want_mask) {
2480 link = next;
2481 }
2482
2483 if (want_mask == 0) {
2484 /* elements seen go to the "left" bucket */
2485 prevarray[i] = &link->next;
2486 prevarray[i + oldsize] = &newarray[i + oldsize].first;
2487 smr_serialized_store_relaxed(prevarray[i + oldsize], next);
2488 } else {
2489 /* elements seen go to the "right" bucket */
2490 prevarray[i] = &newarray[i].first;
2491 prevarray[i + oldsize] = &link->next;
2492 smr_serialized_store_relaxed(prevarray[i], next);
2493 }
2494
2495 smr_serialized_store_relaxed(&oldarray[i].first,
2496 next ? link : NULL);
2497
2498 needs_another_round |= (next != NULL);
2499 }
2500
2501 /*
2502 * At this point, when we split further, we must wait for
2503 * readers to observe the previous state before we split
2504 * further. Indeed, reusing the example above, the next
2505 * round of splitting would end up with this:
2506 *
2507 * l_head --> L1 --> L2 ----------------> L5
2508 * / \
2509 * r_head ----------------> R3 --> R4 -> R6 --> NULL
2510 *
2511 *
2512 * prevarray[left] = &L2->next
2513 * prevarray[right] = &R4->next
2514 * oldarray[old] = R4
2515 *
2516 * But we must be sure that no readers can observe r_head
2517 * having been L1, otherwise a stale reader might skip over
2518 * R3/R4.
2519 *
2520 * Generally speaking we need to do that each time we do a round
2521 * of splitting that isn't terminating the list with NULL.
2522 */
2523
2524 while (needs_another_round) {
2525 smr_synchronize(smrht->domain);
2526
2527 needs_another_round = false;
2528
2529 for (size_t i = 0; i < oldsize; i++) {
2530 struct smrq_slink *link, *next;
2531 uint32_t want_mask;
2532
2533 link = smr_serialized_load(&oldarray[i].first);
2534 if (link == NULL) {
2535 continue;
2536 }
2537
2538 /*
2539 * If `prevarray[i]` (left) points to the linkage
2540 * we stopped at, then it means the next element
2541 * will be "to the right" and vice versa.
2542 *
2543 * We also already know "next" exists, so only probe
2544 * after it.
2545 */
2546 if (prevarray[i] == &link->next) {
2547 want_mask = (uint32_t)oldsize;
2548 } else {
2549 want_mask = 0;
2550 }
2551
2552 link = smr_serialized_load(&link->next);
2553
2554 while ((next = smr_serialized_load(&link->next)) &&
2555 (smrht->obj_hash(next, 0) & oldsize) == want_mask) {
2556 link = next;
2557 }
2558
2559 if (want_mask == 0) {
2560 /* elements seen go to the "left" bucket */
2561 prevarray[i] = &link->next;
2562 smr_serialized_store_relaxed(prevarray[i + oldsize], next);
2563 } else {
2564 /* elements seen go to the "right" bucket */
2565 smr_serialized_store_relaxed(prevarray[i], next);
2566 prevarray[i + oldsize] = &link->next;
2567 }
2568
2569 smr_serialized_store_relaxed(&oldarray[i].first,
2570 next ? link : NULL);
2571
2572 needs_another_round |= (next != NULL);
2573 }
2574 }
2575
2576 smrh->smrh_resizing = false;
2577 lck_mtx_unlock(lock);
2578
2579 /*
2580 * Step 4: cleanup, no need to wait for readers, this happened already
2581 * at least once for splitting reasons.
2582 */
2583 smr_hash_free_array(oldarray, oldsize);
2584 kfree_type(__smrq_slink_t *, newsize, prevarray);
2585 return KERN_SUCCESS;
2586 }
2587
2588 #pragma mark SMR scalable hash tables
2589
2590 #define SMRSH_MIGRATED ((struct smrq_slink *)SMRSH_BUCKET_STOP_BIT)
2591 static LCK_GRP_DECLARE(smr_shash_grp, "smr_shash");
2592
2593 static inline size_t
__smr_shash_min_size(struct smr_shash * smrh)2594 __smr_shash_min_size(struct smr_shash *smrh)
2595 {
2596 return 1ul << smrh->smrsh_min_shift;
2597 }
2598
2599 static inline size_t
__smr_shash_size_for_shift(uint8_t shift)2600 __smr_shash_size_for_shift(uint8_t shift)
2601 {
2602 return (~0u >> shift) + 1;
2603 }
2604
2605 static inline size_t
__smr_shash_cursize(smrsh_state_t state)2606 __smr_shash_cursize(smrsh_state_t state)
2607 {
2608 return __smr_shash_size_for_shift(state.curshift);
2609 }
2610
2611 static void
__smr_shash_bucket_init(hw_lck_ptr_t * head)2612 __smr_shash_bucket_init(hw_lck_ptr_t *head)
2613 {
2614 hw_lck_ptr_init(head, __smr_shash_bucket_stop(head), &smr_shash_grp);
2615 }
2616
2617 static void
__smr_shash_bucket_destroy(hw_lck_ptr_t * head)2618 __smr_shash_bucket_destroy(hw_lck_ptr_t *head)
2619 {
2620 hw_lck_ptr_destroy(head, &smr_shash_grp);
2621 }
2622
2623 __attribute__((noinline))
2624 void *
__smr_shash_entered_find_slow(const struct smr_shash * smrh,smrh_key_t key,hw_lck_ptr_t * head,smrh_traits_t traits)2625 __smr_shash_entered_find_slow(
2626 const struct smr_shash *smrh,
2627 smrh_key_t key,
2628 hw_lck_ptr_t *head,
2629 smrh_traits_t traits)
2630 {
2631 struct smrq_slink *link;
2632 smrsh_state_t state;
2633 uint32_t hash;
2634
2635 /* wait for the rehashing to be done into their target buckets */
2636 hw_lck_ptr_wait_for_value(head, SMRSH_MIGRATED, &smr_shash_grp);
2637
2638 state = os_atomic_load(&smrh->smrsh_state, dependency);
2639 hash = __smr_shash_hash(smrh, state.newidx, key, traits);
2640 head = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
2641
2642 link = hw_lck_ptr_value(head);
2643 while (!__smr_shash_is_stop(link)) {
2644 if (traits->obj_equ(link, key)) {
2645 return __smrht_link_to_obj(traits, link);
2646 }
2647 link = smr_entered_load(&link->next);
2648 }
2649
2650 assert(link == __smr_shash_bucket_stop(head));
2651 return NULL;
2652 }
2653
2654 static const uint8_t __smr_shash_grow_ratio[] = {
2655 [SMRSH_COMPACT] = 6,
2656 [SMRSH_BALANCED] = 4,
2657 [SMRSH_BALANCED_NOSHRINK] = 4,
2658 [SMRSH_FASTEST] = 2,
2659 };
2660
2661 static inline uint64_t
__smr_shash_count(struct smr_shash * smrh)2662 __smr_shash_count(struct smr_shash *smrh)
2663 {
2664 int64_t count = (int64_t)counter_load(&smrh->smrsh_count);
2665
2666 /*
2667 * negative values make no sense and is likely due to some
2668 * stale values being read.
2669 */
2670 return count < 0 ? 0ull : (uint64_t)count;
2671 }
2672
2673 static inline bool
__smr_shash_should_grow(struct smr_shash * smrh,smrsh_state_t state,uint64_t count)2674 __smr_shash_should_grow(
2675 struct smr_shash *smrh,
2676 smrsh_state_t state,
2677 uint64_t count)
2678 {
2679 size_t size = __smr_shash_cursize(state);
2680
2681 /* grow if elem:bucket ratio is worse than grow_ratio:1 */
2682 return count > __smr_shash_grow_ratio[smrh->smrsh_policy] * size;
2683 }
2684
2685 static inline bool
__smr_shash_should_reseed(struct smr_shash * smrh,size_t observed_depth)2686 __smr_shash_should_reseed(
2687 struct smr_shash *smrh,
2688 size_t observed_depth)
2689 {
2690 return observed_depth > 10 * __smr_shash_grow_ratio[smrh->smrsh_policy];
2691 }
2692
2693 static inline bool
__smr_shash_should_shrink(struct smr_shash * smrh,smrsh_state_t state,uint64_t count)2694 __smr_shash_should_shrink(
2695 struct smr_shash *smrh,
2696 smrsh_state_t state,
2697 uint64_t count)
2698 {
2699 size_t size = __smr_shash_cursize(state);
2700
2701 switch (smrh->smrsh_policy) {
2702 case SMRSH_COMPACT:
2703 /* shrink if bucket:elem ratio is worse than 1:1 */
2704 return size > count && size > __smr_shash_min_size(smrh);
2705 case SMRSH_BALANCED:
2706 /* shrink if bucket:elem ratio is worse than 2:1 */
2707 return size > 2 * count && size > __smr_shash_min_size(smrh);
2708 case SMRSH_BALANCED_NOSHRINK:
2709 case SMRSH_FASTEST:
2710 return false;
2711 }
2712 }
2713
2714 static inline void
__smr_shash_schedule_rehash(struct smr_shash * smrh,smrh_traits_t traits,smrsh_rehash_t reason)2715 __smr_shash_schedule_rehash(
2716 struct smr_shash *smrh,
2717 smrh_traits_t traits,
2718 smrsh_rehash_t reason)
2719 {
2720 smrsh_rehash_t rehash;
2721
2722 rehash = os_atomic_load(&smrh->smrsh_rehashing, relaxed);
2723 if (rehash & reason) {
2724 return;
2725 }
2726
2727 rehash = os_atomic_or_orig(&smrh->smrsh_rehashing, reason, relaxed);
2728 if (!rehash) {
2729 thread_call_enter1(smrh->smrsh_callout,
2730 __DECONST(void *, traits));
2731 }
2732 }
2733
2734 void *
__smr_shash_entered_get_or_insert(struct smr_shash * smrh,smrh_key_t key,struct smrq_slink * link,smrh_traits_t traits)2735 __smr_shash_entered_get_or_insert(
2736 struct smr_shash *smrh,
2737 smrh_key_t key,
2738 struct smrq_slink *link,
2739 smrh_traits_t traits)
2740 {
2741 struct smrq_slink *first;
2742 struct smrq_slink *other;
2743 uint32_t hash, depth;
2744 smrsh_state_t state;
2745 hw_lck_ptr_t *head;
2746 void *obj;
2747
2748 state = os_atomic_load(&smrh->smrsh_state, dependency);
2749 hash = __smr_shash_hash(smrh, state.curidx, key, traits);
2750 head = __smr_shash_bucket(smrh, state, SMRSH_CUR, hash);
2751 first = hw_lck_ptr_lock(head, &smr_shash_grp);
2752
2753 if (__improbable(first == SMRSH_MIGRATED)) {
2754 hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
2755
2756 state = os_atomic_load(&smrh->smrsh_state, dependency);
2757 hash = __smr_shash_hash(smrh, state.newidx, key, traits);
2758 head = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
2759 first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
2760 }
2761
2762 depth = 0;
2763 other = first;
2764 while (!__smr_shash_is_stop(other)) {
2765 depth++;
2766 if (traits->obj_equ(other, key)) {
2767 obj = __smrht_link_to_obj(traits, other);
2768 if (traits->obj_try_get(obj)) {
2769 hw_lck_ptr_unlock(head, first,
2770 &smr_shash_grp);
2771 return obj;
2772 }
2773 break;
2774 }
2775 other = smr_serialized_load(&other->next);
2776 }
2777
2778 counter_inc_preemption_disabled(&smrh->smrsh_count);
2779 smr_serialized_store_relaxed(&link->next, first);
2780 hw_lck_ptr_unlock(head, link, &smr_shash_grp);
2781
2782 if (__smr_shash_should_reseed(smrh, depth)) {
2783 __smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_RESEED);
2784 } else if (depth * 2 >= __smr_shash_grow_ratio[smrh->smrsh_policy] &&
2785 __smr_shash_should_grow(smrh, state, __smr_shash_count(smrh))) {
2786 __smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_GROW);
2787 }
2788 return NULL;
2789 }
2790
2791 __abortlike
2792 static void
__smr_shash_missing_elt_panic(struct smr_shash * smrh,struct smrq_slink * link,smrh_traits_t traits)2793 __smr_shash_missing_elt_panic(
2794 struct smr_shash *smrh,
2795 struct smrq_slink *link,
2796 smrh_traits_t traits)
2797 {
2798 panic("Unable to find item %p (linkage %p) in %p (traits %p)",
2799 __smrht_link_to_obj(traits, link), link, smrh, traits);
2800 }
2801
2802 smr_shash_mut_cursor_t
__smr_shash_entered_mut_begin(struct smr_shash * smrh,struct smrq_slink * link,smrh_traits_t traits)2803 __smr_shash_entered_mut_begin(
2804 struct smr_shash *smrh,
2805 struct smrq_slink *link,
2806 smrh_traits_t traits)
2807 {
2808 struct smrq_slink *first, *next;
2809 __smrq_slink_t *prev;
2810 smrsh_state_t state;
2811 hw_lck_ptr_t *head;
2812 uint32_t hash;
2813
2814 state = os_atomic_load(&smrh->smrsh_state, dependency);
2815 hash = __smr_shash_hash(smrh, state.curidx, link, traits);
2816 head = __smr_shash_bucket(smrh, state, SMRSH_CUR, hash);
2817 first = hw_lck_ptr_lock(head, &smr_shash_grp);
2818
2819 if (__improbable(first == SMRSH_MIGRATED)) {
2820 hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
2821
2822 state = os_atomic_load(&smrh->smrsh_state, dependency);
2823 hash = __smr_shash_hash(smrh, state.newidx, link, traits);
2824 head = __smr_shash_bucket(smrh, state, SMRSH_NEW, hash);
2825 first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
2826 }
2827
2828 next = first;
2829 while (next != link) {
2830 if (__smr_shash_is_stop(next)) {
2831 __smr_shash_missing_elt_panic(smrh, link, traits);
2832 }
2833 prev = &next->next;
2834 next = smr_serialized_load(prev);
2835 }
2836
2837 return (smr_shash_mut_cursor_t){ .head = head, .prev = prev };
2838 }
2839
2840 void
__smr_shash_entered_mut_erase(struct smr_shash * smrh,smr_shash_mut_cursor_t cursor,struct smrq_slink * link,smrh_traits_t traits)2841 __smr_shash_entered_mut_erase(
2842 struct smr_shash *smrh,
2843 smr_shash_mut_cursor_t cursor,
2844 struct smrq_slink *link,
2845 smrh_traits_t traits)
2846 {
2847 struct smrq_slink *next, *first;
2848 smrsh_state_t state;
2849
2850 first = hw_lck_ptr_value(cursor.head);
2851
2852 next = smr_serialized_load(&link->next);
2853 if (first == link) {
2854 counter_dec_preemption_disabled(&smrh->smrsh_count);
2855 hw_lck_ptr_unlock(cursor.head, next, &smr_shash_grp);
2856 } else {
2857 smr_serialized_store_relaxed(cursor.prev, next);
2858 counter_dec_preemption_disabled(&smrh->smrsh_count);
2859 hw_lck_ptr_unlock(cursor.head, first, &smr_shash_grp);
2860 }
2861
2862 state = atomic_load_explicit(&smrh->smrsh_state, memory_order_relaxed);
2863 if (first == link && __smr_shash_is_stop(next) &&
2864 __smr_shash_should_shrink(smrh, state, __smr_shash_count(smrh))) {
2865 __smr_shash_schedule_rehash(smrh, traits, SMRSH_REHASH_SHRINK);
2866 }
2867 }
2868
2869 void
__smr_shash_entered_mut_replace(smr_shash_mut_cursor_t cursor,struct smrq_slink * old_link,struct smrq_slink * new_link)2870 __smr_shash_entered_mut_replace(
2871 smr_shash_mut_cursor_t cursor,
2872 struct smrq_slink *old_link,
2873 struct smrq_slink *new_link)
2874 {
2875 struct smrq_slink *first, *next;
2876
2877 first = hw_lck_ptr_value(cursor.head);
2878
2879 next = smr_serialized_load(&old_link->next);
2880 smr_serialized_store_relaxed(&new_link->next, next);
2881 if (first == old_link) {
2882 hw_lck_ptr_unlock(cursor.head, new_link, &smr_shash_grp);
2883 } else {
2884 smr_serialized_store_relaxed(cursor.prev, new_link);
2885 hw_lck_ptr_unlock(cursor.head, first, &smr_shash_grp);
2886 }
2887 }
2888
2889 void
__smr_shash_entered_mut_abort(smr_shash_mut_cursor_t cursor)2890 __smr_shash_entered_mut_abort(smr_shash_mut_cursor_t cursor)
2891 {
2892 hw_lck_ptr_unlock(cursor.head,
2893 hw_lck_ptr_value(cursor.head), &smr_shash_grp);
2894 }
2895
2896 static kern_return_t
__smr_shash_rehash_with_target(struct smr_shash * smrh,smrsh_state_t state,uint8_t newshift,smrh_traits_t traits)2897 __smr_shash_rehash_with_target(
2898 struct smr_shash *smrh,
2899 smrsh_state_t state,
2900 uint8_t newshift,
2901 smrh_traits_t traits)
2902 {
2903 const size_t FLAT_SIZE = 256;
2904 struct smrq_slink *flat_queue[FLAT_SIZE];
2905
2906 size_t oldsize, newsize;
2907 hw_lck_ptr_t *oldarray;
2908 hw_lck_ptr_t *newarray;
2909 uint32_t newseed;
2910 uint8_t oldidx;
2911
2912 /*
2913 * This function resizes a scalable hash table.
2914 *
2915 * It doesn't require a lock because it is the callout
2916 * of a THREAD_CALL_ONCE thread call.
2917 */
2918
2919 oldidx = state.curidx;
2920 state.newidx = 1 - state.curidx;
2921 state.newshift = newshift;
2922 assert(__smr_shash_load_array(smrh, state.newidx) == NULL);
2923
2924 oldsize = __smr_shash_cursize(state);
2925 newsize = __smr_shash_size_for_shift(newshift);
2926
2927 oldarray = __smr_shash_load_array(smrh, state.curidx);
2928 newarray = (hw_lck_ptr_t *)smr_hash_alloc_array(newsize);
2929 newseed = (uint32_t)early_random();
2930
2931 if (newarray == NULL) {
2932 return KERN_RESOURCE_SHORTAGE;
2933 }
2934
2935 /*
2936 * Step 1: initialize the new array and seed,
2937 * and then publish the state referencing it.
2938 *
2939 * We do not need to synchronize explicitly with SMR,
2940 * because readers/writers will notice rehashing when
2941 * the bucket they interact with has a SMRSH_MIGRATED
2942 * value.
2943 */
2944
2945 for (size_t i = 0; i < newsize; i++) {
2946 __smr_shash_bucket_init(&newarray[i]);
2947 }
2948 os_atomic_store(&smrh->smrsh_array[state.newidx], newarray, relaxed);
2949 os_atomic_store(&smrh->smrsh_seed[state.newidx], newseed, relaxed);
2950 os_atomic_store(&smrh->smrsh_state, state, release);
2951
2952 /*
2953 * Step 2: migrate buckets "atomically" under the old bucket lock.
2954 *
2955 * This migration is atomic for writers because
2956 * they take the old bucket lock first, and if
2957 * they observe SMRSH_MIGRATED as the value,
2958 * go look in the new bucket instead.
2959 *
2960 * This migration is atomic for readers, because
2961 * as we move elements to their new buckets,
2962 * the hash chains will not circle back to their
2963 * bucket head (the "stop" value won't match),
2964 * or the bucket head will be SMRSH_MIGRATED.
2965 *
2966 * This causes a slowpath which spins waiting
2967 * for SMRSH_MIGRATED to appear and then looks
2968 * in the new bucket.
2969 */
2970 for (size_t i = 0; i < oldsize; i++) {
2971 struct smrq_slink *first, *link, *next;
2972 hw_lck_ptr_t *head;
2973 uint32_t hash;
2974 size_t n = 0;
2975
2976 link = first = hw_lck_ptr_lock(&oldarray[i], &smr_shash_grp);
2977
2978 while (!__smr_shash_is_stop(link)) {
2979 flat_queue[n++ % FLAT_SIZE] = link;
2980 link = smr_serialized_load(&link->next);
2981 }
2982
2983 while (n-- > 0) {
2984 for (size_t j = (n % FLAT_SIZE) + 1; j-- > 0;) {
2985 link = flat_queue[j];
2986 hash = traits->obj_hash(link, newseed);
2987 head = &newarray[hash >> newshift];
2988 next = hw_lck_ptr_lock_nopreempt(head,
2989 &smr_shash_grp);
2990 smr_serialized_store_relaxed(&link->next, next);
2991 hw_lck_ptr_unlock_nopreempt(head, link,
2992 &smr_shash_grp);
2993 }
2994 n &= ~(FLAT_SIZE - 1);
2995
2996 /*
2997 * If there were more than FLAT_SIZE elements in the
2998 * chain (which is super unlikely and in many ways,
2999 * worrisome), then we need to repopoulate
3000 * the flattened queue array for each run.
3001 *
3002 * This is O(n^2) but we have worse problems anyway
3003 * if we ever hit this path.
3004 */
3005 if (__improbable(n > 0)) {
3006 link = first;
3007 for (size_t j = 0; j < n - FLAT_SIZE; j++) {
3008 link = smr_serialized_load(&link->next);
3009 }
3010
3011 flat_queue[0] = link;
3012 for (size_t j = 1; j < FLAT_SIZE; j++) {
3013 link = smr_serialized_load(&link->next);
3014 flat_queue[j] = link;
3015 }
3016 }
3017 }
3018
3019 hw_lck_ptr_unlock(&oldarray[i], SMRSH_MIGRATED, &smr_shash_grp);
3020 }
3021
3022 /*
3023 * Step 3: deallocate the old array of buckets,
3024 * making sure to hide it from readers.
3025 */
3026
3027 state.curshift = state.newshift;
3028 state.curidx = state.newidx;
3029 os_atomic_store(&smrh->smrsh_state, state, release);
3030
3031 smr_synchronize(traits->domain);
3032
3033 os_atomic_store(&smrh->smrsh_array[oldidx], NULL, relaxed);
3034 for (size_t i = 0; i < oldsize; i++) {
3035 __smr_shash_bucket_destroy(&oldarray[i]);
3036 }
3037 smr_hash_free_array((struct smrq_slist_head *)oldarray, oldsize);
3038
3039 return KERN_SUCCESS;
3040 }
3041
3042 static void
__smr_shash_rehash(thread_call_param_t arg0,thread_call_param_t arg1)3043 __smr_shash_rehash(thread_call_param_t arg0, thread_call_param_t arg1)
3044 {
3045 struct smr_shash *smrh = arg0;
3046 smrh_traits_t traits = arg1;
3047 smrsh_rehash_t reason;
3048 smrsh_state_t state;
3049 uint64_t count;
3050 kern_return_t kr;
3051
3052 do {
3053 reason = os_atomic_xchg(&smrh->smrsh_rehashing,
3054 SMRSH_REHASH_RUNNING, relaxed);
3055
3056 state = os_atomic_load(&smrh->smrsh_state, relaxed);
3057 count = __smr_shash_count(smrh);
3058
3059 if (__smr_shash_should_grow(smrh, state, count)) {
3060 kr = __smr_shash_rehash_with_target(smrh, state,
3061 state.curshift - 1, traits);
3062 } else if (__smr_shash_should_shrink(smrh, state, count)) {
3063 kr = __smr_shash_rehash_with_target(smrh, state,
3064 state.curshift + 1, traits);
3065 } else if (reason & SMRSH_REHASH_RESEED) {
3066 kr = __smr_shash_rehash_with_target(smrh, state,
3067 state.curshift, traits);
3068 } else {
3069 kr = KERN_SUCCESS;
3070 }
3071
3072 if (kr == KERN_RESOURCE_SHORTAGE) {
3073 uint64_t deadline;
3074
3075 os_atomic_or(&smrh->smrsh_rehashing, reason, relaxed);
3076 nanoseconds_to_deadline(NSEC_PER_MSEC, &deadline);
3077 thread_call_enter1_delayed(smrh->smrsh_callout,
3078 arg1, deadline);
3079 break;
3080 }
3081 } while (!os_atomic_cmpxchg(&smrh->smrsh_rehashing,
3082 SMRSH_REHASH_RUNNING, SMRSH_REHASH_NONE, relaxed));
3083 }
3084
3085 void
smr_shash_init(struct smr_shash * smrh,smrsh_policy_t policy,size_t min_size)3086 smr_shash_init(struct smr_shash *smrh, smrsh_policy_t policy, size_t min_size)
3087 {
3088 smrsh_state_t state;
3089 hw_lck_ptr_t *array;
3090 uint8_t shift;
3091 size_t size;
3092
3093 switch (policy) {
3094 case SMRSH_COMPACT:
3095 if (min_size < 2) {
3096 min_size = 2;
3097 }
3098 break;
3099 default:
3100 if (min_size < 16) {
3101 min_size = 16;
3102 }
3103 break;
3104 }
3105
3106 switch (policy) {
3107 case SMRSH_COMPACT:
3108 size = MIN(2, min_size);
3109 break;
3110 case SMRSH_BALANCED:
3111 case SMRSH_BALANCED_NOSHRINK:
3112 size = MIN(16, min_size);
3113 break;
3114 case SMRSH_FASTEST:
3115 size = min_size;
3116 break;
3117 }
3118
3119 if (size > KALLOC_SAFE_ALLOC_SIZE / sizeof(*array)) {
3120 size = KALLOC_SAFE_ALLOC_SIZE / sizeof(*array);
3121 }
3122 shift = (uint8_t)__builtin_clz((uint32_t)(size - 1));
3123 size = (~0u >> shift) + 1;
3124 array = (hw_lck_ptr_t *)smr_hash_alloc_array(size);
3125 for (size_t i = 0; i < size; i++) {
3126 __smr_shash_bucket_init(&array[i]);
3127 }
3128
3129 state = (smrsh_state_t){
3130 .curshift = shift,
3131 .newshift = shift,
3132 };
3133 *smrh = (struct smr_shash){
3134 .smrsh_array[0] = array,
3135 .smrsh_seed[0] = (uint32_t)early_random(),
3136 .smrsh_state = state,
3137 .smrsh_policy = policy,
3138 .smrsh_min_shift = (uint8_t)flsll(min_size - 1),
3139 };
3140 counter_alloc(&smrh->smrsh_count);
3141 smrh->smrsh_callout = thread_call_allocate_with_options(__smr_shash_rehash,
3142 smrh, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
3143 }
3144
3145 void
3146 __smr_shash_destroy(
3147 struct smr_shash *smrh,
3148 smrh_traits_t traits,
3149 void (^free)(void *))
3150 {
3151 smrsh_state_t state;
3152 hw_lck_ptr_t *array;
3153 size_t size;
3154
3155 thread_call_cancel_wait(smrh->smrsh_callout);
3156
3157 state = os_atomic_load(&smrh->smrsh_state, dependency);
3158 assert(state.curidx == state.newidx);
3159 assert(__smr_shash_load_array(smrh, 1 - state.curidx) == NULL);
3160 size = __smr_shash_cursize(state);
3161 array = __smr_shash_load_array(smrh, state.curidx);
3162
3163 if (free) {
3164 for (size_t i = 0; i < size; i++) {
3165 struct smrq_slink *link, *next;
3166
3167 next = hw_lck_ptr_value(&array[i]);
3168 while (!__smr_shash_is_stop(next)) {
3169 link = next;
3170 next = smr_serialized_load(&link->next);
3171 free(__smrht_link_to_obj(traits, link));
3172 }
3173 }
3174 }
3175 for (size_t i = 0; i < size; i++) {
3176 __smr_shash_bucket_destroy(&array[i]);
3177 }
3178
3179 thread_call_free(smrh->smrsh_callout);
3180 counter_free(&smrh->smrsh_count);
3181 smr_hash_free_array((struct smrq_slist_head *)array, size);
3182 bzero(smrh, sizeof(*smrh));
3183 }
3184
3185
3186 #pragma mark misc
3187
3188 void
__smr_linkage_invalid(__smrq_link_t * link)3189 __smr_linkage_invalid(__smrq_link_t *link)
3190 {
3191 struct smrq_link *elem = __container_of(link, struct smrq_link, next);
3192 struct smrq_link *next = smr_serialized_load(&elem->next);
3193
3194 panic("Invalid queue linkage: elt:%p next:%p next->prev:%p",
3195 elem, next, __container_of(next->prev, struct smrq_link, next));
3196 }
3197
3198 void
__smr_stail_invalid(__smrq_slink_t * link,__smrq_slink_t * last)3199 __smr_stail_invalid(__smrq_slink_t *link, __smrq_slink_t *last)
3200 {
3201 struct smrq_slink *elem = __container_of(link, struct smrq_slink, next);
3202 struct smrq_slink *next = smr_serialized_load(&elem->next);
3203
3204 if (next) {
3205 panic("Invalid queue tail (element past end): elt:%p elt->next:%p",
3206 elem, next);
3207 } else {
3208 panic("Invalid queue tail (early end): elt:%p tail:%p",
3209 elem, __container_of(last, struct smrq_slink, next));
3210 }
3211 }
3212
3213 void
__smr_tail_invalid(__smrq_link_t * link,__smrq_link_t * last)3214 __smr_tail_invalid(__smrq_link_t *link, __smrq_link_t *last)
3215 {
3216 struct smrq_link *elem = __container_of(link, struct smrq_link, next);
3217 struct smrq_link *next = smr_serialized_load(&elem->next);
3218
3219 if (next) {
3220 panic("Invalid queue tail (element past end): elt:%p elt->next:%p",
3221 elem, next);
3222 } else {
3223 panic("Invalid queue tail (early end): elt:%p tail:%p",
3224 elem, __container_of(last, struct smrq_link, next));
3225 }
3226 }
3227