1 /*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #if !SCHED_TEST_HARNESS
30
31 #include <kern/debug.h>
32 #include <kern/kern_types.h>
33 #include <kern/machine.h>
34 #include <kern/misc_protos.h>
35 #include <kern/queue.h>
36 #include <kern/sched_clutch.h>
37 #include <kern/sched.h>
38 #include <kern/task.h>
39 #include <kern/thread.h>
40
41 #include <mach/mach_types.h>
42 #include <mach/machine.h>
43
44 #include <machine/atomic.h>
45 #include <machine/machine_cpu.h>
46 #include <machine/machine_routines.h>
47 #include <machine/sched_param.h>
48
49 #include <sys/kdebug.h>
50
51 #endif /* !SCHED_TEST_HARNESS */
52
53 #include <kern/processor.h>
54 #include <kern/sched_prim.h>
55 #include <kern/sched_rt.h>
56
57 #if CONFIG_SCHED_EDGE
58 #include <kern/sched_amp_common.h>
59 #endif /* CONFIG_SCHED_EDGE */
60
61 #if CONFIG_SCHED_CLUTCH
62
63 #if CONFIG_SCHED_SMT
64 #error "The clutch scheduler does not support CONFIG_SCHED_SMT."
65 #endif /* CONFIG_SCHED_SMT */
66
67 #define SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION 1
68 typedef union {
69 struct __attribute__((packed)) {
70 unsigned int version : 4;
71 unsigned int traverse_mode : 3;
72 unsigned int cluster_id : 6;
73 unsigned int selection_was_edf : 1;
74 unsigned int selection_was_cluster_bound : 1;
75 unsigned int selection_opened_starvation_avoidance_window : 1;
76 unsigned int selection_opened_warp_window : 1;
77 unsigned int starvation_avoidance_window_close : 12;
78 unsigned int warp_window_close : 12;
79 unsigned int reserved : 23; /* For future usage */
80 } trace_data;
81 uint64_t scdts_trace_data_packed;
82 } sched_clutch_dbg_thread_select_packed_t;
83
84 static_assert(TH_BUCKET_SCHED_MAX == 6, "Ensure layout of sched_clutch_dbg_thread_select_packed can fit root bucket bitmasks");
85 static_assert(sizeof(sched_clutch_dbg_thread_select_packed_t) <= sizeof(uint64_t), "Ensure sched_clutch_dbg_thread_select_packed_t can fit in one tracepoint argument");
86
87 /* Forward declarations of static routines */
88
89 /* Root level hierarchy management */
90 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
91 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
92 static void sched_clutch_root_pri_update(sched_clutch_root_t);
93 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
94 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
95
96 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
97 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE = 0,
98 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY = 1,
99 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL = 2,
100 });
101 __enum_decl(sched_clutch_traverse_mode_t, uint32_t, {
102 SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY = 0,
103 SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT = 1,
104 SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT = 2,
105 });
106 static_assert(SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT < (1 << 3), "Ensure traverse mode can be encoded within 3 bits of sched_clutch_dbg_thread_select_packed_t");
107 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t, sched_clutch_root_bucket_t, thread_t, bool *, sched_clutch_traverse_mode_t, sched_clutch_dbg_thread_select_packed_t *);
108
109 /* Root bucket level hierarchy management */
110 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
111 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t, bool);
112 static int sched_clutch_root_highest_runnable_qos(sched_clutch_root_t, sched_clutch_highest_root_bucket_type_t);
113
114 /* Options for clutch bucket ordering in the runq */
115 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
116 SCHED_CLUTCH_BUCKET_OPTIONS_NONE = 0x0,
117 /* Round robin clutch bucket on thread removal */
118 SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = 0x1,
119 /* Insert clutch bucket at head (for thread preemption) */
120 SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = 0x2,
121 /* Insert clutch bucket at tail (default) */
122 SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = 0x4,
123 });
124
125 /* Clutch bucket level hierarchy management */
126 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
127 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
128 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
129 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
130 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
131 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
132
133 /* Clutch bucket group level properties management */
134 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
135 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
136 static void sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_t);
137 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
138 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
139 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
140 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
141
142 /* Clutch timeshare properties updates */
143 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
144 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
145
146 /* Clutch membership management */
147 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
148 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
149 static thread_t sched_clutch_hierarchy_thread_highest(sched_clutch_root_t, processor_t, thread_t, sched_clutch_traverse_mode_t);
150
151 /* Clutch properties updates */
152 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
153 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
154 static int sched_clutch_root_priority(sched_clutch_root_t);
155 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t, sched_clutch_root_bucket_t, processor_t _Nullable processor, thread_t _Nullable prev_thread, bool *_Nullable chose_prev_thread);
156
157 /* Clutch thread properties */
158 static boolean_t sched_thread_sched_pri_promoted(thread_t);
159 static inline sched_clutch_bucket_t sched_clutch_bucket_for_thread(sched_clutch_root_t, thread_t);
160 static inline sched_clutch_bucket_group_t sched_clutch_bucket_group_for_thread(thread_t);
161
162 /* General utilities */
163 static inline bool sched_clutch_pri_greater_than_tiebreak(int, int, bool);
164
165 #if CONFIG_SCHED_EDGE
166
167 /* System based routines */
168 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
169 static bool sched_edge_pset_peek_steal_possible(processor_set_t, processor_set_t, bitmap_t);
170
171 #endif /* CONFIG_SCHED_EDGE */
172
173 /* Helper debugging routines */
174 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
175
176 /*
177 * Special markers for buckets that have invalid WCELs/quantums etc.
178 */
179 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
180 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
181
182 /*
183 * Root level bucket WCELs
184 *
185 * The root level bucket selection algorithm is an Earliest Deadline
186 * First (EDF) algorithm where the deadline for buckets are defined
187 * by the worst-case-execution-latency and the make runnable timestamp
188 * for the bucket.
189 *
190 */
191 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
192 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
193 0, /* FG */
194 37500, /* IN (37.5ms) */
195 75000, /* DF (75ms) */
196 150000, /* UT (150ms) */
197 250000 /* BG (250ms) */
198 };
199 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
200
201 /*
202 * Root level bucket warp
203 *
204 * Each root level bucket has a warp value associated with it as well.
205 * The warp value allows the root bucket to effectively warp ahead of
206 * lower priority buckets for a limited time even if it has a later
207 * deadline. The warping behavior provides extra (but limited)
208 * opportunity for high priority buckets to remain responsive.
209 */
210
211 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
212 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
213
214 /* Warp window durations for various tiers */
215 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
216 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
217 8000, /* FG (8ms)*/
218 4000, /* IN (4ms) */
219 2000, /* DF (2ms) */
220 1000, /* UT (1ms) */
221 0 /* BG (0ms) */
222 };
223 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
224
225 /*
226 * Thread level quantum
227 *
228 * The algorithm defines quantums for threads at various buckets. This
229 * (combined with the root level bucket quantums) restricts how much
230 * the lower priority levels can preempt the higher priority threads.
231 */
232
233 #if XNU_TARGET_OS_OSX
234 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
235 10000, /* FIXPRI (10ms) */
236 10000, /* FG (10ms) */
237 10000, /* IN (10ms) */
238 10000, /* DF (10ms) */
239 4000, /* UT (4ms) */
240 2000 /* BG (2ms) */
241 };
242 #else /* XNU_TARGET_OS_OSX */
243 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
244 10000, /* FIXPRI (10ms) */
245 10000, /* FG (10ms) */
246 8000, /* IN (8ms) */
247 6000, /* DF (6ms) */
248 4000, /* UT (4ms) */
249 2000 /* BG (2ms) */
250 };
251 #endif /* XNU_TARGET_OS_OSX */
252
253 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
254
255 /*
256 * sched_clutch_us_to_abstime()
257 *
258 * Initializer for converting all durations in usec to abstime
259 */
260 static void
sched_clutch_us_to_abstime(uint32_t * us_vals,uint64_t * abstime_vals)261 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
262 {
263 for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
264 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
265 abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
266 } else {
267 clock_interval_to_absolutetime_interval(us_vals[i],
268 NSEC_PER_USEC, &abstime_vals[i]);
269 }
270 }
271 }
272
273 /* Clutch/Edge Scheduler Debugging support */
274 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c) ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
275
276 #if DEVELOPMENT || DEBUG
277
278 kern_return_t
sched_clutch_thread_group_cpu_time_for_thread(thread_t thread,int sched_bucket,uint64_t * cpu_stats)279 sched_clutch_thread_group_cpu_time_for_thread(thread_t thread, int sched_bucket, uint64_t *cpu_stats)
280 {
281 if (sched_bucket < 0 || sched_bucket >= TH_BUCKET_MAX) {
282 return KERN_INVALID_ARGUMENT;
283 }
284 sched_clutch_bucket_group_t clutch_bucket_group = &sched_clutch_for_thread(thread)->sc_clutch_groups[sched_bucket];
285 sched_clutch_bucket_cpu_data_t scb_cpu_data;
286 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
287 cpu_stats[0] = scb_cpu_data.cpu_data.scbcd_cpu_used;
288 cpu_stats[1] = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
289 return KERN_SUCCESS;
290 }
291
292 /*
293 * sched_clutch_hierarchy_locked_assert()
294 *
295 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
296 * for the hierarchy depends on where the hierarchy is hooked. The current
297 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
298 * using the pset lock.
299 */
300 static inline void
sched_clutch_hierarchy_locked_assert(sched_clutch_root_t root_clutch)301 sched_clutch_hierarchy_locked_assert(
302 sched_clutch_root_t root_clutch)
303 {
304 pset_assert_locked(root_clutch->scr_pset);
305 }
306
307 #else /* DEVELOPMENT || DEBUG */
308
309 static inline void
sched_clutch_hierarchy_locked_assert(__unused sched_clutch_root_t root_clutch)310 sched_clutch_hierarchy_locked_assert(
311 __unused sched_clutch_root_t root_clutch)
312 {
313 }
314
315 #endif /* DEVELOPMENT || DEBUG */
316
317 /*
318 * sched_clutch_thr_count_inc()
319 *
320 * Increment thread count at a hierarchy level with overflow checks.
321 */
322 static void
sched_clutch_thr_count_inc(uint16_t * thr_count)323 sched_clutch_thr_count_inc(
324 uint16_t *thr_count)
325 {
326 if (__improbable(os_inc_overflow(thr_count))) {
327 panic("sched_clutch thread count overflowed!");
328 }
329 }
330
331 /*
332 * sched_clutch_thr_count_dec()
333 *
334 * Decrement thread count at a hierarchy level with underflow checks.
335 */
336 static void
sched_clutch_thr_count_dec(uint16_t * thr_count)337 sched_clutch_thr_count_dec(
338 uint16_t *thr_count)
339 {
340 if (__improbable(os_dec_overflow(thr_count))) {
341 panic("sched_clutch thread count underflowed!");
342 }
343 }
344
345 static sched_bucket_t
sched_convert_pri_to_bucket(uint8_t priority)346 sched_convert_pri_to_bucket(uint8_t priority)
347 {
348 sched_bucket_t bucket = TH_BUCKET_RUN;
349
350 if (priority > BASEPRI_USER_INITIATED) {
351 bucket = TH_BUCKET_SHARE_FG;
352 } else if (priority > BASEPRI_DEFAULT) {
353 bucket = TH_BUCKET_SHARE_IN;
354 } else if (priority > BASEPRI_UTILITY) {
355 bucket = TH_BUCKET_SHARE_DF;
356 } else if (priority > MAXPRI_THROTTLE) {
357 bucket = TH_BUCKET_SHARE_UT;
358 } else {
359 bucket = TH_BUCKET_SHARE_BG;
360 }
361 return bucket;
362 }
363
364 /*
365 * sched_clutch_thread_bucket_map()
366 *
367 * Map a thread to a scheduling bucket for the clutch/edge scheduler
368 * based on its scheduling mode and the priority attribute passed in.
369 */
370 static sched_bucket_t
sched_clutch_thread_bucket_map(thread_t thread,int pri)371 sched_clutch_thread_bucket_map(thread_t thread, int pri)
372 {
373 switch (thread->sched_mode) {
374 case TH_MODE_FIXED:
375 if (pri >= BASEPRI_FOREGROUND) {
376 return TH_BUCKET_FIXPRI;
377 } else {
378 return sched_convert_pri_to_bucket(pri);
379 }
380
381 case TH_MODE_REALTIME:
382 return TH_BUCKET_FIXPRI;
383
384 case TH_MODE_TIMESHARE:
385 return sched_convert_pri_to_bucket(pri);
386
387 default:
388 panic("unexpected mode: %d", thread->sched_mode);
389 break;
390 }
391 }
392
393 /*
394 * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
395 * based on the amount of time they have been pending and the load at that
396 * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
397 * multiple clusters, its important to keep the load also as a global counter.
398 */
399 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
400
401 /*
402 * sched_clutch_root_init()
403 *
404 * Routine to initialize the scheduler hierarchy root.
405 */
406 static void
sched_clutch_root_init(sched_clutch_root_t root_clutch,processor_set_t pset)407 sched_clutch_root_init(
408 sched_clutch_root_t root_clutch,
409 processor_set_t pset)
410 {
411 root_clutch->scr_thr_count = 0;
412 root_clutch->scr_priority = NOPRI;
413 root_clutch->scr_urgency = 0;
414 root_clutch->scr_pset = pset;
415 #if CONFIG_SCHED_EDGE
416 root_clutch->scr_cluster_id = pset->pset_cluster_id;
417 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
418 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
419 }
420 /* Initialize the silos for tracking steal eligibility */
421 bitmap_zero((bitmap_t *)root_clutch->scr_populated_steal_silos, MAX_PSETS);
422 for (pset_id_t p = 0; p < MAX_PSETS; p++) {
423 bitmap_zero((bitmap_t *)root_clutch->scr_steal_silos[p].sess_populated_steal_queues, TH_BUCKET_SCHED_MAX);
424 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
425 priority_queue_init(&root_clutch->scr_steal_silos[p].sess_steal_queues[bucket]);
426 }
427 }
428 #else /* CONFIG_SCHED_EDGE */
429 root_clutch->scr_cluster_id = 0;
430 #endif /* CONFIG_SCHED_EDGE */
431
432 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
433 queue_init(&root_clutch->scr_clutch_buckets);
434
435 bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
436 bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
437 bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
438 priority_queue_init(&root_clutch->scr_bound_root_buckets);
439
440 /* Initialize the bitmap and priority queue of runnable root buckets */
441 priority_queue_init(&root_clutch->scr_unbound_root_buckets);
442 bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
443 bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
444
445 /* Initialize all the root buckets */
446 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
447 sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
448 sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
449 }
450 }
451
452 /*
453 * Clutch Bucket Runqueues
454 *
455 * The clutch buckets are maintained in a runq at the root bucket level. The
456 * runq organization allows clutch buckets to be ordered based on various
457 * factors such as:
458 *
459 * - Clutch buckets are round robin'ed at the same priority level when a
460 * thread is selected from a clutch bucket. This prevents a clutch bucket
461 * from starving out other clutch buckets at the same priority.
462 *
463 * - Clutch buckets are inserted at the head when it becomes runnable due to
464 * thread preemption. This allows threads that were preempted to maintain
465 * their order in the queue.
466 */
467
468 /*
469 * sched_clutch_bucket_runq_init()
470 *
471 * Initialize a clutch bucket runq.
472 */
473 static void
sched_clutch_bucket_runq_init(sched_clutch_bucket_runq_t clutch_buckets_rq)474 sched_clutch_bucket_runq_init(
475 sched_clutch_bucket_runq_t clutch_buckets_rq)
476 {
477 clutch_buckets_rq->scbrq_highq = NOPRI;
478 for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
479 clutch_buckets_rq->scbrq_bitmap[i] = 0;
480 }
481 clutch_buckets_rq->scbrq_count = 0;
482 for (int i = 0; i < NRQS; i++) {
483 circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
484 }
485 }
486
487 /*
488 * sched_clutch_bucket_runq_empty()
489 *
490 * Returns if a clutch bucket runq is empty.
491 */
492 static boolean_t
sched_clutch_bucket_runq_empty(sched_clutch_bucket_runq_t clutch_buckets_rq)493 sched_clutch_bucket_runq_empty(
494 sched_clutch_bucket_runq_t clutch_buckets_rq)
495 {
496 return clutch_buckets_rq->scbrq_count == 0;
497 }
498
499 /*
500 * sched_clutch_bucket_runq_peek()
501 *
502 * Returns the highest priority clutch bucket in the runq.
503 */
504 static sched_clutch_bucket_t
sched_clutch_bucket_runq_peek(sched_clutch_bucket_runq_t clutch_buckets_rq)505 sched_clutch_bucket_runq_peek(
506 sched_clutch_bucket_runq_t clutch_buckets_rq)
507 {
508 if (clutch_buckets_rq->scbrq_count > 0) {
509 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
510 return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
511 } else {
512 return NULL;
513 }
514 }
515
516 /*
517 * sched_clutch_bucket_runq_enqueue()
518 *
519 * Enqueue a clutch bucket into the runq based on the options passed in.
520 */
521 static void
sched_clutch_bucket_runq_enqueue(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_options_t options)522 sched_clutch_bucket_runq_enqueue(
523 sched_clutch_bucket_runq_t clutch_buckets_rq,
524 sched_clutch_bucket_t clutch_bucket,
525 sched_clutch_bucket_options_t options)
526 {
527 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
528 if (circle_queue_empty(queue)) {
529 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
530 bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
531 if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
532 clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
533 }
534 } else {
535 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
536 circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
537 } else {
538 /*
539 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
540 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
541 */
542 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
543 }
544 }
545 clutch_buckets_rq->scbrq_count++;
546 }
547
548 /*
549 * sched_clutch_bucket_runq_remove()
550 *
551 * Remove a clutch bucket from the runq.
552 */
553 static void
sched_clutch_bucket_runq_remove(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)554 sched_clutch_bucket_runq_remove(
555 sched_clutch_bucket_runq_t clutch_buckets_rq,
556 sched_clutch_bucket_t clutch_bucket)
557 {
558 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
559 circle_dequeue(queue, &clutch_bucket->scb_runqlink);
560 assert(clutch_buckets_rq->scbrq_count > 0);
561 clutch_buckets_rq->scbrq_count--;
562 if (circle_queue_empty(queue)) {
563 bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
564 clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
565 }
566 }
567
568 static void
sched_clutch_bucket_runq_rotate(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)569 sched_clutch_bucket_runq_rotate(
570 sched_clutch_bucket_runq_t clutch_buckets_rq,
571 sched_clutch_bucket_t clutch_bucket)
572 {
573 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
574 assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
575 circle_queue_rotate_head_forward(queue);
576 }
577
578 /*
579 * sched_clutch_root_bucket_init()
580 *
581 * Routine to initialize root buckets.
582 */
583 static void
sched_clutch_root_bucket_init(sched_clutch_root_bucket_t root_bucket,sched_bucket_t bucket,bool bound_root_bucket)584 sched_clutch_root_bucket_init(
585 sched_clutch_root_bucket_t root_bucket,
586 sched_bucket_t bucket,
587 bool bound_root_bucket)
588 {
589 root_bucket->scrb_bucket = bucket;
590 if (bound_root_bucket) {
591 /* For bound root buckets, initialize the bound thread runq. */
592 root_bucket->scrb_bound = true;
593 run_queue_init(&root_bucket->scrb_bound_thread_runq);
594 } else {
595 /*
596 * The unbounded root buckets contain a runq of runnable clutch buckets
597 * which then hold the runnable threads.
598 */
599 root_bucket->scrb_bound = false;
600 sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
601 }
602 priority_queue_entry_init(&root_bucket->scrb_pqlink);
603 root_bucket->scrb_pqlink.deadline = 0;
604 root_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
605 root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
606 root_bucket->scrb_starvation_avoidance = false;
607 root_bucket->scrb_starvation_ts = 0;
608 }
609
610 /*
611 * Special case scheduling for Above UI bucket.
612 *
613 * AboveUI threads are typically system critical threads that need low latency
614 * which is why they are handled specially.
615 *
616 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
617 * important to maintain some native priority order between those buckets. For unbounded
618 * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
619 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
620 * deadline based scheduling which should pickup the timeshare buckets. For the bound
621 * case, the policy simply compares the priority of the highest runnable threads in
622 * the above UI and timeshare buckets.
623 *
624 * The implementation allows extremely low latency CPU access for Above UI threads
625 * while supporting the use case of high priority timeshare threads contending with
626 * lower priority fixed priority threads.
627 */
628
629
630 /*
631 * sched_clutch_root_unbound_select_aboveui()
632 *
633 * Routine to determine if the above UI unbounded bucket should be selected for execution.
634 *
635 * Writes the highest unbound (timeshare FG vs. above UI) bucket, its priority, and whether
636 * it is an above UI bucket into the pointer parameters.
637 */
638 static void
sched_clutch_root_unbound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)639 sched_clutch_root_unbound_select_aboveui(
640 sched_clutch_root_t root_clutch,
641 sched_clutch_root_bucket_t *highest_bucket,
642 int *highest_pri,
643 bool *highest_is_aboveui,
644 sched_clutch_root_bucket_t _Nullable prev_bucket,
645 thread_t _Nullable prev_thread)
646 {
647 /* First determine the highest Clutch bucket */
648 sched_clutch_root_bucket_t higher_root_bucket = NULL;
649 sched_clutch_bucket_t higher_clutch_bucket = NULL;
650 int higher_bucket_sched_pri = -1;
651 bool higher_is_aboveui = false;
652 /* Consider unbound Above UI */
653 if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
654 higher_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
655 higher_clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, higher_root_bucket, NULL, NULL, NULL);
656 higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
657 higher_is_aboveui = true;
658 }
659 /* Consider unbound Timeshare FG */
660 if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
661 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
662 sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket_sharefg, NULL, NULL, NULL);
663 /* Strict greater-than because unbound timeshare FG root bucket loses all priority ties at this level */
664 if (higher_root_bucket == NULL || clutch_bucket_sharefg->scb_priority > higher_clutch_bucket->scb_priority) {
665 higher_root_bucket = root_bucket_sharefg;
666 higher_clutch_bucket = clutch_bucket_sharefg;
667 higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
668 higher_is_aboveui = false;
669 }
670 }
671 /* Consider the previous thread */
672 if (prev_thread != NULL) {
673 assert(prev_bucket->scrb_bound == false);
674 sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
675 int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
676 sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
677 bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && higher_is_aboveui == false;
678 if (higher_clutch_bucket == NULL ||
679 sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, higher_clutch_bucket->scb_priority, prev_bucket_should_win_ties)) {
680 higher_root_bucket = prev_bucket;
681 higher_clutch_bucket = prev_clutch_bucket;
682 higher_bucket_sched_pri = prev_thread->sched_pri;
683 higher_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
684 }
685 }
686 /* Compare highest priority in the highest unbound Clutch bucket to highest priority seen from the bound buckets */
687 if (higher_root_bucket != NULL) {
688 bool unbound_should_win_ties = higher_is_aboveui == true && *highest_is_aboveui == false;
689 if (sched_clutch_pri_greater_than_tiebreak(higher_bucket_sched_pri, *highest_pri, unbound_should_win_ties)) {
690 *highest_pri = higher_bucket_sched_pri;
691 *highest_bucket = higher_root_bucket;
692 *highest_is_aboveui = higher_is_aboveui;
693 }
694 }
695 }
696
697 /*
698 * sched_clutch_root_bound_select_aboveui()
699 *
700 * Routine to determine if the above UI bounded bucket should be selected for execution.
701 *
702 * Writes the highest bound (timeshare FG vs. above UI) bucket, its priority, and whether
703 * it is an above UI bucket into the pointer parameters.
704 */
705 static void
sched_clutch_root_bound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)706 sched_clutch_root_bound_select_aboveui(
707 sched_clutch_root_t root_clutch,
708 sched_clutch_root_bucket_t *highest_bucket,
709 int *highest_pri,
710 bool *highest_is_aboveui,
711 sched_clutch_root_bucket_t _Nullable prev_bucket,
712 thread_t _Nullable prev_thread)
713 {
714 /* Consider bound Above UI */
715 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
716 if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_FIXPRI) &&
717 sched_clutch_pri_greater_than_tiebreak(root_bucket_aboveui->scrb_bound_thread_runq.highq, *highest_pri, *highest_is_aboveui == false)) {
718 *highest_pri = root_bucket_aboveui->scrb_bound_thread_runq.highq;
719 *highest_bucket = root_bucket_aboveui;
720 *highest_is_aboveui = true;
721 }
722 /* Consider bound Timeshare FG */
723 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
724 if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SHARE_FG) &&
725 sched_clutch_pri_greater_than_tiebreak(root_bucket_sharefg->scrb_bound_thread_runq.highq, *highest_pri, false)) {
726 *highest_pri = root_bucket_sharefg->scrb_bound_thread_runq.highq;
727 *highest_bucket = root_bucket_sharefg;
728 *highest_is_aboveui = false;
729 }
730 /* Consider the previous thread */
731 if (prev_thread != NULL) {
732 assert(prev_bucket->scrb_bound == true);
733 bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && *highest_is_aboveui == false;
734 if (sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, *highest_pri, prev_bucket_should_win_ties)) {
735 *highest_pri = prev_thread->sched_pri;
736 *highest_bucket = prev_bucket;
737 *highest_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
738 }
739 }
740 }
741
742 /*
743 * sched_clutch_root_highest_runnable_qos()
744 *
745 * Returns the index of the highest-QoS root bucket which is currently runnable.
746 */
747 static int
sched_clutch_root_highest_runnable_qos(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type)748 sched_clutch_root_highest_runnable_qos(
749 sched_clutch_root_t root_clutch,
750 sched_clutch_highest_root_bucket_type_t type)
751 {
752 int highest_unbound_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
753 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
754 return highest_unbound_bucket;
755 }
756 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
757 int highest_bound_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
758 if (highest_bound_bucket == -1) {
759 return highest_unbound_bucket;
760 }
761 if (highest_unbound_bucket == -1) {
762 return highest_bound_bucket;
763 }
764 /* Both bound and unbound buckets are runnable, return the higher QoS */
765 return MIN(highest_bound_bucket, highest_unbound_bucket);
766 }
767
768 /*
769 * sched_clutch_root_highest_aboveui_root_bucket()
770 *
771 * Routine to determine if an above UI root bucket should be selected for execution.
772 *
773 * Returns the root bucket if we should run an above UI bucket or NULL otherwise.
774 */
775 static sched_clutch_root_bucket_t
sched_clutch_root_highest_aboveui_root_bucket(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread)776 sched_clutch_root_highest_aboveui_root_bucket(
777 sched_clutch_root_t root_clutch,
778 sched_clutch_highest_root_bucket_type_t type,
779 sched_clutch_root_bucket_t _Nullable prev_bucket,
780 thread_t _Nullable prev_thread,
781 bool *chose_prev_thread)
782 {
783 assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
784 assert((type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) || (prev_bucket == NULL));
785
786 sched_clutch_root_bucket_t highest_bucket = NULL;
787 int highest_pri = -1;
788 bool highest_is_aboveui = false;
789
790 /* Forward previous thread to the correct comparison logic, based on boundness */
791 sched_clutch_root_bucket_t bound_prev_bucket = NULL, unbound_prev_bucket = NULL;
792 thread_t bound_prev_thread = NULL, unbound_prev_thread = NULL;
793 if (prev_thread != NULL) {
794 if (prev_bucket->scrb_bound) {
795 bound_prev_bucket = prev_bucket;
796 bound_prev_thread = prev_thread;
797 } else {
798 unbound_prev_bucket = prev_bucket;
799 unbound_prev_thread = prev_thread;
800 }
801 }
802
803 /* Consider bound Above UI vs. Timeshare FG first, so those buckets will win ties against the corresponding unbound buckets */
804 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) {
805 sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, bound_prev_bucket, bound_prev_thread);
806 }
807
808 /* Consider unbound Above UI vs. Timeshare FG */
809 sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, unbound_prev_bucket, unbound_prev_thread);
810 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
811 return highest_is_aboveui ? highest_bucket : NULL;
812 }
813 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
814
815 /* Determine whether we already know to continue running the previous thread */
816 if (prev_thread != NULL &&
817 bitmap_test(highest_bucket->scrb_bound ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap, highest_bucket->scrb_bucket) == false) {
818 /* Highest bucket we saw is empty, so the previous thread must have been the highest */
819 assert(highest_bucket == prev_bucket);
820 *chose_prev_thread = true;
821 }
822
823 return highest_is_aboveui ? highest_bucket : NULL;
824 }
825
826 /*
827 * sched_clutch_root_highest_root_bucket()
828 *
829 * Main routine to find the highest runnable root level bucket.
830 * This routine is called from performance sensitive contexts; so it is
831 * crucial to keep this O(1). The options parameter determines if
832 * the selection logic should look at unbounded threads only (for
833 * cross-cluster stealing operations) or both bounded and unbounded
834 * threads (for selecting next thread for execution on current cluster).
835 */
836 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread,sched_clutch_traverse_mode_t mode,sched_clutch_dbg_thread_select_packed_t * debug_info)837 sched_clutch_root_highest_root_bucket(
838 sched_clutch_root_t root_clutch,
839 uint64_t timestamp,
840 sched_clutch_highest_root_bucket_type_t type,
841 sched_clutch_root_bucket_t _Nullable prev_bucket,
842 thread_t _Nullable prev_thread,
843 bool *chose_prev_thread,
844 sched_clutch_traverse_mode_t mode,
845 sched_clutch_dbg_thread_select_packed_t *debug_info)
846 {
847 assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
848 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL || (prev_thread == NULL));
849 assert(prev_thread == NULL || (mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY));
850 sched_clutch_hierarchy_locked_assert(root_clutch);
851
852 int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(root_clutch, type);
853 if (highest_runnable_bucket == -1) {
854 /*
855 * The Clutch hierarchy has no runnable threads. We can continue running
856 * whatever was running previously.
857 */
858 assert(sched_clutch_root_count(root_clutch) == 0 || type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
859 *chose_prev_thread = true;
860 if (prev_thread != NULL) {
861 debug_info->trace_data.selection_was_edf = true;
862 }
863 return prev_bucket;
864 }
865
866 /* Consider Above UI threads, in comparison to Timeshare FG threads */
867 sched_clutch_root_bucket_t highest_aboveui_bucket = sched_clutch_root_highest_aboveui_root_bucket(root_clutch, type, prev_bucket, prev_thread, chose_prev_thread);
868 if (highest_aboveui_bucket != NULL) {
869 debug_info->trace_data.selection_was_edf = true;
870 return highest_aboveui_bucket;
871 }
872
873 /*
874 * Above UI bucket is not runnable or has a low priority runnable thread; use the
875 * earliest deadline model to schedule threads. The idea is that as the timeshare
876 * buckets use CPU, they will drop their interactivity score/sched priority and
877 * allow the low priority AboveUI buckets to be scheduled.
878 */
879
880 /* Find the earliest deadline bucket */
881 sched_clutch_root_bucket_t edf_bucket;
882 bool edf_bucket_enqueued_normally;
883
884 evaluate_root_buckets:
885 edf_bucket = NULL;
886 edf_bucket_enqueued_normally = true;
887
888 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
889 edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
890 } else {
891 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
892 sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
893 sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
894 if (bound_bucket && unbound_bucket) {
895 /* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
896 edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
897 } else {
898 edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
899 }
900 }
901 if (edf_bucket == NULL) {
902 /* The timeshare portion of the runqueue is empty */
903 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
904 assert(prev_thread != NULL);
905 *chose_prev_thread = true;
906 if (prev_thread != NULL) {
907 debug_info->trace_data.selection_was_edf = true;
908 }
909 return prev_bucket;
910 }
911 if (prev_bucket != NULL && prev_bucket->scrb_pqlink.deadline < edf_bucket->scrb_pqlink.deadline) {
912 /* The previous thread's root bucket has the earliest deadline and is not currently enqueued */
913 edf_bucket = prev_bucket;
914 edf_bucket_enqueued_normally = false;
915 }
916
917 if (edf_bucket->scrb_starvation_avoidance) {
918 /* Check if the EDF bucket is in an expired starvation avoidance window */
919 uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket];
920 if (timestamp >= (edf_bucket->scrb_starvation_ts + starvation_window)) {
921 /* Starvation avoidance window is over; update deadline and re-evaluate EDF */
922 edf_bucket->scrb_starvation_avoidance = false;
923 edf_bucket->scrb_starvation_ts = 0;
924 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
925 bit_set(debug_info->trace_data.starvation_avoidance_window_close, edf_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + edf_bucket->scrb_bucket);
926 goto evaluate_root_buckets;
927 }
928 }
929
930 /*
931 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
932 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
933 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
934 * QoS root bucket.
935 */
936 bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
937 int warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
938
939 /* Allow the prev_bucket to use its warp as well */
940 bool prev_bucket_warping = (prev_bucket != NULL) && (prev_bucket->scrb_bound == edf_bucket->scrb_bound) &&
941 prev_bucket->scrb_bucket < edf_bucket->scrb_bucket && (prev_bucket->scrb_warp_remaining > 0) &&
942 (warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
943
944 bool non_edf_bucket_can_warp = (warp_bucket_index != -1 && warp_bucket_index < edf_bucket->scrb_bucket) || prev_bucket_warping;
945
946 if (non_edf_bucket_can_warp == false) {
947 /* No higher buckets have warp left; best choice is the EDF based bucket */
948 debug_info->trace_data.selection_was_edf = true;
949
950 bool should_update_edf_starvation_state = edf_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
951 if (edf_bucket->scrb_starvation_avoidance == false && should_update_edf_starvation_state) {
952 /* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
953 if (highest_runnable_bucket < edf_bucket->scrb_bucket || (prev_bucket != NULL && prev_bucket->scrb_bucket < edf_bucket->scrb_bucket)) {
954 /*
955 * Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance.
956 *
957 * The starvation avoidance window is allocated as a single quantum for the starved bucket, enforced
958 * simultaneously across all CPUs in the cluster. The idea is to grant the starved bucket roughly one
959 * quantum per core, each time the bucket reaches the earliest deadline position. Note that this
960 * cadence is driven by the difference between the starved bucket's and highest-runnable bucket's WCELs.
961 */
962 edf_bucket->scrb_starvation_avoidance = true;
963 edf_bucket->scrb_starvation_ts = timestamp;
964 debug_info->trace_data.selection_opened_starvation_avoidance_window = true;
965 } else {
966 /* EDF bucket is being selected in the natural order; update deadline and reset warp */
967 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
968 edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
969 edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
970 if (edf_bucket_enqueued_normally) {
971 if (edf_bucket->scrb_bound) {
972 bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
973 } else {
974 bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
975 }
976 }
977 }
978 }
979 *chose_prev_thread = !edf_bucket_enqueued_normally;
980 return edf_bucket;
981 }
982
983 /*
984 * Looks like there is a root bucket which is higher in the natural priority
985 * order than edf_bucket and might have some warp remaining.
986 */
987 assert(prev_bucket_warping || warp_bucket_index >= 0);
988 sched_clutch_root_bucket_t warp_bucket = NULL;
989 if (prev_bucket_warping) {
990 assert(warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
991 warp_bucket = prev_bucket;
992 } else {
993 warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
994 }
995
996 bool warp_is_being_utilized = warp_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
997
998 if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
999 if (warp_is_being_utilized) {
1000 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
1001 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
1002 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
1003 debug_info->trace_data.selection_opened_warp_window = true;
1004 }
1005 *chose_prev_thread = prev_bucket_warping;
1006 debug_info->trace_data.selection_was_edf = false;
1007 assert(warp_bucket != edf_bucket);
1008 return warp_bucket;
1009 }
1010 if (warp_bucket->scrb_warped_deadline > timestamp) {
1011 /* Root bucket already has a warp window open with some warp remaining */
1012 if (warp_is_being_utilized) {
1013 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
1014 }
1015 *chose_prev_thread = prev_bucket_warping;
1016 debug_info->trace_data.selection_was_edf = false;
1017 return warp_bucket;
1018 }
1019
1020 /*
1021 * For this bucket, warp window was opened sometime in the past but has now
1022 * expired. Mark the bucket as not available for warp anymore and re-run the
1023 * warp bucket selection logic.
1024 */
1025 warp_bucket->scrb_warp_remaining = 0;
1026 if (!prev_bucket_warping) {
1027 if (warp_bucket->scrb_bound) {
1028 bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
1029 } else {
1030 bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
1031 }
1032 }
1033 bit_set(debug_info->trace_data.warp_window_close, warp_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + warp_bucket->scrb_bucket);
1034 goto evaluate_root_buckets;
1035 }
1036
1037 static inline bool
sched_clutch_bucket_is_above_timeshare(sched_bucket_t bucket)1038 sched_clutch_bucket_is_above_timeshare(sched_bucket_t bucket)
1039 {
1040 return bucket == TH_BUCKET_FIXPRI;
1041 }
1042
1043 /*
1044 * sched_clutch_root_bucket_deadline_calculate()
1045 *
1046 * Calculate the deadline for the bucket based on its WCEL
1047 */
1048 static uint64_t
sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t root_bucket,uint64_t timestamp)1049 sched_clutch_root_bucket_deadline_calculate(
1050 sched_clutch_root_bucket_t root_bucket,
1051 uint64_t timestamp)
1052 {
1053 /* For fixpri AboveUI bucket always return it as the earliest deadline */
1054 if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1055 return 0;
1056 }
1057
1058 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
1059 return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
1060 }
1061
1062 /*
1063 * sched_clutch_root_bucket_deadline_update()
1064 *
1065 * Routine to update the deadline of the root bucket when it is selected.
1066 * Updating the deadline also moves the root_bucket in the EDF priority
1067 * queue.
1068 */
1069 static void
sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,bool bucket_is_enqueued)1070 sched_clutch_root_bucket_deadline_update(
1071 sched_clutch_root_bucket_t root_bucket,
1072 sched_clutch_root_t root_clutch,
1073 uint64_t timestamp,
1074 bool bucket_is_enqueued)
1075 {
1076 if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1077 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
1078 return;
1079 }
1080
1081 uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
1082 uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1083 if (__improbable(old_deadline > new_deadline)) {
1084 panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
1085 }
1086 if (old_deadline != new_deadline) {
1087 root_bucket->scrb_pqlink.deadline = new_deadline;
1088 if (bucket_is_enqueued) {
1089 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1090 priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
1091 }
1092 }
1093 }
1094
1095 /*
1096 * sched_clutch_root_bucket_runnable()
1097 *
1098 * Routine to insert a newly runnable root bucket into the hierarchy.
1099 * Also updates the deadline and warp parameters as necessary.
1100 */
1101 static void
sched_clutch_root_bucket_runnable(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1102 sched_clutch_root_bucket_runnable(
1103 sched_clutch_root_bucket_t root_bucket,
1104 sched_clutch_root_t root_clutch,
1105 uint64_t timestamp)
1106 {
1107 /* Mark the root bucket as runnable */
1108 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1109 bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
1110
1111 if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1112 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1113 return;
1114 }
1115
1116 if (root_bucket->scrb_starvation_avoidance == false) {
1117 /*
1118 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
1119 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
1120 * that and fix it up.
1121 */
1122 root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1123 }
1124 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1125 priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
1126 if (root_bucket->scrb_warp_remaining) {
1127 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
1128 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1129 bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
1130 }
1131 }
1132
1133 /*
1134 * sched_clutch_root_bucket_empty()
1135 *
1136 * Routine to remove an empty root bucket from the hierarchy.
1137 * Also updates the deadline and warp parameters as necessary.
1138 */
1139 static void
sched_clutch_root_bucket_empty(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1140 sched_clutch_root_bucket_empty(
1141 sched_clutch_root_bucket_t root_bucket,
1142 sched_clutch_root_t root_clutch,
1143 uint64_t timestamp)
1144 {
1145 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1146 bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
1147
1148 if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1149 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1150 return;
1151 }
1152
1153 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1154 priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
1155
1156 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1157 bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
1158
1159 if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
1160 if (root_bucket->scrb_warped_deadline > timestamp) {
1161 /*
1162 * For root buckets that were using the warp, check if the warp
1163 * deadline is in the future. If yes, remove the wall time the
1164 * warp was active and update the warp remaining. This allows
1165 * the root bucket to use the remaining warp the next time it
1166 * becomes runnable.
1167 */
1168 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
1169 } else {
1170 /*
1171 * If the root bucket's warped deadline is in the past, it has used up
1172 * all the warp it was assigned. Empty out its warp remaining.
1173 */
1174 root_bucket->scrb_warp_remaining = 0;
1175 }
1176 }
1177 }
1178
1179 static int
sched_clutch_global_bucket_load_get(sched_bucket_t bucket)1180 sched_clutch_global_bucket_load_get(
1181 sched_bucket_t bucket)
1182 {
1183 return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
1184 }
1185
1186 /*
1187 * sched_clutch_root_pri_update()
1188 *
1189 * The root level priority is used for thread selection and preemption
1190 * logic.
1191 *
1192 * The logic uses the same decision as thread selection for deciding between the
1193 * above UI and timeshare buckets. If one of the timesharing buckets have to be
1194 * used for priority calculation, the logic is slightly different from thread
1195 * selection, because thread selection considers deadlines, warps etc. to
1196 * decide the most optimal bucket at a given timestamp. Since the priority
1197 * value is used for preemption decisions only, it needs to be based on the
1198 * highest runnable thread available in the timeshare domain. This logic can
1199 * be made more sophisticated if there are cases of unnecessary preemption
1200 * being seen in workloads.
1201 */
1202 static void
sched_clutch_root_pri_update(sched_clutch_root_t root_clutch)1203 sched_clutch_root_pri_update(
1204 sched_clutch_root_t root_clutch)
1205 {
1206 sched_clutch_hierarchy_locked_assert(root_clutch);
1207 int16_t root_bound_pri = NOPRI;
1208 int16_t root_unbound_pri = NOPRI;
1209
1210 /* Consider bound root buckets */
1211 if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1212 goto root_pri_update_unbound;
1213 }
1214 sched_clutch_root_bucket_t highest_bound_root_bucket = NULL;
1215 __unused int highest_bound_root_bucket_pri = -1;
1216 bool highest_bound_root_bucket_is_fixpri = false;
1217 sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bound_root_bucket, &highest_bound_root_bucket_pri, &highest_bound_root_bucket_is_fixpri, NULL, NULL);
1218 if (highest_bound_root_bucket_is_fixpri == false) {
1219 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1220 assert(root_bucket_index != -1);
1221 highest_bound_root_bucket = &root_clutch->scr_bound_buckets[root_bucket_index];
1222 }
1223 root_bound_pri = highest_bound_root_bucket->scrb_bound_thread_runq.highq;
1224
1225 root_pri_update_unbound:
1226 /* Consider unbound root buckets */
1227 if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1228 goto root_pri_update_complete;
1229 }
1230 sched_clutch_root_bucket_t highest_unbound_root_bucket = NULL;
1231 __unused int highest_unbound_root_bucket_pri = -1;
1232 bool highest_unbound_root_bucket_is_fixpri = false;
1233 sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_unbound_root_bucket, &highest_unbound_root_bucket_pri, &highest_unbound_root_bucket_is_fixpri, NULL, NULL);
1234 if (highest_unbound_root_bucket_is_fixpri == false) {
1235 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1236 assert(root_bucket_index != -1);
1237 highest_unbound_root_bucket = &root_clutch->scr_unbound_buckets[root_bucket_index];
1238 }
1239
1240 /* For the selected root bucket, find the highest priority clutch bucket */
1241 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, highest_unbound_root_bucket, NULL, NULL, NULL);
1242 root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1243
1244 root_pri_update_complete:
1245 root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
1246 }
1247
1248 /*
1249 * sched_clutch_root_urgency_inc()
1250 *
1251 * Routine to increment the urgency at the root level based on the thread
1252 * priority that is being inserted into the hierarchy. The root urgency
1253 * counter is updated based on the urgency of threads in any of the
1254 * clutch buckets which are part of the hierarchy.
1255 *
1256 * Always called with the pset lock held.
1257 */
1258 static void
sched_clutch_root_urgency_inc(sched_clutch_root_t root_clutch,thread_t thread)1259 sched_clutch_root_urgency_inc(
1260 sched_clutch_root_t root_clutch,
1261 thread_t thread)
1262 {
1263 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1264 root_clutch->scr_urgency++;
1265 }
1266 }
1267
1268 /*
1269 * sched_clutch_root_urgency_dec()
1270 *
1271 * Routine to decrement the urgency at the root level based on the thread
1272 * priority that is being removed from the hierarchy. The root urgency
1273 * counter is updated based on the urgency of threads in any of the
1274 * clutch buckets which are part of the hierarchy.
1275 *
1276 * Always called with the pset lock held.
1277 */
1278 static void
sched_clutch_root_urgency_dec(sched_clutch_root_t root_clutch,thread_t thread)1279 sched_clutch_root_urgency_dec(
1280 sched_clutch_root_t root_clutch,
1281 thread_t thread)
1282 {
1283 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1284 root_clutch->scr_urgency--;
1285 }
1286 }
1287
1288 /*
1289 * Clutch bucket level scheduling
1290 *
1291 * The second level of scheduling is the clutch bucket level scheduling
1292 * which tries to schedule thread groups within root_buckets. Each
1293 * clutch represents a thread group and a clutch_bucket_group represents
1294 * threads at a particular sched_bucket within that thread group. The
1295 * clutch_bucket_group contains a clutch_bucket per cluster on the system
1296 * where it holds the runnable threads destined for execution on that
1297 * cluster.
1298 *
1299 * The goal of this level of scheduling is to allow interactive thread
1300 * groups low latency access to the CPU. It also provides slight
1301 * scheduling preference for App and unrestricted thread groups.
1302 *
1303 * The clutch bucket scheduling algorithm measures an interactivity
1304 * score for all clutch bucket groups. The interactivity score is based
1305 * on the ratio of the CPU used and the voluntary blocking of threads
1306 * within the clutch bucket group. The algorithm is very close to the ULE
1307 * scheduler on FreeBSD in terms of calculations. The interactivity
1308 * score provides an interactivity boost in the range of
1309 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1310 * thread groups to win over CPU spinners.
1311 *
1312 * The interactivity score of the clutch bucket group is combined with the
1313 * highest base/promoted priority of threads in the clutch bucket to form
1314 * the overall priority of the clutch bucket.
1315 */
1316
1317 /* Priority boost range for interactivity */
1318 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT (8)
1319 static uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1320
1321 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1322 static uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1323 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS (500000)
1324
1325 /* The ratio to scale the cpu/blocked time per window */
1326 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO (10)
1327
1328 /* Initial value for voluntary blocking time for the clutch_bucket */
1329 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID (uint64_t)(~0)
1330
1331 /* Value indicating the clutch bucket is not pending execution */
1332 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID ((uint64_t)(~0))
1333
1334 /*
1335 * Thread group CPU starvation avoidance
1336 *
1337 * In heavily CPU contended scenarios, it is possible that some thread groups
1338 * which have a low interactivity score do not get CPU time at all. In order to
1339 * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1340 * bucket group when it has been pending execution for a certain time as defined
1341 * by the sched_clutch_bucket_group_pending_delta_us values below.
1342 *
1343 * The values chosen here are very close to the WCEL values for each sched bucket.
1344 * Theses values are added into the pending interval used to determine how
1345 * frequently we will ageout the CPU usage, ensuring a reasonable limit on the
1346 * frequency.
1347 */
1348 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1349 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
1350 10000, /* FG */
1351 37500, /* IN */
1352 75000, /* DF */
1353 150000, /* UT */
1354 250000, /* BG */
1355 };
1356 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1357
1358 /*
1359 * sched_clutch_bucket_init()
1360 *
1361 * Initializer for clutch buckets.
1362 */
1363 static void
sched_clutch_bucket_init(sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_group_t clutch_bucket_group,sched_bucket_t bucket)1364 sched_clutch_bucket_init(
1365 sched_clutch_bucket_t clutch_bucket,
1366 sched_clutch_bucket_group_t clutch_bucket_group,
1367 sched_bucket_t bucket)
1368 {
1369 clutch_bucket->scb_bucket = bucket;
1370 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1371 clutch_bucket->scb_priority = 0;
1372 #if CONFIG_SCHED_EDGE
1373 clutch_bucket->scb_preferred_pset_when_enqueued = PSET_ID_INVALID;
1374 priority_queue_entry_init(&clutch_bucket->scb_stealqlink);
1375 #endif /* CONFIG_SCHED_EDGE */
1376 clutch_bucket->scb_group = clutch_bucket_group;
1377 clutch_bucket->scb_root = NULL;
1378 priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1379 priority_queue_init(&clutch_bucket->scb_thread_runq);
1380 queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1381 }
1382
1383 /*
1384 * sched_clutch_bucket_group_init()
1385 *
1386 * Initializer for clutch bucket groups.
1387 */
1388 static void
sched_clutch_bucket_group_init(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_t clutch,sched_bucket_t bucket)1389 sched_clutch_bucket_group_init(
1390 sched_clutch_bucket_group_t clutch_bucket_group,
1391 sched_clutch_t clutch,
1392 sched_bucket_t bucket)
1393 {
1394 bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1395 clutch_bucket_group->scbg_bucket = bucket;
1396 clutch_bucket_group->scbg_clutch = clutch;
1397
1398 int max_clusters = ml_get_cluster_count();
1399 clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1400 for (int i = 0; i < max_clusters; i++) {
1401 sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1402 }
1403
1404 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1405 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1406 os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, sched_boot_pset->pset_cluster_id, relaxed);
1407 /*
1408 * All thread groups should be initialized to be interactive; this allows the newly launched
1409 * thread groups to fairly compete with already running thread groups.
1410 */
1411 clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1412 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1413 os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1414 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1415 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1416 }
1417
1418 static void
sched_clutch_bucket_group_destroy(sched_clutch_bucket_group_t clutch_bucket_group)1419 sched_clutch_bucket_group_destroy(
1420 sched_clutch_bucket_group_t clutch_bucket_group)
1421 {
1422 kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1423 clutch_bucket_group->scbg_clutch_buckets);
1424 }
1425
1426 /*
1427 * sched_clutch_init_with_thread_group()
1428 *
1429 * Initialize the sched_clutch when the thread group is being created
1430 */
1431 void
sched_clutch_init_with_thread_group(sched_clutch_t clutch,struct thread_group * tg)1432 sched_clutch_init_with_thread_group(
1433 sched_clutch_t clutch,
1434 struct thread_group *tg)
1435 {
1436 os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1437
1438 /* Initialize all the clutch buckets */
1439 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1440 sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1441 }
1442
1443 /* Grouping specific fields */
1444 clutch->sc_tg = tg;
1445 }
1446
1447 /*
1448 * sched_clutch_destroy()
1449 *
1450 * Destructor for clutch; called from thread group release code.
1451 */
1452 void
sched_clutch_destroy(sched_clutch_t clutch)1453 sched_clutch_destroy(
1454 sched_clutch_t clutch)
1455 {
1456 assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1457 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1458 sched_clutch_bucket_group_destroy(&(clutch->sc_clutch_groups[i]));
1459 }
1460 }
1461
1462 #if CONFIG_SCHED_EDGE
1463
1464 /*
1465 * Edge Scheduler Preferred Cluster Mechanism
1466 *
1467 * In order to have better control over various QoS buckets within a thread group, the Edge
1468 * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1469 * preferences are stored at the sched_clutch_bucket_group level since that represents all
1470 * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1471 * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1472 */
1473
1474 static uint32_t
sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)1475 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1476 {
1477 return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1478 }
1479
1480 static uint32_t
sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)1481 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1482 {
1483 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1484 }
1485
1486 uint32_t
sched_edge_thread_preferred_cluster(thread_t thread)1487 sched_edge_thread_preferred_cluster(thread_t thread)
1488 {
1489 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1490 /* For threads bound to a specific cluster, return the bound cluster id */
1491 return sched_edge_thread_bound_cluster_id(thread);
1492 }
1493
1494 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1495 sched_bucket_t sched_bucket = thread->th_sched_bucket;
1496 if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1497 sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1498 }
1499 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1500 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1501 }
1502
1503 /*
1504 * Edge Scheduler Steal Silo Support
1505 *
1506 * Steal mechanisms in the Edge scheduler, including foreign rebalance
1507 * and regular work-stealing, are implemented using steal "silos"
1508 * on every pset tracking the clutch buckets with steal-able threads,
1509 * where each steal silo on a pset corresponds to a possible preferred
1510 * pset recommendation. Silos are comprised of per-bucket steal
1511 * queues. This amount of subdivision allows for fine-grained steal
1512 * policies which stay precisely in-sync with the complex Edge matrix.
1513 */
1514
1515 /*
1516 * sched_edge_steal_silo_from_pset_id()
1517 *
1518 * Routine to return the steal silo corresponding to a particular
1519 * preferred pset on a root clutch.
1520 */
1521 static sched_edge_steal_silo_t
sched_edge_steal_silo_from_pset_id(pset_id_t preferred_pset_id,sched_clutch_root_t root_clutch)1522 sched_edge_steal_silo_from_pset_id(pset_id_t preferred_pset_id, sched_clutch_root_t root_clutch)
1523 {
1524 return &root_clutch->scr_steal_silos[preferred_pset_id];
1525 }
1526
1527 /*
1528 * sched_edge_steal_silo_clutch_bucket_unclassify()
1529 *
1530 * Routine to reset a clutch bucket's steal silo tracking on the
1531 * pset where it is enqueued, necessary when dequeueing a clutch
1532 * bucket or changing its priority.
1533 * Always called with the pset lock held.
1534 */
1535 static void
sched_edge_steal_silo_clutch_bucket_unclassify(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1536 sched_edge_steal_silo_clutch_bucket_unclassify(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1537 {
1538 assert3u(clutch_bucket->scb_preferred_pset_when_enqueued, !=, PSET_ID_INVALID);
1539 sched_edge_steal_silo_t steal_silo =
1540 sched_edge_steal_silo_from_pset_id(clutch_bucket->scb_preferred_pset_when_enqueued, root_clutch);
1541 struct priority_queue_sched_max *steal_queue = &steal_silo->sess_steal_queues[clutch_bucket->scb_bucket];
1542 priority_queue_remove(steal_queue, &clutch_bucket->scb_stealqlink);
1543 if (priority_queue_empty(steal_queue)) {
1544 /* Last bucket from this steal queue */
1545 atomic_bit_clear(steal_silo->sess_populated_steal_queues, clutch_bucket->scb_bucket, memory_order_relaxed);
1546 }
1547 if (os_atomic_load(steal_silo->sess_populated_steal_queues, relaxed) == 0) {
1548 /* Last populated steal queue from this silo */
1549 atomic_bit_clear(root_clutch->scr_populated_steal_silos,
1550 clutch_bucket->scb_preferred_pset_when_enqueued, memory_order_relaxed);
1551 }
1552 clutch_bucket->scb_preferred_pset_when_enqueued = PSET_ID_INVALID;
1553 }
1554
1555 /*
1556 * sched_edge_steal_silo_clutch_bucket_classify()
1557 *
1558 * Routine to establish a clutch bucket's steal silo tracking on
1559 * the pset where it is (being) enqueued. Can be used to update the
1560 * tracking of a previously enqueued clutch bucket.
1561 * Always called with the pset lock held.
1562 */
1563 static void
sched_edge_steal_silo_clutch_bucket_classify(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint32_t preferred_pset_id)1564 sched_edge_steal_silo_clutch_bucket_classify(sched_clutch_bucket_t clutch_bucket,
1565 sched_clutch_root_t root_clutch, uint32_t preferred_pset_id)
1566 {
1567 if (clutch_bucket->scb_preferred_pset_when_enqueued != PSET_ID_INVALID) {
1568 if (clutch_bucket->scb_preferred_pset_when_enqueued == preferred_pset_id) {
1569 /* Already classified correctly */
1570 return;
1571 } else {
1572 /* Remove from previous queue */
1573 sched_edge_steal_silo_clutch_bucket_unclassify(clutch_bucket, root_clutch);
1574 }
1575 }
1576 assert3u(clutch_bucket->scb_preferred_pset_when_enqueued, ==, PSET_ID_INVALID);
1577 /*
1578 * Insert clutch bucket into the steal silo matching its preferred pset
1579 * and into the queue in the silo matching its scheduling bucket.
1580 */
1581 clutch_bucket->scb_preferred_pset_when_enqueued = preferred_pset_id;
1582 sched_edge_steal_silo_t steal_silo =
1583 sched_edge_steal_silo_from_pset_id(clutch_bucket->scb_preferred_pset_when_enqueued, root_clutch);
1584 struct priority_queue_sched_max *steal_queue = &steal_silo->sess_steal_queues[clutch_bucket->scb_bucket];
1585 priority_queue_entry_set_sched_pri(steal_queue, &clutch_bucket->scb_stealqlink, clutch_bucket->scb_priority, 0);
1586 priority_queue_insert(steal_queue, &clutch_bucket->scb_stealqlink);
1587 atomic_bit_set(steal_silo->sess_populated_steal_queues, clutch_bucket->scb_bucket, memory_order_relaxed);
1588 atomic_bit_set(root_clutch->scr_populated_steal_silos, clutch_bucket->scb_preferred_pset_when_enqueued, memory_order_relaxed);
1589 }
1590
1591 /*
1592 * Edge Scheduler Cumulative Load Average
1593 *
1594 * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1595 * making thread migration decisions. The per-bucket load is maintained as a
1596 * cumulative count since higher scheduling buckets impact load on lower buckets
1597 * for thread migration decisions.
1598 */
1599
1600 static void
sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1601 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1602 {
1603 switch (bucket) {
1604 case TH_BUCKET_FIXPRI: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1605 case TH_BUCKET_SHARE_FG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1606 case TH_BUCKET_SHARE_IN: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1607 case TH_BUCKET_SHARE_DF: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1608 case TH_BUCKET_SHARE_UT: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1609 case TH_BUCKET_SHARE_BG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1610 default:
1611 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1612 }
1613 }
1614
1615 static void
sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1616 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1617 {
1618 switch (bucket) {
1619 case TH_BUCKET_FIXPRI: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1620 case TH_BUCKET_SHARE_FG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1621 case TH_BUCKET_SHARE_IN: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1622 case TH_BUCKET_SHARE_DF: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1623 case TH_BUCKET_SHARE_UT: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1624 case TH_BUCKET_SHARE_BG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1625 default:
1626 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1627 }
1628 }
1629
1630 uint16_t
sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1631 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1632 {
1633 return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1634 }
1635
1636 #endif /* CONFIG_SCHED_EDGE */
1637
1638 /*
1639 * sched_clutch_bucket_hierarchy_insert()
1640 *
1641 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1642 */
1643 static void
sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,sched_clutch_bucket_options_t options)1644 sched_clutch_bucket_hierarchy_insert(
1645 sched_clutch_root_t root_clutch,
1646 sched_clutch_bucket_t clutch_bucket,
1647 sched_bucket_t bucket,
1648 uint64_t timestamp,
1649 sched_clutch_bucket_options_t options)
1650 {
1651 sched_clutch_hierarchy_locked_assert(root_clutch);
1652 if (sched_clutch_bucket_is_above_timeshare(bucket) == false) {
1653 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1654 enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1655 }
1656 #if CONFIG_SCHED_EDGE
1657 /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1658 uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1659 sched_edge_steal_silo_clutch_bucket_classify(clutch_bucket, root_clutch, preferred_cluster);
1660 #endif /* CONFIG_SCHED_EDGE */
1661 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1662
1663 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1664 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1665 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1666 }
1667
1668 /* Insert the clutch bucket into the root bucket run queue with order based on options */
1669 sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1670 clutch_bucket->scb_root = root_clutch;
1671 os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1672 }
1673
1674 /*
1675 * sched_clutch_bucket_hierarchy_remove()
1676 *
1677 * Rotuine to remove a empty clutch bucket from the root hierarchy.
1678 */
1679 static void
sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,__unused sched_clutch_bucket_options_t options)1680 sched_clutch_bucket_hierarchy_remove(
1681 sched_clutch_root_t root_clutch,
1682 sched_clutch_bucket_t clutch_bucket,
1683 sched_bucket_t bucket,
1684 uint64_t timestamp,
1685 __unused sched_clutch_bucket_options_t options)
1686 {
1687 sched_clutch_hierarchy_locked_assert(root_clutch);
1688 if (sched_clutch_bucket_is_above_timeshare(bucket) == false) {
1689 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1690 remqueue(&clutch_bucket->scb_listlink);
1691 }
1692 #if CONFIG_SCHED_EDGE
1693 sched_edge_steal_silo_clutch_bucket_unclassify(clutch_bucket, root_clutch);
1694 #endif /* CONFIG_SCHED_EDGE */
1695
1696 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1697
1698 /* Remove the clutch bucket from the root bucket priority queue */
1699 sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1700 clutch_bucket->scb_root = NULL;
1701
1702 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
1703 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1704 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1705 }
1706 os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1707 }
1708
1709 /*
1710 * sched_clutch_bucket_base_pri()
1711 *
1712 * Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1713 * highest base_pri and the highest sched_pri in the clutch bucket.
1714 */
1715 static uint8_t
sched_clutch_bucket_base_pri(sched_clutch_bucket_t clutch_bucket)1716 sched_clutch_bucket_base_pri(
1717 sched_clutch_bucket_t clutch_bucket)
1718 {
1719 assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1720 /*
1721 * Since the clutch bucket can contain threads that are members of the group due
1722 * to the sched_pri being promoted or due to their base pri, the base priority of
1723 * the entire clutch bucket should be based on the highest thread (promoted or base)
1724 * in the clutch bucket.
1725 */
1726 uint8_t max_pri = 0;
1727 if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1728 max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1729 }
1730 return max_pri;
1731 }
1732
1733 /*
1734 * sched_clutch_interactivity_from_cpu_data()
1735 *
1736 * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1737 */
1738 static uint8_t
sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)1739 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1740 {
1741 sched_clutch_bucket_cpu_data_t scb_cpu_data;
1742 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1743 clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1744 clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1745 uint8_t interactive_score = 0;
1746
1747 if ((cpu_blocked == 0) && (cpu_used == 0)) {
1748 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1749 }
1750 /*
1751 * For all timeshare buckets, calculate the interactivity score of the bucket
1752 * and add it to the base priority
1753 */
1754 if (cpu_blocked > cpu_used) {
1755 /* Interactive clutch_bucket case */
1756 interactive_score = sched_clutch_bucket_group_interactive_pri +
1757 ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1758 } else {
1759 /* Non-interactive clutch_bucket case */
1760 interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1761 }
1762 return interactive_score;
1763 }
1764
1765 /*
1766 * sched_clutch_bucket_pri_calculate()
1767 *
1768 * The priority calculation algorithm for the clutch_bucket is a slight
1769 * modification on the ULE interactivity score. It uses the base priority
1770 * of the clutch bucket and applies an interactivity score boost to the
1771 * highly responsive clutch buckets.
1772 */
1773 static uint8_t
sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t clutch_bucket,uint64_t timestamp)1774 sched_clutch_bucket_pri_calculate(
1775 sched_clutch_bucket_t clutch_bucket,
1776 uint64_t timestamp)
1777 {
1778 /* For empty clutch buckets, return priority 0 */
1779 if (clutch_bucket->scb_thr_count == 0) {
1780 return 0;
1781 }
1782
1783 uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1784 uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1785
1786 assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1787 uint8_t pri = base_pri + interactive_score;
1788 if (pri != clutch_bucket->scb_priority) {
1789 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1790 thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1791 }
1792 return pri;
1793 }
1794
1795 /*
1796 * sched_clutch_root_bucket_highest_clutch_bucket()
1797 *
1798 * Routine to find the highest priority clutch bucket
1799 * within the root bucket.
1800 */
1801 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread,bool * _Nullable chose_prev_thread)1802 sched_clutch_root_bucket_highest_clutch_bucket(
1803 sched_clutch_root_t root_clutch,
1804 sched_clutch_root_bucket_t root_bucket,
1805 processor_t _Nullable processor,
1806 thread_t _Nullable prev_thread,
1807 bool *_Nullable chose_prev_thread)
1808 {
1809 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1810 if (prev_thread != NULL) {
1811 *chose_prev_thread = true;
1812 return sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1813 }
1814 return NULL;
1815 }
1816 sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1817 /* Consider the Clutch bucket of the previous thread */
1818 if (prev_thread != NULL) {
1819 assert(chose_prev_thread != NULL);
1820 sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
1821 int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
1822 sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1823 if (prev_clutch_bucket != clutch_bucket &&
1824 sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, clutch_bucket->scb_priority, processor->first_timeslice)) {
1825 *chose_prev_thread = true;
1826 return prev_clutch_bucket;
1827 }
1828 }
1829 return clutch_bucket;
1830 }
1831
1832 /*
1833 * sched_clutch_bucket_runnable()
1834 *
1835 * Perform all operations needed when a new clutch bucket becomes runnable.
1836 * It involves inserting the clutch_bucket into the hierarchy and updating the
1837 * root priority appropriately.
1838 */
1839 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1840 sched_clutch_bucket_runnable(
1841 sched_clutch_bucket_t clutch_bucket,
1842 sched_clutch_root_t root_clutch,
1843 uint64_t timestamp,
1844 sched_clutch_bucket_options_t options)
1845 {
1846 sched_clutch_hierarchy_locked_assert(root_clutch);
1847 /* Since the clutch bucket became newly runnable, update its pending timestamp */
1848 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1849 sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1850
1851 /* Update the timesharing properties of this clutch_bucket_group; also done every sched_tick */
1852 sched_clutch_bucket_group_pri_shift_update(clutch_bucket->scb_group);
1853
1854 int16_t root_old_pri = root_clutch->scr_priority;
1855 sched_clutch_root_pri_update(root_clutch);
1856 return root_clutch->scr_priority > root_old_pri;
1857 }
1858
1859 /*
1860 * sched_clutch_bucket_update()
1861 *
1862 * Update the clutch_bucket's position in the hierarchy. This routine is
1863 * called when a new thread is inserted or removed from a runnable clutch
1864 * bucket. The options specify some properties about the clutch bucket
1865 * insertion order into the clutch bucket runq.
1866 */
1867 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1868 sched_clutch_bucket_update(
1869 sched_clutch_bucket_t clutch_bucket,
1870 sched_clutch_root_t root_clutch,
1871 uint64_t timestamp,
1872 sched_clutch_bucket_options_t options)
1873 {
1874 sched_clutch_hierarchy_locked_assert(root_clutch);
1875 uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1876 sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1877 if (new_pri == clutch_bucket->scb_priority) {
1878 /*
1879 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1880 * to the end of the runq. Typically used when a thread is selected for execution
1881 * from a clutch bucket.
1882 */
1883 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1884 sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1885 }
1886 return false;
1887 }
1888 sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1889 #if CONFIG_SCHED_EDGE
1890 /* Need to update clutch bucket's priority ranking in its foreign queue */
1891 pset_id_t pset_preference = clutch_bucket->scb_preferred_pset_when_enqueued;
1892 sched_edge_steal_silo_clutch_bucket_unclassify(clutch_bucket, root_clutch);
1893 #endif /* CONFIG_SCHED_EDGE */
1894 clutch_bucket->scb_priority = new_pri;
1895 #if CONFIG_SCHED_EDGE
1896 sched_edge_steal_silo_clutch_bucket_classify(clutch_bucket, root_clutch, pset_preference);
1897 #endif /* CONFIG_SCHED_EDGE */
1898 sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1899
1900 int16_t root_old_pri = root_clutch->scr_priority;
1901 sched_clutch_root_pri_update(root_clutch);
1902 return root_clutch->scr_priority > root_old_pri;
1903 }
1904
1905 /*
1906 * sched_clutch_bucket_empty()
1907 *
1908 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1909 * It involves removing the clutch bucket from the hierarchy and updaing the root
1910 * priority appropriately.
1911 */
1912 static void
sched_clutch_bucket_empty(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1913 sched_clutch_bucket_empty(
1914 sched_clutch_bucket_t clutch_bucket,
1915 sched_clutch_root_t root_clutch,
1916 uint64_t timestamp,
1917 sched_clutch_bucket_options_t options)
1918 {
1919 sched_clutch_hierarchy_locked_assert(root_clutch);
1920 assert3u(clutch_bucket->scb_thr_count, ==, 0);
1921 sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1922
1923 /* Update the timesharing properties of this clutch_bucket_group; also done every sched_tick */
1924 sched_clutch_bucket_group_pri_shift_update(clutch_bucket->scb_group);
1925
1926 clutch_bucket->scb_priority = 0;
1927 sched_clutch_root_pri_update(root_clutch);
1928 }
1929
1930 /*
1931 * sched_clutch_cpu_usage_update()
1932 *
1933 * Routine to update CPU usage of the thread in the hierarchy.
1934 */
1935 void
sched_clutch_cpu_usage_update(thread_t thread,uint64_t delta)1936 sched_clutch_cpu_usage_update(
1937 thread_t thread,
1938 uint64_t delta)
1939 {
1940 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1941 return;
1942 }
1943
1944 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1945 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1946 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1947 }
1948
1949 /*
1950 * sched_clutch_bucket_group_cpu_usage_update()
1951 *
1952 * Routine to update the CPU usage of the clutch_bucket.
1953 */
1954 static void
sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t delta)1955 sched_clutch_bucket_group_cpu_usage_update(
1956 sched_clutch_bucket_group_t clutch_bucket_group,
1957 uint64_t delta)
1958 {
1959 if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
1960 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1961 return;
1962 }
1963 delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1964 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1965 }
1966
1967 /*
1968 * sched_clutch_bucket_group_cpu_pending_adjust()
1969 *
1970 * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1971 * such that one "pending interval" provides one point improvement in interactivity score.
1972 */
1973 static inline uint64_t
sched_clutch_bucket_group_cpu_pending_adjust(uint64_t cpu_used,uint64_t cpu_blocked,uint8_t pending_intervals)1974 sched_clutch_bucket_group_cpu_pending_adjust(
1975 uint64_t cpu_used,
1976 uint64_t cpu_blocked,
1977 uint8_t pending_intervals)
1978 {
1979 uint64_t cpu_used_adjusted = 0;
1980 if (cpu_blocked < cpu_used) {
1981 cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1982 cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1983 } else {
1984 uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1985 cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1986 }
1987 return cpu_used_adjusted;
1988 }
1989
1990 /*
1991 * sched_clutch_bucket_group_cpu_adjust()
1992 *
1993 * Routine to scale the cpu usage and blocked time once the sum gets bigger
1994 * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1995 * manageable and maintain the same ratio while allowing clutch buckets to
1996 * adjust behavior and reflect in the interactivity score in a reasonable
1997 * amount of time. Also adjusts the CPU usage based on pending_intervals
1998 * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1999 */
2000 static void
sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t clutch_bucket_group,uint8_t pending_intervals)2001 sched_clutch_bucket_group_cpu_adjust(
2002 sched_clutch_bucket_group_t clutch_bucket_group,
2003 uint8_t pending_intervals)
2004 {
2005 sched_clutch_bucket_cpu_data_t old_cpu_data = {};
2006 sched_clutch_bucket_cpu_data_t new_cpu_data = {};
2007 os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
2008 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
2009 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
2010
2011 if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
2012 /* No changes to the CPU used and blocked values */
2013 os_atomic_rmw_loop_give_up();
2014 }
2015 if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
2016 /* Only keep the recent CPU history to better indicate how this TG has been behaving */
2017 cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
2018 cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
2019 }
2020 /* Use the shift passed in to ageout the CPU usage */
2021 cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
2022 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
2023 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
2024 });
2025 }
2026
2027 /*
2028 * Thread level scheduling algorithm
2029 *
2030 * The thread level scheduling algorithm uses the mach timeshare
2031 * decay based algorithm to achieve sharing between threads within the
2032 * same clutch bucket. The load/priority shifts etc. are all maintained
2033 * at the clutch bucket level and used for decay calculation of the
2034 * threads. The load sampling is still driven off the scheduler tick
2035 * for runnable clutch buckets (it does not use the new higher frequency
2036 * EWMA based load calculation). The idea is that the contention and load
2037 * within clutch_buckets should be limited enough to not see heavy decay
2038 * and timeshare effectively.
2039 */
2040
2041 /*
2042 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
2043 *
2044 * Increment the run count for the clutch bucket associated with the
2045 * thread.
2046 */
2047 uint32_t
sched_clutch_thread_run_bucket_incr(thread_t thread,sched_bucket_t bucket)2048 sched_clutch_thread_run_bucket_incr(
2049 thread_t thread,
2050 sched_bucket_t bucket)
2051 {
2052 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2053 return 0;
2054 }
2055 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2056 return sched_clutch_run_bucket_incr(clutch, bucket);
2057 }
2058
2059 static uint32_t
sched_clutch_run_bucket_incr(sched_clutch_t clutch,sched_bucket_t bucket)2060 sched_clutch_run_bucket_incr(
2061 sched_clutch_t clutch,
2062 sched_bucket_t bucket)
2063 {
2064 assert(bucket != TH_BUCKET_RUN);
2065 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2066 return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
2067 }
2068
2069 /*
2070 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
2071 *
2072 * Decrement the run count for the clutch bucket associated with the
2073 * thread.
2074 */
2075 uint32_t
sched_clutch_thread_run_bucket_decr(thread_t thread,sched_bucket_t bucket)2076 sched_clutch_thread_run_bucket_decr(
2077 thread_t thread,
2078 sched_bucket_t bucket)
2079 {
2080 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2081 return 0;
2082 }
2083 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2084 return sched_clutch_run_bucket_decr(clutch, bucket);
2085 }
2086
2087 static uint32_t
sched_clutch_run_bucket_decr(sched_clutch_t clutch,sched_bucket_t bucket)2088 sched_clutch_run_bucket_decr(
2089 sched_clutch_t clutch,
2090 sched_bucket_t bucket)
2091 {
2092 assert(bucket != TH_BUCKET_RUN);
2093 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2094 return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
2095 }
2096
2097 /*
2098 * sched_clutch_bucket_group_pri_shift_update()
2099 *
2100 * Routine to update the priority shift for a clutch bucket group,
2101 * necessary for timesharing correctly with priority decay within a
2102 * thread group + QoS.
2103 */
2104 static void
sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_t clutch_bucket_group)2105 sched_clutch_bucket_group_pri_shift_update(
2106 sched_clutch_bucket_group_t clutch_bucket_group)
2107 {
2108 if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2109 /* No timesharing needed for fixed priority Above UI threads */
2110 return;
2111 }
2112
2113 /*
2114 * Update the timeshare parameters for the clutch bucket group
2115 * if they haven't been updated in this tick.
2116 */
2117 uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
2118 uint32_t current_sched_ts = os_atomic_load(&sched_tick, relaxed);
2119 if (sched_ts < current_sched_ts) {
2120 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
2121 /* NCPU wide workloads should not experience decay */
2122 uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
2123 uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
2124 bucket_group_load = MIN(bucket_group_load, NRQS - 1);
2125 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
2126 /* Ensure that the pri_shift value is reasonable */
2127 pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
2128 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
2129 }
2130 }
2131
2132 /*
2133 * sched_clutch_bucket_group_timeshare_update()
2134 *
2135 * Routine to update the priority shift and priority for the clutch_bucket_group
2136 * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
2137 * clutch buckets with runnable threads in them. So it is important to maintain
2138 * the timesharing information at the clutch_bucket_group level instead of
2139 * individual clutch buckets (because the algorithm is trying to timeshare all
2140 * threads at the same QoS irrespective of which hierarchy they are enqueued in).
2141 *
2142 * The routine is called from the sched tick handling code to make sure this value
2143 * is updated at least once every sched tick. For clutch bucket groups which have
2144 * not been runnable for very long, the clutch_bucket_group maintains a "last
2145 * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
2146 * if this value is outdated, we update the priority shift.
2147 *
2148 * Possible optimization:
2149 * - The current algorithm samples the load at most once every sched tick (125ms).
2150 * This is prone to spikes in runnable counts; if that turns out to be
2151 * a problem, a simple solution would be to do the EWMA trick to sample
2152 * load at every load_tick (30ms) and use the averaged value for the pri
2153 * shift calculation.
2154 */
2155 static void
sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_bucket_t clutch_bucket,uint64_t ctime)2156 sched_clutch_bucket_group_timeshare_update(
2157 sched_clutch_bucket_group_t clutch_bucket_group,
2158 sched_clutch_bucket_t clutch_bucket,
2159 uint64_t ctime)
2160 {
2161 if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2162 /* No timesharing needed for fixed priority Above UI threads */
2163 return;
2164 }
2165 sched_clutch_bucket_group_pri_shift_update(clutch_bucket_group);
2166 /*
2167 * Update the clutch bucket priority; this allows clutch buckets that have been pending
2168 * for a long time to get an updated interactivity score.
2169 */
2170 sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
2171 }
2172
2173 /*
2174 * Calculate the CPU used by this thread and attribute it to the
2175 * thread's current scheduling bucket and clutch bucket group, or
2176 * a previous clutch bucket group if specified.
2177 * Also update the general scheduler CPU usage, matching
2178 * what we do for lightweight_update_priority().
2179 */
2180 static inline void
sched_clutch_thread_tick_delta(thread_t thread,sched_clutch_bucket_group_t _Nullable clutch_bucket_group)2181 sched_clutch_thread_tick_delta(thread_t thread, sched_clutch_bucket_group_t _Nullable clutch_bucket_group)
2182 {
2183 uint32_t cpu_delta;
2184 sched_tick_delta(thread, cpu_delta);
2185 if (thread->pri_shift < INT8_MAX) {
2186 thread->sched_usage += cpu_delta;
2187 }
2188 thread->cpu_delta += cpu_delta;
2189 if (clutch_bucket_group != NULL) {
2190 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
2191 } else {
2192 sched_clutch_cpu_usage_update(thread, cpu_delta);
2193 }
2194 }
2195
2196 /*
2197 * sched_clutch_thread_clutch_update()
2198 *
2199 * Routine called when the thread changes its thread group. The current
2200 * implementation relies on the fact that the thread group is changed only from
2201 * the context of the thread itself or when the thread is runnable but not in a
2202 * runqueue. Due to this fact, the thread group change causes only counter
2203 * updates in the old & new clutch buckets and no hierarchy changes. The routine
2204 * also attributes the CPU used so far to the old clutch.
2205 */
2206 void
sched_clutch_thread_clutch_update(thread_t thread,sched_clutch_t old_clutch,sched_clutch_t new_clutch)2207 sched_clutch_thread_clutch_update(
2208 thread_t thread,
2209 sched_clutch_t old_clutch,
2210 sched_clutch_t new_clutch)
2211 {
2212 if (old_clutch) {
2213 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
2214
2215 sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
2216
2217 /* Attribute CPU usage with the old clutch */
2218 sched_clutch_bucket_group_t old_clutch_bucket_group = NULL;
2219 if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
2220 old_clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
2221 }
2222 sched_clutch_thread_tick_delta(thread, old_clutch_bucket_group);
2223 }
2224
2225 if (new_clutch) {
2226 sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
2227 }
2228 }
2229
2230 /* Thread Insertion/Removal/Selection routines */
2231
2232 #if CONFIG_SCHED_EDGE
2233
2234 /*
2235 * Edge Scheduler Bound Thread Support
2236 *
2237 * The edge scheduler allows threads to be bound to specific clusters. The scheduler
2238 * maintains a separate runq on the clutch root to hold these bound threads. These
2239 * bound threads count towards the root priority and thread count, but are ignored
2240 * for thread migration/steal decisions. Bound threads that are enqueued in the
2241 * separate runq have the th_bound_cluster_enqueued flag set to allow easy
2242 * removal.
2243 *
2244 * Bound Threads Timesharing
2245 * The bound threads share the timesharing properties of the clutch bucket group they are
2246 * part of. They contribute to the load and use priority shifts/decay values from the
2247 * clutch bucket group.
2248 */
2249
2250 static boolean_t
sched_edge_bound_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2251 sched_edge_bound_thread_insert(
2252 sched_clutch_root_t root_clutch,
2253 thread_t thread,
2254 integer_t options)
2255 {
2256 /* Update the clutch runnable count and priority */
2257 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2258 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2259 if (root_bucket->scrb_bound_thread_runq.count == 0) {
2260 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
2261 }
2262
2263 assert((thread->th_bound_cluster_enqueued) == false);
2264 run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
2265 thread->th_bound_cluster_enqueued = true;
2266
2267 /*
2268 * Trigger an update to the thread's clutch bucket group's priority shift parameters,
2269 * needed for global timeshare within a clutch bucket group.
2270 */
2271 sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_for_thread(thread));
2272
2273 /* Increment the urgency counter for the root if necessary */
2274 sched_clutch_root_urgency_inc(root_clutch, thread);
2275
2276 int16_t root_old_pri = root_clutch->scr_priority;
2277 sched_clutch_root_pri_update(root_clutch);
2278 return root_clutch->scr_priority > root_old_pri;
2279 }
2280
2281 static void
sched_edge_bound_thread_remove(sched_clutch_root_t root_clutch,thread_t thread)2282 sched_edge_bound_thread_remove(
2283 sched_clutch_root_t root_clutch,
2284 thread_t thread)
2285 {
2286 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2287 assert((thread->th_bound_cluster_enqueued) == true);
2288 run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
2289 thread->th_bound_cluster_enqueued = false;
2290
2291 /* Decrement the urgency counter for the root if necessary */
2292 sched_clutch_root_urgency_dec(root_clutch, thread);
2293
2294 /* Update the clutch runnable count and priority */
2295 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2296 if (root_bucket->scrb_bound_thread_runq.count == 0) {
2297 sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
2298 }
2299 sched_clutch_root_pri_update(root_clutch);
2300
2301 /*
2302 * Trigger an update to the thread's clutch bucket group's priority shift parameters,
2303 * needed for global timeshare within a clutch bucket group.
2304 */
2305 sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_for_thread(thread));
2306 }
2307
2308 /*
2309 * Edge Scheduler cluster shared resource threads load balancing
2310 *
2311 * The Edge scheduler attempts to load balance cluster shared resource intensive threads
2312 * across clusters in order to reduce contention on the shared resources. It achieves
2313 * that by maintaining the runnable and running shared resource load on each cluster
2314 * and balancing the load across multiple clusters.
2315 *
2316 * The current implementation for cluster shared resource load balancing looks at
2317 * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
2318 * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
2319 * do not steal/rebalance shared resource threads. Some more details for the implementation:
2320 *
2321 * - When threads are tagged as shared resource, they go through the cluster selection logic
2322 * which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
2323 * enqueued in the cluster bound runqueue.
2324 *
2325 * - When the threads start running and call avoid_processor, the load balancing logic will be
2326 * invoked and cause the thread to be sent to a more preferred cluster if one exists and has
2327 * no shared resource load.
2328 *
2329 * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
2330 * it will look at running shared resource threads on foreign clusters and actively rebalance them.
2331 *
2332 * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
2333 * go idle intentionally.
2334 *
2335 * - One caveat of this design is that if a preferred CPU has already run and finished its shared
2336 * resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
2337 * The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
2338 * events listed above happen.
2339 *
2340 * - Also it currently does not consider other properties such as thread priorities and
2341 * qos level thread load in the thread placement decision.
2342 *
2343 * Edge Scheduler cluster shared resource thread scheduling policy
2344 *
2345 * The threads for shared resources can be scheduled using one of the two policies:
2346 *
2347 * EDGE_SHARED_RSRC_SCHED_POLICY_RR
2348 * This policy distributes the threads so that they spread across all available clusters
2349 * irrespective of type. The idea is that this scheduling policy will put a shared resource
2350 * thread on each cluster on the platform before it starts doubling up on clusters.
2351 *
2352 * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
2353 * This policy distributes threads so that the threads first fill up all the capacity on
2354 * the preferred cluster and its homogeneous peers before spilling to different core type.
2355 * The current implementation defines capacity based on the number of CPUs in the cluster;
2356 * so a cluster's shared resource is considered full if there are "n" runnable + running
2357 * shared resource threads on the cluster with n cpus. This policy is different from the
2358 * default scheduling policy of the edge scheduler since this always tries to fill up the
2359 * native clusters to capacity even when non-native clusters might be idle.
2360 */
2361 __options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
2362 EDGE_SHARED_RSRC_SCHED_POLICY_RR = 0,
2363 EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST = 1,
2364 });
2365
2366 static const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
2367 [CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
2368 [CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
2369 };
2370
2371 static void
sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch,thread_t thread)2372 sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
2373 {
2374 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2375 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
2376 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
2377 }
2378 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2379 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
2380 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
2381 }
2382 }
2383
2384 static void
sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch,thread_t thread)2385 sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
2386 {
2387 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2388 if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2389 thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2390 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2391 }
2392 }
2393 }
2394
2395 uint16_t
sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch,cluster_shared_rsrc_type_t shared_rsrc_type)2396 sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2397 {
2398 return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2399 }
2400
2401 static uint64_t
sched_edge_pset_cluster_shared_rsrc_load(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2402 sched_edge_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2403 {
2404 /* Prevent migrations to derecommended clusters */
2405 if (!pset_is_recommended(pset)) {
2406 return UINT64_MAX;
2407 }
2408 return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
2409 }
2410
2411 /*
2412 * sched_edge_shared_rsrc_idle()
2413 *
2414 * Routine used to determine if the constrained resource for the pset is idle. This is
2415 * used by a CPU going idle to decide if it should rebalance a running shared resource
2416 * thread from a non-preferred cluster.
2417 */
2418 static boolean_t
sched_edge_shared_rsrc_idle(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2419 sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2420 {
2421 return sched_edge_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
2422 }
2423
2424 /*
2425 * sched_edge_thread_shared_rsrc_type
2426 *
2427 * This routine decides if a given thread needs special handling for being a
2428 * heavy shared resource user. It is valid for the same thread to be using
2429 * several shared resources at the same time and have multiple policy flags set.
2430 * This routine determines which of those properties will be used for load
2431 * balancing and migration decisions.
2432 */
2433 static cluster_shared_rsrc_type_t
sched_edge_thread_shared_rsrc_type(thread_t thread)2434 sched_edge_thread_shared_rsrc_type(thread_t thread)
2435 {
2436 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2437 return CLUSTER_SHARED_RSRC_TYPE_RR;
2438 }
2439 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2440 return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2441 }
2442 return CLUSTER_SHARED_RSRC_TYPE_NONE;
2443 }
2444
2445 #endif /* CONFIG_SCHED_EDGE */
2446
2447 /*
2448 * sched_clutch_thread_bound_lookup()
2449 *
2450 * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2451 */
2452 static thread_t
sched_clutch_thread_bound_lookup(__unused sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t processor,thread_t _Nullable prev_thread)2453 sched_clutch_thread_bound_lookup(
2454 __unused sched_clutch_root_t root_clutch,
2455 sched_clutch_root_bucket_t root_bucket,
2456 processor_t processor,
2457 thread_t _Nullable prev_thread)
2458 {
2459 assert(root_bucket->scrb_bound == true);
2460 thread_t bound_thread = run_queue_peek(&root_bucket->scrb_bound_thread_runq);
2461 if ((prev_thread != NULL) &&
2462 (bound_thread == NULL || sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_thread->sched_pri, processor->first_timeslice))) {
2463 return prev_thread;
2464 }
2465 assert(bound_thread != THREAD_NULL);
2466 return bound_thread;
2467 }
2468
2469 /*
2470 * Clutch Bucket Group Thread Counts and Pending time calculation
2471 *
2472 * The pending time on the clutch_bucket_group allows the scheduler to track if it
2473 * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2474 * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2475 * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2476 * pending time is advanced to the time of thread selection.
2477 *
2478 * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2479 * simulataneously, it is important that the updates to thread counts and pending timestamps
2480 * happen atomically. The implementation relies on the following aspects to make that work
2481 * as expected:
2482 * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2483 * is held when threads are added/removed and pending timestamps are updated
2484 * - The thread count and pending timestamp can be updated atomically using double wide
2485 * 128 bit atomics
2486 *
2487 * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2488 * above to atomically update the interactivity score for a clutch bucket group.
2489 */
2490
2491 #if CONFIG_SCHED_EDGE
2492
2493 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2494 sched_clutch_bucket_group_thr_count_inc(
2495 sched_clutch_bucket_group_t clutch_bucket_group,
2496 uint64_t timestamp)
2497 {
2498 sched_clutch_counter_time_t old_pending_data;
2499 sched_clutch_counter_time_t new_pending_data;
2500 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2501 new_pending_data.scct_count = old_pending_data.scct_count + 1;
2502 new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2503 if (old_pending_data.scct_count == 0) {
2504 new_pending_data.scct_timestamp = timestamp;
2505 }
2506 });
2507 }
2508
2509 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2510 sched_clutch_bucket_group_thr_count_dec(
2511 sched_clutch_bucket_group_t clutch_bucket_group,
2512 uint64_t timestamp)
2513 {
2514 sched_clutch_counter_time_t old_pending_data;
2515 sched_clutch_counter_time_t new_pending_data;
2516 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2517 new_pending_data.scct_count = old_pending_data.scct_count - 1;
2518 if (new_pending_data.scct_count == 0) {
2519 new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2520 } else {
2521 new_pending_data.scct_timestamp = timestamp;
2522 }
2523 });
2524 }
2525
2526 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2527 sched_clutch_bucket_group_pending_ageout(
2528 sched_clutch_bucket_group_t clutch_bucket_group,
2529 uint64_t timestamp)
2530 {
2531 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2532 sched_clutch_counter_time_t old_pending_data;
2533 sched_clutch_counter_time_t new_pending_data;
2534 uint8_t cpu_usage_shift = 0;
2535
2536 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2537 cpu_usage_shift = 0;
2538 uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2539 bool old_update = (old_pending_ts >= timestamp);
2540 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2541 bool no_bucket_load = (bucket_load == 0);
2542 if (old_update || no_pending_time || no_bucket_load) {
2543 os_atomic_rmw_loop_give_up();
2544 }
2545
2546 /* Calculate the time the clutch bucket group has been pending */
2547 uint64_t pending_delta = timestamp - old_pending_ts;
2548 /*
2549 * Other buckets should get a chance to run first before artificially boosting
2550 * this clutch bucket group's interactivity score, at least when the entire root
2551 * bucket is getting a large enough share of CPU.
2552 */
2553 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2554 if (pending_delta < interactivity_delta) {
2555 os_atomic_rmw_loop_give_up();
2556 }
2557 cpu_usage_shift = (pending_delta / interactivity_delta);
2558 new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2559 new_pending_data.scct_count = old_pending_data.scct_count;
2560 });
2561 return cpu_usage_shift;
2562 }
2563
2564 static boolean_t
sched_edge_thread_should_be_inserted_as_bound(sched_clutch_root_t root_clutch,thread_t thread)2565 sched_edge_thread_should_be_inserted_as_bound(
2566 sched_clutch_root_t root_clutch,
2567 thread_t thread)
2568 {
2569 /*
2570 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2571 * If the thread is cluster-bound but to a different cluster, we should enqueue as unbound.
2572 */
2573 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2574 return TRUE;
2575 }
2576 /*
2577 * Use bound runqueue for shared resource threads. See "cluster shared resource
2578 * threads load balancing" section for details.
2579 */
2580 if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2581 return TRUE;
2582 }
2583 return FALSE;
2584 }
2585
2586 #else /* CONFIG_SCHED_EDGE */
2587
2588 /*
2589 * For the clutch scheduler, atomicity is ensured by making sure all operations
2590 * are happening under the pset lock of the only cluster present on the platform.
2591 */
2592 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2593 sched_clutch_bucket_group_thr_count_inc(
2594 sched_clutch_bucket_group_t clutch_bucket_group,
2595 uint64_t timestamp)
2596 {
2597 sched_clutch_hierarchy_locked_assert(&sched_boot_pset->pset_clutch_root);
2598 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2599 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2600 }
2601 clutch_bucket_group->scbg_pending_data.scct_count++;
2602 }
2603
2604 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2605 sched_clutch_bucket_group_thr_count_dec(
2606 sched_clutch_bucket_group_t clutch_bucket_group,
2607 uint64_t timestamp)
2608 {
2609 sched_clutch_hierarchy_locked_assert(&sched_boot_pset->pset_clutch_root);
2610 clutch_bucket_group->scbg_pending_data.scct_count--;
2611 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2612 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2613 } else {
2614 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2615 }
2616 }
2617
2618 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2619 sched_clutch_bucket_group_pending_ageout(
2620 sched_clutch_bucket_group_t clutch_bucket_group,
2621 uint64_t timestamp)
2622 {
2623 sched_clutch_hierarchy_locked_assert(&sched_boot_pset->pset_clutch_root);
2624 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2625 uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2626 bool old_update = (old_pending_ts >= timestamp);
2627 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2628 bool no_bucket_load = (bucket_load == 0);
2629 if (old_update || no_pending_time || no_bucket_load) {
2630 return 0;
2631 }
2632 uint64_t pending_delta = timestamp - old_pending_ts;
2633 /*
2634 * Other buckets should get a chance to run first before artificially boosting
2635 * this clutch bucket group's interactivity score, at least when the entire root
2636 * bucket is getting a large enough share of CPU.
2637 */
2638 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2639 if (pending_delta < interactivity_delta) {
2640 return 0;
2641 }
2642 uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2643 clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2644 return cpu_usage_shift;
2645 }
2646
2647 #endif /* CONFIG_SCHED_EDGE */
2648
2649 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2650 sched_clutch_bucket_group_interactivity_score_calculate(
2651 sched_clutch_bucket_group_t clutch_bucket_group,
2652 uint64_t timestamp)
2653 {
2654 if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2655 /*
2656 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2657 * priorities, make sure all AboveUI buckets are marked interactive.
2658 */
2659 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2660 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2661 }
2662 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2663 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2664 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2665 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2666 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2667 /* Write back any interactivity score update */
2668 #if CONFIG_SCHED_EDGE
2669 sched_clutch_counter_time_t old_interactivity_data;
2670 sched_clutch_counter_time_t new_interactivity_data;
2671 os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2672 new_interactivity_data.scct_count = old_interactivity_data.scct_count;
2673 if (old_interactivity_data.scct_timestamp >= timestamp) {
2674 os_atomic_rmw_loop_give_up();
2675 }
2676 new_interactivity_data.scct_timestamp = timestamp;
2677 if (old_interactivity_data.scct_timestamp != 0) {
2678 new_interactivity_data.scct_count = interactivity_score;
2679 }
2680 });
2681 return (uint8_t)new_interactivity_data.scct_count;
2682 #else /* !CONFIG_SCHED_EDGE */
2683 sched_clutch_hierarchy_locked_assert(&sched_boot_pset->pset_clutch_root);
2684 if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2685 clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2686 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2687 }
2688 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2689 #endif /* !CONFIG_SCHED_EDGE */
2690 }
2691
2692 /*
2693 * Clutch Bucket Group Run Count and Blocked Time Accounting
2694 *
2695 * The clutch bucket group maintains the number of runnable/running threads in the group.
2696 * Since the blocked time of the clutch bucket group is based on this count, it is
2697 * important to make sure the blocking timestamp and the run count are updated atomically.
2698 *
2699 * Since the run count increments happen without any pset locks held, the scheduler updates
2700 * the count & timestamp using double wide 128 bit atomics.
2701 */
2702
2703 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2704 sched_clutch_bucket_group_run_count_inc(
2705 sched_clutch_bucket_group_t clutch_bucket_group)
2706 {
2707 sched_clutch_counter_time_t old_blocked_data;
2708 sched_clutch_counter_time_t new_blocked_data;
2709
2710 bool update_blocked_time = false;
2711 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2712 new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2713 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2714 update_blocked_time = false;
2715 if (old_blocked_data.scct_count == 0) {
2716 new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2717 update_blocked_time = true;
2718 }
2719 });
2720 if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2721 uint64_t ctime = mach_absolute_time();
2722 if (ctime > old_blocked_data.scct_timestamp) {
2723 uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2724 blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2725 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2726 }
2727 }
2728 return (uint32_t)new_blocked_data.scct_count;
2729 }
2730
2731 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2732 sched_clutch_bucket_group_run_count_dec(
2733 sched_clutch_bucket_group_t clutch_bucket_group)
2734 {
2735 sched_clutch_counter_time_t old_blocked_data;
2736 sched_clutch_counter_time_t new_blocked_data;
2737
2738 uint64_t ctime = mach_absolute_time();
2739 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2740 new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2741 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2742 if (new_blocked_data.scct_count == 0) {
2743 new_blocked_data.scct_timestamp = ctime;
2744 }
2745 });
2746 return (uint32_t)new_blocked_data.scct_count;
2747 }
2748
2749 static inline sched_clutch_bucket_t
sched_clutch_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t thread)2750 sched_clutch_bucket_for_thread(
2751 sched_clutch_root_t root_clutch,
2752 thread_t thread)
2753 {
2754 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2755 assert(thread->thread_group == clutch->sc_tg);
2756
2757 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2758 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2759 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2760
2761 return clutch_bucket;
2762 }
2763
2764 static inline sched_clutch_bucket_group_t
sched_clutch_bucket_group_for_thread(thread_t prev_thread)2765 sched_clutch_bucket_group_for_thread(thread_t prev_thread)
2766 {
2767 sched_clutch_t clutch = sched_clutch_for_thread_group(prev_thread->thread_group);
2768 return &clutch->sc_clutch_groups[prev_thread->th_sched_bucket];
2769 }
2770
2771 /*
2772 * sched_clutch_thread_insert()
2773 *
2774 * Routine to insert a thread into the sched clutch hierarchy.
2775 * Update the counts at all levels of the hierarchy and insert the nodes
2776 * as they become runnable. Always called with the pset lock held.
2777 */
2778 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2779 sched_clutch_thread_insert(
2780 sched_clutch_root_t root_clutch,
2781 thread_t thread,
2782 integer_t options)
2783 {
2784 boolean_t result = FALSE;
2785
2786 sched_clutch_hierarchy_locked_assert(root_clutch);
2787 #if CONFIG_SCHED_EDGE
2788 sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2789 sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2790
2791 if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, thread)) {
2792 /*
2793 * Includes threads bound to this specific cluster as well as all
2794 * shared resource threads.
2795 */
2796 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2797 }
2798 #endif /* CONFIG_SCHED_EDGE */
2799
2800 uint64_t current_timestamp = mach_absolute_time();
2801 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2802 assert(thread->thread_group == clutch->sc_tg);
2803 sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, thread);
2804 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2805
2806 /*
2807 * Thread linkage in clutch_bucket
2808 *
2809 * A thread has a few linkages within the clutch bucket:
2810 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2811 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2812 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2813 */
2814
2815 /* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2816 thread->th_clutch_runq_link.stamp = current_timestamp;
2817 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2818 (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2819 priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2820
2821 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2822 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2823 sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2824 priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2825
2826 /* Insert thread into timesharing queue of the clutch bucket */
2827 enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2828
2829 /* Increment the urgency counter for the root if necessary */
2830 sched_clutch_root_urgency_inc(root_clutch, thread);
2831
2832 os_atomic_inc(&clutch->sc_thr_count, relaxed);
2833 sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2834
2835 /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2836 sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2837 if (clutch_bucket->scb_thr_count == 0) {
2838 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2839 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2840 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2841 } else {
2842 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2843 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2844 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2845 }
2846
2847 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2848 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2849 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2850 return result;
2851 }
2852
2853 /*
2854 * sched_clutch_thread_remove()
2855 *
2856 * Routine to remove a thread from the sched clutch hierarchy.
2857 * Update the counts at all levels of the hierarchy and remove the nodes
2858 * as they become empty. Always called with the pset lock held.
2859 */
2860 static void
sched_clutch_thread_remove(sched_clutch_root_t root_clutch,thread_t thread,uint64_t current_timestamp,sched_clutch_bucket_options_t options)2861 sched_clutch_thread_remove(
2862 sched_clutch_root_t root_clutch,
2863 thread_t thread,
2864 uint64_t current_timestamp,
2865 sched_clutch_bucket_options_t options)
2866 {
2867 sched_clutch_hierarchy_locked_assert(root_clutch);
2868 #if CONFIG_SCHED_EDGE
2869 sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2870 sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2871
2872 if (thread->th_bound_cluster_enqueued) {
2873 sched_edge_bound_thread_remove(root_clutch, thread);
2874 return;
2875 }
2876 #endif /* CONFIG_SCHED_EDGE */
2877 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2878 assert(thread->thread_group == clutch->sc_tg);
2879 thread_assert_runq_nonnull(thread);
2880
2881 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2882 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2883 assert(clutch_bucket->scb_root == root_clutch);
2884
2885 /* Decrement the urgency counter for the root if necessary */
2886 sched_clutch_root_urgency_dec(root_clutch, thread);
2887 /* Remove thread from the clutch_bucket */
2888 priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2889 remqueue(&thread->th_clutch_timeshare_link);
2890
2891 priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2892
2893 /*
2894 * Warning: After this point, the thread's scheduling fields may be
2895 * modified by other cores that acquire the thread lock.
2896 */
2897 thread_clear_runq(thread);
2898
2899 /* Update counts at various levels of the hierarchy */
2900 os_atomic_dec(&clutch->sc_thr_count, relaxed);
2901 sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2902 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2903 sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2904
2905 /* Remove the clutch from hierarchy (if needed) and update properties */
2906 if (clutch_bucket->scb_thr_count == 0) {
2907 sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2908 } else {
2909 sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2910 }
2911
2912 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2913 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2914 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2915 }
2916
2917 /*
2918 * sched_clutch_thread_unbound_lookup()
2919 *
2920 * Routine to find the highest unbound thread in the root clutch.
2921 * Helps find threads easily for steal/migrate scenarios in the
2922 * Edge scheduler.
2923 */
2924 static thread_t
sched_clutch_thread_unbound_lookup(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread)2925 sched_clutch_thread_unbound_lookup(
2926 sched_clutch_root_t root_clutch,
2927 sched_clutch_root_bucket_t root_bucket,
2928 processor_t _Nullable processor,
2929 thread_t _Nullable prev_thread)
2930 {
2931 assert(processor != NULL || prev_thread == NULL);
2932 assert(root_bucket->scrb_bound == false);
2933 sched_clutch_hierarchy_locked_assert(root_clutch);
2934
2935 /* Find the highest priority clutch bucket in this root bucket */
2936 bool chose_prev_thread = false;
2937 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket, processor, prev_thread, &chose_prev_thread);
2938 assert(clutch_bucket != NULL);
2939
2940 if (chose_prev_thread) {
2941 /* We have determined that prev_thread is the highest thread, based on the Clutch bucket level policy */
2942 assert(processor != NULL && prev_thread != NULL);
2943 return prev_thread;
2944 }
2945
2946 /* Find the highest priority runnable thread in this clutch bucket */
2947 thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2948 assert(thread != NULL);
2949
2950 /* Consider the previous thread */
2951 if (prev_thread != NULL &&
2952 sched_clutch_bucket_for_thread(root_clutch, prev_thread) == clutch_bucket &&
2953 sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, thread->sched_pri, processor->first_timeslice)) {
2954 thread = prev_thread;
2955 }
2956
2957 return thread;
2958 }
2959
2960 static sched_clutch_root_bucket_t
sched_clutch_root_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t prev_thread)2961 sched_clutch_root_bucket_for_thread(
2962 sched_clutch_root_t root_clutch,
2963 thread_t prev_thread)
2964 {
2965 #if CONFIG_SCHED_EDGE
2966 if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, prev_thread)) {
2967 return &root_clutch->scr_bound_buckets[prev_thread->th_sched_bucket];
2968 }
2969 #endif /* CONFIG_SCHED_EDGE */
2970 return &root_clutch->scr_unbound_buckets[prev_thread->th_sched_bucket];
2971 }
2972
2973 /*
2974 * sched_clutch_hierarchy_thread_highest()
2975 *
2976 * Routine to traverse the Clutch hierarchy and return the highest thread which
2977 * should be selected to run next, optionally comparing against the previously
2978 * running thread. Removes the highest thread with sched_clutch_thread_remove()
2979 * depending on the traverse mode and whether it is the previously running thread.
2980 * Always called with the pset lock held.
2981 */
2982 static thread_t
sched_clutch_hierarchy_thread_highest(sched_clutch_root_t root_clutch,processor_t processor,thread_t _Nullable prev_thread,sched_clutch_traverse_mode_t mode)2983 sched_clutch_hierarchy_thread_highest(
2984 sched_clutch_root_t root_clutch,
2985 processor_t processor,
2986 thread_t _Nullable prev_thread,
2987 sched_clutch_traverse_mode_t mode)
2988 {
2989 assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || prev_thread == NULL);
2990 sched_clutch_hierarchy_locked_assert(root_clutch);
2991
2992 thread_t highest_thread = NULL;
2993 uint64_t current_timestamp = mach_absolute_time();
2994 bool chose_prev_thread = false;
2995 sched_clutch_dbg_thread_select_packed_t debug_info = {0};
2996 sched_clutch_root_bucket_t prev_root_bucket = prev_thread != NULL ? sched_clutch_root_bucket_for_thread(root_clutch, prev_thread) : NULL;
2997 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL, prev_root_bucket, prev_thread, &chose_prev_thread, mode, &debug_info);
2998 if (chose_prev_thread) {
2999 /* We disambiguated that we want to keep running the previous thread */
3000 highest_thread = processor->active_thread;
3001 goto done_selecting_thread;
3002 }
3003 if (root_bucket == NULL) {
3004 /* The Clutch hierarchy has no runnable threads, including the previous thread */
3005 assert(sched_clutch_root_count(root_clutch) == 0);
3006 assert(prev_thread == NULL);
3007 return NULL;
3008 }
3009 if (root_bucket != prev_root_bucket) {
3010 /* We have ruled out continuing to run the previous thread, based on the root bucket level policy */
3011 prev_thread = NULL;
3012 assert((mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT) || (prev_root_bucket == NULL) ||
3013 (prev_root_bucket->scrb_bucket >= root_bucket->scrb_bucket) || (root_bucket->scrb_starvation_avoidance) ||
3014 (prev_root_bucket->scrb_bound != root_bucket->scrb_bound) ||
3015 (root_bucket->scrb_warp_remaining > 0 && root_bucket->scrb_warped_deadline > current_timestamp && prev_root_bucket->scrb_warp_remaining == 0));
3016 }
3017
3018 if (root_bucket->scrb_bound) {
3019 highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket, processor, prev_thread);
3020 } else {
3021 highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket, processor, prev_thread);
3022 }
3023
3024 if (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY ||
3025 (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT && highest_thread != processor->active_thread)) {
3026 assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || highest_thread != processor->active_thread);
3027 sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
3028 }
3029
3030 done_selecting_thread:
3031 debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
3032 debug_info.trace_data.traverse_mode = mode;
3033 debug_info.trace_data.cluster_id = root_clutch->scr_cluster_id;
3034 debug_info.trace_data.selection_was_cluster_bound = root_bucket->scrb_bound;
3035 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
3036 thread_tid(highest_thread), thread_group_get_id(highest_thread->thread_group), root_bucket->scrb_bucket, debug_info.scdts_trace_data_packed, 0);
3037 return highest_thread;
3038 }
3039
3040 /* High level global accessor routines */
3041
3042 /*
3043 * sched_clutch_root_urgency()
3044 *
3045 * Routine to get the urgency of the highest runnable
3046 * thread in the hierarchy.
3047 */
3048 static uint32_t
sched_clutch_root_urgency(sched_clutch_root_t root_clutch)3049 sched_clutch_root_urgency(
3050 sched_clutch_root_t root_clutch)
3051 {
3052 return root_clutch->scr_urgency;
3053 }
3054
3055 /*
3056 * sched_clutch_root_count_sum()
3057 *
3058 * The count_sum mechanism is used for scheduler runq
3059 * statistics calculation. Its only useful for debugging
3060 * purposes; since it takes a mach_absolute_time() on
3061 * other scheduler implementations, its better to avoid
3062 * populating this until absolutely necessary.
3063 */
3064 static uint32_t
sched_clutch_root_count_sum(__unused sched_clutch_root_t root_clutch)3065 sched_clutch_root_count_sum(
3066 __unused sched_clutch_root_t root_clutch)
3067 {
3068 return 0;
3069 }
3070
3071 /*
3072 * sched_clutch_root_priority()
3073 *
3074 * Routine to get the priority of the highest runnable
3075 * thread in the hierarchy.
3076 */
3077 static int
sched_clutch_root_priority(sched_clutch_root_t root_clutch)3078 sched_clutch_root_priority(
3079 sched_clutch_root_t root_clutch)
3080 {
3081 return root_clutch->scr_priority;
3082 }
3083
3084 /*
3085 * sched_clutch_root_count()
3086 *
3087 * Returns total number of runnable threads in the hierarchy.
3088 */
3089 uint32_t
sched_clutch_root_count(sched_clutch_root_t root_clutch)3090 sched_clutch_root_count(
3091 sched_clutch_root_t root_clutch)
3092 {
3093 return root_clutch->scr_thr_count;
3094 }
3095
3096 /*
3097 * sched_clutch_thread_pri_shift()
3098 *
3099 * Routine to get the priority shift value for a thread.
3100 * Since the timesharing is done at the clutch_bucket level,
3101 * this routine gets the clutch_bucket and retrieves the
3102 * values from there.
3103 */
3104 uint32_t
sched_clutch_thread_pri_shift(thread_t thread,sched_bucket_t bucket)3105 sched_clutch_thread_pri_shift(
3106 thread_t thread,
3107 sched_bucket_t bucket)
3108 {
3109 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3110 return INT8_MAX;
3111 }
3112 assert(bucket != TH_BUCKET_RUN);
3113 sched_clutch_t clutch = sched_clutch_for_thread(thread);
3114 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
3115 return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
3116 }
3117
3118 #pragma mark -- Clutch Scheduler Algorithm
3119
3120 static void
3121 sched_clutch_init(void);
3122
3123 static thread_t
3124 sched_clutch_steal_thread(processor_set_t pset);
3125
3126 #if !SCHED_TEST_HARNESS
3127
3128 static void
3129 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
3130
3131 #endif /* !SCHED_TEST_HARNESS */
3132
3133 static boolean_t
3134 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
3135 sched_options_t options);
3136
3137 static boolean_t
3138 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
3139
3140 static ast_t
3141 sched_clutch_processor_csw_check(processor_t processor);
3142
3143 static boolean_t
3144 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3145
3146 static int
3147 sched_clutch_runq_count(processor_t processor);
3148
3149 static boolean_t
3150 sched_clutch_processor_queue_empty(processor_t processor);
3151
3152 #if !SCHED_TEST_HARNESS
3153
3154 static uint64_t
3155 sched_clutch_runq_stats_count_sum(processor_t processor);
3156
3157 #endif /* !SCHED_TEST_HARNESS */
3158
3159 static int
3160 sched_clutch_processor_bound_count(processor_t processor);
3161
3162 static void
3163 sched_clutch_pset_init(processor_set_t pset);
3164
3165 static void
3166 sched_clutch_processor_init(processor_t processor);
3167
3168 static thread_t
3169 sched_clutch_processor_highest_thread(processor_t processor, sched_clutch_traverse_mode_t mode);
3170
3171 static thread_t
3172 sched_clutch_choose_thread(processor_t processor, int priority, thread_t prev_thread, ast_t reason);
3173
3174 #if !SCHED_TEST_HARNESS
3175
3176 static void
3177 sched_clutch_processor_queue_shutdown(processor_t processor, struct pulled_thread_queue * threadq);
3178
3179 #endif /* !SCHED_TEST_HARNESS */
3180
3181 static sched_mode_t
3182 sched_clutch_initial_thread_sched_mode(task_t parent_task);
3183
3184 static uint32_t
3185 sched_clutch_initial_quantum_size(thread_t thread);
3186
3187 static uint32_t
3188 sched_clutch_run_incr(thread_t thread);
3189
3190 static uint32_t
3191 sched_clutch_run_decr(thread_t thread);
3192
3193 static void
3194 sched_clutch_update_thread_bucket(thread_t thread);
3195
3196 #if !SCHED_TEST_HARNESS
3197
3198 static void
3199 sched_clutch_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
3200
3201 #endif /* !SCHED_TEST_HARNESS */
3202
3203 const struct sched_dispatch_table sched_clutch_dispatch = {
3204 .sched_name = "clutch",
3205 .init = sched_clutch_init,
3206 .timebase_init = sched_timeshare_timebase_init,
3207 .processor_init = sched_clutch_processor_init,
3208 .pset_init = sched_clutch_pset_init,
3209 .choose_thread = sched_clutch_choose_thread,
3210 .steal_thread_enabled = sched_steal_thread_enabled,
3211 .steal_thread = sched_clutch_steal_thread,
3212 .processor_enqueue = sched_clutch_processor_enqueue,
3213 .processor_queue_remove = sched_clutch_processor_queue_remove,
3214 .processor_queue_empty = sched_clutch_processor_queue_empty,
3215 .priority_is_urgent = priority_is_urgent,
3216 .processor_csw_check = sched_clutch_processor_csw_check,
3217 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
3218 .initial_quantum_size = sched_clutch_initial_quantum_size,
3219 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3220 .processor_runq_count = sched_clutch_runq_count,
3221 .processor_bound_count = sched_clutch_processor_bound_count,
3222 .multiple_psets_enabled = TRUE,
3223 .avoid_processor_enabled = FALSE,
3224 .thread_avoid_processor = NULL,
3225 .update_thread_bucket = sched_clutch_update_thread_bucket,
3226 .cpu_init_completed = NULL,
3227 .thread_eligible_for_pset = NULL,
3228 .update_pset_load_average = sched_update_pset_load_average,
3229 .update_pset_avg_execution_time = sched_update_pset_avg_execution_time,
3230
3231 .rt_choose_processor = sched_rt_choose_processor,
3232 .rt_steal_thread = NULL,
3233 .rt_init_pset = sched_rt_init_pset,
3234 .rt_init_completed = sched_rt_init_completed,
3235 .rt_runq_count_sum = sched_rt_runq_count_sum,
3236
3237 #if !SCHED_TEST_HARNESS
3238 .maintenance_continuation = sched_timeshare_maintenance_continue,
3239 .compute_timeshare_priority = sched_compute_timeshare_priority,
3240 .choose_node = sched_choose_node,
3241 .choose_processor = choose_processor,
3242 .processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
3243 .can_update_priority = can_update_priority,
3244 .update_priority = update_priority,
3245 .lightweight_update_priority = lightweight_update_priority,
3246 .quantum_expire = sched_default_quantum_expire,
3247 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3248 .thread_update_scan = sched_clutch_thread_update_scan,
3249 .processor_balance = sched_SMT_balance,
3250 .qos_max_parallelism = sched_qos_max_parallelism,
3251 .check_spill = sched_check_spill,
3252 .ipi_policy = sched_ipi_policy,
3253 .thread_should_yield = sched_thread_should_yield,
3254 .run_count_incr = sched_clutch_run_incr,
3255 .run_count_decr = sched_clutch_run_decr,
3256 .pset_made_schedulable = sched_pset_made_schedulable,
3257 .thread_group_recommendation_change = sched_clutch_thread_group_recommendation_change,
3258
3259 .rt_queue_shutdown = sched_rt_queue_shutdown,
3260 .rt_runq_scan = sched_rt_runq_scan,
3261 #endif /* !SCHED_TEST_HARNESS */
3262 };
3263
3264 __attribute__((always_inline))
3265 static inline run_queue_t
sched_clutch_bound_runq(processor_t processor)3266 sched_clutch_bound_runq(processor_t processor)
3267 {
3268 return &processor->runq;
3269 }
3270
3271 __attribute__((always_inline))
3272 static inline sched_clutch_root_t
sched_clutch_processor_root_clutch(processor_t processor)3273 sched_clutch_processor_root_clutch(processor_t processor)
3274 {
3275 return &processor->processor_set->pset_clutch_root;
3276 }
3277
3278 __attribute__((always_inline))
3279 static inline run_queue_t
sched_clutch_thread_bound_runq(processor_t processor,__assert_only thread_t thread)3280 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
3281 {
3282 assert(thread->bound_processor == processor);
3283 return sched_clutch_bound_runq(processor);
3284 }
3285
3286 static uint32_t
sched_clutch_initial_quantum_size(thread_t thread)3287 sched_clutch_initial_quantum_size(thread_t thread)
3288 {
3289 if (thread == THREAD_NULL) {
3290 return std_quantum;
3291 }
3292 assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
3293 return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
3294 }
3295
3296 static sched_mode_t
sched_clutch_initial_thread_sched_mode(task_t parent_task)3297 sched_clutch_initial_thread_sched_mode(task_t parent_task)
3298 {
3299 if (parent_task == kernel_task) {
3300 return TH_MODE_FIXED;
3301 } else {
3302 return TH_MODE_TIMESHARE;
3303 }
3304 }
3305
3306 static void
sched_clutch_processor_init(processor_t processor)3307 sched_clutch_processor_init(processor_t processor)
3308 {
3309 run_queue_init(&processor->runq);
3310 }
3311
3312 static void
sched_clutch_pset_init(processor_set_t pset)3313 sched_clutch_pset_init(processor_set_t pset)
3314 {
3315 sched_clutch_root_init(&pset->pset_clutch_root, pset);
3316 }
3317
3318 static void
sched_clutch_tunables_init(void)3319 sched_clutch_tunables_init(void)
3320 {
3321 sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
3322 sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
3323 sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
3324 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
3325 NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
3326 assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
3327 sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
3328 }
3329
3330 static void
sched_clutch_init(void)3331 sched_clutch_init(void)
3332 {
3333 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3334 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3335 }
3336 sched_timeshare_init();
3337 sched_clutch_tunables_init();
3338 }
3339
3340 static inline bool
sched_clutch_pri_greater_than_tiebreak(int pri_one,int pri_two,bool one_wins_ties)3341 sched_clutch_pri_greater_than_tiebreak(int pri_one, int pri_two, bool one_wins_ties)
3342 {
3343 if (one_wins_ties) {
3344 return pri_one >= pri_two;
3345 } else {
3346 return pri_one > pri_two;
3347 }
3348 }
3349
3350 /*
3351 * sched_clutch_processor_highest_thread()
3352 *
3353 * Routine to determine the highest thread on the entire cluster runqueue which
3354 * should be selected to run next, optionally comparing against the previously
3355 * running thread. Removes the highest thread from the runqueue, depending on the
3356 * traverse mode and whether the highest thread is the previously running thread.
3357 *
3358 * Always called with the pset lock held. Assumes that processor->active_thread
3359 * may be locked and modified by another processor.
3360 */
3361 static thread_t
sched_clutch_processor_highest_thread(processor_t processor,sched_clutch_traverse_mode_t mode)3362 sched_clutch_processor_highest_thread(
3363 processor_t processor,
3364 sched_clutch_traverse_mode_t mode)
3365 {
3366 sched_clutch_root_t root_clutch = sched_clutch_processor_root_clutch(processor);
3367 int clutch_pri = sched_clutch_root_priority(root_clutch);
3368 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3369 int bound_pri = bound_runq->highq;
3370
3371 bool has_prev_thread = mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
3372 thread_t prev_thread = has_prev_thread ? processor->active_thread : NULL;
3373
3374 if (bound_runq->count == 0 && root_clutch->scr_thr_count == 0) {
3375 /* The runqueue is totally empty */
3376 assert(bound_pri < MINPRI && clutch_pri < MINPRI);
3377 return prev_thread;
3378 }
3379
3380 if (has_prev_thread) {
3381 if (prev_thread->sched_pri >= BASEPRI_RTQUEUES) {
3382 /* The previous thread is real-time and thus guaranteed higher than the non-RT runqueue */
3383 return prev_thread;
3384 }
3385 /* Allow the previous thread to influence the priority comparison of Clutch hierarchy vs. processor-bound runqueue */
3386 if (prev_thread->bound_processor != NULL) {
3387 bound_pri = MAX(bound_pri, prev_thread->sched_pri);
3388 } else {
3389 clutch_pri = MAX(clutch_pri, prev_thread->sched_pri);
3390 }
3391 }
3392
3393 bool prev_thread_is_not_processor_bound = has_prev_thread && (prev_thread->bound_processor == NULL);
3394 bool prev_thread_is_processor_bound = has_prev_thread && (prev_thread->bound_processor != NULL);
3395 thread_t next_thread = prev_thread;
3396 if (clutch_pri > bound_pri) {
3397 if (root_clutch->scr_thr_count == 0) {
3398 goto found_thread;
3399 }
3400 next_thread = sched_clutch_hierarchy_thread_highest(root_clutch, processor, prev_thread_is_not_processor_bound ? prev_thread : NULL, mode);
3401 } else {
3402 if (bound_runq->count == 0 ||
3403 (prev_thread_is_processor_bound && sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_runq->highq, processor->first_timeslice))) {
3404 goto found_thread;
3405 }
3406 next_thread = (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY) ?
3407 run_queue_dequeue(bound_runq, SCHED_HEADQ) : run_queue_peek(bound_runq);
3408 assert(mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || next_thread != prev_thread);
3409 }
3410 found_thread:
3411 assert(next_thread != NULL);
3412 return next_thread;
3413 }
3414
3415 static thread_t
sched_clutch_choose_thread(processor_t processor,__unused int priority,thread_t _Nullable prev_thread,__unused ast_t reason)3416 sched_clutch_choose_thread(
3417 processor_t processor,
3418 __unused int priority,
3419 thread_t _Nullable prev_thread,
3420 __unused ast_t reason)
3421 {
3422 assert(prev_thread == NULL || prev_thread == processor->active_thread);
3423 return sched_clutch_processor_highest_thread(processor, prev_thread != NULL ? SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT : SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3424 }
3425
3426 static boolean_t
sched_clutch_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)3427 sched_clutch_processor_enqueue(
3428 processor_t processor,
3429 thread_t thread,
3430 sched_options_t options)
3431 {
3432 boolean_t result;
3433
3434 thread_set_runq_locked(thread, processor);
3435 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3436 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3437 result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
3438 } else {
3439 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3440 result = run_queue_enqueue(rq, thread, options);
3441 }
3442 return result;
3443 }
3444
3445 static boolean_t
sched_clutch_processor_queue_empty(processor_t processor)3446 sched_clutch_processor_queue_empty(processor_t processor)
3447 {
3448 return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
3449 sched_clutch_bound_runq(processor)->count == 0;
3450 }
3451
3452 static ast_t
sched_clutch_processor_csw_check(processor_t processor)3453 sched_clutch_processor_csw_check(processor_t processor)
3454 {
3455 assert(processor->active_thread != NULL);
3456 thread_t runqueue_thread = sched_clutch_processor_highest_thread(processor, SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT);
3457 if (runqueue_thread != processor->active_thread) {
3458 /* Found a better thread to run */
3459 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0 ||
3460 sched_clutch_bound_runq(processor)->urgency > 0) {
3461 return AST_PREEMPT | AST_URGENT;
3462 }
3463 return AST_PREEMPT;
3464 }
3465 return AST_NONE;
3466 }
3467
3468 static boolean_t
sched_clutch_processor_queue_has_priority(__unused processor_t processor,__unused int priority,__unused boolean_t gte)3469 sched_clutch_processor_queue_has_priority(
3470 __unused processor_t processor,
3471 __unused int priority,
3472 __unused boolean_t gte)
3473 {
3474 /*
3475 * Never short-circuit the Clutch runqueue by returning FALSE here. Instead,
3476 * thread_select() should always go through sched_clutch_choose_thread().
3477 */
3478 return TRUE;
3479 }
3480
3481 static int
sched_clutch_runq_count(processor_t processor)3482 sched_clutch_runq_count(processor_t processor)
3483 {
3484 return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
3485 }
3486
3487 #if !SCHED_TEST_HARNESS
3488
3489 static uint64_t
sched_clutch_runq_stats_count_sum(processor_t processor)3490 sched_clutch_runq_stats_count_sum(processor_t processor)
3491 {
3492 uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
3493
3494 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
3495 return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
3496 } else {
3497 return bound_sum;
3498 }
3499 }
3500
3501 #endif /* !SCHED_TEST_HARNESS */
3502
3503 static int
sched_clutch_processor_bound_count(processor_t processor)3504 sched_clutch_processor_bound_count(processor_t processor)
3505 {
3506 return sched_clutch_bound_runq(processor)->count;
3507 }
3508
3509 #if !SCHED_TEST_HARNESS
3510
3511 static void
sched_clutch_processor_queue_shutdown(processor_t processor,struct pulled_thread_queue * threadq)3512 sched_clutch_processor_queue_shutdown(processor_t processor, struct pulled_thread_queue * threadq)
3513 {
3514 processor_set_t pset = processor->processor_set;
3515 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3516
3517 /* We only need to migrate threads if this is the last active processor in the pset */
3518 if (pset->online_processor_count == 0) {
3519 while (sched_clutch_root_count(pset_clutch_root) > 0) {
3520 thread_t thread = sched_clutch_hierarchy_thread_highest(
3521 pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3522 pulled_thread_queue_enqueue(threadq, thread);
3523 }
3524 }
3525
3526 pset_unlock(pset);
3527 }
3528
3529 #endif /* !SCHED_TEST_HARNESS */
3530
3531 static boolean_t
sched_clutch_processor_queue_remove(processor_t processor,thread_t thread)3532 sched_clutch_processor_queue_remove(
3533 processor_t processor,
3534 thread_t thread)
3535 {
3536 processor_set_t pset = processor->processor_set;
3537
3538 pset_lock(pset);
3539
3540 if (processor == thread_get_runq_locked(thread)) {
3541 /*
3542 * Thread is on a run queue and we have a lock on
3543 * that run queue.
3544 */
3545 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3546 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3547 sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3548 } else {
3549 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3550 run_queue_remove(rq, thread);
3551 }
3552 } else {
3553 /*
3554 * The thread left the run queue before we could
3555 * lock the run queue.
3556 */
3557 thread_assert_runq_null(thread);
3558 processor = PROCESSOR_NULL;
3559 }
3560
3561 pset_unlock(pset);
3562
3563 return processor != PROCESSOR_NULL;
3564 }
3565
3566 static thread_t
sched_clutch_steal_thread(__unused processor_set_t pset)3567 sched_clutch_steal_thread(__unused processor_set_t pset)
3568 {
3569 /* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3570 return THREAD_NULL;
3571 }
3572
3573 #if !SCHED_TEST_HARNESS
3574
3575 static void
sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)3576 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3577 {
3578 boolean_t restart_needed = FALSE;
3579 processor_t processor = processor_list;
3580 processor_set_t pset;
3581 thread_t thread;
3582 spl_t s;
3583
3584 /*
3585 * We update the threads associated with each processor (bound and idle threads)
3586 * and then update the threads in each pset runqueue.
3587 */
3588
3589 do {
3590 do {
3591 pset = processor->processor_set;
3592
3593 s = splsched();
3594 pset_lock(pset);
3595
3596 restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
3597
3598 pset_unlock(pset);
3599 splx(s);
3600
3601 if (restart_needed) {
3602 break;
3603 }
3604
3605 thread = processor->idle_thread;
3606 if (thread != THREAD_NULL && thread->sched_stamp != os_atomic_load(&sched_tick, relaxed)) {
3607 if (thread_update_add_thread(thread) == FALSE) {
3608 restart_needed = TRUE;
3609 break;
3610 }
3611 }
3612 } while ((processor = processor->processor_list) != NULL);
3613
3614 /* Ok, we now have a collection of candidates -- fix them. */
3615 thread_update_process_threads();
3616 } while (restart_needed);
3617
3618 pset_node_t node = &pset_node0;
3619 pset = node->psets;
3620
3621 do {
3622 do {
3623 restart_needed = FALSE;
3624 while (pset != NULL) {
3625 s = splsched();
3626 pset_lock(pset);
3627
3628 if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
3629 for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3630 restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3631 if (restart_needed) {
3632 break;
3633 }
3634 }
3635 queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3636 sched_clutch_bucket_t clutch_bucket;
3637 qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3638 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
3639 restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
3640 if (restart_needed) {
3641 break;
3642 }
3643 }
3644 }
3645
3646 pset_unlock(pset);
3647 splx(s);
3648
3649 if (restart_needed) {
3650 break;
3651 }
3652 pset = pset->pset_list;
3653 }
3654
3655 if (restart_needed) {
3656 break;
3657 }
3658 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3659
3660 /* Ok, we now have a collection of candidates -- fix them. */
3661 thread_update_process_threads();
3662 } while (restart_needed);
3663 }
3664
3665 /*
3666 * For threads that have changed sched_pri without changing the
3667 * base_pri for any reason other than decay, use the sched_pri
3668 * as the bucketizing priority instead of base_pri. All such
3669 * changes are typically due to kernel locking primitives boosts
3670 * or demotions.
3671 */
3672 static boolean_t
sched_thread_sched_pri_promoted(thread_t thread)3673 sched_thread_sched_pri_promoted(thread_t thread)
3674 {
3675 return (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3676 (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3677 (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3678 (thread->kern_promotion_schedpri != 0);
3679 }
3680
3681 #endif /* !SCHED_TEST_HARNESS */
3682
3683 /*
3684 * For the clutch scheduler, the run counts are maintained in the clutch
3685 * buckets (i.e thread group scheduling structure).
3686 */
3687 static uint32_t
sched_clutch_run_incr(thread_t thread)3688 sched_clutch_run_incr(thread_t thread)
3689 {
3690 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3691 uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3692 sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3693 return new_count;
3694 }
3695
3696 static uint32_t
sched_clutch_run_decr(thread_t thread)3697 sched_clutch_run_decr(thread_t thread)
3698 {
3699 assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3700 uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3701 sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3702 return new_count;
3703 }
3704
3705 /*
3706 * Routine to update the scheduling bucket for the thread.
3707 *
3708 * In the clutch scheduler implementation, the thread's bucket
3709 * is based on sched_pri if it was promoted due to a kernel
3710 * primitive; otherwise its based on the thread base_pri. This
3711 * enhancement allows promoted threads to reach a higher priority
3712 * bucket and potentially get selected sooner for scheduling.
3713 *
3714 * Also, the clutch scheduler does not honor fixed priority below
3715 * FG priority. It simply puts those threads in the corresponding
3716 * timeshare bucket. The reason for to do that is because it is
3717 * extremely hard to define the scheduling properties of such threads
3718 * and they typically lead to performance issues.
3719 *
3720 * Called with the thread lock held and the thread held off the runqueue.
3721 */
3722
3723 void
sched_clutch_update_thread_bucket(thread_t thread)3724 sched_clutch_update_thread_bucket(thread_t thread)
3725 {
3726 sched_bucket_t old_bucket = thread->th_sched_bucket;
3727 thread_assert_runq_null(thread);
3728 int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3729 sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3730
3731 if (old_bucket == new_bucket) {
3732 return;
3733 }
3734
3735 /* Bypass accounting CPU usage for a newly created thread */
3736 if (old_bucket != TH_BUCKET_RUN) {
3737 /* Attribute CPU usage with the old scheduling bucket */
3738 sched_clutch_thread_tick_delta(thread, NULL);
3739 }
3740
3741 /* Transition to the new sched_bucket */
3742 thread->th_sched_bucket = new_bucket;
3743 thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3744
3745 /*
3746 * Since this is called after the thread has been removed from the runq,
3747 * only the run counts need to be updated. The re-insert into the runq
3748 * would put the thread into the correct new bucket's runq.
3749 */
3750 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3751 sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3752 sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3753 }
3754 }
3755
3756 #if !SCHED_TEST_HARNESS
3757
3758 static void
sched_clutch_thread_group_recommendation_change(__unused struct thread_group * tg,__unused cluster_type_t new_recommendation)3759 sched_clutch_thread_group_recommendation_change(__unused struct thread_group *tg, __unused cluster_type_t new_recommendation)
3760 {
3761 /* Clutch ignores the recommendation because Clutch does not migrate
3762 * threads between cluster types independently from the Edge scheduler.
3763 */
3764 }
3765
3766 #endif /* !SCHED_TEST_HARNESS */
3767
3768 #if CONFIG_SCHED_EDGE
3769
3770 /* Implementation of the AMP version of the clutch scheduler */
3771
3772 static void
3773 sched_edge_init(void);
3774
3775 static void
3776 sched_edge_pset_init(processor_set_t pset);
3777
3778 static thread_t
3779 sched_edge_processor_idle(processor_set_t pset);
3780
3781 static boolean_t
3782 sched_edge_processor_queue_empty(processor_t processor);
3783
3784 static void
3785 sched_edge_processor_queue_shutdown(processor_t processor, struct pulled_thread_queue * threadq);
3786
3787 static processor_t
3788 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, sched_options_t *options_inout);
3789
3790 static void
3791 sched_edge_quantum_expire(thread_t thread);
3792
3793 static bool
3794 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason);
3795
3796 static bool
3797 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3798
3799 static void
3800 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3801
3802 static bool
3803 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3804
3805 static void
3806 sched_edge_pset_made_schedulable(processor_set_t pset);
3807
3808 static void
3809 sched_edge_cpu_init_completed(void);
3810
3811 static bool
3812 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3813
3814 static bool
3815 sched_edge_steal_thread_enabled(processor_set_t pset);
3816
3817 static sched_ipi_type_t
3818 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3819
3820 static uint32_t
3821 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3822
3823 static void
3824 sched_edge_update_pset_load_average(processor_set_t pset, uint64_t curtime);
3825
3826 static void
3827 sched_edge_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket);
3828
3829 static uint32_t
3830 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3831
3832 static uint32_t
3833 sched_edge_run_count_incr(thread_t thread);
3834
3835 static bool
3836 sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset);
3837
3838 const struct sched_dispatch_table sched_edge_dispatch = {
3839 .sched_name = "edge",
3840 .init = sched_edge_init,
3841 .timebase_init = sched_timeshare_timebase_init,
3842 .processor_init = sched_clutch_processor_init,
3843 .pset_init = sched_edge_pset_init,
3844 .choose_thread = sched_clutch_choose_thread,
3845 .steal_thread_enabled = sched_edge_steal_thread_enabled,
3846 .steal_thread = sched_edge_processor_idle,
3847 .choose_processor = sched_edge_choose_processor,
3848 .processor_enqueue = sched_clutch_processor_enqueue,
3849 .processor_queue_remove = sched_clutch_processor_queue_remove,
3850 .processor_queue_empty = sched_edge_processor_queue_empty,
3851 .priority_is_urgent = priority_is_urgent,
3852 .processor_csw_check = sched_clutch_processor_csw_check,
3853 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
3854 .initial_quantum_size = sched_clutch_initial_quantum_size,
3855 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3856 .processor_runq_count = sched_clutch_runq_count,
3857 .processor_bound_count = sched_clutch_processor_bound_count,
3858 .multiple_psets_enabled = TRUE,
3859 .avoid_processor_enabled = TRUE,
3860 .thread_avoid_processor = sched_edge_thread_avoid_processor,
3861 .processor_balance = sched_edge_balance,
3862 .qos_max_parallelism = sched_edge_qos_max_parallelism,
3863 .check_spill = sched_edge_check_spill,
3864 .ipi_policy = sched_edge_ipi_policy,
3865 .thread_should_yield = sched_edge_thread_should_yield,
3866 .update_thread_bucket = sched_clutch_update_thread_bucket,
3867 .cpu_init_completed = sched_edge_cpu_init_completed,
3868 .thread_eligible_for_pset = sched_edge_thread_eligible_for_pset,
3869 .update_pset_load_average = sched_edge_update_pset_load_average,
3870 .update_pset_avg_execution_time = sched_edge_update_pset_avg_execution_time,
3871
3872 .rt_choose_processor = sched_rt_choose_processor,
3873 .rt_steal_thread = sched_rt_steal_thread,
3874 .rt_init_pset = sched_rt_init_pset,
3875 .rt_init_completed = sched_rt_init_completed,
3876 .rt_runq_count_sum = sched_rt_runq_count_sum,
3877
3878 #if !SCHED_TEST_HARNESS
3879 .maintenance_continuation = sched_timeshare_maintenance_continue,
3880 .compute_timeshare_priority = sched_compute_timeshare_priority,
3881 .choose_node = sched_choose_node,
3882 .processor_queue_shutdown = sched_edge_processor_queue_shutdown,
3883 .can_update_priority = can_update_priority,
3884 .update_priority = update_priority,
3885 .lightweight_update_priority = lightweight_update_priority,
3886 .quantum_expire = sched_edge_quantum_expire,
3887 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3888 .thread_update_scan = sched_clutch_thread_update_scan,
3889 .run_count_incr = sched_edge_run_count_incr,
3890 .run_count_decr = sched_clutch_run_decr,
3891 .pset_made_schedulable = sched_edge_pset_made_schedulable,
3892 .thread_group_recommendation_change = NULL,
3893
3894 .rt_queue_shutdown = sched_rt_queue_shutdown,
3895 .rt_runq_scan = sched_rt_runq_scan,
3896 #endif /* !SCHED_TEST_HARNESS */
3897 };
3898
3899 static _Atomic bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3900
3901 /*
3902 * sched_edge_thread_bound_cluster_id()
3903 *
3904 * Routine to determine which cluster a particular thread is bound to. Uses
3905 * the sched_flags on the thread to map back to a specific cluster id.
3906 *
3907 * <Edge Multi-cluster Support Needed>
3908 */
3909 static uint32_t
sched_edge_thread_bound_cluster_id(thread_t thread)3910 sched_edge_thread_bound_cluster_id(thread_t thread)
3911 {
3912 assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3913 return thread->th_bound_cluster_id;
3914 }
3915
3916 /* Forward declaration for some thread migration routines */
3917 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3918 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks, processor_t *processor_hint_out, sched_options_t *options_inout);
3919
3920 static_assert(sizeof(sched_clutch_edge) == sizeof(uint64_t), "sched_clutch_edge fits in 64 bits");
3921
3922 #define PERMISSIVE_MIGRATION_BUCKET (TH_BUCKET_FIXPRI)
3923
3924 /*
3925 * sched_edge_config_set()
3926 *
3927 * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3928 * policies in the scheduler.
3929 */
3930 static void
sched_edge_config_set(uint32_t src_cluster,uint32_t dst_cluster,sched_bucket_t bucket,sched_clutch_edge edge_config)3931 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_bucket_t bucket, sched_clutch_edge edge_config)
3932 {
3933 os_atomic_store(&pset_for_id(src_cluster)->sched_edges[dst_cluster][bucket], edge_config, relaxed);
3934 }
3935
3936 /*
3937 * sched_edge_config_get()
3938 *
3939 * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3940 * if it needs to update edges.
3941 */
3942 static sched_clutch_edge
sched_edge_config_get(uint32_t src_cluster,uint32_t dst_cluster,sched_bucket_t bucket)3943 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster, sched_bucket_t bucket)
3944 {
3945 return os_atomic_load(&pset_array[src_cluster]->sched_edges[dst_cluster][bucket], relaxed);
3946 }
3947
3948 /*
3949 * sched_edge_config_pset_push()
3950 *
3951 * After using sched_edge_config_set() to update edge tunables outgoing from a particular source
3952 * pset, this function should be called in order to propagate the updates to derived metadata for
3953 * the pset, such as search orders for outgoing spill and steal.
3954 */
3955 static void
sched_edge_config_pset_push(uint32_t src_pset_id)3956 sched_edge_config_pset_push(uint32_t src_pset_id)
3957 {
3958 processor_set_t src_pset = pset_array[src_pset_id];
3959 uint8_t search_order_len = sched_num_psets - 1;
3960 sched_pset_search_order_sort_data_t search_order_datas[MAX_PSETS - 1];
3961 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3962 uint8_t dst_pset_id = 0;
3963 for (int i = 0; i < search_order_len; i++, dst_pset_id++) {
3964 if (dst_pset_id == src_pset->pset_id) {
3965 dst_pset_id++;
3966 }
3967 search_order_datas[i].spsosd_src_pset = src_pset;
3968 search_order_datas[i].spsosd_dst_pset_id = dst_pset_id;
3969 sched_clutch_edge edge = sched_edge_config_get(src_pset->pset_id, dst_pset_id, bucket);
3970 search_order_datas[i].spsosd_migration_weight = edge.sce_migration_allowed ?
3971 edge.sce_migration_weight : UINT32_MAX;
3972 }
3973 sched_pset_search_order_compute(&src_pset->spill_search_order[bucket],
3974 search_order_datas, search_order_len, sched_edge_search_order_weight_then_locality_cmp);
3975 }
3976 }
3977
3978 static int
sched_edge_search_order_weight_then_locality(const void * a,const void * b)3979 sched_edge_search_order_weight_then_locality(const void *a, const void *b)
3980 {
3981 const sched_pset_search_order_sort_data_t *data_a = (const sched_pset_search_order_sort_data_t *)a;
3982 const sched_pset_search_order_sort_data_t *data_b = (const sched_pset_search_order_sort_data_t *)b;
3983 assert3p(data_a->spsosd_src_pset, ==, data_b->spsosd_src_pset);
3984 assert3u(data_a->spsosd_dst_pset_id, !=, data_b->spsosd_dst_pset_id);
3985 /*
3986 * Sort based on lowest edge migration weight, followed by die-local psets
3987 * first, followed by lowest pset id.
3988 */
3989 if (data_a->spsosd_migration_weight != data_b->spsosd_migration_weight) {
3990 return (data_a->spsosd_migration_weight < data_b->spsosd_migration_weight) ? -1 : 1;
3991 }
3992
3993 bool is_local_a = bitmap_test(data_a->spsosd_src_pset->local_psets, data_a->spsosd_dst_pset_id);
3994 bool is_local_b = bitmap_test(data_b->spsosd_src_pset->local_psets, data_b->spsosd_dst_pset_id);
3995 if (is_local_a != is_local_b) {
3996 return is_local_a ? -1 : 1;
3997 }
3998
3999 if (data_a->spsosd_dst_pset_id != data_b->spsosd_dst_pset_id) {
4000 return (data_a->spsosd_dst_pset_id < data_b->spsosd_dst_pset_id) ? -1 : 1;
4001 }
4002 return 0;
4003 }
4004
4005 cmpfunc_t sched_edge_search_order_weight_then_locality_cmp = &sched_edge_search_order_weight_then_locality;
4006
4007 #if DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS
4008
4009 /*
4010 * sched_edge_config_verify_non_decreasing_qos_strictness()
4011 *
4012 * Routine to validate the assumption that higher QoSes
4013 * will be configured with the less restrictive migration
4014 * allowance for each edge in the matrix. This allows
4015 * early-exiting searches when migration is disallowed for
4016 * a higher QoS edge.
4017 * Returns true if no violations were discovered.
4018 */
4019 static inline bool
sched_edge_config_verify_non_decreasing_qos_strictness(pset_id_t src_id,pset_id_t dst_id,sched_bucket_t bucket)4020 sched_edge_config_verify_non_decreasing_qos_strictness(
4021 pset_id_t src_id, pset_id_t dst_id, sched_bucket_t bucket)
4022 {
4023 if (bucket == PERMISSIVE_MIGRATION_BUCKET) {
4024 return true;
4025 }
4026 sched_clutch_edge edge = sched_edge_config_get(src_id, dst_id, bucket);
4027 sched_clutch_edge higher_bucket_edge = sched_edge_config_get(src_id, dst_id, bucket - 1);
4028 if ((edge.sce_migration_allowed && !higher_bucket_edge.sce_migration_allowed) ||
4029 (edge.sce_steal_allowed && !higher_bucket_edge.sce_steal_allowed)) {
4030 kprintf("warn: Edge matrix config violates non-decreasing strictness "
4031 "across buckets %u and %u for edge %u->%u\n",
4032 bucket - 1, bucket, src_id, dst_id);
4033 return false;
4034 }
4035 return true;
4036 }
4037
4038 static bool
sched_edge_config_verify_transitive_traverse(pset_id_t dst_id,pset_id_t curr_id,sched_bucket_t qos,bitmap_t * visited_map)4039 sched_edge_config_verify_transitive_traverse(pset_id_t dst_id, pset_id_t curr_id,
4040 sched_bucket_t qos, bitmap_t *visited_map)
4041 {
4042 if (bitmap_test(visited_map, curr_id)) {
4043 /* Been there, done that */
4044 return true;
4045 }
4046 bitmap_set(visited_map, curr_id);
4047 bool pass = true;
4048 for (pset_id_t next_id = 0; next_id < sched_num_psets; next_id++) {
4049 if (next_id == curr_id) {
4050 continue;
4051 }
4052 sched_clutch_edge path_edge = sched_edge_config_get(next_id, curr_id, qos);
4053 if (path_edge.sce_migration_allowed) {
4054 /*
4055 * We have found a migration path from next_id to dst_id.
4056 * Verify that the direct edge agrees.
4057 */
4058 if (next_id != dst_id) {
4059 sched_clutch_edge direct_edge = sched_edge_config_get(next_id, dst_id, qos);
4060 if (!direct_edge.sce_migration_allowed || !direct_edge.sce_steal_allowed) {
4061 pass = false;
4062 kprintf("warn: Edge matrix config violates transitive property across "
4063 "psets %u->%u for scheduling bucket %u\n", next_id, dst_id, qos);
4064 }
4065 }
4066 /* DFS onward */
4067 pass = sched_edge_config_verify_transitive_traverse(dst_id, next_id, qos, visited_map) && pass;
4068 }
4069 }
4070 return pass;
4071 }
4072
4073 /*
4074 * sched_edge_config_verify_transitive()
4075 *
4076 * Routine to validate transitivity of the Edge matrix which
4077 * helps ensure that the configured migration policy minimizes
4078 * scheduling latency by allowing threads to directly spill to
4079 * idle cores where they are allowed to run, rather than
4080 * arrive on those cores only via steal operations.
4081 * Returns true if no violations were discovered.
4082 */
4083 static bool
sched_edge_config_verify_transitive(pset_id_t dst_id)4084 sched_edge_config_verify_transitive(pset_id_t dst_id)
4085 {
4086 bool pass = true;
4087 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4088 /*
4089 * Depth-first-search paths to get to the destination pset,
4090 * and verify that each path also has a matching direct edge
4091 * from start to finish.
4092 */
4093 bitmap_t visited_map[BITMAP_LEN(MAX_PSETS)] = {0};
4094 pass = sched_edge_config_verify_transitive_traverse(dst_id, dst_id, bucket, visited_map) && pass;
4095 }
4096 return pass;
4097 }
4098
4099 /*
4100 * sched_edge_config_verify()
4101 *
4102 * Performs checks to validate assumed properties of the Edge matrix,
4103 * such as transitivity.
4104 * Returns true if no violations were discovered.
4105 */
4106 static bool
sched_edge_config_verify(void)4107 sched_edge_config_verify(void)
4108 {
4109 bool pass = true;
4110 sched_edge_matrix_iterate(src_id, dst_id, bucket, { \
4111 pass = sched_edge_config_verify_non_decreasing_qos_strictness(src_id, dst_id, bucket) && pass;
4112 });
4113 for (pset_id_t dst_id = 0; dst_id < sched_num_psets; dst_id++) {
4114 pass = sched_edge_config_verify_transitive(dst_id) && pass;
4115 }
4116 return pass;
4117 }
4118
4119 #endif /* DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS */
4120
4121 /*
4122 * sched_edge_config_final_push()
4123 *
4124 * After using sched_edge_config_set() to update edge tunables outgoing from every pset,
4125 * this function is called in order to propagate the updates to derived global metadata,
4126 * such as short-cut bitmasks.
4127 */
4128 static void
sched_edge_config_final_push(void)4129 sched_edge_config_final_push(void)
4130 {
4131 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4132 for (pset_id_t dst_id = 0; dst_id < sched_num_psets; dst_id++) {
4133 bitmap_t updated_steal_map[BITMAP_LEN(MAX_PSETS)] = {0};
4134 for (pset_id_t src_id = 0; src_id < sched_num_psets; src_id++) {
4135 sched_clutch_edge edge = sched_edge_config_get(src_id, dst_id, bucket);
4136 if ((dst_id == src_id) || edge.sce_migration_allowed) {
4137 bitmap_set(updated_steal_map, src_id);
4138 }
4139 }
4140 sched_clutch_root_t dst_root = &pset_array[dst_id]->pset_clutch_root;
4141 os_atomic_store(dst_root->scr_incoming_migration_allowed[bucket], updated_steal_map[0], relaxed);
4142 }
4143 }
4144 }
4145
4146 /*
4147 * sched_edge_matrix_set()
4148 *
4149 * Routine to update various edges in the edge migration graph. The edge_changed array
4150 * indicates which edges need to be updated. Both the edge_matrix and edge_changed arrays
4151 * are matrices with dimension num_psets * num_psets * TH_BUCKET_SCHED_MAX, flattened into a
4152 * single-dimensional array.
4153 */
4154 void
sched_edge_matrix_set(sched_clutch_edge * edge_matrix,bool * edge_changed,__unused uint64_t flags,__assert_only uint64_t num_psets)4155 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changed, __unused uint64_t flags,
4156 __assert_only uint64_t num_psets)
4157 {
4158 assert3u(num_psets, ==, sched_num_psets);
4159 uint32_t edge_index = 0;
4160 for (uint32_t src_cluster = 0; src_cluster < sched_num_psets; src_cluster++) {
4161 for (uint32_t dst_cluster = 0; dst_cluster < sched_num_psets; dst_cluster++) {
4162 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4163 if (edge_changed[edge_index]) {
4164 sched_edge_config_set(src_cluster, dst_cluster, bucket, edge_matrix[edge_index]);
4165 }
4166 edge_index++;
4167 }
4168 }
4169 sched_edge_config_pset_push(src_cluster);
4170 }
4171 sched_edge_config_final_push();
4172 }
4173
4174 /*
4175 * sched_edge_matrix_get()
4176 *
4177 * Routine to retrieve various edges in the edge migration graph. The edge_requested array
4178 * indicates which edges need to be retrieved. Both the edge_matrix and edge_requested arrays
4179 * are matrices with dimension num_psets * num_psets * TH_BUCKET_SCHED_MAX, flattened into a
4180 * single-dimensional array.
4181 */
4182 void
sched_edge_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_requested,__unused uint64_t flags,__assert_only uint64_t num_psets)4183 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_requested, __unused uint64_t flags,
4184 __assert_only uint64_t num_psets)
4185 {
4186 assert3u(num_psets, ==, sched_num_psets);
4187 uint32_t edge_index = 0;
4188 for (uint32_t src_pset = 0; src_pset < sched_num_psets; src_pset++) {
4189 for (uint32_t dst_pset = 0; dst_pset < sched_num_psets; dst_pset++) {
4190 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4191 if (edge_requested[edge_index]) {
4192 edge_matrix[edge_index] = sched_edge_config_get(src_pset, dst_pset, bucket);
4193 }
4194 edge_index++;
4195 }
4196 }
4197 }
4198 }
4199
4200
4201 /*
4202 * sched_edge_init()
4203 *
4204 * Routine to initialize the data structures for the Edge scheduler.
4205 */
4206 static void
sched_edge_init(void)4207 sched_edge_init(void)
4208 {
4209 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
4210 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
4211 }
4212 sched_timeshare_init();
4213 sched_clutch_tunables_init();
4214 assert3s(sched_num_psets, >, 0);
4215 assert3s(sched_num_psets, <=, (int)MAX_PSETS);
4216 }
4217
4218 static void
sched_edge_pset_init(processor_set_t pset)4219 sched_edge_pset_init(processor_set_t pset)
4220 {
4221 uint32_t pset_cluster_id = pset->pset_cluster_id;
4222 pset->pset_type = pset_cluster_type_to_cluster_type(pset->pset_cluster_type);
4223 /* Each pset must declare an AMP type */
4224 assert(pset->pset_type != CLUSTER_TYPE_SMP);
4225
4226 /* Set the edge weight and properties for the pset itself */
4227 bitmap_clear(pset->foreign_psets, pset_cluster_id);
4228 bitmap_clear(pset->native_psets, pset_cluster_id);
4229 bitmap_clear(pset->local_psets, pset_cluster_id);
4230 bitmap_clear(pset->remote_psets, pset_cluster_id);
4231 bzero(&pset->sched_edges, sizeof(pset->sched_edges));
4232 bzero(&pset->max_parallel_cores, sizeof(pset->max_parallel_cores));
4233 bzero(&pset->max_parallel_clusters, sizeof(pset->max_parallel_cores));
4234 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4235 sched_pset_search_order_init(pset, &pset->spill_search_order[bucket]);
4236 }
4237 sched_clutch_root_init(&pset->pset_clutch_root, pset);
4238 atomic_bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id, memory_order_relaxed);
4239 }
4240
4241 static boolean_t
sched_edge_processor_queue_empty(processor_t processor)4242 sched_edge_processor_queue_empty(processor_t processor)
4243 {
4244 return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
4245 (sched_clutch_bound_runq(processor)->count == 0);
4246 }
4247
4248 static void
sched_edge_check_spill(__unused processor_set_t pset,__unused thread_t thread)4249 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
4250 {
4251 assert(thread->bound_processor == PROCESSOR_NULL);
4252 }
4253
4254 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
4255 SCHED_EDGE_YIELD_RUNQ_NONEMPTY = 0x0,
4256 /* SCHED_EDGE_YIELD_FOREIGN_RUNNABLE = 0x1, unused */
4257 SCHED_EDGE_YIELD_FOREIGN_RUNNING = 0x2,
4258 SCHED_EDGE_YIELD_STEAL_POSSIBLE = 0x3,
4259 SCHED_EDGE_YIELD_DISALLOW = 0x4,
4260 });
4261
4262 /*
4263 * sched_edge_thread_should_yield()
4264 *
4265 * Routine for a fast-path decision of whether or not to proceed
4266 * depressing the priority of and considering preempting a
4267 * yielding thread.
4268 * Called with preemption disabled but WITHOUT the pset lock held.
4269 */
4270 static bool
sched_edge_thread_should_yield(processor_t processor,__unused thread_t thread)4271 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
4272 {
4273 /* Self runqueue case exactly matches sched_thread_should_yield() */
4274 if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
4275 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4276 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
4277 return true;
4278 }
4279
4280 /* Scan for running rebalance opportunity */
4281 if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
4282 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4283 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
4284 return true;
4285 }
4286
4287 /* Scan for steal opportunity */
4288 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4289 uint64_t try_all_mask = ~0ULL;
4290 while (sched_iterate_psets_ordered(processor->processor_set,
4291 &processor->processor_set->spill_search_order[TH_BUCKET_FIXPRI], try_all_mask, &istate)) {
4292 processor_set_t target_pset = pset_array[istate.spis_pset_id];
4293 if (sched_edge_pset_peek_steal_possible(target_pset, processor->processor_set, try_all_mask)) {
4294 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4295 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
4296 return true;
4297 }
4298 }
4299
4300 /*
4301 * Note, the current yield policy in thread_select() does NOT attempt
4302 * to steal or rebalance before falling back to continue running the
4303 * yielding thread.
4304 */
4305 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4306 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_DISALLOW);
4307 return false;
4308 }
4309
4310 #if !SCHED_TEST_HARNESS
4311
4312 static void
sched_edge_processor_queue_shutdown(processor_t processor,struct pulled_thread_queue * threadq)4313 sched_edge_processor_queue_shutdown(processor_t processor, struct pulled_thread_queue * threadq)
4314 {
4315 processor_set_t pset = processor->processor_set;
4316 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
4317
4318 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4319 if (pset->online_processor_count == 0 || !pset_is_recommended(pset)) {
4320 atomic_bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_id, memory_order_relaxed);
4321
4322 while (sched_clutch_root_count(pset_clutch_root) > 0) {
4323 thread_t thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root,
4324 processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
4325 pulled_thread_queue_enqueue(threadq, thread);
4326 }
4327 }
4328
4329 pset_unlock(pset);
4330 }
4331
4332 #endif /* !SCHED_TEST_HARNESS */
4333
4334 /*
4335 * The Edge scheduler uses average scheduling latency as the metric for making
4336 * thread migration decisions. One component of avg scheduling latency is the load
4337 * average on the cluster.
4338 *
4339 * Load Average Fixed Point Arithmetic
4340 *
4341 * The load average is maintained as a 24.8 fixed point arithmetic value for precision.
4342 * When multiplied by the average execution time, it needs to be rounded up (based on
4343 * the most significant bit of the fractional part) for better accuracy. After rounding
4344 * up, the whole number part of the value is used as the actual load value for
4345 * migrate/steal decisions.
4346 */
4347 #define SCHED_PSET_LOAD_EWMA_FRACTION_BITS 8
4348 #define SCHED_PSET_LOAD_EWMA_ROUND_BIT (1 << (SCHED_PSET_LOAD_EWMA_FRACTION_BITS - 1))
4349 #define SCHED_PSET_LOAD_EWMA_FRACTION_MASK ((1 << SCHED_PSET_LOAD_EWMA_FRACTION_BITS) - 1)
4350 #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
4351
4352 inline static int
sched_edge_get_pset_load_average(processor_set_t pset,sched_bucket_t sched_bucket)4353 sched_edge_get_pset_load_average(processor_set_t pset, sched_bucket_t sched_bucket)
4354 {
4355 uint64_t load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
4356 uint64_t avg_execution_time = os_atomic_load(&pset->pset_execution_time[sched_bucket].pset_avg_thread_execution_time, relaxed);
4357 /*
4358 * Since a load average of 0 indicates an idle cluster, don't allow an average
4359 * execution time less than 1us to cause a cluster to appear idle.
4360 */
4361 avg_execution_time = MAX(avg_execution_time, 1ULL);
4362 return (int)(((load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS) * avg_execution_time);
4363 }
4364
4365 /*
4366 * sched_edge_pset_running_higher_bucket()
4367 *
4368 * Routine to calculate cumulative running counts for each scheduling
4369 * bucket. This effectively lets the load calculation calculate if a
4370 * cluster is running any threads at a QoS lower than the thread being
4371 * migrated etc.
4372 */
4373 static void
sched_edge_pset_running_higher_bucket(processor_set_t pset,uint32_t * running_higher)4374 sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
4375 {
4376 bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
4377 bzero(running_higher, sizeof(uint32_t) * TH_BUCKET_SCHED_MAX);
4378
4379 /* Count the running threads per bucket */
4380 for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
4381 sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
4382 /* Don't count idle threads */
4383 if (cpu_bucket < TH_BUCKET_SCHED_MAX) {
4384 running_higher[cpu_bucket]++;
4385 }
4386 }
4387
4388 /* Calculate the cumulative running counts as a prefix sum */
4389 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX - 1; bucket++) {
4390 running_higher[bucket + 1] += running_higher[bucket];
4391 }
4392 }
4393
4394 /*
4395 * sched_edge_update_pset_load_average()
4396 *
4397 * Updates the load average for each sched bucket for a cluster.
4398 * This routine must be called with the pset lock held.
4399 */
4400 static void
sched_edge_update_pset_load_average(processor_set_t pset,uint64_t curtime)4401 sched_edge_update_pset_load_average(processor_set_t pset, uint64_t curtime)
4402 {
4403 int avail_cpu_count = pset_available_cpu_count(pset);
4404 if (avail_cpu_count == 0) {
4405 /* Looks like the pset is not runnable any more; nothing to do here */
4406 return;
4407 }
4408
4409 /*
4410 * Edge Scheduler Optimization
4411 *
4412 * See if more callers of this routine can pass in timestamps to avoid the
4413 * mach_absolute_time() call here.
4414 */
4415
4416 if (!curtime) {
4417 curtime = mach_absolute_time();
4418 }
4419 uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
4420 int64_t delta_ticks = curtime - last_update;
4421 if (delta_ticks < 0) {
4422 return;
4423 }
4424
4425 uint64_t delta_nsecs = 0;
4426 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
4427
4428 if (__improbable(delta_nsecs > UINT32_MAX)) {
4429 delta_nsecs = UINT32_MAX;
4430 }
4431
4432 /* Update the shared resource load on the pset */
4433 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
4434 uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
4435 uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
4436 uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
4437 uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
4438 if (old_shared_load != new_shared_load) {
4439 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
4440 }
4441 }
4442
4443 uint32_t running_higher[TH_BUCKET_SCHED_MAX];
4444 sched_edge_pset_running_higher_bucket(pset, running_higher);
4445
4446 for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
4447 uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
4448 uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
4449 uint32_t current_runq_depth = sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) + rt_runq_count(pset) + running_higher[sched_bucket];
4450 os_atomic_store(&pset->pset_runnable_depth[sched_bucket], current_runq_depth, relaxed);
4451
4452 uint32_t current_load = current_runq_depth / avail_cpu_count;
4453 /*
4454 * For the new load average multiply current_load by delta_nsecs (which results in a 32.0 value).
4455 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
4456 * new load average needs to be shifted before it can be added to the old load average.
4457 */
4458 uint64_t new_load_average_factor = (current_load * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
4459
4460 /*
4461 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
4462 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
4463 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
4464 */
4465 int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
4466 boolean_t load_uptick = (old_load_shifted == 0) && (current_load != 0);
4467 boolean_t load_downtick = (old_load_shifted != 0) && (current_load == 0);
4468 uint64_t load_average;
4469 if (load_uptick || load_downtick) {
4470 load_average = (current_load << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
4471 } else {
4472 /* Indicates a loaded system; use EWMA for load average calculation */
4473 load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
4474 }
4475 os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
4476 if (load_average != old_load_average) {
4477 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
4478 os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
4479 }
4480 }
4481 os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
4482 }
4483
4484 static void
sched_edge_update_pset_avg_execution_time(processor_set_t pset,uint64_t execution_time,uint64_t curtime,sched_bucket_t sched_bucket)4485 sched_edge_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
4486 {
4487 pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
4488 uint64_t avg_thread_execution_time = 0;
4489
4490 os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
4491 old_execution_time_packed.pset_execution_time_packed,
4492 new_execution_time_packed.pset_execution_time_packed, relaxed, {
4493 uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
4494 int64_t delta_ticks = curtime - last_update;
4495 if (delta_ticks <= 0) {
4496 /*
4497 * Its possible that another CPU came in and updated the pset_execution_time
4498 * before this CPU could do it. Since the average execution time is meant to
4499 * be an approximate measure per cluster, ignore the older update.
4500 */
4501 os_atomic_rmw_loop_give_up(return );
4502 }
4503 uint64_t delta_nsecs = 0;
4504 absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
4505
4506 uint64_t nanotime = 0;
4507 absolutetime_to_nanoseconds(execution_time, &nanotime);
4508 uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
4509
4510 /*
4511 * Since the average execution time is stored in microseconds, avoid rounding errors in
4512 * the EWMA calculation by only using a non-zero previous value.
4513 */
4514 uint64_t old_avg_thread_execution_time = MAX(old_execution_time_packed.pset_avg_thread_execution_time, 1ULL);
4515
4516 uint64_t old_execution_time = (old_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
4517 uint64_t new_execution_time = (execution_time_us * delta_nsecs);
4518
4519 avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
4520 new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
4521 new_execution_time_packed.pset_execution_time_last_update = curtime;
4522 });
4523 if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
4524 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
4525 }
4526 }
4527
4528 /*
4529 * sched_edge_cluster_load_metric()
4530 *
4531 * The load metric for a cluster is a measure of the average scheduling latency
4532 * experienced by threads on that cluster. It is a product of the average number
4533 * of threads in the runqueue and the average execution time for threads. The metric
4534 * has special values in the following cases:
4535 * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
4536 * the maximum value to disallow any threads to migrate to this cluster.
4537 * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
4538 * to be spread across the platform quickly for ncpu wide workloads.
4539 */
4540 static uint32_t
sched_edge_cluster_load_metric(processor_set_t pset,sched_bucket_t sched_bucket)4541 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
4542 {
4543 if (pset_is_recommended(pset) == false) {
4544 return UINT32_MAX;
4545 }
4546 return (uint32_t)sched_edge_get_pset_load_average(pset, sched_bucket);
4547 }
4548
4549 /*
4550 *
4551 * Edge Scheduler Steal/Rebalance logic
4552 *
4553 * = Generic scheduler logic =
4554 *
4555 * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
4556 * find any thread for execution in its runqueue. The aim of the steal operation
4557 * is to find other threads running/runnable in other clusters which should be
4558 * executed here.
4559 *
4560 * If the steal callout does not return a thread, the thread_select() logic calls
4561 * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
4562 * threads and idle out the current CPU.
4563 *
4564 * = SCHED(steal_thread) for Edge Scheduler =
4565 *
4566 * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
4567 * routine tries to do the following operations in order:
4568 * (1) Find foreign runnnable threads in non-native cluster
4569 * runqueues (sched_edge_foreign_runnable_thread_remove())
4570 * (2) Check if foreign threads are running on the non-native
4571 * clusters (sched_edge_foreign_running_thread_available())
4572 * - If yes, return THREAD_NULL for the steal callout and
4573 * perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
4574 * (3) Steal a thread from another cluster based on edge
4575 * weights (sched_edge_steal_thread())
4576 *
4577 * = SCHED(processor_balance) for Edge Scheduler =
4578 *
4579 * If steal_thread did not return a thread for the processor, use
4580 * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
4581 *
4582 * = Clutch Bucket Preferred Cluster Overrides =
4583 *
4584 * Since these operations (just like thread migrations on enqueue)
4585 * move threads across clusters, they need support for handling clutch
4586 * bucket group level preferred pset recommendations.
4587 * For (1), a clutch bucket will be enqueued in the corresponding steal
4588 * silo and queue based on its preferred pset and scheduling bucket
4589 * respectively.
4590 * For (2), the running thread will set the bit on the processor based
4591 * on its preferred cluster type.
4592 * For (3), the edge configuration would prevent threads from being stolen
4593 * in the wrong direction.
4594 *
4595 * = SCHED(thread_should_yield) =
4596 * The thread_should_yield() logic should remain close to matching what
4597 * thread_select() would do for a yielding thread. Note, cases where
4598 * thread_should_yield() answers "yes" but thread_select() does not
4599 * context-switch out the yielding thread still result in a transient
4600 * priority drop for the yielding thread (not to mention timing effects
4601 * from choosing to consult thread_select()), which could racily affect
4602 * migration decisions happening from other cores.
4603 */
4604
4605 static bool
sched_edge_steal_thread_enabled(__unused processor_set_t pset)4606 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
4607 {
4608 return true;
4609 }
4610
4611 /*
4612 * sched_edge_pset_peek_steal_possible()
4613 *
4614 * Routine to fast-path evaluate whether the steal_from_pset may
4615 * contain threads eligible to be stolen to the idle_pset.
4616 * Can be called WITHOUT either pset locked.
4617 */
4618 static inline bool
sched_edge_pset_peek_steal_possible(processor_set_t steal_from_pset,processor_set_t idle_pset,bitmap_t silos_filter)4619 sched_edge_pset_peek_steal_possible(
4620 processor_set_t steal_from_pset,
4621 processor_set_t idle_pset,
4622 bitmap_t silos_filter)
4623 {
4624 bitmap_t populated_silos =
4625 os_atomic_load(steal_from_pset->pset_clutch_root.scr_populated_steal_silos, relaxed);
4626 bitmap_t permissive_migration_allowed_map =
4627 os_atomic_load(idle_pset->pset_clutch_root.scr_incoming_migration_allowed[PERMISSIVE_MIGRATION_BUCKET], relaxed);
4628 bitmap_t eligible_silos = silos_filter & populated_silos & permissive_migration_allowed_map;
4629 if (eligible_silos == 0) {
4630 /* No eligible silos that contain threads */
4631 return false;
4632 }
4633 for (int silo_id = lsb_first(eligible_silos); silo_id >= 0; silo_id = lsb_next(eligible_silos, silo_id)) {
4634 sched_edge_steal_silo_t steal_silo =
4635 sched_edge_steal_silo_from_pset_id((pset_id_t)silo_id, &steal_from_pset->pset_clutch_root);
4636 bitmap_t populated_queues = os_atomic_load(steal_silo->sess_populated_steal_queues, relaxed);
4637 int highest_populated_bucket = lsb_first(populated_queues);
4638 if (highest_populated_bucket != -1) {
4639 sched_clutch_edge silo_edge =
4640 sched_edge_config_get(silo_id, idle_pset->pset_id, highest_populated_bucket);
4641 if (silo_edge.sce_steal_allowed || (silo_id == idle_pset->pset_id)) {
4642 /* Found eligible candidate */
4643 return true;
4644 }
4645 }
4646 }
4647 /* Silos only contain threads of QoSes not allowed to be stolen across the edge */
4648 return false;
4649 }
4650
4651
4652 /*
4653 * Configurable behaviors when looking for threads to steal
4654 * out of a particular pset.
4655 */
4656 __options_decl(sched_edge_steal_options_t, uint8_t, {
4657 SCHED_EDGE_STEAL_OPTIONS_NONE = 0x0,
4658 /* Only steal when there are more threads at the QoS than CPUs in the pset */
4659 SCHED_EDGE_STEAL_OPTIONS_ONLY_EXCESS_LOAD = 0x1,
4660 });
4661
4662 /*
4663 * sched_edge_pset_steal_thread()
4664 *
4665 * Routine to return the highest QoS thread enqueued in
4666 * steal_from_pset which is eligible to be stolen to
4667 * idle_pset, based on the policy configured in steal_options
4668 * combined with the Edge matrix.
4669 * Always called with the steal_from_pset locked.
4670 */
4671 static thread_t
sched_edge_pset_steal_thread(processor_set_t steal_from_pset,processor_set_t idle_pset,bitmap_t silos_filter,sched_edge_steal_options_t steal_options)4672 sched_edge_pset_steal_thread(
4673 processor_set_t steal_from_pset,
4674 processor_set_t idle_pset,
4675 bitmap_t silos_filter,
4676 sched_edge_steal_options_t steal_options)
4677 {
4678 bitmap_t populated_silos =
4679 os_atomic_load(steal_from_pset->pset_clutch_root.scr_populated_steal_silos, relaxed);
4680 bitmap_t silos_to_search = populated_silos & silos_filter;
4681 thread_t highest_pri_thread = THREAD_NULL;
4682 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4683 while (sched_iterate_psets_ordered(idle_pset, &idle_pset->spill_search_order[TH_BUCKET_FIXPRI],
4684 silos_to_search, &istate)) {
4685 int silo_id = istate.spis_pset_id;
4686 sched_edge_steal_silo_t steal_silo =
4687 sched_edge_steal_silo_from_pset_id(silo_id, &steal_from_pset->pset_clutch_root);
4688 bitmap_t populated_queues = os_atomic_load(steal_silo->sess_populated_steal_queues, relaxed);
4689 for (int bucket = lsb_first(populated_queues); bucket >= 0; bucket = lsb_next(populated_queues, bucket)) {
4690 sched_clutch_edge silo_edge = sched_edge_config_get(silo_id, idle_pset->pset_id, bucket);
4691 if ((silo_edge.sce_steal_allowed == false) && (silo_id != idle_pset->pset_id)) {
4692 /*
4693 * Stealing not allowed to the idle_pset for threads of this QoS and
4694 * recommended to this silo.
4695 * Assume that a higher QoS disallowing steal implies the same for
4696 * all lower QoSes.
4697 */
4698 break;
4699 }
4700 if (steal_options & SCHED_EDGE_STEAL_OPTIONS_ONLY_EXCESS_LOAD) {
4701 if (silo_edge.sce_migration_weight != 0) {
4702 uint32_t candidate_runq_depth = os_atomic_load(&steal_from_pset->pset_runnable_depth[bucket], relaxed);
4703 if (candidate_runq_depth <= pset_available_cpu_count(steal_from_pset)) {
4704 /* No excess threads at or above this bucket */
4705 continue;
4706 }
4707 }
4708 }
4709 /* Thread candidate found */
4710 struct priority_queue_sched_max *steal_queue = &steal_silo->sess_steal_queues[bucket];
4711 sched_clutch_bucket_t clutch_bucket = priority_queue_max(steal_queue, struct sched_clutch_bucket, scb_stealqlink);
4712 thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
4713 /* Bias ties in favor of psets earlier in the search order */
4714 if ((highest_pri_thread == THREAD_NULL) || (thread->sched_pri > highest_pri_thread->sched_pri)) {
4715 highest_pri_thread = thread;
4716 }
4717 /* Since this thread is from the highest eligible QoS we found in this silo, move on to search other silos */
4718 break;
4719 }
4720 }
4721 return highest_pri_thread;
4722 }
4723
4724 static thread_t
sched_edge_foreign_runnable_thread_remove(processor_set_t idle_pset,uint64_t ctime)4725 sched_edge_foreign_runnable_thread_remove(processor_set_t idle_pset, uint64_t ctime)
4726 {
4727 thread_t thread = THREAD_NULL;
4728
4729 /*
4730 * Search all the psets that are foreign for the idle_pset,
4731 * iterating in reverse spill order to prioritize rescuing
4732 * threads from their least desired, most "distant" spill
4733 * location.
4734 */
4735 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4736 istate.spis_options = SCHED_PSET_ITERATE_STATE_OPTIONS_REVERSE;
4737 while (sched_iterate_psets_ordered(idle_pset, &idle_pset->spill_search_order[PERMISSIVE_MIGRATION_BUCKET],
4738 idle_pset->foreign_psets[0], &istate)) {
4739 processor_set_t target_pset = pset_array[istate.spis_pset_id];
4740 /*
4741 * For each pset, see if there are any runnable foreign threads.
4742 * This check is currently being done without the pset lock to make it cheap for
4743 * the common case.
4744 */
4745 pset_node_t dst_node = pset_node_for_pset_cluster_type(idle_pset->pset_cluster_type);
4746 if (!sched_edge_pset_peek_steal_possible(target_pset, idle_pset, dst_node->pset_map)) {
4747 continue;
4748 }
4749 /*
4750 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
4751 * and get the highest priority thread.
4752 */
4753 pset_lock(target_pset);
4754 thread = sched_edge_pset_steal_thread(target_pset, idle_pset, dst_node->pset_map,
4755 SCHED_EDGE_STEAL_OPTIONS_NONE);
4756 if (thread != THREAD_NULL) {
4757 sched_clutch_thread_remove(&target_pset->pset_clutch_root, thread, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4758 SCHED(update_pset_load_average)(target_pset, ctime);
4759 }
4760 pset_unlock(target_pset);
4761
4762 /*
4763 * Edge Scheduler Optimization
4764 *
4765 * The current implementation immediately returns as soon as it finds a foreign
4766 * runnable thread. This could be enhanced to look at highest priority threads
4767 * from all foreign clusters and pick the highest amongst them. That would need
4768 * some form of global state across psets to make that kind of a check cheap.
4769 */
4770 if (thread != THREAD_NULL) {
4771 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), idle_pset->pset_id, target_pset->pset_id, 0);
4772 break;
4773 }
4774 /* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
4775 }
4776
4777 return thread;
4778 }
4779
4780 /*
4781 * sched_edge_cpu_running_foreign_shared_rsrc_available()
4782 *
4783 * Routine to determine if the thread running on a CPU is a shared resource thread
4784 * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
4785 * a CPU going idle on a pset should rebalance a running shared resource heavy thread
4786 * from another non-ideal cluster based on the former's shared resource load.
4787 */
4788 static boolean_t
sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset,int foreign_cpu,processor_set_t idle_pset)4789 sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
4790 {
4791 boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
4792 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
4793 return false;
4794 }
4795
4796 boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
4797 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
4798 return false;
4799 }
4800 return true;
4801 }
4802
4803 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset)4804 sched_edge_foreign_running_thread_available(processor_set_t pset)
4805 {
4806 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4807 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4808 while (sched_iterate_psets_ordered(pset, &pset->spill_search_order[PERMISSIVE_MIGRATION_BUCKET], foreign_pset_bitmap[0], &istate)) {
4809 /* Skip the pset if its not schedulable */
4810 processor_set_t target_pset = pset_array[istate.spis_pset_id];
4811 if (pset_is_recommended(target_pset) == false) {
4812 continue;
4813 }
4814
4815 uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
4816 for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
4817 if (sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
4818 return true;
4819 }
4820 }
4821 }
4822 return false;
4823 }
4824
4825 static thread_t
sched_edge_steal_thread(processor_set_t idle_pset,uint64_t candidate_pset_bitmap)4826 sched_edge_steal_thread(processor_set_t idle_pset, uint64_t candidate_pset_bitmap)
4827 {
4828 thread_t stolen_thread = THREAD_NULL;
4829
4830 /*
4831 * Edge Scheduler Optimization
4832 *
4833 * The logic today bails as soon as it finds a cluster where the cluster load is
4834 * greater than the edge weight. Maybe it should have a more advanced version
4835 * which looks for the maximum delta etc.
4836 */
4837 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4838 while (sched_iterate_psets_ordered(idle_pset, &idle_pset->spill_search_order[PERMISSIVE_MIGRATION_BUCKET], candidate_pset_bitmap, &istate)) {
4839 processor_set_t steal_from_pset = pset_array[istate.spis_pset_id];
4840 bitmap_t migration_allowed_map =
4841 os_atomic_load(idle_pset->pset_clutch_root.scr_incoming_migration_allowed[PERMISSIVE_MIGRATION_BUCKET], relaxed);
4842 if (!sched_edge_pset_peek_steal_possible(steal_from_pset, idle_pset, migration_allowed_map)) {
4843 continue;
4844 }
4845 pset_lock(steal_from_pset);
4846
4847 sched_edge_steal_options_t steal_options = SCHED_EDGE_STEAL_OPTIONS_ONLY_EXCESS_LOAD;
4848 stolen_thread = sched_edge_pset_steal_thread(steal_from_pset, idle_pset, migration_allowed_map, steal_options);
4849
4850 if (stolen_thread != THREAD_NULL) {
4851 uint64_t current_timestamp = mach_absolute_time();
4852 sched_clutch_thread_remove(&steal_from_pset->pset_clutch_root, stolen_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4853 SCHED(update_pset_load_average)(steal_from_pset, current_timestamp);
4854 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(stolen_thread), idle_pset->pset_id, steal_from_pset->pset_id, 0);
4855 }
4856
4857 pset_unlock(steal_from_pset);
4858 if (stolen_thread != THREAD_NULL) {
4859 break;
4860 }
4861 }
4862 return stolen_thread;
4863 }
4864
4865 /*
4866 * sched_edge_processor_idle()
4867 *
4868 * The routine is the implementation for steal_thread() for the Edge scheduler.
4869 */
4870 static thread_t
sched_edge_processor_idle(processor_set_t pset)4871 sched_edge_processor_idle(processor_set_t pset)
4872 {
4873 thread_t thread = THREAD_NULL;
4874
4875 uint64_t ctime = mach_absolute_time();
4876
4877 processor_t processor = current_processor();
4878 bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
4879
4880 /* Each of the operations acquire the lock for the pset they target */
4881 pset_unlock(pset);
4882
4883 /* Find highest priority runnable thread on all non-native clusters */
4884 thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
4885 if (thread != THREAD_NULL) {
4886 return thread;
4887 }
4888
4889 /* Find highest priority runnable thread on all native clusters */
4890 thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4891 if (thread != THREAD_NULL) {
4892 return thread;
4893 }
4894
4895 /* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4896 boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4897 if (rebalance_needed) {
4898 return THREAD_NULL;
4899 }
4900
4901 /* No foreign-enqueued threads found; find a thread to steal from all clusters based on weights/loads etc. */
4902 thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4903 return thread;
4904 }
4905
4906 /* Return true if this shared resource thread has a better cluster to run on */
4907 static bool
sched_edge_shared_rsrc_migrate_possible(thread_t thread,processor_set_t preferred_pset,processor_set_t current_pset)4908 sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4909 {
4910 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4911 uint64_t current_pset_load = sched_edge_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4912 /*
4913 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4914 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4915 *
4916 * Edge Scheduler Optimization
4917 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4918 */
4919 uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4920
4921 uint64_t eligible_pset_bitmask = 0;
4922 if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4923 /*
4924 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4925 * only among clusters native with the preferred cluster.
4926 */
4927 eligible_pset_bitmask = preferred_pset->native_psets[0];
4928 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4929 } else {
4930 /* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4931 eligible_pset_bitmask = os_atomic_load(&sched_edge_available_pset_bitmask[0], relaxed);
4932 }
4933
4934 /* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4935 for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4936 if (cluster_id == current_pset->pset_cluster_id) {
4937 continue;
4938 }
4939 uint64_t cluster_load = sched_edge_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4940 if (current_pset_adjusted_load > cluster_load) {
4941 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4942 return true;
4943 }
4944 }
4945 return false;
4946 }
4947
4948 /*
4949 * Stir-the-pot Registry:
4950 *
4951 * Global state tracking which cores currently have threads that
4952 * are ready to be stirred onto cores of the opposite type.
4953 *
4954 * The registry state updates are implemented with atomic transaction
4955 * operations rather than a global lock, in order to avoid the cost
4956 * of serializing some of the most frequent registry state update
4957 * callsites that depend on consistent speed--namely the
4958 * preemption check and context-switch paths. The most expensive
4959 * state update, in sched_edge_stir_the_pot_try_trigger_swap(), only
4960 * happens at quantum expiration, which should allow cheaper
4961 * operations at other callsites to win the race.
4962 */
4963 typedef unsigned __int128 sched_edge_stp_registry_t;
4964 _Atomic sched_edge_stp_registry_t sched_edge_stir_the_pot_global_registry = 0LL;
4965 #define SESTP_BITS_PER_CORE (2)
4966 #define SESTP_BIT_POS(cpu_id) ((sched_edge_stp_registry_t)(cpu_id * SESTP_BITS_PER_CORE))
4967 #define SESTP_MASK(cpu_id) ((sched_edge_stp_registry_t)mask(SESTP_BITS_PER_CORE) << SESTP_BIT_POS(cpu_id))
4968 static_assert((SESTP_BITS_PER_CORE * MAX_CPUS) <= (sizeof(sched_edge_stp_registry_t) * 8),
4969 "Global registry must fit per-core bits for each core");
4970
4971 #define SESTP_EXTRACT_STATE(registry, cpu_id) ((registry >> SESTP_BIT_POS(cpu_id)) & mask(SESTP_BITS_PER_CORE))
4972 #define SESTP_SET_STATE(registry, cpu_id, state) ((registry & ~SESTP_MASK(cpu_id)) | ((sched_edge_stp_registry_t)state << SESTP_BIT_POS(cpu_id)))
4973 __enum_decl(sched_edge_stp_state_t, uint8_t, {
4974 SCHED_EDGE_STP_NOT_WANT = 0,
4975 SCHED_EDGE_STP_REQUESTED = 1,
4976 SCHED_EDGE_STP_PENDING = 2,
4977 SCHED_EDGE_STP_MAX = SCHED_EDGE_STP_PENDING
4978 });
4979 static_assert(SCHED_EDGE_STP_MAX <= mask(SESTP_BITS_PER_CORE),
4980 "Per-core stir-the-pot request state must fit in per-core bits");
4981
4982 #if OS_ATOMIC_USE_LLSC
4983 #error "Expecting CAS implementation of os_atomic_rmw_loop()"
4984 #endif /* OS_ATOMIC_USE_LLSC */
4985
4986 static cpumap_t sched_edge_p_core_map = 0ULL;
4987 static cpumap_t sched_edge_non_p_core_map = 0ULL;
4988
4989 /*
4990 * In order to reduce the chance of picking the same CPUs over
4991 * and over unfairly for stir-the-pot swaps, use an offset value
4992 * for the lsb selection, which rotates by one index each time
4993 * the choice is evaluated.
4994 */
4995 static _Atomic uint64_t sched_edge_stp_selection_p_core_offset = 0;
4996 static _Atomic uint64_t sched_edge_stp_selection_non_p_core_offset = 0;
4997
4998 /*
4999 * sched_edge_stir_the_pot_try_trigger_swap()
5000 *
5001 * Search for an eligible swap candidate on the opposite core
5002 * type, and if one is found, initiate a swap for stir-the-pot.
5003 * From a P-core, initiating means sending an inbox message and IPI
5004 * to the swapping lower performance core. For initiating swap from
5005 * a lower performance core, only an inbox message needs to be sent
5006 * to itself, naming the P-core for swap.
5007 * If no eligible candidate is found, mark the current processor
5008 * as requesting stir-the-pot swap--that is unless a swap has already
5009 * been initiated for this core, in which case we should sit tight.
5010 * Thread lock must be held.
5011 */
5012 static inline int
sched_edge_stir_the_pot_try_trigger_swap(thread_t thread)5013 sched_edge_stir_the_pot_try_trigger_swap(thread_t thread)
5014 {
5015 processor_t self_processor = current_processor();
5016 int self_cpu = self_processor->cpu_id;
5017 /*
5018 * Prepare the core mask of candidate cores (of the opposite type),
5019 * and compute an offset where the candidate search should begin,
5020 * to avoid unfairly swapping with the same cores repeatedly.
5021 */
5022 cpumap_t swap_candidates_map;
5023 uint64_t offset;
5024 if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
5025 swap_candidates_map = sched_edge_non_p_core_map;
5026 offset = os_atomic_inc_orig(&sched_edge_stp_selection_non_p_core_offset, relaxed);
5027 } else {
5028 swap_candidates_map = sched_edge_p_core_map;
5029 offset = os_atomic_inc_orig(&sched_edge_stp_selection_p_core_offset, relaxed);
5030 }
5031 int num_candidates = bit_count(swap_candidates_map);
5032 if (num_candidates == 0) {
5033 /* Too early in boot, no cores of opposite type */
5034 return -1;
5035 }
5036 int cpu_of_type_offset_ind = offset % num_candidates;
5037 int search_start_ind = lsb_first(swap_candidates_map);
5038 for (int i = 0; i < cpu_of_type_offset_ind; i++) {
5039 search_start_ind = lsb_next(swap_candidates_map, search_start_ind);
5040 assert3s(search_start_ind, !=, -1);
5041 }
5042 assert3s(search_start_ind, !=, -1);
5043 swap_candidates_map = bit_ror64(swap_candidates_map, search_start_ind);
5044 /*
5045 * Search the registry for candidate cores of the opposite type which
5046 * have requested swap.
5047 */
5048 int swap_cpu;
5049 sched_edge_stp_registry_t old_registry, new_registry, intermediate_registry;
5050 sched_edge_stp_state_t self_state;
5051 /* BEGIN IGNORE CODESTYLE */
5052 os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
5053 old_registry, new_registry, relaxed, {
5054 swap_cpu = -1;
5055 self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
5056 if (self_state == SCHED_EDGE_STP_PENDING) {
5057 /*
5058 * Another core already initiated a swap with us, so we should
5059 * wait for that one to finish rather than initiate or request
5060 * a new one.
5061 */
5062 os_atomic_rmw_loop_give_up(break);
5063 }
5064 /* Scan candidates */
5065 for (int rotid = lsb_first(swap_candidates_map); rotid != -1; rotid = lsb_next(swap_candidates_map, rotid)) {
5066 int candidate_cpu = (rotid + search_start_ind) % 64; // un-rotate the bit
5067 sched_edge_stp_state_t candidate_state = SESTP_EXTRACT_STATE(old_registry, candidate_cpu);
5068 if (candidate_state == SCHED_EDGE_STP_REQUESTED) {
5069 sched_bucket_t candidate_qos = os_atomic_load(
5070 &processor_array[candidate_cpu]->processor_set->cpu_running_buckets[candidate_cpu], relaxed);
5071 if (candidate_qos == thread->th_sched_bucket) {
5072 /* Found a requesting candidate of matching QoS */
5073 swap_cpu = candidate_cpu;
5074 break;
5075 }
5076 }
5077 }
5078 if (swap_cpu == -1) {
5079 /* No candidates requesting swap, so mark this core as requesting */
5080 intermediate_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_REQUESTED);
5081 } else {
5082 /*
5083 * Mark candidate core as selected/pending for swap, and mark
5084 * current CPU as not needing a swap anymore, since we will now
5085 * start one.
5086 */
5087 intermediate_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_PENDING);
5088 intermediate_registry = SESTP_SET_STATE(intermediate_registry, swap_cpu, SCHED_EDGE_STP_PENDING);
5089 }
5090 new_registry = intermediate_registry;
5091 });
5092 /* END IGNORE CODESTYLE */
5093 /* Leave debug tracepoints for tracking any updates to registry state */
5094 if (self_state != SCHED_EDGE_STP_PENDING) {
5095 if (swap_cpu == -1) {
5096 if (self_state != SCHED_EDGE_STP_REQUESTED) {
5097 /* Now requesting */
5098 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
5099 DBG_FUNC_START, 0, self_cpu, cpu_of_type_offset_ind, 0);
5100 }
5101 } else {
5102 if (self_state == SCHED_EDGE_STP_REQUESTED) {
5103 /* Now pending */
5104 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
5105 DBG_FUNC_END, 1, self_cpu, cpu_of_type_offset_ind, 0);
5106 }
5107 int swap_state = SESTP_EXTRACT_STATE(old_registry, swap_cpu);
5108 if (swap_state == SCHED_EDGE_STP_REQUESTED) {
5109 /* Swap core now pending */
5110 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
5111 DBG_FUNC_END, 1, swap_cpu, cpu_of_type_offset_ind, 0);
5112 }
5113 }
5114 }
5115 if (swap_cpu != -1) {
5116 /* Initiate a stir-the-pot swap */
5117 assert3s(swap_cpu, <, ml_get_topology_info()->num_cpus);
5118 assert3s(swap_cpu, !=, self_processor->cpu_id);
5119 processor_t swap_processor = processor_array[swap_cpu];
5120 if (swap_processor == PROCESSOR_NULL) {
5121 /* Unlikely early boot initialization race */
5122 return -1;
5123 }
5124 assert3u(sched_edge_stir_the_pot_core_type_is_desired(swap_processor->processor_set), !=,
5125 sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set));
5126 if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
5127 /*
5128 * Send a message and IPI notification to the lower-performance
5129 * core we found which wants to swap, so it will know to send its
5130 * thread back here.
5131 */
5132 os_atomic_store(&swap_processor->stir_the_pot_inbox_cpu, self_cpu, relaxed);
5133 processor_set_t swap_pset = swap_processor->processor_set;
5134 pset_lock(swap_pset);
5135 sched_ipi_type_t ipi_type = sched_ipi_action(swap_processor, NULL,
5136 SCHED_IPI_EVENT_REBALANCE);
5137 pset_unlock(swap_pset);
5138 sched_ipi_perform(swap_processor, ipi_type);
5139 } else {
5140 /*
5141 * Send message to self to send this thread to the swap P-core. P-core
5142 * will clear its own pending state upon commiting to the incoming swap
5143 * thread after that happens.
5144 */
5145 os_atomic_store(&self_processor->stir_the_pot_inbox_cpu, swap_cpu, relaxed);
5146 }
5147 }
5148 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_NONE,
5149 (swap_cpu != -1) ? 1 : 0, swap_cpu, old_registry, cpu_of_type_offset_ind);
5150 return swap_cpu;
5151 }
5152
5153 /*
5154 * sched_edge_stir_the_pot_clear_registry_entry()
5155 *
5156 * Mark the current CPU as NOT containing a thread which is eligible
5157 * to be swapped for stir-the-pot.
5158 * Preemption must be disabled.
5159 */
5160 void
sched_edge_stir_the_pot_clear_registry_entry(void)5161 sched_edge_stir_the_pot_clear_registry_entry(void)
5162 {
5163 int self_cpu = current_processor()->cpu_id;
5164 sched_edge_stp_state_t self_state;
5165 sched_edge_stp_registry_t old_registry, new_registry;
5166 os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
5167 old_registry, new_registry, relaxed, {
5168 self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
5169 if (self_state == SCHED_EDGE_STP_NOT_WANT) {
5170 /* State already cleared, nothing to be done */
5171 os_atomic_rmw_loop_give_up(break);
5172 }
5173 new_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_NOT_WANT);
5174 });
5175 if (self_state == SCHED_EDGE_STP_REQUESTED) {
5176 /* Request was cleared */
5177 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_END,
5178 2, self_cpu, 0, 0);
5179 }
5180 }
5181
5182 /*
5183 * sched_edge_stir_the_pot_set_registry_entry()
5184 *
5185 * Mark the current CPU as containing a thread which is eligible
5186 * to be swapped to a core of the opposite type for stir-the-pot.
5187 * Preemption must be disabled.
5188 */
5189 static inline void
sched_edge_stir_the_pot_set_registry_entry(void)5190 sched_edge_stir_the_pot_set_registry_entry(void)
5191 {
5192 int self_cpu = current_processor()->cpu_id;
5193 sched_edge_stp_state_t self_state;
5194 sched_edge_stp_registry_t old_registry, new_registry;
5195 bool newly_requested = os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
5196 old_registry, new_registry, relaxed, {
5197 self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
5198 if (self_state == SCHED_EDGE_STP_REQUESTED) {
5199 /* Core already registered, nothing to be done */
5200 os_atomic_rmw_loop_give_up(break);
5201 }
5202 new_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_REQUESTED);
5203 });
5204 if (newly_requested) {
5205 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_START,
5206 3, self_cpu, self_state, 0);
5207 }
5208 }
5209
5210 /* Stir-the-pot is designed for sharing time on the P-cores */
5211 static inline bool
sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset)5212 sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset)
5213 {
5214 return pset->pset_type == CLUSTER_TYPE_P;
5215 }
5216
5217 /*
5218 * sched_edge_stir_the_pot_thread_eligible()
5219 *
5220 * Determine whether a thread is eligible to engage in a
5221 * stir-the-pot swap. It must be P-recommended, unbound, and not
5222 * round-robin shared resource. Additionally, it must have already
5223 * expired quantum on its current core type.
5224 */
5225 static inline bool
sched_edge_stir_the_pot_thread_eligible(thread_t thread)5226 sched_edge_stir_the_pot_thread_eligible(thread_t thread)
5227 {
5228 processor_set_t preferred_pset;
5229 if ((thread == THREAD_NULL) ||
5230 ((preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)]) == PROCESSOR_SET_NULL)) {
5231 /* Still initializing at boot */
5232 return false;
5233 }
5234 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5235 bool right_kind_of_thread =
5236 sched_edge_stir_the_pot_core_type_is_desired(preferred_pset) &&
5237 (thread->sched_mode != TH_MODE_REALTIME) &&
5238 ((thread->state & TH_IDLE) == 0) &&
5239 SCHED_CLUTCH_THREAD_ELIGIBLE(thread) &&
5240 (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) == false) &&
5241 (shared_rsrc_type == CLUSTER_SHARED_RSRC_TYPE_NONE ||
5242 shared_rsrc_type == CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
5243 bool ready_for_swap = sched_edge_stir_the_pot_core_type_is_desired(current_processor()->processor_set) ?
5244 thread->th_expired_quantum_on_higher_core :
5245 thread->th_expired_quantum_on_lower_core;
5246 return right_kind_of_thread && ready_for_swap;
5247 }
5248
5249 /*
5250 * sched_edge_stir_the_pot_check_inbox_for_thread()
5251 *
5252 * Check whether this thread on a non-P-core has been chosen by a P-core to
5253 * swap places for stir-the-pot, optionally consuming the inbox message.
5254 * Preemption must be disabled.
5255 */
5256 static inline int
sched_edge_stir_the_pot_check_inbox_for_thread(thread_t thread,bool consume_message)5257 sched_edge_stir_the_pot_check_inbox_for_thread(thread_t thread, bool consume_message)
5258 {
5259 processor_t self_processor = current_processor();
5260 int dst_cpu = -1;
5261 if (sched_edge_stir_the_pot_thread_eligible(thread)) {
5262 /* Thread can accept the inbox message */
5263 dst_cpu = os_atomic_load(&self_processor->stir_the_pot_inbox_cpu, relaxed);
5264 } else {
5265 /* Ensure registry state is cleared for ineligible thread, if it hasn't been already */
5266 sched_edge_stir_the_pot_clear_registry_entry();
5267 /*
5268 * Note, we don't clear a possible inbox message, in case an eligible
5269 * thread comes back on-core quickly to receive it.
5270 */
5271 }
5272 if (consume_message) {
5273 /*
5274 * Unconditionally clear inbox, since either we are triggering a
5275 * swap now or ultimately discarding the message because conditions
5276 * have changed (thread not eligible).
5277 */
5278 os_atomic_store(&self_processor->stir_the_pot_inbox_cpu, -1, relaxed);
5279 /*
5280 * We may have delayed requesting stir-the-pot swap for the the current thread
5281 * due to a pending inbox message for the previous thread. Now that that such
5282 * a message has been received, finishing updating the registry state.
5283 */
5284 if (sched_edge_stir_the_pot_thread_eligible(self_processor->active_thread)) {
5285 sched_edge_stir_the_pot_set_registry_entry();
5286 }
5287 }
5288 return dst_cpu;
5289 }
5290
5291 /*
5292 * sched_edge_stir_the_pot_update_registry_state()
5293 *
5294 * Update stir-the-pot state for the current processor based on its
5295 * (possibly new) current thread. This sets or clears the registry state
5296 * which indicates whether the processor is running a thread that wants
5297 * and is eligible to be swapped with a thread on the opposite core type.
5298 * Preemption must be disabled.
5299 */
5300 void
sched_edge_stir_the_pot_update_registry_state(thread_t thread)5301 sched_edge_stir_the_pot_update_registry_state(thread_t thread)
5302 {
5303 processor_t self_processor = current_processor();
5304 /*
5305 * Clear corresponding th_expired_quantum_on_ field now that thread
5306 * is getting a chance to run on the opposite type.
5307 */
5308 if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
5309 thread->th_expired_quantum_on_lower_core = false;
5310 } else {
5311 thread->th_expired_quantum_on_higher_core = false;
5312 }
5313 if (sched_edge_stir_the_pot_thread_eligible(thread)) {
5314 int inbox_message = os_atomic_load(&self_processor->stir_the_pot_inbox_cpu, relaxed);
5315 if (inbox_message == -1) {
5316 /* Set the registry bit */
5317 sched_edge_stir_the_pot_set_registry_entry();
5318 } else {
5319 assert(sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set) == false);
5320 /*
5321 * There's an inbox message which still needs to be used at the next
5322 * migration decision, so avoid starting a new request or clearing the
5323 * interim pending status until then.
5324 */
5325 }
5326 } else {
5327 /* Thread is ineligible for swap, so clear the registry bit */
5328 sched_edge_stir_the_pot_clear_registry_entry();
5329 }
5330 }
5331
5332 /*
5333 * sched_edge_quantum_expire()
5334 *
5335 * Update stir-the-pot eligibility and drive stir-the-pot swaps.
5336 * Thread lock must be held.
5337 */
5338 static void
sched_edge_quantum_expire(thread_t thread)5339 sched_edge_quantum_expire(thread_t thread)
5340 {
5341 if (sched_edge_stir_the_pot_core_type_is_desired(current_processor()->processor_set)) {
5342 thread->th_expired_quantum_on_higher_core = true;
5343 } else {
5344 thread->th_expired_quantum_on_lower_core = true;
5345 }
5346 if (sched_edge_stir_the_pot_thread_eligible(thread)) {
5347 sched_edge_stir_the_pot_try_trigger_swap(thread);
5348 }
5349 }
5350
5351 /*
5352 * sched_edge_run_count_incr()
5353 *
5354 * Update runnable thread counts in the same way as
5355 * sched_clutch_run_incr(), and reset per-thread, quantum-
5356 * expired tracking used by stir-the-pot, as the thread
5357 * is unblocking.
5358 */
5359 static uint32_t
sched_edge_run_count_incr(thread_t thread)5360 sched_edge_run_count_incr(thread_t thread)
5361 {
5362 uint32_t new_count = sched_clutch_run_incr(thread);
5363 /* Thread is unblocking and so resets its quantum tracking */
5364 thread->th_expired_quantum_on_lower_core = false;
5365 thread->th_expired_quantum_on_higher_core = false;
5366 return new_count;
5367 }
5368
5369 /* Return true if this thread should not continue running on this processor */
5370 static bool
sched_edge_thread_avoid_processor(processor_t processor,thread_t thread,ast_t reason)5371 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason)
5372 {
5373 if (thread->bound_processor == processor) {
5374 /* Thread is bound here */
5375 return false;
5376 }
5377
5378 /*
5379 * On quantum expiry, check the migration bitmask if this thread should be migrated off this core.
5380 * A migration is only recommended if there's also an idle core available that needn't be avoided.
5381 */
5382 if (reason & AST_QUANTUM) {
5383 if (bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id)) {
5384 uint64_t non_avoided_idle_primary_map = processor->processor_set->cpu_state_map[PROCESSOR_IDLE] & processor->processor_set->recommended_bitmask & ~processor->processor_set->perfcontrol_cpu_migration_bitmask;
5385 if (non_avoided_idle_primary_map != 0) {
5386 return true;
5387 }
5388 }
5389 }
5390
5391 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5392
5393 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) &&
5394 preferred_pset->pset_id != processor->processor_set->pset_id &&
5395 pset_type_is_recommended(preferred_pset)) {
5396 /* We should send this thread to the bound cluster */
5397 return true;
5398 }
5399
5400 sched_clutch_edge edge = (thread->sched_pri >= BASEPRI_RTQUEUES)
5401 ? sched_rt_config_get(preferred_pset->pset_cluster_id, processor->processor_set->pset_cluster_id)
5402 : sched_edge_config_get(preferred_pset->pset_cluster_id, processor->processor_set->pset_cluster_id, thread->th_sched_bucket);
5403 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) == false &&
5404 preferred_pset->pset_id != processor->processor_set->pset_id &&
5405 edge.sce_migration_allowed == false &&
5406 edge.sce_steal_allowed == false) {
5407 /*
5408 * Thread isn't allowed to be here, according to the edge migration graph.
5409 * Perhaps the thread's priority or boundness or its thread group's preferred
5410 * pset or the edge migration graph changed.
5411 *
5412 * We should only preempt after confirming the thread actually has a
5413 * recommended, allowed alternative pset to run on.
5414 */
5415 for (uint32_t pset_id = 0; pset_id < sched_num_psets; pset_id++) {
5416 if (pset_id == processor->processor_set->pset_id) {
5417 continue;
5418 }
5419 edge = (thread->sched_pri >= BASEPRI_RTQUEUES)
5420 ? sched_rt_config_get(preferred_pset->pset_id, pset_id)
5421 : sched_edge_config_get(preferred_pset->pset_id, pset_id, thread->th_sched_bucket);
5422 if (pset_is_recommended(pset_array[pset_id]) && ((pset_id == preferred_pset->pset_id) || edge.sce_migration_allowed)) {
5423 /* Thread can be run elsewhere. */
5424 return true;
5425 }
5426 }
5427 }
5428
5429 /* Evaluate shared resource policies */
5430 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
5431 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
5432 }
5433 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
5434 if (processor->processor_set->pset_type != preferred_pset->pset_type &&
5435 pset_type_is_recommended(preferred_pset)) {
5436 return true;
5437 }
5438 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
5439 }
5440
5441 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5442 return false;
5443 }
5444 /* ~~ No realtime or shared resource threads beyond this point ~~ */
5445
5446 /*
5447 * Stir-the-Pot:
5448 * A non-P-core should preempt if a P-core has been found to swap the current,
5449 * quantum-expired thread to for stir-the-pot. This is in order for threads in a
5450 * multi-threaded workload to share time on the P-cores so they make roughly equal
5451 * forward progress.
5452 */
5453 if (sched_edge_stir_the_pot_check_inbox_for_thread(thread, false) != -1) {
5454 return true;
5455 }
5456
5457 /*
5458 * Compaction:
5459 * If the preferred pset for the thread is now idle, try and migrate the thread to that cluster.
5460 */
5461 if ((processor->processor_set != preferred_pset) &&
5462 (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
5463 return true;
5464 }
5465
5466 /*
5467 * Running Rebalance:
5468 * We are willing to preempt the thread in order to migrate it onto an idle core
5469 * of the preferred type.
5470 */
5471 if ((processor->processor_set->pset_type != preferred_pset->pset_type) &&
5472 pset_type_is_recommended(preferred_pset)) {
5473 /* Scan for idle pset */
5474 for (uint32_t pset_id = 0; pset_id < sched_num_psets; pset_id++) {
5475 processor_set_t candidate_pset = pset_array[pset_id];
5476 edge = sched_edge_config_get(preferred_pset->pset_id, pset_id, thread->th_sched_bucket);
5477 if ((candidate_pset->pset_type == preferred_pset->pset_type) &&
5478 edge.sce_migration_allowed &&
5479 (sched_edge_cluster_load_metric(candidate_pset, thread->th_sched_bucket) == 0)) {
5480 return true;
5481 }
5482 }
5483 }
5484
5485 return false;
5486 }
5487
5488 static bool
sched_edge_balance(__unused processor_t cprocessor,processor_set_t cpset)5489 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
5490 {
5491 assert(cprocessor == current_processor());
5492 pset_unlock(cpset);
5493
5494 uint64_t ast_processor_map = 0;
5495 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
5496
5497 bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
5498 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_num_psets); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
5499 /* Skip the pset if its not schedulable */
5500 processor_set_t target_pset = pset_array[cluster];
5501 if (pset_is_recommended(target_pset) == false) {
5502 continue;
5503 }
5504
5505 pset_lock(target_pset);
5506 uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
5507 for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
5508 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
5509 continue;
5510 }
5511 processor_t target_cpu = processor_array[cpuid];
5512 ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
5513 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
5514 bit_set(ast_processor_map, cpuid);
5515 }
5516 }
5517 pset_unlock(target_pset);
5518 }
5519
5520 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
5521 processor_t ast_processor = processor_array[cpuid];
5522 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
5523 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
5524 }
5525
5526 /* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
5527 return ast_processor_map != 0;
5528 }
5529
5530 /*
5531 * sched_edge_migration_check()
5532 *
5533 * Routine to evaluate an edge between two clusters to decide if migration is possible
5534 * across that edge. Also updates the selected_pset and max_edge_delta out parameters
5535 * accordingly. The return value indicates if the invoking routine should short circuit
5536 * the search, since an ideal candidate has been found. The routine looks at the regular
5537 * edges and cluster loads or the shared resource loads based on the type of thread.
5538 */
5539 static bool
sched_edge_migration_check(uint32_t cluster_id,processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread,processor_set_t * selected_pset,uint32_t * max_edge_delta)5540 sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
5541 uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
5542 {
5543 uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
5544 cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
5545 processor_set_t dst_pset = pset_array[cluster_id];
5546 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5547 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5548
5549 if (cluster_id == preferred_cluster_id) {
5550 return false;
5551 }
5552
5553 if (dst_pset == NULL) {
5554 return false;
5555 }
5556
5557 sched_clutch_edge edge = sched_edge_config_get(preferred_cluster_id, cluster_id, thread->th_sched_bucket);
5558 if (edge.sce_migration_allowed == false) {
5559 return false;
5560 }
5561 uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_edge_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
5562 if (dst_load == 0
5563 ) {
5564 /* The candidate cluster is idle; select it immediately for execution */
5565 *selected_pset = dst_pset;
5566 *max_edge_delta = preferred_cluster_load;
5567 return true;
5568 }
5569
5570 uint32_t edge_delta = 0;
5571 if (dst_load > preferred_cluster_load) {
5572 return false;
5573 }
5574 edge_delta = preferred_cluster_load - dst_load;
5575 if (!shared_rsrc_thread && (edge_delta < edge.sce_migration_weight)) {
5576 /*
5577 * For non shared resource threads, use the edge migration weight to decide if
5578 * this cluster is over-committed at the QoS level of this thread.
5579 */
5580 return false;
5581 }
5582
5583 if (edge_delta < *max_edge_delta) {
5584 return false;
5585 }
5586 if (edge_delta == *max_edge_delta) {
5587 /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
5588 boolean_t selected_homogeneous = ((*selected_pset)->pset_type == preferred_cluster_type);
5589 boolean_t candidate_homogeneous = (dst_pset->pset_type == preferred_cluster_type);
5590 if (selected_homogeneous || !candidate_homogeneous) {
5591 return false;
5592 }
5593 }
5594 /* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
5595 *max_edge_delta = edge_delta;
5596 *selected_pset = dst_pset;
5597 return false;
5598 }
5599
5600 /*
5601 * sched_edge_migrate_edges_evaluate()
5602 *
5603 * Routine to find the candidate for thread migration based on edge weights.
5604 *
5605 * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
5606 * return preferred_pset if its the most ideal destination for this thread.
5607 */
5608 static processor_set_t
sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread)5609 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
5610 {
5611 processor_set_t selected_pset = preferred_pset;
5612 uint32_t max_edge_delta = 0;
5613 bool search_complete = false;
5614 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5615 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5616
5617 bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
5618 bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
5619 /* Always start the search with the native clusters */
5620 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
5621 while (sched_iterate_psets_ordered(preferred_pset, &preferred_pset->spill_search_order[thread->th_sched_bucket], native_pset_bitmap[0], &istate)) {
5622 search_complete = sched_edge_migration_check(istate.spis_pset_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
5623 if (search_complete) {
5624 break;
5625 }
5626 }
5627
5628 if (search_complete) {
5629 return selected_pset;
5630 }
5631
5632 if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
5633 /*
5634 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
5635 * to fill up the preferred cluster and its homogeneous peers first.
5636 */
5637
5638 if (max_edge_delta > 0) {
5639 /*
5640 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
5641 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
5642 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
5643 * should be placed.
5644 */
5645 return selected_pset;
5646 }
5647 /*
5648 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
5649 * any more capacity left.
5650 */
5651 if (sched_edge_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
5652 return preferred_pset;
5653 }
5654 /*
5655 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
5656 * at non-native clusters for capacity.
5657 */
5658 }
5659
5660 /* Now look at the non-native clusters */
5661 istate = SCHED_PSET_ITERATE_STATE_INIT;
5662 while (sched_iterate_psets_ordered(preferred_pset, &preferred_pset->spill_search_order[thread->th_sched_bucket], foreign_pset_bitmap[0], &istate)) {
5663 search_complete = sched_edge_migration_check(istate.spis_pset_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
5664 if (search_complete) {
5665 break;
5666 }
5667 }
5668 return selected_pset;
5669 }
5670
5671 /*
5672 * sched_edge_candidate_alternative()
5673 *
5674 * Routine to find an alternative cluster from candidate_cluster_bitmap since the
5675 * selected_pset is not available for execution. The logic tries to prefer homogeneous
5676 * clusters over heterogeneous clusters since this is typically used in thread
5677 * placement decisions.
5678 */
5679 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
5680 static processor_set_t
sched_edge_candidate_alternative(processor_set_t selected_pset,uint64_t candidate_cluster_bitmap)5681 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
5682 {
5683 /*
5684 * It looks like the most ideal pset is not available for scheduling currently.
5685 * Try to find a homogeneous cluster that is still available.
5686 */
5687 uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
5688 int available_cluster_id = lsb_first(available_native_clusters);
5689 if (available_cluster_id == -1) {
5690 /* Looks like none of the homogeneous clusters are available; pick the first available cluster */
5691 available_cluster_id = bit_first(candidate_cluster_bitmap);
5692 }
5693 assert(available_cluster_id != -1);
5694 return pset_array[available_cluster_id];
5695 }
5696
5697 /*
5698 * sched_edge_switch_pset_lock()
5699 *
5700 * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
5701 * switch_pset_locks.
5702 * Returns the newly locked pset after the switch.
5703 */
5704 static processor_set_t
sched_edge_switch_pset_lock(processor_set_t selected_pset,processor_set_t locked_pset,bool switch_pset_locks)5705 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
5706 {
5707 if (!switch_pset_locks) {
5708 return locked_pset;
5709 }
5710 if (selected_pset != locked_pset) {
5711 pset_unlock(locked_pset);
5712 pset_lock(selected_pset);
5713 return selected_pset;
5714 } else {
5715 return locked_pset;
5716 }
5717 }
5718
5719 /*
5720 * sched_edge_migrate_candidate()
5721 *
5722 * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
5723 * the thread and the preferred cluster to determine the best available pset for scheduling.
5724 *
5725 * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
5726 * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
5727 * decision has to be synchronized with another CPU changing the recommendation of clusters available
5728 * on the system. If this parameter is set to false, this routine returns the best effort indication of
5729 * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
5730 * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
5731 * more appropriate cluster.
5732 *
5733 * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
5734 * resultant pset lock is held.
5735 */
5736 static processor_set_t
sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset,thread_t thread,processor_set_t locked_pset,bool switch_pset_locks,processor_t * processor_hint_out,sched_options_t * options_inout)5737 sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset, thread_t thread,
5738 processor_set_t locked_pset, bool switch_pset_locks, processor_t *processor_hint_out,
5739 sched_options_t *options_inout)
5740 {
5741 processor_set_t selected_pset = preferred_pset;
5742 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5743 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5744 bool stirring_the_pot = false;
5745
5746 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
5747 /*
5748 * For cluster-bound threads, choose the cluster to which the thread is bound, unless that
5749 * cluster is unavailable. If it's not available, fall through to the regular cluster selection
5750 * logic which handles derecommended clusters appropriately.
5751 */
5752 selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
5753 if (selected_pset != NULL) {
5754 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
5755 if (pset_is_recommended(selected_pset)) {
5756 return selected_pset;
5757 }
5758 }
5759 }
5760
5761 uint64_t candidate_cluster_bitmap = mask(sched_num_psets);
5762 #if DEVELOPMENT || DEBUG
5763 extern int enable_task_set_cluster_type;
5764 task_t task = get_threadtask(thread);
5765 if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5766 processor_set_t pset_hint = task->pset_hint;
5767 if (pset_hint && (selected_pset == NULL || selected_pset->pset_cluster_type != pset_hint->pset_cluster_type)) {
5768 selected_pset = pset_hint;
5769 goto migrate_candidate_available_check;
5770 }
5771 }
5772 #endif
5773
5774 if (preferred_pset == NULL) {
5775 /* The preferred_pset has not finished initializing at boot */
5776 goto migrate_candidate_available_check;
5777 }
5778
5779 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5780 /* For realtime threads, try and schedule them on the preferred pset always */
5781 goto migrate_candidate_available_check;
5782 }
5783
5784 uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_edge_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
5785 if (preferred_cluster_load == 0) {
5786 goto migrate_candidate_available_check;
5787 }
5788
5789 /*
5790 * If this thread has expired quantum on a non-preferred core and is waiting on
5791 * "stir-the-pot" to get a turn running on a P-core, check our processor inbox for
5792 * stir-the-pot to see if an eligible P-core has already been found for swap.
5793 * If so, try to migrate to the corresponding pset and also carry over the
5794 * processor hint to preempt that specific P-core.
5795 *
5796 * The AMP rebalancing mechanism is available for regular threads or shared resource
5797 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
5798 */
5799 int stir_the_pot_swap_cpu = sched_edge_stir_the_pot_check_inbox_for_thread(thread, true);
5800 if (stir_the_pot_swap_cpu != -1) {
5801 *processor_hint_out = processor_array[stir_the_pot_swap_cpu];
5802 selected_pset = processor_array[stir_the_pot_swap_cpu]->processor_set;
5803 stirring_the_pot = true;
5804 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_NONE,
5805 2, stir_the_pot_swap_cpu, 0, 0);
5806 goto migrate_candidate_available_check;
5807 }
5808
5809 /* Look at edge weights to decide the most ideal migration candidate for this thread */
5810 selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
5811
5812 migrate_candidate_available_check:
5813 if (selected_pset == NULL) {
5814 /* The selected_pset has not finished initializing at boot */
5815 pset_unlock(locked_pset);
5816 return NULL;
5817 }
5818
5819 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
5820 if (pset_is_recommended(selected_pset) == true) {
5821 /* Committing to the pset */
5822 if (stirring_the_pot) {
5823 *options_inout |= SCHED_STIR_POT;
5824 }
5825 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_pset->pset_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
5826 return selected_pset;
5827 }
5828 stirring_the_pot = false;
5829 /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
5830 bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
5831 if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_num_psets) == -1)) {
5832 pset_unlock(locked_pset);
5833 return NULL;
5834 }
5835 /* Try and find an alternative for the selected pset */
5836 selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
5837 goto migrate_candidate_available_check;
5838 }
5839
5840 static processor_t
sched_edge_choose_processor(processor_set_t pset,processor_t processor,thread_t thread,sched_options_t * options_inout)5841 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, sched_options_t *options_inout)
5842 {
5843 /* Bound threads don't call this function */
5844 assert(thread->bound_processor == PROCESSOR_NULL);
5845 processor_t chosen_processor = PROCESSOR_NULL;
5846
5847 /*
5848 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
5849 * It should take the passed in "pset" as a hint which represents the recency metric for
5850 * pset selection logic.
5851 */
5852 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5853 processor_set_t chosen_pset = preferred_pset;
5854 /*
5855 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
5856 * threads to. sched_edge_migrate_candidate() returns the preferred pset
5857 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
5858 *
5859 * Edge Scheduler Optimization
5860 * It might be useful to build a recency metric for the thread for multiple clusters and
5861 * factor that into the migration decisions.
5862 */
5863 chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true, &processor, options_inout);
5864 if (chosen_pset) {
5865 chosen_processor = choose_processor(chosen_pset, processor, thread, options_inout);
5866 }
5867 return chosen_processor;
5868 }
5869
5870 /*
5871 * sched_edge_clutch_bucket_threads_drain()
5872 *
5873 * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
5874 * bucket overrides etc.) into a local thread queue.
5875 */
5876 static void
sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,struct pulled_thread_queue * threadq)5877 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, struct pulled_thread_queue * threadq)
5878 {
5879 thread_t thread = THREAD_NULL;
5880 uint64_t current_timestamp = mach_approximate_time();
5881 qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
5882 sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
5883 pulled_thread_queue_enqueue(threadq, thread);
5884 }
5885 }
5886
5887 /*
5888 * sched_edge_update_preferred_cluster()
5889 *
5890 * Routine to update the preferred cluster for QoS buckets within a thread group.
5891 * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
5892 */
5893 static void
sched_edge_update_preferred_cluster(sched_clutch_t sched_clutch,bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster)5894 sched_edge_update_preferred_cluster(
5895 sched_clutch_t sched_clutch,
5896 bitmap_t *clutch_bucket_modify_bitmap,
5897 uint32_t *tg_bucket_preferred_cluster)
5898 {
5899 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
5900 os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
5901 }
5902 }
5903
5904 #if !SCHED_TEST_HARNESS
5905
5906 /*
5907 * sched_edge_migrate_thread_group_runnable_threads()
5908 *
5909 * Routine to implement the migration of threads on a cluster when the thread group
5910 * recommendation is updated. The migration works using a 2-phase
5911 * algorithm.
5912 *
5913 * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
5914 * For each clutch bucket, if it needs to be migrated immediately, drain the threads
5915 * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
5916 * appropriate.
5917 *
5918 * Phase 2: After unlocking the pset, drain all the threads from the local thread
5919 * queue and mark them runnable which should land them in the right hierarchy.
5920 *
5921 * The routine assumes that the preferences for the clutch buckets/clutch bucket
5922 * groups have already been updated by the caller.
5923 *
5924 * - Called with the pset locked and interrupts disabled.
5925 * - Returns with the pset unlocked.
5926 */
5927 static void
sched_edge_migrate_thread_group_runnable_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,bitmap_t * clutch_bucket_modify_bitmap,__unused uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately,struct pulled_thread_queue * threadq)5928 sched_edge_migrate_thread_group_runnable_threads(
5929 sched_clutch_t sched_clutch,
5930 sched_clutch_root_t root_clutch,
5931 bitmap_t *clutch_bucket_modify_bitmap,
5932 __unused uint32_t *tg_bucket_preferred_cluster,
5933 bool migrate_immediately, struct pulled_thread_queue * threadq)
5934 {
5935 sched_clutch_hierarchy_locked_assert(root_clutch);
5936
5937 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
5938 /* Get the clutch bucket for this cluster and sched bucket */
5939 sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
5940 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
5941 if (clutch_bucket->scb_root == NULL) {
5942 /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
5943 assert3u(clutch_bucket->scb_thr_count, ==, 0);
5944 continue;
5945 }
5946 assert3p(clutch_bucket->scb_root, ==, root_clutch);
5947 uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
5948
5949 sched_edge_steal_silo_clutch_bucket_classify(clutch_bucket, root_clutch, clutch_bucket_preferred_cluster);
5950
5951 if (migrate_immediately && (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster)) {
5952 /*
5953 * For transitions where threads need to be migrated immediately, drain the threads into a
5954 * local queue unless we are looking at the clutch buckets for the newly recommended
5955 * cluster.
5956 */
5957 sched_edge_clutch_bucket_threads_drain(clutch_bucket, clutch_bucket->scb_root, threadq);
5958 }
5959 }
5960
5961 pset_unlock(root_clutch->scr_pset);
5962 }
5963
5964 /*
5965 * sched_edge_migrate_thread_group_running_threads()
5966 *
5967 * Routine to find all running threads of a thread group on a specific cluster
5968 * and IPI them if they need to be moved immediately.
5969 */
5970 static void
sched_edge_migrate_thread_group_running_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,__unused bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)5971 sched_edge_migrate_thread_group_running_threads(
5972 sched_clutch_t sched_clutch,
5973 sched_clutch_root_t root_clutch,
5974 __unused bitmap_t *clutch_bucket_modify_bitmap,
5975 uint32_t *tg_bucket_preferred_cluster,
5976 bool migrate_immediately)
5977 {
5978 if (migrate_immediately == false) {
5979 /* If CLPC has recommended not to move threads immediately, nothing to do here */
5980 return;
5981 }
5982
5983 /*
5984 * Edge Scheduler Optimization
5985 *
5986 * When the system has a large number of clusters and cores, it might be useful to
5987 * narrow down the iteration by using a thread running bitmap per clutch.
5988 */
5989 uint64_t ast_processor_map = 0;
5990 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
5991
5992 uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
5993 /*
5994 * Iterate all CPUs and look for the ones running threads from this thread group and are
5995 * not restricted to the specific cluster (due to overrides etc.)
5996 */
5997 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
5998 processor_t src_processor = processor_array[cpuid];
5999 boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
6000 sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
6001 if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
6002 continue;
6003 }
6004 boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
6005
6006 if (expected_tg && non_preferred_cluster) {
6007 ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
6008 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
6009 bit_set(ast_processor_map, cpuid);
6010 } else if (src_processor == current_processor()) {
6011 bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
6012 ast_t new_preempt = update_pending_nonurgent_preemption(src_processor, AST_PREEMPT);
6013 ast_on(new_preempt);
6014 }
6015 }
6016 }
6017
6018 /* Perform all the IPIs */
6019 if (bit_first(ast_processor_map) != -1) {
6020 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
6021 processor_t ast_processor = processor_array[cpuid];
6022 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
6023 }
6024 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
6025 }
6026 }
6027
6028 /*
6029 * sched_edge_tg_preferred_cluster_change()
6030 *
6031 * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
6032 * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
6033 * thread migrations based on the policy specified by CLPC.
6034 * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
6035 * for each QoS within the thread group.
6036 */
6037 void
sched_edge_tg_preferred_cluster_change(struct thread_group * tg,uint32_t * tg_bucket_preferred_cluster,sched_perfcontrol_preferred_cluster_options_t options)6038 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
6039 {
6040 sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
6041 /*
6042 * In order to optimize the processing, create a bitmap which represents all QoS buckets
6043 * for which the preferred cluster has changed.
6044 */
6045 bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
6046 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
6047 uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
6048 uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
6049 if (old_preferred_cluster != new_preferred_cluster) {
6050 bitmap_set(clutch_bucket_modify_bitmap, bucket);
6051 }
6052 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREFERRED_PSET) | DBG_FUNC_NONE,
6053 thread_group_get_id(tg), bucket, new_preferred_cluster, options);
6054 }
6055 if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
6056 /* No changes in any clutch buckets; nothing to do here */
6057 return;
6058 }
6059
6060 /*
6061 * The first operation is to update the preferred cluster for all QoS buckets within the
6062 * thread group so that any future threads becoming runnable would see the new preferred
6063 * cluster value.
6064 */
6065 sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
6066
6067 for (uint32_t cluster_id = 0; cluster_id < sched_num_psets; cluster_id++) {
6068 processor_set_t pset = pset_array[cluster_id];
6069 struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
6070
6071 spl_t s = splsched();
6072 pset_lock(pset);
6073 /*
6074 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
6075 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
6076 *
6077 * Edge Scheduler Optimization
6078 */
6079 /* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
6080 sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
6081 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
6082 /* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
6083 sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
6084 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE), threadq);
6085 /* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
6086 splx(s);
6087
6088 pulled_thread_queue_flush(threadq);
6089 }
6090 }
6091
6092 /*
6093 * sched_edge_pset_made_schedulable()
6094 *
6095 * Pset may already be marked schedulable. Called at least once when new
6096 * processor(s) are made available.
6097 *
6098 * Invoked with the pset lock held and interrupts disabled.
6099 */
6100 static void
sched_edge_pset_made_schedulable(processor_set_t pset)6101 sched_edge_pset_made_schedulable(
6102 processor_set_t pset)
6103 {
6104 /* Mark the pset as schedulable. The bit may already be set if the pset was already schedulable. */
6105 atomic_bit_set(sched_edge_available_pset_bitmask, pset->pset_id, memory_order_relaxed);
6106 }
6107 #endif /* !SCHED_TEST_HARNESS */
6108
6109
6110 /*
6111 * sched_edge_cpu_init_completed()
6112 *
6113 * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
6114 * provides an opportunity for the edge scheduler to initialize all the edge parameters.
6115 */
6116 static void
sched_edge_cpu_init_completed(void)6117 sched_edge_cpu_init_completed(void)
6118 {
6119 /* Now that all cores have registered, compute bitmaps for different core types */
6120 for (int pset_id = 0; pset_id < sched_num_psets; pset_id++) {
6121 processor_set_t pset = pset_array[pset_id];
6122 if (sched_edge_stir_the_pot_core_type_is_desired(pset)) {
6123 os_atomic_or(&sched_edge_p_core_map, pset->cpu_bitmask, relaxed);
6124 } else {
6125 os_atomic_or(&sched_edge_non_p_core_map, pset->cpu_bitmask, relaxed);
6126 }
6127 }
6128 /* Build policy table for setting edge weight tunables based on cluster types */
6129 sched_clutch_edge edge_config_defaults[MAX_CPU_TYPES][MAX_CPU_TYPES];
6130 sched_clutch_edge free_spill = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1};
6131 sched_clutch_edge no_spill = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0};
6132 sched_clutch_edge weighted_spill = (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1};
6133 /* P -> P */
6134 edge_config_defaults[CLUSTER_TYPE_P][CLUSTER_TYPE_P] = free_spill;
6135 /* E -> E */
6136 edge_config_defaults[CLUSTER_TYPE_E][CLUSTER_TYPE_E] = free_spill;
6137 /* P -> E */
6138 edge_config_defaults[CLUSTER_TYPE_P][CLUSTER_TYPE_E] = weighted_spill;
6139 /* E -> P */
6140 edge_config_defaults[CLUSTER_TYPE_E][CLUSTER_TYPE_P] = no_spill;
6141
6142 spl_t s = splsched();
6143 for (int src_cluster_id = 0; src_cluster_id < sched_num_psets; src_cluster_id++) {
6144 processor_set_t src_pset = pset_array[src_cluster_id];
6145 pset_lock(src_pset);
6146
6147 /* Each pset recommendation is at least allowed to access its own cluster */
6148 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
6149 src_pset->max_parallel_cores[bucket] = src_pset->cpu_set_count;
6150 src_pset->max_parallel_clusters[bucket] = 1;
6151 }
6152
6153 /* For each cluster, set all its outgoing edge parameters */
6154 for (int dst_cluster_id = 0; dst_cluster_id < sched_num_psets; dst_cluster_id++) {
6155 processor_set_t dst_pset = pset_array[dst_cluster_id];
6156 if (dst_cluster_id == src_cluster_id) {
6157 continue;
6158 }
6159
6160 bool clusters_homogenous = (src_pset->pset_type == dst_pset->pset_type);
6161 if (clusters_homogenous) {
6162 bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
6163 bitmap_set(src_pset->native_psets, dst_cluster_id);
6164 /* Default realtime policy: spill allowed among homogeneous psets. */
6165 sched_rt_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge) {
6166 .sce_migration_allowed = true,
6167 .sce_steal_allowed = true,
6168 .sce_migration_weight = 0,
6169 });
6170 } else {
6171 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
6172 bitmap_clear(src_pset->native_psets, dst_cluster_id);
6173 /* Default realtime policy: disallow spill among heterogeneous psets. */
6174 sched_rt_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge) {
6175 .sce_migration_allowed = false,
6176 .sce_steal_allowed = false,
6177 .sce_migration_weight = 0,
6178 });
6179 }
6180
6181 bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
6182 if (clusters_local) {
6183 bitmap_set(src_pset->local_psets, dst_cluster_id);
6184 bitmap_clear(src_pset->remote_psets, dst_cluster_id);
6185 } else {
6186 bitmap_set(src_pset->remote_psets, dst_cluster_id);
6187 bitmap_clear(src_pset->local_psets, dst_cluster_id);
6188 }
6189
6190 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
6191 /* Set tunables for an edge based on the cluster types at either ends of it */
6192 sched_clutch_edge edge_config = edge_config_defaults[src_pset->pset_type][dst_pset->pset_type];
6193 sched_edge_config_set(src_cluster_id, dst_cluster_id, bucket, edge_config);
6194 if (edge_config.sce_migration_allowed) {
6195 src_pset->max_parallel_cores[bucket] += dst_pset->cpu_set_count;
6196 src_pset->max_parallel_clusters[bucket] += 1;
6197 }
6198 }
6199 }
6200 sched_edge_config_pset_push(src_cluster_id);
6201
6202 pset_unlock(src_pset);
6203 }
6204 sched_edge_config_final_push();
6205 #if DEVELOPMENT || DEBUG
6206 assert(sched_edge_config_verify());
6207 #endif /* DEVELOPMENT || DEBUG */
6208 splx(s);
6209 }
6210
6211 static bool
sched_edge_thread_eligible_for_pset(thread_t thread,processor_set_t pset)6212 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
6213 {
6214 uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
6215 if (preferred_cluster_id == pset->pset_cluster_id) {
6216 return true;
6217 } else {
6218 sched_clutch_edge edge;
6219 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
6220 edge = sched_rt_config_get(preferred_cluster_id, pset->pset_id);
6221 } else {
6222 edge = sched_edge_config_get(preferred_cluster_id, pset->pset_cluster_id, thread->th_sched_bucket);
6223 }
6224 return edge.sce_migration_allowed;
6225 }
6226 }
6227
6228 extern int sched_amp_spill_deferred_ipi;
6229 extern int sched_amp_pcores_preempt_immediate_ipi;
6230
6231 int sched_edge_migrate_ipi_immediate = 1;
6232
6233 sched_ipi_type_t
sched_edge_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)6234 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
6235 {
6236 processor_set_t pset = dst->processor_set;
6237 assert(dst != current_processor());
6238
6239 boolean_t deferred_ipi_supported = false;
6240 #if defined(CONFIG_SCHED_DEFERRED_AST)
6241 deferred_ipi_supported = true;
6242 #endif /* CONFIG_SCHED_DEFERRED_AST */
6243
6244 switch (event) {
6245 case SCHED_IPI_EVENT_SPILL:
6246 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
6247 if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
6248 return sched_ipi_deferred_policy(pset, dst, thread, event);
6249 }
6250 break;
6251 case SCHED_IPI_EVENT_PREEMPT:
6252 /* For preemption, the default policy is to use deferred IPIs
6253 * for Non-RT P-core preemption. Override that behavior if
6254 * sched_amp_pcores_preempt_immediate_ipi is set
6255 */
6256 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
6257 if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
6258 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
6259 }
6260 if (sched_edge_migrate_ipi_immediate) {
6261 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
6262 /*
6263 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
6264 */
6265 if (preferred_pset->pset_type == pset->pset_type) {
6266 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
6267 }
6268 /*
6269 * For workloads that are going wide, it might be useful to use Immediate IPI to
6270 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
6271 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
6272 * latency on the preferred pset as an estimate of busyness.
6273 */
6274 if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
6275 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
6276 }
6277 }
6278 }
6279 break;
6280 default:
6281 break;
6282 }
6283 /* Default back to the global policy for all other scenarios */
6284 return sched_ipi_policy(dst, thread, dst_idle, event);
6285 }
6286
6287
6288 /*
6289 * sched_edge_qos_max_parallelism()
6290 */
6291 uint32_t
sched_edge_qos_max_parallelism(int qos,uint64_t options)6292 sched_edge_qos_max_parallelism(int qos, uint64_t options)
6293 {
6294 cluster_type_t low_core_type = CLUSTER_TYPE_E;
6295 cluster_type_t high_core_type = CLUSTER_TYPE_P;
6296
6297 if (options & QOS_PARALLELISM_REALTIME) {
6298 /* For realtime threads on AMP, we would want them
6299 * to limit the width to just the P-cores since we
6300 * do not spill/rebalance for RT threads.
6301 */
6302 uint32_t high_cpu_count = ml_get_cpu_number_type(high_core_type, false, false);
6303 uint32_t high_cluster_count = ml_get_cluster_number_type(high_core_type);
6304 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? high_cluster_count : high_cpu_count;
6305 }
6306
6307 /*
6308 * The Edge scheduler supports per-QoS recommendations for thread groups.
6309 * This enables lower QoS buckets (such as UT) to be scheduled on all
6310 * CPUs on the system.
6311 *
6312 * The only restriction is for BG/Maintenance QoS classes for which the
6313 * performance controller would never recommend execution on the P-cores.
6314 * If that policy changes in the future, this value should be changed.
6315 */
6316 switch (qos) {
6317 case THREAD_QOS_BACKGROUND:
6318 case THREAD_QOS_MAINTENANCE:;
6319 uint32_t low_cpu_count = ml_get_cpu_number_type(low_core_type, false, false);
6320 uint32_t low_cluster_count = ml_get_cluster_number_type(low_core_type);
6321 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? low_cluster_count : low_cpu_count;
6322 default:;
6323 uint32_t total_cpus = ml_get_cpu_count();
6324 uint32_t total_clusters = ml_get_cluster_count();
6325 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? total_clusters : total_cpus;
6326 }
6327 }
6328
6329
6330 #endif /* CONFIG_SCHED_EDGE */
6331
6332 #endif /* CONFIG_SCHED_CLUTCH */
6333