1 /*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #if !SCHED_TEST_HARNESS
30
31 #include <kern/debug.h>
32 #include <kern/kern_types.h>
33 #include <kern/machine.h>
34 #include <kern/misc_protos.h>
35 #include <kern/queue.h>
36 #include <kern/sched_clutch.h>
37 #include <kern/sched.h>
38 #include <kern/task.h>
39 #include <kern/thread.h>
40
41 #include <mach/mach_types.h>
42 #include <mach/machine.h>
43
44 #include <machine/atomic.h>
45 #include <machine/machine_cpu.h>
46 #include <machine/machine_routines.h>
47 #include <machine/sched_param.h>
48
49 #include <sys/kdebug.h>
50
51 #endif /* !SCHED_TEST_HARNESS */
52
53 #include <kern/processor.h>
54 #include <kern/sched_prim.h>
55 #include <kern/sched_rt.h>
56
57 #if CONFIG_SCHED_EDGE
58 #include <kern/sched_amp_common.h>
59 #endif /* CONFIG_SCHED_EDGE */
60
61 #if CONFIG_SCHED_CLUTCH
62
63 #if CONFIG_SCHED_SMT
64 #error "The clutch scheduler does not support CONFIG_SCHED_SMT."
65 #endif /* CONFIG_SCHED_SMT */
66
67 #define SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION 1
68 typedef union {
69 struct __attribute__((packed)) {
70 unsigned int version : 4;
71 unsigned int traverse_mode : 3;
72 unsigned int cluster_id : 6;
73 unsigned int selection_was_edf : 1;
74 unsigned int selection_was_cluster_bound : 1;
75 unsigned int selection_opened_starvation_avoidance_window : 1;
76 unsigned int selection_opened_warp_window : 1;
77 unsigned int starvation_avoidance_window_close : 12;
78 unsigned int warp_window_close : 12;
79 unsigned int reserved : 23; /* For future usage */
80 } trace_data;
81 uint64_t scdts_trace_data_packed;
82 } sched_clutch_dbg_thread_select_packed_t;
83
84 static_assert(TH_BUCKET_SCHED_MAX == 6, "Ensure layout of sched_clutch_dbg_thread_select_packed can fit root bucket bitmasks");
85 static_assert(sizeof(sched_clutch_dbg_thread_select_packed_t) <= sizeof(uint64_t), "Ensure sched_clutch_dbg_thread_select_packed_t can fit in one tracepoint argument");
86
87 /* Forward declarations of static routines */
88
89 /* Root level hierarchy management */
90 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
91 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
92 static void sched_clutch_root_pri_update(sched_clutch_root_t);
93 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
94 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
95
96 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
97 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE = 0,
98 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY = 1,
99 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL = 2,
100 });
101 __enum_decl(sched_clutch_traverse_mode_t, uint32_t, {
102 SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY = 0,
103 SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT = 1,
104 SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT = 2,
105 });
106 static_assert(SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT < (1 << 3), "Ensure traverse mode can be encoded within 3 bits of sched_clutch_dbg_thread_select_packed_t");
107 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t, sched_clutch_root_bucket_t, thread_t, bool *, sched_clutch_traverse_mode_t, sched_clutch_dbg_thread_select_packed_t *);
108
109 #if CONFIG_SCHED_EDGE
110 /* Support for foreign threads on AMP platforms */
111 static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
112 static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
113 #endif /* CONFIG_SCHED_EDGE */
114
115 /* Root bucket level hierarchy management */
116 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
117 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t, bool);
118 static int sched_clutch_root_highest_runnable_qos(sched_clutch_root_t, sched_clutch_highest_root_bucket_type_t);
119
120 /* Options for clutch bucket ordering in the runq */
121 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
122 SCHED_CLUTCH_BUCKET_OPTIONS_NONE = 0x0,
123 /* Round robin clutch bucket on thread removal */
124 SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = 0x1,
125 /* Insert clutch bucket at head (for thread preemption) */
126 SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = 0x2,
127 /* Insert clutch bucket at tail (default) */
128 SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = 0x4,
129 });
130
131 /* Clutch bucket level hierarchy management */
132 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
133 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
134 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
135 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
136 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
137 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
138
139 /* Clutch bucket group level properties management */
140 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
141 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
142 static void sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_t);
143 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
144 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
145 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
146 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
147
148 /* Clutch timeshare properties updates */
149 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
150 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
151
152 /* Clutch membership management */
153 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
154 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
155 static thread_t sched_clutch_hierarchy_thread_highest(sched_clutch_root_t, processor_t, thread_t, sched_clutch_traverse_mode_t);
156
157 /* Clutch properties updates */
158 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
159 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
160 static int sched_clutch_root_priority(sched_clutch_root_t);
161 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t, sched_clutch_root_bucket_t, processor_t _Nullable processor, thread_t _Nullable prev_thread, bool *_Nullable chose_prev_thread);
162
163 /* Clutch thread properties */
164 static boolean_t sched_thread_sched_pri_promoted(thread_t);
165 static inline sched_clutch_bucket_t sched_clutch_bucket_for_thread(sched_clutch_root_t, thread_t);
166 static inline sched_clutch_bucket_group_t sched_clutch_bucket_group_for_thread(thread_t);
167
168 /* General utilities */
169 static inline bool sched_clutch_pri_greater_than_tiebreak(int, int, bool);
170
171 #if CONFIG_SCHED_EDGE
172 /* System based routines */
173 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
174
175 /* Global indicating the maximum number of clusters on the current platform */
176 static int sched_edge_max_clusters = 0;
177 #endif /* CONFIG_SCHED_EDGE */
178
179 /* Helper debugging routines */
180 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
181
182 extern processor_set_t pset_array[MAX_PSETS];
183
184 /*
185 * Special markers for buckets that have invalid WCELs/quantums etc.
186 */
187 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
188 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
189
190 /*
191 * Root level bucket WCELs
192 *
193 * The root level bucket selection algorithm is an Earliest Deadline
194 * First (EDF) algorithm where the deadline for buckets are defined
195 * by the worst-case-execution-latency and the make runnable timestamp
196 * for the bucket.
197 *
198 */
199 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
200 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
201 0, /* FG */
202 37500, /* IN (37.5ms) */
203 75000, /* DF (75ms) */
204 150000, /* UT (150ms) */
205 250000 /* BG (250ms) */
206 };
207 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
208
209 /*
210 * Root level bucket warp
211 *
212 * Each root level bucket has a warp value associated with it as well.
213 * The warp value allows the root bucket to effectively warp ahead of
214 * lower priority buckets for a limited time even if it has a later
215 * deadline. The warping behavior provides extra (but limited)
216 * opportunity for high priority buckets to remain responsive.
217 */
218
219 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
220 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
221
222 /* Warp window durations for various tiers */
223 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
224 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
225 8000, /* FG (8ms)*/
226 4000, /* IN (4ms) */
227 2000, /* DF (2ms) */
228 1000, /* UT (1ms) */
229 0 /* BG (0ms) */
230 };
231 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
232
233 /*
234 * Thread level quantum
235 *
236 * The algorithm defines quantums for threads at various buckets. This
237 * (combined with the root level bucket quantums) restricts how much
238 * the lower priority levels can preempt the higher priority threads.
239 */
240
241 #if XNU_TARGET_OS_OSX
242 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
243 10000, /* FIXPRI (10ms) */
244 10000, /* FG (10ms) */
245 10000, /* IN (10ms) */
246 10000, /* DF (10ms) */
247 4000, /* UT (4ms) */
248 2000 /* BG (2ms) */
249 };
250 #else /* XNU_TARGET_OS_OSX */
251 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
252 10000, /* FIXPRI (10ms) */
253 10000, /* FG (10ms) */
254 8000, /* IN (8ms) */
255 6000, /* DF (6ms) */
256 4000, /* UT (4ms) */
257 2000 /* BG (2ms) */
258 };
259 #endif /* XNU_TARGET_OS_OSX */
260
261 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
262
263 /*
264 * sched_clutch_us_to_abstime()
265 *
266 * Initializer for converting all durations in usec to abstime
267 */
268 static void
sched_clutch_us_to_abstime(uint32_t * us_vals,uint64_t * abstime_vals)269 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
270 {
271 for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
272 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
273 abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
274 } else {
275 clock_interval_to_absolutetime_interval(us_vals[i],
276 NSEC_PER_USEC, &abstime_vals[i]);
277 }
278 }
279 }
280
281 /* Clutch/Edge Scheduler Debugging support */
282 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c) ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
283
284 #if DEVELOPMENT || DEBUG
285
286 kern_return_t
sched_clutch_thread_group_cpu_time_for_thread(thread_t thread,int sched_bucket,uint64_t * cpu_stats)287 sched_clutch_thread_group_cpu_time_for_thread(thread_t thread, int sched_bucket, uint64_t *cpu_stats)
288 {
289 if (sched_bucket < 0 || sched_bucket >= TH_BUCKET_MAX) {
290 return KERN_INVALID_ARGUMENT;
291 }
292 sched_clutch_bucket_group_t clutch_bucket_group = &sched_clutch_for_thread(thread)->sc_clutch_groups[sched_bucket];
293 sched_clutch_bucket_cpu_data_t scb_cpu_data;
294 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
295 cpu_stats[0] = scb_cpu_data.cpu_data.scbcd_cpu_used;
296 cpu_stats[1] = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
297 return KERN_SUCCESS;
298 }
299
300 /*
301 * sched_clutch_hierarchy_locked_assert()
302 *
303 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
304 * for the hierarchy depends on where the hierarchy is hooked. The current
305 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
306 * using the pset lock.
307 */
308 static inline void
sched_clutch_hierarchy_locked_assert(sched_clutch_root_t root_clutch)309 sched_clutch_hierarchy_locked_assert(
310 sched_clutch_root_t root_clutch)
311 {
312 pset_assert_locked(root_clutch->scr_pset);
313 }
314
315 #else /* DEVELOPMENT || DEBUG */
316
317 static inline void
sched_clutch_hierarchy_locked_assert(__unused sched_clutch_root_t root_clutch)318 sched_clutch_hierarchy_locked_assert(
319 __unused sched_clutch_root_t root_clutch)
320 {
321 }
322
323 #endif /* DEVELOPMENT || DEBUG */
324
325 /*
326 * sched_clutch_thr_count_inc()
327 *
328 * Increment thread count at a hierarchy level with overflow checks.
329 */
330 static void
sched_clutch_thr_count_inc(uint16_t * thr_count)331 sched_clutch_thr_count_inc(
332 uint16_t *thr_count)
333 {
334 if (__improbable(os_inc_overflow(thr_count))) {
335 panic("sched_clutch thread count overflowed!");
336 }
337 }
338
339 /*
340 * sched_clutch_thr_count_dec()
341 *
342 * Decrement thread count at a hierarchy level with underflow checks.
343 */
344 static void
sched_clutch_thr_count_dec(uint16_t * thr_count)345 sched_clutch_thr_count_dec(
346 uint16_t *thr_count)
347 {
348 if (__improbable(os_dec_overflow(thr_count))) {
349 panic("sched_clutch thread count underflowed!");
350 }
351 }
352
353 static sched_bucket_t
sched_convert_pri_to_bucket(uint8_t priority)354 sched_convert_pri_to_bucket(uint8_t priority)
355 {
356 sched_bucket_t bucket = TH_BUCKET_RUN;
357
358 if (priority > BASEPRI_USER_INITIATED) {
359 bucket = TH_BUCKET_SHARE_FG;
360 } else if (priority > BASEPRI_DEFAULT) {
361 bucket = TH_BUCKET_SHARE_IN;
362 } else if (priority > BASEPRI_UTILITY) {
363 bucket = TH_BUCKET_SHARE_DF;
364 } else if (priority > MAXPRI_THROTTLE) {
365 bucket = TH_BUCKET_SHARE_UT;
366 } else {
367 bucket = TH_BUCKET_SHARE_BG;
368 }
369 return bucket;
370 }
371
372 /*
373 * sched_clutch_thread_bucket_map()
374 *
375 * Map a thread to a scheduling bucket for the clutch/edge scheduler
376 * based on its scheduling mode and the priority attribute passed in.
377 */
378 static sched_bucket_t
sched_clutch_thread_bucket_map(thread_t thread,int pri)379 sched_clutch_thread_bucket_map(thread_t thread, int pri)
380 {
381 switch (thread->sched_mode) {
382 case TH_MODE_FIXED:
383 if (pri >= BASEPRI_FOREGROUND) {
384 return TH_BUCKET_FIXPRI;
385 } else {
386 return sched_convert_pri_to_bucket(pri);
387 }
388
389 case TH_MODE_REALTIME:
390 return TH_BUCKET_FIXPRI;
391
392 case TH_MODE_TIMESHARE:
393 return sched_convert_pri_to_bucket(pri);
394
395 default:
396 panic("unexpected mode: %d", thread->sched_mode);
397 break;
398 }
399 }
400
401 /*
402 * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
403 * based on the amount of time they have been pending and the load at that
404 * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
405 * multiple clusters, its important to keep the load also as a global counter.
406 */
407 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
408
409 /*
410 * sched_clutch_root_init()
411 *
412 * Routine to initialize the scheduler hierarchy root.
413 */
414 static void
sched_clutch_root_init(sched_clutch_root_t root_clutch,processor_set_t pset)415 sched_clutch_root_init(
416 sched_clutch_root_t root_clutch,
417 processor_set_t pset)
418 {
419 root_clutch->scr_thr_count = 0;
420 root_clutch->scr_priority = NOPRI;
421 root_clutch->scr_urgency = 0;
422 root_clutch->scr_pset = pset;
423 #if CONFIG_SCHED_EDGE
424 root_clutch->scr_cluster_id = pset->pset_cluster_id;
425 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
426 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
427 }
428 #else /* CONFIG_SCHED_EDGE */
429 root_clutch->scr_cluster_id = 0;
430 #endif /* CONFIG_SCHED_EDGE */
431
432 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
433 queue_init(&root_clutch->scr_clutch_buckets);
434
435 /* Initialize the priority queue which maintains all runnable foreign clutch buckets */
436 priority_queue_init(&root_clutch->scr_foreign_buckets);
437 bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
438 bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
439 bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
440 priority_queue_init(&root_clutch->scr_bound_root_buckets);
441
442 /* Initialize the bitmap and priority queue of runnable root buckets */
443 priority_queue_init(&root_clutch->scr_unbound_root_buckets);
444 bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
445 bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
446
447 /* Initialize all the root buckets */
448 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
449 sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
450 sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
451 }
452 }
453
454 /*
455 * Clutch Bucket Runqueues
456 *
457 * The clutch buckets are maintained in a runq at the root bucket level. The
458 * runq organization allows clutch buckets to be ordered based on various
459 * factors such as:
460 *
461 * - Clutch buckets are round robin'ed at the same priority level when a
462 * thread is selected from a clutch bucket. This prevents a clutch bucket
463 * from starving out other clutch buckets at the same priority.
464 *
465 * - Clutch buckets are inserted at the head when it becomes runnable due to
466 * thread preemption. This allows threads that were preempted to maintain
467 * their order in the queue.
468 */
469
470 /*
471 * sched_clutch_bucket_runq_init()
472 *
473 * Initialize a clutch bucket runq.
474 */
475 static void
sched_clutch_bucket_runq_init(sched_clutch_bucket_runq_t clutch_buckets_rq)476 sched_clutch_bucket_runq_init(
477 sched_clutch_bucket_runq_t clutch_buckets_rq)
478 {
479 clutch_buckets_rq->scbrq_highq = NOPRI;
480 for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
481 clutch_buckets_rq->scbrq_bitmap[i] = 0;
482 }
483 clutch_buckets_rq->scbrq_count = 0;
484 for (int i = 0; i < NRQS; i++) {
485 circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
486 }
487 }
488
489 /*
490 * sched_clutch_bucket_runq_empty()
491 *
492 * Returns if a clutch bucket runq is empty.
493 */
494 static boolean_t
sched_clutch_bucket_runq_empty(sched_clutch_bucket_runq_t clutch_buckets_rq)495 sched_clutch_bucket_runq_empty(
496 sched_clutch_bucket_runq_t clutch_buckets_rq)
497 {
498 return clutch_buckets_rq->scbrq_count == 0;
499 }
500
501 /*
502 * sched_clutch_bucket_runq_peek()
503 *
504 * Returns the highest priority clutch bucket in the runq.
505 */
506 static sched_clutch_bucket_t
sched_clutch_bucket_runq_peek(sched_clutch_bucket_runq_t clutch_buckets_rq)507 sched_clutch_bucket_runq_peek(
508 sched_clutch_bucket_runq_t clutch_buckets_rq)
509 {
510 if (clutch_buckets_rq->scbrq_count > 0) {
511 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
512 return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
513 } else {
514 return NULL;
515 }
516 }
517
518 /*
519 * sched_clutch_bucket_runq_enqueue()
520 *
521 * Enqueue a clutch bucket into the runq based on the options passed in.
522 */
523 static void
sched_clutch_bucket_runq_enqueue(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_options_t options)524 sched_clutch_bucket_runq_enqueue(
525 sched_clutch_bucket_runq_t clutch_buckets_rq,
526 sched_clutch_bucket_t clutch_bucket,
527 sched_clutch_bucket_options_t options)
528 {
529 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
530 if (circle_queue_empty(queue)) {
531 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
532 bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
533 if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
534 clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
535 }
536 } else {
537 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
538 circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
539 } else {
540 /*
541 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
542 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
543 */
544 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
545 }
546 }
547 clutch_buckets_rq->scbrq_count++;
548 }
549
550 /*
551 * sched_clutch_bucket_runq_remove()
552 *
553 * Remove a clutch bucket from the runq.
554 */
555 static void
sched_clutch_bucket_runq_remove(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)556 sched_clutch_bucket_runq_remove(
557 sched_clutch_bucket_runq_t clutch_buckets_rq,
558 sched_clutch_bucket_t clutch_bucket)
559 {
560 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
561 circle_dequeue(queue, &clutch_bucket->scb_runqlink);
562 assert(clutch_buckets_rq->scbrq_count > 0);
563 clutch_buckets_rq->scbrq_count--;
564 if (circle_queue_empty(queue)) {
565 bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
566 clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
567 }
568 }
569
570 static void
sched_clutch_bucket_runq_rotate(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)571 sched_clutch_bucket_runq_rotate(
572 sched_clutch_bucket_runq_t clutch_buckets_rq,
573 sched_clutch_bucket_t clutch_bucket)
574 {
575 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
576 assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
577 circle_queue_rotate_head_forward(queue);
578 }
579
580 /*
581 * sched_clutch_root_bucket_init()
582 *
583 * Routine to initialize root buckets.
584 */
585 static void
sched_clutch_root_bucket_init(sched_clutch_root_bucket_t root_bucket,sched_bucket_t bucket,bool bound_root_bucket)586 sched_clutch_root_bucket_init(
587 sched_clutch_root_bucket_t root_bucket,
588 sched_bucket_t bucket,
589 bool bound_root_bucket)
590 {
591 root_bucket->scrb_bucket = bucket;
592 if (bound_root_bucket) {
593 /* For bound root buckets, initialize the bound thread runq. */
594 root_bucket->scrb_bound = true;
595 run_queue_init(&root_bucket->scrb_bound_thread_runq);
596 } else {
597 /*
598 * The unbounded root buckets contain a runq of runnable clutch buckets
599 * which then hold the runnable threads.
600 */
601 root_bucket->scrb_bound = false;
602 sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
603 }
604 priority_queue_entry_init(&root_bucket->scrb_pqlink);
605 root_bucket->scrb_pqlink.deadline = 0;
606 root_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
607 root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
608 root_bucket->scrb_starvation_avoidance = false;
609 root_bucket->scrb_starvation_ts = 0;
610 }
611
612 /*
613 * Special case scheduling for Above UI bucket.
614 *
615 * AboveUI threads are typically system critical threads that need low latency
616 * which is why they are handled specially.
617 *
618 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
619 * important to maintain some native priority order between those buckets. For unbounded
620 * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
621 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
622 * deadline based scheduling which should pickup the timeshare buckets. For the bound
623 * case, the policy simply compares the priority of the highest runnable threads in
624 * the above UI and timeshare buckets.
625 *
626 * The implementation allows extremely low latency CPU access for Above UI threads
627 * while supporting the use case of high priority timeshare threads contending with
628 * lower priority fixed priority threads.
629 */
630
631
632 /*
633 * sched_clutch_root_unbound_select_aboveui()
634 *
635 * Routine to determine if the above UI unbounded bucket should be selected for execution.
636 *
637 * Writes the highest unbound (timeshare FG vs. above UI) bucket, its priority, and whether
638 * it is an above UI bucket into the pointer parameters.
639 */
640 static void
sched_clutch_root_unbound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)641 sched_clutch_root_unbound_select_aboveui(
642 sched_clutch_root_t root_clutch,
643 sched_clutch_root_bucket_t *highest_bucket,
644 int *highest_pri,
645 bool *highest_is_aboveui,
646 sched_clutch_root_bucket_t _Nullable prev_bucket,
647 thread_t _Nullable prev_thread)
648 {
649 /* First determine the highest Clutch bucket */
650 sched_clutch_root_bucket_t higher_root_bucket = NULL;
651 sched_clutch_bucket_t higher_clutch_bucket = NULL;
652 int higher_bucket_sched_pri = -1;
653 bool higher_is_aboveui = false;
654 /* Consider unbound Above UI */
655 if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
656 higher_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
657 higher_clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, higher_root_bucket, NULL, NULL, NULL);
658 higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
659 higher_is_aboveui = true;
660 }
661 /* Consider unbound Timeshare FG */
662 if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
663 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
664 sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket_sharefg, NULL, NULL, NULL);
665 /* Strict greater-than because unbound timeshare FG root bucket loses all priority ties at this level */
666 if (higher_root_bucket == NULL || clutch_bucket_sharefg->scb_priority > higher_clutch_bucket->scb_priority) {
667 higher_root_bucket = root_bucket_sharefg;
668 higher_clutch_bucket = clutch_bucket_sharefg;
669 higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
670 higher_is_aboveui = false;
671 }
672 }
673 /* Consider the previous thread */
674 if (prev_thread != NULL) {
675 assert(prev_bucket->scrb_bound == false);
676 sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
677 int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
678 sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
679 bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && higher_is_aboveui == false;
680 if (higher_clutch_bucket == NULL ||
681 sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, higher_clutch_bucket->scb_priority, prev_bucket_should_win_ties)) {
682 higher_root_bucket = prev_bucket;
683 higher_clutch_bucket = prev_clutch_bucket;
684 higher_bucket_sched_pri = prev_thread->sched_pri;
685 higher_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
686 }
687 }
688 /* Compare highest priority in the highest unbound Clutch bucket to highest priority seen from the bound buckets */
689 if (higher_root_bucket != NULL) {
690 bool unbound_should_win_ties = higher_is_aboveui == true && *highest_is_aboveui == false;
691 if (sched_clutch_pri_greater_than_tiebreak(higher_bucket_sched_pri, *highest_pri, unbound_should_win_ties)) {
692 *highest_pri = higher_bucket_sched_pri;
693 *highest_bucket = higher_root_bucket;
694 *highest_is_aboveui = higher_is_aboveui;
695 }
696 }
697 }
698
699 /*
700 * sched_clutch_root_bound_select_aboveui()
701 *
702 * Routine to determine if the above UI bounded bucket should be selected for execution.
703 *
704 * Writes the highest bound (timeshare FG vs. above UI) bucket, its priority, and whether
705 * it is an above UI bucket into the pointer parameters.
706 */
707 static void
sched_clutch_root_bound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)708 sched_clutch_root_bound_select_aboveui(
709 sched_clutch_root_t root_clutch,
710 sched_clutch_root_bucket_t *highest_bucket,
711 int *highest_pri,
712 bool *highest_is_aboveui,
713 sched_clutch_root_bucket_t _Nullable prev_bucket,
714 thread_t _Nullable prev_thread)
715 {
716 /* Consider bound Above UI */
717 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
718 if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_FIXPRI) &&
719 sched_clutch_pri_greater_than_tiebreak(root_bucket_aboveui->scrb_bound_thread_runq.highq, *highest_pri, *highest_is_aboveui == false)) {
720 *highest_pri = root_bucket_aboveui->scrb_bound_thread_runq.highq;
721 *highest_bucket = root_bucket_aboveui;
722 *highest_is_aboveui = true;
723 }
724 /* Consider bound Timeshare FG */
725 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
726 if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SHARE_FG) &&
727 sched_clutch_pri_greater_than_tiebreak(root_bucket_sharefg->scrb_bound_thread_runq.highq, *highest_pri, false)) {
728 *highest_pri = root_bucket_sharefg->scrb_bound_thread_runq.highq;
729 *highest_bucket = root_bucket_sharefg;
730 *highest_is_aboveui = false;
731 }
732 /* Consider the previous thread */
733 if (prev_thread != NULL) {
734 assert(prev_bucket->scrb_bound == true);
735 bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && *highest_is_aboveui == false;
736 if (sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, *highest_pri, prev_bucket_should_win_ties)) {
737 *highest_pri = prev_thread->sched_pri;
738 *highest_bucket = prev_bucket;
739 *highest_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
740 }
741 }
742 }
743
744 /*
745 * sched_clutch_root_highest_runnable_qos()
746 *
747 * Returns the index of the highest-QoS root bucket which is currently runnable.
748 */
749 static int
sched_clutch_root_highest_runnable_qos(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type)750 sched_clutch_root_highest_runnable_qos(
751 sched_clutch_root_t root_clutch,
752 sched_clutch_highest_root_bucket_type_t type)
753 {
754 int highest_unbound_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
755 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
756 return highest_unbound_bucket;
757 }
758 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
759 int highest_bound_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
760 if (highest_bound_bucket == -1) {
761 return highest_unbound_bucket;
762 }
763 if (highest_unbound_bucket == -1) {
764 return highest_bound_bucket;
765 }
766 /* Both bound and unbound buckets are runnable, return the higher QoS */
767 return MIN(highest_bound_bucket, highest_unbound_bucket);
768 }
769
770 /*
771 * sched_clutch_root_highest_aboveui_root_bucket()
772 *
773 * Routine to determine if an above UI root bucket should be selected for execution.
774 *
775 * Returns the root bucket if we should run an above UI bucket or NULL otherwise.
776 */
777 static sched_clutch_root_bucket_t
sched_clutch_root_highest_aboveui_root_bucket(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread)778 sched_clutch_root_highest_aboveui_root_bucket(
779 sched_clutch_root_t root_clutch,
780 sched_clutch_highest_root_bucket_type_t type,
781 sched_clutch_root_bucket_t _Nullable prev_bucket,
782 thread_t _Nullable prev_thread,
783 bool *chose_prev_thread)
784 {
785 assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
786 assert((type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) || (prev_bucket == NULL));
787
788 sched_clutch_root_bucket_t highest_bucket = NULL;
789 int highest_pri = -1;
790 bool highest_is_aboveui = false;
791
792 /* Forward previous thread to the correct comparison logic, based on boundness */
793 sched_clutch_root_bucket_t bound_prev_bucket = NULL, unbound_prev_bucket = NULL;
794 thread_t bound_prev_thread = NULL, unbound_prev_thread = NULL;
795 if (prev_thread != NULL) {
796 if (prev_bucket->scrb_bound) {
797 bound_prev_bucket = prev_bucket;
798 bound_prev_thread = prev_thread;
799 } else {
800 unbound_prev_bucket = prev_bucket;
801 unbound_prev_thread = prev_thread;
802 }
803 }
804
805 /* Consider bound Above UI vs. Timeshare FG first, so those buckets will win ties against the corresponding unbound buckets */
806 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) {
807 sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, bound_prev_bucket, bound_prev_thread);
808 }
809
810 /* Consider unbound Above UI vs. Timeshare FG */
811 sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, unbound_prev_bucket, unbound_prev_thread);
812 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
813 return highest_is_aboveui ? highest_bucket : NULL;
814 }
815 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
816
817 /* Determine whether we already know to continue running the previous thread */
818 if (prev_thread != NULL &&
819 bitmap_test(highest_bucket->scrb_bound ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap, highest_bucket->scrb_bucket) == false) {
820 /* Highest bucket we saw is empty, so the previous thread must have been the highest */
821 assert(highest_bucket == prev_bucket);
822 *chose_prev_thread = true;
823 }
824
825 return highest_is_aboveui ? highest_bucket : NULL;
826 }
827
828 /*
829 * sched_clutch_root_highest_root_bucket()
830 *
831 * Main routine to find the highest runnable root level bucket.
832 * This routine is called from performance sensitive contexts; so it is
833 * crucial to keep this O(1). The options parameter determines if
834 * the selection logic should look at unbounded threads only (for
835 * cross-cluster stealing operations) or both bounded and unbounded
836 * threads (for selecting next thread for execution on current cluster).
837 */
838 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread,sched_clutch_traverse_mode_t mode,sched_clutch_dbg_thread_select_packed_t * debug_info)839 sched_clutch_root_highest_root_bucket(
840 sched_clutch_root_t root_clutch,
841 uint64_t timestamp,
842 sched_clutch_highest_root_bucket_type_t type,
843 sched_clutch_root_bucket_t _Nullable prev_bucket,
844 thread_t _Nullable prev_thread,
845 bool *chose_prev_thread,
846 sched_clutch_traverse_mode_t mode,
847 sched_clutch_dbg_thread_select_packed_t *debug_info)
848 {
849 assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
850 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL || (prev_thread == NULL));
851 assert(prev_thread == NULL || (mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY));
852 sched_clutch_hierarchy_locked_assert(root_clutch);
853
854 int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(root_clutch, type);
855 if (highest_runnable_bucket == -1) {
856 /*
857 * The Clutch hierarchy has no runnable threads. We can continue running
858 * whatever was running previously.
859 */
860 assert(sched_clutch_root_count(root_clutch) == 0 || type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
861 *chose_prev_thread = true;
862 if (prev_thread != NULL) {
863 debug_info->trace_data.selection_was_edf = true;
864 }
865 return prev_bucket;
866 }
867
868 /* Consider Above UI threads, in comparison to Timeshare FG threads */
869 sched_clutch_root_bucket_t highest_aboveui_bucket = sched_clutch_root_highest_aboveui_root_bucket(root_clutch, type, prev_bucket, prev_thread, chose_prev_thread);
870 if (highest_aboveui_bucket != NULL) {
871 debug_info->trace_data.selection_was_edf = true;
872 return highest_aboveui_bucket;
873 }
874
875 /*
876 * Above UI bucket is not runnable or has a low priority runnable thread; use the
877 * earliest deadline model to schedule threads. The idea is that as the timeshare
878 * buckets use CPU, they will drop their interactivity score/sched priority and
879 * allow the low priority AboveUI buckets to be scheduled.
880 */
881
882 /* Find the earliest deadline bucket */
883 sched_clutch_root_bucket_t edf_bucket;
884 bool edf_bucket_enqueued_normally;
885
886 evaluate_root_buckets:
887 edf_bucket = NULL;
888 edf_bucket_enqueued_normally = true;
889
890 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
891 edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
892 } else {
893 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
894 sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
895 sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
896 if (bound_bucket && unbound_bucket) {
897 /* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
898 edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
899 } else {
900 edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
901 }
902 }
903 if (edf_bucket == NULL) {
904 /* The timeshare portion of the runqueue is empty */
905 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
906 assert(prev_thread != NULL);
907 *chose_prev_thread = true;
908 if (prev_thread != NULL) {
909 debug_info->trace_data.selection_was_edf = true;
910 }
911 return prev_bucket;
912 }
913 if (prev_bucket != NULL && prev_bucket->scrb_pqlink.deadline < edf_bucket->scrb_pqlink.deadline) {
914 /* The previous thread's root bucket has the earliest deadline and is not currently enqueued */
915 edf_bucket = prev_bucket;
916 edf_bucket_enqueued_normally = false;
917 }
918
919 if (edf_bucket->scrb_starvation_avoidance) {
920 /* Check if the EDF bucket is in an expired starvation avoidance window */
921 uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket];
922 if (timestamp >= (edf_bucket->scrb_starvation_ts + starvation_window)) {
923 /* Starvation avoidance window is over; update deadline and re-evaluate EDF */
924 edf_bucket->scrb_starvation_avoidance = false;
925 edf_bucket->scrb_starvation_ts = 0;
926 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
927 bit_set(debug_info->trace_data.starvation_avoidance_window_close, edf_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + edf_bucket->scrb_bucket);
928 goto evaluate_root_buckets;
929 }
930 }
931
932 /*
933 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
934 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
935 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
936 * QoS root bucket.
937 */
938 bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
939 int warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
940
941 /* Allow the prev_bucket to use its warp as well */
942 bool prev_bucket_warping = (prev_bucket != NULL) && (prev_bucket->scrb_bound == edf_bucket->scrb_bound) &&
943 prev_bucket->scrb_bucket < edf_bucket->scrb_bucket && (prev_bucket->scrb_warp_remaining > 0) &&
944 (warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
945
946 bool non_edf_bucket_can_warp = (warp_bucket_index != -1 && warp_bucket_index < edf_bucket->scrb_bucket) || prev_bucket_warping;
947
948 if (non_edf_bucket_can_warp == false) {
949 /* No higher buckets have warp left; best choice is the EDF based bucket */
950 debug_info->trace_data.selection_was_edf = true;
951
952 bool should_update_edf_starvation_state = edf_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
953 if (edf_bucket->scrb_starvation_avoidance == false && should_update_edf_starvation_state) {
954 /* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
955 if (highest_runnable_bucket < edf_bucket->scrb_bucket || (prev_bucket != NULL && prev_bucket->scrb_bucket < edf_bucket->scrb_bucket)) {
956 /*
957 * Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance.
958 *
959 * The starvation avoidance window is allocated as a single quantum for the starved bucket, enforced
960 * simultaneously across all CPUs in the cluster. The idea is to grant the starved bucket roughly one
961 * quantum per core, each time the bucket reaches the earliest deadline position. Note that this
962 * cadence is driven by the difference between the starved bucket's and highest-runnable bucket's WCELs.
963 */
964 edf_bucket->scrb_starvation_avoidance = true;
965 edf_bucket->scrb_starvation_ts = timestamp;
966 debug_info->trace_data.selection_opened_starvation_avoidance_window = true;
967 } else {
968 /* EDF bucket is being selected in the natural order; update deadline and reset warp */
969 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
970 edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
971 edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
972 if (edf_bucket_enqueued_normally) {
973 if (edf_bucket->scrb_bound) {
974 bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
975 } else {
976 bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
977 }
978 }
979 }
980 }
981 *chose_prev_thread = !edf_bucket_enqueued_normally;
982 return edf_bucket;
983 }
984
985 /*
986 * Looks like there is a root bucket which is higher in the natural priority
987 * order than edf_bucket and might have some warp remaining.
988 */
989 assert(prev_bucket_warping || warp_bucket_index >= 0);
990 sched_clutch_root_bucket_t warp_bucket = NULL;
991 if (prev_bucket_warping) {
992 assert(warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
993 warp_bucket = prev_bucket;
994 } else {
995 warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
996 }
997
998 bool warp_is_being_utilized = warp_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
999
1000 if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
1001 if (warp_is_being_utilized) {
1002 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
1003 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
1004 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
1005 debug_info->trace_data.selection_opened_warp_window = true;
1006 }
1007 *chose_prev_thread = prev_bucket_warping;
1008 debug_info->trace_data.selection_was_edf = false;
1009 assert(warp_bucket != edf_bucket);
1010 return warp_bucket;
1011 }
1012 if (warp_bucket->scrb_warped_deadline > timestamp) {
1013 /* Root bucket already has a warp window open with some warp remaining */
1014 if (warp_is_being_utilized) {
1015 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
1016 }
1017 *chose_prev_thread = prev_bucket_warping;
1018 debug_info->trace_data.selection_was_edf = false;
1019 return warp_bucket;
1020 }
1021
1022 /*
1023 * For this bucket, warp window was opened sometime in the past but has now
1024 * expired. Mark the bucket as not available for warp anymore and re-run the
1025 * warp bucket selection logic.
1026 */
1027 warp_bucket->scrb_warp_remaining = 0;
1028 if (!prev_bucket_warping) {
1029 if (warp_bucket->scrb_bound) {
1030 bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
1031 } else {
1032 bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
1033 }
1034 }
1035 bit_set(debug_info->trace_data.warp_window_close, warp_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + warp_bucket->scrb_bucket);
1036 goto evaluate_root_buckets;
1037 }
1038
1039 static inline bool
sched_clutch_bucket_is_above_timeshare(sched_bucket_t bucket)1040 sched_clutch_bucket_is_above_timeshare(sched_bucket_t bucket)
1041 {
1042 return bucket == TH_BUCKET_FIXPRI;
1043 }
1044
1045 /*
1046 * sched_clutch_root_bucket_deadline_calculate()
1047 *
1048 * Calculate the deadline for the bucket based on its WCEL
1049 */
1050 static uint64_t
sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t root_bucket,uint64_t timestamp)1051 sched_clutch_root_bucket_deadline_calculate(
1052 sched_clutch_root_bucket_t root_bucket,
1053 uint64_t timestamp)
1054 {
1055 /* For fixpri AboveUI bucket always return it as the earliest deadline */
1056 if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1057 return 0;
1058 }
1059
1060 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
1061 return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
1062 }
1063
1064 /*
1065 * sched_clutch_root_bucket_deadline_update()
1066 *
1067 * Routine to update the deadline of the root bucket when it is selected.
1068 * Updating the deadline also moves the root_bucket in the EDF priority
1069 * queue.
1070 */
1071 static void
sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,bool bucket_is_enqueued)1072 sched_clutch_root_bucket_deadline_update(
1073 sched_clutch_root_bucket_t root_bucket,
1074 sched_clutch_root_t root_clutch,
1075 uint64_t timestamp,
1076 bool bucket_is_enqueued)
1077 {
1078 if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1079 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
1080 return;
1081 }
1082
1083 uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
1084 uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1085 if (__improbable(old_deadline > new_deadline)) {
1086 panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
1087 }
1088 if (old_deadline != new_deadline) {
1089 root_bucket->scrb_pqlink.deadline = new_deadline;
1090 if (bucket_is_enqueued) {
1091 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1092 priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
1093 }
1094 }
1095 }
1096
1097 /*
1098 * sched_clutch_root_bucket_runnable()
1099 *
1100 * Routine to insert a newly runnable root bucket into the hierarchy.
1101 * Also updates the deadline and warp parameters as necessary.
1102 */
1103 static void
sched_clutch_root_bucket_runnable(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1104 sched_clutch_root_bucket_runnable(
1105 sched_clutch_root_bucket_t root_bucket,
1106 sched_clutch_root_t root_clutch,
1107 uint64_t timestamp)
1108 {
1109 /* Mark the root bucket as runnable */
1110 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1111 bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
1112
1113 if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1114 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1115 return;
1116 }
1117
1118 if (root_bucket->scrb_starvation_avoidance == false) {
1119 /*
1120 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
1121 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
1122 * that and fix it up.
1123 */
1124 root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1125 }
1126 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1127 priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
1128 if (root_bucket->scrb_warp_remaining) {
1129 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
1130 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1131 bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
1132 }
1133 }
1134
1135 /*
1136 * sched_clutch_root_bucket_empty()
1137 *
1138 * Routine to remove an empty root bucket from the hierarchy.
1139 * Also updates the deadline and warp parameters as necessary.
1140 */
1141 static void
sched_clutch_root_bucket_empty(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1142 sched_clutch_root_bucket_empty(
1143 sched_clutch_root_bucket_t root_bucket,
1144 sched_clutch_root_t root_clutch,
1145 uint64_t timestamp)
1146 {
1147 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1148 bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
1149
1150 if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1151 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1152 return;
1153 }
1154
1155 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1156 priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
1157
1158 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1159 bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
1160
1161 if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
1162 if (root_bucket->scrb_warped_deadline > timestamp) {
1163 /*
1164 * For root buckets that were using the warp, check if the warp
1165 * deadline is in the future. If yes, remove the wall time the
1166 * warp was active and update the warp remaining. This allows
1167 * the root bucket to use the remaining warp the next time it
1168 * becomes runnable.
1169 */
1170 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
1171 } else {
1172 /*
1173 * If the root bucket's warped deadline is in the past, it has used up
1174 * all the warp it was assigned. Empty out its warp remaining.
1175 */
1176 root_bucket->scrb_warp_remaining = 0;
1177 }
1178 }
1179 }
1180
1181 static int
sched_clutch_global_bucket_load_get(sched_bucket_t bucket)1182 sched_clutch_global_bucket_load_get(
1183 sched_bucket_t bucket)
1184 {
1185 return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
1186 }
1187
1188 /*
1189 * sched_clutch_root_pri_update()
1190 *
1191 * The root level priority is used for thread selection and preemption
1192 * logic.
1193 *
1194 * The logic uses the same decision as thread selection for deciding between the
1195 * above UI and timeshare buckets. If one of the timesharing buckets have to be
1196 * used for priority calculation, the logic is slightly different from thread
1197 * selection, because thread selection considers deadlines, warps etc. to
1198 * decide the most optimal bucket at a given timestamp. Since the priority
1199 * value is used for preemption decisions only, it needs to be based on the
1200 * highest runnable thread available in the timeshare domain. This logic can
1201 * be made more sophisticated if there are cases of unnecessary preemption
1202 * being seen in workloads.
1203 */
1204 static void
sched_clutch_root_pri_update(sched_clutch_root_t root_clutch)1205 sched_clutch_root_pri_update(
1206 sched_clutch_root_t root_clutch)
1207 {
1208 sched_clutch_hierarchy_locked_assert(root_clutch);
1209 int16_t root_bound_pri = NOPRI;
1210 int16_t root_unbound_pri = NOPRI;
1211
1212 /* Consider bound root buckets */
1213 if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1214 goto root_pri_update_unbound;
1215 }
1216 sched_clutch_root_bucket_t highest_bound_root_bucket = NULL;
1217 __unused int highest_bound_root_bucket_pri = -1;
1218 bool highest_bound_root_bucket_is_fixpri = false;
1219 sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bound_root_bucket, &highest_bound_root_bucket_pri, &highest_bound_root_bucket_is_fixpri, NULL, NULL);
1220 if (highest_bound_root_bucket_is_fixpri == false) {
1221 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1222 assert(root_bucket_index != -1);
1223 highest_bound_root_bucket = &root_clutch->scr_bound_buckets[root_bucket_index];
1224 }
1225 root_bound_pri = highest_bound_root_bucket->scrb_bound_thread_runq.highq;
1226
1227 root_pri_update_unbound:
1228 /* Consider unbound root buckets */
1229 if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1230 goto root_pri_update_complete;
1231 }
1232 sched_clutch_root_bucket_t highest_unbound_root_bucket = NULL;
1233 __unused int highest_unbound_root_bucket_pri = -1;
1234 bool highest_unbound_root_bucket_is_fixpri = false;
1235 sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_unbound_root_bucket, &highest_unbound_root_bucket_pri, &highest_unbound_root_bucket_is_fixpri, NULL, NULL);
1236 if (highest_unbound_root_bucket_is_fixpri == false) {
1237 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1238 assert(root_bucket_index != -1);
1239 highest_unbound_root_bucket = &root_clutch->scr_unbound_buckets[root_bucket_index];
1240 }
1241
1242 /* For the selected root bucket, find the highest priority clutch bucket */
1243 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, highest_unbound_root_bucket, NULL, NULL, NULL);
1244 root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1245
1246 root_pri_update_complete:
1247 root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
1248 }
1249
1250 /*
1251 * sched_clutch_root_urgency_inc()
1252 *
1253 * Routine to increment the urgency at the root level based on the thread
1254 * priority that is being inserted into the hierarchy. The root urgency
1255 * counter is updated based on the urgency of threads in any of the
1256 * clutch buckets which are part of the hierarchy.
1257 *
1258 * Always called with the pset lock held.
1259 */
1260 static void
sched_clutch_root_urgency_inc(sched_clutch_root_t root_clutch,thread_t thread)1261 sched_clutch_root_urgency_inc(
1262 sched_clutch_root_t root_clutch,
1263 thread_t thread)
1264 {
1265 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1266 root_clutch->scr_urgency++;
1267 }
1268 }
1269
1270 /*
1271 * sched_clutch_root_urgency_dec()
1272 *
1273 * Routine to decrement the urgency at the root level based on the thread
1274 * priority that is being removed from the hierarchy. The root urgency
1275 * counter is updated based on the urgency of threads in any of the
1276 * clutch buckets which are part of the hierarchy.
1277 *
1278 * Always called with the pset lock held.
1279 */
1280 static void
sched_clutch_root_urgency_dec(sched_clutch_root_t root_clutch,thread_t thread)1281 sched_clutch_root_urgency_dec(
1282 sched_clutch_root_t root_clutch,
1283 thread_t thread)
1284 {
1285 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1286 root_clutch->scr_urgency--;
1287 }
1288 }
1289
1290 /*
1291 * Clutch bucket level scheduling
1292 *
1293 * The second level of scheduling is the clutch bucket level scheduling
1294 * which tries to schedule thread groups within root_buckets. Each
1295 * clutch represents a thread group and a clutch_bucket_group represents
1296 * threads at a particular sched_bucket within that thread group. The
1297 * clutch_bucket_group contains a clutch_bucket per cluster on the system
1298 * where it holds the runnable threads destined for execution on that
1299 * cluster.
1300 *
1301 * The goal of this level of scheduling is to allow interactive thread
1302 * groups low latency access to the CPU. It also provides slight
1303 * scheduling preference for App and unrestricted thread groups.
1304 *
1305 * The clutch bucket scheduling algorithm measures an interactivity
1306 * score for all clutch bucket groups. The interactivity score is based
1307 * on the ratio of the CPU used and the voluntary blocking of threads
1308 * within the clutch bucket group. The algorithm is very close to the ULE
1309 * scheduler on FreeBSD in terms of calculations. The interactivity
1310 * score provides an interactivity boost in the range of
1311 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1312 * thread groups to win over CPU spinners.
1313 *
1314 * The interactivity score of the clutch bucket group is combined with the
1315 * highest base/promoted priority of threads in the clutch bucket to form
1316 * the overall priority of the clutch bucket.
1317 */
1318
1319 /* Priority boost range for interactivity */
1320 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT (8)
1321 static uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1322
1323 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1324 static uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1325 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS (500000)
1326
1327 /* The ratio to scale the cpu/blocked time per window */
1328 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO (10)
1329
1330 /* Initial value for voluntary blocking time for the clutch_bucket */
1331 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID (uint64_t)(~0)
1332
1333 /* Value indicating the clutch bucket is not pending execution */
1334 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID ((uint64_t)(~0))
1335
1336 /*
1337 * Thread group CPU starvation avoidance
1338 *
1339 * In heavily CPU contended scenarios, it is possible that some thread groups
1340 * which have a low interactivity score do not get CPU time at all. In order to
1341 * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1342 * bucket group when it has been pending execution for a certain time as defined
1343 * by the sched_clutch_bucket_group_pending_delta_us values below.
1344 *
1345 * The values chosen here are very close to the WCEL values for each sched bucket.
1346 * Theses values are added into the pending interval used to determine how
1347 * frequently we will ageout the CPU usage, ensuring a reasonable limit on the
1348 * frequency.
1349 */
1350 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1351 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
1352 10000, /* FG */
1353 37500, /* IN */
1354 75000, /* DF */
1355 150000, /* UT */
1356 250000, /* BG */
1357 };
1358 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1359
1360 /*
1361 * sched_clutch_bucket_init()
1362 *
1363 * Initializer for clutch buckets.
1364 */
1365 static void
sched_clutch_bucket_init(sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_group_t clutch_bucket_group,sched_bucket_t bucket)1366 sched_clutch_bucket_init(
1367 sched_clutch_bucket_t clutch_bucket,
1368 sched_clutch_bucket_group_t clutch_bucket_group,
1369 sched_bucket_t bucket)
1370 {
1371 clutch_bucket->scb_bucket = bucket;
1372 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1373 clutch_bucket->scb_priority = 0;
1374 #if CONFIG_SCHED_EDGE
1375 clutch_bucket->scb_foreign = false;
1376 priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1377 #endif /* CONFIG_SCHED_EDGE */
1378 clutch_bucket->scb_group = clutch_bucket_group;
1379 clutch_bucket->scb_root = NULL;
1380 priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1381 priority_queue_init(&clutch_bucket->scb_thread_runq);
1382 queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1383 }
1384
1385 /*
1386 * sched_clutch_bucket_group_init()
1387 *
1388 * Initializer for clutch bucket groups.
1389 */
1390 static void
sched_clutch_bucket_group_init(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_t clutch,sched_bucket_t bucket)1391 sched_clutch_bucket_group_init(
1392 sched_clutch_bucket_group_t clutch_bucket_group,
1393 sched_clutch_t clutch,
1394 sched_bucket_t bucket)
1395 {
1396 bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1397 clutch_bucket_group->scbg_bucket = bucket;
1398 clutch_bucket_group->scbg_clutch = clutch;
1399
1400 int max_clusters = ml_get_cluster_count();
1401 clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1402 for (int i = 0; i < max_clusters; i++) {
1403 sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1404 }
1405
1406 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1407 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1408 os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1409 /*
1410 * All thread groups should be initialized to be interactive; this allows the newly launched
1411 * thread groups to fairly compete with already running thread groups.
1412 */
1413 clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1414 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1415 os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1416 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1417 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1418 }
1419
1420 static void
sched_clutch_bucket_group_destroy(sched_clutch_bucket_group_t clutch_bucket_group)1421 sched_clutch_bucket_group_destroy(
1422 sched_clutch_bucket_group_t clutch_bucket_group)
1423 {
1424 kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1425 clutch_bucket_group->scbg_clutch_buckets);
1426 }
1427
1428 /*
1429 * sched_clutch_init_with_thread_group()
1430 *
1431 * Initialize the sched_clutch when the thread group is being created
1432 */
1433 void
sched_clutch_init_with_thread_group(sched_clutch_t clutch,struct thread_group * tg)1434 sched_clutch_init_with_thread_group(
1435 sched_clutch_t clutch,
1436 struct thread_group *tg)
1437 {
1438 os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1439
1440 /* Initialize all the clutch buckets */
1441 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1442 sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1443 }
1444
1445 /* Grouping specific fields */
1446 clutch->sc_tg = tg;
1447 }
1448
1449 /*
1450 * sched_clutch_destroy()
1451 *
1452 * Destructor for clutch; called from thread group release code.
1453 */
1454 void
sched_clutch_destroy(sched_clutch_t clutch)1455 sched_clutch_destroy(
1456 sched_clutch_t clutch)
1457 {
1458 assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1459 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1460 sched_clutch_bucket_group_destroy(&(clutch->sc_clutch_groups[i]));
1461 }
1462 }
1463
1464 #if CONFIG_SCHED_EDGE
1465
1466 /*
1467 * Edge Scheduler Preferred Cluster Mechanism
1468 *
1469 * In order to have better control over various QoS buckets within a thread group, the Edge
1470 * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1471 * preferences are stored at the sched_clutch_bucket_group level since that represents all
1472 * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1473 * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1474 */
1475
1476 static uint32_t
sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)1477 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1478 {
1479 return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1480 }
1481
1482 static uint32_t
sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)1483 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1484 {
1485 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1486 }
1487
1488 uint32_t
sched_edge_thread_preferred_cluster(thread_t thread)1489 sched_edge_thread_preferred_cluster(thread_t thread)
1490 {
1491 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1492 /* For threads bound to a specific cluster, return the bound cluster id */
1493 return sched_edge_thread_bound_cluster_id(thread);
1494 }
1495
1496 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1497 sched_bucket_t sched_bucket = thread->th_sched_bucket;
1498 if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1499 sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1500 }
1501 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1502 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1503 }
1504
1505 /*
1506 * Edge Scheduler Foreign Bucket Support
1507 *
1508 * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1509 * threads that are not native to the cluster. A clutch bucket is considered native if its
1510 * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1511 * bucket priority queue is used for rebalance operations to get threads back to their native
1512 * cluster quickly.
1513 *
1514 * It is possible to make this policy even more aggressive by considering all clusters that
1515 * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1516 * migrations which might have performance implications.
1517 */
1518
1519 static void
sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1520 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1521 {
1522 if (clutch_bucket->scb_foreign) {
1523 clutch_bucket->scb_foreign = false;
1524 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1525 }
1526 }
1527
1528 static void
sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1529 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1530 {
1531 if (!clutch_bucket->scb_foreign) {
1532 clutch_bucket->scb_foreign = true;
1533 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1534 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1535 }
1536 }
1537
1538 /*
1539 * Edge Scheduler Cumulative Load Average
1540 *
1541 * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1542 * making thread migration decisions. The per-bucket load is maintained as a
1543 * cumulative count since higher scheduling buckets impact load on lower buckets
1544 * for thread migration decisions.
1545 *
1546 */
1547
1548 static void
sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1549 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1550 {
1551 switch (bucket) {
1552 case TH_BUCKET_FIXPRI: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1553 case TH_BUCKET_SHARE_FG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1554 case TH_BUCKET_SHARE_IN: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1555 case TH_BUCKET_SHARE_DF: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1556 case TH_BUCKET_SHARE_UT: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1557 case TH_BUCKET_SHARE_BG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1558 default:
1559 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1560 }
1561 }
1562
1563 static void
sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1564 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1565 {
1566 switch (bucket) {
1567 case TH_BUCKET_FIXPRI: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1568 case TH_BUCKET_SHARE_FG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1569 case TH_BUCKET_SHARE_IN: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1570 case TH_BUCKET_SHARE_DF: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1571 case TH_BUCKET_SHARE_UT: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1572 case TH_BUCKET_SHARE_BG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1573 default:
1574 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1575 }
1576 }
1577
1578 uint16_t
sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1579 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1580 {
1581 return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1582 }
1583
1584 #endif /* CONFIG_SCHED_EDGE */
1585
1586 /*
1587 * sched_clutch_bucket_hierarchy_insert()
1588 *
1589 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1590 */
1591 static void
sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,sched_clutch_bucket_options_t options)1592 sched_clutch_bucket_hierarchy_insert(
1593 sched_clutch_root_t root_clutch,
1594 sched_clutch_bucket_t clutch_bucket,
1595 sched_bucket_t bucket,
1596 uint64_t timestamp,
1597 sched_clutch_bucket_options_t options)
1598 {
1599 sched_clutch_hierarchy_locked_assert(root_clutch);
1600 if (sched_clutch_bucket_is_above_timeshare(bucket) == false) {
1601 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1602 enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1603 }
1604 #if CONFIG_SCHED_EDGE
1605 /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1606 uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1607 if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1608 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1609 }
1610 #endif /* CONFIG_SCHED_EDGE */
1611 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1612
1613 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1614 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1615 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1616 }
1617
1618 /* Insert the clutch bucket into the root bucket run queue with order based on options */
1619 sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1620 os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1621 os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1622 }
1623
1624 /*
1625 * sched_clutch_bucket_hierarchy_remove()
1626 *
1627 * Rotuine to remove a empty clutch bucket from the root hierarchy.
1628 */
1629 static void
sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,__unused sched_clutch_bucket_options_t options)1630 sched_clutch_bucket_hierarchy_remove(
1631 sched_clutch_root_t root_clutch,
1632 sched_clutch_bucket_t clutch_bucket,
1633 sched_bucket_t bucket,
1634 uint64_t timestamp,
1635 __unused sched_clutch_bucket_options_t options)
1636 {
1637 sched_clutch_hierarchy_locked_assert(root_clutch);
1638 if (sched_clutch_bucket_is_above_timeshare(bucket) == false) {
1639 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1640 remqueue(&clutch_bucket->scb_listlink);
1641 }
1642 #if CONFIG_SCHED_EDGE
1643 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1644 #endif /* CONFIG_SCHED_EDGE */
1645
1646 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1647
1648 /* Remove the clutch bucket from the root bucket priority queue */
1649 sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1650 os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1651
1652 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
1653 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1654 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1655 }
1656 os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1657 }
1658
1659 /*
1660 * sched_clutch_bucket_base_pri()
1661 *
1662 * Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1663 * highest base_pri and the highest sched_pri in the clutch bucket.
1664 */
1665 static uint8_t
sched_clutch_bucket_base_pri(sched_clutch_bucket_t clutch_bucket)1666 sched_clutch_bucket_base_pri(
1667 sched_clutch_bucket_t clutch_bucket)
1668 {
1669 assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1670 /*
1671 * Since the clutch bucket can contain threads that are members of the group due
1672 * to the sched_pri being promoted or due to their base pri, the base priority of
1673 * the entire clutch bucket should be based on the highest thread (promoted or base)
1674 * in the clutch bucket.
1675 */
1676 uint8_t max_pri = 0;
1677 if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1678 max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1679 }
1680 return max_pri;
1681 }
1682
1683 /*
1684 * sched_clutch_interactivity_from_cpu_data()
1685 *
1686 * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1687 */
1688 static uint8_t
sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)1689 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1690 {
1691 sched_clutch_bucket_cpu_data_t scb_cpu_data;
1692 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1693 clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1694 clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1695 uint8_t interactive_score = 0;
1696
1697 if ((cpu_blocked == 0) && (cpu_used == 0)) {
1698 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1699 }
1700 /*
1701 * For all timeshare buckets, calculate the interactivity score of the bucket
1702 * and add it to the base priority
1703 */
1704 if (cpu_blocked > cpu_used) {
1705 /* Interactive clutch_bucket case */
1706 interactive_score = sched_clutch_bucket_group_interactive_pri +
1707 ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1708 } else {
1709 /* Non-interactive clutch_bucket case */
1710 interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1711 }
1712 return interactive_score;
1713 }
1714
1715 /*
1716 * sched_clutch_bucket_pri_calculate()
1717 *
1718 * The priority calculation algorithm for the clutch_bucket is a slight
1719 * modification on the ULE interactivity score. It uses the base priority
1720 * of the clutch bucket and applies an interactivity score boost to the
1721 * highly responsive clutch buckets.
1722 */
1723 static uint8_t
sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t clutch_bucket,uint64_t timestamp)1724 sched_clutch_bucket_pri_calculate(
1725 sched_clutch_bucket_t clutch_bucket,
1726 uint64_t timestamp)
1727 {
1728 /* For empty clutch buckets, return priority 0 */
1729 if (clutch_bucket->scb_thr_count == 0) {
1730 return 0;
1731 }
1732
1733 uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1734 uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1735
1736 assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1737 uint8_t pri = base_pri + interactive_score;
1738 if (pri != clutch_bucket->scb_priority) {
1739 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1740 thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1741 }
1742 return pri;
1743 }
1744
1745 /*
1746 * sched_clutch_root_bucket_highest_clutch_bucket()
1747 *
1748 * Routine to find the highest priority clutch bucket
1749 * within the root bucket.
1750 */
1751 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread,bool * _Nullable chose_prev_thread)1752 sched_clutch_root_bucket_highest_clutch_bucket(
1753 sched_clutch_root_t root_clutch,
1754 sched_clutch_root_bucket_t root_bucket,
1755 processor_t _Nullable processor,
1756 thread_t _Nullable prev_thread,
1757 bool *_Nullable chose_prev_thread)
1758 {
1759 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1760 if (prev_thread != NULL) {
1761 *chose_prev_thread = true;
1762 return sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1763 }
1764 return NULL;
1765 }
1766 sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1767 /* Consider the Clutch bucket of the previous thread */
1768 if (prev_thread != NULL) {
1769 assert(chose_prev_thread != NULL);
1770 sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
1771 int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
1772 sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1773 if (prev_clutch_bucket != clutch_bucket &&
1774 sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, clutch_bucket->scb_priority, processor->first_timeslice)) {
1775 *chose_prev_thread = true;
1776 return prev_clutch_bucket;
1777 }
1778 }
1779 return clutch_bucket;
1780 }
1781
1782 /*
1783 * sched_clutch_bucket_runnable()
1784 *
1785 * Perform all operations needed when a new clutch bucket becomes runnable.
1786 * It involves inserting the clutch_bucket into the hierarchy and updating the
1787 * root priority appropriately.
1788 */
1789 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1790 sched_clutch_bucket_runnable(
1791 sched_clutch_bucket_t clutch_bucket,
1792 sched_clutch_root_t root_clutch,
1793 uint64_t timestamp,
1794 sched_clutch_bucket_options_t options)
1795 {
1796 sched_clutch_hierarchy_locked_assert(root_clutch);
1797 /* Since the clutch bucket became newly runnable, update its pending timestamp */
1798 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1799 sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1800
1801 /* Update the timesharing properties of this clutch_bucket_group; also done every sched_tick */
1802 sched_clutch_bucket_group_pri_shift_update(clutch_bucket->scb_group);
1803
1804 int16_t root_old_pri = root_clutch->scr_priority;
1805 sched_clutch_root_pri_update(root_clutch);
1806 return root_clutch->scr_priority > root_old_pri;
1807 }
1808
1809 /*
1810 * sched_clutch_bucket_update()
1811 *
1812 * Update the clutch_bucket's position in the hierarchy. This routine is
1813 * called when a new thread is inserted or removed from a runnable clutch
1814 * bucket. The options specify some properties about the clutch bucket
1815 * insertion order into the clutch bucket runq.
1816 */
1817 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1818 sched_clutch_bucket_update(
1819 sched_clutch_bucket_t clutch_bucket,
1820 sched_clutch_root_t root_clutch,
1821 uint64_t timestamp,
1822 sched_clutch_bucket_options_t options)
1823 {
1824 sched_clutch_hierarchy_locked_assert(root_clutch);
1825 uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1826 sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1827 if (new_pri == clutch_bucket->scb_priority) {
1828 /*
1829 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1830 * to the end of the runq. Typically used when a thread is selected for execution
1831 * from a clutch bucket.
1832 */
1833 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1834 sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1835 }
1836 return false;
1837 }
1838 sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1839 #if CONFIG_SCHED_EDGE
1840 if (clutch_bucket->scb_foreign) {
1841 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1842 }
1843 #endif /* CONFIG_SCHED_EDGE */
1844 clutch_bucket->scb_priority = new_pri;
1845 #if CONFIG_SCHED_EDGE
1846 if (clutch_bucket->scb_foreign) {
1847 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1848 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1849 }
1850 #endif /* CONFIG_SCHED_EDGE */
1851 sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1852
1853 int16_t root_old_pri = root_clutch->scr_priority;
1854 sched_clutch_root_pri_update(root_clutch);
1855 return root_clutch->scr_priority > root_old_pri;
1856 }
1857
1858 /*
1859 * sched_clutch_bucket_empty()
1860 *
1861 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1862 * It involves removing the clutch bucket from the hierarchy and updaing the root
1863 * priority appropriately.
1864 */
1865 static void
sched_clutch_bucket_empty(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1866 sched_clutch_bucket_empty(
1867 sched_clutch_bucket_t clutch_bucket,
1868 sched_clutch_root_t root_clutch,
1869 uint64_t timestamp,
1870 sched_clutch_bucket_options_t options)
1871 {
1872 sched_clutch_hierarchy_locked_assert(root_clutch);
1873 assert3u(clutch_bucket->scb_thr_count, ==, 0);
1874 sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1875
1876 /* Update the timesharing properties of this clutch_bucket_group; also done every sched_tick */
1877 sched_clutch_bucket_group_pri_shift_update(clutch_bucket->scb_group);
1878
1879 clutch_bucket->scb_priority = 0;
1880 sched_clutch_root_pri_update(root_clutch);
1881 }
1882
1883 /*
1884 * sched_clutch_cpu_usage_update()
1885 *
1886 * Routine to update CPU usage of the thread in the hierarchy.
1887 */
1888 void
sched_clutch_cpu_usage_update(thread_t thread,uint64_t delta)1889 sched_clutch_cpu_usage_update(
1890 thread_t thread,
1891 uint64_t delta)
1892 {
1893 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1894 return;
1895 }
1896
1897 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1898 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1899 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1900 }
1901
1902 /*
1903 * sched_clutch_bucket_group_cpu_usage_update()
1904 *
1905 * Routine to update the CPU usage of the clutch_bucket.
1906 */
1907 static void
sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t delta)1908 sched_clutch_bucket_group_cpu_usage_update(
1909 sched_clutch_bucket_group_t clutch_bucket_group,
1910 uint64_t delta)
1911 {
1912 if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
1913 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1914 return;
1915 }
1916 delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1917 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1918 }
1919
1920 /*
1921 * sched_clutch_bucket_group_cpu_pending_adjust()
1922 *
1923 * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1924 * such that one "pending interval" provides one point improvement in interactivity score.
1925 */
1926 static inline uint64_t
sched_clutch_bucket_group_cpu_pending_adjust(uint64_t cpu_used,uint64_t cpu_blocked,uint8_t pending_intervals)1927 sched_clutch_bucket_group_cpu_pending_adjust(
1928 uint64_t cpu_used,
1929 uint64_t cpu_blocked,
1930 uint8_t pending_intervals)
1931 {
1932 uint64_t cpu_used_adjusted = 0;
1933 if (cpu_blocked < cpu_used) {
1934 cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1935 cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1936 } else {
1937 uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1938 cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1939 }
1940 return cpu_used_adjusted;
1941 }
1942
1943 /*
1944 * sched_clutch_bucket_group_cpu_adjust()
1945 *
1946 * Routine to scale the cpu usage and blocked time once the sum gets bigger
1947 * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1948 * manageable and maintain the same ratio while allowing clutch buckets to
1949 * adjust behavior and reflect in the interactivity score in a reasonable
1950 * amount of time. Also adjusts the CPU usage based on pending_intervals
1951 * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1952 */
1953 static void
sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t clutch_bucket_group,uint8_t pending_intervals)1954 sched_clutch_bucket_group_cpu_adjust(
1955 sched_clutch_bucket_group_t clutch_bucket_group,
1956 uint8_t pending_intervals)
1957 {
1958 sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1959 sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1960 os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1961 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1962 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1963
1964 if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1965 /* No changes to the CPU used and blocked values */
1966 os_atomic_rmw_loop_give_up();
1967 }
1968 if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1969 /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1970 cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1971 cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1972 }
1973 /* Use the shift passed in to ageout the CPU usage */
1974 cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1975 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1976 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1977 });
1978 }
1979
1980 /*
1981 * Thread level scheduling algorithm
1982 *
1983 * The thread level scheduling algorithm uses the mach timeshare
1984 * decay based algorithm to achieve sharing between threads within the
1985 * same clutch bucket. The load/priority shifts etc. are all maintained
1986 * at the clutch bucket level and used for decay calculation of the
1987 * threads. The load sampling is still driven off the scheduler tick
1988 * for runnable clutch buckets (it does not use the new higher frequency
1989 * EWMA based load calculation). The idea is that the contention and load
1990 * within clutch_buckets should be limited enough to not see heavy decay
1991 * and timeshare effectively.
1992 */
1993
1994 /*
1995 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1996 *
1997 * Increment the run count for the clutch bucket associated with the
1998 * thread.
1999 */
2000 uint32_t
sched_clutch_thread_run_bucket_incr(thread_t thread,sched_bucket_t bucket)2001 sched_clutch_thread_run_bucket_incr(
2002 thread_t thread,
2003 sched_bucket_t bucket)
2004 {
2005 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2006 return 0;
2007 }
2008 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2009 return sched_clutch_run_bucket_incr(clutch, bucket);
2010 }
2011
2012 static uint32_t
sched_clutch_run_bucket_incr(sched_clutch_t clutch,sched_bucket_t bucket)2013 sched_clutch_run_bucket_incr(
2014 sched_clutch_t clutch,
2015 sched_bucket_t bucket)
2016 {
2017 assert(bucket != TH_BUCKET_RUN);
2018 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2019 return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
2020 }
2021
2022 /*
2023 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
2024 *
2025 * Decrement the run count for the clutch bucket associated with the
2026 * thread.
2027 */
2028 uint32_t
sched_clutch_thread_run_bucket_decr(thread_t thread,sched_bucket_t bucket)2029 sched_clutch_thread_run_bucket_decr(
2030 thread_t thread,
2031 sched_bucket_t bucket)
2032 {
2033 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2034 return 0;
2035 }
2036 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2037 return sched_clutch_run_bucket_decr(clutch, bucket);
2038 }
2039
2040 static uint32_t
sched_clutch_run_bucket_decr(sched_clutch_t clutch,sched_bucket_t bucket)2041 sched_clutch_run_bucket_decr(
2042 sched_clutch_t clutch,
2043 sched_bucket_t bucket)
2044 {
2045 assert(bucket != TH_BUCKET_RUN);
2046 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2047 return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
2048 }
2049
2050 /*
2051 * sched_clutch_bucket_group_pri_shift_update()
2052 *
2053 * Routine to update the priority shift for a clutch bucket group,
2054 * necessary for timesharing correctly with priority decay within a
2055 * thread group + QoS.
2056 */
2057 static void
sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_t clutch_bucket_group)2058 sched_clutch_bucket_group_pri_shift_update(
2059 sched_clutch_bucket_group_t clutch_bucket_group)
2060 {
2061 if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2062 /* No timesharing needed for fixed priority Above UI threads */
2063 return;
2064 }
2065
2066 /*
2067 * Update the timeshare parameters for the clutch bucket group
2068 * if they haven't been updated in this tick.
2069 */
2070 uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
2071 uint32_t current_sched_ts = os_atomic_load(&sched_tick, relaxed);
2072 if (sched_ts < current_sched_ts) {
2073 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
2074 /* NCPU wide workloads should not experience decay */
2075 uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
2076 uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
2077 bucket_group_load = MIN(bucket_group_load, NRQS - 1);
2078 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
2079 /* Ensure that the pri_shift value is reasonable */
2080 pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
2081 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
2082 }
2083 }
2084
2085 /*
2086 * sched_clutch_bucket_group_timeshare_update()
2087 *
2088 * Routine to update the priority shift and priority for the clutch_bucket_group
2089 * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
2090 * clutch buckets with runnable threads in them. So it is important to maintain
2091 * the timesharing information at the clutch_bucket_group level instead of
2092 * individual clutch buckets (because the algorithm is trying to timeshare all
2093 * threads at the same QoS irrespective of which hierarchy they are enqueued in).
2094 *
2095 * The routine is called from the sched tick handling code to make sure this value
2096 * is updated at least once every sched tick. For clutch bucket groups which have
2097 * not been runnable for very long, the clutch_bucket_group maintains a "last
2098 * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
2099 * if this value is outdated, we update the priority shift.
2100 *
2101 * Possible optimization:
2102 * - The current algorithm samples the load at most once every sched tick (125ms).
2103 * This is prone to spikes in runnable counts; if that turns out to be
2104 * a problem, a simple solution would be to do the EWMA trick to sample
2105 * load at every load_tick (30ms) and use the averaged value for the pri
2106 * shift calculation.
2107 */
2108 static void
sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_bucket_t clutch_bucket,uint64_t ctime)2109 sched_clutch_bucket_group_timeshare_update(
2110 sched_clutch_bucket_group_t clutch_bucket_group,
2111 sched_clutch_bucket_t clutch_bucket,
2112 uint64_t ctime)
2113 {
2114 if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2115 /* No timesharing needed for fixed priority Above UI threads */
2116 return;
2117 }
2118 sched_clutch_bucket_group_pri_shift_update(clutch_bucket_group);
2119 /*
2120 * Update the clutch bucket priority; this allows clutch buckets that have been pending
2121 * for a long time to get an updated interactivity score.
2122 */
2123 sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
2124 }
2125
2126 /*
2127 * Calculate the CPU used by this thread and attribute it to the
2128 * thread's current scheduling bucket and clutch bucket group, or
2129 * a previous clutch bucket group if specified.
2130 * Also update the general scheduler CPU usage, matching
2131 * what we do for lightweight_update_priority().
2132 */
2133 static inline void
sched_clutch_thread_tick_delta(thread_t thread,sched_clutch_bucket_group_t _Nullable clutch_bucket_group)2134 sched_clutch_thread_tick_delta(thread_t thread, sched_clutch_bucket_group_t _Nullable clutch_bucket_group)
2135 {
2136 uint32_t cpu_delta;
2137 sched_tick_delta(thread, cpu_delta);
2138 if (thread->pri_shift < INT8_MAX) {
2139 thread->sched_usage += cpu_delta;
2140 }
2141 thread->cpu_delta += cpu_delta;
2142 if (clutch_bucket_group != NULL) {
2143 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
2144 } else {
2145 sched_clutch_cpu_usage_update(thread, cpu_delta);
2146 }
2147 }
2148
2149 /*
2150 * sched_clutch_thread_clutch_update()
2151 *
2152 * Routine called when the thread changes its thread group. The current
2153 * implementation relies on the fact that the thread group is changed only from
2154 * the context of the thread itself or when the thread is runnable but not in a
2155 * runqueue. Due to this fact, the thread group change causes only counter
2156 * updates in the old & new clutch buckets and no hierarchy changes. The routine
2157 * also attributes the CPU used so far to the old clutch.
2158 */
2159 void
sched_clutch_thread_clutch_update(thread_t thread,sched_clutch_t old_clutch,sched_clutch_t new_clutch)2160 sched_clutch_thread_clutch_update(
2161 thread_t thread,
2162 sched_clutch_t old_clutch,
2163 sched_clutch_t new_clutch)
2164 {
2165 if (old_clutch) {
2166 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
2167
2168 sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
2169
2170 /* Attribute CPU usage with the old clutch */
2171 sched_clutch_bucket_group_t old_clutch_bucket_group = NULL;
2172 if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
2173 old_clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
2174 }
2175 sched_clutch_thread_tick_delta(thread, old_clutch_bucket_group);
2176 }
2177
2178 if (new_clutch) {
2179 sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
2180 }
2181 }
2182
2183 /* Thread Insertion/Removal/Selection routines */
2184
2185 #if CONFIG_SCHED_EDGE
2186
2187 /*
2188 * Edge Scheduler Bound Thread Support
2189 *
2190 * The edge scheduler allows threads to be bound to specific clusters. The scheduler
2191 * maintains a separate runq on the clutch root to hold these bound threads. These
2192 * bound threads count towards the root priority and thread count, but are ignored
2193 * for thread migration/steal decisions. Bound threads that are enqueued in the
2194 * separate runq have the th_bound_cluster_enqueued flag set to allow easy
2195 * removal.
2196 *
2197 * Bound Threads Timesharing
2198 * The bound threads share the timesharing properties of the clutch bucket group they are
2199 * part of. They contribute to the load and use priority shifts/decay values from the
2200 * clutch bucket group.
2201 */
2202
2203 static boolean_t
sched_edge_bound_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2204 sched_edge_bound_thread_insert(
2205 sched_clutch_root_t root_clutch,
2206 thread_t thread,
2207 integer_t options)
2208 {
2209 /* Update the clutch runnable count and priority */
2210 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2211 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2212 if (root_bucket->scrb_bound_thread_runq.count == 0) {
2213 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
2214 }
2215
2216 assert((thread->th_bound_cluster_enqueued) == false);
2217 run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
2218 thread->th_bound_cluster_enqueued = true;
2219
2220 /*
2221 * Trigger an update to the thread's clutch bucket group's priority shift parameters,
2222 * needed for global timeshare within a clutch bucket group.
2223 */
2224 sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_for_thread(thread));
2225
2226 /* Increment the urgency counter for the root if necessary */
2227 sched_clutch_root_urgency_inc(root_clutch, thread);
2228
2229 int16_t root_old_pri = root_clutch->scr_priority;
2230 sched_clutch_root_pri_update(root_clutch);
2231 return root_clutch->scr_priority > root_old_pri;
2232 }
2233
2234 static void
sched_edge_bound_thread_remove(sched_clutch_root_t root_clutch,thread_t thread)2235 sched_edge_bound_thread_remove(
2236 sched_clutch_root_t root_clutch,
2237 thread_t thread)
2238 {
2239 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2240 assert((thread->th_bound_cluster_enqueued) == true);
2241 run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
2242 thread->th_bound_cluster_enqueued = false;
2243
2244 /* Decrement the urgency counter for the root if necessary */
2245 sched_clutch_root_urgency_dec(root_clutch, thread);
2246
2247 /* Update the clutch runnable count and priority */
2248 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2249 if (root_bucket->scrb_bound_thread_runq.count == 0) {
2250 sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
2251 }
2252 sched_clutch_root_pri_update(root_clutch);
2253
2254 /*
2255 * Trigger an update to the thread's clutch bucket group's priority shift parameters,
2256 * needed for global timeshare within a clutch bucket group.
2257 */
2258 sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_for_thread(thread));
2259 }
2260
2261 /*
2262 * Edge Scheduler cluster shared resource threads load balancing
2263 *
2264 * The Edge scheduler attempts to load balance cluster shared resource intensive threads
2265 * across clusters in order to reduce contention on the shared resources. It achieves
2266 * that by maintaining the runnable and running shared resource load on each cluster
2267 * and balancing the load across multiple clusters.
2268 *
2269 * The current implementation for cluster shared resource load balancing looks at
2270 * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
2271 * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
2272 * do not steal/rebalance shared resource threads. Some more details for the implementation:
2273 *
2274 * - When threads are tagged as shared resource, they go through the cluster selection logic
2275 * which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
2276 * enqueued in the cluster bound runqueue.
2277 *
2278 * - When the threads start running and call avoid_processor, the load balancing logic will be
2279 * invoked and cause the thread to be sent to a more preferred cluster if one exists and has
2280 * no shared resource load.
2281 *
2282 * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
2283 * it will look at running shared resource threads on foreign clusters and actively rebalance them.
2284 *
2285 * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
2286 * go idle intentionally.
2287 *
2288 * - One caveat of this design is that if a preferred CPU has already run and finished its shared
2289 * resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
2290 * The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
2291 * events listed above happen.
2292 *
2293 * - Also it currently does not consider other properties such as thread priorities and
2294 * qos level thread load in the thread placement decision.
2295 *
2296 * Edge Scheduler cluster shared resource thread scheduling policy
2297 *
2298 * The threads for shared resources can be scheduled using one of the two policies:
2299 *
2300 * EDGE_SHARED_RSRC_SCHED_POLICY_RR
2301 * This policy distributes the threads so that they spread across all available clusters
2302 * irrespective of type. The idea is that this scheduling policy will put a shared resource
2303 * thread on each cluster on the platform before it starts doubling up on clusters.
2304 *
2305 * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
2306 * This policy distributes threads so that the threads first fill up all the capacity on
2307 * the preferred cluster and its homogeneous peers before spilling to different core type.
2308 * The current implementation defines capacity based on the number of CPUs in the cluster;
2309 * so a cluster's shared resource is considered full if there are "n" runnable + running
2310 * shared resource threads on the cluster with n cpus. This policy is different from the
2311 * default scheduling policy of the edge scheduler since this always tries to fill up the
2312 * native clusters to capacity even when non-native clusters might be idle.
2313 */
2314 __options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
2315 EDGE_SHARED_RSRC_SCHED_POLICY_RR = 0,
2316 EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST = 1,
2317 });
2318
2319 static const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
2320 [CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
2321 [CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
2322 };
2323
2324 static void
sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch,thread_t thread)2325 sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
2326 {
2327 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2328 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
2329 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
2330 }
2331 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2332 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
2333 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
2334 }
2335 }
2336
2337 static void
sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch,thread_t thread)2338 sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
2339 {
2340 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2341 if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2342 thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2343 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2344 }
2345 }
2346 }
2347
2348 uint16_t
sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch,cluster_shared_rsrc_type_t shared_rsrc_type)2349 sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2350 {
2351 return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2352 }
2353
2354 /*
2355 * sched_edge_shared_rsrc_idle()
2356 *
2357 * Routine used to determine if the constrained resource for the pset is idle. This is
2358 * used by a CPU going idle to decide if it should rebalance a running shared resource
2359 * thread from a non-preferred cluster.
2360 */
2361 static boolean_t
sched_edge_shared_rsrc_idle(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2362 sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2363 {
2364 return sched_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
2365 }
2366
2367 /*
2368 * sched_edge_thread_shared_rsrc_type
2369 *
2370 * This routine decides if a given thread needs special handling for being a
2371 * heavy shared resource user. It is valid for the same thread to be using
2372 * several shared resources at the same time and have multiple policy flags set.
2373 * This routine determines which of those properties will be used for load
2374 * balancing and migration decisions.
2375 */
2376 static cluster_shared_rsrc_type_t
sched_edge_thread_shared_rsrc_type(thread_t thread)2377 sched_edge_thread_shared_rsrc_type(thread_t thread)
2378 {
2379 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2380 return CLUSTER_SHARED_RSRC_TYPE_RR;
2381 }
2382 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2383 return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2384 }
2385 return CLUSTER_SHARED_RSRC_TYPE_NONE;
2386 }
2387
2388 #endif /* CONFIG_SCHED_EDGE */
2389
2390 /*
2391 * sched_clutch_thread_bound_lookup()
2392 *
2393 * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2394 */
2395 static thread_t
sched_clutch_thread_bound_lookup(__unused sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t processor,thread_t _Nullable prev_thread)2396 sched_clutch_thread_bound_lookup(
2397 __unused sched_clutch_root_t root_clutch,
2398 sched_clutch_root_bucket_t root_bucket,
2399 processor_t processor,
2400 thread_t _Nullable prev_thread)
2401 {
2402 assert(root_bucket->scrb_bound == true);
2403 thread_t bound_thread = run_queue_peek(&root_bucket->scrb_bound_thread_runq);
2404 if ((prev_thread != NULL) &&
2405 (bound_thread == NULL || sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_thread->sched_pri, processor->first_timeslice))) {
2406 return prev_thread;
2407 }
2408 assert(bound_thread != THREAD_NULL);
2409 return bound_thread;
2410 }
2411
2412 /*
2413 * Clutch Bucket Group Thread Counts and Pending time calculation
2414 *
2415 * The pending time on the clutch_bucket_group allows the scheduler to track if it
2416 * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2417 * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2418 * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2419 * pending time is advanced to the time of thread selection.
2420 *
2421 * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2422 * simulataneously, it is important that the updates to thread counts and pending timestamps
2423 * happen atomically. The implementation relies on the following aspects to make that work
2424 * as expected:
2425 * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2426 * is held when threads are added/removed and pending timestamps are updated
2427 * - The thread count and pending timestamp can be updated atomically using double wide
2428 * 128 bit atomics
2429 *
2430 * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2431 * above to atomically update the interactivity score for a clutch bucket group.
2432 */
2433
2434 #if CONFIG_SCHED_EDGE
2435
2436 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2437 sched_clutch_bucket_group_thr_count_inc(
2438 sched_clutch_bucket_group_t clutch_bucket_group,
2439 uint64_t timestamp)
2440 {
2441 sched_clutch_counter_time_t old_pending_data;
2442 sched_clutch_counter_time_t new_pending_data;
2443 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2444 new_pending_data.scct_count = old_pending_data.scct_count + 1;
2445 new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2446 if (old_pending_data.scct_count == 0) {
2447 new_pending_data.scct_timestamp = timestamp;
2448 }
2449 });
2450 }
2451
2452 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2453 sched_clutch_bucket_group_thr_count_dec(
2454 sched_clutch_bucket_group_t clutch_bucket_group,
2455 uint64_t timestamp)
2456 {
2457 sched_clutch_counter_time_t old_pending_data;
2458 sched_clutch_counter_time_t new_pending_data;
2459 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2460 new_pending_data.scct_count = old_pending_data.scct_count - 1;
2461 if (new_pending_data.scct_count == 0) {
2462 new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2463 } else {
2464 new_pending_data.scct_timestamp = timestamp;
2465 }
2466 });
2467 }
2468
2469 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2470 sched_clutch_bucket_group_pending_ageout(
2471 sched_clutch_bucket_group_t clutch_bucket_group,
2472 uint64_t timestamp)
2473 {
2474 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2475 sched_clutch_counter_time_t old_pending_data;
2476 sched_clutch_counter_time_t new_pending_data;
2477 uint8_t cpu_usage_shift = 0;
2478
2479 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2480 cpu_usage_shift = 0;
2481 uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2482 bool old_update = (old_pending_ts >= timestamp);
2483 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2484 bool no_bucket_load = (bucket_load == 0);
2485 if (old_update || no_pending_time || no_bucket_load) {
2486 os_atomic_rmw_loop_give_up();
2487 }
2488
2489 /* Calculate the time the clutch bucket group has been pending */
2490 uint64_t pending_delta = timestamp - old_pending_ts;
2491 /*
2492 * Other buckets should get a chance to run first before artificially boosting
2493 * this clutch bucket group's interactivity score, at least when the entire root
2494 * bucket is getting a large enough share of CPU.
2495 */
2496 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2497 if (pending_delta < interactivity_delta) {
2498 os_atomic_rmw_loop_give_up();
2499 }
2500 cpu_usage_shift = (pending_delta / interactivity_delta);
2501 new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2502 new_pending_data.scct_count = old_pending_data.scct_count;
2503 });
2504 return cpu_usage_shift;
2505 }
2506
2507 static boolean_t
sched_edge_thread_should_be_inserted_as_bound(sched_clutch_root_t root_clutch,thread_t thread)2508 sched_edge_thread_should_be_inserted_as_bound(
2509 sched_clutch_root_t root_clutch,
2510 thread_t thread)
2511 {
2512 /*
2513 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2514 * If the thread is cluster-bound but to a different cluster, we should enqueue as unbound.
2515 */
2516 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2517 return TRUE;
2518 }
2519 /*
2520 * Use bound runqueue for shared resource threads. See "cluster shared resource
2521 * threads load balancing" section for details.
2522 */
2523 if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2524 return TRUE;
2525 }
2526 return FALSE;
2527 }
2528
2529 #else /* CONFIG_SCHED_EDGE */
2530
2531 /*
2532 * For the clutch scheduler, atomicity is ensured by making sure all operations
2533 * are happening under the pset lock of the only cluster present on the platform.
2534 */
2535 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2536 sched_clutch_bucket_group_thr_count_inc(
2537 sched_clutch_bucket_group_t clutch_bucket_group,
2538 uint64_t timestamp)
2539 {
2540 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2541 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2542 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2543 }
2544 clutch_bucket_group->scbg_pending_data.scct_count++;
2545 }
2546
2547 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2548 sched_clutch_bucket_group_thr_count_dec(
2549 sched_clutch_bucket_group_t clutch_bucket_group,
2550 uint64_t timestamp)
2551 {
2552 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2553 clutch_bucket_group->scbg_pending_data.scct_count--;
2554 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2555 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2556 } else {
2557 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2558 }
2559 }
2560
2561 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2562 sched_clutch_bucket_group_pending_ageout(
2563 sched_clutch_bucket_group_t clutch_bucket_group,
2564 uint64_t timestamp)
2565 {
2566 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2567 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2568 uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2569 bool old_update = (old_pending_ts >= timestamp);
2570 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2571 bool no_bucket_load = (bucket_load == 0);
2572 if (old_update || no_pending_time || no_bucket_load) {
2573 return 0;
2574 }
2575 uint64_t pending_delta = timestamp - old_pending_ts;
2576 /*
2577 * Other buckets should get a chance to run first before artificially boosting
2578 * this clutch bucket group's interactivity score, at least when the entire root
2579 * bucket is getting a large enough share of CPU.
2580 */
2581 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2582 if (pending_delta < interactivity_delta) {
2583 return 0;
2584 }
2585 uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2586 clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2587 return cpu_usage_shift;
2588 }
2589
2590 #endif /* CONFIG_SCHED_EDGE */
2591
2592 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2593 sched_clutch_bucket_group_interactivity_score_calculate(
2594 sched_clutch_bucket_group_t clutch_bucket_group,
2595 uint64_t timestamp)
2596 {
2597 if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2598 /*
2599 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2600 * priorities, make sure all AboveUI buckets are marked interactive.
2601 */
2602 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2603 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2604 }
2605 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2606 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2607 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2608 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2609 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2610 /* Write back any interactivity score update */
2611 #if CONFIG_SCHED_EDGE
2612 sched_clutch_counter_time_t old_interactivity_data;
2613 sched_clutch_counter_time_t new_interactivity_data;
2614 os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2615 new_interactivity_data.scct_count = old_interactivity_data.scct_count;
2616 if (old_interactivity_data.scct_timestamp >= timestamp) {
2617 os_atomic_rmw_loop_give_up();
2618 }
2619 new_interactivity_data.scct_timestamp = timestamp;
2620 if (old_interactivity_data.scct_timestamp != 0) {
2621 new_interactivity_data.scct_count = interactivity_score;
2622 }
2623 });
2624 return (uint8_t)new_interactivity_data.scct_count;
2625 #else /* !CONFIG_SCHED_EDGE */
2626 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2627 if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2628 clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2629 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2630 }
2631 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2632 #endif /* !CONFIG_SCHED_EDGE */
2633 }
2634
2635 /*
2636 * Clutch Bucket Group Run Count and Blocked Time Accounting
2637 *
2638 * The clutch bucket group maintains the number of runnable/running threads in the group.
2639 * Since the blocked time of the clutch bucket group is based on this count, it is
2640 * important to make sure the blocking timestamp and the run count are updated atomically.
2641 *
2642 * Since the run count increments happen without any pset locks held, the scheduler updates
2643 * the count & timestamp using double wide 128 bit atomics.
2644 */
2645
2646 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2647 sched_clutch_bucket_group_run_count_inc(
2648 sched_clutch_bucket_group_t clutch_bucket_group)
2649 {
2650 sched_clutch_counter_time_t old_blocked_data;
2651 sched_clutch_counter_time_t new_blocked_data;
2652
2653 bool update_blocked_time = false;
2654 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2655 new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2656 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2657 update_blocked_time = false;
2658 if (old_blocked_data.scct_count == 0) {
2659 new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2660 update_blocked_time = true;
2661 }
2662 });
2663 if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2664 uint64_t ctime = mach_absolute_time();
2665 if (ctime > old_blocked_data.scct_timestamp) {
2666 uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2667 blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2668 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2669 }
2670 }
2671 return (uint32_t)new_blocked_data.scct_count;
2672 }
2673
2674 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2675 sched_clutch_bucket_group_run_count_dec(
2676 sched_clutch_bucket_group_t clutch_bucket_group)
2677 {
2678 sched_clutch_counter_time_t old_blocked_data;
2679 sched_clutch_counter_time_t new_blocked_data;
2680
2681 uint64_t ctime = mach_absolute_time();
2682 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2683 new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2684 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2685 if (new_blocked_data.scct_count == 0) {
2686 new_blocked_data.scct_timestamp = ctime;
2687 }
2688 });
2689 return (uint32_t)new_blocked_data.scct_count;
2690 }
2691
2692 static inline sched_clutch_bucket_t
sched_clutch_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t thread)2693 sched_clutch_bucket_for_thread(
2694 sched_clutch_root_t root_clutch,
2695 thread_t thread)
2696 {
2697 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2698 assert(thread->thread_group == clutch->sc_tg);
2699
2700 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2701 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2702 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2703
2704 return clutch_bucket;
2705 }
2706
2707 static inline sched_clutch_bucket_group_t
sched_clutch_bucket_group_for_thread(thread_t prev_thread)2708 sched_clutch_bucket_group_for_thread(thread_t prev_thread)
2709 {
2710 sched_clutch_t clutch = sched_clutch_for_thread_group(prev_thread->thread_group);
2711 return &clutch->sc_clutch_groups[prev_thread->th_sched_bucket];
2712 }
2713
2714 /*
2715 * sched_clutch_thread_insert()
2716 *
2717 * Routine to insert a thread into the sched clutch hierarchy.
2718 * Update the counts at all levels of the hierarchy and insert the nodes
2719 * as they become runnable. Always called with the pset lock held.
2720 */
2721 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2722 sched_clutch_thread_insert(
2723 sched_clutch_root_t root_clutch,
2724 thread_t thread,
2725 integer_t options)
2726 {
2727 boolean_t result = FALSE;
2728
2729 sched_clutch_hierarchy_locked_assert(root_clutch);
2730 #if CONFIG_SCHED_EDGE
2731 sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2732 sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2733
2734 if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, thread)) {
2735 /*
2736 * Includes threads bound to this specific cluster as well as all
2737 * shared resource threads.
2738 */
2739 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2740 }
2741 #endif /* CONFIG_SCHED_EDGE */
2742
2743 uint64_t current_timestamp = mach_absolute_time();
2744 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2745 assert(thread->thread_group == clutch->sc_tg);
2746 sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, thread);
2747 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2748
2749 /*
2750 * Thread linkage in clutch_bucket
2751 *
2752 * A thread has a few linkages within the clutch bucket:
2753 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2754 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2755 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2756 */
2757
2758 /* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2759 thread->th_clutch_runq_link.stamp = current_timestamp;
2760 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2761 (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2762 priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2763
2764 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2765 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2766 sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2767 priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2768
2769 /* Insert thread into timesharing queue of the clutch bucket */
2770 enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2771
2772 /* Increment the urgency counter for the root if necessary */
2773 sched_clutch_root_urgency_inc(root_clutch, thread);
2774
2775 os_atomic_inc(&clutch->sc_thr_count, relaxed);
2776 sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2777
2778 /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2779 sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2780 if (clutch_bucket->scb_thr_count == 0) {
2781 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2782 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2783 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2784 } else {
2785 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2786 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2787 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2788 }
2789
2790 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2791 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2792 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2793 return result;
2794 }
2795
2796 /*
2797 * sched_clutch_thread_remove()
2798 *
2799 * Routine to remove a thread from the sched clutch hierarchy.
2800 * Update the counts at all levels of the hierarchy and remove the nodes
2801 * as they become empty. Always called with the pset lock held.
2802 */
2803 static void
sched_clutch_thread_remove(sched_clutch_root_t root_clutch,thread_t thread,uint64_t current_timestamp,sched_clutch_bucket_options_t options)2804 sched_clutch_thread_remove(
2805 sched_clutch_root_t root_clutch,
2806 thread_t thread,
2807 uint64_t current_timestamp,
2808 sched_clutch_bucket_options_t options)
2809 {
2810 sched_clutch_hierarchy_locked_assert(root_clutch);
2811 #if CONFIG_SCHED_EDGE
2812 sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2813 sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2814
2815 if (thread->th_bound_cluster_enqueued) {
2816 sched_edge_bound_thread_remove(root_clutch, thread);
2817 return;
2818 }
2819 #endif /* CONFIG_SCHED_EDGE */
2820 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2821 assert(thread->thread_group == clutch->sc_tg);
2822 thread_assert_runq_nonnull(thread);
2823
2824 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2825 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2826 assert(clutch_bucket->scb_root == root_clutch);
2827
2828 /* Decrement the urgency counter for the root if necessary */
2829 sched_clutch_root_urgency_dec(root_clutch, thread);
2830 /* Remove thread from the clutch_bucket */
2831 priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2832 remqueue(&thread->th_clutch_timeshare_link);
2833
2834 priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2835
2836 /*
2837 * Warning: After this point, the thread's scheduling fields may be
2838 * modified by other cores that acquire the thread lock.
2839 */
2840 thread_clear_runq(thread);
2841
2842 /* Update counts at various levels of the hierarchy */
2843 os_atomic_dec(&clutch->sc_thr_count, relaxed);
2844 sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2845 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2846 sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2847
2848 /* Remove the clutch from hierarchy (if needed) and update properties */
2849 if (clutch_bucket->scb_thr_count == 0) {
2850 sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2851 } else {
2852 sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2853 }
2854
2855 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2856 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2857 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2858 }
2859
2860 /*
2861 * sched_clutch_thread_unbound_lookup()
2862 *
2863 * Routine to find the highest unbound thread in the root clutch.
2864 * Helps find threads easily for steal/migrate scenarios in the
2865 * Edge scheduler.
2866 */
2867 static thread_t
sched_clutch_thread_unbound_lookup(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread)2868 sched_clutch_thread_unbound_lookup(
2869 sched_clutch_root_t root_clutch,
2870 sched_clutch_root_bucket_t root_bucket,
2871 processor_t _Nullable processor,
2872 thread_t _Nullable prev_thread)
2873 {
2874 assert(processor != NULL || prev_thread == NULL);
2875 assert(root_bucket->scrb_bound == false);
2876 sched_clutch_hierarchy_locked_assert(root_clutch);
2877
2878 /* Find the highest priority clutch bucket in this root bucket */
2879 bool chose_prev_thread = false;
2880 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket, processor, prev_thread, &chose_prev_thread);
2881 assert(clutch_bucket != NULL);
2882
2883 if (chose_prev_thread) {
2884 /* We have determined that prev_thread is the highest thread, based on the Clutch bucket level policy */
2885 assert(processor != NULL && prev_thread != NULL);
2886 return prev_thread;
2887 }
2888
2889 /* Find the highest priority runnable thread in this clutch bucket */
2890 thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2891 assert(thread != NULL);
2892
2893 /* Consider the previous thread */
2894 if (prev_thread != NULL &&
2895 sched_clutch_bucket_for_thread(root_clutch, prev_thread) == clutch_bucket &&
2896 sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, thread->sched_pri, processor->first_timeslice)) {
2897 thread = prev_thread;
2898 }
2899
2900 return thread;
2901 }
2902
2903 static sched_clutch_root_bucket_t
sched_clutch_root_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t prev_thread)2904 sched_clutch_root_bucket_for_thread(
2905 sched_clutch_root_t root_clutch,
2906 thread_t prev_thread)
2907 {
2908 #if CONFIG_SCHED_EDGE
2909 if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, prev_thread)) {
2910 return &root_clutch->scr_bound_buckets[prev_thread->th_sched_bucket];
2911 }
2912 #endif /* CONFIG_SCHED_EDGE */
2913 return &root_clutch->scr_unbound_buckets[prev_thread->th_sched_bucket];
2914 }
2915
2916 /*
2917 * sched_clutch_hierarchy_thread_highest()
2918 *
2919 * Routine to traverse the Clutch hierarchy and return the highest thread which
2920 * should be selected to run next, optionally comparing against the previously
2921 * running thread. Removes the highest thread with sched_clutch_thread_remove()
2922 * depending on the traverse mode and whether it is the previously running thread.
2923 * Always called with the pset lock held.
2924 */
2925 static thread_t
sched_clutch_hierarchy_thread_highest(sched_clutch_root_t root_clutch,processor_t processor,thread_t _Nullable prev_thread,sched_clutch_traverse_mode_t mode)2926 sched_clutch_hierarchy_thread_highest(
2927 sched_clutch_root_t root_clutch,
2928 processor_t processor,
2929 thread_t _Nullable prev_thread,
2930 sched_clutch_traverse_mode_t mode)
2931 {
2932 assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || prev_thread == NULL);
2933 sched_clutch_hierarchy_locked_assert(root_clutch);
2934
2935 thread_t highest_thread = NULL;
2936 uint64_t current_timestamp = mach_absolute_time();
2937 bool chose_prev_thread = false;
2938 sched_clutch_dbg_thread_select_packed_t debug_info = {0};
2939 sched_clutch_root_bucket_t prev_root_bucket = prev_thread != NULL ? sched_clutch_root_bucket_for_thread(root_clutch, prev_thread) : NULL;
2940 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL, prev_root_bucket, prev_thread, &chose_prev_thread, mode, &debug_info);
2941 if (chose_prev_thread) {
2942 /* We disambiguated that we want to keep running the previous thread */
2943 highest_thread = processor->active_thread;
2944 goto done_selecting_thread;
2945 }
2946 if (root_bucket == NULL) {
2947 /* The Clutch hierarchy has no runnable threads, including the previous thread */
2948 assert(sched_clutch_root_count(root_clutch) == 0);
2949 assert(prev_thread == NULL);
2950 return NULL;
2951 }
2952 if (root_bucket != prev_root_bucket) {
2953 /* We have ruled out continuing to run the previous thread, based on the root bucket level policy */
2954 prev_thread = NULL;
2955 assert((mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT) || (prev_root_bucket == NULL) ||
2956 (prev_root_bucket->scrb_bucket >= root_bucket->scrb_bucket) || (root_bucket->scrb_starvation_avoidance) ||
2957 (prev_root_bucket->scrb_bound != root_bucket->scrb_bound) ||
2958 (root_bucket->scrb_warp_remaining > 0 && root_bucket->scrb_warped_deadline > current_timestamp && prev_root_bucket->scrb_warp_remaining == 0));
2959 }
2960
2961 if (root_bucket->scrb_bound) {
2962 highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket, processor, prev_thread);
2963 } else {
2964 highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket, processor, prev_thread);
2965 }
2966
2967 if (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY ||
2968 (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT && highest_thread != processor->active_thread)) {
2969 assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || highest_thread != processor->active_thread);
2970 sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2971 }
2972
2973 done_selecting_thread:
2974 debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
2975 debug_info.trace_data.traverse_mode = mode;
2976 debug_info.trace_data.cluster_id = root_clutch->scr_cluster_id;
2977 debug_info.trace_data.selection_was_cluster_bound = root_bucket->scrb_bound;
2978 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2979 thread_tid(highest_thread), thread_group_get_id(highest_thread->thread_group), root_bucket->scrb_bucket, debug_info.scdts_trace_data_packed, 0);
2980 return highest_thread;
2981 }
2982
2983 /* High level global accessor routines */
2984
2985 /*
2986 * sched_clutch_root_urgency()
2987 *
2988 * Routine to get the urgency of the highest runnable
2989 * thread in the hierarchy.
2990 */
2991 static uint32_t
sched_clutch_root_urgency(sched_clutch_root_t root_clutch)2992 sched_clutch_root_urgency(
2993 sched_clutch_root_t root_clutch)
2994 {
2995 return root_clutch->scr_urgency;
2996 }
2997
2998 /*
2999 * sched_clutch_root_count_sum()
3000 *
3001 * The count_sum mechanism is used for scheduler runq
3002 * statistics calculation. Its only useful for debugging
3003 * purposes; since it takes a mach_absolute_time() on
3004 * other scheduler implementations, its better to avoid
3005 * populating this until absolutely necessary.
3006 */
3007 static uint32_t
sched_clutch_root_count_sum(__unused sched_clutch_root_t root_clutch)3008 sched_clutch_root_count_sum(
3009 __unused sched_clutch_root_t root_clutch)
3010 {
3011 return 0;
3012 }
3013
3014 /*
3015 * sched_clutch_root_priority()
3016 *
3017 * Routine to get the priority of the highest runnable
3018 * thread in the hierarchy.
3019 */
3020 static int
sched_clutch_root_priority(sched_clutch_root_t root_clutch)3021 sched_clutch_root_priority(
3022 sched_clutch_root_t root_clutch)
3023 {
3024 return root_clutch->scr_priority;
3025 }
3026
3027 /*
3028 * sched_clutch_root_count()
3029 *
3030 * Returns total number of runnable threads in the hierarchy.
3031 */
3032 uint32_t
sched_clutch_root_count(sched_clutch_root_t root_clutch)3033 sched_clutch_root_count(
3034 sched_clutch_root_t root_clutch)
3035 {
3036 return root_clutch->scr_thr_count;
3037 }
3038
3039 #if CONFIG_SCHED_EDGE
3040
3041 /*
3042 * sched_clutch_root_foreign_empty()
3043 *
3044 * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
3045 */
3046 static boolean_t
sched_clutch_root_foreign_empty(sched_clutch_root_t root_clutch)3047 sched_clutch_root_foreign_empty(
3048 sched_clutch_root_t root_clutch)
3049 {
3050 return priority_queue_empty(&root_clutch->scr_foreign_buckets);
3051 }
3052
3053 /*
3054 * sched_clutch_root_highest_foreign_thread_remove()
3055 *
3056 * Routine to return the thread in the highest priority clutch bucket in a cluster.
3057 * Must be called with the pset for the cluster locked.
3058 */
3059 static thread_t
sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t root_clutch)3060 sched_clutch_root_highest_foreign_thread_remove(
3061 sched_clutch_root_t root_clutch)
3062 {
3063 thread_t thread = THREAD_NULL;
3064 if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
3065 return thread;
3066 }
3067 sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
3068 thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
3069 sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
3070 return thread;
3071 }
3072
3073 #endif /* CONFIG_SCHED_EDGE */
3074
3075 /*
3076 * sched_clutch_thread_pri_shift()
3077 *
3078 * Routine to get the priority shift value for a thread.
3079 * Since the timesharing is done at the clutch_bucket level,
3080 * this routine gets the clutch_bucket and retrieves the
3081 * values from there.
3082 */
3083 uint32_t
sched_clutch_thread_pri_shift(thread_t thread,sched_bucket_t bucket)3084 sched_clutch_thread_pri_shift(
3085 thread_t thread,
3086 sched_bucket_t bucket)
3087 {
3088 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3089 return INT8_MAX;
3090 }
3091 assert(bucket != TH_BUCKET_RUN);
3092 sched_clutch_t clutch = sched_clutch_for_thread(thread);
3093 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
3094 return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
3095 }
3096
3097 #pragma mark -- Clutch Scheduler Algorithm
3098
3099 static void
3100 sched_clutch_init(void);
3101
3102 static thread_t
3103 sched_clutch_steal_thread(processor_set_t pset);
3104
3105 #if !SCHED_TEST_HARNESS
3106
3107 static void
3108 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
3109
3110 #endif /* !SCHED_TEST_HARNESS */
3111
3112 static boolean_t
3113 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
3114 sched_options_t options);
3115
3116 static boolean_t
3117 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
3118
3119 static ast_t
3120 sched_clutch_processor_csw_check(processor_t processor);
3121
3122 static boolean_t
3123 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3124
3125 static int
3126 sched_clutch_runq_count(processor_t processor);
3127
3128 static boolean_t
3129 sched_clutch_processor_queue_empty(processor_t processor);
3130
3131 #if !SCHED_TEST_HARNESS
3132
3133 static uint64_t
3134 sched_clutch_runq_stats_count_sum(processor_t processor);
3135
3136 #endif /* !SCHED_TEST_HARNESS */
3137
3138 static int
3139 sched_clutch_processor_bound_count(processor_t processor);
3140
3141 static void
3142 sched_clutch_pset_init(processor_set_t pset);
3143
3144 static void
3145 sched_clutch_processor_init(processor_t processor);
3146
3147 static thread_t
3148 sched_clutch_processor_highest_thread(processor_t processor, sched_clutch_traverse_mode_t mode);
3149
3150 static thread_t
3151 sched_clutch_choose_thread(processor_t processor, int priority, thread_t prev_thread, ast_t reason);
3152
3153 #if !SCHED_TEST_HARNESS
3154
3155 static void
3156 sched_clutch_processor_queue_shutdown(processor_t processor);
3157
3158 #endif /* !SCHED_TEST_HARNESS */
3159
3160 static sched_mode_t
3161 sched_clutch_initial_thread_sched_mode(task_t parent_task);
3162
3163 static uint32_t
3164 sched_clutch_initial_quantum_size(thread_t thread);
3165
3166 static uint32_t
3167 sched_clutch_run_incr(thread_t thread);
3168
3169 static uint32_t
3170 sched_clutch_run_decr(thread_t thread);
3171
3172 static void
3173 sched_clutch_update_thread_bucket(thread_t thread);
3174
3175 #if !SCHED_TEST_HARNESS
3176
3177 static void
3178 sched_clutch_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
3179
3180 #endif /* !SCHED_TEST_HARNESS */
3181
3182 const struct sched_dispatch_table sched_clutch_dispatch = {
3183 .sched_name = "clutch",
3184 .init = sched_clutch_init,
3185 .timebase_init = sched_timeshare_timebase_init,
3186 .processor_init = sched_clutch_processor_init,
3187 .pset_init = sched_clutch_pset_init,
3188 .choose_thread = sched_clutch_choose_thread,
3189 .steal_thread_enabled = sched_steal_thread_enabled,
3190 .steal_thread = sched_clutch_steal_thread,
3191 .processor_enqueue = sched_clutch_processor_enqueue,
3192 .processor_queue_remove = sched_clutch_processor_queue_remove,
3193 .processor_queue_empty = sched_clutch_processor_queue_empty,
3194 .priority_is_urgent = priority_is_urgent,
3195 .processor_csw_check = sched_clutch_processor_csw_check,
3196 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
3197 .initial_quantum_size = sched_clutch_initial_quantum_size,
3198 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3199 .processor_runq_count = sched_clutch_runq_count,
3200 .processor_bound_count = sched_clutch_processor_bound_count,
3201 .multiple_psets_enabled = TRUE,
3202 .avoid_processor_enabled = FALSE,
3203 .thread_avoid_processor = NULL,
3204 .update_thread_bucket = sched_clutch_update_thread_bucket,
3205 .cpu_init_completed = NULL,
3206 .thread_eligible_for_pset = NULL,
3207
3208 .rt_choose_processor = sched_rt_choose_processor,
3209 .rt_steal_thread = NULL,
3210 .rt_init_pset = sched_rt_init_pset,
3211 .rt_init_completed = sched_rt_init_completed,
3212 .rt_runq_count_sum = sched_rt_runq_count_sum,
3213
3214 #if !SCHED_TEST_HARNESS
3215 .maintenance_continuation = sched_timeshare_maintenance_continue,
3216 .compute_timeshare_priority = sched_compute_timeshare_priority,
3217 .choose_node = sched_choose_node,
3218 .choose_processor = choose_processor,
3219 .processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
3220 .can_update_priority = can_update_priority,
3221 .update_priority = update_priority,
3222 .lightweight_update_priority = lightweight_update_priority,
3223 .quantum_expire = sched_default_quantum_expire,
3224 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3225 .thread_update_scan = sched_clutch_thread_update_scan,
3226 .processor_balance = sched_SMT_balance,
3227 .qos_max_parallelism = sched_qos_max_parallelism,
3228 .check_spill = sched_check_spill,
3229 .ipi_policy = sched_ipi_policy,
3230 .thread_should_yield = sched_thread_should_yield,
3231 .run_count_incr = sched_clutch_run_incr,
3232 .run_count_decr = sched_clutch_run_decr,
3233 .pset_made_schedulable = sched_pset_made_schedulable,
3234 .thread_group_recommendation_change = sched_clutch_thread_group_recommendation_change,
3235
3236 .rt_queue_shutdown = sched_rt_queue_shutdown,
3237 .rt_runq_scan = sched_rt_runq_scan,
3238 #endif /* !SCHED_TEST_HARNESS */
3239 };
3240
3241 __attribute__((always_inline))
3242 static inline run_queue_t
sched_clutch_bound_runq(processor_t processor)3243 sched_clutch_bound_runq(processor_t processor)
3244 {
3245 return &processor->runq;
3246 }
3247
3248 __attribute__((always_inline))
3249 static inline sched_clutch_root_t
sched_clutch_processor_root_clutch(processor_t processor)3250 sched_clutch_processor_root_clutch(processor_t processor)
3251 {
3252 return &processor->processor_set->pset_clutch_root;
3253 }
3254
3255 __attribute__((always_inline))
3256 static inline run_queue_t
sched_clutch_thread_bound_runq(processor_t processor,__assert_only thread_t thread)3257 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
3258 {
3259 assert(thread->bound_processor == processor);
3260 return sched_clutch_bound_runq(processor);
3261 }
3262
3263 static uint32_t
sched_clutch_initial_quantum_size(thread_t thread)3264 sched_clutch_initial_quantum_size(thread_t thread)
3265 {
3266 if (thread == THREAD_NULL) {
3267 return std_quantum;
3268 }
3269 assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
3270 return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
3271 }
3272
3273 static sched_mode_t
sched_clutch_initial_thread_sched_mode(task_t parent_task)3274 sched_clutch_initial_thread_sched_mode(task_t parent_task)
3275 {
3276 if (parent_task == kernel_task) {
3277 return TH_MODE_FIXED;
3278 } else {
3279 return TH_MODE_TIMESHARE;
3280 }
3281 }
3282
3283 static void
sched_clutch_processor_init(processor_t processor)3284 sched_clutch_processor_init(processor_t processor)
3285 {
3286 run_queue_init(&processor->runq);
3287 }
3288
3289 static void
sched_clutch_pset_init(processor_set_t pset)3290 sched_clutch_pset_init(processor_set_t pset)
3291 {
3292 sched_clutch_root_init(&pset->pset_clutch_root, pset);
3293 }
3294
3295 static void
sched_clutch_tunables_init(void)3296 sched_clutch_tunables_init(void)
3297 {
3298 sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
3299 sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
3300 sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
3301 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
3302 NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
3303 assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
3304 sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
3305 }
3306
3307 static void
sched_clutch_init(void)3308 sched_clutch_init(void)
3309 {
3310 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3311 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3312 }
3313 sched_timeshare_init();
3314 sched_clutch_tunables_init();
3315 }
3316
3317 static inline bool
sched_clutch_pri_greater_than_tiebreak(int pri_one,int pri_two,bool one_wins_ties)3318 sched_clutch_pri_greater_than_tiebreak(int pri_one, int pri_two, bool one_wins_ties)
3319 {
3320 if (one_wins_ties) {
3321 return pri_one >= pri_two;
3322 } else {
3323 return pri_one > pri_two;
3324 }
3325 }
3326
3327 /*
3328 * sched_clutch_processor_highest_thread()
3329 *
3330 * Routine to determine the highest thread on the entire cluster runqueue which
3331 * should be selected to run next, optionally comparing against the previously
3332 * running thread. Removes the highest thread from the runqueue, depending on the
3333 * traverse mode and whether the highest thread is the previously running thread.
3334 *
3335 * Always called with the pset lock held. Assumes that processor->active_thread
3336 * may be locked and modified by another processor.
3337 */
3338 static thread_t
sched_clutch_processor_highest_thread(processor_t processor,sched_clutch_traverse_mode_t mode)3339 sched_clutch_processor_highest_thread(
3340 processor_t processor,
3341 sched_clutch_traverse_mode_t mode)
3342 {
3343 sched_clutch_root_t root_clutch = sched_clutch_processor_root_clutch(processor);
3344 int clutch_pri = sched_clutch_root_priority(root_clutch);
3345 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3346 int bound_pri = bound_runq->highq;
3347
3348 bool has_prev_thread = mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
3349 thread_t prev_thread = has_prev_thread ? processor->active_thread : NULL;
3350
3351 if (bound_runq->count == 0 && root_clutch->scr_thr_count == 0) {
3352 /* The runqueue is totally empty */
3353 assert(bound_pri < MINPRI && clutch_pri < MINPRI);
3354 return prev_thread;
3355 }
3356
3357 if (has_prev_thread) {
3358 if (prev_thread->sched_pri >= BASEPRI_RTQUEUES) {
3359 /* The previous thread is real-time and thus guaranteed higher than the non-RT runqueue */
3360 return prev_thread;
3361 }
3362 /* Allow the previous thread to influence the priority comparison of Clutch hierarchy vs. processor-bound runqueue */
3363 if (prev_thread->bound_processor != NULL) {
3364 bound_pri = MAX(bound_pri, prev_thread->sched_pri);
3365 } else {
3366 clutch_pri = MAX(clutch_pri, prev_thread->sched_pri);
3367 }
3368 }
3369
3370 bool prev_thread_is_not_processor_bound = has_prev_thread && (prev_thread->bound_processor == NULL);
3371 bool prev_thread_is_processor_bound = has_prev_thread && (prev_thread->bound_processor != NULL);
3372 thread_t next_thread = prev_thread;
3373 if (clutch_pri > bound_pri) {
3374 if (root_clutch->scr_thr_count == 0) {
3375 goto found_thread;
3376 }
3377 next_thread = sched_clutch_hierarchy_thread_highest(root_clutch, processor, prev_thread_is_not_processor_bound ? prev_thread : NULL, mode);
3378 } else {
3379 if (bound_runq->count == 0 ||
3380 (prev_thread_is_processor_bound && sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_runq->highq, processor->first_timeslice))) {
3381 goto found_thread;
3382 }
3383 next_thread = (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY) ?
3384 run_queue_dequeue(bound_runq, SCHED_HEADQ) : run_queue_peek(bound_runq);
3385 assert(mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || next_thread != prev_thread);
3386 }
3387 found_thread:
3388 assert(next_thread != NULL);
3389 return next_thread;
3390 }
3391
3392 static thread_t
sched_clutch_choose_thread(processor_t processor,__unused int priority,thread_t _Nullable prev_thread,__unused ast_t reason)3393 sched_clutch_choose_thread(
3394 processor_t processor,
3395 __unused int priority,
3396 thread_t _Nullable prev_thread,
3397 __unused ast_t reason)
3398 {
3399 assert(prev_thread == NULL || prev_thread == processor->active_thread);
3400 return sched_clutch_processor_highest_thread(processor, prev_thread != NULL ? SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT : SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3401 }
3402
3403 static boolean_t
sched_clutch_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)3404 sched_clutch_processor_enqueue(
3405 processor_t processor,
3406 thread_t thread,
3407 sched_options_t options)
3408 {
3409 boolean_t result;
3410
3411 thread_set_runq_locked(thread, processor);
3412 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3413 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3414 result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
3415 } else {
3416 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3417 result = run_queue_enqueue(rq, thread, options);
3418 }
3419 return result;
3420 }
3421
3422 static boolean_t
sched_clutch_processor_queue_empty(processor_t processor)3423 sched_clutch_processor_queue_empty(processor_t processor)
3424 {
3425 return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
3426 sched_clutch_bound_runq(processor)->count == 0;
3427 }
3428
3429 static ast_t
sched_clutch_processor_csw_check(processor_t processor)3430 sched_clutch_processor_csw_check(processor_t processor)
3431 {
3432 assert(processor->active_thread != NULL);
3433 thread_t runqueue_thread = sched_clutch_processor_highest_thread(processor, SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT);
3434 if (runqueue_thread != processor->active_thread) {
3435 /* Found a better thread to run */
3436 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0 ||
3437 sched_clutch_bound_runq(processor)->urgency > 0) {
3438 return AST_PREEMPT | AST_URGENT;
3439 }
3440 return AST_PREEMPT;
3441 }
3442 return AST_NONE;
3443 }
3444
3445 static boolean_t
sched_clutch_processor_queue_has_priority(__unused processor_t processor,__unused int priority,__unused boolean_t gte)3446 sched_clutch_processor_queue_has_priority(
3447 __unused processor_t processor,
3448 __unused int priority,
3449 __unused boolean_t gte)
3450 {
3451 /*
3452 * Never short-circuit the Clutch runqueue by returning FALSE here. Instead,
3453 * thread_select() should always go through sched_clutch_choose_thread().
3454 */
3455 return TRUE;
3456 }
3457
3458 static int
sched_clutch_runq_count(processor_t processor)3459 sched_clutch_runq_count(processor_t processor)
3460 {
3461 return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
3462 }
3463
3464 #if !SCHED_TEST_HARNESS
3465
3466 static uint64_t
sched_clutch_runq_stats_count_sum(processor_t processor)3467 sched_clutch_runq_stats_count_sum(processor_t processor)
3468 {
3469 uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
3470
3471 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
3472 return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
3473 } else {
3474 return bound_sum;
3475 }
3476 }
3477
3478 #endif /* !SCHED_TEST_HARNESS */
3479
3480 static int
sched_clutch_processor_bound_count(processor_t processor)3481 sched_clutch_processor_bound_count(processor_t processor)
3482 {
3483 return sched_clutch_bound_runq(processor)->count;
3484 }
3485
3486 #if !SCHED_TEST_HARNESS
3487
3488 static void
sched_clutch_processor_queue_shutdown(processor_t processor)3489 sched_clutch_processor_queue_shutdown(processor_t processor)
3490 {
3491 processor_set_t pset = processor->processor_set;
3492 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3493 thread_t thread;
3494 queue_head_t tqueue;
3495
3496 /* We only need to migrate threads if this is the last active processor in the pset */
3497 if (pset->online_processor_count > 0) {
3498 pset_unlock(pset);
3499 return;
3500 }
3501
3502 queue_init(&tqueue);
3503 while (sched_clutch_root_count(pset_clutch_root) > 0) {
3504 thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3505 enqueue_tail(&tqueue, &thread->runq_links);
3506 }
3507
3508 pset_unlock(pset);
3509
3510 qe_foreach_element_safe(thread, &tqueue, runq_links) {
3511 remqueue(&thread->runq_links);
3512 thread_lock(thread);
3513 thread_setrun(thread, SCHED_TAILQ);
3514 thread_unlock(thread);
3515 }
3516 }
3517
3518 #endif /* !SCHED_TEST_HARNESS */
3519
3520 static boolean_t
sched_clutch_processor_queue_remove(processor_t processor,thread_t thread)3521 sched_clutch_processor_queue_remove(
3522 processor_t processor,
3523 thread_t thread)
3524 {
3525 processor_set_t pset = processor->processor_set;
3526
3527 pset_lock(pset);
3528
3529 if (processor == thread_get_runq_locked(thread)) {
3530 /*
3531 * Thread is on a run queue and we have a lock on
3532 * that run queue.
3533 */
3534 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3535 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3536 sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3537 } else {
3538 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3539 run_queue_remove(rq, thread);
3540 }
3541 } else {
3542 /*
3543 * The thread left the run queue before we could
3544 * lock the run queue.
3545 */
3546 thread_assert_runq_null(thread);
3547 processor = PROCESSOR_NULL;
3548 }
3549
3550 pset_unlock(pset);
3551
3552 return processor != PROCESSOR_NULL;
3553 }
3554
3555 static thread_t
sched_clutch_steal_thread(__unused processor_set_t pset)3556 sched_clutch_steal_thread(__unused processor_set_t pset)
3557 {
3558 /* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3559 return THREAD_NULL;
3560 }
3561
3562 #if !SCHED_TEST_HARNESS
3563
3564 static void
sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)3565 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3566 {
3567 boolean_t restart_needed = FALSE;
3568 processor_t processor = processor_list;
3569 processor_set_t pset;
3570 thread_t thread;
3571 spl_t s;
3572
3573 /*
3574 * We update the threads associated with each processor (bound and idle threads)
3575 * and then update the threads in each pset runqueue.
3576 */
3577
3578 do {
3579 do {
3580 pset = processor->processor_set;
3581
3582 s = splsched();
3583 pset_lock(pset);
3584
3585 restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
3586
3587 pset_unlock(pset);
3588 splx(s);
3589
3590 if (restart_needed) {
3591 break;
3592 }
3593
3594 thread = processor->idle_thread;
3595 if (thread != THREAD_NULL && thread->sched_stamp != os_atomic_load(&sched_tick, relaxed)) {
3596 if (thread_update_add_thread(thread) == FALSE) {
3597 restart_needed = TRUE;
3598 break;
3599 }
3600 }
3601 } while ((processor = processor->processor_list) != NULL);
3602
3603 /* Ok, we now have a collection of candidates -- fix them. */
3604 thread_update_process_threads();
3605 } while (restart_needed);
3606
3607 pset_node_t node = &pset_node0;
3608 pset = node->psets;
3609
3610 do {
3611 do {
3612 restart_needed = FALSE;
3613 while (pset != NULL) {
3614 s = splsched();
3615 pset_lock(pset);
3616
3617 if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
3618 for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3619 restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3620 if (restart_needed) {
3621 break;
3622 }
3623 }
3624 queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3625 sched_clutch_bucket_t clutch_bucket;
3626 qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3627 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
3628 restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
3629 if (restart_needed) {
3630 break;
3631 }
3632 }
3633 }
3634
3635 pset_unlock(pset);
3636 splx(s);
3637
3638 if (restart_needed) {
3639 break;
3640 }
3641 pset = pset->pset_list;
3642 }
3643
3644 if (restart_needed) {
3645 break;
3646 }
3647 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3648
3649 /* Ok, we now have a collection of candidates -- fix them. */
3650 thread_update_process_threads();
3651 } while (restart_needed);
3652 }
3653
3654 /*
3655 * For threads that have changed sched_pri without changing the
3656 * base_pri for any reason other than decay, use the sched_pri
3657 * as the bucketizing priority instead of base_pri. All such
3658 * changes are typically due to kernel locking primitives boosts
3659 * or demotions.
3660 */
3661 static boolean_t
sched_thread_sched_pri_promoted(thread_t thread)3662 sched_thread_sched_pri_promoted(thread_t thread)
3663 {
3664 return (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3665 (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3666 (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3667 (thread->kern_promotion_schedpri != 0);
3668 }
3669
3670 #endif /* !SCHED_TEST_HARNESS */
3671
3672 /*
3673 * For the clutch scheduler, the run counts are maintained in the clutch
3674 * buckets (i.e thread group scheduling structure).
3675 */
3676 static uint32_t
sched_clutch_run_incr(thread_t thread)3677 sched_clutch_run_incr(thread_t thread)
3678 {
3679 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3680 uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3681 sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3682 return new_count;
3683 }
3684
3685 static uint32_t
sched_clutch_run_decr(thread_t thread)3686 sched_clutch_run_decr(thread_t thread)
3687 {
3688 assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3689 uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3690 sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3691 return new_count;
3692 }
3693
3694 /*
3695 * Routine to update the scheduling bucket for the thread.
3696 *
3697 * In the clutch scheduler implementation, the thread's bucket
3698 * is based on sched_pri if it was promoted due to a kernel
3699 * primitive; otherwise its based on the thread base_pri. This
3700 * enhancement allows promoted threads to reach a higher priority
3701 * bucket and potentially get selected sooner for scheduling.
3702 *
3703 * Also, the clutch scheduler does not honor fixed priority below
3704 * FG priority. It simply puts those threads in the corresponding
3705 * timeshare bucket. The reason for to do that is because it is
3706 * extremely hard to define the scheduling properties of such threads
3707 * and they typically lead to performance issues.
3708 *
3709 * Called with the thread lock held and the thread held off the runqueue.
3710 */
3711
3712 void
sched_clutch_update_thread_bucket(thread_t thread)3713 sched_clutch_update_thread_bucket(thread_t thread)
3714 {
3715 sched_bucket_t old_bucket = thread->th_sched_bucket;
3716 thread_assert_runq_null(thread);
3717 int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3718 sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3719
3720 if (old_bucket == new_bucket) {
3721 return;
3722 }
3723
3724 /* Bypass accounting CPU usage for a newly created thread */
3725 if (old_bucket != TH_BUCKET_RUN) {
3726 /* Attribute CPU usage with the old scheduling bucket */
3727 sched_clutch_thread_tick_delta(thread, NULL);
3728 }
3729
3730 /* Transition to the new sched_bucket */
3731 thread->th_sched_bucket = new_bucket;
3732 thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3733
3734 /*
3735 * Since this is called after the thread has been removed from the runq,
3736 * only the run counts need to be updated. The re-insert into the runq
3737 * would put the thread into the correct new bucket's runq.
3738 */
3739 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3740 sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3741 sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3742 }
3743 }
3744
3745 #if !SCHED_TEST_HARNESS
3746
3747 static void
sched_clutch_thread_group_recommendation_change(__unused struct thread_group * tg,__unused cluster_type_t new_recommendation)3748 sched_clutch_thread_group_recommendation_change(__unused struct thread_group *tg, __unused cluster_type_t new_recommendation)
3749 {
3750 /* Clutch ignores the recommendation because Clutch does not migrate
3751 * threads between cluster types independently from the Edge scheduler.
3752 */
3753 }
3754
3755 #endif /* !SCHED_TEST_HARNESS */
3756
3757 #if CONFIG_SCHED_EDGE
3758
3759 /* Implementation of the AMP version of the clutch scheduler */
3760
3761 static void
3762 sched_edge_init(void);
3763
3764 static void
3765 sched_edge_pset_init(processor_set_t pset);
3766
3767 static thread_t
3768 sched_edge_processor_idle(processor_set_t pset);
3769
3770 static boolean_t
3771 sched_edge_processor_queue_empty(processor_t processor);
3772
3773 static void
3774 sched_edge_processor_queue_shutdown(processor_t processor);
3775
3776 static processor_t
3777 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, sched_options_t *options_inout);
3778
3779 static void
3780 sched_edge_quantum_expire(thread_t thread);
3781
3782 static bool
3783 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason);
3784
3785 static bool
3786 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3787
3788 static void
3789 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3790
3791 static bool
3792 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3793
3794 static void
3795 sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3796
3797 static void
3798 sched_edge_cpu_init_completed(void);
3799
3800 static bool
3801 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3802
3803 static bool
3804 sched_edge_steal_thread_enabled(processor_set_t pset);
3805
3806 static sched_ipi_type_t
3807 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3808
3809 static uint32_t
3810 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3811
3812 static uint32_t
3813 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3814
3815 static uint32_t
3816 sched_edge_run_count_incr(thread_t thread);
3817
3818 static bool
3819 sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset);
3820
3821 const struct sched_dispatch_table sched_edge_dispatch = {
3822 .sched_name = "edge",
3823 .init = sched_edge_init,
3824 .timebase_init = sched_timeshare_timebase_init,
3825 .processor_init = sched_clutch_processor_init,
3826 .pset_init = sched_edge_pset_init,
3827 .choose_thread = sched_clutch_choose_thread,
3828 .steal_thread_enabled = sched_edge_steal_thread_enabled,
3829 .steal_thread = sched_edge_processor_idle,
3830 .choose_processor = sched_edge_choose_processor,
3831 .processor_enqueue = sched_clutch_processor_enqueue,
3832 .processor_queue_remove = sched_clutch_processor_queue_remove,
3833 .processor_queue_empty = sched_edge_processor_queue_empty,
3834 .priority_is_urgent = priority_is_urgent,
3835 .processor_csw_check = sched_clutch_processor_csw_check,
3836 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
3837 .initial_quantum_size = sched_clutch_initial_quantum_size,
3838 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3839 .processor_runq_count = sched_clutch_runq_count,
3840 .processor_bound_count = sched_clutch_processor_bound_count,
3841 .multiple_psets_enabled = TRUE,
3842 .avoid_processor_enabled = TRUE,
3843 .thread_avoid_processor = sched_edge_thread_avoid_processor,
3844 .processor_balance = sched_edge_balance,
3845 .qos_max_parallelism = sched_edge_qos_max_parallelism,
3846 .check_spill = sched_edge_check_spill,
3847 .ipi_policy = sched_edge_ipi_policy,
3848 .thread_should_yield = sched_edge_thread_should_yield,
3849 .update_thread_bucket = sched_clutch_update_thread_bucket,
3850 .cpu_init_completed = sched_edge_cpu_init_completed,
3851 .thread_eligible_for_pset = sched_edge_thread_eligible_for_pset,
3852
3853 .rt_choose_processor = sched_rt_choose_processor,
3854 .rt_steal_thread = sched_rt_steal_thread,
3855 .rt_init_pset = sched_rt_init_pset,
3856 .rt_init_completed = sched_rt_init_completed,
3857 .rt_runq_count_sum = sched_rt_runq_count_sum,
3858
3859 #if !SCHED_TEST_HARNESS
3860 .maintenance_continuation = sched_timeshare_maintenance_continue,
3861 .compute_timeshare_priority = sched_compute_timeshare_priority,
3862 .choose_node = sched_choose_node,
3863 .processor_queue_shutdown = sched_edge_processor_queue_shutdown,
3864 .can_update_priority = can_update_priority,
3865 .update_priority = update_priority,
3866 .lightweight_update_priority = lightweight_update_priority,
3867 .quantum_expire = sched_edge_quantum_expire,
3868 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3869 .thread_update_scan = sched_clutch_thread_update_scan,
3870 .run_count_incr = sched_edge_run_count_incr,
3871 .run_count_decr = sched_clutch_run_decr,
3872 .pset_made_schedulable = sched_edge_pset_made_schedulable,
3873 .thread_group_recommendation_change = NULL,
3874
3875 .rt_queue_shutdown = sched_rt_queue_shutdown,
3876 .rt_runq_scan = sched_rt_runq_scan,
3877 #endif /* !SCHED_TEST_HARNESS */
3878 };
3879
3880 static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3881
3882 /*
3883 * sched_edge_thread_bound_cluster_id()
3884 *
3885 * Routine to determine which cluster a particular thread is bound to. Uses
3886 * the sched_flags on the thread to map back to a specific cluster id.
3887 *
3888 * <Edge Multi-cluster Support Needed>
3889 */
3890 static uint32_t
sched_edge_thread_bound_cluster_id(thread_t thread)3891 sched_edge_thread_bound_cluster_id(thread_t thread)
3892 {
3893 assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3894 return thread->th_bound_cluster_id;
3895 }
3896
3897 /* Forward declaration for some thread migration routines */
3898 static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3899 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3900 static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3901 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks, processor_t *processor_hint_out, sched_options_t *options_inout);
3902
3903 static_assert(sizeof(sched_clutch_edge) == sizeof(uint64_t), "sched_clutch_edge fits in 64 bits");
3904
3905 /*
3906 * sched_edge_config_set()
3907 *
3908 * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3909 * policies in the scheduler.
3910 */
3911 static void
sched_edge_config_set(uint32_t src_cluster,uint32_t dst_cluster,sched_bucket_t bucket,sched_clutch_edge edge_config)3912 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_bucket_t bucket, sched_clutch_edge edge_config)
3913 {
3914 os_atomic_store(&pset_array[src_cluster]->sched_edges[dst_cluster][bucket], edge_config, relaxed);
3915 }
3916
3917 /*
3918 * sched_edge_config_get()
3919 *
3920 * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3921 * if it needs to update edges.
3922 */
3923 static sched_clutch_edge
sched_edge_config_get(uint32_t src_cluster,uint32_t dst_cluster,sched_bucket_t bucket)3924 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster, sched_bucket_t bucket)
3925 {
3926 return os_atomic_load(&pset_array[src_cluster]->sched_edges[dst_cluster][bucket], relaxed);
3927 }
3928
3929 /*
3930 * sched_edge_config_pset_push()
3931 *
3932 * After using sched_edge_config_set() to update edge tunables outgoing from a particular source
3933 * pset, this function should be called in order to propagate the updates to derived metadata for
3934 * the pset, such as search orders for outgoing spill and steal.
3935 */
3936 static void
sched_edge_config_pset_push(uint32_t src_pset_id)3937 sched_edge_config_pset_push(uint32_t src_pset_id)
3938 {
3939 processor_set_t src_pset = pset_array[src_pset_id];
3940 uint8_t search_order_len = sched_edge_max_clusters - 1;
3941 sched_pset_search_order_sort_data_t search_order_datas[MAX_PSETS - 1];
3942 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3943 uint8_t dst_pset_id = 0;
3944 for (int i = 0; i < search_order_len; i++, dst_pset_id++) {
3945 if (dst_pset_id == src_pset->pset_id) {
3946 dst_pset_id++;
3947 }
3948 search_order_datas[i].spsosd_src_pset = src_pset;
3949 search_order_datas[i].spsosd_dst_pset_id = dst_pset_id;
3950 sched_clutch_edge edge = sched_edge_config_get(src_pset->pset_id, dst_pset_id, bucket);
3951 search_order_datas[i].spsosd_migration_weight = edge.sce_migration_allowed ?
3952 edge.sce_migration_weight : UINT32_MAX;
3953 }
3954 sched_pset_search_order_compute(&src_pset->spill_search_order[bucket],
3955 search_order_datas, search_order_len, sched_edge_search_order_weight_then_locality_cmp);
3956 }
3957 }
3958
3959 static int
sched_edge_search_order_weight_then_locality(const void * a,const void * b)3960 sched_edge_search_order_weight_then_locality(const void *a, const void *b)
3961 {
3962 const sched_pset_search_order_sort_data_t *data_a = (const sched_pset_search_order_sort_data_t *)a;
3963 const sched_pset_search_order_sort_data_t *data_b = (const sched_pset_search_order_sort_data_t *)b;
3964 assert3p(data_a->spsosd_src_pset, ==, data_b->spsosd_src_pset);
3965 assert3u(data_a->spsosd_dst_pset_id, !=, data_b->spsosd_dst_pset_id);
3966 /*
3967 * Sort based on lowest edge migration weight, followed by die-local psets
3968 * first, followed by lowest pset id.
3969 */
3970 if (data_a->spsosd_migration_weight != data_b->spsosd_migration_weight) {
3971 return (data_a->spsosd_migration_weight < data_b->spsosd_migration_weight) ? -1 : 1;
3972 }
3973
3974 bool is_local_a = bitmap_test(data_a->spsosd_src_pset->local_psets, data_a->spsosd_dst_pset_id);
3975 bool is_local_b = bitmap_test(data_b->spsosd_src_pset->local_psets, data_b->spsosd_dst_pset_id);
3976 if (is_local_a != is_local_b) {
3977 return is_local_a ? -1 : 1;
3978 }
3979
3980 if (data_a->spsosd_dst_pset_id != data_b->spsosd_dst_pset_id) {
3981 return (data_a->spsosd_dst_pset_id < data_b->spsosd_dst_pset_id) ? -1 : 1;
3982 }
3983 return 0;
3984 }
3985
3986 cmpfunc_t sched_edge_search_order_weight_then_locality_cmp = &sched_edge_search_order_weight_then_locality;
3987
3988 /*
3989 * sched_edge_matrix_set()
3990 *
3991 * Routine to update various edges in the edge migration graph. The edge_changed array
3992 * indicates which edges need to be updated. Both the edge_matrix and edge_changed arrays
3993 * are matrices with dimension num_psets * num_psets * TH_BUCKET_SCHED_MAX, flattened into a
3994 * single-dimensional array.
3995 */
3996 void
sched_edge_matrix_set(sched_clutch_edge * edge_matrix,bool * edge_changed,__unused uint64_t flags,__assert_only uint64_t num_psets)3997 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changed, __unused uint64_t flags,
3998 __assert_only uint64_t num_psets)
3999 {
4000 assert3u(num_psets, ==, sched_edge_max_clusters);
4001 uint32_t edge_index = 0;
4002 for (uint32_t src_cluster = 0; src_cluster < sched_edge_max_clusters; src_cluster++) {
4003 for (uint32_t dst_cluster = 0; dst_cluster < sched_edge_max_clusters; dst_cluster++) {
4004 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4005 if (edge_changed[edge_index]) {
4006 sched_edge_config_set(src_cluster, dst_cluster, bucket, edge_matrix[edge_index]);
4007 }
4008 edge_index++;
4009 }
4010 }
4011 sched_edge_config_pset_push(src_cluster);
4012 }
4013 }
4014
4015 /*
4016 * sched_edge_matrix_get()
4017 *
4018 * Routine to retrieve various edges in the edge migration graph. The edge_requested array
4019 * indicates which edges need to be retrieved. Both the edge_matrix and edge_requested arrays
4020 * are matrices with dimension num_psets * num_psets * TH_BUCKET_SCHED_MAX, flattened into a
4021 * single-dimensional array.
4022 */
4023 void
sched_edge_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_requested,__unused uint64_t flags,__assert_only uint64_t num_psets)4024 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_requested, __unused uint64_t flags,
4025 __assert_only uint64_t num_psets)
4026 {
4027 assert3u(num_psets, ==, sched_edge_max_clusters);
4028 uint32_t edge_index = 0;
4029 for (uint32_t src_pset = 0; src_pset < sched_edge_max_clusters; src_pset++) {
4030 for (uint32_t dst_pset = 0; dst_pset < sched_edge_max_clusters; dst_pset++) {
4031 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4032 if (edge_requested[edge_index]) {
4033 edge_matrix[edge_index] = sched_edge_config_get(src_pset, dst_pset, bucket);
4034 }
4035 edge_index++;
4036 }
4037 }
4038 }
4039 }
4040
4041
4042 /*
4043 * sched_edge_init()
4044 *
4045 * Routine to initialize the data structures for the Edge scheduler.
4046 */
4047 static void
sched_edge_init(void)4048 sched_edge_init(void)
4049 {
4050 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
4051 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
4052 }
4053 sched_timeshare_init();
4054 sched_clutch_tunables_init();
4055 sched_edge_max_clusters = ml_get_cluster_count();
4056 }
4057
4058 static void
sched_edge_pset_init(processor_set_t pset)4059 sched_edge_pset_init(processor_set_t pset)
4060 {
4061 uint32_t pset_cluster_id = pset->pset_cluster_id;
4062 pset->pset_type = pset_cluster_type_to_cluster_type(pset->pset_cluster_type);
4063 /* Each pset must declare an AMP type */
4064 assert(pset->pset_type != CLUSTER_TYPE_SMP);
4065
4066 /* Set the edge weight and properties for the pset itself */
4067 bitmap_clear(pset->foreign_psets, pset_cluster_id);
4068 bitmap_clear(pset->native_psets, pset_cluster_id);
4069 bitmap_clear(pset->local_psets, pset_cluster_id);
4070 bitmap_clear(pset->remote_psets, pset_cluster_id);
4071 bzero(&pset->sched_edges, sizeof(pset->sched_edges));
4072 bzero(&pset->max_parallel_cores, sizeof(pset->max_parallel_cores));
4073 bzero(&pset->max_parallel_clusters, sizeof(pset->max_parallel_cores));
4074 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4075 sched_pset_search_order_init(pset, &pset->spill_search_order[bucket]);
4076 }
4077 sched_clutch_root_init(&pset->pset_clutch_root, pset);
4078 bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id);
4079 }
4080
4081 static boolean_t
sched_edge_processor_queue_empty(processor_t processor)4082 sched_edge_processor_queue_empty(processor_t processor)
4083 {
4084 return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
4085 (sched_clutch_bound_runq(processor)->count == 0);
4086 }
4087
4088 static void
sched_edge_check_spill(__unused processor_set_t pset,__unused thread_t thread)4089 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
4090 {
4091 assert(thread->bound_processor == PROCESSOR_NULL);
4092 }
4093
4094 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
4095 SCHED_EDGE_YIELD_RUNQ_NONEMPTY = 0x0,
4096 SCHED_EDGE_YIELD_FOREIGN_RUNNABLE = 0x1,
4097 SCHED_EDGE_YIELD_FOREIGN_RUNNING = 0x2,
4098 SCHED_EDGE_YIELD_STEAL_POSSIBLE = 0x3,
4099 SCHED_EDGE_YIELD_DISALLOW = 0x4,
4100 });
4101
4102 static bool
sched_edge_thread_should_yield(processor_t processor,__unused thread_t thread)4103 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
4104 {
4105 if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
4106 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4107 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
4108 return true;
4109 }
4110
4111 /*
4112 * The yield logic should follow the same logic that steal_thread () does. The
4113 * thread_should_yield() is effectively trying to quickly check that if the
4114 * current thread gave up CPU, is there any other thread that would execute
4115 * on this CPU. So it needs to provide the same answer as the steal_thread()/
4116 * processor Idle logic.
4117 */
4118 if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
4119 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4120 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
4121 return true;
4122 }
4123 if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
4124 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4125 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
4126 return true;
4127 }
4128
4129 processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
4130 if (steal_candidate != NULL) {
4131 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4132 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
4133 return true;
4134 }
4135
4136 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
4137 0, SCHED_EDGE_YIELD_DISALLOW);
4138 return false;
4139 }
4140
4141 #if !SCHED_TEST_HARNESS
4142
4143 static void
sched_edge_processor_queue_shutdown(processor_t processor)4144 sched_edge_processor_queue_shutdown(processor_t processor)
4145 {
4146 processor_set_t pset = processor->processor_set;
4147 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
4148 thread_t thread;
4149 queue_head_t tqueue;
4150
4151 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4152 if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
4153 pset_unlock(pset);
4154 return;
4155 }
4156
4157 bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
4158
4159 queue_init(&tqueue);
4160 while (sched_clutch_root_count(pset_clutch_root) > 0) {
4161 thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
4162 enqueue_tail(&tqueue, &thread->runq_links);
4163 }
4164 pset_unlock(pset);
4165
4166 qe_foreach_element_safe(thread, &tqueue, runq_links) {
4167 remqueue(&thread->runq_links);
4168 thread_lock(thread);
4169 thread_setrun(thread, SCHED_TAILQ);
4170 thread_unlock(thread);
4171 }
4172 }
4173
4174 #endif /* !SCHED_TEST_HARNESS */
4175
4176 /*
4177 * sched_edge_cluster_load_metric()
4178 *
4179 * The load metric for a cluster is a measure of the average scheduling latency
4180 * experienced by threads on that cluster. It is a product of the average number
4181 * of threads in the runqueue and the average execution time for threads. The metric
4182 * has special values in the following cases:
4183 * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
4184 * the maximum value to disallow any threads to migrate to this cluster.
4185 * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
4186 * to be spread across the platform quickly for ncpu wide workloads.
4187 */
4188 static uint32_t
sched_edge_cluster_load_metric(processor_set_t pset,sched_bucket_t sched_bucket)4189 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
4190 {
4191 if (pset_is_recommended(pset) == false) {
4192 return UINT32_MAX;
4193 }
4194 return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
4195 }
4196
4197 /*
4198 *
4199 * Edge Scheduler Steal/Rebalance logic
4200 *
4201 * = Generic scheduler logic =
4202 *
4203 * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
4204 * find any thread for execution in its runqueue. The aim of the steal operation
4205 * is to find other threads running/runnable in other clusters which should be
4206 * executed here.
4207 *
4208 * If the steal callout does not return a thread, the thread_select() logic calls
4209 * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
4210 * threads and idle out the current CPU.
4211 *
4212 * = SCHED(steal_thread) for Edge Scheduler =
4213 *
4214 * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
4215 * routine tries to do the following operations in order:
4216 * (1) Find foreign runnnable threads in non-native cluster
4217 * runqueues (sched_edge_foreign_runnable_thread_remove())
4218 * (2) Check if foreign threads are running on the non-native
4219 * clusters (sched_edge_foreign_running_thread_available())
4220 * - If yes, return THREAD_NULL for the steal callout and
4221 * perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
4222 * (3) Steal a thread from another cluster based on edge
4223 * weights (sched_edge_steal_thread())
4224 *
4225 * = SCHED(processor_balance) for Edge Scheduler =
4226 *
4227 * If steal_thread did not return a thread for the processor, use
4228 * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
4229 *
4230 * = Clutch Bucket Preferred Cluster Overrides =
4231 *
4232 * Since these operations (just like thread migrations on enqueue)
4233 * move threads across clusters, they need support for handling clutch
4234 * bucket group level preferred cluster recommendations.
4235 * For (1), a clutch bucket will be in the foreign runnable queue based
4236 * on the clutch bucket group preferred cluster.
4237 * For (2), the running thread will set the bit on the processor based
4238 * on its preferred cluster type.
4239 * For (3), the edge configuration would prevent threads from being stolen
4240 * in the wrong direction.
4241 *
4242 * = SCHED(thread_should_yield) =
4243 * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
4244 * since that is expecting the same answer as if thread_select() was called on a core
4245 * with an empty runqueue.
4246 */
4247
4248 static bool
sched_edge_steal_thread_enabled(__unused processor_set_t pset)4249 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
4250 {
4251 /*
4252 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
4253 */
4254 return true;
4255 }
4256
4257 static processor_set_t
sched_edge_steal_candidate(processor_set_t pset)4258 sched_edge_steal_candidate(processor_set_t pset)
4259 {
4260 uint32_t dst_cluster_id = pset->pset_cluster_id;
4261 for (int cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
4262 processor_set_t candidate_pset = pset_array[cluster_id];
4263 if (cluster_id == dst_cluster_id) {
4264 continue;
4265 }
4266 if (candidate_pset == NULL) {
4267 continue;
4268 }
4269 int highest_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
4270 if (highest_bucket != -1) {
4271 /* Assumes that higher root buckets have the less restrictive sce_steal_allowed edges */
4272 sched_clutch_edge edge = sched_edge_config_get(cluster_id, dst_cluster_id, highest_bucket);
4273 if (edge.sce_steal_allowed) {
4274 return candidate_pset;
4275 }
4276 }
4277 }
4278 return NULL;
4279 }
4280
4281 static boolean_t
sched_edge_foreign_runnable_thread_available(processor_set_t pset)4282 sched_edge_foreign_runnable_thread_available(processor_set_t pset)
4283 {
4284 /* Find all the clusters that are foreign for this cluster */
4285 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4286 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4287 /*
4288 * For each cluster, see if there are any runnable foreign threads.
4289 * This check is currently being done without the pset lock to make it cheap for
4290 * the common case.
4291 */
4292 processor_set_t target_pset = pset_array[cluster];
4293 if (pset_is_recommended(target_pset) == false) {
4294 continue;
4295 }
4296
4297 if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
4298 return true;
4299 }
4300 }
4301 return false;
4302 }
4303
4304 static thread_t
sched_edge_foreign_runnable_thread_remove(processor_set_t pset,uint64_t ctime)4305 sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
4306 {
4307 thread_t thread = THREAD_NULL;
4308
4309 /* Find all the clusters that are foreign for this cluster */
4310 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4311 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4312 /*
4313 * For each cluster, see if there are any runnable foreign threads.
4314 * This check is currently being done without the pset lock to make it cheap for
4315 * the common case.
4316 */
4317 processor_set_t target_pset = pset_array[cluster];
4318 if (pset_is_recommended(target_pset) == false) {
4319 continue;
4320 }
4321
4322 if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
4323 continue;
4324 }
4325 /*
4326 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
4327 * and get the highest priority thread.
4328 */
4329 pset_lock(target_pset);
4330 if (pset_is_recommended(target_pset)) {
4331 thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
4332 sched_update_pset_load_average(target_pset, ctime);
4333 }
4334 pset_unlock(target_pset);
4335
4336 /*
4337 * Edge Scheduler Optimization
4338 *
4339 * The current implementation immediately returns as soon as it finds a foreign
4340 * runnable thread. This could be enhanced to look at highest priority threads
4341 * from all foreign clusters and pick the highest amongst them. That would need
4342 * some form of global state across psets to make that kind of a check cheap.
4343 */
4344 if (thread != THREAD_NULL) {
4345 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
4346 break;
4347 }
4348 /* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
4349 }
4350
4351 return thread;
4352 }
4353
4354 /*
4355 * sched_edge_cpu_running_foreign_shared_rsrc_available()
4356 *
4357 * Routine to determine if the thread running on a CPU is a shared resource thread
4358 * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
4359 * a CPU going idle on a pset should rebalance a running shared resource heavy thread
4360 * from another non-ideal cluster based on the former's shared resource load.
4361 */
4362 static boolean_t
sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset,int foreign_cpu,processor_set_t idle_pset)4363 sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
4364 {
4365 boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
4366 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
4367 return false;
4368 }
4369
4370 boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
4371 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
4372 return false;
4373 }
4374 return true;
4375 }
4376
4377 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset)4378 sched_edge_foreign_running_thread_available(processor_set_t pset)
4379 {
4380 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4381 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4382 while (sched_iterate_psets_ordered(pset, &pset->spill_search_order[0], foreign_pset_bitmap[0], &istate)) {
4383 /* Skip the pset if its not schedulable */
4384 processor_set_t target_pset = pset_array[istate.spis_pset_id];
4385 if (pset_is_recommended(target_pset) == false) {
4386 continue;
4387 }
4388
4389 uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
4390 for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
4391 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
4392 continue;
4393 }
4394 return true;
4395 }
4396 }
4397 return false;
4398 }
4399
4400 /*
4401 * sched_edge_steal_possible()
4402 *
4403 * Determines whether we can and should steal a thread from
4404 * the candidate_pset to run it on the idle_pset. When returning
4405 * true, the function also writes the scheduling bucket that we
4406 * should steal from into the bucket_for_steal out parameter.
4407 *
4408 * Always called with the pset lock for candidate_pset held.
4409 */
4410 static bool
sched_edge_steal_possible(processor_set_t idle_pset,processor_set_t candidate_pset,sched_bucket_t * bucket_for_steal)4411 sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_pset, sched_bucket_t *bucket_for_steal)
4412 {
4413 sched_clutch_root_t candidate_clutch_root = &candidate_pset->pset_clutch_root;
4414
4415 int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(candidate_clutch_root, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
4416 if (highest_runnable_bucket == -1) {
4417 /* Candidate cluster runq is empty of unbound threads */
4418 return false;
4419 }
4420
4421 for (int unbound_qos = highest_runnable_bucket; unbound_qos >= 0; unbound_qos = bitmap_lsb_next(candidate_clutch_root->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, unbound_qos)) {
4422 /* Confirm we are allowed to steal across the edge at this QoS */
4423 sched_clutch_edge edge = sched_edge_config_get(candidate_pset->pset_cluster_id, idle_pset->pset_cluster_id, unbound_qos);
4424 if (edge.sce_steal_allowed == false) {
4425 continue;
4426 }
4427 if (edge.sce_migration_weight == 0) {
4428 /* Allow free stealing across a zero edge weight, even with idle cores in the candidate pset */
4429 *bucket_for_steal = (sched_bucket_t)unbound_qos;
4430 return true;
4431 }
4432 uint32_t candidate_runq_depth = os_atomic_load(&candidate_pset->pset_runnable_depth[unbound_qos], relaxed);
4433 if (candidate_runq_depth > pset_available_cpu_count(candidate_pset)) {
4434 /* Candidate cluster has excess load at this QoS (and at least one unbound thread we can steal!) */
4435 *bucket_for_steal = (sched_bucket_t)unbound_qos;
4436 return true;
4437 }
4438 }
4439 /* None of the unbound root buckets are available for steal */
4440 return false;
4441 }
4442
4443 static thread_t
sched_edge_steal_thread(processor_set_t pset,uint64_t candidate_pset_bitmap)4444 sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap)
4445 {
4446 thread_t stolen_thread = THREAD_NULL;
4447
4448 /*
4449 * Edge Scheduler Optimization
4450 *
4451 * The logic today bails as soon as it finds a cluster where the cluster load is
4452 * greater than the edge weight. Maybe it should have a more advanced version
4453 * which looks for the maximum delta etc.
4454 */
4455 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4456 while (sched_iterate_psets_ordered(pset, &pset->spill_search_order[0], candidate_pset_bitmap, &istate)) {
4457 processor_set_t steal_from_pset = pset_array[istate.spis_pset_id];
4458 if (steal_from_pset == NULL) {
4459 continue;
4460 }
4461 bool steal_allowed = false;
4462 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4463 sched_clutch_edge edge = sched_edge_config_get(istate.spis_pset_id, pset->pset_cluster_id, bucket);
4464 if (edge.sce_steal_allowed) {
4465 steal_allowed = true;
4466 break;
4467 }
4468 }
4469 if (steal_allowed == false) {
4470 continue;
4471 }
4472 pset_lock(steal_from_pset);
4473 sched_bucket_t bucket_for_steal;
4474 if (sched_edge_steal_possible(pset, steal_from_pset, &bucket_for_steal)) {
4475 uint64_t current_timestamp = mach_absolute_time();
4476 sched_clutch_root_t clutch_root_for_steal = &steal_from_pset->pset_clutch_root;
4477 stolen_thread = sched_clutch_thread_unbound_lookup(clutch_root_for_steal, &clutch_root_for_steal->scr_unbound_buckets[bucket_for_steal], NULL, NULL);
4478 sched_clutch_thread_remove(clutch_root_for_steal, stolen_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
4479
4480 sched_clutch_dbg_thread_select_packed_t debug_info = {0};
4481 debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
4482 debug_info.trace_data.traverse_mode = SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY;
4483 debug_info.trace_data.cluster_id = steal_from_pset->pset_cluster_id;
4484 debug_info.trace_data.selection_was_cluster_bound = false;
4485 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
4486 thread_tid(stolen_thread), thread_group_get_id(stolen_thread->thread_group), bucket_for_steal, debug_info.scdts_trace_data_packed, 0);
4487 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(stolen_thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
4488
4489 sched_update_pset_load_average(steal_from_pset, current_timestamp);
4490 }
4491 pset_unlock(steal_from_pset);
4492 if (stolen_thread != THREAD_NULL) {
4493 break;
4494 }
4495 }
4496 return stolen_thread;
4497 }
4498
4499 /*
4500 * sched_edge_processor_idle()
4501 *
4502 * The routine is the implementation for steal_thread() for the Edge scheduler.
4503 */
4504 static thread_t
sched_edge_processor_idle(processor_set_t pset)4505 sched_edge_processor_idle(processor_set_t pset)
4506 {
4507 thread_t thread = THREAD_NULL;
4508
4509 uint64_t ctime = mach_absolute_time();
4510
4511 processor_t processor = current_processor();
4512 bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
4513
4514 /* Each of the operations acquire the lock for the pset they target */
4515 pset_unlock(pset);
4516
4517 /* Find highest priority runnable thread on all non-native clusters */
4518 thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
4519 if (thread != THREAD_NULL) {
4520 return thread;
4521 }
4522
4523 /* Find highest priority runnable thread on all native clusters */
4524 thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4525 if (thread != THREAD_NULL) {
4526 return thread;
4527 }
4528
4529 /* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4530 boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4531 if (rebalance_needed) {
4532 return THREAD_NULL;
4533 }
4534
4535 /* No foreign threads found; find a thread to steal from all clusters based on weights/loads etc. */
4536 thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4537 return thread;
4538 }
4539
4540 /* Return true if this shared resource thread has a better cluster to run on */
4541 static bool
sched_edge_shared_rsrc_migrate_possible(thread_t thread,processor_set_t preferred_pset,processor_set_t current_pset)4542 sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4543 {
4544 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4545 uint64_t current_pset_load = sched_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4546 /*
4547 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4548 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4549 *
4550 * Edge Scheduler Optimization
4551 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4552 */
4553 uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4554
4555 uint64_t eligible_pset_bitmask = 0;
4556 if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4557 /*
4558 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4559 * only among clusters native with the preferred cluster.
4560 */
4561 eligible_pset_bitmask = preferred_pset->native_psets[0];
4562 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4563 } else {
4564 /* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4565 eligible_pset_bitmask = sched_edge_available_pset_bitmask[0];
4566 }
4567
4568 /* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4569 for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4570 if (cluster_id == current_pset->pset_cluster_id) {
4571 continue;
4572 }
4573 uint64_t cluster_load = sched_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4574 if (current_pset_adjusted_load > cluster_load) {
4575 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4576 return true;
4577 }
4578 }
4579 return false;
4580 }
4581
4582 /*
4583 * Stir-the-pot Registry:
4584 *
4585 * Global state tracking which cores currently have threads that
4586 * are ready to be stirred onto cores of the opposite type.
4587 *
4588 * The registry state updates are implemented with atomic transaction
4589 * operations rather than a global lock, in order to avoid the cost
4590 * of serializing some of the most frequent registry state update
4591 * callsites that depend on consistent speed--namely the
4592 * preemption check and context-switch paths. The most expensive
4593 * state update, in sched_edge_stir_the_pot_try_trigger_swap(), only
4594 * happens at quantum expiration, which should allow cheaper
4595 * operations at other callsites to win the race.
4596 */
4597 typedef unsigned __int128 sched_edge_stp_registry_t;
4598 _Atomic sched_edge_stp_registry_t sched_edge_stir_the_pot_global_registry = 0LL;
4599 #define SESTP_BITS_PER_CORE (2)
4600 #define SESTP_BIT_POS(cpu_id) ((sched_edge_stp_registry_t)(cpu_id * SESTP_BITS_PER_CORE))
4601 #define SESTP_MASK(cpu_id) ((sched_edge_stp_registry_t)mask(SESTP_BITS_PER_CORE) << SESTP_BIT_POS(cpu_id))
4602 static_assert((SESTP_BITS_PER_CORE * MAX_CPUS) <= (sizeof(sched_edge_stp_registry_t) * 8),
4603 "Global registry must fit per-core bits for each core");
4604
4605 #define SESTP_EXTRACT_STATE(registry, cpu_id) ((registry >> SESTP_BIT_POS(cpu_id)) & mask(SESTP_BITS_PER_CORE))
4606 #define SESTP_SET_STATE(registry, cpu_id, state) ((registry & ~SESTP_MASK(cpu_id)) | ((sched_edge_stp_registry_t)state << SESTP_BIT_POS(cpu_id)))
4607 __enum_decl(sched_edge_stp_state_t, uint8_t, {
4608 SCHED_EDGE_STP_NOT_WANT = 0,
4609 SCHED_EDGE_STP_REQUESTED = 1,
4610 SCHED_EDGE_STP_PENDING = 2,
4611 SCHED_EDGE_STP_MAX = SCHED_EDGE_STP_PENDING
4612 });
4613 static_assert(SCHED_EDGE_STP_MAX <= mask(SESTP_BITS_PER_CORE),
4614 "Per-core stir-the-pot request state must fit in per-core bits");
4615
4616 #if OS_ATOMIC_USE_LLSC
4617 #error "Expecting CAS implementation of os_atomic_rmw_loop()"
4618 #endif /* OS_ATOMIC_USE_LLSC */
4619
4620 static cpumap_t sched_edge_p_core_map = 0ULL;
4621 static cpumap_t sched_edge_non_p_core_map = 0ULL;
4622
4623 /*
4624 * In order to reduce the chance of picking the same CPUs over
4625 * and over unfairly for stir-the-pot swaps, use an offset value
4626 * for the lsb selection, which rotates by one index each time
4627 * the choice is evaluated.
4628 */
4629 static _Atomic uint64_t sched_edge_stp_selection_p_core_offset = 0;
4630 static _Atomic uint64_t sched_edge_stp_selection_non_p_core_offset = 0;
4631
4632 /*
4633 * sched_edge_stir_the_pot_try_trigger_swap()
4634 *
4635 * Search for an eligible swap candidate on the opposite core
4636 * type, and if one is found, initiate a swap for stir-the-pot.
4637 * From a P-core, initiating means sending an inbox message and IPI
4638 * to the swapping lower performance core. For initiating swap from
4639 * a lower performance core, only an inbox message needs to be sent
4640 * to itself, naming the P-core for swap.
4641 * If no eligible candidate is found, mark the current processor
4642 * as requesting stir-the-pot swap--that is unless a swap has already
4643 * been initiated for this core, in which case we should sit tight.
4644 * Thread lock must be held.
4645 */
4646 static inline int
sched_edge_stir_the_pot_try_trigger_swap(thread_t thread)4647 sched_edge_stir_the_pot_try_trigger_swap(thread_t thread)
4648 {
4649 processor_t self_processor = current_processor();
4650 int self_cpu = self_processor->cpu_id;
4651 /*
4652 * Prepare the core mask of candidate cores (of the opposite type),
4653 * and compute an offset where the candidate search should begin,
4654 * to avoid unfairly swapping with the same cores repeatedly.
4655 */
4656 cpumap_t swap_candidates_map;
4657 uint64_t offset;
4658 if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
4659 swap_candidates_map = sched_edge_non_p_core_map;
4660 offset = os_atomic_inc_orig(&sched_edge_stp_selection_non_p_core_offset, relaxed);
4661 } else {
4662 swap_candidates_map = sched_edge_p_core_map;
4663 offset = os_atomic_inc_orig(&sched_edge_stp_selection_p_core_offset, relaxed);
4664 }
4665 int num_candidates = bit_count(swap_candidates_map);
4666 if (num_candidates == 0) {
4667 /* Too early in boot, no cores of opposite type */
4668 return -1;
4669 }
4670 int cpu_of_type_offset_ind = offset % num_candidates;
4671 int search_start_ind = lsb_first(swap_candidates_map);
4672 for (int i = 0; i < cpu_of_type_offset_ind; i++) {
4673 search_start_ind = lsb_next(swap_candidates_map, search_start_ind);
4674 assert3s(search_start_ind, !=, -1);
4675 }
4676 assert3s(search_start_ind, !=, -1);
4677 swap_candidates_map = bit_ror64(swap_candidates_map, search_start_ind);
4678 /*
4679 * Search the registry for candidate cores of the opposite type which
4680 * have requested swap.
4681 */
4682 int swap_cpu;
4683 sched_edge_stp_registry_t old_registry, new_registry, intermediate_registry;
4684 sched_edge_stp_state_t self_state;
4685 /* BEGIN IGNORE CODESTYLE */
4686 os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
4687 old_registry, new_registry, relaxed, {
4688 swap_cpu = -1;
4689 self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
4690 if (self_state == SCHED_EDGE_STP_PENDING) {
4691 /*
4692 * Another core already initiated a swap with us, so we should
4693 * wait for that one to finish rather than initiate or request
4694 * a new one.
4695 */
4696 os_atomic_rmw_loop_give_up(break);
4697 }
4698 /* Scan candidates */
4699 for (int rotid = lsb_first(swap_candidates_map); rotid != -1; rotid = lsb_next(swap_candidates_map, rotid)) {
4700 int candidate_cpu = (rotid + search_start_ind) % 64; // un-rotate the bit
4701 sched_edge_stp_state_t candidate_state = SESTP_EXTRACT_STATE(old_registry, candidate_cpu);
4702 if (candidate_state == SCHED_EDGE_STP_REQUESTED) {
4703 sched_bucket_t candidate_qos = os_atomic_load(
4704 &processor_array[candidate_cpu]->processor_set->cpu_running_buckets[candidate_cpu], relaxed);
4705 if (candidate_qos == thread->th_sched_bucket) {
4706 /* Found a requesting candidate of matching QoS */
4707 swap_cpu = candidate_cpu;
4708 break;
4709 }
4710 }
4711 }
4712 if (swap_cpu == -1) {
4713 /* No candidates requesting swap, so mark this core as requesting */
4714 intermediate_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_REQUESTED);
4715 } else {
4716 /*
4717 * Mark candidate core as selected/pending for swap, and mark
4718 * current CPU as not needing a swap anymore, since we will now
4719 * start one.
4720 */
4721 intermediate_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_PENDING);
4722 intermediate_registry = SESTP_SET_STATE(intermediate_registry, swap_cpu, SCHED_EDGE_STP_PENDING);
4723 }
4724 new_registry = intermediate_registry;
4725 });
4726 /* END IGNORE CODESTYLE */
4727 /* Leave debug tracepoints for tracking any updates to registry state */
4728 if (self_state != SCHED_EDGE_STP_PENDING) {
4729 if (swap_cpu == -1) {
4730 if (self_state != SCHED_EDGE_STP_REQUESTED) {
4731 /* Now requesting */
4732 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
4733 DBG_FUNC_START, 0, self_cpu, cpu_of_type_offset_ind, 0);
4734 }
4735 } else {
4736 if (self_state == SCHED_EDGE_STP_REQUESTED) {
4737 /* Now pending */
4738 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
4739 DBG_FUNC_END, 1, self_cpu, cpu_of_type_offset_ind, 0);
4740 }
4741 int swap_state = SESTP_EXTRACT_STATE(old_registry, swap_cpu);
4742 if (swap_state == SCHED_EDGE_STP_REQUESTED) {
4743 /* Swap core now pending */
4744 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
4745 DBG_FUNC_END, 1, swap_cpu, cpu_of_type_offset_ind, 0);
4746 }
4747 }
4748 }
4749 if (swap_cpu != -1) {
4750 /* Initiate a stir-the-pot swap */
4751 assert3s(swap_cpu, <, ml_get_topology_info()->num_cpus);
4752 assert3s(swap_cpu, !=, self_processor->cpu_id);
4753 processor_t swap_processor = processor_array[swap_cpu];
4754 if (swap_processor == PROCESSOR_NULL) {
4755 /* Unlikely early boot initialization race */
4756 return -1;
4757 }
4758 assert3u(sched_edge_stir_the_pot_core_type_is_desired(swap_processor->processor_set), !=,
4759 sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set));
4760 if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
4761 /*
4762 * Send a message and IPI notification to the lower-performance
4763 * core we found which wants to swap, so it will know to send its
4764 * thread back here.
4765 */
4766 os_atomic_store(&swap_processor->stir_the_pot_inbox_cpu, self_cpu, relaxed);
4767 processor_set_t swap_pset = swap_processor->processor_set;
4768 pset_lock(swap_pset);
4769 sched_ipi_type_t ipi_type = sched_ipi_action(swap_processor, NULL,
4770 SCHED_IPI_EVENT_REBALANCE);
4771 pset_unlock(swap_pset);
4772 sched_ipi_perform(swap_processor, ipi_type);
4773 } else {
4774 /*
4775 * Send message to self to send this thread to the swap P-core. P-core
4776 * will clear its own pending state upon commiting to the incoming swap
4777 * thread after that happens.
4778 */
4779 os_atomic_store(&self_processor->stir_the_pot_inbox_cpu, swap_cpu, relaxed);
4780 }
4781 }
4782 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_NONE,
4783 (swap_cpu != -1) ? 1 : 0, swap_cpu, old_registry, cpu_of_type_offset_ind);
4784 return swap_cpu;
4785 }
4786
4787 /*
4788 * sched_edge_stir_the_pot_clear_registry_entry()
4789 *
4790 * Mark the current CPU as NOT containing a thread which is eligible
4791 * to be swapped for stir-the-pot.
4792 * Preemption must be disabled.
4793 */
4794 void
sched_edge_stir_the_pot_clear_registry_entry(void)4795 sched_edge_stir_the_pot_clear_registry_entry(void)
4796 {
4797 int self_cpu = current_processor()->cpu_id;
4798 sched_edge_stp_state_t self_state;
4799 sched_edge_stp_registry_t old_registry, new_registry;
4800 os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
4801 old_registry, new_registry, relaxed, {
4802 self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
4803 if (self_state == SCHED_EDGE_STP_NOT_WANT) {
4804 /* State already cleared, nothing to be done */
4805 os_atomic_rmw_loop_give_up(break);
4806 }
4807 new_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_NOT_WANT);
4808 });
4809 if (self_state == SCHED_EDGE_STP_REQUESTED) {
4810 /* Request was cleared */
4811 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_END,
4812 2, self_cpu, 0, 0);
4813 }
4814 }
4815
4816 /*
4817 * sched_edge_stir_the_pot_set_registry_entry()
4818 *
4819 * Mark the current CPU as containing a thread which is eligible
4820 * to be swapped to a core of the opposite type for stir-the-pot.
4821 * Preemption must be disabled.
4822 */
4823 static inline void
sched_edge_stir_the_pot_set_registry_entry(void)4824 sched_edge_stir_the_pot_set_registry_entry(void)
4825 {
4826 int self_cpu = current_processor()->cpu_id;
4827 sched_edge_stp_state_t self_state;
4828 sched_edge_stp_registry_t old_registry, new_registry;
4829 bool newly_requested = os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
4830 old_registry, new_registry, relaxed, {
4831 self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
4832 if (self_state == SCHED_EDGE_STP_REQUESTED) {
4833 /* Core already registered, nothing to be done */
4834 os_atomic_rmw_loop_give_up(break);
4835 }
4836 new_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_REQUESTED);
4837 });
4838 if (newly_requested) {
4839 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_START,
4840 3, self_cpu, self_state, 0);
4841 }
4842 }
4843
4844 /* Stir-the-pot is designed for sharing time on the P-cores */
4845 static inline bool
sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset)4846 sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset)
4847 {
4848 return pset->pset_type == CLUSTER_TYPE_P;
4849 }
4850
4851 /*
4852 * sched_edge_stir_the_pot_thread_eligible()
4853 *
4854 * Determine whether a thread is eligible to engage in a
4855 * stir-the-pot swap. It must be P-recommended, unbound, and not
4856 * round-robin shared resource. Additionally, it must have already
4857 * expired quantum on its current core type.
4858 */
4859 static inline bool
sched_edge_stir_the_pot_thread_eligible(thread_t thread)4860 sched_edge_stir_the_pot_thread_eligible(thread_t thread)
4861 {
4862 processor_set_t preferred_pset;
4863 if ((thread == THREAD_NULL) ||
4864 ((preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)]) == PROCESSOR_SET_NULL)) {
4865 /* Still initializing at boot */
4866 return false;
4867 }
4868 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4869 bool right_kind_of_thread =
4870 sched_edge_stir_the_pot_core_type_is_desired(preferred_pset) &&
4871 (thread->sched_mode != TH_MODE_REALTIME) &&
4872 ((thread->state & TH_IDLE) == 0) &&
4873 SCHED_CLUTCH_THREAD_ELIGIBLE(thread) &&
4874 (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) == false) &&
4875 (shared_rsrc_type == CLUSTER_SHARED_RSRC_TYPE_NONE ||
4876 shared_rsrc_type == CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
4877 bool ready_for_swap = sched_edge_stir_the_pot_core_type_is_desired(current_processor()->processor_set) ?
4878 thread->th_expired_quantum_on_higher_core :
4879 thread->th_expired_quantum_on_lower_core;
4880 return right_kind_of_thread && ready_for_swap;
4881 }
4882
4883 /*
4884 * sched_edge_stir_the_pot_check_inbox_for_thread()
4885 *
4886 * Check whether this thread on a non-P-core has been chosen by a P-core to
4887 * swap places for stir-the-pot, optionally consuming the inbox message.
4888 * Preemption must be disabled.
4889 */
4890 static inline int
sched_edge_stir_the_pot_check_inbox_for_thread(thread_t thread,bool consume_message)4891 sched_edge_stir_the_pot_check_inbox_for_thread(thread_t thread, bool consume_message)
4892 {
4893 processor_t self_processor = current_processor();
4894 int dst_cpu = -1;
4895 if (sched_edge_stir_the_pot_thread_eligible(thread)) {
4896 /* Thread can accept the inbox message */
4897 dst_cpu = os_atomic_load(&self_processor->stir_the_pot_inbox_cpu, relaxed);
4898 } else {
4899 /* Ensure registry state is cleared for ineligible thread, if it hasn't been already */
4900 sched_edge_stir_the_pot_clear_registry_entry();
4901 /*
4902 * Note, we don't clear a possible inbox message, in case an eligible
4903 * thread comes back on-core quickly to receive it.
4904 */
4905 }
4906 if (consume_message) {
4907 /*
4908 * Unconditionally clear inbox, since either we are triggering a
4909 * swap now or ultimately discarding the message because conditions
4910 * have changed (thread not eligible).
4911 */
4912 os_atomic_store(&self_processor->stir_the_pot_inbox_cpu, -1, relaxed);
4913 /*
4914 * We may have delayed requesting stir-the-pot swap for the the current thread
4915 * due to a pending inbox message for the previous thread. Now that that such
4916 * a message has been received, finishing updating the registry state.
4917 */
4918 if (sched_edge_stir_the_pot_thread_eligible(self_processor->active_thread)) {
4919 sched_edge_stir_the_pot_set_registry_entry();
4920 }
4921 }
4922 return dst_cpu;
4923 }
4924
4925 /*
4926 * sched_edge_stir_the_pot_update_registry_state()
4927 *
4928 * Update stir-the-pot state for the current processor based on its
4929 * (possibly new) current thread. This sets or clears the registry state
4930 * which indicates whether the processor is running a thread that wants
4931 * and is eligible to be swapped with a thread on the opposite core type.
4932 * Preemption must be disabled.
4933 */
4934 void
sched_edge_stir_the_pot_update_registry_state(thread_t thread)4935 sched_edge_stir_the_pot_update_registry_state(thread_t thread)
4936 {
4937 processor_t self_processor = current_processor();
4938 /*
4939 * Clear corresponding th_expired_quantum_on_ field now that thread
4940 * is getting a chance to run on the opposite type.
4941 */
4942 if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
4943 thread->th_expired_quantum_on_lower_core = false;
4944 } else {
4945 thread->th_expired_quantum_on_higher_core = false;
4946 }
4947 if (sched_edge_stir_the_pot_thread_eligible(thread)) {
4948 int inbox_message = os_atomic_load(&self_processor->stir_the_pot_inbox_cpu, relaxed);
4949 if (inbox_message == -1) {
4950 /* Set the registry bit */
4951 sched_edge_stir_the_pot_set_registry_entry();
4952 } else {
4953 assert(sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set) == false);
4954 /*
4955 * There's an inbox message which still needs to be used at the next
4956 * migration decision, so avoid starting a new request or clearing the
4957 * interim pending status until then.
4958 */
4959 }
4960 } else {
4961 /* Thread is ineligible for swap, so clear the registry bit */
4962 sched_edge_stir_the_pot_clear_registry_entry();
4963 }
4964 }
4965
4966 /*
4967 * sched_edge_quantum_expire()
4968 *
4969 * Update stir-the-pot eligibility and drive stir-the-pot swaps.
4970 * Thread lock must be held.
4971 */
4972 static void
sched_edge_quantum_expire(thread_t thread)4973 sched_edge_quantum_expire(thread_t thread)
4974 {
4975 if (sched_edge_stir_the_pot_core_type_is_desired(current_processor()->processor_set)) {
4976 thread->th_expired_quantum_on_higher_core = true;
4977 } else {
4978 thread->th_expired_quantum_on_lower_core = true;
4979 }
4980 if (sched_edge_stir_the_pot_thread_eligible(thread)) {
4981 sched_edge_stir_the_pot_try_trigger_swap(thread);
4982 }
4983 }
4984
4985 /*
4986 * sched_edge_run_count_incr()
4987 *
4988 * Update runnable thread counts in the same way as
4989 * sched_clutch_run_incr(), and reset per-thread, quantum-
4990 * expired tracking used by stir-the-pot, as the thread
4991 * is unblocking.
4992 */
4993 static uint32_t
sched_edge_run_count_incr(thread_t thread)4994 sched_edge_run_count_incr(thread_t thread)
4995 {
4996 uint32_t new_count = sched_clutch_run_incr(thread);
4997 /* Thread is unblocking and so resets its quantum tracking */
4998 thread->th_expired_quantum_on_lower_core = false;
4999 thread->th_expired_quantum_on_higher_core = false;
5000 return new_count;
5001 }
5002
5003 /* Return true if this thread should not continue running on this processor */
5004 static bool
sched_edge_thread_avoid_processor(processor_t processor,thread_t thread,ast_t reason)5005 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason)
5006 {
5007 if (thread->bound_processor == processor) {
5008 /* Thread is bound here */
5009 return false;
5010 }
5011
5012 /*
5013 * On quantum expiry, check the migration bitmask if this thread should be migrated off this core.
5014 * A migration is only recommended if there's also an idle core available that needn't be avoided.
5015 */
5016 if (reason & AST_QUANTUM) {
5017 if (bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id)) {
5018 uint64_t non_avoided_idle_primary_map = processor->processor_set->cpu_state_map[PROCESSOR_IDLE] & processor->processor_set->recommended_bitmask & ~processor->processor_set->perfcontrol_cpu_migration_bitmask;
5019 if (non_avoided_idle_primary_map != 0) {
5020 return true;
5021 }
5022 }
5023 }
5024
5025 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5026
5027 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) &&
5028 preferred_pset->pset_id != processor->processor_set->pset_id &&
5029 pset_type_is_recommended(preferred_pset)) {
5030 /* We should send this thread to the bound cluster */
5031 return true;
5032 }
5033
5034 sched_clutch_edge edge = (thread->sched_pri >= BASEPRI_RTQUEUES)
5035 ? sched_rt_config_get(preferred_pset->pset_cluster_id, processor->processor_set->pset_cluster_id)
5036 : sched_edge_config_get(preferred_pset->pset_cluster_id, processor->processor_set->pset_cluster_id, thread->th_sched_bucket);
5037 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) == false &&
5038 preferred_pset->pset_id != processor->processor_set->pset_id &&
5039 edge.sce_migration_allowed == false &&
5040 edge.sce_steal_allowed == false) {
5041 /*
5042 * Thread isn't allowed to be here, according to the edge migration graph.
5043 * Perhaps the thread's priority or boundness or its thread group's preferred
5044 * pset or the edge migration graph changed.
5045 *
5046 * We should only preempt after confirming the thread actually has a
5047 * recommended, allowed alternative pset to run on.
5048 */
5049 for (uint32_t pset_id = 0; pset_id < sched_edge_max_clusters; pset_id++) {
5050 if (pset_id == processor->processor_set->pset_id) {
5051 continue;
5052 }
5053 edge = (thread->sched_pri >= BASEPRI_RTQUEUES)
5054 ? sched_rt_config_get(preferred_pset->pset_id, pset_id)
5055 : sched_edge_config_get(preferred_pset->pset_id, pset_id, thread->th_sched_bucket);
5056 if (pset_is_recommended(pset_array[pset_id]) && ((pset_id == preferred_pset->pset_id) || edge.sce_migration_allowed)) {
5057 /* Thread can be run elsewhere. */
5058 return true;
5059 }
5060 }
5061 }
5062
5063 /* Evaluate shared resource policies */
5064 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
5065 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
5066 }
5067 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
5068 if (processor->processor_set->pset_type != preferred_pset->pset_type &&
5069 pset_type_is_recommended(preferred_pset)) {
5070 return true;
5071 }
5072 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
5073 }
5074
5075 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5076 return false;
5077 }
5078 /* ~~ No realtime or shared resource threads beyond this point ~~ */
5079
5080 /*
5081 * Stir-the-Pot:
5082 * A non-P-core should preempt if a P-core has been found to swap the current,
5083 * quantum-expired thread to for stir-the-pot. This is in order for threads in a
5084 * multi-threaded workload to share time on the P-cores so they make roughly equal
5085 * forward progress.
5086 */
5087 if (sched_edge_stir_the_pot_check_inbox_for_thread(thread, false) != -1) {
5088 return true;
5089 }
5090
5091 /*
5092 * Compaction:
5093 * If the preferred pset for the thread is now idle, try and migrate the thread to that cluster.
5094 */
5095 if ((processor->processor_set != preferred_pset) &&
5096 (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
5097 return true;
5098 }
5099
5100 /*
5101 * Running Rebalance:
5102 * We are willing to preempt the thread in order to migrate it onto an idle core
5103 * of the preferred type.
5104 */
5105 if ((processor->processor_set->pset_type != preferred_pset->pset_type) &&
5106 pset_type_is_recommended(preferred_pset)) {
5107 /* Scan for idle pset */
5108 for (uint32_t pset_id = 0; pset_id < sched_edge_max_clusters; pset_id++) {
5109 processor_set_t candidate_pset = pset_array[pset_id];
5110 edge = sched_edge_config_get(preferred_pset->pset_id, pset_id, thread->th_sched_bucket);
5111 if ((candidate_pset->pset_type == preferred_pset->pset_type) &&
5112 edge.sce_migration_allowed &&
5113 (sched_edge_cluster_load_metric(candidate_pset, thread->th_sched_bucket) == 0)) {
5114 return true;
5115 }
5116 }
5117 }
5118
5119 return false;
5120 }
5121
5122 static bool
sched_edge_balance(__unused processor_t cprocessor,processor_set_t cpset)5123 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
5124 {
5125 assert(cprocessor == current_processor());
5126 pset_unlock(cpset);
5127
5128 uint64_t ast_processor_map = 0;
5129 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
5130
5131 bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
5132 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
5133 /* Skip the pset if its not schedulable */
5134 processor_set_t target_pset = pset_array[cluster];
5135 if (pset_is_recommended(target_pset) == false) {
5136 continue;
5137 }
5138
5139 pset_lock(target_pset);
5140 uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
5141 for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
5142 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
5143 continue;
5144 }
5145 processor_t target_cpu = processor_array[cpuid];
5146 ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
5147 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
5148 bit_set(ast_processor_map, cpuid);
5149 }
5150 }
5151 pset_unlock(target_pset);
5152 }
5153
5154 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
5155 processor_t ast_processor = processor_array[cpuid];
5156 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
5157 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
5158 }
5159
5160 /* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
5161 return ast_processor_map != 0;
5162 }
5163
5164 /*
5165 * sched_edge_migration_check()
5166 *
5167 * Routine to evaluate an edge between two clusters to decide if migration is possible
5168 * across that edge. Also updates the selected_pset and max_edge_delta out parameters
5169 * accordingly. The return value indicates if the invoking routine should short circuit
5170 * the search, since an ideal candidate has been found. The routine looks at the regular
5171 * edges and cluster loads or the shared resource loads based on the type of thread.
5172 */
5173 static bool
sched_edge_migration_check(uint32_t cluster_id,processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread,processor_set_t * selected_pset,uint32_t * max_edge_delta)5174 sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
5175 uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
5176 {
5177 uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
5178 cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
5179 processor_set_t dst_pset = pset_array[cluster_id];
5180 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5181 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5182
5183 if (cluster_id == preferred_cluster_id) {
5184 return false;
5185 }
5186
5187 if (dst_pset == NULL) {
5188 return false;
5189 }
5190
5191 sched_clutch_edge edge = sched_edge_config_get(preferred_cluster_id, cluster_id, thread->th_sched_bucket);
5192 if (edge.sce_migration_allowed == false) {
5193 return false;
5194 }
5195 uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
5196 if (dst_load == 0
5197 ) {
5198 /* The candidate cluster is idle; select it immediately for execution */
5199 *selected_pset = dst_pset;
5200 *max_edge_delta = preferred_cluster_load;
5201 return true;
5202 }
5203
5204 uint32_t edge_delta = 0;
5205 if (dst_load > preferred_cluster_load) {
5206 return false;
5207 }
5208 edge_delta = preferred_cluster_load - dst_load;
5209 if (!shared_rsrc_thread && (edge_delta < edge.sce_migration_weight)) {
5210 /*
5211 * For non shared resource threads, use the edge migration weight to decide if
5212 * this cluster is over-committed at the QoS level of this thread.
5213 */
5214 return false;
5215 }
5216
5217 if (edge_delta < *max_edge_delta) {
5218 return false;
5219 }
5220 if (edge_delta == *max_edge_delta) {
5221 /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
5222 boolean_t selected_homogeneous = ((*selected_pset)->pset_type == preferred_cluster_type);
5223 boolean_t candidate_homogeneous = (dst_pset->pset_type == preferred_cluster_type);
5224 if (selected_homogeneous || !candidate_homogeneous) {
5225 return false;
5226 }
5227 }
5228 /* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
5229 *max_edge_delta = edge_delta;
5230 *selected_pset = dst_pset;
5231 return false;
5232 }
5233
5234 /*
5235 * sched_edge_migrate_edges_evaluate()
5236 *
5237 * Routine to find the candidate for thread migration based on edge weights.
5238 *
5239 * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
5240 * return preferred_pset if its the most ideal destination for this thread.
5241 */
5242 static processor_set_t
sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread)5243 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
5244 {
5245 processor_set_t selected_pset = preferred_pset;
5246 uint32_t max_edge_delta = 0;
5247 bool search_complete = false;
5248 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5249 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5250
5251 bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
5252 bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
5253 /* Always start the search with the native clusters */
5254 sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
5255 while (sched_iterate_psets_ordered(preferred_pset, &preferred_pset->spill_search_order[thread->th_sched_bucket], native_pset_bitmap[0], &istate)) {
5256 search_complete = sched_edge_migration_check(istate.spis_pset_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
5257 if (search_complete) {
5258 break;
5259 }
5260 }
5261
5262 if (search_complete) {
5263 return selected_pset;
5264 }
5265
5266 if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
5267 /*
5268 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
5269 * to fill up the preferred cluster and its homogeneous peers first.
5270 */
5271
5272 if (max_edge_delta > 0) {
5273 /*
5274 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
5275 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
5276 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
5277 * should be placed.
5278 */
5279 return selected_pset;
5280 }
5281 /*
5282 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
5283 * any more capacity left.
5284 */
5285 if (sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
5286 return preferred_pset;
5287 }
5288 /*
5289 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
5290 * at non-native clusters for capacity.
5291 */
5292 }
5293
5294 /* Now look at the non-native clusters */
5295 istate = SCHED_PSET_ITERATE_STATE_INIT;
5296 while (sched_iterate_psets_ordered(preferred_pset, &preferred_pset->spill_search_order[thread->th_sched_bucket], foreign_pset_bitmap[0], &istate)) {
5297 search_complete = sched_edge_migration_check(istate.spis_pset_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
5298 if (search_complete) {
5299 break;
5300 }
5301 }
5302 return selected_pset;
5303 }
5304
5305 /*
5306 * sched_edge_candidate_alternative()
5307 *
5308 * Routine to find an alternative cluster from candidate_cluster_bitmap since the
5309 * selected_pset is not available for execution. The logic tries to prefer homogeneous
5310 * clusters over heterogeneous clusters since this is typically used in thread
5311 * placement decisions.
5312 */
5313 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
5314 static processor_set_t
sched_edge_candidate_alternative(processor_set_t selected_pset,uint64_t candidate_cluster_bitmap)5315 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
5316 {
5317 /*
5318 * It looks like the most ideal pset is not available for scheduling currently.
5319 * Try to find a homogeneous cluster that is still available.
5320 */
5321 uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
5322 int available_cluster_id = lsb_first(available_native_clusters);
5323 if (available_cluster_id == -1) {
5324 /* Looks like none of the homogeneous clusters are available; pick the first available cluster */
5325 available_cluster_id = bit_first(candidate_cluster_bitmap);
5326 }
5327 assert(available_cluster_id != -1);
5328 return pset_array[available_cluster_id];
5329 }
5330
5331 /*
5332 * sched_edge_switch_pset_lock()
5333 *
5334 * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
5335 * switch_pset_locks.
5336 * Returns the newly locked pset after the switch.
5337 */
5338 static processor_set_t
sched_edge_switch_pset_lock(processor_set_t selected_pset,processor_set_t locked_pset,bool switch_pset_locks)5339 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
5340 {
5341 if (!switch_pset_locks) {
5342 return locked_pset;
5343 }
5344 if (selected_pset != locked_pset) {
5345 pset_unlock(locked_pset);
5346 pset_lock(selected_pset);
5347 return selected_pset;
5348 } else {
5349 return locked_pset;
5350 }
5351 }
5352
5353 /*
5354 * sched_edge_migrate_candidate()
5355 *
5356 * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
5357 * the thread and the preferred cluster to determine the best available pset for scheduling.
5358 *
5359 * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
5360 * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
5361 * decision has to be synchronized with another CPU changing the recommendation of clusters available
5362 * on the system. If this parameter is set to false, this routine returns the best effort indication of
5363 * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
5364 * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
5365 * more appropriate cluster.
5366 *
5367 * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
5368 * resultant pset lock is held.
5369 */
5370 static processor_set_t
sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset,thread_t thread,processor_set_t locked_pset,bool switch_pset_locks,processor_t * processor_hint_out,sched_options_t * options_inout)5371 sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset, thread_t thread,
5372 processor_set_t locked_pset, bool switch_pset_locks, processor_t *processor_hint_out,
5373 sched_options_t *options_inout)
5374 {
5375 processor_set_t selected_pset = preferred_pset;
5376 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5377 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5378 bool stirring_the_pot = false;
5379
5380 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
5381 /*
5382 * For cluster-bound threads, choose the cluster to which the thread is bound, unless that
5383 * cluster is unavailable. If it's not available, fall through to the regular cluster selection
5384 * logic which handles derecommended clusters appropriately.
5385 */
5386 selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
5387 if (selected_pset != NULL) {
5388 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
5389 if (pset_is_recommended(selected_pset)) {
5390 return selected_pset;
5391 }
5392 }
5393 }
5394
5395 uint64_t candidate_cluster_bitmap = mask(sched_edge_max_clusters);
5396 #if DEVELOPMENT || DEBUG
5397 extern int enable_task_set_cluster_type;
5398 task_t task = get_threadtask(thread);
5399 if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5400 processor_set_t pset_hint = task->pset_hint;
5401 if (pset_hint && (selected_pset == NULL || selected_pset->pset_cluster_type != pset_hint->pset_cluster_type)) {
5402 selected_pset = pset_hint;
5403 goto migrate_candidate_available_check;
5404 }
5405 }
5406 #endif
5407
5408 if (preferred_pset == NULL) {
5409 /* The preferred_pset has not finished initializing at boot */
5410 goto migrate_candidate_available_check;
5411 }
5412
5413 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5414 /* For realtime threads, try and schedule them on the preferred pset always */
5415 goto migrate_candidate_available_check;
5416 }
5417
5418 uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
5419 if (preferred_cluster_load == 0) {
5420 goto migrate_candidate_available_check;
5421 }
5422
5423 /*
5424 * If this thread has expired quantum on a non-preferred core and is waiting on
5425 * "stir-the-pot" to get a turn running on a P-core, check our processor inbox for
5426 * stir-the-pot to see if an eligible P-core has already been found for swap.
5427 * If so, try to migrate to the corresponding pset and also carry over the
5428 * processor hint to preempt that specific P-core.
5429 *
5430 * The AMP rebalancing mechanism is available for regular threads or shared resource
5431 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
5432 */
5433 int stir_the_pot_swap_cpu = sched_edge_stir_the_pot_check_inbox_for_thread(thread, true);
5434 if (stir_the_pot_swap_cpu != -1) {
5435 *processor_hint_out = processor_array[stir_the_pot_swap_cpu];
5436 selected_pset = processor_array[stir_the_pot_swap_cpu]->processor_set;
5437 stirring_the_pot = true;
5438 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_NONE,
5439 2, stir_the_pot_swap_cpu, 0, 0);
5440 goto migrate_candidate_available_check;
5441 }
5442
5443 /* Look at edge weights to decide the most ideal migration candidate for this thread */
5444 selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
5445
5446 migrate_candidate_available_check:
5447 if (selected_pset == NULL) {
5448 /* The selected_pset has not finished initializing at boot */
5449 pset_unlock(locked_pset);
5450 return NULL;
5451 }
5452
5453 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
5454 if (pset_is_recommended(selected_pset) == true) {
5455 /* Committing to the pset */
5456 if (stirring_the_pot) {
5457 *options_inout |= SCHED_STIR_POT;
5458 }
5459 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_pset->pset_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
5460 return selected_pset;
5461 }
5462 stirring_the_pot = false;
5463 /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
5464 bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
5465 if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -1)) {
5466 pset_unlock(locked_pset);
5467 return NULL;
5468 }
5469 /* Try and find an alternative for the selected pset */
5470 selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
5471 goto migrate_candidate_available_check;
5472 }
5473
5474 static processor_t
sched_edge_choose_processor(processor_set_t pset,processor_t processor,thread_t thread,sched_options_t * options_inout)5475 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, sched_options_t *options_inout)
5476 {
5477 /* Bound threads don't call this function */
5478 assert(thread->bound_processor == PROCESSOR_NULL);
5479 processor_t chosen_processor = PROCESSOR_NULL;
5480
5481 /*
5482 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
5483 * It should take the passed in "pset" as a hint which represents the recency metric for
5484 * pset selection logic.
5485 */
5486 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5487 processor_set_t chosen_pset = preferred_pset;
5488 /*
5489 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
5490 * threads to. sched_edge_migrate_candidate() returns the preferred pset
5491 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
5492 *
5493 * Edge Scheduler Optimization
5494 * It might be useful to build a recency metric for the thread for multiple clusters and
5495 * factor that into the migration decisions.
5496 */
5497 chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true, &processor, options_inout);
5498 if (chosen_pset) {
5499 chosen_processor = choose_processor(chosen_pset, processor, thread, options_inout);
5500 }
5501 return chosen_processor;
5502 }
5503
5504 /*
5505 * sched_edge_clutch_bucket_threads_drain()
5506 *
5507 * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
5508 * bucket overrides etc.) into a local thread queue.
5509 */
5510 static void
sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,queue_t clutch_threads)5511 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
5512 {
5513 thread_t thread = THREAD_NULL;
5514 uint64_t current_timestamp = mach_approximate_time();
5515 qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
5516 sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
5517 enqueue_tail(clutch_threads, &thread->runq_links);
5518 }
5519 }
5520
5521 #if !SCHED_TEST_HARNESS
5522
5523 /*
5524 * sched_edge_run_drained_threads()
5525 *
5526 * Makes all drained threads in a local queue runnable.
5527 */
5528 static void
sched_edge_run_drained_threads(queue_t clutch_threads)5529 sched_edge_run_drained_threads(queue_t clutch_threads)
5530 {
5531 thread_t thread;
5532 /* Now setrun all the threads in the local queue */
5533 qe_foreach_element_safe(thread, clutch_threads, runq_links) {
5534 remqueue(&thread->runq_links);
5535 thread_lock(thread);
5536 thread_setrun(thread, SCHED_TAILQ);
5537 thread_unlock(thread);
5538 }
5539 }
5540
5541 #endif /* !SCHED_TEST_HARNESS */
5542
5543 /*
5544 * sched_edge_update_preferred_cluster()
5545 *
5546 * Routine to update the preferred cluster for QoS buckets within a thread group.
5547 * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
5548 */
5549 static void
sched_edge_update_preferred_cluster(sched_clutch_t sched_clutch,bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster)5550 sched_edge_update_preferred_cluster(
5551 sched_clutch_t sched_clutch,
5552 bitmap_t *clutch_bucket_modify_bitmap,
5553 uint32_t *tg_bucket_preferred_cluster)
5554 {
5555 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
5556 os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
5557 }
5558 }
5559
5560 #if !SCHED_TEST_HARNESS
5561
5562 /*
5563 * sched_edge_migrate_thread_group_runnable_threads()
5564 *
5565 * Routine to implement the migration of threads on a cluster when the thread group
5566 * recommendation is updated. The migration works using a 2-phase
5567 * algorithm.
5568 *
5569 * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
5570 * For each clutch bucket, if it needs to be migrated immediately, drain the threads
5571 * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
5572 * appropriate.
5573 *
5574 * Phase 2: After unlocking the pset, drain all the threads from the local thread
5575 * queue and mark them runnable which should land them in the right hierarchy.
5576 *
5577 * The routine assumes that the preferences for the clutch buckets/clutch bucket
5578 * groups have already been updated by the caller.
5579 *
5580 * - Called with the pset locked and interrupts disabled.
5581 * - Returns with the pset unlocked.
5582 */
5583 static void
sched_edge_migrate_thread_group_runnable_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,bitmap_t * clutch_bucket_modify_bitmap,__unused uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)5584 sched_edge_migrate_thread_group_runnable_threads(
5585 sched_clutch_t sched_clutch,
5586 sched_clutch_root_t root_clutch,
5587 bitmap_t *clutch_bucket_modify_bitmap,
5588 __unused uint32_t *tg_bucket_preferred_cluster,
5589 bool migrate_immediately)
5590 {
5591 /* Queue to hold threads that have been drained from clutch buckets to be migrated */
5592 queue_head_t clutch_threads;
5593 queue_init(&clutch_threads);
5594
5595 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
5596 /* Get the clutch bucket for this cluster and sched bucket */
5597 sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
5598 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
5599 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
5600 if (scb_root == NULL) {
5601 /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
5602 assert(clutch_bucket->scb_thr_count == 0);
5603 continue;
5604 }
5605 assert(scb_root == root_clutch);
5606 uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
5607
5608 if (migrate_immediately) {
5609 /*
5610 * For transitions where threads need to be migrated immediately, drain the threads into a
5611 * local queue unless we are looking at the clutch buckets for the newly recommended
5612 * cluster.
5613 */
5614 if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
5615 sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
5616 } else {
5617 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
5618 }
5619 } else {
5620 /* Check if this cluster is the same type as the newly recommended cluster */
5621 boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
5622 /*
5623 * If threads do not have to be migrated immediately, just change the native/foreign
5624 * flag on the clutch bucket.
5625 */
5626 if (homogeneous_cluster) {
5627 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
5628 } else {
5629 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
5630 }
5631 }
5632 }
5633
5634 pset_unlock(root_clutch->scr_pset);
5635 sched_edge_run_drained_threads(&clutch_threads);
5636 }
5637
5638 /*
5639 * sched_edge_migrate_thread_group_running_threads()
5640 *
5641 * Routine to find all running threads of a thread group on a specific cluster
5642 * and IPI them if they need to be moved immediately.
5643 */
5644 static void
sched_edge_migrate_thread_group_running_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,__unused bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)5645 sched_edge_migrate_thread_group_running_threads(
5646 sched_clutch_t sched_clutch,
5647 sched_clutch_root_t root_clutch,
5648 __unused bitmap_t *clutch_bucket_modify_bitmap,
5649 uint32_t *tg_bucket_preferred_cluster,
5650 bool migrate_immediately)
5651 {
5652 if (migrate_immediately == false) {
5653 /* If CLPC has recommended not to move threads immediately, nothing to do here */
5654 return;
5655 }
5656
5657 /*
5658 * Edge Scheduler Optimization
5659 *
5660 * When the system has a large number of clusters and cores, it might be useful to
5661 * narrow down the iteration by using a thread running bitmap per clutch.
5662 */
5663 uint64_t ast_processor_map = 0;
5664 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
5665
5666 uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
5667 /*
5668 * Iterate all CPUs and look for the ones running threads from this thread group and are
5669 * not restricted to the specific cluster (due to overrides etc.)
5670 */
5671 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
5672 processor_t src_processor = processor_array[cpuid];
5673 boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
5674 sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
5675 if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
5676 continue;
5677 }
5678 boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
5679
5680 if (expected_tg && non_preferred_cluster) {
5681 ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
5682 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
5683 bit_set(ast_processor_map, cpuid);
5684 } else if (src_processor == current_processor()) {
5685 bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
5686 ast_t new_preempt = update_pending_nonurgent_preemption(src_processor, AST_PREEMPT);
5687 ast_on(new_preempt);
5688 }
5689 }
5690 }
5691
5692 /* Perform all the IPIs */
5693 if (bit_first(ast_processor_map) != -1) {
5694 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
5695 processor_t ast_processor = processor_array[cpuid];
5696 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
5697 }
5698 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
5699 }
5700 }
5701
5702 /*
5703 * sched_edge_tg_preferred_cluster_change()
5704 *
5705 * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
5706 * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
5707 * thread migrations based on the policy specified by CLPC.
5708 * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
5709 * for each QoS within the thread group.
5710 */
5711 void
sched_edge_tg_preferred_cluster_change(struct thread_group * tg,uint32_t * tg_bucket_preferred_cluster,sched_perfcontrol_preferred_cluster_options_t options)5712 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
5713 {
5714 sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
5715 /*
5716 * In order to optimize the processing, create a bitmap which represents all QoS buckets
5717 * for which the preferred cluster has changed.
5718 */
5719 bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
5720 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
5721 uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
5722 uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
5723 if (old_preferred_cluster != new_preferred_cluster) {
5724 bitmap_set(clutch_bucket_modify_bitmap, bucket);
5725 }
5726 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREFERRED_PSET) | DBG_FUNC_NONE,
5727 thread_group_get_id(tg), bucket, new_preferred_cluster, options);
5728 }
5729 if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
5730 /* No changes in any clutch buckets; nothing to do here */
5731 return;
5732 }
5733
5734 /*
5735 * The first operation is to update the preferred cluster for all QoS buckets within the
5736 * thread group so that any future threads becoming runnable would see the new preferred
5737 * cluster value.
5738 */
5739 sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
5740
5741 for (uint32_t cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
5742 processor_set_t pset = pset_array[cluster_id];
5743 spl_t s = splsched();
5744 pset_lock(pset);
5745 /*
5746 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
5747 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
5748 *
5749 * Edge Scheduler Optimization
5750 */
5751 /* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
5752 sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
5753 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
5754 /* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
5755 sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
5756 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
5757 /* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
5758 splx(s);
5759 }
5760 }
5761
5762 /*
5763 * sched_edge_pset_made_schedulable()
5764 *
5765 * Routine to migrate all the clutch buckets which are not in their recommended
5766 * pset hierarchy now that a new pset has become runnable. Its possible that this
5767 * routine is called when the pset is already marked schedulable.
5768 *
5769 * Invoked with the pset lock held and interrupts disabled.
5770 */
5771 static void
sched_edge_pset_made_schedulable(__unused processor_t processor,processor_set_t dst_pset,boolean_t drop_lock)5772 sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
5773 {
5774 if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
5775 /* Nothing to do here since pset is already marked schedulable */
5776 if (drop_lock) {
5777 pset_unlock(dst_pset);
5778 }
5779 return;
5780 }
5781
5782 bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
5783
5784 thread_t thread = sched_edge_processor_idle(dst_pset);
5785 if (thread != THREAD_NULL) {
5786 thread_lock(thread);
5787 thread_setrun(thread, SCHED_TAILQ);
5788 thread_unlock(thread);
5789 }
5790
5791 if (!drop_lock) {
5792 pset_lock(dst_pset);
5793 }
5794 }
5795
5796 #endif /* !SCHED_TEST_HARNESS */
5797
5798
5799 /*
5800 * sched_edge_cpu_init_completed()
5801 *
5802 * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
5803 * provides an opportunity for the edge scheduler to initialize all the edge parameters.
5804 */
5805 static void
sched_edge_cpu_init_completed(void)5806 sched_edge_cpu_init_completed(void)
5807 {
5808 /* Now that all cores have registered, compute bitmaps for different core types */
5809 for (int pset_id = 0; pset_id < sched_edge_max_clusters; pset_id++) {
5810 processor_set_t pset = pset_array[pset_id];
5811 if (sched_edge_stir_the_pot_core_type_is_desired(pset)) {
5812 os_atomic_or(&sched_edge_p_core_map, pset->cpu_bitmask, relaxed);
5813 } else {
5814 os_atomic_or(&sched_edge_non_p_core_map, pset->cpu_bitmask, relaxed);
5815 }
5816 }
5817 /* Build policy table for setting edge weight tunables based on cluster types */
5818 sched_clutch_edge edge_config_defaults[MAX_CPU_TYPES][MAX_CPU_TYPES];
5819 sched_clutch_edge free_spill = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1};
5820 sched_clutch_edge no_spill = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0};
5821 sched_clutch_edge weighted_spill = (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1};
5822 /* P -> P */
5823 edge_config_defaults[CLUSTER_TYPE_P][CLUSTER_TYPE_P] = free_spill;
5824 /* E -> E */
5825 edge_config_defaults[CLUSTER_TYPE_E][CLUSTER_TYPE_E] = free_spill;
5826 /* P -> E */
5827 edge_config_defaults[CLUSTER_TYPE_P][CLUSTER_TYPE_E] = weighted_spill;
5828 /* E -> P */
5829 edge_config_defaults[CLUSTER_TYPE_E][CLUSTER_TYPE_P] = no_spill;
5830
5831 spl_t s = splsched();
5832 for (int src_cluster_id = 0; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) {
5833 processor_set_t src_pset = pset_array[src_cluster_id];
5834 pset_lock(src_pset);
5835
5836 /* Each pset recommendation is at least allowed to access its own cluster */
5837 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
5838 src_pset->max_parallel_cores[bucket] = src_pset->cpu_set_count;
5839 src_pset->max_parallel_clusters[bucket] = 1;
5840 }
5841
5842 /* For each cluster, set all its outgoing edge parameters */
5843 for (int dst_cluster_id = 0; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) {
5844 processor_set_t dst_pset = pset_array[dst_cluster_id];
5845 if (dst_cluster_id == src_cluster_id) {
5846 continue;
5847 }
5848
5849 bool clusters_homogenous = (src_pset->pset_type == dst_pset->pset_type);
5850 if (clusters_homogenous) {
5851 bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
5852 bitmap_set(src_pset->native_psets, dst_cluster_id);
5853 /* Default realtime policy: spill allowed among homogeneous psets. */
5854 sched_rt_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge) {
5855 .sce_migration_allowed = true,
5856 .sce_steal_allowed = true,
5857 .sce_migration_weight = 0,
5858 });
5859 } else {
5860 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
5861 bitmap_clear(src_pset->native_psets, dst_cluster_id);
5862 /* Default realtime policy: disallow spill among heterogeneous psets. */
5863 sched_rt_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge) {
5864 .sce_migration_allowed = false,
5865 .sce_steal_allowed = false,
5866 .sce_migration_weight = 0,
5867 });
5868 }
5869
5870 bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
5871 if (clusters_local) {
5872 bitmap_set(src_pset->local_psets, dst_cluster_id);
5873 bitmap_clear(src_pset->remote_psets, dst_cluster_id);
5874 } else {
5875 bitmap_set(src_pset->remote_psets, dst_cluster_id);
5876 bitmap_clear(src_pset->local_psets, dst_cluster_id);
5877 }
5878
5879 for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
5880 /* Set tunables for an edge based on the cluster types at either ends of it */
5881 sched_clutch_edge edge_config = edge_config_defaults[src_pset->pset_type][dst_pset->pset_type];
5882 sched_edge_config_set(src_cluster_id, dst_cluster_id, bucket, edge_config);
5883 if (edge_config.sce_migration_allowed) {
5884 src_pset->max_parallel_cores[bucket] += dst_pset->cpu_set_count;
5885 src_pset->max_parallel_clusters[bucket] += 1;
5886 }
5887 }
5888 }
5889 sched_edge_config_pset_push(src_cluster_id);
5890
5891 pset_unlock(src_pset);
5892 }
5893 splx(s);
5894 }
5895
5896 static bool
sched_edge_thread_eligible_for_pset(thread_t thread,processor_set_t pset)5897 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
5898 {
5899 uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
5900 if (preferred_cluster_id == pset->pset_cluster_id) {
5901 return true;
5902 } else {
5903 sched_clutch_edge edge;
5904 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5905 edge = sched_rt_config_get(preferred_cluster_id, pset->pset_id);
5906 } else {
5907 edge = sched_edge_config_get(preferred_cluster_id, pset->pset_cluster_id, thread->th_sched_bucket);
5908 }
5909 return edge.sce_migration_allowed;
5910 }
5911 }
5912
5913 extern int sched_amp_spill_deferred_ipi;
5914 extern int sched_amp_pcores_preempt_immediate_ipi;
5915
5916 int sched_edge_migrate_ipi_immediate = 1;
5917
5918 sched_ipi_type_t
sched_edge_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)5919 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
5920 {
5921 processor_set_t pset = dst->processor_set;
5922 assert(dst != current_processor());
5923
5924 boolean_t deferred_ipi_supported = false;
5925 #if defined(CONFIG_SCHED_DEFERRED_AST)
5926 deferred_ipi_supported = true;
5927 #endif /* CONFIG_SCHED_DEFERRED_AST */
5928
5929 switch (event) {
5930 case SCHED_IPI_EVENT_SPILL:
5931 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
5932 if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
5933 return sched_ipi_deferred_policy(pset, dst, thread, event);
5934 }
5935 break;
5936 case SCHED_IPI_EVENT_PREEMPT:
5937 /* For preemption, the default policy is to use deferred IPIs
5938 * for Non-RT P-core preemption. Override that behavior if
5939 * sched_amp_pcores_preempt_immediate_ipi is set
5940 */
5941 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
5942 if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
5943 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5944 }
5945 if (sched_edge_migrate_ipi_immediate) {
5946 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5947 /*
5948 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
5949 */
5950 if (preferred_pset->pset_type == pset->pset_type) {
5951 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5952 }
5953 /*
5954 * For workloads that are going wide, it might be useful to use Immediate IPI to
5955 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
5956 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
5957 * latency on the preferred pset as an estimate of busyness.
5958 */
5959 if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
5960 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5961 }
5962 }
5963 }
5964 break;
5965 default:
5966 break;
5967 }
5968 /* Default back to the global policy for all other scenarios */
5969 return sched_ipi_policy(dst, thread, dst_idle, event);
5970 }
5971
5972
5973 /*
5974 * sched_edge_qos_max_parallelism()
5975 */
5976 uint32_t
sched_edge_qos_max_parallelism(int qos,uint64_t options)5977 sched_edge_qos_max_parallelism(int qos, uint64_t options)
5978 {
5979 cluster_type_t low_core_type = CLUSTER_TYPE_E;
5980 cluster_type_t high_core_type = CLUSTER_TYPE_P;
5981
5982 if (options & QOS_PARALLELISM_REALTIME) {
5983 /* For realtime threads on AMP, we would want them
5984 * to limit the width to just the P-cores since we
5985 * do not spill/rebalance for RT threads.
5986 */
5987 uint32_t high_cpu_count = ml_get_cpu_number_type(high_core_type, false, false);
5988 uint32_t high_cluster_count = ml_get_cluster_number_type(high_core_type);
5989 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? high_cluster_count : high_cpu_count;
5990 }
5991
5992 /*
5993 * The Edge scheduler supports per-QoS recommendations for thread groups.
5994 * This enables lower QoS buckets (such as UT) to be scheduled on all
5995 * CPUs on the system.
5996 *
5997 * The only restriction is for BG/Maintenance QoS classes for which the
5998 * performance controller would never recommend execution on the P-cores.
5999 * If that policy changes in the future, this value should be changed.
6000 */
6001 switch (qos) {
6002 case THREAD_QOS_BACKGROUND:
6003 case THREAD_QOS_MAINTENANCE:;
6004 uint32_t low_cpu_count = ml_get_cpu_number_type(low_core_type, false, false);
6005 uint32_t low_cluster_count = ml_get_cluster_number_type(low_core_type);
6006 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? low_cluster_count : low_cpu_count;
6007 default:;
6008 uint32_t total_cpus = ml_get_cpu_count();
6009 uint32_t total_clusters = ml_get_cluster_count();
6010 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? total_clusters : total_cpus;
6011 }
6012 }
6013
6014
6015 #endif /* CONFIG_SCHED_EDGE */
6016
6017 #endif /* CONFIG_SCHED_CLUTCH */
6018