1 /*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #if !SCHED_TEST_HARNESS
30
31 #include <mach/mach_types.h>
32 #include <mach/machine.h>
33 #include <machine/machine_routines.h>
34 #include <machine/sched_param.h>
35 #include <machine/machine_cpu.h>
36 #include <kern/kern_types.h>
37 #include <kern/debug.h>
38 #include <kern/machine.h>
39 #include <kern/misc_protos.h>
40 #include <kern/processor.h>
41 #include <kern/queue.h>
42 #include <kern/sched.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/sched_clutch.h>
46 #include <machine/atomic.h>
47 #include <kern/sched_clutch.h>
48 #include <sys/kdebug.h>
49
50 #endif /* !SCHED_TEST_HARNESS */
51
52 #include <kern/sched_prim.h>
53
54 #if CONFIG_SCHED_EDGE
55 #include <kern/sched_amp_common.h>
56 #endif /* CONFIG_SCHED_EDGE */
57
58 #if CONFIG_SCHED_CLUTCH
59
60 #define SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION 1
61 typedef union {
62 struct __attribute__((packed)) {
63 unsigned int version : 4;
64 unsigned int traverse_mode : 3;
65 unsigned int cluster_id : 6;
66 unsigned int selection_was_edf : 1;
67 unsigned int selection_was_cluster_bound : 1;
68 unsigned int selection_opened_starvation_avoidance_window : 1;
69 unsigned int selection_opened_warp_window : 1;
70 unsigned int starvation_avoidance_window_close : 12;
71 unsigned int warp_window_close : 12;
72 unsigned int reserved : 23; /* For future usage */
73 } trace_data;
74 uint64_t scdts_trace_data_packed;
75 } sched_clutch_dbg_thread_select_packed_t;
76
77 static_assert(TH_BUCKET_SCHED_MAX == 6, "Ensure layout of sched_clutch_dbg_thread_select_packed can fit root bucket bitmasks");
78 static_assert(sizeof(sched_clutch_dbg_thread_select_packed_t) <= sizeof(uint64_t), "Ensure sched_clutch_dbg_thread_select_packed_t can fit in one tracepoint argument");
79
80 /* Forward declarations of static routines */
81
82 /* Root level hierarchy management */
83 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
84 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
85 static void sched_clutch_root_pri_update(sched_clutch_root_t);
86 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
87 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
88
89 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
90 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE = 0,
91 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY = 1,
92 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL = 2,
93 });
94 __enum_decl(sched_clutch_traverse_mode_t, uint32_t, {
95 SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY = 0,
96 SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT = 1,
97 SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT = 2,
98 });
99 static_assert(SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT < (1 << 3), "Ensure traverse mode can be encoded within 3 bits of sched_clutch_dbg_thread_select_packed_t");
100 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t, sched_clutch_root_bucket_t, thread_t, bool *, sched_clutch_traverse_mode_t, sched_clutch_dbg_thread_select_packed_t *);
101
102 #if CONFIG_SCHED_EDGE
103 /* Support for foreign threads on AMP platforms */
104 static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
105 static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
106 #endif /* CONFIG_SCHED_EDGE */
107
108 /* Root bucket level hierarchy management */
109 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
110 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t, bool);
111 static int sched_clutch_root_highest_runnable_qos(sched_clutch_root_t, sched_clutch_highest_root_bucket_type_t);
112
113 /* Options for clutch bucket ordering in the runq */
114 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
115 SCHED_CLUTCH_BUCKET_OPTIONS_NONE = 0x0,
116 /* Round robin clutch bucket on thread removal */
117 SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = 0x1,
118 /* Insert clutch bucket at head (for thread preemption) */
119 SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = 0x2,
120 /* Insert clutch bucket at tail (default) */
121 SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = 0x4,
122 });
123
124 /* Clutch bucket level hierarchy management */
125 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
126 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
127 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
128 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
129 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
130 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
131
132 /* Clutch bucket group level properties management */
133 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
134 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
135 static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
136 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
137 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
138 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
139 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
140
141 /* Clutch timeshare properties updates */
142 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
143 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
144
145 /* Clutch membership management */
146 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
147 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
148 static thread_t sched_clutch_hierarchy_thread_highest(sched_clutch_root_t, processor_t, thread_t, sched_clutch_traverse_mode_t);
149
150 /* Clutch properties updates */
151 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
152 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
153 static int sched_clutch_root_priority(sched_clutch_root_t);
154 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t, sched_clutch_root_bucket_t, processor_t _Nullable processor, thread_t _Nullable prev_thread, bool *_Nullable chose_prev_thread);
155
156 /* Clutch thread properties */
157 static boolean_t sched_thread_sched_pri_promoted(thread_t);
158 static inline sched_clutch_bucket_t sched_clutch_bucket_for_thread(sched_clutch_root_t, thread_t);
159 static inline sched_clutch_bucket_group_t sched_clutch_bucket_group_for_thread(thread_t);
160
161 /* General utilities */
162 static inline bool sched_clutch_pri_greater_than_tiebreak(int, int, bool);
163
164 #if CONFIG_SCHED_EDGE
165 /* System based routines */
166 static bool sched_edge_pset_available(processor_set_t);
167 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
168 static int sched_edge_iterate_clusters_ordered(processor_set_t, uint64_t, int);
169
170 /* Global indicating the maximum number of clusters on the current platform */
171 static int sched_edge_max_clusters = 0;
172 #endif /* CONFIG_SCHED_EDGE */
173
174 /* Helper debugging routines */
175 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
176
177 extern processor_set_t pset_array[MAX_PSETS];
178
179 /*
180 * Special markers for buckets that have invalid WCELs/quantums etc.
181 */
182 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
183 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
184
185 /*
186 * Root level bucket WCELs
187 *
188 * The root level bucket selection algorithm is an Earliest Deadline
189 * First (EDF) algorithm where the deadline for buckets are defined
190 * by the worst-case-execution-latency and the make runnable timestamp
191 * for the bucket.
192 *
193 */
194 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
195 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
196 0, /* FG */
197 37500, /* IN (37.5ms) */
198 75000, /* DF (75ms) */
199 150000, /* UT (150ms) */
200 250000 /* BG (250ms) */
201 };
202 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
203
204 /*
205 * Root level bucket warp
206 *
207 * Each root level bucket has a warp value associated with it as well.
208 * The warp value allows the root bucket to effectively warp ahead of
209 * lower priority buckets for a limited time even if it has a later
210 * deadline. The warping behavior provides extra (but limited)
211 * opportunity for high priority buckets to remain responsive.
212 */
213
214 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
215 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
216
217 /* Warp window durations for various tiers */
218 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
219 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
220 8000, /* FG (8ms)*/
221 4000, /* IN (4ms) */
222 2000, /* DF (2ms) */
223 1000, /* UT (1ms) */
224 0 /* BG (0ms) */
225 };
226 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
227
228 /*
229 * Thread level quantum
230 *
231 * The algorithm defines quantums for threads at various buckets. This
232 * (combined with the root level bucket quantums) restricts how much
233 * the lower priority levels can preempt the higher priority threads.
234 */
235
236 #if XNU_TARGET_OS_OSX
237 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
238 10000, /* FIXPRI (10ms) */
239 10000, /* FG (10ms) */
240 10000, /* IN (10ms) */
241 10000, /* DF (10ms) */
242 4000, /* UT (4ms) */
243 2000 /* BG (2ms) */
244 };
245 #else /* XNU_TARGET_OS_OSX */
246 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
247 10000, /* FIXPRI (10ms) */
248 10000, /* FG (10ms) */
249 8000, /* IN (8ms) */
250 6000, /* DF (6ms) */
251 4000, /* UT (4ms) */
252 2000 /* BG (2ms) */
253 };
254 #endif /* XNU_TARGET_OS_OSX */
255
256 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
257
258 /*
259 * sched_clutch_us_to_abstime()
260 *
261 * Initializer for converting all durations in usec to abstime
262 */
263 static void
sched_clutch_us_to_abstime(uint32_t * us_vals,uint64_t * abstime_vals)264 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
265 {
266 for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
267 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
268 abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
269 } else {
270 clock_interval_to_absolutetime_interval(us_vals[i],
271 NSEC_PER_USEC, &abstime_vals[i]);
272 }
273 }
274 }
275
276 /* Clutch/Edge Scheduler Debugging support */
277 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c) ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
278
279 #if DEVELOPMENT || DEBUG
280
281 /*
282 * sched_clutch_hierarchy_locked_assert()
283 *
284 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
285 * for the hierarchy depends on where the hierarchy is hooked. The current
286 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
287 * using the pset lock.
288 */
289 static inline void
sched_clutch_hierarchy_locked_assert(sched_clutch_root_t root_clutch)290 sched_clutch_hierarchy_locked_assert(
291 sched_clutch_root_t root_clutch)
292 {
293 pset_assert_locked(root_clutch->scr_pset);
294 }
295
296 #else /* DEVELOPMENT || DEBUG */
297
298 static inline void
sched_clutch_hierarchy_locked_assert(__unused sched_clutch_root_t root_clutch)299 sched_clutch_hierarchy_locked_assert(
300 __unused sched_clutch_root_t root_clutch)
301 {
302 }
303
304 #endif /* DEVELOPMENT || DEBUG */
305
306 /*
307 * sched_clutch_thr_count_inc()
308 *
309 * Increment thread count at a hierarchy level with overflow checks.
310 */
311 static void
sched_clutch_thr_count_inc(uint16_t * thr_count)312 sched_clutch_thr_count_inc(
313 uint16_t *thr_count)
314 {
315 if (__improbable(os_inc_overflow(thr_count))) {
316 panic("sched_clutch thread count overflowed!");
317 }
318 }
319
320 /*
321 * sched_clutch_thr_count_dec()
322 *
323 * Decrement thread count at a hierarchy level with underflow checks.
324 */
325 static void
sched_clutch_thr_count_dec(uint16_t * thr_count)326 sched_clutch_thr_count_dec(
327 uint16_t *thr_count)
328 {
329 if (__improbable(os_dec_overflow(thr_count))) {
330 panic("sched_clutch thread count underflowed!");
331 }
332 }
333
334 static sched_bucket_t
sched_convert_pri_to_bucket(uint8_t priority)335 sched_convert_pri_to_bucket(uint8_t priority)
336 {
337 sched_bucket_t bucket = TH_BUCKET_RUN;
338
339 if (priority > BASEPRI_USER_INITIATED) {
340 bucket = TH_BUCKET_SHARE_FG;
341 } else if (priority > BASEPRI_DEFAULT) {
342 bucket = TH_BUCKET_SHARE_IN;
343 } else if (priority > BASEPRI_UTILITY) {
344 bucket = TH_BUCKET_SHARE_DF;
345 } else if (priority > MAXPRI_THROTTLE) {
346 bucket = TH_BUCKET_SHARE_UT;
347 } else {
348 bucket = TH_BUCKET_SHARE_BG;
349 }
350 return bucket;
351 }
352
353 /*
354 * sched_clutch_thread_bucket_map()
355 *
356 * Map a thread to a scheduling bucket for the clutch/edge scheduler
357 * based on its scheduling mode and the priority attribute passed in.
358 */
359 static sched_bucket_t
sched_clutch_thread_bucket_map(thread_t thread,int pri)360 sched_clutch_thread_bucket_map(thread_t thread, int pri)
361 {
362 switch (thread->sched_mode) {
363 case TH_MODE_FIXED:
364 if (pri >= BASEPRI_FOREGROUND) {
365 return TH_BUCKET_FIXPRI;
366 } else {
367 return sched_convert_pri_to_bucket(pri);
368 }
369
370 case TH_MODE_REALTIME:
371 return TH_BUCKET_FIXPRI;
372
373 case TH_MODE_TIMESHARE:
374 return sched_convert_pri_to_bucket(pri);
375
376 default:
377 panic("unexpected mode: %d", thread->sched_mode);
378 break;
379 }
380 }
381
382 /*
383 * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
384 * based on the amount of time they have been pending and the load at that
385 * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
386 * multiple clusters, its important to keep the load also as a global counter.
387 */
388 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
389
390 /*
391 * sched_clutch_root_init()
392 *
393 * Routine to initialize the scheduler hierarchy root.
394 */
395 static void
sched_clutch_root_init(sched_clutch_root_t root_clutch,processor_set_t pset)396 sched_clutch_root_init(
397 sched_clutch_root_t root_clutch,
398 processor_set_t pset)
399 {
400 root_clutch->scr_thr_count = 0;
401 root_clutch->scr_priority = NOPRI;
402 root_clutch->scr_urgency = 0;
403 root_clutch->scr_pset = pset;
404 #if CONFIG_SCHED_EDGE
405 root_clutch->scr_cluster_id = pset->pset_cluster_id;
406 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
407 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
408 }
409 #else /* CONFIG_SCHED_EDGE */
410 root_clutch->scr_cluster_id = 0;
411 #endif /* CONFIG_SCHED_EDGE */
412
413 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
414 queue_init(&root_clutch->scr_clutch_buckets);
415
416 /* Initialize the priority queue which maintains all runnable foreign clutch buckets */
417 priority_queue_init(&root_clutch->scr_foreign_buckets);
418 bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
419 bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
420 bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
421 priority_queue_init(&root_clutch->scr_bound_root_buckets);
422
423 /* Initialize the bitmap and priority queue of runnable root buckets */
424 priority_queue_init(&root_clutch->scr_unbound_root_buckets);
425 bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
426 bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
427
428 /* Initialize all the root buckets */
429 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
430 sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
431 sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
432 }
433 }
434
435 /*
436 * Clutch Bucket Runqueues
437 *
438 * The clutch buckets are maintained in a runq at the root bucket level. The
439 * runq organization allows clutch buckets to be ordered based on various
440 * factors such as:
441 *
442 * - Clutch buckets are round robin'ed at the same priority level when a
443 * thread is selected from a clutch bucket. This prevents a clutch bucket
444 * from starving out other clutch buckets at the same priority.
445 *
446 * - Clutch buckets are inserted at the head when it becomes runnable due to
447 * thread preemption. This allows threads that were preempted to maintain
448 * their order in the queue.
449 */
450
451 /*
452 * sched_clutch_bucket_runq_init()
453 *
454 * Initialize a clutch bucket runq.
455 */
456 static void
sched_clutch_bucket_runq_init(sched_clutch_bucket_runq_t clutch_buckets_rq)457 sched_clutch_bucket_runq_init(
458 sched_clutch_bucket_runq_t clutch_buckets_rq)
459 {
460 clutch_buckets_rq->scbrq_highq = NOPRI;
461 for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
462 clutch_buckets_rq->scbrq_bitmap[i] = 0;
463 }
464 clutch_buckets_rq->scbrq_count = 0;
465 for (int i = 0; i < NRQS; i++) {
466 circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
467 }
468 }
469
470 /*
471 * sched_clutch_bucket_runq_empty()
472 *
473 * Returns if a clutch bucket runq is empty.
474 */
475 static boolean_t
sched_clutch_bucket_runq_empty(sched_clutch_bucket_runq_t clutch_buckets_rq)476 sched_clutch_bucket_runq_empty(
477 sched_clutch_bucket_runq_t clutch_buckets_rq)
478 {
479 return clutch_buckets_rq->scbrq_count == 0;
480 }
481
482 /*
483 * sched_clutch_bucket_runq_peek()
484 *
485 * Returns the highest priority clutch bucket in the runq.
486 */
487 static sched_clutch_bucket_t
sched_clutch_bucket_runq_peek(sched_clutch_bucket_runq_t clutch_buckets_rq)488 sched_clutch_bucket_runq_peek(
489 sched_clutch_bucket_runq_t clutch_buckets_rq)
490 {
491 if (clutch_buckets_rq->scbrq_count > 0) {
492 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
493 return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
494 } else {
495 return NULL;
496 }
497 }
498
499 /*
500 * sched_clutch_bucket_runq_enqueue()
501 *
502 * Enqueue a clutch bucket into the runq based on the options passed in.
503 */
504 static void
sched_clutch_bucket_runq_enqueue(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_options_t options)505 sched_clutch_bucket_runq_enqueue(
506 sched_clutch_bucket_runq_t clutch_buckets_rq,
507 sched_clutch_bucket_t clutch_bucket,
508 sched_clutch_bucket_options_t options)
509 {
510 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
511 if (circle_queue_empty(queue)) {
512 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
513 bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
514 if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
515 clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
516 }
517 } else {
518 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
519 circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
520 } else {
521 /*
522 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
523 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
524 */
525 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
526 }
527 }
528 clutch_buckets_rq->scbrq_count++;
529 }
530
531 /*
532 * sched_clutch_bucket_runq_remove()
533 *
534 * Remove a clutch bucket from the runq.
535 */
536 static void
sched_clutch_bucket_runq_remove(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)537 sched_clutch_bucket_runq_remove(
538 sched_clutch_bucket_runq_t clutch_buckets_rq,
539 sched_clutch_bucket_t clutch_bucket)
540 {
541 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
542 circle_dequeue(queue, &clutch_bucket->scb_runqlink);
543 assert(clutch_buckets_rq->scbrq_count > 0);
544 clutch_buckets_rq->scbrq_count--;
545 if (circle_queue_empty(queue)) {
546 bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
547 clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
548 }
549 }
550
551 static void
sched_clutch_bucket_runq_rotate(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)552 sched_clutch_bucket_runq_rotate(
553 sched_clutch_bucket_runq_t clutch_buckets_rq,
554 sched_clutch_bucket_t clutch_bucket)
555 {
556 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
557 assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
558 circle_queue_rotate_head_forward(queue);
559 }
560
561 /*
562 * sched_clutch_root_bucket_init()
563 *
564 * Routine to initialize root buckets.
565 */
566 static void
sched_clutch_root_bucket_init(sched_clutch_root_bucket_t root_bucket,sched_bucket_t bucket,bool bound_root_bucket)567 sched_clutch_root_bucket_init(
568 sched_clutch_root_bucket_t root_bucket,
569 sched_bucket_t bucket,
570 bool bound_root_bucket)
571 {
572 root_bucket->scrb_bucket = bucket;
573 if (bound_root_bucket) {
574 /* For bound root buckets, initialize the bound thread runq. */
575 root_bucket->scrb_bound = true;
576 run_queue_init(&root_bucket->scrb_bound_thread_runq);
577 } else {
578 /*
579 * The unbounded root buckets contain a runq of runnable clutch buckets
580 * which then hold the runnable threads.
581 */
582 root_bucket->scrb_bound = false;
583 sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
584 }
585 priority_queue_entry_init(&root_bucket->scrb_pqlink);
586 root_bucket->scrb_pqlink.deadline = 0;
587 root_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
588 root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
589 root_bucket->scrb_starvation_avoidance = false;
590 root_bucket->scrb_starvation_ts = 0;
591 }
592
593 /*
594 * Special case scheduling for Above UI bucket.
595 *
596 * AboveUI threads are typically system critical threads that need low latency
597 * which is why they are handled specially.
598 *
599 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
600 * important to maintain some native priority order between those buckets. For unbounded
601 * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
602 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
603 * deadline based scheduling which should pickup the timeshare buckets. For the bound
604 * case, the policy simply compares the priority of the highest runnable threads in
605 * the above UI and timeshare buckets.
606 *
607 * The implementation allows extremely low latency CPU access for Above UI threads
608 * while supporting the use case of high priority timeshare threads contending with
609 * lower priority fixed priority threads.
610 */
611
612
613 /*
614 * sched_clutch_root_unbound_select_aboveui()
615 *
616 * Routine to determine if the above UI unbounded bucket should be selected for execution.
617 *
618 * Writes the highest unbound (timeshare FG vs. above UI) bucket, its priority, and whether
619 * it is an above UI bucket into the pointer parameters.
620 */
621 static void
sched_clutch_root_unbound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)622 sched_clutch_root_unbound_select_aboveui(
623 sched_clutch_root_t root_clutch,
624 sched_clutch_root_bucket_t *highest_bucket,
625 int *highest_pri,
626 bool *highest_is_aboveui,
627 sched_clutch_root_bucket_t _Nullable prev_bucket,
628 thread_t _Nullable prev_thread)
629 {
630 /* First determine the highest Clutch bucket */
631 sched_clutch_root_bucket_t higher_root_bucket = NULL;
632 sched_clutch_bucket_t higher_clutch_bucket = NULL;
633 int higher_bucket_sched_pri = -1;
634 bool higher_is_aboveui = false;
635 /* Consider unbound Above UI */
636 if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
637 higher_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
638 higher_clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, higher_root_bucket, NULL, NULL, NULL);
639 higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
640 higher_is_aboveui = true;
641 }
642 /* Consider unbound Timeshare FG */
643 if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
644 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
645 sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket_sharefg, NULL, NULL, NULL);
646 /* Strict greater-than because unbound timeshare FG root bucket loses all priority ties at this level */
647 if (higher_root_bucket == NULL || clutch_bucket_sharefg->scb_priority > higher_clutch_bucket->scb_priority) {
648 higher_root_bucket = root_bucket_sharefg;
649 higher_clutch_bucket = clutch_bucket_sharefg;
650 higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
651 higher_is_aboveui = false;
652 }
653 }
654 /* Consider the previous thread */
655 if (prev_thread != NULL) {
656 assert(prev_bucket->scrb_bound == false);
657 sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
658 int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
659 sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
660 bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && higher_is_aboveui == false;
661 if (higher_clutch_bucket == NULL ||
662 sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, higher_clutch_bucket->scb_priority, prev_bucket_should_win_ties)) {
663 higher_root_bucket = prev_bucket;
664 higher_clutch_bucket = prev_clutch_bucket;
665 higher_bucket_sched_pri = prev_thread->sched_pri;
666 higher_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
667 }
668 }
669 /* Compare highest priority in the highest unbound Clutch bucket to highest priority seen from the bound buckets */
670 if (higher_root_bucket != NULL) {
671 bool unbound_should_win_ties = higher_is_aboveui == true && *highest_is_aboveui == false;
672 if (sched_clutch_pri_greater_than_tiebreak(higher_bucket_sched_pri, *highest_pri, unbound_should_win_ties)) {
673 *highest_pri = higher_bucket_sched_pri;
674 *highest_bucket = higher_root_bucket;
675 *highest_is_aboveui = higher_is_aboveui;
676 }
677 }
678 }
679
680 /*
681 * sched_clutch_root_bound_select_aboveui()
682 *
683 * Routine to determine if the above UI bounded bucket should be selected for execution.
684 *
685 * Writes the highest bound (timeshare FG vs. above UI) bucket, its priority, and whether
686 * it is an above UI bucket into the pointer parameters.
687 */
688 static void
sched_clutch_root_bound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)689 sched_clutch_root_bound_select_aboveui(
690 sched_clutch_root_t root_clutch,
691 sched_clutch_root_bucket_t *highest_bucket,
692 int *highest_pri,
693 bool *highest_is_aboveui,
694 sched_clutch_root_bucket_t _Nullable prev_bucket,
695 thread_t _Nullable prev_thread)
696 {
697 /* Consider bound Above UI */
698 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
699 if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_FIXPRI) &&
700 sched_clutch_pri_greater_than_tiebreak(root_bucket_aboveui->scrb_bound_thread_runq.highq, *highest_pri, *highest_is_aboveui == false)) {
701 *highest_pri = root_bucket_aboveui->scrb_bound_thread_runq.highq;
702 *highest_bucket = root_bucket_aboveui;
703 *highest_is_aboveui = true;
704 }
705 /* Consider bound Timeshare FG */
706 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
707 if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SHARE_FG) &&
708 sched_clutch_pri_greater_than_tiebreak(root_bucket_sharefg->scrb_bound_thread_runq.highq, *highest_pri, false)) {
709 *highest_pri = root_bucket_sharefg->scrb_bound_thread_runq.highq;
710 *highest_bucket = root_bucket_sharefg;
711 *highest_is_aboveui = false;
712 }
713 /* Consider the previous thread */
714 if (prev_thread != NULL) {
715 assert(prev_bucket->scrb_bound == true);
716 bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && *highest_is_aboveui == false;
717 if (sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, *highest_pri, prev_bucket_should_win_ties)) {
718 *highest_pri = prev_thread->sched_pri;
719 *highest_bucket = prev_bucket;
720 *highest_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
721 }
722 }
723 }
724
725 /*
726 * sched_clutch_root_highest_runnable_qos()
727 *
728 * Returns the index of the highest-QoS root bucket which is currently runnable.
729 */
730 static int
sched_clutch_root_highest_runnable_qos(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type)731 sched_clutch_root_highest_runnable_qos(
732 sched_clutch_root_t root_clutch,
733 sched_clutch_highest_root_bucket_type_t type)
734 {
735 int highest_unbound_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
736 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
737 return highest_unbound_bucket;
738 }
739 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
740 int highest_bound_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
741 if (highest_bound_bucket == -1) {
742 return highest_unbound_bucket;
743 }
744 if (highest_unbound_bucket == -1) {
745 return highest_bound_bucket;
746 }
747 /* Both bound and unbound buckets are runnable, return the higher QoS */
748 return MIN(highest_bound_bucket, highest_unbound_bucket);
749 }
750
751 /*
752 * sched_clutch_root_highest_aboveui_root_bucket()
753 *
754 * Routine to determine if an above UI root bucket should be selected for execution.
755 *
756 * Returns the root bucket if we should run an above UI bucket or NULL otherwise.
757 */
758 static sched_clutch_root_bucket_t
sched_clutch_root_highest_aboveui_root_bucket(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread)759 sched_clutch_root_highest_aboveui_root_bucket(
760 sched_clutch_root_t root_clutch,
761 sched_clutch_highest_root_bucket_type_t type,
762 sched_clutch_root_bucket_t _Nullable prev_bucket,
763 thread_t _Nullable prev_thread,
764 bool *chose_prev_thread)
765 {
766 assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
767 assert((type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) || (prev_bucket == NULL));
768
769 sched_clutch_root_bucket_t highest_bucket = NULL;
770 int highest_pri = -1;
771 bool highest_is_aboveui = false;
772
773 /* Forward previous thread to the correct comparison logic, based on boundness */
774 sched_clutch_root_bucket_t bound_prev_bucket = NULL, unbound_prev_bucket = NULL;
775 thread_t bound_prev_thread = NULL, unbound_prev_thread = NULL;
776 if (prev_thread != NULL) {
777 if (prev_bucket->scrb_bound) {
778 bound_prev_bucket = prev_bucket;
779 bound_prev_thread = prev_thread;
780 } else {
781 unbound_prev_bucket = prev_bucket;
782 unbound_prev_thread = prev_thread;
783 }
784 }
785
786 /* Consider bound Above UI vs. Timeshare FG first, so those buckets will win ties against the corresponding unbound buckets */
787 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) {
788 sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, bound_prev_bucket, bound_prev_thread);
789 }
790
791 /* Consider unbound Above UI vs. Timeshare FG */
792 sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, unbound_prev_bucket, unbound_prev_thread);
793 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
794 return highest_is_aboveui ? highest_bucket : NULL;
795 }
796 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
797
798 /* Determine whether we already know to continue running the previous thread */
799 if (prev_thread != NULL &&
800 bitmap_test(highest_bucket->scrb_bound ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap, highest_bucket->scrb_bucket) == false) {
801 /* Highest bucket we saw is empty, so the previous thread must have been the highest */
802 assert(highest_bucket == prev_bucket);
803 *chose_prev_thread = true;
804 }
805
806 return highest_is_aboveui ? highest_bucket : NULL;
807 }
808
809 /*
810 * sched_clutch_root_highest_root_bucket()
811 *
812 * Main routine to find the highest runnable root level bucket.
813 * This routine is called from performance sensitive contexts; so it is
814 * crucial to keep this O(1). The options parameter determines if
815 * the selection logic should look at unbounded threads only (for
816 * cross-cluster stealing operations) or both bounded and unbounded
817 * threads (for selecting next thread for execution on current cluster).
818 */
819 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread,sched_clutch_traverse_mode_t mode,sched_clutch_dbg_thread_select_packed_t * debug_info)820 sched_clutch_root_highest_root_bucket(
821 sched_clutch_root_t root_clutch,
822 uint64_t timestamp,
823 sched_clutch_highest_root_bucket_type_t type,
824 sched_clutch_root_bucket_t _Nullable prev_bucket,
825 thread_t _Nullable prev_thread,
826 bool *chose_prev_thread,
827 sched_clutch_traverse_mode_t mode,
828 sched_clutch_dbg_thread_select_packed_t *debug_info)
829 {
830 assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
831 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL || (prev_thread == NULL));
832 assert(prev_thread == NULL || (mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY));
833 sched_clutch_hierarchy_locked_assert(root_clutch);
834
835 int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(root_clutch, type);
836 if (highest_runnable_bucket == -1) {
837 /*
838 * The Clutch hierarchy has no runnable threads. We can continue running
839 * whatever was running previously.
840 */
841 assert(sched_clutch_root_count(root_clutch) == 0 || type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
842 *chose_prev_thread = true;
843 if (prev_thread != NULL) {
844 debug_info->trace_data.selection_was_edf = true;
845 }
846 return prev_bucket;
847 }
848
849 /* Consider Above UI threads, in comparison to Timeshare FG threads */
850 sched_clutch_root_bucket_t highest_aboveui_bucket = sched_clutch_root_highest_aboveui_root_bucket(root_clutch, type, prev_bucket, prev_thread, chose_prev_thread);
851 if (highest_aboveui_bucket != NULL) {
852 debug_info->trace_data.selection_was_edf = true;
853 return highest_aboveui_bucket;
854 }
855
856 /*
857 * Above UI bucket is not runnable or has a low priority runnable thread; use the
858 * earliest deadline model to schedule threads. The idea is that as the timeshare
859 * buckets use CPU, they will drop their interactivity score/sched priority and
860 * allow the low priority AboveUI buckets to be scheduled.
861 */
862
863 /* Find the earliest deadline bucket */
864 sched_clutch_root_bucket_t edf_bucket;
865 bool edf_bucket_enqueued_normally;
866
867 evaluate_root_buckets:
868 edf_bucket = NULL;
869 edf_bucket_enqueued_normally = true;
870
871 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
872 edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
873 } else {
874 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
875 sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
876 sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
877 if (bound_bucket && unbound_bucket) {
878 /* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
879 edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
880 } else {
881 edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
882 }
883 }
884 if (edf_bucket == NULL) {
885 /* The timeshare portion of the runqueue is empty */
886 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
887 assert(prev_thread != NULL);
888 *chose_prev_thread = true;
889 if (prev_thread != NULL) {
890 debug_info->trace_data.selection_was_edf = true;
891 }
892 return prev_bucket;
893 }
894 if (prev_bucket != NULL && prev_bucket->scrb_pqlink.deadline < edf_bucket->scrb_pqlink.deadline) {
895 /* The previous thread's root bucket has the earliest deadline and is not currently enqueued */
896 edf_bucket = prev_bucket;
897 edf_bucket_enqueued_normally = false;
898 }
899
900 if (edf_bucket->scrb_starvation_avoidance) {
901 /* Check if the EDF bucket is in an expired starvation avoidance window */
902 uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket];
903 if (timestamp >= (edf_bucket->scrb_starvation_ts + starvation_window)) {
904 /* Starvation avoidance window is over; update deadline and re-evaluate EDF */
905 edf_bucket->scrb_starvation_avoidance = false;
906 edf_bucket->scrb_starvation_ts = 0;
907 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
908 bit_set(debug_info->trace_data.starvation_avoidance_window_close, edf_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + edf_bucket->scrb_bucket);
909 goto evaluate_root_buckets;
910 }
911 }
912
913 /*
914 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
915 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
916 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
917 * QoS root bucket.
918 */
919 bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
920 int warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
921
922 /* Allow the prev_bucket to use its warp as well */
923 bool prev_bucket_warping = (prev_bucket != NULL) && (prev_bucket->scrb_bound == edf_bucket->scrb_bound) &&
924 prev_bucket->scrb_bucket < edf_bucket->scrb_bucket && (prev_bucket->scrb_warp_remaining > 0) &&
925 (warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
926
927 bool non_edf_bucket_can_warp = (warp_bucket_index != -1 && warp_bucket_index < edf_bucket->scrb_bucket) || prev_bucket_warping;
928
929 if (non_edf_bucket_can_warp == false) {
930 /* No higher buckets have warp left; best choice is the EDF based bucket */
931 debug_info->trace_data.selection_was_edf = true;
932
933 bool should_update_edf_starvation_state = edf_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
934 if (edf_bucket->scrb_starvation_avoidance == false && should_update_edf_starvation_state) {
935 /* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
936 if (highest_runnable_bucket < edf_bucket->scrb_bucket || (prev_bucket != NULL && prev_bucket->scrb_bucket < edf_bucket->scrb_bucket)) {
937 /*
938 * Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance.
939 *
940 * The starvation avoidance window is allocated as a single quantum for the starved bucket, enforced
941 * simultaneously across all CPUs in the cluster. The idea is to grant the starved bucket roughly one
942 * quantum per core, each time the bucket reaches the earliest deadline position. Note that this
943 * cadence is driven by the difference between the starved bucket's and highest-runnable bucket's WCELs.
944 */
945 edf_bucket->scrb_starvation_avoidance = true;
946 edf_bucket->scrb_starvation_ts = timestamp;
947 debug_info->trace_data.selection_opened_starvation_avoidance_window = true;
948 } else {
949 /* EDF bucket is being selected in the natural order; update deadline and reset warp */
950 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
951 edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
952 edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
953 if (edf_bucket_enqueued_normally) {
954 if (edf_bucket->scrb_bound) {
955 bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
956 } else {
957 bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
958 }
959 }
960 }
961 }
962 *chose_prev_thread = !edf_bucket_enqueued_normally;
963 return edf_bucket;
964 }
965
966 /*
967 * Looks like there is a root bucket which is higher in the natural priority
968 * order than edf_bucket and might have some warp remaining.
969 */
970 assert(prev_bucket_warping || warp_bucket_index >= 0);
971 sched_clutch_root_bucket_t warp_bucket = NULL;
972 if (prev_bucket_warping) {
973 assert(warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
974 warp_bucket = prev_bucket;
975 } else {
976 warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
977 }
978
979 bool warp_is_being_utilized = warp_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
980
981 if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
982 if (warp_is_being_utilized) {
983 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
984 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
985 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
986 debug_info->trace_data.selection_opened_warp_window = true;
987 }
988 *chose_prev_thread = prev_bucket_warping;
989 debug_info->trace_data.selection_was_edf = false;
990 assert(warp_bucket != edf_bucket);
991 return warp_bucket;
992 }
993 if (warp_bucket->scrb_warped_deadline > timestamp) {
994 /* Root bucket already has a warp window open with some warp remaining */
995 if (warp_is_being_utilized) {
996 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
997 }
998 *chose_prev_thread = prev_bucket_warping;
999 debug_info->trace_data.selection_was_edf = false;
1000 return warp_bucket;
1001 }
1002
1003 /*
1004 * For this bucket, warp window was opened sometime in the past but has now
1005 * expired. Mark the bucket as not available for warp anymore and re-run the
1006 * warp bucket selection logic.
1007 */
1008 warp_bucket->scrb_warp_remaining = 0;
1009 if (!prev_bucket_warping) {
1010 if (warp_bucket->scrb_bound) {
1011 bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
1012 } else {
1013 bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
1014 }
1015 }
1016 bit_set(debug_info->trace_data.warp_window_close, warp_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + warp_bucket->scrb_bucket);
1017 goto evaluate_root_buckets;
1018 }
1019
1020 /*
1021 * sched_clutch_root_bucket_deadline_calculate()
1022 *
1023 * Calculate the deadline for the bucket based on its WCEL
1024 */
1025 static uint64_t
sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t root_bucket,uint64_t timestamp)1026 sched_clutch_root_bucket_deadline_calculate(
1027 sched_clutch_root_bucket_t root_bucket,
1028 uint64_t timestamp)
1029 {
1030 /* For fixpri AboveUI bucket always return it as the earliest deadline */
1031 if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
1032 return 0;
1033 }
1034
1035 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
1036 return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
1037 }
1038
1039 /*
1040 * sched_clutch_root_bucket_deadline_update()
1041 *
1042 * Routine to update the deadline of the root bucket when it is selected.
1043 * Updating the deadline also moves the root_bucket in the EDF priority
1044 * queue.
1045 */
1046 static void
sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,bool bucket_is_enqueued)1047 sched_clutch_root_bucket_deadline_update(
1048 sched_clutch_root_bucket_t root_bucket,
1049 sched_clutch_root_t root_clutch,
1050 uint64_t timestamp,
1051 bool bucket_is_enqueued)
1052 {
1053 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
1054 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
1055 return;
1056 }
1057
1058 uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
1059 uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1060 if (__improbable(old_deadline > new_deadline)) {
1061 panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
1062 }
1063 if (old_deadline != new_deadline) {
1064 root_bucket->scrb_pqlink.deadline = new_deadline;
1065 if (bucket_is_enqueued) {
1066 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1067 priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
1068 }
1069 }
1070 }
1071
1072 /*
1073 * sched_clutch_root_bucket_runnable()
1074 *
1075 * Routine to insert a newly runnable root bucket into the hierarchy.
1076 * Also updates the deadline and warp parameters as necessary.
1077 */
1078 static void
sched_clutch_root_bucket_runnable(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1079 sched_clutch_root_bucket_runnable(
1080 sched_clutch_root_bucket_t root_bucket,
1081 sched_clutch_root_t root_clutch,
1082 uint64_t timestamp)
1083 {
1084 /* Mark the root bucket as runnable */
1085 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1086 bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
1087
1088 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
1089 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1090 return;
1091 }
1092
1093 if (root_bucket->scrb_starvation_avoidance == false) {
1094 /*
1095 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
1096 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
1097 * that and fix it up.
1098 */
1099 root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1100 }
1101 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1102 priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
1103 if (root_bucket->scrb_warp_remaining) {
1104 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
1105 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1106 bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
1107 }
1108 }
1109
1110 /*
1111 * sched_clutch_root_bucket_empty()
1112 *
1113 * Routine to remove an empty root bucket from the hierarchy.
1114 * Also updates the deadline and warp parameters as necessary.
1115 */
1116 static void
sched_clutch_root_bucket_empty(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1117 sched_clutch_root_bucket_empty(
1118 sched_clutch_root_bucket_t root_bucket,
1119 sched_clutch_root_t root_clutch,
1120 uint64_t timestamp)
1121 {
1122 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1123 bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
1124
1125 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
1126 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1127 return;
1128 }
1129
1130 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1131 priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
1132
1133 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1134 bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
1135
1136 if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
1137 if (root_bucket->scrb_warped_deadline > timestamp) {
1138 /*
1139 * For root buckets that were using the warp, check if the warp
1140 * deadline is in the future. If yes, remove the wall time the
1141 * warp was active and update the warp remaining. This allows
1142 * the root bucket to use the remaining warp the next time it
1143 * becomes runnable.
1144 */
1145 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
1146 } else {
1147 /*
1148 * If the root bucket's warped deadline is in the past, it has used up
1149 * all the warp it was assigned. Empty out its warp remaining.
1150 */
1151 root_bucket->scrb_warp_remaining = 0;
1152 }
1153 }
1154 }
1155
1156 static int
sched_clutch_global_bucket_load_get(sched_bucket_t bucket)1157 sched_clutch_global_bucket_load_get(
1158 sched_bucket_t bucket)
1159 {
1160 return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
1161 }
1162
1163 /*
1164 * sched_clutch_root_pri_update()
1165 *
1166 * The root level priority is used for thread selection and preemption
1167 * logic.
1168 *
1169 * The logic uses the same decision as thread selection for deciding between the
1170 * above UI and timeshare buckets. If one of the timesharing buckets have to be
1171 * used for priority calculation, the logic is slightly different from thread
1172 * selection, because thread selection considers deadlines, warps etc. to
1173 * decide the most optimal bucket at a given timestamp. Since the priority
1174 * value is used for preemption decisions only, it needs to be based on the
1175 * highest runnable thread available in the timeshare domain. This logic can
1176 * be made more sophisticated if there are cases of unnecessary preemption
1177 * being seen in workloads.
1178 */
1179 static void
sched_clutch_root_pri_update(sched_clutch_root_t root_clutch)1180 sched_clutch_root_pri_update(
1181 sched_clutch_root_t root_clutch)
1182 {
1183 sched_clutch_hierarchy_locked_assert(root_clutch);
1184 int16_t root_bound_pri = NOPRI;
1185 int16_t root_unbound_pri = NOPRI;
1186
1187 /* Consider bound root buckets */
1188 if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1189 goto root_pri_update_unbound;
1190 }
1191 sched_clutch_root_bucket_t highest_bound_root_bucket = NULL;
1192 __unused int highest_bound_root_bucket_pri = -1;
1193 bool highest_bound_root_bucket_is_fixpri = false;
1194 sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bound_root_bucket, &highest_bound_root_bucket_pri, &highest_bound_root_bucket_is_fixpri, NULL, NULL);
1195 if (highest_bound_root_bucket_is_fixpri == false) {
1196 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1197 assert(root_bucket_index != -1);
1198 highest_bound_root_bucket = &root_clutch->scr_bound_buckets[root_bucket_index];
1199 }
1200 root_bound_pri = highest_bound_root_bucket->scrb_bound_thread_runq.highq;
1201
1202 root_pri_update_unbound:
1203 /* Consider unbound root buckets */
1204 if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1205 goto root_pri_update_complete;
1206 }
1207 sched_clutch_root_bucket_t highest_unbound_root_bucket = NULL;
1208 __unused int highest_unbound_root_bucket_pri = -1;
1209 bool highest_unbound_root_bucket_is_fixpri = false;
1210 sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_unbound_root_bucket, &highest_unbound_root_bucket_pri, &highest_unbound_root_bucket_is_fixpri, NULL, NULL);
1211 if (highest_unbound_root_bucket_is_fixpri == false) {
1212 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1213 assert(root_bucket_index != -1);
1214 highest_unbound_root_bucket = &root_clutch->scr_unbound_buckets[root_bucket_index];
1215 }
1216
1217 /* For the selected root bucket, find the highest priority clutch bucket */
1218 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, highest_unbound_root_bucket, NULL, NULL, NULL);
1219 root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1220
1221 root_pri_update_complete:
1222 root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
1223 }
1224
1225 /*
1226 * sched_clutch_root_urgency_inc()
1227 *
1228 * Routine to increment the urgency at the root level based on the thread
1229 * priority that is being inserted into the hierarchy. The root urgency
1230 * counter is updated based on the urgency of threads in any of the
1231 * clutch buckets which are part of the hierarchy.
1232 *
1233 * Always called with the pset lock held.
1234 */
1235 static void
sched_clutch_root_urgency_inc(sched_clutch_root_t root_clutch,thread_t thread)1236 sched_clutch_root_urgency_inc(
1237 sched_clutch_root_t root_clutch,
1238 thread_t thread)
1239 {
1240 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1241 root_clutch->scr_urgency++;
1242 }
1243 }
1244
1245 /*
1246 * sched_clutch_root_urgency_dec()
1247 *
1248 * Routine to decrement the urgency at the root level based on the thread
1249 * priority that is being removed from the hierarchy. The root urgency
1250 * counter is updated based on the urgency of threads in any of the
1251 * clutch buckets which are part of the hierarchy.
1252 *
1253 * Always called with the pset lock held.
1254 */
1255 static void
sched_clutch_root_urgency_dec(sched_clutch_root_t root_clutch,thread_t thread)1256 sched_clutch_root_urgency_dec(
1257 sched_clutch_root_t root_clutch,
1258 thread_t thread)
1259 {
1260 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1261 root_clutch->scr_urgency--;
1262 }
1263 }
1264
1265 /*
1266 * Clutch bucket level scheduling
1267 *
1268 * The second level of scheduling is the clutch bucket level scheduling
1269 * which tries to schedule thread groups within root_buckets. Each
1270 * clutch represents a thread group and a clutch_bucket_group represents
1271 * threads at a particular sched_bucket within that thread group. The
1272 * clutch_bucket_group contains a clutch_bucket per cluster on the system
1273 * where it holds the runnable threads destined for execution on that
1274 * cluster.
1275 *
1276 * The goal of this level of scheduling is to allow interactive thread
1277 * groups low latency access to the CPU. It also provides slight
1278 * scheduling preference for App and unrestricted thread groups.
1279 *
1280 * The clutch bucket scheduling algorithm measures an interactivity
1281 * score for all clutch bucket groups. The interactivity score is based
1282 * on the ratio of the CPU used and the voluntary blocking of threads
1283 * within the clutch bucket group. The algorithm is very close to the ULE
1284 * scheduler on FreeBSD in terms of calculations. The interactivity
1285 * score provides an interactivity boost in the range of
1286 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1287 * thread groups to win over CPU spinners.
1288 *
1289 * The interactivity score of the clutch bucket group is combined with the
1290 * highest base/promoted priority of threads in the clutch bucket to form
1291 * the overall priority of the clutch bucket.
1292 */
1293
1294 /* Priority boost range for interactivity */
1295 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT (8)
1296 static uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1297
1298 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1299 static uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1300 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS (500000)
1301
1302 /* The ratio to scale the cpu/blocked time per window */
1303 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO (10)
1304
1305 /* Initial value for voluntary blocking time for the clutch_bucket */
1306 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID (uint64_t)(~0)
1307
1308 /* Value indicating the clutch bucket is not pending execution */
1309 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID ((uint64_t)(~0))
1310
1311 /*
1312 * Thread group CPU starvation avoidance
1313 *
1314 * In heavily CPU contended scenarios, it is possible that some thread groups
1315 * which have a low interactivity score do not get CPU time at all. In order to
1316 * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1317 * bucket group when it has been pending execution for a certain time as defined
1318 * by the sched_clutch_bucket_group_pending_delta_us values below.
1319 *
1320 * The values chosen here are very close to the WCEL values for each sched bucket.
1321 * Theses values are added into the pending interval used to determine how
1322 * frequently we will ageout the CPU usage, ensuring a reasonable limit on the
1323 * frequency.
1324 */
1325 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1326 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
1327 10000, /* FG */
1328 37500, /* IN */
1329 75000, /* DF */
1330 150000, /* UT */
1331 250000, /* BG */
1332 };
1333 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1334
1335 /*
1336 * sched_clutch_bucket_init()
1337 *
1338 * Initializer for clutch buckets.
1339 */
1340 static void
sched_clutch_bucket_init(sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_group_t clutch_bucket_group,sched_bucket_t bucket)1341 sched_clutch_bucket_init(
1342 sched_clutch_bucket_t clutch_bucket,
1343 sched_clutch_bucket_group_t clutch_bucket_group,
1344 sched_bucket_t bucket)
1345 {
1346 clutch_bucket->scb_bucket = bucket;
1347 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1348 clutch_bucket->scb_priority = 0;
1349 #if CONFIG_SCHED_EDGE
1350 clutch_bucket->scb_foreign = false;
1351 priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1352 #endif /* CONFIG_SCHED_EDGE */
1353 clutch_bucket->scb_group = clutch_bucket_group;
1354 clutch_bucket->scb_root = NULL;
1355 priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1356 priority_queue_init(&clutch_bucket->scb_thread_runq);
1357 queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1358 }
1359
1360 /*
1361 * sched_clutch_bucket_group_init()
1362 *
1363 * Initializer for clutch bucket groups.
1364 */
1365 static void
sched_clutch_bucket_group_init(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_t clutch,sched_bucket_t bucket)1366 sched_clutch_bucket_group_init(
1367 sched_clutch_bucket_group_t clutch_bucket_group,
1368 sched_clutch_t clutch,
1369 sched_bucket_t bucket)
1370 {
1371 bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1372 clutch_bucket_group->scbg_bucket = bucket;
1373 clutch_bucket_group->scbg_clutch = clutch;
1374
1375 int max_clusters = ml_get_cluster_count();
1376 clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1377 for (int i = 0; i < max_clusters; i++) {
1378 sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1379 }
1380
1381 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1382 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1383 os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1384 /*
1385 * All thread groups should be initialized to be interactive; this allows the newly launched
1386 * thread groups to fairly compete with already running thread groups.
1387 */
1388 clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1389 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1390 os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1391 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1392 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1393 clutch_bucket_group->scbg_amp_rebalance_last_chosen = UINT32_MAX;
1394 }
1395
1396 static void
sched_clutch_bucket_group_destroy(sched_clutch_bucket_group_t clutch_bucket_group)1397 sched_clutch_bucket_group_destroy(
1398 sched_clutch_bucket_group_t clutch_bucket_group)
1399 {
1400 kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1401 clutch_bucket_group->scbg_clutch_buckets);
1402 }
1403
1404 /*
1405 * sched_clutch_init_with_thread_group()
1406 *
1407 * Initialize the sched_clutch when the thread group is being created
1408 */
1409 void
sched_clutch_init_with_thread_group(sched_clutch_t clutch,struct thread_group * tg)1410 sched_clutch_init_with_thread_group(
1411 sched_clutch_t clutch,
1412 struct thread_group *tg)
1413 {
1414 os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1415
1416 /* Initialize all the clutch buckets */
1417 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1418 sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1419 }
1420
1421 /* Grouping specific fields */
1422 clutch->sc_tg = tg;
1423 }
1424
1425 /*
1426 * sched_clutch_destroy()
1427 *
1428 * Destructor for clutch; called from thread group release code.
1429 */
1430 void
sched_clutch_destroy(sched_clutch_t clutch)1431 sched_clutch_destroy(
1432 sched_clutch_t clutch)
1433 {
1434 assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1435 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1436 sched_clutch_bucket_group_destroy(&(clutch->sc_clutch_groups[i]));
1437 }
1438 }
1439
1440 #if CONFIG_SCHED_EDGE
1441
1442 /*
1443 * Edge Scheduler Preferred Cluster Mechanism
1444 *
1445 * In order to have better control over various QoS buckets within a thread group, the Edge
1446 * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1447 * preferences are stored at the sched_clutch_bucket_group level since that represents all
1448 * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1449 * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1450 */
1451
1452 static uint32_t
sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)1453 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1454 {
1455 return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1456 }
1457
1458 static uint32_t
sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)1459 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1460 {
1461 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1462 }
1463
1464 uint32_t
sched_edge_thread_preferred_cluster(thread_t thread)1465 sched_edge_thread_preferred_cluster(thread_t thread)
1466 {
1467 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1468 /* For threads bound to a specific cluster, return the bound cluster id */
1469 return sched_edge_thread_bound_cluster_id(thread);
1470 }
1471
1472 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1473 sched_bucket_t sched_bucket = thread->th_sched_bucket;
1474 if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1475 sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1476 }
1477 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1478 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1479 }
1480
1481 /*
1482 * Edge Scheduler Foreign Bucket Support
1483 *
1484 * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1485 * threads that are not native to the cluster. A clutch bucket is considered native if its
1486 * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1487 * bucket priority queue is used for rebalance operations to get threads back to their native
1488 * cluster quickly.
1489 *
1490 * It is possible to make this policy even more aggressive by considering all clusters that
1491 * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1492 * migrations which might have performance implications.
1493 */
1494
1495 static void
sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1496 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1497 {
1498 if (clutch_bucket->scb_foreign) {
1499 clutch_bucket->scb_foreign = false;
1500 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1501 }
1502 }
1503
1504 static void
sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1505 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1506 {
1507 if (!clutch_bucket->scb_foreign) {
1508 clutch_bucket->scb_foreign = true;
1509 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1510 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1511 }
1512 }
1513
1514 /*
1515 * Edge Scheduler Cumulative Load Average
1516 *
1517 * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1518 * making thread migration decisions. The per-bucket load is maintained as a
1519 * cumulative count since higher scheduling buckets impact load on lower buckets
1520 * for thread migration decisions.
1521 *
1522 */
1523
1524 static void
sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1525 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1526 {
1527 switch (bucket) {
1528 case TH_BUCKET_FIXPRI: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1529 case TH_BUCKET_SHARE_FG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1530 case TH_BUCKET_SHARE_IN: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1531 case TH_BUCKET_SHARE_DF: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1532 case TH_BUCKET_SHARE_UT: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1533 case TH_BUCKET_SHARE_BG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1534 default:
1535 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1536 }
1537 }
1538
1539 static void
sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1540 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1541 {
1542 switch (bucket) {
1543 case TH_BUCKET_FIXPRI: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1544 case TH_BUCKET_SHARE_FG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1545 case TH_BUCKET_SHARE_IN: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1546 case TH_BUCKET_SHARE_DF: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1547 case TH_BUCKET_SHARE_UT: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1548 case TH_BUCKET_SHARE_BG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1549 default:
1550 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1551 }
1552 }
1553
1554 uint16_t
sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1555 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1556 {
1557 return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1558 }
1559
1560 #endif /* CONFIG_SCHED_EDGE */
1561
1562 /*
1563 * sched_clutch_bucket_hierarchy_insert()
1564 *
1565 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1566 */
1567 static void
sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,sched_clutch_bucket_options_t options)1568 sched_clutch_bucket_hierarchy_insert(
1569 sched_clutch_root_t root_clutch,
1570 sched_clutch_bucket_t clutch_bucket,
1571 sched_bucket_t bucket,
1572 uint64_t timestamp,
1573 sched_clutch_bucket_options_t options)
1574 {
1575 sched_clutch_hierarchy_locked_assert(root_clutch);
1576 if (bucket > TH_BUCKET_FIXPRI) {
1577 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1578 enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1579 }
1580 #if CONFIG_SCHED_EDGE
1581 /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1582 uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1583 if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1584 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1585 }
1586 #endif /* CONFIG_SCHED_EDGE */
1587 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1588
1589 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1590 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1591 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1592 }
1593
1594 /* Insert the clutch bucket into the root bucket run queue with order based on options */
1595 sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1596 os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1597 os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1598 }
1599
1600 /*
1601 * sched_clutch_bucket_hierarchy_remove()
1602 *
1603 * Rotuine to remove a empty clutch bucket from the root hierarchy.
1604 */
1605 static void
sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,__unused sched_clutch_bucket_options_t options)1606 sched_clutch_bucket_hierarchy_remove(
1607 sched_clutch_root_t root_clutch,
1608 sched_clutch_bucket_t clutch_bucket,
1609 sched_bucket_t bucket,
1610 uint64_t timestamp,
1611 __unused sched_clutch_bucket_options_t options)
1612 {
1613 sched_clutch_hierarchy_locked_assert(root_clutch);
1614 if (bucket > TH_BUCKET_FIXPRI) {
1615 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1616 remqueue(&clutch_bucket->scb_listlink);
1617 }
1618 #if CONFIG_SCHED_EDGE
1619 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1620 #endif /* CONFIG_SCHED_EDGE */
1621
1622 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1623
1624 /* Remove the clutch bucket from the root bucket priority queue */
1625 sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1626 os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1627
1628 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
1629 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1630 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1631 }
1632 os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1633 }
1634
1635 /*
1636 * sched_clutch_bucket_base_pri()
1637 *
1638 * Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1639 * highest base_pri and the highest sched_pri in the clutch bucket.
1640 */
1641 static uint8_t
sched_clutch_bucket_base_pri(sched_clutch_bucket_t clutch_bucket)1642 sched_clutch_bucket_base_pri(
1643 sched_clutch_bucket_t clutch_bucket)
1644 {
1645 assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1646 /*
1647 * Since the clutch bucket can contain threads that are members of the group due
1648 * to the sched_pri being promoted or due to their base pri, the base priority of
1649 * the entire clutch bucket should be based on the highest thread (promoted or base)
1650 * in the clutch bucket.
1651 */
1652 uint8_t max_pri = 0;
1653 if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1654 max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1655 }
1656 return max_pri;
1657 }
1658
1659 /*
1660 * sched_clutch_interactivity_from_cpu_data()
1661 *
1662 * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1663 */
1664 static uint8_t
sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)1665 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1666 {
1667 sched_clutch_bucket_cpu_data_t scb_cpu_data;
1668 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1669 clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1670 clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1671 uint8_t interactive_score = 0;
1672
1673 if ((cpu_blocked == 0) && (cpu_used == 0)) {
1674 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1675 }
1676 /*
1677 * For all timeshare buckets, calculate the interactivity score of the bucket
1678 * and add it to the base priority
1679 */
1680 if (cpu_blocked > cpu_used) {
1681 /* Interactive clutch_bucket case */
1682 interactive_score = sched_clutch_bucket_group_interactive_pri +
1683 ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1684 } else {
1685 /* Non-interactive clutch_bucket case */
1686 interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1687 }
1688 return interactive_score;
1689 }
1690
1691 /*
1692 * sched_clutch_bucket_pri_calculate()
1693 *
1694 * The priority calculation algorithm for the clutch_bucket is a slight
1695 * modification on the ULE interactivity score. It uses the base priority
1696 * of the clutch bucket and applies an interactivity score boost to the
1697 * highly responsive clutch buckets.
1698 */
1699 static uint8_t
sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t clutch_bucket,uint64_t timestamp)1700 sched_clutch_bucket_pri_calculate(
1701 sched_clutch_bucket_t clutch_bucket,
1702 uint64_t timestamp)
1703 {
1704 /* For empty clutch buckets, return priority 0 */
1705 if (clutch_bucket->scb_thr_count == 0) {
1706 return 0;
1707 }
1708
1709 uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1710 uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1711
1712 assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1713 uint8_t pri = base_pri + interactive_score;
1714 if (pri != clutch_bucket->scb_priority) {
1715 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1716 thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1717 }
1718 return pri;
1719 }
1720
1721 /*
1722 * sched_clutch_root_bucket_highest_clutch_bucket()
1723 *
1724 * Routine to find the highest priority clutch bucket
1725 * within the root bucket.
1726 */
1727 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread,bool * _Nullable chose_prev_thread)1728 sched_clutch_root_bucket_highest_clutch_bucket(
1729 sched_clutch_root_t root_clutch,
1730 sched_clutch_root_bucket_t root_bucket,
1731 processor_t _Nullable processor,
1732 thread_t _Nullable prev_thread,
1733 bool *_Nullable chose_prev_thread)
1734 {
1735 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1736 if (prev_thread != NULL) {
1737 *chose_prev_thread = true;
1738 return sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1739 }
1740 return NULL;
1741 }
1742 sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1743 /* Consider the Clutch bucket of the previous thread */
1744 if (prev_thread != NULL) {
1745 assert(chose_prev_thread != NULL);
1746 sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
1747 int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
1748 sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1749 if (prev_clutch_bucket != clutch_bucket &&
1750 sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, clutch_bucket->scb_priority, processor->first_timeslice)) {
1751 *chose_prev_thread = true;
1752 return prev_clutch_bucket;
1753 }
1754 }
1755 return clutch_bucket;
1756 }
1757
1758 /*
1759 * sched_clutch_bucket_runnable()
1760 *
1761 * Perform all operations needed when a new clutch bucket becomes runnable.
1762 * It involves inserting the clutch_bucket into the hierarchy and updating the
1763 * root priority appropriately.
1764 */
1765 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1766 sched_clutch_bucket_runnable(
1767 sched_clutch_bucket_t clutch_bucket,
1768 sched_clutch_root_t root_clutch,
1769 uint64_t timestamp,
1770 sched_clutch_bucket_options_t options)
1771 {
1772 sched_clutch_hierarchy_locked_assert(root_clutch);
1773 /* Since the clutch bucket became newly runnable, update its pending timestamp */
1774 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1775 sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1776
1777 /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1778 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
1779 int16_t root_old_pri = root_clutch->scr_priority;
1780 sched_clutch_root_pri_update(root_clutch);
1781 return root_clutch->scr_priority > root_old_pri;
1782 }
1783
1784 /*
1785 * sched_clutch_bucket_update()
1786 *
1787 * Update the clutch_bucket's position in the hierarchy. This routine is
1788 * called when a new thread is inserted or removed from a runnable clutch
1789 * bucket. The options specify some properties about the clutch bucket
1790 * insertion order into the clutch bucket runq.
1791 */
1792 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1793 sched_clutch_bucket_update(
1794 sched_clutch_bucket_t clutch_bucket,
1795 sched_clutch_root_t root_clutch,
1796 uint64_t timestamp,
1797 sched_clutch_bucket_options_t options)
1798 {
1799 sched_clutch_hierarchy_locked_assert(root_clutch);
1800 uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1801 sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1802 if (new_pri == clutch_bucket->scb_priority) {
1803 /*
1804 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1805 * to the end of the runq. Typically used when a thread is selected for execution
1806 * from a clutch bucket.
1807 */
1808 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1809 sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1810 }
1811 return false;
1812 }
1813 sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1814 #if CONFIG_SCHED_EDGE
1815 if (clutch_bucket->scb_foreign) {
1816 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1817 }
1818 #endif /* CONFIG_SCHED_EDGE */
1819 clutch_bucket->scb_priority = new_pri;
1820 #if CONFIG_SCHED_EDGE
1821 if (clutch_bucket->scb_foreign) {
1822 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1823 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1824 }
1825 #endif /* CONFIG_SCHED_EDGE */
1826 sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1827
1828 int16_t root_old_pri = root_clutch->scr_priority;
1829 sched_clutch_root_pri_update(root_clutch);
1830 return root_clutch->scr_priority > root_old_pri;
1831 }
1832
1833 /*
1834 * sched_clutch_bucket_empty()
1835 *
1836 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1837 * It involves removing the clutch bucket from the hierarchy and updaing the root
1838 * priority appropriately.
1839 */
1840 static void
sched_clutch_bucket_empty(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1841 sched_clutch_bucket_empty(
1842 sched_clutch_bucket_t clutch_bucket,
1843 sched_clutch_root_t root_clutch,
1844 uint64_t timestamp,
1845 sched_clutch_bucket_options_t options)
1846 {
1847 sched_clutch_hierarchy_locked_assert(root_clutch);
1848 sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1849 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1850 sched_clutch_root_pri_update(root_clutch);
1851 }
1852
1853 /*
1854 * sched_clutch_cpu_usage_update()
1855 *
1856 * Routine to update CPU usage of the thread in the hierarchy.
1857 */
1858 void
sched_clutch_cpu_usage_update(thread_t thread,uint64_t delta)1859 sched_clutch_cpu_usage_update(
1860 thread_t thread,
1861 uint64_t delta)
1862 {
1863 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1864 return;
1865 }
1866
1867 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1868 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1869 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1870 }
1871
1872 /*
1873 * sched_clutch_bucket_group_cpu_usage_update()
1874 *
1875 * Routine to update the CPU usage of the clutch_bucket.
1876 */
1877 static void
sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t delta)1878 sched_clutch_bucket_group_cpu_usage_update(
1879 sched_clutch_bucket_group_t clutch_bucket_group,
1880 uint64_t delta)
1881 {
1882 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1883 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1884 return;
1885 }
1886 delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1887 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1888 }
1889
1890 /*
1891 * sched_clutch_bucket_group_cpu_pending_adjust()
1892 *
1893 * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1894 * such that one "pending interval" provides one point improvement in interactivity score.
1895 */
1896 static inline uint64_t
sched_clutch_bucket_group_cpu_pending_adjust(uint64_t cpu_used,uint64_t cpu_blocked,uint8_t pending_intervals)1897 sched_clutch_bucket_group_cpu_pending_adjust(
1898 uint64_t cpu_used,
1899 uint64_t cpu_blocked,
1900 uint8_t pending_intervals)
1901 {
1902 uint64_t cpu_used_adjusted = 0;
1903 if (cpu_blocked < cpu_used) {
1904 cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1905 cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1906 } else {
1907 uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1908 cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1909 }
1910 return cpu_used_adjusted;
1911 }
1912
1913 /*
1914 * sched_clutch_bucket_group_cpu_adjust()
1915 *
1916 * Routine to scale the cpu usage and blocked time once the sum gets bigger
1917 * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1918 * manageable and maintain the same ratio while allowing clutch buckets to
1919 * adjust behavior and reflect in the interactivity score in a reasonable
1920 * amount of time. Also adjusts the CPU usage based on pending_intervals
1921 * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1922 */
1923 static void
sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t clutch_bucket_group,uint8_t pending_intervals)1924 sched_clutch_bucket_group_cpu_adjust(
1925 sched_clutch_bucket_group_t clutch_bucket_group,
1926 uint8_t pending_intervals)
1927 {
1928 sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1929 sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1930 os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1931 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1932 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1933
1934 if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1935 /* No changes to the CPU used and blocked values */
1936 os_atomic_rmw_loop_give_up();
1937 }
1938 if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1939 /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1940 cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1941 cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1942 }
1943 /* Use the shift passed in to ageout the CPU usage */
1944 cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1945 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1946 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1947 });
1948 }
1949
1950 /*
1951 * Thread level scheduling algorithm
1952 *
1953 * The thread level scheduling algorithm uses the mach timeshare
1954 * decay based algorithm to achieve sharing between threads within the
1955 * same clutch bucket. The load/priority shifts etc. are all maintained
1956 * at the clutch bucket level and used for decay calculation of the
1957 * threads. The load sampling is still driven off the scheduler tick
1958 * for runnable clutch buckets (it does not use the new higher frequency
1959 * EWMA based load calculation). The idea is that the contention and load
1960 * within clutch_buckets should be limited enough to not see heavy decay
1961 * and timeshare effectively.
1962 */
1963
1964 /*
1965 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1966 *
1967 * Increment the run count for the clutch bucket associated with the
1968 * thread.
1969 */
1970 uint32_t
sched_clutch_thread_run_bucket_incr(thread_t thread,sched_bucket_t bucket)1971 sched_clutch_thread_run_bucket_incr(
1972 thread_t thread,
1973 sched_bucket_t bucket)
1974 {
1975 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1976 return 0;
1977 }
1978 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1979 return sched_clutch_run_bucket_incr(clutch, bucket);
1980 }
1981
1982 static uint32_t
sched_clutch_run_bucket_incr(sched_clutch_t clutch,sched_bucket_t bucket)1983 sched_clutch_run_bucket_incr(
1984 sched_clutch_t clutch,
1985 sched_bucket_t bucket)
1986 {
1987 assert(bucket != TH_BUCKET_RUN);
1988 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1989 return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
1990 }
1991
1992 /*
1993 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1994 *
1995 * Decrement the run count for the clutch bucket associated with the
1996 * thread.
1997 */
1998 uint32_t
sched_clutch_thread_run_bucket_decr(thread_t thread,sched_bucket_t bucket)1999 sched_clutch_thread_run_bucket_decr(
2000 thread_t thread,
2001 sched_bucket_t bucket)
2002 {
2003 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2004 return 0;
2005 }
2006 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2007 return sched_clutch_run_bucket_decr(clutch, bucket);
2008 }
2009
2010 static uint32_t
sched_clutch_run_bucket_decr(sched_clutch_t clutch,sched_bucket_t bucket)2011 sched_clutch_run_bucket_decr(
2012 sched_clutch_t clutch,
2013 sched_bucket_t bucket)
2014 {
2015 assert(bucket != TH_BUCKET_RUN);
2016 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2017 return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
2018 }
2019
2020 /*
2021 * sched_clutch_bucket_group_timeshare_update()
2022 *
2023 * Routine to update the load and priority shift for the clutch_bucket_group
2024 * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
2025 * clutch buckets with runnable threads in them. So it is important to maintain
2026 * the timesharing information at the clutch_bucket_group level instead of
2027 * individual clutch buckets (because the algorithm is trying to timeshare all
2028 * threads at the same QoS irrespective of which hierarchy they are enqueued in).
2029 *
2030 * The routine is called from the sched tick handling code to make sure this value
2031 * is updated at least once every sched tick. For clutch bucket groups which have
2032 * not been runnable for very long, the clutch_bucket_group maintains a "last
2033 * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
2034 * if this value is outdated, the load and shifts are updated.
2035 *
2036 * Possible optimization:
2037 * - The current algorithm samples the load every sched tick (125ms).
2038 * This is prone to spikes in runnable counts; if that turns out to be
2039 * a problem, a simple solution would be to do the EWMA trick to sample
2040 * load at every load_tick (30ms) and use the averaged value for the pri
2041 * shift calculation.
2042 */
2043 static void
sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_bucket_t clutch_bucket,uint64_t ctime)2044 sched_clutch_bucket_group_timeshare_update(
2045 sched_clutch_bucket_group_t clutch_bucket_group,
2046 sched_clutch_bucket_t clutch_bucket,
2047 uint64_t ctime)
2048 {
2049 if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
2050 /* No timesharing needed for fixed priority Above UI threads */
2051 return;
2052 }
2053
2054 /*
2055 * Update the timeshare parameters for the clutch bucket group
2056 * if they havent been updated in this tick.
2057 */
2058 uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
2059 uint32_t current_sched_ts = sched_tick;
2060 if (sched_ts < current_sched_ts) {
2061 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
2062 /* NCPU wide workloads should not experience decay */
2063 uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
2064 uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
2065 bucket_group_load = MIN(bucket_group_load, NRQS - 1);
2066 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
2067 /* Ensure that the pri_shift value is reasonable */
2068 pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
2069 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
2070 }
2071
2072 /*
2073 * Update the clutch bucket priority; this allows clutch buckets that have been pending
2074 * for a long time to get an updated interactivity score.
2075 */
2076 sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
2077 }
2078
2079 /*
2080 * sched_clutch_thread_clutch_update()
2081 *
2082 * Routine called when the thread changes its thread group. The current
2083 * implementation relies on the fact that the thread group is changed only from
2084 * the context of the thread itself or when the thread is runnable but not in a
2085 * runqueue. Due to this fact, the thread group change causes only counter
2086 * updates in the old & new clutch buckets and no hierarchy changes. The routine
2087 * also attributes the CPU used so far to the old clutch.
2088 */
2089 void
sched_clutch_thread_clutch_update(thread_t thread,sched_clutch_t old_clutch,sched_clutch_t new_clutch)2090 sched_clutch_thread_clutch_update(
2091 thread_t thread,
2092 sched_clutch_t old_clutch,
2093 sched_clutch_t new_clutch)
2094 {
2095 uint32_t cpu_delta;
2096
2097 if (old_clutch) {
2098 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
2099
2100 sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
2101 /*
2102 * Calculate the CPU used by this thread in the old bucket and
2103 * add it to the old clutch bucket. This uses the same CPU usage
2104 * logic as update_priority etc.
2105 */
2106 sched_tick_delta(thread, cpu_delta);
2107 if (thread->pri_shift < INT8_MAX) {
2108 thread->sched_usage += cpu_delta;
2109 }
2110 thread->cpu_delta += cpu_delta;
2111 if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
2112 sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
2113 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
2114 }
2115 }
2116
2117 if (new_clutch) {
2118 sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
2119 }
2120 }
2121
2122 /* Thread Insertion/Removal/Selection routines */
2123
2124 #if CONFIG_SCHED_EDGE
2125
2126 /*
2127 * Edge Scheduler Bound Thread Support
2128 *
2129 * The edge scheduler allows threads to be bound to specific clusters. The scheduler
2130 * maintains a separate runq on the clutch root to hold these bound threads. These
2131 * bound threads count towards the root priority and thread count, but are ignored
2132 * for thread migration/steal decisions. Bound threads that are enqueued in the
2133 * separate runq have the th_bound_cluster_enqueued flag set to allow easy
2134 * removal.
2135 *
2136 * Bound Threads Timesharing
2137 * The bound threads share the timesharing properties of the clutch bucket group they are
2138 * part of. They contribute to the load and use priority shifts/decay values from the
2139 * clutch bucket group.
2140 */
2141
2142 static boolean_t
sched_edge_bound_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2143 sched_edge_bound_thread_insert(
2144 sched_clutch_root_t root_clutch,
2145 thread_t thread,
2146 integer_t options)
2147 {
2148 /* Update the clutch runnable count and priority */
2149 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2150 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2151 if (root_bucket->scrb_bound_thread_runq.count == 0) {
2152 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
2153 }
2154
2155 assert((thread->th_bound_cluster_enqueued) == false);
2156 run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
2157 thread->th_bound_cluster_enqueued = true;
2158
2159 /* Increment the urgency counter for the root if necessary */
2160 sched_clutch_root_urgency_inc(root_clutch, thread);
2161
2162 int16_t root_old_pri = root_clutch->scr_priority;
2163 sched_clutch_root_pri_update(root_clutch);
2164 return root_clutch->scr_priority > root_old_pri;
2165 }
2166
2167 static void
sched_edge_bound_thread_remove(sched_clutch_root_t root_clutch,thread_t thread)2168 sched_edge_bound_thread_remove(
2169 sched_clutch_root_t root_clutch,
2170 thread_t thread)
2171 {
2172 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2173 assert((thread->th_bound_cluster_enqueued) == true);
2174 run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
2175 thread->th_bound_cluster_enqueued = false;
2176
2177 /* Decrement the urgency counter for the root if necessary */
2178 sched_clutch_root_urgency_dec(root_clutch, thread);
2179
2180 /* Update the clutch runnable count and priority */
2181 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2182 if (root_bucket->scrb_bound_thread_runq.count == 0) {
2183 sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
2184 }
2185 sched_clutch_root_pri_update(root_clutch);
2186 }
2187
2188 /*
2189 * Edge Scheduler cluster shared resource threads load balancing
2190 *
2191 * The Edge scheduler attempts to load balance cluster shared resource intensive threads
2192 * across clusters in order to reduce contention on the shared resources. It achieves
2193 * that by maintaining the runnable and running shared resource load on each cluster
2194 * and balancing the load across multiple clusters.
2195 *
2196 * The current implementation for cluster shared resource load balancing looks at
2197 * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
2198 * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
2199 * do not steal/rebalance shared resource threads. Some more details for the implementation:
2200 *
2201 * - When threads are tagged as shared resource, they go through the cluster selection logic
2202 * which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
2203 * enqueued in the cluster bound runqueue.
2204 *
2205 * - When the threads start running and call avoid_processor, the load balancing logic will be
2206 * invoked and cause the thread to be sent to a more preferred cluster if one exists and has
2207 * no shared resource load.
2208 *
2209 * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
2210 * it will look at running shared resource threads on foreign clusters and actively rebalance them.
2211 *
2212 * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
2213 * go idle intentionally.
2214 *
2215 * - One caveat of this design is that if a preferred CPU has already run and finished its shared
2216 * resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
2217 * The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
2218 * events listed above happen.
2219 *
2220 * - Also it currently does not consider other properties such as thread priorities and
2221 * qos level thread load in the thread placement decision.
2222 *
2223 * Edge Scheduler cluster shared resource thread scheduling policy
2224 *
2225 * The threads for shared resources can be scheduled using one of the two policies:
2226 *
2227 * EDGE_SHARED_RSRC_SCHED_POLICY_RR
2228 * This policy distributes the threads so that they spread across all available clusters
2229 * irrespective of type. The idea is that this scheduling policy will put a shared resource
2230 * thread on each cluster on the platform before it starts doubling up on clusters.
2231 *
2232 * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
2233 * This policy distributes threads so that the threads first fill up all the capacity on
2234 * the preferred cluster and its homogeneous peers before spilling to different core type.
2235 * The current implementation defines capacity based on the number of CPUs in the cluster;
2236 * so a cluster's shared resource is considered full if there are "n" runnable + running
2237 * shared resource threads on the cluster with n cpus. This policy is different from the
2238 * default scheduling policy of the edge scheduler since this always tries to fill up the
2239 * native clusters to capacity even when non-native clusters might be idle.
2240 */
2241 __options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
2242 EDGE_SHARED_RSRC_SCHED_POLICY_RR = 0,
2243 EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST = 1,
2244 });
2245
2246 const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
2247 [CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
2248 [CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
2249 };
2250
2251 static void
sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch,thread_t thread)2252 sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
2253 {
2254 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2255 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
2256 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
2257 }
2258 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2259 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
2260 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
2261 }
2262 }
2263
2264 static void
sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch,thread_t thread)2265 sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
2266 {
2267 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2268 if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2269 thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2270 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2271 }
2272 }
2273 }
2274
2275 uint16_t
sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch,cluster_shared_rsrc_type_t shared_rsrc_type)2276 sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2277 {
2278 return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2279 }
2280
2281 /*
2282 * sched_edge_shared_rsrc_idle()
2283 *
2284 * Routine used to determine if the constrained resource for the pset is idle. This is
2285 * used by a CPU going idle to decide if it should rebalance a running shared resource
2286 * thread from a non-preferred cluster.
2287 */
2288 static boolean_t
sched_edge_shared_rsrc_idle(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2289 sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2290 {
2291 return sched_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
2292 }
2293
2294 /*
2295 * sched_edge_thread_shared_rsrc_type
2296 *
2297 * This routine decides if a given thread needs special handling for being a
2298 * heavy shared resource user. It is valid for the same thread to be using
2299 * several shared resources at the same time and have multiple policy flags set.
2300 * This routine determines which of those properties will be used for load
2301 * balancing and migration decisions.
2302 */
2303 static cluster_shared_rsrc_type_t
sched_edge_thread_shared_rsrc_type(thread_t thread)2304 sched_edge_thread_shared_rsrc_type(thread_t thread)
2305 {
2306 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2307 return CLUSTER_SHARED_RSRC_TYPE_RR;
2308 }
2309 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2310 return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2311 }
2312 return CLUSTER_SHARED_RSRC_TYPE_NONE;
2313 }
2314
2315 #endif /* CONFIG_SCHED_EDGE */
2316
2317 /*
2318 * sched_clutch_thread_bound_lookup()
2319 *
2320 * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2321 */
2322 static thread_t
sched_clutch_thread_bound_lookup(__unused sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t processor,thread_t _Nullable prev_thread)2323 sched_clutch_thread_bound_lookup(
2324 __unused sched_clutch_root_t root_clutch,
2325 sched_clutch_root_bucket_t root_bucket,
2326 processor_t processor,
2327 thread_t _Nullable prev_thread)
2328 {
2329 assert(root_bucket->scrb_bound == true);
2330 thread_t bound_thread = run_queue_peek(&root_bucket->scrb_bound_thread_runq);
2331 if ((prev_thread != NULL) &&
2332 (bound_thread == NULL || sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_thread->sched_pri, processor->first_timeslice))) {
2333 return prev_thread;
2334 }
2335 assert(bound_thread != THREAD_NULL);
2336 return bound_thread;
2337 }
2338
2339 /*
2340 * Clutch Bucket Group Thread Counts and Pending time calculation
2341 *
2342 * The pending time on the clutch_bucket_group allows the scheduler to track if it
2343 * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2344 * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2345 * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2346 * pending time is advanced to the time of thread selection.
2347 *
2348 * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2349 * simulataneously, it is important that the updates to thread counts and pending timestamps
2350 * happen atomically. The implementation relies on the following aspects to make that work
2351 * as expected:
2352 * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2353 * is held when threads are added/removed and pending timestamps are updated
2354 * - The thread count and pending timestamp can be updated atomically using double wide
2355 * 128 bit atomics
2356 *
2357 * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2358 * above to atomically update the interactivity score for a clutch bucket group.
2359 */
2360
2361 #if CONFIG_SCHED_EDGE
2362
2363 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2364 sched_clutch_bucket_group_thr_count_inc(
2365 sched_clutch_bucket_group_t clutch_bucket_group,
2366 uint64_t timestamp)
2367 {
2368 sched_clutch_counter_time_t old_pending_data;
2369 sched_clutch_counter_time_t new_pending_data;
2370 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2371 new_pending_data.scct_count = old_pending_data.scct_count + 1;
2372 new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2373 if (old_pending_data.scct_count == 0) {
2374 new_pending_data.scct_timestamp = timestamp;
2375 }
2376 });
2377 }
2378
2379 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2380 sched_clutch_bucket_group_thr_count_dec(
2381 sched_clutch_bucket_group_t clutch_bucket_group,
2382 uint64_t timestamp)
2383 {
2384 sched_clutch_counter_time_t old_pending_data;
2385 sched_clutch_counter_time_t new_pending_data;
2386 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2387 new_pending_data.scct_count = old_pending_data.scct_count - 1;
2388 if (new_pending_data.scct_count == 0) {
2389 new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2390 } else {
2391 new_pending_data.scct_timestamp = timestamp;
2392 }
2393 });
2394 }
2395
2396 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2397 sched_clutch_bucket_group_pending_ageout(
2398 sched_clutch_bucket_group_t clutch_bucket_group,
2399 uint64_t timestamp)
2400 {
2401 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2402 sched_clutch_counter_time_t old_pending_data;
2403 sched_clutch_counter_time_t new_pending_data;
2404 uint8_t cpu_usage_shift = 0;
2405
2406 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2407 cpu_usage_shift = 0;
2408 uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2409 bool old_update = (old_pending_ts >= timestamp);
2410 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2411 bool no_bucket_load = (bucket_load == 0);
2412 if (old_update || no_pending_time || no_bucket_load) {
2413 os_atomic_rmw_loop_give_up();
2414 }
2415
2416 /* Calculate the time the clutch bucket group has been pending */
2417 uint64_t pending_delta = timestamp - old_pending_ts;
2418 /*
2419 * Other buckets should get a chance to run first before artificially boosting
2420 * this clutch bucket group's interactivity score, at least when the entire root
2421 * bucket is getting a large enough share of CPU.
2422 */
2423 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2424 if (pending_delta < interactivity_delta) {
2425 os_atomic_rmw_loop_give_up();
2426 }
2427 cpu_usage_shift = (pending_delta / interactivity_delta);
2428 new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2429 new_pending_data.scct_count = old_pending_data.scct_count;
2430 });
2431 return cpu_usage_shift;
2432 }
2433
2434 static boolean_t
sched_edge_thread_should_be_inserted_as_bound(sched_clutch_root_t root_clutch,thread_t thread)2435 sched_edge_thread_should_be_inserted_as_bound(
2436 sched_clutch_root_t root_clutch,
2437 thread_t thread)
2438 {
2439 /*
2440 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2441 * One scenario where a bound thread might not be getting enqueued in the bound cluster
2442 * hierarchy would be if the thread is "soft" bound and the bound cluster is
2443 * de-recommended. In that case, the thread should be treated as an unbound
2444 * thread.
2445 */
2446 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2447 return TRUE;
2448 }
2449 /*
2450 * Use bound runqueue for shared resource threads. See "cluster shared resource
2451 * threads load balancing" section for details.
2452 */
2453 if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2454 return TRUE;
2455 }
2456 return FALSE;
2457 }
2458
2459 #else /* CONFIG_SCHED_EDGE */
2460
2461 /*
2462 * For the clutch scheduler, atomicity is ensured by making sure all operations
2463 * are happening under the pset lock of the only cluster present on the platform.
2464 */
2465 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2466 sched_clutch_bucket_group_thr_count_inc(
2467 sched_clutch_bucket_group_t clutch_bucket_group,
2468 uint64_t timestamp)
2469 {
2470 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2471 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2472 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2473 }
2474 clutch_bucket_group->scbg_pending_data.scct_count++;
2475 }
2476
2477 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2478 sched_clutch_bucket_group_thr_count_dec(
2479 sched_clutch_bucket_group_t clutch_bucket_group,
2480 uint64_t timestamp)
2481 {
2482 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2483 clutch_bucket_group->scbg_pending_data.scct_count--;
2484 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2485 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2486 } else {
2487 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2488 }
2489 }
2490
2491 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2492 sched_clutch_bucket_group_pending_ageout(
2493 sched_clutch_bucket_group_t clutch_bucket_group,
2494 uint64_t timestamp)
2495 {
2496 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2497 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2498 uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2499 bool old_update = (old_pending_ts >= timestamp);
2500 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2501 bool no_bucket_load = (bucket_load == 0);
2502 if (old_update || no_pending_time || no_bucket_load) {
2503 return 0;
2504 }
2505 uint64_t pending_delta = timestamp - old_pending_ts;
2506 /*
2507 * Other buckets should get a chance to run first before artificially boosting
2508 * this clutch bucket group's interactivity score, at least when the entire root
2509 * bucket is getting a large enough share of CPU.
2510 */
2511 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2512 if (pending_delta < interactivity_delta) {
2513 return 0;
2514 }
2515 uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2516 clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2517 return cpu_usage_shift;
2518 }
2519
2520 #endif /* CONFIG_SCHED_EDGE */
2521
2522 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2523 sched_clutch_bucket_group_interactivity_score_calculate(
2524 sched_clutch_bucket_group_t clutch_bucket_group,
2525 uint64_t timestamp)
2526 {
2527 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2528 /*
2529 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2530 * priorities, make sure all AboveUI buckets are marked interactive.
2531 */
2532 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2533 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2534 }
2535 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2536 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2537 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2538 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2539 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2540 /* Write back any interactivity score update */
2541 #if CONFIG_SCHED_EDGE
2542 sched_clutch_counter_time_t old_interactivity_data;
2543 sched_clutch_counter_time_t new_interactivity_data;
2544 os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2545 new_interactivity_data.scct_count = old_interactivity_data.scct_count;
2546 if (old_interactivity_data.scct_timestamp >= timestamp) {
2547 os_atomic_rmw_loop_give_up();
2548 }
2549 new_interactivity_data.scct_timestamp = timestamp;
2550 if (old_interactivity_data.scct_timestamp != 0) {
2551 new_interactivity_data.scct_count = interactivity_score;
2552 }
2553 });
2554 return (uint8_t)new_interactivity_data.scct_count;
2555 #else /* !CONFIG_SCHED_EDGE */
2556 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2557 if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2558 clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2559 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2560 }
2561 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2562 #endif /* !CONFIG_SCHED_EDGE */
2563 }
2564
2565 /*
2566 * Clutch Bucket Group Run Count and Blocked Time Accounting
2567 *
2568 * The clutch bucket group maintains the number of runnable/running threads in the group.
2569 * Since the blocked time of the clutch bucket group is based on this count, it is
2570 * important to make sure the blocking timestamp and the run count are updated atomically.
2571 *
2572 * Since the run count increments happen without any pset locks held, the scheduler updates
2573 * the count & timestamp using double wide 128 bit atomics.
2574 */
2575
2576 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2577 sched_clutch_bucket_group_run_count_inc(
2578 sched_clutch_bucket_group_t clutch_bucket_group)
2579 {
2580 sched_clutch_counter_time_t old_blocked_data;
2581 sched_clutch_counter_time_t new_blocked_data;
2582
2583 bool update_blocked_time = false;
2584 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2585 new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2586 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2587 update_blocked_time = false;
2588 if (old_blocked_data.scct_count == 0) {
2589 new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2590 update_blocked_time = true;
2591 }
2592 });
2593 if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2594 uint64_t ctime = mach_absolute_time();
2595 if (ctime > old_blocked_data.scct_timestamp) {
2596 uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2597 blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2598 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2599 }
2600 }
2601 return (uint32_t)new_blocked_data.scct_count;
2602 }
2603
2604 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2605 sched_clutch_bucket_group_run_count_dec(
2606 sched_clutch_bucket_group_t clutch_bucket_group)
2607 {
2608 sched_clutch_counter_time_t old_blocked_data;
2609 sched_clutch_counter_time_t new_blocked_data;
2610
2611 uint64_t ctime = mach_absolute_time();
2612 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2613 new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2614 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2615 if (new_blocked_data.scct_count == 0) {
2616 new_blocked_data.scct_timestamp = ctime;
2617 }
2618 });
2619 return (uint32_t)new_blocked_data.scct_count;
2620 }
2621
2622 static inline sched_clutch_bucket_t
sched_clutch_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t thread)2623 sched_clutch_bucket_for_thread(
2624 sched_clutch_root_t root_clutch,
2625 thread_t thread)
2626 {
2627 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2628 assert(thread->thread_group == clutch->sc_tg);
2629
2630 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2631 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2632 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2633
2634 return clutch_bucket;
2635 }
2636
2637 static inline sched_clutch_bucket_group_t
sched_clutch_bucket_group_for_thread(thread_t prev_thread)2638 sched_clutch_bucket_group_for_thread(thread_t prev_thread)
2639 {
2640 sched_clutch_t clutch = sched_clutch_for_thread_group(prev_thread->thread_group);
2641 return &clutch->sc_clutch_groups[prev_thread->th_sched_bucket];
2642 }
2643
2644 /*
2645 * sched_clutch_thread_insert()
2646 *
2647 * Routine to insert a thread into the sched clutch hierarchy.
2648 * Update the counts at all levels of the hierarchy and insert the nodes
2649 * as they become runnable. Always called with the pset lock held.
2650 */
2651 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2652 sched_clutch_thread_insert(
2653 sched_clutch_root_t root_clutch,
2654 thread_t thread,
2655 integer_t options)
2656 {
2657 boolean_t result = FALSE;
2658
2659 sched_clutch_hierarchy_locked_assert(root_clutch);
2660 #if CONFIG_SCHED_EDGE
2661 sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2662 sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2663
2664 if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, thread)) {
2665 /*
2666 * Includes threads bound to this specific cluster as well as all
2667 * shared resource threads.
2668 */
2669 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2670 }
2671 #endif /* CONFIG_SCHED_EDGE */
2672
2673 uint64_t current_timestamp = mach_absolute_time();
2674 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2675 assert(thread->thread_group == clutch->sc_tg);
2676 sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, thread);
2677 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2678
2679 /*
2680 * Thread linkage in clutch_bucket
2681 *
2682 * A thread has a few linkages within the clutch bucket:
2683 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2684 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2685 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2686 */
2687
2688 /* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2689 thread->th_clutch_runq_link.stamp = current_timestamp;
2690 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2691 (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2692 priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2693
2694 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2695 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2696 sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2697 priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2698
2699 /* Insert thread into timesharing queue of the clutch bucket */
2700 enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2701
2702 /* Increment the urgency counter for the root if necessary */
2703 sched_clutch_root_urgency_inc(root_clutch, thread);
2704
2705 os_atomic_inc(&clutch->sc_thr_count, relaxed);
2706 sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2707
2708 /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2709 sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2710 if (clutch_bucket->scb_thr_count == 0) {
2711 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2712 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2713 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2714 } else {
2715 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2716 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2717 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2718 }
2719
2720 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2721 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2722 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2723 return result;
2724 }
2725
2726 /*
2727 * sched_clutch_thread_remove()
2728 *
2729 * Routine to remove a thread from the sched clutch hierarchy.
2730 * Update the counts at all levels of the hierarchy and remove the nodes
2731 * as they become empty. Always called with the pset lock held.
2732 */
2733 static void
sched_clutch_thread_remove(sched_clutch_root_t root_clutch,thread_t thread,uint64_t current_timestamp,sched_clutch_bucket_options_t options)2734 sched_clutch_thread_remove(
2735 sched_clutch_root_t root_clutch,
2736 thread_t thread,
2737 uint64_t current_timestamp,
2738 sched_clutch_bucket_options_t options)
2739 {
2740 sched_clutch_hierarchy_locked_assert(root_clutch);
2741 #if CONFIG_SCHED_EDGE
2742 sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2743 sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2744
2745 if (thread->th_bound_cluster_enqueued) {
2746 sched_edge_bound_thread_remove(root_clutch, thread);
2747 return;
2748 }
2749 #endif /* CONFIG_SCHED_EDGE */
2750 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2751 assert(thread->thread_group == clutch->sc_tg);
2752 thread_assert_runq_nonnull(thread);
2753
2754 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2755 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2756 assert(clutch_bucket->scb_root == root_clutch);
2757
2758 /* Decrement the urgency counter for the root if necessary */
2759 sched_clutch_root_urgency_dec(root_clutch, thread);
2760 /* Remove thread from the clutch_bucket */
2761 priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2762 remqueue(&thread->th_clutch_timeshare_link);
2763
2764 priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2765
2766 /*
2767 * Warning: After this point, the thread's scheduling fields may be
2768 * modified by other cores that acquire the thread lock.
2769 */
2770 thread_clear_runq(thread);
2771
2772 /* Update counts at various levels of the hierarchy */
2773 os_atomic_dec(&clutch->sc_thr_count, relaxed);
2774 sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2775 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2776 sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2777
2778 /* Remove the clutch from hierarchy (if needed) and update properties */
2779 if (clutch_bucket->scb_thr_count == 0) {
2780 sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2781 } else {
2782 sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2783 }
2784
2785 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2786 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2787 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2788 }
2789
2790 /*
2791 * sched_clutch_thread_unbound_lookup()
2792 *
2793 * Routine to find the highest unbound thread in the root clutch.
2794 * Helps find threads easily for steal/migrate scenarios in the
2795 * Edge scheduler.
2796 */
2797 static thread_t
sched_clutch_thread_unbound_lookup(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread)2798 sched_clutch_thread_unbound_lookup(
2799 sched_clutch_root_t root_clutch,
2800 sched_clutch_root_bucket_t root_bucket,
2801 processor_t _Nullable processor,
2802 thread_t _Nullable prev_thread)
2803 {
2804 assert(processor != NULL || prev_thread == NULL);
2805 assert(root_bucket->scrb_bound == false);
2806 sched_clutch_hierarchy_locked_assert(root_clutch);
2807
2808 /* Find the highest priority clutch bucket in this root bucket */
2809 bool chose_prev_thread = false;
2810 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket, processor, prev_thread, &chose_prev_thread);
2811 assert(clutch_bucket != NULL);
2812
2813 if (chose_prev_thread) {
2814 /* We have determined that prev_thread is the highest thread, based on the Clutch bucket level policy */
2815 assert(processor != NULL && prev_thread != NULL);
2816 return prev_thread;
2817 }
2818
2819 /* Find the highest priority runnable thread in this clutch bucket */
2820 thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2821 assert(thread != NULL);
2822
2823 /* Consider the previous thread */
2824 if (prev_thread != NULL &&
2825 sched_clutch_bucket_for_thread(root_clutch, prev_thread) == clutch_bucket &&
2826 sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, thread->sched_pri, processor->first_timeslice)) {
2827 thread = prev_thread;
2828 }
2829
2830 return thread;
2831 }
2832
2833 static sched_clutch_root_bucket_t
sched_clutch_root_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t prev_thread)2834 sched_clutch_root_bucket_for_thread(
2835 sched_clutch_root_t root_clutch,
2836 thread_t prev_thread)
2837 {
2838 #if CONFIG_SCHED_EDGE
2839 if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, prev_thread)) {
2840 return &root_clutch->scr_bound_buckets[prev_thread->th_sched_bucket];
2841 }
2842 #endif /* CONFIG_SCHED_EDGE */
2843 return &root_clutch->scr_unbound_buckets[prev_thread->th_sched_bucket];
2844 }
2845
2846 /*
2847 * sched_clutch_hierarchy_thread_highest()
2848 *
2849 * Routine to traverse the Clutch hierarchy and return the highest thread which
2850 * should be selected to run next, optionally comparing against the previously
2851 * running thread. Removes the highest thread with sched_clutch_thread_remove()
2852 * depending on the traverse mode and whether it is the previously running thread.
2853 * Always called with the pset lock held.
2854 */
2855 static thread_t
sched_clutch_hierarchy_thread_highest(sched_clutch_root_t root_clutch,processor_t processor,thread_t _Nullable prev_thread,sched_clutch_traverse_mode_t mode)2856 sched_clutch_hierarchy_thread_highest(
2857 sched_clutch_root_t root_clutch,
2858 processor_t processor,
2859 thread_t _Nullable prev_thread,
2860 sched_clutch_traverse_mode_t mode)
2861 {
2862 assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || prev_thread == NULL);
2863 sched_clutch_hierarchy_locked_assert(root_clutch);
2864
2865 thread_t highest_thread = NULL;
2866 uint64_t current_timestamp = mach_absolute_time();
2867 bool chose_prev_thread = false;
2868 sched_clutch_dbg_thread_select_packed_t debug_info = {0};
2869 sched_clutch_root_bucket_t prev_root_bucket = prev_thread != NULL ? sched_clutch_root_bucket_for_thread(root_clutch, prev_thread) : NULL;
2870 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL, prev_root_bucket, prev_thread, &chose_prev_thread, mode, &debug_info);
2871 if (chose_prev_thread) {
2872 /* We disambiguated that we want to keep running the previous thread */
2873 highest_thread = processor->active_thread;
2874 goto done_selecting_thread;
2875 }
2876 if (root_bucket == NULL) {
2877 /* The Clutch hierarchy has no runnable threads, including the previous thread */
2878 assert(sched_clutch_root_count(root_clutch) == 0);
2879 assert(prev_thread == NULL);
2880 return NULL;
2881 }
2882 if (root_bucket != prev_root_bucket) {
2883 /* We have ruled out continuing to run the previous thread, based on the root bucket level policy */
2884 prev_thread = NULL;
2885 assert((mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT) || (prev_root_bucket == NULL) ||
2886 (prev_root_bucket->scrb_bucket >= root_bucket->scrb_bucket) || (root_bucket->scrb_starvation_avoidance) ||
2887 (prev_root_bucket->scrb_bound != root_bucket->scrb_bound) ||
2888 (root_bucket->scrb_warp_remaining > 0 && root_bucket->scrb_warped_deadline > current_timestamp && prev_root_bucket->scrb_warp_remaining == 0));
2889 }
2890
2891 if (root_bucket->scrb_bound) {
2892 highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket, processor, prev_thread);
2893 } else {
2894 highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket, processor, prev_thread);
2895 }
2896
2897 if (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY ||
2898 (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT && highest_thread != processor->active_thread)) {
2899 assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || highest_thread != processor->active_thread);
2900 sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2901 }
2902
2903 done_selecting_thread:
2904 debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
2905 debug_info.trace_data.traverse_mode = mode;
2906 debug_info.trace_data.cluster_id = root_clutch->scr_cluster_id;
2907 debug_info.trace_data.selection_was_cluster_bound = root_bucket->scrb_bound;
2908 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2909 thread_tid(highest_thread), thread_group_get_id(highest_thread->thread_group), root_bucket->scrb_bucket, debug_info.scdts_trace_data_packed, 0);
2910 return highest_thread;
2911 }
2912
2913 /* High level global accessor routines */
2914
2915 /*
2916 * sched_clutch_root_urgency()
2917 *
2918 * Routine to get the urgency of the highest runnable
2919 * thread in the hierarchy.
2920 */
2921 static uint32_t
sched_clutch_root_urgency(sched_clutch_root_t root_clutch)2922 sched_clutch_root_urgency(
2923 sched_clutch_root_t root_clutch)
2924 {
2925 return root_clutch->scr_urgency;
2926 }
2927
2928 /*
2929 * sched_clutch_root_count_sum()
2930 *
2931 * The count_sum mechanism is used for scheduler runq
2932 * statistics calculation. Its only useful for debugging
2933 * purposes; since it takes a mach_absolute_time() on
2934 * other scheduler implementations, its better to avoid
2935 * populating this until absolutely necessary.
2936 */
2937 static uint32_t
sched_clutch_root_count_sum(__unused sched_clutch_root_t root_clutch)2938 sched_clutch_root_count_sum(
2939 __unused sched_clutch_root_t root_clutch)
2940 {
2941 return 0;
2942 }
2943
2944 /*
2945 * sched_clutch_root_priority()
2946 *
2947 * Routine to get the priority of the highest runnable
2948 * thread in the hierarchy.
2949 */
2950 static int
sched_clutch_root_priority(sched_clutch_root_t root_clutch)2951 sched_clutch_root_priority(
2952 sched_clutch_root_t root_clutch)
2953 {
2954 return root_clutch->scr_priority;
2955 }
2956
2957 /*
2958 * sched_clutch_root_count()
2959 *
2960 * Returns total number of runnable threads in the hierarchy.
2961 */
2962 uint32_t
sched_clutch_root_count(sched_clutch_root_t root_clutch)2963 sched_clutch_root_count(
2964 sched_clutch_root_t root_clutch)
2965 {
2966 return root_clutch->scr_thr_count;
2967 }
2968
2969 #if CONFIG_SCHED_EDGE
2970
2971 /*
2972 * sched_clutch_root_foreign_empty()
2973 *
2974 * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2975 */
2976 static boolean_t
sched_clutch_root_foreign_empty(sched_clutch_root_t root_clutch)2977 sched_clutch_root_foreign_empty(
2978 sched_clutch_root_t root_clutch)
2979 {
2980 return priority_queue_empty(&root_clutch->scr_foreign_buckets);
2981 }
2982
2983 /*
2984 * sched_clutch_root_highest_foreign_thread_remove()
2985 *
2986 * Routine to return the thread in the highest priority clutch bucket in a cluster.
2987 * Must be called with the pset for the cluster locked.
2988 */
2989 static thread_t
sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t root_clutch)2990 sched_clutch_root_highest_foreign_thread_remove(
2991 sched_clutch_root_t root_clutch)
2992 {
2993 thread_t thread = THREAD_NULL;
2994 if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
2995 return thread;
2996 }
2997 sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
2998 thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2999 sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
3000 return thread;
3001 }
3002
3003 #endif /* CONFIG_SCHED_EDGE */
3004
3005 /*
3006 * sched_clutch_thread_pri_shift()
3007 *
3008 * Routine to get the priority shift value for a thread.
3009 * Since the timesharing is done at the clutch_bucket level,
3010 * this routine gets the clutch_bucket and retrieves the
3011 * values from there.
3012 */
3013 uint32_t
sched_clutch_thread_pri_shift(thread_t thread,sched_bucket_t bucket)3014 sched_clutch_thread_pri_shift(
3015 thread_t thread,
3016 sched_bucket_t bucket)
3017 {
3018 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3019 return INT8_MAX;
3020 }
3021 assert(bucket != TH_BUCKET_RUN);
3022 sched_clutch_t clutch = sched_clutch_for_thread(thread);
3023 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
3024 return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
3025 }
3026
3027 #pragma mark -- Clutch Scheduler Algorithm
3028
3029 static void
3030 sched_clutch_init(void);
3031
3032 #if !SCHED_TEST_HARNESS
3033
3034 static thread_t
3035 sched_clutch_steal_thread(processor_set_t pset);
3036
3037 static void
3038 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
3039
3040 #endif /* !SCHED_TEST_HARNESS */
3041
3042 static boolean_t
3043 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
3044 sched_options_t options);
3045
3046 static boolean_t
3047 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
3048
3049 static ast_t
3050 sched_clutch_processor_csw_check(processor_t processor);
3051
3052 static boolean_t
3053 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3054
3055 static int
3056 sched_clutch_runq_count(processor_t processor);
3057
3058 static boolean_t
3059 sched_clutch_processor_queue_empty(processor_t processor);
3060
3061 #if !SCHED_TEST_HARNESS
3062
3063 static uint64_t
3064 sched_clutch_runq_stats_count_sum(processor_t processor);
3065
3066 #endif /* !SCHED_TEST_HARNESS */
3067
3068 static int
3069 sched_clutch_processor_bound_count(processor_t processor);
3070
3071 static void
3072 sched_clutch_pset_init(processor_set_t pset);
3073
3074 static void
3075 sched_clutch_processor_init(processor_t processor);
3076
3077 static thread_t
3078 sched_clutch_processor_highest_thread(processor_t processor, sched_clutch_traverse_mode_t mode);
3079
3080 static thread_t
3081 sched_clutch_choose_thread(processor_t processor, int priority, thread_t prev_thread, ast_t reason);
3082
3083 #if !SCHED_TEST_HARNESS
3084
3085 static void
3086 sched_clutch_processor_queue_shutdown(processor_t processor);
3087
3088 #endif /* !SCHED_TEST_HARNESS */
3089
3090 static sched_mode_t
3091 sched_clutch_initial_thread_sched_mode(task_t parent_task);
3092
3093 static uint32_t
3094 sched_clutch_initial_quantum_size(thread_t thread);
3095
3096 static uint32_t
3097 sched_clutch_run_incr(thread_t thread);
3098
3099 static uint32_t
3100 sched_clutch_run_decr(thread_t thread);
3101
3102 static void
3103 sched_clutch_update_thread_bucket(thread_t thread);
3104
3105 #if !SCHED_TEST_HARNESS
3106
3107 static void
3108 sched_clutch_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
3109
3110 #endif /* !SCHED_TEST_HARNESS */
3111
3112 const struct sched_dispatch_table sched_clutch_dispatch = {
3113 .sched_name = "clutch",
3114 .init = sched_clutch_init,
3115 .timebase_init = sched_timeshare_timebase_init,
3116 .processor_init = sched_clutch_processor_init,
3117 .pset_init = sched_clutch_pset_init,
3118 .choose_thread = sched_clutch_choose_thread,
3119 .processor_enqueue = sched_clutch_processor_enqueue,
3120 .processor_queue_remove = sched_clutch_processor_queue_remove,
3121 .processor_queue_empty = sched_clutch_processor_queue_empty,
3122 .priority_is_urgent = priority_is_urgent,
3123 .processor_csw_check = sched_clutch_processor_csw_check,
3124 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
3125 .initial_quantum_size = sched_clutch_initial_quantum_size,
3126 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3127 .processor_runq_count = sched_clutch_runq_count,
3128 .processor_bound_count = sched_clutch_processor_bound_count,
3129 .multiple_psets_enabled = TRUE,
3130 .avoid_processor_enabled = FALSE,
3131 .thread_avoid_processor = NULL,
3132 .update_thread_bucket = sched_clutch_update_thread_bucket,
3133 .cpu_init_completed = NULL,
3134 .thread_eligible_for_pset = NULL,
3135 #if !SCHED_TEST_HARNESS
3136 .maintenance_continuation = sched_timeshare_maintenance_continue,
3137 .steal_thread_enabled = sched_steal_thread_enabled,
3138 .steal_thread = sched_clutch_steal_thread,
3139 .compute_timeshare_priority = sched_compute_timeshare_priority,
3140 .choose_node = sched_choose_node,
3141 .choose_processor = choose_processor,
3142 .processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
3143 .can_update_priority = can_update_priority,
3144 .update_priority = update_priority,
3145 .lightweight_update_priority = lightweight_update_priority,
3146 .quantum_expire = sched_default_quantum_expire,
3147 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3148 .thread_update_scan = sched_clutch_thread_update_scan,
3149 .processor_balance = sched_SMT_balance,
3150 .rt_runq = sched_rtlocal_runq,
3151 .rt_init = sched_rtlocal_init,
3152 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
3153 .rt_runq_scan = sched_rtlocal_runq_scan,
3154 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
3155 .rt_steal_thread = sched_rtlocal_steal_thread,
3156 .qos_max_parallelism = sched_qos_max_parallelism,
3157 .check_spill = sched_check_spill,
3158 .ipi_policy = sched_ipi_policy,
3159 .thread_should_yield = sched_thread_should_yield,
3160 .run_count_incr = sched_clutch_run_incr,
3161 .run_count_decr = sched_clutch_run_decr,
3162 .pset_made_schedulable = sched_pset_made_schedulable,
3163 .thread_group_recommendation_change = sched_clutch_thread_group_recommendation_change,
3164 #endif /* !SCHED_TEST_HARNESS */
3165 };
3166
3167 __attribute__((always_inline))
3168 static inline run_queue_t
sched_clutch_bound_runq(processor_t processor)3169 sched_clutch_bound_runq(processor_t processor)
3170 {
3171 return &processor->runq;
3172 }
3173
3174 __attribute__((always_inline))
3175 static inline sched_clutch_root_t
sched_clutch_processor_root_clutch(processor_t processor)3176 sched_clutch_processor_root_clutch(processor_t processor)
3177 {
3178 return &processor->processor_set->pset_clutch_root;
3179 }
3180
3181 __attribute__((always_inline))
3182 static inline run_queue_t
sched_clutch_thread_bound_runq(processor_t processor,__assert_only thread_t thread)3183 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
3184 {
3185 assert(thread->bound_processor == processor);
3186 return sched_clutch_bound_runq(processor);
3187 }
3188
3189 static uint32_t
sched_clutch_initial_quantum_size(thread_t thread)3190 sched_clutch_initial_quantum_size(thread_t thread)
3191 {
3192 if (thread == THREAD_NULL) {
3193 return std_quantum;
3194 }
3195 assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
3196 return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
3197 }
3198
3199 static sched_mode_t
sched_clutch_initial_thread_sched_mode(task_t parent_task)3200 sched_clutch_initial_thread_sched_mode(task_t parent_task)
3201 {
3202 if (parent_task == kernel_task) {
3203 return TH_MODE_FIXED;
3204 } else {
3205 return TH_MODE_TIMESHARE;
3206 }
3207 }
3208
3209 static void
sched_clutch_processor_init(processor_t processor)3210 sched_clutch_processor_init(processor_t processor)
3211 {
3212 run_queue_init(&processor->runq);
3213 }
3214
3215 static void
sched_clutch_pset_init(processor_set_t pset)3216 sched_clutch_pset_init(processor_set_t pset)
3217 {
3218 sched_clutch_root_init(&pset->pset_clutch_root, pset);
3219 }
3220
3221 static void
sched_clutch_tunables_init(void)3222 sched_clutch_tunables_init(void)
3223 {
3224 sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
3225 sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
3226 sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
3227 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
3228 NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
3229 assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
3230 sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
3231 }
3232
3233 static void
sched_clutch_init(void)3234 sched_clutch_init(void)
3235 {
3236 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3237 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3238 }
3239 sched_timeshare_init();
3240 sched_clutch_tunables_init();
3241 }
3242
3243 static inline bool
sched_clutch_pri_greater_than_tiebreak(int pri_one,int pri_two,bool one_wins_ties)3244 sched_clutch_pri_greater_than_tiebreak(int pri_one, int pri_two, bool one_wins_ties)
3245 {
3246 if (one_wins_ties) {
3247 return pri_one >= pri_two;
3248 } else {
3249 return pri_one > pri_two;
3250 }
3251 }
3252
3253 /*
3254 * sched_clutch_processor_highest_thread()
3255 *
3256 * Routine to determine the highest thread on the entire cluster runqueue which
3257 * should be selected to run next, optionally comparing against the previously
3258 * running thread. Removes the highest thread from the runqueue, depending on the
3259 * traverse mode and whether the highest thread is the previously running thread.
3260 *
3261 * Always called with the pset lock held. Assumes that processor->active_thread
3262 * may be locked and modified by another processor.
3263 */
3264 static thread_t
sched_clutch_processor_highest_thread(processor_t processor,sched_clutch_traverse_mode_t mode)3265 sched_clutch_processor_highest_thread(
3266 processor_t processor,
3267 sched_clutch_traverse_mode_t mode)
3268 {
3269 sched_clutch_root_t root_clutch = sched_clutch_processor_root_clutch(processor);
3270 int clutch_pri = sched_clutch_root_priority(root_clutch);
3271 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3272 int bound_pri = bound_runq->highq;
3273
3274 bool has_prev_thread = mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
3275 thread_t prev_thread = has_prev_thread ? processor->active_thread : NULL;
3276
3277 if (bound_runq->count == 0 && root_clutch->scr_thr_count == 0) {
3278 /* The runqueue is totally empty */
3279 assert(bound_pri < MINPRI && clutch_pri < MINPRI);
3280 return prev_thread;
3281 }
3282
3283 if (has_prev_thread) {
3284 if (prev_thread->sched_pri >= BASEPRI_RTQUEUES) {
3285 /* The previous thread is real-time and thus guaranteed higher than the non-RT runqueue */
3286 return prev_thread;
3287 }
3288 /* Allow the previous thread to influence the priority comparison of Clutch hierarchy vs. processor-bound runqueue */
3289 if (prev_thread->bound_processor != NULL) {
3290 bound_pri = MAX(bound_pri, prev_thread->sched_pri);
3291 } else {
3292 clutch_pri = MAX(clutch_pri, prev_thread->sched_pri);
3293 }
3294 }
3295
3296 bool prev_thread_is_not_processor_bound = has_prev_thread && (prev_thread->bound_processor == NULL);
3297 bool prev_thread_is_processor_bound = has_prev_thread && (prev_thread->bound_processor != NULL);
3298 thread_t next_thread = prev_thread;
3299 if (clutch_pri > bound_pri) {
3300 if (root_clutch->scr_thr_count == 0) {
3301 goto found_thread;
3302 }
3303 next_thread = sched_clutch_hierarchy_thread_highest(root_clutch, processor, prev_thread_is_not_processor_bound ? prev_thread : NULL, mode);
3304 } else {
3305 if (bound_runq->count == 0 ||
3306 (prev_thread_is_processor_bound && sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_runq->highq, processor->first_timeslice))) {
3307 goto found_thread;
3308 }
3309 next_thread = (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY) ?
3310 run_queue_dequeue(bound_runq, SCHED_HEADQ) : run_queue_peek(bound_runq);
3311 assert(mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || next_thread != prev_thread);
3312 }
3313 found_thread:
3314 assert(next_thread != NULL);
3315 return next_thread;
3316 }
3317
3318 static thread_t
sched_clutch_choose_thread(processor_t processor,__unused int priority,thread_t _Nullable prev_thread,__unused ast_t reason)3319 sched_clutch_choose_thread(
3320 processor_t processor,
3321 __unused int priority,
3322 thread_t _Nullable prev_thread,
3323 __unused ast_t reason)
3324 {
3325 assert(prev_thread == NULL || prev_thread == processor->active_thread);
3326 return sched_clutch_processor_highest_thread(processor, prev_thread != NULL ? SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT : SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3327 }
3328
3329 static boolean_t
sched_clutch_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)3330 sched_clutch_processor_enqueue(
3331 processor_t processor,
3332 thread_t thread,
3333 sched_options_t options)
3334 {
3335 boolean_t result;
3336
3337 thread_set_runq_locked(thread, processor);
3338 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3339 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3340 result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
3341 } else {
3342 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3343 result = run_queue_enqueue(rq, thread, options);
3344 }
3345 return result;
3346 }
3347
3348 static boolean_t
sched_clutch_processor_queue_empty(processor_t processor)3349 sched_clutch_processor_queue_empty(processor_t processor)
3350 {
3351 return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
3352 sched_clutch_bound_runq(processor)->count == 0;
3353 }
3354
3355 static ast_t
sched_clutch_processor_csw_check(processor_t processor)3356 sched_clutch_processor_csw_check(processor_t processor)
3357 {
3358 assert(processor->active_thread != NULL);
3359 thread_t runqueue_thread = sched_clutch_processor_highest_thread(processor, SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT);
3360 if (runqueue_thread != processor->active_thread) {
3361 /* Found a better thread to run */
3362 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0 ||
3363 sched_clutch_bound_runq(processor)->urgency > 0) {
3364 return AST_PREEMPT | AST_URGENT;
3365 }
3366 return AST_PREEMPT;
3367 }
3368 return AST_NONE;
3369 }
3370
3371 static boolean_t
sched_clutch_processor_queue_has_priority(__unused processor_t processor,__unused int priority,__unused boolean_t gte)3372 sched_clutch_processor_queue_has_priority(
3373 __unused processor_t processor,
3374 __unused int priority,
3375 __unused boolean_t gte)
3376 {
3377 /*
3378 * Never short-circuit the Clutch runqueue by returning FALSE here. Instead,
3379 * thread_select() should always go through sched_clutch_choose_thread().
3380 */
3381 return TRUE;
3382 }
3383
3384 static int
sched_clutch_runq_count(processor_t processor)3385 sched_clutch_runq_count(processor_t processor)
3386 {
3387 return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
3388 }
3389
3390 #if !SCHED_TEST_HARNESS
3391
3392 static uint64_t
sched_clutch_runq_stats_count_sum(processor_t processor)3393 sched_clutch_runq_stats_count_sum(processor_t processor)
3394 {
3395 uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
3396
3397 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
3398 return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
3399 } else {
3400 return bound_sum;
3401 }
3402 }
3403
3404 #endif /* !SCHED_TEST_HARNESS */
3405
3406 static int
sched_clutch_processor_bound_count(processor_t processor)3407 sched_clutch_processor_bound_count(processor_t processor)
3408 {
3409 return sched_clutch_bound_runq(processor)->count;
3410 }
3411
3412 #if !SCHED_TEST_HARNESS
3413
3414 static void
sched_clutch_processor_queue_shutdown(processor_t processor)3415 sched_clutch_processor_queue_shutdown(processor_t processor)
3416 {
3417 processor_set_t pset = processor->processor_set;
3418 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3419 thread_t thread;
3420 queue_head_t tqueue;
3421
3422 /* We only need to migrate threads if this is the last active processor in the pset */
3423 if (pset->online_processor_count > 0) {
3424 pset_unlock(pset);
3425 return;
3426 }
3427
3428 queue_init(&tqueue);
3429 while (sched_clutch_root_count(pset_clutch_root) > 0) {
3430 thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3431 enqueue_tail(&tqueue, &thread->runq_links);
3432 }
3433
3434 pset_unlock(pset);
3435
3436 qe_foreach_element_safe(thread, &tqueue, runq_links) {
3437 remqueue(&thread->runq_links);
3438 thread_lock(thread);
3439 thread_setrun(thread, SCHED_TAILQ);
3440 thread_unlock(thread);
3441 }
3442 }
3443
3444 #endif /* !SCHED_TEST_HARNESS */
3445
3446 static boolean_t
sched_clutch_processor_queue_remove(processor_t processor,thread_t thread)3447 sched_clutch_processor_queue_remove(
3448 processor_t processor,
3449 thread_t thread)
3450 {
3451 processor_set_t pset = processor->processor_set;
3452
3453 pset_lock(pset);
3454
3455 if (processor == thread_get_runq_locked(thread)) {
3456 /*
3457 * Thread is on a run queue and we have a lock on
3458 * that run queue.
3459 */
3460 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3461 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3462 sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3463 } else {
3464 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3465 run_queue_remove(rq, thread);
3466 }
3467 } else {
3468 /*
3469 * The thread left the run queue before we could
3470 * lock the run queue.
3471 */
3472 thread_assert_runq_null(thread);
3473 processor = PROCESSOR_NULL;
3474 }
3475
3476 pset_unlock(pset);
3477
3478 return processor != PROCESSOR_NULL;
3479 }
3480
3481 #if !SCHED_TEST_HARNESS
3482
3483 static thread_t
sched_clutch_steal_thread(__unused processor_set_t pset)3484 sched_clutch_steal_thread(__unused processor_set_t pset)
3485 {
3486 /* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3487 return THREAD_NULL;
3488 }
3489
3490 static void
sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)3491 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3492 {
3493 boolean_t restart_needed = FALSE;
3494 processor_t processor = processor_list;
3495 processor_set_t pset;
3496 thread_t thread;
3497 spl_t s;
3498
3499 /*
3500 * We update the threads associated with each processor (bound and idle threads)
3501 * and then update the threads in each pset runqueue.
3502 */
3503
3504 do {
3505 do {
3506 pset = processor->processor_set;
3507
3508 s = splsched();
3509 pset_lock(pset);
3510
3511 restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
3512
3513 pset_unlock(pset);
3514 splx(s);
3515
3516 if (restart_needed) {
3517 break;
3518 }
3519
3520 thread = processor->idle_thread;
3521 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
3522 if (thread_update_add_thread(thread) == FALSE) {
3523 restart_needed = TRUE;
3524 break;
3525 }
3526 }
3527 } while ((processor = processor->processor_list) != NULL);
3528
3529 /* Ok, we now have a collection of candidates -- fix them. */
3530 thread_update_process_threads();
3531 } while (restart_needed);
3532
3533 pset_node_t node = &pset_node0;
3534 pset = node->psets;
3535
3536 do {
3537 do {
3538 restart_needed = FALSE;
3539 while (pset != NULL) {
3540 s = splsched();
3541 pset_lock(pset);
3542
3543 if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
3544 for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3545 restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3546 if (restart_needed) {
3547 break;
3548 }
3549 }
3550 queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3551 sched_clutch_bucket_t clutch_bucket;
3552 qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3553 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
3554 restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
3555 if (restart_needed) {
3556 break;
3557 }
3558 }
3559 }
3560
3561 pset_unlock(pset);
3562 splx(s);
3563
3564 if (restart_needed) {
3565 break;
3566 }
3567 pset = pset->pset_list;
3568 }
3569
3570 if (restart_needed) {
3571 break;
3572 }
3573 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3574
3575 /* Ok, we now have a collection of candidates -- fix them. */
3576 thread_update_process_threads();
3577 } while (restart_needed);
3578 }
3579
3580 /*
3581 * For the clutch scheduler, the run counts are maintained in the clutch
3582 * buckets (i.e thread group scheduling structure).
3583 */
3584 static uint32_t
sched_clutch_run_incr(thread_t thread)3585 sched_clutch_run_incr(thread_t thread)
3586 {
3587 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3588 uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3589 sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3590 return new_count;
3591 }
3592
3593 static uint32_t
sched_clutch_run_decr(thread_t thread)3594 sched_clutch_run_decr(thread_t thread)
3595 {
3596 assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3597 uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3598 sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3599 return new_count;
3600 }
3601
3602 /*
3603 * For threads that have changed sched_pri without changing the
3604 * base_pri for any reason other than decay, use the sched_pri
3605 * as the bucketizing priority instead of base_pri. All such
3606 * changes are typically due to kernel locking primitives boosts
3607 * or demotions.
3608 */
3609 static boolean_t
sched_thread_sched_pri_promoted(thread_t thread)3610 sched_thread_sched_pri_promoted(thread_t thread)
3611 {
3612 return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
3613 (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3614 (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3615 (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3616 (thread->kern_promotion_schedpri != 0);
3617 }
3618
3619 #endif /* !SCHED_TEST_HARNESS */
3620
3621 /*
3622 * Routine to update the scheduling bucket for the thread.
3623 *
3624 * In the clutch scheduler implementation, the thread's bucket
3625 * is based on sched_pri if it was promoted due to a kernel
3626 * primitive; otherwise its based on the thread base_pri. This
3627 * enhancement allows promoted threads to reach a higher priority
3628 * bucket and potentially get selected sooner for scheduling.
3629 *
3630 * Also, the clutch scheduler does not honor fixed priority below
3631 * FG priority. It simply puts those threads in the corresponding
3632 * timeshare bucket. The reason for to do that is because it is
3633 * extremely hard to define the scheduling properties of such threads
3634 * and they typically lead to performance issues.
3635 *
3636 * Called with the thread lock held and the thread held off the runqueue.
3637 */
3638
3639 void
sched_clutch_update_thread_bucket(thread_t thread)3640 sched_clutch_update_thread_bucket(thread_t thread)
3641 {
3642 sched_bucket_t old_bucket = thread->th_sched_bucket;
3643 thread_assert_runq_null(thread);
3644 int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3645 sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3646
3647 if (old_bucket == new_bucket) {
3648 return;
3649 }
3650
3651 thread->th_sched_bucket = new_bucket;
3652 thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3653 /*
3654 * Since this is called after the thread has been removed from the runq,
3655 * only the run counts need to be updated. The re-insert into the runq
3656 * would put the thread into the correct new bucket's runq.
3657 */
3658 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3659 sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3660 sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3661 }
3662 }
3663
3664 #if !SCHED_TEST_HARNESS
3665
3666 static void
sched_clutch_thread_group_recommendation_change(__unused struct thread_group * tg,__unused cluster_type_t new_recommendation)3667 sched_clutch_thread_group_recommendation_change(__unused struct thread_group *tg, __unused cluster_type_t new_recommendation)
3668 {
3669 /* Clutch ignores the recommendation because Clutch does not migrate
3670 * threads between cluster types independently from the Edge scheduler.
3671 */
3672 }
3673
3674 #endif /* !SCHED_TEST_HARNESS */
3675
3676 #if CONFIG_SCHED_EDGE
3677
3678 /* Implementation of the AMP version of the clutch scheduler */
3679
3680 static void
3681 sched_edge_init(void);
3682
3683 static void
3684 sched_edge_pset_init(processor_set_t pset);
3685
3686 static thread_t
3687 sched_edge_processor_idle(processor_set_t pset);
3688
3689 static boolean_t
3690 sched_edge_processor_queue_empty(processor_t processor);
3691
3692 static void
3693 sched_edge_processor_queue_shutdown(processor_t processor);
3694
3695 static processor_t
3696 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
3697
3698 static bool
3699 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason);
3700
3701 static bool
3702 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3703
3704 static void
3705 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3706
3707 static bool
3708 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3709
3710 static void
3711 sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3712
3713 static void
3714 sched_edge_cpu_init_completed(void);
3715
3716 static bool
3717 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3718
3719 static bool
3720 sched_edge_steal_thread_enabled(processor_set_t pset);
3721
3722 static sched_ipi_type_t
3723 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3724
3725 static uint32_t
3726 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3727
3728 static uint32_t
3729 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3730
3731 const struct sched_dispatch_table sched_edge_dispatch = {
3732 .sched_name = "edge",
3733 .init = sched_edge_init,
3734 .timebase_init = sched_timeshare_timebase_init,
3735 .processor_init = sched_clutch_processor_init,
3736 .pset_init = sched_edge_pset_init,
3737 .maintenance_continuation = sched_timeshare_maintenance_continue,
3738 .choose_thread = sched_clutch_choose_thread,
3739 .steal_thread_enabled = sched_edge_steal_thread_enabled,
3740 .steal_thread = sched_edge_processor_idle,
3741 .compute_timeshare_priority = sched_compute_timeshare_priority,
3742 .choose_node = sched_choose_node,
3743 .choose_processor = sched_edge_choose_processor,
3744 .processor_enqueue = sched_clutch_processor_enqueue,
3745 .processor_queue_shutdown = sched_edge_processor_queue_shutdown,
3746 .processor_queue_remove = sched_clutch_processor_queue_remove,
3747 .processor_queue_empty = sched_edge_processor_queue_empty,
3748 .priority_is_urgent = priority_is_urgent,
3749 .processor_csw_check = sched_clutch_processor_csw_check,
3750 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
3751 .initial_quantum_size = sched_clutch_initial_quantum_size,
3752 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3753 .can_update_priority = can_update_priority,
3754 .update_priority = update_priority,
3755 .lightweight_update_priority = lightweight_update_priority,
3756 .quantum_expire = sched_default_quantum_expire,
3757 .processor_runq_count = sched_clutch_runq_count,
3758 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3759 .processor_bound_count = sched_clutch_processor_bound_count,
3760 .thread_update_scan = sched_clutch_thread_update_scan,
3761 .multiple_psets_enabled = TRUE,
3762 .avoid_processor_enabled = TRUE,
3763 .thread_avoid_processor = sched_edge_thread_avoid_processor,
3764 .processor_balance = sched_edge_balance,
3765
3766 .rt_runq = sched_rtlocal_runq,
3767 .rt_init = sched_rtlocal_init,
3768 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
3769 .rt_runq_scan = sched_rtlocal_runq_scan,
3770 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
3771 .rt_steal_thread = sched_rtlocal_steal_thread,
3772
3773 .qos_max_parallelism = sched_edge_qos_max_parallelism,
3774 .check_spill = sched_edge_check_spill,
3775 .ipi_policy = sched_edge_ipi_policy,
3776 .thread_should_yield = sched_edge_thread_should_yield,
3777 .run_count_incr = sched_clutch_run_incr,
3778 .run_count_decr = sched_clutch_run_decr,
3779 .update_thread_bucket = sched_clutch_update_thread_bucket,
3780 .pset_made_schedulable = sched_edge_pset_made_schedulable,
3781 .thread_group_recommendation_change = NULL,
3782 .cpu_init_completed = sched_edge_cpu_init_completed,
3783 .thread_eligible_for_pset = sched_edge_thread_eligible_for_pset,
3784 };
3785
3786 static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3787
3788 /*
3789 * sched_edge_pset_available()
3790 *
3791 * Routine to determine if a pset is available for scheduling.
3792 */
3793 static bool
sched_edge_pset_available(processor_set_t pset)3794 sched_edge_pset_available(processor_set_t pset)
3795 {
3796 if (pset == NULL) {
3797 return false;
3798 }
3799 return pset_available_cpu_count(pset) > 0;
3800 }
3801
3802 /*
3803 * sched_edge_thread_bound_cluster_id()
3804 *
3805 * Routine to determine which cluster a particular thread is bound to. Uses
3806 * the sched_flags on the thread to map back to a specific cluster id.
3807 *
3808 * <Edge Multi-cluster Support Needed>
3809 */
3810 static uint32_t
sched_edge_thread_bound_cluster_id(thread_t thread)3811 sched_edge_thread_bound_cluster_id(thread_t thread)
3812 {
3813 assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3814 return thread->th_bound_cluster_id;
3815 }
3816
3817 /* Forward declaration for some thread migration routines */
3818 static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3819 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3820 static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3821 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
3822
3823 /*
3824 * sched_edge_config_set()
3825 *
3826 * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3827 * policies in the scheduler.
3828 */
3829 static void
sched_edge_config_set(uint32_t src_cluster,uint32_t dst_cluster,sched_clutch_edge edge_config)3830 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
3831 {
3832 sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
3833 edge->sce_edge_packed = edge_config.sce_edge_packed;
3834 }
3835
3836 /*
3837 * sched_edge_config_get()
3838 *
3839 * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3840 * if it needs to update edges.
3841 */
3842 static sched_clutch_edge
sched_edge_config_get(uint32_t src_cluster,uint32_t dst_cluster)3843 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
3844 {
3845 return pset_array[src_cluster]->sched_edges[dst_cluster];
3846 }
3847
3848 /*
3849 * sched_edge_matrix_set()
3850 *
3851 * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3852 * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3853 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3854 */
3855 void
sched_edge_matrix_set(sched_clutch_edge * edge_matrix,bool * edge_changes_bitmap,__unused uint64_t flags,uint64_t matrix_order)3856 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3857 {
3858 uint32_t edge_index = 0;
3859 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3860 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3861 if (edge_changes_bitmap[edge_index]) {
3862 sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3863 }
3864 edge_index++;
3865 }
3866 }
3867 }
3868
3869 /*
3870 * sched_edge_matrix_get()
3871 *
3872 * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3873 * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3874 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3875 */
3876 void
sched_edge_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_request_bitmap,__unused uint64_t flags,uint64_t matrix_order)3877 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3878 {
3879 uint32_t edge_index = 0;
3880 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3881 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3882 if (edge_request_bitmap[edge_index]) {
3883 edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3884 }
3885 edge_index++;
3886 }
3887 }
3888 }
3889
3890 /*
3891 * sched_edge_init()
3892 *
3893 * Routine to initialize the data structures for the Edge scheduler.
3894 */
3895 static void
sched_edge_init(void)3896 sched_edge_init(void)
3897 {
3898 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3899 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3900 }
3901 sched_timeshare_init();
3902 sched_clutch_tunables_init();
3903 sched_edge_max_clusters = ml_get_cluster_count();
3904 }
3905
3906 static void
sched_edge_pset_init(processor_set_t pset)3907 sched_edge_pset_init(processor_set_t pset)
3908 {
3909 uint32_t pset_cluster_id = pset->pset_cluster_id;
3910 pset->pset_type = (pset->pset_cluster_type == PSET_AMP_P) ? CLUSTER_TYPE_P : CLUSTER_TYPE_E;
3911
3912 /* Set the edge weight and properties for the pset itself */
3913 bitmap_clear(pset->foreign_psets, pset_cluster_id);
3914 bitmap_clear(pset->native_psets, pset_cluster_id);
3915 bitmap_clear(pset->local_psets, pset_cluster_id);
3916 bitmap_clear(pset->remote_psets, pset_cluster_id);
3917 pset->sched_edges[pset_cluster_id].sce_edge_packed = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0}.sce_edge_packed;
3918 sched_clutch_root_init(&pset->pset_clutch_root, pset);
3919 bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id);
3920 }
3921
3922 static boolean_t
sched_edge_processor_queue_empty(processor_t processor)3923 sched_edge_processor_queue_empty(processor_t processor)
3924 {
3925 return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
3926 (sched_clutch_bound_runq(processor)->count == 0);
3927 }
3928
3929 static void
sched_edge_check_spill(__unused processor_set_t pset,__unused thread_t thread)3930 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
3931 {
3932 assert(thread->bound_processor == PROCESSOR_NULL);
3933 }
3934
3935 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
3936 SCHED_EDGE_YIELD_RUNQ_NONEMPTY = 0x0,
3937 SCHED_EDGE_YIELD_FOREIGN_RUNNABLE = 0x1,
3938 SCHED_EDGE_YIELD_FOREIGN_RUNNING = 0x2,
3939 SCHED_EDGE_YIELD_STEAL_POSSIBLE = 0x3,
3940 SCHED_EDGE_YIELD_DISALLOW = 0x4,
3941 });
3942
3943 static bool
sched_edge_thread_should_yield(processor_t processor,__unused thread_t thread)3944 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
3945 {
3946 if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
3947 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3948 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
3949 return true;
3950 }
3951
3952 /*
3953 * The yield logic should follow the same logic that steal_thread () does. The
3954 * thread_should_yield() is effectively trying to quickly check that if the
3955 * current thread gave up CPU, is there any other thread that would execute
3956 * on this CPU. So it needs to provide the same answer as the steal_thread()/
3957 * processor Idle logic.
3958 */
3959 if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
3960 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3961 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
3962 return true;
3963 }
3964 if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
3965 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3966 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
3967 return true;
3968 }
3969
3970 processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
3971 if (steal_candidate != NULL) {
3972 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3973 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
3974 return true;
3975 }
3976
3977 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
3978 0, SCHED_EDGE_YIELD_DISALLOW);
3979 return false;
3980 }
3981
3982 static void
sched_edge_processor_queue_shutdown(processor_t processor)3983 sched_edge_processor_queue_shutdown(processor_t processor)
3984 {
3985 processor_set_t pset = processor->processor_set;
3986 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3987 thread_t thread;
3988 queue_head_t tqueue;
3989
3990 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
3991 if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
3992 pset_unlock(pset);
3993 return;
3994 }
3995
3996 bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3997
3998 queue_init(&tqueue);
3999 while (sched_clutch_root_count(pset_clutch_root) > 0) {
4000 thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
4001 enqueue_tail(&tqueue, &thread->runq_links);
4002 }
4003 pset_unlock(pset);
4004
4005 qe_foreach_element_safe(thread, &tqueue, runq_links) {
4006 remqueue(&thread->runq_links);
4007 thread_lock(thread);
4008 thread_setrun(thread, SCHED_TAILQ);
4009 thread_unlock(thread);
4010 }
4011 }
4012
4013 /*
4014 * sched_edge_cluster_load_metric()
4015 *
4016 * The load metric for a cluster is a measure of the average scheduling latency
4017 * experienced by threads on that cluster. It is a product of the average number
4018 * of threads in the runqueue and the average execution time for threads. The metric
4019 * has special values in the following cases:
4020 * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
4021 * the maximum value to disallow any threads to migrate to this cluster.
4022 * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
4023 * to be spread across the platform quickly for ncpu wide workloads.
4024 */
4025 static uint32_t
sched_edge_cluster_load_metric(processor_set_t pset,sched_bucket_t sched_bucket)4026 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
4027 {
4028 if (sched_edge_pset_available(pset) == false) {
4029 return UINT32_MAX;
4030 }
4031 return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
4032 }
4033
4034 /*
4035 *
4036 * Edge Scheduler Steal/Rebalance logic
4037 *
4038 * = Generic scheduler logic =
4039 *
4040 * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
4041 * find any thread for execution in its runqueue. The aim of the steal operation
4042 * is to find other threads running/runnable in other clusters which should be
4043 * executed here.
4044 *
4045 * If the steal callout does not return a thread, the thread_select() logic calls
4046 * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
4047 * threads and idle out the current CPU.
4048 *
4049 * = SCHED(steal_thread) for Edge Scheduler =
4050 *
4051 * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
4052 * routine tries to do the following operations in order:
4053 * (1) Find foreign runnnable threads in non-native cluster
4054 * runqueues (sched_edge_foreign_runnable_thread_remove())
4055 * (2) Check if foreign threads are running on the non-native
4056 * clusters (sched_edge_foreign_running_thread_available())
4057 * - If yes, return THREAD_NULL for the steal callout and
4058 * perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
4059 * (3) Steal a thread from another cluster based on edge
4060 * weights (sched_edge_steal_thread())
4061 *
4062 * = SCHED(processor_balance) for Edge Scheduler =
4063 *
4064 * If steal_thread did not return a thread for the processor, use
4065 * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
4066 *
4067 * = Clutch Bucket Preferred Cluster Overrides =
4068 *
4069 * Since these operations (just like thread migrations on enqueue)
4070 * move threads across clusters, they need support for handling clutch
4071 * bucket group level preferred cluster recommendations.
4072 * For (1), a clutch bucket will be in the foreign runnable queue based
4073 * on the clutch bucket group preferred cluster.
4074 * For (2), the running thread will set the bit on the processor based
4075 * on its preferred cluster type.
4076 * For (3), the edge configuration would prevent threads from being stolen
4077 * in the wrong direction.
4078 *
4079 * = SCHED(thread_should_yield) =
4080 * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
4081 * since that is expecting the same answer as if thread_select() was called on a core
4082 * with an empty runqueue.
4083 */
4084
4085 static bool
sched_edge_steal_thread_enabled(__unused processor_set_t pset)4086 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
4087 {
4088 /*
4089 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
4090 */
4091 return true;
4092 }
4093
4094 static processor_set_t
sched_edge_steal_candidate(processor_set_t pset)4095 sched_edge_steal_candidate(processor_set_t pset)
4096 {
4097 uint32_t dst_cluster_id = pset->pset_cluster_id;
4098 for (int cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
4099 processor_set_t candidate_pset = pset_array[cluster_id];
4100 if (cluster_id == dst_cluster_id) {
4101 continue;
4102 }
4103 if (candidate_pset == NULL) {
4104 continue;
4105 }
4106 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
4107 if (incoming_edge->sce_steal_allowed && (bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1)) {
4108 return candidate_pset;
4109 }
4110 }
4111 return NULL;
4112 }
4113
4114 static boolean_t
sched_edge_foreign_runnable_thread_available(processor_set_t pset)4115 sched_edge_foreign_runnable_thread_available(processor_set_t pset)
4116 {
4117 /* Find all the clusters that are foreign for this cluster */
4118 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4119 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4120 /*
4121 * For each cluster, see if there are any runnable foreign threads.
4122 * This check is currently being done without the pset lock to make it cheap for
4123 * the common case.
4124 */
4125 processor_set_t target_pset = pset_array[cluster];
4126 if (sched_edge_pset_available(target_pset) == false) {
4127 continue;
4128 }
4129
4130 if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
4131 return true;
4132 }
4133 }
4134 return false;
4135 }
4136
4137 static thread_t
sched_edge_foreign_runnable_thread_remove(processor_set_t pset,uint64_t ctime)4138 sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
4139 {
4140 thread_t thread = THREAD_NULL;
4141
4142 /* Find all the clusters that are foreign for this cluster */
4143 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4144 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4145 /*
4146 * For each cluster, see if there are any runnable foreign threads.
4147 * This check is currently being done without the pset lock to make it cheap for
4148 * the common case.
4149 */
4150 processor_set_t target_pset = pset_array[cluster];
4151 if (sched_edge_pset_available(target_pset) == false) {
4152 continue;
4153 }
4154
4155 if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
4156 continue;
4157 }
4158 /*
4159 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
4160 * and get the highest priority thread.
4161 */
4162 pset_lock(target_pset);
4163 if (sched_edge_pset_available(target_pset)) {
4164 thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
4165 sched_update_pset_load_average(target_pset, ctime);
4166 }
4167 pset_unlock(target_pset);
4168
4169 /*
4170 * Edge Scheduler Optimization
4171 *
4172 * The current implementation immediately returns as soon as it finds a foreign
4173 * runnable thread. This could be enhanced to look at highest priority threads
4174 * from all foreign clusters and pick the highest amongst them. That would need
4175 * some form of global state across psets to make that kind of a check cheap.
4176 */
4177 if (thread != THREAD_NULL) {
4178 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
4179 break;
4180 }
4181 /* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
4182 }
4183
4184 return thread;
4185 }
4186
4187 /*
4188 * sched_edge_cpu_running_foreign_shared_rsrc_available()
4189 *
4190 * Routine to determine if the thread running on a CPU is a shared resource thread
4191 * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
4192 * a CPU going idle on a pset should rebalance a running shared resource heavy thread
4193 * from another non-ideal cluster based on the former's shared resource load.
4194 */
4195 static boolean_t
sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset,int foreign_cpu,processor_set_t idle_pset)4196 sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
4197 {
4198 boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
4199 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
4200 return false;
4201 }
4202
4203 boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
4204 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
4205 return false;
4206 }
4207 return true;
4208 }
4209
4210 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset)4211 sched_edge_foreign_running_thread_available(processor_set_t pset)
4212 {
4213 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4214 int cluster = -1;
4215 while ((cluster = sched_edge_iterate_clusters_ordered(pset, foreign_pset_bitmap[0], cluster)) != -1) {
4216 /* Skip the pset if its not schedulable */
4217 processor_set_t target_pset = pset_array[cluster];
4218 if (sched_edge_pset_available(target_pset) == false) {
4219 continue;
4220 }
4221
4222 uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
4223 for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
4224 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
4225 continue;
4226 }
4227 return true;
4228 }
4229 }
4230 return false;
4231 }
4232
4233 /*
4234 * sched_edge_steal_possible()
4235 *
4236 * Determines whether we can and should steal a thread from
4237 * the candidate_pset to run it on the idle_pset. When returning
4238 * true, the function also writes the scheduling bucket that we
4239 * should steal from into the bucket_for_steal out parameter.
4240 *
4241 * Always called with the pset lock for candidate_pset held.
4242 */
4243 static bool
sched_edge_steal_possible(processor_set_t idle_pset,processor_set_t candidate_pset,sched_bucket_t * bucket_for_steal)4244 sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_pset, sched_bucket_t *bucket_for_steal)
4245 {
4246 sched_clutch_root_t candidate_clutch_root = &candidate_pset->pset_clutch_root;
4247
4248 int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(candidate_clutch_root, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
4249 if (highest_runnable_bucket == -1) {
4250 /* Candidate cluster runq is empty of unbound threads */
4251 return false;
4252 }
4253
4254 if (idle_pset->pset_type == candidate_pset->pset_type) {
4255 /* Always allow stealing from homogeneous clusters */
4256 *bucket_for_steal = (sched_bucket_t)highest_runnable_bucket;
4257 return true;
4258 }
4259
4260 for (int unbound_qos = highest_runnable_bucket; unbound_qos >= 0; unbound_qos = bitmap_lsb_next(candidate_clutch_root->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, unbound_qos)) {
4261 uint32_t candidate_runq_depth = os_atomic_load(&candidate_pset->pset_runnable_depth[unbound_qos], relaxed);
4262 if (candidate_runq_depth > pset_available_cpu_count(candidate_pset)) {
4263 /* Candidate cluster has excess load at this QoS (and at least one unbound thread we can steal!) */
4264 *bucket_for_steal = (sched_bucket_t)unbound_qos;
4265 return true;
4266 }
4267 }
4268 /* None of the unbound root buckets are overloaded */
4269 return false;
4270 }
4271
4272 static thread_t
sched_edge_steal_thread(processor_set_t pset,uint64_t candidate_pset_bitmap)4273 sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap)
4274 {
4275 thread_t stolen_thread = THREAD_NULL;
4276
4277 /*
4278 * Edge Scheduler Optimization
4279 *
4280 * The logic today bails as soon as it finds a cluster where the cluster load is
4281 * greater than the edge weight. Maybe it should have a more advanced version
4282 * which looks for the maximum delta etc.
4283 */
4284 int cluster_id = -1;
4285 while ((cluster_id = sched_edge_iterate_clusters_ordered(pset, candidate_pset_bitmap, cluster_id)) != -1) {
4286 processor_set_t steal_from_pset = pset_array[cluster_id];
4287 if (steal_from_pset == NULL) {
4288 continue;
4289 }
4290 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[pset->pset_cluster_id];
4291 if (incoming_edge->sce_steal_allowed == false) {
4292 continue;
4293 }
4294 pset_lock(steal_from_pset);
4295 sched_bucket_t bucket_for_steal;
4296 if (sched_edge_steal_possible(pset, steal_from_pset, &bucket_for_steal)) {
4297 uint64_t current_timestamp = mach_absolute_time();
4298 sched_clutch_root_t clutch_root_for_steal = &steal_from_pset->pset_clutch_root;
4299 stolen_thread = sched_clutch_thread_unbound_lookup(clutch_root_for_steal, &clutch_root_for_steal->scr_unbound_buckets[bucket_for_steal], NULL, NULL);
4300 sched_clutch_thread_remove(clutch_root_for_steal, stolen_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
4301
4302 sched_clutch_dbg_thread_select_packed_t debug_info = {0};
4303 debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
4304 debug_info.trace_data.traverse_mode = SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY;
4305 debug_info.trace_data.cluster_id = steal_from_pset->pset_cluster_id;
4306 debug_info.trace_data.selection_was_cluster_bound = false;
4307 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
4308 thread_tid(stolen_thread), thread_group_get_id(stolen_thread->thread_group), bucket_for_steal, debug_info.scdts_trace_data_packed, 0);
4309 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(stolen_thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
4310
4311 sched_update_pset_load_average(steal_from_pset, current_timestamp);
4312 }
4313 pset_unlock(steal_from_pset);
4314 if (stolen_thread != THREAD_NULL) {
4315 break;
4316 }
4317 }
4318 return stolen_thread;
4319 }
4320
4321 /*
4322 * sched_edge_processor_idle()
4323 *
4324 * The routine is the implementation for steal_thread() for the Edge scheduler.
4325 */
4326 static thread_t
sched_edge_processor_idle(processor_set_t pset)4327 sched_edge_processor_idle(processor_set_t pset)
4328 {
4329 thread_t thread = THREAD_NULL;
4330
4331 uint64_t ctime = mach_absolute_time();
4332
4333 processor_t processor = current_processor();
4334 bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
4335
4336 /* Each of the operations acquire the lock for the pset they target */
4337 pset_unlock(pset);
4338
4339 /* Find highest priority runnable thread on all non-native clusters */
4340 thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
4341 if (thread != THREAD_NULL) {
4342 return thread;
4343 }
4344
4345 /* Find highest priority runnable thread on all native clusters */
4346 thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4347 if (thread != THREAD_NULL) {
4348 return thread;
4349 }
4350
4351 /* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4352 boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4353 if (rebalance_needed) {
4354 return THREAD_NULL;
4355 }
4356
4357 /* No foreign threads found; find a thread to steal from all clusters based on weights/loads etc. */
4358 thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4359 return thread;
4360 }
4361
4362 /* Return true if this shared resource thread has a better cluster to run on */
4363 static bool
sched_edge_shared_rsrc_migrate_possible(thread_t thread,processor_set_t preferred_pset,processor_set_t current_pset)4364 sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4365 {
4366 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4367 uint64_t current_pset_load = sched_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4368 /*
4369 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4370 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4371 *
4372 * Edge Scheduler Optimization
4373 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4374 */
4375 uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4376
4377 uint64_t eligible_pset_bitmask = 0;
4378 if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4379 /*
4380 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4381 * only among clusters native with the preferred cluster.
4382 */
4383 eligible_pset_bitmask = preferred_pset->native_psets[0];
4384 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4385 } else {
4386 /* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4387 eligible_pset_bitmask = sched_edge_available_pset_bitmask[0];
4388 }
4389
4390 /* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4391 for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4392 if (cluster_id == current_pset->pset_cluster_id) {
4393 continue;
4394 }
4395 uint64_t cluster_load = sched_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4396 if (current_pset_adjusted_load > cluster_load) {
4397 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4398 return true;
4399 }
4400 }
4401 return false;
4402 }
4403
4404 /* Return true if this thread should not continue running on this processor */
4405 static bool
sched_edge_thread_avoid_processor(processor_t processor,thread_t thread,ast_t reason)4406 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason)
4407 {
4408 if (thread->bound_processor == processor) {
4409 /* Thread is bound here */
4410 return false;
4411 }
4412
4413 /*
4414 * On quantum expiry, check the migration bitmask if this thread should be migrated off this core.
4415 * A migration is only recommended if there's also an idle core available that needn't be avoided.
4416 */
4417 if (reason & AST_QUANTUM) {
4418 if (bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id)) {
4419 uint64_t non_avoided_idle_primary_map = processor->processor_set->cpu_state_map[PROCESSOR_IDLE] & processor->processor_set->recommended_bitmask & ~processor->processor_set->perfcontrol_cpu_migration_bitmask;
4420 if (non_avoided_idle_primary_map != 0) {
4421 return true;
4422 }
4423 }
4424 }
4425
4426 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4427
4428 /* Evaluate shared resource policies */
4429 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
4430 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4431 }
4432 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
4433 if (processor->processor_set->pset_type != preferred_pset->pset_type &&
4434 pset_type_is_recommended(preferred_pset)) {
4435 return true;
4436 }
4437 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4438 }
4439
4440 /*
4441 * For long running parallel workloads, it is important to rebalance threads across
4442 * E/P clusters so that they make equal forward progress. This is achieved through
4443 * threads expiring their quantum on the non-preferred cluster type and explicitly
4444 * rebalancing to the preferred cluster runqueue.
4445 */
4446 if ((processor->processor_set->pset_type != preferred_pset->pset_type) &&
4447 pset_type_is_recommended(preferred_pset)) {
4448 return true;
4449 }
4450 /* If the preferred pset for the thread is now idle, try and migrate thread to that cluster */
4451 if ((processor->processor_set != preferred_pset) &&
4452 (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
4453 return true;
4454 }
4455
4456 return false;
4457 }
4458
4459 static bool
sched_edge_balance(__unused processor_t cprocessor,processor_set_t cpset)4460 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
4461 {
4462 assert(cprocessor == current_processor());
4463 pset_unlock(cpset);
4464
4465 uint64_t ast_processor_map = 0;
4466 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4467
4468 bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
4469 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4470 /* Skip the pset if its not schedulable */
4471 processor_set_t target_pset = pset_array[cluster];
4472 if (sched_edge_pset_available(target_pset) == false) {
4473 continue;
4474 }
4475
4476 pset_lock(target_pset);
4477 uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
4478 for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
4479 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
4480 continue;
4481 }
4482 processor_t target_cpu = processor_array[cpuid];
4483 ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
4484 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4485 bit_set(ast_processor_map, cpuid);
4486 }
4487 }
4488 pset_unlock(target_pset);
4489 }
4490
4491 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4492 processor_t ast_processor = processor_array[cpuid];
4493 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4494 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
4495 }
4496
4497 /* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
4498 return ast_processor_map != 0;
4499 }
4500
4501 /*
4502 * sched_edge_migration_check()
4503 *
4504 * Routine to evaluate an edge between two clusters to decide if migration is possible
4505 * across that edge. Also updates the selected_pset and max_edge_delta out parameters
4506 * accordingly. The return value indicates if the invoking routine should short circuit
4507 * the search, since an ideal candidate has been found. The routine looks at the regular
4508 * edges and cluster loads or the shared resource loads based on the type of thread.
4509 */
4510 static bool
sched_edge_migration_check(uint32_t cluster_id,processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread,processor_set_t * selected_pset,uint32_t * max_edge_delta)4511 sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
4512 uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
4513 {
4514 uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4515 cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
4516 processor_set_t dst_pset = pset_array[cluster_id];
4517 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4518 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4519
4520 if (cluster_id == preferred_cluster_id) {
4521 return false;
4522 }
4523
4524 if (dst_pset == NULL) {
4525 return false;
4526 }
4527
4528 sched_clutch_edge *edge = preferred_pset->sched_edges;
4529 if (edge[cluster_id].sce_migration_allowed == false) {
4530 return false;
4531 }
4532 uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4533 if (dst_load == 0) {
4534 /* The candidate cluster is idle; select it immediately for execution */
4535 *selected_pset = dst_pset;
4536 *max_edge_delta = preferred_cluster_load;
4537 return true;
4538 }
4539
4540 uint32_t edge_delta = 0;
4541 if (dst_load > preferred_cluster_load) {
4542 return false;
4543 }
4544 edge_delta = preferred_cluster_load - dst_load;
4545 if (!shared_rsrc_thread && (edge_delta < edge[cluster_id].sce_migration_weight)) {
4546 /*
4547 * For non shared resource threads, use the edge migration weight to decide if
4548 * this cluster is over-committed at the QoS level of this thread.
4549 */
4550 return false;
4551 }
4552
4553 if (edge_delta < *max_edge_delta) {
4554 return false;
4555 }
4556 if (edge_delta == *max_edge_delta) {
4557 /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4558 boolean_t selected_homogeneous = (pset_type_for_id((*selected_pset)->pset_cluster_id) == preferred_cluster_type);
4559 boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4560 if (selected_homogeneous || !candidate_homogeneous) {
4561 return false;
4562 }
4563 }
4564 /* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
4565 *max_edge_delta = edge_delta;
4566 *selected_pset = dst_pset;
4567 return false;
4568 }
4569
4570 /*
4571 * sched_edge_iterate_clusters_ordered()
4572 *
4573 * Routine to iterate clusters in die local order. For multi-die machines,
4574 * the routine ensures that the candidate clusters on the same die as the
4575 * passed in pset are returned before the remote die clusters. This should
4576 * be used in all places where cluster selection in die order matters.
4577 */
4578
4579 static int
sched_edge_iterate_clusters_ordered(processor_set_t starting_pset,uint64_t candidate_map,int previous_cluster)4580 sched_edge_iterate_clusters_ordered(processor_set_t starting_pset, uint64_t candidate_map, int previous_cluster)
4581 {
4582 int cluster_id = -1;
4583
4584 uint64_t local_candidate_map = starting_pset->local_psets[0] & candidate_map;
4585 uint64_t remote_candidate_map = starting_pset->remote_psets[0] & candidate_map;
4586
4587 if (previous_cluster == -1) {
4588 /* previous_cluster == -1 indicates the initial condition */
4589 cluster_id = bit_first(local_candidate_map);
4590 if (cluster_id != -1) {
4591 return cluster_id;
4592 }
4593 return bit_first(remote_candidate_map);
4594 } else {
4595 /*
4596 * After the initial condition, the routine attempts to return a
4597 * cluster in the previous_cluster's locality. If none is available,
4598 * it looks at remote clusters.
4599 */
4600 if (bit_test(local_candidate_map, previous_cluster)) {
4601 cluster_id = bit_next(local_candidate_map, previous_cluster);
4602 if (cluster_id != -1) {
4603 return cluster_id;
4604 } else {
4605 return bit_first(remote_candidate_map);
4606 }
4607 }
4608 return bit_next(remote_candidate_map, previous_cluster);
4609 }
4610 }
4611
4612 /*
4613 * sched_edge_migrate_edges_evaluate()
4614 *
4615 * Routine to find the candidate for thread migration based on edge weights.
4616 *
4617 * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
4618 * return preferred_pset if its the most ideal destination for this thread.
4619 */
4620 static processor_set_t
sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread)4621 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
4622 {
4623 processor_set_t selected_pset = preferred_pset;
4624 uint32_t max_edge_delta = 0;
4625 bool search_complete = false;
4626 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4627 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4628
4629 bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
4630 bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
4631 /* Always start the search with the native clusters */
4632 int cluster_id = -1;
4633 while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, native_pset_bitmap[0], cluster_id)) != -1) {
4634 search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4635 if (search_complete) {
4636 break;
4637 }
4638 }
4639
4640 if (search_complete) {
4641 return selected_pset;
4642 }
4643
4644 if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
4645 /*
4646 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
4647 * to fill up the preferred cluster and its homogeneous peers first.
4648 */
4649
4650 if (max_edge_delta > 0) {
4651 /*
4652 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
4653 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
4654 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
4655 * should be placed.
4656 */
4657 return selected_pset;
4658 }
4659 /*
4660 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
4661 * any more capacity left.
4662 */
4663 if (sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
4664 return preferred_pset;
4665 }
4666 /*
4667 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
4668 * at non-native clusters for capacity.
4669 */
4670 }
4671
4672 /* Now look at the non-native clusters */
4673 cluster_id = -1;
4674 while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, foreign_pset_bitmap[0], cluster_id)) != -1) {
4675 search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4676 if (search_complete) {
4677 break;
4678 }
4679 }
4680 return selected_pset;
4681 }
4682
4683 /*
4684 * sched_edge_candidate_alternative()
4685 *
4686 * Routine to find an alternative cluster from candidate_cluster_bitmap since the
4687 * selected_pset is not available for execution. The logic tries to prefer homogeneous
4688 * clusters over heterogeneous clusters since this is typically used in thread
4689 * placement decisions.
4690 */
4691 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
4692 static processor_set_t
sched_edge_candidate_alternative(processor_set_t selected_pset,uint64_t candidate_cluster_bitmap)4693 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
4694 {
4695 /*
4696 * It looks like the most ideal pset is not available for scheduling currently.
4697 * Try to find a homogeneous cluster that is still available.
4698 */
4699 uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
4700 int available_cluster_id = lsb_first(available_native_clusters);
4701 if (available_cluster_id == -1) {
4702 /* Looks like none of the homogeneous clusters are available; pick the first available cluster */
4703 available_cluster_id = bit_first(candidate_cluster_bitmap);
4704 }
4705 assert(available_cluster_id != -1);
4706 return pset_array[available_cluster_id];
4707 }
4708
4709 /*
4710 * sched_edge_switch_pset_lock()
4711 *
4712 * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4713 * switch_pset_locks.
4714 * Returns the newly locked pset after the switch.
4715 */
4716 static processor_set_t
sched_edge_switch_pset_lock(processor_set_t selected_pset,processor_set_t locked_pset,bool switch_pset_locks)4717 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
4718 {
4719 if (!switch_pset_locks) {
4720 return locked_pset;
4721 }
4722 if (selected_pset != locked_pset) {
4723 pset_unlock(locked_pset);
4724 pset_lock(selected_pset);
4725 return selected_pset;
4726 } else {
4727 return locked_pset;
4728 }
4729 }
4730
4731 /*
4732 * sched_edge_amp_rebalance_pset()
4733 *
4734 * Routine to decide where a thread which is eligible for AMP rebalance (i.e.
4735 * has executed on non-preferred cluster type for a while) should be enqueued.
4736 * The algorithm maintains a history of AMP rebalance decisions on the clutch
4737 * bucket group of the workload and round-robins between clusters to ensure
4738 * that all threads get a chance on the performance cores and make equal
4739 * progress.
4740 */
4741 static processor_set_t
sched_edge_amp_rebalance_pset(processor_set_t preferred_pset,thread_t thread)4742 sched_edge_amp_rebalance_pset(processor_set_t preferred_pset, thread_t thread)
4743 {
4744 sched_clutch_t clutch = sched_clutch_for_thread(thread);
4745 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
4746
4747 uint32_t last_chosen_cluster, new_chosen_cluster;
4748
4749 /* Only AMP rebalance within clusters native to the preferred cluster */
4750 uint64_t eligible_pset_bitmask = preferred_pset->native_psets[0];
4751 /* Preferred cluster is also eligible for rebalancing */
4752 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4753 /* Atomically update the AMP rebalance cluster for the clutch bucket group */
4754 os_atomic_rmw_loop(&clutch_bucket_group->scbg_amp_rebalance_last_chosen, last_chosen_cluster, new_chosen_cluster, relaxed, {
4755 if (last_chosen_cluster == UINT32_MAX) {
4756 new_chosen_cluster = preferred_pset->pset_cluster_id;
4757 } else {
4758 new_chosen_cluster = lsb_next(eligible_pset_bitmask, last_chosen_cluster);
4759 if (new_chosen_cluster == -1) {
4760 /* Rotate to the start of the eligible bitmask */
4761 new_chosen_cluster = lsb_first(eligible_pset_bitmask);
4762 }
4763 }
4764 });
4765 return pset_array[new_chosen_cluster];
4766 }
4767
4768 /*
4769 * sched_edge_migrate_candidate()
4770 *
4771 * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4772 * the thread and the preferred cluster to determine the best available pset for scheduling.
4773 *
4774 * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4775 * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4776 * decision has to be synchronized with another CPU changing the recommendation of clusters available
4777 * on the system. If this parameter is set to false, this routine returns the best effort indication of
4778 * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4779 * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4780 * more appropriate cluster.
4781 *
4782 * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4783 * resultant pset lock is held.
4784 */
4785 static processor_set_t
sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset,thread_t thread,processor_set_t locked_pset,bool switch_pset_locks)4786 sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
4787 {
4788 processor_set_t selected_pset = preferred_pset;
4789 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4790 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4791
4792 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
4793 /* For bound threads always recommend the cluster it's bound to */
4794 selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
4795 if (selected_pset != NULL) {
4796 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4797 if (sched_edge_pset_available(selected_pset) || (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread) == false)) {
4798 /*
4799 * If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
4800 * fall through to the regular cluster selection logic which handles unavailable clusters
4801 * appropriately. If the thread is hard bound, then return the bound cluster always.
4802 */
4803 return selected_pset;
4804 }
4805 }
4806 }
4807
4808 uint64_t candidate_cluster_bitmap = mask(sched_edge_max_clusters);
4809 #if DEVELOPMENT || DEBUG
4810 extern int enable_task_set_cluster_type;
4811 task_t task = get_threadtask(thread);
4812 if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
4813 processor_set_t pset_hint = task->pset_hint;
4814 if (pset_hint && (selected_pset == NULL || selected_pset->pset_cluster_type != pset_hint->pset_cluster_type)) {
4815 selected_pset = pset_hint;
4816 goto migrate_candidate_available_check;
4817 }
4818 }
4819 #endif
4820
4821 if (preferred_pset == NULL) {
4822 /* The preferred_pset has not finished initializing at boot */
4823 goto migrate_candidate_available_check;
4824 }
4825
4826 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4827 /* For realtime threads, try and schedule them on the preferred pset always */
4828 goto migrate_candidate_available_check;
4829 }
4830
4831 uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
4832 if (preferred_cluster_load == 0) {
4833 goto migrate_candidate_available_check;
4834 }
4835
4836 /*
4837 * If a thread is being rebalanced for achieving equal progress of parallel workloads,
4838 * it needs to end up on the preferred runqueue. This mechanism should only be used for
4839 * threads which have been previously migrated to the non-preferred cluster type.
4840 *
4841 * The AMP rebalancing mechanism is available for regular threads or shared resource
4842 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
4843 */
4844 bool amp_rebalance_eligible = (!shared_rsrc_thread) || (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST));
4845 if (amp_rebalance_eligible) {
4846 boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE | AST_QUANTUM)) == (AST_REBALANCE | AST_QUANTUM);
4847 if (amp_rebalance) {
4848 boolean_t non_preferred_pset = (thread->last_processor->processor_set->pset_type != preferred_pset->pset_type);
4849 if (non_preferred_pset) {
4850 selected_pset = sched_edge_amp_rebalance_pset(preferred_pset, thread);
4851 goto migrate_candidate_available_check;
4852 }
4853 }
4854 }
4855
4856 /* Look at edge weights to decide the most ideal migration candidate for this thread */
4857 selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
4858
4859 migrate_candidate_available_check:
4860 if (selected_pset == NULL) {
4861 /* The selected_pset has not finished initializing at boot */
4862 pset_unlock(locked_pset);
4863 return NULL;
4864 }
4865
4866 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4867 if (sched_edge_pset_available(selected_pset) == true) {
4868 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_pset->pset_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4869 return selected_pset;
4870 }
4871 /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4872 bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4873 if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -1)) {
4874 pset_unlock(locked_pset);
4875 return NULL;
4876 }
4877 /* Try and find an alternative for the selected pset */
4878 selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
4879 goto migrate_candidate_available_check;
4880 }
4881
4882 static processor_t
sched_edge_choose_processor(processor_set_t pset,processor_t processor,thread_t thread)4883 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
4884 {
4885 /* Bound threads don't call this function */
4886 assert(thread->bound_processor == PROCESSOR_NULL);
4887 processor_t chosen_processor = PROCESSOR_NULL;
4888
4889 /*
4890 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
4891 * It should take the passed in "pset" as a hint which represents the recency metric for
4892 * pset selection logic.
4893 */
4894 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4895 processor_set_t chosen_pset = preferred_pset;
4896 /*
4897 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4898 * threads to. sched_edge_migrate_candidate() returns the preferred pset
4899 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4900 *
4901 * Edge Scheduler Optimization
4902 * It might be useful to build a recency metric for the thread for multiple clusters and
4903 * factor that into the migration decisions.
4904 */
4905 chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
4906 if (chosen_pset) {
4907 chosen_processor = choose_processor(chosen_pset, processor, thread);
4908 }
4909 /* For RT threads, choose_processor() can return a different cluster than the one passed into it */
4910 assert(chosen_processor ? chosen_processor->processor_set->pset_type == chosen_pset->pset_type : true);
4911 return chosen_processor;
4912 }
4913
4914 /*
4915 * sched_edge_clutch_bucket_threads_drain()
4916 *
4917 * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4918 * bucket overrides etc.) into a local thread queue.
4919 */
4920 static void
sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,queue_t clutch_threads)4921 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
4922 {
4923 thread_t thread = THREAD_NULL;
4924 uint64_t current_timestamp = mach_approximate_time();
4925 qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
4926 sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4927 enqueue_tail(clutch_threads, &thread->runq_links);
4928 }
4929 }
4930
4931 /*
4932 * sched_edge_run_drained_threads()
4933 *
4934 * Makes all drained threads in a local queue runnable.
4935 */
4936 static void
sched_edge_run_drained_threads(queue_t clutch_threads)4937 sched_edge_run_drained_threads(queue_t clutch_threads)
4938 {
4939 thread_t thread;
4940 /* Now setrun all the threads in the local queue */
4941 qe_foreach_element_safe(thread, clutch_threads, runq_links) {
4942 remqueue(&thread->runq_links);
4943 thread_lock(thread);
4944 thread_setrun(thread, SCHED_TAILQ);
4945 thread_unlock(thread);
4946 }
4947 }
4948
4949 /*
4950 * sched_edge_update_preferred_cluster()
4951 *
4952 * Routine to update the preferred cluster for QoS buckets within a thread group.
4953 * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4954 */
4955 static void
sched_edge_update_preferred_cluster(sched_clutch_t sched_clutch,bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster)4956 sched_edge_update_preferred_cluster(
4957 sched_clutch_t sched_clutch,
4958 bitmap_t *clutch_bucket_modify_bitmap,
4959 uint32_t *tg_bucket_preferred_cluster)
4960 {
4961 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4962 os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
4963 }
4964 }
4965
4966 /*
4967 * sched_edge_migrate_thread_group_runnable_threads()
4968 *
4969 * Routine to implement the migration of threads on a cluster when the thread group
4970 * recommendation is updated. The migration works using a 2-phase
4971 * algorithm.
4972 *
4973 * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4974 * For each clutch bucket, if it needs to be migrated immediately, drain the threads
4975 * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4976 * appropriate.
4977 *
4978 * Phase 2: After unlocking the pset, drain all the threads from the local thread
4979 * queue and mark them runnable which should land them in the right hierarchy.
4980 *
4981 * The routine assumes that the preferences for the clutch buckets/clutch bucket
4982 * groups have already been updated by the caller.
4983 *
4984 * - Called with the pset locked and interrupts disabled.
4985 * - Returns with the pset unlocked.
4986 */
4987 static void
sched_edge_migrate_thread_group_runnable_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,bitmap_t * clutch_bucket_modify_bitmap,__unused uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)4988 sched_edge_migrate_thread_group_runnable_threads(
4989 sched_clutch_t sched_clutch,
4990 sched_clutch_root_t root_clutch,
4991 bitmap_t *clutch_bucket_modify_bitmap,
4992 __unused uint32_t *tg_bucket_preferred_cluster,
4993 bool migrate_immediately)
4994 {
4995 /* Queue to hold threads that have been drained from clutch buckets to be migrated */
4996 queue_head_t clutch_threads;
4997 queue_init(&clutch_threads);
4998
4999 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
5000 /* Get the clutch bucket for this cluster and sched bucket */
5001 sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
5002 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
5003 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
5004 if (scb_root == NULL) {
5005 /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
5006 assert(clutch_bucket->scb_thr_count == 0);
5007 continue;
5008 }
5009 assert(scb_root == root_clutch);
5010 uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
5011
5012 if (migrate_immediately) {
5013 /*
5014 * For transitions where threads need to be migrated immediately, drain the threads into a
5015 * local queue unless we are looking at the clutch buckets for the newly recommended
5016 * cluster.
5017 */
5018 if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
5019 sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
5020 } else {
5021 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
5022 }
5023 } else {
5024 /* Check if this cluster is the same type as the newly recommended cluster */
5025 boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
5026 /*
5027 * If threads do not have to be migrated immediately, just change the native/foreign
5028 * flag on the clutch bucket.
5029 */
5030 if (homogeneous_cluster) {
5031 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
5032 } else {
5033 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
5034 }
5035 }
5036 }
5037
5038 pset_unlock(root_clutch->scr_pset);
5039 sched_edge_run_drained_threads(&clutch_threads);
5040 }
5041
5042 /*
5043 * sched_edge_migrate_thread_group_running_threads()
5044 *
5045 * Routine to find all running threads of a thread group on a specific cluster
5046 * and IPI them if they need to be moved immediately.
5047 */
5048 static void
sched_edge_migrate_thread_group_running_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,__unused bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)5049 sched_edge_migrate_thread_group_running_threads(
5050 sched_clutch_t sched_clutch,
5051 sched_clutch_root_t root_clutch,
5052 __unused bitmap_t *clutch_bucket_modify_bitmap,
5053 uint32_t *tg_bucket_preferred_cluster,
5054 bool migrate_immediately)
5055 {
5056 if (migrate_immediately == false) {
5057 /* If CLPC has recommended not to move threads immediately, nothing to do here */
5058 return;
5059 }
5060
5061 /*
5062 * Edge Scheduler Optimization
5063 *
5064 * When the system has a large number of clusters and cores, it might be useful to
5065 * narrow down the iteration by using a thread running bitmap per clutch.
5066 */
5067 uint64_t ast_processor_map = 0;
5068 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
5069
5070 uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
5071 /*
5072 * Iterate all CPUs and look for the ones running threads from this thread group and are
5073 * not restricted to the specific cluster (due to overrides etc.)
5074 */
5075 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
5076 processor_t src_processor = processor_array[cpuid];
5077 boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
5078 sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
5079 if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
5080 continue;
5081 }
5082 boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
5083
5084 if (expected_tg && non_preferred_cluster) {
5085 ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
5086 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
5087 bit_set(ast_processor_map, cpuid);
5088 } else if (src_processor == current_processor()) {
5089 bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
5090 ast_t new_preempt = update_pending_nonurgent_preemption(src_processor, AST_PREEMPT);
5091 ast_on(new_preempt);
5092 }
5093 }
5094 }
5095
5096 /* Perform all the IPIs */
5097 if (bit_first(ast_processor_map) != -1) {
5098 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
5099 processor_t ast_processor = processor_array[cpuid];
5100 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
5101 }
5102 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
5103 }
5104 }
5105
5106 /*
5107 * sched_edge_tg_preferred_cluster_change()
5108 *
5109 * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
5110 * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
5111 * thread migrations based on the policy specified by CLPC.
5112 * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
5113 * for each QoS within the thread group.
5114 */
5115 void
sched_edge_tg_preferred_cluster_change(struct thread_group * tg,uint32_t * tg_bucket_preferred_cluster,sched_perfcontrol_preferred_cluster_options_t options)5116 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
5117 {
5118 sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
5119 /*
5120 * In order to optimize the processing, create a bitmap which represents all QoS buckets
5121 * for which the preferred cluster has changed.
5122 */
5123 bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
5124 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
5125 uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
5126 uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
5127 if (old_preferred_cluster != new_preferred_cluster) {
5128 bitmap_set(clutch_bucket_modify_bitmap, bucket);
5129 }
5130 }
5131 if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
5132 /* No changes in any clutch buckets; nothing to do here */
5133 return;
5134 }
5135
5136 for (uint32_t cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
5137 processor_set_t pset = pset_array[cluster_id];
5138 spl_t s = splsched();
5139 pset_lock(pset);
5140 /*
5141 * The first operation is to update the preferred cluster for all QoS buckets within the
5142 * thread group so that any future threads becoming runnable would see the new preferred
5143 * cluster value.
5144 */
5145 sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
5146 /*
5147 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
5148 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
5149 *
5150 * Edge Scheduler Optimization
5151 */
5152 /* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
5153 sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
5154 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
5155 /* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
5156 sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
5157 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
5158 /* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
5159 splx(s);
5160 }
5161 }
5162
5163 /*
5164 * sched_edge_pset_made_schedulable()
5165 *
5166 * Routine to migrate all the clutch buckets which are not in their recommended
5167 * pset hierarchy now that a new pset has become runnable. Its possible that this
5168 * routine is called when the pset is already marked schedulable.
5169 *
5170 * Invoked with the pset lock held and interrupts disabled.
5171 */
5172 static void
sched_edge_pset_made_schedulable(__unused processor_t processor,processor_set_t dst_pset,boolean_t drop_lock)5173 sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
5174 {
5175 if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
5176 /* Nothing to do here since pset is already marked schedulable */
5177 if (drop_lock) {
5178 pset_unlock(dst_pset);
5179 }
5180 return;
5181 }
5182
5183 bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
5184
5185 thread_t thread = sched_edge_processor_idle(dst_pset);
5186 if (thread != THREAD_NULL) {
5187 thread_lock(thread);
5188 thread_setrun(thread, SCHED_TAILQ);
5189 thread_unlock(thread);
5190 }
5191
5192 if (!drop_lock) {
5193 pset_lock(dst_pset);
5194 }
5195 }
5196
5197 /*
5198 * sched_edge_cpu_init_completed()
5199 *
5200 * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
5201 * provides an opportunity for the edge scheduler to initialize all the edge parameters.
5202 */
5203 static void
sched_edge_cpu_init_completed(void)5204 sched_edge_cpu_init_completed(void)
5205 {
5206 spl_t s = splsched();
5207 for (int src_cluster_id = 0; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) {
5208 processor_set_t src_pset = pset_array[src_cluster_id];
5209 pset_lock(src_pset);
5210
5211 /* For each cluster, set all its outgoing edge parameters */
5212 for (int dst_cluster_id = 0; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) {
5213 if (dst_cluster_id == src_cluster_id) {
5214 continue;
5215 }
5216 processor_set_t dst_pset = pset_array[dst_cluster_id];
5217 if (src_pset->pset_type == dst_pset->pset_type) {
5218 /* P->P/E->E edge config */
5219 bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
5220 bitmap_set(src_pset->native_psets, dst_cluster_id);
5221 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
5222 } else if ((src_pset->pset_type == CLUSTER_TYPE_P) && (dst_pset->pset_type == CLUSTER_TYPE_E)) {
5223 /* P->E edge config */
5224 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
5225 bitmap_clear(src_pset->native_psets, dst_cluster_id);
5226 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
5227 } else {
5228 /* E->P edge config */
5229 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
5230 bitmap_clear(src_pset->native_psets, dst_cluster_id);
5231 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
5232 }
5233 bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
5234 if (clusters_local) {
5235 bitmap_set(src_pset->local_psets, dst_cluster_id);
5236 bitmap_clear(src_pset->remote_psets, dst_cluster_id);
5237 } else {
5238 bitmap_set(src_pset->remote_psets, dst_cluster_id);
5239 bitmap_clear(src_pset->local_psets, dst_cluster_id);
5240 }
5241 }
5242
5243 pset_unlock(src_pset);
5244 }
5245 splx(s);
5246 }
5247
5248 static bool
sched_edge_thread_eligible_for_pset(thread_t thread,processor_set_t pset)5249 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
5250 {
5251 uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
5252 if (preferred_cluster_id == pset->pset_cluster_id) {
5253 return true;
5254 } else {
5255 processor_set_t preferred_pset = pset_array[preferred_cluster_id];
5256 return preferred_pset->sched_edges[pset->pset_cluster_id].sce_migration_allowed;
5257 }
5258 }
5259
5260 extern int sched_amp_spill_deferred_ipi;
5261 extern int sched_amp_pcores_preempt_immediate_ipi;
5262
5263 int sched_edge_migrate_ipi_immediate = 1;
5264
5265 sched_ipi_type_t
sched_edge_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)5266 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
5267 {
5268 processor_set_t pset = dst->processor_set;
5269 assert(dst != current_processor());
5270
5271 boolean_t deferred_ipi_supported = false;
5272 #if defined(CONFIG_SCHED_DEFERRED_AST)
5273 deferred_ipi_supported = true;
5274 #endif /* CONFIG_SCHED_DEFERRED_AST */
5275
5276 switch (event) {
5277 case SCHED_IPI_EVENT_SPILL:
5278 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
5279 if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
5280 return sched_ipi_deferred_policy(pset, dst, thread, event);
5281 }
5282 break;
5283 case SCHED_IPI_EVENT_PREEMPT:
5284 /* For preemption, the default policy is to use deferred IPIs
5285 * for Non-RT P-core preemption. Override that behavior if
5286 * sched_amp_pcores_preempt_immediate_ipi is set
5287 */
5288 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
5289 if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
5290 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5291 }
5292 if (sched_edge_migrate_ipi_immediate) {
5293 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5294 /*
5295 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
5296 */
5297 if (preferred_pset->pset_type == pset->pset_type) {
5298 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5299 }
5300 /*
5301 * For workloads that are going wide, it might be useful use Immediate IPI to
5302 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
5303 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
5304 * latency on the preferred pset as an estimate of busyness.
5305 */
5306 if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
5307 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5308 }
5309 }
5310 }
5311 break;
5312 default:
5313 break;
5314 }
5315 /* Default back to the global policy for all other scenarios */
5316 return sched_ipi_policy(dst, thread, dst_idle, event);
5317 }
5318
5319 /*
5320 * sched_edge_qos_max_parallelism()
5321 */
5322 uint32_t
sched_edge_qos_max_parallelism(int qos,uint64_t options)5323 sched_edge_qos_max_parallelism(int qos, uint64_t options)
5324 {
5325 uint32_t ecpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_E, false, false);
5326 uint32_t pcpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_P, false, false);
5327 uint32_t ecluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_E);
5328 uint32_t pcluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_P);
5329
5330
5331 if (options & QOS_PARALLELISM_REALTIME) {
5332 /* For realtime threads on AMP, we would want them
5333 * to limit the width to just the P-cores since we
5334 * do not spill/rebalance for RT threads.
5335 */
5336 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcpu_count;
5337 }
5338
5339 /*
5340 * The Edge scheduler supports per-QoS recommendations for thread groups.
5341 * This enables lower QoS buckets (such as UT) to be scheduled on all
5342 * CPUs on the system.
5343 *
5344 * The only restriction is for BG/Maintenance QoS classes for which the
5345 * performance controller would never recommend execution on the P-cores.
5346 * If that policy changes in the future, this value should be changed.
5347 */
5348 switch (qos) {
5349 case THREAD_QOS_BACKGROUND:
5350 case THREAD_QOS_MAINTENANCE:
5351 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecpu_count;
5352 default:
5353 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecpu_count + pcpu_count);
5354 }
5355 }
5356
5357
5358
5359 #endif /* CONFIG_SCHED_EDGE */
5360
5361 #endif /* CONFIG_SCHED_CLUTCH */
5362