1 /*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #if !SCHED_TEST_HARNESS
30
31 #include <mach/mach_types.h>
32 #include <mach/machine.h>
33 #include <machine/machine_routines.h>
34 #include <machine/sched_param.h>
35 #include <machine/machine_cpu.h>
36 #include <kern/kern_types.h>
37 #include <kern/debug.h>
38 #include <kern/machine.h>
39 #include <kern/misc_protos.h>
40 #include <kern/queue.h>
41 #include <kern/sched.h>
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <kern/sched_clutch.h>
45 #include <machine/atomic.h>
46 #include <kern/sched_clutch.h>
47 #include <sys/kdebug.h>
48
49 #endif /* !SCHED_TEST_HARNESS */
50
51 #include <kern/processor.h>
52 #include <kern/sched_prim.h>
53
54 #if CONFIG_SCHED_EDGE
55 #include <kern/sched_amp_common.h>
56 #endif /* CONFIG_SCHED_EDGE */
57
58 #if CONFIG_SCHED_CLUTCH
59
60 #define SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION 1
61 typedef union {
62 struct __attribute__((packed)) {
63 unsigned int version : 4;
64 unsigned int traverse_mode : 3;
65 unsigned int cluster_id : 6;
66 unsigned int selection_was_edf : 1;
67 unsigned int selection_was_cluster_bound : 1;
68 unsigned int selection_opened_starvation_avoidance_window : 1;
69 unsigned int selection_opened_warp_window : 1;
70 unsigned int starvation_avoidance_window_close : 12;
71 unsigned int warp_window_close : 12;
72 unsigned int reserved : 23; /* For future usage */
73 } trace_data;
74 uint64_t scdts_trace_data_packed;
75 } sched_clutch_dbg_thread_select_packed_t;
76
77 static_assert(TH_BUCKET_SCHED_MAX == 6, "Ensure layout of sched_clutch_dbg_thread_select_packed can fit root bucket bitmasks");
78 static_assert(sizeof(sched_clutch_dbg_thread_select_packed_t) <= sizeof(uint64_t), "Ensure sched_clutch_dbg_thread_select_packed_t can fit in one tracepoint argument");
79
80 /* Forward declarations of static routines */
81
82 /* Root level hierarchy management */
83 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
84 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
85 static void sched_clutch_root_pri_update(sched_clutch_root_t);
86 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
87 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
88
89 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
90 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE = 0,
91 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY = 1,
92 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL = 2,
93 });
94 __enum_decl(sched_clutch_traverse_mode_t, uint32_t, {
95 SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY = 0,
96 SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT = 1,
97 SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT = 2,
98 });
99 static_assert(SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT < (1 << 3), "Ensure traverse mode can be encoded within 3 bits of sched_clutch_dbg_thread_select_packed_t");
100 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t, sched_clutch_root_bucket_t, thread_t, bool *, sched_clutch_traverse_mode_t, sched_clutch_dbg_thread_select_packed_t *);
101
102 #if CONFIG_SCHED_EDGE
103 /* Support for foreign threads on AMP platforms */
104 static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
105 static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
106 #endif /* CONFIG_SCHED_EDGE */
107
108 /* Root bucket level hierarchy management */
109 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
110 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t, bool);
111 static int sched_clutch_root_highest_runnable_qos(sched_clutch_root_t, sched_clutch_highest_root_bucket_type_t);
112
113 /* Options for clutch bucket ordering in the runq */
114 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
115 SCHED_CLUTCH_BUCKET_OPTIONS_NONE = 0x0,
116 /* Round robin clutch bucket on thread removal */
117 SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = 0x1,
118 /* Insert clutch bucket at head (for thread preemption) */
119 SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = 0x2,
120 /* Insert clutch bucket at tail (default) */
121 SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = 0x4,
122 });
123
124 /* Clutch bucket level hierarchy management */
125 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
126 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
127 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
128 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
129 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
130 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
131
132 /* Clutch bucket group level properties management */
133 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
134 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
135 static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
136 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
137 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
138 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
139 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
140
141 /* Clutch timeshare properties updates */
142 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
143 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
144
145 /* Clutch membership management */
146 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
147 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
148 static thread_t sched_clutch_hierarchy_thread_highest(sched_clutch_root_t, processor_t, thread_t, sched_clutch_traverse_mode_t);
149
150 /* Clutch properties updates */
151 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
152 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
153 static int sched_clutch_root_priority(sched_clutch_root_t);
154 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t, sched_clutch_root_bucket_t, processor_t _Nullable processor, thread_t _Nullable prev_thread, bool *_Nullable chose_prev_thread);
155
156 /* Clutch thread properties */
157 static boolean_t sched_thread_sched_pri_promoted(thread_t);
158 static inline sched_clutch_bucket_t sched_clutch_bucket_for_thread(sched_clutch_root_t, thread_t);
159 static inline sched_clutch_bucket_group_t sched_clutch_bucket_group_for_thread(thread_t);
160
161 /* General utilities */
162 static inline bool sched_clutch_pri_greater_than_tiebreak(int, int, bool);
163
164 #if CONFIG_SCHED_EDGE
165 /* System based routines */
166 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
167 static int sched_edge_iterate_clusters_ordered(processor_set_t, uint64_t, int);
168
169 /* Global indicating the maximum number of clusters on the current platform */
170 static int sched_edge_max_clusters = 0;
171 #endif /* CONFIG_SCHED_EDGE */
172
173 /* Helper debugging routines */
174 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
175
176 extern processor_set_t pset_array[MAX_PSETS];
177
178 /*
179 * Special markers for buckets that have invalid WCELs/quantums etc.
180 */
181 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
182 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
183
184 /*
185 * Root level bucket WCELs
186 *
187 * The root level bucket selection algorithm is an Earliest Deadline
188 * First (EDF) algorithm where the deadline for buckets are defined
189 * by the worst-case-execution-latency and the make runnable timestamp
190 * for the bucket.
191 *
192 */
193 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
194 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
195 0, /* FG */
196 37500, /* IN (37.5ms) */
197 75000, /* DF (75ms) */
198 150000, /* UT (150ms) */
199 250000 /* BG (250ms) */
200 };
201 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
202
203 /*
204 * Root level bucket warp
205 *
206 * Each root level bucket has a warp value associated with it as well.
207 * The warp value allows the root bucket to effectively warp ahead of
208 * lower priority buckets for a limited time even if it has a later
209 * deadline. The warping behavior provides extra (but limited)
210 * opportunity for high priority buckets to remain responsive.
211 */
212
213 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
214 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
215
216 /* Warp window durations for various tiers */
217 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
218 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
219 8000, /* FG (8ms)*/
220 4000, /* IN (4ms) */
221 2000, /* DF (2ms) */
222 1000, /* UT (1ms) */
223 0 /* BG (0ms) */
224 };
225 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
226
227 /*
228 * Thread level quantum
229 *
230 * The algorithm defines quantums for threads at various buckets. This
231 * (combined with the root level bucket quantums) restricts how much
232 * the lower priority levels can preempt the higher priority threads.
233 */
234
235 #if XNU_TARGET_OS_OSX
236 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
237 10000, /* FIXPRI (10ms) */
238 10000, /* FG (10ms) */
239 10000, /* IN (10ms) */
240 10000, /* DF (10ms) */
241 4000, /* UT (4ms) */
242 2000 /* BG (2ms) */
243 };
244 #else /* XNU_TARGET_OS_OSX */
245 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
246 10000, /* FIXPRI (10ms) */
247 10000, /* FG (10ms) */
248 8000, /* IN (8ms) */
249 6000, /* DF (6ms) */
250 4000, /* UT (4ms) */
251 2000 /* BG (2ms) */
252 };
253 #endif /* XNU_TARGET_OS_OSX */
254
255 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
256
257 /*
258 * sched_clutch_us_to_abstime()
259 *
260 * Initializer for converting all durations in usec to abstime
261 */
262 static void
sched_clutch_us_to_abstime(uint32_t * us_vals,uint64_t * abstime_vals)263 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
264 {
265 for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
266 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
267 abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
268 } else {
269 clock_interval_to_absolutetime_interval(us_vals[i],
270 NSEC_PER_USEC, &abstime_vals[i]);
271 }
272 }
273 }
274
275 /* Clutch/Edge Scheduler Debugging support */
276 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c) ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
277
278 #if DEVELOPMENT || DEBUG
279
280 /*
281 * sched_clutch_hierarchy_locked_assert()
282 *
283 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
284 * for the hierarchy depends on where the hierarchy is hooked. The current
285 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
286 * using the pset lock.
287 */
288 static inline void
sched_clutch_hierarchy_locked_assert(sched_clutch_root_t root_clutch)289 sched_clutch_hierarchy_locked_assert(
290 sched_clutch_root_t root_clutch)
291 {
292 pset_assert_locked(root_clutch->scr_pset);
293 }
294
295 #else /* DEVELOPMENT || DEBUG */
296
297 static inline void
sched_clutch_hierarchy_locked_assert(__unused sched_clutch_root_t root_clutch)298 sched_clutch_hierarchy_locked_assert(
299 __unused sched_clutch_root_t root_clutch)
300 {
301 }
302
303 #endif /* DEVELOPMENT || DEBUG */
304
305 /*
306 * sched_clutch_thr_count_inc()
307 *
308 * Increment thread count at a hierarchy level with overflow checks.
309 */
310 static void
sched_clutch_thr_count_inc(uint16_t * thr_count)311 sched_clutch_thr_count_inc(
312 uint16_t *thr_count)
313 {
314 if (__improbable(os_inc_overflow(thr_count))) {
315 panic("sched_clutch thread count overflowed!");
316 }
317 }
318
319 /*
320 * sched_clutch_thr_count_dec()
321 *
322 * Decrement thread count at a hierarchy level with underflow checks.
323 */
324 static void
sched_clutch_thr_count_dec(uint16_t * thr_count)325 sched_clutch_thr_count_dec(
326 uint16_t *thr_count)
327 {
328 if (__improbable(os_dec_overflow(thr_count))) {
329 panic("sched_clutch thread count underflowed!");
330 }
331 }
332
333 static sched_bucket_t
sched_convert_pri_to_bucket(uint8_t priority)334 sched_convert_pri_to_bucket(uint8_t priority)
335 {
336 sched_bucket_t bucket = TH_BUCKET_RUN;
337
338 if (priority > BASEPRI_USER_INITIATED) {
339 bucket = TH_BUCKET_SHARE_FG;
340 } else if (priority > BASEPRI_DEFAULT) {
341 bucket = TH_BUCKET_SHARE_IN;
342 } else if (priority > BASEPRI_UTILITY) {
343 bucket = TH_BUCKET_SHARE_DF;
344 } else if (priority > MAXPRI_THROTTLE) {
345 bucket = TH_BUCKET_SHARE_UT;
346 } else {
347 bucket = TH_BUCKET_SHARE_BG;
348 }
349 return bucket;
350 }
351
352 /*
353 * sched_clutch_thread_bucket_map()
354 *
355 * Map a thread to a scheduling bucket for the clutch/edge scheduler
356 * based on its scheduling mode and the priority attribute passed in.
357 */
358 static sched_bucket_t
sched_clutch_thread_bucket_map(thread_t thread,int pri)359 sched_clutch_thread_bucket_map(thread_t thread, int pri)
360 {
361 switch (thread->sched_mode) {
362 case TH_MODE_FIXED:
363 if (pri >= BASEPRI_FOREGROUND) {
364 return TH_BUCKET_FIXPRI;
365 } else {
366 return sched_convert_pri_to_bucket(pri);
367 }
368
369 case TH_MODE_REALTIME:
370 return TH_BUCKET_FIXPRI;
371
372 case TH_MODE_TIMESHARE:
373 return sched_convert_pri_to_bucket(pri);
374
375 default:
376 panic("unexpected mode: %d", thread->sched_mode);
377 break;
378 }
379 }
380
381 /*
382 * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
383 * based on the amount of time they have been pending and the load at that
384 * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
385 * multiple clusters, its important to keep the load also as a global counter.
386 */
387 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
388
389 /*
390 * sched_clutch_root_init()
391 *
392 * Routine to initialize the scheduler hierarchy root.
393 */
394 static void
sched_clutch_root_init(sched_clutch_root_t root_clutch,processor_set_t pset)395 sched_clutch_root_init(
396 sched_clutch_root_t root_clutch,
397 processor_set_t pset)
398 {
399 root_clutch->scr_thr_count = 0;
400 root_clutch->scr_priority = NOPRI;
401 root_clutch->scr_urgency = 0;
402 root_clutch->scr_pset = pset;
403 #if CONFIG_SCHED_EDGE
404 root_clutch->scr_cluster_id = pset->pset_cluster_id;
405 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
406 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
407 }
408 #else /* CONFIG_SCHED_EDGE */
409 root_clutch->scr_cluster_id = 0;
410 #endif /* CONFIG_SCHED_EDGE */
411
412 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
413 queue_init(&root_clutch->scr_clutch_buckets);
414
415 /* Initialize the priority queue which maintains all runnable foreign clutch buckets */
416 priority_queue_init(&root_clutch->scr_foreign_buckets);
417 bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
418 bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
419 bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
420 priority_queue_init(&root_clutch->scr_bound_root_buckets);
421
422 /* Initialize the bitmap and priority queue of runnable root buckets */
423 priority_queue_init(&root_clutch->scr_unbound_root_buckets);
424 bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
425 bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
426
427 /* Initialize all the root buckets */
428 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
429 sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
430 sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
431 }
432 }
433
434 /*
435 * Clutch Bucket Runqueues
436 *
437 * The clutch buckets are maintained in a runq at the root bucket level. The
438 * runq organization allows clutch buckets to be ordered based on various
439 * factors such as:
440 *
441 * - Clutch buckets are round robin'ed at the same priority level when a
442 * thread is selected from a clutch bucket. This prevents a clutch bucket
443 * from starving out other clutch buckets at the same priority.
444 *
445 * - Clutch buckets are inserted at the head when it becomes runnable due to
446 * thread preemption. This allows threads that were preempted to maintain
447 * their order in the queue.
448 */
449
450 /*
451 * sched_clutch_bucket_runq_init()
452 *
453 * Initialize a clutch bucket runq.
454 */
455 static void
sched_clutch_bucket_runq_init(sched_clutch_bucket_runq_t clutch_buckets_rq)456 sched_clutch_bucket_runq_init(
457 sched_clutch_bucket_runq_t clutch_buckets_rq)
458 {
459 clutch_buckets_rq->scbrq_highq = NOPRI;
460 for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
461 clutch_buckets_rq->scbrq_bitmap[i] = 0;
462 }
463 clutch_buckets_rq->scbrq_count = 0;
464 for (int i = 0; i < NRQS; i++) {
465 circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
466 }
467 }
468
469 /*
470 * sched_clutch_bucket_runq_empty()
471 *
472 * Returns if a clutch bucket runq is empty.
473 */
474 static boolean_t
sched_clutch_bucket_runq_empty(sched_clutch_bucket_runq_t clutch_buckets_rq)475 sched_clutch_bucket_runq_empty(
476 sched_clutch_bucket_runq_t clutch_buckets_rq)
477 {
478 return clutch_buckets_rq->scbrq_count == 0;
479 }
480
481 /*
482 * sched_clutch_bucket_runq_peek()
483 *
484 * Returns the highest priority clutch bucket in the runq.
485 */
486 static sched_clutch_bucket_t
sched_clutch_bucket_runq_peek(sched_clutch_bucket_runq_t clutch_buckets_rq)487 sched_clutch_bucket_runq_peek(
488 sched_clutch_bucket_runq_t clutch_buckets_rq)
489 {
490 if (clutch_buckets_rq->scbrq_count > 0) {
491 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
492 return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
493 } else {
494 return NULL;
495 }
496 }
497
498 /*
499 * sched_clutch_bucket_runq_enqueue()
500 *
501 * Enqueue a clutch bucket into the runq based on the options passed in.
502 */
503 static void
sched_clutch_bucket_runq_enqueue(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_options_t options)504 sched_clutch_bucket_runq_enqueue(
505 sched_clutch_bucket_runq_t clutch_buckets_rq,
506 sched_clutch_bucket_t clutch_bucket,
507 sched_clutch_bucket_options_t options)
508 {
509 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
510 if (circle_queue_empty(queue)) {
511 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
512 bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
513 if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
514 clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
515 }
516 } else {
517 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
518 circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
519 } else {
520 /*
521 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
522 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
523 */
524 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
525 }
526 }
527 clutch_buckets_rq->scbrq_count++;
528 }
529
530 /*
531 * sched_clutch_bucket_runq_remove()
532 *
533 * Remove a clutch bucket from the runq.
534 */
535 static void
sched_clutch_bucket_runq_remove(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)536 sched_clutch_bucket_runq_remove(
537 sched_clutch_bucket_runq_t clutch_buckets_rq,
538 sched_clutch_bucket_t clutch_bucket)
539 {
540 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
541 circle_dequeue(queue, &clutch_bucket->scb_runqlink);
542 assert(clutch_buckets_rq->scbrq_count > 0);
543 clutch_buckets_rq->scbrq_count--;
544 if (circle_queue_empty(queue)) {
545 bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
546 clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
547 }
548 }
549
550 static void
sched_clutch_bucket_runq_rotate(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)551 sched_clutch_bucket_runq_rotate(
552 sched_clutch_bucket_runq_t clutch_buckets_rq,
553 sched_clutch_bucket_t clutch_bucket)
554 {
555 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
556 assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
557 circle_queue_rotate_head_forward(queue);
558 }
559
560 /*
561 * sched_clutch_root_bucket_init()
562 *
563 * Routine to initialize root buckets.
564 */
565 static void
sched_clutch_root_bucket_init(sched_clutch_root_bucket_t root_bucket,sched_bucket_t bucket,bool bound_root_bucket)566 sched_clutch_root_bucket_init(
567 sched_clutch_root_bucket_t root_bucket,
568 sched_bucket_t bucket,
569 bool bound_root_bucket)
570 {
571 root_bucket->scrb_bucket = bucket;
572 if (bound_root_bucket) {
573 /* For bound root buckets, initialize the bound thread runq. */
574 root_bucket->scrb_bound = true;
575 run_queue_init(&root_bucket->scrb_bound_thread_runq);
576 } else {
577 /*
578 * The unbounded root buckets contain a runq of runnable clutch buckets
579 * which then hold the runnable threads.
580 */
581 root_bucket->scrb_bound = false;
582 sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
583 }
584 priority_queue_entry_init(&root_bucket->scrb_pqlink);
585 root_bucket->scrb_pqlink.deadline = 0;
586 root_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
587 root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
588 root_bucket->scrb_starvation_avoidance = false;
589 root_bucket->scrb_starvation_ts = 0;
590 }
591
592 /*
593 * Special case scheduling for Above UI bucket.
594 *
595 * AboveUI threads are typically system critical threads that need low latency
596 * which is why they are handled specially.
597 *
598 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
599 * important to maintain some native priority order between those buckets. For unbounded
600 * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
601 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
602 * deadline based scheduling which should pickup the timeshare buckets. For the bound
603 * case, the policy simply compares the priority of the highest runnable threads in
604 * the above UI and timeshare buckets.
605 *
606 * The implementation allows extremely low latency CPU access for Above UI threads
607 * while supporting the use case of high priority timeshare threads contending with
608 * lower priority fixed priority threads.
609 */
610
611
612 /*
613 * sched_clutch_root_unbound_select_aboveui()
614 *
615 * Routine to determine if the above UI unbounded bucket should be selected for execution.
616 *
617 * Writes the highest unbound (timeshare FG vs. above UI) bucket, its priority, and whether
618 * it is an above UI bucket into the pointer parameters.
619 */
620 static void
sched_clutch_root_unbound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)621 sched_clutch_root_unbound_select_aboveui(
622 sched_clutch_root_t root_clutch,
623 sched_clutch_root_bucket_t *highest_bucket,
624 int *highest_pri,
625 bool *highest_is_aboveui,
626 sched_clutch_root_bucket_t _Nullable prev_bucket,
627 thread_t _Nullable prev_thread)
628 {
629 /* First determine the highest Clutch bucket */
630 sched_clutch_root_bucket_t higher_root_bucket = NULL;
631 sched_clutch_bucket_t higher_clutch_bucket = NULL;
632 int higher_bucket_sched_pri = -1;
633 bool higher_is_aboveui = false;
634 /* Consider unbound Above UI */
635 if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
636 higher_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
637 higher_clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, higher_root_bucket, NULL, NULL, NULL);
638 higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
639 higher_is_aboveui = true;
640 }
641 /* Consider unbound Timeshare FG */
642 if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
643 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
644 sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket_sharefg, NULL, NULL, NULL);
645 /* Strict greater-than because unbound timeshare FG root bucket loses all priority ties at this level */
646 if (higher_root_bucket == NULL || clutch_bucket_sharefg->scb_priority > higher_clutch_bucket->scb_priority) {
647 higher_root_bucket = root_bucket_sharefg;
648 higher_clutch_bucket = clutch_bucket_sharefg;
649 higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
650 higher_is_aboveui = false;
651 }
652 }
653 /* Consider the previous thread */
654 if (prev_thread != NULL) {
655 assert(prev_bucket->scrb_bound == false);
656 sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
657 int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
658 sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
659 bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && higher_is_aboveui == false;
660 if (higher_clutch_bucket == NULL ||
661 sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, higher_clutch_bucket->scb_priority, prev_bucket_should_win_ties)) {
662 higher_root_bucket = prev_bucket;
663 higher_clutch_bucket = prev_clutch_bucket;
664 higher_bucket_sched_pri = prev_thread->sched_pri;
665 higher_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
666 }
667 }
668 /* Compare highest priority in the highest unbound Clutch bucket to highest priority seen from the bound buckets */
669 if (higher_root_bucket != NULL) {
670 bool unbound_should_win_ties = higher_is_aboveui == true && *highest_is_aboveui == false;
671 if (sched_clutch_pri_greater_than_tiebreak(higher_bucket_sched_pri, *highest_pri, unbound_should_win_ties)) {
672 *highest_pri = higher_bucket_sched_pri;
673 *highest_bucket = higher_root_bucket;
674 *highest_is_aboveui = higher_is_aboveui;
675 }
676 }
677 }
678
679 /*
680 * sched_clutch_root_bound_select_aboveui()
681 *
682 * Routine to determine if the above UI bounded bucket should be selected for execution.
683 *
684 * Writes the highest bound (timeshare FG vs. above UI) bucket, its priority, and whether
685 * it is an above UI bucket into the pointer parameters.
686 */
687 static void
sched_clutch_root_bound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)688 sched_clutch_root_bound_select_aboveui(
689 sched_clutch_root_t root_clutch,
690 sched_clutch_root_bucket_t *highest_bucket,
691 int *highest_pri,
692 bool *highest_is_aboveui,
693 sched_clutch_root_bucket_t _Nullable prev_bucket,
694 thread_t _Nullable prev_thread)
695 {
696 /* Consider bound Above UI */
697 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
698 if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_FIXPRI) &&
699 sched_clutch_pri_greater_than_tiebreak(root_bucket_aboveui->scrb_bound_thread_runq.highq, *highest_pri, *highest_is_aboveui == false)) {
700 *highest_pri = root_bucket_aboveui->scrb_bound_thread_runq.highq;
701 *highest_bucket = root_bucket_aboveui;
702 *highest_is_aboveui = true;
703 }
704 /* Consider bound Timeshare FG */
705 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
706 if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SHARE_FG) &&
707 sched_clutch_pri_greater_than_tiebreak(root_bucket_sharefg->scrb_bound_thread_runq.highq, *highest_pri, false)) {
708 *highest_pri = root_bucket_sharefg->scrb_bound_thread_runq.highq;
709 *highest_bucket = root_bucket_sharefg;
710 *highest_is_aboveui = false;
711 }
712 /* Consider the previous thread */
713 if (prev_thread != NULL) {
714 assert(prev_bucket->scrb_bound == true);
715 bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && *highest_is_aboveui == false;
716 if (sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, *highest_pri, prev_bucket_should_win_ties)) {
717 *highest_pri = prev_thread->sched_pri;
718 *highest_bucket = prev_bucket;
719 *highest_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
720 }
721 }
722 }
723
724 /*
725 * sched_clutch_root_highest_runnable_qos()
726 *
727 * Returns the index of the highest-QoS root bucket which is currently runnable.
728 */
729 static int
sched_clutch_root_highest_runnable_qos(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type)730 sched_clutch_root_highest_runnable_qos(
731 sched_clutch_root_t root_clutch,
732 sched_clutch_highest_root_bucket_type_t type)
733 {
734 int highest_unbound_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
735 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
736 return highest_unbound_bucket;
737 }
738 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
739 int highest_bound_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
740 if (highest_bound_bucket == -1) {
741 return highest_unbound_bucket;
742 }
743 if (highest_unbound_bucket == -1) {
744 return highest_bound_bucket;
745 }
746 /* Both bound and unbound buckets are runnable, return the higher QoS */
747 return MIN(highest_bound_bucket, highest_unbound_bucket);
748 }
749
750 /*
751 * sched_clutch_root_highest_aboveui_root_bucket()
752 *
753 * Routine to determine if an above UI root bucket should be selected for execution.
754 *
755 * Returns the root bucket if we should run an above UI bucket or NULL otherwise.
756 */
757 static sched_clutch_root_bucket_t
sched_clutch_root_highest_aboveui_root_bucket(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread)758 sched_clutch_root_highest_aboveui_root_bucket(
759 sched_clutch_root_t root_clutch,
760 sched_clutch_highest_root_bucket_type_t type,
761 sched_clutch_root_bucket_t _Nullable prev_bucket,
762 thread_t _Nullable prev_thread,
763 bool *chose_prev_thread)
764 {
765 assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
766 assert((type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) || (prev_bucket == NULL));
767
768 sched_clutch_root_bucket_t highest_bucket = NULL;
769 int highest_pri = -1;
770 bool highest_is_aboveui = false;
771
772 /* Forward previous thread to the correct comparison logic, based on boundness */
773 sched_clutch_root_bucket_t bound_prev_bucket = NULL, unbound_prev_bucket = NULL;
774 thread_t bound_prev_thread = NULL, unbound_prev_thread = NULL;
775 if (prev_thread != NULL) {
776 if (prev_bucket->scrb_bound) {
777 bound_prev_bucket = prev_bucket;
778 bound_prev_thread = prev_thread;
779 } else {
780 unbound_prev_bucket = prev_bucket;
781 unbound_prev_thread = prev_thread;
782 }
783 }
784
785 /* Consider bound Above UI vs. Timeshare FG first, so those buckets will win ties against the corresponding unbound buckets */
786 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) {
787 sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, bound_prev_bucket, bound_prev_thread);
788 }
789
790 /* Consider unbound Above UI vs. Timeshare FG */
791 sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, unbound_prev_bucket, unbound_prev_thread);
792 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
793 return highest_is_aboveui ? highest_bucket : NULL;
794 }
795 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
796
797 /* Determine whether we already know to continue running the previous thread */
798 if (prev_thread != NULL &&
799 bitmap_test(highest_bucket->scrb_bound ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap, highest_bucket->scrb_bucket) == false) {
800 /* Highest bucket we saw is empty, so the previous thread must have been the highest */
801 assert(highest_bucket == prev_bucket);
802 *chose_prev_thread = true;
803 }
804
805 return highest_is_aboveui ? highest_bucket : NULL;
806 }
807
808 /*
809 * sched_clutch_root_highest_root_bucket()
810 *
811 * Main routine to find the highest runnable root level bucket.
812 * This routine is called from performance sensitive contexts; so it is
813 * crucial to keep this O(1). The options parameter determines if
814 * the selection logic should look at unbounded threads only (for
815 * cross-cluster stealing operations) or both bounded and unbounded
816 * threads (for selecting next thread for execution on current cluster).
817 */
818 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread,sched_clutch_traverse_mode_t mode,sched_clutch_dbg_thread_select_packed_t * debug_info)819 sched_clutch_root_highest_root_bucket(
820 sched_clutch_root_t root_clutch,
821 uint64_t timestamp,
822 sched_clutch_highest_root_bucket_type_t type,
823 sched_clutch_root_bucket_t _Nullable prev_bucket,
824 thread_t _Nullable prev_thread,
825 bool *chose_prev_thread,
826 sched_clutch_traverse_mode_t mode,
827 sched_clutch_dbg_thread_select_packed_t *debug_info)
828 {
829 assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
830 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL || (prev_thread == NULL));
831 assert(prev_thread == NULL || (mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY));
832 sched_clutch_hierarchy_locked_assert(root_clutch);
833
834 int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(root_clutch, type);
835 if (highest_runnable_bucket == -1) {
836 /*
837 * The Clutch hierarchy has no runnable threads. We can continue running
838 * whatever was running previously.
839 */
840 assert(sched_clutch_root_count(root_clutch) == 0 || type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
841 *chose_prev_thread = true;
842 if (prev_thread != NULL) {
843 debug_info->trace_data.selection_was_edf = true;
844 }
845 return prev_bucket;
846 }
847
848 /* Consider Above UI threads, in comparison to Timeshare FG threads */
849 sched_clutch_root_bucket_t highest_aboveui_bucket = sched_clutch_root_highest_aboveui_root_bucket(root_clutch, type, prev_bucket, prev_thread, chose_prev_thread);
850 if (highest_aboveui_bucket != NULL) {
851 debug_info->trace_data.selection_was_edf = true;
852 return highest_aboveui_bucket;
853 }
854
855 /*
856 * Above UI bucket is not runnable or has a low priority runnable thread; use the
857 * earliest deadline model to schedule threads. The idea is that as the timeshare
858 * buckets use CPU, they will drop their interactivity score/sched priority and
859 * allow the low priority AboveUI buckets to be scheduled.
860 */
861
862 /* Find the earliest deadline bucket */
863 sched_clutch_root_bucket_t edf_bucket;
864 bool edf_bucket_enqueued_normally;
865
866 evaluate_root_buckets:
867 edf_bucket = NULL;
868 edf_bucket_enqueued_normally = true;
869
870 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
871 edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
872 } else {
873 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
874 sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
875 sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
876 if (bound_bucket && unbound_bucket) {
877 /* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
878 edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
879 } else {
880 edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
881 }
882 }
883 if (edf_bucket == NULL) {
884 /* The timeshare portion of the runqueue is empty */
885 assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
886 assert(prev_thread != NULL);
887 *chose_prev_thread = true;
888 if (prev_thread != NULL) {
889 debug_info->trace_data.selection_was_edf = true;
890 }
891 return prev_bucket;
892 }
893 if (prev_bucket != NULL && prev_bucket->scrb_pqlink.deadline < edf_bucket->scrb_pqlink.deadline) {
894 /* The previous thread's root bucket has the earliest deadline and is not currently enqueued */
895 edf_bucket = prev_bucket;
896 edf_bucket_enqueued_normally = false;
897 }
898
899 if (edf_bucket->scrb_starvation_avoidance) {
900 /* Check if the EDF bucket is in an expired starvation avoidance window */
901 uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket];
902 if (timestamp >= (edf_bucket->scrb_starvation_ts + starvation_window)) {
903 /* Starvation avoidance window is over; update deadline and re-evaluate EDF */
904 edf_bucket->scrb_starvation_avoidance = false;
905 edf_bucket->scrb_starvation_ts = 0;
906 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
907 bit_set(debug_info->trace_data.starvation_avoidance_window_close, edf_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + edf_bucket->scrb_bucket);
908 goto evaluate_root_buckets;
909 }
910 }
911
912 /*
913 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
914 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
915 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
916 * QoS root bucket.
917 */
918 bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
919 int warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
920
921 /* Allow the prev_bucket to use its warp as well */
922 bool prev_bucket_warping = (prev_bucket != NULL) && (prev_bucket->scrb_bound == edf_bucket->scrb_bound) &&
923 prev_bucket->scrb_bucket < edf_bucket->scrb_bucket && (prev_bucket->scrb_warp_remaining > 0) &&
924 (warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
925
926 bool non_edf_bucket_can_warp = (warp_bucket_index != -1 && warp_bucket_index < edf_bucket->scrb_bucket) || prev_bucket_warping;
927
928 if (non_edf_bucket_can_warp == false) {
929 /* No higher buckets have warp left; best choice is the EDF based bucket */
930 debug_info->trace_data.selection_was_edf = true;
931
932 bool should_update_edf_starvation_state = edf_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
933 if (edf_bucket->scrb_starvation_avoidance == false && should_update_edf_starvation_state) {
934 /* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
935 if (highest_runnable_bucket < edf_bucket->scrb_bucket || (prev_bucket != NULL && prev_bucket->scrb_bucket < edf_bucket->scrb_bucket)) {
936 /*
937 * Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance.
938 *
939 * The starvation avoidance window is allocated as a single quantum for the starved bucket, enforced
940 * simultaneously across all CPUs in the cluster. The idea is to grant the starved bucket roughly one
941 * quantum per core, each time the bucket reaches the earliest deadline position. Note that this
942 * cadence is driven by the difference between the starved bucket's and highest-runnable bucket's WCELs.
943 */
944 edf_bucket->scrb_starvation_avoidance = true;
945 edf_bucket->scrb_starvation_ts = timestamp;
946 debug_info->trace_data.selection_opened_starvation_avoidance_window = true;
947 } else {
948 /* EDF bucket is being selected in the natural order; update deadline and reset warp */
949 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
950 edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
951 edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
952 if (edf_bucket_enqueued_normally) {
953 if (edf_bucket->scrb_bound) {
954 bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
955 } else {
956 bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
957 }
958 }
959 }
960 }
961 *chose_prev_thread = !edf_bucket_enqueued_normally;
962 return edf_bucket;
963 }
964
965 /*
966 * Looks like there is a root bucket which is higher in the natural priority
967 * order than edf_bucket and might have some warp remaining.
968 */
969 assert(prev_bucket_warping || warp_bucket_index >= 0);
970 sched_clutch_root_bucket_t warp_bucket = NULL;
971 if (prev_bucket_warping) {
972 assert(warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
973 warp_bucket = prev_bucket;
974 } else {
975 warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
976 }
977
978 bool warp_is_being_utilized = warp_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
979
980 if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
981 if (warp_is_being_utilized) {
982 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
983 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
984 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
985 debug_info->trace_data.selection_opened_warp_window = true;
986 }
987 *chose_prev_thread = prev_bucket_warping;
988 debug_info->trace_data.selection_was_edf = false;
989 assert(warp_bucket != edf_bucket);
990 return warp_bucket;
991 }
992 if (warp_bucket->scrb_warped_deadline > timestamp) {
993 /* Root bucket already has a warp window open with some warp remaining */
994 if (warp_is_being_utilized) {
995 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
996 }
997 *chose_prev_thread = prev_bucket_warping;
998 debug_info->trace_data.selection_was_edf = false;
999 return warp_bucket;
1000 }
1001
1002 /*
1003 * For this bucket, warp window was opened sometime in the past but has now
1004 * expired. Mark the bucket as not available for warp anymore and re-run the
1005 * warp bucket selection logic.
1006 */
1007 warp_bucket->scrb_warp_remaining = 0;
1008 if (!prev_bucket_warping) {
1009 if (warp_bucket->scrb_bound) {
1010 bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
1011 } else {
1012 bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
1013 }
1014 }
1015 bit_set(debug_info->trace_data.warp_window_close, warp_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + warp_bucket->scrb_bucket);
1016 goto evaluate_root_buckets;
1017 }
1018
1019 /*
1020 * sched_clutch_root_bucket_deadline_calculate()
1021 *
1022 * Calculate the deadline for the bucket based on its WCEL
1023 */
1024 static uint64_t
sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t root_bucket,uint64_t timestamp)1025 sched_clutch_root_bucket_deadline_calculate(
1026 sched_clutch_root_bucket_t root_bucket,
1027 uint64_t timestamp)
1028 {
1029 /* For fixpri AboveUI bucket always return it as the earliest deadline */
1030 if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
1031 return 0;
1032 }
1033
1034 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
1035 return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
1036 }
1037
1038 /*
1039 * sched_clutch_root_bucket_deadline_update()
1040 *
1041 * Routine to update the deadline of the root bucket when it is selected.
1042 * Updating the deadline also moves the root_bucket in the EDF priority
1043 * queue.
1044 */
1045 static void
sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,bool bucket_is_enqueued)1046 sched_clutch_root_bucket_deadline_update(
1047 sched_clutch_root_bucket_t root_bucket,
1048 sched_clutch_root_t root_clutch,
1049 uint64_t timestamp,
1050 bool bucket_is_enqueued)
1051 {
1052 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
1053 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
1054 return;
1055 }
1056
1057 uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
1058 uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1059 if (__improbable(old_deadline > new_deadline)) {
1060 panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
1061 }
1062 if (old_deadline != new_deadline) {
1063 root_bucket->scrb_pqlink.deadline = new_deadline;
1064 if (bucket_is_enqueued) {
1065 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1066 priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
1067 }
1068 }
1069 }
1070
1071 /*
1072 * sched_clutch_root_bucket_runnable()
1073 *
1074 * Routine to insert a newly runnable root bucket into the hierarchy.
1075 * Also updates the deadline and warp parameters as necessary.
1076 */
1077 static void
sched_clutch_root_bucket_runnable(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1078 sched_clutch_root_bucket_runnable(
1079 sched_clutch_root_bucket_t root_bucket,
1080 sched_clutch_root_t root_clutch,
1081 uint64_t timestamp)
1082 {
1083 /* Mark the root bucket as runnable */
1084 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1085 bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
1086
1087 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
1088 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1089 return;
1090 }
1091
1092 if (root_bucket->scrb_starvation_avoidance == false) {
1093 /*
1094 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
1095 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
1096 * that and fix it up.
1097 */
1098 root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1099 }
1100 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1101 priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
1102 if (root_bucket->scrb_warp_remaining) {
1103 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
1104 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1105 bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
1106 }
1107 }
1108
1109 /*
1110 * sched_clutch_root_bucket_empty()
1111 *
1112 * Routine to remove an empty root bucket from the hierarchy.
1113 * Also updates the deadline and warp parameters as necessary.
1114 */
1115 static void
sched_clutch_root_bucket_empty(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1116 sched_clutch_root_bucket_empty(
1117 sched_clutch_root_bucket_t root_bucket,
1118 sched_clutch_root_t root_clutch,
1119 uint64_t timestamp)
1120 {
1121 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1122 bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
1123
1124 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
1125 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1126 return;
1127 }
1128
1129 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1130 priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
1131
1132 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1133 bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
1134
1135 if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
1136 if (root_bucket->scrb_warped_deadline > timestamp) {
1137 /*
1138 * For root buckets that were using the warp, check if the warp
1139 * deadline is in the future. If yes, remove the wall time the
1140 * warp was active and update the warp remaining. This allows
1141 * the root bucket to use the remaining warp the next time it
1142 * becomes runnable.
1143 */
1144 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
1145 } else {
1146 /*
1147 * If the root bucket's warped deadline is in the past, it has used up
1148 * all the warp it was assigned. Empty out its warp remaining.
1149 */
1150 root_bucket->scrb_warp_remaining = 0;
1151 }
1152 }
1153 }
1154
1155 static int
sched_clutch_global_bucket_load_get(sched_bucket_t bucket)1156 sched_clutch_global_bucket_load_get(
1157 sched_bucket_t bucket)
1158 {
1159 return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
1160 }
1161
1162 /*
1163 * sched_clutch_root_pri_update()
1164 *
1165 * The root level priority is used for thread selection and preemption
1166 * logic.
1167 *
1168 * The logic uses the same decision as thread selection for deciding between the
1169 * above UI and timeshare buckets. If one of the timesharing buckets have to be
1170 * used for priority calculation, the logic is slightly different from thread
1171 * selection, because thread selection considers deadlines, warps etc. to
1172 * decide the most optimal bucket at a given timestamp. Since the priority
1173 * value is used for preemption decisions only, it needs to be based on the
1174 * highest runnable thread available in the timeshare domain. This logic can
1175 * be made more sophisticated if there are cases of unnecessary preemption
1176 * being seen in workloads.
1177 */
1178 static void
sched_clutch_root_pri_update(sched_clutch_root_t root_clutch)1179 sched_clutch_root_pri_update(
1180 sched_clutch_root_t root_clutch)
1181 {
1182 sched_clutch_hierarchy_locked_assert(root_clutch);
1183 int16_t root_bound_pri = NOPRI;
1184 int16_t root_unbound_pri = NOPRI;
1185
1186 /* Consider bound root buckets */
1187 if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1188 goto root_pri_update_unbound;
1189 }
1190 sched_clutch_root_bucket_t highest_bound_root_bucket = NULL;
1191 __unused int highest_bound_root_bucket_pri = -1;
1192 bool highest_bound_root_bucket_is_fixpri = false;
1193 sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bound_root_bucket, &highest_bound_root_bucket_pri, &highest_bound_root_bucket_is_fixpri, NULL, NULL);
1194 if (highest_bound_root_bucket_is_fixpri == false) {
1195 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1196 assert(root_bucket_index != -1);
1197 highest_bound_root_bucket = &root_clutch->scr_bound_buckets[root_bucket_index];
1198 }
1199 root_bound_pri = highest_bound_root_bucket->scrb_bound_thread_runq.highq;
1200
1201 root_pri_update_unbound:
1202 /* Consider unbound root buckets */
1203 if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1204 goto root_pri_update_complete;
1205 }
1206 sched_clutch_root_bucket_t highest_unbound_root_bucket = NULL;
1207 __unused int highest_unbound_root_bucket_pri = -1;
1208 bool highest_unbound_root_bucket_is_fixpri = false;
1209 sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_unbound_root_bucket, &highest_unbound_root_bucket_pri, &highest_unbound_root_bucket_is_fixpri, NULL, NULL);
1210 if (highest_unbound_root_bucket_is_fixpri == false) {
1211 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1212 assert(root_bucket_index != -1);
1213 highest_unbound_root_bucket = &root_clutch->scr_unbound_buckets[root_bucket_index];
1214 }
1215
1216 /* For the selected root bucket, find the highest priority clutch bucket */
1217 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, highest_unbound_root_bucket, NULL, NULL, NULL);
1218 root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1219
1220 root_pri_update_complete:
1221 root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
1222 }
1223
1224 /*
1225 * sched_clutch_root_urgency_inc()
1226 *
1227 * Routine to increment the urgency at the root level based on the thread
1228 * priority that is being inserted into the hierarchy. The root urgency
1229 * counter is updated based on the urgency of threads in any of the
1230 * clutch buckets which are part of the hierarchy.
1231 *
1232 * Always called with the pset lock held.
1233 */
1234 static void
sched_clutch_root_urgency_inc(sched_clutch_root_t root_clutch,thread_t thread)1235 sched_clutch_root_urgency_inc(
1236 sched_clutch_root_t root_clutch,
1237 thread_t thread)
1238 {
1239 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1240 root_clutch->scr_urgency++;
1241 }
1242 }
1243
1244 /*
1245 * sched_clutch_root_urgency_dec()
1246 *
1247 * Routine to decrement the urgency at the root level based on the thread
1248 * priority that is being removed from the hierarchy. The root urgency
1249 * counter is updated based on the urgency of threads in any of the
1250 * clutch buckets which are part of the hierarchy.
1251 *
1252 * Always called with the pset lock held.
1253 */
1254 static void
sched_clutch_root_urgency_dec(sched_clutch_root_t root_clutch,thread_t thread)1255 sched_clutch_root_urgency_dec(
1256 sched_clutch_root_t root_clutch,
1257 thread_t thread)
1258 {
1259 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1260 root_clutch->scr_urgency--;
1261 }
1262 }
1263
1264 /*
1265 * Clutch bucket level scheduling
1266 *
1267 * The second level of scheduling is the clutch bucket level scheduling
1268 * which tries to schedule thread groups within root_buckets. Each
1269 * clutch represents a thread group and a clutch_bucket_group represents
1270 * threads at a particular sched_bucket within that thread group. The
1271 * clutch_bucket_group contains a clutch_bucket per cluster on the system
1272 * where it holds the runnable threads destined for execution on that
1273 * cluster.
1274 *
1275 * The goal of this level of scheduling is to allow interactive thread
1276 * groups low latency access to the CPU. It also provides slight
1277 * scheduling preference for App and unrestricted thread groups.
1278 *
1279 * The clutch bucket scheduling algorithm measures an interactivity
1280 * score for all clutch bucket groups. The interactivity score is based
1281 * on the ratio of the CPU used and the voluntary blocking of threads
1282 * within the clutch bucket group. The algorithm is very close to the ULE
1283 * scheduler on FreeBSD in terms of calculations. The interactivity
1284 * score provides an interactivity boost in the range of
1285 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1286 * thread groups to win over CPU spinners.
1287 *
1288 * The interactivity score of the clutch bucket group is combined with the
1289 * highest base/promoted priority of threads in the clutch bucket to form
1290 * the overall priority of the clutch bucket.
1291 */
1292
1293 /* Priority boost range for interactivity */
1294 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT (8)
1295 static uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1296
1297 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1298 static uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1299 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS (500000)
1300
1301 /* The ratio to scale the cpu/blocked time per window */
1302 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO (10)
1303
1304 /* Initial value for voluntary blocking time for the clutch_bucket */
1305 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID (uint64_t)(~0)
1306
1307 /* Value indicating the clutch bucket is not pending execution */
1308 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID ((uint64_t)(~0))
1309
1310 /*
1311 * Thread group CPU starvation avoidance
1312 *
1313 * In heavily CPU contended scenarios, it is possible that some thread groups
1314 * which have a low interactivity score do not get CPU time at all. In order to
1315 * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1316 * bucket group when it has been pending execution for a certain time as defined
1317 * by the sched_clutch_bucket_group_pending_delta_us values below.
1318 *
1319 * The values chosen here are very close to the WCEL values for each sched bucket.
1320 * Theses values are added into the pending interval used to determine how
1321 * frequently we will ageout the CPU usage, ensuring a reasonable limit on the
1322 * frequency.
1323 */
1324 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1325 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
1326 10000, /* FG */
1327 37500, /* IN */
1328 75000, /* DF */
1329 150000, /* UT */
1330 250000, /* BG */
1331 };
1332 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1333
1334 /*
1335 * sched_clutch_bucket_init()
1336 *
1337 * Initializer for clutch buckets.
1338 */
1339 static void
sched_clutch_bucket_init(sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_group_t clutch_bucket_group,sched_bucket_t bucket)1340 sched_clutch_bucket_init(
1341 sched_clutch_bucket_t clutch_bucket,
1342 sched_clutch_bucket_group_t clutch_bucket_group,
1343 sched_bucket_t bucket)
1344 {
1345 clutch_bucket->scb_bucket = bucket;
1346 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1347 clutch_bucket->scb_priority = 0;
1348 #if CONFIG_SCHED_EDGE
1349 clutch_bucket->scb_foreign = false;
1350 priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1351 #endif /* CONFIG_SCHED_EDGE */
1352 clutch_bucket->scb_group = clutch_bucket_group;
1353 clutch_bucket->scb_root = NULL;
1354 priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1355 priority_queue_init(&clutch_bucket->scb_thread_runq);
1356 queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1357 }
1358
1359 /*
1360 * sched_clutch_bucket_group_init()
1361 *
1362 * Initializer for clutch bucket groups.
1363 */
1364 static void
sched_clutch_bucket_group_init(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_t clutch,sched_bucket_t bucket)1365 sched_clutch_bucket_group_init(
1366 sched_clutch_bucket_group_t clutch_bucket_group,
1367 sched_clutch_t clutch,
1368 sched_bucket_t bucket)
1369 {
1370 bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1371 clutch_bucket_group->scbg_bucket = bucket;
1372 clutch_bucket_group->scbg_clutch = clutch;
1373
1374 int max_clusters = ml_get_cluster_count();
1375 clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1376 for (int i = 0; i < max_clusters; i++) {
1377 sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1378 }
1379
1380 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1381 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1382 os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1383 /*
1384 * All thread groups should be initialized to be interactive; this allows the newly launched
1385 * thread groups to fairly compete with already running thread groups.
1386 */
1387 clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1388 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1389 os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1390 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1391 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1392 clutch_bucket_group->scbg_amp_rebalance_last_chosen = UINT32_MAX;
1393 }
1394
1395 static void
sched_clutch_bucket_group_destroy(sched_clutch_bucket_group_t clutch_bucket_group)1396 sched_clutch_bucket_group_destroy(
1397 sched_clutch_bucket_group_t clutch_bucket_group)
1398 {
1399 kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1400 clutch_bucket_group->scbg_clutch_buckets);
1401 }
1402
1403 /*
1404 * sched_clutch_init_with_thread_group()
1405 *
1406 * Initialize the sched_clutch when the thread group is being created
1407 */
1408 void
sched_clutch_init_with_thread_group(sched_clutch_t clutch,struct thread_group * tg)1409 sched_clutch_init_with_thread_group(
1410 sched_clutch_t clutch,
1411 struct thread_group *tg)
1412 {
1413 os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1414
1415 /* Initialize all the clutch buckets */
1416 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1417 sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1418 }
1419
1420 /* Grouping specific fields */
1421 clutch->sc_tg = tg;
1422 }
1423
1424 /*
1425 * sched_clutch_destroy()
1426 *
1427 * Destructor for clutch; called from thread group release code.
1428 */
1429 void
sched_clutch_destroy(sched_clutch_t clutch)1430 sched_clutch_destroy(
1431 sched_clutch_t clutch)
1432 {
1433 assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1434 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1435 sched_clutch_bucket_group_destroy(&(clutch->sc_clutch_groups[i]));
1436 }
1437 }
1438
1439 #if CONFIG_SCHED_EDGE
1440
1441 /*
1442 * Edge Scheduler Preferred Cluster Mechanism
1443 *
1444 * In order to have better control over various QoS buckets within a thread group, the Edge
1445 * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1446 * preferences are stored at the sched_clutch_bucket_group level since that represents all
1447 * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1448 * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1449 */
1450
1451 static uint32_t
sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)1452 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1453 {
1454 return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1455 }
1456
1457 static uint32_t
sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)1458 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1459 {
1460 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1461 }
1462
1463 uint32_t
sched_edge_thread_preferred_cluster(thread_t thread)1464 sched_edge_thread_preferred_cluster(thread_t thread)
1465 {
1466 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1467 /* For threads bound to a specific cluster, return the bound cluster id */
1468 return sched_edge_thread_bound_cluster_id(thread);
1469 }
1470
1471 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1472 sched_bucket_t sched_bucket = thread->th_sched_bucket;
1473 if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1474 sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1475 }
1476 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1477 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1478 }
1479
1480 /*
1481 * Edge Scheduler Foreign Bucket Support
1482 *
1483 * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1484 * threads that are not native to the cluster. A clutch bucket is considered native if its
1485 * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1486 * bucket priority queue is used for rebalance operations to get threads back to their native
1487 * cluster quickly.
1488 *
1489 * It is possible to make this policy even more aggressive by considering all clusters that
1490 * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1491 * migrations which might have performance implications.
1492 */
1493
1494 static void
sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1495 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1496 {
1497 if (clutch_bucket->scb_foreign) {
1498 clutch_bucket->scb_foreign = false;
1499 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1500 }
1501 }
1502
1503 static void
sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1504 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1505 {
1506 if (!clutch_bucket->scb_foreign) {
1507 clutch_bucket->scb_foreign = true;
1508 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1509 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1510 }
1511 }
1512
1513 /*
1514 * Edge Scheduler Cumulative Load Average
1515 *
1516 * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1517 * making thread migration decisions. The per-bucket load is maintained as a
1518 * cumulative count since higher scheduling buckets impact load on lower buckets
1519 * for thread migration decisions.
1520 *
1521 */
1522
1523 static void
sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1524 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1525 {
1526 switch (bucket) {
1527 case TH_BUCKET_FIXPRI: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1528 case TH_BUCKET_SHARE_FG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1529 case TH_BUCKET_SHARE_IN: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1530 case TH_BUCKET_SHARE_DF: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1531 case TH_BUCKET_SHARE_UT: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1532 case TH_BUCKET_SHARE_BG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1533 default:
1534 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1535 }
1536 }
1537
1538 static void
sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1539 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1540 {
1541 switch (bucket) {
1542 case TH_BUCKET_FIXPRI: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1543 case TH_BUCKET_SHARE_FG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1544 case TH_BUCKET_SHARE_IN: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1545 case TH_BUCKET_SHARE_DF: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1546 case TH_BUCKET_SHARE_UT: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1547 case TH_BUCKET_SHARE_BG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1548 default:
1549 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1550 }
1551 }
1552
1553 uint16_t
sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1554 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1555 {
1556 return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1557 }
1558
1559 #endif /* CONFIG_SCHED_EDGE */
1560
1561 /*
1562 * sched_clutch_bucket_hierarchy_insert()
1563 *
1564 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1565 */
1566 static void
sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,sched_clutch_bucket_options_t options)1567 sched_clutch_bucket_hierarchy_insert(
1568 sched_clutch_root_t root_clutch,
1569 sched_clutch_bucket_t clutch_bucket,
1570 sched_bucket_t bucket,
1571 uint64_t timestamp,
1572 sched_clutch_bucket_options_t options)
1573 {
1574 sched_clutch_hierarchy_locked_assert(root_clutch);
1575 if (bucket > TH_BUCKET_FIXPRI) {
1576 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1577 enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1578 }
1579 #if CONFIG_SCHED_EDGE
1580 /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1581 uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1582 if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1583 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1584 }
1585 #endif /* CONFIG_SCHED_EDGE */
1586 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1587
1588 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1589 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1590 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1591 }
1592
1593 /* Insert the clutch bucket into the root bucket run queue with order based on options */
1594 sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1595 os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1596 os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1597 }
1598
1599 /*
1600 * sched_clutch_bucket_hierarchy_remove()
1601 *
1602 * Rotuine to remove a empty clutch bucket from the root hierarchy.
1603 */
1604 static void
sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,__unused sched_clutch_bucket_options_t options)1605 sched_clutch_bucket_hierarchy_remove(
1606 sched_clutch_root_t root_clutch,
1607 sched_clutch_bucket_t clutch_bucket,
1608 sched_bucket_t bucket,
1609 uint64_t timestamp,
1610 __unused sched_clutch_bucket_options_t options)
1611 {
1612 sched_clutch_hierarchy_locked_assert(root_clutch);
1613 if (bucket > TH_BUCKET_FIXPRI) {
1614 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1615 remqueue(&clutch_bucket->scb_listlink);
1616 }
1617 #if CONFIG_SCHED_EDGE
1618 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1619 #endif /* CONFIG_SCHED_EDGE */
1620
1621 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1622
1623 /* Remove the clutch bucket from the root bucket priority queue */
1624 sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1625 os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1626
1627 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
1628 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1629 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1630 }
1631 os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1632 }
1633
1634 /*
1635 * sched_clutch_bucket_base_pri()
1636 *
1637 * Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1638 * highest base_pri and the highest sched_pri in the clutch bucket.
1639 */
1640 static uint8_t
sched_clutch_bucket_base_pri(sched_clutch_bucket_t clutch_bucket)1641 sched_clutch_bucket_base_pri(
1642 sched_clutch_bucket_t clutch_bucket)
1643 {
1644 assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1645 /*
1646 * Since the clutch bucket can contain threads that are members of the group due
1647 * to the sched_pri being promoted or due to their base pri, the base priority of
1648 * the entire clutch bucket should be based on the highest thread (promoted or base)
1649 * in the clutch bucket.
1650 */
1651 uint8_t max_pri = 0;
1652 if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1653 max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1654 }
1655 return max_pri;
1656 }
1657
1658 /*
1659 * sched_clutch_interactivity_from_cpu_data()
1660 *
1661 * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1662 */
1663 static uint8_t
sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)1664 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1665 {
1666 sched_clutch_bucket_cpu_data_t scb_cpu_data;
1667 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1668 clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1669 clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1670 uint8_t interactive_score = 0;
1671
1672 if ((cpu_blocked == 0) && (cpu_used == 0)) {
1673 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1674 }
1675 /*
1676 * For all timeshare buckets, calculate the interactivity score of the bucket
1677 * and add it to the base priority
1678 */
1679 if (cpu_blocked > cpu_used) {
1680 /* Interactive clutch_bucket case */
1681 interactive_score = sched_clutch_bucket_group_interactive_pri +
1682 ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1683 } else {
1684 /* Non-interactive clutch_bucket case */
1685 interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1686 }
1687 return interactive_score;
1688 }
1689
1690 /*
1691 * sched_clutch_bucket_pri_calculate()
1692 *
1693 * The priority calculation algorithm for the clutch_bucket is a slight
1694 * modification on the ULE interactivity score. It uses the base priority
1695 * of the clutch bucket and applies an interactivity score boost to the
1696 * highly responsive clutch buckets.
1697 */
1698 static uint8_t
sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t clutch_bucket,uint64_t timestamp)1699 sched_clutch_bucket_pri_calculate(
1700 sched_clutch_bucket_t clutch_bucket,
1701 uint64_t timestamp)
1702 {
1703 /* For empty clutch buckets, return priority 0 */
1704 if (clutch_bucket->scb_thr_count == 0) {
1705 return 0;
1706 }
1707
1708 uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1709 uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1710
1711 assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1712 uint8_t pri = base_pri + interactive_score;
1713 if (pri != clutch_bucket->scb_priority) {
1714 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1715 thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1716 }
1717 return pri;
1718 }
1719
1720 /*
1721 * sched_clutch_root_bucket_highest_clutch_bucket()
1722 *
1723 * Routine to find the highest priority clutch bucket
1724 * within the root bucket.
1725 */
1726 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread,bool * _Nullable chose_prev_thread)1727 sched_clutch_root_bucket_highest_clutch_bucket(
1728 sched_clutch_root_t root_clutch,
1729 sched_clutch_root_bucket_t root_bucket,
1730 processor_t _Nullable processor,
1731 thread_t _Nullable prev_thread,
1732 bool *_Nullable chose_prev_thread)
1733 {
1734 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1735 if (prev_thread != NULL) {
1736 *chose_prev_thread = true;
1737 return sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1738 }
1739 return NULL;
1740 }
1741 sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1742 /* Consider the Clutch bucket of the previous thread */
1743 if (prev_thread != NULL) {
1744 assert(chose_prev_thread != NULL);
1745 sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
1746 int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
1747 sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1748 if (prev_clutch_bucket != clutch_bucket &&
1749 sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, clutch_bucket->scb_priority, processor->first_timeslice)) {
1750 *chose_prev_thread = true;
1751 return prev_clutch_bucket;
1752 }
1753 }
1754 return clutch_bucket;
1755 }
1756
1757 /*
1758 * sched_clutch_bucket_runnable()
1759 *
1760 * Perform all operations needed when a new clutch bucket becomes runnable.
1761 * It involves inserting the clutch_bucket into the hierarchy and updating the
1762 * root priority appropriately.
1763 */
1764 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1765 sched_clutch_bucket_runnable(
1766 sched_clutch_bucket_t clutch_bucket,
1767 sched_clutch_root_t root_clutch,
1768 uint64_t timestamp,
1769 sched_clutch_bucket_options_t options)
1770 {
1771 sched_clutch_hierarchy_locked_assert(root_clutch);
1772 /* Since the clutch bucket became newly runnable, update its pending timestamp */
1773 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1774 sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1775
1776 /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1777 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
1778 int16_t root_old_pri = root_clutch->scr_priority;
1779 sched_clutch_root_pri_update(root_clutch);
1780 return root_clutch->scr_priority > root_old_pri;
1781 }
1782
1783 /*
1784 * sched_clutch_bucket_update()
1785 *
1786 * Update the clutch_bucket's position in the hierarchy. This routine is
1787 * called when a new thread is inserted or removed from a runnable clutch
1788 * bucket. The options specify some properties about the clutch bucket
1789 * insertion order into the clutch bucket runq.
1790 */
1791 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1792 sched_clutch_bucket_update(
1793 sched_clutch_bucket_t clutch_bucket,
1794 sched_clutch_root_t root_clutch,
1795 uint64_t timestamp,
1796 sched_clutch_bucket_options_t options)
1797 {
1798 sched_clutch_hierarchy_locked_assert(root_clutch);
1799 uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1800 sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1801 if (new_pri == clutch_bucket->scb_priority) {
1802 /*
1803 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1804 * to the end of the runq. Typically used when a thread is selected for execution
1805 * from a clutch bucket.
1806 */
1807 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1808 sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1809 }
1810 return false;
1811 }
1812 sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1813 #if CONFIG_SCHED_EDGE
1814 if (clutch_bucket->scb_foreign) {
1815 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1816 }
1817 #endif /* CONFIG_SCHED_EDGE */
1818 clutch_bucket->scb_priority = new_pri;
1819 #if CONFIG_SCHED_EDGE
1820 if (clutch_bucket->scb_foreign) {
1821 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1822 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1823 }
1824 #endif /* CONFIG_SCHED_EDGE */
1825 sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1826
1827 int16_t root_old_pri = root_clutch->scr_priority;
1828 sched_clutch_root_pri_update(root_clutch);
1829 return root_clutch->scr_priority > root_old_pri;
1830 }
1831
1832 /*
1833 * sched_clutch_bucket_empty()
1834 *
1835 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1836 * It involves removing the clutch bucket from the hierarchy and updaing the root
1837 * priority appropriately.
1838 */
1839 static void
sched_clutch_bucket_empty(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1840 sched_clutch_bucket_empty(
1841 sched_clutch_bucket_t clutch_bucket,
1842 sched_clutch_root_t root_clutch,
1843 uint64_t timestamp,
1844 sched_clutch_bucket_options_t options)
1845 {
1846 sched_clutch_hierarchy_locked_assert(root_clutch);
1847 sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1848 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1849 sched_clutch_root_pri_update(root_clutch);
1850 }
1851
1852 /*
1853 * sched_clutch_cpu_usage_update()
1854 *
1855 * Routine to update CPU usage of the thread in the hierarchy.
1856 */
1857 void
sched_clutch_cpu_usage_update(thread_t thread,uint64_t delta)1858 sched_clutch_cpu_usage_update(
1859 thread_t thread,
1860 uint64_t delta)
1861 {
1862 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1863 return;
1864 }
1865
1866 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1867 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1868 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1869 }
1870
1871 /*
1872 * sched_clutch_bucket_group_cpu_usage_update()
1873 *
1874 * Routine to update the CPU usage of the clutch_bucket.
1875 */
1876 static void
sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t delta)1877 sched_clutch_bucket_group_cpu_usage_update(
1878 sched_clutch_bucket_group_t clutch_bucket_group,
1879 uint64_t delta)
1880 {
1881 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1882 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1883 return;
1884 }
1885 delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1886 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1887 }
1888
1889 /*
1890 * sched_clutch_bucket_group_cpu_pending_adjust()
1891 *
1892 * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1893 * such that one "pending interval" provides one point improvement in interactivity score.
1894 */
1895 static inline uint64_t
sched_clutch_bucket_group_cpu_pending_adjust(uint64_t cpu_used,uint64_t cpu_blocked,uint8_t pending_intervals)1896 sched_clutch_bucket_group_cpu_pending_adjust(
1897 uint64_t cpu_used,
1898 uint64_t cpu_blocked,
1899 uint8_t pending_intervals)
1900 {
1901 uint64_t cpu_used_adjusted = 0;
1902 if (cpu_blocked < cpu_used) {
1903 cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1904 cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1905 } else {
1906 uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1907 cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1908 }
1909 return cpu_used_adjusted;
1910 }
1911
1912 /*
1913 * sched_clutch_bucket_group_cpu_adjust()
1914 *
1915 * Routine to scale the cpu usage and blocked time once the sum gets bigger
1916 * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1917 * manageable and maintain the same ratio while allowing clutch buckets to
1918 * adjust behavior and reflect in the interactivity score in a reasonable
1919 * amount of time. Also adjusts the CPU usage based on pending_intervals
1920 * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1921 */
1922 static void
sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t clutch_bucket_group,uint8_t pending_intervals)1923 sched_clutch_bucket_group_cpu_adjust(
1924 sched_clutch_bucket_group_t clutch_bucket_group,
1925 uint8_t pending_intervals)
1926 {
1927 sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1928 sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1929 os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1930 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1931 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1932
1933 if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1934 /* No changes to the CPU used and blocked values */
1935 os_atomic_rmw_loop_give_up();
1936 }
1937 if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1938 /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1939 cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1940 cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1941 }
1942 /* Use the shift passed in to ageout the CPU usage */
1943 cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1944 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1945 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1946 });
1947 }
1948
1949 /*
1950 * Thread level scheduling algorithm
1951 *
1952 * The thread level scheduling algorithm uses the mach timeshare
1953 * decay based algorithm to achieve sharing between threads within the
1954 * same clutch bucket. The load/priority shifts etc. are all maintained
1955 * at the clutch bucket level and used for decay calculation of the
1956 * threads. The load sampling is still driven off the scheduler tick
1957 * for runnable clutch buckets (it does not use the new higher frequency
1958 * EWMA based load calculation). The idea is that the contention and load
1959 * within clutch_buckets should be limited enough to not see heavy decay
1960 * and timeshare effectively.
1961 */
1962
1963 /*
1964 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1965 *
1966 * Increment the run count for the clutch bucket associated with the
1967 * thread.
1968 */
1969 uint32_t
sched_clutch_thread_run_bucket_incr(thread_t thread,sched_bucket_t bucket)1970 sched_clutch_thread_run_bucket_incr(
1971 thread_t thread,
1972 sched_bucket_t bucket)
1973 {
1974 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1975 return 0;
1976 }
1977 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1978 return sched_clutch_run_bucket_incr(clutch, bucket);
1979 }
1980
1981 static uint32_t
sched_clutch_run_bucket_incr(sched_clutch_t clutch,sched_bucket_t bucket)1982 sched_clutch_run_bucket_incr(
1983 sched_clutch_t clutch,
1984 sched_bucket_t bucket)
1985 {
1986 assert(bucket != TH_BUCKET_RUN);
1987 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1988 return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
1989 }
1990
1991 /*
1992 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1993 *
1994 * Decrement the run count for the clutch bucket associated with the
1995 * thread.
1996 */
1997 uint32_t
sched_clutch_thread_run_bucket_decr(thread_t thread,sched_bucket_t bucket)1998 sched_clutch_thread_run_bucket_decr(
1999 thread_t thread,
2000 sched_bucket_t bucket)
2001 {
2002 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2003 return 0;
2004 }
2005 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2006 return sched_clutch_run_bucket_decr(clutch, bucket);
2007 }
2008
2009 static uint32_t
sched_clutch_run_bucket_decr(sched_clutch_t clutch,sched_bucket_t bucket)2010 sched_clutch_run_bucket_decr(
2011 sched_clutch_t clutch,
2012 sched_bucket_t bucket)
2013 {
2014 assert(bucket != TH_BUCKET_RUN);
2015 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2016 return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
2017 }
2018
2019 /*
2020 * sched_clutch_bucket_group_timeshare_update()
2021 *
2022 * Routine to update the load and priority shift for the clutch_bucket_group
2023 * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
2024 * clutch buckets with runnable threads in them. So it is important to maintain
2025 * the timesharing information at the clutch_bucket_group level instead of
2026 * individual clutch buckets (because the algorithm is trying to timeshare all
2027 * threads at the same QoS irrespective of which hierarchy they are enqueued in).
2028 *
2029 * The routine is called from the sched tick handling code to make sure this value
2030 * is updated at least once every sched tick. For clutch bucket groups which have
2031 * not been runnable for very long, the clutch_bucket_group maintains a "last
2032 * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
2033 * if this value is outdated, the load and shifts are updated.
2034 *
2035 * Possible optimization:
2036 * - The current algorithm samples the load every sched tick (125ms).
2037 * This is prone to spikes in runnable counts; if that turns out to be
2038 * a problem, a simple solution would be to do the EWMA trick to sample
2039 * load at every load_tick (30ms) and use the averaged value for the pri
2040 * shift calculation.
2041 */
2042 static void
sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_bucket_t clutch_bucket,uint64_t ctime)2043 sched_clutch_bucket_group_timeshare_update(
2044 sched_clutch_bucket_group_t clutch_bucket_group,
2045 sched_clutch_bucket_t clutch_bucket,
2046 uint64_t ctime)
2047 {
2048 if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
2049 /* No timesharing needed for fixed priority Above UI threads */
2050 return;
2051 }
2052
2053 /*
2054 * Update the timeshare parameters for the clutch bucket group
2055 * if they havent been updated in this tick.
2056 */
2057 uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
2058 uint32_t current_sched_ts = sched_tick;
2059 if (sched_ts < current_sched_ts) {
2060 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
2061 /* NCPU wide workloads should not experience decay */
2062 uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
2063 uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
2064 bucket_group_load = MIN(bucket_group_load, NRQS - 1);
2065 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
2066 /* Ensure that the pri_shift value is reasonable */
2067 pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
2068 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
2069 }
2070
2071 /*
2072 * Update the clutch bucket priority; this allows clutch buckets that have been pending
2073 * for a long time to get an updated interactivity score.
2074 */
2075 sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
2076 }
2077
2078 /*
2079 * sched_clutch_thread_clutch_update()
2080 *
2081 * Routine called when the thread changes its thread group. The current
2082 * implementation relies on the fact that the thread group is changed only from
2083 * the context of the thread itself or when the thread is runnable but not in a
2084 * runqueue. Due to this fact, the thread group change causes only counter
2085 * updates in the old & new clutch buckets and no hierarchy changes. The routine
2086 * also attributes the CPU used so far to the old clutch.
2087 */
2088 void
sched_clutch_thread_clutch_update(thread_t thread,sched_clutch_t old_clutch,sched_clutch_t new_clutch)2089 sched_clutch_thread_clutch_update(
2090 thread_t thread,
2091 sched_clutch_t old_clutch,
2092 sched_clutch_t new_clutch)
2093 {
2094 uint32_t cpu_delta;
2095
2096 if (old_clutch) {
2097 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
2098
2099 sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
2100 /*
2101 * Calculate the CPU used by this thread in the old bucket and
2102 * add it to the old clutch bucket. This uses the same CPU usage
2103 * logic as update_priority etc.
2104 */
2105 sched_tick_delta(thread, cpu_delta);
2106 if (thread->pri_shift < INT8_MAX) {
2107 thread->sched_usage += cpu_delta;
2108 }
2109 thread->cpu_delta += cpu_delta;
2110 if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
2111 sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
2112 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
2113 }
2114 }
2115
2116 if (new_clutch) {
2117 sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
2118 }
2119 }
2120
2121 /* Thread Insertion/Removal/Selection routines */
2122
2123 #if CONFIG_SCHED_EDGE
2124
2125 /*
2126 * Edge Scheduler Bound Thread Support
2127 *
2128 * The edge scheduler allows threads to be bound to specific clusters. The scheduler
2129 * maintains a separate runq on the clutch root to hold these bound threads. These
2130 * bound threads count towards the root priority and thread count, but are ignored
2131 * for thread migration/steal decisions. Bound threads that are enqueued in the
2132 * separate runq have the th_bound_cluster_enqueued flag set to allow easy
2133 * removal.
2134 *
2135 * Bound Threads Timesharing
2136 * The bound threads share the timesharing properties of the clutch bucket group they are
2137 * part of. They contribute to the load and use priority shifts/decay values from the
2138 * clutch bucket group.
2139 */
2140
2141 static boolean_t
sched_edge_bound_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2142 sched_edge_bound_thread_insert(
2143 sched_clutch_root_t root_clutch,
2144 thread_t thread,
2145 integer_t options)
2146 {
2147 /* Update the clutch runnable count and priority */
2148 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2149 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2150 if (root_bucket->scrb_bound_thread_runq.count == 0) {
2151 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
2152 }
2153
2154 assert((thread->th_bound_cluster_enqueued) == false);
2155 run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
2156 thread->th_bound_cluster_enqueued = true;
2157
2158 /* Increment the urgency counter for the root if necessary */
2159 sched_clutch_root_urgency_inc(root_clutch, thread);
2160
2161 int16_t root_old_pri = root_clutch->scr_priority;
2162 sched_clutch_root_pri_update(root_clutch);
2163 return root_clutch->scr_priority > root_old_pri;
2164 }
2165
2166 static void
sched_edge_bound_thread_remove(sched_clutch_root_t root_clutch,thread_t thread)2167 sched_edge_bound_thread_remove(
2168 sched_clutch_root_t root_clutch,
2169 thread_t thread)
2170 {
2171 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2172 assert((thread->th_bound_cluster_enqueued) == true);
2173 run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
2174 thread->th_bound_cluster_enqueued = false;
2175
2176 /* Decrement the urgency counter for the root if necessary */
2177 sched_clutch_root_urgency_dec(root_clutch, thread);
2178
2179 /* Update the clutch runnable count and priority */
2180 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2181 if (root_bucket->scrb_bound_thread_runq.count == 0) {
2182 sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
2183 }
2184 sched_clutch_root_pri_update(root_clutch);
2185 }
2186
2187 /*
2188 * Edge Scheduler cluster shared resource threads load balancing
2189 *
2190 * The Edge scheduler attempts to load balance cluster shared resource intensive threads
2191 * across clusters in order to reduce contention on the shared resources. It achieves
2192 * that by maintaining the runnable and running shared resource load on each cluster
2193 * and balancing the load across multiple clusters.
2194 *
2195 * The current implementation for cluster shared resource load balancing looks at
2196 * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
2197 * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
2198 * do not steal/rebalance shared resource threads. Some more details for the implementation:
2199 *
2200 * - When threads are tagged as shared resource, they go through the cluster selection logic
2201 * which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
2202 * enqueued in the cluster bound runqueue.
2203 *
2204 * - When the threads start running and call avoid_processor, the load balancing logic will be
2205 * invoked and cause the thread to be sent to a more preferred cluster if one exists and has
2206 * no shared resource load.
2207 *
2208 * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
2209 * it will look at running shared resource threads on foreign clusters and actively rebalance them.
2210 *
2211 * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
2212 * go idle intentionally.
2213 *
2214 * - One caveat of this design is that if a preferred CPU has already run and finished its shared
2215 * resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
2216 * The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
2217 * events listed above happen.
2218 *
2219 * - Also it currently does not consider other properties such as thread priorities and
2220 * qos level thread load in the thread placement decision.
2221 *
2222 * Edge Scheduler cluster shared resource thread scheduling policy
2223 *
2224 * The threads for shared resources can be scheduled using one of the two policies:
2225 *
2226 * EDGE_SHARED_RSRC_SCHED_POLICY_RR
2227 * This policy distributes the threads so that they spread across all available clusters
2228 * irrespective of type. The idea is that this scheduling policy will put a shared resource
2229 * thread on each cluster on the platform before it starts doubling up on clusters.
2230 *
2231 * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
2232 * This policy distributes threads so that the threads first fill up all the capacity on
2233 * the preferred cluster and its homogeneous peers before spilling to different core type.
2234 * The current implementation defines capacity based on the number of CPUs in the cluster;
2235 * so a cluster's shared resource is considered full if there are "n" runnable + running
2236 * shared resource threads on the cluster with n cpus. This policy is different from the
2237 * default scheduling policy of the edge scheduler since this always tries to fill up the
2238 * native clusters to capacity even when non-native clusters might be idle.
2239 */
2240 __options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
2241 EDGE_SHARED_RSRC_SCHED_POLICY_RR = 0,
2242 EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST = 1,
2243 });
2244
2245 static const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
2246 [CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
2247 [CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
2248 };
2249
2250 static void
sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch,thread_t thread)2251 sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
2252 {
2253 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2254 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
2255 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
2256 }
2257 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2258 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
2259 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
2260 }
2261 }
2262
2263 static void
sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch,thread_t thread)2264 sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
2265 {
2266 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2267 if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2268 thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2269 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2270 }
2271 }
2272 }
2273
2274 uint16_t
sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch,cluster_shared_rsrc_type_t shared_rsrc_type)2275 sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2276 {
2277 return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2278 }
2279
2280 /*
2281 * sched_edge_shared_rsrc_idle()
2282 *
2283 * Routine used to determine if the constrained resource for the pset is idle. This is
2284 * used by a CPU going idle to decide if it should rebalance a running shared resource
2285 * thread from a non-preferred cluster.
2286 */
2287 static boolean_t
sched_edge_shared_rsrc_idle(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2288 sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2289 {
2290 return sched_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
2291 }
2292
2293 /*
2294 * sched_edge_thread_shared_rsrc_type
2295 *
2296 * This routine decides if a given thread needs special handling for being a
2297 * heavy shared resource user. It is valid for the same thread to be using
2298 * several shared resources at the same time and have multiple policy flags set.
2299 * This routine determines which of those properties will be used for load
2300 * balancing and migration decisions.
2301 */
2302 static cluster_shared_rsrc_type_t
sched_edge_thread_shared_rsrc_type(thread_t thread)2303 sched_edge_thread_shared_rsrc_type(thread_t thread)
2304 {
2305 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2306 return CLUSTER_SHARED_RSRC_TYPE_RR;
2307 }
2308 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2309 return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2310 }
2311 return CLUSTER_SHARED_RSRC_TYPE_NONE;
2312 }
2313
2314 #endif /* CONFIG_SCHED_EDGE */
2315
2316 /*
2317 * sched_clutch_thread_bound_lookup()
2318 *
2319 * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2320 */
2321 static thread_t
sched_clutch_thread_bound_lookup(__unused sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t processor,thread_t _Nullable prev_thread)2322 sched_clutch_thread_bound_lookup(
2323 __unused sched_clutch_root_t root_clutch,
2324 sched_clutch_root_bucket_t root_bucket,
2325 processor_t processor,
2326 thread_t _Nullable prev_thread)
2327 {
2328 assert(root_bucket->scrb_bound == true);
2329 thread_t bound_thread = run_queue_peek(&root_bucket->scrb_bound_thread_runq);
2330 if ((prev_thread != NULL) &&
2331 (bound_thread == NULL || sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_thread->sched_pri, processor->first_timeslice))) {
2332 return prev_thread;
2333 }
2334 assert(bound_thread != THREAD_NULL);
2335 return bound_thread;
2336 }
2337
2338 /*
2339 * Clutch Bucket Group Thread Counts and Pending time calculation
2340 *
2341 * The pending time on the clutch_bucket_group allows the scheduler to track if it
2342 * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2343 * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2344 * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2345 * pending time is advanced to the time of thread selection.
2346 *
2347 * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2348 * simulataneously, it is important that the updates to thread counts and pending timestamps
2349 * happen atomically. The implementation relies on the following aspects to make that work
2350 * as expected:
2351 * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2352 * is held when threads are added/removed and pending timestamps are updated
2353 * - The thread count and pending timestamp can be updated atomically using double wide
2354 * 128 bit atomics
2355 *
2356 * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2357 * above to atomically update the interactivity score for a clutch bucket group.
2358 */
2359
2360 #if CONFIG_SCHED_EDGE
2361
2362 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2363 sched_clutch_bucket_group_thr_count_inc(
2364 sched_clutch_bucket_group_t clutch_bucket_group,
2365 uint64_t timestamp)
2366 {
2367 sched_clutch_counter_time_t old_pending_data;
2368 sched_clutch_counter_time_t new_pending_data;
2369 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2370 new_pending_data.scct_count = old_pending_data.scct_count + 1;
2371 new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2372 if (old_pending_data.scct_count == 0) {
2373 new_pending_data.scct_timestamp = timestamp;
2374 }
2375 });
2376 }
2377
2378 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2379 sched_clutch_bucket_group_thr_count_dec(
2380 sched_clutch_bucket_group_t clutch_bucket_group,
2381 uint64_t timestamp)
2382 {
2383 sched_clutch_counter_time_t old_pending_data;
2384 sched_clutch_counter_time_t new_pending_data;
2385 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2386 new_pending_data.scct_count = old_pending_data.scct_count - 1;
2387 if (new_pending_data.scct_count == 0) {
2388 new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2389 } else {
2390 new_pending_data.scct_timestamp = timestamp;
2391 }
2392 });
2393 }
2394
2395 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2396 sched_clutch_bucket_group_pending_ageout(
2397 sched_clutch_bucket_group_t clutch_bucket_group,
2398 uint64_t timestamp)
2399 {
2400 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2401 sched_clutch_counter_time_t old_pending_data;
2402 sched_clutch_counter_time_t new_pending_data;
2403 uint8_t cpu_usage_shift = 0;
2404
2405 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2406 cpu_usage_shift = 0;
2407 uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2408 bool old_update = (old_pending_ts >= timestamp);
2409 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2410 bool no_bucket_load = (bucket_load == 0);
2411 if (old_update || no_pending_time || no_bucket_load) {
2412 os_atomic_rmw_loop_give_up();
2413 }
2414
2415 /* Calculate the time the clutch bucket group has been pending */
2416 uint64_t pending_delta = timestamp - old_pending_ts;
2417 /*
2418 * Other buckets should get a chance to run first before artificially boosting
2419 * this clutch bucket group's interactivity score, at least when the entire root
2420 * bucket is getting a large enough share of CPU.
2421 */
2422 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2423 if (pending_delta < interactivity_delta) {
2424 os_atomic_rmw_loop_give_up();
2425 }
2426 cpu_usage_shift = (pending_delta / interactivity_delta);
2427 new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2428 new_pending_data.scct_count = old_pending_data.scct_count;
2429 });
2430 return cpu_usage_shift;
2431 }
2432
2433 static boolean_t
sched_edge_thread_should_be_inserted_as_bound(sched_clutch_root_t root_clutch,thread_t thread)2434 sched_edge_thread_should_be_inserted_as_bound(
2435 sched_clutch_root_t root_clutch,
2436 thread_t thread)
2437 {
2438 /*
2439 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2440 * If the thread is cluster-bound but to a different cluster, we should enqueue as unbound.
2441 */
2442 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2443 return TRUE;
2444 }
2445 /*
2446 * Use bound runqueue for shared resource threads. See "cluster shared resource
2447 * threads load balancing" section for details.
2448 */
2449 if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2450 return TRUE;
2451 }
2452 return FALSE;
2453 }
2454
2455 #else /* CONFIG_SCHED_EDGE */
2456
2457 /*
2458 * For the clutch scheduler, atomicity is ensured by making sure all operations
2459 * are happening under the pset lock of the only cluster present on the platform.
2460 */
2461 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2462 sched_clutch_bucket_group_thr_count_inc(
2463 sched_clutch_bucket_group_t clutch_bucket_group,
2464 uint64_t timestamp)
2465 {
2466 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2467 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2468 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2469 }
2470 clutch_bucket_group->scbg_pending_data.scct_count++;
2471 }
2472
2473 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2474 sched_clutch_bucket_group_thr_count_dec(
2475 sched_clutch_bucket_group_t clutch_bucket_group,
2476 uint64_t timestamp)
2477 {
2478 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2479 clutch_bucket_group->scbg_pending_data.scct_count--;
2480 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2481 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2482 } else {
2483 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2484 }
2485 }
2486
2487 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2488 sched_clutch_bucket_group_pending_ageout(
2489 sched_clutch_bucket_group_t clutch_bucket_group,
2490 uint64_t timestamp)
2491 {
2492 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2493 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2494 uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2495 bool old_update = (old_pending_ts >= timestamp);
2496 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2497 bool no_bucket_load = (bucket_load == 0);
2498 if (old_update || no_pending_time || no_bucket_load) {
2499 return 0;
2500 }
2501 uint64_t pending_delta = timestamp - old_pending_ts;
2502 /*
2503 * Other buckets should get a chance to run first before artificially boosting
2504 * this clutch bucket group's interactivity score, at least when the entire root
2505 * bucket is getting a large enough share of CPU.
2506 */
2507 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2508 if (pending_delta < interactivity_delta) {
2509 return 0;
2510 }
2511 uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2512 clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2513 return cpu_usage_shift;
2514 }
2515
2516 #endif /* CONFIG_SCHED_EDGE */
2517
2518 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2519 sched_clutch_bucket_group_interactivity_score_calculate(
2520 sched_clutch_bucket_group_t clutch_bucket_group,
2521 uint64_t timestamp)
2522 {
2523 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2524 /*
2525 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2526 * priorities, make sure all AboveUI buckets are marked interactive.
2527 */
2528 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2529 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2530 }
2531 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2532 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2533 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2534 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2535 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2536 /* Write back any interactivity score update */
2537 #if CONFIG_SCHED_EDGE
2538 sched_clutch_counter_time_t old_interactivity_data;
2539 sched_clutch_counter_time_t new_interactivity_data;
2540 os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2541 new_interactivity_data.scct_count = old_interactivity_data.scct_count;
2542 if (old_interactivity_data.scct_timestamp >= timestamp) {
2543 os_atomic_rmw_loop_give_up();
2544 }
2545 new_interactivity_data.scct_timestamp = timestamp;
2546 if (old_interactivity_data.scct_timestamp != 0) {
2547 new_interactivity_data.scct_count = interactivity_score;
2548 }
2549 });
2550 return (uint8_t)new_interactivity_data.scct_count;
2551 #else /* !CONFIG_SCHED_EDGE */
2552 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2553 if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2554 clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2555 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2556 }
2557 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2558 #endif /* !CONFIG_SCHED_EDGE */
2559 }
2560
2561 /*
2562 * Clutch Bucket Group Run Count and Blocked Time Accounting
2563 *
2564 * The clutch bucket group maintains the number of runnable/running threads in the group.
2565 * Since the blocked time of the clutch bucket group is based on this count, it is
2566 * important to make sure the blocking timestamp and the run count are updated atomically.
2567 *
2568 * Since the run count increments happen without any pset locks held, the scheduler updates
2569 * the count & timestamp using double wide 128 bit atomics.
2570 */
2571
2572 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2573 sched_clutch_bucket_group_run_count_inc(
2574 sched_clutch_bucket_group_t clutch_bucket_group)
2575 {
2576 sched_clutch_counter_time_t old_blocked_data;
2577 sched_clutch_counter_time_t new_blocked_data;
2578
2579 bool update_blocked_time = false;
2580 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2581 new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2582 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2583 update_blocked_time = false;
2584 if (old_blocked_data.scct_count == 0) {
2585 new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2586 update_blocked_time = true;
2587 }
2588 });
2589 if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2590 uint64_t ctime = mach_absolute_time();
2591 if (ctime > old_blocked_data.scct_timestamp) {
2592 uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2593 blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2594 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2595 }
2596 }
2597 return (uint32_t)new_blocked_data.scct_count;
2598 }
2599
2600 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2601 sched_clutch_bucket_group_run_count_dec(
2602 sched_clutch_bucket_group_t clutch_bucket_group)
2603 {
2604 sched_clutch_counter_time_t old_blocked_data;
2605 sched_clutch_counter_time_t new_blocked_data;
2606
2607 uint64_t ctime = mach_absolute_time();
2608 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2609 new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2610 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2611 if (new_blocked_data.scct_count == 0) {
2612 new_blocked_data.scct_timestamp = ctime;
2613 }
2614 });
2615 return (uint32_t)new_blocked_data.scct_count;
2616 }
2617
2618 static inline sched_clutch_bucket_t
sched_clutch_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t thread)2619 sched_clutch_bucket_for_thread(
2620 sched_clutch_root_t root_clutch,
2621 thread_t thread)
2622 {
2623 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2624 assert(thread->thread_group == clutch->sc_tg);
2625
2626 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2627 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2628 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2629
2630 return clutch_bucket;
2631 }
2632
2633 static inline sched_clutch_bucket_group_t
sched_clutch_bucket_group_for_thread(thread_t prev_thread)2634 sched_clutch_bucket_group_for_thread(thread_t prev_thread)
2635 {
2636 sched_clutch_t clutch = sched_clutch_for_thread_group(prev_thread->thread_group);
2637 return &clutch->sc_clutch_groups[prev_thread->th_sched_bucket];
2638 }
2639
2640 /*
2641 * sched_clutch_thread_insert()
2642 *
2643 * Routine to insert a thread into the sched clutch hierarchy.
2644 * Update the counts at all levels of the hierarchy and insert the nodes
2645 * as they become runnable. Always called with the pset lock held.
2646 */
2647 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2648 sched_clutch_thread_insert(
2649 sched_clutch_root_t root_clutch,
2650 thread_t thread,
2651 integer_t options)
2652 {
2653 boolean_t result = FALSE;
2654
2655 sched_clutch_hierarchy_locked_assert(root_clutch);
2656 #if CONFIG_SCHED_EDGE
2657 sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2658 sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2659
2660 if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, thread)) {
2661 /*
2662 * Includes threads bound to this specific cluster as well as all
2663 * shared resource threads.
2664 */
2665 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2666 }
2667 #endif /* CONFIG_SCHED_EDGE */
2668
2669 uint64_t current_timestamp = mach_absolute_time();
2670 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2671 assert(thread->thread_group == clutch->sc_tg);
2672 sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, thread);
2673 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2674
2675 /*
2676 * Thread linkage in clutch_bucket
2677 *
2678 * A thread has a few linkages within the clutch bucket:
2679 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2680 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2681 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2682 */
2683
2684 /* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2685 thread->th_clutch_runq_link.stamp = current_timestamp;
2686 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2687 (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2688 priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2689
2690 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2691 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2692 sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2693 priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2694
2695 /* Insert thread into timesharing queue of the clutch bucket */
2696 enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2697
2698 /* Increment the urgency counter for the root if necessary */
2699 sched_clutch_root_urgency_inc(root_clutch, thread);
2700
2701 os_atomic_inc(&clutch->sc_thr_count, relaxed);
2702 sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2703
2704 /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2705 sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2706 if (clutch_bucket->scb_thr_count == 0) {
2707 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2708 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2709 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2710 } else {
2711 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2712 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2713 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2714 }
2715
2716 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2717 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2718 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2719 return result;
2720 }
2721
2722 /*
2723 * sched_clutch_thread_remove()
2724 *
2725 * Routine to remove a thread from the sched clutch hierarchy.
2726 * Update the counts at all levels of the hierarchy and remove the nodes
2727 * as they become empty. Always called with the pset lock held.
2728 */
2729 static void
sched_clutch_thread_remove(sched_clutch_root_t root_clutch,thread_t thread,uint64_t current_timestamp,sched_clutch_bucket_options_t options)2730 sched_clutch_thread_remove(
2731 sched_clutch_root_t root_clutch,
2732 thread_t thread,
2733 uint64_t current_timestamp,
2734 sched_clutch_bucket_options_t options)
2735 {
2736 sched_clutch_hierarchy_locked_assert(root_clutch);
2737 #if CONFIG_SCHED_EDGE
2738 sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2739 sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2740
2741 if (thread->th_bound_cluster_enqueued) {
2742 sched_edge_bound_thread_remove(root_clutch, thread);
2743 return;
2744 }
2745 #endif /* CONFIG_SCHED_EDGE */
2746 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2747 assert(thread->thread_group == clutch->sc_tg);
2748 thread_assert_runq_nonnull(thread);
2749
2750 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2751 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2752 assert(clutch_bucket->scb_root == root_clutch);
2753
2754 /* Decrement the urgency counter for the root if necessary */
2755 sched_clutch_root_urgency_dec(root_clutch, thread);
2756 /* Remove thread from the clutch_bucket */
2757 priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2758 remqueue(&thread->th_clutch_timeshare_link);
2759
2760 priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2761
2762 /*
2763 * Warning: After this point, the thread's scheduling fields may be
2764 * modified by other cores that acquire the thread lock.
2765 */
2766 thread_clear_runq(thread);
2767
2768 /* Update counts at various levels of the hierarchy */
2769 os_atomic_dec(&clutch->sc_thr_count, relaxed);
2770 sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2771 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2772 sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2773
2774 /* Remove the clutch from hierarchy (if needed) and update properties */
2775 if (clutch_bucket->scb_thr_count == 0) {
2776 sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2777 } else {
2778 sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2779 }
2780
2781 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2782 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2783 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2784 }
2785
2786 /*
2787 * sched_clutch_thread_unbound_lookup()
2788 *
2789 * Routine to find the highest unbound thread in the root clutch.
2790 * Helps find threads easily for steal/migrate scenarios in the
2791 * Edge scheduler.
2792 */
2793 static thread_t
sched_clutch_thread_unbound_lookup(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread)2794 sched_clutch_thread_unbound_lookup(
2795 sched_clutch_root_t root_clutch,
2796 sched_clutch_root_bucket_t root_bucket,
2797 processor_t _Nullable processor,
2798 thread_t _Nullable prev_thread)
2799 {
2800 assert(processor != NULL || prev_thread == NULL);
2801 assert(root_bucket->scrb_bound == false);
2802 sched_clutch_hierarchy_locked_assert(root_clutch);
2803
2804 /* Find the highest priority clutch bucket in this root bucket */
2805 bool chose_prev_thread = false;
2806 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket, processor, prev_thread, &chose_prev_thread);
2807 assert(clutch_bucket != NULL);
2808
2809 if (chose_prev_thread) {
2810 /* We have determined that prev_thread is the highest thread, based on the Clutch bucket level policy */
2811 assert(processor != NULL && prev_thread != NULL);
2812 return prev_thread;
2813 }
2814
2815 /* Find the highest priority runnable thread in this clutch bucket */
2816 thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2817 assert(thread != NULL);
2818
2819 /* Consider the previous thread */
2820 if (prev_thread != NULL &&
2821 sched_clutch_bucket_for_thread(root_clutch, prev_thread) == clutch_bucket &&
2822 sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, thread->sched_pri, processor->first_timeslice)) {
2823 thread = prev_thread;
2824 }
2825
2826 return thread;
2827 }
2828
2829 static sched_clutch_root_bucket_t
sched_clutch_root_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t prev_thread)2830 sched_clutch_root_bucket_for_thread(
2831 sched_clutch_root_t root_clutch,
2832 thread_t prev_thread)
2833 {
2834 #if CONFIG_SCHED_EDGE
2835 if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, prev_thread)) {
2836 return &root_clutch->scr_bound_buckets[prev_thread->th_sched_bucket];
2837 }
2838 #endif /* CONFIG_SCHED_EDGE */
2839 return &root_clutch->scr_unbound_buckets[prev_thread->th_sched_bucket];
2840 }
2841
2842 /*
2843 * sched_clutch_hierarchy_thread_highest()
2844 *
2845 * Routine to traverse the Clutch hierarchy and return the highest thread which
2846 * should be selected to run next, optionally comparing against the previously
2847 * running thread. Removes the highest thread with sched_clutch_thread_remove()
2848 * depending on the traverse mode and whether it is the previously running thread.
2849 * Always called with the pset lock held.
2850 */
2851 static thread_t
sched_clutch_hierarchy_thread_highest(sched_clutch_root_t root_clutch,processor_t processor,thread_t _Nullable prev_thread,sched_clutch_traverse_mode_t mode)2852 sched_clutch_hierarchy_thread_highest(
2853 sched_clutch_root_t root_clutch,
2854 processor_t processor,
2855 thread_t _Nullable prev_thread,
2856 sched_clutch_traverse_mode_t mode)
2857 {
2858 assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || prev_thread == NULL);
2859 sched_clutch_hierarchy_locked_assert(root_clutch);
2860
2861 thread_t highest_thread = NULL;
2862 uint64_t current_timestamp = mach_absolute_time();
2863 bool chose_prev_thread = false;
2864 sched_clutch_dbg_thread_select_packed_t debug_info = {0};
2865 sched_clutch_root_bucket_t prev_root_bucket = prev_thread != NULL ? sched_clutch_root_bucket_for_thread(root_clutch, prev_thread) : NULL;
2866 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL, prev_root_bucket, prev_thread, &chose_prev_thread, mode, &debug_info);
2867 if (chose_prev_thread) {
2868 /* We disambiguated that we want to keep running the previous thread */
2869 highest_thread = processor->active_thread;
2870 goto done_selecting_thread;
2871 }
2872 if (root_bucket == NULL) {
2873 /* The Clutch hierarchy has no runnable threads, including the previous thread */
2874 assert(sched_clutch_root_count(root_clutch) == 0);
2875 assert(prev_thread == NULL);
2876 return NULL;
2877 }
2878 if (root_bucket != prev_root_bucket) {
2879 /* We have ruled out continuing to run the previous thread, based on the root bucket level policy */
2880 prev_thread = NULL;
2881 assert((mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT) || (prev_root_bucket == NULL) ||
2882 (prev_root_bucket->scrb_bucket >= root_bucket->scrb_bucket) || (root_bucket->scrb_starvation_avoidance) ||
2883 (prev_root_bucket->scrb_bound != root_bucket->scrb_bound) ||
2884 (root_bucket->scrb_warp_remaining > 0 && root_bucket->scrb_warped_deadline > current_timestamp && prev_root_bucket->scrb_warp_remaining == 0));
2885 }
2886
2887 if (root_bucket->scrb_bound) {
2888 highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket, processor, prev_thread);
2889 } else {
2890 highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket, processor, prev_thread);
2891 }
2892
2893 if (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY ||
2894 (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT && highest_thread != processor->active_thread)) {
2895 assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || highest_thread != processor->active_thread);
2896 sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2897 }
2898
2899 done_selecting_thread:
2900 debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
2901 debug_info.trace_data.traverse_mode = mode;
2902 debug_info.trace_data.cluster_id = root_clutch->scr_cluster_id;
2903 debug_info.trace_data.selection_was_cluster_bound = root_bucket->scrb_bound;
2904 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2905 thread_tid(highest_thread), thread_group_get_id(highest_thread->thread_group), root_bucket->scrb_bucket, debug_info.scdts_trace_data_packed, 0);
2906 return highest_thread;
2907 }
2908
2909 /* High level global accessor routines */
2910
2911 /*
2912 * sched_clutch_root_urgency()
2913 *
2914 * Routine to get the urgency of the highest runnable
2915 * thread in the hierarchy.
2916 */
2917 static uint32_t
sched_clutch_root_urgency(sched_clutch_root_t root_clutch)2918 sched_clutch_root_urgency(
2919 sched_clutch_root_t root_clutch)
2920 {
2921 return root_clutch->scr_urgency;
2922 }
2923
2924 /*
2925 * sched_clutch_root_count_sum()
2926 *
2927 * The count_sum mechanism is used for scheduler runq
2928 * statistics calculation. Its only useful for debugging
2929 * purposes; since it takes a mach_absolute_time() on
2930 * other scheduler implementations, its better to avoid
2931 * populating this until absolutely necessary.
2932 */
2933 static uint32_t
sched_clutch_root_count_sum(__unused sched_clutch_root_t root_clutch)2934 sched_clutch_root_count_sum(
2935 __unused sched_clutch_root_t root_clutch)
2936 {
2937 return 0;
2938 }
2939
2940 /*
2941 * sched_clutch_root_priority()
2942 *
2943 * Routine to get the priority of the highest runnable
2944 * thread in the hierarchy.
2945 */
2946 static int
sched_clutch_root_priority(sched_clutch_root_t root_clutch)2947 sched_clutch_root_priority(
2948 sched_clutch_root_t root_clutch)
2949 {
2950 return root_clutch->scr_priority;
2951 }
2952
2953 /*
2954 * sched_clutch_root_count()
2955 *
2956 * Returns total number of runnable threads in the hierarchy.
2957 */
2958 uint32_t
sched_clutch_root_count(sched_clutch_root_t root_clutch)2959 sched_clutch_root_count(
2960 sched_clutch_root_t root_clutch)
2961 {
2962 return root_clutch->scr_thr_count;
2963 }
2964
2965 #if CONFIG_SCHED_EDGE
2966
2967 /*
2968 * sched_clutch_root_foreign_empty()
2969 *
2970 * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2971 */
2972 static boolean_t
sched_clutch_root_foreign_empty(sched_clutch_root_t root_clutch)2973 sched_clutch_root_foreign_empty(
2974 sched_clutch_root_t root_clutch)
2975 {
2976 return priority_queue_empty(&root_clutch->scr_foreign_buckets);
2977 }
2978
2979 /*
2980 * sched_clutch_root_highest_foreign_thread_remove()
2981 *
2982 * Routine to return the thread in the highest priority clutch bucket in a cluster.
2983 * Must be called with the pset for the cluster locked.
2984 */
2985 static thread_t
sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t root_clutch)2986 sched_clutch_root_highest_foreign_thread_remove(
2987 sched_clutch_root_t root_clutch)
2988 {
2989 thread_t thread = THREAD_NULL;
2990 if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
2991 return thread;
2992 }
2993 sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
2994 thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2995 sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
2996 return thread;
2997 }
2998
2999 #endif /* CONFIG_SCHED_EDGE */
3000
3001 /*
3002 * sched_clutch_thread_pri_shift()
3003 *
3004 * Routine to get the priority shift value for a thread.
3005 * Since the timesharing is done at the clutch_bucket level,
3006 * this routine gets the clutch_bucket and retrieves the
3007 * values from there.
3008 */
3009 uint32_t
sched_clutch_thread_pri_shift(thread_t thread,sched_bucket_t bucket)3010 sched_clutch_thread_pri_shift(
3011 thread_t thread,
3012 sched_bucket_t bucket)
3013 {
3014 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3015 return INT8_MAX;
3016 }
3017 assert(bucket != TH_BUCKET_RUN);
3018 sched_clutch_t clutch = sched_clutch_for_thread(thread);
3019 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
3020 return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
3021 }
3022
3023 #pragma mark -- Clutch Scheduler Algorithm
3024
3025 static void
3026 sched_clutch_init(void);
3027
3028 static thread_t
3029 sched_clutch_steal_thread(processor_set_t pset);
3030
3031 #if !SCHED_TEST_HARNESS
3032
3033 static void
3034 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
3035
3036 #endif /* !SCHED_TEST_HARNESS */
3037
3038 static boolean_t
3039 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
3040 sched_options_t options);
3041
3042 static boolean_t
3043 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
3044
3045 static ast_t
3046 sched_clutch_processor_csw_check(processor_t processor);
3047
3048 static boolean_t
3049 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3050
3051 static int
3052 sched_clutch_runq_count(processor_t processor);
3053
3054 static boolean_t
3055 sched_clutch_processor_queue_empty(processor_t processor);
3056
3057 #if !SCHED_TEST_HARNESS
3058
3059 static uint64_t
3060 sched_clutch_runq_stats_count_sum(processor_t processor);
3061
3062 #endif /* !SCHED_TEST_HARNESS */
3063
3064 static int
3065 sched_clutch_processor_bound_count(processor_t processor);
3066
3067 static void
3068 sched_clutch_pset_init(processor_set_t pset);
3069
3070 static void
3071 sched_clutch_processor_init(processor_t processor);
3072
3073 static thread_t
3074 sched_clutch_processor_highest_thread(processor_t processor, sched_clutch_traverse_mode_t mode);
3075
3076 static thread_t
3077 sched_clutch_choose_thread(processor_t processor, int priority, thread_t prev_thread, ast_t reason);
3078
3079 #if !SCHED_TEST_HARNESS
3080
3081 static void
3082 sched_clutch_processor_queue_shutdown(processor_t processor);
3083
3084 #endif /* !SCHED_TEST_HARNESS */
3085
3086 static sched_mode_t
3087 sched_clutch_initial_thread_sched_mode(task_t parent_task);
3088
3089 static uint32_t
3090 sched_clutch_initial_quantum_size(thread_t thread);
3091
3092 static uint32_t
3093 sched_clutch_run_incr(thread_t thread);
3094
3095 static uint32_t
3096 sched_clutch_run_decr(thread_t thread);
3097
3098 static void
3099 sched_clutch_update_thread_bucket(thread_t thread);
3100
3101 #if !SCHED_TEST_HARNESS
3102
3103 static void
3104 sched_clutch_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
3105
3106 #endif /* !SCHED_TEST_HARNESS */
3107
3108 const struct sched_dispatch_table sched_clutch_dispatch = {
3109 .sched_name = "clutch",
3110 .init = sched_clutch_init,
3111 .timebase_init = sched_timeshare_timebase_init,
3112 .processor_init = sched_clutch_processor_init,
3113 .pset_init = sched_clutch_pset_init,
3114 .choose_thread = sched_clutch_choose_thread,
3115 .steal_thread = sched_clutch_steal_thread,
3116 .processor_enqueue = sched_clutch_processor_enqueue,
3117 .processor_queue_remove = sched_clutch_processor_queue_remove,
3118 .processor_queue_empty = sched_clutch_processor_queue_empty,
3119 .priority_is_urgent = priority_is_urgent,
3120 .processor_csw_check = sched_clutch_processor_csw_check,
3121 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
3122 .initial_quantum_size = sched_clutch_initial_quantum_size,
3123 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3124 .processor_runq_count = sched_clutch_runq_count,
3125 .processor_bound_count = sched_clutch_processor_bound_count,
3126 .multiple_psets_enabled = TRUE,
3127 .avoid_processor_enabled = FALSE,
3128 .thread_avoid_processor = NULL,
3129 .update_thread_bucket = sched_clutch_update_thread_bucket,
3130 .cpu_init_completed = NULL,
3131 .thread_eligible_for_pset = NULL,
3132 #if !SCHED_TEST_HARNESS
3133 .maintenance_continuation = sched_timeshare_maintenance_continue,
3134 .steal_thread_enabled = sched_steal_thread_enabled,
3135 .compute_timeshare_priority = sched_compute_timeshare_priority,
3136 .choose_node = sched_choose_node,
3137 #if CONFIG_SCHED_SMT
3138 .choose_processor = choose_processor_smt,
3139 #else /* CONFIG_SCHED_SMT */
3140 .choose_processor = choose_processor,
3141 #endif
3142 .processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
3143 .can_update_priority = can_update_priority,
3144 .update_priority = update_priority,
3145 .lightweight_update_priority = lightweight_update_priority,
3146 .quantum_expire = sched_default_quantum_expire,
3147 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3148 .thread_update_scan = sched_clutch_thread_update_scan,
3149 .processor_balance = sched_SMT_balance,
3150 .rt_runq = sched_rtlocal_runq,
3151 .rt_init = sched_rtlocal_init,
3152 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
3153 .rt_runq_scan = sched_rtlocal_runq_scan,
3154 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
3155 .rt_steal_thread = sched_rtlocal_steal_thread,
3156 .qos_max_parallelism = sched_qos_max_parallelism,
3157 .check_spill = sched_check_spill,
3158 .ipi_policy = sched_ipi_policy,
3159 .thread_should_yield = sched_thread_should_yield,
3160 .run_count_incr = sched_clutch_run_incr,
3161 .run_count_decr = sched_clutch_run_decr,
3162 .pset_made_schedulable = sched_pset_made_schedulable,
3163 .thread_group_recommendation_change = sched_clutch_thread_group_recommendation_change,
3164 #endif /* !SCHED_TEST_HARNESS */
3165 };
3166
3167 __attribute__((always_inline))
3168 static inline run_queue_t
sched_clutch_bound_runq(processor_t processor)3169 sched_clutch_bound_runq(processor_t processor)
3170 {
3171 return &processor->runq;
3172 }
3173
3174 __attribute__((always_inline))
3175 static inline sched_clutch_root_t
sched_clutch_processor_root_clutch(processor_t processor)3176 sched_clutch_processor_root_clutch(processor_t processor)
3177 {
3178 return &processor->processor_set->pset_clutch_root;
3179 }
3180
3181 __attribute__((always_inline))
3182 static inline run_queue_t
sched_clutch_thread_bound_runq(processor_t processor,__assert_only thread_t thread)3183 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
3184 {
3185 assert(thread->bound_processor == processor);
3186 return sched_clutch_bound_runq(processor);
3187 }
3188
3189 static uint32_t
sched_clutch_initial_quantum_size(thread_t thread)3190 sched_clutch_initial_quantum_size(thread_t thread)
3191 {
3192 if (thread == THREAD_NULL) {
3193 return std_quantum;
3194 }
3195 assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
3196 return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
3197 }
3198
3199 static sched_mode_t
sched_clutch_initial_thread_sched_mode(task_t parent_task)3200 sched_clutch_initial_thread_sched_mode(task_t parent_task)
3201 {
3202 if (parent_task == kernel_task) {
3203 return TH_MODE_FIXED;
3204 } else {
3205 return TH_MODE_TIMESHARE;
3206 }
3207 }
3208
3209 static void
sched_clutch_processor_init(processor_t processor)3210 sched_clutch_processor_init(processor_t processor)
3211 {
3212 run_queue_init(&processor->runq);
3213 }
3214
3215 static void
sched_clutch_pset_init(processor_set_t pset)3216 sched_clutch_pset_init(processor_set_t pset)
3217 {
3218 sched_clutch_root_init(&pset->pset_clutch_root, pset);
3219 }
3220
3221 static void
sched_clutch_tunables_init(void)3222 sched_clutch_tunables_init(void)
3223 {
3224 sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
3225 sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
3226 sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
3227 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
3228 NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
3229 assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
3230 sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
3231 }
3232
3233 static void
sched_clutch_init(void)3234 sched_clutch_init(void)
3235 {
3236 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3237 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3238 }
3239 sched_timeshare_init();
3240 sched_clutch_tunables_init();
3241 }
3242
3243 static inline bool
sched_clutch_pri_greater_than_tiebreak(int pri_one,int pri_two,bool one_wins_ties)3244 sched_clutch_pri_greater_than_tiebreak(int pri_one, int pri_two, bool one_wins_ties)
3245 {
3246 if (one_wins_ties) {
3247 return pri_one >= pri_two;
3248 } else {
3249 return pri_one > pri_two;
3250 }
3251 }
3252
3253 /*
3254 * sched_clutch_processor_highest_thread()
3255 *
3256 * Routine to determine the highest thread on the entire cluster runqueue which
3257 * should be selected to run next, optionally comparing against the previously
3258 * running thread. Removes the highest thread from the runqueue, depending on the
3259 * traverse mode and whether the highest thread is the previously running thread.
3260 *
3261 * Always called with the pset lock held. Assumes that processor->active_thread
3262 * may be locked and modified by another processor.
3263 */
3264 static thread_t
sched_clutch_processor_highest_thread(processor_t processor,sched_clutch_traverse_mode_t mode)3265 sched_clutch_processor_highest_thread(
3266 processor_t processor,
3267 sched_clutch_traverse_mode_t mode)
3268 {
3269 sched_clutch_root_t root_clutch = sched_clutch_processor_root_clutch(processor);
3270 int clutch_pri = sched_clutch_root_priority(root_clutch);
3271 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3272 int bound_pri = bound_runq->highq;
3273
3274 bool has_prev_thread = mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
3275 thread_t prev_thread = has_prev_thread ? processor->active_thread : NULL;
3276
3277 if (bound_runq->count == 0 && root_clutch->scr_thr_count == 0) {
3278 /* The runqueue is totally empty */
3279 assert(bound_pri < MINPRI && clutch_pri < MINPRI);
3280 return prev_thread;
3281 }
3282
3283 if (has_prev_thread) {
3284 if (prev_thread->sched_pri >= BASEPRI_RTQUEUES) {
3285 /* The previous thread is real-time and thus guaranteed higher than the non-RT runqueue */
3286 return prev_thread;
3287 }
3288 /* Allow the previous thread to influence the priority comparison of Clutch hierarchy vs. processor-bound runqueue */
3289 if (prev_thread->bound_processor != NULL) {
3290 bound_pri = MAX(bound_pri, prev_thread->sched_pri);
3291 } else {
3292 clutch_pri = MAX(clutch_pri, prev_thread->sched_pri);
3293 }
3294 }
3295
3296 bool prev_thread_is_not_processor_bound = has_prev_thread && (prev_thread->bound_processor == NULL);
3297 bool prev_thread_is_processor_bound = has_prev_thread && (prev_thread->bound_processor != NULL);
3298 thread_t next_thread = prev_thread;
3299 if (clutch_pri > bound_pri) {
3300 if (root_clutch->scr_thr_count == 0) {
3301 goto found_thread;
3302 }
3303 next_thread = sched_clutch_hierarchy_thread_highest(root_clutch, processor, prev_thread_is_not_processor_bound ? prev_thread : NULL, mode);
3304 } else {
3305 if (bound_runq->count == 0 ||
3306 (prev_thread_is_processor_bound && sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_runq->highq, processor->first_timeslice))) {
3307 goto found_thread;
3308 }
3309 next_thread = (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY) ?
3310 run_queue_dequeue(bound_runq, SCHED_HEADQ) : run_queue_peek(bound_runq);
3311 assert(mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || next_thread != prev_thread);
3312 }
3313 found_thread:
3314 assert(next_thread != NULL);
3315 return next_thread;
3316 }
3317
3318 static thread_t
sched_clutch_choose_thread(processor_t processor,__unused int priority,thread_t _Nullable prev_thread,__unused ast_t reason)3319 sched_clutch_choose_thread(
3320 processor_t processor,
3321 __unused int priority,
3322 thread_t _Nullable prev_thread,
3323 __unused ast_t reason)
3324 {
3325 assert(prev_thread == NULL || prev_thread == processor->active_thread);
3326 return sched_clutch_processor_highest_thread(processor, prev_thread != NULL ? SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT : SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3327 }
3328
3329 static boolean_t
sched_clutch_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)3330 sched_clutch_processor_enqueue(
3331 processor_t processor,
3332 thread_t thread,
3333 sched_options_t options)
3334 {
3335 boolean_t result;
3336
3337 thread_set_runq_locked(thread, processor);
3338 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3339 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3340 result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
3341 } else {
3342 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3343 result = run_queue_enqueue(rq, thread, options);
3344 }
3345 return result;
3346 }
3347
3348 static boolean_t
sched_clutch_processor_queue_empty(processor_t processor)3349 sched_clutch_processor_queue_empty(processor_t processor)
3350 {
3351 return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
3352 sched_clutch_bound_runq(processor)->count == 0;
3353 }
3354
3355 static ast_t
sched_clutch_processor_csw_check(processor_t processor)3356 sched_clutch_processor_csw_check(processor_t processor)
3357 {
3358 assert(processor->active_thread != NULL);
3359 thread_t runqueue_thread = sched_clutch_processor_highest_thread(processor, SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT);
3360 if (runqueue_thread != processor->active_thread) {
3361 /* Found a better thread to run */
3362 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0 ||
3363 sched_clutch_bound_runq(processor)->urgency > 0) {
3364 return AST_PREEMPT | AST_URGENT;
3365 }
3366 return AST_PREEMPT;
3367 }
3368 return AST_NONE;
3369 }
3370
3371 static boolean_t
sched_clutch_processor_queue_has_priority(__unused processor_t processor,__unused int priority,__unused boolean_t gte)3372 sched_clutch_processor_queue_has_priority(
3373 __unused processor_t processor,
3374 __unused int priority,
3375 __unused boolean_t gte)
3376 {
3377 /*
3378 * Never short-circuit the Clutch runqueue by returning FALSE here. Instead,
3379 * thread_select() should always go through sched_clutch_choose_thread().
3380 */
3381 return TRUE;
3382 }
3383
3384 static int
sched_clutch_runq_count(processor_t processor)3385 sched_clutch_runq_count(processor_t processor)
3386 {
3387 return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
3388 }
3389
3390 #if !SCHED_TEST_HARNESS
3391
3392 static uint64_t
sched_clutch_runq_stats_count_sum(processor_t processor)3393 sched_clutch_runq_stats_count_sum(processor_t processor)
3394 {
3395 uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
3396
3397 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
3398 return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
3399 } else {
3400 return bound_sum;
3401 }
3402 }
3403
3404 #endif /* !SCHED_TEST_HARNESS */
3405
3406 static int
sched_clutch_processor_bound_count(processor_t processor)3407 sched_clutch_processor_bound_count(processor_t processor)
3408 {
3409 return sched_clutch_bound_runq(processor)->count;
3410 }
3411
3412 #if !SCHED_TEST_HARNESS
3413
3414 static void
sched_clutch_processor_queue_shutdown(processor_t processor)3415 sched_clutch_processor_queue_shutdown(processor_t processor)
3416 {
3417 processor_set_t pset = processor->processor_set;
3418 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3419 thread_t thread;
3420 queue_head_t tqueue;
3421
3422 /* We only need to migrate threads if this is the last active processor in the pset */
3423 if (pset->online_processor_count > 0) {
3424 pset_unlock(pset);
3425 return;
3426 }
3427
3428 queue_init(&tqueue);
3429 while (sched_clutch_root_count(pset_clutch_root) > 0) {
3430 thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3431 enqueue_tail(&tqueue, &thread->runq_links);
3432 }
3433
3434 pset_unlock(pset);
3435
3436 qe_foreach_element_safe(thread, &tqueue, runq_links) {
3437 remqueue(&thread->runq_links);
3438 thread_lock(thread);
3439 thread_setrun(thread, SCHED_TAILQ);
3440 thread_unlock(thread);
3441 }
3442 }
3443
3444 #endif /* !SCHED_TEST_HARNESS */
3445
3446 static boolean_t
sched_clutch_processor_queue_remove(processor_t processor,thread_t thread)3447 sched_clutch_processor_queue_remove(
3448 processor_t processor,
3449 thread_t thread)
3450 {
3451 processor_set_t pset = processor->processor_set;
3452
3453 pset_lock(pset);
3454
3455 if (processor == thread_get_runq_locked(thread)) {
3456 /*
3457 * Thread is on a run queue and we have a lock on
3458 * that run queue.
3459 */
3460 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3461 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3462 sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3463 } else {
3464 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3465 run_queue_remove(rq, thread);
3466 }
3467 } else {
3468 /*
3469 * The thread left the run queue before we could
3470 * lock the run queue.
3471 */
3472 thread_assert_runq_null(thread);
3473 processor = PROCESSOR_NULL;
3474 }
3475
3476 pset_unlock(pset);
3477
3478 return processor != PROCESSOR_NULL;
3479 }
3480
3481 static thread_t
sched_clutch_steal_thread(__unused processor_set_t pset)3482 sched_clutch_steal_thread(__unused processor_set_t pset)
3483 {
3484 /* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3485 return THREAD_NULL;
3486 }
3487
3488 #if !SCHED_TEST_HARNESS
3489
3490 static void
sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)3491 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3492 {
3493 boolean_t restart_needed = FALSE;
3494 processor_t processor = processor_list;
3495 processor_set_t pset;
3496 thread_t thread;
3497 spl_t s;
3498
3499 /*
3500 * We update the threads associated with each processor (bound and idle threads)
3501 * and then update the threads in each pset runqueue.
3502 */
3503
3504 do {
3505 do {
3506 pset = processor->processor_set;
3507
3508 s = splsched();
3509 pset_lock(pset);
3510
3511 restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
3512
3513 pset_unlock(pset);
3514 splx(s);
3515
3516 if (restart_needed) {
3517 break;
3518 }
3519
3520 thread = processor->idle_thread;
3521 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
3522 if (thread_update_add_thread(thread) == FALSE) {
3523 restart_needed = TRUE;
3524 break;
3525 }
3526 }
3527 } while ((processor = processor->processor_list) != NULL);
3528
3529 /* Ok, we now have a collection of candidates -- fix them. */
3530 thread_update_process_threads();
3531 } while (restart_needed);
3532
3533 pset_node_t node = &pset_node0;
3534 pset = node->psets;
3535
3536 do {
3537 do {
3538 restart_needed = FALSE;
3539 while (pset != NULL) {
3540 s = splsched();
3541 pset_lock(pset);
3542
3543 if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
3544 for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3545 restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3546 if (restart_needed) {
3547 break;
3548 }
3549 }
3550 queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3551 sched_clutch_bucket_t clutch_bucket;
3552 qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3553 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
3554 restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
3555 if (restart_needed) {
3556 break;
3557 }
3558 }
3559 }
3560
3561 pset_unlock(pset);
3562 splx(s);
3563
3564 if (restart_needed) {
3565 break;
3566 }
3567 pset = pset->pset_list;
3568 }
3569
3570 if (restart_needed) {
3571 break;
3572 }
3573 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3574
3575 /* Ok, we now have a collection of candidates -- fix them. */
3576 thread_update_process_threads();
3577 } while (restart_needed);
3578 }
3579
3580 /*
3581 * For the clutch scheduler, the run counts are maintained in the clutch
3582 * buckets (i.e thread group scheduling structure).
3583 */
3584 static uint32_t
sched_clutch_run_incr(thread_t thread)3585 sched_clutch_run_incr(thread_t thread)
3586 {
3587 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3588 uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3589 sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3590 return new_count;
3591 }
3592
3593 static uint32_t
sched_clutch_run_decr(thread_t thread)3594 sched_clutch_run_decr(thread_t thread)
3595 {
3596 assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3597 uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3598 sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3599 return new_count;
3600 }
3601
3602 /*
3603 * For threads that have changed sched_pri without changing the
3604 * base_pri for any reason other than decay, use the sched_pri
3605 * as the bucketizing priority instead of base_pri. All such
3606 * changes are typically due to kernel locking primitives boosts
3607 * or demotions.
3608 */
3609 static boolean_t
sched_thread_sched_pri_promoted(thread_t thread)3610 sched_thread_sched_pri_promoted(thread_t thread)
3611 {
3612 return (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3613 (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3614 (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3615 (thread->kern_promotion_schedpri != 0);
3616 }
3617
3618 #endif /* !SCHED_TEST_HARNESS */
3619
3620 /*
3621 * Routine to update the scheduling bucket for the thread.
3622 *
3623 * In the clutch scheduler implementation, the thread's bucket
3624 * is based on sched_pri if it was promoted due to a kernel
3625 * primitive; otherwise its based on the thread base_pri. This
3626 * enhancement allows promoted threads to reach a higher priority
3627 * bucket and potentially get selected sooner for scheduling.
3628 *
3629 * Also, the clutch scheduler does not honor fixed priority below
3630 * FG priority. It simply puts those threads in the corresponding
3631 * timeshare bucket. The reason for to do that is because it is
3632 * extremely hard to define the scheduling properties of such threads
3633 * and they typically lead to performance issues.
3634 *
3635 * Called with the thread lock held and the thread held off the runqueue.
3636 */
3637
3638 void
sched_clutch_update_thread_bucket(thread_t thread)3639 sched_clutch_update_thread_bucket(thread_t thread)
3640 {
3641 sched_bucket_t old_bucket = thread->th_sched_bucket;
3642 thread_assert_runq_null(thread);
3643 int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3644 sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3645
3646 if (old_bucket == new_bucket) {
3647 return;
3648 }
3649
3650 thread->th_sched_bucket = new_bucket;
3651 thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3652 /*
3653 * Since this is called after the thread has been removed from the runq,
3654 * only the run counts need to be updated. The re-insert into the runq
3655 * would put the thread into the correct new bucket's runq.
3656 */
3657 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3658 sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3659 sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3660 }
3661 }
3662
3663 #if !SCHED_TEST_HARNESS
3664
3665 static void
sched_clutch_thread_group_recommendation_change(__unused struct thread_group * tg,__unused cluster_type_t new_recommendation)3666 sched_clutch_thread_group_recommendation_change(__unused struct thread_group *tg, __unused cluster_type_t new_recommendation)
3667 {
3668 /* Clutch ignores the recommendation because Clutch does not migrate
3669 * threads between cluster types independently from the Edge scheduler.
3670 */
3671 }
3672
3673 #endif /* !SCHED_TEST_HARNESS */
3674
3675 #if CONFIG_SCHED_EDGE
3676
3677 /* Implementation of the AMP version of the clutch scheduler */
3678
3679 static void
3680 sched_edge_init(void);
3681
3682 static void
3683 sched_edge_pset_init(processor_set_t pset);
3684
3685 static thread_t
3686 sched_edge_processor_idle(processor_set_t pset);
3687
3688 static boolean_t
3689 sched_edge_processor_queue_empty(processor_t processor);
3690
3691 static void
3692 sched_edge_processor_queue_shutdown(processor_t processor);
3693
3694 static processor_t
3695 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
3696
3697 static bool
3698 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason);
3699
3700 static bool
3701 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3702
3703 static void
3704 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3705
3706 static bool
3707 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3708
3709 static void
3710 sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3711
3712 static void
3713 sched_edge_cpu_init_completed(void);
3714
3715 static bool
3716 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3717
3718 static bool
3719 sched_edge_steal_thread_enabled(processor_set_t pset);
3720
3721 static sched_ipi_type_t
3722 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3723
3724 static uint32_t
3725 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3726
3727 static uint32_t
3728 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3729
3730 const struct sched_dispatch_table sched_edge_dispatch = {
3731 .sched_name = "edge",
3732 .init = sched_edge_init,
3733 .timebase_init = sched_timeshare_timebase_init,
3734 .processor_init = sched_clutch_processor_init,
3735 .pset_init = sched_edge_pset_init,
3736 .choose_thread = sched_clutch_choose_thread,
3737 .steal_thread_enabled = sched_edge_steal_thread_enabled,
3738 .steal_thread = sched_edge_processor_idle,
3739 .choose_processor = sched_edge_choose_processor,
3740 .processor_enqueue = sched_clutch_processor_enqueue,
3741 .processor_queue_remove = sched_clutch_processor_queue_remove,
3742 .processor_queue_empty = sched_edge_processor_queue_empty,
3743 .priority_is_urgent = priority_is_urgent,
3744 .processor_csw_check = sched_clutch_processor_csw_check,
3745 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
3746 .initial_quantum_size = sched_clutch_initial_quantum_size,
3747 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3748 .processor_runq_count = sched_clutch_runq_count,
3749 .processor_bound_count = sched_clutch_processor_bound_count,
3750 .multiple_psets_enabled = TRUE,
3751 .avoid_processor_enabled = TRUE,
3752 .thread_avoid_processor = sched_edge_thread_avoid_processor,
3753 .processor_balance = sched_edge_balance,
3754 .qos_max_parallelism = sched_edge_qos_max_parallelism,
3755 .check_spill = sched_edge_check_spill,
3756 .ipi_policy = sched_edge_ipi_policy,
3757 .thread_should_yield = sched_edge_thread_should_yield,
3758 .update_thread_bucket = sched_clutch_update_thread_bucket,
3759 .cpu_init_completed = sched_edge_cpu_init_completed,
3760 .thread_eligible_for_pset = sched_edge_thread_eligible_for_pset,
3761 #if !SCHED_TEST_HARNESS
3762 .maintenance_continuation = sched_timeshare_maintenance_continue,
3763 .compute_timeshare_priority = sched_compute_timeshare_priority,
3764 .choose_node = sched_choose_node,
3765 .processor_queue_shutdown = sched_edge_processor_queue_shutdown,
3766 .can_update_priority = can_update_priority,
3767 .update_priority = update_priority,
3768 .lightweight_update_priority = lightweight_update_priority,
3769 .quantum_expire = sched_default_quantum_expire,
3770 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3771 .thread_update_scan = sched_clutch_thread_update_scan,
3772 .rt_runq = sched_rtlocal_runq,
3773 .rt_init = sched_rtlocal_init,
3774 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
3775 .rt_runq_scan = sched_rtlocal_runq_scan,
3776 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
3777 .rt_steal_thread = sched_rtlocal_steal_thread,
3778 .run_count_incr = sched_clutch_run_incr,
3779 .run_count_decr = sched_clutch_run_decr,
3780 .pset_made_schedulable = sched_edge_pset_made_schedulable,
3781 .thread_group_recommendation_change = NULL,
3782 #endif /* !SCHED_TEST_HARNESS */
3783 };
3784
3785 static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3786
3787 /*
3788 * sched_edge_thread_bound_cluster_id()
3789 *
3790 * Routine to determine which cluster a particular thread is bound to. Uses
3791 * the sched_flags on the thread to map back to a specific cluster id.
3792 *
3793 * <Edge Multi-cluster Support Needed>
3794 */
3795 static uint32_t
sched_edge_thread_bound_cluster_id(thread_t thread)3796 sched_edge_thread_bound_cluster_id(thread_t thread)
3797 {
3798 assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3799 return thread->th_bound_cluster_id;
3800 }
3801
3802 /* Forward declaration for some thread migration routines */
3803 static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3804 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3805 static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3806 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
3807
3808 /*
3809 * sched_edge_config_set()
3810 *
3811 * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3812 * policies in the scheduler.
3813 */
3814 static void
sched_edge_config_set(uint32_t src_cluster,uint32_t dst_cluster,sched_clutch_edge edge_config)3815 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
3816 {
3817 sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
3818 edge->sce_edge_packed = edge_config.sce_edge_packed;
3819 }
3820
3821 /*
3822 * sched_edge_config_get()
3823 *
3824 * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3825 * if it needs to update edges.
3826 */
3827 static sched_clutch_edge
sched_edge_config_get(uint32_t src_cluster,uint32_t dst_cluster)3828 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
3829 {
3830 return pset_array[src_cluster]->sched_edges[dst_cluster];
3831 }
3832
3833 /*
3834 * sched_edge_matrix_set()
3835 *
3836 * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3837 * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3838 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3839 */
3840 void
sched_edge_matrix_set(sched_clutch_edge * edge_matrix,bool * edge_changes_bitmap,__unused uint64_t flags,uint64_t matrix_order)3841 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3842 {
3843 uint32_t edge_index = 0;
3844 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3845 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3846 if (edge_changes_bitmap[edge_index]) {
3847 sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3848 }
3849 edge_index++;
3850 }
3851 }
3852 }
3853
3854 /*
3855 * sched_edge_matrix_get()
3856 *
3857 * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3858 * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3859 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3860 */
3861 void
sched_edge_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_request_bitmap,__unused uint64_t flags,uint64_t matrix_order)3862 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3863 {
3864 uint32_t edge_index = 0;
3865 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3866 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3867 if (edge_request_bitmap[edge_index]) {
3868 edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3869 }
3870 edge_index++;
3871 }
3872 }
3873 }
3874
3875 /*
3876 * sched_edge_init()
3877 *
3878 * Routine to initialize the data structures for the Edge scheduler.
3879 */
3880 static void
sched_edge_init(void)3881 sched_edge_init(void)
3882 {
3883 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3884 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3885 }
3886 sched_timeshare_init();
3887 sched_clutch_tunables_init();
3888 sched_edge_max_clusters = ml_get_cluster_count();
3889 }
3890
3891 static void
sched_edge_pset_init(processor_set_t pset)3892 sched_edge_pset_init(processor_set_t pset)
3893 {
3894 uint32_t pset_cluster_id = pset->pset_cluster_id;
3895 pset->pset_type = (pset->pset_cluster_type == PSET_AMP_P) ? CLUSTER_TYPE_P : CLUSTER_TYPE_E;
3896
3897 /* Set the edge weight and properties for the pset itself */
3898 bitmap_clear(pset->foreign_psets, pset_cluster_id);
3899 bitmap_clear(pset->native_psets, pset_cluster_id);
3900 bitmap_clear(pset->local_psets, pset_cluster_id);
3901 bitmap_clear(pset->remote_psets, pset_cluster_id);
3902 pset->sched_edges[pset_cluster_id].sce_edge_packed = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0}.sce_edge_packed;
3903 sched_clutch_root_init(&pset->pset_clutch_root, pset);
3904 bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id);
3905 }
3906
3907 static boolean_t
sched_edge_processor_queue_empty(processor_t processor)3908 sched_edge_processor_queue_empty(processor_t processor)
3909 {
3910 return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
3911 (sched_clutch_bound_runq(processor)->count == 0);
3912 }
3913
3914 static void
sched_edge_check_spill(__unused processor_set_t pset,__unused thread_t thread)3915 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
3916 {
3917 assert(thread->bound_processor == PROCESSOR_NULL);
3918 }
3919
3920 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
3921 SCHED_EDGE_YIELD_RUNQ_NONEMPTY = 0x0,
3922 SCHED_EDGE_YIELD_FOREIGN_RUNNABLE = 0x1,
3923 SCHED_EDGE_YIELD_FOREIGN_RUNNING = 0x2,
3924 SCHED_EDGE_YIELD_STEAL_POSSIBLE = 0x3,
3925 SCHED_EDGE_YIELD_DISALLOW = 0x4,
3926 });
3927
3928 static bool
sched_edge_thread_should_yield(processor_t processor,__unused thread_t thread)3929 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
3930 {
3931 if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
3932 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3933 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
3934 return true;
3935 }
3936
3937 /*
3938 * The yield logic should follow the same logic that steal_thread () does. The
3939 * thread_should_yield() is effectively trying to quickly check that if the
3940 * current thread gave up CPU, is there any other thread that would execute
3941 * on this CPU. So it needs to provide the same answer as the steal_thread()/
3942 * processor Idle logic.
3943 */
3944 if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
3945 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3946 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
3947 return true;
3948 }
3949 if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
3950 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3951 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
3952 return true;
3953 }
3954
3955 processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
3956 if (steal_candidate != NULL) {
3957 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3958 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
3959 return true;
3960 }
3961
3962 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
3963 0, SCHED_EDGE_YIELD_DISALLOW);
3964 return false;
3965 }
3966
3967 #if !SCHED_TEST_HARNESS
3968
3969 static void
sched_edge_processor_queue_shutdown(processor_t processor)3970 sched_edge_processor_queue_shutdown(processor_t processor)
3971 {
3972 processor_set_t pset = processor->processor_set;
3973 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3974 thread_t thread;
3975 queue_head_t tqueue;
3976
3977 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
3978 if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
3979 pset_unlock(pset);
3980 return;
3981 }
3982
3983 bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3984
3985 queue_init(&tqueue);
3986 while (sched_clutch_root_count(pset_clutch_root) > 0) {
3987 thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3988 enqueue_tail(&tqueue, &thread->runq_links);
3989 }
3990 pset_unlock(pset);
3991
3992 qe_foreach_element_safe(thread, &tqueue, runq_links) {
3993 remqueue(&thread->runq_links);
3994 thread_lock(thread);
3995 thread_setrun(thread, SCHED_TAILQ);
3996 thread_unlock(thread);
3997 }
3998 }
3999
4000 #endif /* !SCHED_TEST_HARNESS */
4001
4002 /*
4003 * sched_edge_cluster_load_metric()
4004 *
4005 * The load metric for a cluster is a measure of the average scheduling latency
4006 * experienced by threads on that cluster. It is a product of the average number
4007 * of threads in the runqueue and the average execution time for threads. The metric
4008 * has special values in the following cases:
4009 * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
4010 * the maximum value to disallow any threads to migrate to this cluster.
4011 * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
4012 * to be spread across the platform quickly for ncpu wide workloads.
4013 */
4014 static uint32_t
sched_edge_cluster_load_metric(processor_set_t pset,sched_bucket_t sched_bucket)4015 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
4016 {
4017 if (pset_is_recommended(pset) == false) {
4018 return UINT32_MAX;
4019 }
4020 return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
4021 }
4022
4023 /*
4024 *
4025 * Edge Scheduler Steal/Rebalance logic
4026 *
4027 * = Generic scheduler logic =
4028 *
4029 * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
4030 * find any thread for execution in its runqueue. The aim of the steal operation
4031 * is to find other threads running/runnable in other clusters which should be
4032 * executed here.
4033 *
4034 * If the steal callout does not return a thread, the thread_select() logic calls
4035 * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
4036 * threads and idle out the current CPU.
4037 *
4038 * = SCHED(steal_thread) for Edge Scheduler =
4039 *
4040 * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
4041 * routine tries to do the following operations in order:
4042 * (1) Find foreign runnnable threads in non-native cluster
4043 * runqueues (sched_edge_foreign_runnable_thread_remove())
4044 * (2) Check if foreign threads are running on the non-native
4045 * clusters (sched_edge_foreign_running_thread_available())
4046 * - If yes, return THREAD_NULL for the steal callout and
4047 * perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
4048 * (3) Steal a thread from another cluster based on edge
4049 * weights (sched_edge_steal_thread())
4050 *
4051 * = SCHED(processor_balance) for Edge Scheduler =
4052 *
4053 * If steal_thread did not return a thread for the processor, use
4054 * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
4055 *
4056 * = Clutch Bucket Preferred Cluster Overrides =
4057 *
4058 * Since these operations (just like thread migrations on enqueue)
4059 * move threads across clusters, they need support for handling clutch
4060 * bucket group level preferred cluster recommendations.
4061 * For (1), a clutch bucket will be in the foreign runnable queue based
4062 * on the clutch bucket group preferred cluster.
4063 * For (2), the running thread will set the bit on the processor based
4064 * on its preferred cluster type.
4065 * For (3), the edge configuration would prevent threads from being stolen
4066 * in the wrong direction.
4067 *
4068 * = SCHED(thread_should_yield) =
4069 * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
4070 * since that is expecting the same answer as if thread_select() was called on a core
4071 * with an empty runqueue.
4072 */
4073
4074 static bool
sched_edge_steal_thread_enabled(__unused processor_set_t pset)4075 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
4076 {
4077 /*
4078 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
4079 */
4080 return true;
4081 }
4082
4083 static processor_set_t
sched_edge_steal_candidate(processor_set_t pset)4084 sched_edge_steal_candidate(processor_set_t pset)
4085 {
4086 uint32_t dst_cluster_id = pset->pset_cluster_id;
4087 for (int cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
4088 processor_set_t candidate_pset = pset_array[cluster_id];
4089 if (cluster_id == dst_cluster_id) {
4090 continue;
4091 }
4092 if (candidate_pset == NULL) {
4093 continue;
4094 }
4095 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
4096 if (incoming_edge->sce_steal_allowed && (bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1)) {
4097 return candidate_pset;
4098 }
4099 }
4100 return NULL;
4101 }
4102
4103 static boolean_t
sched_edge_foreign_runnable_thread_available(processor_set_t pset)4104 sched_edge_foreign_runnable_thread_available(processor_set_t pset)
4105 {
4106 /* Find all the clusters that are foreign for this cluster */
4107 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4108 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4109 /*
4110 * For each cluster, see if there are any runnable foreign threads.
4111 * This check is currently being done without the pset lock to make it cheap for
4112 * the common case.
4113 */
4114 processor_set_t target_pset = pset_array[cluster];
4115 if (pset_is_recommended(target_pset) == false) {
4116 continue;
4117 }
4118
4119 if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
4120 return true;
4121 }
4122 }
4123 return false;
4124 }
4125
4126 static thread_t
sched_edge_foreign_runnable_thread_remove(processor_set_t pset,uint64_t ctime)4127 sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
4128 {
4129 thread_t thread = THREAD_NULL;
4130
4131 /* Find all the clusters that are foreign for this cluster */
4132 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4133 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4134 /*
4135 * For each cluster, see if there are any runnable foreign threads.
4136 * This check is currently being done without the pset lock to make it cheap for
4137 * the common case.
4138 */
4139 processor_set_t target_pset = pset_array[cluster];
4140 if (pset_is_recommended(target_pset) == false) {
4141 continue;
4142 }
4143
4144 if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
4145 continue;
4146 }
4147 /*
4148 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
4149 * and get the highest priority thread.
4150 */
4151 pset_lock(target_pset);
4152 if (pset_is_recommended(target_pset)) {
4153 thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
4154 sched_update_pset_load_average(target_pset, ctime);
4155 }
4156 pset_unlock(target_pset);
4157
4158 /*
4159 * Edge Scheduler Optimization
4160 *
4161 * The current implementation immediately returns as soon as it finds a foreign
4162 * runnable thread. This could be enhanced to look at highest priority threads
4163 * from all foreign clusters and pick the highest amongst them. That would need
4164 * some form of global state across psets to make that kind of a check cheap.
4165 */
4166 if (thread != THREAD_NULL) {
4167 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
4168 break;
4169 }
4170 /* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
4171 }
4172
4173 return thread;
4174 }
4175
4176 /*
4177 * sched_edge_cpu_running_foreign_shared_rsrc_available()
4178 *
4179 * Routine to determine if the thread running on a CPU is a shared resource thread
4180 * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
4181 * a CPU going idle on a pset should rebalance a running shared resource heavy thread
4182 * from another non-ideal cluster based on the former's shared resource load.
4183 */
4184 static boolean_t
sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset,int foreign_cpu,processor_set_t idle_pset)4185 sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
4186 {
4187 boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
4188 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
4189 return false;
4190 }
4191
4192 boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
4193 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
4194 return false;
4195 }
4196 return true;
4197 }
4198
4199 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset)4200 sched_edge_foreign_running_thread_available(processor_set_t pset)
4201 {
4202 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4203 int cluster = -1;
4204 while ((cluster = sched_edge_iterate_clusters_ordered(pset, foreign_pset_bitmap[0], cluster)) != -1) {
4205 /* Skip the pset if its not schedulable */
4206 processor_set_t target_pset = pset_array[cluster];
4207 if (pset_is_recommended(target_pset) == false) {
4208 continue;
4209 }
4210
4211 uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
4212 for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
4213 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
4214 continue;
4215 }
4216 return true;
4217 }
4218 }
4219 return false;
4220 }
4221
4222 /*
4223 * sched_edge_steal_possible()
4224 *
4225 * Determines whether we can and should steal a thread from
4226 * the candidate_pset to run it on the idle_pset. When returning
4227 * true, the function also writes the scheduling bucket that we
4228 * should steal from into the bucket_for_steal out parameter.
4229 *
4230 * Always called with the pset lock for candidate_pset held.
4231 */
4232 static bool
sched_edge_steal_possible(processor_set_t idle_pset,processor_set_t candidate_pset,sched_bucket_t * bucket_for_steal)4233 sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_pset, sched_bucket_t *bucket_for_steal)
4234 {
4235 sched_clutch_root_t candidate_clutch_root = &candidate_pset->pset_clutch_root;
4236
4237 int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(candidate_clutch_root, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
4238 if (highest_runnable_bucket == -1) {
4239 /* Candidate cluster runq is empty of unbound threads */
4240 return false;
4241 }
4242
4243 if (idle_pset->pset_type == candidate_pset->pset_type) {
4244 /* Always allow stealing from homogeneous clusters */
4245 *bucket_for_steal = (sched_bucket_t)highest_runnable_bucket;
4246 return true;
4247 }
4248
4249 for (int unbound_qos = highest_runnable_bucket; unbound_qos >= 0; unbound_qos = bitmap_lsb_next(candidate_clutch_root->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, unbound_qos)) {
4250 uint32_t candidate_runq_depth = os_atomic_load(&candidate_pset->pset_runnable_depth[unbound_qos], relaxed);
4251 if (candidate_runq_depth > pset_available_cpu_count(candidate_pset)) {
4252 /* Candidate cluster has excess load at this QoS (and at least one unbound thread we can steal!) */
4253 *bucket_for_steal = (sched_bucket_t)unbound_qos;
4254 return true;
4255 }
4256 }
4257 /* None of the unbound root buckets are overloaded */
4258 return false;
4259 }
4260
4261 static thread_t
sched_edge_steal_thread(processor_set_t pset,uint64_t candidate_pset_bitmap)4262 sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap)
4263 {
4264 thread_t stolen_thread = THREAD_NULL;
4265
4266 /*
4267 * Edge Scheduler Optimization
4268 *
4269 * The logic today bails as soon as it finds a cluster where the cluster load is
4270 * greater than the edge weight. Maybe it should have a more advanced version
4271 * which looks for the maximum delta etc.
4272 */
4273 int cluster_id = -1;
4274 while ((cluster_id = sched_edge_iterate_clusters_ordered(pset, candidate_pset_bitmap, cluster_id)) != -1) {
4275 processor_set_t steal_from_pset = pset_array[cluster_id];
4276 if (steal_from_pset == NULL) {
4277 continue;
4278 }
4279 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[pset->pset_cluster_id];
4280 if (incoming_edge->sce_steal_allowed == false) {
4281 continue;
4282 }
4283 pset_lock(steal_from_pset);
4284 sched_bucket_t bucket_for_steal;
4285 if (sched_edge_steal_possible(pset, steal_from_pset, &bucket_for_steal)) {
4286 uint64_t current_timestamp = mach_absolute_time();
4287 sched_clutch_root_t clutch_root_for_steal = &steal_from_pset->pset_clutch_root;
4288 stolen_thread = sched_clutch_thread_unbound_lookup(clutch_root_for_steal, &clutch_root_for_steal->scr_unbound_buckets[bucket_for_steal], NULL, NULL);
4289 sched_clutch_thread_remove(clutch_root_for_steal, stolen_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
4290
4291 sched_clutch_dbg_thread_select_packed_t debug_info = {0};
4292 debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
4293 debug_info.trace_data.traverse_mode = SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY;
4294 debug_info.trace_data.cluster_id = steal_from_pset->pset_cluster_id;
4295 debug_info.trace_data.selection_was_cluster_bound = false;
4296 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
4297 thread_tid(stolen_thread), thread_group_get_id(stolen_thread->thread_group), bucket_for_steal, debug_info.scdts_trace_data_packed, 0);
4298 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(stolen_thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
4299
4300 sched_update_pset_load_average(steal_from_pset, current_timestamp);
4301 }
4302 pset_unlock(steal_from_pset);
4303 if (stolen_thread != THREAD_NULL) {
4304 break;
4305 }
4306 }
4307 return stolen_thread;
4308 }
4309
4310 /*
4311 * sched_edge_processor_idle()
4312 *
4313 * The routine is the implementation for steal_thread() for the Edge scheduler.
4314 */
4315 static thread_t
sched_edge_processor_idle(processor_set_t pset)4316 sched_edge_processor_idle(processor_set_t pset)
4317 {
4318 thread_t thread = THREAD_NULL;
4319
4320 uint64_t ctime = mach_absolute_time();
4321
4322 processor_t processor = current_processor();
4323 bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
4324
4325 /* Each of the operations acquire the lock for the pset they target */
4326 pset_unlock(pset);
4327
4328 /* Find highest priority runnable thread on all non-native clusters */
4329 thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
4330 if (thread != THREAD_NULL) {
4331 return thread;
4332 }
4333
4334 /* Find highest priority runnable thread on all native clusters */
4335 thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4336 if (thread != THREAD_NULL) {
4337 return thread;
4338 }
4339
4340 /* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4341 boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4342 if (rebalance_needed) {
4343 return THREAD_NULL;
4344 }
4345
4346 /* No foreign threads found; find a thread to steal from all clusters based on weights/loads etc. */
4347 thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4348 return thread;
4349 }
4350
4351 /* Return true if this shared resource thread has a better cluster to run on */
4352 static bool
sched_edge_shared_rsrc_migrate_possible(thread_t thread,processor_set_t preferred_pset,processor_set_t current_pset)4353 sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4354 {
4355 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4356 uint64_t current_pset_load = sched_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4357 /*
4358 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4359 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4360 *
4361 * Edge Scheduler Optimization
4362 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4363 */
4364 uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4365
4366 uint64_t eligible_pset_bitmask = 0;
4367 if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4368 /*
4369 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4370 * only among clusters native with the preferred cluster.
4371 */
4372 eligible_pset_bitmask = preferred_pset->native_psets[0];
4373 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4374 } else {
4375 /* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4376 eligible_pset_bitmask = sched_edge_available_pset_bitmask[0];
4377 }
4378
4379 /* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4380 for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4381 if (cluster_id == current_pset->pset_cluster_id) {
4382 continue;
4383 }
4384 uint64_t cluster_load = sched_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4385 if (current_pset_adjusted_load > cluster_load) {
4386 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4387 return true;
4388 }
4389 }
4390 return false;
4391 }
4392
4393 /* Return true if this thread should not continue running on this processor */
4394 static bool
sched_edge_thread_avoid_processor(processor_t processor,thread_t thread,ast_t reason)4395 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason)
4396 {
4397 if (thread->bound_processor == processor) {
4398 /* Thread is bound here */
4399 return false;
4400 }
4401
4402 /*
4403 * On quantum expiry, check the migration bitmask if this thread should be migrated off this core.
4404 * A migration is only recommended if there's also an idle core available that needn't be avoided.
4405 */
4406 if (reason & AST_QUANTUM) {
4407 if (bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id)) {
4408 uint64_t non_avoided_idle_primary_map = processor->processor_set->cpu_state_map[PROCESSOR_IDLE] & processor->processor_set->recommended_bitmask & ~processor->processor_set->perfcontrol_cpu_migration_bitmask;
4409 if (non_avoided_idle_primary_map != 0) {
4410 return true;
4411 }
4412 }
4413 }
4414
4415 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4416
4417 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) &&
4418 preferred_pset->pset_id != processor->processor_set->pset_id &&
4419 pset_type_is_recommended(preferred_pset)) {
4420 /* We should send this thread to the bound cluster */
4421 return true;
4422 }
4423
4424 /* Evaluate shared resource policies */
4425 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
4426 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4427 }
4428 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
4429 if (processor->processor_set->pset_type != preferred_pset->pset_type &&
4430 pset_type_is_recommended(preferred_pset)) {
4431 return true;
4432 }
4433 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4434 }
4435
4436 /*
4437 * For long running parallel workloads, it is important to rebalance threads across
4438 * E/P clusters so that they make equal forward progress. This is achieved through
4439 * threads expiring their quantum on the non-preferred cluster type and explicitly
4440 * rebalancing to the preferred cluster runqueue.
4441 */
4442 if ((processor->processor_set->pset_type != preferred_pset->pset_type) &&
4443 pset_type_is_recommended(preferred_pset)) {
4444 return true;
4445 }
4446 /* If the preferred pset for the thread is now idle, try and migrate thread to that cluster */
4447 if ((processor->processor_set != preferred_pset) &&
4448 (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
4449 return true;
4450 }
4451
4452 return false;
4453 }
4454
4455 static bool
sched_edge_balance(__unused processor_t cprocessor,processor_set_t cpset)4456 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
4457 {
4458 assert(cprocessor == current_processor());
4459 pset_unlock(cpset);
4460
4461 uint64_t ast_processor_map = 0;
4462 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4463
4464 bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
4465 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4466 /* Skip the pset if its not schedulable */
4467 processor_set_t target_pset = pset_array[cluster];
4468 if (pset_is_recommended(target_pset) == false) {
4469 continue;
4470 }
4471
4472 pset_lock(target_pset);
4473 uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
4474 for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
4475 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
4476 continue;
4477 }
4478 processor_t target_cpu = processor_array[cpuid];
4479 ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
4480 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4481 bit_set(ast_processor_map, cpuid);
4482 }
4483 }
4484 pset_unlock(target_pset);
4485 }
4486
4487 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4488 processor_t ast_processor = processor_array[cpuid];
4489 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4490 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
4491 }
4492
4493 /* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
4494 return ast_processor_map != 0;
4495 }
4496
4497 /*
4498 * sched_edge_migration_check()
4499 *
4500 * Routine to evaluate an edge between two clusters to decide if migration is possible
4501 * across that edge. Also updates the selected_pset and max_edge_delta out parameters
4502 * accordingly. The return value indicates if the invoking routine should short circuit
4503 * the search, since an ideal candidate has been found. The routine looks at the regular
4504 * edges and cluster loads or the shared resource loads based on the type of thread.
4505 */
4506 static bool
sched_edge_migration_check(uint32_t cluster_id,processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread,processor_set_t * selected_pset,uint32_t * max_edge_delta)4507 sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
4508 uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
4509 {
4510 uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4511 cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
4512 processor_set_t dst_pset = pset_array[cluster_id];
4513 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4514 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4515
4516 if (cluster_id == preferred_cluster_id) {
4517 return false;
4518 }
4519
4520 if (dst_pset == NULL) {
4521 return false;
4522 }
4523
4524 sched_clutch_edge *edge = preferred_pset->sched_edges;
4525 if (edge[cluster_id].sce_migration_allowed == false) {
4526 return false;
4527 }
4528 uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4529 if (dst_load == 0) {
4530 /* The candidate cluster is idle; select it immediately for execution */
4531 *selected_pset = dst_pset;
4532 *max_edge_delta = preferred_cluster_load;
4533 return true;
4534 }
4535
4536 uint32_t edge_delta = 0;
4537 if (dst_load > preferred_cluster_load) {
4538 return false;
4539 }
4540 edge_delta = preferred_cluster_load - dst_load;
4541 if (!shared_rsrc_thread && (edge_delta < edge[cluster_id].sce_migration_weight)) {
4542 /*
4543 * For non shared resource threads, use the edge migration weight to decide if
4544 * this cluster is over-committed at the QoS level of this thread.
4545 */
4546 return false;
4547 }
4548
4549 if (edge_delta < *max_edge_delta) {
4550 return false;
4551 }
4552 if (edge_delta == *max_edge_delta) {
4553 /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4554 boolean_t selected_homogeneous = (pset_type_for_id((*selected_pset)->pset_cluster_id) == preferred_cluster_type);
4555 boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4556 if (selected_homogeneous || !candidate_homogeneous) {
4557 return false;
4558 }
4559 }
4560 /* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
4561 *max_edge_delta = edge_delta;
4562 *selected_pset = dst_pset;
4563 return false;
4564 }
4565
4566 /*
4567 * sched_edge_iterate_clusters_ordered()
4568 *
4569 * Routine to iterate clusters in die local order. For multi-die machines,
4570 * the routine ensures that the candidate clusters on the same die as the
4571 * passed in pset are returned before the remote die clusters. This should
4572 * be used in all places where cluster selection in die order matters.
4573 */
4574
4575 static int
sched_edge_iterate_clusters_ordered(processor_set_t starting_pset,uint64_t candidate_map,int previous_cluster)4576 sched_edge_iterate_clusters_ordered(processor_set_t starting_pset, uint64_t candidate_map, int previous_cluster)
4577 {
4578 int cluster_id = -1;
4579
4580 uint64_t local_candidate_map = starting_pset->local_psets[0] & candidate_map;
4581 uint64_t remote_candidate_map = starting_pset->remote_psets[0] & candidate_map;
4582
4583 if (previous_cluster == -1) {
4584 /* previous_cluster == -1 indicates the initial condition */
4585 cluster_id = bit_first(local_candidate_map);
4586 if (cluster_id != -1) {
4587 return cluster_id;
4588 }
4589 return bit_first(remote_candidate_map);
4590 } else {
4591 /*
4592 * After the initial condition, the routine attempts to return a
4593 * cluster in the previous_cluster's locality. If none is available,
4594 * it looks at remote clusters.
4595 */
4596 if (bit_test(local_candidate_map, previous_cluster)) {
4597 cluster_id = bit_next(local_candidate_map, previous_cluster);
4598 if (cluster_id != -1) {
4599 return cluster_id;
4600 } else {
4601 return bit_first(remote_candidate_map);
4602 }
4603 }
4604 return bit_next(remote_candidate_map, previous_cluster);
4605 }
4606 }
4607
4608 /*
4609 * sched_edge_migrate_edges_evaluate()
4610 *
4611 * Routine to find the candidate for thread migration based on edge weights.
4612 *
4613 * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
4614 * return preferred_pset if its the most ideal destination for this thread.
4615 */
4616 static processor_set_t
sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread)4617 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
4618 {
4619 processor_set_t selected_pset = preferred_pset;
4620 uint32_t max_edge_delta = 0;
4621 bool search_complete = false;
4622 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4623 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4624
4625 bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
4626 bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
4627 /* Always start the search with the native clusters */
4628 int cluster_id = -1;
4629 while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, native_pset_bitmap[0], cluster_id)) != -1) {
4630 search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4631 if (search_complete) {
4632 break;
4633 }
4634 }
4635
4636 if (search_complete) {
4637 return selected_pset;
4638 }
4639
4640 if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
4641 /*
4642 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
4643 * to fill up the preferred cluster and its homogeneous peers first.
4644 */
4645
4646 if (max_edge_delta > 0) {
4647 /*
4648 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
4649 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
4650 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
4651 * should be placed.
4652 */
4653 return selected_pset;
4654 }
4655 /*
4656 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
4657 * any more capacity left.
4658 */
4659 if (sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
4660 return preferred_pset;
4661 }
4662 /*
4663 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
4664 * at non-native clusters for capacity.
4665 */
4666 }
4667
4668 /* Now look at the non-native clusters */
4669 cluster_id = -1;
4670 while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, foreign_pset_bitmap[0], cluster_id)) != -1) {
4671 search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4672 if (search_complete) {
4673 break;
4674 }
4675 }
4676 return selected_pset;
4677 }
4678
4679 /*
4680 * sched_edge_candidate_alternative()
4681 *
4682 * Routine to find an alternative cluster from candidate_cluster_bitmap since the
4683 * selected_pset is not available for execution. The logic tries to prefer homogeneous
4684 * clusters over heterogeneous clusters since this is typically used in thread
4685 * placement decisions.
4686 */
4687 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
4688 static processor_set_t
sched_edge_candidate_alternative(processor_set_t selected_pset,uint64_t candidate_cluster_bitmap)4689 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
4690 {
4691 /*
4692 * It looks like the most ideal pset is not available for scheduling currently.
4693 * Try to find a homogeneous cluster that is still available.
4694 */
4695 uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
4696 int available_cluster_id = lsb_first(available_native_clusters);
4697 if (available_cluster_id == -1) {
4698 /* Looks like none of the homogeneous clusters are available; pick the first available cluster */
4699 available_cluster_id = bit_first(candidate_cluster_bitmap);
4700 }
4701 assert(available_cluster_id != -1);
4702 return pset_array[available_cluster_id];
4703 }
4704
4705 /*
4706 * sched_edge_switch_pset_lock()
4707 *
4708 * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4709 * switch_pset_locks.
4710 * Returns the newly locked pset after the switch.
4711 */
4712 static processor_set_t
sched_edge_switch_pset_lock(processor_set_t selected_pset,processor_set_t locked_pset,bool switch_pset_locks)4713 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
4714 {
4715 if (!switch_pset_locks) {
4716 return locked_pset;
4717 }
4718 if (selected_pset != locked_pset) {
4719 pset_unlock(locked_pset);
4720 pset_lock(selected_pset);
4721 return selected_pset;
4722 } else {
4723 return locked_pset;
4724 }
4725 }
4726
4727 /*
4728 * sched_edge_amp_rebalance_pset()
4729 *
4730 * Routine to decide where a thread which is eligible for AMP rebalance (i.e.
4731 * has executed on non-preferred cluster type for a while) should be enqueued.
4732 * The algorithm maintains a history of AMP rebalance decisions on the clutch
4733 * bucket group of the workload and round-robins between clusters to ensure
4734 * that all threads get a chance on the performance cores and make equal
4735 * progress.
4736 */
4737 static processor_set_t
sched_edge_amp_rebalance_pset(processor_set_t preferred_pset,thread_t thread)4738 sched_edge_amp_rebalance_pset(processor_set_t preferred_pset, thread_t thread)
4739 {
4740 sched_clutch_t clutch = sched_clutch_for_thread(thread);
4741 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
4742
4743 uint32_t last_chosen_cluster, new_chosen_cluster;
4744
4745 /* Only AMP rebalance within clusters native to the preferred cluster */
4746 uint64_t eligible_pset_bitmask = preferred_pset->native_psets[0];
4747 /* Preferred cluster is also eligible for rebalancing */
4748 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4749 /* Atomically update the AMP rebalance cluster for the clutch bucket group */
4750 os_atomic_rmw_loop(&clutch_bucket_group->scbg_amp_rebalance_last_chosen, last_chosen_cluster, new_chosen_cluster, relaxed, {
4751 if (last_chosen_cluster == UINT32_MAX) {
4752 new_chosen_cluster = preferred_pset->pset_cluster_id;
4753 } else {
4754 new_chosen_cluster = lsb_next(eligible_pset_bitmask, last_chosen_cluster);
4755 if (new_chosen_cluster == -1) {
4756 /* Rotate to the start of the eligible bitmask */
4757 new_chosen_cluster = lsb_first(eligible_pset_bitmask);
4758 }
4759 }
4760 });
4761 return pset_array[new_chosen_cluster];
4762 }
4763
4764 /*
4765 * sched_edge_migrate_candidate()
4766 *
4767 * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4768 * the thread and the preferred cluster to determine the best available pset for scheduling.
4769 *
4770 * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4771 * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4772 * decision has to be synchronized with another CPU changing the recommendation of clusters available
4773 * on the system. If this parameter is set to false, this routine returns the best effort indication of
4774 * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4775 * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4776 * more appropriate cluster.
4777 *
4778 * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4779 * resultant pset lock is held.
4780 */
4781 static processor_set_t
sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset,thread_t thread,processor_set_t locked_pset,bool switch_pset_locks)4782 sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
4783 {
4784 processor_set_t selected_pset = preferred_pset;
4785 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4786 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4787
4788 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
4789 /*
4790 * For cluster-bound threads, choose the cluster to which the thread is bound, unless that
4791 * cluster is unavailable. If it's not available, fall through to the regular cluster selection
4792 * logic which handles derecommended clusters appropriately.
4793 */
4794 selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
4795 if (selected_pset != NULL) {
4796 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4797 if (pset_is_recommended(selected_pset)) {
4798 return selected_pset;
4799 }
4800 }
4801 }
4802
4803 uint64_t candidate_cluster_bitmap = mask(sched_edge_max_clusters);
4804 #if DEVELOPMENT || DEBUG
4805 extern int enable_task_set_cluster_type;
4806 task_t task = get_threadtask(thread);
4807 if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
4808 processor_set_t pset_hint = task->pset_hint;
4809 if (pset_hint && (selected_pset == NULL || selected_pset->pset_cluster_type != pset_hint->pset_cluster_type)) {
4810 selected_pset = pset_hint;
4811 goto migrate_candidate_available_check;
4812 }
4813 }
4814 #endif
4815
4816 if (preferred_pset == NULL) {
4817 /* The preferred_pset has not finished initializing at boot */
4818 goto migrate_candidate_available_check;
4819 }
4820
4821 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4822 /* For realtime threads, try and schedule them on the preferred pset always */
4823 goto migrate_candidate_available_check;
4824 }
4825
4826 uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
4827 if (preferred_cluster_load == 0) {
4828 goto migrate_candidate_available_check;
4829 }
4830
4831 /*
4832 * If a thread is being rebalanced for achieving equal progress of parallel workloads,
4833 * it needs to end up on the preferred runqueue. This mechanism should only be used for
4834 * threads which have been previously migrated to the non-preferred cluster type.
4835 *
4836 * The AMP rebalancing mechanism is available for regular threads or shared resource
4837 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
4838 */
4839 bool amp_rebalance_eligible = (!shared_rsrc_thread) || (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST));
4840 if (amp_rebalance_eligible) {
4841 boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE | AST_QUANTUM)) == (AST_REBALANCE | AST_QUANTUM);
4842 if (amp_rebalance) {
4843 boolean_t non_preferred_pset = (thread->last_processor->processor_set->pset_type != preferred_pset->pset_type);
4844 if (non_preferred_pset) {
4845 selected_pset = sched_edge_amp_rebalance_pset(preferred_pset, thread);
4846 goto migrate_candidate_available_check;
4847 }
4848 }
4849 }
4850
4851 /* Look at edge weights to decide the most ideal migration candidate for this thread */
4852 selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
4853
4854 migrate_candidate_available_check:
4855 if (selected_pset == NULL) {
4856 /* The selected_pset has not finished initializing at boot */
4857 pset_unlock(locked_pset);
4858 return NULL;
4859 }
4860
4861 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4862 if (pset_is_recommended(selected_pset) == true) {
4863 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_pset->pset_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4864 return selected_pset;
4865 }
4866 /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4867 bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4868 if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -1)) {
4869 pset_unlock(locked_pset);
4870 return NULL;
4871 }
4872 /* Try and find an alternative for the selected pset */
4873 selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
4874 goto migrate_candidate_available_check;
4875 }
4876
4877 static processor_t
sched_edge_choose_processor(processor_set_t pset,processor_t processor,thread_t thread)4878 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
4879 {
4880 /* Bound threads don't call this function */
4881 assert(thread->bound_processor == PROCESSOR_NULL);
4882 processor_t chosen_processor = PROCESSOR_NULL;
4883
4884 /*
4885 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
4886 * It should take the passed in "pset" as a hint which represents the recency metric for
4887 * pset selection logic.
4888 */
4889 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4890 processor_set_t chosen_pset = preferred_pset;
4891 /*
4892 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4893 * threads to. sched_edge_migrate_candidate() returns the preferred pset
4894 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4895 *
4896 * Edge Scheduler Optimization
4897 * It might be useful to build a recency metric for the thread for multiple clusters and
4898 * factor that into the migration decisions.
4899 */
4900 chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
4901 if (chosen_pset) {
4902 chosen_processor = choose_processor(chosen_pset, processor, thread);
4903 }
4904 /* For RT threads, choose_processor() can return a different cluster than the one passed into it */
4905 assert(chosen_processor ? chosen_processor->processor_set->pset_type == chosen_pset->pset_type : true);
4906 return chosen_processor;
4907 }
4908
4909 /*
4910 * sched_edge_clutch_bucket_threads_drain()
4911 *
4912 * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4913 * bucket overrides etc.) into a local thread queue.
4914 */
4915 static void
sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,queue_t clutch_threads)4916 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
4917 {
4918 thread_t thread = THREAD_NULL;
4919 uint64_t current_timestamp = mach_approximate_time();
4920 qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
4921 sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4922 enqueue_tail(clutch_threads, &thread->runq_links);
4923 }
4924 }
4925
4926 #if !SCHED_TEST_HARNESS
4927
4928 /*
4929 * sched_edge_run_drained_threads()
4930 *
4931 * Makes all drained threads in a local queue runnable.
4932 */
4933 static void
sched_edge_run_drained_threads(queue_t clutch_threads)4934 sched_edge_run_drained_threads(queue_t clutch_threads)
4935 {
4936 thread_t thread;
4937 /* Now setrun all the threads in the local queue */
4938 qe_foreach_element_safe(thread, clutch_threads, runq_links) {
4939 remqueue(&thread->runq_links);
4940 thread_lock(thread);
4941 thread_setrun(thread, SCHED_TAILQ);
4942 thread_unlock(thread);
4943 }
4944 }
4945
4946 #endif /* !SCHED_TEST_HARNESS */
4947
4948 /*
4949 * sched_edge_update_preferred_cluster()
4950 *
4951 * Routine to update the preferred cluster for QoS buckets within a thread group.
4952 * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4953 */
4954 static void
sched_edge_update_preferred_cluster(sched_clutch_t sched_clutch,bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster)4955 sched_edge_update_preferred_cluster(
4956 sched_clutch_t sched_clutch,
4957 bitmap_t *clutch_bucket_modify_bitmap,
4958 uint32_t *tg_bucket_preferred_cluster)
4959 {
4960 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4961 os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
4962 }
4963 }
4964
4965 #if !SCHED_TEST_HARNESS
4966
4967 /*
4968 * sched_edge_migrate_thread_group_runnable_threads()
4969 *
4970 * Routine to implement the migration of threads on a cluster when the thread group
4971 * recommendation is updated. The migration works using a 2-phase
4972 * algorithm.
4973 *
4974 * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4975 * For each clutch bucket, if it needs to be migrated immediately, drain the threads
4976 * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4977 * appropriate.
4978 *
4979 * Phase 2: After unlocking the pset, drain all the threads from the local thread
4980 * queue and mark them runnable which should land them in the right hierarchy.
4981 *
4982 * The routine assumes that the preferences for the clutch buckets/clutch bucket
4983 * groups have already been updated by the caller.
4984 *
4985 * - Called with the pset locked and interrupts disabled.
4986 * - Returns with the pset unlocked.
4987 */
4988 static void
sched_edge_migrate_thread_group_runnable_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,bitmap_t * clutch_bucket_modify_bitmap,__unused uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)4989 sched_edge_migrate_thread_group_runnable_threads(
4990 sched_clutch_t sched_clutch,
4991 sched_clutch_root_t root_clutch,
4992 bitmap_t *clutch_bucket_modify_bitmap,
4993 __unused uint32_t *tg_bucket_preferred_cluster,
4994 bool migrate_immediately)
4995 {
4996 /* Queue to hold threads that have been drained from clutch buckets to be migrated */
4997 queue_head_t clutch_threads;
4998 queue_init(&clutch_threads);
4999
5000 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
5001 /* Get the clutch bucket for this cluster and sched bucket */
5002 sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
5003 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
5004 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
5005 if (scb_root == NULL) {
5006 /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
5007 assert(clutch_bucket->scb_thr_count == 0);
5008 continue;
5009 }
5010 assert(scb_root == root_clutch);
5011 uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
5012
5013 if (migrate_immediately) {
5014 /*
5015 * For transitions where threads need to be migrated immediately, drain the threads into a
5016 * local queue unless we are looking at the clutch buckets for the newly recommended
5017 * cluster.
5018 */
5019 if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
5020 sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
5021 } else {
5022 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
5023 }
5024 } else {
5025 /* Check if this cluster is the same type as the newly recommended cluster */
5026 boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
5027 /*
5028 * If threads do not have to be migrated immediately, just change the native/foreign
5029 * flag on the clutch bucket.
5030 */
5031 if (homogeneous_cluster) {
5032 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
5033 } else {
5034 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
5035 }
5036 }
5037 }
5038
5039 pset_unlock(root_clutch->scr_pset);
5040 sched_edge_run_drained_threads(&clutch_threads);
5041 }
5042
5043 /*
5044 * sched_edge_migrate_thread_group_running_threads()
5045 *
5046 * Routine to find all running threads of a thread group on a specific cluster
5047 * and IPI them if they need to be moved immediately.
5048 */
5049 static void
sched_edge_migrate_thread_group_running_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,__unused bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)5050 sched_edge_migrate_thread_group_running_threads(
5051 sched_clutch_t sched_clutch,
5052 sched_clutch_root_t root_clutch,
5053 __unused bitmap_t *clutch_bucket_modify_bitmap,
5054 uint32_t *tg_bucket_preferred_cluster,
5055 bool migrate_immediately)
5056 {
5057 if (migrate_immediately == false) {
5058 /* If CLPC has recommended not to move threads immediately, nothing to do here */
5059 return;
5060 }
5061
5062 /*
5063 * Edge Scheduler Optimization
5064 *
5065 * When the system has a large number of clusters and cores, it might be useful to
5066 * narrow down the iteration by using a thread running bitmap per clutch.
5067 */
5068 uint64_t ast_processor_map = 0;
5069 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
5070
5071 uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
5072 /*
5073 * Iterate all CPUs and look for the ones running threads from this thread group and are
5074 * not restricted to the specific cluster (due to overrides etc.)
5075 */
5076 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
5077 processor_t src_processor = processor_array[cpuid];
5078 boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
5079 sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
5080 if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
5081 continue;
5082 }
5083 boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
5084
5085 if (expected_tg && non_preferred_cluster) {
5086 ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
5087 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
5088 bit_set(ast_processor_map, cpuid);
5089 } else if (src_processor == current_processor()) {
5090 bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
5091 ast_t new_preempt = update_pending_nonurgent_preemption(src_processor, AST_PREEMPT);
5092 ast_on(new_preempt);
5093 }
5094 }
5095 }
5096
5097 /* Perform all the IPIs */
5098 if (bit_first(ast_processor_map) != -1) {
5099 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
5100 processor_t ast_processor = processor_array[cpuid];
5101 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
5102 }
5103 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
5104 }
5105 }
5106
5107 /*
5108 * sched_edge_tg_preferred_cluster_change()
5109 *
5110 * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
5111 * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
5112 * thread migrations based on the policy specified by CLPC.
5113 * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
5114 * for each QoS within the thread group.
5115 */
5116 void
sched_edge_tg_preferred_cluster_change(struct thread_group * tg,uint32_t * tg_bucket_preferred_cluster,sched_perfcontrol_preferred_cluster_options_t options)5117 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
5118 {
5119 sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
5120 /*
5121 * In order to optimize the processing, create a bitmap which represents all QoS buckets
5122 * for which the preferred cluster has changed.
5123 */
5124 bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
5125 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
5126 uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
5127 uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
5128 if (old_preferred_cluster != new_preferred_cluster) {
5129 bitmap_set(clutch_bucket_modify_bitmap, bucket);
5130 }
5131 }
5132 if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
5133 /* No changes in any clutch buckets; nothing to do here */
5134 return;
5135 }
5136
5137 /*
5138 * The first operation is to update the preferred cluster for all QoS buckets within the
5139 * thread group so that any future threads becoming runnable would see the new preferred
5140 * cluster value.
5141 */
5142 sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
5143
5144 for (uint32_t cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
5145 processor_set_t pset = pset_array[cluster_id];
5146 spl_t s = splsched();
5147 pset_lock(pset);
5148 /*
5149 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
5150 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
5151 *
5152 * Edge Scheduler Optimization
5153 */
5154 /* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
5155 sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
5156 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
5157 /* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
5158 sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
5159 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
5160 /* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
5161 splx(s);
5162 }
5163 }
5164
5165 /*
5166 * sched_edge_pset_made_schedulable()
5167 *
5168 * Routine to migrate all the clutch buckets which are not in their recommended
5169 * pset hierarchy now that a new pset has become runnable. Its possible that this
5170 * routine is called when the pset is already marked schedulable.
5171 *
5172 * Invoked with the pset lock held and interrupts disabled.
5173 */
5174 static void
sched_edge_pset_made_schedulable(__unused processor_t processor,processor_set_t dst_pset,boolean_t drop_lock)5175 sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
5176 {
5177 if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
5178 /* Nothing to do here since pset is already marked schedulable */
5179 if (drop_lock) {
5180 pset_unlock(dst_pset);
5181 }
5182 return;
5183 }
5184
5185 bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
5186
5187 thread_t thread = sched_edge_processor_idle(dst_pset);
5188 if (thread != THREAD_NULL) {
5189 thread_lock(thread);
5190 thread_setrun(thread, SCHED_TAILQ);
5191 thread_unlock(thread);
5192 }
5193
5194 if (!drop_lock) {
5195 pset_lock(dst_pset);
5196 }
5197 }
5198
5199 #endif /* !SCHED_TEST_HARNESS */
5200
5201 /*
5202 * sched_edge_cpu_init_completed()
5203 *
5204 * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
5205 * provides an opportunity for the edge scheduler to initialize all the edge parameters.
5206 */
5207 static void
sched_edge_cpu_init_completed(void)5208 sched_edge_cpu_init_completed(void)
5209 {
5210 spl_t s = splsched();
5211 for (int src_cluster_id = 0; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) {
5212 processor_set_t src_pset = pset_array[src_cluster_id];
5213 pset_lock(src_pset);
5214
5215 /* For each cluster, set all its outgoing edge parameters */
5216 for (int dst_cluster_id = 0; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) {
5217 if (dst_cluster_id == src_cluster_id) {
5218 continue;
5219 }
5220 processor_set_t dst_pset = pset_array[dst_cluster_id];
5221 if (src_pset->pset_type == dst_pset->pset_type) {
5222 /* P->P/E->E edge config */
5223 bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
5224 bitmap_set(src_pset->native_psets, dst_cluster_id);
5225 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
5226 } else if ((src_pset->pset_type == CLUSTER_TYPE_P) && (dst_pset->pset_type == CLUSTER_TYPE_E)) {
5227 /* P->E edge config */
5228 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
5229 bitmap_clear(src_pset->native_psets, dst_cluster_id);
5230 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
5231 } else {
5232 /* E->P edge config */
5233 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
5234 bitmap_clear(src_pset->native_psets, dst_cluster_id);
5235 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
5236 }
5237 bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
5238 if (clusters_local) {
5239 bitmap_set(src_pset->local_psets, dst_cluster_id);
5240 bitmap_clear(src_pset->remote_psets, dst_cluster_id);
5241 } else {
5242 bitmap_set(src_pset->remote_psets, dst_cluster_id);
5243 bitmap_clear(src_pset->local_psets, dst_cluster_id);
5244 }
5245 }
5246
5247 pset_unlock(src_pset);
5248 }
5249 splx(s);
5250 }
5251
5252 static bool
sched_edge_thread_eligible_for_pset(thread_t thread,processor_set_t pset)5253 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
5254 {
5255 uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
5256 if (preferred_cluster_id == pset->pset_cluster_id) {
5257 return true;
5258 } else {
5259 processor_set_t preferred_pset = pset_array[preferred_cluster_id];
5260 return preferred_pset->sched_edges[pset->pset_cluster_id].sce_migration_allowed;
5261 }
5262 }
5263
5264 extern int sched_amp_spill_deferred_ipi;
5265 extern int sched_amp_pcores_preempt_immediate_ipi;
5266
5267 int sched_edge_migrate_ipi_immediate = 1;
5268
5269 sched_ipi_type_t
sched_edge_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)5270 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
5271 {
5272 processor_set_t pset = dst->processor_set;
5273 assert(dst != current_processor());
5274
5275 boolean_t deferred_ipi_supported = false;
5276 #if defined(CONFIG_SCHED_DEFERRED_AST)
5277 deferred_ipi_supported = true;
5278 #endif /* CONFIG_SCHED_DEFERRED_AST */
5279
5280 switch (event) {
5281 case SCHED_IPI_EVENT_SPILL:
5282 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
5283 if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
5284 return sched_ipi_deferred_policy(pset, dst, thread, event);
5285 }
5286 break;
5287 case SCHED_IPI_EVENT_PREEMPT:
5288 /* For preemption, the default policy is to use deferred IPIs
5289 * for Non-RT P-core preemption. Override that behavior if
5290 * sched_amp_pcores_preempt_immediate_ipi is set
5291 */
5292 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
5293 if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
5294 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5295 }
5296 if (sched_edge_migrate_ipi_immediate) {
5297 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5298 /*
5299 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
5300 */
5301 if (preferred_pset->pset_type == pset->pset_type) {
5302 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5303 }
5304 /*
5305 * For workloads that are going wide, it might be useful to use Immediate IPI to
5306 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
5307 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
5308 * latency on the preferred pset as an estimate of busyness.
5309 */
5310 if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
5311 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5312 }
5313 }
5314 }
5315 break;
5316 default:
5317 break;
5318 }
5319 /* Default back to the global policy for all other scenarios */
5320 return sched_ipi_policy(dst, thread, dst_idle, event);
5321 }
5322
5323 /*
5324 * sched_edge_qos_max_parallelism()
5325 */
5326 uint32_t
sched_edge_qos_max_parallelism(int qos,uint64_t options)5327 sched_edge_qos_max_parallelism(int qos, uint64_t options)
5328 {
5329 uint32_t ecpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_E, false, false);
5330 uint32_t pcpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_P, false, false);
5331 uint32_t ecluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_E);
5332 uint32_t pcluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_P);
5333
5334
5335 if (options & QOS_PARALLELISM_REALTIME) {
5336 /* For realtime threads on AMP, we would want them
5337 * to limit the width to just the P-cores since we
5338 * do not spill/rebalance for RT threads.
5339 */
5340 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcpu_count;
5341 }
5342
5343 /*
5344 * The Edge scheduler supports per-QoS recommendations for thread groups.
5345 * This enables lower QoS buckets (such as UT) to be scheduled on all
5346 * CPUs on the system.
5347 *
5348 * The only restriction is for BG/Maintenance QoS classes for which the
5349 * performance controller would never recommend execution on the P-cores.
5350 * If that policy changes in the future, this value should be changed.
5351 */
5352 switch (qos) {
5353 case THREAD_QOS_BACKGROUND:
5354 case THREAD_QOS_MAINTENANCE:
5355 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecpu_count;
5356 default:
5357 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecpu_count + pcpu_count);
5358 }
5359 }
5360
5361
5362 #endif /* CONFIG_SCHED_EDGE */
5363
5364 #endif /* CONFIG_SCHED_CLUTCH */
5365