1 /*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 #include <machine/machine_routines.h>
32 #include <machine/sched_param.h>
33 #include <machine/machine_cpu.h>
34 #include <kern/kern_types.h>
35 #include <kern/debug.h>
36 #include <kern/machine.h>
37 #include <kern/misc_protos.h>
38 #include <kern/processor.h>
39 #include <kern/queue.h>
40 #include <kern/sched.h>
41 #include <kern/sched_prim.h>
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <kern/sched_clutch.h>
45 #include <machine/atomic.h>
46 #include <kern/sched_clutch.h>
47 #include <sys/kdebug.h>
48
49 #if CONFIG_SCHED_EDGE
50 #include <kern/sched_amp_common.h>
51 #endif /* CONFIG_SCHED_EDGE */
52
53 #if CONFIG_SCHED_CLUTCH
54
55 /* Forward declarations of static routines */
56
57 /* Root level hierarchy management */
58 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
59 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
60 static void sched_clutch_root_pri_update(sched_clutch_root_t);
61 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
62 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
63
64 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
65 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE = 0,
66 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY = 1,
67 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL = 2,
68 });
69
70 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t);
71
72 #if CONFIG_SCHED_EDGE
73 /* Support for foreign threads on AMP platforms */
74 static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
75 static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
76 #endif /* CONFIG_SCHED_EDGE */
77
78 /* Root bucket level hierarchy management */
79 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
80 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
81
82 /* Options for clutch bucket ordering in the runq */
83 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
84 SCHED_CLUTCH_BUCKET_OPTIONS_NONE = 0x0,
85 /* Round robin clutch bucket on thread removal */
86 SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = 0x1,
87 /* Insert clutch bucket at head (for thread preemption) */
88 SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = 0x2,
89 /* Insert clutch bucket at tail (default) */
90 SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = 0x4,
91 });
92
93 /* Clutch bucket level hierarchy management */
94 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
95 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
96 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
97 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
98 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
99 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
100
101 /* Clutch bucket group level properties management */
102 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
103 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
104 static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
105 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
106 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
107 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
108 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
109
110 /* Clutch timeshare properties updates */
111 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
112 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
113
114 /* Clutch membership management */
115 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
116 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
117 static thread_t sched_clutch_thread_highest_remove(sched_clutch_root_t);
118
119 /* Clutch properties updates */
120 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
121 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
122 static int sched_clutch_root_priority(sched_clutch_root_t);
123 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
124 static boolean_t sched_thread_sched_pri_promoted(thread_t);
125
126 #if CONFIG_SCHED_EDGE
127 /* System based routines */
128 static bool sched_edge_pset_available(processor_set_t);
129 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
130 static int sched_edge_iterate_clusters_ordered(processor_set_t, uint64_t, int);
131
132 /* Global indicating the maximum number of clusters on the current platform */
133 static int sched_edge_max_clusters = 0;
134 #endif /* CONFIG_SCHED_EDGE */
135
136 /* Helper debugging routines */
137 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
138
139 extern processor_set_t pset_array[MAX_PSETS];
140
141 /*
142 * Special markers for buckets that have invalid WCELs/quantums etc.
143 */
144 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
145 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
146
147 /*
148 * Root level bucket WCELs
149 *
150 * The root level bucket selection algorithm is an Earliest Deadline
151 * First (EDF) algorithm where the deadline for buckets are defined
152 * by the worst-case-execution-latency and the make runnable timestamp
153 * for the bucket.
154 *
155 */
156 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
157 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
158 0, /* FG */
159 37500, /* IN (37.5ms) */
160 75000, /* DF (75ms) */
161 150000, /* UT (150ms) */
162 250000 /* BG (250ms) */
163 };
164 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
165
166 /*
167 * Root level bucket warp
168 *
169 * Each root level bucket has a warp value associated with it as well.
170 * The warp value allows the root bucket to effectively warp ahead of
171 * lower priority buckets for a limited time even if it has a later
172 * deadline. The warping behavior provides extra (but limited)
173 * opportunity for high priority buckets to remain responsive.
174 */
175
176 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
177 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
178
179 /* Warp window durations for various tiers */
180 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
181 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
182 8000, /* FG (8ms)*/
183 4000, /* IN (4ms) */
184 2000, /* DF (2ms) */
185 1000, /* UT (1ms) */
186 0 /* BG (0ms) */
187 };
188 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
189
190 /*
191 * Thread level quantum
192 *
193 * The algorithm defines quantums for threads at various buckets. This
194 * (combined with the root level bucket quantums) restricts how much
195 * the lower priority levels can preempt the higher priority threads.
196 */
197
198 #if XNU_TARGET_OS_OSX
199 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
200 10000, /* FIXPRI (10ms) */
201 10000, /* FG (10ms) */
202 10000, /* IN (10ms) */
203 10000, /* DF (10ms) */
204 4000, /* UT (4ms) */
205 2000 /* BG (2ms) */
206 };
207 #else /* XNU_TARGET_OS_OSX */
208 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
209 10000, /* FIXPRI (10ms) */
210 10000, /* FG (10ms) */
211 8000, /* IN (8ms) */
212 6000, /* DF (6ms) */
213 4000, /* UT (4ms) */
214 2000 /* BG (2ms) */
215 };
216 #endif /* XNU_TARGET_OS_OSX */
217
218 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
219
220 /*
221 * sched_clutch_us_to_abstime()
222 *
223 * Initializer for converting all durations in usec to abstime
224 */
225 static void
sched_clutch_us_to_abstime(uint32_t * us_vals,uint64_t * abstime_vals)226 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
227 {
228 for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
229 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
230 abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
231 } else {
232 clock_interval_to_absolutetime_interval(us_vals[i],
233 NSEC_PER_USEC, &abstime_vals[i]);
234 }
235 }
236 }
237
238 /* Clutch/Edge Scheduler Debugging support */
239 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c) ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
240
241 #if DEVELOPMENT || DEBUG
242
243 /*
244 * sched_clutch_hierarchy_locked_assert()
245 *
246 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
247 * for the hierarchy depends on where the hierarchy is hooked. The current
248 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
249 * using the pset lock.
250 */
251 static inline void
sched_clutch_hierarchy_locked_assert(sched_clutch_root_t root_clutch)252 sched_clutch_hierarchy_locked_assert(
253 sched_clutch_root_t root_clutch)
254 {
255 pset_assert_locked(root_clutch->scr_pset);
256 }
257
258 #else /* DEVELOPMENT || DEBUG */
259
260 static inline void
sched_clutch_hierarchy_locked_assert(__unused sched_clutch_root_t root_clutch)261 sched_clutch_hierarchy_locked_assert(
262 __unused sched_clutch_root_t root_clutch)
263 {
264 }
265
266 #endif /* DEVELOPMENT || DEBUG */
267
268 /*
269 * sched_clutch_thr_count_inc()
270 *
271 * Increment thread count at a hierarchy level with overflow checks.
272 */
273 static void
sched_clutch_thr_count_inc(uint16_t * thr_count)274 sched_clutch_thr_count_inc(
275 uint16_t *thr_count)
276 {
277 if (__improbable(os_inc_overflow(thr_count))) {
278 panic("sched_clutch thread count overflowed!");
279 }
280 }
281
282 /*
283 * sched_clutch_thr_count_dec()
284 *
285 * Decrement thread count at a hierarchy level with underflow checks.
286 */
287 static void
sched_clutch_thr_count_dec(uint16_t * thr_count)288 sched_clutch_thr_count_dec(
289 uint16_t *thr_count)
290 {
291 if (__improbable(os_dec_overflow(thr_count))) {
292 panic("sched_clutch thread count underflowed!");
293 }
294 }
295
296 static sched_bucket_t
sched_convert_pri_to_bucket(uint8_t priority)297 sched_convert_pri_to_bucket(uint8_t priority)
298 {
299 sched_bucket_t bucket = TH_BUCKET_RUN;
300
301 if (priority > BASEPRI_USER_INITIATED) {
302 bucket = TH_BUCKET_SHARE_FG;
303 } else if (priority > BASEPRI_DEFAULT) {
304 bucket = TH_BUCKET_SHARE_IN;
305 } else if (priority > BASEPRI_UTILITY) {
306 bucket = TH_BUCKET_SHARE_DF;
307 } else if (priority > MAXPRI_THROTTLE) {
308 bucket = TH_BUCKET_SHARE_UT;
309 } else {
310 bucket = TH_BUCKET_SHARE_BG;
311 }
312 return bucket;
313 }
314
315 /*
316 * sched_clutch_thread_bucket_map()
317 *
318 * Map a thread to a scheduling bucket for the clutch/edge scheduler
319 * based on its scheduling mode and the priority attribute passed in.
320 */
321 static sched_bucket_t
sched_clutch_thread_bucket_map(thread_t thread,int pri)322 sched_clutch_thread_bucket_map(thread_t thread, int pri)
323 {
324 switch (thread->sched_mode) {
325 case TH_MODE_FIXED:
326 if (pri >= BASEPRI_FOREGROUND) {
327 return TH_BUCKET_FIXPRI;
328 } else {
329 return sched_convert_pri_to_bucket(pri);
330 }
331
332 case TH_MODE_REALTIME:
333 return TH_BUCKET_FIXPRI;
334
335 case TH_MODE_TIMESHARE:
336 return sched_convert_pri_to_bucket(pri);
337
338 default:
339 panic("unexpected mode: %d", thread->sched_mode);
340 break;
341 }
342 }
343
344 /*
345 * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
346 * based on the amount of time they have been pending and the load at that
347 * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
348 * multiple clusters, its important to keep the load also as a global counter.
349 */
350 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
351
352 /*
353 * sched_clutch_root_init()
354 *
355 * Routine to initialize the scheduler hierarchy root.
356 */
357 static void
sched_clutch_root_init(sched_clutch_root_t root_clutch,processor_set_t pset)358 sched_clutch_root_init(
359 sched_clutch_root_t root_clutch,
360 processor_set_t pset)
361 {
362 root_clutch->scr_thr_count = 0;
363 root_clutch->scr_priority = NOPRI;
364 root_clutch->scr_urgency = 0;
365 root_clutch->scr_pset = pset;
366 #if CONFIG_SCHED_EDGE
367 root_clutch->scr_cluster_id = pset->pset_cluster_id;
368 #else /* CONFIG_SCHED_EDGE */
369 root_clutch->scr_cluster_id = 0;
370 #endif /* CONFIG_SCHED_EDGE */
371
372 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
373 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
374 }
375 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
376 queue_init(&root_clutch->scr_clutch_buckets);
377
378 /* Initialize the priority queue which maintains all runnable foreign clutch buckets */
379 priority_queue_init(&root_clutch->scr_foreign_buckets);
380 bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
381 bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
382 bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
383 priority_queue_init(&root_clutch->scr_bound_root_buckets);
384
385 /* Initialize the bitmap and priority queue of runnable root buckets */
386 priority_queue_init(&root_clutch->scr_unbound_root_buckets);
387 bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
388 bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
389
390 /* Initialize all the root buckets */
391 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
392 sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
393 sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
394 }
395 }
396
397 /*
398 * Clutch Bucket Runqueues
399 *
400 * The clutch buckets are maintained in a runq at the root bucket level. The
401 * runq organization allows clutch buckets to be ordered based on various
402 * factors such as:
403 *
404 * - Clutch buckets are round robin'ed at the same priority level when a
405 * thread is selected from a clutch bucket. This prevents a clutch bucket
406 * from starving out other clutch buckets at the same priority.
407 *
408 * - Clutch buckets are inserted at the head when it becomes runnable due to
409 * thread preemption. This allows threads that were preempted to maintain
410 * their order in the queue.
411 *
412 */
413
414 /*
415 * sched_clutch_bucket_runq_init()
416 *
417 * Initialize a clutch bucket runq.
418 */
419 static void
sched_clutch_bucket_runq_init(sched_clutch_bucket_runq_t clutch_buckets_rq)420 sched_clutch_bucket_runq_init(
421 sched_clutch_bucket_runq_t clutch_buckets_rq)
422 {
423 clutch_buckets_rq->scbrq_highq = NOPRI;
424 for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
425 clutch_buckets_rq->scbrq_bitmap[i] = 0;
426 }
427 clutch_buckets_rq->scbrq_count = 0;
428 for (int i = 0; i < NRQS; i++) {
429 circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
430 }
431 }
432
433 /*
434 * sched_clutch_bucket_runq_empty()
435 *
436 * Returns if a clutch bucket runq is empty.
437 */
438 static boolean_t
sched_clutch_bucket_runq_empty(sched_clutch_bucket_runq_t clutch_buckets_rq)439 sched_clutch_bucket_runq_empty(
440 sched_clutch_bucket_runq_t clutch_buckets_rq)
441 {
442 return clutch_buckets_rq->scbrq_count == 0;
443 }
444
445 /*
446 * sched_clutch_bucket_runq_peek()
447 *
448 * Returns the highest priority clutch bucket in the runq.
449 */
450 static sched_clutch_bucket_t
sched_clutch_bucket_runq_peek(sched_clutch_bucket_runq_t clutch_buckets_rq)451 sched_clutch_bucket_runq_peek(
452 sched_clutch_bucket_runq_t clutch_buckets_rq)
453 {
454 if (clutch_buckets_rq->scbrq_count > 0) {
455 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
456 return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
457 } else {
458 return NULL;
459 }
460 }
461
462 /*
463 * sched_clutch_bucket_runq_enqueue()
464 *
465 * Enqueue a clutch bucket into the runq based on the options passed in.
466 */
467 static void
sched_clutch_bucket_runq_enqueue(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_options_t options)468 sched_clutch_bucket_runq_enqueue(
469 sched_clutch_bucket_runq_t clutch_buckets_rq,
470 sched_clutch_bucket_t clutch_bucket,
471 sched_clutch_bucket_options_t options)
472 {
473 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
474 if (circle_queue_empty(queue)) {
475 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
476 bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
477 if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
478 clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
479 }
480 } else {
481 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
482 circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
483 } else {
484 /*
485 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
486 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
487 */
488 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
489 }
490 }
491 clutch_buckets_rq->scbrq_count++;
492 }
493
494 /*
495 * sched_clutch_bucket_runq_remove()
496 *
497 * Remove a clutch bucket from the runq.
498 */
499 static void
sched_clutch_bucket_runq_remove(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)500 sched_clutch_bucket_runq_remove(
501 sched_clutch_bucket_runq_t clutch_buckets_rq,
502 sched_clutch_bucket_t clutch_bucket)
503 {
504 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
505 circle_dequeue(queue, &clutch_bucket->scb_runqlink);
506 assert(clutch_buckets_rq->scbrq_count > 0);
507 clutch_buckets_rq->scbrq_count--;
508 if (circle_queue_empty(queue)) {
509 bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
510 clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
511 }
512 }
513
514 static void
sched_clutch_bucket_runq_rotate(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)515 sched_clutch_bucket_runq_rotate(
516 sched_clutch_bucket_runq_t clutch_buckets_rq,
517 sched_clutch_bucket_t clutch_bucket)
518 {
519 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
520 assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
521 circle_queue_rotate_head_forward(queue);
522 }
523
524 /*
525 * sched_clutch_root_bucket_init()
526 *
527 * Routine to initialize root buckets.
528 */
529 static void
sched_clutch_root_bucket_init(sched_clutch_root_bucket_t root_bucket,sched_bucket_t bucket,bool bound_root_bucket)530 sched_clutch_root_bucket_init(
531 sched_clutch_root_bucket_t root_bucket,
532 sched_bucket_t bucket,
533 bool bound_root_bucket)
534 {
535 root_bucket->scrb_bucket = bucket;
536 if (bound_root_bucket) {
537 /* For bound root buckets, initialize the bound thread runq. */
538 root_bucket->scrb_bound = true;
539 run_queue_init(&root_bucket->scrb_bound_thread_runq);
540 } else {
541 /*
542 * The unbounded root buckets contain a runq of runnable clutch buckets
543 * which then hold the runnable threads.
544 */
545 root_bucket->scrb_bound = false;
546 sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
547 }
548 priority_queue_entry_init(&root_bucket->scrb_pqlink);
549 root_bucket->scrb_pqlink.deadline = SCHED_CLUTCH_INVALID_TIME_64;
550 root_bucket->scrb_warped_deadline = 0;
551 root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
552 root_bucket->scrb_starvation_avoidance = false;
553 root_bucket->scrb_starvation_ts = 0;
554 }
555
556 /*
557 * Special case scheduling for Above UI bucket.
558 *
559 * AboveUI threads are typically system critical threads that need low latency
560 * which is why they are handled specially.
561 *
562 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
563 * important to maintain some native priority order between those buckets. For unbounded
564 * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
565 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
566 * deadline based scheduling which should pickup the timeshare buckets. For the bound
567 * case, the policy simply compares the priority of the highest runnable threads in
568 * the above UI and timeshare buckets.
569 *
570 * The implementation allows extremely low latency CPU access for Above UI threads
571 * while supporting the use case of high priority timeshare threads contending with
572 * lower priority fixed priority threads.
573 */
574
575
576 /*
577 * sched_clutch_root_unbound_select_aboveui()
578 *
579 * Routine to determine if the above UI unbounded bucket should be selected for execution.
580 */
581 static bool
sched_clutch_root_unbound_select_aboveui(sched_clutch_root_t root_clutch)582 sched_clutch_root_unbound_select_aboveui(
583 sched_clutch_root_t root_clutch)
584 {
585 if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
586 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
587 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
588 if (!bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
589 /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
590 return true;
591 }
592 sched_clutch_bucket_t clutch_bucket_aboveui = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_aboveui);
593 sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_sharefg);
594 if (clutch_bucket_aboveui->scb_priority >= clutch_bucket_sharefg->scb_priority) {
595 return true;
596 }
597 }
598 return false;
599 }
600
601 /*
602 * sched_clutch_root_bound_select_aboveui()
603 *
604 * Routine to determine if the above UI bounded bucket should be selected for execution.
605 */
606 static bool
sched_clutch_root_bound_select_aboveui(sched_clutch_root_t root_clutch)607 sched_clutch_root_bound_select_aboveui(
608 sched_clutch_root_t root_clutch)
609 {
610 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
611 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
612 if (root_bucket_aboveui->scrb_bound_thread_runq.count == 0) {
613 return false;
614 }
615 return root_bucket_aboveui->scrb_bound_thread_runq.highq >= root_bucket_sharefg->scrb_bound_thread_runq.highq;
616 }
617
618 /*
619 * sched_clutch_root_highest_root_bucket()
620 *
621 * Main routine to find the highest runnable root level bucket.
622 * This routine is called from performance sensitive contexts; so it is
623 * crucial to keep this O(1). The options parameter determines if
624 * the selection logic should look at unbounded threads only (for
625 * cross-cluster stealing operations) or both bounded and unbounded
626 * threads (for selecting next thread for execution on current cluster).
627 */
628 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_highest_root_bucket_type_t type)629 sched_clutch_root_highest_root_bucket(
630 sched_clutch_root_t root_clutch,
631 uint64_t timestamp,
632 sched_clutch_highest_root_bucket_type_t type)
633 {
634 sched_clutch_hierarchy_locked_assert(root_clutch);
635 int highest_runnable_bucket = -1;
636 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
637 highest_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
638 } else {
639 int highest_unbound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
640 int highest_bound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
641 highest_runnable_bucket = (highest_bound_runnable_bucket != -1) ? ((highest_unbound_runnable_bucket != -1) ? MIN(highest_bound_runnable_bucket, highest_unbound_runnable_bucket) : highest_bound_runnable_bucket) : highest_unbound_runnable_bucket;
642 }
643
644 if (highest_runnable_bucket == -1) {
645 return NULL;
646 }
647
648 /* Above UI root bucket selection (see comment above for more details on this special case handling) */
649 bool unbound_aboveui = sched_clutch_root_unbound_select_aboveui(root_clutch);
650 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
651 if (unbound_aboveui) {
652 return &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
653 }
654 /* Fall through to selecting a timeshare root bucket */
655 } else {
656 bool bound_aboveui = sched_clutch_root_bound_select_aboveui(root_clutch);
657 sched_clutch_root_bucket_t unbound_aboveui_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
658 sched_clutch_root_bucket_t bound_aboveui_root_bucket = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
659
660 if (unbound_aboveui && bound_aboveui) {
661 /*
662 * In this scenario both the bounded and unbounded above UI buckets are runnable; choose based on the
663 * highest runnable priority in both the buckets.
664 * */
665 int bound_aboveui_pri = root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI].scrb_bound_thread_runq.highq;
666 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(unbound_aboveui_root_bucket);
667 int unbound_aboveui_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
668 return (bound_aboveui_pri >= unbound_aboveui_pri) ? bound_aboveui_root_bucket : unbound_aboveui_root_bucket;
669 }
670 if (unbound_aboveui) {
671 return unbound_aboveui_root_bucket;
672 }
673 if (bound_aboveui) {
674 return bound_aboveui_root_bucket;
675 }
676 /* Fall through to selecting a timeshare root bucket */
677 }
678
679 /*
680 * Above UI bucket is not runnable or has a low priority runnable thread; use the
681 * earliest deadline model to schedule threads. The idea is that as the timeshare
682 * buckets use CPU, they will drop their interactivity score/sched priority and
683 * allow the low priority AboveUI buckets to be scheduled.
684 */
685
686 /* Find the earliest deadline bucket */
687 sched_clutch_root_bucket_t edf_bucket = NULL;
688 sched_clutch_root_bucket_t warp_bucket = NULL;
689 int warp_bucket_index = -1;
690
691 evaluate_root_buckets:
692 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
693 edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
694 } else {
695 sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
696 sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
697 if (bound_bucket && unbound_bucket) {
698 /* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
699 edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
700 } else {
701 edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
702 }
703 }
704 /*
705 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
706 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
707 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
708 * QoS root bucket.
709 */
710 bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
711 warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
712
713 if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) {
714 /* No higher buckets have warp left; best choice is the EDF based bucket */
715 if (edf_bucket->scrb_starvation_avoidance) {
716 /*
717 * Indicates that the earliest deadline bucket is in starvation avoidance mode. Check to see if the
718 * starvation avoidance window is still open and return this bucket if it is.
719 *
720 * The starvation avoidance window is calculated based on the quantum of threads at that bucket and
721 * the number of CPUs in the cluster. The idea is to basically provide one quantum worth of starvation
722 * avoidance across all CPUs.
723 */
724 uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket] / pset_available_cpu_count(root_clutch->scr_pset);
725 if (timestamp < (edf_bucket->scrb_starvation_ts + starvation_window)) {
726 return edf_bucket;
727 } else {
728 /* Starvation avoidance window is over; update deadline and re-evaluate EDF */
729 edf_bucket->scrb_starvation_avoidance = false;
730 edf_bucket->scrb_starvation_ts = 0;
731 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
732 }
733 goto evaluate_root_buckets;
734 }
735
736 /* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
737 if (highest_runnable_bucket < edf_bucket->scrb_bucket) {
738 /* Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance */
739 edf_bucket->scrb_starvation_avoidance = true;
740 edf_bucket->scrb_starvation_ts = timestamp;
741 } else {
742 /* EDF bucket is being selected in the natural order; update deadline and reset warp */
743 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
744 edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
745 edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
746 if (edf_bucket->scrb_bound) {
747 bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
748 } else {
749 bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
750 }
751 }
752 return edf_bucket;
753 }
754
755 /*
756 * Looks like there is a root bucket which is higher in the natural priority
757 * order than edf_bucket and might have some warp remaining.
758 */
759 warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
760 if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
761 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
762 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
763 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
764 return warp_bucket;
765 }
766 if (warp_bucket->scrb_warped_deadline > timestamp) {
767 /* Root bucket already has a warp window open with some warp remaining */
768 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
769 return warp_bucket;
770 }
771
772 /* For this bucket, warp window was opened sometime in the past but has now
773 * expired. Mark the bucket as not avilable for warp anymore and re-run the
774 * warp bucket selection logic.
775 */
776 warp_bucket->scrb_warp_remaining = 0;
777 if (warp_bucket->scrb_bound) {
778 bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
779 } else {
780 bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
781 }
782 goto evaluate_root_buckets;
783 }
784
785 /*
786 * sched_clutch_root_bucket_deadline_calculate()
787 *
788 * Calculate the deadline for the bucket based on its WCEL
789 */
790 static uint64_t
sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t root_bucket,uint64_t timestamp)791 sched_clutch_root_bucket_deadline_calculate(
792 sched_clutch_root_bucket_t root_bucket,
793 uint64_t timestamp)
794 {
795 /* For fixpri AboveUI bucket always return it as the earliest deadline */
796 if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
797 return 0;
798 }
799
800 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
801 return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
802 }
803
804 /*
805 * sched_clutch_root_bucket_deadline_update()
806 *
807 * Routine to update the deadline of the root bucket when it is selected.
808 * Updating the deadline also moves the root_bucket in the EDF priority
809 * queue.
810 */
811 static void
sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)812 sched_clutch_root_bucket_deadline_update(
813 sched_clutch_root_bucket_t root_bucket,
814 sched_clutch_root_t root_clutch,
815 uint64_t timestamp)
816 {
817 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
818 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
819 return;
820 }
821
822 uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
823 uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
824 if (__improbable(old_deadline > new_deadline)) {
825 panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
826 }
827 if (old_deadline != new_deadline) {
828 root_bucket->scrb_pqlink.deadline = new_deadline;
829 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
830 priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
831 }
832 }
833
834 /*
835 * sched_clutch_root_bucket_runnable()
836 *
837 * Routine to insert a newly runnable root bucket into the hierarchy.
838 * Also updates the deadline and warp parameters as necessary.
839 */
840 static void
sched_clutch_root_bucket_runnable(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)841 sched_clutch_root_bucket_runnable(
842 sched_clutch_root_bucket_t root_bucket,
843 sched_clutch_root_t root_clutch,
844 uint64_t timestamp)
845 {
846 /* Mark the root bucket as runnable */
847 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
848 bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
849
850 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
851 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
852 return;
853 }
854
855 if (root_bucket->scrb_starvation_avoidance == false) {
856 /*
857 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
858 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
859 * that and fix it up.
860 */
861 root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
862 }
863 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
864 priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
865 if (root_bucket->scrb_warp_remaining) {
866 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
867 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
868 bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
869 }
870 }
871
872 /*
873 * sched_clutch_root_bucket_empty()
874 *
875 * Routine to remove an empty root bucket from the hierarchy.
876 * Also updates the deadline and warp parameters as necessary.
877 */
878 static void
sched_clutch_root_bucket_empty(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)879 sched_clutch_root_bucket_empty(
880 sched_clutch_root_bucket_t root_bucket,
881 sched_clutch_root_t root_clutch,
882 uint64_t timestamp)
883 {
884 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
885 bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
886
887 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
888 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
889 return;
890 }
891
892 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
893 priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
894
895 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
896 bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
897
898 if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
899 if (root_bucket->scrb_warped_deadline > timestamp) {
900 /*
901 * For root buckets that were using the warp, check if the warp
902 * deadline is in the future. If yes, remove the wall time the
903 * warp was active and update the warp remaining. This allows
904 * the root bucket to use the remaining warp the next time it
905 * becomes runnable.
906 */
907 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
908 } else {
909 /*
910 * If the root bucket's warped deadline is in the past, it has used up
911 * all the warp it was assigned. Empty out its warp remaining.
912 */
913 root_bucket->scrb_warp_remaining = 0;
914 }
915 }
916 }
917
918 static int
sched_clutch_global_bucket_load_get(sched_bucket_t bucket)919 sched_clutch_global_bucket_load_get(
920 sched_bucket_t bucket)
921 {
922 return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
923 }
924
925 /*
926 * sched_clutch_root_pri_update()
927 *
928 * The root level priority is used for thread selection and preemption
929 * logic.
930 *
931 * The logic uses the same decision as thread selection for deciding between the
932 * above UI and timeshare buckets. If one of the timesharing buckets have to be
933 * used for priority calculation, the logic is slightly different from thread
934 * selection, because thread selection considers deadlines, warps etc. to
935 * decide the most optimal bucket at a given timestamp. Since the priority
936 * value is used for preemption decisions only, it needs to be based on the
937 * highest runnable thread available in the timeshare domain. This logic can
938 * be made more sophisticated if there are cases of unnecessary preemption
939 * being seen in workloads.
940 */
941 static void
sched_clutch_root_pri_update(sched_clutch_root_t root_clutch)942 sched_clutch_root_pri_update(
943 sched_clutch_root_t root_clutch)
944 {
945 sched_clutch_hierarchy_locked_assert(root_clutch);
946 int16_t root_bound_pri = NOPRI;
947 int16_t root_unbound_pri = NOPRI;
948
949 if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
950 goto root_pri_update_unbound;
951 }
952 sched_clutch_root_bucket_t root_bucket_bound = NULL;
953 if (sched_clutch_root_bound_select_aboveui(root_clutch)) {
954 root_bucket_bound = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
955 } else {
956 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
957 assert(root_bucket_index != -1);
958 root_bucket_bound = &root_clutch->scr_bound_buckets[root_bucket_index];
959 }
960 root_bound_pri = root_bucket_bound->scrb_bound_thread_runq.highq;
961
962 root_pri_update_unbound:
963 if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
964 goto root_pri_update_complete;
965 }
966 sched_clutch_root_bucket_t root_bucket_unbound = NULL;
967 if (sched_clutch_root_unbound_select_aboveui(root_clutch)) {
968 root_bucket_unbound = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
969 } else {
970 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
971 assert(root_bucket_index != -1);
972 root_bucket_unbound = &root_clutch->scr_unbound_buckets[root_bucket_index];
973 }
974 /* For the selected root bucket, find the highest priority clutch bucket */
975 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_unbound);
976 root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
977
978 root_pri_update_complete:
979 root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
980 }
981
982 /*
983 * sched_clutch_root_urgency_inc()
984 *
985 * Routine to increment the urgency at the root level based on the thread
986 * priority that is being inserted into the hierarchy. The root urgency
987 * counter is updated based on the urgency of threads in any of the
988 * clutch buckets which are part of the hierarchy.
989 *
990 * Always called with the pset lock held.
991 */
992 static void
sched_clutch_root_urgency_inc(sched_clutch_root_t root_clutch,thread_t thread)993 sched_clutch_root_urgency_inc(
994 sched_clutch_root_t root_clutch,
995 thread_t thread)
996 {
997 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
998 root_clutch->scr_urgency++;
999 }
1000 }
1001
1002 /*
1003 * sched_clutch_root_urgency_dec()
1004 *
1005 * Routine to decrement the urgency at the root level based on the thread
1006 * priority that is being removed from the hierarchy. The root urgency
1007 * counter is updated based on the urgency of threads in any of the
1008 * clutch buckets which are part of the hierarchy.
1009 *
1010 * Always called with the pset lock held.
1011 */
1012 static void
sched_clutch_root_urgency_dec(sched_clutch_root_t root_clutch,thread_t thread)1013 sched_clutch_root_urgency_dec(
1014 sched_clutch_root_t root_clutch,
1015 thread_t thread)
1016 {
1017 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1018 root_clutch->scr_urgency--;
1019 }
1020 }
1021
1022 /*
1023 * Clutch bucket level scheduling
1024 *
1025 * The second level of scheduling is the clutch bucket level scheduling
1026 * which tries to schedule thread groups within root_buckets. Each
1027 * clutch represents a thread group and a clutch_bucket_group represents
1028 * threads at a particular sched_bucket within that thread group. The
1029 * clutch_bucket_group contains a clutch_bucket per cluster on the system
1030 * where it holds the runnable threads destined for execution on that
1031 * cluster.
1032 *
1033 * The goal of this level of scheduling is to allow interactive thread
1034 * groups low latency access to the CPU. It also provides slight
1035 * scheduling preference for App and unrestricted thread groups.
1036 *
1037 * The clutch bucket scheduling algorithm measures an interactivity
1038 * score for all clutch bucket groups. The interactivity score is based
1039 * on the ratio of the CPU used and the voluntary blocking of threads
1040 * within the clutch bucket group. The algorithm is very close to the ULE
1041 * scheduler on FreeBSD in terms of calculations. The interactivity
1042 * score provides an interactivity boost in the range of
1043 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1044 * thread groups to win over CPU spinners.
1045 *
1046 * The interactivity score of the clutch bucket group is combined with the
1047 * highest base/promoted priority of threads in the clutch bucket to form
1048 * the overall priority of the clutch bucket.
1049 */
1050
1051 /* Priority boost range for interactivity */
1052 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT (8)
1053 uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1054
1055 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1056 uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1057 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS (500000)
1058
1059 /* The ratio to scale the cpu/blocked time per window */
1060 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO (10)
1061
1062 /* Initial value for voluntary blocking time for the clutch_bucket */
1063 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID (uint64_t)(~0)
1064
1065 /* Value indicating the clutch bucket is not pending execution */
1066 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID ((uint64_t)(~0))
1067
1068 /*
1069 * Thread group CPU starvation avoidance
1070 *
1071 * In heavily CPU contended scenarios, it is possible that some thread groups
1072 * which have a low interactivity score do not get CPU time at all. In order to
1073 * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1074 * bucket group when it has been pending execution for a certain time as defined
1075 * by the sched_clutch_bucket_group_pending_delta_us values below.
1076 *
1077 * The values chosen here are very close to the WCEL values for each sched bucket.
1078 * These values are multiplied by the load average of the relevant root bucket to
1079 * provide an estimate of the actual clutch bucket load.
1080 */
1081 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1082 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
1083 10000, /* FG */
1084 37500, /* IN */
1085 75000, /* DF */
1086 150000, /* UT */
1087 250000, /* BG */
1088 };
1089 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1090
1091 /*
1092 * sched_clutch_bucket_init()
1093 *
1094 * Initializer for clutch buckets.
1095 */
1096 static void
sched_clutch_bucket_init(sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_group_t clutch_bucket_group,sched_bucket_t bucket)1097 sched_clutch_bucket_init(
1098 sched_clutch_bucket_t clutch_bucket,
1099 sched_clutch_bucket_group_t clutch_bucket_group,
1100 sched_bucket_t bucket)
1101 {
1102 clutch_bucket->scb_bucket = bucket;
1103 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1104 clutch_bucket->scb_priority = 0;
1105 #if CONFIG_SCHED_EDGE
1106 clutch_bucket->scb_foreign = false;
1107 priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1108 #endif /* CONFIG_SCHED_EDGE */
1109 clutch_bucket->scb_group = clutch_bucket_group;
1110 clutch_bucket->scb_root = NULL;
1111 priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1112 priority_queue_init(&clutch_bucket->scb_thread_runq);
1113 queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1114 }
1115
1116 /*
1117 * sched_clutch_bucket_group_init()
1118 *
1119 * Initializer for clutch bucket groups.
1120 */
1121 static void
sched_clutch_bucket_group_init(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_t clutch,sched_bucket_t bucket)1122 sched_clutch_bucket_group_init(
1123 sched_clutch_bucket_group_t clutch_bucket_group,
1124 sched_clutch_t clutch,
1125 sched_bucket_t bucket)
1126 {
1127 bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1128 clutch_bucket_group->scbg_bucket = bucket;
1129 clutch_bucket_group->scbg_clutch = clutch;
1130
1131 int max_clusters = ml_get_cluster_count();
1132 clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1133 for (int i = 0; i < max_clusters; i++) {
1134 sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1135 }
1136
1137 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1138 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1139 os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1140 /*
1141 * All thread groups should be initialized to be interactive; this allows the newly launched
1142 * thread groups to fairly compete with already running thread groups.
1143 */
1144 clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1145 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1146 os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1147 #if !__LP64__
1148 lck_spin_init(&clutch_bucket_group->scbg_stats_lock, &pset_lck_grp, NULL);
1149 #endif /* !__LP64__ */
1150 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1151 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1152 clutch_bucket_group->scbg_amp_rebalance_last_chosen = UINT32_MAX;
1153 }
1154
1155 static void
sched_clutch_bucket_group_destroy(sched_clutch_bucket_group_t clutch_bucket_group)1156 sched_clutch_bucket_group_destroy(
1157 sched_clutch_bucket_group_t clutch_bucket_group)
1158 {
1159 kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1160 clutch_bucket_group->scbg_clutch_buckets);
1161 }
1162
1163 /*
1164 * sched_clutch_init_with_thread_group()
1165 *
1166 * Initialize the sched_clutch when the thread group is being created
1167 */
1168 void
sched_clutch_init_with_thread_group(sched_clutch_t clutch,struct thread_group * tg)1169 sched_clutch_init_with_thread_group(
1170 sched_clutch_t clutch,
1171 struct thread_group *tg)
1172 {
1173 os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1174
1175 /* Initialize all the clutch buckets */
1176 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1177 sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1178 }
1179
1180 /* Grouping specific fields */
1181 clutch->sc_tg = tg;
1182 }
1183
1184 /*
1185 * sched_clutch_destroy()
1186 *
1187 * Destructor for clutch; called from thread group release code.
1188 */
1189 void
sched_clutch_destroy(sched_clutch_t clutch)1190 sched_clutch_destroy(
1191 sched_clutch_t clutch)
1192 {
1193 assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1194 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1195 sched_clutch_bucket_group_destroy(&(clutch->sc_clutch_groups[i]));
1196 }
1197 }
1198
1199 #if CONFIG_SCHED_EDGE
1200
1201 /*
1202 * Edge Scheduler Preferred Cluster Mechanism
1203 *
1204 * In order to have better control over various QoS buckets within a thread group, the Edge
1205 * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1206 * preferences are stored at the sched_clutch_bucket_group level since that represents all
1207 * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1208 * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1209 */
1210
1211 static uint32_t
sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)1212 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1213 {
1214 return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1215 }
1216
1217 static uint32_t
sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)1218 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1219 {
1220 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1221 }
1222
1223 uint32_t
sched_edge_thread_preferred_cluster(thread_t thread)1224 sched_edge_thread_preferred_cluster(thread_t thread)
1225 {
1226 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1227 /* For threads bound to a specific cluster, return the bound cluster id */
1228 return sched_edge_thread_bound_cluster_id(thread);
1229 }
1230
1231 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1232 sched_bucket_t sched_bucket = thread->th_sched_bucket;
1233 if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1234 sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1235 }
1236 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1237 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1238 }
1239
1240 /*
1241 * Edge Scheduler Foreign Bucket Support
1242 *
1243 * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1244 * threads that are not native to the cluster. A clutch bucket is considered native if its
1245 * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1246 * bucket priority queue is used for rebalance operations to get threads back to their native
1247 * cluster quickly.
1248 *
1249 * It is possible to make this policy even more aggressive by considering all clusters that
1250 * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1251 * migrations which might have performance implications.
1252 */
1253
1254 static void
sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1255 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1256 {
1257 if (clutch_bucket->scb_foreign) {
1258 clutch_bucket->scb_foreign = false;
1259 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1260 }
1261 }
1262
1263 static void
sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1264 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1265 {
1266 if (!clutch_bucket->scb_foreign) {
1267 clutch_bucket->scb_foreign = true;
1268 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1269 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1270 }
1271 }
1272
1273 /*
1274 * Edge Scheduler Cumulative Load Average
1275 *
1276 * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1277 * making thread migration decisions. The per-bucket load is maintained as a
1278 * cumulative count since higher scheduling buckets impact load on lower buckets
1279 * for thread migration decisions.
1280 *
1281 */
1282
1283 static void
sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1284 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1285 {
1286 switch (bucket) {
1287 case TH_BUCKET_FIXPRI: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1288 case TH_BUCKET_SHARE_FG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1289 case TH_BUCKET_SHARE_IN: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1290 case TH_BUCKET_SHARE_DF: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1291 case TH_BUCKET_SHARE_UT: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1292 case TH_BUCKET_SHARE_BG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1293 default:
1294 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1295 }
1296 }
1297
1298 static void
sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1299 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1300 {
1301 switch (bucket) {
1302 case TH_BUCKET_FIXPRI: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1303 case TH_BUCKET_SHARE_FG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1304 case TH_BUCKET_SHARE_IN: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1305 case TH_BUCKET_SHARE_DF: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1306 case TH_BUCKET_SHARE_UT: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1307 case TH_BUCKET_SHARE_BG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1308 default:
1309 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1310 }
1311 }
1312
1313 uint16_t
sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1314 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1315 {
1316 return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1317 }
1318
1319 #endif /* CONFIG_SCHED_EDGE */
1320
1321 /*
1322 * sched_clutch_bucket_hierarchy_insert()
1323 *
1324 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1325 */
1326 static void
sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,sched_clutch_bucket_options_t options)1327 sched_clutch_bucket_hierarchy_insert(
1328 sched_clutch_root_t root_clutch,
1329 sched_clutch_bucket_t clutch_bucket,
1330 sched_bucket_t bucket,
1331 uint64_t timestamp,
1332 sched_clutch_bucket_options_t options)
1333 {
1334 sched_clutch_hierarchy_locked_assert(root_clutch);
1335 if (bucket > TH_BUCKET_FIXPRI) {
1336 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1337 enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1338 }
1339 #if CONFIG_SCHED_EDGE
1340 /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1341 uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1342 if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1343 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1344 }
1345 #endif /* CONFIG_SCHED_EDGE */
1346 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1347
1348 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1349 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1350 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1351 }
1352
1353 /* Insert the clutch bucket into the root bucket run queue with order based on options */
1354 sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1355 os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1356 os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1357 }
1358
1359 /*
1360 * sched_clutch_bucket_hierarchy_remove()
1361 *
1362 * Rotuine to remove a empty clutch bucket from the root hierarchy.
1363 */
1364 static void
sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,__unused sched_clutch_bucket_options_t options)1365 sched_clutch_bucket_hierarchy_remove(
1366 sched_clutch_root_t root_clutch,
1367 sched_clutch_bucket_t clutch_bucket,
1368 sched_bucket_t bucket,
1369 uint64_t timestamp,
1370 __unused sched_clutch_bucket_options_t options)
1371 {
1372 sched_clutch_hierarchy_locked_assert(root_clutch);
1373 if (bucket > TH_BUCKET_FIXPRI) {
1374 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1375 remqueue(&clutch_bucket->scb_listlink);
1376 }
1377 #if CONFIG_SCHED_EDGE
1378 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1379 #endif /* CONFIG_SCHED_EDGE */
1380
1381 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1382
1383 /* Remove the clutch bucket from the root bucket priority queue */
1384 sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1385 os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1386
1387 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
1388 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1389 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1390 }
1391 os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1392 }
1393
1394 /*
1395 * sched_clutch_bucket_base_pri()
1396 *
1397 * Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1398 * highest base_pri and the highest sched_pri in the clutch bucket.
1399 */
1400 static uint8_t
sched_clutch_bucket_base_pri(sched_clutch_bucket_t clutch_bucket)1401 sched_clutch_bucket_base_pri(
1402 sched_clutch_bucket_t clutch_bucket)
1403 {
1404 assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1405 /*
1406 * Since the clutch bucket can contain threads that are members of the group due
1407 * to the sched_pri being promoted or due to their base pri, the base priority of
1408 * the entire clutch bucket should be based on the highest thread (promoted or base)
1409 * in the clutch bucket.
1410 */
1411 uint8_t max_pri = 0;
1412 if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1413 max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1414 }
1415 return max_pri;
1416 }
1417
1418 /*
1419 * sched_clutch_interactivity_from_cpu_data()
1420 *
1421 * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1422 */
1423 static uint8_t
sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)1424 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1425 {
1426 sched_clutch_bucket_cpu_data_t scb_cpu_data;
1427 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1428 clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1429 clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1430 uint8_t interactive_score = 0;
1431
1432 if ((cpu_blocked == 0) && (cpu_used == 0)) {
1433 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1434 }
1435 /*
1436 * For all timeshare buckets, calculate the interactivity score of the bucket
1437 * and add it to the base priority
1438 */
1439 if (cpu_blocked > cpu_used) {
1440 /* Interactive clutch_bucket case */
1441 interactive_score = sched_clutch_bucket_group_interactive_pri +
1442 ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1443 } else {
1444 /* Non-interactive clutch_bucket case */
1445 interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1446 }
1447 return interactive_score;
1448 }
1449
1450 /*
1451 * sched_clutch_bucket_pri_calculate()
1452 *
1453 * The priority calculation algorithm for the clutch_bucket is a slight
1454 * modification on the ULE interactivity score. It uses the base priority
1455 * of the clutch bucket and applies an interactivity score boost to the
1456 * highly responsive clutch buckets.
1457 */
1458 static uint8_t
sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t clutch_bucket,uint64_t timestamp)1459 sched_clutch_bucket_pri_calculate(
1460 sched_clutch_bucket_t clutch_bucket,
1461 uint64_t timestamp)
1462 {
1463 /* For empty clutch buckets, return priority 0 */
1464 if (clutch_bucket->scb_thr_count == 0) {
1465 return 0;
1466 }
1467
1468 uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1469 uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1470
1471 assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1472 uint8_t pri = base_pri + interactive_score;
1473 if (pri != clutch_bucket->scb_priority) {
1474 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1475 thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1476 }
1477 return pri;
1478 }
1479
1480 /*
1481 * sched_clutch_root_bucket_highest_clutch_bucket()
1482 *
1483 * Routine to find the highest priority clutch bucket
1484 * within the root bucket.
1485 */
1486 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t root_bucket)1487 sched_clutch_root_bucket_highest_clutch_bucket(
1488 sched_clutch_root_bucket_t root_bucket)
1489 {
1490 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1491 return NULL;
1492 }
1493 return sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1494 }
1495
1496 /*
1497 * sched_clutch_bucket_runnable()
1498 *
1499 * Perform all operations needed when a new clutch bucket becomes runnable.
1500 * It involves inserting the clutch_bucket into the hierarchy and updating the
1501 * root priority appropriately.
1502 */
1503 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1504 sched_clutch_bucket_runnable(
1505 sched_clutch_bucket_t clutch_bucket,
1506 sched_clutch_root_t root_clutch,
1507 uint64_t timestamp,
1508 sched_clutch_bucket_options_t options)
1509 {
1510 sched_clutch_hierarchy_locked_assert(root_clutch);
1511 /* Since the clutch bucket became newly runnable, update its pending timestamp */
1512 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1513 sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1514
1515 /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1516 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
1517 int16_t root_old_pri = root_clutch->scr_priority;
1518 sched_clutch_root_pri_update(root_clutch);
1519 return root_clutch->scr_priority > root_old_pri;
1520 }
1521
1522 /*
1523 * sched_clutch_bucket_update()
1524 *
1525 * Update the clutch_bucket's position in the hierarchy. This routine is
1526 * called when a new thread is inserted or removed from a runnable clutch
1527 * bucket. The options specify some properties about the clutch bucket
1528 * insertion order into the clutch bucket runq.
1529 */
1530 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1531 sched_clutch_bucket_update(
1532 sched_clutch_bucket_t clutch_bucket,
1533 sched_clutch_root_t root_clutch,
1534 uint64_t timestamp,
1535 sched_clutch_bucket_options_t options)
1536 {
1537 sched_clutch_hierarchy_locked_assert(root_clutch);
1538 uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1539 sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1540 if (new_pri == clutch_bucket->scb_priority) {
1541 /*
1542 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1543 * to the end of the runq. Typically used when a thread is selected for execution
1544 * from a clutch bucket.
1545 */
1546 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1547 sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1548 }
1549 return false;
1550 }
1551 sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1552 #if CONFIG_SCHED_EDGE
1553 if (clutch_bucket->scb_foreign) {
1554 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1555 }
1556 #endif /* CONFIG_SCHED_EDGE */
1557 clutch_bucket->scb_priority = new_pri;
1558 #if CONFIG_SCHED_EDGE
1559 if (clutch_bucket->scb_foreign) {
1560 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1561 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1562 }
1563 #endif /* CONFIG_SCHED_EDGE */
1564 sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1565
1566 int16_t root_old_pri = root_clutch->scr_priority;
1567 sched_clutch_root_pri_update(root_clutch);
1568 return root_clutch->scr_priority > root_old_pri;
1569 }
1570
1571 /*
1572 * sched_clutch_bucket_empty()
1573 *
1574 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1575 * It involves removing the clutch bucket from the hierarchy and updaing the root
1576 * priority appropriately.
1577 */
1578 static void
sched_clutch_bucket_empty(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1579 sched_clutch_bucket_empty(
1580 sched_clutch_bucket_t clutch_bucket,
1581 sched_clutch_root_t root_clutch,
1582 uint64_t timestamp,
1583 sched_clutch_bucket_options_t options)
1584 {
1585 sched_clutch_hierarchy_locked_assert(root_clutch);
1586 sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1587 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1588 sched_clutch_root_pri_update(root_clutch);
1589 }
1590
1591 /*
1592 * sched_clutch_cpu_usage_update()
1593 *
1594 * Routine to update CPU usage of the thread in the hierarchy.
1595 */
1596 void
sched_clutch_cpu_usage_update(thread_t thread,uint64_t delta)1597 sched_clutch_cpu_usage_update(
1598 thread_t thread,
1599 uint64_t delta)
1600 {
1601 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1602 return;
1603 }
1604
1605 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1606 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1607 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1608 }
1609
1610 /*
1611 * sched_clutch_bucket_group_cpu_usage_update()
1612 *
1613 * Routine to update the CPU usage of the clutch_bucket.
1614 */
1615 static void
sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t delta)1616 sched_clutch_bucket_group_cpu_usage_update(
1617 sched_clutch_bucket_group_t clutch_bucket_group,
1618 uint64_t delta)
1619 {
1620 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1621 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1622 return;
1623 }
1624 delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1625 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1626 }
1627
1628 /*
1629 * sched_clutch_bucket_group_cpu_pending_adjust()
1630 *
1631 * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1632 * such that one "pending interval" provides one point improvement in interactivity score.
1633 */
1634 static inline uint64_t
sched_clutch_bucket_group_cpu_pending_adjust(uint64_t cpu_used,uint64_t cpu_blocked,uint8_t pending_intervals)1635 sched_clutch_bucket_group_cpu_pending_adjust(
1636 uint64_t cpu_used,
1637 uint64_t cpu_blocked,
1638 uint8_t pending_intervals)
1639 {
1640 uint64_t cpu_used_adjusted = 0;
1641 if (cpu_blocked < cpu_used) {
1642 cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1643 cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1644 } else {
1645 uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1646 cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1647 }
1648 return cpu_used_adjusted;
1649 }
1650
1651 /*
1652 * sched_clutch_bucket_group_cpu_adjust()
1653 *
1654 * Routine to scale the cpu usage and blocked time once the sum gets bigger
1655 * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1656 * manageable and maintain the same ratio while allowing clutch buckets to
1657 * adjust behavior and reflect in the interactivity score in a reasonable
1658 * amount of time. Also adjusts the CPU usage based on pending_intervals
1659 * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1660 */
1661 static void
sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t clutch_bucket_group,uint8_t pending_intervals)1662 sched_clutch_bucket_group_cpu_adjust(
1663 sched_clutch_bucket_group_t clutch_bucket_group,
1664 uint8_t pending_intervals)
1665 {
1666 sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1667 sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1668 os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1669 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1670 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1671
1672 if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1673 /* No changes to the CPU used and blocked values */
1674 os_atomic_rmw_loop_give_up();
1675 }
1676 if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1677 /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1678 cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1679 cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1680 }
1681 /* Use the shift passed in to ageout the CPU usage */
1682 cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1683 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1684 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1685 });
1686 }
1687
1688 /*
1689 * Thread level scheduling algorithm
1690 *
1691 * The thread level scheduling algorithm uses the mach timeshare
1692 * decay based algorithm to achieve sharing between threads within the
1693 * same clutch bucket. The load/priority shifts etc. are all maintained
1694 * at the clutch bucket level and used for decay calculation of the
1695 * threads. The load sampling is still driven off the scheduler tick
1696 * for runnable clutch buckets (it does not use the new higher frequency
1697 * EWMA based load calculation). The idea is that the contention and load
1698 * within clutch_buckets should be limited enough to not see heavy decay
1699 * and timeshare effectively.
1700 */
1701
1702 /*
1703 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1704 *
1705 * Increment the run count for the clutch bucket associated with the
1706 * thread.
1707 */
1708 uint32_t
sched_clutch_thread_run_bucket_incr(thread_t thread,sched_bucket_t bucket)1709 sched_clutch_thread_run_bucket_incr(
1710 thread_t thread,
1711 sched_bucket_t bucket)
1712 {
1713 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1714 return 0;
1715 }
1716 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1717 return sched_clutch_run_bucket_incr(clutch, bucket);
1718 }
1719
1720 static uint32_t
sched_clutch_run_bucket_incr(sched_clutch_t clutch,sched_bucket_t bucket)1721 sched_clutch_run_bucket_incr(
1722 sched_clutch_t clutch,
1723 sched_bucket_t bucket)
1724 {
1725 assert(bucket != TH_BUCKET_RUN);
1726 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1727 return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
1728 }
1729
1730 /*
1731 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1732 *
1733 * Decrement the run count for the clutch bucket associated with the
1734 * thread.
1735 */
1736 uint32_t
sched_clutch_thread_run_bucket_decr(thread_t thread,sched_bucket_t bucket)1737 sched_clutch_thread_run_bucket_decr(
1738 thread_t thread,
1739 sched_bucket_t bucket)
1740 {
1741 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1742 return 0;
1743 }
1744 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1745 return sched_clutch_run_bucket_decr(clutch, bucket);
1746 }
1747
1748 static uint32_t
sched_clutch_run_bucket_decr(sched_clutch_t clutch,sched_bucket_t bucket)1749 sched_clutch_run_bucket_decr(
1750 sched_clutch_t clutch,
1751 sched_bucket_t bucket)
1752 {
1753 assert(bucket != TH_BUCKET_RUN);
1754 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1755 return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
1756 }
1757
1758 /*
1759 * sched_clutch_bucket_group_timeshare_update()
1760 *
1761 * Routine to update the load and priority shift for the clutch_bucket_group
1762 * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
1763 * clutch buckets with runnable threads in them. So it is important to maintain
1764 * the timesharing information at the clutch_bucket_group level instead of
1765 * individual clutch buckets (because the algorithm is trying to timeshare all
1766 * threads at the same QoS irrespective of which hierarchy they are enqueued in).
1767 *
1768 * The routine is called from the sched tick handling code to make sure this value
1769 * is updated at least once every sched tick. For clutch bucket groups which have
1770 * not been runnable for very long, the clutch_bucket_group maintains a "last
1771 * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
1772 * if this value is outdated, the load and shifts are updated.
1773 *
1774 * Possible optimization:
1775 * - The current algorithm samples the load every sched tick (125ms).
1776 * This is prone to spikes in runnable counts; if that turns out to be
1777 * a problem, a simple solution would be to do the EWMA trick to sample
1778 * load at every load_tick (30ms) and use the averaged value for the pri
1779 * shift calculation.
1780 */
1781 static void
sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_bucket_t clutch_bucket,uint64_t ctime)1782 sched_clutch_bucket_group_timeshare_update(
1783 sched_clutch_bucket_group_t clutch_bucket_group,
1784 sched_clutch_bucket_t clutch_bucket,
1785 uint64_t ctime)
1786 {
1787 if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
1788 /* No timesharing needed for fixed priority Above UI threads */
1789 return;
1790 }
1791
1792 /*
1793 * Update the timeshare parameters for the clutch bucket group
1794 * if they havent been updated in this tick.
1795 */
1796 uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
1797 uint32_t current_sched_ts = sched_tick;
1798 if (sched_ts < current_sched_ts) {
1799 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
1800 /* NCPU wide workloads should not experience decay */
1801 uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
1802 uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
1803 bucket_group_load = MIN(bucket_group_load, NRQS - 1);
1804 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
1805 /* Ensure that the pri_shift value is reasonable */
1806 pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
1807 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
1808 }
1809
1810 /*
1811 * Update the clutch bucket priority; this allows clutch buckets that have been pending
1812 * for a long time to get an updated interactivity score.
1813 */
1814 sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
1815 }
1816
1817 /*
1818 * sched_clutch_thread_clutch_update()
1819 *
1820 * Routine called when the thread changes its thread group. The current
1821 * implementation relies on the fact that the thread group is changed only from
1822 * the context of the thread itself or when the thread is runnable but not in a
1823 * runqueue. Due to this fact, the thread group change causes only counter
1824 * updates in the old & new clutch buckets and no hierarchy changes. The routine
1825 * also attributes the CPU used so far to the old clutch.
1826 */
1827 void
sched_clutch_thread_clutch_update(thread_t thread,sched_clutch_t old_clutch,sched_clutch_t new_clutch)1828 sched_clutch_thread_clutch_update(
1829 thread_t thread,
1830 sched_clutch_t old_clutch,
1831 sched_clutch_t new_clutch)
1832 {
1833 uint32_t cpu_delta;
1834
1835 if (old_clutch) {
1836 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
1837
1838 sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
1839 /*
1840 * Calculate the CPU used by this thread in the old bucket and
1841 * add it to the old clutch bucket. This uses the same CPU usage
1842 * logic as update_priority etc.
1843 */
1844 sched_tick_delta(thread, cpu_delta);
1845 if (thread->pri_shift < INT8_MAX) {
1846 thread->sched_usage += cpu_delta;
1847 }
1848 thread->cpu_delta += cpu_delta;
1849 if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1850 sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
1851 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
1852 }
1853 }
1854
1855 if (new_clutch) {
1856 sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
1857 }
1858 }
1859
1860 /* Thread Insertion/Removal/Selection routines */
1861
1862 #if CONFIG_SCHED_EDGE
1863
1864 /*
1865 * Edge Scheduler Bound Thread Support
1866 *
1867 * The edge scheduler allows threads to be bound to specific clusters. The scheduler
1868 * maintains a separate runq on the clutch root to hold these bound threads. These
1869 * bound threads count towards the root priority and thread count, but are ignored
1870 * for thread migration/steal decisions. Bound threads that are enqueued in the
1871 * separate runq have the th_bound_cluster_enqueued flag set to allow easy
1872 * removal.
1873 *
1874 * Bound Threads Timesharing
1875 * The bound threads share the timesharing properties of the clutch bucket group they are
1876 * part of. They contribute to the load and use priority shifts/decay values from the
1877 * clutch bucket group.
1878 */
1879
1880 static boolean_t
sched_edge_bound_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)1881 sched_edge_bound_thread_insert(
1882 sched_clutch_root_t root_clutch,
1883 thread_t thread,
1884 integer_t options)
1885 {
1886 /* Update the clutch runnable count and priority */
1887 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1888 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1889 if (root_bucket->scrb_bound_thread_runq.count == 0) {
1890 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
1891 }
1892
1893 assert((thread->th_bound_cluster_enqueued) == false);
1894 run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
1895 thread->th_bound_cluster_enqueued = true;
1896
1897 int16_t root_old_pri = root_clutch->scr_priority;
1898 sched_clutch_root_pri_update(root_clutch);
1899 return root_clutch->scr_priority > root_old_pri;
1900 }
1901
1902 static void
sched_edge_bound_thread_remove(sched_clutch_root_t root_clutch,thread_t thread)1903 sched_edge_bound_thread_remove(
1904 sched_clutch_root_t root_clutch,
1905 thread_t thread)
1906 {
1907 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1908 assert((thread->th_bound_cluster_enqueued) == true);
1909 run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
1910 thread->th_bound_cluster_enqueued = false;
1911
1912 /* Update the clutch runnable count and priority */
1913 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
1914 if (root_bucket->scrb_bound_thread_runq.count == 0) {
1915 sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
1916 }
1917 sched_clutch_root_pri_update(root_clutch);
1918 }
1919
1920 /*
1921 * Edge Scheduler cluster shared resource threads load balancing
1922 *
1923 * The Edge scheduler attempts to load balance cluster shared resource intensive threads
1924 * across clusters in order to reduce contention on the shared resources. It achieves
1925 * that by maintaining the runnable and running shared resource load on each cluster
1926 * and balancing the load across multiple clusters.
1927 *
1928 * The current implementation for cluster shared resource load balancing looks at
1929 * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
1930 * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
1931 * do not steal/rebalance shared resource threads. Some more details for the implementation:
1932 *
1933 * - When threads are tagged as shared resource, they go through the cluster selection logic
1934 * which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
1935 * enqueued in the cluster bound runqueue.
1936 *
1937 * - When the threads start running and call avoid_processor, the load balancing logic will be
1938 * invoked and cause the thread to be sent to a more preferred cluster if one exists and has
1939 * no shared resource load.
1940 *
1941 * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
1942 * it will look at running shared resource threads on foreign clusters and actively rebalance them.
1943 *
1944 * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
1945 * go idle intentionally.
1946 *
1947 * - One caveat of this design is that if a preferred CPU has already run and finished its shared
1948 * resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
1949 * The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
1950 * events listed above happen.
1951 *
1952 * - Also it currently does not consider other properties such as thread priorities and
1953 * qos level thread load in the thread placement decision.
1954 *
1955 * Edge Scheduler cluster shared resource thread scheduling policy
1956 *
1957 * The threads for shared resources can be scheduled using one of the two policies:
1958 *
1959 * EDGE_SHARED_RSRC_SCHED_POLICY_RR
1960 * This policy distributes the threads so that they spread across all available clusters
1961 * irrespective of type. The idea is that this scheduling policy will put a shared resource
1962 * thread on each cluster on the platform before it starts doubling up on clusters.
1963 *
1964 * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
1965 * This policy distributes threads so that the threads first fill up all the capacity on
1966 * the preferred cluster and its homogeneous peers before spilling to different core type.
1967 * The current implementation defines capacity based on the number of CPUs in the cluster;
1968 * so a cluster's shared resource is considered full if there are "n" runnable + running
1969 * shared resource threads on the cluster with n cpus. This policy is different from the
1970 * default scheduling policy of the edge scheduler since this always tries to fill up the
1971 * native clusters to capacity even when non-native clusters might be idle.
1972 */
1973 __options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
1974 EDGE_SHARED_RSRC_SCHED_POLICY_RR = 0,
1975 EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST = 1,
1976 });
1977
1978 const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
1979 [CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
1980 [CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
1981 };
1982
1983 static void
sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch,thread_t thread)1984 sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
1985 {
1986 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
1987 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
1988 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
1989 }
1990 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
1991 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
1992 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
1993 }
1994 }
1995
1996 static void
sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch,thread_t thread)1997 sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
1998 {
1999 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2000 if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2001 thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2002 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2003 }
2004 }
2005 }
2006
2007 uint16_t
sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch,cluster_shared_rsrc_type_t shared_rsrc_type)2008 sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2009 {
2010 return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2011 }
2012
2013 /*
2014 * sched_edge_shared_rsrc_idle()
2015 *
2016 * Routine used to determine if the constrained resource for the pset is idle. This is
2017 * used by a CPU going idle to decide if it should rebalance a running shared resource
2018 * thread from a non-preferred cluster.
2019 */
2020 static boolean_t
sched_edge_shared_rsrc_idle(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2021 sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2022 {
2023 return sched_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
2024 }
2025
2026 /*
2027 * sched_edge_thread_shared_rsrc_type
2028 *
2029 * This routine decides if a given thread needs special handling for being a
2030 * heavy shared resource user. It is valid for the same thread to be using
2031 * several shared resources at the same time and have multiple policy flags set.
2032 * This routine determines which of those properties will be used for load
2033 * balancing and migration decisions.
2034 */
2035 static cluster_shared_rsrc_type_t
sched_edge_thread_shared_rsrc_type(thread_t thread)2036 sched_edge_thread_shared_rsrc_type(thread_t thread)
2037 {
2038 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2039 return CLUSTER_SHARED_RSRC_TYPE_RR;
2040 }
2041 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2042 return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2043 }
2044 return CLUSTER_SHARED_RSRC_TYPE_NONE;
2045 }
2046
2047 #endif /* CONFIG_SCHED_EDGE */
2048
2049 /*
2050 * sched_clutch_thread_bound_lookup()
2051 *
2052 * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2053 */
2054 static thread_t
sched_clutch_thread_bound_lookup(__unused sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket)2055 sched_clutch_thread_bound_lookup(
2056 __unused sched_clutch_root_t root_clutch,
2057 sched_clutch_root_bucket_t root_bucket)
2058 {
2059 return run_queue_peek(&root_bucket->scrb_bound_thread_runq);
2060 }
2061
2062 /*
2063 * Clutch Bucket Group Thread Counts and Pending time calculation
2064 *
2065 * The pending time on the clutch_bucket_group allows the scheduler to track if it
2066 * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2067 * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2068 * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2069 * pending time is advanced to the time of thread selection.
2070 *
2071 * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2072 * simulataneously, it is important that the updates to thread counts and pending timestamps
2073 * happen atomically. The implementation relies on the following aspects to make that work
2074 * as expected:
2075 * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2076 * is held when threads are added/removed and pending timestamps are updated
2077 * - The edge scheduler would have support for double wide 128 bit atomics which allow the
2078 * thread count and pending timestamp to be updated atomically.
2079 *
2080 * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2081 * above to atomically update the interactivity score for a clutch bucket group.
2082 */
2083
2084 #if CONFIG_SCHED_EDGE
2085
2086 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2087 sched_clutch_bucket_group_thr_count_inc(
2088 sched_clutch_bucket_group_t clutch_bucket_group,
2089 uint64_t timestamp)
2090 {
2091 sched_clutch_counter_time_t old_pending_data;
2092 sched_clutch_counter_time_t new_pending_data;
2093 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2094 new_pending_data.scct_count = old_pending_data.scct_count + 1;
2095 new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2096 if (old_pending_data.scct_count == 0) {
2097 new_pending_data.scct_timestamp = timestamp;
2098 }
2099 });
2100 }
2101
2102 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2103 sched_clutch_bucket_group_thr_count_dec(
2104 sched_clutch_bucket_group_t clutch_bucket_group,
2105 uint64_t timestamp)
2106 {
2107 sched_clutch_counter_time_t old_pending_data;
2108 sched_clutch_counter_time_t new_pending_data;
2109 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2110 new_pending_data.scct_count = old_pending_data.scct_count - 1;
2111 if (new_pending_data.scct_count == 0) {
2112 new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2113 } else {
2114 new_pending_data.scct_timestamp = timestamp;
2115 }
2116 });
2117 }
2118
2119 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2120 sched_clutch_bucket_group_pending_ageout(
2121 sched_clutch_bucket_group_t clutch_bucket_group,
2122 uint64_t timestamp)
2123 {
2124 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2125 sched_clutch_counter_time_t old_pending_data;
2126 sched_clutch_counter_time_t new_pending_data;
2127 uint8_t cpu_usage_shift = 0;
2128
2129 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2130 cpu_usage_shift = 0;
2131 uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2132 bool old_update = (old_pending_ts >= timestamp);
2133 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2134 bool no_bucket_load = (bucket_load == 0);
2135 if (old_update || no_pending_time || no_bucket_load) {
2136 os_atomic_rmw_loop_give_up();
2137 }
2138
2139 /* Calculate the time the clutch bucket group has been pending */
2140 uint64_t pending_delta = timestamp - old_pending_ts;
2141 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2142 if (pending_delta < interactivity_delta) {
2143 os_atomic_rmw_loop_give_up();
2144 }
2145 cpu_usage_shift = (pending_delta / interactivity_delta);
2146 new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2147 new_pending_data.scct_count = old_pending_data.scct_count;
2148 });
2149 return cpu_usage_shift;
2150 }
2151
2152 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2153 sched_clutch_bucket_group_interactivity_score_calculate(
2154 sched_clutch_bucket_group_t clutch_bucket_group,
2155 uint64_t timestamp)
2156 {
2157 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2158 /*
2159 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2160 * priorities, make sure all AboveUI buckets are marked interactive.
2161 */
2162 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2163 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2164 }
2165 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2166 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2167 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2168 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2169 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2170 sched_clutch_counter_time_t old_interactivity_data;
2171 sched_clutch_counter_time_t new_interactivity_data;
2172
2173 bool score_updated = os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2174 if (old_interactivity_data.scct_timestamp >= timestamp) {
2175 os_atomic_rmw_loop_give_up();
2176 }
2177 new_interactivity_data.scct_timestamp = timestamp;
2178 if (old_interactivity_data.scct_timestamp != 0) {
2179 new_interactivity_data.scct_count = interactivity_score;
2180 }
2181 });
2182 if (score_updated) {
2183 return (uint8_t)new_interactivity_data.scct_count;
2184 } else {
2185 return (uint8_t)old_interactivity_data.scct_count;
2186 }
2187 }
2188
2189 #else /* CONFIG_SCHED_EDGE */
2190
2191 /*
2192 * For the clutch scheduler, atomicity is ensured by making sure all operations
2193 * are happening under the pset lock of the only cluster present on the platform.
2194 */
2195 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2196 sched_clutch_bucket_group_thr_count_inc(
2197 sched_clutch_bucket_group_t clutch_bucket_group,
2198 uint64_t timestamp)
2199 {
2200 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2201 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2202 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2203 }
2204 clutch_bucket_group->scbg_pending_data.scct_count++;
2205 }
2206
2207 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2208 sched_clutch_bucket_group_thr_count_dec(
2209 sched_clutch_bucket_group_t clutch_bucket_group,
2210 uint64_t timestamp)
2211 {
2212 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2213 clutch_bucket_group->scbg_pending_data.scct_count--;
2214 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2215 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2216 } else {
2217 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2218 }
2219 }
2220
2221 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2222 sched_clutch_bucket_group_pending_ageout(
2223 sched_clutch_bucket_group_t clutch_bucket_group,
2224 uint64_t timestamp)
2225 {
2226 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2227 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2228 uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2229 bool old_update = (old_pending_ts >= timestamp);
2230 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2231 bool no_bucket_load = (bucket_load == 0);
2232 if (old_update || no_pending_time || no_bucket_load) {
2233 return 0;
2234 }
2235 uint64_t pending_delta = timestamp - old_pending_ts;
2236 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2237 if (pending_delta < interactivity_delta) {
2238 return 0;
2239 }
2240 uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2241 clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2242 return cpu_usage_shift;
2243 }
2244
2245 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2246 sched_clutch_bucket_group_interactivity_score_calculate(
2247 sched_clutch_bucket_group_t clutch_bucket_group,
2248 uint64_t timestamp)
2249 {
2250 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2251 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2252 /*
2253 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2254 * priorities, make sure all AboveUI buckets are marked interactive.
2255 */
2256 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2257 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2258 }
2259 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2260 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2261 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2262 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2263 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2264 if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2265 clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2266 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2267 return interactivity_score;
2268 } else {
2269 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2270 }
2271 }
2272
2273 #endif /* CONFIG_SCHED_EDGE */
2274
2275 /*
2276 * Clutch Bucket Group Run Count and Blocked Time Accounting
2277 *
2278 * The clutch bucket group maintains the number of runnable/running threads in the group.
2279 * Since the blocked time of the clutch bucket group is based on this count, it is
2280 * important to make sure the blocking timestamp and the run count are updated atomically.
2281 *
2282 * Since the run count increments happen without any pset locks held, the scheduler makes
2283 * these updates atomic in the following way:
2284 * - On 64-bit platforms, it uses double wide atomics to update the count & timestamp
2285 * - On 32-bit platforms, it uses a lock to synchronize the count & timestamp update
2286 */
2287
2288 #if !__LP64__
2289
2290 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2291 sched_clutch_bucket_group_run_count_inc(
2292 sched_clutch_bucket_group_t clutch_bucket_group)
2293 {
2294 uint64_t blocked_time = 0;
2295 uint64_t updated_run_count = 0;
2296
2297 lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2298 if ((clutch_bucket_group->scbg_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID) &&
2299 (clutch_bucket_group->scbg_blocked_data.scct_count == 0)) {
2300 /* Run count is transitioning from 0 to 1; calculate blocked time and add it to CPU data */
2301 blocked_time = mach_absolute_time() - clutch_bucket_group->scbg_blocked_data.scct_timestamp;
2302 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2303 }
2304 clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count + 1;
2305 updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2306 lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2307
2308 blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2309 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2310 return (uint32_t)updated_run_count;
2311 }
2312
2313 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2314 sched_clutch_bucket_group_run_count_dec(
2315 sched_clutch_bucket_group_t clutch_bucket_group)
2316 {
2317 uint64_t updated_run_count = 0;
2318
2319 lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2320 clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count - 1;
2321 if (clutch_bucket_group->scbg_blocked_data.scct_count == 0) {
2322 /* Run count is transitioning from 1 to 0; start the blocked timer */
2323 clutch_bucket_group->scbg_blocked_data.scct_timestamp = mach_absolute_time();
2324 }
2325 updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2326 lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2327 return (uint32_t)updated_run_count;
2328 }
2329
2330 #else /* !__LP64__ */
2331
2332 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2333 sched_clutch_bucket_group_run_count_inc(
2334 sched_clutch_bucket_group_t clutch_bucket_group)
2335 {
2336 sched_clutch_counter_time_t old_blocked_data;
2337 sched_clutch_counter_time_t new_blocked_data;
2338
2339 bool update_blocked_time = false;
2340 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2341 new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2342 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2343 update_blocked_time = false;
2344 if (old_blocked_data.scct_count == 0) {
2345 new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2346 update_blocked_time = true;
2347 }
2348 });
2349 if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2350 uint64_t ctime = mach_absolute_time();
2351 if (ctime > old_blocked_data.scct_timestamp) {
2352 uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2353 blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2354 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2355 }
2356 }
2357 return (uint32_t)new_blocked_data.scct_count;
2358 }
2359
2360 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2361 sched_clutch_bucket_group_run_count_dec(
2362 sched_clutch_bucket_group_t clutch_bucket_group)
2363 {
2364 sched_clutch_counter_time_t old_blocked_data;
2365 sched_clutch_counter_time_t new_blocked_data;
2366
2367 uint64_t ctime = mach_absolute_time();
2368 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2369 new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2370 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2371 if (new_blocked_data.scct_count == 0) {
2372 new_blocked_data.scct_timestamp = ctime;
2373 }
2374 });
2375 return (uint32_t)new_blocked_data.scct_count;
2376 }
2377
2378 #endif /* !__LP64__ */
2379
2380
2381 /*
2382 * sched_clutch_thread_insert()
2383 *
2384 * Routine to insert a thread into the sched clutch hierarchy.
2385 * Update the counts at all levels of the hierarchy and insert the nodes
2386 * as they become runnable. Always called with the pset lock held.
2387 */
2388 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2389 sched_clutch_thread_insert(
2390 sched_clutch_root_t root_clutch,
2391 thread_t thread,
2392 integer_t options)
2393 {
2394 boolean_t result = FALSE;
2395
2396 sched_clutch_hierarchy_locked_assert(root_clutch);
2397 #if CONFIG_SCHED_EDGE
2398 sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2399 sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2400 /*
2401 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2402 * One scenario where a bound thread might not be getting enqueued in the bound cluster
2403 * hierarchy would be if the thread is "soft" bound and the bound cluster is
2404 * de-recommended. In that case, the thread should be treated as an unbound
2405 * thread.
2406 */
2407 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2408 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2409 }
2410 /*
2411 * Use bound runqueue for shared resource threads. See "cluster shared resource
2412 * threads load balancing" section for details.
2413 */
2414 if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2415 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2416 }
2417
2418 #endif /* CONFIG_SCHED_EDGE */
2419 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2420 assert(thread->thread_group == clutch->sc_tg);
2421
2422 uint64_t current_timestamp = mach_absolute_time();
2423 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2424 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2425 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2426
2427 /*
2428 * Thread linkage in clutch_bucket
2429 *
2430 * A thread has a few linkages within the clutch bucket:
2431 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2432 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2433 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2434 */
2435
2436 /* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2437 thread->th_clutch_runq_link.stamp = current_timestamp;
2438 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2439 (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2440 priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2441
2442 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2443 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2444 sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2445 priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2446
2447 /* Insert thread into timesharing queue of the clutch bucket */
2448 enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2449
2450 /* Increment the urgency counter for the root if necessary */
2451 sched_clutch_root_urgency_inc(root_clutch, thread);
2452
2453 os_atomic_inc(&clutch->sc_thr_count, relaxed);
2454 sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2455
2456 /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2457 sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2458 if (clutch_bucket->scb_thr_count == 0) {
2459 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2460 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2461 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2462 } else {
2463 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2464 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2465 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2466 }
2467
2468 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2469 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2470 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2471 return result;
2472 }
2473
2474 /*
2475 * sched_clutch_thread_remove()
2476 *
2477 * Routine to remove a thread from the sched clutch hierarchy.
2478 * Update the counts at all levels of the hierarchy and remove the nodes
2479 * as they become empty. Always called with the pset lock held.
2480 */
2481 static void
sched_clutch_thread_remove(sched_clutch_root_t root_clutch,thread_t thread,uint64_t current_timestamp,sched_clutch_bucket_options_t options)2482 sched_clutch_thread_remove(
2483 sched_clutch_root_t root_clutch,
2484 thread_t thread,
2485 uint64_t current_timestamp,
2486 sched_clutch_bucket_options_t options)
2487 {
2488 sched_clutch_hierarchy_locked_assert(root_clutch);
2489 #if CONFIG_SCHED_EDGE
2490 sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2491 sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2492
2493 if (thread->th_bound_cluster_enqueued) {
2494 sched_edge_bound_thread_remove(root_clutch, thread);
2495 return;
2496 }
2497 #endif /* CONFIG_SCHED_EDGE */
2498 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2499 assert(thread->thread_group == clutch->sc_tg);
2500 assert(thread->runq != PROCESSOR_NULL);
2501
2502 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2503 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2504 assert(clutch_bucket->scb_root == root_clutch);
2505
2506 /* Decrement the urgency counter for the root if necessary */
2507 sched_clutch_root_urgency_dec(root_clutch, thread);
2508 /* Remove thread from the clutch_bucket */
2509 priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2510 remqueue(&thread->th_clutch_timeshare_link);
2511 thread->runq = PROCESSOR_NULL;
2512
2513 priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2514
2515 /* Update counts at various levels of the hierarchy */
2516 os_atomic_dec(&clutch->sc_thr_count, relaxed);
2517 sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2518 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2519 sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2520
2521 /* Remove the clutch from hierarchy (if needed) and update properties */
2522 if (clutch_bucket->scb_thr_count == 0) {
2523 sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2524 } else {
2525 sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2526 }
2527
2528 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2529 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2530 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2531 }
2532
2533 /*
2534 * sched_clutch_thread_unbound_lookup()
2535 *
2536 * Routine to find the highest unbound thread in the root clutch.
2537 * Helps find threads easily for steal/migrate scenarios in the
2538 * Edge scheduler.
2539 */
2540 static thread_t
sched_clutch_thread_unbound_lookup(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket)2541 sched_clutch_thread_unbound_lookup(
2542 sched_clutch_root_t root_clutch,
2543 sched_clutch_root_bucket_t root_bucket)
2544 {
2545 sched_clutch_hierarchy_locked_assert(root_clutch);
2546
2547 /* Find the highest priority clutch bucket in this root bucket */
2548 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
2549 assert(clutch_bucket != NULL);
2550
2551 /* Find the highest priority runnable thread in this clutch bucket */
2552 thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2553 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2554 thread_tid(thread), thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
2555 return thread;
2556 }
2557
2558 /*
2559 * sched_clutch_thread_highest_remove()
2560 *
2561 * Routine to find and remove the highest priority thread
2562 * from the sched clutch hierarchy. The algorithm looks at the
2563 * hierarchy for the most eligible runnable thread and calls
2564 * sched_clutch_thread_remove(). Always called with the
2565 * pset lock held.
2566 */
2567 static thread_t
sched_clutch_thread_highest_remove(sched_clutch_root_t root_clutch)2568 sched_clutch_thread_highest_remove(
2569 sched_clutch_root_t root_clutch)
2570 {
2571 sched_clutch_hierarchy_locked_assert(root_clutch);
2572 uint64_t current_timestamp = mach_absolute_time();
2573
2574 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
2575 if (root_bucket == NULL) {
2576 return THREAD_NULL;
2577 }
2578
2579 thread_t highest_thread = THREAD_NULL;
2580 if (root_bucket->scrb_bound) {
2581 highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket);
2582 } else {
2583 highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket);
2584 }
2585 sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2586 return highest_thread;
2587 }
2588
2589 /* High level global accessor routines */
2590
2591 /*
2592 * sched_clutch_root_urgency()
2593 *
2594 * Routine to get the urgency of the highest runnable
2595 * thread in the hierarchy.
2596 */
2597 static uint32_t
sched_clutch_root_urgency(sched_clutch_root_t root_clutch)2598 sched_clutch_root_urgency(
2599 sched_clutch_root_t root_clutch)
2600 {
2601 return root_clutch->scr_urgency;
2602 }
2603
2604 /*
2605 * sched_clutch_root_count_sum()
2606 *
2607 * The count_sum mechanism is used for scheduler runq
2608 * statistics calculation. Its only useful for debugging
2609 * purposes; since it takes a mach_absolute_time() on
2610 * other scheduler implementations, its better to avoid
2611 * populating this until absolutely necessary.
2612 */
2613 static uint32_t
sched_clutch_root_count_sum(__unused sched_clutch_root_t root_clutch)2614 sched_clutch_root_count_sum(
2615 __unused sched_clutch_root_t root_clutch)
2616 {
2617 return 0;
2618 }
2619
2620 /*
2621 * sched_clutch_root_priority()
2622 *
2623 * Routine to get the priority of the highest runnable
2624 * thread in the hierarchy.
2625 */
2626 static int
sched_clutch_root_priority(sched_clutch_root_t root_clutch)2627 sched_clutch_root_priority(
2628 sched_clutch_root_t root_clutch)
2629 {
2630 return root_clutch->scr_priority;
2631 }
2632
2633 /*
2634 * sched_clutch_root_count()
2635 *
2636 * Returns total number of runnable threads in the hierarchy.
2637 */
2638 uint32_t
sched_clutch_root_count(sched_clutch_root_t root_clutch)2639 sched_clutch_root_count(
2640 sched_clutch_root_t root_clutch)
2641 {
2642 return root_clutch->scr_thr_count;
2643 }
2644
2645 #if CONFIG_SCHED_EDGE
2646
2647 /*
2648 * sched_clutch_root_foreign_empty()
2649 *
2650 * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2651 */
2652 static boolean_t
sched_clutch_root_foreign_empty(sched_clutch_root_t root_clutch)2653 sched_clutch_root_foreign_empty(
2654 sched_clutch_root_t root_clutch)
2655 {
2656 return priority_queue_empty(&root_clutch->scr_foreign_buckets);
2657 }
2658
2659 /*
2660 * sched_clutch_root_highest_foreign_thread_remove()
2661 *
2662 * Routine to return the thread in the highest priority clutch bucket in a cluster.
2663 * Must be called with the pset for the cluster locked.
2664 */
2665 static thread_t
sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t root_clutch)2666 sched_clutch_root_highest_foreign_thread_remove(
2667 sched_clutch_root_t root_clutch)
2668 {
2669 thread_t thread = THREAD_NULL;
2670 if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
2671 return thread;
2672 }
2673 sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
2674 thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2675 sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
2676 return thread;
2677 }
2678
2679 #endif /* CONFIG_SCHED_EDGE */
2680
2681 /*
2682 * sched_clutch_thread_pri_shift()
2683 *
2684 * Routine to get the priority shift value for a thread.
2685 * Since the timesharing is done at the clutch_bucket level,
2686 * this routine gets the clutch_bucket and retrieves the
2687 * values from there.
2688 */
2689 uint32_t
sched_clutch_thread_pri_shift(thread_t thread,sched_bucket_t bucket)2690 sched_clutch_thread_pri_shift(
2691 thread_t thread,
2692 sched_bucket_t bucket)
2693 {
2694 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2695 return INT8_MAX;
2696 }
2697 assert(bucket != TH_BUCKET_RUN);
2698 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2699 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2700 return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
2701 }
2702
2703 #pragma mark -- Clutch Scheduler Algorithm
2704
2705 static void
2706 sched_clutch_init(void);
2707
2708 static thread_t
2709 sched_clutch_steal_thread(processor_set_t pset);
2710
2711 static void
2712 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
2713
2714 static boolean_t
2715 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
2716 sched_options_t options);
2717
2718 static boolean_t
2719 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
2720
2721 static ast_t
2722 sched_clutch_processor_csw_check(processor_t processor);
2723
2724 static boolean_t
2725 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
2726
2727 static int
2728 sched_clutch_runq_count(processor_t processor);
2729
2730 static boolean_t
2731 sched_clutch_processor_queue_empty(processor_t processor);
2732
2733 static uint64_t
2734 sched_clutch_runq_stats_count_sum(processor_t processor);
2735
2736 static int
2737 sched_clutch_processor_bound_count(processor_t processor);
2738
2739 static void
2740 sched_clutch_pset_init(processor_set_t pset);
2741
2742 static void
2743 sched_clutch_processor_init(processor_t processor);
2744
2745 static thread_t
2746 sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
2747
2748 static void
2749 sched_clutch_processor_queue_shutdown(processor_t processor);
2750
2751 static sched_mode_t
2752 sched_clutch_initial_thread_sched_mode(task_t parent_task);
2753
2754 static uint32_t
2755 sched_clutch_initial_quantum_size(thread_t thread);
2756
2757 static bool
2758 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread);
2759
2760 static uint32_t
2761 sched_clutch_run_incr(thread_t thread);
2762
2763 static uint32_t
2764 sched_clutch_run_decr(thread_t thread);
2765
2766 static void
2767 sched_clutch_update_thread_bucket(thread_t thread);
2768
2769 const struct sched_dispatch_table sched_clutch_dispatch = {
2770 .sched_name = "clutch",
2771 .init = sched_clutch_init,
2772 .timebase_init = sched_timeshare_timebase_init,
2773 .processor_init = sched_clutch_processor_init,
2774 .pset_init = sched_clutch_pset_init,
2775 .maintenance_continuation = sched_timeshare_maintenance_continue,
2776 .choose_thread = sched_clutch_choose_thread,
2777 .steal_thread_enabled = sched_steal_thread_enabled,
2778 .steal_thread = sched_clutch_steal_thread,
2779 .compute_timeshare_priority = sched_compute_timeshare_priority,
2780 .choose_node = sched_choose_node,
2781 .choose_processor = choose_processor,
2782 .processor_enqueue = sched_clutch_processor_enqueue,
2783 .processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
2784 .processor_queue_remove = sched_clutch_processor_queue_remove,
2785 .processor_queue_empty = sched_clutch_processor_queue_empty,
2786 .priority_is_urgent = priority_is_urgent,
2787 .processor_csw_check = sched_clutch_processor_csw_check,
2788 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
2789 .initial_quantum_size = sched_clutch_initial_quantum_size,
2790 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
2791 .can_update_priority = can_update_priority,
2792 .update_priority = update_priority,
2793 .lightweight_update_priority = lightweight_update_priority,
2794 .quantum_expire = sched_default_quantum_expire,
2795 .processor_runq_count = sched_clutch_runq_count,
2796 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
2797 .processor_bound_count = sched_clutch_processor_bound_count,
2798 .thread_update_scan = sched_clutch_thread_update_scan,
2799 .multiple_psets_enabled = TRUE,
2800 .sched_groups_enabled = FALSE,
2801 .avoid_processor_enabled = TRUE,
2802 .thread_avoid_processor = sched_clutch_thread_avoid_processor,
2803 .processor_balance = sched_SMT_balance,
2804
2805 .rt_runq = sched_rtlocal_runq,
2806 .rt_init = sched_rtlocal_init,
2807 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
2808 .rt_runq_scan = sched_rtlocal_runq_scan,
2809 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
2810 .rt_steal_thread = sched_rtlocal_steal_thread,
2811
2812 .qos_max_parallelism = sched_qos_max_parallelism,
2813 .check_spill = sched_check_spill,
2814 .ipi_policy = sched_ipi_policy,
2815 .thread_should_yield = sched_thread_should_yield,
2816 .run_count_incr = sched_clutch_run_incr,
2817 .run_count_decr = sched_clutch_run_decr,
2818 .update_thread_bucket = sched_clutch_update_thread_bucket,
2819 .pset_made_schedulable = sched_pset_made_schedulable,
2820 .cpu_init_completed = NULL,
2821 .thread_eligible_for_pset = NULL,
2822 };
2823
2824 __attribute__((always_inline))
2825 static inline run_queue_t
sched_clutch_bound_runq(processor_t processor)2826 sched_clutch_bound_runq(processor_t processor)
2827 {
2828 return &processor->runq;
2829 }
2830
2831 __attribute__((always_inline))
2832 static inline sched_clutch_root_t
sched_clutch_processor_root_clutch(processor_t processor)2833 sched_clutch_processor_root_clutch(processor_t processor)
2834 {
2835 return &processor->processor_set->pset_clutch_root;
2836 }
2837
2838 __attribute__((always_inline))
2839 static inline run_queue_t
sched_clutch_thread_bound_runq(processor_t processor,__assert_only thread_t thread)2840 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
2841 {
2842 assert(thread->bound_processor == processor);
2843 return sched_clutch_bound_runq(processor);
2844 }
2845
2846 static uint32_t
sched_clutch_initial_quantum_size(thread_t thread)2847 sched_clutch_initial_quantum_size(thread_t thread)
2848 {
2849 if (thread == THREAD_NULL) {
2850 return std_quantum;
2851 }
2852 assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
2853 return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
2854 }
2855
2856 static sched_mode_t
sched_clutch_initial_thread_sched_mode(task_t parent_task)2857 sched_clutch_initial_thread_sched_mode(task_t parent_task)
2858 {
2859 if (parent_task == kernel_task) {
2860 return TH_MODE_FIXED;
2861 } else {
2862 return TH_MODE_TIMESHARE;
2863 }
2864 }
2865
2866 static void
sched_clutch_processor_init(processor_t processor)2867 sched_clutch_processor_init(processor_t processor)
2868 {
2869 run_queue_init(&processor->runq);
2870 }
2871
2872 static void
sched_clutch_pset_init(processor_set_t pset)2873 sched_clutch_pset_init(processor_set_t pset)
2874 {
2875 sched_clutch_root_init(&pset->pset_clutch_root, pset);
2876 }
2877
2878 static void
sched_clutch_tunables_init(void)2879 sched_clutch_tunables_init(void)
2880 {
2881 sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
2882 sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
2883 sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
2884 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
2885 NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
2886 assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
2887 sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
2888 }
2889
2890 static void
sched_clutch_init(void)2891 sched_clutch_init(void)
2892 {
2893 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
2894 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
2895 }
2896 sched_timeshare_init();
2897 sched_clutch_tunables_init();
2898 }
2899
2900 static thread_t
sched_clutch_choose_thread(processor_t processor,int priority,__unused ast_t reason)2901 sched_clutch_choose_thread(
2902 processor_t processor,
2903 int priority,
2904 __unused ast_t reason)
2905 {
2906 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2907 uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
2908 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2909 boolean_t choose_from_boundq = false;
2910
2911 if (bound_runq->highq < priority &&
2912 clutch_pri < priority) {
2913 return THREAD_NULL;
2914 }
2915
2916 if (bound_runq->count && clutch_count) {
2917 if (bound_runq->highq >= clutch_pri) {
2918 choose_from_boundq = true;
2919 }
2920 } else if (bound_runq->count) {
2921 choose_from_boundq = true;
2922 } else if (clutch_count) {
2923 choose_from_boundq = false;
2924 } else {
2925 return THREAD_NULL;
2926 }
2927
2928 thread_t thread = THREAD_NULL;
2929 if (choose_from_boundq == false) {
2930 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2931 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
2932 } else {
2933 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
2934 }
2935 return thread;
2936 }
2937
2938 static boolean_t
sched_clutch_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)2939 sched_clutch_processor_enqueue(
2940 processor_t processor,
2941 thread_t thread,
2942 sched_options_t options)
2943 {
2944 boolean_t result;
2945
2946 thread->runq = processor;
2947 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2948 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2949 result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
2950 } else {
2951 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
2952 result = run_queue_enqueue(rq, thread, options);
2953 }
2954 return result;
2955 }
2956
2957 static boolean_t
sched_clutch_processor_queue_empty(processor_t processor)2958 sched_clutch_processor_queue_empty(processor_t processor)
2959 {
2960 return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
2961 sched_clutch_bound_runq(processor)->count == 0;
2962 }
2963
2964 static ast_t
sched_clutch_processor_csw_check(processor_t processor)2965 sched_clutch_processor_csw_check(processor_t processor)
2966 {
2967 boolean_t has_higher;
2968 int pri;
2969
2970 if (sched_clutch_thread_avoid_processor(processor, current_thread())) {
2971 return AST_PREEMPT | AST_URGENT;
2972 }
2973
2974 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2975 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2976
2977 assert(processor->active_thread != NULL);
2978
2979 pri = MAX(clutch_pri, bound_runq->highq);
2980
2981 if (processor->first_timeslice) {
2982 has_higher = (pri > processor->current_pri);
2983 } else {
2984 has_higher = (pri >= processor->current_pri);
2985 }
2986
2987 if (has_higher) {
2988 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
2989 return AST_PREEMPT | AST_URGENT;
2990 }
2991
2992 if (bound_runq->urgency > 0) {
2993 return AST_PREEMPT | AST_URGENT;
2994 }
2995
2996 return AST_PREEMPT;
2997 }
2998
2999 return AST_NONE;
3000 }
3001
3002 static boolean_t
sched_clutch_processor_queue_has_priority(processor_t processor,int priority,boolean_t gte)3003 sched_clutch_processor_queue_has_priority(processor_t processor,
3004 int priority,
3005 boolean_t gte)
3006 {
3007 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3008
3009 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
3010
3011 if (gte) {
3012 return qpri >= priority;
3013 } else {
3014 return qpri > priority;
3015 }
3016 }
3017
3018 static int
sched_clutch_runq_count(processor_t processor)3019 sched_clutch_runq_count(processor_t processor)
3020 {
3021 return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
3022 }
3023
3024 static uint64_t
sched_clutch_runq_stats_count_sum(processor_t processor)3025 sched_clutch_runq_stats_count_sum(processor_t processor)
3026 {
3027 uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
3028
3029 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
3030 return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
3031 } else {
3032 return bound_sum;
3033 }
3034 }
3035 static int
sched_clutch_processor_bound_count(processor_t processor)3036 sched_clutch_processor_bound_count(processor_t processor)
3037 {
3038 return sched_clutch_bound_runq(processor)->count;
3039 }
3040
3041 static void
sched_clutch_processor_queue_shutdown(processor_t processor)3042 sched_clutch_processor_queue_shutdown(processor_t processor)
3043 {
3044 processor_set_t pset = processor->processor_set;
3045 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3046 thread_t thread;
3047 queue_head_t tqueue;
3048
3049 /* We only need to migrate threads if this is the last active processor in the pset */
3050 if (pset->online_processor_count > 0) {
3051 pset_unlock(pset);
3052 return;
3053 }
3054
3055 queue_init(&tqueue);
3056 while (sched_clutch_root_count(pset_clutch_root) > 0) {
3057 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3058 enqueue_tail(&tqueue, &thread->runq_links);
3059 }
3060
3061 pset_unlock(pset);
3062
3063 qe_foreach_element_safe(thread, &tqueue, runq_links) {
3064 remqueue(&thread->runq_links);
3065 thread_lock(thread);
3066 thread_setrun(thread, SCHED_TAILQ);
3067 thread_unlock(thread);
3068 }
3069 }
3070
3071 static boolean_t
sched_clutch_processor_queue_remove(processor_t processor,thread_t thread)3072 sched_clutch_processor_queue_remove(
3073 processor_t processor,
3074 thread_t thread)
3075 {
3076 run_queue_t rq;
3077 processor_set_t pset = processor->processor_set;
3078
3079 pset_lock(pset);
3080
3081 if (processor == thread->runq) {
3082 /*
3083 * Thread is on a run queue and we have a lock on
3084 * that run queue.
3085 */
3086 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3087 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3088 sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3089 } else {
3090 rq = sched_clutch_thread_bound_runq(processor, thread);
3091 run_queue_remove(rq, thread);
3092 }
3093 } else {
3094 /*
3095 * The thread left the run queue before we could
3096 * lock the run queue.
3097 */
3098 assert(thread->runq == PROCESSOR_NULL);
3099 processor = PROCESSOR_NULL;
3100 }
3101
3102 pset_unlock(pset);
3103
3104 return processor != PROCESSOR_NULL;
3105 }
3106
3107 static thread_t
sched_clutch_steal_thread(__unused processor_set_t pset)3108 sched_clutch_steal_thread(__unused processor_set_t pset)
3109 {
3110 /* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3111 return THREAD_NULL;
3112 }
3113
3114 static void
sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)3115 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3116 {
3117 boolean_t restart_needed = FALSE;
3118 processor_t processor = processor_list;
3119 processor_set_t pset;
3120 thread_t thread;
3121 spl_t s;
3122
3123 /*
3124 * We update the threads associated with each processor (bound and idle threads)
3125 * and then update the threads in each pset runqueue.
3126 */
3127
3128 do {
3129 do {
3130 pset = processor->processor_set;
3131
3132 s = splsched();
3133 pset_lock(pset);
3134
3135 restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
3136
3137 pset_unlock(pset);
3138 splx(s);
3139
3140 if (restart_needed) {
3141 break;
3142 }
3143
3144 thread = processor->idle_thread;
3145 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
3146 if (thread_update_add_thread(thread) == FALSE) {
3147 restart_needed = TRUE;
3148 break;
3149 }
3150 }
3151 } while ((processor = processor->processor_list) != NULL);
3152
3153 /* Ok, we now have a collection of candidates -- fix them. */
3154 thread_update_process_threads();
3155 } while (restart_needed);
3156
3157 pset_node_t node = &pset_node0;
3158 pset = node->psets;
3159
3160 do {
3161 do {
3162 restart_needed = FALSE;
3163 while (pset != NULL) {
3164 s = splsched();
3165 pset_lock(pset);
3166
3167 if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
3168 for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3169 restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3170 if (restart_needed) {
3171 break;
3172 }
3173 }
3174 queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3175 sched_clutch_bucket_t clutch_bucket;
3176 qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3177 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
3178 restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
3179 }
3180 }
3181
3182 pset_unlock(pset);
3183 splx(s);
3184
3185 if (restart_needed) {
3186 break;
3187 }
3188 pset = pset->pset_list;
3189 }
3190
3191 if (restart_needed) {
3192 break;
3193 }
3194 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3195
3196 /* Ok, we now have a collection of candidates -- fix them. */
3197 thread_update_process_threads();
3198 } while (restart_needed);
3199 }
3200
3201 extern int sched_allow_rt_smt;
3202
3203 /* Return true if this thread should not continue running on this processor */
3204 static bool
sched_clutch_thread_avoid_processor(processor_t processor,thread_t thread)3205 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread)
3206 {
3207 if (processor->processor_primary != processor) {
3208 /*
3209 * This is a secondary SMT processor. If the primary is running
3210 * a realtime thread, only allow realtime threads on the secondary.
3211 */
3212 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
3213 return true;
3214 }
3215 }
3216
3217 return false;
3218 }
3219
3220 /*
3221 * For the clutch scheduler, the run counts are maintained in the clutch
3222 * buckets (i.e thread group scheduling structure).
3223 */
3224 static uint32_t
sched_clutch_run_incr(thread_t thread)3225 sched_clutch_run_incr(thread_t thread)
3226 {
3227 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3228 uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3229 sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3230 return new_count;
3231 }
3232
3233 static uint32_t
sched_clutch_run_decr(thread_t thread)3234 sched_clutch_run_decr(thread_t thread)
3235 {
3236 assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3237 uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3238 sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3239 return new_count;
3240 }
3241
3242 /*
3243 * For threads that have changed sched_pri without changing the
3244 * base_pri for any reason other than decay, use the sched_pri
3245 * as the bucketizing priority instead of base_pri. All such
3246 * changes are typically due to kernel locking primitives boosts
3247 * or demotions.
3248 */
3249 static boolean_t
sched_thread_sched_pri_promoted(thread_t thread)3250 sched_thread_sched_pri_promoted(thread_t thread)
3251 {
3252 return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
3253 (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3254 (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3255 (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3256 (thread->kern_promotion_schedpri != 0);
3257 }
3258
3259 /*
3260 * Routine to update the scheduling bucket for the thread.
3261 *
3262 * In the clutch scheduler implementation, the thread's bucket
3263 * is based on sched_pri if it was promoted due to a kernel
3264 * primitive; otherwise its based on the thread base_pri. This
3265 * enhancement allows promoted threads to reach a higher priority
3266 * bucket and potentially get selected sooner for scheduling.
3267 *
3268 * Also, the clutch scheduler does not honor fixed priority below
3269 * FG priority. It simply puts those threads in the corresponding
3270 * timeshare bucket. The reason for to do that is because it is
3271 * extremely hard to define the scheduling properties of such threads
3272 * and they typically lead to performance issues.
3273 */
3274
3275 void
sched_clutch_update_thread_bucket(thread_t thread)3276 sched_clutch_update_thread_bucket(thread_t thread)
3277 {
3278 sched_bucket_t old_bucket = thread->th_sched_bucket;
3279 assert(thread->runq == PROCESSOR_NULL);
3280 int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3281 sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3282
3283 if (old_bucket == new_bucket) {
3284 return;
3285 }
3286
3287 thread->th_sched_bucket = new_bucket;
3288 thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3289 /*
3290 * Since this is called after the thread has been removed from the runq,
3291 * only the run counts need to be updated. The re-insert into the runq
3292 * would put the thread into the correct new bucket's runq.
3293 */
3294 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3295 sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3296 sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3297 }
3298 }
3299
3300 #if CONFIG_SCHED_EDGE
3301
3302 /* Implementation of the AMP version of the clutch scheduler */
3303
3304 static void
3305 sched_edge_init(void);
3306
3307 static void
3308 sched_edge_pset_init(processor_set_t pset);
3309
3310 static thread_t
3311 sched_edge_processor_idle(processor_set_t pset);
3312
3313 static ast_t
3314 sched_edge_processor_csw_check(processor_t processor);
3315
3316 static boolean_t
3317 sched_edge_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3318
3319 static boolean_t
3320 sched_edge_processor_queue_empty(processor_t processor);
3321
3322 static thread_t
3323 sched_edge_choose_thread(processor_t processor, int priority, ast_t reason);
3324
3325 static void
3326 sched_edge_processor_queue_shutdown(processor_t processor);
3327
3328 static processor_t
3329 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
3330
3331 static bool
3332 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread);
3333
3334 static void
3335 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3336
3337 static void
3338 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3339
3340 static bool
3341 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3342
3343 static void
3344 sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3345
3346 static void
3347 sched_edge_cpu_init_completed(void);
3348
3349 static bool
3350 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3351
3352 static bool
3353 sched_edge_steal_thread_enabled(processor_set_t pset);
3354
3355 static sched_ipi_type_t
3356 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3357
3358 static uint32_t
3359 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3360
3361 static uint32_t
3362 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3363
3364 const struct sched_dispatch_table sched_edge_dispatch = {
3365 .sched_name = "edge",
3366 .init = sched_edge_init,
3367 .timebase_init = sched_timeshare_timebase_init,
3368 .processor_init = sched_clutch_processor_init,
3369 .pset_init = sched_edge_pset_init,
3370 .maintenance_continuation = sched_timeshare_maintenance_continue,
3371 .choose_thread = sched_edge_choose_thread,
3372 .steal_thread_enabled = sched_edge_steal_thread_enabled,
3373 .steal_thread = sched_edge_processor_idle,
3374 .compute_timeshare_priority = sched_compute_timeshare_priority,
3375 .choose_node = sched_choose_node,
3376 .choose_processor = sched_edge_choose_processor,
3377 .processor_enqueue = sched_clutch_processor_enqueue,
3378 .processor_queue_shutdown = sched_edge_processor_queue_shutdown,
3379 .processor_queue_remove = sched_clutch_processor_queue_remove,
3380 .processor_queue_empty = sched_edge_processor_queue_empty,
3381 .priority_is_urgent = priority_is_urgent,
3382 .processor_csw_check = sched_edge_processor_csw_check,
3383 .processor_queue_has_priority = sched_edge_processor_queue_has_priority,
3384 .initial_quantum_size = sched_clutch_initial_quantum_size,
3385 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3386 .can_update_priority = can_update_priority,
3387 .update_priority = update_priority,
3388 .lightweight_update_priority = lightweight_update_priority,
3389 .quantum_expire = sched_default_quantum_expire,
3390 .processor_runq_count = sched_clutch_runq_count,
3391 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3392 .processor_bound_count = sched_clutch_processor_bound_count,
3393 .thread_update_scan = sched_clutch_thread_update_scan,
3394 .multiple_psets_enabled = TRUE,
3395 .sched_groups_enabled = FALSE,
3396 .avoid_processor_enabled = TRUE,
3397 .thread_avoid_processor = sched_edge_thread_avoid_processor,
3398 .processor_balance = sched_edge_balance,
3399
3400 .rt_runq = sched_rtlocal_runq,
3401 .rt_init = sched_rtlocal_init,
3402 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
3403 .rt_runq_scan = sched_rtlocal_runq_scan,
3404 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
3405 .rt_steal_thread = sched_rtlocal_steal_thread,
3406
3407 .qos_max_parallelism = sched_edge_qos_max_parallelism,
3408 .check_spill = sched_edge_check_spill,
3409 .ipi_policy = sched_edge_ipi_policy,
3410 .thread_should_yield = sched_edge_thread_should_yield,
3411 .run_count_incr = sched_clutch_run_incr,
3412 .run_count_decr = sched_clutch_run_decr,
3413 .update_thread_bucket = sched_clutch_update_thread_bucket,
3414 .pset_made_schedulable = sched_edge_pset_made_schedulable,
3415 .thread_group_recommendation_change = NULL,
3416 .cpu_init_completed = sched_edge_cpu_init_completed,
3417 .thread_eligible_for_pset = sched_edge_thread_eligible_for_pset,
3418 };
3419
3420 static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3421
3422 /*
3423 * sched_edge_pset_available()
3424 *
3425 * Routine to determine if a pset is available for scheduling.
3426 */
3427 static bool
sched_edge_pset_available(processor_set_t pset)3428 sched_edge_pset_available(processor_set_t pset)
3429 {
3430 if (pset == NULL) {
3431 return false;
3432 }
3433 return pset_available_cpu_count(pset) > 0;
3434 }
3435
3436 /*
3437 * sched_edge_thread_bound_cluster_id()
3438 *
3439 * Routine to determine which cluster a particular thread is bound to. Uses
3440 * the sched_flags on the thread to map back to a specific cluster id.
3441 *
3442 * <Edge Multi-cluster Support Needed>
3443 */
3444 static uint32_t
sched_edge_thread_bound_cluster_id(thread_t thread)3445 sched_edge_thread_bound_cluster_id(thread_t thread)
3446 {
3447 assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3448 return thread->th_bound_cluster_id;
3449 }
3450
3451 /* Forward declaration for some thread migration routines */
3452 static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3453 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3454 static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3455 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
3456
3457 /*
3458 * sched_edge_config_set()
3459 *
3460 * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3461 * policies in the scheduler.
3462 */
3463 static void
sched_edge_config_set(uint32_t src_cluster,uint32_t dst_cluster,sched_clutch_edge edge_config)3464 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
3465 {
3466 sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
3467 edge->sce_edge_packed = edge_config.sce_edge_packed;
3468 }
3469
3470 /*
3471 * sched_edge_config_get()
3472 *
3473 * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3474 * if it needs to update edges.
3475 */
3476 static sched_clutch_edge
sched_edge_config_get(uint32_t src_cluster,uint32_t dst_cluster)3477 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
3478 {
3479 return pset_array[src_cluster]->sched_edges[dst_cluster];
3480 }
3481
3482 /*
3483 * sched_edge_matrix_set()
3484 *
3485 * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3486 * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3487 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3488 */
3489 void
sched_edge_matrix_set(sched_clutch_edge * edge_matrix,bool * edge_changes_bitmap,__unused uint64_t flags,uint64_t matrix_order)3490 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3491 {
3492 uint32_t edge_index = 0;
3493 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3494 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3495 if (edge_changes_bitmap[edge_index]) {
3496 sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3497 }
3498 edge_index++;
3499 }
3500 }
3501 }
3502
3503 /*
3504 * sched_edge_matrix_get()
3505 *
3506 * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3507 * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3508 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3509 */
3510 void
sched_edge_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_request_bitmap,__unused uint64_t flags,uint64_t matrix_order)3511 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3512 {
3513 uint32_t edge_index = 0;
3514 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3515 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3516 if (edge_request_bitmap[edge_index]) {
3517 edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3518 }
3519 edge_index++;
3520 }
3521 }
3522 }
3523
3524 /*
3525 * sched_edge_init()
3526 *
3527 * Routine to initialize the data structures for the Edge scheduler.
3528 */
3529 static void
sched_edge_init(void)3530 sched_edge_init(void)
3531 {
3532 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3533 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3534 }
3535 sched_timeshare_init();
3536 sched_clutch_tunables_init();
3537 sched_edge_max_clusters = ml_get_cluster_count();
3538 }
3539
3540 static void
sched_edge_pset_init(processor_set_t pset)3541 sched_edge_pset_init(processor_set_t pset)
3542 {
3543 uint32_t pset_cluster_id = pset->pset_cluster_id;
3544 pset->pset_type = (pset->pset_cluster_type == PSET_AMP_P) ? CLUSTER_TYPE_P : CLUSTER_TYPE_E;
3545
3546 /* Set the edge weight and properties for the pset itself */
3547 bitmap_clear(pset->foreign_psets, pset_cluster_id);
3548 bitmap_clear(pset->native_psets, pset_cluster_id);
3549 bitmap_clear(pset->local_psets, pset_cluster_id);
3550 bitmap_clear(pset->remote_psets, pset_cluster_id);
3551 pset->sched_edges[pset_cluster_id].sce_edge_packed = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0}.sce_edge_packed;
3552 sched_clutch_root_init(&pset->pset_clutch_root, pset);
3553 bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id);
3554 }
3555
3556 static thread_t
sched_edge_choose_thread(processor_t processor,int priority,__unused ast_t reason)3557 sched_edge_choose_thread(
3558 processor_t processor,
3559 int priority,
3560 __unused ast_t reason)
3561 {
3562 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3563 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3564 boolean_t choose_from_boundq = false;
3565
3566 if ((bound_runq->highq < priority) &&
3567 (clutch_pri < priority)) {
3568 return THREAD_NULL;
3569 }
3570
3571 if (bound_runq->highq >= clutch_pri) {
3572 choose_from_boundq = true;
3573 }
3574
3575 thread_t thread = THREAD_NULL;
3576 if (choose_from_boundq == false) {
3577 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3578 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3579 } else {
3580 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
3581 }
3582 return thread;
3583 }
3584
3585 static boolean_t
sched_edge_processor_queue_empty(processor_t processor)3586 sched_edge_processor_queue_empty(processor_t processor)
3587 {
3588 return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
3589 (sched_clutch_bound_runq(processor)->count == 0);
3590 }
3591
3592 static void
sched_edge_check_spill(__unused processor_set_t pset,__unused thread_t thread)3593 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
3594 {
3595 assert(thread->bound_processor == PROCESSOR_NULL);
3596 }
3597
3598 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
3599 SCHED_EDGE_YIELD_RUNQ_NONEMPTY = 0x0,
3600 SCHED_EDGE_YIELD_FOREIGN_RUNNABLE = 0x1,
3601 SCHED_EDGE_YIELD_FOREIGN_RUNNING = 0x2,
3602 SCHED_EDGE_YIELD_STEAL_POSSIBLE = 0x3,
3603 SCHED_EDGE_YIELD_DISALLOW = 0x4,
3604 });
3605
3606 static bool
sched_edge_thread_should_yield(processor_t processor,__unused thread_t thread)3607 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
3608 {
3609 if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
3610 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3611 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
3612 return true;
3613 }
3614
3615 /*
3616 * The yield logic should follow the same logic that steal_thread () does. The
3617 * thread_should_yield() is effectively trying to quickly check that if the
3618 * current thread gave up CPU, is there any other thread that would execute
3619 * on this CPU. So it needs to provide the same answer as the steal_thread()/
3620 * processor Idle logic.
3621 */
3622 if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
3623 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3624 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
3625 return true;
3626 }
3627 if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
3628 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3629 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
3630 return true;
3631 }
3632
3633 processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
3634 if (steal_candidate != NULL) {
3635 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3636 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
3637 return true;
3638 }
3639
3640 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
3641 0, SCHED_EDGE_YIELD_DISALLOW);
3642 return false;
3643 }
3644
3645 static ast_t
sched_edge_processor_csw_check(processor_t processor)3646 sched_edge_processor_csw_check(processor_t processor)
3647 {
3648 boolean_t has_higher;
3649 int pri;
3650
3651 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3652 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3653
3654 assert(processor->active_thread != NULL);
3655
3656 pri = MAX(clutch_pri, bound_runq->highq);
3657
3658 if (processor->first_timeslice) {
3659 has_higher = (pri > processor->current_pri);
3660 } else {
3661 has_higher = (pri >= processor->current_pri);
3662 }
3663
3664 if (has_higher) {
3665 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
3666 return AST_PREEMPT | AST_URGENT;
3667 }
3668
3669 if (bound_runq->urgency > 0) {
3670 return AST_PREEMPT | AST_URGENT;
3671 }
3672
3673 return AST_PREEMPT;
3674 }
3675
3676 return AST_NONE;
3677 }
3678
3679 static boolean_t
sched_edge_processor_queue_has_priority(processor_t processor,int priority,boolean_t gte)3680 sched_edge_processor_queue_has_priority(processor_t processor,
3681 int priority,
3682 boolean_t gte)
3683 {
3684 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3685
3686 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
3687 if (gte) {
3688 return qpri >= priority;
3689 } else {
3690 return qpri > priority;
3691 }
3692 }
3693
3694 static void
sched_edge_processor_queue_shutdown(processor_t processor)3695 sched_edge_processor_queue_shutdown(processor_t processor)
3696 {
3697 processor_set_t pset = processor->processor_set;
3698 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3699 thread_t thread;
3700 queue_head_t tqueue;
3701
3702 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
3703 if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
3704 pset_unlock(pset);
3705 return;
3706 }
3707
3708 bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3709
3710 queue_init(&tqueue);
3711 while (sched_clutch_root_count(pset_clutch_root) > 0) {
3712 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3713 enqueue_tail(&tqueue, &thread->runq_links);
3714 }
3715 pset_unlock(pset);
3716
3717 qe_foreach_element_safe(thread, &tqueue, runq_links) {
3718 remqueue(&thread->runq_links);
3719 thread_lock(thread);
3720 thread_setrun(thread, SCHED_TAILQ);
3721 thread_unlock(thread);
3722 }
3723 }
3724
3725 /*
3726 * sched_edge_cluster_load_metric()
3727 *
3728 * The load metric for a cluster is a measure of the average scheduling latency
3729 * experienced by threads on that cluster. It is a product of the average number
3730 * of threads in the runqueue and the average execution time for threads. The metric
3731 * has special values in the following cases:
3732 * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
3733 * the maximum value to disallow any threads to migrate to this cluster.
3734 * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
3735 * to be spread across the platform quickly for ncpu wide workloads.
3736 */
3737 static uint32_t
sched_edge_cluster_load_metric(processor_set_t pset,sched_bucket_t sched_bucket)3738 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
3739 {
3740 if (sched_edge_pset_available(pset) == false) {
3741 return UINT32_MAX;
3742 }
3743 return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
3744 }
3745
3746 /*
3747 *
3748 * Edge Scheduler Steal/Rebalance logic
3749 *
3750 * = Generic scheduler logic =
3751 *
3752 * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
3753 * find any thread for execution in its runqueue. The aim of the steal operation
3754 * is to find other threads running/runnable in other clusters which should be
3755 * executed here.
3756 *
3757 * If the steal callout does not return a thread, the thread_select() logic calls
3758 * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
3759 * threads and idle out the current CPU.
3760 *
3761 * = SCHED(steal_thread) for Edge Scheduler =
3762 *
3763 * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
3764 * routine tries to do the following operations in order:
3765 * (1) Find foreign runnnable threads in non-native cluster
3766 * runqueues (sched_edge_foreign_runnable_thread_remove())
3767 * (2) Check if foreign threads are running on the non-native
3768 * clusters (sched_edge_foreign_running_thread_available())
3769 * - If yes, return THREAD_NULL for the steal callout and
3770 * perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
3771 * (3) Steal a thread from another cluster based on edge
3772 * weights (sched_edge_steal_thread())
3773 *
3774 * = SCHED(processor_balance) for Edge Scheduler =
3775 *
3776 * If steal_thread did not return a thread for the processor, use
3777 * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
3778 *
3779 * = Clutch Bucket Preferred Cluster Overrides =
3780 *
3781 * Since these operations (just like thread migrations on enqueue)
3782 * move threads across clusters, they need support for handling clutch
3783 * bucket group level preferred cluster recommendations.
3784 * For (1), a clutch bucket will be in the foreign runnable queue based
3785 * on the clutch bucket group preferred cluster.
3786 * For (2), the running thread will set the bit on the processor based
3787 * on its preferred cluster type.
3788 * For (3), the edge configuration would prevent threads from being stolen
3789 * in the wrong direction.
3790 *
3791 * = SCHED(thread_should_yield) =
3792 * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
3793 * since that is expecting the same answer as if thread_select() was called on a core
3794 * with an empty runqueue.
3795 */
3796
3797 static bool
sched_edge_steal_thread_enabled(__unused processor_set_t pset)3798 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
3799 {
3800 /*
3801 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
3802 */
3803 return true;
3804 }
3805
3806 static processor_set_t
sched_edge_steal_candidate(processor_set_t pset)3807 sched_edge_steal_candidate(processor_set_t pset)
3808 {
3809 uint32_t dst_cluster_id = pset->pset_cluster_id;
3810 for (int cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
3811 processor_set_t candidate_pset = pset_array[cluster_id];
3812 if (cluster_id == dst_cluster_id) {
3813 continue;
3814 }
3815 if (candidate_pset == NULL) {
3816 continue;
3817 }
3818 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
3819 if (incoming_edge->sce_steal_allowed && (bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1)) {
3820 return candidate_pset;
3821 }
3822 }
3823 return NULL;
3824 }
3825
3826 static boolean_t
sched_edge_foreign_runnable_thread_available(processor_set_t pset)3827 sched_edge_foreign_runnable_thread_available(processor_set_t pset)
3828 {
3829 /* Find all the clusters that are foreign for this cluster */
3830 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3831 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3832 /*
3833 * For each cluster, see if there are any runnable foreign threads.
3834 * This check is currently being done without the pset lock to make it cheap for
3835 * the common case.
3836 */
3837 processor_set_t target_pset = pset_array[cluster];
3838 if (sched_edge_pset_available(target_pset) == false) {
3839 continue;
3840 }
3841
3842 if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3843 return true;
3844 }
3845 }
3846 return false;
3847 }
3848
3849 static thread_t
sched_edge_foreign_runnable_thread_remove(processor_set_t pset,uint64_t ctime)3850 sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
3851 {
3852 thread_t thread = THREAD_NULL;
3853
3854 /* Find all the clusters that are foreign for this cluster */
3855 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3856 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3857 /*
3858 * For each cluster, see if there are any runnable foreign threads.
3859 * This check is currently being done without the pset lock to make it cheap for
3860 * the common case.
3861 */
3862 processor_set_t target_pset = pset_array[cluster];
3863 if (sched_edge_pset_available(target_pset) == false) {
3864 continue;
3865 }
3866
3867 if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3868 continue;
3869 }
3870 /*
3871 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
3872 * and get the highest priority thread.
3873 */
3874 pset_lock(target_pset);
3875 if (sched_edge_pset_available(target_pset)) {
3876 thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
3877 sched_update_pset_load_average(target_pset, ctime);
3878 }
3879 pset_unlock(target_pset);
3880
3881 /*
3882 * Edge Scheduler Optimization
3883 *
3884 * The current implementation immediately returns as soon as it finds a foreign
3885 * runnable thread. This could be enhanced to look at highest priority threads
3886 * from all foreign clusters and pick the highest amongst them. That would need
3887 * some form of global state across psets to make that kind of a check cheap.
3888 */
3889 if (thread != THREAD_NULL) {
3890 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
3891 break;
3892 }
3893 /* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
3894 }
3895
3896 return thread;
3897 }
3898
3899 /*
3900 * sched_edge_cpu_running_foreign_shared_rsrc_available()
3901 *
3902 * Routine to determine if the thread running on a CPU is a shared resource thread
3903 * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
3904 * a CPU going idle on a pset should rebalance a running shared resource heavy thread
3905 * from another non-ideal cluster based on the former's shared resource load.
3906 */
3907 static boolean_t
sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset,int foreign_cpu,processor_set_t idle_pset)3908 sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
3909 {
3910 boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
3911 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
3912 return false;
3913 }
3914
3915 boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
3916 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
3917 return false;
3918 }
3919 return true;
3920 }
3921
3922 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset)3923 sched_edge_foreign_running_thread_available(processor_set_t pset)
3924 {
3925 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3926 int cluster = -1;
3927 while ((cluster = sched_edge_iterate_clusters_ordered(pset, foreign_pset_bitmap[0], cluster)) != -1) {
3928 /* Skip the pset if its not schedulable */
3929 processor_set_t target_pset = pset_array[cluster];
3930 if (sched_edge_pset_available(target_pset) == false) {
3931 continue;
3932 }
3933
3934 uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
3935 for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
3936 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
3937 continue;
3938 }
3939 return true;
3940 }
3941 }
3942 return false;
3943 }
3944
3945 static bool
sched_edge_steal_possible(processor_set_t idle_pset,processor_set_t candidate_pset)3946 sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_pset)
3947 {
3948 int highest_runnable_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
3949 if (highest_runnable_bucket == -1) {
3950 /* Candidate cluster runq is empty */
3951 return false;
3952 }
3953
3954 if (idle_pset->pset_cluster_type == candidate_pset->pset_cluster_type) {
3955 /* Always allow stealing from homogeneous clusters */
3956 return true;
3957 } else {
3958 /* Use the load metrics for highest runnable bucket since that would be stolen next */
3959 int32_t candidate_load = sched_edge_cluster_load_metric(candidate_pset, (sched_bucket_t)highest_runnable_bucket);
3960 return candidate_load > 0;
3961 }
3962 }
3963
3964 static thread_t
sched_edge_steal_thread(processor_set_t pset,uint64_t candidate_pset_bitmap)3965 sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap)
3966 {
3967 thread_t thread = THREAD_NULL;
3968
3969 /*
3970 * Edge Scheduler Optimization
3971 *
3972 * The logic today bails as soon as it finds a cluster where the cluster load is
3973 * greater than the edge weight. Maybe it should have a more advanced version
3974 * which looks for the maximum delta etc.
3975 */
3976 int cluster_id = -1;
3977 while ((cluster_id = sched_edge_iterate_clusters_ordered(pset, candidate_pset_bitmap, cluster_id)) != -1) {
3978 processor_set_t steal_from_pset = pset_array[cluster_id];
3979 if (steal_from_pset == NULL) {
3980 continue;
3981 }
3982 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[pset->pset_cluster_id];
3983 if (incoming_edge->sce_steal_allowed == false) {
3984 continue;
3985 }
3986 pset_lock(steal_from_pset);
3987 if (sched_edge_steal_possible(pset, steal_from_pset)) {
3988 uint64_t current_timestamp = mach_absolute_time();
3989 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(&steal_from_pset->pset_clutch_root, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
3990 thread = sched_clutch_thread_unbound_lookup(&steal_from_pset->pset_clutch_root, root_bucket);
3991 sched_clutch_thread_remove(&steal_from_pset->pset_clutch_root, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
3992 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
3993 sched_update_pset_load_average(steal_from_pset, current_timestamp);
3994 }
3995 pset_unlock(steal_from_pset);
3996 if (thread != THREAD_NULL) {
3997 break;
3998 }
3999 }
4000 return thread;
4001 }
4002
4003 /*
4004 * sched_edge_processor_idle()
4005 *
4006 * The routine is the implementation for steal_thread() for the Edge scheduler.
4007 */
4008 static thread_t
sched_edge_processor_idle(processor_set_t pset)4009 sched_edge_processor_idle(processor_set_t pset)
4010 {
4011 thread_t thread = THREAD_NULL;
4012
4013 uint64_t ctime = mach_absolute_time();
4014
4015 processor_t processor = current_processor();
4016 bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
4017
4018 /* Each of the operations acquire the lock for the pset they target */
4019 pset_unlock(pset);
4020
4021 /* Find highest priority runnable thread on all non-native clusters */
4022 thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
4023 if (thread != THREAD_NULL) {
4024 return thread;
4025 }
4026
4027 /* Find highest priority runnable thread on all native clusters */
4028 thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4029 if (thread != THREAD_NULL) {
4030 return thread;
4031 }
4032
4033 /* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4034 boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4035 if (rebalance_needed) {
4036 return THREAD_NULL;
4037 }
4038
4039 /* No foreign threads found; find a thread to steal from all clusters based on weights/loads etc. */
4040 thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4041 return thread;
4042 }
4043
4044 /* Return true if this shared resource thread has a better cluster to run on */
4045 static bool
sched_edge_shared_rsrc_migrate_possible(thread_t thread,processor_set_t preferred_pset,processor_set_t current_pset)4046 sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4047 {
4048 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4049 uint64_t current_pset_load = sched_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4050 /*
4051 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4052 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4053 *
4054 * Edge Scheduler Optimization
4055 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4056 */
4057 uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4058
4059 uint64_t eligible_pset_bitmask = 0;
4060 if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4061 /*
4062 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4063 * only among clusters native with the preferred cluster.
4064 */
4065 eligible_pset_bitmask = preferred_pset->native_psets[0];
4066 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4067 } else {
4068 /* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4069 eligible_pset_bitmask = sched_edge_available_pset_bitmask[0];
4070 }
4071
4072 /* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4073 for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4074 if (cluster_id == current_pset->pset_cluster_id) {
4075 continue;
4076 }
4077 uint64_t cluster_load = sched_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4078 if (current_pset_adjusted_load > cluster_load) {
4079 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4080 return true;
4081 }
4082 }
4083 return false;
4084 }
4085
4086 /* Return true if this thread should not continue running on this processor */
4087 static bool
sched_edge_thread_avoid_processor(processor_t processor,thread_t thread)4088 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread)
4089 {
4090 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4091 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
4092 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4093 }
4094 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
4095 if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4096 return true;
4097 }
4098 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4099 }
4100
4101 /*
4102 * For long running parallel workloads, it is important to rebalance threads across
4103 * E/P clusters so that they make equal forward progress. This is achieved through
4104 * threads expiring their quantum on the non-preferred cluster type and explicitly
4105 * rebalancing to the preferred cluster runqueue.
4106 */
4107 if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4108 return true;
4109 }
4110 /* If the preferred pset for the thread is now idle, try and migrate thread to that cluster */
4111 if ((processor->processor_set != preferred_pset) &&
4112 (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
4113 return true;
4114 }
4115 return false;
4116 }
4117
4118 static void
sched_edge_balance(__unused processor_t cprocessor,processor_set_t cpset)4119 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
4120 {
4121 assert(cprocessor == current_processor());
4122 pset_unlock(cpset);
4123
4124 uint64_t ast_processor_map = 0;
4125 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4126
4127 bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
4128 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4129 /* Skip the pset if its not schedulable */
4130 processor_set_t target_pset = pset_array[cluster];
4131 if (sched_edge_pset_available(target_pset) == false) {
4132 continue;
4133 }
4134
4135 pset_lock(target_pset);
4136 uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
4137 for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
4138 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
4139 continue;
4140 }
4141 processor_t target_cpu = processor_array[cpuid];
4142 ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
4143 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4144 bit_set(ast_processor_map, cpuid);
4145 }
4146 }
4147 pset_unlock(target_pset);
4148 }
4149
4150 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4151 processor_t ast_processor = processor_array[cpuid];
4152 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4153 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
4154 }
4155 }
4156
4157 /*
4158 * sched_edge_migration_check()
4159 *
4160 * Routine to evaluate an edge between two clusters to decide if migration is possible
4161 * across that edge. Also updates the selected_pset and max_edge_delta out parameters
4162 * accordingly. The return value indicates if the invoking routine should short circuit
4163 * the search, since an ideal candidate has been found. The routine looks at the regular
4164 * edges and cluster loads or the shared resource loads based on the type of thread.
4165 */
4166 static bool
sched_edge_migration_check(uint32_t cluster_id,processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread,processor_set_t * selected_pset,uint32_t * max_edge_delta)4167 sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
4168 uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
4169 {
4170 uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4171 cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
4172 processor_set_t dst_pset = pset_array[cluster_id];
4173 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4174 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4175
4176 if (cluster_id == preferred_cluster_id) {
4177 return false;
4178 }
4179
4180 if (dst_pset == NULL) {
4181 return false;
4182 }
4183
4184 sched_clutch_edge *edge = preferred_pset->sched_edges;
4185 if (edge[cluster_id].sce_migration_allowed == false) {
4186 return false;
4187 }
4188 uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4189 if (dst_load == 0) {
4190 /* The candidate cluster is idle; select it immediately for execution */
4191 *selected_pset = dst_pset;
4192 *max_edge_delta = preferred_cluster_load;
4193 return true;
4194 }
4195
4196 uint32_t edge_delta = 0;
4197 if (dst_load > preferred_cluster_load) {
4198 return false;
4199 }
4200 edge_delta = preferred_cluster_load - dst_load;
4201 if (!shared_rsrc_thread && (edge_delta < edge[cluster_id].sce_migration_weight)) {
4202 /*
4203 * For non shared resource threads, use the edge migration weight to decide if
4204 * this cluster is over-committed at the QoS level of this thread.
4205 */
4206 return false;
4207 }
4208
4209 if (edge_delta < *max_edge_delta) {
4210 return false;
4211 }
4212 if (edge_delta == *max_edge_delta) {
4213 /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4214 boolean_t selected_homogeneous = (pset_type_for_id((*selected_pset)->pset_cluster_id) == preferred_cluster_type);
4215 boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4216 if (selected_homogeneous || !candidate_homogeneous) {
4217 return false;
4218 }
4219 }
4220 /* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
4221 *max_edge_delta = edge_delta;
4222 *selected_pset = dst_pset;
4223 return false;
4224 }
4225
4226 /*
4227 * sched_edge_iterate_clusters_ordered()
4228 *
4229 * Routine to iterate clusters in die local order. For multi-die machines,
4230 * the routine ensures that the candidate clusters on the same die as the
4231 * passed in pset are returned before the remote die clusters. This should
4232 * be used in all places where cluster selection in die order matters.
4233 */
4234
4235 static int
sched_edge_iterate_clusters_ordered(processor_set_t starting_pset,uint64_t candidate_map,int previous_cluster)4236 sched_edge_iterate_clusters_ordered(processor_set_t starting_pset, uint64_t candidate_map, int previous_cluster)
4237 {
4238 int cluster_id = -1;
4239
4240 uint64_t local_candidate_map = starting_pset->local_psets[0] & candidate_map;
4241 uint64_t remote_candidate_map = starting_pset->remote_psets[0] & candidate_map;
4242
4243 if (previous_cluster == -1) {
4244 /* previous_cluster == -1 indicates the initial condition */
4245 cluster_id = bit_first(local_candidate_map);
4246 if (cluster_id != -1) {
4247 return cluster_id;
4248 }
4249 return bit_first(remote_candidate_map);
4250 } else {
4251 /*
4252 * After the initial condition, the routine attempts to return a
4253 * cluster in the previous_cluster's locality. If none is available,
4254 * it looks at remote clusters.
4255 */
4256 if (bit_test(local_candidate_map, previous_cluster)) {
4257 cluster_id = bit_next(local_candidate_map, previous_cluster);
4258 if (cluster_id != -1) {
4259 return cluster_id;
4260 } else {
4261 return bit_first(remote_candidate_map);
4262 }
4263 }
4264 return bit_next(remote_candidate_map, previous_cluster);
4265 }
4266 }
4267
4268 /*
4269 * sched_edge_migrate_edges_evaluate()
4270 *
4271 * Routine to find the candidate for thread migration based on edge weights.
4272 *
4273 * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
4274 * return preferred_pset if its the most ideal destination for this thread.
4275 */
4276 static processor_set_t
sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread)4277 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
4278 {
4279 processor_set_t selected_pset = preferred_pset;
4280 uint32_t max_edge_delta = 0;
4281 bool search_complete = false;
4282 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4283 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4284
4285 bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
4286 bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
4287 /* Always start the search with the native clusters */
4288 int cluster_id = -1;
4289 while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, native_pset_bitmap[0], cluster_id)) != -1) {
4290 search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4291 if (search_complete) {
4292 break;
4293 }
4294 }
4295
4296 if (search_complete) {
4297 return selected_pset;
4298 }
4299
4300 if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
4301 /*
4302 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
4303 * to fill up the preferred cluster and its homogeneous peers first.
4304 */
4305
4306 if (max_edge_delta > 0) {
4307 /*
4308 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
4309 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
4310 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
4311 * should be placed.
4312 */
4313 return selected_pset;
4314 }
4315 /*
4316 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
4317 * any more capacity left.
4318 */
4319 if (sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
4320 return preferred_pset;
4321 }
4322 /*
4323 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
4324 * at non-native clusters for capacity.
4325 */
4326 }
4327
4328 /* Now look at the non-native clusters */
4329 cluster_id = -1;
4330 while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, foreign_pset_bitmap[0], cluster_id)) != -1) {
4331 search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4332 if (search_complete) {
4333 break;
4334 }
4335 }
4336 return selected_pset;
4337 }
4338
4339 /*
4340 * sched_edge_candidate_alternative()
4341 *
4342 * Routine to find an alternative cluster from candidate_cluster_bitmap since the
4343 * selected_pset is not available for execution. The logic tries to prefer homogeneous
4344 * clusters over heterogeneous clusters since this is typically used in thread
4345 * placement decisions.
4346 */
4347 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
4348 static processor_set_t
sched_edge_candidate_alternative(processor_set_t selected_pset,uint64_t candidate_cluster_bitmap)4349 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
4350 {
4351 /*
4352 * It looks like the most ideal pset is not available for scheduling currently.
4353 * Try to find a homogeneous cluster that is still available.
4354 */
4355 uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
4356 int available_cluster_id = lsb_first(available_native_clusters);
4357 if (available_cluster_id == -1) {
4358 /* Looks like none of the homogeneous clusters are available; pick the first available cluster */
4359 available_cluster_id = bit_first(candidate_cluster_bitmap);
4360 }
4361 assert(available_cluster_id != -1);
4362 return pset_array[available_cluster_id];
4363 }
4364
4365 /*
4366 * sched_edge_switch_pset_lock()
4367 *
4368 * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4369 * switch_pset_locks.
4370 * Returns the newly locked pset after the switch.
4371 */
4372 static processor_set_t
sched_edge_switch_pset_lock(processor_set_t selected_pset,processor_set_t locked_pset,bool switch_pset_locks)4373 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
4374 {
4375 if (!switch_pset_locks) {
4376 return locked_pset;
4377 }
4378 if (selected_pset != locked_pset) {
4379 pset_unlock(locked_pset);
4380 pset_lock(selected_pset);
4381 return selected_pset;
4382 } else {
4383 return locked_pset;
4384 }
4385 }
4386
4387 /*
4388 * sched_edge_amp_rebalance_pset()
4389 *
4390 * Routine to decide where a thread which is eligible for AMP rebalance (i.e.
4391 * has executed on non-preferred cluster type for a while) should be enqueued.
4392 * The algorithm maintains a history of AMP rebalance decisions on the clutch
4393 * bucket group of the workload and round-robins between clusters to ensure
4394 * that all threads get a chance on the performance cores and make equal
4395 * progress.
4396 */
4397 static processor_set_t
sched_edge_amp_rebalance_pset(processor_set_t preferred_pset,thread_t thread)4398 sched_edge_amp_rebalance_pset(processor_set_t preferred_pset, thread_t thread)
4399 {
4400 sched_clutch_t clutch = sched_clutch_for_thread(thread);
4401 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
4402
4403 uint32_t last_chosen_cluster, new_chosen_cluster;
4404
4405 /* Only AMP rebalance within clusters native to the preferred cluster */
4406 uint64_t eligible_pset_bitmask = preferred_pset->native_psets[0];
4407 /* Preferred cluster is also eligible for rebalancing */
4408 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4409 /* Atomically update the AMP rebalance cluster for the clutch bucket group */
4410 os_atomic_rmw_loop(&clutch_bucket_group->scbg_amp_rebalance_last_chosen, last_chosen_cluster, new_chosen_cluster, relaxed, {
4411 if (last_chosen_cluster == UINT32_MAX) {
4412 new_chosen_cluster = preferred_pset->pset_cluster_id;
4413 } else {
4414 new_chosen_cluster = lsb_next(eligible_pset_bitmask, last_chosen_cluster);
4415 if (new_chosen_cluster == -1) {
4416 /* Rotate to the start of the eligible bitmask */
4417 new_chosen_cluster = lsb_first(eligible_pset_bitmask);
4418 }
4419 }
4420 });
4421 return pset_array[new_chosen_cluster];
4422 }
4423
4424 /*
4425 * sched_edge_migrate_candidate()
4426 *
4427 * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4428 * the thread and the preferred cluster to determine the best available pset for scheduling.
4429 *
4430 * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4431 * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4432 * decision has to be synchronized with another CPU changing the recommendation of clusters available
4433 * on the system. If this parameter is set to false, this routine returns the best effort indication of
4434 * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4435 * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4436 * more appropriate cluster.
4437 *
4438 * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4439 * resultant pset lock is held.
4440 */
4441 static processor_set_t
sched_edge_migrate_candidate(processor_set_t preferred_pset,thread_t thread,processor_set_t locked_pset,bool switch_pset_locks)4442 sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
4443 {
4444 __kdebug_only uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4445 processor_set_t selected_pset = preferred_pset;
4446 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4447 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4448
4449 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
4450 /* For bound threads always recommend the cluster its bound to */
4451 selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
4452 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4453 if (sched_edge_pset_available(selected_pset) || (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread) == false)) {
4454 /*
4455 * If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
4456 * fall through to the regular cluster selection logic which handles unavailable clusters
4457 * appropriately. If the thread is hard bound, then return the bound cluster always.
4458 */
4459 return selected_pset;
4460 }
4461 }
4462
4463 uint64_t candidate_cluster_bitmap = mask(sched_edge_max_clusters);
4464 #if DEVELOPMENT || DEBUG
4465 extern int enable_task_set_cluster_type;
4466 task_t task = get_threadtask(thread);
4467 if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
4468 processor_set_t pset_hint = task->pset_hint;
4469 if (pset_hint && selected_pset->pset_cluster_type != pset_hint->pset_cluster_type) {
4470 selected_pset = pset_hint;
4471 goto migrate_candidate_available_check;
4472 }
4473 }
4474 #endif
4475
4476 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4477 /* For realtime threads, try and schedule them on the preferred pset always */
4478 goto migrate_candidate_available_check;
4479 }
4480
4481 uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
4482 if (preferred_cluster_load == 0) {
4483 goto migrate_candidate_available_check;
4484 }
4485
4486 /*
4487 * If a thread is being rebalanced for achieving equal progress of parallel workloads,
4488 * it needs to end up on the preferred runqueue. This mechanism should only be used for
4489 * threads which have been previously migrated to the non-preferred cluster type.
4490 *
4491 * The AMP rebalancing mechanism is available for regular threads or shared resource
4492 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
4493 */
4494 bool amp_rebalance_eligible = (!shared_rsrc_thread) || (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST));
4495 if (amp_rebalance_eligible) {
4496 boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE | AST_QUANTUM)) == (AST_REBALANCE | AST_QUANTUM);
4497 if (amp_rebalance) {
4498 boolean_t non_preferred_pset = (thread->last_processor->processor_set->pset_type != preferred_pset->pset_type);
4499 if (non_preferred_pset) {
4500 selected_pset = sched_edge_amp_rebalance_pset(preferred_pset, thread);
4501 goto migrate_candidate_available_check;
4502 }
4503 }
4504 }
4505
4506 /* Look at edge weights to decide the most ideal migration candidate for this thread */
4507 selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
4508
4509 migrate_candidate_available_check:
4510 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4511 if (sched_edge_pset_available(selected_pset) == true) {
4512 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4513 return selected_pset;
4514 }
4515 /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4516 bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4517 if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -1)) {
4518 pset_unlock(locked_pset);
4519 return NULL;
4520 }
4521 /* Try and find an alternative for the selected pset */
4522 selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
4523 goto migrate_candidate_available_check;
4524 }
4525
4526 static processor_t
sched_edge_choose_processor(processor_set_t pset,processor_t processor,thread_t thread)4527 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
4528 {
4529 /* Bound threads don't call this function */
4530 assert(thread->bound_processor == PROCESSOR_NULL);
4531 processor_t chosen_processor = PROCESSOR_NULL;
4532
4533 /*
4534 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
4535 * It should take the passed in "pset" as a hint which represents the recency metric for
4536 * pset selection logic.
4537 */
4538 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4539 processor_set_t chosen_pset = preferred_pset;
4540 /*
4541 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4542 * threads to. sched_edge_migrate_candidate() returns the preferred pset
4543 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4544 *
4545 * Edge Scheduler Optimization
4546 * It might be useful to build a recency metric for the thread for multiple clusters and
4547 * factor that into the migration decisions.
4548 */
4549 chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
4550 if (chosen_pset) {
4551 chosen_processor = choose_processor(chosen_pset, processor, thread);
4552 }
4553 /* For RT threads, choose_processor() can return a different cluster than the one passed into it */
4554 assert(chosen_processor ? chosen_processor->processor_set->pset_type == chosen_pset->pset_type : true);
4555 return chosen_processor;
4556 }
4557
4558 /*
4559 * sched_edge_clutch_bucket_threads_drain()
4560 *
4561 * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4562 * bucket overrides etc.) into a local thread queue.
4563 */
4564 static void
sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,queue_t clutch_threads)4565 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
4566 {
4567 thread_t thread = THREAD_NULL;
4568 uint64_t current_timestamp = mach_approximate_time();
4569 qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
4570 sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4571 enqueue_tail(clutch_threads, &thread->runq_links);
4572 }
4573 }
4574
4575 /*
4576 * sched_edge_run_drained_threads()
4577 *
4578 * Makes all drained threads in a local queue runnable.
4579 */
4580 static void
sched_edge_run_drained_threads(queue_t clutch_threads)4581 sched_edge_run_drained_threads(queue_t clutch_threads)
4582 {
4583 thread_t thread;
4584 /* Now setrun all the threads in the local queue */
4585 qe_foreach_element_safe(thread, clutch_threads, runq_links) {
4586 remqueue(&thread->runq_links);
4587 thread_lock(thread);
4588 thread_setrun(thread, SCHED_TAILQ);
4589 thread_unlock(thread);
4590 }
4591 }
4592
4593 /*
4594 * sched_edge_update_preferred_cluster()
4595 *
4596 * Routine to update the preferred cluster for QoS buckets within a thread group.
4597 * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4598 */
4599 static void
sched_edge_update_preferred_cluster(sched_clutch_t sched_clutch,bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster)4600 sched_edge_update_preferred_cluster(
4601 sched_clutch_t sched_clutch,
4602 bitmap_t *clutch_bucket_modify_bitmap,
4603 uint32_t *tg_bucket_preferred_cluster)
4604 {
4605 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4606 os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
4607 }
4608 }
4609
4610 /*
4611 * sched_edge_migrate_thread_group_runnable_threads()
4612 *
4613 * Routine to implement the migration of threads on a cluster when the thread group
4614 * recommendation is updated. The migration works using a 2-phase
4615 * algorithm.
4616 *
4617 * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4618 * For each clutch bucket, if it needs to be migrated immediately, drain the threads
4619 * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4620 * appropriate.
4621 *
4622 * Phase 2: After unlocking the pset, drain all the threads from the local thread
4623 * queue and mark them runnable which should land them in the right hierarchy.
4624 *
4625 * The routine assumes that the preferences for the clutch buckets/clutch bucket
4626 * groups have already been updated by the caller.
4627 *
4628 * - Called with the pset locked and interrupts disabled.
4629 * - Returns with the pset unlocked.
4630 */
4631 static void
sched_edge_migrate_thread_group_runnable_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,bitmap_t * clutch_bucket_modify_bitmap,__unused uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)4632 sched_edge_migrate_thread_group_runnable_threads(
4633 sched_clutch_t sched_clutch,
4634 sched_clutch_root_t root_clutch,
4635 bitmap_t *clutch_bucket_modify_bitmap,
4636 __unused uint32_t *tg_bucket_preferred_cluster,
4637 bool migrate_immediately)
4638 {
4639 /* Queue to hold threads that have been drained from clutch buckets to be migrated */
4640 queue_head_t clutch_threads;
4641 queue_init(&clutch_threads);
4642
4643 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4644 /* Get the clutch bucket for this cluster and sched bucket */
4645 sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
4646 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
4647 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
4648 if (scb_root == NULL) {
4649 /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
4650 assert(clutch_bucket->scb_thr_count == 0);
4651 continue;
4652 }
4653 assert(scb_root == root_clutch);
4654 uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
4655
4656 if (migrate_immediately) {
4657 /*
4658 * For transitions where threads need to be migrated immediately, drain the threads into a
4659 * local queue unless we are looking at the clutch buckets for the newly recommended
4660 * cluster.
4661 */
4662 if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
4663 sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
4664 } else {
4665 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4666 }
4667 } else {
4668 /* Check if this cluster is the same type as the newly recommended cluster */
4669 boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
4670 /*
4671 * If threads do not have to be migrated immediately, just change the native/foreign
4672 * flag on the clutch bucket.
4673 */
4674 if (homogeneous_cluster) {
4675 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4676 } else {
4677 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
4678 }
4679 }
4680 }
4681
4682 pset_unlock(root_clutch->scr_pset);
4683 sched_edge_run_drained_threads(&clutch_threads);
4684 }
4685
4686 /*
4687 * sched_edge_migrate_thread_group_running_threads()
4688 *
4689 * Routine to find all running threads of a thread group on a specific cluster
4690 * and IPI them if they need to be moved immediately.
4691 */
4692 static void
sched_edge_migrate_thread_group_running_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,__unused bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)4693 sched_edge_migrate_thread_group_running_threads(
4694 sched_clutch_t sched_clutch,
4695 sched_clutch_root_t root_clutch,
4696 __unused bitmap_t *clutch_bucket_modify_bitmap,
4697 uint32_t *tg_bucket_preferred_cluster,
4698 bool migrate_immediately)
4699 {
4700 if (migrate_immediately == false) {
4701 /* If CLPC has recommended not to move threads immediately, nothing to do here */
4702 return;
4703 }
4704
4705 /*
4706 * Edge Scheduler Optimization
4707 *
4708 * When the system has a large number of clusters and cores, it might be useful to
4709 * narrow down the iteration by using a thread running bitmap per clutch.
4710 */
4711 uint64_t ast_processor_map = 0;
4712 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4713
4714 uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
4715 /*
4716 * Iterate all CPUs and look for the ones running threads from this thread group and are
4717 * not restricted to the specific cluster (due to overrides etc.)
4718 */
4719 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
4720 processor_t src_processor = processor_array[cpuid];
4721 boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
4722 sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
4723 if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
4724 continue;
4725 }
4726 boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
4727
4728 if (expected_tg && non_preferred_cluster) {
4729 ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
4730 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4731 bit_set(ast_processor_map, cpuid);
4732 } else if (src_processor == current_processor()) {
4733 ast_on(AST_PREEMPT);
4734 bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
4735 }
4736 }
4737 }
4738
4739 /* Perform all the IPIs */
4740 if (bit_first(ast_processor_map) != -1) {
4741 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4742 processor_t ast_processor = processor_array[cpuid];
4743 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4744 }
4745 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
4746 }
4747 }
4748
4749 /*
4750 * sched_edge_tg_preferred_cluster_change()
4751 *
4752 * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
4753 * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
4754 * thread migrations based on the policy specified by CLPC.
4755 * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
4756 * for each QoS within the thread group.
4757 */
4758 void
sched_edge_tg_preferred_cluster_change(struct thread_group * tg,uint32_t * tg_bucket_preferred_cluster,sched_perfcontrol_preferred_cluster_options_t options)4759 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
4760 {
4761 sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
4762 /*
4763 * In order to optimize the processing, create a bitmap which represents all QoS buckets
4764 * for which the preferred cluster has changed.
4765 */
4766 bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
4767 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4768 uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
4769 uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
4770 if (old_preferred_cluster != new_preferred_cluster) {
4771 bitmap_set(clutch_bucket_modify_bitmap, bucket);
4772 }
4773 }
4774 if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
4775 /* No changes in any clutch buckets; nothing to do here */
4776 return;
4777 }
4778
4779 for (uint32_t cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
4780 processor_set_t pset = pset_array[cluster_id];
4781 spl_t s = splsched();
4782 pset_lock(pset);
4783 /*
4784 * The first operation is to update the preferred cluster for all QoS buckets within the
4785 * thread group so that any future threads becoming runnable would see the new preferred
4786 * cluster value.
4787 */
4788 sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
4789 /*
4790 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
4791 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
4792 *
4793 * Edge Scheduler Optimization
4794 */
4795 /* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
4796 sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4797 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
4798 /* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
4799 sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4800 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
4801 /* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
4802 splx(s);
4803 }
4804 }
4805
4806 /*
4807 * sched_edge_pset_made_schedulable()
4808 *
4809 * Routine to migrate all the clutch buckets which are not in their recommended
4810 * pset hierarchy now that a new pset has become runnable. Its possible that this
4811 * routine is called when the pset is already marked schedulable.
4812 *
4813 * Invoked with the pset lock held and interrupts disabled.
4814 */
4815 static void
sched_edge_pset_made_schedulable(__unused processor_t processor,processor_set_t dst_pset,boolean_t drop_lock)4816 sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
4817 {
4818 if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
4819 /* Nothing to do here since pset is already marked schedulable */
4820 if (drop_lock) {
4821 pset_unlock(dst_pset);
4822 }
4823 return;
4824 }
4825
4826 bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
4827
4828 thread_t thread = sched_edge_processor_idle(dst_pset);
4829 if (thread != THREAD_NULL) {
4830 thread_lock(thread);
4831 thread_setrun(thread, SCHED_TAILQ);
4832 thread_unlock(thread);
4833 }
4834
4835 if (!drop_lock) {
4836 pset_lock(dst_pset);
4837 }
4838 }
4839
4840 /*
4841 * sched_edge_cpu_init_completed()
4842 *
4843 * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
4844 * provides an opportunity for the edge scheduler to initialize all the edge parameters.
4845 */
4846 static void
sched_edge_cpu_init_completed(void)4847 sched_edge_cpu_init_completed(void)
4848 {
4849 spl_t s = splsched();
4850 for (int src_cluster_id = 0; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) {
4851 processor_set_t src_pset = pset_array[src_cluster_id];
4852 pset_lock(src_pset);
4853
4854 /* For each cluster, set all its outgoing edge parameters */
4855 for (int dst_cluster_id = 0; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) {
4856 if (dst_cluster_id == src_cluster_id) {
4857 continue;
4858 }
4859 processor_set_t dst_pset = pset_array[dst_cluster_id];
4860 if (src_pset->pset_type == dst_pset->pset_type) {
4861 /* P->P/E->E edge config */
4862 bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
4863 bitmap_set(src_pset->native_psets, dst_cluster_id);
4864 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
4865 } else if ((src_pset->pset_type == CLUSTER_TYPE_P) && (dst_pset->pset_type == CLUSTER_TYPE_E)) {
4866 /* P->E edge config */
4867 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4868 bitmap_clear(src_pset->native_psets, dst_cluster_id);
4869 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
4870 } else {
4871 /* E->P edge config */
4872 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4873 bitmap_clear(src_pset->native_psets, dst_cluster_id);
4874 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
4875 }
4876 bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
4877 if (clusters_local) {
4878 bitmap_set(src_pset->local_psets, dst_cluster_id);
4879 bitmap_clear(src_pset->remote_psets, dst_cluster_id);
4880 } else {
4881 bitmap_set(src_pset->remote_psets, dst_cluster_id);
4882 bitmap_clear(src_pset->local_psets, dst_cluster_id);
4883 }
4884 }
4885
4886 pset_unlock(src_pset);
4887 }
4888 splx(s);
4889 }
4890
4891 static bool
sched_edge_thread_eligible_for_pset(thread_t thread,processor_set_t pset)4892 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
4893 {
4894 uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
4895 if (preferred_cluster_id == pset->pset_cluster_id) {
4896 return true;
4897 } else {
4898 processor_set_t preferred_pset = pset_array[preferred_cluster_id];
4899 return preferred_pset->sched_edges[pset->pset_cluster_id].sce_migration_allowed;
4900 }
4901 }
4902
4903 extern int sched_amp_spill_deferred_ipi;
4904 extern int sched_amp_pcores_preempt_immediate_ipi;
4905
4906 int sched_edge_migrate_ipi_immediate = 1;
4907
4908 sched_ipi_type_t
sched_edge_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4909 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4910 {
4911 processor_set_t pset = dst->processor_set;
4912 assert(dst != current_processor());
4913
4914 boolean_t deferred_ipi_supported = false;
4915 #if defined(CONFIG_SCHED_DEFERRED_AST)
4916 deferred_ipi_supported = true;
4917 #endif /* CONFIG_SCHED_DEFERRED_AST */
4918
4919 switch (event) {
4920 case SCHED_IPI_EVENT_SPILL:
4921 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
4922 if (deferred_ipi_supported) {
4923 return sched_ipi_deferred_policy(pset, dst, thread, event);
4924 }
4925 break;
4926 case SCHED_IPI_EVENT_PREEMPT:
4927 /* For preemption, the default policy is to use deferred IPIs
4928 * for Non-RT P-core preemption. Override that behavior if
4929 * sched_amp_pcores_preempt_immediate_ipi is set
4930 */
4931 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
4932 if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
4933 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4934 }
4935 if (sched_edge_migrate_ipi_immediate) {
4936 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4937 /*
4938 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
4939 */
4940 if (preferred_pset->pset_type == pset->pset_type) {
4941 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4942 }
4943 /*
4944 * For workloads that are going wide, it might be useful use Immediate IPI to
4945 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
4946 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
4947 * latency on the preferred pset as an estimate of busyness.
4948 */
4949 if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
4950 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4951 }
4952 }
4953 }
4954 break;
4955 default:
4956 break;
4957 }
4958 /* Default back to the global policy for all other scenarios */
4959 return sched_ipi_policy(dst, thread, dst_idle, event);
4960 }
4961
4962 /*
4963 * sched_edge_qos_max_parallelism()
4964 */
4965 uint32_t
sched_edge_qos_max_parallelism(int qos,uint64_t options)4966 sched_edge_qos_max_parallelism(int qos, uint64_t options)
4967 {
4968 uint32_t ecpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_E, false, false);
4969 uint32_t pcpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_P, false, false);
4970 uint32_t ecluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_E);
4971 uint32_t pcluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_P);
4972
4973 if (options & QOS_PARALLELISM_REALTIME) {
4974 /* For realtime threads on AMP, we would want them
4975 * to limit the width to just the P-cores since we
4976 * do not spill/rebalance for RT threads.
4977 */
4978 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcpu_count;
4979 }
4980
4981 /*
4982 * The Edge scheduler supports per-QoS recommendations for thread groups.
4983 * This enables lower QoS buckets (such as UT) to be scheduled on all
4984 * CPUs on the system.
4985 *
4986 * The only restriction is for BG/Maintenance QoS classes for which the
4987 * performance controller would never recommend execution on the P-cores.
4988 * If that policy changes in the future, this value should be changed.
4989 */
4990 switch (qos) {
4991 case THREAD_QOS_BACKGROUND:
4992 case THREAD_QOS_MAINTENANCE:
4993 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecpu_count;
4994 default:
4995 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecpu_count + pcpu_count);
4996 }
4997 }
4998
4999
5000
5001 #endif /* CONFIG_SCHED_EDGE */
5002
5003 #endif /* CONFIG_SCHED_CLUTCH */
5004