1 /*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 #include <machine/machine_routines.h>
32 #include <machine/sched_param.h>
33 #include <machine/machine_cpu.h>
34 #include <kern/kern_types.h>
35 #include <kern/debug.h>
36 #include <kern/machine.h>
37 #include <kern/misc_protos.h>
38 #include <kern/processor.h>
39 #include <kern/queue.h>
40 #include <kern/sched.h>
41 #include <kern/sched_prim.h>
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <kern/sched_clutch.h>
45 #include <machine/atomic.h>
46 #include <kern/sched_clutch.h>
47 #include <sys/kdebug.h>
48
49 #if CONFIG_SCHED_EDGE
50 #include <kern/sched_amp_common.h>
51 #endif /* CONFIG_SCHED_EDGE */
52
53 #if CONFIG_SCHED_CLUTCH
54
55 /* Forward declarations of static routines */
56
57 /* Root level hierarchy management */
58 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
59 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
60 static void sched_clutch_root_pri_update(sched_clutch_root_t);
61 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
62 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
63
64 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
65 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE = 0,
66 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY = 1,
67 SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL = 2,
68 });
69
70 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t);
71
72 #if CONFIG_SCHED_EDGE
73 /* Support for foreign threads on AMP platforms */
74 static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
75 static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
76 #endif /* CONFIG_SCHED_EDGE */
77
78 /* Root bucket level hierarchy management */
79 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
80 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
81
82 /* Options for clutch bucket ordering in the runq */
83 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
84 SCHED_CLUTCH_BUCKET_OPTIONS_NONE = 0x0,
85 /* Round robin clutch bucket on thread removal */
86 SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = 0x1,
87 /* Insert clutch bucket at head (for thread preemption) */
88 SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = 0x2,
89 /* Insert clutch bucket at tail (default) */
90 SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = 0x4,
91 });
92
93 /* Clutch bucket level hierarchy management */
94 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
95 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
96 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
97 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
98 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
99 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
100
101 /* Clutch bucket group level properties management */
102 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
103 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
104 static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
105 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
106 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
107 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
108 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
109
110 /* Clutch timeshare properties updates */
111 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
112 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
113
114 /* Clutch membership management */
115 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
116 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
117 static thread_t sched_clutch_thread_highest_remove(sched_clutch_root_t);
118
119 /* Clutch properties updates */
120 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
121 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
122 static int sched_clutch_root_priority(sched_clutch_root_t);
123 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
124 static boolean_t sched_thread_sched_pri_promoted(thread_t);
125
126 #if CONFIG_SCHED_EDGE
127 /* System based routines */
128 static bool sched_edge_pset_available(processor_set_t);
129 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
130 static int sched_edge_iterate_clusters_ordered(processor_set_t, uint64_t, int);
131
132 /* Global indicating the maximum number of clusters on the current platform */
133 static int sched_edge_max_clusters = 0;
134 #endif /* CONFIG_SCHED_EDGE */
135
136 /* Helper debugging routines */
137 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
138
139 extern processor_set_t pset_array[MAX_PSETS];
140
141 /*
142 * Special markers for buckets that have invalid WCELs/quantums etc.
143 */
144 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
145 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
146
147 /*
148 * Root level bucket WCELs
149 *
150 * The root level bucket selection algorithm is an Earliest Deadline
151 * First (EDF) algorithm where the deadline for buckets are defined
152 * by the worst-case-execution-latency and the make runnable timestamp
153 * for the bucket.
154 *
155 */
156 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
157 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
158 0, /* FG */
159 37500, /* IN (37.5ms) */
160 75000, /* DF (75ms) */
161 150000, /* UT (150ms) */
162 250000 /* BG (250ms) */
163 };
164 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
165
166 /*
167 * Root level bucket warp
168 *
169 * Each root level bucket has a warp value associated with it as well.
170 * The warp value allows the root bucket to effectively warp ahead of
171 * lower priority buckets for a limited time even if it has a later
172 * deadline. The warping behavior provides extra (but limited)
173 * opportunity for high priority buckets to remain responsive.
174 */
175
176 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
177 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED (SCHED_CLUTCH_INVALID_TIME_64)
178
179 /* Warp window durations for various tiers */
180 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
181 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
182 8000, /* FG (8ms)*/
183 4000, /* IN (4ms) */
184 2000, /* DF (2ms) */
185 1000, /* UT (1ms) */
186 0 /* BG (0ms) */
187 };
188 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
189
190 /*
191 * Thread level quantum
192 *
193 * The algorithm defines quantums for threads at various buckets. This
194 * (combined with the root level bucket quantums) restricts how much
195 * the lower priority levels can preempt the higher priority threads.
196 */
197
198 #if XNU_TARGET_OS_OSX
199 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
200 10000, /* FIXPRI (10ms) */
201 10000, /* FG (10ms) */
202 10000, /* IN (10ms) */
203 10000, /* DF (10ms) */
204 4000, /* UT (4ms) */
205 2000 /* BG (2ms) */
206 };
207 #else /* XNU_TARGET_OS_OSX */
208 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
209 10000, /* FIXPRI (10ms) */
210 10000, /* FG (10ms) */
211 8000, /* IN (8ms) */
212 6000, /* DF (6ms) */
213 4000, /* UT (4ms) */
214 2000 /* BG (2ms) */
215 };
216 #endif /* XNU_TARGET_OS_OSX */
217
218 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
219
220 /*
221 * sched_clutch_us_to_abstime()
222 *
223 * Initializer for converting all durations in usec to abstime
224 */
225 static void
sched_clutch_us_to_abstime(uint32_t * us_vals,uint64_t * abstime_vals)226 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
227 {
228 for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
229 if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
230 abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
231 } else {
232 clock_interval_to_absolutetime_interval(us_vals[i],
233 NSEC_PER_USEC, &abstime_vals[i]);
234 }
235 }
236 }
237
238 /* Clutch/Edge Scheduler Debugging support */
239 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c) ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
240
241 #if DEVELOPMENT || DEBUG
242
243 /*
244 * sched_clutch_hierarchy_locked_assert()
245 *
246 * Debugging helper routine. Asserts that the hierarchy is locked. The locking
247 * for the hierarchy depends on where the hierarchy is hooked. The current
248 * implementation hooks the hierarchy at the pset, so the hierarchy is locked
249 * using the pset lock.
250 */
251 static inline void
sched_clutch_hierarchy_locked_assert(sched_clutch_root_t root_clutch)252 sched_clutch_hierarchy_locked_assert(
253 sched_clutch_root_t root_clutch)
254 {
255 pset_assert_locked(root_clutch->scr_pset);
256 }
257
258 #else /* DEVELOPMENT || DEBUG */
259
260 static inline void
sched_clutch_hierarchy_locked_assert(__unused sched_clutch_root_t root_clutch)261 sched_clutch_hierarchy_locked_assert(
262 __unused sched_clutch_root_t root_clutch)
263 {
264 }
265
266 #endif /* DEVELOPMENT || DEBUG */
267
268 /*
269 * sched_clutch_thr_count_inc()
270 *
271 * Increment thread count at a hierarchy level with overflow checks.
272 */
273 static void
sched_clutch_thr_count_inc(uint16_t * thr_count)274 sched_clutch_thr_count_inc(
275 uint16_t *thr_count)
276 {
277 if (__improbable(os_inc_overflow(thr_count))) {
278 panic("sched_clutch thread count overflowed!");
279 }
280 }
281
282 /*
283 * sched_clutch_thr_count_dec()
284 *
285 * Decrement thread count at a hierarchy level with underflow checks.
286 */
287 static void
sched_clutch_thr_count_dec(uint16_t * thr_count)288 sched_clutch_thr_count_dec(
289 uint16_t *thr_count)
290 {
291 if (__improbable(os_dec_overflow(thr_count))) {
292 panic("sched_clutch thread count underflowed!");
293 }
294 }
295
296 /*
297 * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
298 * based on the amount of time they have been pending and the load at that
299 * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
300 * multiple clusters, its important to keep the load also as a global counter.
301 */
302 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
303
304 /*
305 * sched_clutch_root_init()
306 *
307 * Routine to initialize the scheduler hierarchy root.
308 */
309 static void
sched_clutch_root_init(sched_clutch_root_t root_clutch,processor_set_t pset)310 sched_clutch_root_init(
311 sched_clutch_root_t root_clutch,
312 processor_set_t pset)
313 {
314 root_clutch->scr_thr_count = 0;
315 root_clutch->scr_priority = NOPRI;
316 root_clutch->scr_urgency = 0;
317 root_clutch->scr_pset = pset;
318 #if CONFIG_SCHED_EDGE
319 root_clutch->scr_cluster_id = pset->pset_cluster_id;
320 #else /* CONFIG_SCHED_EDGE */
321 root_clutch->scr_cluster_id = 0;
322 #endif /* CONFIG_SCHED_EDGE */
323
324 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
325 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
326 }
327 /* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
328 queue_init(&root_clutch->scr_clutch_buckets);
329
330 /* Initialize the priority queue which maintains all runnable foreign clutch buckets */
331 priority_queue_init(&root_clutch->scr_foreign_buckets);
332 bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
333 bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
334 bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
335 priority_queue_init(&root_clutch->scr_bound_root_buckets);
336
337 /* Initialize the bitmap and priority queue of runnable root buckets */
338 priority_queue_init(&root_clutch->scr_unbound_root_buckets);
339 bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
340 bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
341
342 /* Initialize all the root buckets */
343 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
344 sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
345 sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
346 }
347 }
348
349 /*
350 * Clutch Bucket Runqueues
351 *
352 * The clutch buckets are maintained in a runq at the root bucket level. The
353 * runq organization allows clutch buckets to be ordered based on various
354 * factors such as:
355 *
356 * - Clutch buckets are round robin'ed at the same priority level when a
357 * thread is selected from a clutch bucket. This prevents a clutch bucket
358 * from starving out other clutch buckets at the same priority.
359 *
360 * - Clutch buckets are inserted at the head when it becomes runnable due to
361 * thread preemption. This allows threads that were preempted to maintain
362 * their order in the queue.
363 *
364 */
365
366 /*
367 * sched_clutch_bucket_runq_init()
368 *
369 * Initialize a clutch bucket runq.
370 */
371 static void
sched_clutch_bucket_runq_init(sched_clutch_bucket_runq_t clutch_buckets_rq)372 sched_clutch_bucket_runq_init(
373 sched_clutch_bucket_runq_t clutch_buckets_rq)
374 {
375 clutch_buckets_rq->scbrq_highq = NOPRI;
376 for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
377 clutch_buckets_rq->scbrq_bitmap[i] = 0;
378 }
379 clutch_buckets_rq->scbrq_count = 0;
380 for (int i = 0; i < NRQS; i++) {
381 circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
382 }
383 }
384
385 /*
386 * sched_clutch_bucket_runq_empty()
387 *
388 * Returns if a clutch bucket runq is empty.
389 */
390 static boolean_t
sched_clutch_bucket_runq_empty(sched_clutch_bucket_runq_t clutch_buckets_rq)391 sched_clutch_bucket_runq_empty(
392 sched_clutch_bucket_runq_t clutch_buckets_rq)
393 {
394 return clutch_buckets_rq->scbrq_count == 0;
395 }
396
397 /*
398 * sched_clutch_bucket_runq_peek()
399 *
400 * Returns the highest priority clutch bucket in the runq.
401 */
402 static sched_clutch_bucket_t
sched_clutch_bucket_runq_peek(sched_clutch_bucket_runq_t clutch_buckets_rq)403 sched_clutch_bucket_runq_peek(
404 sched_clutch_bucket_runq_t clutch_buckets_rq)
405 {
406 if (clutch_buckets_rq->scbrq_count > 0) {
407 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
408 return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
409 } else {
410 return NULL;
411 }
412 }
413
414 /*
415 * sched_clutch_bucket_runq_enqueue()
416 *
417 * Enqueue a clutch bucket into the runq based on the options passed in.
418 */
419 static void
sched_clutch_bucket_runq_enqueue(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_options_t options)420 sched_clutch_bucket_runq_enqueue(
421 sched_clutch_bucket_runq_t clutch_buckets_rq,
422 sched_clutch_bucket_t clutch_bucket,
423 sched_clutch_bucket_options_t options)
424 {
425 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
426 if (circle_queue_empty(queue)) {
427 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
428 bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
429 if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
430 clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
431 }
432 } else {
433 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
434 circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
435 } else {
436 /*
437 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
438 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
439 */
440 circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
441 }
442 }
443 clutch_buckets_rq->scbrq_count++;
444 }
445
446 /*
447 * sched_clutch_bucket_runq_remove()
448 *
449 * Remove a clutch bucket from the runq.
450 */
451 static void
sched_clutch_bucket_runq_remove(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)452 sched_clutch_bucket_runq_remove(
453 sched_clutch_bucket_runq_t clutch_buckets_rq,
454 sched_clutch_bucket_t clutch_bucket)
455 {
456 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
457 circle_dequeue(queue, &clutch_bucket->scb_runqlink);
458 assert(clutch_buckets_rq->scbrq_count > 0);
459 clutch_buckets_rq->scbrq_count--;
460 if (circle_queue_empty(queue)) {
461 bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
462 clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
463 }
464 }
465
466 static void
sched_clutch_bucket_runq_rotate(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)467 sched_clutch_bucket_runq_rotate(
468 sched_clutch_bucket_runq_t clutch_buckets_rq,
469 sched_clutch_bucket_t clutch_bucket)
470 {
471 circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
472 assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
473 circle_queue_rotate_head_forward(queue);
474 }
475
476 /*
477 * sched_clutch_root_bucket_init()
478 *
479 * Routine to initialize root buckets.
480 */
481 static void
sched_clutch_root_bucket_init(sched_clutch_root_bucket_t root_bucket,sched_bucket_t bucket,bool bound_root_bucket)482 sched_clutch_root_bucket_init(
483 sched_clutch_root_bucket_t root_bucket,
484 sched_bucket_t bucket,
485 bool bound_root_bucket)
486 {
487 root_bucket->scrb_bucket = bucket;
488 if (bound_root_bucket) {
489 /* For bound root buckets, initialize the bound thread runq. */
490 root_bucket->scrb_bound = true;
491 run_queue_init(&root_bucket->scrb_bound_thread_runq);
492 } else {
493 /*
494 * The unbounded root buckets contain a runq of runnable clutch buckets
495 * which then hold the runnable threads.
496 */
497 root_bucket->scrb_bound = false;
498 sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
499 }
500 priority_queue_entry_init(&root_bucket->scrb_pqlink);
501 root_bucket->scrb_pqlink.deadline = SCHED_CLUTCH_INVALID_TIME_64;
502 root_bucket->scrb_warped_deadline = 0;
503 root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
504 root_bucket->scrb_starvation_avoidance = false;
505 root_bucket->scrb_starvation_ts = 0;
506 }
507
508 /*
509 * Special case scheduling for Above UI bucket.
510 *
511 * AboveUI threads are typically system critical threads that need low latency
512 * which is why they are handled specially.
513 *
514 * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
515 * important to maintain some native priority order between those buckets. For unbounded
516 * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
517 * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
518 * deadline based scheduling which should pickup the timeshare buckets. For the bound
519 * case, the policy simply compares the priority of the highest runnable threads in
520 * the above UI and timeshare buckets.
521 *
522 * The implementation allows extremely low latency CPU access for Above UI threads
523 * while supporting the use case of high priority timeshare threads contending with
524 * lower priority fixed priority threads.
525 */
526
527
528 /*
529 * sched_clutch_root_unbound_select_aboveui()
530 *
531 * Routine to determine if the above UI unbounded bucket should be selected for execution.
532 */
533 static bool
sched_clutch_root_unbound_select_aboveui(sched_clutch_root_t root_clutch)534 sched_clutch_root_unbound_select_aboveui(
535 sched_clutch_root_t root_clutch)
536 {
537 if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
538 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
539 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
540 if (!bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
541 /* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
542 return true;
543 }
544 sched_clutch_bucket_t clutch_bucket_aboveui = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_aboveui);
545 sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_sharefg);
546 if (clutch_bucket_aboveui->scb_priority >= clutch_bucket_sharefg->scb_priority) {
547 return true;
548 }
549 }
550 return false;
551 }
552
553 /*
554 * sched_clutch_root_bound_select_aboveui()
555 *
556 * Routine to determine if the above UI bounded bucket should be selected for execution.
557 */
558 static bool
sched_clutch_root_bound_select_aboveui(sched_clutch_root_t root_clutch)559 sched_clutch_root_bound_select_aboveui(
560 sched_clutch_root_t root_clutch)
561 {
562 sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
563 sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
564 if (root_bucket_aboveui->scrb_bound_thread_runq.count == 0) {
565 return false;
566 }
567 return root_bucket_aboveui->scrb_bound_thread_runq.highq >= root_bucket_sharefg->scrb_bound_thread_runq.highq;
568 }
569
570 /*
571 * sched_clutch_root_highest_root_bucket()
572 *
573 * Main routine to find the highest runnable root level bucket.
574 * This routine is called from performance sensitive contexts; so it is
575 * crucial to keep this O(1). The options parameter determines if
576 * the selection logic should look at unbounded threads only (for
577 * cross-cluster stealing operations) or both bounded and unbounded
578 * threads (for selecting next thread for execution on current cluster).
579 */
580 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_highest_root_bucket_type_t type)581 sched_clutch_root_highest_root_bucket(
582 sched_clutch_root_t root_clutch,
583 uint64_t timestamp,
584 sched_clutch_highest_root_bucket_type_t type)
585 {
586 sched_clutch_hierarchy_locked_assert(root_clutch);
587 int highest_runnable_bucket = -1;
588 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
589 highest_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
590 } else {
591 int highest_unbound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
592 int highest_bound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
593 highest_runnable_bucket = (highest_bound_runnable_bucket != -1) ? ((highest_unbound_runnable_bucket != -1) ? MIN(highest_bound_runnable_bucket, highest_unbound_runnable_bucket) : highest_bound_runnable_bucket) : highest_unbound_runnable_bucket;
594 }
595
596 if (highest_runnable_bucket == -1) {
597 return NULL;
598 }
599
600 /* Above UI root bucket selection (see comment above for more details on this special case handling) */
601 bool unbound_aboveui = sched_clutch_root_unbound_select_aboveui(root_clutch);
602 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
603 if (unbound_aboveui) {
604 return &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
605 }
606 /* Fall through to selecting a timeshare root bucket */
607 } else {
608 bool bound_aboveui = sched_clutch_root_bound_select_aboveui(root_clutch);
609 sched_clutch_root_bucket_t unbound_aboveui_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
610 sched_clutch_root_bucket_t bound_aboveui_root_bucket = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
611
612 if (unbound_aboveui && bound_aboveui) {
613 /*
614 * In this scenario both the bounded and unbounded above UI buckets are runnable; choose based on the
615 * highest runnable priority in both the buckets.
616 * */
617 int bound_aboveui_pri = root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI].scrb_bound_thread_runq.highq;
618 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(unbound_aboveui_root_bucket);
619 int unbound_aboveui_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
620 return (bound_aboveui_pri >= unbound_aboveui_pri) ? bound_aboveui_root_bucket : unbound_aboveui_root_bucket;
621 }
622 if (unbound_aboveui) {
623 return unbound_aboveui_root_bucket;
624 }
625 if (bound_aboveui) {
626 return bound_aboveui_root_bucket;
627 }
628 /* Fall through to selecting a timeshare root bucket */
629 }
630
631 /*
632 * Above UI bucket is not runnable or has a low priority runnable thread; use the
633 * earliest deadline model to schedule threads. The idea is that as the timeshare
634 * buckets use CPU, they will drop their interactivity score/sched priority and
635 * allow the low priority AboveUI buckets to be scheduled.
636 */
637
638 /* Find the earliest deadline bucket */
639 sched_clutch_root_bucket_t edf_bucket = NULL;
640 sched_clutch_root_bucket_t warp_bucket = NULL;
641 int warp_bucket_index = -1;
642
643 evaluate_root_buckets:
644 if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
645 edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
646 } else {
647 sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
648 sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
649 if (bound_bucket && unbound_bucket) {
650 /* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
651 edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
652 } else {
653 edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
654 }
655 }
656 /*
657 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
658 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
659 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
660 * QoS root bucket.
661 */
662 bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
663 warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
664
665 if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) {
666 /* No higher buckets have warp left; best choice is the EDF based bucket */
667 if (edf_bucket->scrb_starvation_avoidance) {
668 /*
669 * Indicates that the earliest deadline bucket is in starvation avoidance mode. Check to see if the
670 * starvation avoidance window is still open and return this bucket if it is.
671 *
672 * The starvation avoidance window is calculated based on the quantum of threads at that bucket and
673 * the number of CPUs in the cluster. The idea is to basically provide one quantum worth of starvation
674 * avoidance across all CPUs.
675 */
676 uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket] / pset_available_cpu_count(root_clutch->scr_pset);
677 if (timestamp < (edf_bucket->scrb_starvation_ts + starvation_window)) {
678 return edf_bucket;
679 } else {
680 /* Starvation avoidance window is over; update deadline and re-evaluate EDF */
681 edf_bucket->scrb_starvation_avoidance = false;
682 edf_bucket->scrb_starvation_ts = 0;
683 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
684 }
685 goto evaluate_root_buckets;
686 }
687
688 /* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
689 if (highest_runnable_bucket < edf_bucket->scrb_bucket) {
690 /* Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance */
691 edf_bucket->scrb_starvation_avoidance = true;
692 edf_bucket->scrb_starvation_ts = timestamp;
693 } else {
694 /* EDF bucket is being selected in the natural order; update deadline and reset warp */
695 sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
696 edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
697 edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
698 if (edf_bucket->scrb_bound) {
699 bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
700 } else {
701 bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
702 }
703 }
704 return edf_bucket;
705 }
706
707 /*
708 * Looks like there is a root bucket which is higher in the natural priority
709 * order than edf_bucket and might have some warp remaining.
710 */
711 warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
712 if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
713 /* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
714 warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
715 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
716 return warp_bucket;
717 }
718 if (warp_bucket->scrb_warped_deadline > timestamp) {
719 /* Root bucket already has a warp window open with some warp remaining */
720 sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
721 return warp_bucket;
722 }
723
724 /* For this bucket, warp window was opened sometime in the past but has now
725 * expired. Mark the bucket as not avilable for warp anymore and re-run the
726 * warp bucket selection logic.
727 */
728 warp_bucket->scrb_warp_remaining = 0;
729 if (warp_bucket->scrb_bound) {
730 bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
731 } else {
732 bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
733 }
734 goto evaluate_root_buckets;
735 }
736
737 /*
738 * sched_clutch_root_bucket_deadline_calculate()
739 *
740 * Calculate the deadline for the bucket based on its WCEL
741 */
742 static uint64_t
sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t root_bucket,uint64_t timestamp)743 sched_clutch_root_bucket_deadline_calculate(
744 sched_clutch_root_bucket_t root_bucket,
745 uint64_t timestamp)
746 {
747 /* For fixpri AboveUI bucket always return it as the earliest deadline */
748 if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
749 return 0;
750 }
751
752 /* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
753 return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
754 }
755
756 /*
757 * sched_clutch_root_bucket_deadline_update()
758 *
759 * Routine to update the deadline of the root bucket when it is selected.
760 * Updating the deadline also moves the root_bucket in the EDF priority
761 * queue.
762 */
763 static void
sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)764 sched_clutch_root_bucket_deadline_update(
765 sched_clutch_root_bucket_t root_bucket,
766 sched_clutch_root_t root_clutch,
767 uint64_t timestamp)
768 {
769 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
770 /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
771 return;
772 }
773
774 uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
775 uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
776 if (__improbable(old_deadline > new_deadline)) {
777 panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
778 }
779 if (old_deadline != new_deadline) {
780 root_bucket->scrb_pqlink.deadline = new_deadline;
781 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
782 priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
783 }
784 }
785
786 /*
787 * sched_clutch_root_bucket_runnable()
788 *
789 * Routine to insert a newly runnable root bucket into the hierarchy.
790 * Also updates the deadline and warp parameters as necessary.
791 */
792 static void
sched_clutch_root_bucket_runnable(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)793 sched_clutch_root_bucket_runnable(
794 sched_clutch_root_bucket_t root_bucket,
795 sched_clutch_root_t root_clutch,
796 uint64_t timestamp)
797 {
798 /* Mark the root bucket as runnable */
799 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
800 bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
801
802 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
803 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
804 return;
805 }
806
807 if (root_bucket->scrb_starvation_avoidance == false) {
808 /*
809 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
810 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
811 * that and fix it up.
812 */
813 root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
814 }
815 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
816 priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
817 if (root_bucket->scrb_warp_remaining) {
818 /* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
819 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
820 bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
821 }
822 }
823
824 /*
825 * sched_clutch_root_bucket_empty()
826 *
827 * Routine to remove an empty root bucket from the hierarchy.
828 * Also updates the deadline and warp parameters as necessary.
829 */
830 static void
sched_clutch_root_bucket_empty(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)831 sched_clutch_root_bucket_empty(
832 sched_clutch_root_bucket_t root_bucket,
833 sched_clutch_root_t root_clutch,
834 uint64_t timestamp)
835 {
836 bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
837 bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
838
839 if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
840 /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
841 return;
842 }
843
844 struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
845 priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
846
847 bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
848 bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
849
850 if (root_bucket->scrb_warped_deadline > timestamp) {
851 /*
852 * For root buckets that were using the warp, check if the warp
853 * deadline is in the future. If yes, remove the wall time the
854 * warp was active and update the warp remaining. This allows
855 * the root bucket to use the remaining warp the next time it
856 * becomes runnable.
857 */
858 root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
859 } else if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
860 /*
861 * If the root bucket's warped deadline is in the past, it has used up
862 * all the warp it was assigned. Empty out its warp remaining.
863 */
864 root_bucket->scrb_warp_remaining = 0;
865 }
866 }
867
868 static int
sched_clutch_global_bucket_load_get(sched_bucket_t bucket)869 sched_clutch_global_bucket_load_get(
870 sched_bucket_t bucket)
871 {
872 return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
873 }
874
875 /*
876 * sched_clutch_root_pri_update()
877 *
878 * The root level priority is used for thread selection and preemption
879 * logic.
880 *
881 * The logic uses the same decision as thread selection for deciding between the
882 * above UI and timeshare buckets. If one of the timesharing buckets have to be
883 * used for priority calculation, the logic is slightly different from thread
884 * selection, because thread selection considers deadlines, warps etc. to
885 * decide the most optimal bucket at a given timestamp. Since the priority
886 * value is used for preemption decisions only, it needs to be based on the
887 * highest runnable thread available in the timeshare domain. This logic can
888 * be made more sophisticated if there are cases of unnecessary preemption
889 * being seen in workloads.
890 */
891 static void
sched_clutch_root_pri_update(sched_clutch_root_t root_clutch)892 sched_clutch_root_pri_update(
893 sched_clutch_root_t root_clutch)
894 {
895 sched_clutch_hierarchy_locked_assert(root_clutch);
896 int16_t root_bound_pri = NOPRI;
897 int16_t root_unbound_pri = NOPRI;
898
899 if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
900 goto root_pri_update_unbound;
901 }
902 sched_clutch_root_bucket_t root_bucket_bound = NULL;
903 if (sched_clutch_root_bound_select_aboveui(root_clutch)) {
904 root_bucket_bound = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
905 } else {
906 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
907 assert(root_bucket_index != -1);
908 root_bucket_bound = &root_clutch->scr_bound_buckets[root_bucket_index];
909 }
910 root_bound_pri = root_bucket_bound->scrb_bound_thread_runq.highq;
911
912 root_pri_update_unbound:
913 if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
914 goto root_pri_update_complete;
915 }
916 sched_clutch_root_bucket_t root_bucket_unbound = NULL;
917 if (sched_clutch_root_unbound_select_aboveui(root_clutch)) {
918 root_bucket_unbound = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
919 } else {
920 int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
921 assert(root_bucket_index != -1);
922 root_bucket_unbound = &root_clutch->scr_unbound_buckets[root_bucket_index];
923 }
924 /* For the selected root bucket, find the highest priority clutch bucket */
925 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_unbound);
926 root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
927
928 root_pri_update_complete:
929 root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
930 }
931
932 /*
933 * sched_clutch_root_urgency_inc()
934 *
935 * Routine to increment the urgency at the root level based on the thread
936 * priority that is being inserted into the hierarchy. The root urgency
937 * counter is updated based on the urgency of threads in any of the
938 * clutch buckets which are part of the hierarchy.
939 *
940 * Always called with the pset lock held.
941 */
942 static void
sched_clutch_root_urgency_inc(sched_clutch_root_t root_clutch,thread_t thread)943 sched_clutch_root_urgency_inc(
944 sched_clutch_root_t root_clutch,
945 thread_t thread)
946 {
947 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
948 root_clutch->scr_urgency++;
949 }
950 }
951
952 /*
953 * sched_clutch_root_urgency_dec()
954 *
955 * Routine to decrement the urgency at the root level based on the thread
956 * priority that is being removed from the hierarchy. The root urgency
957 * counter is updated based on the urgency of threads in any of the
958 * clutch buckets which are part of the hierarchy.
959 *
960 * Always called with the pset lock held.
961 */
962 static void
sched_clutch_root_urgency_dec(sched_clutch_root_t root_clutch,thread_t thread)963 sched_clutch_root_urgency_dec(
964 sched_clutch_root_t root_clutch,
965 thread_t thread)
966 {
967 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
968 root_clutch->scr_urgency--;
969 }
970 }
971
972 /*
973 * Clutch bucket level scheduling
974 *
975 * The second level of scheduling is the clutch bucket level scheduling
976 * which tries to schedule thread groups within root_buckets. Each
977 * clutch represents a thread group and a clutch_bucket_group represents
978 * threads at a particular sched_bucket within that thread group. The
979 * clutch_bucket_group contains a clutch_bucket per cluster on the system
980 * where it holds the runnable threads destined for execution on that
981 * cluster.
982 *
983 * The goal of this level of scheduling is to allow interactive thread
984 * groups low latency access to the CPU. It also provides slight
985 * scheduling preference for App and unrestricted thread groups.
986 *
987 * The clutch bucket scheduling algorithm measures an interactivity
988 * score for all clutch bucket groups. The interactivity score is based
989 * on the ratio of the CPU used and the voluntary blocking of threads
990 * within the clutch bucket group. The algorithm is very close to the ULE
991 * scheduler on FreeBSD in terms of calculations. The interactivity
992 * score provides an interactivity boost in the range of
993 * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
994 * thread groups to win over CPU spinners.
995 *
996 * The interactivity score of the clutch bucket group is combined with the
997 * highest base/promoted priority of threads in the clutch bucket to form
998 * the overall priority of the clutch bucket.
999 */
1000
1001 /* Priority boost range for interactivity */
1002 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT (8)
1003 uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1004
1005 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1006 uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1007 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS (500000)
1008
1009 /* The ratio to scale the cpu/blocked time per window */
1010 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO (10)
1011
1012 /*
1013 * In order to allow App thread groups some preference over daemon thread
1014 * groups, the App clutch_buckets get a priority boost. The boost value should
1015 * be chosen such that badly behaved apps are still penalized over well
1016 * behaved interactive daemons.
1017 */
1018 static uint8_t sched_clutch_bucket_group_pri_boost[SCHED_CLUTCH_TG_PRI_MAX] = {
1019 [SCHED_CLUTCH_TG_PRI_LOW] = 0,
1020 [SCHED_CLUTCH_TG_PRI_MED] = 2,
1021 [SCHED_CLUTCH_TG_PRI_HIGH] = 4,
1022 };
1023
1024 /* Initial value for voluntary blocking time for the clutch_bucket */
1025 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID (uint64_t)(~0)
1026
1027 /* Value indicating the clutch bucket is not pending execution */
1028 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID ((uint64_t)(~0))
1029
1030 /*
1031 * Thread group CPU starvation avoidance
1032 *
1033 * In heavily CPU contended scenarios, it is possible that some thread groups
1034 * which have a low interactivity score do not get CPU time at all. In order to
1035 * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1036 * bucket group when it has been pending execution for a certain time as defined
1037 * by the sched_clutch_bucket_group_pending_delta_us values below.
1038 *
1039 * The values chosen here are very close to the WCEL values for each sched bucket.
1040 * These values are multiplied by the load average of the relevant root bucket to
1041 * provide an estimate of the actual clutch bucket load.
1042 */
1043 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1044 SCHED_CLUTCH_INVALID_TIME_32, /* FIXPRI */
1045 10000, /* FG */
1046 37500, /* IN */
1047 75000, /* DF */
1048 150000, /* UT */
1049 250000, /* BG */
1050 };
1051 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1052
1053 /*
1054 * sched_clutch_bucket_init()
1055 *
1056 * Initializer for clutch buckets.
1057 */
1058 static void
sched_clutch_bucket_init(sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_group_t clutch_bucket_group,sched_bucket_t bucket)1059 sched_clutch_bucket_init(
1060 sched_clutch_bucket_t clutch_bucket,
1061 sched_clutch_bucket_group_t clutch_bucket_group,
1062 sched_bucket_t bucket)
1063 {
1064 clutch_bucket->scb_bucket = bucket;
1065 /* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1066 clutch_bucket->scb_priority = 0;
1067 #if CONFIG_SCHED_EDGE
1068 clutch_bucket->scb_foreign = false;
1069 priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1070 #endif /* CONFIG_SCHED_EDGE */
1071 clutch_bucket->scb_group = clutch_bucket_group;
1072 clutch_bucket->scb_root = NULL;
1073 priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1074 priority_queue_init(&clutch_bucket->scb_thread_runq);
1075 queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1076 }
1077
1078 /*
1079 * sched_clutch_bucket_group_init()
1080 *
1081 * Initializer for clutch bucket groups.
1082 */
1083 static void
sched_clutch_bucket_group_init(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_t clutch,sched_bucket_t bucket)1084 sched_clutch_bucket_group_init(
1085 sched_clutch_bucket_group_t clutch_bucket_group,
1086 sched_clutch_t clutch,
1087 sched_bucket_t bucket)
1088 {
1089 bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1090 clutch_bucket_group->scbg_bucket = bucket;
1091 clutch_bucket_group->scbg_clutch = clutch;
1092
1093 int max_clusters = ml_get_cluster_count();
1094 clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1095 for (int i = 0; i < max_clusters; i++) {
1096 sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1097 }
1098
1099 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1100 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1101 os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1102 /*
1103 * All thread groups should be initialized to be interactive; this allows the newly launched
1104 * thread groups to fairly compete with already running thread groups.
1105 */
1106 clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1107 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1108 os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1109 #if !__LP64__
1110 lck_spin_init(&clutch_bucket_group->scbg_stats_lock, &pset_lck_grp, NULL);
1111 #endif /* !__LP64__ */
1112 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1113 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1114 clutch_bucket_group->scbg_amp_rebalance_last_chosen = UINT32_MAX;
1115 }
1116
1117 static void
sched_clutch_bucket_group_destroy(sched_clutch_bucket_group_t clutch_bucket_group)1118 sched_clutch_bucket_group_destroy(
1119 sched_clutch_bucket_group_t clutch_bucket_group)
1120 {
1121 kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1122 clutch_bucket_group->scbg_clutch_buckets);
1123 }
1124
1125 /*
1126 * sched_clutch_init_with_thread_group()
1127 *
1128 * Initialize the sched_clutch when the thread group is being created
1129 */
1130 void
sched_clutch_init_with_thread_group(sched_clutch_t clutch,struct thread_group * tg)1131 sched_clutch_init_with_thread_group(
1132 sched_clutch_t clutch,
1133 struct thread_group *tg)
1134 {
1135 os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1136
1137 /* Initialize all the clutch buckets */
1138 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1139 sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1140 }
1141
1142 /* Grouping specific fields */
1143 clutch->sc_tg = tg;
1144 os_atomic_store(&clutch->sc_tg_priority, 0, relaxed);
1145 }
1146
1147 /*
1148 * sched_clutch_destroy()
1149 *
1150 * Destructor for clutch; called from thread group release code.
1151 */
1152 void
sched_clutch_destroy(sched_clutch_t clutch)1153 sched_clutch_destroy(
1154 sched_clutch_t clutch)
1155 {
1156 assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1157 for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1158 sched_clutch_bucket_group_destroy(&(clutch->sc_clutch_groups[i]));
1159 }
1160 }
1161
1162 #if CONFIG_SCHED_EDGE
1163
1164 /*
1165 * Edge Scheduler Preferred Cluster Mechanism
1166 *
1167 * In order to have better control over various QoS buckets within a thread group, the Edge
1168 * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1169 * preferences are stored at the sched_clutch_bucket_group level since that represents all
1170 * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1171 * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1172 */
1173
1174 static uint32_t
sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)1175 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1176 {
1177 return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1178 }
1179
1180 static uint32_t
sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)1181 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1182 {
1183 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1184 }
1185
1186 uint32_t
sched_edge_thread_preferred_cluster(thread_t thread)1187 sched_edge_thread_preferred_cluster(thread_t thread)
1188 {
1189 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1190 /* For threads bound to a specific cluster, return the bound cluster id */
1191 return sched_edge_thread_bound_cluster_id(thread);
1192 }
1193
1194 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1195 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
1196 return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1197 }
1198
1199 /*
1200 * Edge Scheduler Foreign Bucket Support
1201 *
1202 * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1203 * threads that are not native to the cluster. A clutch bucket is considered native if its
1204 * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1205 * bucket priority queue is used for rebalance operations to get threads back to their native
1206 * cluster quickly.
1207 *
1208 * It is possible to make this policy even more aggressive by considering all clusters that
1209 * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1210 * migrations which might have performance implications.
1211 */
1212
1213 static void
sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1214 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1215 {
1216 if (clutch_bucket->scb_foreign) {
1217 clutch_bucket->scb_foreign = false;
1218 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1219 }
1220 }
1221
1222 static void
sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1223 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1224 {
1225 if (!clutch_bucket->scb_foreign) {
1226 clutch_bucket->scb_foreign = true;
1227 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1228 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1229 }
1230 }
1231
1232 /*
1233 * Edge Scheduler Cumulative Load Average
1234 *
1235 * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1236 * making thread migration decisions. The per-bucket load is maintained as a
1237 * cumulative count since higher scheduling buckets impact load on lower buckets
1238 * for thread migration decisions.
1239 *
1240 */
1241
1242 static void
sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1243 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1244 {
1245 switch (bucket) {
1246 case TH_BUCKET_FIXPRI: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1247 case TH_BUCKET_SHARE_FG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1248 case TH_BUCKET_SHARE_IN: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1249 case TH_BUCKET_SHARE_DF: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1250 case TH_BUCKET_SHARE_UT: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1251 case TH_BUCKET_SHARE_BG: os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1252 default:
1253 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1254 }
1255 }
1256
1257 static void
sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1258 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1259 {
1260 switch (bucket) {
1261 case TH_BUCKET_FIXPRI: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1262 case TH_BUCKET_SHARE_FG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1263 case TH_BUCKET_SHARE_IN: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1264 case TH_BUCKET_SHARE_DF: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1265 case TH_BUCKET_SHARE_UT: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1266 case TH_BUCKET_SHARE_BG: os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1267 default:
1268 panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1269 }
1270 }
1271
1272 uint16_t
sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1273 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1274 {
1275 return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1276 }
1277
1278 #endif /* CONFIG_SCHED_EDGE */
1279
1280 /*
1281 * sched_clutch_bucket_hierarchy_insert()
1282 *
1283 * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1284 */
1285 static void
sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,sched_clutch_bucket_options_t options)1286 sched_clutch_bucket_hierarchy_insert(
1287 sched_clutch_root_t root_clutch,
1288 sched_clutch_bucket_t clutch_bucket,
1289 sched_bucket_t bucket,
1290 uint64_t timestamp,
1291 sched_clutch_bucket_options_t options)
1292 {
1293 sched_clutch_hierarchy_locked_assert(root_clutch);
1294 if (bucket > TH_BUCKET_FIXPRI) {
1295 /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1296 enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1297 }
1298 #if CONFIG_SCHED_EDGE
1299 /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1300 uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1301 if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1302 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1303 }
1304 #endif /* CONFIG_SCHED_EDGE */
1305 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1306
1307 /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1308 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1309 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1310 }
1311
1312 /* Insert the clutch bucket into the root bucket run queue with order based on options */
1313 sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1314 os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1315 os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1316 }
1317
1318 /*
1319 * sched_clutch_bucket_hierarchy_remove()
1320 *
1321 * Rotuine to remove a empty clutch bucket from the root hierarchy.
1322 */
1323 static void
sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,__unused sched_clutch_bucket_options_t options)1324 sched_clutch_bucket_hierarchy_remove(
1325 sched_clutch_root_t root_clutch,
1326 sched_clutch_bucket_t clutch_bucket,
1327 sched_bucket_t bucket,
1328 uint64_t timestamp,
1329 __unused sched_clutch_bucket_options_t options)
1330 {
1331 sched_clutch_hierarchy_locked_assert(root_clutch);
1332 if (bucket > TH_BUCKET_FIXPRI) {
1333 /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1334 remqueue(&clutch_bucket->scb_listlink);
1335 }
1336 #if CONFIG_SCHED_EDGE
1337 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1338 #endif /* CONFIG_SCHED_EDGE */
1339
1340 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1341
1342 /* Remove the clutch bucket from the root bucket priority queue */
1343 sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1344 os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1345
1346 /* If the root bucket priority queue is now empty, remove it from the root priority queue */
1347 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1348 sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1349 }
1350 os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1351 }
1352
1353 /*
1354 * sched_clutch_bucket_base_pri()
1355 *
1356 * Calculates the "base" priority of the clutch bucket. The base
1357 * priority of the clutch bucket is the sum of the max of highest
1358 * base_pri and highest sched_pri in the clutch bucket and any
1359 * grouping specific (App/Daemon...) boosts applicable to the
1360 * clutch_bucket.
1361 */
1362 static uint8_t
sched_clutch_bucket_base_pri(sched_clutch_bucket_t clutch_bucket)1363 sched_clutch_bucket_base_pri(
1364 sched_clutch_bucket_t clutch_bucket)
1365 {
1366 uint8_t clutch_boost = 0;
1367 assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1368
1369 sched_clutch_t clutch = clutch_bucket->scb_group->scbg_clutch;
1370
1371 /*
1372 * Since the clutch bucket can contain threads that are members of the group due
1373 * to the sched_pri being promoted or due to their base pri, the base priority of
1374 * the entire clutch bucket should be based on the highest thread (promoted or base)
1375 * in the clutch bucket.
1376 */
1377 uint8_t max_pri = 0;
1378 if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1379 max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1380 }
1381
1382 sched_clutch_tg_priority_t tg_pri = os_atomic_load(&clutch->sc_tg_priority, relaxed);
1383 clutch_boost = sched_clutch_bucket_group_pri_boost[tg_pri];
1384 return max_pri + clutch_boost;
1385 }
1386
1387 /*
1388 * sched_clutch_interactivity_from_cpu_data()
1389 *
1390 * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1391 */
1392 static uint8_t
sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)1393 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1394 {
1395 sched_clutch_bucket_cpu_data_t scb_cpu_data;
1396 scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1397 clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1398 clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1399 uint8_t interactive_score = 0;
1400
1401 if ((cpu_blocked == 0) && (cpu_used == 0)) {
1402 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1403 }
1404 /*
1405 * For all timeshare buckets, calculate the interactivity score of the bucket
1406 * and add it to the base priority
1407 */
1408 if (cpu_blocked > cpu_used) {
1409 /* Interactive clutch_bucket case */
1410 interactive_score = sched_clutch_bucket_group_interactive_pri +
1411 ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1412 } else {
1413 /* Non-interactive clutch_bucket case */
1414 interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1415 }
1416 return interactive_score;
1417 }
1418
1419 /*
1420 * sched_clutch_bucket_pri_calculate()
1421 *
1422 * The priority calculation algorithm for the clutch_bucket is a slight
1423 * modification on the ULE interactivity score. It uses the base priority
1424 * of the clutch bucket and applies an interactivity score boost to the
1425 * highly responsive clutch buckets.
1426 */
1427 static uint8_t
sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t clutch_bucket,uint64_t timestamp)1428 sched_clutch_bucket_pri_calculate(
1429 sched_clutch_bucket_t clutch_bucket,
1430 uint64_t timestamp)
1431 {
1432 /* For empty clutch buckets, return priority 0 */
1433 if (clutch_bucket->scb_thr_count == 0) {
1434 return 0;
1435 }
1436
1437 uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1438 uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1439
1440 assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1441 uint8_t pri = base_pri + interactive_score;
1442 if (pri != clutch_bucket->scb_priority) {
1443 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1444 thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1445 }
1446 return pri;
1447 }
1448
1449 /*
1450 * sched_clutch_root_bucket_highest_clutch_bucket()
1451 *
1452 * Routine to find the highest priority clutch bucket
1453 * within the root bucket.
1454 */
1455 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t root_bucket)1456 sched_clutch_root_bucket_highest_clutch_bucket(
1457 sched_clutch_root_bucket_t root_bucket)
1458 {
1459 if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1460 return NULL;
1461 }
1462 return sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1463 }
1464
1465 /*
1466 * sched_clutch_bucket_runnable()
1467 *
1468 * Perform all operations needed when a new clutch bucket becomes runnable.
1469 * It involves inserting the clutch_bucket into the hierarchy and updating the
1470 * root priority appropriately.
1471 */
1472 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1473 sched_clutch_bucket_runnable(
1474 sched_clutch_bucket_t clutch_bucket,
1475 sched_clutch_root_t root_clutch,
1476 uint64_t timestamp,
1477 sched_clutch_bucket_options_t options)
1478 {
1479 sched_clutch_hierarchy_locked_assert(root_clutch);
1480 /* Since the clutch bucket became newly runnable, update its pending timestamp */
1481 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1482 sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1483
1484 /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1485 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
1486 int16_t root_old_pri = root_clutch->scr_priority;
1487 sched_clutch_root_pri_update(root_clutch);
1488 return root_clutch->scr_priority > root_old_pri;
1489 }
1490
1491 /*
1492 * sched_clutch_bucket_update()
1493 *
1494 * Update the clutch_bucket's position in the hierarchy. This routine is
1495 * called when a new thread is inserted or removed from a runnable clutch
1496 * bucket. The options specify some properties about the clutch bucket
1497 * insertion order into the clutch bucket runq.
1498 */
1499 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1500 sched_clutch_bucket_update(
1501 sched_clutch_bucket_t clutch_bucket,
1502 sched_clutch_root_t root_clutch,
1503 uint64_t timestamp,
1504 sched_clutch_bucket_options_t options)
1505 {
1506 sched_clutch_hierarchy_locked_assert(root_clutch);
1507 uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1508 sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1509 if (new_pri == clutch_bucket->scb_priority) {
1510 /*
1511 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1512 * to the end of the runq. Typically used when a thread is selected for execution
1513 * from a clutch bucket.
1514 */
1515 if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1516 sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1517 }
1518 return false;
1519 }
1520 sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1521 #if CONFIG_SCHED_EDGE
1522 if (clutch_bucket->scb_foreign) {
1523 priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1524 }
1525 #endif /* CONFIG_SCHED_EDGE */
1526 clutch_bucket->scb_priority = new_pri;
1527 #if CONFIG_SCHED_EDGE
1528 if (clutch_bucket->scb_foreign) {
1529 priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1530 priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1531 }
1532 #endif /* CONFIG_SCHED_EDGE */
1533 sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1534
1535 int16_t root_old_pri = root_clutch->scr_priority;
1536 sched_clutch_root_pri_update(root_clutch);
1537 return root_clutch->scr_priority > root_old_pri;
1538 }
1539
1540 /*
1541 * sched_clutch_bucket_empty()
1542 *
1543 * Perform all the operations needed when a clutch_bucket is no longer runnable.
1544 * It involves removing the clutch bucket from the hierarchy and updaing the root
1545 * priority appropriately.
1546 */
1547 static void
sched_clutch_bucket_empty(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1548 sched_clutch_bucket_empty(
1549 sched_clutch_bucket_t clutch_bucket,
1550 sched_clutch_root_t root_clutch,
1551 uint64_t timestamp,
1552 sched_clutch_bucket_options_t options)
1553 {
1554 sched_clutch_hierarchy_locked_assert(root_clutch);
1555 sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1556 clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1557 sched_clutch_root_pri_update(root_clutch);
1558 }
1559
1560 /*
1561 * sched_clutch_cpu_usage_update()
1562 *
1563 * Routine to update CPU usage of the thread in the hierarchy.
1564 */
1565 void
sched_clutch_cpu_usage_update(thread_t thread,uint64_t delta)1566 sched_clutch_cpu_usage_update(
1567 thread_t thread,
1568 uint64_t delta)
1569 {
1570 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1571 return;
1572 }
1573
1574 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1575 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1576 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1577 }
1578
1579 /*
1580 * sched_clutch_bucket_group_cpu_usage_update()
1581 *
1582 * Routine to update the CPU usage of the clutch_bucket.
1583 */
1584 static void
sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t delta)1585 sched_clutch_bucket_group_cpu_usage_update(
1586 sched_clutch_bucket_group_t clutch_bucket_group,
1587 uint64_t delta)
1588 {
1589 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1590 /* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1591 return;
1592 }
1593 delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1594 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1595 }
1596
1597 /*
1598 * sched_clutch_bucket_group_cpu_pending_adjust()
1599 *
1600 * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1601 * such that one "pending interval" provides one point improvement in interactivity score.
1602 */
1603 static inline uint64_t
sched_clutch_bucket_group_cpu_pending_adjust(uint64_t cpu_used,uint64_t cpu_blocked,uint8_t pending_intervals)1604 sched_clutch_bucket_group_cpu_pending_adjust(
1605 uint64_t cpu_used,
1606 uint64_t cpu_blocked,
1607 uint8_t pending_intervals)
1608 {
1609 uint64_t cpu_used_adjusted = 0;
1610 if (cpu_blocked < cpu_used) {
1611 cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1612 cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1613 } else {
1614 uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1615 cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1616 }
1617 return cpu_used_adjusted;
1618 }
1619
1620 /*
1621 * sched_clutch_bucket_group_cpu_adjust()
1622 *
1623 * Routine to scale the cpu usage and blocked time once the sum gets bigger
1624 * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1625 * manageable and maintain the same ratio while allowing clutch buckets to
1626 * adjust behavior and reflect in the interactivity score in a reasonable
1627 * amount of time. Also adjusts the CPU usage based on pending_intervals
1628 * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1629 */
1630 static void
sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t clutch_bucket_group,uint8_t pending_intervals)1631 sched_clutch_bucket_group_cpu_adjust(
1632 sched_clutch_bucket_group_t clutch_bucket_group,
1633 uint8_t pending_intervals)
1634 {
1635 sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1636 sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1637 os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1638 clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1639 clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1640
1641 if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1642 /* No changes to the CPU used and blocked values */
1643 os_atomic_rmw_loop_give_up();
1644 }
1645 if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1646 /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1647 cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1648 cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1649 }
1650 /* Use the shift passed in to ageout the CPU usage */
1651 cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1652 new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1653 new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1654 });
1655 }
1656
1657 /*
1658 * Thread level scheduling algorithm
1659 *
1660 * The thread level scheduling algorithm uses the mach timeshare
1661 * decay based algorithm to achieve sharing between threads within the
1662 * same clutch bucket. The load/priority shifts etc. are all maintained
1663 * at the clutch bucket level and used for decay calculation of the
1664 * threads. The load sampling is still driven off the scheduler tick
1665 * for runnable clutch buckets (it does not use the new higher frequency
1666 * EWMA based load calculation). The idea is that the contention and load
1667 * within clutch_buckets should be limited enough to not see heavy decay
1668 * and timeshare effectively.
1669 */
1670
1671 /*
1672 * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1673 *
1674 * Increment the run count for the clutch bucket associated with the
1675 * thread.
1676 */
1677 uint32_t
sched_clutch_thread_run_bucket_incr(thread_t thread,sched_bucket_t bucket)1678 sched_clutch_thread_run_bucket_incr(
1679 thread_t thread,
1680 sched_bucket_t bucket)
1681 {
1682 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1683 return 0;
1684 }
1685 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1686 return sched_clutch_run_bucket_incr(clutch, bucket);
1687 }
1688
1689 static uint32_t
sched_clutch_run_bucket_incr(sched_clutch_t clutch,sched_bucket_t bucket)1690 sched_clutch_run_bucket_incr(
1691 sched_clutch_t clutch,
1692 sched_bucket_t bucket)
1693 {
1694 assert(bucket != TH_BUCKET_RUN);
1695 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1696 return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
1697 }
1698
1699 /*
1700 * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1701 *
1702 * Decrement the run count for the clutch bucket associated with the
1703 * thread.
1704 */
1705 uint32_t
sched_clutch_thread_run_bucket_decr(thread_t thread,sched_bucket_t bucket)1706 sched_clutch_thread_run_bucket_decr(
1707 thread_t thread,
1708 sched_bucket_t bucket)
1709 {
1710 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1711 return 0;
1712 }
1713 sched_clutch_t clutch = sched_clutch_for_thread(thread);
1714 return sched_clutch_run_bucket_decr(clutch, bucket);
1715 }
1716
1717 static uint32_t
sched_clutch_run_bucket_decr(sched_clutch_t clutch,sched_bucket_t bucket)1718 sched_clutch_run_bucket_decr(
1719 sched_clutch_t clutch,
1720 sched_bucket_t bucket)
1721 {
1722 assert(bucket != TH_BUCKET_RUN);
1723 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1724 return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
1725 }
1726
1727 /*
1728 * sched_clutch_bucket_group_timeshare_update()
1729 *
1730 * Routine to update the load and priority shift for the clutch_bucket_group
1731 * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
1732 * clutch buckets with runnable threads in them. So it is important to maintain
1733 * the timesharing information at the clutch_bucket_group level instead of
1734 * individual clutch buckets (because the algorithm is trying to timeshare all
1735 * threads at the same QoS irrespective of which hierarchy they are enqueued in).
1736 *
1737 * The routine is called from the sched tick handling code to make sure this value
1738 * is updated at least once every sched tick. For clutch bucket groups which have
1739 * not been runnable for very long, the clutch_bucket_group maintains a "last
1740 * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
1741 * if this value is outdated, the load and shifts are updated.
1742 *
1743 * Possible optimization:
1744 * - The current algorithm samples the load every sched tick (125ms).
1745 * This is prone to spikes in runnable counts; if that turns out to be
1746 * a problem, a simple solution would be to do the EWMA trick to sample
1747 * load at every load_tick (30ms) and use the averaged value for the pri
1748 * shift calculation.
1749 */
1750 static void
sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_bucket_t clutch_bucket,uint64_t ctime)1751 sched_clutch_bucket_group_timeshare_update(
1752 sched_clutch_bucket_group_t clutch_bucket_group,
1753 sched_clutch_bucket_t clutch_bucket,
1754 uint64_t ctime)
1755 {
1756 if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
1757 /* No timesharing needed for fixed priority Above UI threads */
1758 return;
1759 }
1760
1761 /*
1762 * Update the timeshare parameters for the clutch bucket group
1763 * if they havent been updated in this tick.
1764 */
1765 uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
1766 uint32_t current_sched_ts = sched_tick;
1767 if (sched_ts < current_sched_ts) {
1768 os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
1769 /* NCPU wide workloads should not experience decay */
1770 uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
1771 uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
1772 bucket_group_load = MIN(bucket_group_load, NRQS - 1);
1773 uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
1774 /* Ensure that the pri_shift value is reasonable */
1775 pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
1776 os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
1777 }
1778
1779 /*
1780 * Update the clutch bucket priority; this allows clutch buckets that have been pending
1781 * for a long time to get an updated interactivity score.
1782 */
1783 sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
1784 }
1785
1786 /*
1787 * sched_clutch_thread_clutch_update()
1788 *
1789 * Routine called when the thread changes its thread group. The current
1790 * implementation relies on the fact that the thread group is changed only from
1791 * the context of the thread itself or when the thread is runnable but not in a
1792 * runqueue. Due to this fact, the thread group change causes only counter
1793 * updates in the old & new clutch buckets and no hierarchy changes. The routine
1794 * also attributes the CPU used so far to the old clutch.
1795 */
1796 void
sched_clutch_thread_clutch_update(thread_t thread,sched_clutch_t old_clutch,sched_clutch_t new_clutch)1797 sched_clutch_thread_clutch_update(
1798 thread_t thread,
1799 sched_clutch_t old_clutch,
1800 sched_clutch_t new_clutch)
1801 {
1802 uint32_t cpu_delta;
1803
1804 if (old_clutch) {
1805 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
1806
1807 sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
1808 /*
1809 * Calculate the CPU used by this thread in the old bucket and
1810 * add it to the old clutch bucket. This uses the same CPU usage
1811 * logic as update_priority etc.
1812 */
1813 sched_tick_delta(thread, cpu_delta);
1814 if (thread->pri_shift < INT8_MAX) {
1815 thread->sched_usage += cpu_delta;
1816 }
1817 thread->cpu_delta += cpu_delta;
1818 if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1819 sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
1820 sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
1821 }
1822 }
1823
1824 if (new_clutch) {
1825 sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
1826 }
1827 }
1828
1829 /* Thread Insertion/Removal/Selection routines */
1830
1831 #if CONFIG_SCHED_EDGE
1832
1833 /*
1834 * Edge Scheduler Bound Thread Support
1835 *
1836 * The edge scheduler allows threads to be bound to specific clusters. The scheduler
1837 * maintains a separate runq on the clutch root to hold these bound threads. These
1838 * bound threads count towards the root priority and thread count, but are ignored
1839 * for thread migration/steal decisions. Bound threads that are enqueued in the
1840 * separate runq have the th_bound_cluster_enqueued flag set to allow easy
1841 * removal.
1842 *
1843 * Bound Threads Timesharing
1844 * The bound threads share the timesharing properties of the clutch bucket group they are
1845 * part of. They contribute to the load and use priority shifts/decay values from the
1846 * clutch bucket group.
1847 */
1848
1849 static boolean_t
sched_edge_bound_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)1850 sched_edge_bound_thread_insert(
1851 sched_clutch_root_t root_clutch,
1852 thread_t thread,
1853 integer_t options)
1854 {
1855 /* Update the clutch runnable count and priority */
1856 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1857 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1858 if (root_bucket->scrb_bound_thread_runq.count == 0) {
1859 sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
1860 }
1861
1862 assert((thread->th_bound_cluster_enqueued) == false);
1863 run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
1864 thread->th_bound_cluster_enqueued = true;
1865
1866 int16_t root_old_pri = root_clutch->scr_priority;
1867 sched_clutch_root_pri_update(root_clutch);
1868 return root_clutch->scr_priority > root_old_pri;
1869 }
1870
1871 static void
sched_edge_bound_thread_remove(sched_clutch_root_t root_clutch,thread_t thread)1872 sched_edge_bound_thread_remove(
1873 sched_clutch_root_t root_clutch,
1874 thread_t thread)
1875 {
1876 sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1877 assert((thread->th_bound_cluster_enqueued) == true);
1878 run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
1879 thread->th_bound_cluster_enqueued = false;
1880
1881 /* Update the clutch runnable count and priority */
1882 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
1883 if (root_bucket->scrb_bound_thread_runq.count == 0) {
1884 sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
1885 }
1886 sched_clutch_root_pri_update(root_clutch);
1887 }
1888
1889 /*
1890 * Edge Scheduler cluster shared resource threads load balancing
1891 *
1892 * The Edge scheduler attempts to load balance cluster shared resource intensive threads
1893 * across clusters in order to reduce contention on the shared resources. It achieves
1894 * that by maintaining the runnable and running shared resource load on each cluster
1895 * and balancing the load across multiple clusters.
1896 *
1897 * The current implementation for cluster shared resource load balancing looks at
1898 * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
1899 * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
1900 * do not steal/rebalance shared resource threads. Some more details for the implementation:
1901 *
1902 * - When threads are tagged as shared resource, they go through the cluster selection logic
1903 * which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
1904 * enqueued in the cluster bound runqueue.
1905 *
1906 * - When the threads start running and call avoid_processor, the load balancing logic will be
1907 * invoked and cause the thread to be sent to a more preferred cluster if one exists and has
1908 * no shared resource load.
1909 *
1910 * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
1911 * it will look at running shared resource threads on foreign clusters and actively rebalance them.
1912 *
1913 * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
1914 * go idle intentionally.
1915 *
1916 * - One caveat of this design is that if a preferred CPU has already run and finished its shared
1917 * resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
1918 * The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
1919 * events listed above happen.
1920 *
1921 * - Also it currently does not consider other properties such as thread priorities and
1922 * qos level thread load in the thread placement decision.
1923 *
1924 * Edge Scheduler cluster shared resource thread scheduling policy
1925 *
1926 * The threads for shared resources can be scheduled using one of the two policies:
1927 *
1928 * EDGE_SHARED_RSRC_SCHED_POLICY_RR
1929 * This policy distributes the threads so that they spread across all available clusters
1930 * irrespective of type. The idea is that this scheduling policy will put a shared resource
1931 * thread on each cluster on the platform before it starts doubling up on clusters.
1932 *
1933 * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
1934 * This policy distributes threads so that the threads first fill up all the capacity on
1935 * the preferred cluster and its homogeneous peers before spilling to different core type.
1936 * The current implementation defines capacity based on the number of CPUs in the cluster;
1937 * so a cluster's shared resource is considered full if there are "n" runnable + running
1938 * shared resource threads on the cluster with n cpus. This policy is different from the
1939 * default scheduling policy of the edge scheduler since this always tries to fill up the
1940 * native clusters to capacity even when non-native clusters might be idle.
1941 */
1942 __options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
1943 EDGE_SHARED_RSRC_SCHED_POLICY_RR = 0,
1944 EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST = 1,
1945 });
1946
1947 const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
1948 [CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
1949 [CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
1950 };
1951
1952 static void
sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch,thread_t thread)1953 sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
1954 {
1955 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
1956 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
1957 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
1958 }
1959 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
1960 root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
1961 thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
1962 }
1963 }
1964
1965 static void
sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch,thread_t thread)1966 sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
1967 {
1968 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
1969 if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
1970 thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
1971 root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
1972 }
1973 }
1974 }
1975
1976 uint16_t
sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch,cluster_shared_rsrc_type_t shared_rsrc_type)1977 sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
1978 {
1979 return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
1980 }
1981
1982 /*
1983 * sched_edge_shared_rsrc_idle()
1984 *
1985 * Routine used to determine if the constrained resource for the pset is idle. This is
1986 * used by a CPU going idle to decide if it should rebalance a running shared resource
1987 * thread from a non-preferred cluster.
1988 */
1989 static boolean_t
sched_edge_shared_rsrc_idle(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)1990 sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
1991 {
1992 return sched_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
1993 }
1994
1995 /*
1996 * sched_edge_thread_shared_rsrc_type
1997 *
1998 * This routine decides if a given thread needs special handling for being a
1999 * heavy shared resource user. It is valid for the same thread to be using
2000 * several shared resources at the same time and have multiple policy flags set.
2001 * This routine determines which of those properties will be used for load
2002 * balancing and migration decisions.
2003 */
2004 static cluster_shared_rsrc_type_t
sched_edge_thread_shared_rsrc_type(thread_t thread)2005 sched_edge_thread_shared_rsrc_type(thread_t thread)
2006 {
2007 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2008 return CLUSTER_SHARED_RSRC_TYPE_RR;
2009 }
2010 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2011 return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2012 }
2013 return CLUSTER_SHARED_RSRC_TYPE_NONE;
2014 }
2015
2016 #endif /* CONFIG_SCHED_EDGE */
2017
2018 /*
2019 * sched_clutch_thread_bound_lookup()
2020 *
2021 * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2022 */
2023 static thread_t
sched_clutch_thread_bound_lookup(__unused sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket)2024 sched_clutch_thread_bound_lookup(
2025 __unused sched_clutch_root_t root_clutch,
2026 sched_clutch_root_bucket_t root_bucket)
2027 {
2028 return run_queue_peek(&root_bucket->scrb_bound_thread_runq);
2029 }
2030
2031 /*
2032 * Clutch Bucket Group Thread Counts and Pending time calculation
2033 *
2034 * The pending time on the clutch_bucket_group allows the scheduler to track if it
2035 * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2036 * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2037 * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2038 * pending time is advanced to the time of thread selection.
2039 *
2040 * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2041 * simulataneously, it is important that the updates to thread counts and pending timestamps
2042 * happen atomically. The implementation relies on the following aspects to make that work
2043 * as expected:
2044 * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2045 * is held when threads are added/removed and pending timestamps are updated
2046 * - The edge scheduler would have support for double wide 128 bit atomics which allow the
2047 * thread count and pending timestamp to be updated atomically.
2048 *
2049 * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2050 * above to atomically update the interactivity score for a clutch bucket group.
2051 */
2052
2053 #if CONFIG_SCHED_EDGE
2054
2055 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2056 sched_clutch_bucket_group_thr_count_inc(
2057 sched_clutch_bucket_group_t clutch_bucket_group,
2058 uint64_t timestamp)
2059 {
2060 sched_clutch_counter_time_t old_pending_data;
2061 sched_clutch_counter_time_t new_pending_data;
2062 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2063 new_pending_data.scct_count = old_pending_data.scct_count + 1;
2064 new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2065 if (old_pending_data.scct_count == 0) {
2066 new_pending_data.scct_timestamp = timestamp;
2067 }
2068 });
2069 }
2070
2071 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2072 sched_clutch_bucket_group_thr_count_dec(
2073 sched_clutch_bucket_group_t clutch_bucket_group,
2074 uint64_t timestamp)
2075 {
2076 sched_clutch_counter_time_t old_pending_data;
2077 sched_clutch_counter_time_t new_pending_data;
2078 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2079 new_pending_data.scct_count = old_pending_data.scct_count - 1;
2080 if (new_pending_data.scct_count == 0) {
2081 new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2082 } else {
2083 new_pending_data.scct_timestamp = timestamp;
2084 }
2085 });
2086 }
2087
2088 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2089 sched_clutch_bucket_group_pending_ageout(
2090 sched_clutch_bucket_group_t clutch_bucket_group,
2091 uint64_t timestamp)
2092 {
2093 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2094 sched_clutch_counter_time_t old_pending_data;
2095 sched_clutch_counter_time_t new_pending_data;
2096 uint8_t cpu_usage_shift = 0;
2097
2098 os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2099 cpu_usage_shift = 0;
2100 uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2101 bool old_update = (old_pending_ts >= timestamp);
2102 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2103 bool no_bucket_load = (bucket_load == 0);
2104 if (old_update || no_pending_time || no_bucket_load) {
2105 os_atomic_rmw_loop_give_up();
2106 }
2107
2108 /* Calculate the time the clutch bucket group has been pending */
2109 uint64_t pending_delta = timestamp - old_pending_ts;
2110 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2111 if (pending_delta < interactivity_delta) {
2112 os_atomic_rmw_loop_give_up();
2113 }
2114 cpu_usage_shift = (pending_delta / interactivity_delta);
2115 new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2116 new_pending_data.scct_count = old_pending_data.scct_count;
2117 });
2118 return cpu_usage_shift;
2119 }
2120
2121 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2122 sched_clutch_bucket_group_interactivity_score_calculate(
2123 sched_clutch_bucket_group_t clutch_bucket_group,
2124 uint64_t timestamp)
2125 {
2126 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2127 /*
2128 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2129 * priorities, make sure all AboveUI buckets are marked interactive.
2130 */
2131 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2132 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2133 }
2134 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2135 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2136 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2137 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2138 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2139 sched_clutch_counter_time_t old_interactivity_data;
2140 sched_clutch_counter_time_t new_interactivity_data;
2141
2142 bool score_updated = os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2143 if (old_interactivity_data.scct_timestamp >= timestamp) {
2144 os_atomic_rmw_loop_give_up();
2145 }
2146 new_interactivity_data.scct_timestamp = timestamp;
2147 if (old_interactivity_data.scct_timestamp != 0) {
2148 new_interactivity_data.scct_count = interactivity_score;
2149 }
2150 });
2151 if (score_updated) {
2152 return (uint8_t)new_interactivity_data.scct_count;
2153 } else {
2154 return (uint8_t)old_interactivity_data.scct_count;
2155 }
2156 }
2157
2158 #else /* CONFIG_SCHED_EDGE */
2159
2160 /*
2161 * For the clutch scheduler, atomicity is ensured by making sure all operations
2162 * are happening under the pset lock of the only cluster present on the platform.
2163 */
2164 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2165 sched_clutch_bucket_group_thr_count_inc(
2166 sched_clutch_bucket_group_t clutch_bucket_group,
2167 uint64_t timestamp)
2168 {
2169 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2170 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2171 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2172 }
2173 clutch_bucket_group->scbg_pending_data.scct_count++;
2174 }
2175
2176 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2177 sched_clutch_bucket_group_thr_count_dec(
2178 sched_clutch_bucket_group_t clutch_bucket_group,
2179 uint64_t timestamp)
2180 {
2181 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2182 clutch_bucket_group->scbg_pending_data.scct_count--;
2183 if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2184 clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2185 } else {
2186 clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2187 }
2188 }
2189
2190 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2191 sched_clutch_bucket_group_pending_ageout(
2192 sched_clutch_bucket_group_t clutch_bucket_group,
2193 uint64_t timestamp)
2194 {
2195 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2196 int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2197 uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2198 bool old_update = (old_pending_ts >= timestamp);
2199 bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2200 bool no_bucket_load = (bucket_load == 0);
2201 if (old_update || no_pending_time || no_bucket_load) {
2202 return 0;
2203 }
2204 uint64_t pending_delta = timestamp - old_pending_ts;
2205 uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2206 if (pending_delta < interactivity_delta) {
2207 return 0;
2208 }
2209 uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2210 clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2211 return cpu_usage_shift;
2212 }
2213
2214 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2215 sched_clutch_bucket_group_interactivity_score_calculate(
2216 sched_clutch_bucket_group_t clutch_bucket_group,
2217 uint64_t timestamp)
2218 {
2219 sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2220 if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2221 /*
2222 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2223 * priorities, make sure all AboveUI buckets are marked interactive.
2224 */
2225 assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2226 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2227 }
2228 /* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2229 uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2230 /* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2231 sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2232 uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2233 if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2234 clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2235 clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2236 return interactivity_score;
2237 } else {
2238 return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2239 }
2240 }
2241
2242 #endif /* CONFIG_SCHED_EDGE */
2243
2244 /*
2245 * Clutch Bucket Group Run Count and Blocked Time Accounting
2246 *
2247 * The clutch bucket group maintains the number of runnable/running threads in the group.
2248 * Since the blocked time of the clutch bucket group is based on this count, it is
2249 * important to make sure the blocking timestamp and the run count are updated atomically.
2250 *
2251 * Since the run count increments happen without any pset locks held, the scheduler makes
2252 * these updates atomic in the following way:
2253 * - On 64-bit platforms, it uses double wide atomics to update the count & timestamp
2254 * - On 32-bit platforms, it uses a lock to synchronize the count & timestamp update
2255 */
2256
2257 #if !__LP64__
2258
2259 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2260 sched_clutch_bucket_group_run_count_inc(
2261 sched_clutch_bucket_group_t clutch_bucket_group)
2262 {
2263 uint64_t blocked_time = 0;
2264 uint64_t updated_run_count = 0;
2265
2266 lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2267 if ((clutch_bucket_group->scbg_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID) &&
2268 (clutch_bucket_group->scbg_blocked_data.scct_count == 0)) {
2269 /* Run count is transitioning from 0 to 1; calculate blocked time and add it to CPU data */
2270 blocked_time = mach_absolute_time() - clutch_bucket_group->scbg_blocked_data.scct_timestamp;
2271 clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2272 }
2273 clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count + 1;
2274 updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2275 lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2276
2277 blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2278 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2279 return (uint32_t)updated_run_count;
2280 }
2281
2282 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2283 sched_clutch_bucket_group_run_count_dec(
2284 sched_clutch_bucket_group_t clutch_bucket_group)
2285 {
2286 uint64_t updated_run_count = 0;
2287
2288 lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2289 clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count - 1;
2290 if (clutch_bucket_group->scbg_blocked_data.scct_count == 0) {
2291 /* Run count is transitioning from 1 to 0; start the blocked timer */
2292 clutch_bucket_group->scbg_blocked_data.scct_timestamp = mach_absolute_time();
2293 }
2294 updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2295 lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2296 return (uint32_t)updated_run_count;
2297 }
2298
2299 #else /* !__LP64__ */
2300
2301 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2302 sched_clutch_bucket_group_run_count_inc(
2303 sched_clutch_bucket_group_t clutch_bucket_group)
2304 {
2305 sched_clutch_counter_time_t old_blocked_data;
2306 sched_clutch_counter_time_t new_blocked_data;
2307
2308 bool update_blocked_time = false;
2309 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2310 new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2311 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2312 update_blocked_time = false;
2313 if (old_blocked_data.scct_count == 0) {
2314 new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2315 update_blocked_time = true;
2316 }
2317 });
2318 if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2319 uint64_t ctime = mach_absolute_time();
2320 if (ctime > old_blocked_data.scct_timestamp) {
2321 uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2322 blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2323 os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2324 }
2325 }
2326 return (uint32_t)new_blocked_data.scct_count;
2327 }
2328
2329 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2330 sched_clutch_bucket_group_run_count_dec(
2331 sched_clutch_bucket_group_t clutch_bucket_group)
2332 {
2333 sched_clutch_counter_time_t old_blocked_data;
2334 sched_clutch_counter_time_t new_blocked_data;
2335
2336 uint64_t ctime = mach_absolute_time();
2337 os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2338 new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2339 new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2340 if (new_blocked_data.scct_count == 0) {
2341 new_blocked_data.scct_timestamp = ctime;
2342 }
2343 });
2344 return (uint32_t)new_blocked_data.scct_count;
2345 }
2346
2347 #endif /* !__LP64__ */
2348
2349
2350 /*
2351 * sched_clutch_thread_insert()
2352 *
2353 * Routine to insert a thread into the sched clutch hierarchy.
2354 * Update the counts at all levels of the hierarchy and insert the nodes
2355 * as they become runnable. Always called with the pset lock held.
2356 */
2357 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2358 sched_clutch_thread_insert(
2359 sched_clutch_root_t root_clutch,
2360 thread_t thread,
2361 integer_t options)
2362 {
2363 boolean_t result = FALSE;
2364
2365 sched_clutch_hierarchy_locked_assert(root_clutch);
2366 #if CONFIG_SCHED_EDGE
2367 sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2368 sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2369 /*
2370 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2371 * One scenario where a bound thread might not be getting enqueued in the bound cluster
2372 * hierarchy would be if the thread is "soft" bound and the bound cluster is
2373 * de-recommended. In that case, the thread should be treated as an unbound
2374 * thread.
2375 */
2376 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2377 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2378 }
2379 /*
2380 * Use bound runqueue for shared resource threads. See "cluster shared resource
2381 * threads load balancing" section for details.
2382 */
2383 if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2384 return sched_edge_bound_thread_insert(root_clutch, thread, options);
2385 }
2386
2387 #endif /* CONFIG_SCHED_EDGE */
2388 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2389 assert(thread->thread_group == clutch->sc_tg);
2390
2391 uint64_t current_timestamp = mach_absolute_time();
2392 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2393 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2394 assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2395
2396 /*
2397 * Thread linkage in clutch_bucket
2398 *
2399 * A thread has a few linkages within the clutch bucket:
2400 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2401 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2402 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2403 */
2404
2405 /* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2406 thread->th_clutch_runq_link.stamp = current_timestamp;
2407 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2408 (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2409 priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2410
2411 /* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2412 priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2413 sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2414 priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2415
2416 /* Insert thread into timesharing queue of the clutch bucket */
2417 enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2418
2419 /* Increment the urgency counter for the root if necessary */
2420 sched_clutch_root_urgency_inc(root_clutch, thread);
2421
2422 os_atomic_inc(&clutch->sc_thr_count, relaxed);
2423 sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2424
2425 /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2426 sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2427 if (clutch_bucket->scb_thr_count == 0) {
2428 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2429 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2430 result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2431 } else {
2432 sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2433 sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2434 result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2435 }
2436
2437 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2438 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2439 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2440 return result;
2441 }
2442
2443 /*
2444 * sched_clutch_thread_remove()
2445 *
2446 * Routine to remove a thread from the sched clutch hierarchy.
2447 * Update the counts at all levels of the hierarchy and remove the nodes
2448 * as they become empty. Always called with the pset lock held.
2449 */
2450 static void
sched_clutch_thread_remove(sched_clutch_root_t root_clutch,thread_t thread,uint64_t current_timestamp,sched_clutch_bucket_options_t options)2451 sched_clutch_thread_remove(
2452 sched_clutch_root_t root_clutch,
2453 thread_t thread,
2454 uint64_t current_timestamp,
2455 sched_clutch_bucket_options_t options)
2456 {
2457 sched_clutch_hierarchy_locked_assert(root_clutch);
2458 #if CONFIG_SCHED_EDGE
2459 sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2460 sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2461
2462 if (thread->th_bound_cluster_enqueued) {
2463 sched_edge_bound_thread_remove(root_clutch, thread);
2464 return;
2465 }
2466 #endif /* CONFIG_SCHED_EDGE */
2467 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2468 assert(thread->thread_group == clutch->sc_tg);
2469 assert(thread->runq != PROCESSOR_NULL);
2470
2471 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2472 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2473 assert(clutch_bucket->scb_root == root_clutch);
2474
2475 /* Decrement the urgency counter for the root if necessary */
2476 sched_clutch_root_urgency_dec(root_clutch, thread);
2477 /* Remove thread from the clutch_bucket */
2478 priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2479 remqueue(&thread->th_clutch_timeshare_link);
2480 thread->runq = PROCESSOR_NULL;
2481
2482 priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2483
2484 /* Update counts at various levels of the hierarchy */
2485 os_atomic_dec(&clutch->sc_thr_count, relaxed);
2486 sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2487 sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2488 sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2489
2490 /* Remove the clutch from hierarchy (if needed) and update properties */
2491 if (clutch_bucket->scb_thr_count == 0) {
2492 sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2493 } else {
2494 sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2495 }
2496
2497 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2498 root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2499 SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2500 }
2501
2502 /*
2503 * sched_clutch_thread_unbound_lookup()
2504 *
2505 * Routine to find the highest unbound thread in the root clutch.
2506 * Helps find threads easily for steal/migrate scenarios in the
2507 * Edge scheduler.
2508 */
2509 static thread_t
sched_clutch_thread_unbound_lookup(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket)2510 sched_clutch_thread_unbound_lookup(
2511 sched_clutch_root_t root_clutch,
2512 sched_clutch_root_bucket_t root_bucket)
2513 {
2514 sched_clutch_hierarchy_locked_assert(root_clutch);
2515
2516 /* Find the highest priority clutch bucket in this root bucket */
2517 sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
2518 assert(clutch_bucket != NULL);
2519
2520 /* Find the highest priority runnable thread in this clutch bucket */
2521 thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2522 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2523 thread_tid(thread), thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
2524 return thread;
2525 }
2526
2527 /*
2528 * sched_clutch_thread_highest_remove()
2529 *
2530 * Routine to find and remove the highest priority thread
2531 * from the sched clutch hierarchy. The algorithm looks at the
2532 * hierarchy for the most eligible runnable thread and calls
2533 * sched_clutch_thread_remove(). Always called with the
2534 * pset lock held.
2535 */
2536 static thread_t
sched_clutch_thread_highest_remove(sched_clutch_root_t root_clutch)2537 sched_clutch_thread_highest_remove(
2538 sched_clutch_root_t root_clutch)
2539 {
2540 sched_clutch_hierarchy_locked_assert(root_clutch);
2541 uint64_t current_timestamp = mach_absolute_time();
2542
2543 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
2544 if (root_bucket == NULL) {
2545 return THREAD_NULL;
2546 }
2547
2548 thread_t highest_thread = THREAD_NULL;
2549 if (root_bucket->scrb_bound) {
2550 highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket);
2551 } else {
2552 highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket);
2553 }
2554 sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2555 return highest_thread;
2556 }
2557
2558 /* High level global accessor routines */
2559
2560 /*
2561 * sched_clutch_root_urgency()
2562 *
2563 * Routine to get the urgency of the highest runnable
2564 * thread in the hierarchy.
2565 */
2566 static uint32_t
sched_clutch_root_urgency(sched_clutch_root_t root_clutch)2567 sched_clutch_root_urgency(
2568 sched_clutch_root_t root_clutch)
2569 {
2570 return root_clutch->scr_urgency;
2571 }
2572
2573 /*
2574 * sched_clutch_root_count_sum()
2575 *
2576 * The count_sum mechanism is used for scheduler runq
2577 * statistics calculation. Its only useful for debugging
2578 * purposes; since it takes a mach_absolute_time() on
2579 * other scheduler implementations, its better to avoid
2580 * populating this until absolutely necessary.
2581 */
2582 static uint32_t
sched_clutch_root_count_sum(__unused sched_clutch_root_t root_clutch)2583 sched_clutch_root_count_sum(
2584 __unused sched_clutch_root_t root_clutch)
2585 {
2586 return 0;
2587 }
2588
2589 /*
2590 * sched_clutch_root_priority()
2591 *
2592 * Routine to get the priority of the highest runnable
2593 * thread in the hierarchy.
2594 */
2595 static int
sched_clutch_root_priority(sched_clutch_root_t root_clutch)2596 sched_clutch_root_priority(
2597 sched_clutch_root_t root_clutch)
2598 {
2599 return root_clutch->scr_priority;
2600 }
2601
2602 /*
2603 * sched_clutch_root_count()
2604 *
2605 * Returns total number of runnable threads in the hierarchy.
2606 */
2607 uint32_t
sched_clutch_root_count(sched_clutch_root_t root_clutch)2608 sched_clutch_root_count(
2609 sched_clutch_root_t root_clutch)
2610 {
2611 return root_clutch->scr_thr_count;
2612 }
2613
2614 #if CONFIG_SCHED_EDGE
2615
2616 /*
2617 * sched_clutch_root_foreign_empty()
2618 *
2619 * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2620 */
2621 static boolean_t
sched_clutch_root_foreign_empty(sched_clutch_root_t root_clutch)2622 sched_clutch_root_foreign_empty(
2623 sched_clutch_root_t root_clutch)
2624 {
2625 return priority_queue_empty(&root_clutch->scr_foreign_buckets);
2626 }
2627
2628 /*
2629 * sched_clutch_root_highest_foreign_thread_remove()
2630 *
2631 * Routine to return the thread in the highest priority clutch bucket in a cluster.
2632 * Must be called with the pset for the cluster locked.
2633 */
2634 static thread_t
sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t root_clutch)2635 sched_clutch_root_highest_foreign_thread_remove(
2636 sched_clutch_root_t root_clutch)
2637 {
2638 thread_t thread = THREAD_NULL;
2639 if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
2640 return thread;
2641 }
2642 sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
2643 thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2644 sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
2645 return thread;
2646 }
2647
2648 #endif /* CONFIG_SCHED_EDGE */
2649
2650 /*
2651 * sched_clutch_thread_pri_shift()
2652 *
2653 * Routine to get the priority shift value for a thread.
2654 * Since the timesharing is done at the clutch_bucket level,
2655 * this routine gets the clutch_bucket and retrieves the
2656 * values from there.
2657 */
2658 uint32_t
sched_clutch_thread_pri_shift(thread_t thread,sched_bucket_t bucket)2659 sched_clutch_thread_pri_shift(
2660 thread_t thread,
2661 sched_bucket_t bucket)
2662 {
2663 if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2664 return INT8_MAX;
2665 }
2666 assert(bucket != TH_BUCKET_RUN);
2667 sched_clutch_t clutch = sched_clutch_for_thread(thread);
2668 sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2669 return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
2670 }
2671
2672 #pragma mark -- Clutch Scheduler Algorithm
2673
2674 static void
2675 sched_clutch_init(void);
2676
2677 static thread_t
2678 sched_clutch_steal_thread(processor_set_t pset);
2679
2680 static void
2681 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
2682
2683 static boolean_t
2684 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
2685 sched_options_t options);
2686
2687 static boolean_t
2688 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
2689
2690 static ast_t
2691 sched_clutch_processor_csw_check(processor_t processor);
2692
2693 static boolean_t
2694 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
2695
2696 static int
2697 sched_clutch_runq_count(processor_t processor);
2698
2699 static boolean_t
2700 sched_clutch_processor_queue_empty(processor_t processor);
2701
2702 static uint64_t
2703 sched_clutch_runq_stats_count_sum(processor_t processor);
2704
2705 static int
2706 sched_clutch_processor_bound_count(processor_t processor);
2707
2708 static void
2709 sched_clutch_pset_init(processor_set_t pset);
2710
2711 static void
2712 sched_clutch_processor_init(processor_t processor);
2713
2714 static thread_t
2715 sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
2716
2717 static void
2718 sched_clutch_processor_queue_shutdown(processor_t processor);
2719
2720 static sched_mode_t
2721 sched_clutch_initial_thread_sched_mode(task_t parent_task);
2722
2723 static uint32_t
2724 sched_clutch_initial_quantum_size(thread_t thread);
2725
2726 static bool
2727 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread);
2728
2729 static uint32_t
2730 sched_clutch_run_incr(thread_t thread);
2731
2732 static uint32_t
2733 sched_clutch_run_decr(thread_t thread);
2734
2735 static void
2736 sched_clutch_update_thread_bucket(thread_t thread);
2737
2738 const struct sched_dispatch_table sched_clutch_dispatch = {
2739 .sched_name = "clutch",
2740 .init = sched_clutch_init,
2741 .timebase_init = sched_timeshare_timebase_init,
2742 .processor_init = sched_clutch_processor_init,
2743 .pset_init = sched_clutch_pset_init,
2744 .maintenance_continuation = sched_timeshare_maintenance_continue,
2745 .choose_thread = sched_clutch_choose_thread,
2746 .steal_thread_enabled = sched_steal_thread_enabled,
2747 .steal_thread = sched_clutch_steal_thread,
2748 .compute_timeshare_priority = sched_compute_timeshare_priority,
2749 .choose_node = sched_choose_node,
2750 .choose_processor = choose_processor,
2751 .processor_enqueue = sched_clutch_processor_enqueue,
2752 .processor_queue_shutdown = sched_clutch_processor_queue_shutdown,
2753 .processor_queue_remove = sched_clutch_processor_queue_remove,
2754 .processor_queue_empty = sched_clutch_processor_queue_empty,
2755 .priority_is_urgent = priority_is_urgent,
2756 .processor_csw_check = sched_clutch_processor_csw_check,
2757 .processor_queue_has_priority = sched_clutch_processor_queue_has_priority,
2758 .initial_quantum_size = sched_clutch_initial_quantum_size,
2759 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
2760 .can_update_priority = can_update_priority,
2761 .update_priority = update_priority,
2762 .lightweight_update_priority = lightweight_update_priority,
2763 .quantum_expire = sched_default_quantum_expire,
2764 .processor_runq_count = sched_clutch_runq_count,
2765 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
2766 .processor_bound_count = sched_clutch_processor_bound_count,
2767 .thread_update_scan = sched_clutch_thread_update_scan,
2768 .multiple_psets_enabled = TRUE,
2769 .sched_groups_enabled = FALSE,
2770 .avoid_processor_enabled = TRUE,
2771 .thread_avoid_processor = sched_clutch_thread_avoid_processor,
2772 .processor_balance = sched_SMT_balance,
2773
2774 .rt_runq = sched_rtlocal_runq,
2775 .rt_init = sched_rtlocal_init,
2776 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
2777 .rt_runq_scan = sched_rtlocal_runq_scan,
2778 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
2779 .rt_steal_thread = sched_rtlocal_steal_thread,
2780
2781 .qos_max_parallelism = sched_qos_max_parallelism,
2782 .check_spill = sched_check_spill,
2783 .ipi_policy = sched_ipi_policy,
2784 .thread_should_yield = sched_thread_should_yield,
2785 .run_count_incr = sched_clutch_run_incr,
2786 .run_count_decr = sched_clutch_run_decr,
2787 .update_thread_bucket = sched_clutch_update_thread_bucket,
2788 .pset_made_schedulable = sched_pset_made_schedulable,
2789 .cpu_init_completed = NULL,
2790 .thread_eligible_for_pset = NULL,
2791 };
2792
2793 __attribute__((always_inline))
2794 static inline run_queue_t
sched_clutch_bound_runq(processor_t processor)2795 sched_clutch_bound_runq(processor_t processor)
2796 {
2797 return &processor->runq;
2798 }
2799
2800 __attribute__((always_inline))
2801 static inline sched_clutch_root_t
sched_clutch_processor_root_clutch(processor_t processor)2802 sched_clutch_processor_root_clutch(processor_t processor)
2803 {
2804 return &processor->processor_set->pset_clutch_root;
2805 }
2806
2807 __attribute__((always_inline))
2808 static inline run_queue_t
sched_clutch_thread_bound_runq(processor_t processor,__assert_only thread_t thread)2809 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
2810 {
2811 assert(thread->bound_processor == processor);
2812 return sched_clutch_bound_runq(processor);
2813 }
2814
2815 static uint32_t
sched_clutch_initial_quantum_size(thread_t thread)2816 sched_clutch_initial_quantum_size(thread_t thread)
2817 {
2818 if (thread == THREAD_NULL) {
2819 return std_quantum;
2820 }
2821 assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
2822 return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
2823 }
2824
2825 static sched_mode_t
sched_clutch_initial_thread_sched_mode(task_t parent_task)2826 sched_clutch_initial_thread_sched_mode(task_t parent_task)
2827 {
2828 if (parent_task == kernel_task) {
2829 return TH_MODE_FIXED;
2830 } else {
2831 return TH_MODE_TIMESHARE;
2832 }
2833 }
2834
2835 static void
sched_clutch_processor_init(processor_t processor)2836 sched_clutch_processor_init(processor_t processor)
2837 {
2838 run_queue_init(&processor->runq);
2839 }
2840
2841 static void
sched_clutch_pset_init(processor_set_t pset)2842 sched_clutch_pset_init(processor_set_t pset)
2843 {
2844 sched_clutch_root_init(&pset->pset_clutch_root, pset);
2845 }
2846
2847 static void
sched_clutch_tunables_init(void)2848 sched_clutch_tunables_init(void)
2849 {
2850 sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
2851 sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
2852 sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
2853 clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
2854 NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
2855 assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
2856 sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
2857 }
2858
2859 static void
sched_clutch_init(void)2860 sched_clutch_init(void)
2861 {
2862 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
2863 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
2864 }
2865 sched_timeshare_init();
2866 sched_clutch_tunables_init();
2867 }
2868
2869 static thread_t
sched_clutch_choose_thread(processor_t processor,int priority,__unused ast_t reason)2870 sched_clutch_choose_thread(
2871 processor_t processor,
2872 int priority,
2873 __unused ast_t reason)
2874 {
2875 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2876 uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
2877 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2878 boolean_t choose_from_boundq = false;
2879
2880 if (bound_runq->highq < priority &&
2881 clutch_pri < priority) {
2882 return THREAD_NULL;
2883 }
2884
2885 if (bound_runq->count && clutch_count) {
2886 if (bound_runq->highq >= clutch_pri) {
2887 choose_from_boundq = true;
2888 }
2889 } else if (bound_runq->count) {
2890 choose_from_boundq = true;
2891 } else if (clutch_count) {
2892 choose_from_boundq = false;
2893 } else {
2894 return THREAD_NULL;
2895 }
2896
2897 thread_t thread = THREAD_NULL;
2898 if (choose_from_boundq == false) {
2899 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2900 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
2901 } else {
2902 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
2903 }
2904 return thread;
2905 }
2906
2907 static boolean_t
sched_clutch_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)2908 sched_clutch_processor_enqueue(
2909 processor_t processor,
2910 thread_t thread,
2911 sched_options_t options)
2912 {
2913 boolean_t result;
2914
2915 thread->runq = processor;
2916 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2917 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2918 result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
2919 } else {
2920 run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
2921 result = run_queue_enqueue(rq, thread, options);
2922 }
2923 return result;
2924 }
2925
2926 static boolean_t
sched_clutch_processor_queue_empty(processor_t processor)2927 sched_clutch_processor_queue_empty(processor_t processor)
2928 {
2929 return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
2930 sched_clutch_bound_runq(processor)->count == 0;
2931 }
2932
2933 static ast_t
sched_clutch_processor_csw_check(processor_t processor)2934 sched_clutch_processor_csw_check(processor_t processor)
2935 {
2936 boolean_t has_higher;
2937 int pri;
2938
2939 if (sched_clutch_thread_avoid_processor(processor, current_thread())) {
2940 return AST_PREEMPT | AST_URGENT;
2941 }
2942
2943 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2944 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2945
2946 assert(processor->active_thread != NULL);
2947
2948 pri = MAX(clutch_pri, bound_runq->highq);
2949
2950 if (processor->first_timeslice) {
2951 has_higher = (pri > processor->current_pri);
2952 } else {
2953 has_higher = (pri >= processor->current_pri);
2954 }
2955
2956 if (has_higher) {
2957 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
2958 return AST_PREEMPT | AST_URGENT;
2959 }
2960
2961 if (bound_runq->urgency > 0) {
2962 return AST_PREEMPT | AST_URGENT;
2963 }
2964
2965 return AST_PREEMPT;
2966 }
2967
2968 return AST_NONE;
2969 }
2970
2971 static boolean_t
sched_clutch_processor_queue_has_priority(processor_t processor,int priority,boolean_t gte)2972 sched_clutch_processor_queue_has_priority(processor_t processor,
2973 int priority,
2974 boolean_t gte)
2975 {
2976 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2977
2978 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
2979
2980 if (gte) {
2981 return qpri >= priority;
2982 } else {
2983 return qpri > priority;
2984 }
2985 }
2986
2987 static int
sched_clutch_runq_count(processor_t processor)2988 sched_clutch_runq_count(processor_t processor)
2989 {
2990 return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
2991 }
2992
2993 static uint64_t
sched_clutch_runq_stats_count_sum(processor_t processor)2994 sched_clutch_runq_stats_count_sum(processor_t processor)
2995 {
2996 uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
2997
2998 if (processor->cpu_id == processor->processor_set->cpu_set_low) {
2999 return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
3000 } else {
3001 return bound_sum;
3002 }
3003 }
3004 static int
sched_clutch_processor_bound_count(processor_t processor)3005 sched_clutch_processor_bound_count(processor_t processor)
3006 {
3007 return sched_clutch_bound_runq(processor)->count;
3008 }
3009
3010 static void
sched_clutch_processor_queue_shutdown(processor_t processor)3011 sched_clutch_processor_queue_shutdown(processor_t processor)
3012 {
3013 processor_set_t pset = processor->processor_set;
3014 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3015 thread_t thread;
3016 queue_head_t tqueue;
3017
3018 /* We only need to migrate threads if this is the last active processor in the pset */
3019 if (pset->online_processor_count > 0) {
3020 pset_unlock(pset);
3021 return;
3022 }
3023
3024 queue_init(&tqueue);
3025 while (sched_clutch_root_count(pset_clutch_root) > 0) {
3026 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3027 enqueue_tail(&tqueue, &thread->runq_links);
3028 }
3029
3030 pset_unlock(pset);
3031
3032 qe_foreach_element_safe(thread, &tqueue, runq_links) {
3033 remqueue(&thread->runq_links);
3034 thread_lock(thread);
3035 thread_setrun(thread, SCHED_TAILQ);
3036 thread_unlock(thread);
3037 }
3038 }
3039
3040 static boolean_t
sched_clutch_processor_queue_remove(processor_t processor,thread_t thread)3041 sched_clutch_processor_queue_remove(
3042 processor_t processor,
3043 thread_t thread)
3044 {
3045 run_queue_t rq;
3046 processor_set_t pset = processor->processor_set;
3047
3048 pset_lock(pset);
3049
3050 if (processor == thread->runq) {
3051 /*
3052 * Thread is on a run queue and we have a lock on
3053 * that run queue.
3054 */
3055 if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3056 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3057 sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3058 } else {
3059 rq = sched_clutch_thread_bound_runq(processor, thread);
3060 run_queue_remove(rq, thread);
3061 }
3062 } else {
3063 /*
3064 * The thread left the run queue before we could
3065 * lock the run queue.
3066 */
3067 assert(thread->runq == PROCESSOR_NULL);
3068 processor = PROCESSOR_NULL;
3069 }
3070
3071 pset_unlock(pset);
3072
3073 return processor != PROCESSOR_NULL;
3074 }
3075
3076 static thread_t
sched_clutch_steal_thread(__unused processor_set_t pset)3077 sched_clutch_steal_thread(__unused processor_set_t pset)
3078 {
3079 /* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3080 return THREAD_NULL;
3081 }
3082
3083 static void
sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)3084 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3085 {
3086 boolean_t restart_needed = FALSE;
3087 processor_t processor = processor_list;
3088 processor_set_t pset;
3089 thread_t thread;
3090 spl_t s;
3091
3092 /*
3093 * We update the threads associated with each processor (bound and idle threads)
3094 * and then update the threads in each pset runqueue.
3095 */
3096
3097 do {
3098 do {
3099 pset = processor->processor_set;
3100
3101 s = splsched();
3102 pset_lock(pset);
3103
3104 restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
3105
3106 pset_unlock(pset);
3107 splx(s);
3108
3109 if (restart_needed) {
3110 break;
3111 }
3112
3113 thread = processor->idle_thread;
3114 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
3115 if (thread_update_add_thread(thread) == FALSE) {
3116 restart_needed = TRUE;
3117 break;
3118 }
3119 }
3120 } while ((processor = processor->processor_list) != NULL);
3121
3122 /* Ok, we now have a collection of candidates -- fix them. */
3123 thread_update_process_threads();
3124 } while (restart_needed);
3125
3126 pset_node_t node = &pset_node0;
3127 pset = node->psets;
3128
3129 do {
3130 do {
3131 restart_needed = FALSE;
3132 while (pset != NULL) {
3133 s = splsched();
3134 pset_lock(pset);
3135
3136 if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
3137 for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3138 restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3139 if (restart_needed) {
3140 break;
3141 }
3142 }
3143 queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3144 sched_clutch_bucket_t clutch_bucket;
3145 qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3146 sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
3147 restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
3148 }
3149 }
3150
3151 pset_unlock(pset);
3152 splx(s);
3153
3154 if (restart_needed) {
3155 break;
3156 }
3157 pset = pset->pset_list;
3158 }
3159
3160 if (restart_needed) {
3161 break;
3162 }
3163 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3164
3165 /* Ok, we now have a collection of candidates -- fix them. */
3166 thread_update_process_threads();
3167 } while (restart_needed);
3168 }
3169
3170 extern int sched_allow_rt_smt;
3171
3172 /* Return true if this thread should not continue running on this processor */
3173 static bool
sched_clutch_thread_avoid_processor(processor_t processor,thread_t thread)3174 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread)
3175 {
3176 if (processor->processor_primary != processor) {
3177 /*
3178 * This is a secondary SMT processor. If the primary is running
3179 * a realtime thread, only allow realtime threads on the secondary.
3180 */
3181 if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
3182 return true;
3183 }
3184 }
3185
3186 return false;
3187 }
3188
3189 /*
3190 * For the clutch scheduler, the run counts are maintained in the clutch
3191 * buckets (i.e thread group scheduling structure).
3192 */
3193 static uint32_t
sched_clutch_run_incr(thread_t thread)3194 sched_clutch_run_incr(thread_t thread)
3195 {
3196 assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3197 uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3198 sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3199 return new_count;
3200 }
3201
3202 static uint32_t
sched_clutch_run_decr(thread_t thread)3203 sched_clutch_run_decr(thread_t thread)
3204 {
3205 assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3206 uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3207 sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3208 return new_count;
3209 }
3210
3211 static sched_bucket_t
sched_convert_pri_to_bucket(uint8_t priority)3212 sched_convert_pri_to_bucket(uint8_t priority)
3213 {
3214 sched_bucket_t bucket = TH_BUCKET_RUN;
3215
3216 if (priority > BASEPRI_USER_INITIATED) {
3217 bucket = TH_BUCKET_SHARE_FG;
3218 } else if (priority > BASEPRI_DEFAULT) {
3219 bucket = TH_BUCKET_SHARE_IN;
3220 } else if (priority > BASEPRI_UTILITY) {
3221 bucket = TH_BUCKET_SHARE_DF;
3222 } else if (priority > MAXPRI_THROTTLE) {
3223 bucket = TH_BUCKET_SHARE_UT;
3224 } else {
3225 bucket = TH_BUCKET_SHARE_BG;
3226 }
3227 return bucket;
3228 }
3229
3230 /*
3231 * For threads that have changed sched_pri without changing the
3232 * base_pri for any reason other than decay, use the sched_pri
3233 * as the bucketizing priority instead of base_pri. All such
3234 * changes are typically due to kernel locking primitives boosts
3235 * or demotions.
3236 */
3237 static boolean_t
sched_thread_sched_pri_promoted(thread_t thread)3238 sched_thread_sched_pri_promoted(thread_t thread)
3239 {
3240 return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
3241 (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3242 (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3243 (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3244 (thread->kern_promotion_schedpri != 0);
3245 }
3246
3247 /*
3248 * Routine to update the scheduling bucket for the thread.
3249 *
3250 * In the clutch scheduler implementation, the thread's bucket
3251 * is based on sched_pri if it was promoted due to a kernel
3252 * primitive; otherwise its based on the thread base_pri. This
3253 * enhancement allows promoted threads to reach a higher priority
3254 * bucket and potentially get selected sooner for scheduling.
3255 *
3256 * Also, the clutch scheduler does not honor fixed priority below
3257 * FG priority. It simply puts those threads in the corresponding
3258 * timeshare bucket. The reason for to do that is because it is
3259 * extremely hard to define the scheduling properties of such threads
3260 * and they typically lead to performance issues.
3261 */
3262
3263 void
sched_clutch_update_thread_bucket(thread_t thread)3264 sched_clutch_update_thread_bucket(thread_t thread)
3265 {
3266 sched_bucket_t old_bucket = thread->th_sched_bucket;
3267 sched_bucket_t new_bucket = TH_BUCKET_RUN;
3268 assert(thread->runq == PROCESSOR_NULL);
3269 int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3270
3271 switch (thread->sched_mode) {
3272 case TH_MODE_FIXED:
3273 if (pri >= BASEPRI_FOREGROUND) {
3274 new_bucket = TH_BUCKET_FIXPRI;
3275 } else {
3276 new_bucket = sched_convert_pri_to_bucket(pri);
3277 }
3278 break;
3279
3280 case TH_MODE_REALTIME:
3281 new_bucket = TH_BUCKET_FIXPRI;
3282 break;
3283
3284 case TH_MODE_TIMESHARE:
3285 new_bucket = sched_convert_pri_to_bucket(pri);
3286 break;
3287
3288 default:
3289 panic("unexpected mode: %d", thread->sched_mode);
3290 break;
3291 }
3292
3293 if (old_bucket == new_bucket) {
3294 return;
3295 }
3296
3297 thread->th_sched_bucket = new_bucket;
3298 thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3299 /*
3300 * Since this is called after the thread has been removed from the runq,
3301 * only the run counts need to be updated. The re-insert into the runq
3302 * would put the thread into the correct new bucket's runq.
3303 */
3304 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3305 sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3306 sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3307 }
3308 }
3309
3310 #if CONFIG_SCHED_EDGE
3311
3312 /* Implementation of the AMP version of the clutch scheduler */
3313
3314 static void
3315 sched_edge_init(void);
3316
3317 static void
3318 sched_edge_pset_init(processor_set_t pset);
3319
3320 static thread_t
3321 sched_edge_processor_idle(processor_set_t pset);
3322
3323 static ast_t
3324 sched_edge_processor_csw_check(processor_t processor);
3325
3326 static boolean_t
3327 sched_edge_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3328
3329 static boolean_t
3330 sched_edge_processor_queue_empty(processor_t processor);
3331
3332 static thread_t
3333 sched_edge_choose_thread(processor_t processor, int priority, ast_t reason);
3334
3335 static void
3336 sched_edge_processor_queue_shutdown(processor_t processor);
3337
3338 static processor_t
3339 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
3340
3341 static bool
3342 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread);
3343
3344 static void
3345 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3346
3347 static void
3348 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3349
3350 static bool
3351 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3352
3353 static void
3354 sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3355
3356 static void
3357 sched_edge_cpu_init_completed(void);
3358
3359 static bool
3360 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3361
3362 static bool
3363 sched_edge_steal_thread_enabled(processor_set_t pset);
3364
3365 static sched_ipi_type_t
3366 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3367
3368 static uint32_t
3369 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3370
3371 static uint32_t
3372 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3373
3374 const struct sched_dispatch_table sched_edge_dispatch = {
3375 .sched_name = "edge",
3376 .init = sched_edge_init,
3377 .timebase_init = sched_timeshare_timebase_init,
3378 .processor_init = sched_clutch_processor_init,
3379 .pset_init = sched_edge_pset_init,
3380 .maintenance_continuation = sched_timeshare_maintenance_continue,
3381 .choose_thread = sched_edge_choose_thread,
3382 .steal_thread_enabled = sched_edge_steal_thread_enabled,
3383 .steal_thread = sched_edge_processor_idle,
3384 .compute_timeshare_priority = sched_compute_timeshare_priority,
3385 .choose_node = sched_choose_node,
3386 .choose_processor = sched_edge_choose_processor,
3387 .processor_enqueue = sched_clutch_processor_enqueue,
3388 .processor_queue_shutdown = sched_edge_processor_queue_shutdown,
3389 .processor_queue_remove = sched_clutch_processor_queue_remove,
3390 .processor_queue_empty = sched_edge_processor_queue_empty,
3391 .priority_is_urgent = priority_is_urgent,
3392 .processor_csw_check = sched_edge_processor_csw_check,
3393 .processor_queue_has_priority = sched_edge_processor_queue_has_priority,
3394 .initial_quantum_size = sched_clutch_initial_quantum_size,
3395 .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode,
3396 .can_update_priority = can_update_priority,
3397 .update_priority = update_priority,
3398 .lightweight_update_priority = lightweight_update_priority,
3399 .quantum_expire = sched_default_quantum_expire,
3400 .processor_runq_count = sched_clutch_runq_count,
3401 .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum,
3402 .processor_bound_count = sched_clutch_processor_bound_count,
3403 .thread_update_scan = sched_clutch_thread_update_scan,
3404 .multiple_psets_enabled = TRUE,
3405 .sched_groups_enabled = FALSE,
3406 .avoid_processor_enabled = TRUE,
3407 .thread_avoid_processor = sched_edge_thread_avoid_processor,
3408 .processor_balance = sched_edge_balance,
3409
3410 .rt_runq = sched_rtlocal_runq,
3411 .rt_init = sched_rtlocal_init,
3412 .rt_queue_shutdown = sched_rtlocal_queue_shutdown,
3413 .rt_runq_scan = sched_rtlocal_runq_scan,
3414 .rt_runq_count_sum = sched_rtlocal_runq_count_sum,
3415 .rt_steal_thread = sched_rtlocal_steal_thread,
3416
3417 .qos_max_parallelism = sched_edge_qos_max_parallelism,
3418 .check_spill = sched_edge_check_spill,
3419 .ipi_policy = sched_edge_ipi_policy,
3420 .thread_should_yield = sched_edge_thread_should_yield,
3421 .run_count_incr = sched_clutch_run_incr,
3422 .run_count_decr = sched_clutch_run_decr,
3423 .update_thread_bucket = sched_clutch_update_thread_bucket,
3424 .pset_made_schedulable = sched_edge_pset_made_schedulable,
3425 .thread_group_recommendation_change = NULL,
3426 .cpu_init_completed = sched_edge_cpu_init_completed,
3427 .thread_eligible_for_pset = sched_edge_thread_eligible_for_pset,
3428 };
3429
3430 static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3431
3432 /*
3433 * sched_edge_pset_available()
3434 *
3435 * Routine to determine if a pset is available for scheduling.
3436 */
3437 static bool
sched_edge_pset_available(processor_set_t pset)3438 sched_edge_pset_available(processor_set_t pset)
3439 {
3440 if (pset == NULL) {
3441 return false;
3442 }
3443 return pset_available_cpu_count(pset) > 0;
3444 }
3445
3446 /*
3447 * sched_edge_thread_bound_cluster_id()
3448 *
3449 * Routine to determine which cluster a particular thread is bound to. Uses
3450 * the sched_flags on the thread to map back to a specific cluster id.
3451 *
3452 * <Edge Multi-cluster Support Needed>
3453 */
3454 static uint32_t
sched_edge_thread_bound_cluster_id(thread_t thread)3455 sched_edge_thread_bound_cluster_id(thread_t thread)
3456 {
3457 assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3458 return thread->th_bound_cluster_id;
3459 }
3460
3461 /* Forward declaration for some thread migration routines */
3462 static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3463 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3464 static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3465 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
3466
3467 /*
3468 * sched_edge_config_set()
3469 *
3470 * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3471 * policies in the scheduler.
3472 */
3473 static void
sched_edge_config_set(uint32_t src_cluster,uint32_t dst_cluster,sched_clutch_edge edge_config)3474 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
3475 {
3476 sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
3477 edge->sce_edge_packed = edge_config.sce_edge_packed;
3478 }
3479
3480 /*
3481 * sched_edge_config_get()
3482 *
3483 * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3484 * if it needs to update edges.
3485 */
3486 static sched_clutch_edge
sched_edge_config_get(uint32_t src_cluster,uint32_t dst_cluster)3487 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
3488 {
3489 return pset_array[src_cluster]->sched_edges[dst_cluster];
3490 }
3491
3492 /*
3493 * sched_edge_matrix_set()
3494 *
3495 * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3496 * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3497 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3498 */
3499 void
sched_edge_matrix_set(sched_clutch_edge * edge_matrix,bool * edge_changes_bitmap,__unused uint64_t flags,uint64_t matrix_order)3500 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3501 {
3502 uint32_t edge_index = 0;
3503 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3504 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3505 if (edge_changes_bitmap[edge_index]) {
3506 sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3507 }
3508 edge_index++;
3509 }
3510 }
3511 }
3512
3513 /*
3514 * sched_edge_matrix_get()
3515 *
3516 * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3517 * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3518 * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3519 */
3520 void
sched_edge_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_request_bitmap,__unused uint64_t flags,uint64_t matrix_order)3521 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3522 {
3523 uint32_t edge_index = 0;
3524 for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3525 for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3526 if (edge_request_bitmap[edge_index]) {
3527 edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3528 }
3529 edge_index++;
3530 }
3531 }
3532 }
3533
3534 /*
3535 * sched_edge_init()
3536 *
3537 * Routine to initialize the data structures for the Edge scheduler.
3538 */
3539 static void
sched_edge_init(void)3540 sched_edge_init(void)
3541 {
3542 if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3543 sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3544 }
3545 sched_timeshare_init();
3546 sched_clutch_tunables_init();
3547 sched_edge_max_clusters = ml_get_cluster_count();
3548 }
3549
3550 static void
sched_edge_pset_init(processor_set_t pset)3551 sched_edge_pset_init(processor_set_t pset)
3552 {
3553 uint32_t pset_cluster_id = pset->pset_cluster_id;
3554 pset->pset_type = (pset->pset_cluster_type == PSET_AMP_P) ? CLUSTER_TYPE_P : CLUSTER_TYPE_E;
3555
3556 /* Set the edge weight and properties for the pset itself */
3557 bitmap_clear(pset->foreign_psets, pset_cluster_id);
3558 bitmap_clear(pset->native_psets, pset_cluster_id);
3559 bitmap_clear(pset->local_psets, pset_cluster_id);
3560 bitmap_clear(pset->remote_psets, pset_cluster_id);
3561 pset->sched_edges[pset_cluster_id].sce_edge_packed = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0}.sce_edge_packed;
3562 sched_clutch_root_init(&pset->pset_clutch_root, pset);
3563 bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id);
3564 }
3565
3566 static thread_t
sched_edge_choose_thread(processor_t processor,int priority,__unused ast_t reason)3567 sched_edge_choose_thread(
3568 processor_t processor,
3569 int priority,
3570 __unused ast_t reason)
3571 {
3572 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3573 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3574 boolean_t choose_from_boundq = false;
3575
3576 if ((bound_runq->highq < priority) &&
3577 (clutch_pri < priority)) {
3578 return THREAD_NULL;
3579 }
3580
3581 if (bound_runq->highq >= clutch_pri) {
3582 choose_from_boundq = true;
3583 }
3584
3585 thread_t thread = THREAD_NULL;
3586 if (choose_from_boundq == false) {
3587 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3588 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3589 } else {
3590 thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
3591 }
3592 return thread;
3593 }
3594
3595 static boolean_t
sched_edge_processor_queue_empty(processor_t processor)3596 sched_edge_processor_queue_empty(processor_t processor)
3597 {
3598 return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
3599 (sched_clutch_bound_runq(processor)->count == 0);
3600 }
3601
3602 static void
sched_edge_check_spill(__unused processor_set_t pset,__unused thread_t thread)3603 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
3604 {
3605 assert(thread->bound_processor == PROCESSOR_NULL);
3606 }
3607
3608 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
3609 SCHED_EDGE_YIELD_RUNQ_NONEMPTY = 0x0,
3610 SCHED_EDGE_YIELD_FOREIGN_RUNNABLE = 0x1,
3611 SCHED_EDGE_YIELD_FOREIGN_RUNNING = 0x2,
3612 SCHED_EDGE_YIELD_STEAL_POSSIBLE = 0x3,
3613 SCHED_EDGE_YIELD_DISALLOW = 0x4,
3614 });
3615
3616 static bool
sched_edge_thread_should_yield(processor_t processor,__unused thread_t thread)3617 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
3618 {
3619 if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
3620 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3621 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
3622 return true;
3623 }
3624
3625 /*
3626 * The yield logic should follow the same logic that steal_thread () does. The
3627 * thread_should_yield() is effectively trying to quickly check that if the
3628 * current thread gave up CPU, is there any other thread that would execute
3629 * on this CPU. So it needs to provide the same answer as the steal_thread()/
3630 * processor Idle logic.
3631 */
3632 if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
3633 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3634 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
3635 return true;
3636 }
3637 if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
3638 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3639 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
3640 return true;
3641 }
3642
3643 processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
3644 if (steal_candidate != NULL) {
3645 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3646 thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
3647 return true;
3648 }
3649
3650 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
3651 0, SCHED_EDGE_YIELD_DISALLOW);
3652 return false;
3653 }
3654
3655 static ast_t
sched_edge_processor_csw_check(processor_t processor)3656 sched_edge_processor_csw_check(processor_t processor)
3657 {
3658 boolean_t has_higher;
3659 int pri;
3660
3661 int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3662 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3663
3664 assert(processor->active_thread != NULL);
3665
3666 pri = MAX(clutch_pri, bound_runq->highq);
3667
3668 if (processor->first_timeslice) {
3669 has_higher = (pri > processor->current_pri);
3670 } else {
3671 has_higher = (pri >= processor->current_pri);
3672 }
3673
3674 if (has_higher) {
3675 if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
3676 return AST_PREEMPT | AST_URGENT;
3677 }
3678
3679 if (bound_runq->urgency > 0) {
3680 return AST_PREEMPT | AST_URGENT;
3681 }
3682
3683 return AST_PREEMPT;
3684 }
3685
3686 return AST_NONE;
3687 }
3688
3689 static boolean_t
sched_edge_processor_queue_has_priority(processor_t processor,int priority,boolean_t gte)3690 sched_edge_processor_queue_has_priority(processor_t processor,
3691 int priority,
3692 boolean_t gte)
3693 {
3694 run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3695
3696 int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
3697 if (gte) {
3698 return qpri >= priority;
3699 } else {
3700 return qpri > priority;
3701 }
3702 }
3703
3704 static void
sched_edge_processor_queue_shutdown(processor_t processor)3705 sched_edge_processor_queue_shutdown(processor_t processor)
3706 {
3707 processor_set_t pset = processor->processor_set;
3708 sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3709 thread_t thread;
3710 queue_head_t tqueue;
3711
3712 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */
3713 if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
3714 pset_unlock(pset);
3715 return;
3716 }
3717
3718 bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3719
3720 queue_init(&tqueue);
3721 while (sched_clutch_root_count(pset_clutch_root) > 0) {
3722 thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3723 enqueue_tail(&tqueue, &thread->runq_links);
3724 }
3725 pset_unlock(pset);
3726
3727 qe_foreach_element_safe(thread, &tqueue, runq_links) {
3728 remqueue(&thread->runq_links);
3729 thread_lock(thread);
3730 thread_setrun(thread, SCHED_TAILQ);
3731 thread_unlock(thread);
3732 }
3733 }
3734
3735 /*
3736 * sched_edge_cluster_load_metric()
3737 *
3738 * The load metric for a cluster is a measure of the average scheduling latency
3739 * experienced by threads on that cluster. It is a product of the average number
3740 * of threads in the runqueue and the average execution time for threads. The metric
3741 * has special values in the following cases:
3742 * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
3743 * the maximum value to disallow any threads to migrate to this cluster.
3744 * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
3745 * to be spread across the platform quickly for ncpu wide workloads.
3746 */
3747 static uint32_t
sched_edge_cluster_load_metric(processor_set_t pset,sched_bucket_t sched_bucket)3748 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
3749 {
3750 if (sched_edge_pset_available(pset) == false) {
3751 return UINT32_MAX;
3752 }
3753 return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
3754 }
3755
3756 /*
3757 *
3758 * Edge Scheduler Steal/Rebalance logic
3759 *
3760 * = Generic scheduler logic =
3761 *
3762 * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
3763 * find any thread for execution in its runqueue. The aim of the steal operation
3764 * is to find other threads running/runnable in other clusters which should be
3765 * executed here.
3766 *
3767 * If the steal callout does not return a thread, the thread_select() logic calls
3768 * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
3769 * threads and idle out the current CPU.
3770 *
3771 * = SCHED(steal_thread) for Edge Scheduler =
3772 *
3773 * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
3774 * routine tries to do the following operations in order:
3775 * (1) Find foreign runnnable threads in non-native cluster
3776 * runqueues (sched_edge_foreign_runnable_thread_remove())
3777 * (2) Check if foreign threads are running on the non-native
3778 * clusters (sched_edge_foreign_running_thread_available())
3779 * - If yes, return THREAD_NULL for the steal callout and
3780 * perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
3781 * (3) Steal a thread from another cluster based on edge
3782 * weights (sched_edge_steal_thread())
3783 *
3784 * = SCHED(processor_balance) for Edge Scheduler =
3785 *
3786 * If steal_thread did not return a thread for the processor, use
3787 * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
3788 *
3789 * = Clutch Bucket Preferred Cluster Overrides =
3790 *
3791 * Since these operations (just like thread migrations on enqueue)
3792 * move threads across clusters, they need support for handling clutch
3793 * bucket group level preferred cluster recommendations.
3794 * For (1), a clutch bucket will be in the foreign runnable queue based
3795 * on the clutch bucket group preferred cluster.
3796 * For (2), the running thread will set the bit on the processor based
3797 * on its preferred cluster type.
3798 * For (3), the edge configuration would prevent threads from being stolen
3799 * in the wrong direction.
3800 *
3801 * = SCHED(thread_should_yield) =
3802 * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
3803 * since that is expecting the same answer as if thread_select() was called on a core
3804 * with an empty runqueue.
3805 */
3806
3807 static bool
sched_edge_steal_thread_enabled(__unused processor_set_t pset)3808 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
3809 {
3810 /*
3811 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
3812 */
3813 return true;
3814 }
3815
3816 static processor_set_t
sched_edge_steal_candidate(processor_set_t pset)3817 sched_edge_steal_candidate(processor_set_t pset)
3818 {
3819 uint32_t dst_cluster_id = pset->pset_cluster_id;
3820 for (int cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
3821 processor_set_t candidate_pset = pset_array[cluster_id];
3822 if (cluster_id == dst_cluster_id) {
3823 continue;
3824 }
3825 if (candidate_pset == NULL) {
3826 continue;
3827 }
3828 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
3829 if (incoming_edge->sce_steal_allowed && (bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1)) {
3830 return candidate_pset;
3831 }
3832 }
3833 return NULL;
3834 }
3835
3836 static boolean_t
sched_edge_foreign_runnable_thread_available(processor_set_t pset)3837 sched_edge_foreign_runnable_thread_available(processor_set_t pset)
3838 {
3839 /* Find all the clusters that are foreign for this cluster */
3840 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3841 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3842 /*
3843 * For each cluster, see if there are any runnable foreign threads.
3844 * This check is currently being done without the pset lock to make it cheap for
3845 * the common case.
3846 */
3847 processor_set_t target_pset = pset_array[cluster];
3848 if (sched_edge_pset_available(target_pset) == false) {
3849 continue;
3850 }
3851
3852 if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3853 return true;
3854 }
3855 }
3856 return false;
3857 }
3858
3859 static thread_t
sched_edge_foreign_runnable_thread_remove(processor_set_t pset,uint64_t ctime)3860 sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
3861 {
3862 thread_t thread = THREAD_NULL;
3863
3864 /* Find all the clusters that are foreign for this cluster */
3865 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3866 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3867 /*
3868 * For each cluster, see if there are any runnable foreign threads.
3869 * This check is currently being done without the pset lock to make it cheap for
3870 * the common case.
3871 */
3872 processor_set_t target_pset = pset_array[cluster];
3873 if (sched_edge_pset_available(target_pset) == false) {
3874 continue;
3875 }
3876
3877 if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3878 continue;
3879 }
3880 /*
3881 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
3882 * and get the highest priority thread.
3883 */
3884 pset_lock(target_pset);
3885 if (sched_edge_pset_available(target_pset)) {
3886 thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
3887 sched_update_pset_load_average(target_pset, ctime);
3888 }
3889 pset_unlock(target_pset);
3890
3891 /*
3892 * Edge Scheduler Optimization
3893 *
3894 * The current implementation immediately returns as soon as it finds a foreign
3895 * runnable thread. This could be enhanced to look at highest priority threads
3896 * from all foreign clusters and pick the highest amongst them. That would need
3897 * some form of global state across psets to make that kind of a check cheap.
3898 */
3899 if (thread != THREAD_NULL) {
3900 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
3901 break;
3902 }
3903 /* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
3904 }
3905
3906 return thread;
3907 }
3908
3909 /*
3910 * sched_edge_cpu_running_foreign_shared_rsrc_available()
3911 *
3912 * Routine to determine if the thread running on a CPU is a shared resource thread
3913 * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
3914 * a CPU going idle on a pset should rebalance a running shared resource heavy thread
3915 * from another non-ideal cluster based on the former's shared resource load.
3916 */
3917 static boolean_t
sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset,int foreign_cpu,processor_set_t idle_pset)3918 sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
3919 {
3920 boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
3921 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
3922 return false;
3923 }
3924
3925 boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
3926 if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
3927 return false;
3928 }
3929 return true;
3930 }
3931
3932 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset)3933 sched_edge_foreign_running_thread_available(processor_set_t pset)
3934 {
3935 bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3936 int cluster = -1;
3937 while ((cluster = sched_edge_iterate_clusters_ordered(pset, foreign_pset_bitmap[0], cluster)) != -1) {
3938 /* Skip the pset if its not schedulable */
3939 processor_set_t target_pset = pset_array[cluster];
3940 if (sched_edge_pset_available(target_pset) == false) {
3941 continue;
3942 }
3943
3944 uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
3945 for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
3946 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
3947 continue;
3948 }
3949 return true;
3950 }
3951 }
3952 return false;
3953 }
3954
3955 static bool
sched_edge_steal_possible(processor_set_t idle_pset,processor_set_t candidate_pset)3956 sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_pset)
3957 {
3958 int highest_runnable_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
3959 if (highest_runnable_bucket == -1) {
3960 /* Candidate cluster runq is empty */
3961 return false;
3962 }
3963
3964 if (idle_pset->pset_cluster_type == candidate_pset->pset_cluster_type) {
3965 /* Always allow stealing from homogeneous clusters */
3966 return true;
3967 } else {
3968 /* Use the load metrics for highest runnable bucket since that would be stolen next */
3969 int32_t candidate_load = sched_edge_cluster_load_metric(candidate_pset, (sched_bucket_t)highest_runnable_bucket);
3970 return candidate_load > 0;
3971 }
3972 }
3973
3974 static thread_t
sched_edge_steal_thread(processor_set_t pset,uint64_t candidate_pset_bitmap)3975 sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap)
3976 {
3977 thread_t thread = THREAD_NULL;
3978
3979 /*
3980 * Edge Scheduler Optimization
3981 *
3982 * The logic today bails as soon as it finds a cluster where the cluster load is
3983 * greater than the edge weight. Maybe it should have a more advanced version
3984 * which looks for the maximum delta etc.
3985 */
3986 int cluster_id = -1;
3987 while ((cluster_id = sched_edge_iterate_clusters_ordered(pset, candidate_pset_bitmap, cluster_id)) != -1) {
3988 processor_set_t steal_from_pset = pset_array[cluster_id];
3989 if (steal_from_pset == NULL) {
3990 continue;
3991 }
3992 sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[pset->pset_cluster_id];
3993 if (incoming_edge->sce_steal_allowed == false) {
3994 continue;
3995 }
3996 pset_lock(steal_from_pset);
3997 if (sched_edge_steal_possible(pset, steal_from_pset)) {
3998 uint64_t current_timestamp = mach_absolute_time();
3999 sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(&steal_from_pset->pset_clutch_root, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
4000 thread = sched_clutch_thread_unbound_lookup(&steal_from_pset->pset_clutch_root, root_bucket);
4001 sched_clutch_thread_remove(&steal_from_pset->pset_clutch_root, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
4002 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
4003 sched_update_pset_load_average(steal_from_pset, current_timestamp);
4004 }
4005 pset_unlock(steal_from_pset);
4006 if (thread != THREAD_NULL) {
4007 break;
4008 }
4009 }
4010 return thread;
4011 }
4012
4013 /*
4014 * sched_edge_processor_idle()
4015 *
4016 * The routine is the implementation for steal_thread() for the Edge scheduler.
4017 */
4018 static thread_t
sched_edge_processor_idle(processor_set_t pset)4019 sched_edge_processor_idle(processor_set_t pset)
4020 {
4021 thread_t thread = THREAD_NULL;
4022
4023 uint64_t ctime = mach_absolute_time();
4024
4025 processor_t processor = current_processor();
4026 bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
4027
4028 /* Each of the operations acquire the lock for the pset they target */
4029 pset_unlock(pset);
4030
4031 /* Find highest priority runnable thread on all non-native clusters */
4032 thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
4033 if (thread != THREAD_NULL) {
4034 return thread;
4035 }
4036
4037 /* Find highest priority runnable thread on all native clusters */
4038 thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4039 if (thread != THREAD_NULL) {
4040 return thread;
4041 }
4042
4043 /* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4044 boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4045 if (rebalance_needed) {
4046 return THREAD_NULL;
4047 }
4048
4049 /* No foreign threads found; find a thread to steal from all clusters based on weights/loads etc. */
4050 thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4051 return thread;
4052 }
4053
4054 /* Return true if this shared resource thread has a better cluster to run on */
4055 static bool
sched_edge_shared_rsrc_migrate_possible(thread_t thread,processor_set_t preferred_pset,processor_set_t current_pset)4056 sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4057 {
4058 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4059 uint64_t current_pset_load = sched_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4060 /*
4061 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4062 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4063 *
4064 * Edge Scheduler Optimization
4065 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4066 */
4067 uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4068
4069 uint64_t eligible_pset_bitmask = 0;
4070 if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4071 /*
4072 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4073 * only among clusters native with the preferred cluster.
4074 */
4075 eligible_pset_bitmask = preferred_pset->native_psets[0];
4076 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4077 } else {
4078 /* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4079 eligible_pset_bitmask = sched_edge_available_pset_bitmask[0];
4080 }
4081
4082 /* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4083 for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4084 if (cluster_id == current_pset->pset_cluster_id) {
4085 continue;
4086 }
4087 uint64_t cluster_load = sched_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4088 if (current_pset_adjusted_load > cluster_load) {
4089 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4090 return true;
4091 }
4092 }
4093 return false;
4094 }
4095
4096 /* Return true if this thread should not continue running on this processor */
4097 static bool
sched_edge_thread_avoid_processor(processor_t processor,thread_t thread)4098 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread)
4099 {
4100 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4101 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
4102 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4103 }
4104 if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
4105 if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4106 return true;
4107 }
4108 return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4109 }
4110
4111 /*
4112 * For long running parallel workloads, it is important to rebalance threads across
4113 * E/P clusters so that they make equal forward progress. This is achieved through
4114 * threads expiring their quantum on the non-preferred cluster type and explicitly
4115 * rebalancing to the preferred cluster runqueue.
4116 */
4117 if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4118 return true;
4119 }
4120 /* If the preferred pset for the thread is now idle, try and migrate thread to that cluster */
4121 if ((processor->processor_set != preferred_pset) &&
4122 (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
4123 return true;
4124 }
4125 return false;
4126 }
4127
4128 static void
sched_edge_balance(__unused processor_t cprocessor,processor_set_t cpset)4129 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
4130 {
4131 assert(cprocessor == current_processor());
4132 pset_unlock(cpset);
4133
4134 uint64_t ast_processor_map = 0;
4135 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4136
4137 bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
4138 for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4139 /* Skip the pset if its not schedulable */
4140 processor_set_t target_pset = pset_array[cluster];
4141 if (sched_edge_pset_available(target_pset) == false) {
4142 continue;
4143 }
4144
4145 pset_lock(target_pset);
4146 uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
4147 for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
4148 if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
4149 continue;
4150 }
4151 processor_t target_cpu = processor_array[cpuid];
4152 ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
4153 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4154 bit_set(ast_processor_map, cpuid);
4155 }
4156 }
4157 pset_unlock(target_pset);
4158 }
4159
4160 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4161 processor_t ast_processor = processor_array[cpuid];
4162 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4163 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
4164 }
4165 }
4166
4167 /*
4168 * sched_edge_migration_check()
4169 *
4170 * Routine to evaluate an edge between two clusters to decide if migration is possible
4171 * across that edge. Also updates the selected_pset and max_edge_delta out parameters
4172 * accordingly. The return value indicates if the invoking routine should short circuit
4173 * the search, since an ideal candidate has been found. The routine looks at the regular
4174 * edges and cluster loads or the shared resource loads based on the type of thread.
4175 */
4176 static bool
sched_edge_migration_check(uint32_t cluster_id,processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread,processor_set_t * selected_pset,uint32_t * max_edge_delta)4177 sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
4178 uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
4179 {
4180 uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4181 cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
4182 processor_set_t dst_pset = pset_array[cluster_id];
4183 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4184 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4185
4186 if (cluster_id == preferred_cluster_id) {
4187 return false;
4188 }
4189
4190 if (dst_pset == NULL) {
4191 return false;
4192 }
4193
4194 sched_clutch_edge *edge = preferred_pset->sched_edges;
4195 if (edge[cluster_id].sce_migration_allowed == false) {
4196 return false;
4197 }
4198 uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4199 if (dst_load == 0) {
4200 /* The candidate cluster is idle; select it immediately for execution */
4201 *selected_pset = dst_pset;
4202 *max_edge_delta = preferred_cluster_load;
4203 return true;
4204 }
4205
4206 uint32_t edge_delta = 0;
4207 if (dst_load > preferred_cluster_load) {
4208 return false;
4209 }
4210 edge_delta = preferred_cluster_load - dst_load;
4211 if (!shared_rsrc_thread && (edge_delta < edge[cluster_id].sce_migration_weight)) {
4212 /*
4213 * For non shared resource threads, use the edge migration weight to decide if
4214 * this cluster is over-committed at the QoS level of this thread.
4215 */
4216 return false;
4217 }
4218
4219 if (edge_delta < *max_edge_delta) {
4220 return false;
4221 }
4222 if (edge_delta == *max_edge_delta) {
4223 /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4224 boolean_t selected_homogeneous = (pset_type_for_id((*selected_pset)->pset_cluster_id) == preferred_cluster_type);
4225 boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4226 if (selected_homogeneous || !candidate_homogeneous) {
4227 return false;
4228 }
4229 }
4230 /* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
4231 *max_edge_delta = edge_delta;
4232 *selected_pset = dst_pset;
4233 return false;
4234 }
4235
4236
4237 static int
sched_edge_iterate_clusters_ordered(processor_set_t starting_pset,uint64_t candidate_map,int previous_cluster)4238 sched_edge_iterate_clusters_ordered(processor_set_t starting_pset, uint64_t candidate_map, int previous_cluster)
4239 {
4240 int cluster_id = -1;
4241
4242 uint64_t local_candidate_map = starting_pset->local_psets[0] & candidate_map;
4243 uint64_t remote_candidate_map = starting_pset->remote_psets[0] & candidate_map;
4244
4245 if (previous_cluster == -1) {
4246 /* previous_cluster == -1 indicates the initial condition */
4247 cluster_id = bit_first(local_candidate_map);
4248 if (cluster_id != -1) {
4249 return cluster_id;
4250 }
4251 return bit_first(remote_candidate_map);
4252 } else {
4253 /*
4254 * After the initial condition, the routine attempts to return a
4255 * cluster in the previous_cluster's locality. If none is available,
4256 * it looks at remote clusters.
4257 */
4258 if (bit_test(local_candidate_map, previous_cluster)) {
4259 cluster_id = bit_next(local_candidate_map, previous_cluster);
4260 if (cluster_id != -1) {
4261 return cluster_id;
4262 } else {
4263 return bit_first(remote_candidate_map);
4264 }
4265 }
4266 return bit_next(remote_candidate_map, previous_cluster);
4267 }
4268 }
4269
4270 /*
4271 * sched_edge_migrate_edges_evaluate()
4272 *
4273 * Routine to find the candidate for thread migration based on edge weights.
4274 *
4275 * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
4276 * return preferred_pset if its the most ideal destination for this thread.
4277 */
4278 static processor_set_t
sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread)4279 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
4280 {
4281 processor_set_t selected_pset = preferred_pset;
4282 uint32_t max_edge_delta = 0;
4283 bool search_complete = false;
4284 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4285 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4286
4287 bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
4288 bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
4289 /* Always start the search with the native clusters */
4290 int cluster_id = -1;
4291 while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, native_pset_bitmap[0], cluster_id)) != -1) {
4292 search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4293 if (search_complete) {
4294 break;
4295 }
4296 }
4297
4298 if (search_complete) {
4299 return selected_pset;
4300 }
4301
4302 if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
4303 /*
4304 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
4305 * to fill up the preferred cluster and its homogeneous peers first.
4306 */
4307
4308 if (max_edge_delta > 0) {
4309 /*
4310 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
4311 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
4312 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
4313 * should be placed.
4314 */
4315 return selected_pset;
4316 }
4317 /*
4318 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
4319 * any more capacity left.
4320 */
4321 if (sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
4322 return preferred_pset;
4323 }
4324 /*
4325 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
4326 * at non-native clusters for capacity.
4327 */
4328 }
4329
4330 /* Now look at the non-native clusters */
4331 cluster_id = -1;
4332 while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, foreign_pset_bitmap[0], cluster_id)) != -1) {
4333 search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4334 if (search_complete) {
4335 break;
4336 }
4337 }
4338 return selected_pset;
4339 }
4340
4341 /*
4342 * sched_edge_candidate_alternative()
4343 *
4344 * Routine to find an alternative cluster from candidate_cluster_bitmap since the
4345 * selected_pset is not available for execution. The logic tries to prefer homogeneous
4346 * clusters over heterogeneous clusters since this is typically used in thread
4347 * placement decisions.
4348 */
4349 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
4350 static processor_set_t
sched_edge_candidate_alternative(processor_set_t selected_pset,uint64_t candidate_cluster_bitmap)4351 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
4352 {
4353 /*
4354 * It looks like the most ideal pset is not available for scheduling currently.
4355 * Try to find a homogeneous cluster that is still available.
4356 */
4357 uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
4358 int available_cluster_id = lsb_first(available_native_clusters);
4359 if (available_cluster_id == -1) {
4360 /* Looks like none of the homogeneous clusters are available; pick the first available cluster */
4361 available_cluster_id = bit_first(candidate_cluster_bitmap);
4362 }
4363 assert(available_cluster_id != -1);
4364 return pset_array[available_cluster_id];
4365 }
4366
4367 /*
4368 * sched_edge_switch_pset_lock()
4369 *
4370 * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4371 * switch_pset_locks.
4372 * Returns the newly locked pset after the switch.
4373 */
4374 static processor_set_t
sched_edge_switch_pset_lock(processor_set_t selected_pset,processor_set_t locked_pset,bool switch_pset_locks)4375 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
4376 {
4377 if (!switch_pset_locks) {
4378 return locked_pset;
4379 }
4380 if (selected_pset != locked_pset) {
4381 pset_unlock(locked_pset);
4382 pset_lock(selected_pset);
4383 return selected_pset;
4384 } else {
4385 return locked_pset;
4386 }
4387 }
4388
4389 /*
4390 * sched_edge_amp_rebalance_pset()
4391 *
4392 * Routine to decide where a thread which is eligible for AMP rebalance (i.e.
4393 * has executed on non-preferred cluster type for a while) should be enqueued.
4394 * The algorithm maintains a history of AMP rebalance decisions on the clutch
4395 * bucket group of the workload and round-robins between clusters to ensure
4396 * that all threads get a chance on the performance cores and make equal
4397 * progress.
4398 */
4399 static processor_set_t
sched_edge_amp_rebalance_pset(processor_set_t preferred_pset,thread_t thread)4400 sched_edge_amp_rebalance_pset(processor_set_t preferred_pset, thread_t thread)
4401 {
4402 sched_clutch_t clutch = sched_clutch_for_thread(thread);
4403 sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
4404
4405 uint32_t last_chosen_cluster, new_chosen_cluster;
4406
4407 /* Only AMP rebalance within clusters native to the preferred cluster */
4408 uint64_t eligible_pset_bitmask = preferred_pset->native_psets[0];
4409 /* Preferred cluster is also eligible for rebalancing */
4410 bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4411 /* Atomically update the AMP rebalance cluster for the clutch bucket group */
4412 os_atomic_rmw_loop(&clutch_bucket_group->scbg_amp_rebalance_last_chosen, last_chosen_cluster, new_chosen_cluster, relaxed, {
4413 if (last_chosen_cluster == UINT32_MAX) {
4414 new_chosen_cluster = preferred_pset->pset_cluster_id;
4415 } else {
4416 new_chosen_cluster = lsb_next(eligible_pset_bitmask, last_chosen_cluster);
4417 if (new_chosen_cluster == -1) {
4418 /* Rotate to the start of the eligible bitmask */
4419 new_chosen_cluster = lsb_first(eligible_pset_bitmask);
4420 }
4421 }
4422 });
4423 return pset_array[new_chosen_cluster];
4424 }
4425
4426 /*
4427 * sched_edge_migrate_candidate()
4428 *
4429 * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4430 * the thread and the preferred cluster to determine the best available pset for scheduling.
4431 *
4432 * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4433 * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4434 * decision has to be synchronized with another CPU changing the recommendation of clusters available
4435 * on the system. If this parameter is set to false, this routine returns the best effort indication of
4436 * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4437 * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4438 * more appropriate cluster.
4439 *
4440 * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4441 * resultant pset lock is held.
4442 */
4443 static processor_set_t
sched_edge_migrate_candidate(processor_set_t preferred_pset,thread_t thread,processor_set_t locked_pset,bool switch_pset_locks)4444 sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
4445 {
4446 __kdebug_only uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4447 processor_set_t selected_pset = preferred_pset;
4448 cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4449 bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4450
4451 if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
4452 /* For bound threads always recommend the cluster its bound to */
4453 selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
4454 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4455 if (sched_edge_pset_available(selected_pset) || (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread) == false)) {
4456 /*
4457 * If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
4458 * fall through to the regular cluster selection logic which handles unavailable clusters
4459 * appropriately. If the thread is hard bound, then return the bound cluster always.
4460 */
4461 return selected_pset;
4462 }
4463 }
4464
4465 uint64_t candidate_cluster_bitmap = mask(sched_edge_max_clusters);
4466 #if DEVELOPMENT || DEBUG
4467 extern int enable_task_set_cluster_type;
4468 task_t task = get_threadtask(thread);
4469 if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
4470 processor_set_t pset_hint = task->pset_hint;
4471 if (pset_hint && selected_pset->pset_cluster_type != pset_hint->pset_cluster_type) {
4472 selected_pset = pset_hint;
4473 goto migrate_candidate_available_check;
4474 }
4475 }
4476 #endif
4477
4478 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4479 /* For realtime threads, try and schedule them on the preferred pset always */
4480 goto migrate_candidate_available_check;
4481 }
4482
4483 uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
4484 if (preferred_cluster_load == 0) {
4485 goto migrate_candidate_available_check;
4486 }
4487
4488 /*
4489 * If a thread is being rebalanced for achieving equal progress of parallel workloads,
4490 * it needs to end up on the preferred runqueue. This mechanism should only be used for
4491 * threads which have been previously migrated to the non-preferred cluster type.
4492 *
4493 * The AMP rebalancing mechanism is available for regular threads or shared resource
4494 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
4495 */
4496 bool amp_rebalance_eligible = (!shared_rsrc_thread) || (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST));
4497 if (amp_rebalance_eligible) {
4498 boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE | AST_QUANTUM)) == (AST_REBALANCE | AST_QUANTUM);
4499 if (amp_rebalance) {
4500 boolean_t non_preferred_pset = (thread->last_processor->processor_set->pset_type != preferred_pset->pset_type);
4501 if (non_preferred_pset) {
4502 selected_pset = sched_edge_amp_rebalance_pset(preferred_pset, thread);
4503 goto migrate_candidate_available_check;
4504 }
4505 }
4506 }
4507
4508 /* Look at edge weights to decide the most ideal migration candidate for this thread */
4509 selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
4510
4511 migrate_candidate_available_check:
4512 locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4513 if (sched_edge_pset_available(selected_pset) == true) {
4514 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4515 return selected_pset;
4516 }
4517 /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4518 bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4519 if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -1)) {
4520 pset_unlock(locked_pset);
4521 return NULL;
4522 }
4523 /* Try and find an alternative for the selected pset */
4524 selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
4525 goto migrate_candidate_available_check;
4526 }
4527
4528 static processor_t
sched_edge_choose_processor(processor_set_t pset,processor_t processor,thread_t thread)4529 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
4530 {
4531 /* Bound threads don't call this function */
4532 assert(thread->bound_processor == PROCESSOR_NULL);
4533 processor_t chosen_processor = PROCESSOR_NULL;
4534
4535 /*
4536 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
4537 * It should take the passed in "pset" as a hint which represents the recency metric for
4538 * pset selection logic.
4539 */
4540 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4541 processor_set_t chosen_pset = preferred_pset;
4542 /*
4543 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4544 * threads to. sched_edge_migrate_candidate() returns the preferred pset
4545 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4546 *
4547 * Edge Scheduler Optimization
4548 * It might be useful to build a recency metric for the thread for multiple clusters and
4549 * factor that into the migration decisions.
4550 */
4551 chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
4552 if (chosen_pset) {
4553 chosen_processor = choose_processor(chosen_pset, processor, thread);
4554 }
4555 /* For RT threads, choose_processor() can return a different cluster than the one passed into it */
4556 assert(chosen_processor ? chosen_processor->processor_set->pset_type == chosen_pset->pset_type : true);
4557 return chosen_processor;
4558 }
4559
4560 /*
4561 * sched_edge_clutch_bucket_threads_drain()
4562 *
4563 * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4564 * bucket overrides etc.) into a local thread queue.
4565 */
4566 static void
sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,queue_t clutch_threads)4567 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
4568 {
4569 thread_t thread = THREAD_NULL;
4570 uint64_t current_timestamp = mach_approximate_time();
4571 qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
4572 sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4573 enqueue_tail(clutch_threads, &thread->runq_links);
4574 }
4575 }
4576
4577 /*
4578 * sched_edge_run_drained_threads()
4579 *
4580 * Makes all drained threads in a local queue runnable.
4581 */
4582 static void
sched_edge_run_drained_threads(queue_t clutch_threads)4583 sched_edge_run_drained_threads(queue_t clutch_threads)
4584 {
4585 thread_t thread;
4586 /* Now setrun all the threads in the local queue */
4587 qe_foreach_element_safe(thread, clutch_threads, runq_links) {
4588 remqueue(&thread->runq_links);
4589 thread_lock(thread);
4590 thread_setrun(thread, SCHED_TAILQ);
4591 thread_unlock(thread);
4592 }
4593 }
4594
4595 /*
4596 * sched_edge_update_preferred_cluster()
4597 *
4598 * Routine to update the preferred cluster for QoS buckets within a thread group.
4599 * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4600 */
4601 static void
sched_edge_update_preferred_cluster(sched_clutch_t sched_clutch,bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster)4602 sched_edge_update_preferred_cluster(
4603 sched_clutch_t sched_clutch,
4604 bitmap_t *clutch_bucket_modify_bitmap,
4605 uint32_t *tg_bucket_preferred_cluster)
4606 {
4607 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4608 os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
4609 }
4610 }
4611
4612 /*
4613 * sched_edge_migrate_thread_group_runnable_threads()
4614 *
4615 * Routine to implement the migration of threads on a cluster when the thread group
4616 * recommendation is updated. The migration works using a 2-phase
4617 * algorithm.
4618 *
4619 * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4620 * For each clutch bucket, if it needs to be migrated immediately, drain the threads
4621 * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4622 * appropriate.
4623 *
4624 * Phase 2: After unlocking the pset, drain all the threads from the local thread
4625 * queue and mark them runnable which should land them in the right hierarchy.
4626 *
4627 * The routine assumes that the preferences for the clutch buckets/clutch bucket
4628 * groups have already been updated by the caller.
4629 *
4630 * - Called with the pset locked and interrupts disabled.
4631 * - Returns with the pset unlocked.
4632 */
4633 static void
sched_edge_migrate_thread_group_runnable_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,bitmap_t * clutch_bucket_modify_bitmap,__unused uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)4634 sched_edge_migrate_thread_group_runnable_threads(
4635 sched_clutch_t sched_clutch,
4636 sched_clutch_root_t root_clutch,
4637 bitmap_t *clutch_bucket_modify_bitmap,
4638 __unused uint32_t *tg_bucket_preferred_cluster,
4639 bool migrate_immediately)
4640 {
4641 /* Queue to hold threads that have been drained from clutch buckets to be migrated */
4642 queue_head_t clutch_threads;
4643 queue_init(&clutch_threads);
4644
4645 for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4646 /* Get the clutch bucket for this cluster and sched bucket */
4647 sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
4648 sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
4649 sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
4650 if (scb_root == NULL) {
4651 /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
4652 assert(clutch_bucket->scb_thr_count == 0);
4653 continue;
4654 }
4655 assert(scb_root == root_clutch);
4656 uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
4657
4658 if (migrate_immediately) {
4659 /*
4660 * For transitions where threads need to be migrated immediately, drain the threads into a
4661 * local queue unless we are looking at the clutch buckets for the newly recommended
4662 * cluster.
4663 */
4664 if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
4665 sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
4666 } else {
4667 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4668 }
4669 } else {
4670 /* Check if this cluster is the same type as the newly recommended cluster */
4671 boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
4672 /*
4673 * If threads do not have to be migrated immediately, just change the native/foreign
4674 * flag on the clutch bucket.
4675 */
4676 if (homogeneous_cluster) {
4677 sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4678 } else {
4679 sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
4680 }
4681 }
4682 }
4683
4684 pset_unlock(root_clutch->scr_pset);
4685 sched_edge_run_drained_threads(&clutch_threads);
4686 }
4687
4688 /*
4689 * sched_edge_migrate_thread_group_running_threads()
4690 *
4691 * Routine to find all running threads of a thread group on a specific cluster
4692 * and IPI them if they need to be moved immediately.
4693 */
4694 static void
sched_edge_migrate_thread_group_running_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,__unused bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)4695 sched_edge_migrate_thread_group_running_threads(
4696 sched_clutch_t sched_clutch,
4697 sched_clutch_root_t root_clutch,
4698 __unused bitmap_t *clutch_bucket_modify_bitmap,
4699 uint32_t *tg_bucket_preferred_cluster,
4700 bool migrate_immediately)
4701 {
4702 if (migrate_immediately == false) {
4703 /* If CLPC has recommended not to move threads immediately, nothing to do here */
4704 return;
4705 }
4706
4707 /*
4708 * Edge Scheduler Optimization
4709 *
4710 * When the system has a large number of clusters and cores, it might be useful to
4711 * narrow down the iteration by using a thread running bitmap per clutch.
4712 */
4713 uint64_t ast_processor_map = 0;
4714 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4715
4716 uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
4717 /*
4718 * Iterate all CPUs and look for the ones running threads from this thread group and are
4719 * not restricted to the specific cluster (due to overrides etc.)
4720 */
4721 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
4722 processor_t src_processor = processor_array[cpuid];
4723 boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
4724 sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
4725 if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
4726 continue;
4727 }
4728 boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
4729
4730 if (expected_tg && non_preferred_cluster) {
4731 ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
4732 if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4733 bit_set(ast_processor_map, cpuid);
4734 } else if (src_processor == current_processor()) {
4735 ast_on(AST_PREEMPT);
4736 bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
4737 }
4738 }
4739 }
4740
4741 /* Perform all the IPIs */
4742 if (bit_first(ast_processor_map) != -1) {
4743 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4744 processor_t ast_processor = processor_array[cpuid];
4745 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4746 }
4747 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
4748 }
4749 }
4750
4751 /*
4752 * sched_edge_tg_preferred_cluster_change()
4753 *
4754 * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
4755 * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
4756 * thread migrations based on the policy specified by CLPC.
4757 * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
4758 * for each QoS within the thread group.
4759 */
4760 void
sched_edge_tg_preferred_cluster_change(struct thread_group * tg,uint32_t * tg_bucket_preferred_cluster,sched_perfcontrol_preferred_cluster_options_t options)4761 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
4762 {
4763 sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
4764 /*
4765 * In order to optimize the processing, create a bitmap which represents all QoS buckets
4766 * for which the preferred cluster has changed.
4767 */
4768 bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
4769 for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4770 uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
4771 uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
4772 if (old_preferred_cluster != new_preferred_cluster) {
4773 bitmap_set(clutch_bucket_modify_bitmap, bucket);
4774 }
4775 }
4776 if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
4777 /* No changes in any clutch buckets; nothing to do here */
4778 return;
4779 }
4780
4781 for (uint32_t cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
4782 processor_set_t pset = pset_array[cluster_id];
4783 spl_t s = splsched();
4784 pset_lock(pset);
4785 /*
4786 * The first operation is to update the preferred cluster for all QoS buckets within the
4787 * thread group so that any future threads becoming runnable would see the new preferred
4788 * cluster value.
4789 */
4790 sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
4791 /*
4792 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
4793 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
4794 *
4795 * Edge Scheduler Optimization
4796 */
4797 /* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
4798 sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4799 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
4800 /* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
4801 sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4802 tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
4803 /* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
4804 splx(s);
4805 }
4806 }
4807
4808 /*
4809 * sched_edge_pset_made_schedulable()
4810 *
4811 * Routine to migrate all the clutch buckets which are not in their recommended
4812 * pset hierarchy now that a new pset has become runnable. Its possible that this
4813 * routine is called when the pset is already marked schedulable.
4814 *
4815 * Invoked with the pset lock held and interrupts disabled.
4816 */
4817 static void
sched_edge_pset_made_schedulable(__unused processor_t processor,processor_set_t dst_pset,boolean_t drop_lock)4818 sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
4819 {
4820 if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
4821 /* Nothing to do here since pset is already marked schedulable */
4822 if (drop_lock) {
4823 pset_unlock(dst_pset);
4824 }
4825 return;
4826 }
4827
4828 bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
4829
4830 thread_t thread = sched_edge_processor_idle(dst_pset);
4831 if (thread != THREAD_NULL) {
4832 thread_lock(thread);
4833 thread_setrun(thread, SCHED_TAILQ);
4834 thread_unlock(thread);
4835 }
4836
4837 if (!drop_lock) {
4838 pset_lock(dst_pset);
4839 }
4840 }
4841
4842 /*
4843 * sched_edge_cpu_init_completed()
4844 *
4845 * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
4846 * provides an opportunity for the edge scheduler to initialize all the edge parameters.
4847 */
4848 static void
sched_edge_cpu_init_completed(void)4849 sched_edge_cpu_init_completed(void)
4850 {
4851 spl_t s = splsched();
4852 for (int src_cluster_id = 0; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) {
4853 processor_set_t src_pset = pset_array[src_cluster_id];
4854 pset_lock(src_pset);
4855
4856 /* For each cluster, set all its outgoing edge parameters */
4857 for (int dst_cluster_id = 0; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) {
4858 if (dst_cluster_id == src_cluster_id) {
4859 continue;
4860 }
4861 processor_set_t dst_pset = pset_array[dst_cluster_id];
4862 if (src_pset->pset_type == dst_pset->pset_type) {
4863 /* P->P/E->E edge config */
4864 bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
4865 bitmap_set(src_pset->native_psets, dst_cluster_id);
4866 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
4867 } else if ((src_pset->pset_type == CLUSTER_TYPE_P) && (dst_pset->pset_type == CLUSTER_TYPE_E)) {
4868 /* P->E edge config */
4869 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4870 bitmap_clear(src_pset->native_psets, dst_cluster_id);
4871 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
4872 } else {
4873 /* E->P edge config */
4874 bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4875 bitmap_clear(src_pset->native_psets, dst_cluster_id);
4876 sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
4877 }
4878 bool clusters_local = true;
4879 if (clusters_local) {
4880 bitmap_set(src_pset->local_psets, dst_cluster_id);
4881 bitmap_clear(src_pset->remote_psets, dst_cluster_id);
4882 } else {
4883 bitmap_set(src_pset->remote_psets, dst_cluster_id);
4884 bitmap_clear(src_pset->local_psets, dst_cluster_id);
4885 }
4886 }
4887
4888 pset_unlock(src_pset);
4889 }
4890 splx(s);
4891 }
4892
4893 static bool
sched_edge_thread_eligible_for_pset(thread_t thread,processor_set_t pset)4894 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
4895 {
4896 uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
4897 if (preferred_cluster_id == pset->pset_cluster_id) {
4898 return true;
4899 } else {
4900 processor_set_t preferred_pset = pset_array[preferred_cluster_id];
4901 return preferred_pset->sched_edges[pset->pset_cluster_id].sce_migration_allowed;
4902 }
4903 }
4904
4905 extern int sched_amp_spill_deferred_ipi;
4906 extern int sched_amp_pcores_preempt_immediate_ipi;
4907
4908 int sched_edge_migrate_ipi_immediate = 1;
4909
4910 sched_ipi_type_t
sched_edge_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4911 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4912 {
4913 processor_set_t pset = dst->processor_set;
4914 assert(dst != current_processor());
4915
4916 boolean_t deferred_ipi_supported = false;
4917 #if defined(CONFIG_SCHED_DEFERRED_AST)
4918 deferred_ipi_supported = true;
4919 #endif /* CONFIG_SCHED_DEFERRED_AST */
4920
4921 switch (event) {
4922 case SCHED_IPI_EVENT_SPILL:
4923 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
4924 if (deferred_ipi_supported) {
4925 return sched_ipi_deferred_policy(pset, dst, thread, event);
4926 }
4927 break;
4928 case SCHED_IPI_EVENT_PREEMPT:
4929 /* For preemption, the default policy is to use deferred IPIs
4930 * for Non-RT P-core preemption. Override that behavior if
4931 * sched_amp_pcores_preempt_immediate_ipi is set
4932 */
4933 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
4934 if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
4935 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4936 }
4937 if (sched_edge_migrate_ipi_immediate) {
4938 processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4939 /*
4940 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
4941 */
4942 if (preferred_pset->pset_type == pset->pset_type) {
4943 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4944 }
4945 /*
4946 * For workloads that are going wide, it might be useful use Immediate IPI to
4947 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
4948 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
4949 * latency on the preferred pset as an estimate of busyness.
4950 */
4951 if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
4952 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4953 }
4954 }
4955 }
4956 break;
4957 default:
4958 break;
4959 }
4960 /* Default back to the global policy for all other scenarios */
4961 return sched_ipi_policy(dst, thread, dst_idle, event);
4962 }
4963
4964 /*
4965 * sched_edge_qos_max_parallelism()
4966 */
4967 uint32_t
sched_edge_qos_max_parallelism(int qos,uint64_t options)4968 sched_edge_qos_max_parallelism(int qos, uint64_t options)
4969 {
4970 uint32_t ecpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_E, false, false);
4971 uint32_t pcpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_P, false, false);
4972 uint32_t ecluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_E);
4973 uint32_t pcluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_P);
4974
4975 if (options & QOS_PARALLELISM_REALTIME) {
4976 /* For realtime threads on AMP, we would want them
4977 * to limit the width to just the P-cores since we
4978 * do not spill/rebalance for RT threads.
4979 */
4980 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcpu_count;
4981 }
4982
4983 /*
4984 * The Edge scheduler supports per-QoS recommendations for thread groups.
4985 * This enables lower QoS buckets (such as UT) to be scheduled on all
4986 * CPUs on the system.
4987 *
4988 * The only restriction is for BG/Maintenance QoS classes for which the
4989 * performance controller would never recommend execution on the P-cores.
4990 * If that policy changes in the future, this value should be changed.
4991 */
4992 switch (qos) {
4993 case THREAD_QOS_BACKGROUND:
4994 case THREAD_QOS_MAINTENANCE:
4995 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecpu_count;
4996 default:
4997 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecpu_count + pcpu_count);
4998 }
4999 }
5000
5001
5002
5003 #endif /* CONFIG_SCHED_EDGE */
5004
5005 #endif /* CONFIG_SCHED_CLUTCH */
5006