xref: /xnu-11215.81.4/osfmk/kern/sched_clutch.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2018 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #if !SCHED_TEST_HARNESS
30 
31 #include <mach/mach_types.h>
32 #include <mach/machine.h>
33 #include <machine/machine_routines.h>
34 #include <machine/sched_param.h>
35 #include <machine/machine_cpu.h>
36 #include <kern/kern_types.h>
37 #include <kern/debug.h>
38 #include <kern/machine.h>
39 #include <kern/misc_protos.h>
40 #include <kern/processor.h>
41 #include <kern/queue.h>
42 #include <kern/sched.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/sched_clutch.h>
46 #include <machine/atomic.h>
47 #include <kern/sched_clutch.h>
48 #include <sys/kdebug.h>
49 
50 #endif /* !SCHED_TEST_HARNESS */
51 
52 #include <kern/sched_prim.h>
53 
54 #if CONFIG_SCHED_EDGE
55 #include <kern/sched_amp_common.h>
56 #endif /* CONFIG_SCHED_EDGE */
57 
58 #if CONFIG_SCHED_CLUTCH
59 
60 #define SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION 1
61 typedef union {
62 	struct __attribute__((packed)) {
63 		unsigned int version                            : 4;
64 		unsigned int traverse_mode                      : 3;
65 		unsigned int cluster_id                         : 6;
66 		unsigned int selection_was_edf                  : 1;
67 		unsigned int selection_was_cluster_bound        : 1;
68 		unsigned int selection_opened_starvation_avoidance_window  : 1;
69 		unsigned int selection_opened_warp_window       : 1;
70 		unsigned int starvation_avoidance_window_close  : 12;
71 		unsigned int warp_window_close                  : 12;
72 		unsigned int reserved                           : 23;  /* For future usage */
73 	} trace_data;
74 	uint64_t scdts_trace_data_packed;
75 } sched_clutch_dbg_thread_select_packed_t;
76 
77 static_assert(TH_BUCKET_SCHED_MAX == 6, "Ensure layout of sched_clutch_dbg_thread_select_packed can fit root bucket bitmasks");
78 static_assert(sizeof(sched_clutch_dbg_thread_select_packed_t) <= sizeof(uint64_t), "Ensure sched_clutch_dbg_thread_select_packed_t can fit in one tracepoint argument");
79 
80 /* Forward declarations of static routines */
81 
82 /* Root level hierarchy management */
83 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
84 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
85 static void sched_clutch_root_pri_update(sched_clutch_root_t);
86 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
87 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
88 
89 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
90 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE           = 0,
91 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY   = 1,
92 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL            = 2,
93 });
94 __enum_decl(sched_clutch_traverse_mode_t, uint32_t, {
95 	SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY   = 0,
96 	SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT = 1,
97 	SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT           = 2,
98 });
99 static_assert(SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT < (1 << 3), "Ensure traverse mode can be encoded within 3 bits of sched_clutch_dbg_thread_select_packed_t");
100 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t, sched_clutch_root_bucket_t, thread_t, bool *, sched_clutch_traverse_mode_t, sched_clutch_dbg_thread_select_packed_t *);
101 
102 #if CONFIG_SCHED_EDGE
103 /* Support for foreign threads on AMP platforms */
104 static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
105 static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
106 #endif /* CONFIG_SCHED_EDGE */
107 
108 /* Root bucket level hierarchy management */
109 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
110 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t, bool);
111 static int sched_clutch_root_highest_runnable_qos(sched_clutch_root_t, sched_clutch_highest_root_bucket_type_t);
112 
113 /* Options for clutch bucket ordering in the runq */
114 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
115 	SCHED_CLUTCH_BUCKET_OPTIONS_NONE        = 0x0,
116 	/* Round robin clutch bucket on thread removal */
117 	SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR  = 0x1,
118 	/* Insert clutch bucket at head (for thread preemption) */
119 	SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ       = 0x2,
120 	/* Insert clutch bucket at tail (default) */
121 	SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ       = 0x4,
122 });
123 
124 /* Clutch bucket level hierarchy management */
125 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
126 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
127 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
128 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
129 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
130 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
131 
132 /* Clutch bucket group level properties management */
133 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
134 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
135 static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
136 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
137 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
138 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
139 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
140 
141 /* Clutch timeshare properties updates */
142 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
143 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
144 
145 /* Clutch membership management */
146 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
147 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
148 static thread_t sched_clutch_hierarchy_thread_highest(sched_clutch_root_t, processor_t, thread_t, sched_clutch_traverse_mode_t);
149 
150 /* Clutch properties updates */
151 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
152 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
153 static int sched_clutch_root_priority(sched_clutch_root_t);
154 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t, sched_clutch_root_bucket_t, processor_t _Nullable processor, thread_t _Nullable prev_thread, bool *_Nullable chose_prev_thread);
155 
156 /* Clutch thread properties */
157 static boolean_t sched_thread_sched_pri_promoted(thread_t);
158 static inline sched_clutch_bucket_t sched_clutch_bucket_for_thread(sched_clutch_root_t, thread_t);
159 static inline sched_clutch_bucket_group_t sched_clutch_bucket_group_for_thread(thread_t);
160 
161 /* General utilities */
162 static inline bool sched_clutch_pri_greater_than_tiebreak(int, int, bool);
163 
164 #if CONFIG_SCHED_EDGE
165 /* System based routines */
166 static bool sched_edge_pset_available(processor_set_t);
167 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
168 static int sched_edge_iterate_clusters_ordered(processor_set_t, uint64_t, int);
169 
170 /* Global indicating the maximum number of clusters on the current platform */
171 static int sched_edge_max_clusters = 0;
172 #endif /* CONFIG_SCHED_EDGE */
173 
174 /* Helper debugging routines */
175 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
176 
177 extern processor_set_t pset_array[MAX_PSETS];
178 
179 /*
180  * Special markers for buckets that have invalid WCELs/quantums etc.
181  */
182 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
183 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
184 
185 /*
186  * Root level bucket WCELs
187  *
188  * The root level bucket selection algorithm is an Earliest Deadline
189  * First (EDF) algorithm where the deadline for buckets are defined
190  * by the worst-case-execution-latency and the make runnable timestamp
191  * for the bucket.
192  *
193  */
194 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
195 	SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
196 	0,                                              /* FG */
197 	37500,                                          /* IN (37.5ms) */
198 	75000,                                          /* DF (75ms) */
199 	150000,                                         /* UT (150ms) */
200 	250000                                          /* BG (250ms) */
201 };
202 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
203 
204 /*
205  * Root level bucket warp
206  *
207  * Each root level bucket has a warp value associated with it as well.
208  * The warp value allows the root bucket to effectively warp ahead of
209  * lower priority buckets for a limited time even if it has a later
210  * deadline. The warping behavior provides extra (but limited)
211  * opportunity for high priority buckets to remain responsive.
212  */
213 
214 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
215 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED    (SCHED_CLUTCH_INVALID_TIME_64)
216 
217 /* Warp window durations for various tiers */
218 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
219 	SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
220 	8000,                                           /* FG (8ms)*/
221 	4000,                                           /* IN (4ms) */
222 	2000,                                           /* DF (2ms) */
223 	1000,                                           /* UT (1ms) */
224 	0                                               /* BG (0ms) */
225 };
226 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
227 
228 /*
229  * Thread level quantum
230  *
231  * The algorithm defines quantums for threads at various buckets. This
232  * (combined with the root level bucket quantums) restricts how much
233  * the lower priority levels can preempt the higher priority threads.
234  */
235 
236 #if XNU_TARGET_OS_OSX
237 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
238 	10000,                                          /* FIXPRI (10ms) */
239 	10000,                                          /* FG (10ms) */
240 	10000,                                          /* IN (10ms) */
241 	10000,                                          /* DF (10ms) */
242 	4000,                                           /* UT (4ms) */
243 	2000                                            /* BG (2ms) */
244 };
245 #else /* XNU_TARGET_OS_OSX */
246 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
247 	10000,                                          /* FIXPRI (10ms) */
248 	10000,                                          /* FG (10ms) */
249 	8000,                                           /* IN (8ms) */
250 	6000,                                           /* DF (6ms) */
251 	4000,                                           /* UT (4ms) */
252 	2000                                            /* BG (2ms) */
253 };
254 #endif /* XNU_TARGET_OS_OSX */
255 
256 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
257 
258 /*
259  * sched_clutch_us_to_abstime()
260  *
261  * Initializer for converting all durations in usec to abstime
262  */
263 static void
sched_clutch_us_to_abstime(uint32_t * us_vals,uint64_t * abstime_vals)264 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
265 {
266 	for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
267 		if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
268 			abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
269 		} else {
270 			clock_interval_to_absolutetime_interval(us_vals[i],
271 			    NSEC_PER_USEC, &abstime_vals[i]);
272 		}
273 	}
274 }
275 
276 /* Clutch/Edge Scheduler Debugging support */
277 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c)        ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
278 
279 #if DEVELOPMENT || DEBUG
280 
281 /*
282  * sched_clutch_hierarchy_locked_assert()
283  *
284  * Debugging helper routine. Asserts that the hierarchy is locked. The locking
285  * for the hierarchy depends on where the hierarchy is hooked. The current
286  * implementation hooks the hierarchy at the pset, so the hierarchy is locked
287  * using the pset lock.
288  */
289 static inline void
sched_clutch_hierarchy_locked_assert(sched_clutch_root_t root_clutch)290 sched_clutch_hierarchy_locked_assert(
291 	sched_clutch_root_t root_clutch)
292 {
293 	pset_assert_locked(root_clutch->scr_pset);
294 }
295 
296 #else /* DEVELOPMENT || DEBUG */
297 
298 static inline void
sched_clutch_hierarchy_locked_assert(__unused sched_clutch_root_t root_clutch)299 sched_clutch_hierarchy_locked_assert(
300 	__unused sched_clutch_root_t root_clutch)
301 {
302 }
303 
304 #endif /* DEVELOPMENT || DEBUG */
305 
306 /*
307  * sched_clutch_thr_count_inc()
308  *
309  * Increment thread count at a hierarchy level with overflow checks.
310  */
311 static void
sched_clutch_thr_count_inc(uint16_t * thr_count)312 sched_clutch_thr_count_inc(
313 	uint16_t *thr_count)
314 {
315 	if (__improbable(os_inc_overflow(thr_count))) {
316 		panic("sched_clutch thread count overflowed!");
317 	}
318 }
319 
320 /*
321  * sched_clutch_thr_count_dec()
322  *
323  * Decrement thread count at a hierarchy level with underflow checks.
324  */
325 static void
sched_clutch_thr_count_dec(uint16_t * thr_count)326 sched_clutch_thr_count_dec(
327 	uint16_t *thr_count)
328 {
329 	if (__improbable(os_dec_overflow(thr_count))) {
330 		panic("sched_clutch thread count underflowed!");
331 	}
332 }
333 
334 static sched_bucket_t
sched_convert_pri_to_bucket(uint8_t priority)335 sched_convert_pri_to_bucket(uint8_t priority)
336 {
337 	sched_bucket_t bucket = TH_BUCKET_RUN;
338 
339 	if (priority > BASEPRI_USER_INITIATED) {
340 		bucket = TH_BUCKET_SHARE_FG;
341 	} else if (priority > BASEPRI_DEFAULT) {
342 		bucket = TH_BUCKET_SHARE_IN;
343 	} else if (priority > BASEPRI_UTILITY) {
344 		bucket = TH_BUCKET_SHARE_DF;
345 	} else if (priority > MAXPRI_THROTTLE) {
346 		bucket = TH_BUCKET_SHARE_UT;
347 	} else {
348 		bucket = TH_BUCKET_SHARE_BG;
349 	}
350 	return bucket;
351 }
352 
353 /*
354  * sched_clutch_thread_bucket_map()
355  *
356  * Map a thread to a scheduling bucket for the clutch/edge scheduler
357  * based on its scheduling mode and the priority attribute passed in.
358  */
359 static sched_bucket_t
sched_clutch_thread_bucket_map(thread_t thread,int pri)360 sched_clutch_thread_bucket_map(thread_t thread, int pri)
361 {
362 	switch (thread->sched_mode) {
363 	case TH_MODE_FIXED:
364 		if (pri >= BASEPRI_FOREGROUND) {
365 			return TH_BUCKET_FIXPRI;
366 		} else {
367 			return sched_convert_pri_to_bucket(pri);
368 		}
369 
370 	case TH_MODE_REALTIME:
371 		return TH_BUCKET_FIXPRI;
372 
373 	case TH_MODE_TIMESHARE:
374 		return sched_convert_pri_to_bucket(pri);
375 
376 	default:
377 		panic("unexpected mode: %d", thread->sched_mode);
378 		break;
379 	}
380 }
381 
382 /*
383  * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
384  * based on the amount of time they have been pending and the load at that
385  * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
386  * multiple clusters, its important to keep the load also as a global counter.
387  */
388 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
389 
390 /*
391  * sched_clutch_root_init()
392  *
393  * Routine to initialize the scheduler hierarchy root.
394  */
395 static void
sched_clutch_root_init(sched_clutch_root_t root_clutch,processor_set_t pset)396 sched_clutch_root_init(
397 	sched_clutch_root_t root_clutch,
398 	processor_set_t pset)
399 {
400 	root_clutch->scr_thr_count = 0;
401 	root_clutch->scr_priority = NOPRI;
402 	root_clutch->scr_urgency = 0;
403 	root_clutch->scr_pset = pset;
404 #if CONFIG_SCHED_EDGE
405 	root_clutch->scr_cluster_id = pset->pset_cluster_id;
406 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
407 		root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
408 	}
409 #else /* CONFIG_SCHED_EDGE */
410 	root_clutch->scr_cluster_id = 0;
411 #endif /* CONFIG_SCHED_EDGE */
412 
413 	/* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
414 	queue_init(&root_clutch->scr_clutch_buckets);
415 
416 	/* Initialize the priority queue which maintains all runnable foreign clutch buckets */
417 	priority_queue_init(&root_clutch->scr_foreign_buckets);
418 	bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
419 	bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
420 	bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
421 	priority_queue_init(&root_clutch->scr_bound_root_buckets);
422 
423 	/* Initialize the bitmap and priority queue of runnable root buckets */
424 	priority_queue_init(&root_clutch->scr_unbound_root_buckets);
425 	bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
426 	bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
427 
428 	/* Initialize all the root buckets */
429 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
430 		sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
431 		sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
432 	}
433 }
434 
435 /*
436  * Clutch Bucket Runqueues
437  *
438  * The clutch buckets are maintained in a runq at the root bucket level. The
439  * runq organization allows clutch buckets to be ordered based on various
440  * factors such as:
441  *
442  * - Clutch buckets are round robin'ed at the same priority level when a
443  *   thread is selected from a clutch bucket. This prevents a clutch bucket
444  *   from starving out other clutch buckets at the same priority.
445  *
446  * - Clutch buckets are inserted at the head when it becomes runnable due to
447  *   thread preemption. This allows threads that were preempted to maintain
448  *   their order in the queue.
449  */
450 
451 /*
452  * sched_clutch_bucket_runq_init()
453  *
454  * Initialize a clutch bucket runq.
455  */
456 static void
sched_clutch_bucket_runq_init(sched_clutch_bucket_runq_t clutch_buckets_rq)457 sched_clutch_bucket_runq_init(
458 	sched_clutch_bucket_runq_t clutch_buckets_rq)
459 {
460 	clutch_buckets_rq->scbrq_highq = NOPRI;
461 	for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
462 		clutch_buckets_rq->scbrq_bitmap[i] = 0;
463 	}
464 	clutch_buckets_rq->scbrq_count = 0;
465 	for (int i = 0; i < NRQS; i++) {
466 		circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
467 	}
468 }
469 
470 /*
471  * sched_clutch_bucket_runq_empty()
472  *
473  * Returns if a clutch bucket runq is empty.
474  */
475 static boolean_t
sched_clutch_bucket_runq_empty(sched_clutch_bucket_runq_t clutch_buckets_rq)476 sched_clutch_bucket_runq_empty(
477 	sched_clutch_bucket_runq_t clutch_buckets_rq)
478 {
479 	return clutch_buckets_rq->scbrq_count == 0;
480 }
481 
482 /*
483  * sched_clutch_bucket_runq_peek()
484  *
485  * Returns the highest priority clutch bucket in the runq.
486  */
487 static sched_clutch_bucket_t
sched_clutch_bucket_runq_peek(sched_clutch_bucket_runq_t clutch_buckets_rq)488 sched_clutch_bucket_runq_peek(
489 	sched_clutch_bucket_runq_t clutch_buckets_rq)
490 {
491 	if (clutch_buckets_rq->scbrq_count > 0) {
492 		circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
493 		return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
494 	} else {
495 		return NULL;
496 	}
497 }
498 
499 /*
500  * sched_clutch_bucket_runq_enqueue()
501  *
502  * Enqueue a clutch bucket into the runq based on the options passed in.
503  */
504 static void
sched_clutch_bucket_runq_enqueue(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_options_t options)505 sched_clutch_bucket_runq_enqueue(
506 	sched_clutch_bucket_runq_t clutch_buckets_rq,
507 	sched_clutch_bucket_t clutch_bucket,
508 	sched_clutch_bucket_options_t options)
509 {
510 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
511 	if (circle_queue_empty(queue)) {
512 		circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
513 		bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
514 		if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
515 			clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
516 		}
517 	} else {
518 		if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
519 			circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
520 		} else {
521 			/*
522 			 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
523 			 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
524 			 */
525 			circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
526 		}
527 	}
528 	clutch_buckets_rq->scbrq_count++;
529 }
530 
531 /*
532  * sched_clutch_bucket_runq_remove()
533  *
534  * Remove a clutch bucket from the runq.
535  */
536 static void
sched_clutch_bucket_runq_remove(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)537 sched_clutch_bucket_runq_remove(
538 	sched_clutch_bucket_runq_t clutch_buckets_rq,
539 	sched_clutch_bucket_t clutch_bucket)
540 {
541 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
542 	circle_dequeue(queue, &clutch_bucket->scb_runqlink);
543 	assert(clutch_buckets_rq->scbrq_count > 0);
544 	clutch_buckets_rq->scbrq_count--;
545 	if (circle_queue_empty(queue)) {
546 		bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
547 		clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
548 	}
549 }
550 
551 static void
sched_clutch_bucket_runq_rotate(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)552 sched_clutch_bucket_runq_rotate(
553 	sched_clutch_bucket_runq_t clutch_buckets_rq,
554 	sched_clutch_bucket_t clutch_bucket)
555 {
556 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
557 	assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
558 	circle_queue_rotate_head_forward(queue);
559 }
560 
561 /*
562  * sched_clutch_root_bucket_init()
563  *
564  * Routine to initialize root buckets.
565  */
566 static void
sched_clutch_root_bucket_init(sched_clutch_root_bucket_t root_bucket,sched_bucket_t bucket,bool bound_root_bucket)567 sched_clutch_root_bucket_init(
568 	sched_clutch_root_bucket_t root_bucket,
569 	sched_bucket_t bucket,
570 	bool bound_root_bucket)
571 {
572 	root_bucket->scrb_bucket = bucket;
573 	if (bound_root_bucket) {
574 		/* For bound root buckets, initialize the bound thread runq. */
575 		root_bucket->scrb_bound = true;
576 		run_queue_init(&root_bucket->scrb_bound_thread_runq);
577 	} else {
578 		/*
579 		 * The unbounded root buckets contain a runq of runnable clutch buckets
580 		 * which then hold the runnable threads.
581 		 */
582 		root_bucket->scrb_bound = false;
583 		sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
584 	}
585 	priority_queue_entry_init(&root_bucket->scrb_pqlink);
586 	root_bucket->scrb_pqlink.deadline = 0;
587 	root_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
588 	root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
589 	root_bucket->scrb_starvation_avoidance = false;
590 	root_bucket->scrb_starvation_ts = 0;
591 }
592 
593 /*
594  * Special case scheduling for Above UI bucket.
595  *
596  * AboveUI threads are typically system critical threads that need low latency
597  * which is why they are handled specially.
598  *
599  * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
600  * important to maintain some native priority order between those buckets. For unbounded
601  * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
602  * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
603  * deadline based scheduling which should pickup the timeshare buckets. For the bound
604  * case, the policy simply compares the priority of the highest runnable threads in
605  * the above UI and timeshare buckets.
606  *
607  * The implementation allows extremely low latency CPU access for Above UI threads
608  * while supporting the use case of high priority timeshare threads contending with
609  * lower priority fixed priority threads.
610  */
611 
612 
613 /*
614  * sched_clutch_root_unbound_select_aboveui()
615  *
616  * Routine to determine if the above UI unbounded bucket should be selected for execution.
617  *
618  * Writes the highest unbound (timeshare FG vs. above UI) bucket, its priority, and whether
619  * it is an above UI bucket into the pointer parameters.
620  */
621 static void
sched_clutch_root_unbound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)622 sched_clutch_root_unbound_select_aboveui(
623 	sched_clutch_root_t root_clutch,
624 	sched_clutch_root_bucket_t *highest_bucket,
625 	int *highest_pri,
626 	bool *highest_is_aboveui,
627 	sched_clutch_root_bucket_t _Nullable prev_bucket,
628 	thread_t _Nullable prev_thread)
629 {
630 	/* First determine the highest Clutch bucket */
631 	sched_clutch_root_bucket_t higher_root_bucket = NULL;
632 	sched_clutch_bucket_t higher_clutch_bucket = NULL;
633 	int higher_bucket_sched_pri = -1;
634 	bool higher_is_aboveui = false;
635 	/* Consider unbound Above UI */
636 	if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
637 		higher_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
638 		higher_clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, higher_root_bucket, NULL, NULL, NULL);
639 		higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
640 		higher_is_aboveui = true;
641 	}
642 	/* Consider unbound Timeshare FG */
643 	if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
644 		sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
645 		sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket_sharefg, NULL, NULL, NULL);
646 		/* Strict greater-than because unbound timeshare FG root bucket loses all priority ties at this level */
647 		if (higher_root_bucket == NULL || clutch_bucket_sharefg->scb_priority > higher_clutch_bucket->scb_priority) {
648 			higher_root_bucket = root_bucket_sharefg;
649 			higher_clutch_bucket = clutch_bucket_sharefg;
650 			higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
651 			higher_is_aboveui = false;
652 		}
653 	}
654 	/* Consider the previous thread */
655 	if (prev_thread != NULL) {
656 		assert(prev_bucket->scrb_bound == false);
657 		sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
658 		int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
659 		sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
660 		bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && higher_is_aboveui == false;
661 		if (higher_clutch_bucket == NULL ||
662 		    sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, higher_clutch_bucket->scb_priority, prev_bucket_should_win_ties)) {
663 			higher_root_bucket = prev_bucket;
664 			higher_clutch_bucket = prev_clutch_bucket;
665 			higher_bucket_sched_pri = prev_thread->sched_pri;
666 			higher_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
667 		}
668 	}
669 	/* Compare highest priority in the highest unbound Clutch bucket to highest priority seen from the bound buckets */
670 	if (higher_root_bucket != NULL) {
671 		bool unbound_should_win_ties = higher_is_aboveui == true && *highest_is_aboveui == false;
672 		if (sched_clutch_pri_greater_than_tiebreak(higher_bucket_sched_pri, *highest_pri, unbound_should_win_ties)) {
673 			*highest_pri = higher_bucket_sched_pri;
674 			*highest_bucket = higher_root_bucket;
675 			*highest_is_aboveui = higher_is_aboveui;
676 		}
677 	}
678 }
679 
680 /*
681  * sched_clutch_root_bound_select_aboveui()
682  *
683  * Routine to determine if the above UI bounded bucket should be selected for execution.
684  *
685  * Writes the highest bound (timeshare FG vs. above UI) bucket, its priority, and whether
686  * it is an above UI bucket into the pointer parameters.
687  */
688 static void
sched_clutch_root_bound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)689 sched_clutch_root_bound_select_aboveui(
690 	sched_clutch_root_t root_clutch,
691 	sched_clutch_root_bucket_t *highest_bucket,
692 	int *highest_pri,
693 	bool *highest_is_aboveui,
694 	sched_clutch_root_bucket_t _Nullable prev_bucket,
695 	thread_t _Nullable prev_thread)
696 {
697 	/* Consider bound Above UI */
698 	sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
699 	if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_FIXPRI) &&
700 	    sched_clutch_pri_greater_than_tiebreak(root_bucket_aboveui->scrb_bound_thread_runq.highq, *highest_pri, *highest_is_aboveui == false)) {
701 		*highest_pri = root_bucket_aboveui->scrb_bound_thread_runq.highq;
702 		*highest_bucket = root_bucket_aboveui;
703 		*highest_is_aboveui = true;
704 	}
705 	/* Consider bound Timeshare FG */
706 	sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
707 	if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SHARE_FG) &&
708 	    sched_clutch_pri_greater_than_tiebreak(root_bucket_sharefg->scrb_bound_thread_runq.highq, *highest_pri, false)) {
709 		*highest_pri = root_bucket_sharefg->scrb_bound_thread_runq.highq;
710 		*highest_bucket = root_bucket_sharefg;
711 		*highest_is_aboveui = false;
712 	}
713 	/* Consider the previous thread */
714 	if (prev_thread != NULL) {
715 		assert(prev_bucket->scrb_bound == true);
716 		bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && *highest_is_aboveui == false;
717 		if (sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, *highest_pri, prev_bucket_should_win_ties)) {
718 			*highest_pri = prev_thread->sched_pri;
719 			*highest_bucket = prev_bucket;
720 			*highest_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
721 		}
722 	}
723 }
724 
725 /*
726  * sched_clutch_root_highest_runnable_qos()
727  *
728  * Returns the index of the highest-QoS root bucket which is currently runnable.
729  */
730 static int
sched_clutch_root_highest_runnable_qos(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type)731 sched_clutch_root_highest_runnable_qos(
732 	sched_clutch_root_t root_clutch,
733 	sched_clutch_highest_root_bucket_type_t type)
734 {
735 	int highest_unbound_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
736 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
737 		return highest_unbound_bucket;
738 	}
739 	assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
740 	int highest_bound_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
741 	if (highest_bound_bucket == -1) {
742 		return highest_unbound_bucket;
743 	}
744 	if (highest_unbound_bucket == -1) {
745 		return highest_bound_bucket;
746 	}
747 	/* Both bound and unbound buckets are runnable, return the higher QoS */
748 	return MIN(highest_bound_bucket, highest_unbound_bucket);
749 }
750 
751 /*
752  * sched_clutch_root_highest_aboveui_root_bucket()
753  *
754  * Routine to determine if an above UI root bucket should be selected for execution.
755  *
756  * Returns the root bucket if we should run an above UI bucket or NULL otherwise.
757  */
758 static sched_clutch_root_bucket_t
sched_clutch_root_highest_aboveui_root_bucket(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread)759 sched_clutch_root_highest_aboveui_root_bucket(
760 	sched_clutch_root_t root_clutch,
761 	sched_clutch_highest_root_bucket_type_t type,
762 	sched_clutch_root_bucket_t _Nullable prev_bucket,
763 	thread_t _Nullable prev_thread,
764 	bool *chose_prev_thread)
765 {
766 	assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
767 	assert((type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) || (prev_bucket == NULL));
768 
769 	sched_clutch_root_bucket_t highest_bucket = NULL;
770 	int highest_pri = -1;
771 	bool highest_is_aboveui = false;
772 
773 	/* Forward previous thread to the correct comparison logic, based on boundness */
774 	sched_clutch_root_bucket_t bound_prev_bucket = NULL, unbound_prev_bucket = NULL;
775 	thread_t bound_prev_thread = NULL, unbound_prev_thread = NULL;
776 	if (prev_thread != NULL) {
777 		if (prev_bucket->scrb_bound) {
778 			bound_prev_bucket = prev_bucket;
779 			bound_prev_thread = prev_thread;
780 		} else {
781 			unbound_prev_bucket = prev_bucket;
782 			unbound_prev_thread = prev_thread;
783 		}
784 	}
785 
786 	/* Consider bound Above UI vs. Timeshare FG first, so those buckets will win ties against the corresponding unbound buckets */
787 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) {
788 		sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, bound_prev_bucket, bound_prev_thread);
789 	}
790 
791 	/* Consider unbound Above UI vs. Timeshare FG */
792 	sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, unbound_prev_bucket, unbound_prev_thread);
793 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
794 		return highest_is_aboveui ? highest_bucket : NULL;
795 	}
796 	assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
797 
798 	/* Determine whether we already know to continue running the previous thread */
799 	if (prev_thread != NULL &&
800 	    bitmap_test(highest_bucket->scrb_bound ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap, highest_bucket->scrb_bucket) == false) {
801 		/* Highest bucket we saw is empty, so the previous thread must have been the highest */
802 		assert(highest_bucket == prev_bucket);
803 		*chose_prev_thread = true;
804 	}
805 
806 	return highest_is_aboveui ? highest_bucket : NULL;
807 }
808 
809 /*
810  * sched_clutch_root_highest_root_bucket()
811  *
812  * Main routine to find the highest runnable root level bucket.
813  * This routine is called from performance sensitive contexts; so it is
814  * crucial to keep this O(1). The options parameter determines if
815  * the selection logic should look at unbounded threads only (for
816  * cross-cluster stealing operations) or both bounded and unbounded
817  * threads (for selecting next thread for execution on current cluster).
818  */
819 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread,sched_clutch_traverse_mode_t mode,sched_clutch_dbg_thread_select_packed_t * debug_info)820 sched_clutch_root_highest_root_bucket(
821 	sched_clutch_root_t root_clutch,
822 	uint64_t timestamp,
823 	sched_clutch_highest_root_bucket_type_t type,
824 	sched_clutch_root_bucket_t _Nullable prev_bucket,
825 	thread_t _Nullable prev_thread,
826 	bool *chose_prev_thread,
827 	sched_clutch_traverse_mode_t mode,
828 	sched_clutch_dbg_thread_select_packed_t *debug_info)
829 {
830 	assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
831 	assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL || (prev_thread == NULL));
832 	assert(prev_thread == NULL || (mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY));
833 	sched_clutch_hierarchy_locked_assert(root_clutch);
834 
835 	int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(root_clutch, type);
836 	if (highest_runnable_bucket == -1) {
837 		/*
838 		 * The Clutch hierarchy has no runnable threads. We can continue running
839 		 * whatever was running previously.
840 		 */
841 		assert(sched_clutch_root_count(root_clutch) == 0 || type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
842 		*chose_prev_thread = true;
843 		if (prev_thread != NULL) {
844 			debug_info->trace_data.selection_was_edf = true;
845 		}
846 		return prev_bucket;
847 	}
848 
849 	/* Consider Above UI threads, in comparison to Timeshare FG threads */
850 	sched_clutch_root_bucket_t highest_aboveui_bucket = sched_clutch_root_highest_aboveui_root_bucket(root_clutch, type, prev_bucket, prev_thread, chose_prev_thread);
851 	if (highest_aboveui_bucket != NULL) {
852 		debug_info->trace_data.selection_was_edf = true;
853 		return highest_aboveui_bucket;
854 	}
855 
856 	/*
857 	 * Above UI bucket is not runnable or has a low priority runnable thread; use the
858 	 * earliest deadline model to schedule threads. The idea is that as the timeshare
859 	 * buckets use CPU, they will drop their interactivity score/sched priority and
860 	 * allow the low priority AboveUI buckets to be scheduled.
861 	 */
862 
863 	/* Find the earliest deadline bucket */
864 	sched_clutch_root_bucket_t edf_bucket;
865 	bool edf_bucket_enqueued_normally;
866 
867 evaluate_root_buckets:
868 	edf_bucket = NULL;
869 	edf_bucket_enqueued_normally = true;
870 
871 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
872 		edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
873 	} else {
874 		assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
875 		sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
876 		sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
877 		if (bound_bucket && unbound_bucket) {
878 			/* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
879 			edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
880 		} else {
881 			edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
882 		}
883 	}
884 	if (edf_bucket == NULL) {
885 		/* The timeshare portion of the runqueue is empty */
886 		assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
887 		assert(prev_thread != NULL);
888 		*chose_prev_thread = true;
889 		if (prev_thread != NULL) {
890 			debug_info->trace_data.selection_was_edf = true;
891 		}
892 		return prev_bucket;
893 	}
894 	if (prev_bucket != NULL && prev_bucket->scrb_pqlink.deadline < edf_bucket->scrb_pqlink.deadline) {
895 		/* The previous thread's root bucket has the earliest deadline and is not currently enqueued */
896 		edf_bucket = prev_bucket;
897 		edf_bucket_enqueued_normally = false;
898 	}
899 
900 	if (edf_bucket->scrb_starvation_avoidance) {
901 		/* Check if the EDF bucket is in an expired starvation avoidance window */
902 		uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket];
903 		if (timestamp >= (edf_bucket->scrb_starvation_ts + starvation_window)) {
904 			/* Starvation avoidance window is over; update deadline and re-evaluate EDF */
905 			edf_bucket->scrb_starvation_avoidance = false;
906 			edf_bucket->scrb_starvation_ts = 0;
907 			sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
908 			bit_set(debug_info->trace_data.starvation_avoidance_window_close, edf_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + edf_bucket->scrb_bucket);
909 			goto evaluate_root_buckets;
910 		}
911 	}
912 
913 	/*
914 	 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
915 	 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
916 	 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
917 	 * QoS root bucket.
918 	 */
919 	bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
920 	int warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
921 
922 	/* Allow the prev_bucket to use its warp as well */
923 	bool prev_bucket_warping = (prev_bucket != NULL) && (prev_bucket->scrb_bound == edf_bucket->scrb_bound) &&
924 	    prev_bucket->scrb_bucket < edf_bucket->scrb_bucket && (prev_bucket->scrb_warp_remaining > 0) &&
925 	    (warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
926 
927 	bool non_edf_bucket_can_warp = (warp_bucket_index != -1 && warp_bucket_index < edf_bucket->scrb_bucket) || prev_bucket_warping;
928 
929 	if (non_edf_bucket_can_warp == false) {
930 		/* No higher buckets have warp left; best choice is the EDF based bucket */
931 		debug_info->trace_data.selection_was_edf = true;
932 
933 		bool should_update_edf_starvation_state = edf_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
934 		if (edf_bucket->scrb_starvation_avoidance == false && should_update_edf_starvation_state) {
935 			/* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
936 			if (highest_runnable_bucket < edf_bucket->scrb_bucket || (prev_bucket != NULL && prev_bucket->scrb_bucket < edf_bucket->scrb_bucket)) {
937 				/*
938 				 * Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance.
939 				 *
940 				 * The starvation avoidance window is allocated as a single quantum for the starved bucket, enforced
941 				 * simultaneously across all CPUs in the cluster. The idea is to grant the starved bucket roughly one
942 				 * quantum per core, each time the bucket reaches the earliest deadline position. Note that this
943 				 * cadence is driven by the difference between the starved bucket's and highest-runnable bucket's WCELs.
944 				 */
945 				edf_bucket->scrb_starvation_avoidance = true;
946 				edf_bucket->scrb_starvation_ts = timestamp;
947 				debug_info->trace_data.selection_opened_starvation_avoidance_window = true;
948 			} else {
949 				/* EDF bucket is being selected in the natural order; update deadline and reset warp */
950 				sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
951 				edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
952 				edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
953 				if (edf_bucket_enqueued_normally) {
954 					if (edf_bucket->scrb_bound) {
955 						bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
956 					} else {
957 						bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
958 					}
959 				}
960 			}
961 		}
962 		*chose_prev_thread = !edf_bucket_enqueued_normally;
963 		return edf_bucket;
964 	}
965 
966 	/*
967 	 * Looks like there is a root bucket which is higher in the natural priority
968 	 * order than edf_bucket and might have some warp remaining.
969 	 */
970 	assert(prev_bucket_warping || warp_bucket_index >= 0);
971 	sched_clutch_root_bucket_t warp_bucket = NULL;
972 	if (prev_bucket_warping) {
973 		assert(warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
974 		warp_bucket = prev_bucket;
975 	} else {
976 		warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
977 	}
978 
979 	bool warp_is_being_utilized = warp_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
980 
981 	if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
982 		if (warp_is_being_utilized) {
983 			/* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
984 			warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
985 			sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
986 			debug_info->trace_data.selection_opened_warp_window = true;
987 		}
988 		*chose_prev_thread = prev_bucket_warping;
989 		debug_info->trace_data.selection_was_edf = false;
990 		assert(warp_bucket != edf_bucket);
991 		return warp_bucket;
992 	}
993 	if (warp_bucket->scrb_warped_deadline > timestamp) {
994 		/* Root bucket already has a warp window open with some warp remaining */
995 		if (warp_is_being_utilized) {
996 			sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
997 		}
998 		*chose_prev_thread = prev_bucket_warping;
999 		debug_info->trace_data.selection_was_edf = false;
1000 		return warp_bucket;
1001 	}
1002 
1003 	/*
1004 	 * For this bucket, warp window was opened sometime in the past but has now
1005 	 * expired. Mark the bucket as not available for warp anymore and re-run the
1006 	 * warp bucket selection logic.
1007 	 */
1008 	warp_bucket->scrb_warp_remaining = 0;
1009 	if (!prev_bucket_warping) {
1010 		if (warp_bucket->scrb_bound) {
1011 			bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
1012 		} else {
1013 			bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
1014 		}
1015 	}
1016 	bit_set(debug_info->trace_data.warp_window_close, warp_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + warp_bucket->scrb_bucket);
1017 	goto evaluate_root_buckets;
1018 }
1019 
1020 /*
1021  * sched_clutch_root_bucket_deadline_calculate()
1022  *
1023  * Calculate the deadline for the bucket based on its WCEL
1024  */
1025 static uint64_t
sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t root_bucket,uint64_t timestamp)1026 sched_clutch_root_bucket_deadline_calculate(
1027 	sched_clutch_root_bucket_t root_bucket,
1028 	uint64_t timestamp)
1029 {
1030 	/* For fixpri AboveUI bucket always return it as the earliest deadline */
1031 	if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
1032 		return 0;
1033 	}
1034 
1035 	/* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
1036 	return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
1037 }
1038 
1039 /*
1040  * sched_clutch_root_bucket_deadline_update()
1041  *
1042  * Routine to update the deadline of the root bucket when it is selected.
1043  * Updating the deadline also moves the root_bucket in the EDF priority
1044  * queue.
1045  */
1046 static void
sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,bool bucket_is_enqueued)1047 sched_clutch_root_bucket_deadline_update(
1048 	sched_clutch_root_bucket_t root_bucket,
1049 	sched_clutch_root_t root_clutch,
1050 	uint64_t timestamp,
1051 	bool bucket_is_enqueued)
1052 {
1053 	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
1054 		/* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
1055 		return;
1056 	}
1057 
1058 	uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
1059 	uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1060 	if (__improbable(old_deadline > new_deadline)) {
1061 		panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
1062 	}
1063 	if (old_deadline != new_deadline) {
1064 		root_bucket->scrb_pqlink.deadline = new_deadline;
1065 		if (bucket_is_enqueued) {
1066 			struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1067 			priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
1068 		}
1069 	}
1070 }
1071 
1072 /*
1073  * sched_clutch_root_bucket_runnable()
1074  *
1075  * Routine to insert a newly runnable root bucket into the hierarchy.
1076  * Also updates the deadline and warp parameters as necessary.
1077  */
1078 static void
sched_clutch_root_bucket_runnable(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1079 sched_clutch_root_bucket_runnable(
1080 	sched_clutch_root_bucket_t root_bucket,
1081 	sched_clutch_root_t root_clutch,
1082 	uint64_t timestamp)
1083 {
1084 	/* Mark the root bucket as runnable */
1085 	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1086 	bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
1087 
1088 	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
1089 		/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1090 		return;
1091 	}
1092 
1093 	if (root_bucket->scrb_starvation_avoidance == false) {
1094 		/*
1095 		 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
1096 		 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
1097 		 * that and fix it up.
1098 		 */
1099 		root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1100 	}
1101 	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1102 	priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
1103 	if (root_bucket->scrb_warp_remaining) {
1104 		/* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
1105 		bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1106 		bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
1107 	}
1108 }
1109 
1110 /*
1111  * sched_clutch_root_bucket_empty()
1112  *
1113  * Routine to remove an empty root bucket from the hierarchy.
1114  * Also updates the deadline and warp parameters as necessary.
1115  */
1116 static void
sched_clutch_root_bucket_empty(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1117 sched_clutch_root_bucket_empty(
1118 	sched_clutch_root_bucket_t root_bucket,
1119 	sched_clutch_root_t root_clutch,
1120 	uint64_t timestamp)
1121 {
1122 	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1123 	bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
1124 
1125 	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
1126 		/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1127 		return;
1128 	}
1129 
1130 	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1131 	priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
1132 
1133 	bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1134 	bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
1135 
1136 	if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
1137 		if (root_bucket->scrb_warped_deadline > timestamp) {
1138 			/*
1139 			 * For root buckets that were using the warp, check if the warp
1140 			 * deadline is in the future. If yes, remove the wall time the
1141 			 * warp was active and update the warp remaining. This allows
1142 			 * the root bucket to use the remaining warp the next time it
1143 			 * becomes runnable.
1144 			 */
1145 			root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
1146 		} else {
1147 			/*
1148 			 * If the root bucket's warped deadline is in the past, it has used up
1149 			 * all the warp it was assigned. Empty out its warp remaining.
1150 			 */
1151 			root_bucket->scrb_warp_remaining = 0;
1152 		}
1153 	}
1154 }
1155 
1156 static int
sched_clutch_global_bucket_load_get(sched_bucket_t bucket)1157 sched_clutch_global_bucket_load_get(
1158 	sched_bucket_t bucket)
1159 {
1160 	return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
1161 }
1162 
1163 /*
1164  * sched_clutch_root_pri_update()
1165  *
1166  * The root level priority is used for thread selection and preemption
1167  * logic.
1168  *
1169  * The logic uses the same decision as thread selection for deciding between the
1170  * above UI and timeshare buckets. If one of the timesharing buckets have to be
1171  * used for priority calculation, the logic is slightly different from thread
1172  * selection, because thread selection considers deadlines, warps etc. to
1173  * decide the most optimal bucket at a given timestamp. Since the priority
1174  * value is used for preemption decisions only, it needs to be based on the
1175  * highest runnable thread available in the timeshare domain. This logic can
1176  * be made more sophisticated if there are cases of unnecessary preemption
1177  * being seen in workloads.
1178  */
1179 static void
sched_clutch_root_pri_update(sched_clutch_root_t root_clutch)1180 sched_clutch_root_pri_update(
1181 	sched_clutch_root_t root_clutch)
1182 {
1183 	sched_clutch_hierarchy_locked_assert(root_clutch);
1184 	int16_t root_bound_pri = NOPRI;
1185 	int16_t root_unbound_pri = NOPRI;
1186 
1187 	/* Consider bound root buckets */
1188 	if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1189 		goto root_pri_update_unbound;
1190 	}
1191 	sched_clutch_root_bucket_t highest_bound_root_bucket = NULL;
1192 	__unused int highest_bound_root_bucket_pri = -1;
1193 	bool highest_bound_root_bucket_is_fixpri = false;
1194 	sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bound_root_bucket, &highest_bound_root_bucket_pri, &highest_bound_root_bucket_is_fixpri, NULL, NULL);
1195 	if (highest_bound_root_bucket_is_fixpri == false) {
1196 		int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1197 		assert(root_bucket_index != -1);
1198 		highest_bound_root_bucket = &root_clutch->scr_bound_buckets[root_bucket_index];
1199 	}
1200 	root_bound_pri = highest_bound_root_bucket->scrb_bound_thread_runq.highq;
1201 
1202 root_pri_update_unbound:
1203 	/* Consider unbound root buckets */
1204 	if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1205 		goto root_pri_update_complete;
1206 	}
1207 	sched_clutch_root_bucket_t highest_unbound_root_bucket = NULL;
1208 	__unused int highest_unbound_root_bucket_pri = -1;
1209 	bool highest_unbound_root_bucket_is_fixpri = false;
1210 	sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_unbound_root_bucket, &highest_unbound_root_bucket_pri, &highest_unbound_root_bucket_is_fixpri, NULL, NULL);
1211 	if (highest_unbound_root_bucket_is_fixpri == false) {
1212 		int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1213 		assert(root_bucket_index != -1);
1214 		highest_unbound_root_bucket = &root_clutch->scr_unbound_buckets[root_bucket_index];
1215 	}
1216 
1217 	/* For the selected root bucket, find the highest priority clutch bucket */
1218 	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, highest_unbound_root_bucket, NULL, NULL, NULL);
1219 	root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1220 
1221 root_pri_update_complete:
1222 	root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
1223 }
1224 
1225 /*
1226  * sched_clutch_root_urgency_inc()
1227  *
1228  * Routine to increment the urgency at the root level based on the thread
1229  * priority that is being inserted into the hierarchy. The root urgency
1230  * counter is updated based on the urgency of threads in any of the
1231  * clutch buckets which are part of the hierarchy.
1232  *
1233  * Always called with the pset lock held.
1234  */
1235 static void
sched_clutch_root_urgency_inc(sched_clutch_root_t root_clutch,thread_t thread)1236 sched_clutch_root_urgency_inc(
1237 	sched_clutch_root_t root_clutch,
1238 	thread_t thread)
1239 {
1240 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1241 		root_clutch->scr_urgency++;
1242 	}
1243 }
1244 
1245 /*
1246  * sched_clutch_root_urgency_dec()
1247  *
1248  * Routine to decrement the urgency at the root level based on the thread
1249  * priority that is being removed from the hierarchy. The root urgency
1250  * counter is updated based on the urgency of threads in any of the
1251  * clutch buckets which are part of the hierarchy.
1252  *
1253  * Always called with the pset lock held.
1254  */
1255 static void
sched_clutch_root_urgency_dec(sched_clutch_root_t root_clutch,thread_t thread)1256 sched_clutch_root_urgency_dec(
1257 	sched_clutch_root_t root_clutch,
1258 	thread_t thread)
1259 {
1260 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1261 		root_clutch->scr_urgency--;
1262 	}
1263 }
1264 
1265 /*
1266  * Clutch bucket level scheduling
1267  *
1268  * The second level of scheduling is the clutch bucket level scheduling
1269  * which tries to schedule thread groups within root_buckets. Each
1270  * clutch represents a thread group and a clutch_bucket_group represents
1271  * threads at a particular sched_bucket within that thread group. The
1272  * clutch_bucket_group contains a clutch_bucket per cluster on the system
1273  * where it holds the runnable threads destined for execution on that
1274  * cluster.
1275  *
1276  * The goal of this level of scheduling is to allow interactive thread
1277  * groups low latency access to the CPU. It also provides slight
1278  * scheduling preference for App and unrestricted thread groups.
1279  *
1280  * The clutch bucket scheduling algorithm measures an interactivity
1281  * score for all clutch bucket groups. The interactivity score is based
1282  * on the ratio of the CPU used and the voluntary blocking of threads
1283  * within the clutch bucket group. The algorithm is very close to the ULE
1284  * scheduler on FreeBSD in terms of calculations. The interactivity
1285  * score provides an interactivity boost in the range of
1286  * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1287  * thread groups to win over CPU spinners.
1288  *
1289  * The interactivity score of the clutch bucket group is combined with the
1290  * highest base/promoted priority of threads in the clutch bucket to form
1291  * the overall priority of the clutch bucket.
1292  */
1293 
1294 /* Priority boost range for interactivity */
1295 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT     (8)
1296 static uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1297 
1298 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1299 static uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1300 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS      (500000)
1301 
1302 /* The ratio to scale the cpu/blocked time per window */
1303 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO                (10)
1304 
1305 /* Initial value for voluntary blocking time for the clutch_bucket */
1306 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID          (uint64_t)(~0)
1307 
1308 /* Value indicating the clutch bucket is not pending execution */
1309 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID             ((uint64_t)(~0))
1310 
1311 /*
1312  * Thread group CPU starvation avoidance
1313  *
1314  * In heavily CPU contended scenarios, it is possible that some thread groups
1315  * which have a low interactivity score do not get CPU time at all. In order to
1316  * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1317  * bucket group when it has been pending execution for a certain time as defined
1318  * by the sched_clutch_bucket_group_pending_delta_us values below.
1319  *
1320  * The values chosen here are very close to the WCEL values for each sched bucket.
1321  * Theses values are added into the pending interval used to determine how
1322  * frequently we will ageout the CPU usage, ensuring a reasonable limit on the
1323  * frequency.
1324  */
1325 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1326 	SCHED_CLUTCH_INVALID_TIME_32,           /* FIXPRI */
1327 	10000,                                  /* FG */
1328 	37500,                                  /* IN */
1329 	75000,                                  /* DF */
1330 	150000,                                 /* UT */
1331 	250000,                                 /* BG */
1332 };
1333 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1334 
1335 /*
1336  * sched_clutch_bucket_init()
1337  *
1338  * Initializer for clutch buckets.
1339  */
1340 static void
sched_clutch_bucket_init(sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_group_t clutch_bucket_group,sched_bucket_t bucket)1341 sched_clutch_bucket_init(
1342 	sched_clutch_bucket_t clutch_bucket,
1343 	sched_clutch_bucket_group_t clutch_bucket_group,
1344 	sched_bucket_t bucket)
1345 {
1346 	clutch_bucket->scb_bucket = bucket;
1347 	/* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1348 	clutch_bucket->scb_priority = 0;
1349 #if CONFIG_SCHED_EDGE
1350 	clutch_bucket->scb_foreign = false;
1351 	priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1352 #endif /* CONFIG_SCHED_EDGE */
1353 	clutch_bucket->scb_group = clutch_bucket_group;
1354 	clutch_bucket->scb_root = NULL;
1355 	priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1356 	priority_queue_init(&clutch_bucket->scb_thread_runq);
1357 	queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1358 }
1359 
1360 /*
1361  * sched_clutch_bucket_group_init()
1362  *
1363  * Initializer for clutch bucket groups.
1364  */
1365 static void
sched_clutch_bucket_group_init(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_t clutch,sched_bucket_t bucket)1366 sched_clutch_bucket_group_init(
1367 	sched_clutch_bucket_group_t clutch_bucket_group,
1368 	sched_clutch_t clutch,
1369 	sched_bucket_t bucket)
1370 {
1371 	bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1372 	clutch_bucket_group->scbg_bucket = bucket;
1373 	clutch_bucket_group->scbg_clutch = clutch;
1374 
1375 	int max_clusters = ml_get_cluster_count();
1376 	clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1377 	for (int i = 0; i < max_clusters; i++) {
1378 		sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1379 	}
1380 
1381 	os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1382 	os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1383 	os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1384 	/*
1385 	 * All thread groups should be initialized to be interactive; this allows the newly launched
1386 	 * thread groups to fairly compete with already running thread groups.
1387 	 */
1388 	clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1389 	clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1390 	os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1391 	clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1392 	clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1393 	clutch_bucket_group->scbg_amp_rebalance_last_chosen = UINT32_MAX;
1394 }
1395 
1396 static void
sched_clutch_bucket_group_destroy(sched_clutch_bucket_group_t clutch_bucket_group)1397 sched_clutch_bucket_group_destroy(
1398 	sched_clutch_bucket_group_t clutch_bucket_group)
1399 {
1400 	kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1401 	    clutch_bucket_group->scbg_clutch_buckets);
1402 }
1403 
1404 /*
1405  * sched_clutch_init_with_thread_group()
1406  *
1407  * Initialize the sched_clutch when the thread group is being created
1408  */
1409 void
sched_clutch_init_with_thread_group(sched_clutch_t clutch,struct thread_group * tg)1410 sched_clutch_init_with_thread_group(
1411 	sched_clutch_t clutch,
1412 	struct thread_group *tg)
1413 {
1414 	os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1415 
1416 	/* Initialize all the clutch buckets */
1417 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1418 		sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1419 	}
1420 
1421 	/* Grouping specific fields */
1422 	clutch->sc_tg = tg;
1423 }
1424 
1425 /*
1426  * sched_clutch_destroy()
1427  *
1428  * Destructor for clutch; called from thread group release code.
1429  */
1430 void
sched_clutch_destroy(sched_clutch_t clutch)1431 sched_clutch_destroy(
1432 	sched_clutch_t clutch)
1433 {
1434 	assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1435 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1436 		sched_clutch_bucket_group_destroy(&(clutch->sc_clutch_groups[i]));
1437 	}
1438 }
1439 
1440 #if CONFIG_SCHED_EDGE
1441 
1442 /*
1443  * Edge Scheduler Preferred Cluster Mechanism
1444  *
1445  * In order to have better control over various QoS buckets within a thread group, the Edge
1446  * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1447  * preferences are stored at the sched_clutch_bucket_group level since that represents all
1448  * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1449  * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1450  */
1451 
1452 static uint32_t
sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)1453 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1454 {
1455 	return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1456 }
1457 
1458 static uint32_t
sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)1459 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1460 {
1461 	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1462 }
1463 
1464 uint32_t
sched_edge_thread_preferred_cluster(thread_t thread)1465 sched_edge_thread_preferred_cluster(thread_t thread)
1466 {
1467 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1468 		/* For threads bound to a specific cluster, return the bound cluster id */
1469 		return sched_edge_thread_bound_cluster_id(thread);
1470 	}
1471 
1472 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1473 	sched_bucket_t sched_bucket = thread->th_sched_bucket;
1474 	if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1475 		sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1476 	}
1477 	sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1478 	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1479 }
1480 
1481 /*
1482  * Edge Scheduler Foreign Bucket Support
1483  *
1484  * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1485  * threads that are not native to the cluster. A clutch bucket is considered native if its
1486  * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1487  * bucket priority queue is used for rebalance operations to get threads back to their native
1488  * cluster quickly.
1489  *
1490  * It is possible to make this policy even more aggressive by considering all clusters that
1491  * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1492  * migrations which might have performance implications.
1493  */
1494 
1495 static void
sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1496 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1497 {
1498 	if (clutch_bucket->scb_foreign) {
1499 		clutch_bucket->scb_foreign = false;
1500 		priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1501 	}
1502 }
1503 
1504 static void
sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1505 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1506 {
1507 	if (!clutch_bucket->scb_foreign) {
1508 		clutch_bucket->scb_foreign = true;
1509 		priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1510 		priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1511 	}
1512 }
1513 
1514 /*
1515  * Edge Scheduler Cumulative Load Average
1516  *
1517  * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1518  * making thread migration decisions. The per-bucket load is maintained as a
1519  * cumulative count since higher scheduling buckets impact load on lower buckets
1520  * for thread migration decisions.
1521  *
1522  */
1523 
1524 static void
sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1525 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1526 {
1527 	switch (bucket) {
1528 	case TH_BUCKET_FIXPRI:    os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1529 	case TH_BUCKET_SHARE_FG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1530 	case TH_BUCKET_SHARE_IN:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1531 	case TH_BUCKET_SHARE_DF:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1532 	case TH_BUCKET_SHARE_UT:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1533 	case TH_BUCKET_SHARE_BG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1534 	default:
1535 		panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1536 	}
1537 }
1538 
1539 static void
sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1540 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1541 {
1542 	switch (bucket) {
1543 	case TH_BUCKET_FIXPRI:    os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1544 	case TH_BUCKET_SHARE_FG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1545 	case TH_BUCKET_SHARE_IN:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1546 	case TH_BUCKET_SHARE_DF:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1547 	case TH_BUCKET_SHARE_UT:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1548 	case TH_BUCKET_SHARE_BG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1549 	default:
1550 		panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1551 	}
1552 }
1553 
1554 uint16_t
sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1555 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1556 {
1557 	return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1558 }
1559 
1560 #endif /* CONFIG_SCHED_EDGE */
1561 
1562 /*
1563  * sched_clutch_bucket_hierarchy_insert()
1564  *
1565  * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1566  */
1567 static void
sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,sched_clutch_bucket_options_t options)1568 sched_clutch_bucket_hierarchy_insert(
1569 	sched_clutch_root_t root_clutch,
1570 	sched_clutch_bucket_t clutch_bucket,
1571 	sched_bucket_t bucket,
1572 	uint64_t timestamp,
1573 	sched_clutch_bucket_options_t options)
1574 {
1575 	sched_clutch_hierarchy_locked_assert(root_clutch);
1576 	if (bucket > TH_BUCKET_FIXPRI) {
1577 		/* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1578 		enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1579 	}
1580 #if CONFIG_SCHED_EDGE
1581 	/* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1582 	uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1583 	if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1584 		sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1585 	}
1586 #endif /* CONFIG_SCHED_EDGE */
1587 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1588 
1589 	/* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1590 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1591 		sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1592 	}
1593 
1594 	/* Insert the clutch bucket into the root bucket run queue with order based on options */
1595 	sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1596 	os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1597 	os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1598 }
1599 
1600 /*
1601  * sched_clutch_bucket_hierarchy_remove()
1602  *
1603  * Rotuine to remove a empty clutch bucket from the root hierarchy.
1604  */
1605 static void
sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,__unused sched_clutch_bucket_options_t options)1606 sched_clutch_bucket_hierarchy_remove(
1607 	sched_clutch_root_t root_clutch,
1608 	sched_clutch_bucket_t clutch_bucket,
1609 	sched_bucket_t bucket,
1610 	uint64_t timestamp,
1611 	__unused sched_clutch_bucket_options_t options)
1612 {
1613 	sched_clutch_hierarchy_locked_assert(root_clutch);
1614 	if (bucket > TH_BUCKET_FIXPRI) {
1615 		/* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1616 		remqueue(&clutch_bucket->scb_listlink);
1617 	}
1618 #if CONFIG_SCHED_EDGE
1619 	sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1620 #endif /* CONFIG_SCHED_EDGE */
1621 
1622 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1623 
1624 	/* Remove the clutch bucket from the root bucket priority queue */
1625 	sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1626 	os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1627 
1628 	/* If the root bucket priority queue is now empty, remove it from the root priority queue */
1629 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1630 		sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1631 	}
1632 	os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1633 }
1634 
1635 /*
1636  * sched_clutch_bucket_base_pri()
1637  *
1638  * Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1639  * highest base_pri and the highest sched_pri in the clutch bucket.
1640  */
1641 static uint8_t
sched_clutch_bucket_base_pri(sched_clutch_bucket_t clutch_bucket)1642 sched_clutch_bucket_base_pri(
1643 	sched_clutch_bucket_t clutch_bucket)
1644 {
1645 	assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1646 	/*
1647 	 * Since the clutch bucket can contain threads that are members of the group due
1648 	 * to the sched_pri being promoted or due to their base pri, the base priority of
1649 	 * the entire clutch bucket should be based on the highest thread (promoted or base)
1650 	 * in the clutch bucket.
1651 	 */
1652 	uint8_t max_pri = 0;
1653 	if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1654 		max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1655 	}
1656 	return max_pri;
1657 }
1658 
1659 /*
1660  * sched_clutch_interactivity_from_cpu_data()
1661  *
1662  * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1663  */
1664 static uint8_t
sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)1665 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1666 {
1667 	sched_clutch_bucket_cpu_data_t scb_cpu_data;
1668 	scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1669 	clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1670 	clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1671 	uint8_t interactive_score = 0;
1672 
1673 	if ((cpu_blocked == 0) && (cpu_used == 0)) {
1674 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1675 	}
1676 	/*
1677 	 * For all timeshare buckets, calculate the interactivity score of the bucket
1678 	 * and add it to the base priority
1679 	 */
1680 	if (cpu_blocked > cpu_used) {
1681 		/* Interactive clutch_bucket case */
1682 		interactive_score = sched_clutch_bucket_group_interactive_pri +
1683 		    ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1684 	} else {
1685 		/* Non-interactive clutch_bucket case */
1686 		interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1687 	}
1688 	return interactive_score;
1689 }
1690 
1691 /*
1692  * sched_clutch_bucket_pri_calculate()
1693  *
1694  * The priority calculation algorithm for the clutch_bucket is a slight
1695  * modification on the ULE interactivity score. It uses the base priority
1696  * of the clutch bucket and applies an interactivity score boost to the
1697  * highly responsive clutch buckets.
1698  */
1699 static uint8_t
sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t clutch_bucket,uint64_t timestamp)1700 sched_clutch_bucket_pri_calculate(
1701 	sched_clutch_bucket_t clutch_bucket,
1702 	uint64_t timestamp)
1703 {
1704 	/* For empty clutch buckets, return priority 0 */
1705 	if (clutch_bucket->scb_thr_count == 0) {
1706 		return 0;
1707 	}
1708 
1709 	uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1710 	uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1711 
1712 	assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1713 	uint8_t pri = base_pri + interactive_score;
1714 	if (pri != clutch_bucket->scb_priority) {
1715 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1716 		    thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1717 	}
1718 	return pri;
1719 }
1720 
1721 /*
1722  * sched_clutch_root_bucket_highest_clutch_bucket()
1723  *
1724  * Routine to find the highest priority clutch bucket
1725  * within the root bucket.
1726  */
1727 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread,bool * _Nullable chose_prev_thread)1728 sched_clutch_root_bucket_highest_clutch_bucket(
1729 	sched_clutch_root_t root_clutch,
1730 	sched_clutch_root_bucket_t root_bucket,
1731 	processor_t _Nullable processor,
1732 	thread_t _Nullable prev_thread,
1733 	bool *_Nullable chose_prev_thread)
1734 {
1735 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1736 		if (prev_thread != NULL) {
1737 			*chose_prev_thread = true;
1738 			return sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1739 		}
1740 		return NULL;
1741 	}
1742 	sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1743 	/* Consider the Clutch bucket of the previous thread */
1744 	if (prev_thread != NULL) {
1745 		assert(chose_prev_thread != NULL);
1746 		sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
1747 		int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
1748 		sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1749 		if (prev_clutch_bucket != clutch_bucket &&
1750 		    sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, clutch_bucket->scb_priority, processor->first_timeslice)) {
1751 			*chose_prev_thread = true;
1752 			return prev_clutch_bucket;
1753 		}
1754 	}
1755 	return clutch_bucket;
1756 }
1757 
1758 /*
1759  * sched_clutch_bucket_runnable()
1760  *
1761  * Perform all operations needed when a new clutch bucket becomes runnable.
1762  * It involves inserting the clutch_bucket into the hierarchy and updating the
1763  * root priority appropriately.
1764  */
1765 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1766 sched_clutch_bucket_runnable(
1767 	sched_clutch_bucket_t clutch_bucket,
1768 	sched_clutch_root_t root_clutch,
1769 	uint64_t timestamp,
1770 	sched_clutch_bucket_options_t options)
1771 {
1772 	sched_clutch_hierarchy_locked_assert(root_clutch);
1773 	/* Since the clutch bucket became newly runnable, update its pending timestamp */
1774 	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1775 	sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1776 
1777 	/* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1778 	sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
1779 	int16_t root_old_pri = root_clutch->scr_priority;
1780 	sched_clutch_root_pri_update(root_clutch);
1781 	return root_clutch->scr_priority > root_old_pri;
1782 }
1783 
1784 /*
1785  * sched_clutch_bucket_update()
1786  *
1787  * Update the clutch_bucket's position in the hierarchy. This routine is
1788  * called when a new thread is inserted or removed from a runnable clutch
1789  * bucket. The options specify some properties about the clutch bucket
1790  * insertion order into the clutch bucket runq.
1791  */
1792 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1793 sched_clutch_bucket_update(
1794 	sched_clutch_bucket_t clutch_bucket,
1795 	sched_clutch_root_t root_clutch,
1796 	uint64_t timestamp,
1797 	sched_clutch_bucket_options_t options)
1798 {
1799 	sched_clutch_hierarchy_locked_assert(root_clutch);
1800 	uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1801 	sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1802 	if (new_pri == clutch_bucket->scb_priority) {
1803 		/*
1804 		 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1805 		 * to the end of the runq. Typically used when a thread is selected for execution
1806 		 * from a clutch bucket.
1807 		 */
1808 		if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1809 			sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1810 		}
1811 		return false;
1812 	}
1813 	sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1814 #if CONFIG_SCHED_EDGE
1815 	if (clutch_bucket->scb_foreign) {
1816 		priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1817 	}
1818 #endif /* CONFIG_SCHED_EDGE */
1819 	clutch_bucket->scb_priority = new_pri;
1820 #if CONFIG_SCHED_EDGE
1821 	if (clutch_bucket->scb_foreign) {
1822 		priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1823 		priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1824 	}
1825 #endif /* CONFIG_SCHED_EDGE */
1826 	sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1827 
1828 	int16_t root_old_pri = root_clutch->scr_priority;
1829 	sched_clutch_root_pri_update(root_clutch);
1830 	return root_clutch->scr_priority > root_old_pri;
1831 }
1832 
1833 /*
1834  * sched_clutch_bucket_empty()
1835  *
1836  * Perform all the operations needed when a clutch_bucket is no longer runnable.
1837  * It involves removing the clutch bucket from the hierarchy and updaing the root
1838  * priority appropriately.
1839  */
1840 static void
sched_clutch_bucket_empty(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1841 sched_clutch_bucket_empty(
1842 	sched_clutch_bucket_t clutch_bucket,
1843 	sched_clutch_root_t root_clutch,
1844 	uint64_t timestamp,
1845 	sched_clutch_bucket_options_t options)
1846 {
1847 	sched_clutch_hierarchy_locked_assert(root_clutch);
1848 	sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1849 	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1850 	sched_clutch_root_pri_update(root_clutch);
1851 }
1852 
1853 /*
1854  * sched_clutch_cpu_usage_update()
1855  *
1856  * Routine to update CPU usage of the thread in the hierarchy.
1857  */
1858 void
sched_clutch_cpu_usage_update(thread_t thread,uint64_t delta)1859 sched_clutch_cpu_usage_update(
1860 	thread_t thread,
1861 	uint64_t delta)
1862 {
1863 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1864 		return;
1865 	}
1866 
1867 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1868 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1869 	sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1870 }
1871 
1872 /*
1873  * sched_clutch_bucket_group_cpu_usage_update()
1874  *
1875  * Routine to update the CPU usage of the clutch_bucket.
1876  */
1877 static void
sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t delta)1878 sched_clutch_bucket_group_cpu_usage_update(
1879 	sched_clutch_bucket_group_t clutch_bucket_group,
1880 	uint64_t delta)
1881 {
1882 	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1883 		/* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1884 		return;
1885 	}
1886 	delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1887 	os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1888 }
1889 
1890 /*
1891  * sched_clutch_bucket_group_cpu_pending_adjust()
1892  *
1893  * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1894  * such that one "pending interval" provides one point improvement in interactivity score.
1895  */
1896 static inline uint64_t
sched_clutch_bucket_group_cpu_pending_adjust(uint64_t cpu_used,uint64_t cpu_blocked,uint8_t pending_intervals)1897 sched_clutch_bucket_group_cpu_pending_adjust(
1898 	uint64_t cpu_used,
1899 	uint64_t cpu_blocked,
1900 	uint8_t pending_intervals)
1901 {
1902 	uint64_t cpu_used_adjusted = 0;
1903 	if (cpu_blocked < cpu_used) {
1904 		cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1905 		cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1906 	} else {
1907 		uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1908 		cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1909 	}
1910 	return cpu_used_adjusted;
1911 }
1912 
1913 /*
1914  * sched_clutch_bucket_group_cpu_adjust()
1915  *
1916  * Routine to scale the cpu usage and blocked time once the sum gets bigger
1917  * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1918  * manageable and maintain the same ratio while allowing clutch buckets to
1919  * adjust behavior and reflect in the interactivity score in a reasonable
1920  * amount of time. Also adjusts the CPU usage based on pending_intervals
1921  * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1922  */
1923 static void
sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t clutch_bucket_group,uint8_t pending_intervals)1924 sched_clutch_bucket_group_cpu_adjust(
1925 	sched_clutch_bucket_group_t clutch_bucket_group,
1926 	uint8_t pending_intervals)
1927 {
1928 	sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1929 	sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1930 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1931 		clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1932 		clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1933 
1934 		if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1935 		        /* No changes to the CPU used and blocked values */
1936 		        os_atomic_rmw_loop_give_up();
1937 		}
1938 		if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1939 		        /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1940 		        cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1941 		        cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1942 		}
1943 		/* Use the shift passed in to ageout the CPU usage */
1944 		cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1945 		new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1946 		new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1947 	});
1948 }
1949 
1950 /*
1951  * Thread level scheduling algorithm
1952  *
1953  * The thread level scheduling algorithm uses the mach timeshare
1954  * decay based algorithm to achieve sharing between threads within the
1955  * same clutch bucket. The load/priority shifts etc. are all maintained
1956  * at the clutch bucket level and used for decay calculation of the
1957  * threads. The load sampling is still driven off the scheduler tick
1958  * for runnable clutch buckets (it does not use the new higher frequency
1959  * EWMA based load calculation). The idea is that the contention and load
1960  * within clutch_buckets should be limited enough to not see heavy decay
1961  * and timeshare effectively.
1962  */
1963 
1964 /*
1965  * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1966  *
1967  * Increment the run count for the clutch bucket associated with the
1968  * thread.
1969  */
1970 uint32_t
sched_clutch_thread_run_bucket_incr(thread_t thread,sched_bucket_t bucket)1971 sched_clutch_thread_run_bucket_incr(
1972 	thread_t thread,
1973 	sched_bucket_t bucket)
1974 {
1975 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1976 		return 0;
1977 	}
1978 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1979 	return sched_clutch_run_bucket_incr(clutch, bucket);
1980 }
1981 
1982 static uint32_t
sched_clutch_run_bucket_incr(sched_clutch_t clutch,sched_bucket_t bucket)1983 sched_clutch_run_bucket_incr(
1984 	sched_clutch_t clutch,
1985 	sched_bucket_t bucket)
1986 {
1987 	assert(bucket != TH_BUCKET_RUN);
1988 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1989 	return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
1990 }
1991 
1992 /*
1993  * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1994  *
1995  * Decrement the run count for the clutch bucket associated with the
1996  * thread.
1997  */
1998 uint32_t
sched_clutch_thread_run_bucket_decr(thread_t thread,sched_bucket_t bucket)1999 sched_clutch_thread_run_bucket_decr(
2000 	thread_t thread,
2001 	sched_bucket_t bucket)
2002 {
2003 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2004 		return 0;
2005 	}
2006 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2007 	return sched_clutch_run_bucket_decr(clutch, bucket);
2008 }
2009 
2010 static uint32_t
sched_clutch_run_bucket_decr(sched_clutch_t clutch,sched_bucket_t bucket)2011 sched_clutch_run_bucket_decr(
2012 	sched_clutch_t clutch,
2013 	sched_bucket_t bucket)
2014 {
2015 	assert(bucket != TH_BUCKET_RUN);
2016 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2017 	return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
2018 }
2019 
2020 /*
2021  * sched_clutch_bucket_group_timeshare_update()
2022  *
2023  * Routine to update the load and priority shift for the clutch_bucket_group
2024  * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
2025  * clutch buckets with runnable threads in them. So it is important to maintain
2026  * the timesharing information at the clutch_bucket_group level instead of
2027  * individual clutch buckets (because the algorithm is trying to timeshare all
2028  * threads at the same QoS irrespective of which hierarchy they are enqueued in).
2029  *
2030  * The routine is called from the sched tick handling code to make sure this value
2031  * is updated at least once every sched tick. For clutch bucket groups which have
2032  * not been runnable for very long, the clutch_bucket_group maintains a "last
2033  * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
2034  * if this value is outdated, the load and shifts are updated.
2035  *
2036  * Possible optimization:
2037  * - The current algorithm samples the load every sched tick (125ms).
2038  *   This is prone to spikes in runnable counts; if that turns out to be
2039  *   a problem, a simple solution would be to do the EWMA trick to sample
2040  *   load at every load_tick (30ms) and use the averaged value for the pri
2041  *   shift calculation.
2042  */
2043 static void
sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_bucket_t clutch_bucket,uint64_t ctime)2044 sched_clutch_bucket_group_timeshare_update(
2045 	sched_clutch_bucket_group_t clutch_bucket_group,
2046 	sched_clutch_bucket_t clutch_bucket,
2047 	uint64_t ctime)
2048 {
2049 	if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
2050 		/* No timesharing needed for fixed priority Above UI threads */
2051 		return;
2052 	}
2053 
2054 	/*
2055 	 * Update the timeshare parameters for the clutch bucket group
2056 	 * if they havent been updated in this tick.
2057 	 */
2058 	uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
2059 	uint32_t current_sched_ts = sched_tick;
2060 	if (sched_ts < current_sched_ts) {
2061 		os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
2062 		/* NCPU wide workloads should not experience decay */
2063 		uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
2064 		uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
2065 		bucket_group_load = MIN(bucket_group_load, NRQS - 1);
2066 		uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
2067 		/* Ensure that the pri_shift value is reasonable */
2068 		pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
2069 		os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
2070 	}
2071 
2072 	/*
2073 	 * Update the clutch bucket priority; this allows clutch buckets that have been pending
2074 	 * for a long time to get an updated interactivity score.
2075 	 */
2076 	sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
2077 }
2078 
2079 /*
2080  * sched_clutch_thread_clutch_update()
2081  *
2082  * Routine called when the thread changes its thread group. The current
2083  * implementation relies on the fact that the thread group is changed only from
2084  * the context of the thread itself or when the thread is runnable but not in a
2085  * runqueue. Due to this fact, the thread group change causes only counter
2086  * updates in the old & new clutch buckets and no hierarchy changes. The routine
2087  * also attributes the CPU used so far to the old clutch.
2088  */
2089 void
sched_clutch_thread_clutch_update(thread_t thread,sched_clutch_t old_clutch,sched_clutch_t new_clutch)2090 sched_clutch_thread_clutch_update(
2091 	thread_t thread,
2092 	sched_clutch_t old_clutch,
2093 	sched_clutch_t new_clutch)
2094 {
2095 	uint32_t cpu_delta;
2096 
2097 	if (old_clutch) {
2098 		assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
2099 
2100 		sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
2101 		/*
2102 		 * Calculate the CPU used by this thread in the old bucket and
2103 		 * add it to the old clutch bucket. This uses the same CPU usage
2104 		 * logic as update_priority etc.
2105 		 */
2106 		sched_tick_delta(thread, cpu_delta);
2107 		if (thread->pri_shift < INT8_MAX) {
2108 			thread->sched_usage += cpu_delta;
2109 		}
2110 		thread->cpu_delta += cpu_delta;
2111 		if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
2112 			sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
2113 			sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
2114 		}
2115 	}
2116 
2117 	if (new_clutch) {
2118 		sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
2119 	}
2120 }
2121 
2122 /* Thread Insertion/Removal/Selection routines */
2123 
2124 #if CONFIG_SCHED_EDGE
2125 
2126 /*
2127  * Edge Scheduler Bound Thread Support
2128  *
2129  * The edge scheduler allows threads to be bound to specific clusters. The scheduler
2130  * maintains a separate runq on the clutch root to hold these bound threads. These
2131  * bound threads count towards the root priority and thread count, but are ignored
2132  * for thread migration/steal decisions. Bound threads that are enqueued in the
2133  * separate runq have the th_bound_cluster_enqueued flag set to allow easy
2134  * removal.
2135  *
2136  * Bound Threads Timesharing
2137  * The bound threads share the timesharing properties of the clutch bucket group they are
2138  * part of. They contribute to the load and use priority shifts/decay values from the
2139  * clutch bucket group.
2140  */
2141 
2142 static boolean_t
sched_edge_bound_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2143 sched_edge_bound_thread_insert(
2144 	sched_clutch_root_t root_clutch,
2145 	thread_t thread,
2146 	integer_t options)
2147 {
2148 	/* Update the clutch runnable count and priority */
2149 	sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2150 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2151 	if (root_bucket->scrb_bound_thread_runq.count == 0) {
2152 		sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
2153 	}
2154 
2155 	assert((thread->th_bound_cluster_enqueued) == false);
2156 	run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
2157 	thread->th_bound_cluster_enqueued = true;
2158 
2159 	/* Increment the urgency counter for the root if necessary */
2160 	sched_clutch_root_urgency_inc(root_clutch, thread);
2161 
2162 	int16_t root_old_pri = root_clutch->scr_priority;
2163 	sched_clutch_root_pri_update(root_clutch);
2164 	return root_clutch->scr_priority > root_old_pri;
2165 }
2166 
2167 static void
sched_edge_bound_thread_remove(sched_clutch_root_t root_clutch,thread_t thread)2168 sched_edge_bound_thread_remove(
2169 	sched_clutch_root_t root_clutch,
2170 	thread_t thread)
2171 {
2172 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2173 	assert((thread->th_bound_cluster_enqueued) == true);
2174 	run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
2175 	thread->th_bound_cluster_enqueued = false;
2176 
2177 	/* Decrement the urgency counter for the root if necessary */
2178 	sched_clutch_root_urgency_dec(root_clutch, thread);
2179 
2180 	/* Update the clutch runnable count and priority */
2181 	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2182 	if (root_bucket->scrb_bound_thread_runq.count == 0) {
2183 		sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
2184 	}
2185 	sched_clutch_root_pri_update(root_clutch);
2186 }
2187 
2188 /*
2189  * Edge Scheduler cluster shared resource threads load balancing
2190  *
2191  * The Edge scheduler attempts to load balance cluster shared resource intensive threads
2192  * across clusters in order to reduce contention on the shared resources. It achieves
2193  * that by maintaining the runnable and running shared resource load on each cluster
2194  * and balancing the load across multiple clusters.
2195  *
2196  * The current implementation for cluster shared resource load balancing looks at
2197  * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
2198  * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
2199  * do not steal/rebalance shared resource threads. Some more details for the implementation:
2200  *
2201  * - When threads are tagged as shared resource, they go through the cluster selection logic
2202  *   which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
2203  *   enqueued in the cluster bound runqueue.
2204  *
2205  * - When the threads start running and call avoid_processor, the load balancing logic will be
2206  *   invoked and cause the thread to be sent to a more preferred cluster if one exists and has
2207  *   no shared resource load.
2208  *
2209  * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
2210  *   it will look at running shared resource threads on foreign clusters and actively rebalance them.
2211  *
2212  * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
2213  *   go idle intentionally.
2214  *
2215  * - One caveat of this design is that if a preferred CPU has already run and finished its shared
2216  *   resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
2217  *   The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
2218  *   events listed above happen.
2219  *
2220  * - Also it currently does not consider other properties such as thread priorities and
2221  *   qos level thread load in the thread placement decision.
2222  *
2223  * Edge Scheduler cluster shared resource thread scheduling policy
2224  *
2225  * The threads for shared resources can be scheduled using one of the two policies:
2226  *
2227  * EDGE_SHARED_RSRC_SCHED_POLICY_RR
2228  * This policy distributes the threads so that they spread across all available clusters
2229  * irrespective of type. The idea is that this scheduling policy will put a shared resource
2230  * thread on each cluster on the platform before it starts doubling up on clusters.
2231  *
2232  * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
2233  * This policy distributes threads so that the threads first fill up all the capacity on
2234  * the preferred cluster and its homogeneous peers before spilling to different core type.
2235  * The current implementation defines capacity based on the number of CPUs in the cluster;
2236  * so a cluster's shared resource is considered full if there are "n" runnable + running
2237  * shared resource threads on the cluster with n cpus. This policy is different from the
2238  * default scheduling policy of the edge scheduler since this always tries to fill up the
2239  * native clusters to capacity even when non-native clusters might be idle.
2240  */
2241 __options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
2242 	EDGE_SHARED_RSRC_SCHED_POLICY_RR                = 0,
2243 	EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST      = 1,
2244 });
2245 
2246 const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
2247 	[CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
2248 	[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
2249 };
2250 
2251 static void
sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch,thread_t thread)2252 sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
2253 {
2254 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2255 		root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
2256 		thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
2257 	}
2258 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2259 		root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
2260 		thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
2261 	}
2262 }
2263 
2264 static void
sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch,thread_t thread)2265 sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
2266 {
2267 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2268 		if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2269 			thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2270 			root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2271 		}
2272 	}
2273 }
2274 
2275 uint16_t
sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch,cluster_shared_rsrc_type_t shared_rsrc_type)2276 sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2277 {
2278 	return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2279 }
2280 
2281 /*
2282  * sched_edge_shared_rsrc_idle()
2283  *
2284  * Routine used to determine if the constrained resource for the pset is idle. This is
2285  * used by a CPU going idle to decide if it should rebalance a running shared resource
2286  * thread from a non-preferred cluster.
2287  */
2288 static boolean_t
sched_edge_shared_rsrc_idle(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2289 sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2290 {
2291 	return sched_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
2292 }
2293 
2294 /*
2295  * sched_edge_thread_shared_rsrc_type
2296  *
2297  * This routine decides if a given thread needs special handling for being a
2298  * heavy shared resource user. It is valid for the same thread to be using
2299  * several shared resources at the same time and have multiple policy flags set.
2300  * This routine determines which of those properties will be used for load
2301  * balancing and migration decisions.
2302  */
2303 static cluster_shared_rsrc_type_t
sched_edge_thread_shared_rsrc_type(thread_t thread)2304 sched_edge_thread_shared_rsrc_type(thread_t thread)
2305 {
2306 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2307 		return CLUSTER_SHARED_RSRC_TYPE_RR;
2308 	}
2309 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2310 		return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2311 	}
2312 	return CLUSTER_SHARED_RSRC_TYPE_NONE;
2313 }
2314 
2315 #endif /* CONFIG_SCHED_EDGE */
2316 
2317 /*
2318  * sched_clutch_thread_bound_lookup()
2319  *
2320  * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2321  */
2322 static thread_t
sched_clutch_thread_bound_lookup(__unused sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t processor,thread_t _Nullable prev_thread)2323 sched_clutch_thread_bound_lookup(
2324 	__unused sched_clutch_root_t root_clutch,
2325 	sched_clutch_root_bucket_t root_bucket,
2326 	processor_t processor,
2327 	thread_t _Nullable prev_thread)
2328 {
2329 	assert(root_bucket->scrb_bound == true);
2330 	thread_t bound_thread = run_queue_peek(&root_bucket->scrb_bound_thread_runq);
2331 	if ((prev_thread != NULL) &&
2332 	    (bound_thread == NULL || sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_thread->sched_pri, processor->first_timeslice))) {
2333 		return prev_thread;
2334 	}
2335 	assert(bound_thread != THREAD_NULL);
2336 	return bound_thread;
2337 }
2338 
2339 /*
2340  * Clutch Bucket Group Thread Counts and Pending time calculation
2341  *
2342  * The pending time on the clutch_bucket_group allows the scheduler to track if it
2343  * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2344  * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2345  * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2346  * pending time is advanced to the time of thread selection.
2347  *
2348  * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2349  * simulataneously, it is important that the updates to thread counts and pending timestamps
2350  * happen atomically. The implementation relies on the following aspects to make that work
2351  * as expected:
2352  * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2353  *   is held when threads are added/removed and pending timestamps are updated
2354  * - The thread count and pending timestamp can be updated atomically using double wide
2355  *   128 bit atomics
2356  *
2357  * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2358  * above to atomically update the interactivity score for a clutch bucket group.
2359  */
2360 
2361 #if CONFIG_SCHED_EDGE
2362 
2363 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2364 sched_clutch_bucket_group_thr_count_inc(
2365 	sched_clutch_bucket_group_t clutch_bucket_group,
2366 	uint64_t timestamp)
2367 {
2368 	sched_clutch_counter_time_t old_pending_data;
2369 	sched_clutch_counter_time_t new_pending_data;
2370 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2371 		new_pending_data.scct_count = old_pending_data.scct_count + 1;
2372 		new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2373 		if (old_pending_data.scct_count == 0) {
2374 		        new_pending_data.scct_timestamp = timestamp;
2375 		}
2376 	});
2377 }
2378 
2379 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2380 sched_clutch_bucket_group_thr_count_dec(
2381 	sched_clutch_bucket_group_t clutch_bucket_group,
2382 	uint64_t timestamp)
2383 {
2384 	sched_clutch_counter_time_t old_pending_data;
2385 	sched_clutch_counter_time_t new_pending_data;
2386 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2387 		new_pending_data.scct_count = old_pending_data.scct_count - 1;
2388 		if (new_pending_data.scct_count == 0) {
2389 		        new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2390 		} else {
2391 		        new_pending_data.scct_timestamp = timestamp;
2392 		}
2393 	});
2394 }
2395 
2396 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2397 sched_clutch_bucket_group_pending_ageout(
2398 	sched_clutch_bucket_group_t clutch_bucket_group,
2399 	uint64_t timestamp)
2400 {
2401 	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2402 	sched_clutch_counter_time_t old_pending_data;
2403 	sched_clutch_counter_time_t new_pending_data;
2404 	uint8_t cpu_usage_shift = 0;
2405 
2406 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2407 		cpu_usage_shift = 0;
2408 		uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2409 		bool old_update = (old_pending_ts >= timestamp);
2410 		bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2411 		bool no_bucket_load = (bucket_load == 0);
2412 		if (old_update || no_pending_time || no_bucket_load) {
2413 		        os_atomic_rmw_loop_give_up();
2414 		}
2415 
2416 		/* Calculate the time the clutch bucket group has been pending */
2417 		uint64_t pending_delta = timestamp - old_pending_ts;
2418 		/*
2419 		 * Other buckets should get a chance to run first before artificially boosting
2420 		 * this clutch bucket group's interactivity score, at least when the entire root
2421 		 * bucket is getting a large enough share of CPU.
2422 		 */
2423 		uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2424 		if (pending_delta < interactivity_delta) {
2425 		        os_atomic_rmw_loop_give_up();
2426 		}
2427 		cpu_usage_shift = (pending_delta / interactivity_delta);
2428 		new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2429 		new_pending_data.scct_count = old_pending_data.scct_count;
2430 	});
2431 	return cpu_usage_shift;
2432 }
2433 
2434 static boolean_t
sched_edge_thread_should_be_inserted_as_bound(sched_clutch_root_t root_clutch,thread_t thread)2435 sched_edge_thread_should_be_inserted_as_bound(
2436 	sched_clutch_root_t root_clutch,
2437 	thread_t thread)
2438 {
2439 	/*
2440 	 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2441 	 * One scenario where a bound thread might not be getting enqueued in the bound cluster
2442 	 * hierarchy would be if the thread is "soft" bound and the bound cluster is
2443 	 * de-recommended. In that case, the thread should be treated as an unbound
2444 	 * thread.
2445 	 */
2446 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2447 		return TRUE;
2448 	}
2449 	/*
2450 	 * Use bound runqueue for shared resource threads. See "cluster shared resource
2451 	 * threads load balancing" section for details.
2452 	 */
2453 	if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2454 		return TRUE;
2455 	}
2456 	return FALSE;
2457 }
2458 
2459 #else /* CONFIG_SCHED_EDGE */
2460 
2461 /*
2462  * For the clutch scheduler, atomicity is ensured by making sure all operations
2463  * are happening under the pset lock of the only cluster present on the platform.
2464  */
2465 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2466 sched_clutch_bucket_group_thr_count_inc(
2467 	sched_clutch_bucket_group_t clutch_bucket_group,
2468 	uint64_t timestamp)
2469 {
2470 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2471 	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2472 		clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2473 	}
2474 	clutch_bucket_group->scbg_pending_data.scct_count++;
2475 }
2476 
2477 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2478 sched_clutch_bucket_group_thr_count_dec(
2479 	sched_clutch_bucket_group_t clutch_bucket_group,
2480 	uint64_t timestamp)
2481 {
2482 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2483 	clutch_bucket_group->scbg_pending_data.scct_count--;
2484 	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2485 		clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2486 	} else {
2487 		clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2488 	}
2489 }
2490 
2491 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2492 sched_clutch_bucket_group_pending_ageout(
2493 	sched_clutch_bucket_group_t clutch_bucket_group,
2494 	uint64_t timestamp)
2495 {
2496 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2497 	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2498 	uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2499 	bool old_update = (old_pending_ts >= timestamp);
2500 	bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2501 	bool no_bucket_load = (bucket_load == 0);
2502 	if (old_update || no_pending_time || no_bucket_load) {
2503 		return 0;
2504 	}
2505 	uint64_t pending_delta = timestamp - old_pending_ts;
2506 	/*
2507 	 * Other buckets should get a chance to run first before artificially boosting
2508 	 * this clutch bucket group's interactivity score, at least when the entire root
2509 	 * bucket is getting a large enough share of CPU.
2510 	 */
2511 	uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2512 	if (pending_delta < interactivity_delta) {
2513 		return 0;
2514 	}
2515 	uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2516 	clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2517 	return cpu_usage_shift;
2518 }
2519 
2520 #endif /* CONFIG_SCHED_EDGE */
2521 
2522 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2523 sched_clutch_bucket_group_interactivity_score_calculate(
2524 	sched_clutch_bucket_group_t clutch_bucket_group,
2525 	uint64_t timestamp)
2526 {
2527 	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2528 		/*
2529 		 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2530 		 * priorities, make sure all AboveUI buckets are marked interactive.
2531 		 */
2532 		assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2533 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2534 	}
2535 	/* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2536 	uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2537 	/* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2538 	sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2539 	uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2540 	/* Write back any interactivity score update */
2541 #if CONFIG_SCHED_EDGE
2542 	sched_clutch_counter_time_t old_interactivity_data;
2543 	sched_clutch_counter_time_t new_interactivity_data;
2544 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2545 		new_interactivity_data.scct_count = old_interactivity_data.scct_count;
2546 		if (old_interactivity_data.scct_timestamp >= timestamp) {
2547 		        os_atomic_rmw_loop_give_up();
2548 		}
2549 		new_interactivity_data.scct_timestamp = timestamp;
2550 		if (old_interactivity_data.scct_timestamp != 0) {
2551 		        new_interactivity_data.scct_count = interactivity_score;
2552 		}
2553 	});
2554 	return (uint8_t)new_interactivity_data.scct_count;
2555 #else /* !CONFIG_SCHED_EDGE */
2556 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2557 	if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2558 		clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2559 		clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2560 	}
2561 	return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2562 #endif /* !CONFIG_SCHED_EDGE */
2563 }
2564 
2565 /*
2566  * Clutch Bucket Group Run Count and Blocked Time Accounting
2567  *
2568  * The clutch bucket group maintains the number of runnable/running threads in the group.
2569  * Since the blocked time of the clutch bucket group is based on this count, it is
2570  * important to make sure the blocking timestamp and the run count are updated atomically.
2571  *
2572  * Since the run count increments happen without any pset locks held, the scheduler updates
2573  * the count & timestamp using double wide 128 bit atomics.
2574  */
2575 
2576 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2577 sched_clutch_bucket_group_run_count_inc(
2578 	sched_clutch_bucket_group_t clutch_bucket_group)
2579 {
2580 	sched_clutch_counter_time_t old_blocked_data;
2581 	sched_clutch_counter_time_t new_blocked_data;
2582 
2583 	bool update_blocked_time = false;
2584 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2585 		new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2586 		new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2587 		update_blocked_time = false;
2588 		if (old_blocked_data.scct_count == 0) {
2589 		        new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2590 		        update_blocked_time = true;
2591 		}
2592 	});
2593 	if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2594 		uint64_t ctime = mach_absolute_time();
2595 		if (ctime > old_blocked_data.scct_timestamp) {
2596 			uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2597 			blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2598 			os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2599 		}
2600 	}
2601 	return (uint32_t)new_blocked_data.scct_count;
2602 }
2603 
2604 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2605 sched_clutch_bucket_group_run_count_dec(
2606 	sched_clutch_bucket_group_t clutch_bucket_group)
2607 {
2608 	sched_clutch_counter_time_t old_blocked_data;
2609 	sched_clutch_counter_time_t new_blocked_data;
2610 
2611 	uint64_t ctime = mach_absolute_time();
2612 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2613 		new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2614 		new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2615 		if (new_blocked_data.scct_count == 0) {
2616 		        new_blocked_data.scct_timestamp = ctime;
2617 		}
2618 	});
2619 	return (uint32_t)new_blocked_data.scct_count;
2620 }
2621 
2622 static inline sched_clutch_bucket_t
sched_clutch_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t thread)2623 sched_clutch_bucket_for_thread(
2624 	sched_clutch_root_t root_clutch,
2625 	thread_t thread)
2626 {
2627 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2628 	assert(thread->thread_group == clutch->sc_tg);
2629 
2630 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2631 	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2632 	assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2633 
2634 	return clutch_bucket;
2635 }
2636 
2637 static inline sched_clutch_bucket_group_t
sched_clutch_bucket_group_for_thread(thread_t prev_thread)2638 sched_clutch_bucket_group_for_thread(thread_t prev_thread)
2639 {
2640 	sched_clutch_t clutch = sched_clutch_for_thread_group(prev_thread->thread_group);
2641 	return &clutch->sc_clutch_groups[prev_thread->th_sched_bucket];
2642 }
2643 
2644 /*
2645  * sched_clutch_thread_insert()
2646  *
2647  * Routine to insert a thread into the sched clutch hierarchy.
2648  * Update the counts at all levels of the hierarchy and insert the nodes
2649  * as they become runnable. Always called with the pset lock held.
2650  */
2651 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2652 sched_clutch_thread_insert(
2653 	sched_clutch_root_t root_clutch,
2654 	thread_t thread,
2655 	integer_t options)
2656 {
2657 	boolean_t result = FALSE;
2658 
2659 	sched_clutch_hierarchy_locked_assert(root_clutch);
2660 #if CONFIG_SCHED_EDGE
2661 	sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2662 	sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2663 
2664 	if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, thread)) {
2665 		/*
2666 		 * Includes threads bound to this specific cluster as well as all
2667 		 * shared resource threads.
2668 		 */
2669 		return sched_edge_bound_thread_insert(root_clutch, thread, options);
2670 	}
2671 #endif /* CONFIG_SCHED_EDGE */
2672 
2673 	uint64_t current_timestamp = mach_absolute_time();
2674 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2675 	assert(thread->thread_group == clutch->sc_tg);
2676 	sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, thread);
2677 	assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2678 
2679 	/*
2680 	 * Thread linkage in clutch_bucket
2681 	 *
2682 	 * A thread has a few linkages within the clutch bucket:
2683 	 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2684 	 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2685 	 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2686 	 */
2687 
2688 	/* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2689 	thread->th_clutch_runq_link.stamp = current_timestamp;
2690 	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2691 	    (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2692 	priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2693 
2694 	/* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2695 	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2696 	    sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2697 	priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2698 
2699 	/* Insert thread into timesharing queue of the clutch bucket */
2700 	enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2701 
2702 	/* Increment the urgency counter for the root if necessary */
2703 	sched_clutch_root_urgency_inc(root_clutch, thread);
2704 
2705 	os_atomic_inc(&clutch->sc_thr_count, relaxed);
2706 	sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2707 
2708 	/* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2709 	sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2710 	if (clutch_bucket->scb_thr_count == 0) {
2711 		sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2712 		sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2713 		result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2714 	} else {
2715 		sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2716 		sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2717 		result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2718 	}
2719 
2720 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2721 	    root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2722 	    SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2723 	return result;
2724 }
2725 
2726 /*
2727  * sched_clutch_thread_remove()
2728  *
2729  * Routine to remove a thread from the sched clutch hierarchy.
2730  * Update the counts at all levels of the hierarchy and remove the nodes
2731  * as they become empty. Always called with the pset lock held.
2732  */
2733 static void
sched_clutch_thread_remove(sched_clutch_root_t root_clutch,thread_t thread,uint64_t current_timestamp,sched_clutch_bucket_options_t options)2734 sched_clutch_thread_remove(
2735 	sched_clutch_root_t root_clutch,
2736 	thread_t thread,
2737 	uint64_t current_timestamp,
2738 	sched_clutch_bucket_options_t options)
2739 {
2740 	sched_clutch_hierarchy_locked_assert(root_clutch);
2741 #if CONFIG_SCHED_EDGE
2742 	sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2743 	sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2744 
2745 	if (thread->th_bound_cluster_enqueued) {
2746 		sched_edge_bound_thread_remove(root_clutch, thread);
2747 		return;
2748 	}
2749 #endif /* CONFIG_SCHED_EDGE */
2750 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2751 	assert(thread->thread_group == clutch->sc_tg);
2752 	thread_assert_runq_nonnull(thread);
2753 
2754 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2755 	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2756 	assert(clutch_bucket->scb_root == root_clutch);
2757 
2758 	/* Decrement the urgency counter for the root if necessary */
2759 	sched_clutch_root_urgency_dec(root_clutch, thread);
2760 	/* Remove thread from the clutch_bucket */
2761 	priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2762 	remqueue(&thread->th_clutch_timeshare_link);
2763 
2764 	priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2765 
2766 	/*
2767 	 * Warning: After this point, the thread's scheduling fields may be
2768 	 * modified by other cores that acquire the thread lock.
2769 	 */
2770 	thread_clear_runq(thread);
2771 
2772 	/* Update counts at various levels of the hierarchy */
2773 	os_atomic_dec(&clutch->sc_thr_count, relaxed);
2774 	sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2775 	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2776 	sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2777 
2778 	/* Remove the clutch from hierarchy (if needed) and update properties */
2779 	if (clutch_bucket->scb_thr_count == 0) {
2780 		sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2781 	} else {
2782 		sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2783 	}
2784 
2785 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2786 	    root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2787 	    SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2788 }
2789 
2790 /*
2791  * sched_clutch_thread_unbound_lookup()
2792  *
2793  * Routine to find the highest unbound thread in the root clutch.
2794  * Helps find threads easily for steal/migrate scenarios in the
2795  * Edge scheduler.
2796  */
2797 static thread_t
sched_clutch_thread_unbound_lookup(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread)2798 sched_clutch_thread_unbound_lookup(
2799 	sched_clutch_root_t root_clutch,
2800 	sched_clutch_root_bucket_t root_bucket,
2801 	processor_t _Nullable processor,
2802 	thread_t _Nullable prev_thread)
2803 {
2804 	assert(processor != NULL || prev_thread == NULL);
2805 	assert(root_bucket->scrb_bound == false);
2806 	sched_clutch_hierarchy_locked_assert(root_clutch);
2807 
2808 	/* Find the highest priority clutch bucket in this root bucket */
2809 	bool chose_prev_thread = false;
2810 	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket, processor, prev_thread, &chose_prev_thread);
2811 	assert(clutch_bucket != NULL);
2812 
2813 	if (chose_prev_thread) {
2814 		/* We have determined that prev_thread is the highest thread, based on the Clutch bucket level policy */
2815 		assert(processor != NULL && prev_thread != NULL);
2816 		return prev_thread;
2817 	}
2818 
2819 	/* Find the highest priority runnable thread in this clutch bucket */
2820 	thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2821 	assert(thread != NULL);
2822 
2823 	/* Consider the previous thread */
2824 	if (prev_thread != NULL &&
2825 	    sched_clutch_bucket_for_thread(root_clutch, prev_thread) == clutch_bucket &&
2826 	    sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, thread->sched_pri, processor->first_timeslice)) {
2827 		thread = prev_thread;
2828 	}
2829 
2830 	return thread;
2831 }
2832 
2833 static sched_clutch_root_bucket_t
sched_clutch_root_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t prev_thread)2834 sched_clutch_root_bucket_for_thread(
2835 	sched_clutch_root_t root_clutch,
2836 	thread_t prev_thread)
2837 {
2838 #if CONFIG_SCHED_EDGE
2839 	if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, prev_thread)) {
2840 		return &root_clutch->scr_bound_buckets[prev_thread->th_sched_bucket];
2841 	}
2842 #endif /* CONFIG_SCHED_EDGE */
2843 	return &root_clutch->scr_unbound_buckets[prev_thread->th_sched_bucket];
2844 }
2845 
2846 /*
2847  * sched_clutch_hierarchy_thread_highest()
2848  *
2849  * Routine to traverse the Clutch hierarchy and return the highest thread which
2850  * should be selected to run next, optionally comparing against the previously
2851  * running thread. Removes the highest thread with sched_clutch_thread_remove()
2852  * depending on the traverse mode and whether it is the previously running thread.
2853  * Always called with the pset lock held.
2854  */
2855 static thread_t
sched_clutch_hierarchy_thread_highest(sched_clutch_root_t root_clutch,processor_t processor,thread_t _Nullable prev_thread,sched_clutch_traverse_mode_t mode)2856 sched_clutch_hierarchy_thread_highest(
2857 	sched_clutch_root_t root_clutch,
2858 	processor_t processor,
2859 	thread_t _Nullable prev_thread,
2860 	sched_clutch_traverse_mode_t mode)
2861 {
2862 	assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || prev_thread == NULL);
2863 	sched_clutch_hierarchy_locked_assert(root_clutch);
2864 
2865 	thread_t highest_thread = NULL;
2866 	uint64_t current_timestamp = mach_absolute_time();
2867 	bool chose_prev_thread = false;
2868 	sched_clutch_dbg_thread_select_packed_t debug_info = {0};
2869 	sched_clutch_root_bucket_t prev_root_bucket = prev_thread != NULL ? sched_clutch_root_bucket_for_thread(root_clutch, prev_thread) : NULL;
2870 	sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL, prev_root_bucket, prev_thread, &chose_prev_thread, mode, &debug_info);
2871 	if (chose_prev_thread) {
2872 		/* We disambiguated that we want to keep running the previous thread */
2873 		highest_thread = processor->active_thread;
2874 		goto done_selecting_thread;
2875 	}
2876 	if (root_bucket == NULL) {
2877 		/* The Clutch hierarchy has no runnable threads, including the previous thread */
2878 		assert(sched_clutch_root_count(root_clutch) == 0);
2879 		assert(prev_thread == NULL);
2880 		return NULL;
2881 	}
2882 	if (root_bucket != prev_root_bucket) {
2883 		/* We have ruled out continuing to run the previous thread, based on the root bucket level policy */
2884 		prev_thread = NULL;
2885 		assert((mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT) || (prev_root_bucket == NULL) ||
2886 		    (prev_root_bucket->scrb_bucket >= root_bucket->scrb_bucket) || (root_bucket->scrb_starvation_avoidance) ||
2887 		    (prev_root_bucket->scrb_bound != root_bucket->scrb_bound) ||
2888 		    (root_bucket->scrb_warp_remaining > 0 && root_bucket->scrb_warped_deadline > current_timestamp && prev_root_bucket->scrb_warp_remaining == 0));
2889 	}
2890 
2891 	if (root_bucket->scrb_bound) {
2892 		highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket, processor, prev_thread);
2893 	} else {
2894 		highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket, processor, prev_thread);
2895 	}
2896 
2897 	if (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY ||
2898 	    (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT && highest_thread != processor->active_thread)) {
2899 		assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || highest_thread != processor->active_thread);
2900 		sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2901 	}
2902 
2903 done_selecting_thread:
2904 	debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
2905 	debug_info.trace_data.traverse_mode = mode;
2906 	debug_info.trace_data.cluster_id = root_clutch->scr_cluster_id;
2907 	debug_info.trace_data.selection_was_cluster_bound = root_bucket->scrb_bound;
2908 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2909 	    thread_tid(highest_thread), thread_group_get_id(highest_thread->thread_group), root_bucket->scrb_bucket, debug_info.scdts_trace_data_packed, 0);
2910 	return highest_thread;
2911 }
2912 
2913 /* High level global accessor routines */
2914 
2915 /*
2916  * sched_clutch_root_urgency()
2917  *
2918  * Routine to get the urgency of the highest runnable
2919  * thread in the hierarchy.
2920  */
2921 static uint32_t
sched_clutch_root_urgency(sched_clutch_root_t root_clutch)2922 sched_clutch_root_urgency(
2923 	sched_clutch_root_t root_clutch)
2924 {
2925 	return root_clutch->scr_urgency;
2926 }
2927 
2928 /*
2929  * sched_clutch_root_count_sum()
2930  *
2931  * The count_sum mechanism is used for scheduler runq
2932  * statistics calculation. Its only useful for debugging
2933  * purposes; since it takes a mach_absolute_time() on
2934  * other scheduler implementations, its better to avoid
2935  * populating this until absolutely necessary.
2936  */
2937 static uint32_t
sched_clutch_root_count_sum(__unused sched_clutch_root_t root_clutch)2938 sched_clutch_root_count_sum(
2939 	__unused sched_clutch_root_t root_clutch)
2940 {
2941 	return 0;
2942 }
2943 
2944 /*
2945  * sched_clutch_root_priority()
2946  *
2947  * Routine to get the priority of the highest runnable
2948  * thread in the hierarchy.
2949  */
2950 static int
sched_clutch_root_priority(sched_clutch_root_t root_clutch)2951 sched_clutch_root_priority(
2952 	sched_clutch_root_t root_clutch)
2953 {
2954 	return root_clutch->scr_priority;
2955 }
2956 
2957 /*
2958  * sched_clutch_root_count()
2959  *
2960  * Returns total number of runnable threads in the hierarchy.
2961  */
2962 uint32_t
sched_clutch_root_count(sched_clutch_root_t root_clutch)2963 sched_clutch_root_count(
2964 	sched_clutch_root_t root_clutch)
2965 {
2966 	return root_clutch->scr_thr_count;
2967 }
2968 
2969 #if CONFIG_SCHED_EDGE
2970 
2971 /*
2972  * sched_clutch_root_foreign_empty()
2973  *
2974  * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2975  */
2976 static boolean_t
sched_clutch_root_foreign_empty(sched_clutch_root_t root_clutch)2977 sched_clutch_root_foreign_empty(
2978 	sched_clutch_root_t root_clutch)
2979 {
2980 	return priority_queue_empty(&root_clutch->scr_foreign_buckets);
2981 }
2982 
2983 /*
2984  * sched_clutch_root_highest_foreign_thread_remove()
2985  *
2986  * Routine to return the thread in the highest priority clutch bucket in a cluster.
2987  * Must be called with the pset for the cluster locked.
2988  */
2989 static thread_t
sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t root_clutch)2990 sched_clutch_root_highest_foreign_thread_remove(
2991 	sched_clutch_root_t root_clutch)
2992 {
2993 	thread_t thread = THREAD_NULL;
2994 	if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
2995 		return thread;
2996 	}
2997 	sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
2998 	thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2999 	sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
3000 	return thread;
3001 }
3002 
3003 #endif /* CONFIG_SCHED_EDGE */
3004 
3005 /*
3006  * sched_clutch_thread_pri_shift()
3007  *
3008  * Routine to get the priority shift value for a thread.
3009  * Since the timesharing is done at the clutch_bucket level,
3010  * this routine gets the clutch_bucket and retrieves the
3011  * values from there.
3012  */
3013 uint32_t
sched_clutch_thread_pri_shift(thread_t thread,sched_bucket_t bucket)3014 sched_clutch_thread_pri_shift(
3015 	thread_t thread,
3016 	sched_bucket_t bucket)
3017 {
3018 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3019 		return INT8_MAX;
3020 	}
3021 	assert(bucket != TH_BUCKET_RUN);
3022 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
3023 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
3024 	return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
3025 }
3026 
3027 #pragma mark -- Clutch Scheduler Algorithm
3028 
3029 static void
3030 sched_clutch_init(void);
3031 
3032 #if !SCHED_TEST_HARNESS
3033 
3034 static thread_t
3035 sched_clutch_steal_thread(processor_set_t pset);
3036 
3037 static void
3038 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
3039 
3040 #endif /* !SCHED_TEST_HARNESS */
3041 
3042 static boolean_t
3043 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
3044     sched_options_t options);
3045 
3046 static boolean_t
3047 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
3048 
3049 static ast_t
3050 sched_clutch_processor_csw_check(processor_t processor);
3051 
3052 static boolean_t
3053 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3054 
3055 static int
3056 sched_clutch_runq_count(processor_t processor);
3057 
3058 static boolean_t
3059 sched_clutch_processor_queue_empty(processor_t processor);
3060 
3061 #if !SCHED_TEST_HARNESS
3062 
3063 static uint64_t
3064 sched_clutch_runq_stats_count_sum(processor_t processor);
3065 
3066 #endif /* !SCHED_TEST_HARNESS */
3067 
3068 static int
3069 sched_clutch_processor_bound_count(processor_t processor);
3070 
3071 static void
3072 sched_clutch_pset_init(processor_set_t pset);
3073 
3074 static void
3075 sched_clutch_processor_init(processor_t processor);
3076 
3077 static thread_t
3078 sched_clutch_processor_highest_thread(processor_t processor, sched_clutch_traverse_mode_t mode);
3079 
3080 static thread_t
3081 sched_clutch_choose_thread(processor_t processor, int priority, thread_t prev_thread, ast_t reason);
3082 
3083 #if !SCHED_TEST_HARNESS
3084 
3085 static void
3086 sched_clutch_processor_queue_shutdown(processor_t processor);
3087 
3088 #endif /* !SCHED_TEST_HARNESS */
3089 
3090 static sched_mode_t
3091 sched_clutch_initial_thread_sched_mode(task_t parent_task);
3092 
3093 static uint32_t
3094 sched_clutch_initial_quantum_size(thread_t thread);
3095 
3096 static uint32_t
3097 sched_clutch_run_incr(thread_t thread);
3098 
3099 static uint32_t
3100 sched_clutch_run_decr(thread_t thread);
3101 
3102 static void
3103 sched_clutch_update_thread_bucket(thread_t thread);
3104 
3105 #if !SCHED_TEST_HARNESS
3106 
3107 static void
3108 sched_clutch_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
3109 
3110 #endif /* !SCHED_TEST_HARNESS */
3111 
3112 const struct sched_dispatch_table sched_clutch_dispatch = {
3113 	.sched_name                                     = "clutch",
3114 	.init                                           = sched_clutch_init,
3115 	.timebase_init                                  = sched_timeshare_timebase_init,
3116 	.processor_init                                 = sched_clutch_processor_init,
3117 	.pset_init                                      = sched_clutch_pset_init,
3118 	.choose_thread                                  = sched_clutch_choose_thread,
3119 	.processor_enqueue                              = sched_clutch_processor_enqueue,
3120 	.processor_queue_remove                         = sched_clutch_processor_queue_remove,
3121 	.processor_queue_empty                          = sched_clutch_processor_queue_empty,
3122 	.priority_is_urgent                             = priority_is_urgent,
3123 	.processor_csw_check                            = sched_clutch_processor_csw_check,
3124 	.processor_queue_has_priority                   = sched_clutch_processor_queue_has_priority,
3125 	.initial_quantum_size                           = sched_clutch_initial_quantum_size,
3126 	.initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
3127 	.processor_runq_count                           = sched_clutch_runq_count,
3128 	.processor_bound_count                          = sched_clutch_processor_bound_count,
3129 	.multiple_psets_enabled                         = TRUE,
3130 	.avoid_processor_enabled                        = FALSE,
3131 	.thread_avoid_processor                         = NULL,
3132 	.update_thread_bucket                           = sched_clutch_update_thread_bucket,
3133 	.cpu_init_completed                             = NULL,
3134 	.thread_eligible_for_pset                       = NULL,
3135 #if !SCHED_TEST_HARNESS
3136 	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
3137 	.steal_thread_enabled                           = sched_steal_thread_enabled,
3138 	.steal_thread                                   = sched_clutch_steal_thread,
3139 	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
3140 	.choose_node                                    = sched_choose_node,
3141 	.choose_processor                               = choose_processor,
3142 	.processor_queue_shutdown                       = sched_clutch_processor_queue_shutdown,
3143 	.can_update_priority                            = can_update_priority,
3144 	.update_priority                                = update_priority,
3145 	.lightweight_update_priority                    = lightweight_update_priority,
3146 	.quantum_expire                                 = sched_default_quantum_expire,
3147 	.processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
3148 	.thread_update_scan                             = sched_clutch_thread_update_scan,
3149 	.processor_balance                              = sched_SMT_balance,
3150 	.rt_runq                                        = sched_rtlocal_runq,
3151 	.rt_init                                        = sched_rtlocal_init,
3152 	.rt_queue_shutdown                              = sched_rtlocal_queue_shutdown,
3153 	.rt_runq_scan                                   = sched_rtlocal_runq_scan,
3154 	.rt_runq_count_sum                              = sched_rtlocal_runq_count_sum,
3155 	.rt_steal_thread                                = sched_rtlocal_steal_thread,
3156 	.qos_max_parallelism                            = sched_qos_max_parallelism,
3157 	.check_spill                                    = sched_check_spill,
3158 	.ipi_policy                                     = sched_ipi_policy,
3159 	.thread_should_yield                            = sched_thread_should_yield,
3160 	.run_count_incr                                 = sched_clutch_run_incr,
3161 	.run_count_decr                                 = sched_clutch_run_decr,
3162 	.pset_made_schedulable                          = sched_pset_made_schedulable,
3163 	.thread_group_recommendation_change             = sched_clutch_thread_group_recommendation_change,
3164 #endif /* !SCHED_TEST_HARNESS */
3165 };
3166 
3167 __attribute__((always_inline))
3168 static inline run_queue_t
sched_clutch_bound_runq(processor_t processor)3169 sched_clutch_bound_runq(processor_t processor)
3170 {
3171 	return &processor->runq;
3172 }
3173 
3174 __attribute__((always_inline))
3175 static inline sched_clutch_root_t
sched_clutch_processor_root_clutch(processor_t processor)3176 sched_clutch_processor_root_clutch(processor_t processor)
3177 {
3178 	return &processor->processor_set->pset_clutch_root;
3179 }
3180 
3181 __attribute__((always_inline))
3182 static inline run_queue_t
sched_clutch_thread_bound_runq(processor_t processor,__assert_only thread_t thread)3183 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
3184 {
3185 	assert(thread->bound_processor == processor);
3186 	return sched_clutch_bound_runq(processor);
3187 }
3188 
3189 static uint32_t
sched_clutch_initial_quantum_size(thread_t thread)3190 sched_clutch_initial_quantum_size(thread_t thread)
3191 {
3192 	if (thread == THREAD_NULL) {
3193 		return std_quantum;
3194 	}
3195 	assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
3196 	return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
3197 }
3198 
3199 static sched_mode_t
sched_clutch_initial_thread_sched_mode(task_t parent_task)3200 sched_clutch_initial_thread_sched_mode(task_t parent_task)
3201 {
3202 	if (parent_task == kernel_task) {
3203 		return TH_MODE_FIXED;
3204 	} else {
3205 		return TH_MODE_TIMESHARE;
3206 	}
3207 }
3208 
3209 static void
sched_clutch_processor_init(processor_t processor)3210 sched_clutch_processor_init(processor_t processor)
3211 {
3212 	run_queue_init(&processor->runq);
3213 }
3214 
3215 static void
sched_clutch_pset_init(processor_set_t pset)3216 sched_clutch_pset_init(processor_set_t pset)
3217 {
3218 	sched_clutch_root_init(&pset->pset_clutch_root, pset);
3219 }
3220 
3221 static void
sched_clutch_tunables_init(void)3222 sched_clutch_tunables_init(void)
3223 {
3224 	sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
3225 	sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
3226 	sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
3227 	clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
3228 	    NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
3229 	assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
3230 	sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
3231 }
3232 
3233 static void
sched_clutch_init(void)3234 sched_clutch_init(void)
3235 {
3236 	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3237 		sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3238 	}
3239 	sched_timeshare_init();
3240 	sched_clutch_tunables_init();
3241 }
3242 
3243 static inline bool
sched_clutch_pri_greater_than_tiebreak(int pri_one,int pri_two,bool one_wins_ties)3244 sched_clutch_pri_greater_than_tiebreak(int pri_one, int pri_two, bool one_wins_ties)
3245 {
3246 	if (one_wins_ties) {
3247 		return pri_one >= pri_two;
3248 	} else {
3249 		return pri_one > pri_two;
3250 	}
3251 }
3252 
3253 /*
3254  * sched_clutch_processor_highest_thread()
3255  *
3256  * Routine to determine the highest thread on the entire cluster runqueue which
3257  * should be selected to run next, optionally comparing against the previously
3258  * running thread. Removes the highest thread from the runqueue, depending on the
3259  * traverse mode and whether the highest thread is the previously running thread.
3260  *
3261  * Always called with the pset lock held. Assumes that processor->active_thread
3262  * may be locked and modified by another processor.
3263  */
3264 static thread_t
sched_clutch_processor_highest_thread(processor_t processor,sched_clutch_traverse_mode_t mode)3265 sched_clutch_processor_highest_thread(
3266 	processor_t      processor,
3267 	sched_clutch_traverse_mode_t mode)
3268 {
3269 	sched_clutch_root_t root_clutch = sched_clutch_processor_root_clutch(processor);
3270 	int clutch_pri = sched_clutch_root_priority(root_clutch);
3271 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3272 	int bound_pri = bound_runq->highq;
3273 
3274 	bool has_prev_thread = mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
3275 	thread_t prev_thread = has_prev_thread ? processor->active_thread : NULL;
3276 
3277 	if (bound_runq->count == 0 && root_clutch->scr_thr_count == 0) {
3278 		/* The runqueue is totally empty */
3279 		assert(bound_pri < MINPRI && clutch_pri < MINPRI);
3280 		return prev_thread;
3281 	}
3282 
3283 	if (has_prev_thread) {
3284 		if (prev_thread->sched_pri >= BASEPRI_RTQUEUES) {
3285 			/* The previous thread is real-time and thus guaranteed higher than the non-RT runqueue */
3286 			return prev_thread;
3287 		}
3288 		/* Allow the previous thread to influence the priority comparison of Clutch hierarchy vs. processor-bound runqueue */
3289 		if (prev_thread->bound_processor != NULL) {
3290 			bound_pri = MAX(bound_pri, prev_thread->sched_pri);
3291 		} else {
3292 			clutch_pri = MAX(clutch_pri, prev_thread->sched_pri);
3293 		}
3294 	}
3295 
3296 	bool prev_thread_is_not_processor_bound = has_prev_thread && (prev_thread->bound_processor == NULL);
3297 	bool prev_thread_is_processor_bound = has_prev_thread && (prev_thread->bound_processor != NULL);
3298 	thread_t next_thread = prev_thread;
3299 	if (clutch_pri > bound_pri) {
3300 		if (root_clutch->scr_thr_count == 0) {
3301 			goto found_thread;
3302 		}
3303 		next_thread = sched_clutch_hierarchy_thread_highest(root_clutch, processor, prev_thread_is_not_processor_bound ? prev_thread : NULL, mode);
3304 	} else {
3305 		if (bound_runq->count == 0 ||
3306 		    (prev_thread_is_processor_bound && sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_runq->highq, processor->first_timeslice))) {
3307 			goto found_thread;
3308 		}
3309 		next_thread = (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY) ?
3310 		    run_queue_dequeue(bound_runq, SCHED_HEADQ) : run_queue_peek(bound_runq);
3311 		assert(mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || next_thread != prev_thread);
3312 	}
3313 found_thread:
3314 	assert(next_thread != NULL);
3315 	return next_thread;
3316 }
3317 
3318 static thread_t
sched_clutch_choose_thread(processor_t processor,__unused int priority,thread_t _Nullable prev_thread,__unused ast_t reason)3319 sched_clutch_choose_thread(
3320 	processor_t      processor,
3321 	__unused int              priority,
3322 	thread_t _Nullable        prev_thread,
3323 	__unused ast_t            reason)
3324 {
3325 	assert(prev_thread == NULL || prev_thread == processor->active_thread);
3326 	return sched_clutch_processor_highest_thread(processor, prev_thread != NULL ? SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT : SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3327 }
3328 
3329 static boolean_t
sched_clutch_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)3330 sched_clutch_processor_enqueue(
3331 	processor_t       processor,
3332 	thread_t          thread,
3333 	sched_options_t   options)
3334 {
3335 	boolean_t       result;
3336 
3337 	thread_set_runq_locked(thread, processor);
3338 	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3339 		sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3340 		result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
3341 	} else {
3342 		run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3343 		result = run_queue_enqueue(rq, thread, options);
3344 	}
3345 	return result;
3346 }
3347 
3348 static boolean_t
sched_clutch_processor_queue_empty(processor_t processor)3349 sched_clutch_processor_queue_empty(processor_t processor)
3350 {
3351 	return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
3352 	       sched_clutch_bound_runq(processor)->count == 0;
3353 }
3354 
3355 static ast_t
sched_clutch_processor_csw_check(processor_t processor)3356 sched_clutch_processor_csw_check(processor_t processor)
3357 {
3358 	assert(processor->active_thread != NULL);
3359 	thread_t runqueue_thread = sched_clutch_processor_highest_thread(processor, SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT);
3360 	if (runqueue_thread != processor->active_thread) {
3361 		/* Found a better thread to run */
3362 		if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0 ||
3363 		    sched_clutch_bound_runq(processor)->urgency > 0) {
3364 			return AST_PREEMPT | AST_URGENT;
3365 		}
3366 		return AST_PREEMPT;
3367 	}
3368 	return AST_NONE;
3369 }
3370 
3371 static boolean_t
sched_clutch_processor_queue_has_priority(__unused processor_t processor,__unused int priority,__unused boolean_t gte)3372 sched_clutch_processor_queue_has_priority(
3373 	__unused processor_t    processor,
3374 	__unused int            priority,
3375 	__unused boolean_t      gte)
3376 {
3377 	/*
3378 	 * Never short-circuit the Clutch runqueue by returning FALSE here. Instead,
3379 	 * thread_select() should always go through sched_clutch_choose_thread().
3380 	 */
3381 	return TRUE;
3382 }
3383 
3384 static int
sched_clutch_runq_count(processor_t processor)3385 sched_clutch_runq_count(processor_t processor)
3386 {
3387 	return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
3388 }
3389 
3390 #if !SCHED_TEST_HARNESS
3391 
3392 static uint64_t
sched_clutch_runq_stats_count_sum(processor_t processor)3393 sched_clutch_runq_stats_count_sum(processor_t processor)
3394 {
3395 	uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
3396 
3397 	if (processor->cpu_id == processor->processor_set->cpu_set_low) {
3398 		return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
3399 	} else {
3400 		return bound_sum;
3401 	}
3402 }
3403 
3404 #endif /* !SCHED_TEST_HARNESS */
3405 
3406 static int
sched_clutch_processor_bound_count(processor_t processor)3407 sched_clutch_processor_bound_count(processor_t processor)
3408 {
3409 	return sched_clutch_bound_runq(processor)->count;
3410 }
3411 
3412 #if !SCHED_TEST_HARNESS
3413 
3414 static void
sched_clutch_processor_queue_shutdown(processor_t processor)3415 sched_clutch_processor_queue_shutdown(processor_t processor)
3416 {
3417 	processor_set_t pset = processor->processor_set;
3418 	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3419 	thread_t        thread;
3420 	queue_head_t    tqueue;
3421 
3422 	/* We only need to migrate threads if this is the last active processor in the pset */
3423 	if (pset->online_processor_count > 0) {
3424 		pset_unlock(pset);
3425 		return;
3426 	}
3427 
3428 	queue_init(&tqueue);
3429 	while (sched_clutch_root_count(pset_clutch_root) > 0) {
3430 		thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3431 		enqueue_tail(&tqueue, &thread->runq_links);
3432 	}
3433 
3434 	pset_unlock(pset);
3435 
3436 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
3437 		remqueue(&thread->runq_links);
3438 		thread_lock(thread);
3439 		thread_setrun(thread, SCHED_TAILQ);
3440 		thread_unlock(thread);
3441 	}
3442 }
3443 
3444 #endif /* !SCHED_TEST_HARNESS */
3445 
3446 static boolean_t
sched_clutch_processor_queue_remove(processor_t processor,thread_t thread)3447 sched_clutch_processor_queue_remove(
3448 	processor_t processor,
3449 	thread_t    thread)
3450 {
3451 	processor_set_t         pset = processor->processor_set;
3452 
3453 	pset_lock(pset);
3454 
3455 	if (processor == thread_get_runq_locked(thread)) {
3456 		/*
3457 		 * Thread is on a run queue and we have a lock on
3458 		 * that run queue.
3459 		 */
3460 		if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3461 			sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3462 			sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3463 		} else {
3464 			run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3465 			run_queue_remove(rq, thread);
3466 		}
3467 	} else {
3468 		/*
3469 		 * The thread left the run queue before we could
3470 		 * lock the run queue.
3471 		 */
3472 		thread_assert_runq_null(thread);
3473 		processor = PROCESSOR_NULL;
3474 	}
3475 
3476 	pset_unlock(pset);
3477 
3478 	return processor != PROCESSOR_NULL;
3479 }
3480 
3481 #if !SCHED_TEST_HARNESS
3482 
3483 static thread_t
sched_clutch_steal_thread(__unused processor_set_t pset)3484 sched_clutch_steal_thread(__unused processor_set_t pset)
3485 {
3486 	/* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3487 	return THREAD_NULL;
3488 }
3489 
3490 static void
sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)3491 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3492 {
3493 	boolean_t               restart_needed = FALSE;
3494 	processor_t             processor = processor_list;
3495 	processor_set_t         pset;
3496 	thread_t                thread;
3497 	spl_t                   s;
3498 
3499 	/*
3500 	 *  We update the threads associated with each processor (bound and idle threads)
3501 	 *  and then update the threads in each pset runqueue.
3502 	 */
3503 
3504 	do {
3505 		do {
3506 			pset = processor->processor_set;
3507 
3508 			s = splsched();
3509 			pset_lock(pset);
3510 
3511 			restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
3512 
3513 			pset_unlock(pset);
3514 			splx(s);
3515 
3516 			if (restart_needed) {
3517 				break;
3518 			}
3519 
3520 			thread = processor->idle_thread;
3521 			if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
3522 				if (thread_update_add_thread(thread) == FALSE) {
3523 					restart_needed = TRUE;
3524 					break;
3525 				}
3526 			}
3527 		} while ((processor = processor->processor_list) != NULL);
3528 
3529 		/* Ok, we now have a collection of candidates -- fix them. */
3530 		thread_update_process_threads();
3531 	} while (restart_needed);
3532 
3533 	pset_node_t node = &pset_node0;
3534 	pset = node->psets;
3535 
3536 	do {
3537 		do {
3538 			restart_needed = FALSE;
3539 			while (pset != NULL) {
3540 				s = splsched();
3541 				pset_lock(pset);
3542 
3543 				if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
3544 					for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3545 						restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3546 						if (restart_needed) {
3547 							break;
3548 						}
3549 					}
3550 					queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3551 					sched_clutch_bucket_t clutch_bucket;
3552 					qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3553 						sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
3554 						restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
3555 						if (restart_needed) {
3556 							break;
3557 						}
3558 					}
3559 				}
3560 
3561 				pset_unlock(pset);
3562 				splx(s);
3563 
3564 				if (restart_needed) {
3565 					break;
3566 				}
3567 				pset = pset->pset_list;
3568 			}
3569 
3570 			if (restart_needed) {
3571 				break;
3572 			}
3573 		} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3574 
3575 		/* Ok, we now have a collection of candidates -- fix them. */
3576 		thread_update_process_threads();
3577 	} while (restart_needed);
3578 }
3579 
3580 /*
3581  * For the clutch scheduler, the run counts are maintained in the clutch
3582  * buckets (i.e thread group scheduling structure).
3583  */
3584 static uint32_t
sched_clutch_run_incr(thread_t thread)3585 sched_clutch_run_incr(thread_t thread)
3586 {
3587 	assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3588 	uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3589 	sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3590 	return new_count;
3591 }
3592 
3593 static uint32_t
sched_clutch_run_decr(thread_t thread)3594 sched_clutch_run_decr(thread_t thread)
3595 {
3596 	assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3597 	uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3598 	sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3599 	return new_count;
3600 }
3601 
3602 /*
3603  * For threads that have changed sched_pri without changing the
3604  * base_pri for any reason other than decay, use the sched_pri
3605  * as the bucketizing priority instead of base_pri. All such
3606  * changes are typically due to kernel locking primitives boosts
3607  * or demotions.
3608  */
3609 static boolean_t
sched_thread_sched_pri_promoted(thread_t thread)3610 sched_thread_sched_pri_promoted(thread_t thread)
3611 {
3612 	return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
3613 	       (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3614 	       (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3615 	       (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3616 	       (thread->kern_promotion_schedpri != 0);
3617 }
3618 
3619 #endif /* !SCHED_TEST_HARNESS */
3620 
3621 /*
3622  * Routine to update the scheduling bucket for the thread.
3623  *
3624  * In the clutch scheduler implementation, the thread's bucket
3625  * is based on sched_pri if it was promoted due to a kernel
3626  * primitive; otherwise its based on the thread base_pri. This
3627  * enhancement allows promoted threads to reach a higher priority
3628  * bucket and potentially get selected sooner for scheduling.
3629  *
3630  * Also, the clutch scheduler does not honor fixed priority below
3631  * FG priority. It simply puts those threads in the corresponding
3632  * timeshare bucket. The reason for to do that is because it is
3633  * extremely hard to define the scheduling properties of such threads
3634  * and they typically lead to performance issues.
3635  *
3636  * Called with the thread lock held and the thread held off the runqueue.
3637  */
3638 
3639 void
sched_clutch_update_thread_bucket(thread_t thread)3640 sched_clutch_update_thread_bucket(thread_t thread)
3641 {
3642 	sched_bucket_t old_bucket = thread->th_sched_bucket;
3643 	thread_assert_runq_null(thread);
3644 	int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3645 	sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3646 
3647 	if (old_bucket == new_bucket) {
3648 		return;
3649 	}
3650 
3651 	thread->th_sched_bucket = new_bucket;
3652 	thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3653 	/*
3654 	 * Since this is called after the thread has been removed from the runq,
3655 	 * only the run counts need to be updated. The re-insert into the runq
3656 	 * would put the thread into the correct new bucket's runq.
3657 	 */
3658 	if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3659 		sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3660 		sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3661 	}
3662 }
3663 
3664 #if !SCHED_TEST_HARNESS
3665 
3666 static void
sched_clutch_thread_group_recommendation_change(__unused struct thread_group * tg,__unused cluster_type_t new_recommendation)3667 sched_clutch_thread_group_recommendation_change(__unused struct thread_group *tg, __unused cluster_type_t new_recommendation)
3668 {
3669 	/* Clutch ignores the recommendation because Clutch does not migrate
3670 	 * threads between cluster types independently from the Edge scheduler.
3671 	 */
3672 }
3673 
3674 #endif /* !SCHED_TEST_HARNESS */
3675 
3676 #if CONFIG_SCHED_EDGE
3677 
3678 /* Implementation of the AMP version of the clutch scheduler */
3679 
3680 static void
3681 sched_edge_init(void);
3682 
3683 static void
3684 sched_edge_pset_init(processor_set_t pset);
3685 
3686 static thread_t
3687 sched_edge_processor_idle(processor_set_t pset);
3688 
3689 static boolean_t
3690 sched_edge_processor_queue_empty(processor_t processor);
3691 
3692 static void
3693 sched_edge_processor_queue_shutdown(processor_t processor);
3694 
3695 static processor_t
3696 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
3697 
3698 static bool
3699 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason);
3700 
3701 static bool
3702 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3703 
3704 static void
3705 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3706 
3707 static bool
3708 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3709 
3710 static void
3711 sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3712 
3713 static void
3714 sched_edge_cpu_init_completed(void);
3715 
3716 static bool
3717 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3718 
3719 static bool
3720 sched_edge_steal_thread_enabled(processor_set_t pset);
3721 
3722 static sched_ipi_type_t
3723 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3724 
3725 static uint32_t
3726 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3727 
3728 static uint32_t
3729 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3730 
3731 const struct sched_dispatch_table sched_edge_dispatch = {
3732 	.sched_name                                     = "edge",
3733 	.init                                           = sched_edge_init,
3734 	.timebase_init                                  = sched_timeshare_timebase_init,
3735 	.processor_init                                 = sched_clutch_processor_init,
3736 	.pset_init                                      = sched_edge_pset_init,
3737 	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
3738 	.choose_thread                                  = sched_clutch_choose_thread,
3739 	.steal_thread_enabled                           = sched_edge_steal_thread_enabled,
3740 	.steal_thread                                   = sched_edge_processor_idle,
3741 	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
3742 	.choose_node                                    = sched_choose_node,
3743 	.choose_processor                               = sched_edge_choose_processor,
3744 	.processor_enqueue                              = sched_clutch_processor_enqueue,
3745 	.processor_queue_shutdown                       = sched_edge_processor_queue_shutdown,
3746 	.processor_queue_remove                         = sched_clutch_processor_queue_remove,
3747 	.processor_queue_empty                          = sched_edge_processor_queue_empty,
3748 	.priority_is_urgent                             = priority_is_urgent,
3749 	.processor_csw_check                            = sched_clutch_processor_csw_check,
3750 	.processor_queue_has_priority                   = sched_clutch_processor_queue_has_priority,
3751 	.initial_quantum_size                           = sched_clutch_initial_quantum_size,
3752 	.initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
3753 	.can_update_priority                            = can_update_priority,
3754 	.update_priority                                = update_priority,
3755 	.lightweight_update_priority                    = lightweight_update_priority,
3756 	.quantum_expire                                 = sched_default_quantum_expire,
3757 	.processor_runq_count                           = sched_clutch_runq_count,
3758 	.processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
3759 	.processor_bound_count                          = sched_clutch_processor_bound_count,
3760 	.thread_update_scan                             = sched_clutch_thread_update_scan,
3761 	.multiple_psets_enabled                         = TRUE,
3762 	.avoid_processor_enabled                        = TRUE,
3763 	.thread_avoid_processor                         = sched_edge_thread_avoid_processor,
3764 	.processor_balance                              = sched_edge_balance,
3765 
3766 	.rt_runq                                        = sched_rtlocal_runq,
3767 	.rt_init                                        = sched_rtlocal_init,
3768 	.rt_queue_shutdown                              = sched_rtlocal_queue_shutdown,
3769 	.rt_runq_scan                                   = sched_rtlocal_runq_scan,
3770 	.rt_runq_count_sum                              = sched_rtlocal_runq_count_sum,
3771 	.rt_steal_thread                                = sched_rtlocal_steal_thread,
3772 
3773 	.qos_max_parallelism                            = sched_edge_qos_max_parallelism,
3774 	.check_spill                                    = sched_edge_check_spill,
3775 	.ipi_policy                                     = sched_edge_ipi_policy,
3776 	.thread_should_yield                            = sched_edge_thread_should_yield,
3777 	.run_count_incr                                 = sched_clutch_run_incr,
3778 	.run_count_decr                                 = sched_clutch_run_decr,
3779 	.update_thread_bucket                           = sched_clutch_update_thread_bucket,
3780 	.pset_made_schedulable                          = sched_edge_pset_made_schedulable,
3781 	.thread_group_recommendation_change             = NULL,
3782 	.cpu_init_completed                             = sched_edge_cpu_init_completed,
3783 	.thread_eligible_for_pset                       = sched_edge_thread_eligible_for_pset,
3784 };
3785 
3786 static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3787 
3788 /*
3789  * sched_edge_pset_available()
3790  *
3791  * Routine to determine if a pset is available for scheduling.
3792  */
3793 static bool
sched_edge_pset_available(processor_set_t pset)3794 sched_edge_pset_available(processor_set_t pset)
3795 {
3796 	if (pset == NULL) {
3797 		return false;
3798 	}
3799 	return pset_available_cpu_count(pset) > 0;
3800 }
3801 
3802 /*
3803  * sched_edge_thread_bound_cluster_id()
3804  *
3805  * Routine to determine which cluster a particular thread is bound to. Uses
3806  * the sched_flags on the thread to map back to a specific cluster id.
3807  *
3808  * <Edge Multi-cluster Support Needed>
3809  */
3810 static uint32_t
sched_edge_thread_bound_cluster_id(thread_t thread)3811 sched_edge_thread_bound_cluster_id(thread_t thread)
3812 {
3813 	assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3814 	return thread->th_bound_cluster_id;
3815 }
3816 
3817 /* Forward declaration for some thread migration routines */
3818 static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3819 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3820 static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3821 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
3822 
3823 /*
3824  * sched_edge_config_set()
3825  *
3826  * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3827  * policies in the scheduler.
3828  */
3829 static void
sched_edge_config_set(uint32_t src_cluster,uint32_t dst_cluster,sched_clutch_edge edge_config)3830 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
3831 {
3832 	sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
3833 	edge->sce_edge_packed = edge_config.sce_edge_packed;
3834 }
3835 
3836 /*
3837  * sched_edge_config_get()
3838  *
3839  * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3840  * if it needs to update edges.
3841  */
3842 static sched_clutch_edge
sched_edge_config_get(uint32_t src_cluster,uint32_t dst_cluster)3843 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
3844 {
3845 	return pset_array[src_cluster]->sched_edges[dst_cluster];
3846 }
3847 
3848 /*
3849  * sched_edge_matrix_set()
3850  *
3851  * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3852  * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3853  * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3854  */
3855 void
sched_edge_matrix_set(sched_clutch_edge * edge_matrix,bool * edge_changes_bitmap,__unused uint64_t flags,uint64_t matrix_order)3856 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3857 {
3858 	uint32_t edge_index = 0;
3859 	for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3860 		for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3861 			if (edge_changes_bitmap[edge_index]) {
3862 				sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3863 			}
3864 			edge_index++;
3865 		}
3866 	}
3867 }
3868 
3869 /*
3870  * sched_edge_matrix_get()
3871  *
3872  * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3873  * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3874  * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3875  */
3876 void
sched_edge_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_request_bitmap,__unused uint64_t flags,uint64_t matrix_order)3877 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3878 {
3879 	uint32_t edge_index = 0;
3880 	for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3881 		for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3882 			if (edge_request_bitmap[edge_index]) {
3883 				edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3884 			}
3885 			edge_index++;
3886 		}
3887 	}
3888 }
3889 
3890 /*
3891  * sched_edge_init()
3892  *
3893  * Routine to initialize the data structures for the Edge scheduler.
3894  */
3895 static void
sched_edge_init(void)3896 sched_edge_init(void)
3897 {
3898 	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3899 		sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3900 	}
3901 	sched_timeshare_init();
3902 	sched_clutch_tunables_init();
3903 	sched_edge_max_clusters = ml_get_cluster_count();
3904 }
3905 
3906 static void
sched_edge_pset_init(processor_set_t pset)3907 sched_edge_pset_init(processor_set_t pset)
3908 {
3909 	uint32_t pset_cluster_id = pset->pset_cluster_id;
3910 	pset->pset_type = (pset->pset_cluster_type == PSET_AMP_P) ? CLUSTER_TYPE_P : CLUSTER_TYPE_E;
3911 
3912 	/* Set the edge weight and properties for the pset itself */
3913 	bitmap_clear(pset->foreign_psets, pset_cluster_id);
3914 	bitmap_clear(pset->native_psets, pset_cluster_id);
3915 	bitmap_clear(pset->local_psets, pset_cluster_id);
3916 	bitmap_clear(pset->remote_psets, pset_cluster_id);
3917 	pset->sched_edges[pset_cluster_id].sce_edge_packed = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0}.sce_edge_packed;
3918 	sched_clutch_root_init(&pset->pset_clutch_root, pset);
3919 	bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id);
3920 }
3921 
3922 static boolean_t
sched_edge_processor_queue_empty(processor_t processor)3923 sched_edge_processor_queue_empty(processor_t processor)
3924 {
3925 	return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
3926 	       (sched_clutch_bound_runq(processor)->count == 0);
3927 }
3928 
3929 static void
sched_edge_check_spill(__unused processor_set_t pset,__unused thread_t thread)3930 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
3931 {
3932 	assert(thread->bound_processor == PROCESSOR_NULL);
3933 }
3934 
3935 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
3936 	SCHED_EDGE_YIELD_RUNQ_NONEMPTY       = 0x0,
3937 	SCHED_EDGE_YIELD_FOREIGN_RUNNABLE    = 0x1,
3938 	SCHED_EDGE_YIELD_FOREIGN_RUNNING     = 0x2,
3939 	SCHED_EDGE_YIELD_STEAL_POSSIBLE      = 0x3,
3940 	SCHED_EDGE_YIELD_DISALLOW            = 0x4,
3941 });
3942 
3943 static bool
sched_edge_thread_should_yield(processor_t processor,__unused thread_t thread)3944 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
3945 {
3946 	if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
3947 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3948 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
3949 		return true;
3950 	}
3951 
3952 	/*
3953 	 * The yield logic should follow the same logic that steal_thread () does. The
3954 	 * thread_should_yield() is effectively trying to quickly check that if the
3955 	 * current thread gave up CPU, is there any other thread that would execute
3956 	 * on this CPU. So it needs to provide the same answer as the steal_thread()/
3957 	 * processor Idle logic.
3958 	 */
3959 	if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
3960 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3961 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
3962 		return true;
3963 	}
3964 	if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
3965 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3966 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
3967 		return true;
3968 	}
3969 
3970 	processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
3971 	if (steal_candidate != NULL) {
3972 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3973 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
3974 		return true;
3975 	}
3976 
3977 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
3978 	    0, SCHED_EDGE_YIELD_DISALLOW);
3979 	return false;
3980 }
3981 
3982 static void
sched_edge_processor_queue_shutdown(processor_t processor)3983 sched_edge_processor_queue_shutdown(processor_t processor)
3984 {
3985 	processor_set_t pset = processor->processor_set;
3986 	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3987 	thread_t        thread;
3988 	queue_head_t    tqueue;
3989 
3990 	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
3991 	if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
3992 		pset_unlock(pset);
3993 		return;
3994 	}
3995 
3996 	bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3997 
3998 	queue_init(&tqueue);
3999 	while (sched_clutch_root_count(pset_clutch_root) > 0) {
4000 		thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
4001 		enqueue_tail(&tqueue, &thread->runq_links);
4002 	}
4003 	pset_unlock(pset);
4004 
4005 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
4006 		remqueue(&thread->runq_links);
4007 		thread_lock(thread);
4008 		thread_setrun(thread, SCHED_TAILQ);
4009 		thread_unlock(thread);
4010 	}
4011 }
4012 
4013 /*
4014  * sched_edge_cluster_load_metric()
4015  *
4016  * The load metric for a cluster is a measure of the average scheduling latency
4017  * experienced by threads on that cluster. It is a product of the average number
4018  * of threads in the runqueue and the average execution time for threads. The metric
4019  * has special values in the following cases:
4020  * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
4021  *   the maximum value to disallow any threads to migrate to this cluster.
4022  * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
4023  *   to be spread across the platform quickly for ncpu wide workloads.
4024  */
4025 static uint32_t
sched_edge_cluster_load_metric(processor_set_t pset,sched_bucket_t sched_bucket)4026 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
4027 {
4028 	if (sched_edge_pset_available(pset) == false) {
4029 		return UINT32_MAX;
4030 	}
4031 	return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
4032 }
4033 
4034 /*
4035  *
4036  * Edge Scheduler Steal/Rebalance logic
4037  *
4038  * = Generic scheduler logic =
4039  *
4040  * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
4041  * find any thread for execution in its runqueue. The aim of the steal operation
4042  * is to find other threads running/runnable in other clusters which should be
4043  * executed here.
4044  *
4045  * If the steal callout does not return a thread, the thread_select() logic calls
4046  * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
4047  * threads and idle out the current CPU.
4048  *
4049  * = SCHED(steal_thread) for Edge Scheduler =
4050  *
4051  * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
4052  * routine tries to do the following operations in order:
4053  * (1) Find foreign runnnable threads in non-native cluster
4054  *     runqueues (sched_edge_foreign_runnable_thread_remove())
4055  * (2) Check if foreign threads are running on the non-native
4056  *     clusters (sched_edge_foreign_running_thread_available())
4057  *         - If yes, return THREAD_NULL for the steal callout and
4058  *         perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
4059  * (3) Steal a thread from another cluster based on edge
4060  *     weights (sched_edge_steal_thread())
4061  *
4062  * = SCHED(processor_balance) for Edge Scheduler =
4063  *
4064  * If steal_thread did not return a thread for the processor, use
4065  * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
4066  *
4067  * = Clutch Bucket Preferred Cluster Overrides =
4068  *
4069  * Since these operations (just like thread migrations on enqueue)
4070  * move threads across clusters, they need support for handling clutch
4071  * bucket group level preferred cluster recommendations.
4072  * For (1), a clutch bucket will be in the foreign runnable queue based
4073  * on the clutch bucket group preferred cluster.
4074  * For (2), the running thread will set the bit on the processor based
4075  * on its preferred cluster type.
4076  * For (3), the edge configuration would prevent threads from being stolen
4077  * in the wrong direction.
4078  *
4079  * = SCHED(thread_should_yield) =
4080  * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
4081  * since that is expecting the same answer as if thread_select() was called on a core
4082  * with an empty runqueue.
4083  */
4084 
4085 static bool
sched_edge_steal_thread_enabled(__unused processor_set_t pset)4086 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
4087 {
4088 	/*
4089 	 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
4090 	 */
4091 	return true;
4092 }
4093 
4094 static processor_set_t
sched_edge_steal_candidate(processor_set_t pset)4095 sched_edge_steal_candidate(processor_set_t pset)
4096 {
4097 	uint32_t dst_cluster_id = pset->pset_cluster_id;
4098 	for (int cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
4099 		processor_set_t candidate_pset = pset_array[cluster_id];
4100 		if (cluster_id == dst_cluster_id) {
4101 			continue;
4102 		}
4103 		if (candidate_pset == NULL) {
4104 			continue;
4105 		}
4106 		sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
4107 		if (incoming_edge->sce_steal_allowed && (bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1)) {
4108 			return candidate_pset;
4109 		}
4110 	}
4111 	return NULL;
4112 }
4113 
4114 static boolean_t
sched_edge_foreign_runnable_thread_available(processor_set_t pset)4115 sched_edge_foreign_runnable_thread_available(processor_set_t pset)
4116 {
4117 	/* Find all the clusters that are foreign for this cluster */
4118 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4119 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4120 		/*
4121 		 * For each cluster, see if there are any runnable foreign threads.
4122 		 * This check is currently being done without the pset lock to make it cheap for
4123 		 * the common case.
4124 		 */
4125 		processor_set_t target_pset = pset_array[cluster];
4126 		if (sched_edge_pset_available(target_pset) == false) {
4127 			continue;
4128 		}
4129 
4130 		if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
4131 			return true;
4132 		}
4133 	}
4134 	return false;
4135 }
4136 
4137 static thread_t
sched_edge_foreign_runnable_thread_remove(processor_set_t pset,uint64_t ctime)4138 sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
4139 {
4140 	thread_t thread = THREAD_NULL;
4141 
4142 	/* Find all the clusters that are foreign for this cluster */
4143 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4144 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4145 		/*
4146 		 * For each cluster, see if there are any runnable foreign threads.
4147 		 * This check is currently being done without the pset lock to make it cheap for
4148 		 * the common case.
4149 		 */
4150 		processor_set_t target_pset = pset_array[cluster];
4151 		if (sched_edge_pset_available(target_pset) == false) {
4152 			continue;
4153 		}
4154 
4155 		if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
4156 			continue;
4157 		}
4158 		/*
4159 		 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
4160 		 * and get the highest priority thread.
4161 		 */
4162 		pset_lock(target_pset);
4163 		if (sched_edge_pset_available(target_pset)) {
4164 			thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
4165 			sched_update_pset_load_average(target_pset, ctime);
4166 		}
4167 		pset_unlock(target_pset);
4168 
4169 		/*
4170 		 * Edge Scheduler Optimization
4171 		 *
4172 		 * The current implementation immediately returns as soon as it finds a foreign
4173 		 * runnable thread. This could be enhanced to look at highest priority threads
4174 		 * from all foreign clusters and pick the highest amongst them. That would need
4175 		 * some form of global state across psets to make that kind of a check cheap.
4176 		 */
4177 		if (thread != THREAD_NULL) {
4178 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
4179 			break;
4180 		}
4181 		/* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
4182 	}
4183 
4184 	return thread;
4185 }
4186 
4187 /*
4188  * sched_edge_cpu_running_foreign_shared_rsrc_available()
4189  *
4190  * Routine to determine if the thread running on a CPU is a shared resource thread
4191  * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
4192  * a CPU going idle on a pset should rebalance a running shared resource heavy thread
4193  * from another non-ideal cluster based on the former's shared resource load.
4194  */
4195 static boolean_t
sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset,int foreign_cpu,processor_set_t idle_pset)4196 sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
4197 {
4198 	boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
4199 	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
4200 		return false;
4201 	}
4202 
4203 	boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
4204 	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
4205 		return false;
4206 	}
4207 	return true;
4208 }
4209 
4210 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset)4211 sched_edge_foreign_running_thread_available(processor_set_t pset)
4212 {
4213 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4214 	int cluster = -1;
4215 	while ((cluster = sched_edge_iterate_clusters_ordered(pset, foreign_pset_bitmap[0], cluster)) != -1) {
4216 		/* Skip the pset if its not schedulable */
4217 		processor_set_t target_pset = pset_array[cluster];
4218 		if (sched_edge_pset_available(target_pset) == false) {
4219 			continue;
4220 		}
4221 
4222 		uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
4223 		for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
4224 			if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
4225 				continue;
4226 			}
4227 			return true;
4228 		}
4229 	}
4230 	return false;
4231 }
4232 
4233 /*
4234  * sched_edge_steal_possible()
4235  *
4236  * Determines whether we can and should steal a thread from
4237  * the candidate_pset to run it on the idle_pset. When returning
4238  * true, the function also writes the scheduling bucket that we
4239  * should steal from into the bucket_for_steal out parameter.
4240  *
4241  * Always called with the pset lock for candidate_pset held.
4242  */
4243 static bool
sched_edge_steal_possible(processor_set_t idle_pset,processor_set_t candidate_pset,sched_bucket_t * bucket_for_steal)4244 sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_pset, sched_bucket_t *bucket_for_steal)
4245 {
4246 	sched_clutch_root_t candidate_clutch_root = &candidate_pset->pset_clutch_root;
4247 
4248 	int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(candidate_clutch_root, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
4249 	if (highest_runnable_bucket == -1) {
4250 		/* Candidate cluster runq is empty of unbound threads */
4251 		return false;
4252 	}
4253 
4254 	if (idle_pset->pset_type == candidate_pset->pset_type) {
4255 		/* Always allow stealing from homogeneous clusters */
4256 		*bucket_for_steal = (sched_bucket_t)highest_runnable_bucket;
4257 		return true;
4258 	}
4259 
4260 	for (int unbound_qos = highest_runnable_bucket; unbound_qos >= 0; unbound_qos = bitmap_lsb_next(candidate_clutch_root->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, unbound_qos)) {
4261 		uint32_t candidate_runq_depth = os_atomic_load(&candidate_pset->pset_runnable_depth[unbound_qos], relaxed);
4262 		if (candidate_runq_depth > pset_available_cpu_count(candidate_pset)) {
4263 			/* Candidate cluster has excess load at this QoS (and at least one unbound thread we can steal!) */
4264 			*bucket_for_steal = (sched_bucket_t)unbound_qos;
4265 			return true;
4266 		}
4267 	}
4268 	/* None of the unbound root buckets are overloaded */
4269 	return false;
4270 }
4271 
4272 static thread_t
sched_edge_steal_thread(processor_set_t pset,uint64_t candidate_pset_bitmap)4273 sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap)
4274 {
4275 	thread_t stolen_thread = THREAD_NULL;
4276 
4277 	/*
4278 	 * Edge Scheduler Optimization
4279 	 *
4280 	 * The logic today bails as soon as it finds a cluster where the cluster load is
4281 	 * greater than the edge weight. Maybe it should have a more advanced version
4282 	 * which looks for the maximum delta etc.
4283 	 */
4284 	int cluster_id = -1;
4285 	while ((cluster_id = sched_edge_iterate_clusters_ordered(pset, candidate_pset_bitmap, cluster_id)) != -1) {
4286 		processor_set_t steal_from_pset = pset_array[cluster_id];
4287 		if (steal_from_pset == NULL) {
4288 			continue;
4289 		}
4290 		sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[pset->pset_cluster_id];
4291 		if (incoming_edge->sce_steal_allowed == false) {
4292 			continue;
4293 		}
4294 		pset_lock(steal_from_pset);
4295 		sched_bucket_t bucket_for_steal;
4296 		if (sched_edge_steal_possible(pset, steal_from_pset, &bucket_for_steal)) {
4297 			uint64_t current_timestamp = mach_absolute_time();
4298 			sched_clutch_root_t clutch_root_for_steal = &steal_from_pset->pset_clutch_root;
4299 			stolen_thread = sched_clutch_thread_unbound_lookup(clutch_root_for_steal, &clutch_root_for_steal->scr_unbound_buckets[bucket_for_steal], NULL, NULL);
4300 			sched_clutch_thread_remove(clutch_root_for_steal, stolen_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
4301 
4302 			sched_clutch_dbg_thread_select_packed_t debug_info = {0};
4303 			debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
4304 			debug_info.trace_data.traverse_mode = SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY;
4305 			debug_info.trace_data.cluster_id = steal_from_pset->pset_cluster_id;
4306 			debug_info.trace_data.selection_was_cluster_bound = false;
4307 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
4308 			    thread_tid(stolen_thread), thread_group_get_id(stolen_thread->thread_group), bucket_for_steal, debug_info.scdts_trace_data_packed, 0);
4309 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(stolen_thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
4310 
4311 			sched_update_pset_load_average(steal_from_pset, current_timestamp);
4312 		}
4313 		pset_unlock(steal_from_pset);
4314 		if (stolen_thread != THREAD_NULL) {
4315 			break;
4316 		}
4317 	}
4318 	return stolen_thread;
4319 }
4320 
4321 /*
4322  * sched_edge_processor_idle()
4323  *
4324  * The routine is the implementation for steal_thread() for the Edge scheduler.
4325  */
4326 static thread_t
sched_edge_processor_idle(processor_set_t pset)4327 sched_edge_processor_idle(processor_set_t pset)
4328 {
4329 	thread_t thread = THREAD_NULL;
4330 
4331 	uint64_t ctime = mach_absolute_time();
4332 
4333 	processor_t processor = current_processor();
4334 	bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
4335 
4336 	/* Each of the operations acquire the lock for the pset they target */
4337 	pset_unlock(pset);
4338 
4339 	/* Find highest priority runnable thread on all non-native clusters */
4340 	thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
4341 	if (thread != THREAD_NULL) {
4342 		return thread;
4343 	}
4344 
4345 	/* Find highest priority runnable thread on all native clusters */
4346 	thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4347 	if (thread != THREAD_NULL) {
4348 		return thread;
4349 	}
4350 
4351 	/* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4352 	boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4353 	if (rebalance_needed) {
4354 		return THREAD_NULL;
4355 	}
4356 
4357 	/* No foreign threads found; find a thread to steal from all clusters based on weights/loads etc. */
4358 	thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4359 	return thread;
4360 }
4361 
4362 /* Return true if this shared resource thread has a better cluster to run on */
4363 static bool
sched_edge_shared_rsrc_migrate_possible(thread_t thread,processor_set_t preferred_pset,processor_set_t current_pset)4364 sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4365 {
4366 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4367 	uint64_t current_pset_load = sched_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4368 	/*
4369 	 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4370 	 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4371 	 *
4372 	 * Edge Scheduler Optimization
4373 	 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4374 	 */
4375 	uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4376 
4377 	uint64_t eligible_pset_bitmask = 0;
4378 	if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4379 		/*
4380 		 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4381 		 * only among clusters native with the preferred cluster.
4382 		 */
4383 		eligible_pset_bitmask = preferred_pset->native_psets[0];
4384 		bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4385 	} else {
4386 		/* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4387 		eligible_pset_bitmask = sched_edge_available_pset_bitmask[0];
4388 	}
4389 
4390 	/* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4391 	for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4392 		if (cluster_id == current_pset->pset_cluster_id) {
4393 			continue;
4394 		}
4395 		uint64_t cluster_load = sched_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4396 		if (current_pset_adjusted_load > cluster_load) {
4397 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4398 			return true;
4399 		}
4400 	}
4401 	return false;
4402 }
4403 
4404 /* Return true if this thread should not continue running on this processor */
4405 static bool
sched_edge_thread_avoid_processor(processor_t processor,thread_t thread,ast_t reason)4406 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason)
4407 {
4408 	if (thread->bound_processor == processor) {
4409 		/* Thread is bound here */
4410 		return false;
4411 	}
4412 
4413 	/*
4414 	 * On quantum expiry, check the migration bitmask if this thread should be migrated off this core.
4415 	 * A migration is only recommended if there's also an idle core available that needn't be avoided.
4416 	 */
4417 	if (reason & AST_QUANTUM) {
4418 		if (bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id)) {
4419 			uint64_t non_avoided_idle_primary_map = processor->processor_set->cpu_state_map[PROCESSOR_IDLE] & processor->processor_set->recommended_bitmask & ~processor->processor_set->perfcontrol_cpu_migration_bitmask;
4420 			if (non_avoided_idle_primary_map != 0) {
4421 				return true;
4422 			}
4423 		}
4424 	}
4425 
4426 	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4427 
4428 	/* Evaluate shared resource policies */
4429 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
4430 		return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4431 	}
4432 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
4433 		if (processor->processor_set->pset_type != preferred_pset->pset_type &&
4434 		    pset_type_is_recommended(preferred_pset)) {
4435 			return true;
4436 		}
4437 		return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4438 	}
4439 
4440 	/*
4441 	 * For long running parallel workloads, it is important to rebalance threads across
4442 	 * E/P clusters so that they make equal forward progress. This is achieved through
4443 	 * threads expiring their quantum on the non-preferred cluster type and explicitly
4444 	 * rebalancing to the preferred cluster runqueue.
4445 	 */
4446 	if ((processor->processor_set->pset_type != preferred_pset->pset_type) &&
4447 	    pset_type_is_recommended(preferred_pset)) {
4448 		return true;
4449 	}
4450 	/* If the preferred pset for the thread is now idle, try and migrate thread to that cluster */
4451 	if ((processor->processor_set != preferred_pset) &&
4452 	    (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
4453 		return true;
4454 	}
4455 
4456 	return false;
4457 }
4458 
4459 static bool
sched_edge_balance(__unused processor_t cprocessor,processor_set_t cpset)4460 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
4461 {
4462 	assert(cprocessor == current_processor());
4463 	pset_unlock(cpset);
4464 
4465 	uint64_t ast_processor_map = 0;
4466 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4467 
4468 	bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
4469 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4470 		/* Skip the pset if its not schedulable */
4471 		processor_set_t target_pset = pset_array[cluster];
4472 		if (sched_edge_pset_available(target_pset) == false) {
4473 			continue;
4474 		}
4475 
4476 		pset_lock(target_pset);
4477 		uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
4478 		for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
4479 			if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
4480 				continue;
4481 			}
4482 			processor_t target_cpu = processor_array[cpuid];
4483 			ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
4484 			if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4485 				bit_set(ast_processor_map, cpuid);
4486 			}
4487 		}
4488 		pset_unlock(target_pset);
4489 	}
4490 
4491 	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4492 		processor_t ast_processor = processor_array[cpuid];
4493 		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4494 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
4495 	}
4496 
4497 	/* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
4498 	return ast_processor_map != 0;
4499 }
4500 
4501 /*
4502  * sched_edge_migration_check()
4503  *
4504  * Routine to evaluate an edge between two clusters to decide if migration is possible
4505  * across that edge. Also updates the selected_pset and max_edge_delta out parameters
4506  * accordingly. The return value indicates if the invoking routine should short circuit
4507  * the search, since an ideal candidate has been found. The routine looks at the regular
4508  * edges and cluster loads or the shared resource loads based on the type of thread.
4509  */
4510 static bool
sched_edge_migration_check(uint32_t cluster_id,processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread,processor_set_t * selected_pset,uint32_t * max_edge_delta)4511 sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
4512     uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
4513 {
4514 	uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4515 	cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
4516 	processor_set_t dst_pset = pset_array[cluster_id];
4517 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4518 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4519 
4520 	if (cluster_id == preferred_cluster_id) {
4521 		return false;
4522 	}
4523 
4524 	if (dst_pset == NULL) {
4525 		return false;
4526 	}
4527 
4528 	sched_clutch_edge *edge = preferred_pset->sched_edges;
4529 	if (edge[cluster_id].sce_migration_allowed == false) {
4530 		return false;
4531 	}
4532 	uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4533 	if (dst_load == 0) {
4534 		/* The candidate cluster is idle; select it immediately for execution */
4535 		*selected_pset = dst_pset;
4536 		*max_edge_delta = preferred_cluster_load;
4537 		return true;
4538 	}
4539 
4540 	uint32_t edge_delta = 0;
4541 	if (dst_load > preferred_cluster_load) {
4542 		return false;
4543 	}
4544 	edge_delta = preferred_cluster_load - dst_load;
4545 	if (!shared_rsrc_thread && (edge_delta < edge[cluster_id].sce_migration_weight)) {
4546 		/*
4547 		 * For non shared resource threads, use the edge migration weight to decide if
4548 		 * this cluster is over-committed at the QoS level of this thread.
4549 		 */
4550 		return false;
4551 	}
4552 
4553 	if (edge_delta < *max_edge_delta) {
4554 		return false;
4555 	}
4556 	if (edge_delta == *max_edge_delta) {
4557 		/* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4558 		boolean_t selected_homogeneous = (pset_type_for_id((*selected_pset)->pset_cluster_id) == preferred_cluster_type);
4559 		boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4560 		if (selected_homogeneous || !candidate_homogeneous) {
4561 			return false;
4562 		}
4563 	}
4564 	/* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
4565 	*max_edge_delta = edge_delta;
4566 	*selected_pset = dst_pset;
4567 	return false;
4568 }
4569 
4570 /*
4571  * sched_edge_iterate_clusters_ordered()
4572  *
4573  * Routine to iterate clusters in die local order. For multi-die machines,
4574  * the routine ensures that the candidate clusters on the same die as the
4575  * passed in pset are returned before the remote die clusters. This should
4576  * be used in all places where cluster selection in die order matters.
4577  */
4578 
4579 static int
sched_edge_iterate_clusters_ordered(processor_set_t starting_pset,uint64_t candidate_map,int previous_cluster)4580 sched_edge_iterate_clusters_ordered(processor_set_t starting_pset, uint64_t candidate_map, int previous_cluster)
4581 {
4582 	int cluster_id = -1;
4583 
4584 	uint64_t local_candidate_map = starting_pset->local_psets[0] & candidate_map;
4585 	uint64_t remote_candidate_map = starting_pset->remote_psets[0] & candidate_map;
4586 
4587 	if (previous_cluster == -1) {
4588 		/* previous_cluster == -1 indicates the initial condition */
4589 		cluster_id = bit_first(local_candidate_map);
4590 		if (cluster_id != -1) {
4591 			return cluster_id;
4592 		}
4593 		return bit_first(remote_candidate_map);
4594 	} else {
4595 		/*
4596 		 * After the initial condition, the routine attempts to return a
4597 		 * cluster in the previous_cluster's locality. If none is available,
4598 		 * it looks at remote clusters.
4599 		 */
4600 		if (bit_test(local_candidate_map, previous_cluster)) {
4601 			cluster_id = bit_next(local_candidate_map, previous_cluster);
4602 			if (cluster_id != -1) {
4603 				return cluster_id;
4604 			} else {
4605 				return bit_first(remote_candidate_map);
4606 			}
4607 		}
4608 		return bit_next(remote_candidate_map, previous_cluster);
4609 	}
4610 }
4611 
4612 /*
4613  * sched_edge_migrate_edges_evaluate()
4614  *
4615  * Routine to find the candidate for thread migration based on edge weights.
4616  *
4617  * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
4618  * return preferred_pset if its the most ideal destination for this thread.
4619  */
4620 static processor_set_t
sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread)4621 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
4622 {
4623 	processor_set_t selected_pset = preferred_pset;
4624 	uint32_t max_edge_delta = 0;
4625 	bool search_complete = false;
4626 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4627 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4628 
4629 	bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
4630 	bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
4631 	/* Always start the search with the native clusters */
4632 	int cluster_id = -1;
4633 	while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, native_pset_bitmap[0], cluster_id)) != -1) {
4634 		search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4635 		if (search_complete) {
4636 			break;
4637 		}
4638 	}
4639 
4640 	if (search_complete) {
4641 		return selected_pset;
4642 	}
4643 
4644 	if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
4645 		/*
4646 		 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
4647 		 * to fill up the preferred cluster and its homogeneous peers first.
4648 		 */
4649 
4650 		if (max_edge_delta > 0) {
4651 			/*
4652 			 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
4653 			 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
4654 			 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
4655 			 * should be placed.
4656 			 */
4657 			return selected_pset;
4658 		}
4659 		/*
4660 		 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
4661 		 * any more capacity left.
4662 		 */
4663 		if (sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
4664 			return preferred_pset;
4665 		}
4666 		/*
4667 		 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
4668 		 * at non-native clusters for capacity.
4669 		 */
4670 	}
4671 
4672 	/* Now look at the non-native clusters */
4673 	cluster_id = -1;
4674 	while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, foreign_pset_bitmap[0], cluster_id)) != -1) {
4675 		search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4676 		if (search_complete) {
4677 			break;
4678 		}
4679 	}
4680 	return selected_pset;
4681 }
4682 
4683 /*
4684  * sched_edge_candidate_alternative()
4685  *
4686  * Routine to find an alternative cluster from candidate_cluster_bitmap since the
4687  * selected_pset is not available for execution. The logic tries to prefer homogeneous
4688  * clusters over heterogeneous clusters since this is typically used in thread
4689  * placement decisions.
4690  */
4691 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
4692 static processor_set_t
sched_edge_candidate_alternative(processor_set_t selected_pset,uint64_t candidate_cluster_bitmap)4693 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
4694 {
4695 	/*
4696 	 * It looks like the most ideal pset is not available for scheduling currently.
4697 	 * Try to find a homogeneous cluster that is still available.
4698 	 */
4699 	uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
4700 	int available_cluster_id = lsb_first(available_native_clusters);
4701 	if (available_cluster_id == -1) {
4702 		/* Looks like none of the homogeneous clusters are available; pick the first available cluster */
4703 		available_cluster_id = bit_first(candidate_cluster_bitmap);
4704 	}
4705 	assert(available_cluster_id != -1);
4706 	return pset_array[available_cluster_id];
4707 }
4708 
4709 /*
4710  * sched_edge_switch_pset_lock()
4711  *
4712  * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4713  * switch_pset_locks.
4714  * Returns the newly locked pset after the switch.
4715  */
4716 static processor_set_t
sched_edge_switch_pset_lock(processor_set_t selected_pset,processor_set_t locked_pset,bool switch_pset_locks)4717 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
4718 {
4719 	if (!switch_pset_locks) {
4720 		return locked_pset;
4721 	}
4722 	if (selected_pset != locked_pset) {
4723 		pset_unlock(locked_pset);
4724 		pset_lock(selected_pset);
4725 		return selected_pset;
4726 	} else {
4727 		return locked_pset;
4728 	}
4729 }
4730 
4731 /*
4732  * sched_edge_amp_rebalance_pset()
4733  *
4734  * Routine to decide where a thread which is eligible for AMP rebalance (i.e.
4735  * has executed on non-preferred cluster type for a while) should be enqueued.
4736  * The algorithm maintains a history of AMP rebalance decisions on the clutch
4737  * bucket group of the workload and round-robins between clusters to ensure
4738  * that all threads get a chance on the performance cores and make equal
4739  * progress.
4740  */
4741 static processor_set_t
sched_edge_amp_rebalance_pset(processor_set_t preferred_pset,thread_t thread)4742 sched_edge_amp_rebalance_pset(processor_set_t preferred_pset, thread_t thread)
4743 {
4744 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
4745 	sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
4746 
4747 	uint32_t last_chosen_cluster, new_chosen_cluster;
4748 
4749 	/* Only AMP rebalance within clusters native to the preferred cluster */
4750 	uint64_t eligible_pset_bitmask = preferred_pset->native_psets[0];
4751 	/* Preferred cluster is also eligible for rebalancing */
4752 	bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4753 	/* Atomically update the AMP rebalance cluster for the clutch bucket group */
4754 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_amp_rebalance_last_chosen, last_chosen_cluster, new_chosen_cluster, relaxed, {
4755 		if (last_chosen_cluster == UINT32_MAX) {
4756 		        new_chosen_cluster = preferred_pset->pset_cluster_id;
4757 		} else {
4758 		        new_chosen_cluster = lsb_next(eligible_pset_bitmask, last_chosen_cluster);
4759 		        if (new_chosen_cluster == -1) {
4760 		                /* Rotate to the start of the eligible bitmask */
4761 		                new_chosen_cluster = lsb_first(eligible_pset_bitmask);
4762 			}
4763 		}
4764 	});
4765 	return pset_array[new_chosen_cluster];
4766 }
4767 
4768 /*
4769  * sched_edge_migrate_candidate()
4770  *
4771  * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4772  * the thread and the preferred cluster to determine the best available pset for scheduling.
4773  *
4774  * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4775  * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4776  * decision has to be synchronized with another CPU changing the recommendation of clusters available
4777  * on the system. If this parameter is set to false, this routine returns the best effort indication of
4778  * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4779  * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4780  * more appropriate cluster.
4781  *
4782  * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4783  * resultant pset lock is held.
4784  */
4785 static processor_set_t
sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset,thread_t thread,processor_set_t locked_pset,bool switch_pset_locks)4786 sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
4787 {
4788 	processor_set_t selected_pset = preferred_pset;
4789 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4790 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4791 
4792 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
4793 		/* For bound threads always recommend the cluster it's bound to */
4794 		selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
4795 		if (selected_pset != NULL) {
4796 			locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4797 			if (sched_edge_pset_available(selected_pset) || (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread) == false)) {
4798 				/*
4799 				 * If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
4800 				 * fall through to the regular cluster selection logic which handles unavailable clusters
4801 				 * appropriately. If the thread is hard bound, then return the bound cluster always.
4802 				 */
4803 				return selected_pset;
4804 			}
4805 		}
4806 	}
4807 
4808 	uint64_t candidate_cluster_bitmap = mask(sched_edge_max_clusters);
4809 #if DEVELOPMENT || DEBUG
4810 	extern int enable_task_set_cluster_type;
4811 	task_t task = get_threadtask(thread);
4812 	if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
4813 		processor_set_t pset_hint = task->pset_hint;
4814 		if (pset_hint && (selected_pset == NULL || selected_pset->pset_cluster_type != pset_hint->pset_cluster_type)) {
4815 			selected_pset = pset_hint;
4816 			goto migrate_candidate_available_check;
4817 		}
4818 	}
4819 #endif
4820 
4821 	if (preferred_pset == NULL) {
4822 		/* The preferred_pset has not finished initializing at boot */
4823 		goto migrate_candidate_available_check;
4824 	}
4825 
4826 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4827 		/* For realtime threads, try and schedule them on the preferred pset always */
4828 		goto migrate_candidate_available_check;
4829 	}
4830 
4831 	uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
4832 	if (preferred_cluster_load == 0) {
4833 		goto migrate_candidate_available_check;
4834 	}
4835 
4836 	/*
4837 	 * If a thread is being rebalanced for achieving equal progress of parallel workloads,
4838 	 * it needs to end up on the preferred runqueue. This mechanism should only be used for
4839 	 * threads which have been previously migrated to the non-preferred cluster type.
4840 	 *
4841 	 * The AMP rebalancing mechanism is available for regular threads or shared resource
4842 	 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
4843 	 */
4844 	bool amp_rebalance_eligible = (!shared_rsrc_thread) || (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST));
4845 	if (amp_rebalance_eligible) {
4846 		boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE | AST_QUANTUM)) == (AST_REBALANCE | AST_QUANTUM);
4847 		if (amp_rebalance) {
4848 			boolean_t non_preferred_pset = (thread->last_processor->processor_set->pset_type != preferred_pset->pset_type);
4849 			if (non_preferred_pset) {
4850 				selected_pset = sched_edge_amp_rebalance_pset(preferred_pset, thread);
4851 				goto migrate_candidate_available_check;
4852 			}
4853 		}
4854 	}
4855 
4856 	/* Look at edge weights to decide the most ideal migration candidate for this thread */
4857 	selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
4858 
4859 migrate_candidate_available_check:
4860 	if (selected_pset == NULL) {
4861 		/* The selected_pset has not finished initializing at boot */
4862 		pset_unlock(locked_pset);
4863 		return NULL;
4864 	}
4865 
4866 	locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4867 	if (sched_edge_pset_available(selected_pset) == true) {
4868 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_pset->pset_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4869 		return selected_pset;
4870 	}
4871 	/* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4872 	bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4873 	if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -1)) {
4874 		pset_unlock(locked_pset);
4875 		return NULL;
4876 	}
4877 	/* Try and find an alternative for the selected pset */
4878 	selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
4879 	goto migrate_candidate_available_check;
4880 }
4881 
4882 static processor_t
sched_edge_choose_processor(processor_set_t pset,processor_t processor,thread_t thread)4883 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
4884 {
4885 	/* Bound threads don't call this function */
4886 	assert(thread->bound_processor == PROCESSOR_NULL);
4887 	processor_t chosen_processor = PROCESSOR_NULL;
4888 
4889 	/*
4890 	 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
4891 	 * It should take the passed in "pset" as a hint which represents the recency metric for
4892 	 * pset selection logic.
4893 	 */
4894 	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4895 	processor_set_t chosen_pset = preferred_pset;
4896 	/*
4897 	 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4898 	 * threads to. sched_edge_migrate_candidate() returns the preferred pset
4899 	 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4900 	 *
4901 	 * Edge Scheduler Optimization
4902 	 * It might be useful to build a recency metric for the thread for multiple clusters and
4903 	 * factor that into the migration decisions.
4904 	 */
4905 	chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
4906 	if (chosen_pset) {
4907 		chosen_processor = choose_processor(chosen_pset, processor, thread);
4908 	}
4909 	/* For RT threads, choose_processor() can return a different cluster than the one passed into it */
4910 	assert(chosen_processor ? chosen_processor->processor_set->pset_type == chosen_pset->pset_type : true);
4911 	return chosen_processor;
4912 }
4913 
4914 /*
4915  * sched_edge_clutch_bucket_threads_drain()
4916  *
4917  * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4918  * bucket overrides etc.) into a local thread queue.
4919  */
4920 static void
sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,queue_t clutch_threads)4921 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
4922 {
4923 	thread_t thread = THREAD_NULL;
4924 	uint64_t current_timestamp = mach_approximate_time();
4925 	qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
4926 		sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4927 		enqueue_tail(clutch_threads, &thread->runq_links);
4928 	}
4929 }
4930 
4931 /*
4932  * sched_edge_run_drained_threads()
4933  *
4934  * Makes all drained threads in a local queue runnable.
4935  */
4936 static void
sched_edge_run_drained_threads(queue_t clutch_threads)4937 sched_edge_run_drained_threads(queue_t clutch_threads)
4938 {
4939 	thread_t thread;
4940 	/* Now setrun all the threads in the local queue */
4941 	qe_foreach_element_safe(thread, clutch_threads, runq_links) {
4942 		remqueue(&thread->runq_links);
4943 		thread_lock(thread);
4944 		thread_setrun(thread, SCHED_TAILQ);
4945 		thread_unlock(thread);
4946 	}
4947 }
4948 
4949 /*
4950  * sched_edge_update_preferred_cluster()
4951  *
4952  * Routine to update the preferred cluster for QoS buckets within a thread group.
4953  * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4954  */
4955 static void
sched_edge_update_preferred_cluster(sched_clutch_t sched_clutch,bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster)4956 sched_edge_update_preferred_cluster(
4957 	sched_clutch_t sched_clutch,
4958 	bitmap_t *clutch_bucket_modify_bitmap,
4959 	uint32_t *tg_bucket_preferred_cluster)
4960 {
4961 	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4962 		os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
4963 	}
4964 }
4965 
4966 /*
4967  * sched_edge_migrate_thread_group_runnable_threads()
4968  *
4969  * Routine to implement the migration of threads on a cluster when the thread group
4970  * recommendation is updated. The migration works using a 2-phase
4971  * algorithm.
4972  *
4973  * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4974  * For each clutch bucket, if it needs to be migrated immediately, drain the threads
4975  * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4976  * appropriate.
4977  *
4978  * Phase 2: After unlocking the pset, drain all the threads from the local thread
4979  * queue and mark them runnable which should land them in the right hierarchy.
4980  *
4981  * The routine assumes that the preferences for the clutch buckets/clutch bucket
4982  * groups have already been updated by the caller.
4983  *
4984  * - Called with the pset locked and interrupts disabled.
4985  * - Returns with the pset unlocked.
4986  */
4987 static void
sched_edge_migrate_thread_group_runnable_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,bitmap_t * clutch_bucket_modify_bitmap,__unused uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)4988 sched_edge_migrate_thread_group_runnable_threads(
4989 	sched_clutch_t sched_clutch,
4990 	sched_clutch_root_t root_clutch,
4991 	bitmap_t *clutch_bucket_modify_bitmap,
4992 	__unused uint32_t *tg_bucket_preferred_cluster,
4993 	bool migrate_immediately)
4994 {
4995 	/* Queue to hold threads that have been drained from clutch buckets to be migrated */
4996 	queue_head_t clutch_threads;
4997 	queue_init(&clutch_threads);
4998 
4999 	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
5000 		/* Get the clutch bucket for this cluster and sched bucket */
5001 		sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
5002 		sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
5003 		sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
5004 		if (scb_root == NULL) {
5005 			/* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
5006 			assert(clutch_bucket->scb_thr_count == 0);
5007 			continue;
5008 		}
5009 		assert(scb_root == root_clutch);
5010 		uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
5011 
5012 		if (migrate_immediately) {
5013 			/*
5014 			 * For transitions where threads need to be migrated immediately, drain the threads into a
5015 			 * local queue unless we are looking at the clutch buckets for the newly recommended
5016 			 * cluster.
5017 			 */
5018 			if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
5019 				sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
5020 			} else {
5021 				sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
5022 			}
5023 		} else {
5024 			/* Check if this cluster is the same type as the newly recommended cluster */
5025 			boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
5026 			/*
5027 			 * If threads do not have to be migrated immediately, just change the native/foreign
5028 			 * flag on the clutch bucket.
5029 			 */
5030 			if (homogeneous_cluster) {
5031 				sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
5032 			} else {
5033 				sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
5034 			}
5035 		}
5036 	}
5037 
5038 	pset_unlock(root_clutch->scr_pset);
5039 	sched_edge_run_drained_threads(&clutch_threads);
5040 }
5041 
5042 /*
5043  * sched_edge_migrate_thread_group_running_threads()
5044  *
5045  * Routine to find all running threads of a thread group on a specific cluster
5046  * and IPI them if they need to be moved immediately.
5047  */
5048 static void
sched_edge_migrate_thread_group_running_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,__unused bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)5049 sched_edge_migrate_thread_group_running_threads(
5050 	sched_clutch_t sched_clutch,
5051 	sched_clutch_root_t root_clutch,
5052 	__unused bitmap_t *clutch_bucket_modify_bitmap,
5053 	uint32_t *tg_bucket_preferred_cluster,
5054 	bool migrate_immediately)
5055 {
5056 	if (migrate_immediately == false) {
5057 		/* If CLPC has recommended not to move threads immediately, nothing to do here */
5058 		return;
5059 	}
5060 
5061 	/*
5062 	 * Edge Scheduler Optimization
5063 	 *
5064 	 * When the system has a large number of clusters and cores, it might be useful to
5065 	 * narrow down the iteration by using a thread running bitmap per clutch.
5066 	 */
5067 	uint64_t ast_processor_map = 0;
5068 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
5069 
5070 	uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
5071 	/*
5072 	 * Iterate all CPUs and look for the ones running threads from this thread group and are
5073 	 * not restricted to the specific cluster (due to overrides etc.)
5074 	 */
5075 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
5076 		processor_t src_processor = processor_array[cpuid];
5077 		boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
5078 		sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
5079 		if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
5080 			continue;
5081 		}
5082 		boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
5083 
5084 		if (expected_tg && non_preferred_cluster) {
5085 			ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
5086 			if (ipi_type[cpuid] != SCHED_IPI_NONE) {
5087 				bit_set(ast_processor_map, cpuid);
5088 			} else if (src_processor == current_processor()) {
5089 				bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
5090 				ast_t new_preempt = update_pending_nonurgent_preemption(src_processor, AST_PREEMPT);
5091 				ast_on(new_preempt);
5092 			}
5093 		}
5094 	}
5095 
5096 	/* Perform all the IPIs */
5097 	if (bit_first(ast_processor_map) != -1) {
5098 		for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
5099 			processor_t ast_processor = processor_array[cpuid];
5100 			sched_ipi_perform(ast_processor, ipi_type[cpuid]);
5101 		}
5102 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
5103 	}
5104 }
5105 
5106 /*
5107  * sched_edge_tg_preferred_cluster_change()
5108  *
5109  * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
5110  * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
5111  * thread migrations based on the policy specified by CLPC.
5112  * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
5113  * for each QoS within the thread group.
5114  */
5115 void
sched_edge_tg_preferred_cluster_change(struct thread_group * tg,uint32_t * tg_bucket_preferred_cluster,sched_perfcontrol_preferred_cluster_options_t options)5116 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
5117 {
5118 	sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
5119 	/*
5120 	 * In order to optimize the processing, create a bitmap which represents all QoS buckets
5121 	 * for which the preferred cluster has changed.
5122 	 */
5123 	bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
5124 	for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
5125 		uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
5126 		uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
5127 		if (old_preferred_cluster != new_preferred_cluster) {
5128 			bitmap_set(clutch_bucket_modify_bitmap, bucket);
5129 		}
5130 	}
5131 	if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
5132 		/* No changes in any clutch buckets; nothing to do here */
5133 		return;
5134 	}
5135 
5136 	for (uint32_t cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
5137 		processor_set_t pset = pset_array[cluster_id];
5138 		spl_t s = splsched();
5139 		pset_lock(pset);
5140 		/*
5141 		 * The first operation is to update the preferred cluster for all QoS buckets within the
5142 		 * thread group so that any future threads becoming runnable would see the new preferred
5143 		 * cluster value.
5144 		 */
5145 		sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
5146 		/*
5147 		 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
5148 		 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
5149 		 *
5150 		 * Edge Scheduler Optimization
5151 		 */
5152 		/* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
5153 		sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
5154 		    tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
5155 		/* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
5156 		sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
5157 		    tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
5158 		/* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
5159 		splx(s);
5160 	}
5161 }
5162 
5163 /*
5164  * sched_edge_pset_made_schedulable()
5165  *
5166  * Routine to migrate all the clutch buckets which are not in their recommended
5167  * pset hierarchy now that a new pset has become runnable. Its possible that this
5168  * routine is called when the pset is already marked schedulable.
5169  *
5170  * Invoked with the pset lock held and interrupts disabled.
5171  */
5172 static void
sched_edge_pset_made_schedulable(__unused processor_t processor,processor_set_t dst_pset,boolean_t drop_lock)5173 sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
5174 {
5175 	if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
5176 		/* Nothing to do here since pset is already marked schedulable */
5177 		if (drop_lock) {
5178 			pset_unlock(dst_pset);
5179 		}
5180 		return;
5181 	}
5182 
5183 	bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
5184 
5185 	thread_t thread = sched_edge_processor_idle(dst_pset);
5186 	if (thread != THREAD_NULL) {
5187 		thread_lock(thread);
5188 		thread_setrun(thread, SCHED_TAILQ);
5189 		thread_unlock(thread);
5190 	}
5191 
5192 	if (!drop_lock) {
5193 		pset_lock(dst_pset);
5194 	}
5195 }
5196 
5197 /*
5198  * sched_edge_cpu_init_completed()
5199  *
5200  * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
5201  * provides an opportunity for the edge scheduler to initialize all the edge parameters.
5202  */
5203 static void
sched_edge_cpu_init_completed(void)5204 sched_edge_cpu_init_completed(void)
5205 {
5206 	spl_t s = splsched();
5207 	for (int src_cluster_id = 0; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) {
5208 		processor_set_t src_pset = pset_array[src_cluster_id];
5209 		pset_lock(src_pset);
5210 
5211 		/* For each cluster, set all its outgoing edge parameters */
5212 		for (int dst_cluster_id = 0; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) {
5213 			if (dst_cluster_id == src_cluster_id) {
5214 				continue;
5215 			}
5216 			processor_set_t dst_pset = pset_array[dst_cluster_id];
5217 			if (src_pset->pset_type == dst_pset->pset_type) {
5218 				/* P->P/E->E edge config */
5219 				bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
5220 				bitmap_set(src_pset->native_psets, dst_cluster_id);
5221 				sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
5222 			} else if ((src_pset->pset_type == CLUSTER_TYPE_P) && (dst_pset->pset_type == CLUSTER_TYPE_E)) {
5223 				/* P->E edge config */
5224 				bitmap_set(src_pset->foreign_psets, dst_cluster_id);
5225 				bitmap_clear(src_pset->native_psets, dst_cluster_id);
5226 				sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
5227 			} else {
5228 				/* E->P edge config */
5229 				bitmap_set(src_pset->foreign_psets, dst_cluster_id);
5230 				bitmap_clear(src_pset->native_psets, dst_cluster_id);
5231 				sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
5232 			}
5233 			bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
5234 			if (clusters_local) {
5235 				bitmap_set(src_pset->local_psets, dst_cluster_id);
5236 				bitmap_clear(src_pset->remote_psets, dst_cluster_id);
5237 			} else {
5238 				bitmap_set(src_pset->remote_psets, dst_cluster_id);
5239 				bitmap_clear(src_pset->local_psets, dst_cluster_id);
5240 			}
5241 		}
5242 
5243 		pset_unlock(src_pset);
5244 	}
5245 	splx(s);
5246 }
5247 
5248 static bool
sched_edge_thread_eligible_for_pset(thread_t thread,processor_set_t pset)5249 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
5250 {
5251 	uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
5252 	if (preferred_cluster_id == pset->pset_cluster_id) {
5253 		return true;
5254 	} else {
5255 		processor_set_t preferred_pset = pset_array[preferred_cluster_id];
5256 		return preferred_pset->sched_edges[pset->pset_cluster_id].sce_migration_allowed;
5257 	}
5258 }
5259 
5260 extern int sched_amp_spill_deferred_ipi;
5261 extern int sched_amp_pcores_preempt_immediate_ipi;
5262 
5263 int sched_edge_migrate_ipi_immediate = 1;
5264 
5265 sched_ipi_type_t
sched_edge_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)5266 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
5267 {
5268 	processor_set_t pset = dst->processor_set;
5269 	assert(dst != current_processor());
5270 
5271 	boolean_t deferred_ipi_supported = false;
5272 #if defined(CONFIG_SCHED_DEFERRED_AST)
5273 	deferred_ipi_supported = true;
5274 #endif /* CONFIG_SCHED_DEFERRED_AST */
5275 
5276 	switch (event) {
5277 	case SCHED_IPI_EVENT_SPILL:
5278 		/* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
5279 		if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
5280 			return sched_ipi_deferred_policy(pset, dst, thread, event);
5281 		}
5282 		break;
5283 	case SCHED_IPI_EVENT_PREEMPT:
5284 		/* For preemption, the default policy is to use deferred IPIs
5285 		 * for Non-RT P-core preemption. Override that behavior if
5286 		 * sched_amp_pcores_preempt_immediate_ipi is set
5287 		 */
5288 		if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
5289 			if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
5290 				return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5291 			}
5292 			if (sched_edge_migrate_ipi_immediate) {
5293 				processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5294 				/*
5295 				 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
5296 				 */
5297 				if (preferred_pset->pset_type == pset->pset_type) {
5298 					return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5299 				}
5300 				/*
5301 				 * For workloads that are going wide, it might be useful use Immediate IPI to
5302 				 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
5303 				 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
5304 				 * latency on the preferred pset as an estimate of busyness.
5305 				 */
5306 				if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
5307 					return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5308 				}
5309 			}
5310 		}
5311 		break;
5312 	default:
5313 		break;
5314 	}
5315 	/* Default back to the global policy for all other scenarios */
5316 	return sched_ipi_policy(dst, thread, dst_idle, event);
5317 }
5318 
5319 /*
5320  * sched_edge_qos_max_parallelism()
5321  */
5322 uint32_t
sched_edge_qos_max_parallelism(int qos,uint64_t options)5323 sched_edge_qos_max_parallelism(int qos, uint64_t options)
5324 {
5325 	uint32_t ecpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_E, false, false);
5326 	uint32_t pcpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_P, false, false);
5327 	uint32_t ecluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_E);
5328 	uint32_t pcluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_P);
5329 
5330 
5331 	if (options & QOS_PARALLELISM_REALTIME) {
5332 		/* For realtime threads on AMP, we would want them
5333 		 * to limit the width to just the P-cores since we
5334 		 * do not spill/rebalance for RT threads.
5335 		 */
5336 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcpu_count;
5337 	}
5338 
5339 	/*
5340 	 * The Edge scheduler supports per-QoS recommendations for thread groups.
5341 	 * This enables lower QoS buckets (such as UT) to be scheduled on all
5342 	 * CPUs on the system.
5343 	 *
5344 	 * The only restriction is for BG/Maintenance QoS classes for which the
5345 	 * performance controller would never recommend execution on the P-cores.
5346 	 * If that policy changes in the future, this value should be changed.
5347 	 */
5348 	switch (qos) {
5349 	case THREAD_QOS_BACKGROUND:
5350 	case THREAD_QOS_MAINTENANCE:
5351 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecpu_count;
5352 	default:
5353 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecpu_count + pcpu_count);
5354 	}
5355 }
5356 
5357 
5358 
5359 #endif /* CONFIG_SCHED_EDGE */
5360 
5361 #endif /* CONFIG_SCHED_CLUTCH */
5362