xref: /xnu-8796.141.3/osfmk/kern/sched_clutch.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 2018 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 #include <machine/machine_routines.h>
32 #include <machine/sched_param.h>
33 #include <machine/machine_cpu.h>
34 #include <kern/kern_types.h>
35 #include <kern/debug.h>
36 #include <kern/machine.h>
37 #include <kern/misc_protos.h>
38 #include <kern/processor.h>
39 #include <kern/queue.h>
40 #include <kern/sched.h>
41 #include <kern/sched_prim.h>
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <kern/sched_clutch.h>
45 #include <machine/atomic.h>
46 #include <kern/sched_clutch.h>
47 #include <sys/kdebug.h>
48 
49 #if CONFIG_SCHED_EDGE
50 #include <kern/sched_amp_common.h>
51 #endif /* CONFIG_SCHED_EDGE */
52 
53 #if CONFIG_SCHED_CLUTCH
54 
55 /* Forward declarations of static routines */
56 
57 /* Root level hierarchy management */
58 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
59 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
60 static void sched_clutch_root_pri_update(sched_clutch_root_t);
61 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
62 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
63 
64 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
65 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE           = 0,
66 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY   = 1,
67 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL            = 2,
68 });
69 
70 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t);
71 
72 #if CONFIG_SCHED_EDGE
73 /* Support for foreign threads on AMP platforms */
74 static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
75 static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
76 #endif /* CONFIG_SCHED_EDGE */
77 
78 /* Root bucket level hierarchy management */
79 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
80 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
81 
82 /* Options for clutch bucket ordering in the runq */
83 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
84 	SCHED_CLUTCH_BUCKET_OPTIONS_NONE        = 0x0,
85 	/* Round robin clutch bucket on thread removal */
86 	SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR  = 0x1,
87 	/* Insert clutch bucket at head (for thread preemption) */
88 	SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ       = 0x2,
89 	/* Insert clutch bucket at tail (default) */
90 	SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ       = 0x4,
91 });
92 
93 /* Clutch bucket level hierarchy management */
94 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
95 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
96 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
97 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
98 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
99 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
100 
101 /* Clutch bucket group level properties management */
102 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
103 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
104 static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
105 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
106 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
107 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
108 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
109 
110 /* Clutch timeshare properties updates */
111 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
112 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
113 
114 /* Clutch membership management */
115 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
116 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
117 static thread_t sched_clutch_thread_highest_remove(sched_clutch_root_t);
118 
119 /* Clutch properties updates */
120 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
121 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
122 static int sched_clutch_root_priority(sched_clutch_root_t);
123 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
124 static boolean_t sched_thread_sched_pri_promoted(thread_t);
125 
126 #if CONFIG_SCHED_EDGE
127 /* System based routines */
128 static bool sched_edge_pset_available(processor_set_t);
129 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
130 static int sched_edge_iterate_clusters_ordered(processor_set_t, uint64_t, int);
131 
132 /* Global indicating the maximum number of clusters on the current platform */
133 static int sched_edge_max_clusters = 0;
134 #endif /* CONFIG_SCHED_EDGE */
135 
136 /* Helper debugging routines */
137 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
138 
139 extern processor_set_t pset_array[MAX_PSETS];
140 
141 /*
142  * Special markers for buckets that have invalid WCELs/quantums etc.
143  */
144 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
145 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
146 
147 /*
148  * Root level bucket WCELs
149  *
150  * The root level bucket selection algorithm is an Earliest Deadline
151  * First (EDF) algorithm where the deadline for buckets are defined
152  * by the worst-case-execution-latency and the make runnable timestamp
153  * for the bucket.
154  *
155  */
156 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
157 	SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
158 	0,                                              /* FG */
159 	37500,                                          /* IN (37.5ms) */
160 	75000,                                          /* DF (75ms) */
161 	150000,                                         /* UT (150ms) */
162 	250000                                          /* BG (250ms) */
163 };
164 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
165 
166 /*
167  * Root level bucket warp
168  *
169  * Each root level bucket has a warp value associated with it as well.
170  * The warp value allows the root bucket to effectively warp ahead of
171  * lower priority buckets for a limited time even if it has a later
172  * deadline. The warping behavior provides extra (but limited)
173  * opportunity for high priority buckets to remain responsive.
174  */
175 
176 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
177 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED    (SCHED_CLUTCH_INVALID_TIME_64)
178 
179 /* Warp window durations for various tiers */
180 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
181 	SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
182 	8000,                                           /* FG (8ms)*/
183 	4000,                                           /* IN (4ms) */
184 	2000,                                           /* DF (2ms) */
185 	1000,                                           /* UT (1ms) */
186 	0                                               /* BG (0ms) */
187 };
188 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
189 
190 /*
191  * Thread level quantum
192  *
193  * The algorithm defines quantums for threads at various buckets. This
194  * (combined with the root level bucket quantums) restricts how much
195  * the lower priority levels can preempt the higher priority threads.
196  */
197 
198 #if XNU_TARGET_OS_OSX
199 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
200 	10000,                                          /* FIXPRI (10ms) */
201 	10000,                                          /* FG (10ms) */
202 	10000,                                          /* IN (10ms) */
203 	10000,                                          /* DF (10ms) */
204 	4000,                                           /* UT (4ms) */
205 	2000                                            /* BG (2ms) */
206 };
207 #else /* XNU_TARGET_OS_OSX */
208 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
209 	10000,                                          /* FIXPRI (10ms) */
210 	10000,                                          /* FG (10ms) */
211 	8000,                                           /* IN (8ms) */
212 	6000,                                           /* DF (6ms) */
213 	4000,                                           /* UT (4ms) */
214 	2000                                            /* BG (2ms) */
215 };
216 #endif /* XNU_TARGET_OS_OSX */
217 
218 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
219 
220 /*
221  * sched_clutch_us_to_abstime()
222  *
223  * Initializer for converting all durations in usec to abstime
224  */
225 static void
sched_clutch_us_to_abstime(uint32_t * us_vals,uint64_t * abstime_vals)226 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
227 {
228 	for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
229 		if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
230 			abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
231 		} else {
232 			clock_interval_to_absolutetime_interval(us_vals[i],
233 			    NSEC_PER_USEC, &abstime_vals[i]);
234 		}
235 	}
236 }
237 
238 /* Clutch/Edge Scheduler Debugging support */
239 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c)        ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
240 
241 #if DEVELOPMENT || DEBUG
242 
243 /*
244  * sched_clutch_hierarchy_locked_assert()
245  *
246  * Debugging helper routine. Asserts that the hierarchy is locked. The locking
247  * for the hierarchy depends on where the hierarchy is hooked. The current
248  * implementation hooks the hierarchy at the pset, so the hierarchy is locked
249  * using the pset lock.
250  */
251 static inline void
sched_clutch_hierarchy_locked_assert(sched_clutch_root_t root_clutch)252 sched_clutch_hierarchy_locked_assert(
253 	sched_clutch_root_t root_clutch)
254 {
255 	pset_assert_locked(root_clutch->scr_pset);
256 }
257 
258 #else /* DEVELOPMENT || DEBUG */
259 
260 static inline void
sched_clutch_hierarchy_locked_assert(__unused sched_clutch_root_t root_clutch)261 sched_clutch_hierarchy_locked_assert(
262 	__unused sched_clutch_root_t root_clutch)
263 {
264 }
265 
266 #endif /* DEVELOPMENT || DEBUG */
267 
268 /*
269  * sched_clutch_thr_count_inc()
270  *
271  * Increment thread count at a hierarchy level with overflow checks.
272  */
273 static void
sched_clutch_thr_count_inc(uint16_t * thr_count)274 sched_clutch_thr_count_inc(
275 	uint16_t *thr_count)
276 {
277 	if (__improbable(os_inc_overflow(thr_count))) {
278 		panic("sched_clutch thread count overflowed!");
279 	}
280 }
281 
282 /*
283  * sched_clutch_thr_count_dec()
284  *
285  * Decrement thread count at a hierarchy level with underflow checks.
286  */
287 static void
sched_clutch_thr_count_dec(uint16_t * thr_count)288 sched_clutch_thr_count_dec(
289 	uint16_t *thr_count)
290 {
291 	if (__improbable(os_dec_overflow(thr_count))) {
292 		panic("sched_clutch thread count underflowed!");
293 	}
294 }
295 
296 static sched_bucket_t
sched_convert_pri_to_bucket(uint8_t priority)297 sched_convert_pri_to_bucket(uint8_t priority)
298 {
299 	sched_bucket_t bucket = TH_BUCKET_RUN;
300 
301 	if (priority > BASEPRI_USER_INITIATED) {
302 		bucket = TH_BUCKET_SHARE_FG;
303 	} else if (priority > BASEPRI_DEFAULT) {
304 		bucket = TH_BUCKET_SHARE_IN;
305 	} else if (priority > BASEPRI_UTILITY) {
306 		bucket = TH_BUCKET_SHARE_DF;
307 	} else if (priority > MAXPRI_THROTTLE) {
308 		bucket = TH_BUCKET_SHARE_UT;
309 	} else {
310 		bucket = TH_BUCKET_SHARE_BG;
311 	}
312 	return bucket;
313 }
314 
315 /*
316  * sched_clutch_thread_bucket_map()
317  *
318  * Map a thread to a scheduling bucket for the clutch/edge scheduler
319  * based on its scheduling mode and the priority attribute passed in.
320  */
321 static sched_bucket_t
sched_clutch_thread_bucket_map(thread_t thread,int pri)322 sched_clutch_thread_bucket_map(thread_t thread, int pri)
323 {
324 	switch (thread->sched_mode) {
325 	case TH_MODE_FIXED:
326 		if (pri >= BASEPRI_FOREGROUND) {
327 			return TH_BUCKET_FIXPRI;
328 		} else {
329 			return sched_convert_pri_to_bucket(pri);
330 		}
331 
332 	case TH_MODE_REALTIME:
333 		return TH_BUCKET_FIXPRI;
334 
335 	case TH_MODE_TIMESHARE:
336 		return sched_convert_pri_to_bucket(pri);
337 
338 	default:
339 		panic("unexpected mode: %d", thread->sched_mode);
340 		break;
341 	}
342 }
343 
344 /*
345  * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
346  * based on the amount of time they have been pending and the load at that
347  * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
348  * multiple clusters, its important to keep the load also as a global counter.
349  */
350 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
351 
352 /*
353  * sched_clutch_root_init()
354  *
355  * Routine to initialize the scheduler hierarchy root.
356  */
357 static void
sched_clutch_root_init(sched_clutch_root_t root_clutch,processor_set_t pset)358 sched_clutch_root_init(
359 	sched_clutch_root_t root_clutch,
360 	processor_set_t pset)
361 {
362 	root_clutch->scr_thr_count = 0;
363 	root_clutch->scr_priority = NOPRI;
364 	root_clutch->scr_urgency = 0;
365 	root_clutch->scr_pset = pset;
366 #if CONFIG_SCHED_EDGE
367 	root_clutch->scr_cluster_id = pset->pset_cluster_id;
368 #else /* CONFIG_SCHED_EDGE */
369 	root_clutch->scr_cluster_id = 0;
370 #endif /* CONFIG_SCHED_EDGE */
371 
372 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
373 		root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
374 	}
375 	/* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
376 	queue_init(&root_clutch->scr_clutch_buckets);
377 
378 	/* Initialize the priority queue which maintains all runnable foreign clutch buckets */
379 	priority_queue_init(&root_clutch->scr_foreign_buckets);
380 	bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
381 	bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
382 	bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
383 	priority_queue_init(&root_clutch->scr_bound_root_buckets);
384 
385 	/* Initialize the bitmap and priority queue of runnable root buckets */
386 	priority_queue_init(&root_clutch->scr_unbound_root_buckets);
387 	bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
388 	bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
389 
390 	/* Initialize all the root buckets */
391 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
392 		sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
393 		sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
394 	}
395 }
396 
397 /*
398  * Clutch Bucket Runqueues
399  *
400  * The clutch buckets are maintained in a runq at the root bucket level. The
401  * runq organization allows clutch buckets to be ordered based on various
402  * factors such as:
403  *
404  * - Clutch buckets are round robin'ed at the same priority level when a
405  *   thread is selected from a clutch bucket. This prevents a clutch bucket
406  *   from starving out other clutch buckets at the same priority.
407  *
408  * - Clutch buckets are inserted at the head when it becomes runnable due to
409  *   thread preemption. This allows threads that were preempted to maintain
410  *   their order in the queue.
411  *
412  */
413 
414 /*
415  * sched_clutch_bucket_runq_init()
416  *
417  * Initialize a clutch bucket runq.
418  */
419 static void
sched_clutch_bucket_runq_init(sched_clutch_bucket_runq_t clutch_buckets_rq)420 sched_clutch_bucket_runq_init(
421 	sched_clutch_bucket_runq_t clutch_buckets_rq)
422 {
423 	clutch_buckets_rq->scbrq_highq = NOPRI;
424 	for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
425 		clutch_buckets_rq->scbrq_bitmap[i] = 0;
426 	}
427 	clutch_buckets_rq->scbrq_count = 0;
428 	for (int i = 0; i < NRQS; i++) {
429 		circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
430 	}
431 }
432 
433 /*
434  * sched_clutch_bucket_runq_empty()
435  *
436  * Returns if a clutch bucket runq is empty.
437  */
438 static boolean_t
sched_clutch_bucket_runq_empty(sched_clutch_bucket_runq_t clutch_buckets_rq)439 sched_clutch_bucket_runq_empty(
440 	sched_clutch_bucket_runq_t clutch_buckets_rq)
441 {
442 	return clutch_buckets_rq->scbrq_count == 0;
443 }
444 
445 /*
446  * sched_clutch_bucket_runq_peek()
447  *
448  * Returns the highest priority clutch bucket in the runq.
449  */
450 static sched_clutch_bucket_t
sched_clutch_bucket_runq_peek(sched_clutch_bucket_runq_t clutch_buckets_rq)451 sched_clutch_bucket_runq_peek(
452 	sched_clutch_bucket_runq_t clutch_buckets_rq)
453 {
454 	if (clutch_buckets_rq->scbrq_count > 0) {
455 		circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
456 		return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
457 	} else {
458 		return NULL;
459 	}
460 }
461 
462 /*
463  * sched_clutch_bucket_runq_enqueue()
464  *
465  * Enqueue a clutch bucket into the runq based on the options passed in.
466  */
467 static void
sched_clutch_bucket_runq_enqueue(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_options_t options)468 sched_clutch_bucket_runq_enqueue(
469 	sched_clutch_bucket_runq_t clutch_buckets_rq,
470 	sched_clutch_bucket_t clutch_bucket,
471 	sched_clutch_bucket_options_t options)
472 {
473 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
474 	if (circle_queue_empty(queue)) {
475 		circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
476 		bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
477 		if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
478 			clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
479 		}
480 	} else {
481 		if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
482 			circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
483 		} else {
484 			/*
485 			 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
486 			 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
487 			 */
488 			circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
489 		}
490 	}
491 	clutch_buckets_rq->scbrq_count++;
492 }
493 
494 /*
495  * sched_clutch_bucket_runq_remove()
496  *
497  * Remove a clutch bucket from the runq.
498  */
499 static void
sched_clutch_bucket_runq_remove(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)500 sched_clutch_bucket_runq_remove(
501 	sched_clutch_bucket_runq_t clutch_buckets_rq,
502 	sched_clutch_bucket_t clutch_bucket)
503 {
504 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
505 	circle_dequeue(queue, &clutch_bucket->scb_runqlink);
506 	assert(clutch_buckets_rq->scbrq_count > 0);
507 	clutch_buckets_rq->scbrq_count--;
508 	if (circle_queue_empty(queue)) {
509 		bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
510 		clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
511 	}
512 }
513 
514 static void
sched_clutch_bucket_runq_rotate(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)515 sched_clutch_bucket_runq_rotate(
516 	sched_clutch_bucket_runq_t clutch_buckets_rq,
517 	sched_clutch_bucket_t clutch_bucket)
518 {
519 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
520 	assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
521 	circle_queue_rotate_head_forward(queue);
522 }
523 
524 /*
525  * sched_clutch_root_bucket_init()
526  *
527  * Routine to initialize root buckets.
528  */
529 static void
sched_clutch_root_bucket_init(sched_clutch_root_bucket_t root_bucket,sched_bucket_t bucket,bool bound_root_bucket)530 sched_clutch_root_bucket_init(
531 	sched_clutch_root_bucket_t root_bucket,
532 	sched_bucket_t bucket,
533 	bool bound_root_bucket)
534 {
535 	root_bucket->scrb_bucket = bucket;
536 	if (bound_root_bucket) {
537 		/* For bound root buckets, initialize the bound thread runq. */
538 		root_bucket->scrb_bound = true;
539 		run_queue_init(&root_bucket->scrb_bound_thread_runq);
540 	} else {
541 		/*
542 		 * The unbounded root buckets contain a runq of runnable clutch buckets
543 		 * which then hold the runnable threads.
544 		 */
545 		root_bucket->scrb_bound = false;
546 		sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
547 	}
548 	priority_queue_entry_init(&root_bucket->scrb_pqlink);
549 	root_bucket->scrb_pqlink.deadline = SCHED_CLUTCH_INVALID_TIME_64;
550 	root_bucket->scrb_warped_deadline = 0;
551 	root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
552 	root_bucket->scrb_starvation_avoidance = false;
553 	root_bucket->scrb_starvation_ts = 0;
554 }
555 
556 /*
557  * Special case scheduling for Above UI bucket.
558  *
559  * AboveUI threads are typically system critical threads that need low latency
560  * which is why they are handled specially.
561  *
562  * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
563  * important to maintain some native priority order between those buckets. For unbounded
564  * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
565  * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
566  * deadline based scheduling which should pickup the timeshare buckets. For the bound
567  * case, the policy simply compares the priority of the highest runnable threads in
568  * the above UI and timeshare buckets.
569  *
570  * The implementation allows extremely low latency CPU access for Above UI threads
571  * while supporting the use case of high priority timeshare threads contending with
572  * lower priority fixed priority threads.
573  */
574 
575 
576 /*
577  * sched_clutch_root_unbound_select_aboveui()
578  *
579  * Routine to determine if the above UI unbounded bucket should be selected for execution.
580  */
581 static bool
sched_clutch_root_unbound_select_aboveui(sched_clutch_root_t root_clutch)582 sched_clutch_root_unbound_select_aboveui(
583 	sched_clutch_root_t root_clutch)
584 {
585 	if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
586 		sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
587 		sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
588 		if (!bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
589 			/* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
590 			return true;
591 		}
592 		sched_clutch_bucket_t clutch_bucket_aboveui = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_aboveui);
593 		sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_sharefg);
594 		if (clutch_bucket_aboveui->scb_priority >= clutch_bucket_sharefg->scb_priority) {
595 			return true;
596 		}
597 	}
598 	return false;
599 }
600 
601 /*
602  * sched_clutch_root_bound_select_aboveui()
603  *
604  * Routine to determine if the above UI bounded bucket should be selected for execution.
605  */
606 static bool
sched_clutch_root_bound_select_aboveui(sched_clutch_root_t root_clutch)607 sched_clutch_root_bound_select_aboveui(
608 	sched_clutch_root_t root_clutch)
609 {
610 	sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
611 	sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
612 	if (root_bucket_aboveui->scrb_bound_thread_runq.count == 0) {
613 		return false;
614 	}
615 	return root_bucket_aboveui->scrb_bound_thread_runq.highq >= root_bucket_sharefg->scrb_bound_thread_runq.highq;
616 }
617 
618 /*
619  * sched_clutch_root_highest_root_bucket()
620  *
621  * Main routine to find the highest runnable root level bucket.
622  * This routine is called from performance sensitive contexts; so it is
623  * crucial to keep this O(1). The options parameter determines if
624  * the selection logic should look at unbounded threads only (for
625  * cross-cluster stealing operations) or both bounded and unbounded
626  * threads (for selecting next thread for execution on current cluster).
627  */
628 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_highest_root_bucket_type_t type)629 sched_clutch_root_highest_root_bucket(
630 	sched_clutch_root_t root_clutch,
631 	uint64_t timestamp,
632 	sched_clutch_highest_root_bucket_type_t type)
633 {
634 	sched_clutch_hierarchy_locked_assert(root_clutch);
635 	int highest_runnable_bucket = -1;
636 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
637 		highest_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
638 	} else {
639 		int highest_unbound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
640 		int highest_bound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
641 		highest_runnable_bucket = (highest_bound_runnable_bucket != -1) ? ((highest_unbound_runnable_bucket != -1) ? MIN(highest_bound_runnable_bucket, highest_unbound_runnable_bucket) : highest_bound_runnable_bucket) : highest_unbound_runnable_bucket;
642 	}
643 
644 	if (highest_runnable_bucket == -1) {
645 		return NULL;
646 	}
647 
648 	/* Above UI root bucket selection (see comment above for more details on this special case handling) */
649 	bool unbound_aboveui = sched_clutch_root_unbound_select_aboveui(root_clutch);
650 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
651 		if (unbound_aboveui) {
652 			return &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
653 		}
654 		/* Fall through to selecting a timeshare root bucket */
655 	} else {
656 		bool bound_aboveui = sched_clutch_root_bound_select_aboveui(root_clutch);
657 		sched_clutch_root_bucket_t unbound_aboveui_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
658 		sched_clutch_root_bucket_t bound_aboveui_root_bucket = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
659 
660 		if (unbound_aboveui && bound_aboveui) {
661 			/*
662 			 * In this scenario both the bounded and unbounded above UI buckets are runnable; choose based on the
663 			 * highest runnable priority in both the buckets.
664 			 * */
665 			int bound_aboveui_pri = root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI].scrb_bound_thread_runq.highq;
666 			sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(unbound_aboveui_root_bucket);
667 			int unbound_aboveui_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
668 			return (bound_aboveui_pri >= unbound_aboveui_pri) ? bound_aboveui_root_bucket : unbound_aboveui_root_bucket;
669 		}
670 		if (unbound_aboveui) {
671 			return unbound_aboveui_root_bucket;
672 		}
673 		if (bound_aboveui) {
674 			return bound_aboveui_root_bucket;
675 		}
676 		/* Fall through to selecting a timeshare root bucket */
677 	}
678 
679 	/*
680 	 * Above UI bucket is not runnable or has a low priority runnable thread; use the
681 	 * earliest deadline model to schedule threads. The idea is that as the timeshare
682 	 * buckets use CPU, they will drop their interactivity score/sched priority and
683 	 * allow the low priority AboveUI buckets to be scheduled.
684 	 */
685 
686 	/* Find the earliest deadline bucket */
687 	sched_clutch_root_bucket_t edf_bucket = NULL;
688 	sched_clutch_root_bucket_t warp_bucket = NULL;
689 	int warp_bucket_index = -1;
690 
691 evaluate_root_buckets:
692 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
693 		edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
694 	} else {
695 		sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
696 		sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
697 		if (bound_bucket && unbound_bucket) {
698 			/* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
699 			edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
700 		} else {
701 			edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
702 		}
703 	}
704 	/*
705 	 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
706 	 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
707 	 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
708 	 * QoS root bucket.
709 	 */
710 	bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
711 	warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
712 
713 	if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) {
714 		/* No higher buckets have warp left; best choice is the EDF based bucket */
715 		if (edf_bucket->scrb_starvation_avoidance) {
716 			/*
717 			 * Indicates that the earliest deadline bucket is in starvation avoidance mode. Check to see if the
718 			 * starvation avoidance window is still open and return this bucket if it is.
719 			 *
720 			 * The starvation avoidance window is calculated based on the quantum of threads at that bucket and
721 			 * the number of CPUs in the cluster. The idea is to basically provide one quantum worth of starvation
722 			 * avoidance across all CPUs.
723 			 */
724 			uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket] / pset_available_cpu_count(root_clutch->scr_pset);
725 			if (timestamp < (edf_bucket->scrb_starvation_ts + starvation_window)) {
726 				return edf_bucket;
727 			} else {
728 				/* Starvation avoidance window is over; update deadline and re-evaluate EDF */
729 				edf_bucket->scrb_starvation_avoidance = false;
730 				edf_bucket->scrb_starvation_ts = 0;
731 				sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
732 			}
733 			goto evaluate_root_buckets;
734 		}
735 
736 		/* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
737 		if (highest_runnable_bucket < edf_bucket->scrb_bucket) {
738 			/* Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance */
739 			edf_bucket->scrb_starvation_avoidance = true;
740 			edf_bucket->scrb_starvation_ts = timestamp;
741 		} else {
742 			/* EDF bucket is being selected in the natural order; update deadline and reset warp */
743 			sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
744 			edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
745 			edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
746 			if (edf_bucket->scrb_bound) {
747 				bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
748 			} else {
749 				bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
750 			}
751 		}
752 		return edf_bucket;
753 	}
754 
755 	/*
756 	 * Looks like there is a root bucket which is higher in the natural priority
757 	 * order than edf_bucket and might have some warp remaining.
758 	 */
759 	warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
760 	if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
761 		/* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
762 		warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
763 		sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
764 		return warp_bucket;
765 	}
766 	if (warp_bucket->scrb_warped_deadline > timestamp) {
767 		/* Root bucket already has a warp window open with some warp remaining */
768 		sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
769 		return warp_bucket;
770 	}
771 
772 	/* For this bucket, warp window was opened sometime in the past but has now
773 	 * expired. Mark the bucket as not avilable for warp anymore and re-run the
774 	 * warp bucket selection logic.
775 	 */
776 	warp_bucket->scrb_warp_remaining = 0;
777 	if (warp_bucket->scrb_bound) {
778 		bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
779 	} else {
780 		bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
781 	}
782 	goto evaluate_root_buckets;
783 }
784 
785 /*
786  * sched_clutch_root_bucket_deadline_calculate()
787  *
788  * Calculate the deadline for the bucket based on its WCEL
789  */
790 static uint64_t
sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t root_bucket,uint64_t timestamp)791 sched_clutch_root_bucket_deadline_calculate(
792 	sched_clutch_root_bucket_t root_bucket,
793 	uint64_t timestamp)
794 {
795 	/* For fixpri AboveUI bucket always return it as the earliest deadline */
796 	if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
797 		return 0;
798 	}
799 
800 	/* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
801 	return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
802 }
803 
804 /*
805  * sched_clutch_root_bucket_deadline_update()
806  *
807  * Routine to update the deadline of the root bucket when it is selected.
808  * Updating the deadline also moves the root_bucket in the EDF priority
809  * queue.
810  */
811 static void
sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)812 sched_clutch_root_bucket_deadline_update(
813 	sched_clutch_root_bucket_t root_bucket,
814 	sched_clutch_root_t root_clutch,
815 	uint64_t timestamp)
816 {
817 	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
818 		/* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
819 		return;
820 	}
821 
822 	uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
823 	uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
824 	if (__improbable(old_deadline > new_deadline)) {
825 		panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
826 	}
827 	if (old_deadline != new_deadline) {
828 		root_bucket->scrb_pqlink.deadline = new_deadline;
829 		struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
830 		priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
831 	}
832 }
833 
834 /*
835  * sched_clutch_root_bucket_runnable()
836  *
837  * Routine to insert a newly runnable root bucket into the hierarchy.
838  * Also updates the deadline and warp parameters as necessary.
839  */
840 static void
sched_clutch_root_bucket_runnable(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)841 sched_clutch_root_bucket_runnable(
842 	sched_clutch_root_bucket_t root_bucket,
843 	sched_clutch_root_t root_clutch,
844 	uint64_t timestamp)
845 {
846 	/* Mark the root bucket as runnable */
847 	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
848 	bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
849 
850 	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
851 		/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
852 		return;
853 	}
854 
855 	if (root_bucket->scrb_starvation_avoidance == false) {
856 		/*
857 		 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
858 		 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
859 		 * that and fix it up.
860 		 */
861 		root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
862 	}
863 	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
864 	priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
865 	if (root_bucket->scrb_warp_remaining) {
866 		/* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
867 		bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
868 		bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
869 	}
870 }
871 
872 /*
873  * sched_clutch_root_bucket_empty()
874  *
875  * Routine to remove an empty root bucket from the hierarchy.
876  * Also updates the deadline and warp parameters as necessary.
877  */
878 static void
sched_clutch_root_bucket_empty(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)879 sched_clutch_root_bucket_empty(
880 	sched_clutch_root_bucket_t root_bucket,
881 	sched_clutch_root_t root_clutch,
882 	uint64_t timestamp)
883 {
884 	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
885 	bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
886 
887 	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
888 		/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
889 		return;
890 	}
891 
892 	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
893 	priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
894 
895 	bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
896 	bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
897 
898 	if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
899 		if (root_bucket->scrb_warped_deadline > timestamp) {
900 			/*
901 			 * For root buckets that were using the warp, check if the warp
902 			 * deadline is in the future. If yes, remove the wall time the
903 			 * warp was active and update the warp remaining. This allows
904 			 * the root bucket to use the remaining warp the next time it
905 			 * becomes runnable.
906 			 */
907 			root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
908 		} else {
909 			/*
910 			 * If the root bucket's warped deadline is in the past, it has used up
911 			 * all the warp it was assigned. Empty out its warp remaining.
912 			 */
913 			root_bucket->scrb_warp_remaining = 0;
914 		}
915 	}
916 }
917 
918 static int
sched_clutch_global_bucket_load_get(sched_bucket_t bucket)919 sched_clutch_global_bucket_load_get(
920 	sched_bucket_t bucket)
921 {
922 	return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
923 }
924 
925 /*
926  * sched_clutch_root_pri_update()
927  *
928  * The root level priority is used for thread selection and preemption
929  * logic.
930  *
931  * The logic uses the same decision as thread selection for deciding between the
932  * above UI and timeshare buckets. If one of the timesharing buckets have to be
933  * used for priority calculation, the logic is slightly different from thread
934  * selection, because thread selection considers deadlines, warps etc. to
935  * decide the most optimal bucket at a given timestamp. Since the priority
936  * value is used for preemption decisions only, it needs to be based on the
937  * highest runnable thread available in the timeshare domain. This logic can
938  * be made more sophisticated if there are cases of unnecessary preemption
939  * being seen in workloads.
940  */
941 static void
sched_clutch_root_pri_update(sched_clutch_root_t root_clutch)942 sched_clutch_root_pri_update(
943 	sched_clutch_root_t root_clutch)
944 {
945 	sched_clutch_hierarchy_locked_assert(root_clutch);
946 	int16_t root_bound_pri = NOPRI;
947 	int16_t root_unbound_pri = NOPRI;
948 
949 	if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
950 		goto root_pri_update_unbound;
951 	}
952 	sched_clutch_root_bucket_t root_bucket_bound = NULL;
953 	if (sched_clutch_root_bound_select_aboveui(root_clutch)) {
954 		root_bucket_bound = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
955 	} else {
956 		int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
957 		assert(root_bucket_index != -1);
958 		root_bucket_bound = &root_clutch->scr_bound_buckets[root_bucket_index];
959 	}
960 	root_bound_pri = root_bucket_bound->scrb_bound_thread_runq.highq;
961 
962 root_pri_update_unbound:
963 	if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
964 		goto root_pri_update_complete;
965 	}
966 	sched_clutch_root_bucket_t root_bucket_unbound = NULL;
967 	if (sched_clutch_root_unbound_select_aboveui(root_clutch)) {
968 		root_bucket_unbound = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
969 	} else {
970 		int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
971 		assert(root_bucket_index != -1);
972 		root_bucket_unbound = &root_clutch->scr_unbound_buckets[root_bucket_index];
973 	}
974 	/* For the selected root bucket, find the highest priority clutch bucket */
975 	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_unbound);
976 	root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
977 
978 root_pri_update_complete:
979 	root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
980 }
981 
982 /*
983  * sched_clutch_root_urgency_inc()
984  *
985  * Routine to increment the urgency at the root level based on the thread
986  * priority that is being inserted into the hierarchy. The root urgency
987  * counter is updated based on the urgency of threads in any of the
988  * clutch buckets which are part of the hierarchy.
989  *
990  * Always called with the pset lock held.
991  */
992 static void
sched_clutch_root_urgency_inc(sched_clutch_root_t root_clutch,thread_t thread)993 sched_clutch_root_urgency_inc(
994 	sched_clutch_root_t root_clutch,
995 	thread_t thread)
996 {
997 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
998 		root_clutch->scr_urgency++;
999 	}
1000 }
1001 
1002 /*
1003  * sched_clutch_root_urgency_dec()
1004  *
1005  * Routine to decrement the urgency at the root level based on the thread
1006  * priority that is being removed from the hierarchy. The root urgency
1007  * counter is updated based on the urgency of threads in any of the
1008  * clutch buckets which are part of the hierarchy.
1009  *
1010  * Always called with the pset lock held.
1011  */
1012 static void
sched_clutch_root_urgency_dec(sched_clutch_root_t root_clutch,thread_t thread)1013 sched_clutch_root_urgency_dec(
1014 	sched_clutch_root_t root_clutch,
1015 	thread_t thread)
1016 {
1017 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1018 		root_clutch->scr_urgency--;
1019 	}
1020 }
1021 
1022 /*
1023  * Clutch bucket level scheduling
1024  *
1025  * The second level of scheduling is the clutch bucket level scheduling
1026  * which tries to schedule thread groups within root_buckets. Each
1027  * clutch represents a thread group and a clutch_bucket_group represents
1028  * threads at a particular sched_bucket within that thread group. The
1029  * clutch_bucket_group contains a clutch_bucket per cluster on the system
1030  * where it holds the runnable threads destined for execution on that
1031  * cluster.
1032  *
1033  * The goal of this level of scheduling is to allow interactive thread
1034  * groups low latency access to the CPU. It also provides slight
1035  * scheduling preference for App and unrestricted thread groups.
1036  *
1037  * The clutch bucket scheduling algorithm measures an interactivity
1038  * score for all clutch bucket groups. The interactivity score is based
1039  * on the ratio of the CPU used and the voluntary blocking of threads
1040  * within the clutch bucket group. The algorithm is very close to the ULE
1041  * scheduler on FreeBSD in terms of calculations. The interactivity
1042  * score provides an interactivity boost in the range of
1043  * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1044  * thread groups to win over CPU spinners.
1045  *
1046  * The interactivity score of the clutch bucket group is combined with the
1047  * highest base/promoted priority of threads in the clutch bucket to form
1048  * the overall priority of the clutch bucket.
1049  */
1050 
1051 /* Priority boost range for interactivity */
1052 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT     (8)
1053 uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1054 
1055 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1056 uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1057 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS      (500000)
1058 
1059 /* The ratio to scale the cpu/blocked time per window */
1060 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO                (10)
1061 
1062 /* Initial value for voluntary blocking time for the clutch_bucket */
1063 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID          (uint64_t)(~0)
1064 
1065 /* Value indicating the clutch bucket is not pending execution */
1066 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID             ((uint64_t)(~0))
1067 
1068 /*
1069  * Thread group CPU starvation avoidance
1070  *
1071  * In heavily CPU contended scenarios, it is possible that some thread groups
1072  * which have a low interactivity score do not get CPU time at all. In order to
1073  * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1074  * bucket group when it has been pending execution for a certain time as defined
1075  * by the sched_clutch_bucket_group_pending_delta_us values below.
1076  *
1077  * The values chosen here are very close to the WCEL values for each sched bucket.
1078  * These values are multiplied by the load average of the relevant root bucket to
1079  * provide an estimate of the actual clutch bucket load.
1080  */
1081 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1082 	SCHED_CLUTCH_INVALID_TIME_32,           /* FIXPRI */
1083 	10000,                                  /* FG */
1084 	37500,                                  /* IN */
1085 	75000,                                  /* DF */
1086 	150000,                                 /* UT */
1087 	250000,                                 /* BG */
1088 };
1089 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1090 
1091 /*
1092  * sched_clutch_bucket_init()
1093  *
1094  * Initializer for clutch buckets.
1095  */
1096 static void
sched_clutch_bucket_init(sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_group_t clutch_bucket_group,sched_bucket_t bucket)1097 sched_clutch_bucket_init(
1098 	sched_clutch_bucket_t clutch_bucket,
1099 	sched_clutch_bucket_group_t clutch_bucket_group,
1100 	sched_bucket_t bucket)
1101 {
1102 	clutch_bucket->scb_bucket = bucket;
1103 	/* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1104 	clutch_bucket->scb_priority = 0;
1105 #if CONFIG_SCHED_EDGE
1106 	clutch_bucket->scb_foreign = false;
1107 	priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1108 #endif /* CONFIG_SCHED_EDGE */
1109 	clutch_bucket->scb_group = clutch_bucket_group;
1110 	clutch_bucket->scb_root = NULL;
1111 	priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1112 	priority_queue_init(&clutch_bucket->scb_thread_runq);
1113 	queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1114 }
1115 
1116 /*
1117  * sched_clutch_bucket_group_init()
1118  *
1119  * Initializer for clutch bucket groups.
1120  */
1121 static void
sched_clutch_bucket_group_init(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_t clutch,sched_bucket_t bucket)1122 sched_clutch_bucket_group_init(
1123 	sched_clutch_bucket_group_t clutch_bucket_group,
1124 	sched_clutch_t clutch,
1125 	sched_bucket_t bucket)
1126 {
1127 	bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1128 	clutch_bucket_group->scbg_bucket = bucket;
1129 	clutch_bucket_group->scbg_clutch = clutch;
1130 
1131 	int max_clusters = ml_get_cluster_count();
1132 	clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1133 	for (int i = 0; i < max_clusters; i++) {
1134 		sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1135 	}
1136 
1137 	os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1138 	os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1139 	os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1140 	/*
1141 	 * All thread groups should be initialized to be interactive; this allows the newly launched
1142 	 * thread groups to fairly compete with already running thread groups.
1143 	 */
1144 	clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1145 	clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1146 	os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1147 #if !__LP64__
1148 	lck_spin_init(&clutch_bucket_group->scbg_stats_lock, &pset_lck_grp, NULL);
1149 #endif /* !__LP64__ */
1150 	clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1151 	clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1152 	clutch_bucket_group->scbg_amp_rebalance_last_chosen = UINT32_MAX;
1153 }
1154 
1155 static void
sched_clutch_bucket_group_destroy(sched_clutch_bucket_group_t clutch_bucket_group)1156 sched_clutch_bucket_group_destroy(
1157 	sched_clutch_bucket_group_t clutch_bucket_group)
1158 {
1159 	kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1160 	    clutch_bucket_group->scbg_clutch_buckets);
1161 }
1162 
1163 /*
1164  * sched_clutch_init_with_thread_group()
1165  *
1166  * Initialize the sched_clutch when the thread group is being created
1167  */
1168 void
sched_clutch_init_with_thread_group(sched_clutch_t clutch,struct thread_group * tg)1169 sched_clutch_init_with_thread_group(
1170 	sched_clutch_t clutch,
1171 	struct thread_group *tg)
1172 {
1173 	os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1174 
1175 	/* Initialize all the clutch buckets */
1176 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1177 		sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1178 	}
1179 
1180 	/* Grouping specific fields */
1181 	clutch->sc_tg = tg;
1182 }
1183 
1184 /*
1185  * sched_clutch_destroy()
1186  *
1187  * Destructor for clutch; called from thread group release code.
1188  */
1189 void
sched_clutch_destroy(sched_clutch_t clutch)1190 sched_clutch_destroy(
1191 	sched_clutch_t clutch)
1192 {
1193 	assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1194 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1195 		sched_clutch_bucket_group_destroy(&(clutch->sc_clutch_groups[i]));
1196 	}
1197 }
1198 
1199 #if CONFIG_SCHED_EDGE
1200 
1201 /*
1202  * Edge Scheduler Preferred Cluster Mechanism
1203  *
1204  * In order to have better control over various QoS buckets within a thread group, the Edge
1205  * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1206  * preferences are stored at the sched_clutch_bucket_group level since that represents all
1207  * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1208  * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1209  */
1210 
1211 static uint32_t
sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)1212 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1213 {
1214 	return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1215 }
1216 
1217 static uint32_t
sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)1218 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1219 {
1220 	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1221 }
1222 
1223 uint32_t
sched_edge_thread_preferred_cluster(thread_t thread)1224 sched_edge_thread_preferred_cluster(thread_t thread)
1225 {
1226 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1227 		/* For threads bound to a specific cluster, return the bound cluster id */
1228 		return sched_edge_thread_bound_cluster_id(thread);
1229 	}
1230 
1231 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1232 	sched_bucket_t sched_bucket = thread->th_sched_bucket;
1233 	if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1234 		sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1235 	}
1236 	sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1237 	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1238 }
1239 
1240 /*
1241  * Edge Scheduler Foreign Bucket Support
1242  *
1243  * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1244  * threads that are not native to the cluster. A clutch bucket is considered native if its
1245  * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1246  * bucket priority queue is used for rebalance operations to get threads back to their native
1247  * cluster quickly.
1248  *
1249  * It is possible to make this policy even more aggressive by considering all clusters that
1250  * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1251  * migrations which might have performance implications.
1252  */
1253 
1254 static void
sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1255 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1256 {
1257 	if (clutch_bucket->scb_foreign) {
1258 		clutch_bucket->scb_foreign = false;
1259 		priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1260 	}
1261 }
1262 
1263 static void
sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1264 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1265 {
1266 	if (!clutch_bucket->scb_foreign) {
1267 		clutch_bucket->scb_foreign = true;
1268 		priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1269 		priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1270 	}
1271 }
1272 
1273 /*
1274  * Edge Scheduler Cumulative Load Average
1275  *
1276  * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1277  * making thread migration decisions. The per-bucket load is maintained as a
1278  * cumulative count since higher scheduling buckets impact load on lower buckets
1279  * for thread migration decisions.
1280  *
1281  */
1282 
1283 static void
sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1284 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1285 {
1286 	switch (bucket) {
1287 	case TH_BUCKET_FIXPRI:    os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1288 	case TH_BUCKET_SHARE_FG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1289 	case TH_BUCKET_SHARE_IN:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1290 	case TH_BUCKET_SHARE_DF:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1291 	case TH_BUCKET_SHARE_UT:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1292 	case TH_BUCKET_SHARE_BG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1293 	default:
1294 		panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1295 	}
1296 }
1297 
1298 static void
sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1299 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1300 {
1301 	switch (bucket) {
1302 	case TH_BUCKET_FIXPRI:    os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1303 	case TH_BUCKET_SHARE_FG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1304 	case TH_BUCKET_SHARE_IN:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1305 	case TH_BUCKET_SHARE_DF:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1306 	case TH_BUCKET_SHARE_UT:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1307 	case TH_BUCKET_SHARE_BG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1308 	default:
1309 		panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1310 	}
1311 }
1312 
1313 uint16_t
sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1314 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1315 {
1316 	return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1317 }
1318 
1319 #endif /* CONFIG_SCHED_EDGE */
1320 
1321 /*
1322  * sched_clutch_bucket_hierarchy_insert()
1323  *
1324  * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1325  */
1326 static void
sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,sched_clutch_bucket_options_t options)1327 sched_clutch_bucket_hierarchy_insert(
1328 	sched_clutch_root_t root_clutch,
1329 	sched_clutch_bucket_t clutch_bucket,
1330 	sched_bucket_t bucket,
1331 	uint64_t timestamp,
1332 	sched_clutch_bucket_options_t options)
1333 {
1334 	sched_clutch_hierarchy_locked_assert(root_clutch);
1335 	if (bucket > TH_BUCKET_FIXPRI) {
1336 		/* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1337 		enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1338 	}
1339 #if CONFIG_SCHED_EDGE
1340 	/* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1341 	uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1342 	if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1343 		sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1344 	}
1345 #endif /* CONFIG_SCHED_EDGE */
1346 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1347 
1348 	/* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1349 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1350 		sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1351 	}
1352 
1353 	/* Insert the clutch bucket into the root bucket run queue with order based on options */
1354 	sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1355 	os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1356 	os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1357 }
1358 
1359 /*
1360  * sched_clutch_bucket_hierarchy_remove()
1361  *
1362  * Rotuine to remove a empty clutch bucket from the root hierarchy.
1363  */
1364 static void
sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,__unused sched_clutch_bucket_options_t options)1365 sched_clutch_bucket_hierarchy_remove(
1366 	sched_clutch_root_t root_clutch,
1367 	sched_clutch_bucket_t clutch_bucket,
1368 	sched_bucket_t bucket,
1369 	uint64_t timestamp,
1370 	__unused sched_clutch_bucket_options_t options)
1371 {
1372 	sched_clutch_hierarchy_locked_assert(root_clutch);
1373 	if (bucket > TH_BUCKET_FIXPRI) {
1374 		/* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1375 		remqueue(&clutch_bucket->scb_listlink);
1376 	}
1377 #if CONFIG_SCHED_EDGE
1378 	sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1379 #endif /* CONFIG_SCHED_EDGE */
1380 
1381 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1382 
1383 	/* Remove the clutch bucket from the root bucket priority queue */
1384 	sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1385 	os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1386 
1387 	/* If the root bucket priority queue is now empty, remove it from the root priority queue */
1388 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1389 		sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1390 	}
1391 	os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1392 }
1393 
1394 /*
1395  * sched_clutch_bucket_base_pri()
1396  *
1397  * Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1398  * highest base_pri and the highest sched_pri in the clutch bucket.
1399  */
1400 static uint8_t
sched_clutch_bucket_base_pri(sched_clutch_bucket_t clutch_bucket)1401 sched_clutch_bucket_base_pri(
1402 	sched_clutch_bucket_t clutch_bucket)
1403 {
1404 	assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1405 	/*
1406 	 * Since the clutch bucket can contain threads that are members of the group due
1407 	 * to the sched_pri being promoted or due to their base pri, the base priority of
1408 	 * the entire clutch bucket should be based on the highest thread (promoted or base)
1409 	 * in the clutch bucket.
1410 	 */
1411 	uint8_t max_pri = 0;
1412 	if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1413 		max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1414 	}
1415 	return max_pri;
1416 }
1417 
1418 /*
1419  * sched_clutch_interactivity_from_cpu_data()
1420  *
1421  * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1422  */
1423 static uint8_t
sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)1424 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1425 {
1426 	sched_clutch_bucket_cpu_data_t scb_cpu_data;
1427 	scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1428 	clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1429 	clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1430 	uint8_t interactive_score = 0;
1431 
1432 	if ((cpu_blocked == 0) && (cpu_used == 0)) {
1433 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1434 	}
1435 	/*
1436 	 * For all timeshare buckets, calculate the interactivity score of the bucket
1437 	 * and add it to the base priority
1438 	 */
1439 	if (cpu_blocked > cpu_used) {
1440 		/* Interactive clutch_bucket case */
1441 		interactive_score = sched_clutch_bucket_group_interactive_pri +
1442 		    ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1443 	} else {
1444 		/* Non-interactive clutch_bucket case */
1445 		interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1446 	}
1447 	return interactive_score;
1448 }
1449 
1450 /*
1451  * sched_clutch_bucket_pri_calculate()
1452  *
1453  * The priority calculation algorithm for the clutch_bucket is a slight
1454  * modification on the ULE interactivity score. It uses the base priority
1455  * of the clutch bucket and applies an interactivity score boost to the
1456  * highly responsive clutch buckets.
1457  */
1458 static uint8_t
sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t clutch_bucket,uint64_t timestamp)1459 sched_clutch_bucket_pri_calculate(
1460 	sched_clutch_bucket_t clutch_bucket,
1461 	uint64_t timestamp)
1462 {
1463 	/* For empty clutch buckets, return priority 0 */
1464 	if (clutch_bucket->scb_thr_count == 0) {
1465 		return 0;
1466 	}
1467 
1468 	uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1469 	uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1470 
1471 	assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1472 	uint8_t pri = base_pri + interactive_score;
1473 	if (pri != clutch_bucket->scb_priority) {
1474 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1475 		    thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1476 	}
1477 	return pri;
1478 }
1479 
1480 /*
1481  * sched_clutch_root_bucket_highest_clutch_bucket()
1482  *
1483  * Routine to find the highest priority clutch bucket
1484  * within the root bucket.
1485  */
1486 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t root_bucket)1487 sched_clutch_root_bucket_highest_clutch_bucket(
1488 	sched_clutch_root_bucket_t root_bucket)
1489 {
1490 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1491 		return NULL;
1492 	}
1493 	return sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1494 }
1495 
1496 /*
1497  * sched_clutch_bucket_runnable()
1498  *
1499  * Perform all operations needed when a new clutch bucket becomes runnable.
1500  * It involves inserting the clutch_bucket into the hierarchy and updating the
1501  * root priority appropriately.
1502  */
1503 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1504 sched_clutch_bucket_runnable(
1505 	sched_clutch_bucket_t clutch_bucket,
1506 	sched_clutch_root_t root_clutch,
1507 	uint64_t timestamp,
1508 	sched_clutch_bucket_options_t options)
1509 {
1510 	sched_clutch_hierarchy_locked_assert(root_clutch);
1511 	/* Since the clutch bucket became newly runnable, update its pending timestamp */
1512 	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1513 	sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1514 
1515 	/* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1516 	sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
1517 	int16_t root_old_pri = root_clutch->scr_priority;
1518 	sched_clutch_root_pri_update(root_clutch);
1519 	return root_clutch->scr_priority > root_old_pri;
1520 }
1521 
1522 /*
1523  * sched_clutch_bucket_update()
1524  *
1525  * Update the clutch_bucket's position in the hierarchy. This routine is
1526  * called when a new thread is inserted or removed from a runnable clutch
1527  * bucket. The options specify some properties about the clutch bucket
1528  * insertion order into the clutch bucket runq.
1529  */
1530 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1531 sched_clutch_bucket_update(
1532 	sched_clutch_bucket_t clutch_bucket,
1533 	sched_clutch_root_t root_clutch,
1534 	uint64_t timestamp,
1535 	sched_clutch_bucket_options_t options)
1536 {
1537 	sched_clutch_hierarchy_locked_assert(root_clutch);
1538 	uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1539 	sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1540 	if (new_pri == clutch_bucket->scb_priority) {
1541 		/*
1542 		 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1543 		 * to the end of the runq. Typically used when a thread is selected for execution
1544 		 * from a clutch bucket.
1545 		 */
1546 		if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1547 			sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1548 		}
1549 		return false;
1550 	}
1551 	sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1552 #if CONFIG_SCHED_EDGE
1553 	if (clutch_bucket->scb_foreign) {
1554 		priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1555 	}
1556 #endif /* CONFIG_SCHED_EDGE */
1557 	clutch_bucket->scb_priority = new_pri;
1558 #if CONFIG_SCHED_EDGE
1559 	if (clutch_bucket->scb_foreign) {
1560 		priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1561 		priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1562 	}
1563 #endif /* CONFIG_SCHED_EDGE */
1564 	sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1565 
1566 	int16_t root_old_pri = root_clutch->scr_priority;
1567 	sched_clutch_root_pri_update(root_clutch);
1568 	return root_clutch->scr_priority > root_old_pri;
1569 }
1570 
1571 /*
1572  * sched_clutch_bucket_empty()
1573  *
1574  * Perform all the operations needed when a clutch_bucket is no longer runnable.
1575  * It involves removing the clutch bucket from the hierarchy and updaing the root
1576  * priority appropriately.
1577  */
1578 static void
sched_clutch_bucket_empty(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1579 sched_clutch_bucket_empty(
1580 	sched_clutch_bucket_t clutch_bucket,
1581 	sched_clutch_root_t root_clutch,
1582 	uint64_t timestamp,
1583 	sched_clutch_bucket_options_t options)
1584 {
1585 	sched_clutch_hierarchy_locked_assert(root_clutch);
1586 	sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1587 	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1588 	sched_clutch_root_pri_update(root_clutch);
1589 }
1590 
1591 /*
1592  * sched_clutch_cpu_usage_update()
1593  *
1594  * Routine to update CPU usage of the thread in the hierarchy.
1595  */
1596 void
sched_clutch_cpu_usage_update(thread_t thread,uint64_t delta)1597 sched_clutch_cpu_usage_update(
1598 	thread_t thread,
1599 	uint64_t delta)
1600 {
1601 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1602 		return;
1603 	}
1604 
1605 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1606 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1607 	sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1608 }
1609 
1610 /*
1611  * sched_clutch_bucket_group_cpu_usage_update()
1612  *
1613  * Routine to update the CPU usage of the clutch_bucket.
1614  */
1615 static void
sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t delta)1616 sched_clutch_bucket_group_cpu_usage_update(
1617 	sched_clutch_bucket_group_t clutch_bucket_group,
1618 	uint64_t delta)
1619 {
1620 	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1621 		/* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1622 		return;
1623 	}
1624 	delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1625 	os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1626 }
1627 
1628 /*
1629  * sched_clutch_bucket_group_cpu_pending_adjust()
1630  *
1631  * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1632  * such that one "pending interval" provides one point improvement in interactivity score.
1633  */
1634 static inline uint64_t
sched_clutch_bucket_group_cpu_pending_adjust(uint64_t cpu_used,uint64_t cpu_blocked,uint8_t pending_intervals)1635 sched_clutch_bucket_group_cpu_pending_adjust(
1636 	uint64_t cpu_used,
1637 	uint64_t cpu_blocked,
1638 	uint8_t pending_intervals)
1639 {
1640 	uint64_t cpu_used_adjusted = 0;
1641 	if (cpu_blocked < cpu_used) {
1642 		cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1643 		cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1644 	} else {
1645 		uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1646 		cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1647 	}
1648 	return cpu_used_adjusted;
1649 }
1650 
1651 /*
1652  * sched_clutch_bucket_group_cpu_adjust()
1653  *
1654  * Routine to scale the cpu usage and blocked time once the sum gets bigger
1655  * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1656  * manageable and maintain the same ratio while allowing clutch buckets to
1657  * adjust behavior and reflect in the interactivity score in a reasonable
1658  * amount of time. Also adjusts the CPU usage based on pending_intervals
1659  * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1660  */
1661 static void
sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t clutch_bucket_group,uint8_t pending_intervals)1662 sched_clutch_bucket_group_cpu_adjust(
1663 	sched_clutch_bucket_group_t clutch_bucket_group,
1664 	uint8_t pending_intervals)
1665 {
1666 	sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1667 	sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1668 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1669 		clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1670 		clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1671 
1672 		if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1673 		        /* No changes to the CPU used and blocked values */
1674 		        os_atomic_rmw_loop_give_up();
1675 		}
1676 		if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1677 		        /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1678 		        cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1679 		        cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1680 		}
1681 		/* Use the shift passed in to ageout the CPU usage */
1682 		cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1683 		new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1684 		new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1685 	});
1686 }
1687 
1688 /*
1689  * Thread level scheduling algorithm
1690  *
1691  * The thread level scheduling algorithm uses the mach timeshare
1692  * decay based algorithm to achieve sharing between threads within the
1693  * same clutch bucket. The load/priority shifts etc. are all maintained
1694  * at the clutch bucket level and used for decay calculation of the
1695  * threads. The load sampling is still driven off the scheduler tick
1696  * for runnable clutch buckets (it does not use the new higher frequency
1697  * EWMA based load calculation). The idea is that the contention and load
1698  * within clutch_buckets should be limited enough to not see heavy decay
1699  * and timeshare effectively.
1700  */
1701 
1702 /*
1703  * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1704  *
1705  * Increment the run count for the clutch bucket associated with the
1706  * thread.
1707  */
1708 uint32_t
sched_clutch_thread_run_bucket_incr(thread_t thread,sched_bucket_t bucket)1709 sched_clutch_thread_run_bucket_incr(
1710 	thread_t thread,
1711 	sched_bucket_t bucket)
1712 {
1713 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1714 		return 0;
1715 	}
1716 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1717 	return sched_clutch_run_bucket_incr(clutch, bucket);
1718 }
1719 
1720 static uint32_t
sched_clutch_run_bucket_incr(sched_clutch_t clutch,sched_bucket_t bucket)1721 sched_clutch_run_bucket_incr(
1722 	sched_clutch_t clutch,
1723 	sched_bucket_t bucket)
1724 {
1725 	assert(bucket != TH_BUCKET_RUN);
1726 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1727 	return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
1728 }
1729 
1730 /*
1731  * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1732  *
1733  * Decrement the run count for the clutch bucket associated with the
1734  * thread.
1735  */
1736 uint32_t
sched_clutch_thread_run_bucket_decr(thread_t thread,sched_bucket_t bucket)1737 sched_clutch_thread_run_bucket_decr(
1738 	thread_t thread,
1739 	sched_bucket_t bucket)
1740 {
1741 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1742 		return 0;
1743 	}
1744 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1745 	return sched_clutch_run_bucket_decr(clutch, bucket);
1746 }
1747 
1748 static uint32_t
sched_clutch_run_bucket_decr(sched_clutch_t clutch,sched_bucket_t bucket)1749 sched_clutch_run_bucket_decr(
1750 	sched_clutch_t clutch,
1751 	sched_bucket_t bucket)
1752 {
1753 	assert(bucket != TH_BUCKET_RUN);
1754 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1755 	return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
1756 }
1757 
1758 /*
1759  * sched_clutch_bucket_group_timeshare_update()
1760  *
1761  * Routine to update the load and priority shift for the clutch_bucket_group
1762  * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
1763  * clutch buckets with runnable threads in them. So it is important to maintain
1764  * the timesharing information at the clutch_bucket_group level instead of
1765  * individual clutch buckets (because the algorithm is trying to timeshare all
1766  * threads at the same QoS irrespective of which hierarchy they are enqueued in).
1767  *
1768  * The routine is called from the sched tick handling code to make sure this value
1769  * is updated at least once every sched tick. For clutch bucket groups which have
1770  * not been runnable for very long, the clutch_bucket_group maintains a "last
1771  * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
1772  * if this value is outdated, the load and shifts are updated.
1773  *
1774  * Possible optimization:
1775  * - The current algorithm samples the load every sched tick (125ms).
1776  *   This is prone to spikes in runnable counts; if that turns out to be
1777  *   a problem, a simple solution would be to do the EWMA trick to sample
1778  *   load at every load_tick (30ms) and use the averaged value for the pri
1779  *   shift calculation.
1780  */
1781 static void
sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_bucket_t clutch_bucket,uint64_t ctime)1782 sched_clutch_bucket_group_timeshare_update(
1783 	sched_clutch_bucket_group_t clutch_bucket_group,
1784 	sched_clutch_bucket_t clutch_bucket,
1785 	uint64_t ctime)
1786 {
1787 	if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
1788 		/* No timesharing needed for fixed priority Above UI threads */
1789 		return;
1790 	}
1791 
1792 	/*
1793 	 * Update the timeshare parameters for the clutch bucket group
1794 	 * if they havent been updated in this tick.
1795 	 */
1796 	uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
1797 	uint32_t current_sched_ts = sched_tick;
1798 	if (sched_ts < current_sched_ts) {
1799 		os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
1800 		/* NCPU wide workloads should not experience decay */
1801 		uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
1802 		uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
1803 		bucket_group_load = MIN(bucket_group_load, NRQS - 1);
1804 		uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
1805 		/* Ensure that the pri_shift value is reasonable */
1806 		pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
1807 		os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
1808 	}
1809 
1810 	/*
1811 	 * Update the clutch bucket priority; this allows clutch buckets that have been pending
1812 	 * for a long time to get an updated interactivity score.
1813 	 */
1814 	sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
1815 }
1816 
1817 /*
1818  * sched_clutch_thread_clutch_update()
1819  *
1820  * Routine called when the thread changes its thread group. The current
1821  * implementation relies on the fact that the thread group is changed only from
1822  * the context of the thread itself or when the thread is runnable but not in a
1823  * runqueue. Due to this fact, the thread group change causes only counter
1824  * updates in the old & new clutch buckets and no hierarchy changes. The routine
1825  * also attributes the CPU used so far to the old clutch.
1826  */
1827 void
sched_clutch_thread_clutch_update(thread_t thread,sched_clutch_t old_clutch,sched_clutch_t new_clutch)1828 sched_clutch_thread_clutch_update(
1829 	thread_t thread,
1830 	sched_clutch_t old_clutch,
1831 	sched_clutch_t new_clutch)
1832 {
1833 	uint32_t cpu_delta;
1834 
1835 	if (old_clutch) {
1836 		assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
1837 
1838 		sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
1839 		/*
1840 		 * Calculate the CPU used by this thread in the old bucket and
1841 		 * add it to the old clutch bucket. This uses the same CPU usage
1842 		 * logic as update_priority etc.
1843 		 */
1844 		sched_tick_delta(thread, cpu_delta);
1845 		if (thread->pri_shift < INT8_MAX) {
1846 			thread->sched_usage += cpu_delta;
1847 		}
1848 		thread->cpu_delta += cpu_delta;
1849 		if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1850 			sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
1851 			sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
1852 		}
1853 	}
1854 
1855 	if (new_clutch) {
1856 		sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
1857 	}
1858 }
1859 
1860 /* Thread Insertion/Removal/Selection routines */
1861 
1862 #if CONFIG_SCHED_EDGE
1863 
1864 /*
1865  * Edge Scheduler Bound Thread Support
1866  *
1867  * The edge scheduler allows threads to be bound to specific clusters. The scheduler
1868  * maintains a separate runq on the clutch root to hold these bound threads. These
1869  * bound threads count towards the root priority and thread count, but are ignored
1870  * for thread migration/steal decisions. Bound threads that are enqueued in the
1871  * separate runq have the th_bound_cluster_enqueued flag set to allow easy
1872  * removal.
1873  *
1874  * Bound Threads Timesharing
1875  * The bound threads share the timesharing properties of the clutch bucket group they are
1876  * part of. They contribute to the load and use priority shifts/decay values from the
1877  * clutch bucket group.
1878  */
1879 
1880 static boolean_t
sched_edge_bound_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)1881 sched_edge_bound_thread_insert(
1882 	sched_clutch_root_t root_clutch,
1883 	thread_t thread,
1884 	integer_t options)
1885 {
1886 	/* Update the clutch runnable count and priority */
1887 	sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1888 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1889 	if (root_bucket->scrb_bound_thread_runq.count == 0) {
1890 		sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
1891 	}
1892 
1893 	assert((thread->th_bound_cluster_enqueued) == false);
1894 	run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
1895 	thread->th_bound_cluster_enqueued = true;
1896 
1897 	int16_t root_old_pri = root_clutch->scr_priority;
1898 	sched_clutch_root_pri_update(root_clutch);
1899 	return root_clutch->scr_priority > root_old_pri;
1900 }
1901 
1902 static void
sched_edge_bound_thread_remove(sched_clutch_root_t root_clutch,thread_t thread)1903 sched_edge_bound_thread_remove(
1904 	sched_clutch_root_t root_clutch,
1905 	thread_t thread)
1906 {
1907 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1908 	assert((thread->th_bound_cluster_enqueued) == true);
1909 	run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
1910 	thread->th_bound_cluster_enqueued = false;
1911 
1912 	/* Update the clutch runnable count and priority */
1913 	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
1914 	if (root_bucket->scrb_bound_thread_runq.count == 0) {
1915 		sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
1916 	}
1917 	sched_clutch_root_pri_update(root_clutch);
1918 }
1919 
1920 /*
1921  * Edge Scheduler cluster shared resource threads load balancing
1922  *
1923  * The Edge scheduler attempts to load balance cluster shared resource intensive threads
1924  * across clusters in order to reduce contention on the shared resources. It achieves
1925  * that by maintaining the runnable and running shared resource load on each cluster
1926  * and balancing the load across multiple clusters.
1927  *
1928  * The current implementation for cluster shared resource load balancing looks at
1929  * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
1930  * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
1931  * do not steal/rebalance shared resource threads. Some more details for the implementation:
1932  *
1933  * - When threads are tagged as shared resource, they go through the cluster selection logic
1934  *   which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
1935  *   enqueued in the cluster bound runqueue.
1936  *
1937  * - When the threads start running and call avoid_processor, the load balancing logic will be
1938  *   invoked and cause the thread to be sent to a more preferred cluster if one exists and has
1939  *   no shared resource load.
1940  *
1941  * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
1942  *   it will look at running shared resource threads on foreign clusters and actively rebalance them.
1943  *
1944  * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
1945  *   go idle intentionally.
1946  *
1947  * - One caveat of this design is that if a preferred CPU has already run and finished its shared
1948  *   resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
1949  *   The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
1950  *   events listed above happen.
1951  *
1952  * - Also it currently does not consider other properties such as thread priorities and
1953  *   qos level thread load in the thread placement decision.
1954  *
1955  * Edge Scheduler cluster shared resource thread scheduling policy
1956  *
1957  * The threads for shared resources can be scheduled using one of the two policies:
1958  *
1959  * EDGE_SHARED_RSRC_SCHED_POLICY_RR
1960  * This policy distributes the threads so that they spread across all available clusters
1961  * irrespective of type. The idea is that this scheduling policy will put a shared resource
1962  * thread on each cluster on the platform before it starts doubling up on clusters.
1963  *
1964  * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
1965  * This policy distributes threads so that the threads first fill up all the capacity on
1966  * the preferred cluster and its homogeneous peers before spilling to different core type.
1967  * The current implementation defines capacity based on the number of CPUs in the cluster;
1968  * so a cluster's shared resource is considered full if there are "n" runnable + running
1969  * shared resource threads on the cluster with n cpus. This policy is different from the
1970  * default scheduling policy of the edge scheduler since this always tries to fill up the
1971  * native clusters to capacity even when non-native clusters might be idle.
1972  */
1973 __options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
1974 	EDGE_SHARED_RSRC_SCHED_POLICY_RR                = 0,
1975 	EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST      = 1,
1976 });
1977 
1978 const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
1979 	[CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
1980 	[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
1981 };
1982 
1983 static void
sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch,thread_t thread)1984 sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
1985 {
1986 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
1987 		root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
1988 		thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
1989 	}
1990 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
1991 		root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
1992 		thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
1993 	}
1994 }
1995 
1996 static void
sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch,thread_t thread)1997 sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
1998 {
1999 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2000 		if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2001 			thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2002 			root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2003 		}
2004 	}
2005 }
2006 
2007 uint16_t
sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch,cluster_shared_rsrc_type_t shared_rsrc_type)2008 sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2009 {
2010 	return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2011 }
2012 
2013 /*
2014  * sched_edge_shared_rsrc_idle()
2015  *
2016  * Routine used to determine if the constrained resource for the pset is idle. This is
2017  * used by a CPU going idle to decide if it should rebalance a running shared resource
2018  * thread from a non-preferred cluster.
2019  */
2020 static boolean_t
sched_edge_shared_rsrc_idle(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2021 sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2022 {
2023 	return sched_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
2024 }
2025 
2026 /*
2027  * sched_edge_thread_shared_rsrc_type
2028  *
2029  * This routine decides if a given thread needs special handling for being a
2030  * heavy shared resource user. It is valid for the same thread to be using
2031  * several shared resources at the same time and have multiple policy flags set.
2032  * This routine determines which of those properties will be used for load
2033  * balancing and migration decisions.
2034  */
2035 static cluster_shared_rsrc_type_t
sched_edge_thread_shared_rsrc_type(thread_t thread)2036 sched_edge_thread_shared_rsrc_type(thread_t thread)
2037 {
2038 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2039 		return CLUSTER_SHARED_RSRC_TYPE_RR;
2040 	}
2041 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2042 		return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2043 	}
2044 	return CLUSTER_SHARED_RSRC_TYPE_NONE;
2045 }
2046 
2047 #endif /* CONFIG_SCHED_EDGE */
2048 
2049 /*
2050  * sched_clutch_thread_bound_lookup()
2051  *
2052  * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2053  */
2054 static thread_t
sched_clutch_thread_bound_lookup(__unused sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket)2055 sched_clutch_thread_bound_lookup(
2056 	__unused sched_clutch_root_t root_clutch,
2057 	sched_clutch_root_bucket_t root_bucket)
2058 {
2059 	return run_queue_peek(&root_bucket->scrb_bound_thread_runq);
2060 }
2061 
2062 /*
2063  * Clutch Bucket Group Thread Counts and Pending time calculation
2064  *
2065  * The pending time on the clutch_bucket_group allows the scheduler to track if it
2066  * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2067  * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2068  * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2069  * pending time is advanced to the time of thread selection.
2070  *
2071  * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2072  * simulataneously, it is important that the updates to thread counts and pending timestamps
2073  * happen atomically. The implementation relies on the following aspects to make that work
2074  * as expected:
2075  * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2076  *   is held when threads are added/removed and pending timestamps are updated
2077  * - The edge scheduler would have support for double wide 128 bit atomics which allow the
2078  *   thread count and pending timestamp to be updated atomically.
2079  *
2080  * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2081  * above to atomically update the interactivity score for a clutch bucket group.
2082  */
2083 
2084 #if CONFIG_SCHED_EDGE
2085 
2086 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2087 sched_clutch_bucket_group_thr_count_inc(
2088 	sched_clutch_bucket_group_t clutch_bucket_group,
2089 	uint64_t timestamp)
2090 {
2091 	sched_clutch_counter_time_t old_pending_data;
2092 	sched_clutch_counter_time_t new_pending_data;
2093 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2094 		new_pending_data.scct_count = old_pending_data.scct_count + 1;
2095 		new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2096 		if (old_pending_data.scct_count == 0) {
2097 		        new_pending_data.scct_timestamp = timestamp;
2098 		}
2099 	});
2100 }
2101 
2102 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2103 sched_clutch_bucket_group_thr_count_dec(
2104 	sched_clutch_bucket_group_t clutch_bucket_group,
2105 	uint64_t timestamp)
2106 {
2107 	sched_clutch_counter_time_t old_pending_data;
2108 	sched_clutch_counter_time_t new_pending_data;
2109 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2110 		new_pending_data.scct_count = old_pending_data.scct_count - 1;
2111 		if (new_pending_data.scct_count == 0) {
2112 		        new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2113 		} else {
2114 		        new_pending_data.scct_timestamp = timestamp;
2115 		}
2116 	});
2117 }
2118 
2119 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2120 sched_clutch_bucket_group_pending_ageout(
2121 	sched_clutch_bucket_group_t clutch_bucket_group,
2122 	uint64_t timestamp)
2123 {
2124 	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2125 	sched_clutch_counter_time_t old_pending_data;
2126 	sched_clutch_counter_time_t new_pending_data;
2127 	uint8_t cpu_usage_shift = 0;
2128 
2129 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2130 		cpu_usage_shift = 0;
2131 		uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2132 		bool old_update = (old_pending_ts >= timestamp);
2133 		bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2134 		bool no_bucket_load = (bucket_load == 0);
2135 		if (old_update || no_pending_time || no_bucket_load) {
2136 		        os_atomic_rmw_loop_give_up();
2137 		}
2138 
2139 		/* Calculate the time the clutch bucket group has been pending */
2140 		uint64_t pending_delta = timestamp - old_pending_ts;
2141 		uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2142 		if (pending_delta < interactivity_delta) {
2143 		        os_atomic_rmw_loop_give_up();
2144 		}
2145 		cpu_usage_shift = (pending_delta / interactivity_delta);
2146 		new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2147 		new_pending_data.scct_count = old_pending_data.scct_count;
2148 	});
2149 	return cpu_usage_shift;
2150 }
2151 
2152 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2153 sched_clutch_bucket_group_interactivity_score_calculate(
2154 	sched_clutch_bucket_group_t clutch_bucket_group,
2155 	uint64_t timestamp)
2156 {
2157 	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2158 		/*
2159 		 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2160 		 * priorities, make sure all AboveUI buckets are marked interactive.
2161 		 */
2162 		assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2163 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2164 	}
2165 	/* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2166 	uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2167 	/* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2168 	sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2169 	uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2170 	sched_clutch_counter_time_t old_interactivity_data;
2171 	sched_clutch_counter_time_t new_interactivity_data;
2172 
2173 	bool score_updated = os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2174 		if (old_interactivity_data.scct_timestamp >= timestamp) {
2175 		        os_atomic_rmw_loop_give_up();
2176 		}
2177 		new_interactivity_data.scct_timestamp = timestamp;
2178 		if (old_interactivity_data.scct_timestamp != 0) {
2179 		        new_interactivity_data.scct_count = interactivity_score;
2180 		}
2181 	});
2182 	if (score_updated) {
2183 		return (uint8_t)new_interactivity_data.scct_count;
2184 	} else {
2185 		return (uint8_t)old_interactivity_data.scct_count;
2186 	}
2187 }
2188 
2189 #else /* CONFIG_SCHED_EDGE */
2190 
2191 /*
2192  * For the clutch scheduler, atomicity is ensured by making sure all operations
2193  * are happening under the pset lock of the only cluster present on the platform.
2194  */
2195 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2196 sched_clutch_bucket_group_thr_count_inc(
2197 	sched_clutch_bucket_group_t clutch_bucket_group,
2198 	uint64_t timestamp)
2199 {
2200 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2201 	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2202 		clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2203 	}
2204 	clutch_bucket_group->scbg_pending_data.scct_count++;
2205 }
2206 
2207 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2208 sched_clutch_bucket_group_thr_count_dec(
2209 	sched_clutch_bucket_group_t clutch_bucket_group,
2210 	uint64_t timestamp)
2211 {
2212 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2213 	clutch_bucket_group->scbg_pending_data.scct_count--;
2214 	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2215 		clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2216 	} else {
2217 		clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2218 	}
2219 }
2220 
2221 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2222 sched_clutch_bucket_group_pending_ageout(
2223 	sched_clutch_bucket_group_t clutch_bucket_group,
2224 	uint64_t timestamp)
2225 {
2226 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2227 	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2228 	uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2229 	bool old_update = (old_pending_ts >= timestamp);
2230 	bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2231 	bool no_bucket_load = (bucket_load == 0);
2232 	if (old_update || no_pending_time || no_bucket_load) {
2233 		return 0;
2234 	}
2235 	uint64_t pending_delta = timestamp - old_pending_ts;
2236 	uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2237 	if (pending_delta < interactivity_delta) {
2238 		return 0;
2239 	}
2240 	uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2241 	clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2242 	return cpu_usage_shift;
2243 }
2244 
2245 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2246 sched_clutch_bucket_group_interactivity_score_calculate(
2247 	sched_clutch_bucket_group_t clutch_bucket_group,
2248 	uint64_t timestamp)
2249 {
2250 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2251 	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2252 		/*
2253 		 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2254 		 * priorities, make sure all AboveUI buckets are marked interactive.
2255 		 */
2256 		assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2257 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2258 	}
2259 	/* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2260 	uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2261 	/* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2262 	sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2263 	uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2264 	if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2265 		clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2266 		clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2267 		return interactivity_score;
2268 	} else {
2269 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2270 	}
2271 }
2272 
2273 #endif /* CONFIG_SCHED_EDGE */
2274 
2275 /*
2276  * Clutch Bucket Group Run Count and Blocked Time Accounting
2277  *
2278  * The clutch bucket group maintains the number of runnable/running threads in the group.
2279  * Since the blocked time of the clutch bucket group is based on this count, it is
2280  * important to make sure the blocking timestamp and the run count are updated atomically.
2281  *
2282  * Since the run count increments happen without any pset locks held, the scheduler makes
2283  * these updates atomic in the following way:
2284  * - On 64-bit platforms, it uses double wide atomics to update the count & timestamp
2285  * - On 32-bit platforms, it uses a lock to synchronize the count & timestamp update
2286  */
2287 
2288 #if !__LP64__
2289 
2290 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2291 sched_clutch_bucket_group_run_count_inc(
2292 	sched_clutch_bucket_group_t clutch_bucket_group)
2293 {
2294 	uint64_t blocked_time = 0;
2295 	uint64_t updated_run_count = 0;
2296 
2297 	lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2298 	if ((clutch_bucket_group->scbg_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID) &&
2299 	    (clutch_bucket_group->scbg_blocked_data.scct_count == 0)) {
2300 		/* Run count is transitioning from 0 to 1; calculate blocked time and add it to CPU data */
2301 		blocked_time = mach_absolute_time() - clutch_bucket_group->scbg_blocked_data.scct_timestamp;
2302 		clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2303 	}
2304 	clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count + 1;
2305 	updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2306 	lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2307 
2308 	blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2309 	os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2310 	return (uint32_t)updated_run_count;
2311 }
2312 
2313 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2314 sched_clutch_bucket_group_run_count_dec(
2315 	sched_clutch_bucket_group_t clutch_bucket_group)
2316 {
2317 	uint64_t updated_run_count = 0;
2318 
2319 	lck_spin_lock(&clutch_bucket_group->scbg_stats_lock);
2320 	clutch_bucket_group->scbg_blocked_data.scct_count = clutch_bucket_group->scbg_blocked_data.scct_count - 1;
2321 	if (clutch_bucket_group->scbg_blocked_data.scct_count == 0) {
2322 		/* Run count is transitioning from 1 to 0; start the blocked timer */
2323 		clutch_bucket_group->scbg_blocked_data.scct_timestamp = mach_absolute_time();
2324 	}
2325 	updated_run_count = clutch_bucket_group->scbg_blocked_data.scct_count;
2326 	lck_spin_unlock(&clutch_bucket_group->scbg_stats_lock);
2327 	return (uint32_t)updated_run_count;
2328 }
2329 
2330 #else /* !__LP64__ */
2331 
2332 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2333 sched_clutch_bucket_group_run_count_inc(
2334 	sched_clutch_bucket_group_t clutch_bucket_group)
2335 {
2336 	sched_clutch_counter_time_t old_blocked_data;
2337 	sched_clutch_counter_time_t new_blocked_data;
2338 
2339 	bool update_blocked_time = false;
2340 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2341 		new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2342 		new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2343 		update_blocked_time = false;
2344 		if (old_blocked_data.scct_count == 0) {
2345 		        new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2346 		        update_blocked_time = true;
2347 		}
2348 	});
2349 	if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2350 		uint64_t ctime = mach_absolute_time();
2351 		if (ctime > old_blocked_data.scct_timestamp) {
2352 			uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2353 			blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2354 			os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2355 		}
2356 	}
2357 	return (uint32_t)new_blocked_data.scct_count;
2358 }
2359 
2360 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2361 sched_clutch_bucket_group_run_count_dec(
2362 	sched_clutch_bucket_group_t clutch_bucket_group)
2363 {
2364 	sched_clutch_counter_time_t old_blocked_data;
2365 	sched_clutch_counter_time_t new_blocked_data;
2366 
2367 	uint64_t ctime = mach_absolute_time();
2368 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2369 		new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2370 		new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2371 		if (new_blocked_data.scct_count == 0) {
2372 		        new_blocked_data.scct_timestamp = ctime;
2373 		}
2374 	});
2375 	return (uint32_t)new_blocked_data.scct_count;
2376 }
2377 
2378 #endif /* !__LP64__ */
2379 
2380 
2381 /*
2382  * sched_clutch_thread_insert()
2383  *
2384  * Routine to insert a thread into the sched clutch hierarchy.
2385  * Update the counts at all levels of the hierarchy and insert the nodes
2386  * as they become runnable. Always called with the pset lock held.
2387  */
2388 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2389 sched_clutch_thread_insert(
2390 	sched_clutch_root_t root_clutch,
2391 	thread_t thread,
2392 	integer_t options)
2393 {
2394 	boolean_t result = FALSE;
2395 
2396 	sched_clutch_hierarchy_locked_assert(root_clutch);
2397 #if CONFIG_SCHED_EDGE
2398 	sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2399 	sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2400 	/*
2401 	 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2402 	 * One scenario where a bound thread might not be getting enqueued in the bound cluster
2403 	 * hierarchy would be if the thread is "soft" bound and the bound cluster is
2404 	 * de-recommended. In that case, the thread should be treated as an unbound
2405 	 * thread.
2406 	 */
2407 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2408 		return sched_edge_bound_thread_insert(root_clutch, thread, options);
2409 	}
2410 	/*
2411 	 * Use bound runqueue for shared resource threads. See "cluster shared resource
2412 	 * threads load balancing" section for details.
2413 	 */
2414 	if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2415 		return sched_edge_bound_thread_insert(root_clutch, thread, options);
2416 	}
2417 
2418 #endif /* CONFIG_SCHED_EDGE */
2419 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2420 	assert(thread->thread_group == clutch->sc_tg);
2421 
2422 	uint64_t current_timestamp = mach_absolute_time();
2423 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2424 	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2425 	assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2426 
2427 	/*
2428 	 * Thread linkage in clutch_bucket
2429 	 *
2430 	 * A thread has a few linkages within the clutch bucket:
2431 	 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2432 	 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2433 	 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2434 	 */
2435 
2436 	/* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2437 	thread->th_clutch_runq_link.stamp = current_timestamp;
2438 	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2439 	    (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2440 	priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2441 
2442 	/* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2443 	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2444 	    sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2445 	priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2446 
2447 	/* Insert thread into timesharing queue of the clutch bucket */
2448 	enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2449 
2450 	/* Increment the urgency counter for the root if necessary */
2451 	sched_clutch_root_urgency_inc(root_clutch, thread);
2452 
2453 	os_atomic_inc(&clutch->sc_thr_count, relaxed);
2454 	sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2455 
2456 	/* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2457 	sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2458 	if (clutch_bucket->scb_thr_count == 0) {
2459 		sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2460 		sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2461 		result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2462 	} else {
2463 		sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2464 		sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2465 		result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2466 	}
2467 
2468 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2469 	    root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2470 	    SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2471 	return result;
2472 }
2473 
2474 /*
2475  * sched_clutch_thread_remove()
2476  *
2477  * Routine to remove a thread from the sched clutch hierarchy.
2478  * Update the counts at all levels of the hierarchy and remove the nodes
2479  * as they become empty. Always called with the pset lock held.
2480  */
2481 static void
sched_clutch_thread_remove(sched_clutch_root_t root_clutch,thread_t thread,uint64_t current_timestamp,sched_clutch_bucket_options_t options)2482 sched_clutch_thread_remove(
2483 	sched_clutch_root_t root_clutch,
2484 	thread_t thread,
2485 	uint64_t current_timestamp,
2486 	sched_clutch_bucket_options_t options)
2487 {
2488 	sched_clutch_hierarchy_locked_assert(root_clutch);
2489 #if CONFIG_SCHED_EDGE
2490 	sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2491 	sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2492 
2493 	if (thread->th_bound_cluster_enqueued) {
2494 		sched_edge_bound_thread_remove(root_clutch, thread);
2495 		return;
2496 	}
2497 #endif /* CONFIG_SCHED_EDGE */
2498 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2499 	assert(thread->thread_group == clutch->sc_tg);
2500 	assert(thread->runq != PROCESSOR_NULL);
2501 
2502 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2503 	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2504 	assert(clutch_bucket->scb_root == root_clutch);
2505 
2506 	/* Decrement the urgency counter for the root if necessary */
2507 	sched_clutch_root_urgency_dec(root_clutch, thread);
2508 	/* Remove thread from the clutch_bucket */
2509 	priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2510 	remqueue(&thread->th_clutch_timeshare_link);
2511 	thread->runq = PROCESSOR_NULL;
2512 
2513 	priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2514 
2515 	/* Update counts at various levels of the hierarchy */
2516 	os_atomic_dec(&clutch->sc_thr_count, relaxed);
2517 	sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2518 	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2519 	sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2520 
2521 	/* Remove the clutch from hierarchy (if needed) and update properties */
2522 	if (clutch_bucket->scb_thr_count == 0) {
2523 		sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2524 	} else {
2525 		sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2526 	}
2527 
2528 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2529 	    root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2530 	    SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2531 }
2532 
2533 /*
2534  * sched_clutch_thread_unbound_lookup()
2535  *
2536  * Routine to find the highest unbound thread in the root clutch.
2537  * Helps find threads easily for steal/migrate scenarios in the
2538  * Edge scheduler.
2539  */
2540 static thread_t
sched_clutch_thread_unbound_lookup(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket)2541 sched_clutch_thread_unbound_lookup(
2542 	sched_clutch_root_t root_clutch,
2543 	sched_clutch_root_bucket_t root_bucket)
2544 {
2545 	sched_clutch_hierarchy_locked_assert(root_clutch);
2546 
2547 	/* Find the highest priority clutch bucket in this root bucket */
2548 	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
2549 	assert(clutch_bucket != NULL);
2550 
2551 	/* Find the highest priority runnable thread in this clutch bucket */
2552 	thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2553 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2554 	    thread_tid(thread), thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
2555 	return thread;
2556 }
2557 
2558 /*
2559  * sched_clutch_thread_highest_remove()
2560  *
2561  * Routine to find and remove the highest priority thread
2562  * from the sched clutch hierarchy. The algorithm looks at the
2563  * hierarchy for the most eligible runnable thread and calls
2564  * sched_clutch_thread_remove(). Always called with the
2565  * pset lock held.
2566  */
2567 static thread_t
sched_clutch_thread_highest_remove(sched_clutch_root_t root_clutch)2568 sched_clutch_thread_highest_remove(
2569 	sched_clutch_root_t root_clutch)
2570 {
2571 	sched_clutch_hierarchy_locked_assert(root_clutch);
2572 	uint64_t current_timestamp = mach_absolute_time();
2573 
2574 	sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
2575 	if (root_bucket == NULL) {
2576 		return THREAD_NULL;
2577 	}
2578 
2579 	thread_t highest_thread = THREAD_NULL;
2580 	if (root_bucket->scrb_bound) {
2581 		highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket);
2582 	} else {
2583 		highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket);
2584 	}
2585 	sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2586 	return highest_thread;
2587 }
2588 
2589 /* High level global accessor routines */
2590 
2591 /*
2592  * sched_clutch_root_urgency()
2593  *
2594  * Routine to get the urgency of the highest runnable
2595  * thread in the hierarchy.
2596  */
2597 static uint32_t
sched_clutch_root_urgency(sched_clutch_root_t root_clutch)2598 sched_clutch_root_urgency(
2599 	sched_clutch_root_t root_clutch)
2600 {
2601 	return root_clutch->scr_urgency;
2602 }
2603 
2604 /*
2605  * sched_clutch_root_count_sum()
2606  *
2607  * The count_sum mechanism is used for scheduler runq
2608  * statistics calculation. Its only useful for debugging
2609  * purposes; since it takes a mach_absolute_time() on
2610  * other scheduler implementations, its better to avoid
2611  * populating this until absolutely necessary.
2612  */
2613 static uint32_t
sched_clutch_root_count_sum(__unused sched_clutch_root_t root_clutch)2614 sched_clutch_root_count_sum(
2615 	__unused sched_clutch_root_t root_clutch)
2616 {
2617 	return 0;
2618 }
2619 
2620 /*
2621  * sched_clutch_root_priority()
2622  *
2623  * Routine to get the priority of the highest runnable
2624  * thread in the hierarchy.
2625  */
2626 static int
sched_clutch_root_priority(sched_clutch_root_t root_clutch)2627 sched_clutch_root_priority(
2628 	sched_clutch_root_t root_clutch)
2629 {
2630 	return root_clutch->scr_priority;
2631 }
2632 
2633 /*
2634  * sched_clutch_root_count()
2635  *
2636  * Returns total number of runnable threads in the hierarchy.
2637  */
2638 uint32_t
sched_clutch_root_count(sched_clutch_root_t root_clutch)2639 sched_clutch_root_count(
2640 	sched_clutch_root_t root_clutch)
2641 {
2642 	return root_clutch->scr_thr_count;
2643 }
2644 
2645 #if CONFIG_SCHED_EDGE
2646 
2647 /*
2648  * sched_clutch_root_foreign_empty()
2649  *
2650  * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2651  */
2652 static boolean_t
sched_clutch_root_foreign_empty(sched_clutch_root_t root_clutch)2653 sched_clutch_root_foreign_empty(
2654 	sched_clutch_root_t root_clutch)
2655 {
2656 	return priority_queue_empty(&root_clutch->scr_foreign_buckets);
2657 }
2658 
2659 /*
2660  * sched_clutch_root_highest_foreign_thread_remove()
2661  *
2662  * Routine to return the thread in the highest priority clutch bucket in a cluster.
2663  * Must be called with the pset for the cluster locked.
2664  */
2665 static thread_t
sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t root_clutch)2666 sched_clutch_root_highest_foreign_thread_remove(
2667 	sched_clutch_root_t root_clutch)
2668 {
2669 	thread_t thread = THREAD_NULL;
2670 	if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
2671 		return thread;
2672 	}
2673 	sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
2674 	thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2675 	sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
2676 	return thread;
2677 }
2678 
2679 #endif /* CONFIG_SCHED_EDGE */
2680 
2681 /*
2682  * sched_clutch_thread_pri_shift()
2683  *
2684  * Routine to get the priority shift value for a thread.
2685  * Since the timesharing is done at the clutch_bucket level,
2686  * this routine gets the clutch_bucket and retrieves the
2687  * values from there.
2688  */
2689 uint32_t
sched_clutch_thread_pri_shift(thread_t thread,sched_bucket_t bucket)2690 sched_clutch_thread_pri_shift(
2691 	thread_t thread,
2692 	sched_bucket_t bucket)
2693 {
2694 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2695 		return INT8_MAX;
2696 	}
2697 	assert(bucket != TH_BUCKET_RUN);
2698 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2699 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2700 	return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
2701 }
2702 
2703 #pragma mark -- Clutch Scheduler Algorithm
2704 
2705 static void
2706 sched_clutch_init(void);
2707 
2708 static thread_t
2709 sched_clutch_steal_thread(processor_set_t pset);
2710 
2711 static void
2712 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
2713 
2714 static boolean_t
2715 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
2716     sched_options_t options);
2717 
2718 static boolean_t
2719 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
2720 
2721 static ast_t
2722 sched_clutch_processor_csw_check(processor_t processor);
2723 
2724 static boolean_t
2725 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
2726 
2727 static int
2728 sched_clutch_runq_count(processor_t processor);
2729 
2730 static boolean_t
2731 sched_clutch_processor_queue_empty(processor_t processor);
2732 
2733 static uint64_t
2734 sched_clutch_runq_stats_count_sum(processor_t processor);
2735 
2736 static int
2737 sched_clutch_processor_bound_count(processor_t processor);
2738 
2739 static void
2740 sched_clutch_pset_init(processor_set_t pset);
2741 
2742 static void
2743 sched_clutch_processor_init(processor_t processor);
2744 
2745 static thread_t
2746 sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
2747 
2748 static void
2749 sched_clutch_processor_queue_shutdown(processor_t processor);
2750 
2751 static sched_mode_t
2752 sched_clutch_initial_thread_sched_mode(task_t parent_task);
2753 
2754 static uint32_t
2755 sched_clutch_initial_quantum_size(thread_t thread);
2756 
2757 static bool
2758 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread);
2759 
2760 static uint32_t
2761 sched_clutch_run_incr(thread_t thread);
2762 
2763 static uint32_t
2764 sched_clutch_run_decr(thread_t thread);
2765 
2766 static void
2767 sched_clutch_update_thread_bucket(thread_t thread);
2768 
2769 const struct sched_dispatch_table sched_clutch_dispatch = {
2770 	.sched_name                                     = "clutch",
2771 	.init                                           = sched_clutch_init,
2772 	.timebase_init                                  = sched_timeshare_timebase_init,
2773 	.processor_init                                 = sched_clutch_processor_init,
2774 	.pset_init                                      = sched_clutch_pset_init,
2775 	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
2776 	.choose_thread                                  = sched_clutch_choose_thread,
2777 	.steal_thread_enabled                           = sched_steal_thread_enabled,
2778 	.steal_thread                                   = sched_clutch_steal_thread,
2779 	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
2780 	.choose_node                                    = sched_choose_node,
2781 	.choose_processor                               = choose_processor,
2782 	.processor_enqueue                              = sched_clutch_processor_enqueue,
2783 	.processor_queue_shutdown                       = sched_clutch_processor_queue_shutdown,
2784 	.processor_queue_remove                         = sched_clutch_processor_queue_remove,
2785 	.processor_queue_empty                          = sched_clutch_processor_queue_empty,
2786 	.priority_is_urgent                             = priority_is_urgent,
2787 	.processor_csw_check                            = sched_clutch_processor_csw_check,
2788 	.processor_queue_has_priority                   = sched_clutch_processor_queue_has_priority,
2789 	.initial_quantum_size                           = sched_clutch_initial_quantum_size,
2790 	.initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
2791 	.can_update_priority                            = can_update_priority,
2792 	.update_priority                                = update_priority,
2793 	.lightweight_update_priority                    = lightweight_update_priority,
2794 	.quantum_expire                                 = sched_default_quantum_expire,
2795 	.processor_runq_count                           = sched_clutch_runq_count,
2796 	.processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
2797 	.processor_bound_count                          = sched_clutch_processor_bound_count,
2798 	.thread_update_scan                             = sched_clutch_thread_update_scan,
2799 	.multiple_psets_enabled                         = TRUE,
2800 	.sched_groups_enabled                           = FALSE,
2801 	.avoid_processor_enabled                        = TRUE,
2802 	.thread_avoid_processor                         = sched_clutch_thread_avoid_processor,
2803 	.processor_balance                              = sched_SMT_balance,
2804 
2805 	.rt_runq                                        = sched_rtlocal_runq,
2806 	.rt_init                                        = sched_rtlocal_init,
2807 	.rt_queue_shutdown                              = sched_rtlocal_queue_shutdown,
2808 	.rt_runq_scan                                   = sched_rtlocal_runq_scan,
2809 	.rt_runq_count_sum                              = sched_rtlocal_runq_count_sum,
2810 	.rt_steal_thread                                = sched_rtlocal_steal_thread,
2811 
2812 	.qos_max_parallelism                            = sched_qos_max_parallelism,
2813 	.check_spill                                    = sched_check_spill,
2814 	.ipi_policy                                     = sched_ipi_policy,
2815 	.thread_should_yield                            = sched_thread_should_yield,
2816 	.run_count_incr                                 = sched_clutch_run_incr,
2817 	.run_count_decr                                 = sched_clutch_run_decr,
2818 	.update_thread_bucket                           = sched_clutch_update_thread_bucket,
2819 	.pset_made_schedulable                          = sched_pset_made_schedulable,
2820 	.cpu_init_completed                             = NULL,
2821 	.thread_eligible_for_pset                       = NULL,
2822 };
2823 
2824 __attribute__((always_inline))
2825 static inline run_queue_t
sched_clutch_bound_runq(processor_t processor)2826 sched_clutch_bound_runq(processor_t processor)
2827 {
2828 	return &processor->runq;
2829 }
2830 
2831 __attribute__((always_inline))
2832 static inline sched_clutch_root_t
sched_clutch_processor_root_clutch(processor_t processor)2833 sched_clutch_processor_root_clutch(processor_t processor)
2834 {
2835 	return &processor->processor_set->pset_clutch_root;
2836 }
2837 
2838 __attribute__((always_inline))
2839 static inline run_queue_t
sched_clutch_thread_bound_runq(processor_t processor,__assert_only thread_t thread)2840 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
2841 {
2842 	assert(thread->bound_processor == processor);
2843 	return sched_clutch_bound_runq(processor);
2844 }
2845 
2846 static uint32_t
sched_clutch_initial_quantum_size(thread_t thread)2847 sched_clutch_initial_quantum_size(thread_t thread)
2848 {
2849 	if (thread == THREAD_NULL) {
2850 		return std_quantum;
2851 	}
2852 	assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
2853 	return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
2854 }
2855 
2856 static sched_mode_t
sched_clutch_initial_thread_sched_mode(task_t parent_task)2857 sched_clutch_initial_thread_sched_mode(task_t parent_task)
2858 {
2859 	if (parent_task == kernel_task) {
2860 		return TH_MODE_FIXED;
2861 	} else {
2862 		return TH_MODE_TIMESHARE;
2863 	}
2864 }
2865 
2866 static void
sched_clutch_processor_init(processor_t processor)2867 sched_clutch_processor_init(processor_t processor)
2868 {
2869 	run_queue_init(&processor->runq);
2870 }
2871 
2872 static void
sched_clutch_pset_init(processor_set_t pset)2873 sched_clutch_pset_init(processor_set_t pset)
2874 {
2875 	sched_clutch_root_init(&pset->pset_clutch_root, pset);
2876 }
2877 
2878 static void
sched_clutch_tunables_init(void)2879 sched_clutch_tunables_init(void)
2880 {
2881 	sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
2882 	sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
2883 	sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
2884 	clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
2885 	    NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
2886 	assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
2887 	sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
2888 }
2889 
2890 static void
sched_clutch_init(void)2891 sched_clutch_init(void)
2892 {
2893 	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
2894 		sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
2895 	}
2896 	sched_timeshare_init();
2897 	sched_clutch_tunables_init();
2898 }
2899 
2900 static thread_t
sched_clutch_choose_thread(processor_t processor,int priority,__unused ast_t reason)2901 sched_clutch_choose_thread(
2902 	processor_t      processor,
2903 	int              priority,
2904 	__unused ast_t            reason)
2905 {
2906 	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2907 	uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
2908 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2909 	boolean_t choose_from_boundq = false;
2910 
2911 	if (bound_runq->highq < priority &&
2912 	    clutch_pri < priority) {
2913 		return THREAD_NULL;
2914 	}
2915 
2916 	if (bound_runq->count && clutch_count) {
2917 		if (bound_runq->highq >= clutch_pri) {
2918 			choose_from_boundq = true;
2919 		}
2920 	} else if (bound_runq->count) {
2921 		choose_from_boundq = true;
2922 	} else if (clutch_count) {
2923 		choose_from_boundq = false;
2924 	} else {
2925 		return THREAD_NULL;
2926 	}
2927 
2928 	thread_t thread = THREAD_NULL;
2929 	if (choose_from_boundq == false) {
2930 		sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2931 		thread = sched_clutch_thread_highest_remove(pset_clutch_root);
2932 	} else {
2933 		thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
2934 	}
2935 	return thread;
2936 }
2937 
2938 static boolean_t
sched_clutch_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)2939 sched_clutch_processor_enqueue(
2940 	processor_t       processor,
2941 	thread_t          thread,
2942 	sched_options_t   options)
2943 {
2944 	boolean_t       result;
2945 
2946 	thread->runq = processor;
2947 	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2948 		sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2949 		result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
2950 	} else {
2951 		run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
2952 		result = run_queue_enqueue(rq, thread, options);
2953 	}
2954 	return result;
2955 }
2956 
2957 static boolean_t
sched_clutch_processor_queue_empty(processor_t processor)2958 sched_clutch_processor_queue_empty(processor_t processor)
2959 {
2960 	return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
2961 	       sched_clutch_bound_runq(processor)->count == 0;
2962 }
2963 
2964 static ast_t
sched_clutch_processor_csw_check(processor_t processor)2965 sched_clutch_processor_csw_check(processor_t processor)
2966 {
2967 	boolean_t       has_higher;
2968 	int             pri;
2969 
2970 	if (sched_clutch_thread_avoid_processor(processor, current_thread())) {
2971 		return AST_PREEMPT | AST_URGENT;
2972 	}
2973 
2974 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2975 	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2976 
2977 	assert(processor->active_thread != NULL);
2978 
2979 	pri = MAX(clutch_pri, bound_runq->highq);
2980 
2981 	if (processor->first_timeslice) {
2982 		has_higher = (pri > processor->current_pri);
2983 	} else {
2984 		has_higher = (pri >= processor->current_pri);
2985 	}
2986 
2987 	if (has_higher) {
2988 		if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
2989 			return AST_PREEMPT | AST_URGENT;
2990 		}
2991 
2992 		if (bound_runq->urgency > 0) {
2993 			return AST_PREEMPT | AST_URGENT;
2994 		}
2995 
2996 		return AST_PREEMPT;
2997 	}
2998 
2999 	return AST_NONE;
3000 }
3001 
3002 static boolean_t
sched_clutch_processor_queue_has_priority(processor_t processor,int priority,boolean_t gte)3003 sched_clutch_processor_queue_has_priority(processor_t    processor,
3004     int            priority,
3005     boolean_t      gte)
3006 {
3007 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3008 
3009 	int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
3010 
3011 	if (gte) {
3012 		return qpri >= priority;
3013 	} else {
3014 		return qpri > priority;
3015 	}
3016 }
3017 
3018 static int
sched_clutch_runq_count(processor_t processor)3019 sched_clutch_runq_count(processor_t processor)
3020 {
3021 	return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
3022 }
3023 
3024 static uint64_t
sched_clutch_runq_stats_count_sum(processor_t processor)3025 sched_clutch_runq_stats_count_sum(processor_t processor)
3026 {
3027 	uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
3028 
3029 	if (processor->cpu_id == processor->processor_set->cpu_set_low) {
3030 		return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
3031 	} else {
3032 		return bound_sum;
3033 	}
3034 }
3035 static int
sched_clutch_processor_bound_count(processor_t processor)3036 sched_clutch_processor_bound_count(processor_t processor)
3037 {
3038 	return sched_clutch_bound_runq(processor)->count;
3039 }
3040 
3041 static void
sched_clutch_processor_queue_shutdown(processor_t processor)3042 sched_clutch_processor_queue_shutdown(processor_t processor)
3043 {
3044 	processor_set_t pset = processor->processor_set;
3045 	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3046 	thread_t        thread;
3047 	queue_head_t    tqueue;
3048 
3049 	/* We only need to migrate threads if this is the last active processor in the pset */
3050 	if (pset->online_processor_count > 0) {
3051 		pset_unlock(pset);
3052 		return;
3053 	}
3054 
3055 	queue_init(&tqueue);
3056 	while (sched_clutch_root_count(pset_clutch_root) > 0) {
3057 		thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3058 		enqueue_tail(&tqueue, &thread->runq_links);
3059 	}
3060 
3061 	pset_unlock(pset);
3062 
3063 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
3064 		remqueue(&thread->runq_links);
3065 		thread_lock(thread);
3066 		thread_setrun(thread, SCHED_TAILQ);
3067 		thread_unlock(thread);
3068 	}
3069 }
3070 
3071 static boolean_t
sched_clutch_processor_queue_remove(processor_t processor,thread_t thread)3072 sched_clutch_processor_queue_remove(
3073 	processor_t processor,
3074 	thread_t    thread)
3075 {
3076 	run_queue_t             rq;
3077 	processor_set_t         pset = processor->processor_set;
3078 
3079 	pset_lock(pset);
3080 
3081 	if (processor == thread->runq) {
3082 		/*
3083 		 * Thread is on a run queue and we have a lock on
3084 		 * that run queue.
3085 		 */
3086 		if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3087 			sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3088 			sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3089 		} else {
3090 			rq = sched_clutch_thread_bound_runq(processor, thread);
3091 			run_queue_remove(rq, thread);
3092 		}
3093 	} else {
3094 		/*
3095 		 * The thread left the run queue before we could
3096 		 * lock the run queue.
3097 		 */
3098 		assert(thread->runq == PROCESSOR_NULL);
3099 		processor = PROCESSOR_NULL;
3100 	}
3101 
3102 	pset_unlock(pset);
3103 
3104 	return processor != PROCESSOR_NULL;
3105 }
3106 
3107 static thread_t
sched_clutch_steal_thread(__unused processor_set_t pset)3108 sched_clutch_steal_thread(__unused processor_set_t pset)
3109 {
3110 	/* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3111 	return THREAD_NULL;
3112 }
3113 
3114 static void
sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)3115 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3116 {
3117 	boolean_t               restart_needed = FALSE;
3118 	processor_t             processor = processor_list;
3119 	processor_set_t         pset;
3120 	thread_t                thread;
3121 	spl_t                   s;
3122 
3123 	/*
3124 	 *  We update the threads associated with each processor (bound and idle threads)
3125 	 *  and then update the threads in each pset runqueue.
3126 	 */
3127 
3128 	do {
3129 		do {
3130 			pset = processor->processor_set;
3131 
3132 			s = splsched();
3133 			pset_lock(pset);
3134 
3135 			restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
3136 
3137 			pset_unlock(pset);
3138 			splx(s);
3139 
3140 			if (restart_needed) {
3141 				break;
3142 			}
3143 
3144 			thread = processor->idle_thread;
3145 			if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
3146 				if (thread_update_add_thread(thread) == FALSE) {
3147 					restart_needed = TRUE;
3148 					break;
3149 				}
3150 			}
3151 		} while ((processor = processor->processor_list) != NULL);
3152 
3153 		/* Ok, we now have a collection of candidates -- fix them. */
3154 		thread_update_process_threads();
3155 	} while (restart_needed);
3156 
3157 	pset_node_t node = &pset_node0;
3158 	pset = node->psets;
3159 
3160 	do {
3161 		do {
3162 			restart_needed = FALSE;
3163 			while (pset != NULL) {
3164 				s = splsched();
3165 				pset_lock(pset);
3166 
3167 				if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
3168 					for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3169 						restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3170 						if (restart_needed) {
3171 							break;
3172 						}
3173 					}
3174 					queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3175 					sched_clutch_bucket_t clutch_bucket;
3176 					qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3177 						sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
3178 						restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
3179 					}
3180 				}
3181 
3182 				pset_unlock(pset);
3183 				splx(s);
3184 
3185 				if (restart_needed) {
3186 					break;
3187 				}
3188 				pset = pset->pset_list;
3189 			}
3190 
3191 			if (restart_needed) {
3192 				break;
3193 			}
3194 		} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3195 
3196 		/* Ok, we now have a collection of candidates -- fix them. */
3197 		thread_update_process_threads();
3198 	} while (restart_needed);
3199 }
3200 
3201 extern int sched_allow_rt_smt;
3202 
3203 /* Return true if this thread should not continue running on this processor */
3204 static bool
sched_clutch_thread_avoid_processor(processor_t processor,thread_t thread)3205 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread)
3206 {
3207 	if (processor->processor_primary != processor) {
3208 		/*
3209 		 * This is a secondary SMT processor.  If the primary is running
3210 		 * a realtime thread, only allow realtime threads on the secondary.
3211 		 */
3212 		if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
3213 			return true;
3214 		}
3215 	}
3216 
3217 	return false;
3218 }
3219 
3220 /*
3221  * For the clutch scheduler, the run counts are maintained in the clutch
3222  * buckets (i.e thread group scheduling structure).
3223  */
3224 static uint32_t
sched_clutch_run_incr(thread_t thread)3225 sched_clutch_run_incr(thread_t thread)
3226 {
3227 	assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3228 	uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3229 	sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3230 	return new_count;
3231 }
3232 
3233 static uint32_t
sched_clutch_run_decr(thread_t thread)3234 sched_clutch_run_decr(thread_t thread)
3235 {
3236 	assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3237 	uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3238 	sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3239 	return new_count;
3240 }
3241 
3242 /*
3243  * For threads that have changed sched_pri without changing the
3244  * base_pri for any reason other than decay, use the sched_pri
3245  * as the bucketizing priority instead of base_pri. All such
3246  * changes are typically due to kernel locking primitives boosts
3247  * or demotions.
3248  */
3249 static boolean_t
sched_thread_sched_pri_promoted(thread_t thread)3250 sched_thread_sched_pri_promoted(thread_t thread)
3251 {
3252 	return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
3253 	       (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3254 	       (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3255 	       (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3256 	       (thread->kern_promotion_schedpri != 0);
3257 }
3258 
3259 /*
3260  * Routine to update the scheduling bucket for the thread.
3261  *
3262  * In the clutch scheduler implementation, the thread's bucket
3263  * is based on sched_pri if it was promoted due to a kernel
3264  * primitive; otherwise its based on the thread base_pri. This
3265  * enhancement allows promoted threads to reach a higher priority
3266  * bucket and potentially get selected sooner for scheduling.
3267  *
3268  * Also, the clutch scheduler does not honor fixed priority below
3269  * FG priority. It simply puts those threads in the corresponding
3270  * timeshare bucket. The reason for to do that is because it is
3271  * extremely hard to define the scheduling properties of such threads
3272  * and they typically lead to performance issues.
3273  */
3274 
3275 void
sched_clutch_update_thread_bucket(thread_t thread)3276 sched_clutch_update_thread_bucket(thread_t thread)
3277 {
3278 	sched_bucket_t old_bucket = thread->th_sched_bucket;
3279 	assert(thread->runq == PROCESSOR_NULL);
3280 	int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3281 	sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3282 
3283 	if (old_bucket == new_bucket) {
3284 		return;
3285 	}
3286 
3287 	thread->th_sched_bucket = new_bucket;
3288 	thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3289 	/*
3290 	 * Since this is called after the thread has been removed from the runq,
3291 	 * only the run counts need to be updated. The re-insert into the runq
3292 	 * would put the thread into the correct new bucket's runq.
3293 	 */
3294 	if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3295 		sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3296 		sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3297 	}
3298 }
3299 
3300 #if CONFIG_SCHED_EDGE
3301 
3302 /* Implementation of the AMP version of the clutch scheduler */
3303 
3304 static void
3305 sched_edge_init(void);
3306 
3307 static void
3308 sched_edge_pset_init(processor_set_t pset);
3309 
3310 static thread_t
3311 sched_edge_processor_idle(processor_set_t pset);
3312 
3313 static ast_t
3314 sched_edge_processor_csw_check(processor_t processor);
3315 
3316 static boolean_t
3317 sched_edge_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3318 
3319 static boolean_t
3320 sched_edge_processor_queue_empty(processor_t processor);
3321 
3322 static thread_t
3323 sched_edge_choose_thread(processor_t processor, int priority, ast_t reason);
3324 
3325 static void
3326 sched_edge_processor_queue_shutdown(processor_t processor);
3327 
3328 static processor_t
3329 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
3330 
3331 static bool
3332 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread);
3333 
3334 static void
3335 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3336 
3337 static void
3338 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3339 
3340 static bool
3341 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3342 
3343 static void
3344 sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3345 
3346 static void
3347 sched_edge_cpu_init_completed(void);
3348 
3349 static bool
3350 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3351 
3352 static bool
3353 sched_edge_steal_thread_enabled(processor_set_t pset);
3354 
3355 static sched_ipi_type_t
3356 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3357 
3358 static uint32_t
3359 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3360 
3361 static uint32_t
3362 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3363 
3364 const struct sched_dispatch_table sched_edge_dispatch = {
3365 	.sched_name                                     = "edge",
3366 	.init                                           = sched_edge_init,
3367 	.timebase_init                                  = sched_timeshare_timebase_init,
3368 	.processor_init                                 = sched_clutch_processor_init,
3369 	.pset_init                                      = sched_edge_pset_init,
3370 	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
3371 	.choose_thread                                  = sched_edge_choose_thread,
3372 	.steal_thread_enabled                           = sched_edge_steal_thread_enabled,
3373 	.steal_thread                                   = sched_edge_processor_idle,
3374 	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
3375 	.choose_node                                    = sched_choose_node,
3376 	.choose_processor                               = sched_edge_choose_processor,
3377 	.processor_enqueue                              = sched_clutch_processor_enqueue,
3378 	.processor_queue_shutdown                       = sched_edge_processor_queue_shutdown,
3379 	.processor_queue_remove                         = sched_clutch_processor_queue_remove,
3380 	.processor_queue_empty                          = sched_edge_processor_queue_empty,
3381 	.priority_is_urgent                             = priority_is_urgent,
3382 	.processor_csw_check                            = sched_edge_processor_csw_check,
3383 	.processor_queue_has_priority                   = sched_edge_processor_queue_has_priority,
3384 	.initial_quantum_size                           = sched_clutch_initial_quantum_size,
3385 	.initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
3386 	.can_update_priority                            = can_update_priority,
3387 	.update_priority                                = update_priority,
3388 	.lightweight_update_priority                    = lightweight_update_priority,
3389 	.quantum_expire                                 = sched_default_quantum_expire,
3390 	.processor_runq_count                           = sched_clutch_runq_count,
3391 	.processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
3392 	.processor_bound_count                          = sched_clutch_processor_bound_count,
3393 	.thread_update_scan                             = sched_clutch_thread_update_scan,
3394 	.multiple_psets_enabled                         = TRUE,
3395 	.sched_groups_enabled                           = FALSE,
3396 	.avoid_processor_enabled                        = TRUE,
3397 	.thread_avoid_processor                         = sched_edge_thread_avoid_processor,
3398 	.processor_balance                              = sched_edge_balance,
3399 
3400 	.rt_runq                                        = sched_rtlocal_runq,
3401 	.rt_init                                        = sched_rtlocal_init,
3402 	.rt_queue_shutdown                              = sched_rtlocal_queue_shutdown,
3403 	.rt_runq_scan                                   = sched_rtlocal_runq_scan,
3404 	.rt_runq_count_sum                              = sched_rtlocal_runq_count_sum,
3405 	.rt_steal_thread                                = sched_rtlocal_steal_thread,
3406 
3407 	.qos_max_parallelism                            = sched_edge_qos_max_parallelism,
3408 	.check_spill                                    = sched_edge_check_spill,
3409 	.ipi_policy                                     = sched_edge_ipi_policy,
3410 	.thread_should_yield                            = sched_edge_thread_should_yield,
3411 	.run_count_incr                                 = sched_clutch_run_incr,
3412 	.run_count_decr                                 = sched_clutch_run_decr,
3413 	.update_thread_bucket                           = sched_clutch_update_thread_bucket,
3414 	.pset_made_schedulable                          = sched_edge_pset_made_schedulable,
3415 	.thread_group_recommendation_change             = NULL,
3416 	.cpu_init_completed                             = sched_edge_cpu_init_completed,
3417 	.thread_eligible_for_pset                       = sched_edge_thread_eligible_for_pset,
3418 };
3419 
3420 static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3421 
3422 /*
3423  * sched_edge_pset_available()
3424  *
3425  * Routine to determine if a pset is available for scheduling.
3426  */
3427 static bool
sched_edge_pset_available(processor_set_t pset)3428 sched_edge_pset_available(processor_set_t pset)
3429 {
3430 	if (pset == NULL) {
3431 		return false;
3432 	}
3433 	return pset_available_cpu_count(pset) > 0;
3434 }
3435 
3436 /*
3437  * sched_edge_thread_bound_cluster_id()
3438  *
3439  * Routine to determine which cluster a particular thread is bound to. Uses
3440  * the sched_flags on the thread to map back to a specific cluster id.
3441  *
3442  * <Edge Multi-cluster Support Needed>
3443  */
3444 static uint32_t
sched_edge_thread_bound_cluster_id(thread_t thread)3445 sched_edge_thread_bound_cluster_id(thread_t thread)
3446 {
3447 	assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3448 	return thread->th_bound_cluster_id;
3449 }
3450 
3451 /* Forward declaration for some thread migration routines */
3452 static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3453 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3454 static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3455 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
3456 
3457 /*
3458  * sched_edge_config_set()
3459  *
3460  * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3461  * policies in the scheduler.
3462  */
3463 static void
sched_edge_config_set(uint32_t src_cluster,uint32_t dst_cluster,sched_clutch_edge edge_config)3464 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
3465 {
3466 	sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
3467 	edge->sce_edge_packed = edge_config.sce_edge_packed;
3468 }
3469 
3470 /*
3471  * sched_edge_config_get()
3472  *
3473  * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3474  * if it needs to update edges.
3475  */
3476 static sched_clutch_edge
sched_edge_config_get(uint32_t src_cluster,uint32_t dst_cluster)3477 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
3478 {
3479 	return pset_array[src_cluster]->sched_edges[dst_cluster];
3480 }
3481 
3482 /*
3483  * sched_edge_matrix_set()
3484  *
3485  * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3486  * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3487  * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3488  */
3489 void
sched_edge_matrix_set(sched_clutch_edge * edge_matrix,bool * edge_changes_bitmap,__unused uint64_t flags,uint64_t matrix_order)3490 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3491 {
3492 	uint32_t edge_index = 0;
3493 	for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3494 		for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3495 			if (edge_changes_bitmap[edge_index]) {
3496 				sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3497 			}
3498 			edge_index++;
3499 		}
3500 	}
3501 }
3502 
3503 /*
3504  * sched_edge_matrix_get()
3505  *
3506  * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3507  * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3508  * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3509  */
3510 void
sched_edge_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_request_bitmap,__unused uint64_t flags,uint64_t matrix_order)3511 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3512 {
3513 	uint32_t edge_index = 0;
3514 	for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3515 		for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3516 			if (edge_request_bitmap[edge_index]) {
3517 				edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3518 			}
3519 			edge_index++;
3520 		}
3521 	}
3522 }
3523 
3524 /*
3525  * sched_edge_init()
3526  *
3527  * Routine to initialize the data structures for the Edge scheduler.
3528  */
3529 static void
sched_edge_init(void)3530 sched_edge_init(void)
3531 {
3532 	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3533 		sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3534 	}
3535 	sched_timeshare_init();
3536 	sched_clutch_tunables_init();
3537 	sched_edge_max_clusters = ml_get_cluster_count();
3538 }
3539 
3540 static void
sched_edge_pset_init(processor_set_t pset)3541 sched_edge_pset_init(processor_set_t pset)
3542 {
3543 	uint32_t pset_cluster_id = pset->pset_cluster_id;
3544 	pset->pset_type = (pset->pset_cluster_type == PSET_AMP_P) ? CLUSTER_TYPE_P : CLUSTER_TYPE_E;
3545 
3546 	/* Set the edge weight and properties for the pset itself */
3547 	bitmap_clear(pset->foreign_psets, pset_cluster_id);
3548 	bitmap_clear(pset->native_psets, pset_cluster_id);
3549 	bitmap_clear(pset->local_psets, pset_cluster_id);
3550 	bitmap_clear(pset->remote_psets, pset_cluster_id);
3551 	pset->sched_edges[pset_cluster_id].sce_edge_packed = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0}.sce_edge_packed;
3552 	sched_clutch_root_init(&pset->pset_clutch_root, pset);
3553 	bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id);
3554 }
3555 
3556 static thread_t
sched_edge_choose_thread(processor_t processor,int priority,__unused ast_t reason)3557 sched_edge_choose_thread(
3558 	processor_t      processor,
3559 	int              priority,
3560 	__unused ast_t            reason)
3561 {
3562 	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3563 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3564 	boolean_t choose_from_boundq = false;
3565 
3566 	if ((bound_runq->highq < priority) &&
3567 	    (clutch_pri < priority)) {
3568 		return THREAD_NULL;
3569 	}
3570 
3571 	if (bound_runq->highq >= clutch_pri) {
3572 		choose_from_boundq = true;
3573 	}
3574 
3575 	thread_t thread = THREAD_NULL;
3576 	if (choose_from_boundq == false) {
3577 		sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3578 		thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3579 	} else {
3580 		thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
3581 	}
3582 	return thread;
3583 }
3584 
3585 static boolean_t
sched_edge_processor_queue_empty(processor_t processor)3586 sched_edge_processor_queue_empty(processor_t processor)
3587 {
3588 	return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
3589 	       (sched_clutch_bound_runq(processor)->count == 0);
3590 }
3591 
3592 static void
sched_edge_check_spill(__unused processor_set_t pset,__unused thread_t thread)3593 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
3594 {
3595 	assert(thread->bound_processor == PROCESSOR_NULL);
3596 }
3597 
3598 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
3599 	SCHED_EDGE_YIELD_RUNQ_NONEMPTY       = 0x0,
3600 	SCHED_EDGE_YIELD_FOREIGN_RUNNABLE    = 0x1,
3601 	SCHED_EDGE_YIELD_FOREIGN_RUNNING     = 0x2,
3602 	SCHED_EDGE_YIELD_STEAL_POSSIBLE      = 0x3,
3603 	SCHED_EDGE_YIELD_DISALLOW            = 0x4,
3604 });
3605 
3606 static bool
sched_edge_thread_should_yield(processor_t processor,__unused thread_t thread)3607 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
3608 {
3609 	if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
3610 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3611 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
3612 		return true;
3613 	}
3614 
3615 	/*
3616 	 * The yield logic should follow the same logic that steal_thread () does. The
3617 	 * thread_should_yield() is effectively trying to quickly check that if the
3618 	 * current thread gave up CPU, is there any other thread that would execute
3619 	 * on this CPU. So it needs to provide the same answer as the steal_thread()/
3620 	 * processor Idle logic.
3621 	 */
3622 	if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
3623 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3624 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
3625 		return true;
3626 	}
3627 	if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
3628 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3629 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
3630 		return true;
3631 	}
3632 
3633 	processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
3634 	if (steal_candidate != NULL) {
3635 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3636 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
3637 		return true;
3638 	}
3639 
3640 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
3641 	    0, SCHED_EDGE_YIELD_DISALLOW);
3642 	return false;
3643 }
3644 
3645 static ast_t
sched_edge_processor_csw_check(processor_t processor)3646 sched_edge_processor_csw_check(processor_t processor)
3647 {
3648 	boolean_t       has_higher;
3649 	int             pri;
3650 
3651 	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3652 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3653 
3654 	assert(processor->active_thread != NULL);
3655 
3656 	pri = MAX(clutch_pri, bound_runq->highq);
3657 
3658 	if (processor->first_timeslice) {
3659 		has_higher = (pri > processor->current_pri);
3660 	} else {
3661 		has_higher = (pri >= processor->current_pri);
3662 	}
3663 
3664 	if (has_higher) {
3665 		if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
3666 			return AST_PREEMPT | AST_URGENT;
3667 		}
3668 
3669 		if (bound_runq->urgency > 0) {
3670 			return AST_PREEMPT | AST_URGENT;
3671 		}
3672 
3673 		return AST_PREEMPT;
3674 	}
3675 
3676 	return AST_NONE;
3677 }
3678 
3679 static boolean_t
sched_edge_processor_queue_has_priority(processor_t processor,int priority,boolean_t gte)3680 sched_edge_processor_queue_has_priority(processor_t    processor,
3681     int            priority,
3682     boolean_t      gte)
3683 {
3684 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3685 
3686 	int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
3687 	if (gte) {
3688 		return qpri >= priority;
3689 	} else {
3690 		return qpri > priority;
3691 	}
3692 }
3693 
3694 static void
sched_edge_processor_queue_shutdown(processor_t processor)3695 sched_edge_processor_queue_shutdown(processor_t processor)
3696 {
3697 	processor_set_t pset = processor->processor_set;
3698 	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3699 	thread_t        thread;
3700 	queue_head_t    tqueue;
3701 
3702 	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
3703 	if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
3704 		pset_unlock(pset);
3705 		return;
3706 	}
3707 
3708 	bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3709 
3710 	queue_init(&tqueue);
3711 	while (sched_clutch_root_count(pset_clutch_root) > 0) {
3712 		thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3713 		enqueue_tail(&tqueue, &thread->runq_links);
3714 	}
3715 	pset_unlock(pset);
3716 
3717 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
3718 		remqueue(&thread->runq_links);
3719 		thread_lock(thread);
3720 		thread_setrun(thread, SCHED_TAILQ);
3721 		thread_unlock(thread);
3722 	}
3723 }
3724 
3725 /*
3726  * sched_edge_cluster_load_metric()
3727  *
3728  * The load metric for a cluster is a measure of the average scheduling latency
3729  * experienced by threads on that cluster. It is a product of the average number
3730  * of threads in the runqueue and the average execution time for threads. The metric
3731  * has special values in the following cases:
3732  * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
3733  *   the maximum value to disallow any threads to migrate to this cluster.
3734  * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
3735  *   to be spread across the platform quickly for ncpu wide workloads.
3736  */
3737 static uint32_t
sched_edge_cluster_load_metric(processor_set_t pset,sched_bucket_t sched_bucket)3738 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
3739 {
3740 	if (sched_edge_pset_available(pset) == false) {
3741 		return UINT32_MAX;
3742 	}
3743 	return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
3744 }
3745 
3746 /*
3747  *
3748  * Edge Scheduler Steal/Rebalance logic
3749  *
3750  * = Generic scheduler logic =
3751  *
3752  * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
3753  * find any thread for execution in its runqueue. The aim of the steal operation
3754  * is to find other threads running/runnable in other clusters which should be
3755  * executed here.
3756  *
3757  * If the steal callout does not return a thread, the thread_select() logic calls
3758  * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
3759  * threads and idle out the current CPU.
3760  *
3761  * = SCHED(steal_thread) for Edge Scheduler =
3762  *
3763  * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
3764  * routine tries to do the following operations in order:
3765  * (1) Find foreign runnnable threads in non-native cluster
3766  *     runqueues (sched_edge_foreign_runnable_thread_remove())
3767  * (2) Check if foreign threads are running on the non-native
3768  *     clusters (sched_edge_foreign_running_thread_available())
3769  *         - If yes, return THREAD_NULL for the steal callout and
3770  *         perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
3771  * (3) Steal a thread from another cluster based on edge
3772  *     weights (sched_edge_steal_thread())
3773  *
3774  * = SCHED(processor_balance) for Edge Scheduler =
3775  *
3776  * If steal_thread did not return a thread for the processor, use
3777  * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
3778  *
3779  * = Clutch Bucket Preferred Cluster Overrides =
3780  *
3781  * Since these operations (just like thread migrations on enqueue)
3782  * move threads across clusters, they need support for handling clutch
3783  * bucket group level preferred cluster recommendations.
3784  * For (1), a clutch bucket will be in the foreign runnable queue based
3785  * on the clutch bucket group preferred cluster.
3786  * For (2), the running thread will set the bit on the processor based
3787  * on its preferred cluster type.
3788  * For (3), the edge configuration would prevent threads from being stolen
3789  * in the wrong direction.
3790  *
3791  * = SCHED(thread_should_yield) =
3792  * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
3793  * since that is expecting the same answer as if thread_select() was called on a core
3794  * with an empty runqueue.
3795  */
3796 
3797 static bool
sched_edge_steal_thread_enabled(__unused processor_set_t pset)3798 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
3799 {
3800 	/*
3801 	 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
3802 	 */
3803 	return true;
3804 }
3805 
3806 static processor_set_t
sched_edge_steal_candidate(processor_set_t pset)3807 sched_edge_steal_candidate(processor_set_t pset)
3808 {
3809 	uint32_t dst_cluster_id = pset->pset_cluster_id;
3810 	for (int cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
3811 		processor_set_t candidate_pset = pset_array[cluster_id];
3812 		if (cluster_id == dst_cluster_id) {
3813 			continue;
3814 		}
3815 		if (candidate_pset == NULL) {
3816 			continue;
3817 		}
3818 		sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
3819 		if (incoming_edge->sce_steal_allowed && (bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1)) {
3820 			return candidate_pset;
3821 		}
3822 	}
3823 	return NULL;
3824 }
3825 
3826 static boolean_t
sched_edge_foreign_runnable_thread_available(processor_set_t pset)3827 sched_edge_foreign_runnable_thread_available(processor_set_t pset)
3828 {
3829 	/* Find all the clusters that are foreign for this cluster */
3830 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3831 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3832 		/*
3833 		 * For each cluster, see if there are any runnable foreign threads.
3834 		 * This check is currently being done without the pset lock to make it cheap for
3835 		 * the common case.
3836 		 */
3837 		processor_set_t target_pset = pset_array[cluster];
3838 		if (sched_edge_pset_available(target_pset) == false) {
3839 			continue;
3840 		}
3841 
3842 		if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3843 			return true;
3844 		}
3845 	}
3846 	return false;
3847 }
3848 
3849 static thread_t
sched_edge_foreign_runnable_thread_remove(processor_set_t pset,uint64_t ctime)3850 sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
3851 {
3852 	thread_t thread = THREAD_NULL;
3853 
3854 	/* Find all the clusters that are foreign for this cluster */
3855 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3856 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3857 		/*
3858 		 * For each cluster, see if there are any runnable foreign threads.
3859 		 * This check is currently being done without the pset lock to make it cheap for
3860 		 * the common case.
3861 		 */
3862 		processor_set_t target_pset = pset_array[cluster];
3863 		if (sched_edge_pset_available(target_pset) == false) {
3864 			continue;
3865 		}
3866 
3867 		if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3868 			continue;
3869 		}
3870 		/*
3871 		 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
3872 		 * and get the highest priority thread.
3873 		 */
3874 		pset_lock(target_pset);
3875 		if (sched_edge_pset_available(target_pset)) {
3876 			thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
3877 			sched_update_pset_load_average(target_pset, ctime);
3878 		}
3879 		pset_unlock(target_pset);
3880 
3881 		/*
3882 		 * Edge Scheduler Optimization
3883 		 *
3884 		 * The current implementation immediately returns as soon as it finds a foreign
3885 		 * runnable thread. This could be enhanced to look at highest priority threads
3886 		 * from all foreign clusters and pick the highest amongst them. That would need
3887 		 * some form of global state across psets to make that kind of a check cheap.
3888 		 */
3889 		if (thread != THREAD_NULL) {
3890 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
3891 			break;
3892 		}
3893 		/* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
3894 	}
3895 
3896 	return thread;
3897 }
3898 
3899 /*
3900  * sched_edge_cpu_running_foreign_shared_rsrc_available()
3901  *
3902  * Routine to determine if the thread running on a CPU is a shared resource thread
3903  * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
3904  * a CPU going idle on a pset should rebalance a running shared resource heavy thread
3905  * from another non-ideal cluster based on the former's shared resource load.
3906  */
3907 static boolean_t
sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset,int foreign_cpu,processor_set_t idle_pset)3908 sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
3909 {
3910 	boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
3911 	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
3912 		return false;
3913 	}
3914 
3915 	boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
3916 	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
3917 		return false;
3918 	}
3919 	return true;
3920 }
3921 
3922 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset)3923 sched_edge_foreign_running_thread_available(processor_set_t pset)
3924 {
3925 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3926 	int cluster = -1;
3927 	while ((cluster = sched_edge_iterate_clusters_ordered(pset, foreign_pset_bitmap[0], cluster)) != -1) {
3928 		/* Skip the pset if its not schedulable */
3929 		processor_set_t target_pset = pset_array[cluster];
3930 		if (sched_edge_pset_available(target_pset) == false) {
3931 			continue;
3932 		}
3933 
3934 		uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
3935 		for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
3936 			if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
3937 				continue;
3938 			}
3939 			return true;
3940 		}
3941 	}
3942 	return false;
3943 }
3944 
3945 static bool
sched_edge_steal_possible(processor_set_t idle_pset,processor_set_t candidate_pset)3946 sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_pset)
3947 {
3948 	int highest_runnable_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
3949 	if (highest_runnable_bucket == -1) {
3950 		/* Candidate cluster runq is empty */
3951 		return false;
3952 	}
3953 
3954 	if (idle_pset->pset_cluster_type == candidate_pset->pset_cluster_type) {
3955 		/* Always allow stealing from homogeneous clusters */
3956 		return true;
3957 	} else {
3958 		/* Use the load metrics for highest runnable bucket since that would be stolen next */
3959 		int32_t candidate_load = sched_edge_cluster_load_metric(candidate_pset, (sched_bucket_t)highest_runnable_bucket);
3960 		return candidate_load > 0;
3961 	}
3962 }
3963 
3964 static thread_t
sched_edge_steal_thread(processor_set_t pset,uint64_t candidate_pset_bitmap)3965 sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap)
3966 {
3967 	thread_t thread = THREAD_NULL;
3968 
3969 	/*
3970 	 * Edge Scheduler Optimization
3971 	 *
3972 	 * The logic today bails as soon as it finds a cluster where the cluster load is
3973 	 * greater than the edge weight. Maybe it should have a more advanced version
3974 	 * which looks for the maximum delta etc.
3975 	 */
3976 	int cluster_id = -1;
3977 	while ((cluster_id = sched_edge_iterate_clusters_ordered(pset, candidate_pset_bitmap, cluster_id)) != -1) {
3978 		processor_set_t steal_from_pset = pset_array[cluster_id];
3979 		if (steal_from_pset == NULL) {
3980 			continue;
3981 		}
3982 		sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[pset->pset_cluster_id];
3983 		if (incoming_edge->sce_steal_allowed == false) {
3984 			continue;
3985 		}
3986 		pset_lock(steal_from_pset);
3987 		if (sched_edge_steal_possible(pset, steal_from_pset)) {
3988 			uint64_t current_timestamp = mach_absolute_time();
3989 			sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(&steal_from_pset->pset_clutch_root, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
3990 			thread = sched_clutch_thread_unbound_lookup(&steal_from_pset->pset_clutch_root, root_bucket);
3991 			sched_clutch_thread_remove(&steal_from_pset->pset_clutch_root, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
3992 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
3993 			sched_update_pset_load_average(steal_from_pset, current_timestamp);
3994 		}
3995 		pset_unlock(steal_from_pset);
3996 		if (thread != THREAD_NULL) {
3997 			break;
3998 		}
3999 	}
4000 	return thread;
4001 }
4002 
4003 /*
4004  * sched_edge_processor_idle()
4005  *
4006  * The routine is the implementation for steal_thread() for the Edge scheduler.
4007  */
4008 static thread_t
sched_edge_processor_idle(processor_set_t pset)4009 sched_edge_processor_idle(processor_set_t pset)
4010 {
4011 	thread_t thread = THREAD_NULL;
4012 
4013 	uint64_t ctime = mach_absolute_time();
4014 
4015 	processor_t processor = current_processor();
4016 	bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
4017 
4018 	/* Each of the operations acquire the lock for the pset they target */
4019 	pset_unlock(pset);
4020 
4021 	/* Find highest priority runnable thread on all non-native clusters */
4022 	thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
4023 	if (thread != THREAD_NULL) {
4024 		return thread;
4025 	}
4026 
4027 	/* Find highest priority runnable thread on all native clusters */
4028 	thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4029 	if (thread != THREAD_NULL) {
4030 		return thread;
4031 	}
4032 
4033 	/* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4034 	boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4035 	if (rebalance_needed) {
4036 		return THREAD_NULL;
4037 	}
4038 
4039 	/* No foreign threads found; find a thread to steal from all clusters based on weights/loads etc. */
4040 	thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4041 	return thread;
4042 }
4043 
4044 /* Return true if this shared resource thread has a better cluster to run on */
4045 static bool
sched_edge_shared_rsrc_migrate_possible(thread_t thread,processor_set_t preferred_pset,processor_set_t current_pset)4046 sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4047 {
4048 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4049 	uint64_t current_pset_load = sched_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4050 	/*
4051 	 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4052 	 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4053 	 *
4054 	 * Edge Scheduler Optimization
4055 	 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4056 	 */
4057 	uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4058 
4059 	uint64_t eligible_pset_bitmask = 0;
4060 	if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4061 		/*
4062 		 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4063 		 * only among clusters native with the preferred cluster.
4064 		 */
4065 		eligible_pset_bitmask = preferred_pset->native_psets[0];
4066 		bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4067 	} else {
4068 		/* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4069 		eligible_pset_bitmask = sched_edge_available_pset_bitmask[0];
4070 	}
4071 
4072 	/* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4073 	for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4074 		if (cluster_id == current_pset->pset_cluster_id) {
4075 			continue;
4076 		}
4077 		uint64_t cluster_load = sched_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4078 		if (current_pset_adjusted_load > cluster_load) {
4079 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4080 			return true;
4081 		}
4082 	}
4083 	return false;
4084 }
4085 
4086 /* Return true if this thread should not continue running on this processor */
4087 static bool
sched_edge_thread_avoid_processor(processor_t processor,thread_t thread)4088 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread)
4089 {
4090 	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4091 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
4092 		return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4093 	}
4094 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
4095 		if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4096 			return true;
4097 		}
4098 		return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4099 	}
4100 
4101 	/*
4102 	 * For long running parallel workloads, it is important to rebalance threads across
4103 	 * E/P clusters so that they make equal forward progress. This is achieved through
4104 	 * threads expiring their quantum on the non-preferred cluster type and explicitly
4105 	 * rebalancing to the preferred cluster runqueue.
4106 	 */
4107 	if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4108 		return true;
4109 	}
4110 	/* If the preferred pset for the thread is now idle, try and migrate thread to that cluster */
4111 	if ((processor->processor_set != preferred_pset) &&
4112 	    (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
4113 		return true;
4114 	}
4115 	return false;
4116 }
4117 
4118 static void
sched_edge_balance(__unused processor_t cprocessor,processor_set_t cpset)4119 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
4120 {
4121 	assert(cprocessor == current_processor());
4122 	pset_unlock(cpset);
4123 
4124 	uint64_t ast_processor_map = 0;
4125 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4126 
4127 	bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
4128 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4129 		/* Skip the pset if its not schedulable */
4130 		processor_set_t target_pset = pset_array[cluster];
4131 		if (sched_edge_pset_available(target_pset) == false) {
4132 			continue;
4133 		}
4134 
4135 		pset_lock(target_pset);
4136 		uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
4137 		for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
4138 			if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
4139 				continue;
4140 			}
4141 			processor_t target_cpu = processor_array[cpuid];
4142 			ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
4143 			if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4144 				bit_set(ast_processor_map, cpuid);
4145 			}
4146 		}
4147 		pset_unlock(target_pset);
4148 	}
4149 
4150 	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4151 		processor_t ast_processor = processor_array[cpuid];
4152 		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4153 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
4154 	}
4155 }
4156 
4157 /*
4158  * sched_edge_migration_check()
4159  *
4160  * Routine to evaluate an edge between two clusters to decide if migration is possible
4161  * across that edge. Also updates the selected_pset and max_edge_delta out parameters
4162  * accordingly. The return value indicates if the invoking routine should short circuit
4163  * the search, since an ideal candidate has been found. The routine looks at the regular
4164  * edges and cluster loads or the shared resource loads based on the type of thread.
4165  */
4166 static bool
sched_edge_migration_check(uint32_t cluster_id,processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread,processor_set_t * selected_pset,uint32_t * max_edge_delta)4167 sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
4168     uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
4169 {
4170 	uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4171 	cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
4172 	processor_set_t dst_pset = pset_array[cluster_id];
4173 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4174 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4175 
4176 	if (cluster_id == preferred_cluster_id) {
4177 		return false;
4178 	}
4179 
4180 	if (dst_pset == NULL) {
4181 		return false;
4182 	}
4183 
4184 	sched_clutch_edge *edge = preferred_pset->sched_edges;
4185 	if (edge[cluster_id].sce_migration_allowed == false) {
4186 		return false;
4187 	}
4188 	uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4189 	if (dst_load == 0) {
4190 		/* The candidate cluster is idle; select it immediately for execution */
4191 		*selected_pset = dst_pset;
4192 		*max_edge_delta = preferred_cluster_load;
4193 		return true;
4194 	}
4195 
4196 	uint32_t edge_delta = 0;
4197 	if (dst_load > preferred_cluster_load) {
4198 		return false;
4199 	}
4200 	edge_delta = preferred_cluster_load - dst_load;
4201 	if (!shared_rsrc_thread && (edge_delta < edge[cluster_id].sce_migration_weight)) {
4202 		/*
4203 		 * For non shared resource threads, use the edge migration weight to decide if
4204 		 * this cluster is over-committed at the QoS level of this thread.
4205 		 */
4206 		return false;
4207 	}
4208 
4209 	if (edge_delta < *max_edge_delta) {
4210 		return false;
4211 	}
4212 	if (edge_delta == *max_edge_delta) {
4213 		/* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4214 		boolean_t selected_homogeneous = (pset_type_for_id((*selected_pset)->pset_cluster_id) == preferred_cluster_type);
4215 		boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4216 		if (selected_homogeneous || !candidate_homogeneous) {
4217 			return false;
4218 		}
4219 	}
4220 	/* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
4221 	*max_edge_delta = edge_delta;
4222 	*selected_pset = dst_pset;
4223 	return false;
4224 }
4225 
4226 /*
4227  * sched_edge_iterate_clusters_ordered()
4228  *
4229  * Routine to iterate clusters in die local order. For multi-die machines,
4230  * the routine ensures that the candidate clusters on the same die as the
4231  * passed in pset are returned before the remote die clusters. This should
4232  * be used in all places where cluster selection in die order matters.
4233  */
4234 
4235 static int
sched_edge_iterate_clusters_ordered(processor_set_t starting_pset,uint64_t candidate_map,int previous_cluster)4236 sched_edge_iterate_clusters_ordered(processor_set_t starting_pset, uint64_t candidate_map, int previous_cluster)
4237 {
4238 	int cluster_id = -1;
4239 
4240 	uint64_t local_candidate_map = starting_pset->local_psets[0] & candidate_map;
4241 	uint64_t remote_candidate_map = starting_pset->remote_psets[0] & candidate_map;
4242 
4243 	if (previous_cluster == -1) {
4244 		/* previous_cluster == -1 indicates the initial condition */
4245 		cluster_id = bit_first(local_candidate_map);
4246 		if (cluster_id != -1) {
4247 			return cluster_id;
4248 		}
4249 		return bit_first(remote_candidate_map);
4250 	} else {
4251 		/*
4252 		 * After the initial condition, the routine attempts to return a
4253 		 * cluster in the previous_cluster's locality. If none is available,
4254 		 * it looks at remote clusters.
4255 		 */
4256 		if (bit_test(local_candidate_map, previous_cluster)) {
4257 			cluster_id = bit_next(local_candidate_map, previous_cluster);
4258 			if (cluster_id != -1) {
4259 				return cluster_id;
4260 			} else {
4261 				return bit_first(remote_candidate_map);
4262 			}
4263 		}
4264 		return bit_next(remote_candidate_map, previous_cluster);
4265 	}
4266 }
4267 
4268 /*
4269  * sched_edge_migrate_edges_evaluate()
4270  *
4271  * Routine to find the candidate for thread migration based on edge weights.
4272  *
4273  * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
4274  * return preferred_pset if its the most ideal destination for this thread.
4275  */
4276 static processor_set_t
sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread)4277 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
4278 {
4279 	processor_set_t selected_pset = preferred_pset;
4280 	uint32_t max_edge_delta = 0;
4281 	bool search_complete = false;
4282 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4283 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4284 
4285 	bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
4286 	bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
4287 	/* Always start the search with the native clusters */
4288 	int cluster_id = -1;
4289 	while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, native_pset_bitmap[0], cluster_id)) != -1) {
4290 		search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4291 		if (search_complete) {
4292 			break;
4293 		}
4294 	}
4295 
4296 	if (search_complete) {
4297 		return selected_pset;
4298 	}
4299 
4300 	if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
4301 		/*
4302 		 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
4303 		 * to fill up the preferred cluster and its homogeneous peers first.
4304 		 */
4305 
4306 		if (max_edge_delta > 0) {
4307 			/*
4308 			 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
4309 			 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
4310 			 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
4311 			 * should be placed.
4312 			 */
4313 			return selected_pset;
4314 		}
4315 		/*
4316 		 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
4317 		 * any more capacity left.
4318 		 */
4319 		if (sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
4320 			return preferred_pset;
4321 		}
4322 		/*
4323 		 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
4324 		 * at non-native clusters for capacity.
4325 		 */
4326 	}
4327 
4328 	/* Now look at the non-native clusters */
4329 	cluster_id = -1;
4330 	while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, foreign_pset_bitmap[0], cluster_id)) != -1) {
4331 		search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4332 		if (search_complete) {
4333 			break;
4334 		}
4335 	}
4336 	return selected_pset;
4337 }
4338 
4339 /*
4340  * sched_edge_candidate_alternative()
4341  *
4342  * Routine to find an alternative cluster from candidate_cluster_bitmap since the
4343  * selected_pset is not available for execution. The logic tries to prefer homogeneous
4344  * clusters over heterogeneous clusters since this is typically used in thread
4345  * placement decisions.
4346  */
4347 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
4348 static processor_set_t
sched_edge_candidate_alternative(processor_set_t selected_pset,uint64_t candidate_cluster_bitmap)4349 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
4350 {
4351 	/*
4352 	 * It looks like the most ideal pset is not available for scheduling currently.
4353 	 * Try to find a homogeneous cluster that is still available.
4354 	 */
4355 	uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
4356 	int available_cluster_id = lsb_first(available_native_clusters);
4357 	if (available_cluster_id == -1) {
4358 		/* Looks like none of the homogeneous clusters are available; pick the first available cluster */
4359 		available_cluster_id = bit_first(candidate_cluster_bitmap);
4360 	}
4361 	assert(available_cluster_id != -1);
4362 	return pset_array[available_cluster_id];
4363 }
4364 
4365 /*
4366  * sched_edge_switch_pset_lock()
4367  *
4368  * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4369  * switch_pset_locks.
4370  * Returns the newly locked pset after the switch.
4371  */
4372 static processor_set_t
sched_edge_switch_pset_lock(processor_set_t selected_pset,processor_set_t locked_pset,bool switch_pset_locks)4373 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
4374 {
4375 	if (!switch_pset_locks) {
4376 		return locked_pset;
4377 	}
4378 	if (selected_pset != locked_pset) {
4379 		pset_unlock(locked_pset);
4380 		pset_lock(selected_pset);
4381 		return selected_pset;
4382 	} else {
4383 		return locked_pset;
4384 	}
4385 }
4386 
4387 /*
4388  * sched_edge_amp_rebalance_pset()
4389  *
4390  * Routine to decide where a thread which is eligible for AMP rebalance (i.e.
4391  * has executed on non-preferred cluster type for a while) should be enqueued.
4392  * The algorithm maintains a history of AMP rebalance decisions on the clutch
4393  * bucket group of the workload and round-robins between clusters to ensure
4394  * that all threads get a chance on the performance cores and make equal
4395  * progress.
4396  */
4397 static processor_set_t
sched_edge_amp_rebalance_pset(processor_set_t preferred_pset,thread_t thread)4398 sched_edge_amp_rebalance_pset(processor_set_t preferred_pset, thread_t thread)
4399 {
4400 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
4401 	sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
4402 
4403 	uint32_t last_chosen_cluster, new_chosen_cluster;
4404 
4405 	/* Only AMP rebalance within clusters native to the preferred cluster */
4406 	uint64_t eligible_pset_bitmask = preferred_pset->native_psets[0];
4407 	/* Preferred cluster is also eligible for rebalancing */
4408 	bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4409 	/* Atomically update the AMP rebalance cluster for the clutch bucket group */
4410 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_amp_rebalance_last_chosen, last_chosen_cluster, new_chosen_cluster, relaxed, {
4411 		if (last_chosen_cluster == UINT32_MAX) {
4412 		        new_chosen_cluster = preferred_pset->pset_cluster_id;
4413 		} else {
4414 		        new_chosen_cluster = lsb_next(eligible_pset_bitmask, last_chosen_cluster);
4415 		        if (new_chosen_cluster == -1) {
4416 		                /* Rotate to the start of the eligible bitmask */
4417 		                new_chosen_cluster = lsb_first(eligible_pset_bitmask);
4418 			}
4419 		}
4420 	});
4421 	return pset_array[new_chosen_cluster];
4422 }
4423 
4424 /*
4425  * sched_edge_migrate_candidate()
4426  *
4427  * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4428  * the thread and the preferred cluster to determine the best available pset for scheduling.
4429  *
4430  * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4431  * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4432  * decision has to be synchronized with another CPU changing the recommendation of clusters available
4433  * on the system. If this parameter is set to false, this routine returns the best effort indication of
4434  * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4435  * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4436  * more appropriate cluster.
4437  *
4438  * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4439  * resultant pset lock is held.
4440  */
4441 static processor_set_t
sched_edge_migrate_candidate(processor_set_t preferred_pset,thread_t thread,processor_set_t locked_pset,bool switch_pset_locks)4442 sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
4443 {
4444 	__kdebug_only uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4445 	processor_set_t selected_pset = preferred_pset;
4446 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4447 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4448 
4449 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
4450 		/* For bound threads always recommend the cluster its bound to */
4451 		selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
4452 		locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4453 		if (sched_edge_pset_available(selected_pset) || (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread) == false)) {
4454 			/*
4455 			 * If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
4456 			 * fall through to the regular cluster selection logic which handles unavailable clusters
4457 			 * appropriately. If the thread is hard bound, then return the bound cluster always.
4458 			 */
4459 			return selected_pset;
4460 		}
4461 	}
4462 
4463 	uint64_t candidate_cluster_bitmap = mask(sched_edge_max_clusters);
4464 #if DEVELOPMENT || DEBUG
4465 	extern int enable_task_set_cluster_type;
4466 	task_t task = get_threadtask(thread);
4467 	if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
4468 		processor_set_t pset_hint = task->pset_hint;
4469 		if (pset_hint && selected_pset->pset_cluster_type != pset_hint->pset_cluster_type) {
4470 			selected_pset = pset_hint;
4471 			goto migrate_candidate_available_check;
4472 		}
4473 	}
4474 #endif
4475 
4476 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4477 		/* For realtime threads, try and schedule them on the preferred pset always */
4478 		goto migrate_candidate_available_check;
4479 	}
4480 
4481 	uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
4482 	if (preferred_cluster_load == 0) {
4483 		goto migrate_candidate_available_check;
4484 	}
4485 
4486 	/*
4487 	 * If a thread is being rebalanced for achieving equal progress of parallel workloads,
4488 	 * it needs to end up on the preferred runqueue. This mechanism should only be used for
4489 	 * threads which have been previously migrated to the non-preferred cluster type.
4490 	 *
4491 	 * The AMP rebalancing mechanism is available for regular threads or shared resource
4492 	 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
4493 	 */
4494 	bool amp_rebalance_eligible = (!shared_rsrc_thread) || (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST));
4495 	if (amp_rebalance_eligible) {
4496 		boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE | AST_QUANTUM)) == (AST_REBALANCE | AST_QUANTUM);
4497 		if (amp_rebalance) {
4498 			boolean_t non_preferred_pset = (thread->last_processor->processor_set->pset_type != preferred_pset->pset_type);
4499 			if (non_preferred_pset) {
4500 				selected_pset = sched_edge_amp_rebalance_pset(preferred_pset, thread);
4501 				goto migrate_candidate_available_check;
4502 			}
4503 		}
4504 	}
4505 
4506 	/* Look at edge weights to decide the most ideal migration candidate for this thread */
4507 	selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
4508 
4509 migrate_candidate_available_check:
4510 	locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4511 	if (sched_edge_pset_available(selected_pset) == true) {
4512 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4513 		return selected_pset;
4514 	}
4515 	/* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4516 	bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4517 	if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -1)) {
4518 		pset_unlock(locked_pset);
4519 		return NULL;
4520 	}
4521 	/* Try and find an alternative for the selected pset */
4522 	selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
4523 	goto migrate_candidate_available_check;
4524 }
4525 
4526 static processor_t
sched_edge_choose_processor(processor_set_t pset,processor_t processor,thread_t thread)4527 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
4528 {
4529 	/* Bound threads don't call this function */
4530 	assert(thread->bound_processor == PROCESSOR_NULL);
4531 	processor_t chosen_processor = PROCESSOR_NULL;
4532 
4533 	/*
4534 	 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
4535 	 * It should take the passed in "pset" as a hint which represents the recency metric for
4536 	 * pset selection logic.
4537 	 */
4538 	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4539 	processor_set_t chosen_pset = preferred_pset;
4540 	/*
4541 	 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4542 	 * threads to. sched_edge_migrate_candidate() returns the preferred pset
4543 	 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4544 	 *
4545 	 * Edge Scheduler Optimization
4546 	 * It might be useful to build a recency metric for the thread for multiple clusters and
4547 	 * factor that into the migration decisions.
4548 	 */
4549 	chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
4550 	if (chosen_pset) {
4551 		chosen_processor = choose_processor(chosen_pset, processor, thread);
4552 	}
4553 	/* For RT threads, choose_processor() can return a different cluster than the one passed into it */
4554 	assert(chosen_processor ? chosen_processor->processor_set->pset_type == chosen_pset->pset_type : true);
4555 	return chosen_processor;
4556 }
4557 
4558 /*
4559  * sched_edge_clutch_bucket_threads_drain()
4560  *
4561  * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4562  * bucket overrides etc.) into a local thread queue.
4563  */
4564 static void
sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,queue_t clutch_threads)4565 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
4566 {
4567 	thread_t thread = THREAD_NULL;
4568 	uint64_t current_timestamp = mach_approximate_time();
4569 	qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
4570 		sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4571 		enqueue_tail(clutch_threads, &thread->runq_links);
4572 	}
4573 }
4574 
4575 /*
4576  * sched_edge_run_drained_threads()
4577  *
4578  * Makes all drained threads in a local queue runnable.
4579  */
4580 static void
sched_edge_run_drained_threads(queue_t clutch_threads)4581 sched_edge_run_drained_threads(queue_t clutch_threads)
4582 {
4583 	thread_t thread;
4584 	/* Now setrun all the threads in the local queue */
4585 	qe_foreach_element_safe(thread, clutch_threads, runq_links) {
4586 		remqueue(&thread->runq_links);
4587 		thread_lock(thread);
4588 		thread_setrun(thread, SCHED_TAILQ);
4589 		thread_unlock(thread);
4590 	}
4591 }
4592 
4593 /*
4594  * sched_edge_update_preferred_cluster()
4595  *
4596  * Routine to update the preferred cluster for QoS buckets within a thread group.
4597  * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4598  */
4599 static void
sched_edge_update_preferred_cluster(sched_clutch_t sched_clutch,bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster)4600 sched_edge_update_preferred_cluster(
4601 	sched_clutch_t sched_clutch,
4602 	bitmap_t *clutch_bucket_modify_bitmap,
4603 	uint32_t *tg_bucket_preferred_cluster)
4604 {
4605 	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4606 		os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
4607 	}
4608 }
4609 
4610 /*
4611  * sched_edge_migrate_thread_group_runnable_threads()
4612  *
4613  * Routine to implement the migration of threads on a cluster when the thread group
4614  * recommendation is updated. The migration works using a 2-phase
4615  * algorithm.
4616  *
4617  * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4618  * For each clutch bucket, if it needs to be migrated immediately, drain the threads
4619  * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4620  * appropriate.
4621  *
4622  * Phase 2: After unlocking the pset, drain all the threads from the local thread
4623  * queue and mark them runnable which should land them in the right hierarchy.
4624  *
4625  * The routine assumes that the preferences for the clutch buckets/clutch bucket
4626  * groups have already been updated by the caller.
4627  *
4628  * - Called with the pset locked and interrupts disabled.
4629  * - Returns with the pset unlocked.
4630  */
4631 static void
sched_edge_migrate_thread_group_runnable_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,bitmap_t * clutch_bucket_modify_bitmap,__unused uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)4632 sched_edge_migrate_thread_group_runnable_threads(
4633 	sched_clutch_t sched_clutch,
4634 	sched_clutch_root_t root_clutch,
4635 	bitmap_t *clutch_bucket_modify_bitmap,
4636 	__unused uint32_t *tg_bucket_preferred_cluster,
4637 	bool migrate_immediately)
4638 {
4639 	/* Queue to hold threads that have been drained from clutch buckets to be migrated */
4640 	queue_head_t clutch_threads;
4641 	queue_init(&clutch_threads);
4642 
4643 	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4644 		/* Get the clutch bucket for this cluster and sched bucket */
4645 		sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
4646 		sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
4647 		sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
4648 		if (scb_root == NULL) {
4649 			/* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
4650 			assert(clutch_bucket->scb_thr_count == 0);
4651 			continue;
4652 		}
4653 		assert(scb_root == root_clutch);
4654 		uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
4655 
4656 		if (migrate_immediately) {
4657 			/*
4658 			 * For transitions where threads need to be migrated immediately, drain the threads into a
4659 			 * local queue unless we are looking at the clutch buckets for the newly recommended
4660 			 * cluster.
4661 			 */
4662 			if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
4663 				sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
4664 			} else {
4665 				sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4666 			}
4667 		} else {
4668 			/* Check if this cluster is the same type as the newly recommended cluster */
4669 			boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
4670 			/*
4671 			 * If threads do not have to be migrated immediately, just change the native/foreign
4672 			 * flag on the clutch bucket.
4673 			 */
4674 			if (homogeneous_cluster) {
4675 				sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4676 			} else {
4677 				sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
4678 			}
4679 		}
4680 	}
4681 
4682 	pset_unlock(root_clutch->scr_pset);
4683 	sched_edge_run_drained_threads(&clutch_threads);
4684 }
4685 
4686 /*
4687  * sched_edge_migrate_thread_group_running_threads()
4688  *
4689  * Routine to find all running threads of a thread group on a specific cluster
4690  * and IPI them if they need to be moved immediately.
4691  */
4692 static void
sched_edge_migrate_thread_group_running_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,__unused bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)4693 sched_edge_migrate_thread_group_running_threads(
4694 	sched_clutch_t sched_clutch,
4695 	sched_clutch_root_t root_clutch,
4696 	__unused bitmap_t *clutch_bucket_modify_bitmap,
4697 	uint32_t *tg_bucket_preferred_cluster,
4698 	bool migrate_immediately)
4699 {
4700 	if (migrate_immediately == false) {
4701 		/* If CLPC has recommended not to move threads immediately, nothing to do here */
4702 		return;
4703 	}
4704 
4705 	/*
4706 	 * Edge Scheduler Optimization
4707 	 *
4708 	 * When the system has a large number of clusters and cores, it might be useful to
4709 	 * narrow down the iteration by using a thread running bitmap per clutch.
4710 	 */
4711 	uint64_t ast_processor_map = 0;
4712 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4713 
4714 	uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
4715 	/*
4716 	 * Iterate all CPUs and look for the ones running threads from this thread group and are
4717 	 * not restricted to the specific cluster (due to overrides etc.)
4718 	 */
4719 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
4720 		processor_t src_processor = processor_array[cpuid];
4721 		boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
4722 		sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
4723 		if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
4724 			continue;
4725 		}
4726 		boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
4727 
4728 		if (expected_tg && non_preferred_cluster) {
4729 			ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
4730 			if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4731 				bit_set(ast_processor_map, cpuid);
4732 			} else if (src_processor == current_processor()) {
4733 				ast_on(AST_PREEMPT);
4734 				bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
4735 			}
4736 		}
4737 	}
4738 
4739 	/* Perform all the IPIs */
4740 	if (bit_first(ast_processor_map) != -1) {
4741 		for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4742 			processor_t ast_processor = processor_array[cpuid];
4743 			sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4744 		}
4745 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
4746 	}
4747 }
4748 
4749 /*
4750  * sched_edge_tg_preferred_cluster_change()
4751  *
4752  * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
4753  * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
4754  * thread migrations based on the policy specified by CLPC.
4755  * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
4756  * for each QoS within the thread group.
4757  */
4758 void
sched_edge_tg_preferred_cluster_change(struct thread_group * tg,uint32_t * tg_bucket_preferred_cluster,sched_perfcontrol_preferred_cluster_options_t options)4759 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
4760 {
4761 	sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
4762 	/*
4763 	 * In order to optimize the processing, create a bitmap which represents all QoS buckets
4764 	 * for which the preferred cluster has changed.
4765 	 */
4766 	bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
4767 	for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4768 		uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
4769 		uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
4770 		if (old_preferred_cluster != new_preferred_cluster) {
4771 			bitmap_set(clutch_bucket_modify_bitmap, bucket);
4772 		}
4773 	}
4774 	if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
4775 		/* No changes in any clutch buckets; nothing to do here */
4776 		return;
4777 	}
4778 
4779 	for (uint32_t cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
4780 		processor_set_t pset = pset_array[cluster_id];
4781 		spl_t s = splsched();
4782 		pset_lock(pset);
4783 		/*
4784 		 * The first operation is to update the preferred cluster for all QoS buckets within the
4785 		 * thread group so that any future threads becoming runnable would see the new preferred
4786 		 * cluster value.
4787 		 */
4788 		sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
4789 		/*
4790 		 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
4791 		 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
4792 		 *
4793 		 * Edge Scheduler Optimization
4794 		 */
4795 		/* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
4796 		sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4797 		    tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
4798 		/* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
4799 		sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4800 		    tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
4801 		/* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
4802 		splx(s);
4803 	}
4804 }
4805 
4806 /*
4807  * sched_edge_pset_made_schedulable()
4808  *
4809  * Routine to migrate all the clutch buckets which are not in their recommended
4810  * pset hierarchy now that a new pset has become runnable. Its possible that this
4811  * routine is called when the pset is already marked schedulable.
4812  *
4813  * Invoked with the pset lock held and interrupts disabled.
4814  */
4815 static void
sched_edge_pset_made_schedulable(__unused processor_t processor,processor_set_t dst_pset,boolean_t drop_lock)4816 sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
4817 {
4818 	if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
4819 		/* Nothing to do here since pset is already marked schedulable */
4820 		if (drop_lock) {
4821 			pset_unlock(dst_pset);
4822 		}
4823 		return;
4824 	}
4825 
4826 	bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
4827 
4828 	thread_t thread = sched_edge_processor_idle(dst_pset);
4829 	if (thread != THREAD_NULL) {
4830 		thread_lock(thread);
4831 		thread_setrun(thread, SCHED_TAILQ);
4832 		thread_unlock(thread);
4833 	}
4834 
4835 	if (!drop_lock) {
4836 		pset_lock(dst_pset);
4837 	}
4838 }
4839 
4840 /*
4841  * sched_edge_cpu_init_completed()
4842  *
4843  * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
4844  * provides an opportunity for the edge scheduler to initialize all the edge parameters.
4845  */
4846 static void
sched_edge_cpu_init_completed(void)4847 sched_edge_cpu_init_completed(void)
4848 {
4849 	spl_t s = splsched();
4850 	for (int src_cluster_id = 0; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) {
4851 		processor_set_t src_pset = pset_array[src_cluster_id];
4852 		pset_lock(src_pset);
4853 
4854 		/* For each cluster, set all its outgoing edge parameters */
4855 		for (int dst_cluster_id = 0; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) {
4856 			if (dst_cluster_id == src_cluster_id) {
4857 				continue;
4858 			}
4859 			processor_set_t dst_pset = pset_array[dst_cluster_id];
4860 			if (src_pset->pset_type == dst_pset->pset_type) {
4861 				/* P->P/E->E edge config */
4862 				bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
4863 				bitmap_set(src_pset->native_psets, dst_cluster_id);
4864 				sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
4865 			} else if ((src_pset->pset_type == CLUSTER_TYPE_P) && (dst_pset->pset_type == CLUSTER_TYPE_E)) {
4866 				/* P->E edge config */
4867 				bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4868 				bitmap_clear(src_pset->native_psets, dst_cluster_id);
4869 				sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
4870 			} else {
4871 				/* E->P edge config */
4872 				bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4873 				bitmap_clear(src_pset->native_psets, dst_cluster_id);
4874 				sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
4875 			}
4876 			bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
4877 			if (clusters_local) {
4878 				bitmap_set(src_pset->local_psets, dst_cluster_id);
4879 				bitmap_clear(src_pset->remote_psets, dst_cluster_id);
4880 			} else {
4881 				bitmap_set(src_pset->remote_psets, dst_cluster_id);
4882 				bitmap_clear(src_pset->local_psets, dst_cluster_id);
4883 			}
4884 		}
4885 
4886 		pset_unlock(src_pset);
4887 	}
4888 	splx(s);
4889 }
4890 
4891 static bool
sched_edge_thread_eligible_for_pset(thread_t thread,processor_set_t pset)4892 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
4893 {
4894 	uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
4895 	if (preferred_cluster_id == pset->pset_cluster_id) {
4896 		return true;
4897 	} else {
4898 		processor_set_t preferred_pset = pset_array[preferred_cluster_id];
4899 		return preferred_pset->sched_edges[pset->pset_cluster_id].sce_migration_allowed;
4900 	}
4901 }
4902 
4903 extern int sched_amp_spill_deferred_ipi;
4904 extern int sched_amp_pcores_preempt_immediate_ipi;
4905 
4906 int sched_edge_migrate_ipi_immediate = 1;
4907 
4908 sched_ipi_type_t
sched_edge_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4909 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4910 {
4911 	processor_set_t pset = dst->processor_set;
4912 	assert(dst != current_processor());
4913 
4914 	boolean_t deferred_ipi_supported = false;
4915 #if defined(CONFIG_SCHED_DEFERRED_AST)
4916 	deferred_ipi_supported = true;
4917 #endif /* CONFIG_SCHED_DEFERRED_AST */
4918 
4919 	switch (event) {
4920 	case SCHED_IPI_EVENT_SPILL:
4921 		/* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
4922 		if (deferred_ipi_supported) {
4923 			return sched_ipi_deferred_policy(pset, dst, thread, event);
4924 		}
4925 		break;
4926 	case SCHED_IPI_EVENT_PREEMPT:
4927 		/* For preemption, the default policy is to use deferred IPIs
4928 		 * for Non-RT P-core preemption. Override that behavior if
4929 		 * sched_amp_pcores_preempt_immediate_ipi is set
4930 		 */
4931 		if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
4932 			if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
4933 				return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4934 			}
4935 			if (sched_edge_migrate_ipi_immediate) {
4936 				processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4937 				/*
4938 				 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
4939 				 */
4940 				if (preferred_pset->pset_type == pset->pset_type) {
4941 					return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4942 				}
4943 				/*
4944 				 * For workloads that are going wide, it might be useful use Immediate IPI to
4945 				 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
4946 				 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
4947 				 * latency on the preferred pset as an estimate of busyness.
4948 				 */
4949 				if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
4950 					return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4951 				}
4952 			}
4953 		}
4954 		break;
4955 	default:
4956 		break;
4957 	}
4958 	/* Default back to the global policy for all other scenarios */
4959 	return sched_ipi_policy(dst, thread, dst_idle, event);
4960 }
4961 
4962 /*
4963  * sched_edge_qos_max_parallelism()
4964  */
4965 uint32_t
sched_edge_qos_max_parallelism(int qos,uint64_t options)4966 sched_edge_qos_max_parallelism(int qos, uint64_t options)
4967 {
4968 	uint32_t ecpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_E, false, false);
4969 	uint32_t pcpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_P, false, false);
4970 	uint32_t ecluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_E);
4971 	uint32_t pcluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_P);
4972 
4973 	if (options & QOS_PARALLELISM_REALTIME) {
4974 		/* For realtime threads on AMP, we would want them
4975 		 * to limit the width to just the P-cores since we
4976 		 * do not spill/rebalance for RT threads.
4977 		 */
4978 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcpu_count;
4979 	}
4980 
4981 	/*
4982 	 * The Edge scheduler supports per-QoS recommendations for thread groups.
4983 	 * This enables lower QoS buckets (such as UT) to be scheduled on all
4984 	 * CPUs on the system.
4985 	 *
4986 	 * The only restriction is for BG/Maintenance QoS classes for which the
4987 	 * performance controller would never recommend execution on the P-cores.
4988 	 * If that policy changes in the future, this value should be changed.
4989 	 */
4990 	switch (qos) {
4991 	case THREAD_QOS_BACKGROUND:
4992 	case THREAD_QOS_MAINTENANCE:
4993 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecpu_count;
4994 	default:
4995 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecpu_count + pcpu_count);
4996 	}
4997 }
4998 
4999 
5000 
5001 #endif /* CONFIG_SCHED_EDGE */
5002 
5003 #endif /* CONFIG_SCHED_CLUTCH */
5004