xref: /xnu-10063.101.15/osfmk/kern/sched_clutch.c (revision 94d3b452840153a99b38a3a9659680b2a006908e)
1 /*
2  * Copyright (c) 2018 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 #include <machine/machine_routines.h>
32 #include <machine/sched_param.h>
33 #include <machine/machine_cpu.h>
34 #include <kern/kern_types.h>
35 #include <kern/debug.h>
36 #include <kern/machine.h>
37 #include <kern/misc_protos.h>
38 #include <kern/processor.h>
39 #include <kern/queue.h>
40 #include <kern/sched.h>
41 #include <kern/sched_prim.h>
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <kern/sched_clutch.h>
45 #include <machine/atomic.h>
46 #include <kern/sched_clutch.h>
47 #include <sys/kdebug.h>
48 
49 #if CONFIG_SCHED_EDGE
50 #include <kern/sched_amp_common.h>
51 #endif /* CONFIG_SCHED_EDGE */
52 
53 #if CONFIG_SCHED_CLUTCH
54 
55 /* Forward declarations of static routines */
56 
57 /* Root level hierarchy management */
58 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
59 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
60 static void sched_clutch_root_pri_update(sched_clutch_root_t);
61 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
62 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
63 
64 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
65 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE           = 0,
66 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY   = 1,
67 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL            = 2,
68 });
69 
70 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t);
71 
72 #if CONFIG_SCHED_EDGE
73 /* Support for foreign threads on AMP platforms */
74 static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
75 static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
76 #endif /* CONFIG_SCHED_EDGE */
77 
78 /* Root bucket level hierarchy management */
79 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
80 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t);
81 
82 /* Options for clutch bucket ordering in the runq */
83 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
84 	SCHED_CLUTCH_BUCKET_OPTIONS_NONE        = 0x0,
85 	/* Round robin clutch bucket on thread removal */
86 	SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR  = 0x1,
87 	/* Insert clutch bucket at head (for thread preemption) */
88 	SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ       = 0x2,
89 	/* Insert clutch bucket at tail (default) */
90 	SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ       = 0x4,
91 });
92 
93 /* Clutch bucket level hierarchy management */
94 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
95 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
96 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
97 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
98 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
99 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
100 
101 /* Clutch bucket group level properties management */
102 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
103 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
104 static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t);
105 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
106 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
107 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
108 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
109 
110 /* Clutch timeshare properties updates */
111 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
112 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
113 
114 /* Clutch membership management */
115 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
116 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
117 static thread_t sched_clutch_thread_highest_remove(sched_clutch_root_t);
118 
119 /* Clutch properties updates */
120 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
121 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
122 static int sched_clutch_root_priority(sched_clutch_root_t);
123 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t);
124 static boolean_t sched_thread_sched_pri_promoted(thread_t);
125 
126 #if CONFIG_SCHED_EDGE
127 /* System based routines */
128 static bool sched_edge_pset_available(processor_set_t);
129 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
130 static int sched_edge_iterate_clusters_ordered(processor_set_t, uint64_t, int);
131 
132 /* Global indicating the maximum number of clusters on the current platform */
133 static int sched_edge_max_clusters = 0;
134 #endif /* CONFIG_SCHED_EDGE */
135 
136 /* Helper debugging routines */
137 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
138 
139 extern processor_set_t pset_array[MAX_PSETS];
140 
141 /*
142  * Special markers for buckets that have invalid WCELs/quantums etc.
143  */
144 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
145 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
146 
147 /*
148  * Root level bucket WCELs
149  *
150  * The root level bucket selection algorithm is an Earliest Deadline
151  * First (EDF) algorithm where the deadline for buckets are defined
152  * by the worst-case-execution-latency and the make runnable timestamp
153  * for the bucket.
154  *
155  */
156 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
157 	SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
158 	0,                                              /* FG */
159 	37500,                                          /* IN (37.5ms) */
160 	75000,                                          /* DF (75ms) */
161 	150000,                                         /* UT (150ms) */
162 	250000                                          /* BG (250ms) */
163 };
164 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
165 
166 /*
167  * Root level bucket warp
168  *
169  * Each root level bucket has a warp value associated with it as well.
170  * The warp value allows the root bucket to effectively warp ahead of
171  * lower priority buckets for a limited time even if it has a later
172  * deadline. The warping behavior provides extra (but limited)
173  * opportunity for high priority buckets to remain responsive.
174  */
175 
176 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
177 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED    (SCHED_CLUTCH_INVALID_TIME_64)
178 
179 /* Warp window durations for various tiers */
180 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
181 	SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
182 	8000,                                           /* FG (8ms)*/
183 	4000,                                           /* IN (4ms) */
184 	2000,                                           /* DF (2ms) */
185 	1000,                                           /* UT (1ms) */
186 	0                                               /* BG (0ms) */
187 };
188 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
189 
190 /*
191  * Thread level quantum
192  *
193  * The algorithm defines quantums for threads at various buckets. This
194  * (combined with the root level bucket quantums) restricts how much
195  * the lower priority levels can preempt the higher priority threads.
196  */
197 
198 #if XNU_TARGET_OS_OSX
199 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
200 	10000,                                          /* FIXPRI (10ms) */
201 	10000,                                          /* FG (10ms) */
202 	10000,                                          /* IN (10ms) */
203 	10000,                                          /* DF (10ms) */
204 	4000,                                           /* UT (4ms) */
205 	2000                                            /* BG (2ms) */
206 };
207 #else /* XNU_TARGET_OS_OSX */
208 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
209 	10000,                                          /* FIXPRI (10ms) */
210 	10000,                                          /* FG (10ms) */
211 	8000,                                           /* IN (8ms) */
212 	6000,                                           /* DF (6ms) */
213 	4000,                                           /* UT (4ms) */
214 	2000                                            /* BG (2ms) */
215 };
216 #endif /* XNU_TARGET_OS_OSX */
217 
218 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
219 
220 /*
221  * sched_clutch_us_to_abstime()
222  *
223  * Initializer for converting all durations in usec to abstime
224  */
225 static void
sched_clutch_us_to_abstime(uint32_t * us_vals,uint64_t * abstime_vals)226 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
227 {
228 	for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
229 		if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
230 			abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
231 		} else {
232 			clock_interval_to_absolutetime_interval(us_vals[i],
233 			    NSEC_PER_USEC, &abstime_vals[i]);
234 		}
235 	}
236 }
237 
238 /* Clutch/Edge Scheduler Debugging support */
239 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c)        ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
240 
241 #if DEVELOPMENT || DEBUG
242 
243 /*
244  * sched_clutch_hierarchy_locked_assert()
245  *
246  * Debugging helper routine. Asserts that the hierarchy is locked. The locking
247  * for the hierarchy depends on where the hierarchy is hooked. The current
248  * implementation hooks the hierarchy at the pset, so the hierarchy is locked
249  * using the pset lock.
250  */
251 static inline void
sched_clutch_hierarchy_locked_assert(sched_clutch_root_t root_clutch)252 sched_clutch_hierarchy_locked_assert(
253 	sched_clutch_root_t root_clutch)
254 {
255 	pset_assert_locked(root_clutch->scr_pset);
256 }
257 
258 #else /* DEVELOPMENT || DEBUG */
259 
260 static inline void
sched_clutch_hierarchy_locked_assert(__unused sched_clutch_root_t root_clutch)261 sched_clutch_hierarchy_locked_assert(
262 	__unused sched_clutch_root_t root_clutch)
263 {
264 }
265 
266 #endif /* DEVELOPMENT || DEBUG */
267 
268 /*
269  * sched_clutch_thr_count_inc()
270  *
271  * Increment thread count at a hierarchy level with overflow checks.
272  */
273 static void
sched_clutch_thr_count_inc(uint16_t * thr_count)274 sched_clutch_thr_count_inc(
275 	uint16_t *thr_count)
276 {
277 	if (__improbable(os_inc_overflow(thr_count))) {
278 		panic("sched_clutch thread count overflowed!");
279 	}
280 }
281 
282 /*
283  * sched_clutch_thr_count_dec()
284  *
285  * Decrement thread count at a hierarchy level with underflow checks.
286  */
287 static void
sched_clutch_thr_count_dec(uint16_t * thr_count)288 sched_clutch_thr_count_dec(
289 	uint16_t *thr_count)
290 {
291 	if (__improbable(os_dec_overflow(thr_count))) {
292 		panic("sched_clutch thread count underflowed!");
293 	}
294 }
295 
296 static sched_bucket_t
sched_convert_pri_to_bucket(uint8_t priority)297 sched_convert_pri_to_bucket(uint8_t priority)
298 {
299 	sched_bucket_t bucket = TH_BUCKET_RUN;
300 
301 	if (priority > BASEPRI_USER_INITIATED) {
302 		bucket = TH_BUCKET_SHARE_FG;
303 	} else if (priority > BASEPRI_DEFAULT) {
304 		bucket = TH_BUCKET_SHARE_IN;
305 	} else if (priority > BASEPRI_UTILITY) {
306 		bucket = TH_BUCKET_SHARE_DF;
307 	} else if (priority > MAXPRI_THROTTLE) {
308 		bucket = TH_BUCKET_SHARE_UT;
309 	} else {
310 		bucket = TH_BUCKET_SHARE_BG;
311 	}
312 	return bucket;
313 }
314 
315 /*
316  * sched_clutch_thread_bucket_map()
317  *
318  * Map a thread to a scheduling bucket for the clutch/edge scheduler
319  * based on its scheduling mode and the priority attribute passed in.
320  */
321 static sched_bucket_t
sched_clutch_thread_bucket_map(thread_t thread,int pri)322 sched_clutch_thread_bucket_map(thread_t thread, int pri)
323 {
324 	switch (thread->sched_mode) {
325 	case TH_MODE_FIXED:
326 		if (pri >= BASEPRI_FOREGROUND) {
327 			return TH_BUCKET_FIXPRI;
328 		} else {
329 			return sched_convert_pri_to_bucket(pri);
330 		}
331 
332 	case TH_MODE_REALTIME:
333 		return TH_BUCKET_FIXPRI;
334 
335 	case TH_MODE_TIMESHARE:
336 		return sched_convert_pri_to_bucket(pri);
337 
338 	default:
339 		panic("unexpected mode: %d", thread->sched_mode);
340 		break;
341 	}
342 }
343 
344 /*
345  * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
346  * based on the amount of time they have been pending and the load at that
347  * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
348  * multiple clusters, its important to keep the load also as a global counter.
349  */
350 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
351 
352 /*
353  * sched_clutch_root_init()
354  *
355  * Routine to initialize the scheduler hierarchy root.
356  */
357 static void
sched_clutch_root_init(sched_clutch_root_t root_clutch,processor_set_t pset)358 sched_clutch_root_init(
359 	sched_clutch_root_t root_clutch,
360 	processor_set_t pset)
361 {
362 	root_clutch->scr_thr_count = 0;
363 	root_clutch->scr_priority = NOPRI;
364 	root_clutch->scr_urgency = 0;
365 	root_clutch->scr_pset = pset;
366 #if CONFIG_SCHED_EDGE
367 	root_clutch->scr_cluster_id = pset->pset_cluster_id;
368 #else /* CONFIG_SCHED_EDGE */
369 	root_clutch->scr_cluster_id = 0;
370 #endif /* CONFIG_SCHED_EDGE */
371 
372 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
373 		root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
374 	}
375 	/* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
376 	queue_init(&root_clutch->scr_clutch_buckets);
377 
378 	/* Initialize the priority queue which maintains all runnable foreign clutch buckets */
379 	priority_queue_init(&root_clutch->scr_foreign_buckets);
380 	bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
381 	bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
382 	bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
383 	priority_queue_init(&root_clutch->scr_bound_root_buckets);
384 
385 	/* Initialize the bitmap and priority queue of runnable root buckets */
386 	priority_queue_init(&root_clutch->scr_unbound_root_buckets);
387 	bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
388 	bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
389 
390 	/* Initialize all the root buckets */
391 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
392 		sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
393 		sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
394 	}
395 }
396 
397 /*
398  * Clutch Bucket Runqueues
399  *
400  * The clutch buckets are maintained in a runq at the root bucket level. The
401  * runq organization allows clutch buckets to be ordered based on various
402  * factors such as:
403  *
404  * - Clutch buckets are round robin'ed at the same priority level when a
405  *   thread is selected from a clutch bucket. This prevents a clutch bucket
406  *   from starving out other clutch buckets at the same priority.
407  *
408  * - Clutch buckets are inserted at the head when it becomes runnable due to
409  *   thread preemption. This allows threads that were preempted to maintain
410  *   their order in the queue.
411  */
412 
413 /*
414  * sched_clutch_bucket_runq_init()
415  *
416  * Initialize a clutch bucket runq.
417  */
418 static void
sched_clutch_bucket_runq_init(sched_clutch_bucket_runq_t clutch_buckets_rq)419 sched_clutch_bucket_runq_init(
420 	sched_clutch_bucket_runq_t clutch_buckets_rq)
421 {
422 	clutch_buckets_rq->scbrq_highq = NOPRI;
423 	for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
424 		clutch_buckets_rq->scbrq_bitmap[i] = 0;
425 	}
426 	clutch_buckets_rq->scbrq_count = 0;
427 	for (int i = 0; i < NRQS; i++) {
428 		circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
429 	}
430 }
431 
432 /*
433  * sched_clutch_bucket_runq_empty()
434  *
435  * Returns if a clutch bucket runq is empty.
436  */
437 static boolean_t
sched_clutch_bucket_runq_empty(sched_clutch_bucket_runq_t clutch_buckets_rq)438 sched_clutch_bucket_runq_empty(
439 	sched_clutch_bucket_runq_t clutch_buckets_rq)
440 {
441 	return clutch_buckets_rq->scbrq_count == 0;
442 }
443 
444 /*
445  * sched_clutch_bucket_runq_peek()
446  *
447  * Returns the highest priority clutch bucket in the runq.
448  */
449 static sched_clutch_bucket_t
sched_clutch_bucket_runq_peek(sched_clutch_bucket_runq_t clutch_buckets_rq)450 sched_clutch_bucket_runq_peek(
451 	sched_clutch_bucket_runq_t clutch_buckets_rq)
452 {
453 	if (clutch_buckets_rq->scbrq_count > 0) {
454 		circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
455 		return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
456 	} else {
457 		return NULL;
458 	}
459 }
460 
461 /*
462  * sched_clutch_bucket_runq_enqueue()
463  *
464  * Enqueue a clutch bucket into the runq based on the options passed in.
465  */
466 static void
sched_clutch_bucket_runq_enqueue(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_options_t options)467 sched_clutch_bucket_runq_enqueue(
468 	sched_clutch_bucket_runq_t clutch_buckets_rq,
469 	sched_clutch_bucket_t clutch_bucket,
470 	sched_clutch_bucket_options_t options)
471 {
472 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
473 	if (circle_queue_empty(queue)) {
474 		circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
475 		bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
476 		if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
477 			clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
478 		}
479 	} else {
480 		if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
481 			circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
482 		} else {
483 			/*
484 			 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
485 			 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
486 			 */
487 			circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
488 		}
489 	}
490 	clutch_buckets_rq->scbrq_count++;
491 }
492 
493 /*
494  * sched_clutch_bucket_runq_remove()
495  *
496  * Remove a clutch bucket from the runq.
497  */
498 static void
sched_clutch_bucket_runq_remove(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)499 sched_clutch_bucket_runq_remove(
500 	sched_clutch_bucket_runq_t clutch_buckets_rq,
501 	sched_clutch_bucket_t clutch_bucket)
502 {
503 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
504 	circle_dequeue(queue, &clutch_bucket->scb_runqlink);
505 	assert(clutch_buckets_rq->scbrq_count > 0);
506 	clutch_buckets_rq->scbrq_count--;
507 	if (circle_queue_empty(queue)) {
508 		bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
509 		clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
510 	}
511 }
512 
513 static void
sched_clutch_bucket_runq_rotate(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)514 sched_clutch_bucket_runq_rotate(
515 	sched_clutch_bucket_runq_t clutch_buckets_rq,
516 	sched_clutch_bucket_t clutch_bucket)
517 {
518 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
519 	assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
520 	circle_queue_rotate_head_forward(queue);
521 }
522 
523 /*
524  * sched_clutch_root_bucket_init()
525  *
526  * Routine to initialize root buckets.
527  */
528 static void
sched_clutch_root_bucket_init(sched_clutch_root_bucket_t root_bucket,sched_bucket_t bucket,bool bound_root_bucket)529 sched_clutch_root_bucket_init(
530 	sched_clutch_root_bucket_t root_bucket,
531 	sched_bucket_t bucket,
532 	bool bound_root_bucket)
533 {
534 	root_bucket->scrb_bucket = bucket;
535 	if (bound_root_bucket) {
536 		/* For bound root buckets, initialize the bound thread runq. */
537 		root_bucket->scrb_bound = true;
538 		run_queue_init(&root_bucket->scrb_bound_thread_runq);
539 	} else {
540 		/*
541 		 * The unbounded root buckets contain a runq of runnable clutch buckets
542 		 * which then hold the runnable threads.
543 		 */
544 		root_bucket->scrb_bound = false;
545 		sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
546 	}
547 	priority_queue_entry_init(&root_bucket->scrb_pqlink);
548 	root_bucket->scrb_pqlink.deadline = SCHED_CLUTCH_INVALID_TIME_64;
549 	root_bucket->scrb_warped_deadline = 0;
550 	root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
551 	root_bucket->scrb_starvation_avoidance = false;
552 	root_bucket->scrb_starvation_ts = 0;
553 }
554 
555 /*
556  * Special case scheduling for Above UI bucket.
557  *
558  * AboveUI threads are typically system critical threads that need low latency
559  * which is why they are handled specially.
560  *
561  * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
562  * important to maintain some native priority order between those buckets. For unbounded
563  * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
564  * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
565  * deadline based scheduling which should pickup the timeshare buckets. For the bound
566  * case, the policy simply compares the priority of the highest runnable threads in
567  * the above UI and timeshare buckets.
568  *
569  * The implementation allows extremely low latency CPU access for Above UI threads
570  * while supporting the use case of high priority timeshare threads contending with
571  * lower priority fixed priority threads.
572  */
573 
574 
575 /*
576  * sched_clutch_root_unbound_select_aboveui()
577  *
578  * Routine to determine if the above UI unbounded bucket should be selected for execution.
579  */
580 static bool
sched_clutch_root_unbound_select_aboveui(sched_clutch_root_t root_clutch)581 sched_clutch_root_unbound_select_aboveui(
582 	sched_clutch_root_t root_clutch)
583 {
584 	if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
585 		sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
586 		sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
587 		if (!bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
588 			/* If the timeshare FG bucket is not runnable, pick the aboveUI bucket for scheduling */
589 			return true;
590 		}
591 		sched_clutch_bucket_t clutch_bucket_aboveui = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_aboveui);
592 		sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_sharefg);
593 		if (clutch_bucket_aboveui->scb_priority >= clutch_bucket_sharefg->scb_priority) {
594 			return true;
595 		}
596 	}
597 	return false;
598 }
599 
600 /*
601  * sched_clutch_root_bound_select_aboveui()
602  *
603  * Routine to determine if the above UI bounded bucket should be selected for execution.
604  */
605 static bool
sched_clutch_root_bound_select_aboveui(sched_clutch_root_t root_clutch)606 sched_clutch_root_bound_select_aboveui(
607 	sched_clutch_root_t root_clutch)
608 {
609 	sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
610 	sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
611 	if (root_bucket_aboveui->scrb_bound_thread_runq.count == 0) {
612 		return false;
613 	}
614 	return root_bucket_aboveui->scrb_bound_thread_runq.highq >= root_bucket_sharefg->scrb_bound_thread_runq.highq;
615 }
616 
617 /*
618  * sched_clutch_root_highest_root_bucket()
619  *
620  * Main routine to find the highest runnable root level bucket.
621  * This routine is called from performance sensitive contexts; so it is
622  * crucial to keep this O(1). The options parameter determines if
623  * the selection logic should look at unbounded threads only (for
624  * cross-cluster stealing operations) or both bounded and unbounded
625  * threads (for selecting next thread for execution on current cluster).
626  */
627 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_highest_root_bucket_type_t type)628 sched_clutch_root_highest_root_bucket(
629 	sched_clutch_root_t root_clutch,
630 	uint64_t timestamp,
631 	sched_clutch_highest_root_bucket_type_t type)
632 {
633 	sched_clutch_hierarchy_locked_assert(root_clutch);
634 	int highest_runnable_bucket = -1;
635 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
636 		highest_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
637 	} else {
638 		int highest_unbound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
639 		int highest_bound_runnable_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
640 		highest_runnable_bucket = (highest_bound_runnable_bucket != -1) ? ((highest_unbound_runnable_bucket != -1) ? MIN(highest_bound_runnable_bucket, highest_unbound_runnable_bucket) : highest_bound_runnable_bucket) : highest_unbound_runnable_bucket;
641 	}
642 
643 	if (highest_runnable_bucket == -1) {
644 		return NULL;
645 	}
646 
647 	/* Above UI root bucket selection (see comment above for more details on this special case handling) */
648 	bool unbound_aboveui = sched_clutch_root_unbound_select_aboveui(root_clutch);
649 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
650 		if (unbound_aboveui) {
651 			return &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
652 		}
653 		/* Fall through to selecting a timeshare root bucket */
654 	} else {
655 		bool bound_aboveui = sched_clutch_root_bound_select_aboveui(root_clutch);
656 		sched_clutch_root_bucket_t unbound_aboveui_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
657 		sched_clutch_root_bucket_t bound_aboveui_root_bucket = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
658 
659 		if (unbound_aboveui && bound_aboveui) {
660 			/*
661 			 * In this scenario both the bounded and unbounded above UI buckets are runnable; choose based on the
662 			 * highest runnable priority in both the buckets.
663 			 * */
664 			int bound_aboveui_pri = root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI].scrb_bound_thread_runq.highq;
665 			sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(unbound_aboveui_root_bucket);
666 			int unbound_aboveui_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
667 			return (bound_aboveui_pri >= unbound_aboveui_pri) ? bound_aboveui_root_bucket : unbound_aboveui_root_bucket;
668 		}
669 		if (unbound_aboveui) {
670 			return unbound_aboveui_root_bucket;
671 		}
672 		if (bound_aboveui) {
673 			return bound_aboveui_root_bucket;
674 		}
675 		/* Fall through to selecting a timeshare root bucket */
676 	}
677 
678 	/*
679 	 * Above UI bucket is not runnable or has a low priority runnable thread; use the
680 	 * earliest deadline model to schedule threads. The idea is that as the timeshare
681 	 * buckets use CPU, they will drop their interactivity score/sched priority and
682 	 * allow the low priority AboveUI buckets to be scheduled.
683 	 */
684 
685 	/* Find the earliest deadline bucket */
686 	sched_clutch_root_bucket_t edf_bucket = NULL;
687 	sched_clutch_root_bucket_t warp_bucket = NULL;
688 	int warp_bucket_index = -1;
689 
690 evaluate_root_buckets:
691 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
692 		edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
693 	} else {
694 		sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
695 		sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
696 		if (bound_bucket && unbound_bucket) {
697 			/* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
698 			edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
699 		} else {
700 			edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
701 		}
702 	}
703 	/*
704 	 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
705 	 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
706 	 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
707 	 * QoS root bucket.
708 	 */
709 	bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
710 	warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
711 
712 	if ((warp_bucket_index == -1) || (warp_bucket_index >= edf_bucket->scrb_bucket)) {
713 		/* No higher buckets have warp left; best choice is the EDF based bucket */
714 		if (edf_bucket->scrb_starvation_avoidance) {
715 			/*
716 			 * Indicates that the earliest deadline bucket is in starvation avoidance mode. Check to see if the
717 			 * starvation avoidance window is still open and return this bucket if it is.
718 			 *
719 			 * The starvation avoidance window is calculated based on the quantum of threads at that bucket and
720 			 * the number of CPUs in the cluster. The idea is to basically provide one quantum worth of starvation
721 			 * avoidance across all CPUs.
722 			 */
723 			uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket] / pset_available_cpu_count(root_clutch->scr_pset);
724 			if (timestamp < (edf_bucket->scrb_starvation_ts + starvation_window)) {
725 				return edf_bucket;
726 			} else {
727 				/* Starvation avoidance window is over; update deadline and re-evaluate EDF */
728 				edf_bucket->scrb_starvation_avoidance = false;
729 				edf_bucket->scrb_starvation_ts = 0;
730 				sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
731 			}
732 			goto evaluate_root_buckets;
733 		}
734 
735 		/* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
736 		if (highest_runnable_bucket < edf_bucket->scrb_bucket) {
737 			/* Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance */
738 			edf_bucket->scrb_starvation_avoidance = true;
739 			edf_bucket->scrb_starvation_ts = timestamp;
740 		} else {
741 			/* EDF bucket is being selected in the natural order; update deadline and reset warp */
742 			sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp);
743 			edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
744 			edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
745 			if (edf_bucket->scrb_bound) {
746 				bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
747 			} else {
748 				bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
749 			}
750 		}
751 		return edf_bucket;
752 	}
753 
754 	/*
755 	 * Looks like there is a root bucket which is higher in the natural priority
756 	 * order than edf_bucket and might have some warp remaining.
757 	 */
758 	warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
759 	if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
760 		/* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
761 		warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
762 		sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
763 		return warp_bucket;
764 	}
765 	if (warp_bucket->scrb_warped_deadline > timestamp) {
766 		/* Root bucket already has a warp window open with some warp remaining */
767 		sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp);
768 		return warp_bucket;
769 	}
770 
771 	/* For this bucket, warp window was opened sometime in the past but has now
772 	 * expired. Mark the bucket as not avilable for warp anymore and re-run the
773 	 * warp bucket selection logic.
774 	 */
775 	warp_bucket->scrb_warp_remaining = 0;
776 	if (warp_bucket->scrb_bound) {
777 		bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
778 	} else {
779 		bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
780 	}
781 	goto evaluate_root_buckets;
782 }
783 
784 /*
785  * sched_clutch_root_bucket_deadline_calculate()
786  *
787  * Calculate the deadline for the bucket based on its WCEL
788  */
789 static uint64_t
sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t root_bucket,uint64_t timestamp)790 sched_clutch_root_bucket_deadline_calculate(
791 	sched_clutch_root_bucket_t root_bucket,
792 	uint64_t timestamp)
793 {
794 	/* For fixpri AboveUI bucket always return it as the earliest deadline */
795 	if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) {
796 		return 0;
797 	}
798 
799 	/* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
800 	return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
801 }
802 
803 /*
804  * sched_clutch_root_bucket_deadline_update()
805  *
806  * Routine to update the deadline of the root bucket when it is selected.
807  * Updating the deadline also moves the root_bucket in the EDF priority
808  * queue.
809  */
810 static void
sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)811 sched_clutch_root_bucket_deadline_update(
812 	sched_clutch_root_bucket_t root_bucket,
813 	sched_clutch_root_t root_clutch,
814 	uint64_t timestamp)
815 {
816 	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
817 		/* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
818 		return;
819 	}
820 
821 	uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
822 	uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
823 	if (__improbable(old_deadline > new_deadline)) {
824 		panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
825 	}
826 	if (old_deadline != new_deadline) {
827 		root_bucket->scrb_pqlink.deadline = new_deadline;
828 		struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
829 		priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
830 	}
831 }
832 
833 /*
834  * sched_clutch_root_bucket_runnable()
835  *
836  * Routine to insert a newly runnable root bucket into the hierarchy.
837  * Also updates the deadline and warp parameters as necessary.
838  */
839 static void
sched_clutch_root_bucket_runnable(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)840 sched_clutch_root_bucket_runnable(
841 	sched_clutch_root_bucket_t root_bucket,
842 	sched_clutch_root_t root_clutch,
843 	uint64_t timestamp)
844 {
845 	/* Mark the root bucket as runnable */
846 	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
847 	bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
848 
849 	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
850 		/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
851 		return;
852 	}
853 
854 	if (root_bucket->scrb_starvation_avoidance == false) {
855 		/*
856 		 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
857 		 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
858 		 * that and fix it up.
859 		 */
860 		root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
861 	}
862 	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
863 	priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
864 	if (root_bucket->scrb_warp_remaining) {
865 		/* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
866 		bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
867 		bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
868 	}
869 }
870 
871 /*
872  * sched_clutch_root_bucket_empty()
873  *
874  * Routine to remove an empty root bucket from the hierarchy.
875  * Also updates the deadline and warp parameters as necessary.
876  */
877 static void
sched_clutch_root_bucket_empty(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)878 sched_clutch_root_bucket_empty(
879 	sched_clutch_root_bucket_t root_bucket,
880 	sched_clutch_root_t root_clutch,
881 	uint64_t timestamp)
882 {
883 	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
884 	bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
885 
886 	if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) {
887 		/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
888 		return;
889 	}
890 
891 	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
892 	priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
893 
894 	bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
895 	bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
896 
897 	if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
898 		if (root_bucket->scrb_warped_deadline > timestamp) {
899 			/*
900 			 * For root buckets that were using the warp, check if the warp
901 			 * deadline is in the future. If yes, remove the wall time the
902 			 * warp was active and update the warp remaining. This allows
903 			 * the root bucket to use the remaining warp the next time it
904 			 * becomes runnable.
905 			 */
906 			root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
907 		} else {
908 			/*
909 			 * If the root bucket's warped deadline is in the past, it has used up
910 			 * all the warp it was assigned. Empty out its warp remaining.
911 			 */
912 			root_bucket->scrb_warp_remaining = 0;
913 		}
914 	}
915 }
916 
917 static int
sched_clutch_global_bucket_load_get(sched_bucket_t bucket)918 sched_clutch_global_bucket_load_get(
919 	sched_bucket_t bucket)
920 {
921 	return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
922 }
923 
924 /*
925  * sched_clutch_root_pri_update()
926  *
927  * The root level priority is used for thread selection and preemption
928  * logic.
929  *
930  * The logic uses the same decision as thread selection for deciding between the
931  * above UI and timeshare buckets. If one of the timesharing buckets have to be
932  * used for priority calculation, the logic is slightly different from thread
933  * selection, because thread selection considers deadlines, warps etc. to
934  * decide the most optimal bucket at a given timestamp. Since the priority
935  * value is used for preemption decisions only, it needs to be based on the
936  * highest runnable thread available in the timeshare domain. This logic can
937  * be made more sophisticated if there are cases of unnecessary preemption
938  * being seen in workloads.
939  */
940 static void
sched_clutch_root_pri_update(sched_clutch_root_t root_clutch)941 sched_clutch_root_pri_update(
942 	sched_clutch_root_t root_clutch)
943 {
944 	sched_clutch_hierarchy_locked_assert(root_clutch);
945 	int16_t root_bound_pri = NOPRI;
946 	int16_t root_unbound_pri = NOPRI;
947 
948 	if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
949 		goto root_pri_update_unbound;
950 	}
951 	sched_clutch_root_bucket_t root_bucket_bound = NULL;
952 	if (sched_clutch_root_bound_select_aboveui(root_clutch)) {
953 		root_bucket_bound = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
954 	} else {
955 		int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
956 		assert(root_bucket_index != -1);
957 		root_bucket_bound = &root_clutch->scr_bound_buckets[root_bucket_index];
958 	}
959 	root_bound_pri = root_bucket_bound->scrb_bound_thread_runq.highq;
960 
961 root_pri_update_unbound:
962 	if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
963 		goto root_pri_update_complete;
964 	}
965 	sched_clutch_root_bucket_t root_bucket_unbound = NULL;
966 	if (sched_clutch_root_unbound_select_aboveui(root_clutch)) {
967 		root_bucket_unbound = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
968 	} else {
969 		int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
970 		assert(root_bucket_index != -1);
971 		root_bucket_unbound = &root_clutch->scr_unbound_buckets[root_bucket_index];
972 	}
973 	/* For the selected root bucket, find the highest priority clutch bucket */
974 	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket_unbound);
975 	root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
976 
977 root_pri_update_complete:
978 	root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
979 }
980 
981 /*
982  * sched_clutch_root_urgency_inc()
983  *
984  * Routine to increment the urgency at the root level based on the thread
985  * priority that is being inserted into the hierarchy. The root urgency
986  * counter is updated based on the urgency of threads in any of the
987  * clutch buckets which are part of the hierarchy.
988  *
989  * Always called with the pset lock held.
990  */
991 static void
sched_clutch_root_urgency_inc(sched_clutch_root_t root_clutch,thread_t thread)992 sched_clutch_root_urgency_inc(
993 	sched_clutch_root_t root_clutch,
994 	thread_t thread)
995 {
996 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
997 		root_clutch->scr_urgency++;
998 	}
999 }
1000 
1001 /*
1002  * sched_clutch_root_urgency_dec()
1003  *
1004  * Routine to decrement the urgency at the root level based on the thread
1005  * priority that is being removed from the hierarchy. The root urgency
1006  * counter is updated based on the urgency of threads in any of the
1007  * clutch buckets which are part of the hierarchy.
1008  *
1009  * Always called with the pset lock held.
1010  */
1011 static void
sched_clutch_root_urgency_dec(sched_clutch_root_t root_clutch,thread_t thread)1012 sched_clutch_root_urgency_dec(
1013 	sched_clutch_root_t root_clutch,
1014 	thread_t thread)
1015 {
1016 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1017 		root_clutch->scr_urgency--;
1018 	}
1019 }
1020 
1021 /*
1022  * Clutch bucket level scheduling
1023  *
1024  * The second level of scheduling is the clutch bucket level scheduling
1025  * which tries to schedule thread groups within root_buckets. Each
1026  * clutch represents a thread group and a clutch_bucket_group represents
1027  * threads at a particular sched_bucket within that thread group. The
1028  * clutch_bucket_group contains a clutch_bucket per cluster on the system
1029  * where it holds the runnable threads destined for execution on that
1030  * cluster.
1031  *
1032  * The goal of this level of scheduling is to allow interactive thread
1033  * groups low latency access to the CPU. It also provides slight
1034  * scheduling preference for App and unrestricted thread groups.
1035  *
1036  * The clutch bucket scheduling algorithm measures an interactivity
1037  * score for all clutch bucket groups. The interactivity score is based
1038  * on the ratio of the CPU used and the voluntary blocking of threads
1039  * within the clutch bucket group. The algorithm is very close to the ULE
1040  * scheduler on FreeBSD in terms of calculations. The interactivity
1041  * score provides an interactivity boost in the range of
1042  * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1043  * thread groups to win over CPU spinners.
1044  *
1045  * The interactivity score of the clutch bucket group is combined with the
1046  * highest base/promoted priority of threads in the clutch bucket to form
1047  * the overall priority of the clutch bucket.
1048  */
1049 
1050 /* Priority boost range for interactivity */
1051 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT     (8)
1052 uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1053 
1054 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1055 uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1056 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS      (500000)
1057 
1058 /* The ratio to scale the cpu/blocked time per window */
1059 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO                (10)
1060 
1061 /* Initial value for voluntary blocking time for the clutch_bucket */
1062 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID          (uint64_t)(~0)
1063 
1064 /* Value indicating the clutch bucket is not pending execution */
1065 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID             ((uint64_t)(~0))
1066 
1067 /*
1068  * Thread group CPU starvation avoidance
1069  *
1070  * In heavily CPU contended scenarios, it is possible that some thread groups
1071  * which have a low interactivity score do not get CPU time at all. In order to
1072  * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1073  * bucket group when it has been pending execution for a certain time as defined
1074  * by the sched_clutch_bucket_group_pending_delta_us values below.
1075  *
1076  * The values chosen here are very close to the WCEL values for each sched bucket.
1077  * These values are multiplied by the load average of the relevant root bucket to
1078  * provide an estimate of the actual clutch bucket load.
1079  */
1080 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1081 	SCHED_CLUTCH_INVALID_TIME_32,           /* FIXPRI */
1082 	10000,                                  /* FG */
1083 	37500,                                  /* IN */
1084 	75000,                                  /* DF */
1085 	150000,                                 /* UT */
1086 	250000,                                 /* BG */
1087 };
1088 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1089 
1090 /*
1091  * sched_clutch_bucket_init()
1092  *
1093  * Initializer for clutch buckets.
1094  */
1095 static void
sched_clutch_bucket_init(sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_group_t clutch_bucket_group,sched_bucket_t bucket)1096 sched_clutch_bucket_init(
1097 	sched_clutch_bucket_t clutch_bucket,
1098 	sched_clutch_bucket_group_t clutch_bucket_group,
1099 	sched_bucket_t bucket)
1100 {
1101 	clutch_bucket->scb_bucket = bucket;
1102 	/* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1103 	clutch_bucket->scb_priority = 0;
1104 #if CONFIG_SCHED_EDGE
1105 	clutch_bucket->scb_foreign = false;
1106 	priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1107 #endif /* CONFIG_SCHED_EDGE */
1108 	clutch_bucket->scb_group = clutch_bucket_group;
1109 	clutch_bucket->scb_root = NULL;
1110 	priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1111 	priority_queue_init(&clutch_bucket->scb_thread_runq);
1112 	queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1113 }
1114 
1115 /*
1116  * sched_clutch_bucket_group_init()
1117  *
1118  * Initializer for clutch bucket groups.
1119  */
1120 static void
sched_clutch_bucket_group_init(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_t clutch,sched_bucket_t bucket)1121 sched_clutch_bucket_group_init(
1122 	sched_clutch_bucket_group_t clutch_bucket_group,
1123 	sched_clutch_t clutch,
1124 	sched_bucket_t bucket)
1125 {
1126 	bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1127 	clutch_bucket_group->scbg_bucket = bucket;
1128 	clutch_bucket_group->scbg_clutch = clutch;
1129 
1130 	int max_clusters = ml_get_cluster_count();
1131 	clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1132 	for (int i = 0; i < max_clusters; i++) {
1133 		sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1134 	}
1135 
1136 	os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1137 	os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1138 	os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1139 	/*
1140 	 * All thread groups should be initialized to be interactive; this allows the newly launched
1141 	 * thread groups to fairly compete with already running thread groups.
1142 	 */
1143 	clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1144 	clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1145 	os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1146 	clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1147 	clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1148 	clutch_bucket_group->scbg_amp_rebalance_last_chosen = UINT32_MAX;
1149 }
1150 
1151 static void
sched_clutch_bucket_group_destroy(sched_clutch_bucket_group_t clutch_bucket_group)1152 sched_clutch_bucket_group_destroy(
1153 	sched_clutch_bucket_group_t clutch_bucket_group)
1154 {
1155 	kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1156 	    clutch_bucket_group->scbg_clutch_buckets);
1157 }
1158 
1159 /*
1160  * sched_clutch_init_with_thread_group()
1161  *
1162  * Initialize the sched_clutch when the thread group is being created
1163  */
1164 void
sched_clutch_init_with_thread_group(sched_clutch_t clutch,struct thread_group * tg)1165 sched_clutch_init_with_thread_group(
1166 	sched_clutch_t clutch,
1167 	struct thread_group *tg)
1168 {
1169 	os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1170 
1171 	/* Initialize all the clutch buckets */
1172 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1173 		sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1174 	}
1175 
1176 	/* Grouping specific fields */
1177 	clutch->sc_tg = tg;
1178 }
1179 
1180 /*
1181  * sched_clutch_destroy()
1182  *
1183  * Destructor for clutch; called from thread group release code.
1184  */
1185 void
sched_clutch_destroy(sched_clutch_t clutch)1186 sched_clutch_destroy(
1187 	sched_clutch_t clutch)
1188 {
1189 	assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1190 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1191 		sched_clutch_bucket_group_destroy(&(clutch->sc_clutch_groups[i]));
1192 	}
1193 }
1194 
1195 #if CONFIG_SCHED_EDGE
1196 
1197 /*
1198  * Edge Scheduler Preferred Cluster Mechanism
1199  *
1200  * In order to have better control over various QoS buckets within a thread group, the Edge
1201  * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1202  * preferences are stored at the sched_clutch_bucket_group level since that represents all
1203  * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1204  * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1205  */
1206 
1207 static uint32_t
sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)1208 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1209 {
1210 	return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1211 }
1212 
1213 static uint32_t
sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)1214 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1215 {
1216 	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1217 }
1218 
1219 uint32_t
sched_edge_thread_preferred_cluster(thread_t thread)1220 sched_edge_thread_preferred_cluster(thread_t thread)
1221 {
1222 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1223 		/* For threads bound to a specific cluster, return the bound cluster id */
1224 		return sched_edge_thread_bound_cluster_id(thread);
1225 	}
1226 
1227 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1228 	sched_bucket_t sched_bucket = thread->th_sched_bucket;
1229 	if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1230 		sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1231 	}
1232 	sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1233 	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1234 }
1235 
1236 /*
1237  * Edge Scheduler Foreign Bucket Support
1238  *
1239  * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1240  * threads that are not native to the cluster. A clutch bucket is considered native if its
1241  * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1242  * bucket priority queue is used for rebalance operations to get threads back to their native
1243  * cluster quickly.
1244  *
1245  * It is possible to make this policy even more aggressive by considering all clusters that
1246  * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1247  * migrations which might have performance implications.
1248  */
1249 
1250 static void
sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1251 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1252 {
1253 	if (clutch_bucket->scb_foreign) {
1254 		clutch_bucket->scb_foreign = false;
1255 		priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1256 	}
1257 }
1258 
1259 static void
sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1260 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1261 {
1262 	if (!clutch_bucket->scb_foreign) {
1263 		clutch_bucket->scb_foreign = true;
1264 		priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1265 		priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1266 	}
1267 }
1268 
1269 /*
1270  * Edge Scheduler Cumulative Load Average
1271  *
1272  * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1273  * making thread migration decisions. The per-bucket load is maintained as a
1274  * cumulative count since higher scheduling buckets impact load on lower buckets
1275  * for thread migration decisions.
1276  *
1277  */
1278 
1279 static void
sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1280 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1281 {
1282 	switch (bucket) {
1283 	case TH_BUCKET_FIXPRI:    os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1284 	case TH_BUCKET_SHARE_FG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1285 	case TH_BUCKET_SHARE_IN:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1286 	case TH_BUCKET_SHARE_DF:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1287 	case TH_BUCKET_SHARE_UT:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1288 	case TH_BUCKET_SHARE_BG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1289 	default:
1290 		panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1291 	}
1292 }
1293 
1294 static void
sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1295 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1296 {
1297 	switch (bucket) {
1298 	case TH_BUCKET_FIXPRI:    os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1299 	case TH_BUCKET_SHARE_FG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1300 	case TH_BUCKET_SHARE_IN:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1301 	case TH_BUCKET_SHARE_DF:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1302 	case TH_BUCKET_SHARE_UT:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1303 	case TH_BUCKET_SHARE_BG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1304 	default:
1305 		panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1306 	}
1307 }
1308 
1309 uint16_t
sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1310 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1311 {
1312 	return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1313 }
1314 
1315 #endif /* CONFIG_SCHED_EDGE */
1316 
1317 /*
1318  * sched_clutch_bucket_hierarchy_insert()
1319  *
1320  * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1321  */
1322 static void
sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,sched_clutch_bucket_options_t options)1323 sched_clutch_bucket_hierarchy_insert(
1324 	sched_clutch_root_t root_clutch,
1325 	sched_clutch_bucket_t clutch_bucket,
1326 	sched_bucket_t bucket,
1327 	uint64_t timestamp,
1328 	sched_clutch_bucket_options_t options)
1329 {
1330 	sched_clutch_hierarchy_locked_assert(root_clutch);
1331 	if (bucket > TH_BUCKET_FIXPRI) {
1332 		/* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1333 		enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1334 	}
1335 #if CONFIG_SCHED_EDGE
1336 	/* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1337 	uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1338 	if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1339 		sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1340 	}
1341 #endif /* CONFIG_SCHED_EDGE */
1342 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1343 
1344 	/* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1345 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1346 		sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1347 	}
1348 
1349 	/* Insert the clutch bucket into the root bucket run queue with order based on options */
1350 	sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1351 	os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1352 	os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1353 }
1354 
1355 /*
1356  * sched_clutch_bucket_hierarchy_remove()
1357  *
1358  * Rotuine to remove a empty clutch bucket from the root hierarchy.
1359  */
1360 static void
sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,__unused sched_clutch_bucket_options_t options)1361 sched_clutch_bucket_hierarchy_remove(
1362 	sched_clutch_root_t root_clutch,
1363 	sched_clutch_bucket_t clutch_bucket,
1364 	sched_bucket_t bucket,
1365 	uint64_t timestamp,
1366 	__unused sched_clutch_bucket_options_t options)
1367 {
1368 	sched_clutch_hierarchy_locked_assert(root_clutch);
1369 	if (bucket > TH_BUCKET_FIXPRI) {
1370 		/* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1371 		remqueue(&clutch_bucket->scb_listlink);
1372 	}
1373 #if CONFIG_SCHED_EDGE
1374 	sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1375 #endif /* CONFIG_SCHED_EDGE */
1376 
1377 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1378 
1379 	/* Remove the clutch bucket from the root bucket priority queue */
1380 	sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1381 	os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1382 
1383 	/* If the root bucket priority queue is now empty, remove it from the root priority queue */
1384 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1385 		sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1386 	}
1387 	os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1388 }
1389 
1390 /*
1391  * sched_clutch_bucket_base_pri()
1392  *
1393  * Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1394  * highest base_pri and the highest sched_pri in the clutch bucket.
1395  */
1396 static uint8_t
sched_clutch_bucket_base_pri(sched_clutch_bucket_t clutch_bucket)1397 sched_clutch_bucket_base_pri(
1398 	sched_clutch_bucket_t clutch_bucket)
1399 {
1400 	assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1401 	/*
1402 	 * Since the clutch bucket can contain threads that are members of the group due
1403 	 * to the sched_pri being promoted or due to their base pri, the base priority of
1404 	 * the entire clutch bucket should be based on the highest thread (promoted or base)
1405 	 * in the clutch bucket.
1406 	 */
1407 	uint8_t max_pri = 0;
1408 	if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1409 		max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1410 	}
1411 	return max_pri;
1412 }
1413 
1414 /*
1415  * sched_clutch_interactivity_from_cpu_data()
1416  *
1417  * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1418  */
1419 static uint8_t
sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)1420 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1421 {
1422 	sched_clutch_bucket_cpu_data_t scb_cpu_data;
1423 	scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1424 	clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1425 	clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1426 	uint8_t interactive_score = 0;
1427 
1428 	if ((cpu_blocked == 0) && (cpu_used == 0)) {
1429 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1430 	}
1431 	/*
1432 	 * For all timeshare buckets, calculate the interactivity score of the bucket
1433 	 * and add it to the base priority
1434 	 */
1435 	if (cpu_blocked > cpu_used) {
1436 		/* Interactive clutch_bucket case */
1437 		interactive_score = sched_clutch_bucket_group_interactive_pri +
1438 		    ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1439 	} else {
1440 		/* Non-interactive clutch_bucket case */
1441 		interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1442 	}
1443 	return interactive_score;
1444 }
1445 
1446 /*
1447  * sched_clutch_bucket_pri_calculate()
1448  *
1449  * The priority calculation algorithm for the clutch_bucket is a slight
1450  * modification on the ULE interactivity score. It uses the base priority
1451  * of the clutch bucket and applies an interactivity score boost to the
1452  * highly responsive clutch buckets.
1453  */
1454 static uint8_t
sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t clutch_bucket,uint64_t timestamp)1455 sched_clutch_bucket_pri_calculate(
1456 	sched_clutch_bucket_t clutch_bucket,
1457 	uint64_t timestamp)
1458 {
1459 	/* For empty clutch buckets, return priority 0 */
1460 	if (clutch_bucket->scb_thr_count == 0) {
1461 		return 0;
1462 	}
1463 
1464 	uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1465 	uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1466 
1467 	assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1468 	uint8_t pri = base_pri + interactive_score;
1469 	if (pri != clutch_bucket->scb_priority) {
1470 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1471 		    thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1472 	}
1473 	return pri;
1474 }
1475 
1476 /*
1477  * sched_clutch_root_bucket_highest_clutch_bucket()
1478  *
1479  * Routine to find the highest priority clutch bucket
1480  * within the root bucket.
1481  */
1482 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_bucket_t root_bucket)1483 sched_clutch_root_bucket_highest_clutch_bucket(
1484 	sched_clutch_root_bucket_t root_bucket)
1485 {
1486 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1487 		return NULL;
1488 	}
1489 	return sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1490 }
1491 
1492 /*
1493  * sched_clutch_bucket_runnable()
1494  *
1495  * Perform all operations needed when a new clutch bucket becomes runnable.
1496  * It involves inserting the clutch_bucket into the hierarchy and updating the
1497  * root priority appropriately.
1498  */
1499 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1500 sched_clutch_bucket_runnable(
1501 	sched_clutch_bucket_t clutch_bucket,
1502 	sched_clutch_root_t root_clutch,
1503 	uint64_t timestamp,
1504 	sched_clutch_bucket_options_t options)
1505 {
1506 	sched_clutch_hierarchy_locked_assert(root_clutch);
1507 	/* Since the clutch bucket became newly runnable, update its pending timestamp */
1508 	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1509 	sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1510 
1511 	/* Update the timesharing properties of this clutch_bucket; also done every sched_tick */
1512 	sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp);
1513 	int16_t root_old_pri = root_clutch->scr_priority;
1514 	sched_clutch_root_pri_update(root_clutch);
1515 	return root_clutch->scr_priority > root_old_pri;
1516 }
1517 
1518 /*
1519  * sched_clutch_bucket_update()
1520  *
1521  * Update the clutch_bucket's position in the hierarchy. This routine is
1522  * called when a new thread is inserted or removed from a runnable clutch
1523  * bucket. The options specify some properties about the clutch bucket
1524  * insertion order into the clutch bucket runq.
1525  */
1526 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1527 sched_clutch_bucket_update(
1528 	sched_clutch_bucket_t clutch_bucket,
1529 	sched_clutch_root_t root_clutch,
1530 	uint64_t timestamp,
1531 	sched_clutch_bucket_options_t options)
1532 {
1533 	sched_clutch_hierarchy_locked_assert(root_clutch);
1534 	uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1535 	sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1536 	if (new_pri == clutch_bucket->scb_priority) {
1537 		/*
1538 		 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1539 		 * to the end of the runq. Typically used when a thread is selected for execution
1540 		 * from a clutch bucket.
1541 		 */
1542 		if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1543 			sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1544 		}
1545 		return false;
1546 	}
1547 	sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1548 #if CONFIG_SCHED_EDGE
1549 	if (clutch_bucket->scb_foreign) {
1550 		priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1551 	}
1552 #endif /* CONFIG_SCHED_EDGE */
1553 	clutch_bucket->scb_priority = new_pri;
1554 #if CONFIG_SCHED_EDGE
1555 	if (clutch_bucket->scb_foreign) {
1556 		priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1557 		priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1558 	}
1559 #endif /* CONFIG_SCHED_EDGE */
1560 	sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1561 
1562 	int16_t root_old_pri = root_clutch->scr_priority;
1563 	sched_clutch_root_pri_update(root_clutch);
1564 	return root_clutch->scr_priority > root_old_pri;
1565 }
1566 
1567 /*
1568  * sched_clutch_bucket_empty()
1569  *
1570  * Perform all the operations needed when a clutch_bucket is no longer runnable.
1571  * It involves removing the clutch bucket from the hierarchy and updaing the root
1572  * priority appropriately.
1573  */
1574 static void
sched_clutch_bucket_empty(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1575 sched_clutch_bucket_empty(
1576 	sched_clutch_bucket_t clutch_bucket,
1577 	sched_clutch_root_t root_clutch,
1578 	uint64_t timestamp,
1579 	sched_clutch_bucket_options_t options)
1580 {
1581 	sched_clutch_hierarchy_locked_assert(root_clutch);
1582 	sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1583 	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1584 	sched_clutch_root_pri_update(root_clutch);
1585 }
1586 
1587 /*
1588  * sched_clutch_cpu_usage_update()
1589  *
1590  * Routine to update CPU usage of the thread in the hierarchy.
1591  */
1592 void
sched_clutch_cpu_usage_update(thread_t thread,uint64_t delta)1593 sched_clutch_cpu_usage_update(
1594 	thread_t thread,
1595 	uint64_t delta)
1596 {
1597 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1598 		return;
1599 	}
1600 
1601 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1602 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1603 	sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1604 }
1605 
1606 /*
1607  * sched_clutch_bucket_group_cpu_usage_update()
1608  *
1609  * Routine to update the CPU usage of the clutch_bucket.
1610  */
1611 static void
sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t delta)1612 sched_clutch_bucket_group_cpu_usage_update(
1613 	sched_clutch_bucket_group_t clutch_bucket_group,
1614 	uint64_t delta)
1615 {
1616 	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
1617 		/* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1618 		return;
1619 	}
1620 	delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1621 	os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1622 }
1623 
1624 /*
1625  * sched_clutch_bucket_group_cpu_pending_adjust()
1626  *
1627  * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1628  * such that one "pending interval" provides one point improvement in interactivity score.
1629  */
1630 static inline uint64_t
sched_clutch_bucket_group_cpu_pending_adjust(uint64_t cpu_used,uint64_t cpu_blocked,uint8_t pending_intervals)1631 sched_clutch_bucket_group_cpu_pending_adjust(
1632 	uint64_t cpu_used,
1633 	uint64_t cpu_blocked,
1634 	uint8_t pending_intervals)
1635 {
1636 	uint64_t cpu_used_adjusted = 0;
1637 	if (cpu_blocked < cpu_used) {
1638 		cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1639 		cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1640 	} else {
1641 		uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1642 		cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1643 	}
1644 	return cpu_used_adjusted;
1645 }
1646 
1647 /*
1648  * sched_clutch_bucket_group_cpu_adjust()
1649  *
1650  * Routine to scale the cpu usage and blocked time once the sum gets bigger
1651  * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1652  * manageable and maintain the same ratio while allowing clutch buckets to
1653  * adjust behavior and reflect in the interactivity score in a reasonable
1654  * amount of time. Also adjusts the CPU usage based on pending_intervals
1655  * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1656  */
1657 static void
sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t clutch_bucket_group,uint8_t pending_intervals)1658 sched_clutch_bucket_group_cpu_adjust(
1659 	sched_clutch_bucket_group_t clutch_bucket_group,
1660 	uint8_t pending_intervals)
1661 {
1662 	sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1663 	sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1664 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1665 		clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1666 		clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1667 
1668 		if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1669 		        /* No changes to the CPU used and blocked values */
1670 		        os_atomic_rmw_loop_give_up();
1671 		}
1672 		if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1673 		        /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1674 		        cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1675 		        cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1676 		}
1677 		/* Use the shift passed in to ageout the CPU usage */
1678 		cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1679 		new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1680 		new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1681 	});
1682 }
1683 
1684 /*
1685  * Thread level scheduling algorithm
1686  *
1687  * The thread level scheduling algorithm uses the mach timeshare
1688  * decay based algorithm to achieve sharing between threads within the
1689  * same clutch bucket. The load/priority shifts etc. are all maintained
1690  * at the clutch bucket level and used for decay calculation of the
1691  * threads. The load sampling is still driven off the scheduler tick
1692  * for runnable clutch buckets (it does not use the new higher frequency
1693  * EWMA based load calculation). The idea is that the contention and load
1694  * within clutch_buckets should be limited enough to not see heavy decay
1695  * and timeshare effectively.
1696  */
1697 
1698 /*
1699  * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1700  *
1701  * Increment the run count for the clutch bucket associated with the
1702  * thread.
1703  */
1704 uint32_t
sched_clutch_thread_run_bucket_incr(thread_t thread,sched_bucket_t bucket)1705 sched_clutch_thread_run_bucket_incr(
1706 	thread_t thread,
1707 	sched_bucket_t bucket)
1708 {
1709 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1710 		return 0;
1711 	}
1712 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1713 	return sched_clutch_run_bucket_incr(clutch, bucket);
1714 }
1715 
1716 static uint32_t
sched_clutch_run_bucket_incr(sched_clutch_t clutch,sched_bucket_t bucket)1717 sched_clutch_run_bucket_incr(
1718 	sched_clutch_t clutch,
1719 	sched_bucket_t bucket)
1720 {
1721 	assert(bucket != TH_BUCKET_RUN);
1722 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1723 	return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
1724 }
1725 
1726 /*
1727  * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
1728  *
1729  * Decrement the run count for the clutch bucket associated with the
1730  * thread.
1731  */
1732 uint32_t
sched_clutch_thread_run_bucket_decr(thread_t thread,sched_bucket_t bucket)1733 sched_clutch_thread_run_bucket_decr(
1734 	thread_t thread,
1735 	sched_bucket_t bucket)
1736 {
1737 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
1738 		return 0;
1739 	}
1740 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1741 	return sched_clutch_run_bucket_decr(clutch, bucket);
1742 }
1743 
1744 static uint32_t
sched_clutch_run_bucket_decr(sched_clutch_t clutch,sched_bucket_t bucket)1745 sched_clutch_run_bucket_decr(
1746 	sched_clutch_t clutch,
1747 	sched_bucket_t bucket)
1748 {
1749 	assert(bucket != TH_BUCKET_RUN);
1750 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
1751 	return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
1752 }
1753 
1754 /*
1755  * sched_clutch_bucket_group_timeshare_update()
1756  *
1757  * Routine to update the load and priority shift for the clutch_bucket_group
1758  * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
1759  * clutch buckets with runnable threads in them. So it is important to maintain
1760  * the timesharing information at the clutch_bucket_group level instead of
1761  * individual clutch buckets (because the algorithm is trying to timeshare all
1762  * threads at the same QoS irrespective of which hierarchy they are enqueued in).
1763  *
1764  * The routine is called from the sched tick handling code to make sure this value
1765  * is updated at least once every sched tick. For clutch bucket groups which have
1766  * not been runnable for very long, the clutch_bucket_group maintains a "last
1767  * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
1768  * if this value is outdated, the load and shifts are updated.
1769  *
1770  * Possible optimization:
1771  * - The current algorithm samples the load every sched tick (125ms).
1772  *   This is prone to spikes in runnable counts; if that turns out to be
1773  *   a problem, a simple solution would be to do the EWMA trick to sample
1774  *   load at every load_tick (30ms) and use the averaged value for the pri
1775  *   shift calculation.
1776  */
1777 static void
sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_bucket_t clutch_bucket,uint64_t ctime)1778 sched_clutch_bucket_group_timeshare_update(
1779 	sched_clutch_bucket_group_t clutch_bucket_group,
1780 	sched_clutch_bucket_t clutch_bucket,
1781 	uint64_t ctime)
1782 {
1783 	if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) {
1784 		/* No timesharing needed for fixed priority Above UI threads */
1785 		return;
1786 	}
1787 
1788 	/*
1789 	 * Update the timeshare parameters for the clutch bucket group
1790 	 * if they havent been updated in this tick.
1791 	 */
1792 	uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
1793 	uint32_t current_sched_ts = sched_tick;
1794 	if (sched_ts < current_sched_ts) {
1795 		os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
1796 		/* NCPU wide workloads should not experience decay */
1797 		uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
1798 		uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
1799 		bucket_group_load = MIN(bucket_group_load, NRQS - 1);
1800 		uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
1801 		/* Ensure that the pri_shift value is reasonable */
1802 		pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
1803 		os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
1804 	}
1805 
1806 	/*
1807 	 * Update the clutch bucket priority; this allows clutch buckets that have been pending
1808 	 * for a long time to get an updated interactivity score.
1809 	 */
1810 	sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
1811 }
1812 
1813 /*
1814  * sched_clutch_thread_clutch_update()
1815  *
1816  * Routine called when the thread changes its thread group. The current
1817  * implementation relies on the fact that the thread group is changed only from
1818  * the context of the thread itself or when the thread is runnable but not in a
1819  * runqueue. Due to this fact, the thread group change causes only counter
1820  * updates in the old & new clutch buckets and no hierarchy changes. The routine
1821  * also attributes the CPU used so far to the old clutch.
1822  */
1823 void
sched_clutch_thread_clutch_update(thread_t thread,sched_clutch_t old_clutch,sched_clutch_t new_clutch)1824 sched_clutch_thread_clutch_update(
1825 	thread_t thread,
1826 	sched_clutch_t old_clutch,
1827 	sched_clutch_t new_clutch)
1828 {
1829 	uint32_t cpu_delta;
1830 
1831 	if (old_clutch) {
1832 		assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
1833 
1834 		sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
1835 		/*
1836 		 * Calculate the CPU used by this thread in the old bucket and
1837 		 * add it to the old clutch bucket. This uses the same CPU usage
1838 		 * logic as update_priority etc.
1839 		 */
1840 		sched_tick_delta(thread, cpu_delta);
1841 		if (thread->pri_shift < INT8_MAX) {
1842 			thread->sched_usage += cpu_delta;
1843 		}
1844 		thread->cpu_delta += cpu_delta;
1845 		if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1846 			sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
1847 			sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
1848 		}
1849 	}
1850 
1851 	if (new_clutch) {
1852 		sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
1853 	}
1854 }
1855 
1856 /* Thread Insertion/Removal/Selection routines */
1857 
1858 #if CONFIG_SCHED_EDGE
1859 
1860 /*
1861  * Edge Scheduler Bound Thread Support
1862  *
1863  * The edge scheduler allows threads to be bound to specific clusters. The scheduler
1864  * maintains a separate runq on the clutch root to hold these bound threads. These
1865  * bound threads count towards the root priority and thread count, but are ignored
1866  * for thread migration/steal decisions. Bound threads that are enqueued in the
1867  * separate runq have the th_bound_cluster_enqueued flag set to allow easy
1868  * removal.
1869  *
1870  * Bound Threads Timesharing
1871  * The bound threads share the timesharing properties of the clutch bucket group they are
1872  * part of. They contribute to the load and use priority shifts/decay values from the
1873  * clutch bucket group.
1874  */
1875 
1876 static boolean_t
sched_edge_bound_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)1877 sched_edge_bound_thread_insert(
1878 	sched_clutch_root_t root_clutch,
1879 	thread_t thread,
1880 	integer_t options)
1881 {
1882 	/* Update the clutch runnable count and priority */
1883 	sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
1884 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1885 	if (root_bucket->scrb_bound_thread_runq.count == 0) {
1886 		sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
1887 	}
1888 
1889 	assert((thread->th_bound_cluster_enqueued) == false);
1890 	run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
1891 	thread->th_bound_cluster_enqueued = true;
1892 
1893 	/* Increment the urgency counter for the root if necessary */
1894 	sched_clutch_root_urgency_inc(root_clutch, thread);
1895 
1896 	int16_t root_old_pri = root_clutch->scr_priority;
1897 	sched_clutch_root_pri_update(root_clutch);
1898 	return root_clutch->scr_priority > root_old_pri;
1899 }
1900 
1901 static void
sched_edge_bound_thread_remove(sched_clutch_root_t root_clutch,thread_t thread)1902 sched_edge_bound_thread_remove(
1903 	sched_clutch_root_t root_clutch,
1904 	thread_t thread)
1905 {
1906 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
1907 	assert((thread->th_bound_cluster_enqueued) == true);
1908 	run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
1909 	thread->th_bound_cluster_enqueued = false;
1910 
1911 	/* Decrement the urgency counter for the root if necessary */
1912 	sched_clutch_root_urgency_dec(root_clutch, thread);
1913 
1914 	/* Update the clutch runnable count and priority */
1915 	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
1916 	if (root_bucket->scrb_bound_thread_runq.count == 0) {
1917 		sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
1918 	}
1919 	sched_clutch_root_pri_update(root_clutch);
1920 }
1921 
1922 /*
1923  * Edge Scheduler cluster shared resource threads load balancing
1924  *
1925  * The Edge scheduler attempts to load balance cluster shared resource intensive threads
1926  * across clusters in order to reduce contention on the shared resources. It achieves
1927  * that by maintaining the runnable and running shared resource load on each cluster
1928  * and balancing the load across multiple clusters.
1929  *
1930  * The current implementation for cluster shared resource load balancing looks at
1931  * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
1932  * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
1933  * do not steal/rebalance shared resource threads. Some more details for the implementation:
1934  *
1935  * - When threads are tagged as shared resource, they go through the cluster selection logic
1936  *   which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
1937  *   enqueued in the cluster bound runqueue.
1938  *
1939  * - When the threads start running and call avoid_processor, the load balancing logic will be
1940  *   invoked and cause the thread to be sent to a more preferred cluster if one exists and has
1941  *   no shared resource load.
1942  *
1943  * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
1944  *   it will look at running shared resource threads on foreign clusters and actively rebalance them.
1945  *
1946  * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
1947  *   go idle intentionally.
1948  *
1949  * - One caveat of this design is that if a preferred CPU has already run and finished its shared
1950  *   resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
1951  *   The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
1952  *   events listed above happen.
1953  *
1954  * - Also it currently does not consider other properties such as thread priorities and
1955  *   qos level thread load in the thread placement decision.
1956  *
1957  * Edge Scheduler cluster shared resource thread scheduling policy
1958  *
1959  * The threads for shared resources can be scheduled using one of the two policies:
1960  *
1961  * EDGE_SHARED_RSRC_SCHED_POLICY_RR
1962  * This policy distributes the threads so that they spread across all available clusters
1963  * irrespective of type. The idea is that this scheduling policy will put a shared resource
1964  * thread on each cluster on the platform before it starts doubling up on clusters.
1965  *
1966  * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
1967  * This policy distributes threads so that the threads first fill up all the capacity on
1968  * the preferred cluster and its homogeneous peers before spilling to different core type.
1969  * The current implementation defines capacity based on the number of CPUs in the cluster;
1970  * so a cluster's shared resource is considered full if there are "n" runnable + running
1971  * shared resource threads on the cluster with n cpus. This policy is different from the
1972  * default scheduling policy of the edge scheduler since this always tries to fill up the
1973  * native clusters to capacity even when non-native clusters might be idle.
1974  */
1975 __options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
1976 	EDGE_SHARED_RSRC_SCHED_POLICY_RR                = 0,
1977 	EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST      = 1,
1978 });
1979 
1980 const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
1981 	[CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
1982 	[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
1983 };
1984 
1985 static void
sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch,thread_t thread)1986 sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
1987 {
1988 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
1989 		root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
1990 		thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
1991 	}
1992 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
1993 		root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
1994 		thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
1995 	}
1996 }
1997 
1998 static void
sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch,thread_t thread)1999 sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
2000 {
2001 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2002 		if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2003 			thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2004 			root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2005 		}
2006 	}
2007 }
2008 
2009 uint16_t
sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch,cluster_shared_rsrc_type_t shared_rsrc_type)2010 sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2011 {
2012 	return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2013 }
2014 
2015 /*
2016  * sched_edge_shared_rsrc_idle()
2017  *
2018  * Routine used to determine if the constrained resource for the pset is idle. This is
2019  * used by a CPU going idle to decide if it should rebalance a running shared resource
2020  * thread from a non-preferred cluster.
2021  */
2022 static boolean_t
sched_edge_shared_rsrc_idle(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2023 sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2024 {
2025 	return sched_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
2026 }
2027 
2028 /*
2029  * sched_edge_thread_shared_rsrc_type
2030  *
2031  * This routine decides if a given thread needs special handling for being a
2032  * heavy shared resource user. It is valid for the same thread to be using
2033  * several shared resources at the same time and have multiple policy flags set.
2034  * This routine determines which of those properties will be used for load
2035  * balancing and migration decisions.
2036  */
2037 static cluster_shared_rsrc_type_t
sched_edge_thread_shared_rsrc_type(thread_t thread)2038 sched_edge_thread_shared_rsrc_type(thread_t thread)
2039 {
2040 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2041 		return CLUSTER_SHARED_RSRC_TYPE_RR;
2042 	}
2043 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2044 		return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2045 	}
2046 	return CLUSTER_SHARED_RSRC_TYPE_NONE;
2047 }
2048 
2049 #endif /* CONFIG_SCHED_EDGE */
2050 
2051 /*
2052  * sched_clutch_thread_bound_lookup()
2053  *
2054  * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2055  */
2056 static thread_t
sched_clutch_thread_bound_lookup(__unused sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket)2057 sched_clutch_thread_bound_lookup(
2058 	__unused sched_clutch_root_t root_clutch,
2059 	sched_clutch_root_bucket_t root_bucket)
2060 {
2061 	return run_queue_peek(&root_bucket->scrb_bound_thread_runq);
2062 }
2063 
2064 /*
2065  * Clutch Bucket Group Thread Counts and Pending time calculation
2066  *
2067  * The pending time on the clutch_bucket_group allows the scheduler to track if it
2068  * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2069  * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2070  * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2071  * pending time is advanced to the time of thread selection.
2072  *
2073  * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2074  * simulataneously, it is important that the updates to thread counts and pending timestamps
2075  * happen atomically. The implementation relies on the following aspects to make that work
2076  * as expected:
2077  * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2078  *   is held when threads are added/removed and pending timestamps are updated
2079  * - The thread count and pending timestamp can be updated atomically using double wide
2080  *   128 bit atomics
2081  *
2082  * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2083  * above to atomically update the interactivity score for a clutch bucket group.
2084  */
2085 
2086 #if CONFIG_SCHED_EDGE
2087 
2088 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2089 sched_clutch_bucket_group_thr_count_inc(
2090 	sched_clutch_bucket_group_t clutch_bucket_group,
2091 	uint64_t timestamp)
2092 {
2093 	sched_clutch_counter_time_t old_pending_data;
2094 	sched_clutch_counter_time_t new_pending_data;
2095 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2096 		new_pending_data.scct_count = old_pending_data.scct_count + 1;
2097 		new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2098 		if (old_pending_data.scct_count == 0) {
2099 		        new_pending_data.scct_timestamp = timestamp;
2100 		}
2101 	});
2102 }
2103 
2104 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2105 sched_clutch_bucket_group_thr_count_dec(
2106 	sched_clutch_bucket_group_t clutch_bucket_group,
2107 	uint64_t timestamp)
2108 {
2109 	sched_clutch_counter_time_t old_pending_data;
2110 	sched_clutch_counter_time_t new_pending_data;
2111 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2112 		new_pending_data.scct_count = old_pending_data.scct_count - 1;
2113 		if (new_pending_data.scct_count == 0) {
2114 		        new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2115 		} else {
2116 		        new_pending_data.scct_timestamp = timestamp;
2117 		}
2118 	});
2119 }
2120 
2121 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2122 sched_clutch_bucket_group_pending_ageout(
2123 	sched_clutch_bucket_group_t clutch_bucket_group,
2124 	uint64_t timestamp)
2125 {
2126 	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2127 	sched_clutch_counter_time_t old_pending_data;
2128 	sched_clutch_counter_time_t new_pending_data;
2129 	uint8_t cpu_usage_shift = 0;
2130 
2131 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2132 		cpu_usage_shift = 0;
2133 		uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2134 		bool old_update = (old_pending_ts >= timestamp);
2135 		bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2136 		bool no_bucket_load = (bucket_load == 0);
2137 		if (old_update || no_pending_time || no_bucket_load) {
2138 		        os_atomic_rmw_loop_give_up();
2139 		}
2140 
2141 		/* Calculate the time the clutch bucket group has been pending */
2142 		uint64_t pending_delta = timestamp - old_pending_ts;
2143 		uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2144 		if (pending_delta < interactivity_delta) {
2145 		        os_atomic_rmw_loop_give_up();
2146 		}
2147 		cpu_usage_shift = (pending_delta / interactivity_delta);
2148 		new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2149 		new_pending_data.scct_count = old_pending_data.scct_count;
2150 	});
2151 	return cpu_usage_shift;
2152 }
2153 
2154 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2155 sched_clutch_bucket_group_interactivity_score_calculate(
2156 	sched_clutch_bucket_group_t clutch_bucket_group,
2157 	uint64_t timestamp)
2158 {
2159 	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2160 		/*
2161 		 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2162 		 * priorities, make sure all AboveUI buckets are marked interactive.
2163 		 */
2164 		assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2165 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2166 	}
2167 	/* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2168 	uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2169 	/* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2170 	sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2171 	uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2172 	sched_clutch_counter_time_t old_interactivity_data;
2173 	sched_clutch_counter_time_t new_interactivity_data;
2174 
2175 	bool score_updated = os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2176 		if (old_interactivity_data.scct_timestamp >= timestamp) {
2177 		        os_atomic_rmw_loop_give_up();
2178 		}
2179 		new_interactivity_data.scct_timestamp = timestamp;
2180 		if (old_interactivity_data.scct_timestamp != 0) {
2181 		        new_interactivity_data.scct_count = interactivity_score;
2182 		}
2183 	});
2184 	if (score_updated) {
2185 		return (uint8_t)new_interactivity_data.scct_count;
2186 	} else {
2187 		return (uint8_t)old_interactivity_data.scct_count;
2188 	}
2189 }
2190 
2191 #else /* CONFIG_SCHED_EDGE */
2192 
2193 /*
2194  * For the clutch scheduler, atomicity is ensured by making sure all operations
2195  * are happening under the pset lock of the only cluster present on the platform.
2196  */
2197 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2198 sched_clutch_bucket_group_thr_count_inc(
2199 	sched_clutch_bucket_group_t clutch_bucket_group,
2200 	uint64_t timestamp)
2201 {
2202 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2203 	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2204 		clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2205 	}
2206 	clutch_bucket_group->scbg_pending_data.scct_count++;
2207 }
2208 
2209 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2210 sched_clutch_bucket_group_thr_count_dec(
2211 	sched_clutch_bucket_group_t clutch_bucket_group,
2212 	uint64_t timestamp)
2213 {
2214 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2215 	clutch_bucket_group->scbg_pending_data.scct_count--;
2216 	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2217 		clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2218 	} else {
2219 		clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2220 	}
2221 }
2222 
2223 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2224 sched_clutch_bucket_group_pending_ageout(
2225 	sched_clutch_bucket_group_t clutch_bucket_group,
2226 	uint64_t timestamp)
2227 {
2228 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2229 	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2230 	uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2231 	bool old_update = (old_pending_ts >= timestamp);
2232 	bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2233 	bool no_bucket_load = (bucket_load == 0);
2234 	if (old_update || no_pending_time || no_bucket_load) {
2235 		return 0;
2236 	}
2237 	uint64_t pending_delta = timestamp - old_pending_ts;
2238 	uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] * bucket_load;
2239 	if (pending_delta < interactivity_delta) {
2240 		return 0;
2241 	}
2242 	uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2243 	clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2244 	return cpu_usage_shift;
2245 }
2246 
2247 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2248 sched_clutch_bucket_group_interactivity_score_calculate(
2249 	sched_clutch_bucket_group_t clutch_bucket_group,
2250 	uint64_t timestamp)
2251 {
2252 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2253 	if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) {
2254 		/*
2255 		 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2256 		 * priorities, make sure all AboveUI buckets are marked interactive.
2257 		 */
2258 		assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2259 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2260 	}
2261 	/* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2262 	uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2263 	/* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2264 	sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2265 	uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2266 	if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2267 		clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2268 		clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2269 		return interactivity_score;
2270 	} else {
2271 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2272 	}
2273 }
2274 
2275 #endif /* CONFIG_SCHED_EDGE */
2276 
2277 /*
2278  * Clutch Bucket Group Run Count and Blocked Time Accounting
2279  *
2280  * The clutch bucket group maintains the number of runnable/running threads in the group.
2281  * Since the blocked time of the clutch bucket group is based on this count, it is
2282  * important to make sure the blocking timestamp and the run count are updated atomically.
2283  *
2284  * Since the run count increments happen without any pset locks held, the scheduler updates
2285  * the count & timestamp using double wide 128 bit atomics.
2286  */
2287 
2288 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2289 sched_clutch_bucket_group_run_count_inc(
2290 	sched_clutch_bucket_group_t clutch_bucket_group)
2291 {
2292 	sched_clutch_counter_time_t old_blocked_data;
2293 	sched_clutch_counter_time_t new_blocked_data;
2294 
2295 	bool update_blocked_time = false;
2296 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2297 		new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2298 		new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2299 		update_blocked_time = false;
2300 		if (old_blocked_data.scct_count == 0) {
2301 		        new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2302 		        update_blocked_time = true;
2303 		}
2304 	});
2305 	if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2306 		uint64_t ctime = mach_absolute_time();
2307 		if (ctime > old_blocked_data.scct_timestamp) {
2308 			uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2309 			blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2310 			os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2311 		}
2312 	}
2313 	return (uint32_t)new_blocked_data.scct_count;
2314 }
2315 
2316 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2317 sched_clutch_bucket_group_run_count_dec(
2318 	sched_clutch_bucket_group_t clutch_bucket_group)
2319 {
2320 	sched_clutch_counter_time_t old_blocked_data;
2321 	sched_clutch_counter_time_t new_blocked_data;
2322 
2323 	uint64_t ctime = mach_absolute_time();
2324 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2325 		new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2326 		new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2327 		if (new_blocked_data.scct_count == 0) {
2328 		        new_blocked_data.scct_timestamp = ctime;
2329 		}
2330 	});
2331 	return (uint32_t)new_blocked_data.scct_count;
2332 }
2333 
2334 /*
2335  * sched_clutch_thread_insert()
2336  *
2337  * Routine to insert a thread into the sched clutch hierarchy.
2338  * Update the counts at all levels of the hierarchy and insert the nodes
2339  * as they become runnable. Always called with the pset lock held.
2340  */
2341 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2342 sched_clutch_thread_insert(
2343 	sched_clutch_root_t root_clutch,
2344 	thread_t thread,
2345 	integer_t options)
2346 {
2347 	boolean_t result = FALSE;
2348 
2349 	sched_clutch_hierarchy_locked_assert(root_clutch);
2350 #if CONFIG_SCHED_EDGE
2351 	sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2352 	sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2353 	/*
2354 	 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2355 	 * One scenario where a bound thread might not be getting enqueued in the bound cluster
2356 	 * hierarchy would be if the thread is "soft" bound and the bound cluster is
2357 	 * de-recommended. In that case, the thread should be treated as an unbound
2358 	 * thread.
2359 	 */
2360 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2361 		return sched_edge_bound_thread_insert(root_clutch, thread, options);
2362 	}
2363 	/*
2364 	 * Use bound runqueue for shared resource threads. See "cluster shared resource
2365 	 * threads load balancing" section for details.
2366 	 */
2367 	if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2368 		return sched_edge_bound_thread_insert(root_clutch, thread, options);
2369 	}
2370 
2371 #endif /* CONFIG_SCHED_EDGE */
2372 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2373 	assert(thread->thread_group == clutch->sc_tg);
2374 
2375 	uint64_t current_timestamp = mach_absolute_time();
2376 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2377 	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2378 	assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2379 
2380 	/*
2381 	 * Thread linkage in clutch_bucket
2382 	 *
2383 	 * A thread has a few linkages within the clutch bucket:
2384 	 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2385 	 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2386 	 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2387 	 */
2388 
2389 	/* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2390 	thread->th_clutch_runq_link.stamp = current_timestamp;
2391 	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2392 	    (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2393 	priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2394 
2395 	/* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2396 	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2397 	    sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2398 	priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2399 
2400 	/* Insert thread into timesharing queue of the clutch bucket */
2401 	enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2402 
2403 	/* Increment the urgency counter for the root if necessary */
2404 	sched_clutch_root_urgency_inc(root_clutch, thread);
2405 
2406 	os_atomic_inc(&clutch->sc_thr_count, relaxed);
2407 	sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2408 
2409 	/* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2410 	sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2411 	if (clutch_bucket->scb_thr_count == 0) {
2412 		sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2413 		sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2414 		result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2415 	} else {
2416 		sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2417 		sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2418 		result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2419 	}
2420 
2421 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2422 	    root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2423 	    SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2424 	return result;
2425 }
2426 
2427 /*
2428  * sched_clutch_thread_remove()
2429  *
2430  * Routine to remove a thread from the sched clutch hierarchy.
2431  * Update the counts at all levels of the hierarchy and remove the nodes
2432  * as they become empty. Always called with the pset lock held.
2433  */
2434 static void
sched_clutch_thread_remove(sched_clutch_root_t root_clutch,thread_t thread,uint64_t current_timestamp,sched_clutch_bucket_options_t options)2435 sched_clutch_thread_remove(
2436 	sched_clutch_root_t root_clutch,
2437 	thread_t thread,
2438 	uint64_t current_timestamp,
2439 	sched_clutch_bucket_options_t options)
2440 {
2441 	sched_clutch_hierarchy_locked_assert(root_clutch);
2442 #if CONFIG_SCHED_EDGE
2443 	sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2444 	sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2445 
2446 	if (thread->th_bound_cluster_enqueued) {
2447 		sched_edge_bound_thread_remove(root_clutch, thread);
2448 		return;
2449 	}
2450 #endif /* CONFIG_SCHED_EDGE */
2451 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2452 	assert(thread->thread_group == clutch->sc_tg);
2453 	thread_assert_runq_nonnull(thread);
2454 
2455 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2456 	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2457 	assert(clutch_bucket->scb_root == root_clutch);
2458 
2459 	/* Decrement the urgency counter for the root if necessary */
2460 	sched_clutch_root_urgency_dec(root_clutch, thread);
2461 	/* Remove thread from the clutch_bucket */
2462 	priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2463 	remqueue(&thread->th_clutch_timeshare_link);
2464 
2465 	priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2466 
2467 	/*
2468 	 * Warning: After this point, the thread's scheduling fields may be
2469 	 * modified by other cores that acquire the thread lock.
2470 	 */
2471 	thread_clear_runq(thread);
2472 
2473 	/* Update counts at various levels of the hierarchy */
2474 	os_atomic_dec(&clutch->sc_thr_count, relaxed);
2475 	sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2476 	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2477 	sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2478 
2479 	/* Remove the clutch from hierarchy (if needed) and update properties */
2480 	if (clutch_bucket->scb_thr_count == 0) {
2481 		sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2482 	} else {
2483 		sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2484 	}
2485 
2486 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2487 	    root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2488 	    SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2489 }
2490 
2491 /*
2492  * sched_clutch_thread_unbound_lookup()
2493  *
2494  * Routine to find the highest unbound thread in the root clutch.
2495  * Helps find threads easily for steal/migrate scenarios in the
2496  * Edge scheduler.
2497  */
2498 static thread_t
sched_clutch_thread_unbound_lookup(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket)2499 sched_clutch_thread_unbound_lookup(
2500 	sched_clutch_root_t root_clutch,
2501 	sched_clutch_root_bucket_t root_bucket)
2502 {
2503 	sched_clutch_hierarchy_locked_assert(root_clutch);
2504 
2505 	/* Find the highest priority clutch bucket in this root bucket */
2506 	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_bucket);
2507 	assert(clutch_bucket != NULL);
2508 
2509 	/* Find the highest priority runnable thread in this clutch bucket */
2510 	thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2511 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2512 	    thread_tid(thread), thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0);
2513 	return thread;
2514 }
2515 
2516 /*
2517  * sched_clutch_thread_highest_remove()
2518  *
2519  * Routine to find and remove the highest priority thread
2520  * from the sched clutch hierarchy. The algorithm looks at the
2521  * hierarchy for the most eligible runnable thread and calls
2522  * sched_clutch_thread_remove(). Always called with the
2523  * pset lock held.
2524  */
2525 static thread_t
sched_clutch_thread_highest_remove(sched_clutch_root_t root_clutch)2526 sched_clutch_thread_highest_remove(
2527 	sched_clutch_root_t root_clutch)
2528 {
2529 	sched_clutch_hierarchy_locked_assert(root_clutch);
2530 	uint64_t current_timestamp = mach_absolute_time();
2531 
2532 	sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
2533 	if (root_bucket == NULL) {
2534 		return THREAD_NULL;
2535 	}
2536 
2537 	thread_t highest_thread = THREAD_NULL;
2538 	if (root_bucket->scrb_bound) {
2539 		highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket);
2540 	} else {
2541 		highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket);
2542 	}
2543 	sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2544 	return highest_thread;
2545 }
2546 
2547 /* High level global accessor routines */
2548 
2549 /*
2550  * sched_clutch_root_urgency()
2551  *
2552  * Routine to get the urgency of the highest runnable
2553  * thread in the hierarchy.
2554  */
2555 static uint32_t
sched_clutch_root_urgency(sched_clutch_root_t root_clutch)2556 sched_clutch_root_urgency(
2557 	sched_clutch_root_t root_clutch)
2558 {
2559 	return root_clutch->scr_urgency;
2560 }
2561 
2562 /*
2563  * sched_clutch_root_count_sum()
2564  *
2565  * The count_sum mechanism is used for scheduler runq
2566  * statistics calculation. Its only useful for debugging
2567  * purposes; since it takes a mach_absolute_time() on
2568  * other scheduler implementations, its better to avoid
2569  * populating this until absolutely necessary.
2570  */
2571 static uint32_t
sched_clutch_root_count_sum(__unused sched_clutch_root_t root_clutch)2572 sched_clutch_root_count_sum(
2573 	__unused sched_clutch_root_t root_clutch)
2574 {
2575 	return 0;
2576 }
2577 
2578 /*
2579  * sched_clutch_root_priority()
2580  *
2581  * Routine to get the priority of the highest runnable
2582  * thread in the hierarchy.
2583  */
2584 static int
sched_clutch_root_priority(sched_clutch_root_t root_clutch)2585 sched_clutch_root_priority(
2586 	sched_clutch_root_t root_clutch)
2587 {
2588 	return root_clutch->scr_priority;
2589 }
2590 
2591 /*
2592  * sched_clutch_root_count()
2593  *
2594  * Returns total number of runnable threads in the hierarchy.
2595  */
2596 uint32_t
sched_clutch_root_count(sched_clutch_root_t root_clutch)2597 sched_clutch_root_count(
2598 	sched_clutch_root_t root_clutch)
2599 {
2600 	return root_clutch->scr_thr_count;
2601 }
2602 
2603 #if CONFIG_SCHED_EDGE
2604 
2605 /*
2606  * sched_clutch_root_foreign_empty()
2607  *
2608  * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
2609  */
2610 static boolean_t
sched_clutch_root_foreign_empty(sched_clutch_root_t root_clutch)2611 sched_clutch_root_foreign_empty(
2612 	sched_clutch_root_t root_clutch)
2613 {
2614 	return priority_queue_empty(&root_clutch->scr_foreign_buckets);
2615 }
2616 
2617 /*
2618  * sched_clutch_root_highest_foreign_thread_remove()
2619  *
2620  * Routine to return the thread in the highest priority clutch bucket in a cluster.
2621  * Must be called with the pset for the cluster locked.
2622  */
2623 static thread_t
sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t root_clutch)2624 sched_clutch_root_highest_foreign_thread_remove(
2625 	sched_clutch_root_t root_clutch)
2626 {
2627 	thread_t thread = THREAD_NULL;
2628 	if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
2629 		return thread;
2630 	}
2631 	sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
2632 	thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2633 	sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
2634 	return thread;
2635 }
2636 
2637 #endif /* CONFIG_SCHED_EDGE */
2638 
2639 /*
2640  * sched_clutch_thread_pri_shift()
2641  *
2642  * Routine to get the priority shift value for a thread.
2643  * Since the timesharing is done at the clutch_bucket level,
2644  * this routine gets the clutch_bucket and retrieves the
2645  * values from there.
2646  */
2647 uint32_t
sched_clutch_thread_pri_shift(thread_t thread,sched_bucket_t bucket)2648 sched_clutch_thread_pri_shift(
2649 	thread_t thread,
2650 	sched_bucket_t bucket)
2651 {
2652 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2653 		return INT8_MAX;
2654 	}
2655 	assert(bucket != TH_BUCKET_RUN);
2656 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2657 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2658 	return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
2659 }
2660 
2661 #pragma mark -- Clutch Scheduler Algorithm
2662 
2663 static void
2664 sched_clutch_init(void);
2665 
2666 static thread_t
2667 sched_clutch_steal_thread(processor_set_t pset);
2668 
2669 static void
2670 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
2671 
2672 static boolean_t
2673 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
2674     sched_options_t options);
2675 
2676 static boolean_t
2677 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
2678 
2679 static ast_t
2680 sched_clutch_processor_csw_check(processor_t processor);
2681 
2682 static boolean_t
2683 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
2684 
2685 static int
2686 sched_clutch_runq_count(processor_t processor);
2687 
2688 static boolean_t
2689 sched_clutch_processor_queue_empty(processor_t processor);
2690 
2691 static uint64_t
2692 sched_clutch_runq_stats_count_sum(processor_t processor);
2693 
2694 static int
2695 sched_clutch_processor_bound_count(processor_t processor);
2696 
2697 static void
2698 sched_clutch_pset_init(processor_set_t pset);
2699 
2700 static void
2701 sched_clutch_processor_init(processor_t processor);
2702 
2703 static thread_t
2704 sched_clutch_choose_thread(processor_t processor, int priority, ast_t reason);
2705 
2706 static void
2707 sched_clutch_processor_queue_shutdown(processor_t processor);
2708 
2709 static sched_mode_t
2710 sched_clutch_initial_thread_sched_mode(task_t parent_task);
2711 
2712 static uint32_t
2713 sched_clutch_initial_quantum_size(thread_t thread);
2714 
2715 static bool
2716 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread, __unused ast_t reason);
2717 
2718 static uint32_t
2719 sched_clutch_run_incr(thread_t thread);
2720 
2721 static uint32_t
2722 sched_clutch_run_decr(thread_t thread);
2723 
2724 static void
2725 sched_clutch_update_thread_bucket(thread_t thread);
2726 
2727 static void
2728 sched_clutch_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
2729 
2730 const struct sched_dispatch_table sched_clutch_dispatch = {
2731 	.sched_name                                     = "clutch",
2732 	.init                                           = sched_clutch_init,
2733 	.timebase_init                                  = sched_timeshare_timebase_init,
2734 	.processor_init                                 = sched_clutch_processor_init,
2735 	.pset_init                                      = sched_clutch_pset_init,
2736 	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
2737 	.choose_thread                                  = sched_clutch_choose_thread,
2738 	.steal_thread_enabled                           = sched_steal_thread_enabled,
2739 	.steal_thread                                   = sched_clutch_steal_thread,
2740 	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
2741 	.choose_node                                    = sched_choose_node,
2742 	.choose_processor                               = choose_processor,
2743 	.processor_enqueue                              = sched_clutch_processor_enqueue,
2744 	.processor_queue_shutdown                       = sched_clutch_processor_queue_shutdown,
2745 	.processor_queue_remove                         = sched_clutch_processor_queue_remove,
2746 	.processor_queue_empty                          = sched_clutch_processor_queue_empty,
2747 	.priority_is_urgent                             = priority_is_urgent,
2748 	.processor_csw_check                            = sched_clutch_processor_csw_check,
2749 	.processor_queue_has_priority                   = sched_clutch_processor_queue_has_priority,
2750 	.initial_quantum_size                           = sched_clutch_initial_quantum_size,
2751 	.initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
2752 	.can_update_priority                            = can_update_priority,
2753 	.update_priority                                = update_priority,
2754 	.lightweight_update_priority                    = lightweight_update_priority,
2755 	.quantum_expire                                 = sched_default_quantum_expire,
2756 	.processor_runq_count                           = sched_clutch_runq_count,
2757 	.processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
2758 	.processor_bound_count                          = sched_clutch_processor_bound_count,
2759 	.thread_update_scan                             = sched_clutch_thread_update_scan,
2760 	.multiple_psets_enabled                         = TRUE,
2761 	.sched_groups_enabled                           = FALSE,
2762 	.avoid_processor_enabled                        = TRUE,
2763 	.thread_avoid_processor                         = sched_clutch_thread_avoid_processor,
2764 	.processor_balance                              = sched_SMT_balance,
2765 
2766 	.rt_runq                                        = sched_rtlocal_runq,
2767 	.rt_init                                        = sched_rtlocal_init,
2768 	.rt_queue_shutdown                              = sched_rtlocal_queue_shutdown,
2769 	.rt_runq_scan                                   = sched_rtlocal_runq_scan,
2770 	.rt_runq_count_sum                              = sched_rtlocal_runq_count_sum,
2771 	.rt_steal_thread                                = sched_rtlocal_steal_thread,
2772 
2773 	.qos_max_parallelism                            = sched_qos_max_parallelism,
2774 	.check_spill                                    = sched_check_spill,
2775 	.ipi_policy                                     = sched_ipi_policy,
2776 	.thread_should_yield                            = sched_thread_should_yield,
2777 	.run_count_incr                                 = sched_clutch_run_incr,
2778 	.run_count_decr                                 = sched_clutch_run_decr,
2779 	.update_thread_bucket                           = sched_clutch_update_thread_bucket,
2780 	.pset_made_schedulable                          = sched_pset_made_schedulable,
2781 	.thread_group_recommendation_change             = sched_clutch_thread_group_recommendation_change,
2782 	.cpu_init_completed                             = NULL,
2783 	.thread_eligible_for_pset                       = NULL,
2784 };
2785 
2786 __attribute__((always_inline))
2787 static inline run_queue_t
sched_clutch_bound_runq(processor_t processor)2788 sched_clutch_bound_runq(processor_t processor)
2789 {
2790 	return &processor->runq;
2791 }
2792 
2793 __attribute__((always_inline))
2794 static inline sched_clutch_root_t
sched_clutch_processor_root_clutch(processor_t processor)2795 sched_clutch_processor_root_clutch(processor_t processor)
2796 {
2797 	return &processor->processor_set->pset_clutch_root;
2798 }
2799 
2800 __attribute__((always_inline))
2801 static inline run_queue_t
sched_clutch_thread_bound_runq(processor_t processor,__assert_only thread_t thread)2802 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
2803 {
2804 	assert(thread->bound_processor == processor);
2805 	return sched_clutch_bound_runq(processor);
2806 }
2807 
2808 static uint32_t
sched_clutch_initial_quantum_size(thread_t thread)2809 sched_clutch_initial_quantum_size(thread_t thread)
2810 {
2811 	if (thread == THREAD_NULL) {
2812 		return std_quantum;
2813 	}
2814 	assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
2815 	return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
2816 }
2817 
2818 static sched_mode_t
sched_clutch_initial_thread_sched_mode(task_t parent_task)2819 sched_clutch_initial_thread_sched_mode(task_t parent_task)
2820 {
2821 	if (parent_task == kernel_task) {
2822 		return TH_MODE_FIXED;
2823 	} else {
2824 		return TH_MODE_TIMESHARE;
2825 	}
2826 }
2827 
2828 static void
sched_clutch_processor_init(processor_t processor)2829 sched_clutch_processor_init(processor_t processor)
2830 {
2831 	run_queue_init(&processor->runq);
2832 }
2833 
2834 static void
sched_clutch_pset_init(processor_set_t pset)2835 sched_clutch_pset_init(processor_set_t pset)
2836 {
2837 	sched_clutch_root_init(&pset->pset_clutch_root, pset);
2838 }
2839 
2840 static void
sched_clutch_tunables_init(void)2841 sched_clutch_tunables_init(void)
2842 {
2843 	sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
2844 	sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
2845 	sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
2846 	clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
2847 	    NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
2848 	assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
2849 	sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
2850 }
2851 
2852 static void
sched_clutch_init(void)2853 sched_clutch_init(void)
2854 {
2855 	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
2856 		sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
2857 	}
2858 	sched_timeshare_init();
2859 	sched_clutch_tunables_init();
2860 }
2861 
2862 static thread_t
sched_clutch_choose_thread(processor_t processor,int priority,__unused ast_t reason)2863 sched_clutch_choose_thread(
2864 	processor_t      processor,
2865 	int              priority,
2866 	__unused ast_t            reason)
2867 {
2868 	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2869 	uint32_t clutch_count = sched_clutch_root_count(sched_clutch_processor_root_clutch(processor));
2870 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2871 	boolean_t choose_from_boundq = false;
2872 
2873 	if (bound_runq->highq < priority &&
2874 	    clutch_pri < priority) {
2875 		return THREAD_NULL;
2876 	}
2877 
2878 	if (bound_runq->count && clutch_count) {
2879 		if (bound_runq->highq >= clutch_pri) {
2880 			choose_from_boundq = true;
2881 		}
2882 	} else if (bound_runq->count) {
2883 		choose_from_boundq = true;
2884 	} else if (clutch_count) {
2885 		choose_from_boundq = false;
2886 	} else {
2887 		return THREAD_NULL;
2888 	}
2889 
2890 	thread_t thread = THREAD_NULL;
2891 	if (choose_from_boundq == false) {
2892 		sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2893 		thread = sched_clutch_thread_highest_remove(pset_clutch_root);
2894 	} else {
2895 		thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
2896 	}
2897 	return thread;
2898 }
2899 
2900 static boolean_t
sched_clutch_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)2901 sched_clutch_processor_enqueue(
2902 	processor_t       processor,
2903 	thread_t          thread,
2904 	sched_options_t   options)
2905 {
2906 	boolean_t       result;
2907 
2908 	thread_set_runq_locked(thread, processor);
2909 	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2910 		sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
2911 		result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
2912 	} else {
2913 		run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
2914 		result = run_queue_enqueue(rq, thread, options);
2915 	}
2916 	return result;
2917 }
2918 
2919 static boolean_t
sched_clutch_processor_queue_empty(processor_t processor)2920 sched_clutch_processor_queue_empty(processor_t processor)
2921 {
2922 	return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
2923 	       sched_clutch_bound_runq(processor)->count == 0;
2924 }
2925 
2926 static ast_t
sched_clutch_processor_csw_check(processor_t processor)2927 sched_clutch_processor_csw_check(processor_t processor)
2928 {
2929 	boolean_t       has_higher;
2930 	int             pri;
2931 
2932 	if (sched_clutch_thread_avoid_processor(processor, current_thread(), AST_NONE)) {
2933 		return AST_PREEMPT | AST_URGENT;
2934 	}
2935 
2936 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2937 	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
2938 
2939 	assert(processor->active_thread != NULL);
2940 
2941 	pri = MAX(clutch_pri, bound_runq->highq);
2942 
2943 	if (processor->first_timeslice) {
2944 		has_higher = (pri > processor->current_pri);
2945 	} else {
2946 		has_higher = (pri >= processor->current_pri);
2947 	}
2948 
2949 	if (has_higher) {
2950 		if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
2951 			return AST_PREEMPT | AST_URGENT;
2952 		}
2953 
2954 		if (bound_runq->urgency > 0) {
2955 			return AST_PREEMPT | AST_URGENT;
2956 		}
2957 
2958 		return AST_PREEMPT;
2959 	}
2960 
2961 	return AST_NONE;
2962 }
2963 
2964 static boolean_t
sched_clutch_processor_queue_has_priority(processor_t processor,int priority,boolean_t gte)2965 sched_clutch_processor_queue_has_priority(processor_t    processor,
2966     int            priority,
2967     boolean_t      gte)
2968 {
2969 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
2970 
2971 	int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
2972 
2973 	if (gte) {
2974 		return qpri >= priority;
2975 	} else {
2976 		return qpri > priority;
2977 	}
2978 }
2979 
2980 static int
sched_clutch_runq_count(processor_t processor)2981 sched_clutch_runq_count(processor_t processor)
2982 {
2983 	return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
2984 }
2985 
2986 static uint64_t
sched_clutch_runq_stats_count_sum(processor_t processor)2987 sched_clutch_runq_stats_count_sum(processor_t processor)
2988 {
2989 	uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
2990 
2991 	if (processor->cpu_id == processor->processor_set->cpu_set_low) {
2992 		return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
2993 	} else {
2994 		return bound_sum;
2995 	}
2996 }
2997 static int
sched_clutch_processor_bound_count(processor_t processor)2998 sched_clutch_processor_bound_count(processor_t processor)
2999 {
3000 	return sched_clutch_bound_runq(processor)->count;
3001 }
3002 
3003 static void
sched_clutch_processor_queue_shutdown(processor_t processor)3004 sched_clutch_processor_queue_shutdown(processor_t processor)
3005 {
3006 	processor_set_t pset = processor->processor_set;
3007 	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3008 	thread_t        thread;
3009 	queue_head_t    tqueue;
3010 
3011 	/* We only need to migrate threads if this is the last active processor in the pset */
3012 	if (pset->online_processor_count > 0) {
3013 		pset_unlock(pset);
3014 		return;
3015 	}
3016 
3017 	queue_init(&tqueue);
3018 	while (sched_clutch_root_count(pset_clutch_root) > 0) {
3019 		thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3020 		enqueue_tail(&tqueue, &thread->runq_links);
3021 	}
3022 
3023 	pset_unlock(pset);
3024 
3025 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
3026 		remqueue(&thread->runq_links);
3027 		thread_lock(thread);
3028 		thread_setrun(thread, SCHED_TAILQ);
3029 		thread_unlock(thread);
3030 	}
3031 }
3032 
3033 static boolean_t
sched_clutch_processor_queue_remove(processor_t processor,thread_t thread)3034 sched_clutch_processor_queue_remove(
3035 	processor_t processor,
3036 	thread_t    thread)
3037 {
3038 	processor_set_t         pset = processor->processor_set;
3039 
3040 	pset_lock(pset);
3041 
3042 	if (processor == thread_get_runq_locked(thread)) {
3043 		/*
3044 		 * Thread is on a run queue and we have a lock on
3045 		 * that run queue.
3046 		 */
3047 		if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3048 			sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3049 			sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3050 		} else {
3051 			run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3052 			run_queue_remove(rq, thread);
3053 		}
3054 	} else {
3055 		/*
3056 		 * The thread left the run queue before we could
3057 		 * lock the run queue.
3058 		 */
3059 		thread_assert_runq_null(thread);
3060 		processor = PROCESSOR_NULL;
3061 	}
3062 
3063 	pset_unlock(pset);
3064 
3065 	return processor != PROCESSOR_NULL;
3066 }
3067 
3068 static thread_t
sched_clutch_steal_thread(__unused processor_set_t pset)3069 sched_clutch_steal_thread(__unused processor_set_t pset)
3070 {
3071 	/* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3072 	return THREAD_NULL;
3073 }
3074 
3075 static void
sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)3076 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3077 {
3078 	boolean_t               restart_needed = FALSE;
3079 	processor_t             processor = processor_list;
3080 	processor_set_t         pset;
3081 	thread_t                thread;
3082 	spl_t                   s;
3083 
3084 	/*
3085 	 *  We update the threads associated with each processor (bound and idle threads)
3086 	 *  and then update the threads in each pset runqueue.
3087 	 */
3088 
3089 	do {
3090 		do {
3091 			pset = processor->processor_set;
3092 
3093 			s = splsched();
3094 			pset_lock(pset);
3095 
3096 			restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
3097 
3098 			pset_unlock(pset);
3099 			splx(s);
3100 
3101 			if (restart_needed) {
3102 				break;
3103 			}
3104 
3105 			thread = processor->idle_thread;
3106 			if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
3107 				if (thread_update_add_thread(thread) == FALSE) {
3108 					restart_needed = TRUE;
3109 					break;
3110 				}
3111 			}
3112 		} while ((processor = processor->processor_list) != NULL);
3113 
3114 		/* Ok, we now have a collection of candidates -- fix them. */
3115 		thread_update_process_threads();
3116 	} while (restart_needed);
3117 
3118 	pset_node_t node = &pset_node0;
3119 	pset = node->psets;
3120 
3121 	do {
3122 		do {
3123 			restart_needed = FALSE;
3124 			while (pset != NULL) {
3125 				s = splsched();
3126 				pset_lock(pset);
3127 
3128 				if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
3129 					for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3130 						restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3131 						if (restart_needed) {
3132 							break;
3133 						}
3134 					}
3135 					queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3136 					sched_clutch_bucket_t clutch_bucket;
3137 					qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3138 						sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
3139 						restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
3140 						if (restart_needed) {
3141 							break;
3142 						}
3143 					}
3144 				}
3145 
3146 				pset_unlock(pset);
3147 				splx(s);
3148 
3149 				if (restart_needed) {
3150 					break;
3151 				}
3152 				pset = pset->pset_list;
3153 			}
3154 
3155 			if (restart_needed) {
3156 				break;
3157 			}
3158 		} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3159 
3160 		/* Ok, we now have a collection of candidates -- fix them. */
3161 		thread_update_process_threads();
3162 	} while (restart_needed);
3163 }
3164 
3165 extern int sched_allow_rt_smt;
3166 
3167 /* Return true if this thread should not continue running on this processor */
3168 static bool
sched_clutch_thread_avoid_processor(processor_t processor,thread_t thread,__unused ast_t reason)3169 sched_clutch_thread_avoid_processor(processor_t processor, thread_t thread, __unused ast_t reason)
3170 {
3171 	if (processor->processor_primary != processor) {
3172 		/*
3173 		 * This is a secondary SMT processor.  If the primary is running
3174 		 * a realtime thread, only allow realtime threads on the secondary.
3175 		 */
3176 		if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
3177 			return true;
3178 		}
3179 	}
3180 
3181 	return false;
3182 }
3183 
3184 /*
3185  * For the clutch scheduler, the run counts are maintained in the clutch
3186  * buckets (i.e thread group scheduling structure).
3187  */
3188 static uint32_t
sched_clutch_run_incr(thread_t thread)3189 sched_clutch_run_incr(thread_t thread)
3190 {
3191 	assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3192 	uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3193 	sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3194 	return new_count;
3195 }
3196 
3197 static uint32_t
sched_clutch_run_decr(thread_t thread)3198 sched_clutch_run_decr(thread_t thread)
3199 {
3200 	assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3201 	uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3202 	sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3203 	return new_count;
3204 }
3205 
3206 /*
3207  * For threads that have changed sched_pri without changing the
3208  * base_pri for any reason other than decay, use the sched_pri
3209  * as the bucketizing priority instead of base_pri. All such
3210  * changes are typically due to kernel locking primitives boosts
3211  * or demotions.
3212  */
3213 static boolean_t
sched_thread_sched_pri_promoted(thread_t thread)3214 sched_thread_sched_pri_promoted(thread_t thread)
3215 {
3216 	return (thread->sched_flags & TH_SFLAG_PROMOTED) ||
3217 	       (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3218 	       (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3219 	       (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3220 	       (thread->kern_promotion_schedpri != 0);
3221 }
3222 
3223 /*
3224  * Routine to update the scheduling bucket for the thread.
3225  *
3226  * In the clutch scheduler implementation, the thread's bucket
3227  * is based on sched_pri if it was promoted due to a kernel
3228  * primitive; otherwise its based on the thread base_pri. This
3229  * enhancement allows promoted threads to reach a higher priority
3230  * bucket and potentially get selected sooner for scheduling.
3231  *
3232  * Also, the clutch scheduler does not honor fixed priority below
3233  * FG priority. It simply puts those threads in the corresponding
3234  * timeshare bucket. The reason for to do that is because it is
3235  * extremely hard to define the scheduling properties of such threads
3236  * and they typically lead to performance issues.
3237  */
3238 
3239 void
sched_clutch_update_thread_bucket(thread_t thread)3240 sched_clutch_update_thread_bucket(thread_t thread)
3241 {
3242 	sched_bucket_t old_bucket = thread->th_sched_bucket;
3243 	thread_assert_runq_null(thread);
3244 	int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3245 	sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3246 
3247 	if (old_bucket == new_bucket) {
3248 		return;
3249 	}
3250 
3251 	thread->th_sched_bucket = new_bucket;
3252 	thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3253 	/*
3254 	 * Since this is called after the thread has been removed from the runq,
3255 	 * only the run counts need to be updated. The re-insert into the runq
3256 	 * would put the thread into the correct new bucket's runq.
3257 	 */
3258 	if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3259 		sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3260 		sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3261 	}
3262 }
3263 
3264 static void
sched_clutch_thread_group_recommendation_change(__unused struct thread_group * tg,__unused cluster_type_t new_recommendation)3265 sched_clutch_thread_group_recommendation_change(__unused struct thread_group *tg, __unused cluster_type_t new_recommendation)
3266 {
3267 	/* Clutch ignores the recommendation because Clutch does not migrate
3268 	 * threads between cluster types independently from the Edge scheduler.
3269 	 */
3270 }
3271 
3272 #if CONFIG_SCHED_EDGE
3273 
3274 /* Implementation of the AMP version of the clutch scheduler */
3275 
3276 static void
3277 sched_edge_init(void);
3278 
3279 static void
3280 sched_edge_pset_init(processor_set_t pset);
3281 
3282 static thread_t
3283 sched_edge_processor_idle(processor_set_t pset);
3284 
3285 static ast_t
3286 sched_edge_processor_csw_check(processor_t processor);
3287 
3288 static boolean_t
3289 sched_edge_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3290 
3291 static boolean_t
3292 sched_edge_processor_queue_empty(processor_t processor);
3293 
3294 static thread_t
3295 sched_edge_choose_thread(processor_t processor, int priority, ast_t reason);
3296 
3297 static void
3298 sched_edge_processor_queue_shutdown(processor_t processor);
3299 
3300 static processor_t
3301 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread);
3302 
3303 static bool
3304 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason);
3305 
3306 static bool
3307 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3308 
3309 static void
3310 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3311 
3312 static bool
3313 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3314 
3315 static void
3316 sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3317 
3318 static void
3319 sched_edge_cpu_init_completed(void);
3320 
3321 static bool
3322 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3323 
3324 static bool
3325 sched_edge_steal_thread_enabled(processor_set_t pset);
3326 
3327 static sched_ipi_type_t
3328 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3329 
3330 static uint32_t
3331 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3332 
3333 static uint32_t
3334 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3335 
3336 const struct sched_dispatch_table sched_edge_dispatch = {
3337 	.sched_name                                     = "edge",
3338 	.init                                           = sched_edge_init,
3339 	.timebase_init                                  = sched_timeshare_timebase_init,
3340 	.processor_init                                 = sched_clutch_processor_init,
3341 	.pset_init                                      = sched_edge_pset_init,
3342 	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
3343 	.choose_thread                                  = sched_edge_choose_thread,
3344 	.steal_thread_enabled                           = sched_edge_steal_thread_enabled,
3345 	.steal_thread                                   = sched_edge_processor_idle,
3346 	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
3347 	.choose_node                                    = sched_choose_node,
3348 	.choose_processor                               = sched_edge_choose_processor,
3349 	.processor_enqueue                              = sched_clutch_processor_enqueue,
3350 	.processor_queue_shutdown                       = sched_edge_processor_queue_shutdown,
3351 	.processor_queue_remove                         = sched_clutch_processor_queue_remove,
3352 	.processor_queue_empty                          = sched_edge_processor_queue_empty,
3353 	.priority_is_urgent                             = priority_is_urgent,
3354 	.processor_csw_check                            = sched_edge_processor_csw_check,
3355 	.processor_queue_has_priority                   = sched_edge_processor_queue_has_priority,
3356 	.initial_quantum_size                           = sched_clutch_initial_quantum_size,
3357 	.initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
3358 	.can_update_priority                            = can_update_priority,
3359 	.update_priority                                = update_priority,
3360 	.lightweight_update_priority                    = lightweight_update_priority,
3361 	.quantum_expire                                 = sched_default_quantum_expire,
3362 	.processor_runq_count                           = sched_clutch_runq_count,
3363 	.processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
3364 	.processor_bound_count                          = sched_clutch_processor_bound_count,
3365 	.thread_update_scan                             = sched_clutch_thread_update_scan,
3366 	.multiple_psets_enabled                         = TRUE,
3367 	.sched_groups_enabled                           = FALSE,
3368 	.avoid_processor_enabled                        = TRUE,
3369 	.thread_avoid_processor                         = sched_edge_thread_avoid_processor,
3370 	.processor_balance                              = sched_edge_balance,
3371 
3372 	.rt_runq                                        = sched_rtlocal_runq,
3373 	.rt_init                                        = sched_rtlocal_init,
3374 	.rt_queue_shutdown                              = sched_rtlocal_queue_shutdown,
3375 	.rt_runq_scan                                   = sched_rtlocal_runq_scan,
3376 	.rt_runq_count_sum                              = sched_rtlocal_runq_count_sum,
3377 	.rt_steal_thread                                = sched_rtlocal_steal_thread,
3378 
3379 	.qos_max_parallelism                            = sched_edge_qos_max_parallelism,
3380 	.check_spill                                    = sched_edge_check_spill,
3381 	.ipi_policy                                     = sched_edge_ipi_policy,
3382 	.thread_should_yield                            = sched_edge_thread_should_yield,
3383 	.run_count_incr                                 = sched_clutch_run_incr,
3384 	.run_count_decr                                 = sched_clutch_run_decr,
3385 	.update_thread_bucket                           = sched_clutch_update_thread_bucket,
3386 	.pset_made_schedulable                          = sched_edge_pset_made_schedulable,
3387 	.thread_group_recommendation_change             = NULL,
3388 	.cpu_init_completed                             = sched_edge_cpu_init_completed,
3389 	.thread_eligible_for_pset                       = sched_edge_thread_eligible_for_pset,
3390 };
3391 
3392 static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3393 
3394 /*
3395  * sched_edge_pset_available()
3396  *
3397  * Routine to determine if a pset is available for scheduling.
3398  */
3399 static bool
sched_edge_pset_available(processor_set_t pset)3400 sched_edge_pset_available(processor_set_t pset)
3401 {
3402 	if (pset == NULL) {
3403 		return false;
3404 	}
3405 	return pset_available_cpu_count(pset) > 0;
3406 }
3407 
3408 /*
3409  * sched_edge_thread_bound_cluster_id()
3410  *
3411  * Routine to determine which cluster a particular thread is bound to. Uses
3412  * the sched_flags on the thread to map back to a specific cluster id.
3413  *
3414  * <Edge Multi-cluster Support Needed>
3415  */
3416 static uint32_t
sched_edge_thread_bound_cluster_id(thread_t thread)3417 sched_edge_thread_bound_cluster_id(thread_t thread)
3418 {
3419 	assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3420 	return thread->th_bound_cluster_id;
3421 }
3422 
3423 /* Forward declaration for some thread migration routines */
3424 static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3425 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3426 static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3427 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks);
3428 
3429 /*
3430  * sched_edge_config_set()
3431  *
3432  * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3433  * policies in the scheduler.
3434  */
3435 static void
sched_edge_config_set(uint32_t src_cluster,uint32_t dst_cluster,sched_clutch_edge edge_config)3436 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config)
3437 {
3438 	sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster];
3439 	edge->sce_edge_packed = edge_config.sce_edge_packed;
3440 }
3441 
3442 /*
3443  * sched_edge_config_get()
3444  *
3445  * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3446  * if it needs to update edges.
3447  */
3448 static sched_clutch_edge
sched_edge_config_get(uint32_t src_cluster,uint32_t dst_cluster)3449 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster)
3450 {
3451 	return pset_array[src_cluster]->sched_edges[dst_cluster];
3452 }
3453 
3454 /*
3455  * sched_edge_matrix_set()
3456  *
3457  * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap
3458  * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap
3459  * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3460  */
3461 void
sched_edge_matrix_set(sched_clutch_edge * edge_matrix,bool * edge_changes_bitmap,__unused uint64_t flags,uint64_t matrix_order)3462 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3463 {
3464 	uint32_t edge_index = 0;
3465 	for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3466 		for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3467 			if (edge_changes_bitmap[edge_index]) {
3468 				sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]);
3469 			}
3470 			edge_index++;
3471 		}
3472 	}
3473 }
3474 
3475 /*
3476  * sched_edge_matrix_get()
3477  *
3478  * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap
3479  * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap
3480  * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array.
3481  */
3482 void
sched_edge_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_request_bitmap,__unused uint64_t flags,uint64_t matrix_order)3483 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order)
3484 {
3485 	uint32_t edge_index = 0;
3486 	for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) {
3487 		for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) {
3488 			if (edge_request_bitmap[edge_index]) {
3489 				edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster);
3490 			}
3491 			edge_index++;
3492 		}
3493 	}
3494 }
3495 
3496 /*
3497  * sched_edge_init()
3498  *
3499  * Routine to initialize the data structures for the Edge scheduler.
3500  */
3501 static void
sched_edge_init(void)3502 sched_edge_init(void)
3503 {
3504 	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3505 		sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3506 	}
3507 	sched_timeshare_init();
3508 	sched_clutch_tunables_init();
3509 	sched_edge_max_clusters = ml_get_cluster_count();
3510 }
3511 
3512 static void
sched_edge_pset_init(processor_set_t pset)3513 sched_edge_pset_init(processor_set_t pset)
3514 {
3515 	uint32_t pset_cluster_id = pset->pset_cluster_id;
3516 	pset->pset_type = (pset->pset_cluster_type == PSET_AMP_P) ? CLUSTER_TYPE_P : CLUSTER_TYPE_E;
3517 
3518 	/* Set the edge weight and properties for the pset itself */
3519 	bitmap_clear(pset->foreign_psets, pset_cluster_id);
3520 	bitmap_clear(pset->native_psets, pset_cluster_id);
3521 	bitmap_clear(pset->local_psets, pset_cluster_id);
3522 	bitmap_clear(pset->remote_psets, pset_cluster_id);
3523 	pset->sched_edges[pset_cluster_id].sce_edge_packed = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0}.sce_edge_packed;
3524 	sched_clutch_root_init(&pset->pset_clutch_root, pset);
3525 	bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id);
3526 }
3527 
3528 static thread_t
sched_edge_choose_thread(processor_t processor,int priority,__unused ast_t reason)3529 sched_edge_choose_thread(
3530 	processor_t      processor,
3531 	int              priority,
3532 	__unused ast_t            reason)
3533 {
3534 	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3535 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3536 	boolean_t choose_from_boundq = false;
3537 
3538 	if ((bound_runq->highq < priority) &&
3539 	    (clutch_pri < priority)) {
3540 		return THREAD_NULL;
3541 	}
3542 
3543 	if (bound_runq->highq >= clutch_pri) {
3544 		choose_from_boundq = true;
3545 	}
3546 
3547 	thread_t thread = THREAD_NULL;
3548 	if (choose_from_boundq == false) {
3549 		sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3550 		thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3551 	} else {
3552 		thread = run_queue_dequeue(bound_runq, SCHED_HEADQ);
3553 	}
3554 	return thread;
3555 }
3556 
3557 static boolean_t
sched_edge_processor_queue_empty(processor_t processor)3558 sched_edge_processor_queue_empty(processor_t processor)
3559 {
3560 	return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
3561 	       (sched_clutch_bound_runq(processor)->count == 0);
3562 }
3563 
3564 static void
sched_edge_check_spill(__unused processor_set_t pset,__unused thread_t thread)3565 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
3566 {
3567 	assert(thread->bound_processor == PROCESSOR_NULL);
3568 }
3569 
3570 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
3571 	SCHED_EDGE_YIELD_RUNQ_NONEMPTY       = 0x0,
3572 	SCHED_EDGE_YIELD_FOREIGN_RUNNABLE    = 0x1,
3573 	SCHED_EDGE_YIELD_FOREIGN_RUNNING     = 0x2,
3574 	SCHED_EDGE_YIELD_STEAL_POSSIBLE      = 0x3,
3575 	SCHED_EDGE_YIELD_DISALLOW            = 0x4,
3576 });
3577 
3578 static bool
sched_edge_thread_should_yield(processor_t processor,__unused thread_t thread)3579 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
3580 {
3581 	if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
3582 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3583 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
3584 		return true;
3585 	}
3586 
3587 	/*
3588 	 * The yield logic should follow the same logic that steal_thread () does. The
3589 	 * thread_should_yield() is effectively trying to quickly check that if the
3590 	 * current thread gave up CPU, is there any other thread that would execute
3591 	 * on this CPU. So it needs to provide the same answer as the steal_thread()/
3592 	 * processor Idle logic.
3593 	 */
3594 	if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
3595 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3596 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
3597 		return true;
3598 	}
3599 	if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
3600 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3601 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
3602 		return true;
3603 	}
3604 
3605 	processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
3606 	if (steal_candidate != NULL) {
3607 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
3608 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
3609 		return true;
3610 	}
3611 
3612 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
3613 	    0, SCHED_EDGE_YIELD_DISALLOW);
3614 	return false;
3615 }
3616 
3617 static ast_t
sched_edge_processor_csw_check(processor_t processor)3618 sched_edge_processor_csw_check(processor_t processor)
3619 {
3620 	boolean_t       has_higher;
3621 	int             pri;
3622 
3623 	int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor));
3624 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3625 
3626 	assert(processor->active_thread != NULL);
3627 
3628 	pri = MAX(clutch_pri, bound_runq->highq);
3629 
3630 	if (processor->first_timeslice) {
3631 		has_higher = (pri > processor->current_pri);
3632 	} else {
3633 		has_higher = (pri >= processor->current_pri);
3634 	}
3635 
3636 	if (has_higher) {
3637 		if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) {
3638 			return AST_PREEMPT | AST_URGENT;
3639 		}
3640 
3641 		if (bound_runq->urgency > 0) {
3642 			return AST_PREEMPT | AST_URGENT;
3643 		}
3644 
3645 		return AST_PREEMPT;
3646 	}
3647 
3648 	return AST_NONE;
3649 }
3650 
3651 static boolean_t
sched_edge_processor_queue_has_priority(processor_t processor,int priority,boolean_t gte)3652 sched_edge_processor_queue_has_priority(processor_t    processor,
3653     int            priority,
3654     boolean_t      gte)
3655 {
3656 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3657 
3658 	int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq);
3659 	if (gte) {
3660 		return qpri >= priority;
3661 	} else {
3662 		return qpri > priority;
3663 	}
3664 }
3665 
3666 static void
sched_edge_processor_queue_shutdown(processor_t processor)3667 sched_edge_processor_queue_shutdown(processor_t processor)
3668 {
3669 	processor_set_t pset = processor->processor_set;
3670 	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3671 	thread_t        thread;
3672 	queue_head_t    tqueue;
3673 
3674 	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
3675 	if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
3676 		pset_unlock(pset);
3677 		return;
3678 	}
3679 
3680 	bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
3681 
3682 	queue_init(&tqueue);
3683 	while (sched_clutch_root_count(pset_clutch_root) > 0) {
3684 		thread = sched_clutch_thread_highest_remove(pset_clutch_root);
3685 		enqueue_tail(&tqueue, &thread->runq_links);
3686 	}
3687 	pset_unlock(pset);
3688 
3689 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
3690 		remqueue(&thread->runq_links);
3691 		thread_lock(thread);
3692 		thread_setrun(thread, SCHED_TAILQ);
3693 		thread_unlock(thread);
3694 	}
3695 }
3696 
3697 /*
3698  * sched_edge_cluster_load_metric()
3699  *
3700  * The load metric for a cluster is a measure of the average scheduling latency
3701  * experienced by threads on that cluster. It is a product of the average number
3702  * of threads in the runqueue and the average execution time for threads. The metric
3703  * has special values in the following cases:
3704  * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
3705  *   the maximum value to disallow any threads to migrate to this cluster.
3706  * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
3707  *   to be spread across the platform quickly for ncpu wide workloads.
3708  */
3709 static uint32_t
sched_edge_cluster_load_metric(processor_set_t pset,sched_bucket_t sched_bucket)3710 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
3711 {
3712 	if (sched_edge_pset_available(pset) == false) {
3713 		return UINT32_MAX;
3714 	}
3715 	return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
3716 }
3717 
3718 /*
3719  *
3720  * Edge Scheduler Steal/Rebalance logic
3721  *
3722  * = Generic scheduler logic =
3723  *
3724  * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
3725  * find any thread for execution in its runqueue. The aim of the steal operation
3726  * is to find other threads running/runnable in other clusters which should be
3727  * executed here.
3728  *
3729  * If the steal callout does not return a thread, the thread_select() logic calls
3730  * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
3731  * threads and idle out the current CPU.
3732  *
3733  * = SCHED(steal_thread) for Edge Scheduler =
3734  *
3735  * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
3736  * routine tries to do the following operations in order:
3737  * (1) Find foreign runnnable threads in non-native cluster
3738  *     runqueues (sched_edge_foreign_runnable_thread_remove())
3739  * (2) Check if foreign threads are running on the non-native
3740  *     clusters (sched_edge_foreign_running_thread_available())
3741  *         - If yes, return THREAD_NULL for the steal callout and
3742  *         perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
3743  * (3) Steal a thread from another cluster based on edge
3744  *     weights (sched_edge_steal_thread())
3745  *
3746  * = SCHED(processor_balance) for Edge Scheduler =
3747  *
3748  * If steal_thread did not return a thread for the processor, use
3749  * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
3750  *
3751  * = Clutch Bucket Preferred Cluster Overrides =
3752  *
3753  * Since these operations (just like thread migrations on enqueue)
3754  * move threads across clusters, they need support for handling clutch
3755  * bucket group level preferred cluster recommendations.
3756  * For (1), a clutch bucket will be in the foreign runnable queue based
3757  * on the clutch bucket group preferred cluster.
3758  * For (2), the running thread will set the bit on the processor based
3759  * on its preferred cluster type.
3760  * For (3), the edge configuration would prevent threads from being stolen
3761  * in the wrong direction.
3762  *
3763  * = SCHED(thread_should_yield) =
3764  * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
3765  * since that is expecting the same answer as if thread_select() was called on a core
3766  * with an empty runqueue.
3767  */
3768 
3769 static bool
sched_edge_steal_thread_enabled(__unused processor_set_t pset)3770 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
3771 {
3772 	/*
3773 	 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
3774 	 */
3775 	return true;
3776 }
3777 
3778 static processor_set_t
sched_edge_steal_candidate(processor_set_t pset)3779 sched_edge_steal_candidate(processor_set_t pset)
3780 {
3781 	uint32_t dst_cluster_id = pset->pset_cluster_id;
3782 	for (int cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
3783 		processor_set_t candidate_pset = pset_array[cluster_id];
3784 		if (cluster_id == dst_cluster_id) {
3785 			continue;
3786 		}
3787 		if (candidate_pset == NULL) {
3788 			continue;
3789 		}
3790 		sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id];
3791 		if (incoming_edge->sce_steal_allowed && (bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1)) {
3792 			return candidate_pset;
3793 		}
3794 	}
3795 	return NULL;
3796 }
3797 
3798 static boolean_t
sched_edge_foreign_runnable_thread_available(processor_set_t pset)3799 sched_edge_foreign_runnable_thread_available(processor_set_t pset)
3800 {
3801 	/* Find all the clusters that are foreign for this cluster */
3802 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3803 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3804 		/*
3805 		 * For each cluster, see if there are any runnable foreign threads.
3806 		 * This check is currently being done without the pset lock to make it cheap for
3807 		 * the common case.
3808 		 */
3809 		processor_set_t target_pset = pset_array[cluster];
3810 		if (sched_edge_pset_available(target_pset) == false) {
3811 			continue;
3812 		}
3813 
3814 		if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3815 			return true;
3816 		}
3817 	}
3818 	return false;
3819 }
3820 
3821 static thread_t
sched_edge_foreign_runnable_thread_remove(processor_set_t pset,uint64_t ctime)3822 sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
3823 {
3824 	thread_t thread = THREAD_NULL;
3825 
3826 	/* Find all the clusters that are foreign for this cluster */
3827 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3828 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
3829 		/*
3830 		 * For each cluster, see if there are any runnable foreign threads.
3831 		 * This check is currently being done without the pset lock to make it cheap for
3832 		 * the common case.
3833 		 */
3834 		processor_set_t target_pset = pset_array[cluster];
3835 		if (sched_edge_pset_available(target_pset) == false) {
3836 			continue;
3837 		}
3838 
3839 		if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
3840 			continue;
3841 		}
3842 		/*
3843 		 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
3844 		 * and get the highest priority thread.
3845 		 */
3846 		pset_lock(target_pset);
3847 		if (sched_edge_pset_available(target_pset)) {
3848 			thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
3849 			sched_update_pset_load_average(target_pset, ctime);
3850 		}
3851 		pset_unlock(target_pset);
3852 
3853 		/*
3854 		 * Edge Scheduler Optimization
3855 		 *
3856 		 * The current implementation immediately returns as soon as it finds a foreign
3857 		 * runnable thread. This could be enhanced to look at highest priority threads
3858 		 * from all foreign clusters and pick the highest amongst them. That would need
3859 		 * some form of global state across psets to make that kind of a check cheap.
3860 		 */
3861 		if (thread != THREAD_NULL) {
3862 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
3863 			break;
3864 		}
3865 		/* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
3866 	}
3867 
3868 	return thread;
3869 }
3870 
3871 /*
3872  * sched_edge_cpu_running_foreign_shared_rsrc_available()
3873  *
3874  * Routine to determine if the thread running on a CPU is a shared resource thread
3875  * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
3876  * a CPU going idle on a pset should rebalance a running shared resource heavy thread
3877  * from another non-ideal cluster based on the former's shared resource load.
3878  */
3879 static boolean_t
sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset,int foreign_cpu,processor_set_t idle_pset)3880 sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
3881 {
3882 	boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
3883 	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
3884 		return false;
3885 	}
3886 
3887 	boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
3888 	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
3889 		return false;
3890 	}
3891 	return true;
3892 }
3893 
3894 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset)3895 sched_edge_foreign_running_thread_available(processor_set_t pset)
3896 {
3897 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
3898 	int cluster = -1;
3899 	while ((cluster = sched_edge_iterate_clusters_ordered(pset, foreign_pset_bitmap[0], cluster)) != -1) {
3900 		/* Skip the pset if its not schedulable */
3901 		processor_set_t target_pset = pset_array[cluster];
3902 		if (sched_edge_pset_available(target_pset) == false) {
3903 			continue;
3904 		}
3905 
3906 		uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
3907 		for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
3908 			if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
3909 				continue;
3910 			}
3911 			return true;
3912 		}
3913 	}
3914 	return false;
3915 }
3916 
3917 static bool
sched_edge_steal_possible(processor_set_t idle_pset,processor_set_t candidate_pset)3918 sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_pset)
3919 {
3920 	int highest_runnable_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
3921 	if (highest_runnable_bucket == -1) {
3922 		/* Candidate cluster runq is empty */
3923 		return false;
3924 	}
3925 
3926 	if (idle_pset->pset_cluster_type == candidate_pset->pset_cluster_type) {
3927 		/* Always allow stealing from homogeneous clusters */
3928 		return true;
3929 	} else {
3930 		/* Use the load metrics for highest runnable bucket since that would be stolen next */
3931 		int32_t candidate_load = sched_edge_cluster_load_metric(candidate_pset, (sched_bucket_t)highest_runnable_bucket);
3932 		return candidate_load > 0;
3933 	}
3934 }
3935 
3936 static thread_t
sched_edge_steal_thread(processor_set_t pset,uint64_t candidate_pset_bitmap)3937 sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap)
3938 {
3939 	thread_t thread = THREAD_NULL;
3940 
3941 	/*
3942 	 * Edge Scheduler Optimization
3943 	 *
3944 	 * The logic today bails as soon as it finds a cluster where the cluster load is
3945 	 * greater than the edge weight. Maybe it should have a more advanced version
3946 	 * which looks for the maximum delta etc.
3947 	 */
3948 	int cluster_id = -1;
3949 	while ((cluster_id = sched_edge_iterate_clusters_ordered(pset, candidate_pset_bitmap, cluster_id)) != -1) {
3950 		processor_set_t steal_from_pset = pset_array[cluster_id];
3951 		if (steal_from_pset == NULL) {
3952 			continue;
3953 		}
3954 		sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[pset->pset_cluster_id];
3955 		if (incoming_edge->sce_steal_allowed == false) {
3956 			continue;
3957 		}
3958 		pset_lock(steal_from_pset);
3959 		if (sched_edge_steal_possible(pset, steal_from_pset)) {
3960 			uint64_t current_timestamp = mach_absolute_time();
3961 			sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(&steal_from_pset->pset_clutch_root, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
3962 			thread = sched_clutch_thread_unbound_lookup(&steal_from_pset->pset_clutch_root, root_bucket);
3963 			sched_clutch_thread_remove(&steal_from_pset->pset_clutch_root, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
3964 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
3965 			sched_update_pset_load_average(steal_from_pset, current_timestamp);
3966 		}
3967 		pset_unlock(steal_from_pset);
3968 		if (thread != THREAD_NULL) {
3969 			break;
3970 		}
3971 	}
3972 	return thread;
3973 }
3974 
3975 /*
3976  * sched_edge_processor_idle()
3977  *
3978  * The routine is the implementation for steal_thread() for the Edge scheduler.
3979  */
3980 static thread_t
sched_edge_processor_idle(processor_set_t pset)3981 sched_edge_processor_idle(processor_set_t pset)
3982 {
3983 	thread_t thread = THREAD_NULL;
3984 
3985 	uint64_t ctime = mach_absolute_time();
3986 
3987 	processor_t processor = current_processor();
3988 	bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
3989 
3990 	/* Each of the operations acquire the lock for the pset they target */
3991 	pset_unlock(pset);
3992 
3993 	/* Find highest priority runnable thread on all non-native clusters */
3994 	thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
3995 	if (thread != THREAD_NULL) {
3996 		return thread;
3997 	}
3998 
3999 	/* Find highest priority runnable thread on all native clusters */
4000 	thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4001 	if (thread != THREAD_NULL) {
4002 		return thread;
4003 	}
4004 
4005 	/* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4006 	boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4007 	if (rebalance_needed) {
4008 		return THREAD_NULL;
4009 	}
4010 
4011 	/* No foreign threads found; find a thread to steal from all clusters based on weights/loads etc. */
4012 	thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4013 	return thread;
4014 }
4015 
4016 /* Return true if this shared resource thread has a better cluster to run on */
4017 static bool
sched_edge_shared_rsrc_migrate_possible(thread_t thread,processor_set_t preferred_pset,processor_set_t current_pset)4018 sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4019 {
4020 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4021 	uint64_t current_pset_load = sched_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4022 	/*
4023 	 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4024 	 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4025 	 *
4026 	 * Edge Scheduler Optimization
4027 	 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4028 	 */
4029 	uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4030 
4031 	uint64_t eligible_pset_bitmask = 0;
4032 	if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4033 		/*
4034 		 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4035 		 * only among clusters native with the preferred cluster.
4036 		 */
4037 		eligible_pset_bitmask = preferred_pset->native_psets[0];
4038 		bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4039 	} else {
4040 		/* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4041 		eligible_pset_bitmask = sched_edge_available_pset_bitmask[0];
4042 	}
4043 
4044 	/* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4045 	for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4046 		if (cluster_id == current_pset->pset_cluster_id) {
4047 			continue;
4048 		}
4049 		uint64_t cluster_load = sched_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4050 		if (current_pset_adjusted_load > cluster_load) {
4051 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4052 			return true;
4053 		}
4054 	}
4055 	return false;
4056 }
4057 
4058 /* Return true if this thread should not continue running on this processor */
4059 static bool
sched_edge_thread_avoid_processor(processor_t processor,thread_t thread,ast_t reason)4060 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason)
4061 {
4062 	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4063 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
4064 		return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4065 	}
4066 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
4067 		if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4068 			return true;
4069 		}
4070 		return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
4071 	}
4072 
4073 	/*
4074 	 * For long running parallel workloads, it is important to rebalance threads across
4075 	 * E/P clusters so that they make equal forward progress. This is achieved through
4076 	 * threads expiring their quantum on the non-preferred cluster type and explicitly
4077 	 * rebalancing to the preferred cluster runqueue.
4078 	 */
4079 	if (processor->processor_set->pset_type != preferred_pset->pset_type) {
4080 		return true;
4081 	}
4082 	/* If the preferred pset for the thread is now idle, try and migrate thread to that cluster */
4083 	if ((processor->processor_set != preferred_pset) &&
4084 	    (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
4085 		return true;
4086 	}
4087 
4088 	/*
4089 	 * On quantum expiry, check the migration bitmask if this thread should be migrated off this core.
4090 	 * A migration is only recommended if there's also an idle core available that needn't be avoided.
4091 	 */
4092 	if (reason & AST_QUANTUM) {
4093 		if (bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id)) {
4094 			uint64_t non_avoided_idle_primary_map = processor->processor_set->cpu_state_map[PROCESSOR_IDLE] & processor->processor_set->recommended_bitmask & ~processor->processor_set->perfcontrol_cpu_migration_bitmask;
4095 			if (non_avoided_idle_primary_map != 0) {
4096 				return true;
4097 			}
4098 		}
4099 	}
4100 
4101 	return false;
4102 }
4103 
4104 static bool
sched_edge_balance(__unused processor_t cprocessor,processor_set_t cpset)4105 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
4106 {
4107 	assert(cprocessor == current_processor());
4108 	pset_unlock(cpset);
4109 
4110 	uint64_t ast_processor_map = 0;
4111 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4112 
4113 	bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
4114 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4115 		/* Skip the pset if its not schedulable */
4116 		processor_set_t target_pset = pset_array[cluster];
4117 		if (sched_edge_pset_available(target_pset) == false) {
4118 			continue;
4119 		}
4120 
4121 		pset_lock(target_pset);
4122 		uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
4123 		for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
4124 			if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
4125 				continue;
4126 			}
4127 			processor_t target_cpu = processor_array[cpuid];
4128 			ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
4129 			if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4130 				bit_set(ast_processor_map, cpuid);
4131 			}
4132 		}
4133 		pset_unlock(target_pset);
4134 	}
4135 
4136 	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4137 		processor_t ast_processor = processor_array[cpuid];
4138 		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4139 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
4140 	}
4141 
4142 	/* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
4143 	return ast_processor_map != 0;
4144 }
4145 
4146 /*
4147  * sched_edge_migration_check()
4148  *
4149  * Routine to evaluate an edge between two clusters to decide if migration is possible
4150  * across that edge. Also updates the selected_pset and max_edge_delta out parameters
4151  * accordingly. The return value indicates if the invoking routine should short circuit
4152  * the search, since an ideal candidate has been found. The routine looks at the regular
4153  * edges and cluster loads or the shared resource loads based on the type of thread.
4154  */
4155 static bool
sched_edge_migration_check(uint32_t cluster_id,processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread,processor_set_t * selected_pset,uint32_t * max_edge_delta)4156 sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
4157     uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
4158 {
4159 	uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
4160 	cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
4161 	processor_set_t dst_pset = pset_array[cluster_id];
4162 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4163 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4164 
4165 	if (cluster_id == preferred_cluster_id) {
4166 		return false;
4167 	}
4168 
4169 	if (dst_pset == NULL) {
4170 		return false;
4171 	}
4172 
4173 	sched_clutch_edge *edge = preferred_pset->sched_edges;
4174 	if (edge[cluster_id].sce_migration_allowed == false) {
4175 		return false;
4176 	}
4177 	uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
4178 	if (dst_load == 0) {
4179 		/* The candidate cluster is idle; select it immediately for execution */
4180 		*selected_pset = dst_pset;
4181 		*max_edge_delta = preferred_cluster_load;
4182 		return true;
4183 	}
4184 
4185 	uint32_t edge_delta = 0;
4186 	if (dst_load > preferred_cluster_load) {
4187 		return false;
4188 	}
4189 	edge_delta = preferred_cluster_load - dst_load;
4190 	if (!shared_rsrc_thread && (edge_delta < edge[cluster_id].sce_migration_weight)) {
4191 		/*
4192 		 * For non shared resource threads, use the edge migration weight to decide if
4193 		 * this cluster is over-committed at the QoS level of this thread.
4194 		 */
4195 		return false;
4196 	}
4197 
4198 	if (edge_delta < *max_edge_delta) {
4199 		return false;
4200 	}
4201 	if (edge_delta == *max_edge_delta) {
4202 		/* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
4203 		boolean_t selected_homogeneous = (pset_type_for_id((*selected_pset)->pset_cluster_id) == preferred_cluster_type);
4204 		boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type);
4205 		if (selected_homogeneous || !candidate_homogeneous) {
4206 			return false;
4207 		}
4208 	}
4209 	/* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
4210 	*max_edge_delta = edge_delta;
4211 	*selected_pset = dst_pset;
4212 	return false;
4213 }
4214 
4215 /*
4216  * sched_edge_iterate_clusters_ordered()
4217  *
4218  * Routine to iterate clusters in die local order. For multi-die machines,
4219  * the routine ensures that the candidate clusters on the same die as the
4220  * passed in pset are returned before the remote die clusters. This should
4221  * be used in all places where cluster selection in die order matters.
4222  */
4223 
4224 static int
sched_edge_iterate_clusters_ordered(processor_set_t starting_pset,uint64_t candidate_map,int previous_cluster)4225 sched_edge_iterate_clusters_ordered(processor_set_t starting_pset, uint64_t candidate_map, int previous_cluster)
4226 {
4227 	int cluster_id = -1;
4228 
4229 	uint64_t local_candidate_map = starting_pset->local_psets[0] & candidate_map;
4230 	uint64_t remote_candidate_map = starting_pset->remote_psets[0] & candidate_map;
4231 
4232 	if (previous_cluster == -1) {
4233 		/* previous_cluster == -1 indicates the initial condition */
4234 		cluster_id = bit_first(local_candidate_map);
4235 		if (cluster_id != -1) {
4236 			return cluster_id;
4237 		}
4238 		return bit_first(remote_candidate_map);
4239 	} else {
4240 		/*
4241 		 * After the initial condition, the routine attempts to return a
4242 		 * cluster in the previous_cluster's locality. If none is available,
4243 		 * it looks at remote clusters.
4244 		 */
4245 		if (bit_test(local_candidate_map, previous_cluster)) {
4246 			cluster_id = bit_next(local_candidate_map, previous_cluster);
4247 			if (cluster_id != -1) {
4248 				return cluster_id;
4249 			} else {
4250 				return bit_first(remote_candidate_map);
4251 			}
4252 		}
4253 		return bit_next(remote_candidate_map, previous_cluster);
4254 	}
4255 }
4256 
4257 /*
4258  * sched_edge_migrate_edges_evaluate()
4259  *
4260  * Routine to find the candidate for thread migration based on edge weights.
4261  *
4262  * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
4263  * return preferred_pset if its the most ideal destination for this thread.
4264  */
4265 static processor_set_t
sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread)4266 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
4267 {
4268 	processor_set_t selected_pset = preferred_pset;
4269 	uint32_t max_edge_delta = 0;
4270 	bool search_complete = false;
4271 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4272 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4273 
4274 	bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
4275 	bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
4276 	/* Always start the search with the native clusters */
4277 	int cluster_id = -1;
4278 	while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, native_pset_bitmap[0], cluster_id)) != -1) {
4279 		search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4280 		if (search_complete) {
4281 			break;
4282 		}
4283 	}
4284 
4285 	if (search_complete) {
4286 		return selected_pset;
4287 	}
4288 
4289 	if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
4290 		/*
4291 		 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
4292 		 * to fill up the preferred cluster and its homogeneous peers first.
4293 		 */
4294 
4295 		if (max_edge_delta > 0) {
4296 			/*
4297 			 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
4298 			 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
4299 			 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
4300 			 * should be placed.
4301 			 */
4302 			return selected_pset;
4303 		}
4304 		/*
4305 		 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
4306 		 * any more capacity left.
4307 		 */
4308 		if (sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
4309 			return preferred_pset;
4310 		}
4311 		/*
4312 		 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
4313 		 * at non-native clusters for capacity.
4314 		 */
4315 	}
4316 
4317 	/* Now look at the non-native clusters */
4318 	cluster_id = -1;
4319 	while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, foreign_pset_bitmap[0], cluster_id)) != -1) {
4320 		search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
4321 		if (search_complete) {
4322 			break;
4323 		}
4324 	}
4325 	return selected_pset;
4326 }
4327 
4328 /*
4329  * sched_edge_candidate_alternative()
4330  *
4331  * Routine to find an alternative cluster from candidate_cluster_bitmap since the
4332  * selected_pset is not available for execution. The logic tries to prefer homogeneous
4333  * clusters over heterogeneous clusters since this is typically used in thread
4334  * placement decisions.
4335  */
4336 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
4337 static processor_set_t
sched_edge_candidate_alternative(processor_set_t selected_pset,uint64_t candidate_cluster_bitmap)4338 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
4339 {
4340 	/*
4341 	 * It looks like the most ideal pset is not available for scheduling currently.
4342 	 * Try to find a homogeneous cluster that is still available.
4343 	 */
4344 	uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
4345 	int available_cluster_id = lsb_first(available_native_clusters);
4346 	if (available_cluster_id == -1) {
4347 		/* Looks like none of the homogeneous clusters are available; pick the first available cluster */
4348 		available_cluster_id = bit_first(candidate_cluster_bitmap);
4349 	}
4350 	assert(available_cluster_id != -1);
4351 	return pset_array[available_cluster_id];
4352 }
4353 
4354 /*
4355  * sched_edge_switch_pset_lock()
4356  *
4357  * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
4358  * switch_pset_locks.
4359  * Returns the newly locked pset after the switch.
4360  */
4361 static processor_set_t
sched_edge_switch_pset_lock(processor_set_t selected_pset,processor_set_t locked_pset,bool switch_pset_locks)4362 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
4363 {
4364 	if (!switch_pset_locks) {
4365 		return locked_pset;
4366 	}
4367 	if (selected_pset != locked_pset) {
4368 		pset_unlock(locked_pset);
4369 		pset_lock(selected_pset);
4370 		return selected_pset;
4371 	} else {
4372 		return locked_pset;
4373 	}
4374 }
4375 
4376 /*
4377  * sched_edge_amp_rebalance_pset()
4378  *
4379  * Routine to decide where a thread which is eligible for AMP rebalance (i.e.
4380  * has executed on non-preferred cluster type for a while) should be enqueued.
4381  * The algorithm maintains a history of AMP rebalance decisions on the clutch
4382  * bucket group of the workload and round-robins between clusters to ensure
4383  * that all threads get a chance on the performance cores and make equal
4384  * progress.
4385  */
4386 static processor_set_t
sched_edge_amp_rebalance_pset(processor_set_t preferred_pset,thread_t thread)4387 sched_edge_amp_rebalance_pset(processor_set_t preferred_pset, thread_t thread)
4388 {
4389 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
4390 	sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket];
4391 
4392 	uint32_t last_chosen_cluster, new_chosen_cluster;
4393 
4394 	/* Only AMP rebalance within clusters native to the preferred cluster */
4395 	uint64_t eligible_pset_bitmask = preferred_pset->native_psets[0];
4396 	/* Preferred cluster is also eligible for rebalancing */
4397 	bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4398 	/* Atomically update the AMP rebalance cluster for the clutch bucket group */
4399 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_amp_rebalance_last_chosen, last_chosen_cluster, new_chosen_cluster, relaxed, {
4400 		if (last_chosen_cluster == UINT32_MAX) {
4401 		        new_chosen_cluster = preferred_pset->pset_cluster_id;
4402 		} else {
4403 		        new_chosen_cluster = lsb_next(eligible_pset_bitmask, last_chosen_cluster);
4404 		        if (new_chosen_cluster == -1) {
4405 		                /* Rotate to the start of the eligible bitmask */
4406 		                new_chosen_cluster = lsb_first(eligible_pset_bitmask);
4407 			}
4408 		}
4409 	});
4410 	return pset_array[new_chosen_cluster];
4411 }
4412 
4413 /*
4414  * sched_edge_migrate_candidate()
4415  *
4416  * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
4417  * the thread and the preferred cluster to determine the best available pset for scheduling.
4418  *
4419  * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
4420  * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
4421  * decision has to be synchronized with another CPU changing the recommendation of clusters available
4422  * on the system. If this parameter is set to false, this routine returns the best effort indication of
4423  * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
4424  * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
4425  * more appropriate cluster.
4426  *
4427  * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
4428  * resultant pset lock is held.
4429  */
4430 static processor_set_t
sched_edge_migrate_candidate(_Nullable processor_set_t preferred_pset,thread_t thread,processor_set_t locked_pset,bool switch_pset_locks)4431 sched_edge_migrate_candidate(_Nullable processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks)
4432 {
4433 	processor_set_t selected_pset = preferred_pset;
4434 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4435 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
4436 
4437 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
4438 		/* For bound threads always recommend the cluster its bound to */
4439 		selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
4440 		if (selected_pset != NULL) {
4441 			locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4442 			if (sched_edge_pset_available(selected_pset) || (SCHED_CLUTCH_THREAD_CLUSTER_BOUND_SOFT(thread) == false)) {
4443 				/*
4444 				 * If the bound cluster is not available, check if the thread is soft bound. For soft bound threads,
4445 				 * fall through to the regular cluster selection logic which handles unavailable clusters
4446 				 * appropriately. If the thread is hard bound, then return the bound cluster always.
4447 				 */
4448 				return selected_pset;
4449 			}
4450 		}
4451 	}
4452 
4453 	uint64_t candidate_cluster_bitmap = mask(sched_edge_max_clusters);
4454 #if DEVELOPMENT || DEBUG
4455 	extern int enable_task_set_cluster_type;
4456 	task_t task = get_threadtask(thread);
4457 	if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
4458 		processor_set_t pset_hint = task->pset_hint;
4459 		if (pset_hint && (selected_pset == NULL || selected_pset->pset_cluster_type != pset_hint->pset_cluster_type)) {
4460 			selected_pset = pset_hint;
4461 			goto migrate_candidate_available_check;
4462 		}
4463 	}
4464 #endif
4465 
4466 	if (preferred_pset == NULL) {
4467 		/* The preferred_pset has not finished initializing at boot */
4468 		goto migrate_candidate_available_check;
4469 	}
4470 
4471 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
4472 		/* For realtime threads, try and schedule them on the preferred pset always */
4473 		goto migrate_candidate_available_check;
4474 	}
4475 
4476 	uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
4477 	if (preferred_cluster_load == 0) {
4478 		goto migrate_candidate_available_check;
4479 	}
4480 
4481 	/*
4482 	 * If a thread is being rebalanced for achieving equal progress of parallel workloads,
4483 	 * it needs to end up on the preferred runqueue. This mechanism should only be used for
4484 	 * threads which have been previously migrated to the non-preferred cluster type.
4485 	 *
4486 	 * The AMP rebalancing mechanism is available for regular threads or shared resource
4487 	 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
4488 	 */
4489 	bool amp_rebalance_eligible = (!shared_rsrc_thread) || (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST));
4490 	if (amp_rebalance_eligible) {
4491 		boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE | AST_QUANTUM)) == (AST_REBALANCE | AST_QUANTUM);
4492 		if (amp_rebalance) {
4493 			boolean_t non_preferred_pset = (thread->last_processor->processor_set->pset_type != preferred_pset->pset_type);
4494 			if (non_preferred_pset) {
4495 				selected_pset = sched_edge_amp_rebalance_pset(preferred_pset, thread);
4496 				goto migrate_candidate_available_check;
4497 			}
4498 		}
4499 	}
4500 
4501 	/* Look at edge weights to decide the most ideal migration candidate for this thread */
4502 	selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
4503 
4504 migrate_candidate_available_check:
4505 	if (selected_pset == NULL) {
4506 		/* The selected_pset has not finished initializing at boot */
4507 		pset_unlock(locked_pset);
4508 		return NULL;
4509 	}
4510 
4511 	locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
4512 	if (sched_edge_pset_available(selected_pset) == true) {
4513 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_pset->pset_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
4514 		return selected_pset;
4515 	}
4516 	/* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
4517 	bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
4518 	if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -1)) {
4519 		pset_unlock(locked_pset);
4520 		return NULL;
4521 	}
4522 	/* Try and find an alternative for the selected pset */
4523 	selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
4524 	goto migrate_candidate_available_check;
4525 }
4526 
4527 static processor_t
sched_edge_choose_processor(processor_set_t pset,processor_t processor,thread_t thread)4528 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread)
4529 {
4530 	/* Bound threads don't call this function */
4531 	assert(thread->bound_processor == PROCESSOR_NULL);
4532 	processor_t chosen_processor = PROCESSOR_NULL;
4533 
4534 	/*
4535 	 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
4536 	 * It should take the passed in "pset" as a hint which represents the recency metric for
4537 	 * pset selection logic.
4538 	 */
4539 	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4540 	processor_set_t chosen_pset = preferred_pset;
4541 	/*
4542 	 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
4543 	 * threads to. sched_edge_migrate_candidate() returns the preferred pset
4544 	 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
4545 	 *
4546 	 * Edge Scheduler Optimization
4547 	 * It might be useful to build a recency metric for the thread for multiple clusters and
4548 	 * factor that into the migration decisions.
4549 	 */
4550 	chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true);
4551 	if (chosen_pset) {
4552 		chosen_processor = choose_processor(chosen_pset, processor, thread);
4553 	}
4554 	/* For RT threads, choose_processor() can return a different cluster than the one passed into it */
4555 	assert(chosen_processor ? chosen_processor->processor_set->pset_type == chosen_pset->pset_type : true);
4556 	return chosen_processor;
4557 }
4558 
4559 /*
4560  * sched_edge_clutch_bucket_threads_drain()
4561  *
4562  * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
4563  * bucket overrides etc.) into a local thread queue.
4564  */
4565 static void
sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,queue_t clutch_threads)4566 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
4567 {
4568 	thread_t thread = THREAD_NULL;
4569 	uint64_t current_timestamp = mach_approximate_time();
4570 	qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
4571 		sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4572 		enqueue_tail(clutch_threads, &thread->runq_links);
4573 	}
4574 }
4575 
4576 /*
4577  * sched_edge_run_drained_threads()
4578  *
4579  * Makes all drained threads in a local queue runnable.
4580  */
4581 static void
sched_edge_run_drained_threads(queue_t clutch_threads)4582 sched_edge_run_drained_threads(queue_t clutch_threads)
4583 {
4584 	thread_t thread;
4585 	/* Now setrun all the threads in the local queue */
4586 	qe_foreach_element_safe(thread, clutch_threads, runq_links) {
4587 		remqueue(&thread->runq_links);
4588 		thread_lock(thread);
4589 		thread_setrun(thread, SCHED_TAILQ);
4590 		thread_unlock(thread);
4591 	}
4592 }
4593 
4594 /*
4595  * sched_edge_update_preferred_cluster()
4596  *
4597  * Routine to update the preferred cluster for QoS buckets within a thread group.
4598  * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
4599  */
4600 static void
sched_edge_update_preferred_cluster(sched_clutch_t sched_clutch,bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster)4601 sched_edge_update_preferred_cluster(
4602 	sched_clutch_t sched_clutch,
4603 	bitmap_t *clutch_bucket_modify_bitmap,
4604 	uint32_t *tg_bucket_preferred_cluster)
4605 {
4606 	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4607 		os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
4608 	}
4609 }
4610 
4611 /*
4612  * sched_edge_migrate_thread_group_runnable_threads()
4613  *
4614  * Routine to implement the migration of threads on a cluster when the thread group
4615  * recommendation is updated. The migration works using a 2-phase
4616  * algorithm.
4617  *
4618  * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
4619  * For each clutch bucket, if it needs to be migrated immediately, drain the threads
4620  * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
4621  * appropriate.
4622  *
4623  * Phase 2: After unlocking the pset, drain all the threads from the local thread
4624  * queue and mark them runnable which should land them in the right hierarchy.
4625  *
4626  * The routine assumes that the preferences for the clutch buckets/clutch bucket
4627  * groups have already been updated by the caller.
4628  *
4629  * - Called with the pset locked and interrupts disabled.
4630  * - Returns with the pset unlocked.
4631  */
4632 static void
sched_edge_migrate_thread_group_runnable_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,bitmap_t * clutch_bucket_modify_bitmap,__unused uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)4633 sched_edge_migrate_thread_group_runnable_threads(
4634 	sched_clutch_t sched_clutch,
4635 	sched_clutch_root_t root_clutch,
4636 	bitmap_t *clutch_bucket_modify_bitmap,
4637 	__unused uint32_t *tg_bucket_preferred_cluster,
4638 	bool migrate_immediately)
4639 {
4640 	/* Queue to hold threads that have been drained from clutch buckets to be migrated */
4641 	queue_head_t clutch_threads;
4642 	queue_init(&clutch_threads);
4643 
4644 	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
4645 		/* Get the clutch bucket for this cluster and sched bucket */
4646 		sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
4647 		sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
4648 		sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
4649 		if (scb_root == NULL) {
4650 			/* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
4651 			assert(clutch_bucket->scb_thr_count == 0);
4652 			continue;
4653 		}
4654 		assert(scb_root == root_clutch);
4655 		uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
4656 
4657 		if (migrate_immediately) {
4658 			/*
4659 			 * For transitions where threads need to be migrated immediately, drain the threads into a
4660 			 * local queue unless we are looking at the clutch buckets for the newly recommended
4661 			 * cluster.
4662 			 */
4663 			if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
4664 				sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
4665 			} else {
4666 				sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4667 			}
4668 		} else {
4669 			/* Check if this cluster is the same type as the newly recommended cluster */
4670 			boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
4671 			/*
4672 			 * If threads do not have to be migrated immediately, just change the native/foreign
4673 			 * flag on the clutch bucket.
4674 			 */
4675 			if (homogeneous_cluster) {
4676 				sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
4677 			} else {
4678 				sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
4679 			}
4680 		}
4681 	}
4682 
4683 	pset_unlock(root_clutch->scr_pset);
4684 	sched_edge_run_drained_threads(&clutch_threads);
4685 }
4686 
4687 /*
4688  * sched_edge_migrate_thread_group_running_threads()
4689  *
4690  * Routine to find all running threads of a thread group on a specific cluster
4691  * and IPI them if they need to be moved immediately.
4692  */
4693 static void
sched_edge_migrate_thread_group_running_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,__unused bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)4694 sched_edge_migrate_thread_group_running_threads(
4695 	sched_clutch_t sched_clutch,
4696 	sched_clutch_root_t root_clutch,
4697 	__unused bitmap_t *clutch_bucket_modify_bitmap,
4698 	uint32_t *tg_bucket_preferred_cluster,
4699 	bool migrate_immediately)
4700 {
4701 	if (migrate_immediately == false) {
4702 		/* If CLPC has recommended not to move threads immediately, nothing to do here */
4703 		return;
4704 	}
4705 
4706 	/*
4707 	 * Edge Scheduler Optimization
4708 	 *
4709 	 * When the system has a large number of clusters and cores, it might be useful to
4710 	 * narrow down the iteration by using a thread running bitmap per clutch.
4711 	 */
4712 	uint64_t ast_processor_map = 0;
4713 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
4714 
4715 	uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
4716 	/*
4717 	 * Iterate all CPUs and look for the ones running threads from this thread group and are
4718 	 * not restricted to the specific cluster (due to overrides etc.)
4719 	 */
4720 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
4721 		processor_t src_processor = processor_array[cpuid];
4722 		boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
4723 		sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
4724 		if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
4725 			continue;
4726 		}
4727 		boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
4728 
4729 		if (expected_tg && non_preferred_cluster) {
4730 			ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
4731 			if (ipi_type[cpuid] != SCHED_IPI_NONE) {
4732 				bit_set(ast_processor_map, cpuid);
4733 			} else if (src_processor == current_processor()) {
4734 				bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
4735 				ast_t new_preempt = update_pending_nonurgent_preemption(src_processor, AST_PREEMPT);
4736 				ast_on(new_preempt);
4737 			}
4738 		}
4739 	}
4740 
4741 	/* Perform all the IPIs */
4742 	if (bit_first(ast_processor_map) != -1) {
4743 		for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
4744 			processor_t ast_processor = processor_array[cpuid];
4745 			sched_ipi_perform(ast_processor, ipi_type[cpuid]);
4746 		}
4747 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
4748 	}
4749 }
4750 
4751 /*
4752  * sched_edge_tg_preferred_cluster_change()
4753  *
4754  * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
4755  * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
4756  * thread migrations based on the policy specified by CLPC.
4757  * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
4758  * for each QoS within the thread group.
4759  */
4760 void
sched_edge_tg_preferred_cluster_change(struct thread_group * tg,uint32_t * tg_bucket_preferred_cluster,sched_perfcontrol_preferred_cluster_options_t options)4761 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
4762 {
4763 	sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
4764 	/*
4765 	 * In order to optimize the processing, create a bitmap which represents all QoS buckets
4766 	 * for which the preferred cluster has changed.
4767 	 */
4768 	bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
4769 	for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4770 		uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
4771 		uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
4772 		if (old_preferred_cluster != new_preferred_cluster) {
4773 			bitmap_set(clutch_bucket_modify_bitmap, bucket);
4774 		}
4775 	}
4776 	if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
4777 		/* No changes in any clutch buckets; nothing to do here */
4778 		return;
4779 	}
4780 
4781 	for (uint32_t cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
4782 		processor_set_t pset = pset_array[cluster_id];
4783 		spl_t s = splsched();
4784 		pset_lock(pset);
4785 		/*
4786 		 * The first operation is to update the preferred cluster for all QoS buckets within the
4787 		 * thread group so that any future threads becoming runnable would see the new preferred
4788 		 * cluster value.
4789 		 */
4790 		sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
4791 		/*
4792 		 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
4793 		 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
4794 		 *
4795 		 * Edge Scheduler Optimization
4796 		 */
4797 		/* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
4798 		sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4799 		    tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
4800 		/* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
4801 		sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
4802 		    tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
4803 		/* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
4804 		splx(s);
4805 	}
4806 }
4807 
4808 /*
4809  * sched_edge_pset_made_schedulable()
4810  *
4811  * Routine to migrate all the clutch buckets which are not in their recommended
4812  * pset hierarchy now that a new pset has become runnable. Its possible that this
4813  * routine is called when the pset is already marked schedulable.
4814  *
4815  * Invoked with the pset lock held and interrupts disabled.
4816  */
4817 static void
sched_edge_pset_made_schedulable(__unused processor_t processor,processor_set_t dst_pset,boolean_t drop_lock)4818 sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
4819 {
4820 	if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
4821 		/* Nothing to do here since pset is already marked schedulable */
4822 		if (drop_lock) {
4823 			pset_unlock(dst_pset);
4824 		}
4825 		return;
4826 	}
4827 
4828 	bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
4829 
4830 	thread_t thread = sched_edge_processor_idle(dst_pset);
4831 	if (thread != THREAD_NULL) {
4832 		thread_lock(thread);
4833 		thread_setrun(thread, SCHED_TAILQ);
4834 		thread_unlock(thread);
4835 	}
4836 
4837 	if (!drop_lock) {
4838 		pset_lock(dst_pset);
4839 	}
4840 }
4841 
4842 /*
4843  * sched_edge_cpu_init_completed()
4844  *
4845  * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
4846  * provides an opportunity for the edge scheduler to initialize all the edge parameters.
4847  */
4848 static void
sched_edge_cpu_init_completed(void)4849 sched_edge_cpu_init_completed(void)
4850 {
4851 	spl_t s = splsched();
4852 	for (int src_cluster_id = 0; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) {
4853 		processor_set_t src_pset = pset_array[src_cluster_id];
4854 		pset_lock(src_pset);
4855 
4856 		/* For each cluster, set all its outgoing edge parameters */
4857 		for (int dst_cluster_id = 0; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) {
4858 			if (dst_cluster_id == src_cluster_id) {
4859 				continue;
4860 			}
4861 			processor_set_t dst_pset = pset_array[dst_cluster_id];
4862 			if (src_pset->pset_type == dst_pset->pset_type) {
4863 				/* P->P/E->E edge config */
4864 				bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
4865 				bitmap_set(src_pset->native_psets, dst_cluster_id);
4866 				sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
4867 			} else if ((src_pset->pset_type == CLUSTER_TYPE_P) && (dst_pset->pset_type == CLUSTER_TYPE_E)) {
4868 				/* P->E edge config */
4869 				bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4870 				bitmap_clear(src_pset->native_psets, dst_cluster_id);
4871 				sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1});
4872 			} else {
4873 				/* E->P edge config */
4874 				bitmap_set(src_pset->foreign_psets, dst_cluster_id);
4875 				bitmap_clear(src_pset->native_psets, dst_cluster_id);
4876 				sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0});
4877 			}
4878 			bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
4879 			if (clusters_local) {
4880 				bitmap_set(src_pset->local_psets, dst_cluster_id);
4881 				bitmap_clear(src_pset->remote_psets, dst_cluster_id);
4882 			} else {
4883 				bitmap_set(src_pset->remote_psets, dst_cluster_id);
4884 				bitmap_clear(src_pset->local_psets, dst_cluster_id);
4885 			}
4886 		}
4887 
4888 		pset_unlock(src_pset);
4889 	}
4890 	splx(s);
4891 }
4892 
4893 static bool
sched_edge_thread_eligible_for_pset(thread_t thread,processor_set_t pset)4894 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
4895 {
4896 	uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
4897 	if (preferred_cluster_id == pset->pset_cluster_id) {
4898 		return true;
4899 	} else {
4900 		processor_set_t preferred_pset = pset_array[preferred_cluster_id];
4901 		return preferred_pset->sched_edges[pset->pset_cluster_id].sce_migration_allowed;
4902 	}
4903 }
4904 
4905 extern int sched_amp_spill_deferred_ipi;
4906 extern int sched_amp_pcores_preempt_immediate_ipi;
4907 
4908 int sched_edge_migrate_ipi_immediate = 1;
4909 
4910 sched_ipi_type_t
sched_edge_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)4911 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
4912 {
4913 	processor_set_t pset = dst->processor_set;
4914 	assert(dst != current_processor());
4915 
4916 	boolean_t deferred_ipi_supported = false;
4917 #if defined(CONFIG_SCHED_DEFERRED_AST)
4918 	deferred_ipi_supported = true;
4919 #endif /* CONFIG_SCHED_DEFERRED_AST */
4920 
4921 	switch (event) {
4922 	case SCHED_IPI_EVENT_SPILL:
4923 		/* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
4924 		if (deferred_ipi_supported) {
4925 			return sched_ipi_deferred_policy(pset, dst, thread, event);
4926 		}
4927 		break;
4928 	case SCHED_IPI_EVENT_PREEMPT:
4929 		/* For preemption, the default policy is to use deferred IPIs
4930 		 * for Non-RT P-core preemption. Override that behavior if
4931 		 * sched_amp_pcores_preempt_immediate_ipi is set
4932 		 */
4933 		if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
4934 			if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
4935 				return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4936 			}
4937 			if (sched_edge_migrate_ipi_immediate) {
4938 				processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
4939 				/*
4940 				 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
4941 				 */
4942 				if (preferred_pset->pset_type == pset->pset_type) {
4943 					return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4944 				}
4945 				/*
4946 				 * For workloads that are going wide, it might be useful use Immediate IPI to
4947 				 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
4948 				 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
4949 				 * latency on the preferred pset as an estimate of busyness.
4950 				 */
4951 				if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
4952 					return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
4953 				}
4954 			}
4955 		}
4956 		break;
4957 	default:
4958 		break;
4959 	}
4960 	/* Default back to the global policy for all other scenarios */
4961 	return sched_ipi_policy(dst, thread, dst_idle, event);
4962 }
4963 
4964 /*
4965  * sched_edge_qos_max_parallelism()
4966  */
4967 uint32_t
sched_edge_qos_max_parallelism(int qos,uint64_t options)4968 sched_edge_qos_max_parallelism(int qos, uint64_t options)
4969 {
4970 	uint32_t ecpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_E, false, false);
4971 	uint32_t pcpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_P, false, false);
4972 	uint32_t ecluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_E);
4973 	uint32_t pcluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_P);
4974 
4975 	if (options & QOS_PARALLELISM_REALTIME) {
4976 		/* For realtime threads on AMP, we would want them
4977 		 * to limit the width to just the P-cores since we
4978 		 * do not spill/rebalance for RT threads.
4979 		 */
4980 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcpu_count;
4981 	}
4982 
4983 	/*
4984 	 * The Edge scheduler supports per-QoS recommendations for thread groups.
4985 	 * This enables lower QoS buckets (such as UT) to be scheduled on all
4986 	 * CPUs on the system.
4987 	 *
4988 	 * The only restriction is for BG/Maintenance QoS classes for which the
4989 	 * performance controller would never recommend execution on the P-cores.
4990 	 * If that policy changes in the future, this value should be changed.
4991 	 */
4992 	switch (qos) {
4993 	case THREAD_QOS_BACKGROUND:
4994 	case THREAD_QOS_MAINTENANCE:
4995 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecpu_count;
4996 	default:
4997 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecpu_count + pcpu_count);
4998 	}
4999 }
5000 
5001 
5002 
5003 #endif /* CONFIG_SCHED_EDGE */
5004 
5005 #endif /* CONFIG_SCHED_CLUTCH */
5006