xref: /xnu-12377.41.6/osfmk/kern/sched_clutch.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2018 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #if !SCHED_TEST_HARNESS
30 
31 #include <kern/debug.h>
32 #include <kern/kern_types.h>
33 #include <kern/machine.h>
34 #include <kern/misc_protos.h>
35 #include <kern/queue.h>
36 #include <kern/sched_clutch.h>
37 #include <kern/sched.h>
38 #include <kern/task.h>
39 #include <kern/thread.h>
40 
41 #include <mach/mach_types.h>
42 #include <mach/machine.h>
43 
44 #include <machine/atomic.h>
45 #include <machine/machine_cpu.h>
46 #include <machine/machine_routines.h>
47 #include <machine/sched_param.h>
48 
49 #include <sys/kdebug.h>
50 
51 #endif /* !SCHED_TEST_HARNESS */
52 
53 #include <kern/processor.h>
54 #include <kern/sched_prim.h>
55 #include <kern/sched_rt.h>
56 
57 #if CONFIG_SCHED_EDGE
58 #include <kern/sched_amp_common.h>
59 #endif /* CONFIG_SCHED_EDGE */
60 
61 #if CONFIG_SCHED_CLUTCH
62 
63 #if CONFIG_SCHED_SMT
64 #error "The clutch scheduler does not support CONFIG_SCHED_SMT."
65 #endif /* CONFIG_SCHED_SMT */
66 
67 #define SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION 1
68 typedef union {
69 	struct __attribute__((packed)) {
70 		unsigned int version                            : 4;
71 		unsigned int traverse_mode                      : 3;
72 		unsigned int cluster_id                         : 6;
73 		unsigned int selection_was_edf                  : 1;
74 		unsigned int selection_was_cluster_bound        : 1;
75 		unsigned int selection_opened_starvation_avoidance_window  : 1;
76 		unsigned int selection_opened_warp_window       : 1;
77 		unsigned int starvation_avoidance_window_close  : 12;
78 		unsigned int warp_window_close                  : 12;
79 		unsigned int reserved                           : 23;  /* For future usage */
80 	} trace_data;
81 	uint64_t scdts_trace_data_packed;
82 } sched_clutch_dbg_thread_select_packed_t;
83 
84 static_assert(TH_BUCKET_SCHED_MAX == 6, "Ensure layout of sched_clutch_dbg_thread_select_packed can fit root bucket bitmasks");
85 static_assert(sizeof(sched_clutch_dbg_thread_select_packed_t) <= sizeof(uint64_t), "Ensure sched_clutch_dbg_thread_select_packed_t can fit in one tracepoint argument");
86 
87 /* Forward declarations of static routines */
88 
89 /* Root level hierarchy management */
90 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
91 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
92 static void sched_clutch_root_pri_update(sched_clutch_root_t);
93 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
94 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
95 
96 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
97 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE           = 0,
98 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY   = 1,
99 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL            = 2,
100 });
101 __enum_decl(sched_clutch_traverse_mode_t, uint32_t, {
102 	SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY   = 0,
103 	SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT = 1,
104 	SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT           = 2,
105 });
106 static_assert(SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT < (1 << 3), "Ensure traverse mode can be encoded within 3 bits of sched_clutch_dbg_thread_select_packed_t");
107 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t, sched_clutch_root_bucket_t, thread_t, bool *, sched_clutch_traverse_mode_t, sched_clutch_dbg_thread_select_packed_t *);
108 
109 #if CONFIG_SCHED_EDGE
110 /* Support for foreign threads on AMP platforms */
111 static boolean_t sched_clutch_root_foreign_empty(sched_clutch_root_t);
112 static thread_t sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t);
113 #endif /* CONFIG_SCHED_EDGE */
114 
115 /* Root bucket level hierarchy management */
116 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
117 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t, bool);
118 static int sched_clutch_root_highest_runnable_qos(sched_clutch_root_t, sched_clutch_highest_root_bucket_type_t);
119 
120 /* Options for clutch bucket ordering in the runq */
121 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
122 	SCHED_CLUTCH_BUCKET_OPTIONS_NONE        = 0x0,
123 	/* Round robin clutch bucket on thread removal */
124 	SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR  = 0x1,
125 	/* Insert clutch bucket at head (for thread preemption) */
126 	SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ       = 0x2,
127 	/* Insert clutch bucket at tail (default) */
128 	SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ       = 0x4,
129 });
130 
131 /* Clutch bucket level hierarchy management */
132 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
133 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
134 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
135 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
136 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
137 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
138 
139 /* Clutch bucket group level properties management */
140 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
141 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
142 static void sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_t);
143 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
144 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
145 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
146 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
147 
148 /* Clutch timeshare properties updates */
149 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
150 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
151 
152 /* Clutch membership management */
153 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
154 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
155 static thread_t sched_clutch_hierarchy_thread_highest(sched_clutch_root_t, processor_t, thread_t, sched_clutch_traverse_mode_t);
156 
157 /* Clutch properties updates */
158 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
159 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
160 static int sched_clutch_root_priority(sched_clutch_root_t);
161 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t, sched_clutch_root_bucket_t, processor_t _Nullable processor, thread_t _Nullable prev_thread, bool *_Nullable chose_prev_thread);
162 
163 /* Clutch thread properties */
164 static boolean_t sched_thread_sched_pri_promoted(thread_t);
165 static inline sched_clutch_bucket_t sched_clutch_bucket_for_thread(sched_clutch_root_t, thread_t);
166 static inline sched_clutch_bucket_group_t sched_clutch_bucket_group_for_thread(thread_t);
167 
168 /* General utilities */
169 static inline bool sched_clutch_pri_greater_than_tiebreak(int, int, bool);
170 
171 #if CONFIG_SCHED_EDGE
172 /* System based routines */
173 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
174 
175 /* Global indicating the maximum number of clusters on the current platform */
176 static int sched_edge_max_clusters = 0;
177 #endif /* CONFIG_SCHED_EDGE */
178 
179 /* Helper debugging routines */
180 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
181 
182 extern processor_set_t pset_array[MAX_PSETS];
183 
184 /*
185  * Special markers for buckets that have invalid WCELs/quantums etc.
186  */
187 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
188 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
189 
190 /*
191  * Root level bucket WCELs
192  *
193  * The root level bucket selection algorithm is an Earliest Deadline
194  * First (EDF) algorithm where the deadline for buckets are defined
195  * by the worst-case-execution-latency and the make runnable timestamp
196  * for the bucket.
197  *
198  */
199 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
200 	SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
201 	0,                                              /* FG */
202 	37500,                                          /* IN (37.5ms) */
203 	75000,                                          /* DF (75ms) */
204 	150000,                                         /* UT (150ms) */
205 	250000                                          /* BG (250ms) */
206 };
207 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
208 
209 /*
210  * Root level bucket warp
211  *
212  * Each root level bucket has a warp value associated with it as well.
213  * The warp value allows the root bucket to effectively warp ahead of
214  * lower priority buckets for a limited time even if it has a later
215  * deadline. The warping behavior provides extra (but limited)
216  * opportunity for high priority buckets to remain responsive.
217  */
218 
219 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
220 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED    (SCHED_CLUTCH_INVALID_TIME_64)
221 
222 /* Warp window durations for various tiers */
223 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
224 	SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
225 	8000,                                           /* FG (8ms)*/
226 	4000,                                           /* IN (4ms) */
227 	2000,                                           /* DF (2ms) */
228 	1000,                                           /* UT (1ms) */
229 	0                                               /* BG (0ms) */
230 };
231 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
232 
233 /*
234  * Thread level quantum
235  *
236  * The algorithm defines quantums for threads at various buckets. This
237  * (combined with the root level bucket quantums) restricts how much
238  * the lower priority levels can preempt the higher priority threads.
239  */
240 
241 #if XNU_TARGET_OS_OSX
242 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
243 	10000,                                          /* FIXPRI (10ms) */
244 	10000,                                          /* FG (10ms) */
245 	10000,                                          /* IN (10ms) */
246 	10000,                                          /* DF (10ms) */
247 	4000,                                           /* UT (4ms) */
248 	2000                                            /* BG (2ms) */
249 };
250 #else /* XNU_TARGET_OS_OSX */
251 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
252 	10000,                                          /* FIXPRI (10ms) */
253 	10000,                                          /* FG (10ms) */
254 	8000,                                           /* IN (8ms) */
255 	6000,                                           /* DF (6ms) */
256 	4000,                                           /* UT (4ms) */
257 	2000                                            /* BG (2ms) */
258 };
259 #endif /* XNU_TARGET_OS_OSX */
260 
261 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
262 
263 /*
264  * sched_clutch_us_to_abstime()
265  *
266  * Initializer for converting all durations in usec to abstime
267  */
268 static void
sched_clutch_us_to_abstime(uint32_t * us_vals,uint64_t * abstime_vals)269 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
270 {
271 	for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
272 		if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
273 			abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
274 		} else {
275 			clock_interval_to_absolutetime_interval(us_vals[i],
276 			    NSEC_PER_USEC, &abstime_vals[i]);
277 		}
278 	}
279 }
280 
281 /* Clutch/Edge Scheduler Debugging support */
282 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c)        ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
283 
284 #if DEVELOPMENT || DEBUG
285 
286 kern_return_t
sched_clutch_thread_group_cpu_time_for_thread(thread_t thread,int sched_bucket,uint64_t * cpu_stats)287 sched_clutch_thread_group_cpu_time_for_thread(thread_t thread, int sched_bucket, uint64_t *cpu_stats)
288 {
289 	if (sched_bucket < 0 || sched_bucket >= TH_BUCKET_MAX) {
290 		return KERN_INVALID_ARGUMENT;
291 	}
292 	sched_clutch_bucket_group_t clutch_bucket_group = &sched_clutch_for_thread(thread)->sc_clutch_groups[sched_bucket];
293 	sched_clutch_bucket_cpu_data_t scb_cpu_data;
294 	scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
295 	cpu_stats[0] = scb_cpu_data.cpu_data.scbcd_cpu_used;
296 	cpu_stats[1] = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
297 	return KERN_SUCCESS;
298 }
299 
300 /*
301  * sched_clutch_hierarchy_locked_assert()
302  *
303  * Debugging helper routine. Asserts that the hierarchy is locked. The locking
304  * for the hierarchy depends on where the hierarchy is hooked. The current
305  * implementation hooks the hierarchy at the pset, so the hierarchy is locked
306  * using the pset lock.
307  */
308 static inline void
sched_clutch_hierarchy_locked_assert(sched_clutch_root_t root_clutch)309 sched_clutch_hierarchy_locked_assert(
310 	sched_clutch_root_t root_clutch)
311 {
312 	pset_assert_locked(root_clutch->scr_pset);
313 }
314 
315 #else /* DEVELOPMENT || DEBUG */
316 
317 static inline void
sched_clutch_hierarchy_locked_assert(__unused sched_clutch_root_t root_clutch)318 sched_clutch_hierarchy_locked_assert(
319 	__unused sched_clutch_root_t root_clutch)
320 {
321 }
322 
323 #endif /* DEVELOPMENT || DEBUG */
324 
325 /*
326  * sched_clutch_thr_count_inc()
327  *
328  * Increment thread count at a hierarchy level with overflow checks.
329  */
330 static void
sched_clutch_thr_count_inc(uint16_t * thr_count)331 sched_clutch_thr_count_inc(
332 	uint16_t *thr_count)
333 {
334 	if (__improbable(os_inc_overflow(thr_count))) {
335 		panic("sched_clutch thread count overflowed!");
336 	}
337 }
338 
339 /*
340  * sched_clutch_thr_count_dec()
341  *
342  * Decrement thread count at a hierarchy level with underflow checks.
343  */
344 static void
sched_clutch_thr_count_dec(uint16_t * thr_count)345 sched_clutch_thr_count_dec(
346 	uint16_t *thr_count)
347 {
348 	if (__improbable(os_dec_overflow(thr_count))) {
349 		panic("sched_clutch thread count underflowed!");
350 	}
351 }
352 
353 static sched_bucket_t
sched_convert_pri_to_bucket(uint8_t priority)354 sched_convert_pri_to_bucket(uint8_t priority)
355 {
356 	sched_bucket_t bucket = TH_BUCKET_RUN;
357 
358 	if (priority > BASEPRI_USER_INITIATED) {
359 		bucket = TH_BUCKET_SHARE_FG;
360 	} else if (priority > BASEPRI_DEFAULT) {
361 		bucket = TH_BUCKET_SHARE_IN;
362 	} else if (priority > BASEPRI_UTILITY) {
363 		bucket = TH_BUCKET_SHARE_DF;
364 	} else if (priority > MAXPRI_THROTTLE) {
365 		bucket = TH_BUCKET_SHARE_UT;
366 	} else {
367 		bucket = TH_BUCKET_SHARE_BG;
368 	}
369 	return bucket;
370 }
371 
372 /*
373  * sched_clutch_thread_bucket_map()
374  *
375  * Map a thread to a scheduling bucket for the clutch/edge scheduler
376  * based on its scheduling mode and the priority attribute passed in.
377  */
378 static sched_bucket_t
sched_clutch_thread_bucket_map(thread_t thread,int pri)379 sched_clutch_thread_bucket_map(thread_t thread, int pri)
380 {
381 	switch (thread->sched_mode) {
382 	case TH_MODE_FIXED:
383 		if (pri >= BASEPRI_FOREGROUND) {
384 			return TH_BUCKET_FIXPRI;
385 		} else {
386 			return sched_convert_pri_to_bucket(pri);
387 		}
388 
389 	case TH_MODE_REALTIME:
390 		return TH_BUCKET_FIXPRI;
391 
392 	case TH_MODE_TIMESHARE:
393 		return sched_convert_pri_to_bucket(pri);
394 
395 	default:
396 		panic("unexpected mode: %d", thread->sched_mode);
397 		break;
398 	}
399 }
400 
401 /*
402  * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
403  * based on the amount of time they have been pending and the load at that
404  * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
405  * multiple clusters, its important to keep the load also as a global counter.
406  */
407 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
408 
409 /*
410  * sched_clutch_root_init()
411  *
412  * Routine to initialize the scheduler hierarchy root.
413  */
414 static void
sched_clutch_root_init(sched_clutch_root_t root_clutch,processor_set_t pset)415 sched_clutch_root_init(
416 	sched_clutch_root_t root_clutch,
417 	processor_set_t pset)
418 {
419 	root_clutch->scr_thr_count = 0;
420 	root_clutch->scr_priority = NOPRI;
421 	root_clutch->scr_urgency = 0;
422 	root_clutch->scr_pset = pset;
423 #if CONFIG_SCHED_EDGE
424 	root_clutch->scr_cluster_id = pset->pset_cluster_id;
425 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
426 		root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
427 	}
428 #else /* CONFIG_SCHED_EDGE */
429 	root_clutch->scr_cluster_id = 0;
430 #endif /* CONFIG_SCHED_EDGE */
431 
432 	/* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
433 	queue_init(&root_clutch->scr_clutch_buckets);
434 
435 	/* Initialize the priority queue which maintains all runnable foreign clutch buckets */
436 	priority_queue_init(&root_clutch->scr_foreign_buckets);
437 	bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
438 	bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
439 	bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
440 	priority_queue_init(&root_clutch->scr_bound_root_buckets);
441 
442 	/* Initialize the bitmap and priority queue of runnable root buckets */
443 	priority_queue_init(&root_clutch->scr_unbound_root_buckets);
444 	bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
445 	bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
446 
447 	/* Initialize all the root buckets */
448 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
449 		sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
450 		sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
451 	}
452 }
453 
454 /*
455  * Clutch Bucket Runqueues
456  *
457  * The clutch buckets are maintained in a runq at the root bucket level. The
458  * runq organization allows clutch buckets to be ordered based on various
459  * factors such as:
460  *
461  * - Clutch buckets are round robin'ed at the same priority level when a
462  *   thread is selected from a clutch bucket. This prevents a clutch bucket
463  *   from starving out other clutch buckets at the same priority.
464  *
465  * - Clutch buckets are inserted at the head when it becomes runnable due to
466  *   thread preemption. This allows threads that were preempted to maintain
467  *   their order in the queue.
468  */
469 
470 /*
471  * sched_clutch_bucket_runq_init()
472  *
473  * Initialize a clutch bucket runq.
474  */
475 static void
sched_clutch_bucket_runq_init(sched_clutch_bucket_runq_t clutch_buckets_rq)476 sched_clutch_bucket_runq_init(
477 	sched_clutch_bucket_runq_t clutch_buckets_rq)
478 {
479 	clutch_buckets_rq->scbrq_highq = NOPRI;
480 	for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
481 		clutch_buckets_rq->scbrq_bitmap[i] = 0;
482 	}
483 	clutch_buckets_rq->scbrq_count = 0;
484 	for (int i = 0; i < NRQS; i++) {
485 		circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
486 	}
487 }
488 
489 /*
490  * sched_clutch_bucket_runq_empty()
491  *
492  * Returns if a clutch bucket runq is empty.
493  */
494 static boolean_t
sched_clutch_bucket_runq_empty(sched_clutch_bucket_runq_t clutch_buckets_rq)495 sched_clutch_bucket_runq_empty(
496 	sched_clutch_bucket_runq_t clutch_buckets_rq)
497 {
498 	return clutch_buckets_rq->scbrq_count == 0;
499 }
500 
501 /*
502  * sched_clutch_bucket_runq_peek()
503  *
504  * Returns the highest priority clutch bucket in the runq.
505  */
506 static sched_clutch_bucket_t
sched_clutch_bucket_runq_peek(sched_clutch_bucket_runq_t clutch_buckets_rq)507 sched_clutch_bucket_runq_peek(
508 	sched_clutch_bucket_runq_t clutch_buckets_rq)
509 {
510 	if (clutch_buckets_rq->scbrq_count > 0) {
511 		circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
512 		return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
513 	} else {
514 		return NULL;
515 	}
516 }
517 
518 /*
519  * sched_clutch_bucket_runq_enqueue()
520  *
521  * Enqueue a clutch bucket into the runq based on the options passed in.
522  */
523 static void
sched_clutch_bucket_runq_enqueue(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_options_t options)524 sched_clutch_bucket_runq_enqueue(
525 	sched_clutch_bucket_runq_t clutch_buckets_rq,
526 	sched_clutch_bucket_t clutch_bucket,
527 	sched_clutch_bucket_options_t options)
528 {
529 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
530 	if (circle_queue_empty(queue)) {
531 		circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
532 		bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
533 		if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
534 			clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
535 		}
536 	} else {
537 		if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
538 			circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
539 		} else {
540 			/*
541 			 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
542 			 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
543 			 */
544 			circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
545 		}
546 	}
547 	clutch_buckets_rq->scbrq_count++;
548 }
549 
550 /*
551  * sched_clutch_bucket_runq_remove()
552  *
553  * Remove a clutch bucket from the runq.
554  */
555 static void
sched_clutch_bucket_runq_remove(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)556 sched_clutch_bucket_runq_remove(
557 	sched_clutch_bucket_runq_t clutch_buckets_rq,
558 	sched_clutch_bucket_t clutch_bucket)
559 {
560 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
561 	circle_dequeue(queue, &clutch_bucket->scb_runqlink);
562 	assert(clutch_buckets_rq->scbrq_count > 0);
563 	clutch_buckets_rq->scbrq_count--;
564 	if (circle_queue_empty(queue)) {
565 		bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
566 		clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
567 	}
568 }
569 
570 static void
sched_clutch_bucket_runq_rotate(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)571 sched_clutch_bucket_runq_rotate(
572 	sched_clutch_bucket_runq_t clutch_buckets_rq,
573 	sched_clutch_bucket_t clutch_bucket)
574 {
575 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
576 	assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
577 	circle_queue_rotate_head_forward(queue);
578 }
579 
580 /*
581  * sched_clutch_root_bucket_init()
582  *
583  * Routine to initialize root buckets.
584  */
585 static void
sched_clutch_root_bucket_init(sched_clutch_root_bucket_t root_bucket,sched_bucket_t bucket,bool bound_root_bucket)586 sched_clutch_root_bucket_init(
587 	sched_clutch_root_bucket_t root_bucket,
588 	sched_bucket_t bucket,
589 	bool bound_root_bucket)
590 {
591 	root_bucket->scrb_bucket = bucket;
592 	if (bound_root_bucket) {
593 		/* For bound root buckets, initialize the bound thread runq. */
594 		root_bucket->scrb_bound = true;
595 		run_queue_init(&root_bucket->scrb_bound_thread_runq);
596 	} else {
597 		/*
598 		 * The unbounded root buckets contain a runq of runnable clutch buckets
599 		 * which then hold the runnable threads.
600 		 */
601 		root_bucket->scrb_bound = false;
602 		sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
603 	}
604 	priority_queue_entry_init(&root_bucket->scrb_pqlink);
605 	root_bucket->scrb_pqlink.deadline = 0;
606 	root_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
607 	root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
608 	root_bucket->scrb_starvation_avoidance = false;
609 	root_bucket->scrb_starvation_ts = 0;
610 }
611 
612 /*
613  * Special case scheduling for Above UI bucket.
614  *
615  * AboveUI threads are typically system critical threads that need low latency
616  * which is why they are handled specially.
617  *
618  * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
619  * important to maintain some native priority order between those buckets. For unbounded
620  * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
621  * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
622  * deadline based scheduling which should pickup the timeshare buckets. For the bound
623  * case, the policy simply compares the priority of the highest runnable threads in
624  * the above UI and timeshare buckets.
625  *
626  * The implementation allows extremely low latency CPU access for Above UI threads
627  * while supporting the use case of high priority timeshare threads contending with
628  * lower priority fixed priority threads.
629  */
630 
631 
632 /*
633  * sched_clutch_root_unbound_select_aboveui()
634  *
635  * Routine to determine if the above UI unbounded bucket should be selected for execution.
636  *
637  * Writes the highest unbound (timeshare FG vs. above UI) bucket, its priority, and whether
638  * it is an above UI bucket into the pointer parameters.
639  */
640 static void
sched_clutch_root_unbound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)641 sched_clutch_root_unbound_select_aboveui(
642 	sched_clutch_root_t root_clutch,
643 	sched_clutch_root_bucket_t *highest_bucket,
644 	int *highest_pri,
645 	bool *highest_is_aboveui,
646 	sched_clutch_root_bucket_t _Nullable prev_bucket,
647 	thread_t _Nullable prev_thread)
648 {
649 	/* First determine the highest Clutch bucket */
650 	sched_clutch_root_bucket_t higher_root_bucket = NULL;
651 	sched_clutch_bucket_t higher_clutch_bucket = NULL;
652 	int higher_bucket_sched_pri = -1;
653 	bool higher_is_aboveui = false;
654 	/* Consider unbound Above UI */
655 	if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
656 		higher_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
657 		higher_clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, higher_root_bucket, NULL, NULL, NULL);
658 		higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
659 		higher_is_aboveui = true;
660 	}
661 	/* Consider unbound Timeshare FG */
662 	if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
663 		sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
664 		sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket_sharefg, NULL, NULL, NULL);
665 		/* Strict greater-than because unbound timeshare FG root bucket loses all priority ties at this level */
666 		if (higher_root_bucket == NULL || clutch_bucket_sharefg->scb_priority > higher_clutch_bucket->scb_priority) {
667 			higher_root_bucket = root_bucket_sharefg;
668 			higher_clutch_bucket = clutch_bucket_sharefg;
669 			higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
670 			higher_is_aboveui = false;
671 		}
672 	}
673 	/* Consider the previous thread */
674 	if (prev_thread != NULL) {
675 		assert(prev_bucket->scrb_bound == false);
676 		sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
677 		int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
678 		sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
679 		bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && higher_is_aboveui == false;
680 		if (higher_clutch_bucket == NULL ||
681 		    sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, higher_clutch_bucket->scb_priority, prev_bucket_should_win_ties)) {
682 			higher_root_bucket = prev_bucket;
683 			higher_clutch_bucket = prev_clutch_bucket;
684 			higher_bucket_sched_pri = prev_thread->sched_pri;
685 			higher_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
686 		}
687 	}
688 	/* Compare highest priority in the highest unbound Clutch bucket to highest priority seen from the bound buckets */
689 	if (higher_root_bucket != NULL) {
690 		bool unbound_should_win_ties = higher_is_aboveui == true && *highest_is_aboveui == false;
691 		if (sched_clutch_pri_greater_than_tiebreak(higher_bucket_sched_pri, *highest_pri, unbound_should_win_ties)) {
692 			*highest_pri = higher_bucket_sched_pri;
693 			*highest_bucket = higher_root_bucket;
694 			*highest_is_aboveui = higher_is_aboveui;
695 		}
696 	}
697 }
698 
699 /*
700  * sched_clutch_root_bound_select_aboveui()
701  *
702  * Routine to determine if the above UI bounded bucket should be selected for execution.
703  *
704  * Writes the highest bound (timeshare FG vs. above UI) bucket, its priority, and whether
705  * it is an above UI bucket into the pointer parameters.
706  */
707 static void
sched_clutch_root_bound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)708 sched_clutch_root_bound_select_aboveui(
709 	sched_clutch_root_t root_clutch,
710 	sched_clutch_root_bucket_t *highest_bucket,
711 	int *highest_pri,
712 	bool *highest_is_aboveui,
713 	sched_clutch_root_bucket_t _Nullable prev_bucket,
714 	thread_t _Nullable prev_thread)
715 {
716 	/* Consider bound Above UI */
717 	sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
718 	if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_FIXPRI) &&
719 	    sched_clutch_pri_greater_than_tiebreak(root_bucket_aboveui->scrb_bound_thread_runq.highq, *highest_pri, *highest_is_aboveui == false)) {
720 		*highest_pri = root_bucket_aboveui->scrb_bound_thread_runq.highq;
721 		*highest_bucket = root_bucket_aboveui;
722 		*highest_is_aboveui = true;
723 	}
724 	/* Consider bound Timeshare FG */
725 	sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
726 	if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SHARE_FG) &&
727 	    sched_clutch_pri_greater_than_tiebreak(root_bucket_sharefg->scrb_bound_thread_runq.highq, *highest_pri, false)) {
728 		*highest_pri = root_bucket_sharefg->scrb_bound_thread_runq.highq;
729 		*highest_bucket = root_bucket_sharefg;
730 		*highest_is_aboveui = false;
731 	}
732 	/* Consider the previous thread */
733 	if (prev_thread != NULL) {
734 		assert(prev_bucket->scrb_bound == true);
735 		bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && *highest_is_aboveui == false;
736 		if (sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, *highest_pri, prev_bucket_should_win_ties)) {
737 			*highest_pri = prev_thread->sched_pri;
738 			*highest_bucket = prev_bucket;
739 			*highest_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
740 		}
741 	}
742 }
743 
744 /*
745  * sched_clutch_root_highest_runnable_qos()
746  *
747  * Returns the index of the highest-QoS root bucket which is currently runnable.
748  */
749 static int
sched_clutch_root_highest_runnable_qos(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type)750 sched_clutch_root_highest_runnable_qos(
751 	sched_clutch_root_t root_clutch,
752 	sched_clutch_highest_root_bucket_type_t type)
753 {
754 	int highest_unbound_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
755 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
756 		return highest_unbound_bucket;
757 	}
758 	assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
759 	int highest_bound_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
760 	if (highest_bound_bucket == -1) {
761 		return highest_unbound_bucket;
762 	}
763 	if (highest_unbound_bucket == -1) {
764 		return highest_bound_bucket;
765 	}
766 	/* Both bound and unbound buckets are runnable, return the higher QoS */
767 	return MIN(highest_bound_bucket, highest_unbound_bucket);
768 }
769 
770 /*
771  * sched_clutch_root_highest_aboveui_root_bucket()
772  *
773  * Routine to determine if an above UI root bucket should be selected for execution.
774  *
775  * Returns the root bucket if we should run an above UI bucket or NULL otherwise.
776  */
777 static sched_clutch_root_bucket_t
sched_clutch_root_highest_aboveui_root_bucket(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread)778 sched_clutch_root_highest_aboveui_root_bucket(
779 	sched_clutch_root_t root_clutch,
780 	sched_clutch_highest_root_bucket_type_t type,
781 	sched_clutch_root_bucket_t _Nullable prev_bucket,
782 	thread_t _Nullable prev_thread,
783 	bool *chose_prev_thread)
784 {
785 	assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
786 	assert((type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) || (prev_bucket == NULL));
787 
788 	sched_clutch_root_bucket_t highest_bucket = NULL;
789 	int highest_pri = -1;
790 	bool highest_is_aboveui = false;
791 
792 	/* Forward previous thread to the correct comparison logic, based on boundness */
793 	sched_clutch_root_bucket_t bound_prev_bucket = NULL, unbound_prev_bucket = NULL;
794 	thread_t bound_prev_thread = NULL, unbound_prev_thread = NULL;
795 	if (prev_thread != NULL) {
796 		if (prev_bucket->scrb_bound) {
797 			bound_prev_bucket = prev_bucket;
798 			bound_prev_thread = prev_thread;
799 		} else {
800 			unbound_prev_bucket = prev_bucket;
801 			unbound_prev_thread = prev_thread;
802 		}
803 	}
804 
805 	/* Consider bound Above UI vs. Timeshare FG first, so those buckets will win ties against the corresponding unbound buckets */
806 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) {
807 		sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, bound_prev_bucket, bound_prev_thread);
808 	}
809 
810 	/* Consider unbound Above UI vs. Timeshare FG */
811 	sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, unbound_prev_bucket, unbound_prev_thread);
812 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
813 		return highest_is_aboveui ? highest_bucket : NULL;
814 	}
815 	assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
816 
817 	/* Determine whether we already know to continue running the previous thread */
818 	if (prev_thread != NULL &&
819 	    bitmap_test(highest_bucket->scrb_bound ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap, highest_bucket->scrb_bucket) == false) {
820 		/* Highest bucket we saw is empty, so the previous thread must have been the highest */
821 		assert(highest_bucket == prev_bucket);
822 		*chose_prev_thread = true;
823 	}
824 
825 	return highest_is_aboveui ? highest_bucket : NULL;
826 }
827 
828 /*
829  * sched_clutch_root_highest_root_bucket()
830  *
831  * Main routine to find the highest runnable root level bucket.
832  * This routine is called from performance sensitive contexts; so it is
833  * crucial to keep this O(1). The options parameter determines if
834  * the selection logic should look at unbounded threads only (for
835  * cross-cluster stealing operations) or both bounded and unbounded
836  * threads (for selecting next thread for execution on current cluster).
837  */
838 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread,sched_clutch_traverse_mode_t mode,sched_clutch_dbg_thread_select_packed_t * debug_info)839 sched_clutch_root_highest_root_bucket(
840 	sched_clutch_root_t root_clutch,
841 	uint64_t timestamp,
842 	sched_clutch_highest_root_bucket_type_t type,
843 	sched_clutch_root_bucket_t _Nullable prev_bucket,
844 	thread_t _Nullable prev_thread,
845 	bool *chose_prev_thread,
846 	sched_clutch_traverse_mode_t mode,
847 	sched_clutch_dbg_thread_select_packed_t *debug_info)
848 {
849 	assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
850 	assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL || (prev_thread == NULL));
851 	assert(prev_thread == NULL || (mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY));
852 	sched_clutch_hierarchy_locked_assert(root_clutch);
853 
854 	int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(root_clutch, type);
855 	if (highest_runnable_bucket == -1) {
856 		/*
857 		 * The Clutch hierarchy has no runnable threads. We can continue running
858 		 * whatever was running previously.
859 		 */
860 		assert(sched_clutch_root_count(root_clutch) == 0 || type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
861 		*chose_prev_thread = true;
862 		if (prev_thread != NULL) {
863 			debug_info->trace_data.selection_was_edf = true;
864 		}
865 		return prev_bucket;
866 	}
867 
868 	/* Consider Above UI threads, in comparison to Timeshare FG threads */
869 	sched_clutch_root_bucket_t highest_aboveui_bucket = sched_clutch_root_highest_aboveui_root_bucket(root_clutch, type, prev_bucket, prev_thread, chose_prev_thread);
870 	if (highest_aboveui_bucket != NULL) {
871 		debug_info->trace_data.selection_was_edf = true;
872 		return highest_aboveui_bucket;
873 	}
874 
875 	/*
876 	 * Above UI bucket is not runnable or has a low priority runnable thread; use the
877 	 * earliest deadline model to schedule threads. The idea is that as the timeshare
878 	 * buckets use CPU, they will drop their interactivity score/sched priority and
879 	 * allow the low priority AboveUI buckets to be scheduled.
880 	 */
881 
882 	/* Find the earliest deadline bucket */
883 	sched_clutch_root_bucket_t edf_bucket;
884 	bool edf_bucket_enqueued_normally;
885 
886 evaluate_root_buckets:
887 	edf_bucket = NULL;
888 	edf_bucket_enqueued_normally = true;
889 
890 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
891 		edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
892 	} else {
893 		assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
894 		sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
895 		sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
896 		if (bound_bucket && unbound_bucket) {
897 			/* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
898 			edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
899 		} else {
900 			edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
901 		}
902 	}
903 	if (edf_bucket == NULL) {
904 		/* The timeshare portion of the runqueue is empty */
905 		assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
906 		assert(prev_thread != NULL);
907 		*chose_prev_thread = true;
908 		if (prev_thread != NULL) {
909 			debug_info->trace_data.selection_was_edf = true;
910 		}
911 		return prev_bucket;
912 	}
913 	if (prev_bucket != NULL && prev_bucket->scrb_pqlink.deadline < edf_bucket->scrb_pqlink.deadline) {
914 		/* The previous thread's root bucket has the earliest deadline and is not currently enqueued */
915 		edf_bucket = prev_bucket;
916 		edf_bucket_enqueued_normally = false;
917 	}
918 
919 	if (edf_bucket->scrb_starvation_avoidance) {
920 		/* Check if the EDF bucket is in an expired starvation avoidance window */
921 		uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket];
922 		if (timestamp >= (edf_bucket->scrb_starvation_ts + starvation_window)) {
923 			/* Starvation avoidance window is over; update deadline and re-evaluate EDF */
924 			edf_bucket->scrb_starvation_avoidance = false;
925 			edf_bucket->scrb_starvation_ts = 0;
926 			sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
927 			bit_set(debug_info->trace_data.starvation_avoidance_window_close, edf_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + edf_bucket->scrb_bucket);
928 			goto evaluate_root_buckets;
929 		}
930 	}
931 
932 	/*
933 	 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
934 	 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
935 	 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
936 	 * QoS root bucket.
937 	 */
938 	bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
939 	int warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
940 
941 	/* Allow the prev_bucket to use its warp as well */
942 	bool prev_bucket_warping = (prev_bucket != NULL) && (prev_bucket->scrb_bound == edf_bucket->scrb_bound) &&
943 	    prev_bucket->scrb_bucket < edf_bucket->scrb_bucket && (prev_bucket->scrb_warp_remaining > 0) &&
944 	    (warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
945 
946 	bool non_edf_bucket_can_warp = (warp_bucket_index != -1 && warp_bucket_index < edf_bucket->scrb_bucket) || prev_bucket_warping;
947 
948 	if (non_edf_bucket_can_warp == false) {
949 		/* No higher buckets have warp left; best choice is the EDF based bucket */
950 		debug_info->trace_data.selection_was_edf = true;
951 
952 		bool should_update_edf_starvation_state = edf_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
953 		if (edf_bucket->scrb_starvation_avoidance == false && should_update_edf_starvation_state) {
954 			/* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
955 			if (highest_runnable_bucket < edf_bucket->scrb_bucket || (prev_bucket != NULL && prev_bucket->scrb_bucket < edf_bucket->scrb_bucket)) {
956 				/*
957 				 * Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance.
958 				 *
959 				 * The starvation avoidance window is allocated as a single quantum for the starved bucket, enforced
960 				 * simultaneously across all CPUs in the cluster. The idea is to grant the starved bucket roughly one
961 				 * quantum per core, each time the bucket reaches the earliest deadline position. Note that this
962 				 * cadence is driven by the difference between the starved bucket's and highest-runnable bucket's WCELs.
963 				 */
964 				edf_bucket->scrb_starvation_avoidance = true;
965 				edf_bucket->scrb_starvation_ts = timestamp;
966 				debug_info->trace_data.selection_opened_starvation_avoidance_window = true;
967 			} else {
968 				/* EDF bucket is being selected in the natural order; update deadline and reset warp */
969 				sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
970 				edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
971 				edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
972 				if (edf_bucket_enqueued_normally) {
973 					if (edf_bucket->scrb_bound) {
974 						bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
975 					} else {
976 						bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
977 					}
978 				}
979 			}
980 		}
981 		*chose_prev_thread = !edf_bucket_enqueued_normally;
982 		return edf_bucket;
983 	}
984 
985 	/*
986 	 * Looks like there is a root bucket which is higher in the natural priority
987 	 * order than edf_bucket and might have some warp remaining.
988 	 */
989 	assert(prev_bucket_warping || warp_bucket_index >= 0);
990 	sched_clutch_root_bucket_t warp_bucket = NULL;
991 	if (prev_bucket_warping) {
992 		assert(warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
993 		warp_bucket = prev_bucket;
994 	} else {
995 		warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
996 	}
997 
998 	bool warp_is_being_utilized = warp_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
999 
1000 	if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
1001 		if (warp_is_being_utilized) {
1002 			/* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
1003 			warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
1004 			sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
1005 			debug_info->trace_data.selection_opened_warp_window = true;
1006 		}
1007 		*chose_prev_thread = prev_bucket_warping;
1008 		debug_info->trace_data.selection_was_edf = false;
1009 		assert(warp_bucket != edf_bucket);
1010 		return warp_bucket;
1011 	}
1012 	if (warp_bucket->scrb_warped_deadline > timestamp) {
1013 		/* Root bucket already has a warp window open with some warp remaining */
1014 		if (warp_is_being_utilized) {
1015 			sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
1016 		}
1017 		*chose_prev_thread = prev_bucket_warping;
1018 		debug_info->trace_data.selection_was_edf = false;
1019 		return warp_bucket;
1020 	}
1021 
1022 	/*
1023 	 * For this bucket, warp window was opened sometime in the past but has now
1024 	 * expired. Mark the bucket as not available for warp anymore and re-run the
1025 	 * warp bucket selection logic.
1026 	 */
1027 	warp_bucket->scrb_warp_remaining = 0;
1028 	if (!prev_bucket_warping) {
1029 		if (warp_bucket->scrb_bound) {
1030 			bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
1031 		} else {
1032 			bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
1033 		}
1034 	}
1035 	bit_set(debug_info->trace_data.warp_window_close, warp_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + warp_bucket->scrb_bucket);
1036 	goto evaluate_root_buckets;
1037 }
1038 
1039 static inline bool
sched_clutch_bucket_is_above_timeshare(sched_bucket_t bucket)1040 sched_clutch_bucket_is_above_timeshare(sched_bucket_t bucket)
1041 {
1042 	return bucket == TH_BUCKET_FIXPRI;
1043 }
1044 
1045 /*
1046  * sched_clutch_root_bucket_deadline_calculate()
1047  *
1048  * Calculate the deadline for the bucket based on its WCEL
1049  */
1050 static uint64_t
sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t root_bucket,uint64_t timestamp)1051 sched_clutch_root_bucket_deadline_calculate(
1052 	sched_clutch_root_bucket_t root_bucket,
1053 	uint64_t timestamp)
1054 {
1055 	/* For fixpri AboveUI bucket always return it as the earliest deadline */
1056 	if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1057 		return 0;
1058 	}
1059 
1060 	/* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
1061 	return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
1062 }
1063 
1064 /*
1065  * sched_clutch_root_bucket_deadline_update()
1066  *
1067  * Routine to update the deadline of the root bucket when it is selected.
1068  * Updating the deadline also moves the root_bucket in the EDF priority
1069  * queue.
1070  */
1071 static void
sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,bool bucket_is_enqueued)1072 sched_clutch_root_bucket_deadline_update(
1073 	sched_clutch_root_bucket_t root_bucket,
1074 	sched_clutch_root_t root_clutch,
1075 	uint64_t timestamp,
1076 	bool bucket_is_enqueued)
1077 {
1078 	if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1079 		/* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
1080 		return;
1081 	}
1082 
1083 	uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
1084 	uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1085 	if (__improbable(old_deadline > new_deadline)) {
1086 		panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
1087 	}
1088 	if (old_deadline != new_deadline) {
1089 		root_bucket->scrb_pqlink.deadline = new_deadline;
1090 		if (bucket_is_enqueued) {
1091 			struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1092 			priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
1093 		}
1094 	}
1095 }
1096 
1097 /*
1098  * sched_clutch_root_bucket_runnable()
1099  *
1100  * Routine to insert a newly runnable root bucket into the hierarchy.
1101  * Also updates the deadline and warp parameters as necessary.
1102  */
1103 static void
sched_clutch_root_bucket_runnable(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1104 sched_clutch_root_bucket_runnable(
1105 	sched_clutch_root_bucket_t root_bucket,
1106 	sched_clutch_root_t root_clutch,
1107 	uint64_t timestamp)
1108 {
1109 	/* Mark the root bucket as runnable */
1110 	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1111 	bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
1112 
1113 	if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1114 		/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1115 		return;
1116 	}
1117 
1118 	if (root_bucket->scrb_starvation_avoidance == false) {
1119 		/*
1120 		 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
1121 		 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
1122 		 * that and fix it up.
1123 		 */
1124 		root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1125 	}
1126 	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1127 	priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
1128 	if (root_bucket->scrb_warp_remaining) {
1129 		/* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
1130 		bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1131 		bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
1132 	}
1133 }
1134 
1135 /*
1136  * sched_clutch_root_bucket_empty()
1137  *
1138  * Routine to remove an empty root bucket from the hierarchy.
1139  * Also updates the deadline and warp parameters as necessary.
1140  */
1141 static void
sched_clutch_root_bucket_empty(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1142 sched_clutch_root_bucket_empty(
1143 	sched_clutch_root_bucket_t root_bucket,
1144 	sched_clutch_root_t root_clutch,
1145 	uint64_t timestamp)
1146 {
1147 	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1148 	bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
1149 
1150 	if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1151 		/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1152 		return;
1153 	}
1154 
1155 	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1156 	priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
1157 
1158 	bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1159 	bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
1160 
1161 	if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
1162 		if (root_bucket->scrb_warped_deadline > timestamp) {
1163 			/*
1164 			 * For root buckets that were using the warp, check if the warp
1165 			 * deadline is in the future. If yes, remove the wall time the
1166 			 * warp was active and update the warp remaining. This allows
1167 			 * the root bucket to use the remaining warp the next time it
1168 			 * becomes runnable.
1169 			 */
1170 			root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
1171 		} else {
1172 			/*
1173 			 * If the root bucket's warped deadline is in the past, it has used up
1174 			 * all the warp it was assigned. Empty out its warp remaining.
1175 			 */
1176 			root_bucket->scrb_warp_remaining = 0;
1177 		}
1178 	}
1179 }
1180 
1181 static int
sched_clutch_global_bucket_load_get(sched_bucket_t bucket)1182 sched_clutch_global_bucket_load_get(
1183 	sched_bucket_t bucket)
1184 {
1185 	return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
1186 }
1187 
1188 /*
1189  * sched_clutch_root_pri_update()
1190  *
1191  * The root level priority is used for thread selection and preemption
1192  * logic.
1193  *
1194  * The logic uses the same decision as thread selection for deciding between the
1195  * above UI and timeshare buckets. If one of the timesharing buckets have to be
1196  * used for priority calculation, the logic is slightly different from thread
1197  * selection, because thread selection considers deadlines, warps etc. to
1198  * decide the most optimal bucket at a given timestamp. Since the priority
1199  * value is used for preemption decisions only, it needs to be based on the
1200  * highest runnable thread available in the timeshare domain. This logic can
1201  * be made more sophisticated if there are cases of unnecessary preemption
1202  * being seen in workloads.
1203  */
1204 static void
sched_clutch_root_pri_update(sched_clutch_root_t root_clutch)1205 sched_clutch_root_pri_update(
1206 	sched_clutch_root_t root_clutch)
1207 {
1208 	sched_clutch_hierarchy_locked_assert(root_clutch);
1209 	int16_t root_bound_pri = NOPRI;
1210 	int16_t root_unbound_pri = NOPRI;
1211 
1212 	/* Consider bound root buckets */
1213 	if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1214 		goto root_pri_update_unbound;
1215 	}
1216 	sched_clutch_root_bucket_t highest_bound_root_bucket = NULL;
1217 	__unused int highest_bound_root_bucket_pri = -1;
1218 	bool highest_bound_root_bucket_is_fixpri = false;
1219 	sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bound_root_bucket, &highest_bound_root_bucket_pri, &highest_bound_root_bucket_is_fixpri, NULL, NULL);
1220 	if (highest_bound_root_bucket_is_fixpri == false) {
1221 		int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1222 		assert(root_bucket_index != -1);
1223 		highest_bound_root_bucket = &root_clutch->scr_bound_buckets[root_bucket_index];
1224 	}
1225 	root_bound_pri = highest_bound_root_bucket->scrb_bound_thread_runq.highq;
1226 
1227 root_pri_update_unbound:
1228 	/* Consider unbound root buckets */
1229 	if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1230 		goto root_pri_update_complete;
1231 	}
1232 	sched_clutch_root_bucket_t highest_unbound_root_bucket = NULL;
1233 	__unused int highest_unbound_root_bucket_pri = -1;
1234 	bool highest_unbound_root_bucket_is_fixpri = false;
1235 	sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_unbound_root_bucket, &highest_unbound_root_bucket_pri, &highest_unbound_root_bucket_is_fixpri, NULL, NULL);
1236 	if (highest_unbound_root_bucket_is_fixpri == false) {
1237 		int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1238 		assert(root_bucket_index != -1);
1239 		highest_unbound_root_bucket = &root_clutch->scr_unbound_buckets[root_bucket_index];
1240 	}
1241 
1242 	/* For the selected root bucket, find the highest priority clutch bucket */
1243 	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, highest_unbound_root_bucket, NULL, NULL, NULL);
1244 	root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1245 
1246 root_pri_update_complete:
1247 	root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
1248 }
1249 
1250 /*
1251  * sched_clutch_root_urgency_inc()
1252  *
1253  * Routine to increment the urgency at the root level based on the thread
1254  * priority that is being inserted into the hierarchy. The root urgency
1255  * counter is updated based on the urgency of threads in any of the
1256  * clutch buckets which are part of the hierarchy.
1257  *
1258  * Always called with the pset lock held.
1259  */
1260 static void
sched_clutch_root_urgency_inc(sched_clutch_root_t root_clutch,thread_t thread)1261 sched_clutch_root_urgency_inc(
1262 	sched_clutch_root_t root_clutch,
1263 	thread_t thread)
1264 {
1265 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1266 		root_clutch->scr_urgency++;
1267 	}
1268 }
1269 
1270 /*
1271  * sched_clutch_root_urgency_dec()
1272  *
1273  * Routine to decrement the urgency at the root level based on the thread
1274  * priority that is being removed from the hierarchy. The root urgency
1275  * counter is updated based on the urgency of threads in any of the
1276  * clutch buckets which are part of the hierarchy.
1277  *
1278  * Always called with the pset lock held.
1279  */
1280 static void
sched_clutch_root_urgency_dec(sched_clutch_root_t root_clutch,thread_t thread)1281 sched_clutch_root_urgency_dec(
1282 	sched_clutch_root_t root_clutch,
1283 	thread_t thread)
1284 {
1285 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1286 		root_clutch->scr_urgency--;
1287 	}
1288 }
1289 
1290 /*
1291  * Clutch bucket level scheduling
1292  *
1293  * The second level of scheduling is the clutch bucket level scheduling
1294  * which tries to schedule thread groups within root_buckets. Each
1295  * clutch represents a thread group and a clutch_bucket_group represents
1296  * threads at a particular sched_bucket within that thread group. The
1297  * clutch_bucket_group contains a clutch_bucket per cluster on the system
1298  * where it holds the runnable threads destined for execution on that
1299  * cluster.
1300  *
1301  * The goal of this level of scheduling is to allow interactive thread
1302  * groups low latency access to the CPU. It also provides slight
1303  * scheduling preference for App and unrestricted thread groups.
1304  *
1305  * The clutch bucket scheduling algorithm measures an interactivity
1306  * score for all clutch bucket groups. The interactivity score is based
1307  * on the ratio of the CPU used and the voluntary blocking of threads
1308  * within the clutch bucket group. The algorithm is very close to the ULE
1309  * scheduler on FreeBSD in terms of calculations. The interactivity
1310  * score provides an interactivity boost in the range of
1311  * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1312  * thread groups to win over CPU spinners.
1313  *
1314  * The interactivity score of the clutch bucket group is combined with the
1315  * highest base/promoted priority of threads in the clutch bucket to form
1316  * the overall priority of the clutch bucket.
1317  */
1318 
1319 /* Priority boost range for interactivity */
1320 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT     (8)
1321 static uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1322 
1323 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1324 static uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1325 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS      (500000)
1326 
1327 /* The ratio to scale the cpu/blocked time per window */
1328 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO                (10)
1329 
1330 /* Initial value for voluntary blocking time for the clutch_bucket */
1331 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID          (uint64_t)(~0)
1332 
1333 /* Value indicating the clutch bucket is not pending execution */
1334 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID             ((uint64_t)(~0))
1335 
1336 /*
1337  * Thread group CPU starvation avoidance
1338  *
1339  * In heavily CPU contended scenarios, it is possible that some thread groups
1340  * which have a low interactivity score do not get CPU time at all. In order to
1341  * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1342  * bucket group when it has been pending execution for a certain time as defined
1343  * by the sched_clutch_bucket_group_pending_delta_us values below.
1344  *
1345  * The values chosen here are very close to the WCEL values for each sched bucket.
1346  * Theses values are added into the pending interval used to determine how
1347  * frequently we will ageout the CPU usage, ensuring a reasonable limit on the
1348  * frequency.
1349  */
1350 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1351 	SCHED_CLUTCH_INVALID_TIME_32,           /* FIXPRI */
1352 	10000,                                  /* FG */
1353 	37500,                                  /* IN */
1354 	75000,                                  /* DF */
1355 	150000,                                 /* UT */
1356 	250000,                                 /* BG */
1357 };
1358 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1359 
1360 /*
1361  * sched_clutch_bucket_init()
1362  *
1363  * Initializer for clutch buckets.
1364  */
1365 static void
sched_clutch_bucket_init(sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_group_t clutch_bucket_group,sched_bucket_t bucket)1366 sched_clutch_bucket_init(
1367 	sched_clutch_bucket_t clutch_bucket,
1368 	sched_clutch_bucket_group_t clutch_bucket_group,
1369 	sched_bucket_t bucket)
1370 {
1371 	clutch_bucket->scb_bucket = bucket;
1372 	/* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1373 	clutch_bucket->scb_priority = 0;
1374 #if CONFIG_SCHED_EDGE
1375 	clutch_bucket->scb_foreign = false;
1376 	priority_queue_entry_init(&clutch_bucket->scb_foreignlink);
1377 #endif /* CONFIG_SCHED_EDGE */
1378 	clutch_bucket->scb_group = clutch_bucket_group;
1379 	clutch_bucket->scb_root = NULL;
1380 	priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1381 	priority_queue_init(&clutch_bucket->scb_thread_runq);
1382 	queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1383 }
1384 
1385 /*
1386  * sched_clutch_bucket_group_init()
1387  *
1388  * Initializer for clutch bucket groups.
1389  */
1390 static void
sched_clutch_bucket_group_init(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_t clutch,sched_bucket_t bucket)1391 sched_clutch_bucket_group_init(
1392 	sched_clutch_bucket_group_t clutch_bucket_group,
1393 	sched_clutch_t clutch,
1394 	sched_bucket_t bucket)
1395 {
1396 	bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1397 	clutch_bucket_group->scbg_bucket = bucket;
1398 	clutch_bucket_group->scbg_clutch = clutch;
1399 
1400 	int max_clusters = ml_get_cluster_count();
1401 	clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1402 	for (int i = 0; i < max_clusters; i++) {
1403 		sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1404 	}
1405 
1406 	os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1407 	os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1408 	os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, pset0.pset_cluster_id, relaxed);
1409 	/*
1410 	 * All thread groups should be initialized to be interactive; this allows the newly launched
1411 	 * thread groups to fairly compete with already running thread groups.
1412 	 */
1413 	clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1414 	clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1415 	os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1416 	clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1417 	clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1418 }
1419 
1420 static void
sched_clutch_bucket_group_destroy(sched_clutch_bucket_group_t clutch_bucket_group)1421 sched_clutch_bucket_group_destroy(
1422 	sched_clutch_bucket_group_t clutch_bucket_group)
1423 {
1424 	kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1425 	    clutch_bucket_group->scbg_clutch_buckets);
1426 }
1427 
1428 /*
1429  * sched_clutch_init_with_thread_group()
1430  *
1431  * Initialize the sched_clutch when the thread group is being created
1432  */
1433 void
sched_clutch_init_with_thread_group(sched_clutch_t clutch,struct thread_group * tg)1434 sched_clutch_init_with_thread_group(
1435 	sched_clutch_t clutch,
1436 	struct thread_group *tg)
1437 {
1438 	os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1439 
1440 	/* Initialize all the clutch buckets */
1441 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1442 		sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1443 	}
1444 
1445 	/* Grouping specific fields */
1446 	clutch->sc_tg = tg;
1447 }
1448 
1449 /*
1450  * sched_clutch_destroy()
1451  *
1452  * Destructor for clutch; called from thread group release code.
1453  */
1454 void
sched_clutch_destroy(sched_clutch_t clutch)1455 sched_clutch_destroy(
1456 	sched_clutch_t clutch)
1457 {
1458 	assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1459 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1460 		sched_clutch_bucket_group_destroy(&(clutch->sc_clutch_groups[i]));
1461 	}
1462 }
1463 
1464 #if CONFIG_SCHED_EDGE
1465 
1466 /*
1467  * Edge Scheduler Preferred Cluster Mechanism
1468  *
1469  * In order to have better control over various QoS buckets within a thread group, the Edge
1470  * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1471  * preferences are stored at the sched_clutch_bucket_group level since that represents all
1472  * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1473  * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1474  */
1475 
1476 static uint32_t
sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)1477 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1478 {
1479 	return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1480 }
1481 
1482 static uint32_t
sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)1483 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1484 {
1485 	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1486 }
1487 
1488 uint32_t
sched_edge_thread_preferred_cluster(thread_t thread)1489 sched_edge_thread_preferred_cluster(thread_t thread)
1490 {
1491 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1492 		/* For threads bound to a specific cluster, return the bound cluster id */
1493 		return sched_edge_thread_bound_cluster_id(thread);
1494 	}
1495 
1496 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1497 	sched_bucket_t sched_bucket = thread->th_sched_bucket;
1498 	if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1499 		sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1500 	}
1501 	sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1502 	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1503 }
1504 
1505 /*
1506  * Edge Scheduler Foreign Bucket Support
1507  *
1508  * In the Edge Scheduler, each cluster maintains a priority queue of clutch buckets containing
1509  * threads that are not native to the cluster. A clutch bucket is considered native if its
1510  * preferred cluster has the same type as the cluster its enqueued in. The foreign clutch
1511  * bucket priority queue is used for rebalance operations to get threads back to their native
1512  * cluster quickly.
1513  *
1514  * It is possible to make this policy even more aggressive by considering all clusters that
1515  * are not the preferred cluster as the foreign cluster, but that would mean a lot of thread
1516  * migrations which might have performance implications.
1517  */
1518 
1519 static void
sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1520 sched_clutch_bucket_mark_native(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1521 {
1522 	if (clutch_bucket->scb_foreign) {
1523 		clutch_bucket->scb_foreign = false;
1524 		priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1525 	}
1526 }
1527 
1528 static void
sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1529 sched_clutch_bucket_mark_foreign(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1530 {
1531 	if (!clutch_bucket->scb_foreign) {
1532 		clutch_bucket->scb_foreign = true;
1533 		priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1534 		priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1535 	}
1536 }
1537 
1538 /*
1539  * Edge Scheduler Cumulative Load Average
1540  *
1541  * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1542  * making thread migration decisions. The per-bucket load is maintained as a
1543  * cumulative count since higher scheduling buckets impact load on lower buckets
1544  * for thread migration decisions.
1545  *
1546  */
1547 
1548 static void
sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1549 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1550 {
1551 	switch (bucket) {
1552 	case TH_BUCKET_FIXPRI:    os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1553 	case TH_BUCKET_SHARE_FG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1554 	case TH_BUCKET_SHARE_IN:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1555 	case TH_BUCKET_SHARE_DF:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1556 	case TH_BUCKET_SHARE_UT:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1557 	case TH_BUCKET_SHARE_BG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1558 	default:
1559 		panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1560 	}
1561 }
1562 
1563 static void
sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1564 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1565 {
1566 	switch (bucket) {
1567 	case TH_BUCKET_FIXPRI:    os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1568 	case TH_BUCKET_SHARE_FG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1569 	case TH_BUCKET_SHARE_IN:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1570 	case TH_BUCKET_SHARE_DF:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1571 	case TH_BUCKET_SHARE_UT:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1572 	case TH_BUCKET_SHARE_BG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1573 	default:
1574 		panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1575 	}
1576 }
1577 
1578 uint16_t
sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1579 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1580 {
1581 	return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1582 }
1583 
1584 #endif /* CONFIG_SCHED_EDGE */
1585 
1586 /*
1587  * sched_clutch_bucket_hierarchy_insert()
1588  *
1589  * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1590  */
1591 static void
sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,sched_clutch_bucket_options_t options)1592 sched_clutch_bucket_hierarchy_insert(
1593 	sched_clutch_root_t root_clutch,
1594 	sched_clutch_bucket_t clutch_bucket,
1595 	sched_bucket_t bucket,
1596 	uint64_t timestamp,
1597 	sched_clutch_bucket_options_t options)
1598 {
1599 	sched_clutch_hierarchy_locked_assert(root_clutch);
1600 	if (sched_clutch_bucket_is_above_timeshare(bucket) == false) {
1601 		/* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1602 		enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1603 	}
1604 #if CONFIG_SCHED_EDGE
1605 	/* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1606 	uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1607 	if (pset_type_for_id(preferred_cluster) != pset_type_for_id(root_clutch->scr_cluster_id)) {
1608 		sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
1609 	}
1610 #endif /* CONFIG_SCHED_EDGE */
1611 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1612 
1613 	/* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1614 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1615 		sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1616 	}
1617 
1618 	/* Insert the clutch bucket into the root bucket run queue with order based on options */
1619 	sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1620 	os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed);
1621 	os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1622 }
1623 
1624 /*
1625  * sched_clutch_bucket_hierarchy_remove()
1626  *
1627  * Rotuine to remove a empty clutch bucket from the root hierarchy.
1628  */
1629 static void
sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,__unused sched_clutch_bucket_options_t options)1630 sched_clutch_bucket_hierarchy_remove(
1631 	sched_clutch_root_t root_clutch,
1632 	sched_clutch_bucket_t clutch_bucket,
1633 	sched_bucket_t bucket,
1634 	uint64_t timestamp,
1635 	__unused sched_clutch_bucket_options_t options)
1636 {
1637 	sched_clutch_hierarchy_locked_assert(root_clutch);
1638 	if (sched_clutch_bucket_is_above_timeshare(bucket) == false) {
1639 		/* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1640 		remqueue(&clutch_bucket->scb_listlink);
1641 	}
1642 #if CONFIG_SCHED_EDGE
1643 	sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
1644 #endif /* CONFIG_SCHED_EDGE */
1645 
1646 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1647 
1648 	/* Remove the clutch bucket from the root bucket priority queue */
1649 	sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1650 	os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed);
1651 
1652 	/* If the root bucket priority queue is now empty, remove it from the root priority queue */
1653 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1654 		sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1655 	}
1656 	os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1657 }
1658 
1659 /*
1660  * sched_clutch_bucket_base_pri()
1661  *
1662  * Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1663  * highest base_pri and the highest sched_pri in the clutch bucket.
1664  */
1665 static uint8_t
sched_clutch_bucket_base_pri(sched_clutch_bucket_t clutch_bucket)1666 sched_clutch_bucket_base_pri(
1667 	sched_clutch_bucket_t clutch_bucket)
1668 {
1669 	assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1670 	/*
1671 	 * Since the clutch bucket can contain threads that are members of the group due
1672 	 * to the sched_pri being promoted or due to their base pri, the base priority of
1673 	 * the entire clutch bucket should be based on the highest thread (promoted or base)
1674 	 * in the clutch bucket.
1675 	 */
1676 	uint8_t max_pri = 0;
1677 	if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1678 		max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1679 	}
1680 	return max_pri;
1681 }
1682 
1683 /*
1684  * sched_clutch_interactivity_from_cpu_data()
1685  *
1686  * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1687  */
1688 static uint8_t
sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)1689 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1690 {
1691 	sched_clutch_bucket_cpu_data_t scb_cpu_data;
1692 	scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1693 	clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1694 	clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1695 	uint8_t interactive_score = 0;
1696 
1697 	if ((cpu_blocked == 0) && (cpu_used == 0)) {
1698 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1699 	}
1700 	/*
1701 	 * For all timeshare buckets, calculate the interactivity score of the bucket
1702 	 * and add it to the base priority
1703 	 */
1704 	if (cpu_blocked > cpu_used) {
1705 		/* Interactive clutch_bucket case */
1706 		interactive_score = sched_clutch_bucket_group_interactive_pri +
1707 		    ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1708 	} else {
1709 		/* Non-interactive clutch_bucket case */
1710 		interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1711 	}
1712 	return interactive_score;
1713 }
1714 
1715 /*
1716  * sched_clutch_bucket_pri_calculate()
1717  *
1718  * The priority calculation algorithm for the clutch_bucket is a slight
1719  * modification on the ULE interactivity score. It uses the base priority
1720  * of the clutch bucket and applies an interactivity score boost to the
1721  * highly responsive clutch buckets.
1722  */
1723 static uint8_t
sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t clutch_bucket,uint64_t timestamp)1724 sched_clutch_bucket_pri_calculate(
1725 	sched_clutch_bucket_t clutch_bucket,
1726 	uint64_t timestamp)
1727 {
1728 	/* For empty clutch buckets, return priority 0 */
1729 	if (clutch_bucket->scb_thr_count == 0) {
1730 		return 0;
1731 	}
1732 
1733 	uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1734 	uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1735 
1736 	assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1737 	uint8_t pri = base_pri + interactive_score;
1738 	if (pri != clutch_bucket->scb_priority) {
1739 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1740 		    thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1741 	}
1742 	return pri;
1743 }
1744 
1745 /*
1746  * sched_clutch_root_bucket_highest_clutch_bucket()
1747  *
1748  * Routine to find the highest priority clutch bucket
1749  * within the root bucket.
1750  */
1751 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread,bool * _Nullable chose_prev_thread)1752 sched_clutch_root_bucket_highest_clutch_bucket(
1753 	sched_clutch_root_t root_clutch,
1754 	sched_clutch_root_bucket_t root_bucket,
1755 	processor_t _Nullable processor,
1756 	thread_t _Nullable prev_thread,
1757 	bool *_Nullable chose_prev_thread)
1758 {
1759 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1760 		if (prev_thread != NULL) {
1761 			*chose_prev_thread = true;
1762 			return sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1763 		}
1764 		return NULL;
1765 	}
1766 	sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1767 	/* Consider the Clutch bucket of the previous thread */
1768 	if (prev_thread != NULL) {
1769 		assert(chose_prev_thread != NULL);
1770 		sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
1771 		int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
1772 		sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1773 		if (prev_clutch_bucket != clutch_bucket &&
1774 		    sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, clutch_bucket->scb_priority, processor->first_timeslice)) {
1775 			*chose_prev_thread = true;
1776 			return prev_clutch_bucket;
1777 		}
1778 	}
1779 	return clutch_bucket;
1780 }
1781 
1782 /*
1783  * sched_clutch_bucket_runnable()
1784  *
1785  * Perform all operations needed when a new clutch bucket becomes runnable.
1786  * It involves inserting the clutch_bucket into the hierarchy and updating the
1787  * root priority appropriately.
1788  */
1789 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1790 sched_clutch_bucket_runnable(
1791 	sched_clutch_bucket_t clutch_bucket,
1792 	sched_clutch_root_t root_clutch,
1793 	uint64_t timestamp,
1794 	sched_clutch_bucket_options_t options)
1795 {
1796 	sched_clutch_hierarchy_locked_assert(root_clutch);
1797 	/* Since the clutch bucket became newly runnable, update its pending timestamp */
1798 	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1799 	sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1800 
1801 	/* Update the timesharing properties of this clutch_bucket_group; also done every sched_tick */
1802 	sched_clutch_bucket_group_pri_shift_update(clutch_bucket->scb_group);
1803 
1804 	int16_t root_old_pri = root_clutch->scr_priority;
1805 	sched_clutch_root_pri_update(root_clutch);
1806 	return root_clutch->scr_priority > root_old_pri;
1807 }
1808 
1809 /*
1810  * sched_clutch_bucket_update()
1811  *
1812  * Update the clutch_bucket's position in the hierarchy. This routine is
1813  * called when a new thread is inserted or removed from a runnable clutch
1814  * bucket. The options specify some properties about the clutch bucket
1815  * insertion order into the clutch bucket runq.
1816  */
1817 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1818 sched_clutch_bucket_update(
1819 	sched_clutch_bucket_t clutch_bucket,
1820 	sched_clutch_root_t root_clutch,
1821 	uint64_t timestamp,
1822 	sched_clutch_bucket_options_t options)
1823 {
1824 	sched_clutch_hierarchy_locked_assert(root_clutch);
1825 	uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1826 	sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1827 	if (new_pri == clutch_bucket->scb_priority) {
1828 		/*
1829 		 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1830 		 * to the end of the runq. Typically used when a thread is selected for execution
1831 		 * from a clutch bucket.
1832 		 */
1833 		if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1834 			sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1835 		}
1836 		return false;
1837 	}
1838 	sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1839 #if CONFIG_SCHED_EDGE
1840 	if (clutch_bucket->scb_foreign) {
1841 		priority_queue_remove(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1842 	}
1843 #endif /* CONFIG_SCHED_EDGE */
1844 	clutch_bucket->scb_priority = new_pri;
1845 #if CONFIG_SCHED_EDGE
1846 	if (clutch_bucket->scb_foreign) {
1847 		priority_queue_entry_set_sched_pri(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink, clutch_bucket->scb_priority, 0);
1848 		priority_queue_insert(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink);
1849 	}
1850 #endif /* CONFIG_SCHED_EDGE */
1851 	sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1852 
1853 	int16_t root_old_pri = root_clutch->scr_priority;
1854 	sched_clutch_root_pri_update(root_clutch);
1855 	return root_clutch->scr_priority > root_old_pri;
1856 }
1857 
1858 /*
1859  * sched_clutch_bucket_empty()
1860  *
1861  * Perform all the operations needed when a clutch_bucket is no longer runnable.
1862  * It involves removing the clutch bucket from the hierarchy and updaing the root
1863  * priority appropriately.
1864  */
1865 static void
sched_clutch_bucket_empty(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1866 sched_clutch_bucket_empty(
1867 	sched_clutch_bucket_t clutch_bucket,
1868 	sched_clutch_root_t root_clutch,
1869 	uint64_t timestamp,
1870 	sched_clutch_bucket_options_t options)
1871 {
1872 	sched_clutch_hierarchy_locked_assert(root_clutch);
1873 	assert3u(clutch_bucket->scb_thr_count, ==, 0);
1874 	sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1875 
1876 	/* Update the timesharing properties of this clutch_bucket_group; also done every sched_tick */
1877 	sched_clutch_bucket_group_pri_shift_update(clutch_bucket->scb_group);
1878 
1879 	clutch_bucket->scb_priority = 0;
1880 	sched_clutch_root_pri_update(root_clutch);
1881 }
1882 
1883 /*
1884  * sched_clutch_cpu_usage_update()
1885  *
1886  * Routine to update CPU usage of the thread in the hierarchy.
1887  */
1888 void
sched_clutch_cpu_usage_update(thread_t thread,uint64_t delta)1889 sched_clutch_cpu_usage_update(
1890 	thread_t thread,
1891 	uint64_t delta)
1892 {
1893 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1894 		return;
1895 	}
1896 
1897 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1898 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1899 	sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1900 }
1901 
1902 /*
1903  * sched_clutch_bucket_group_cpu_usage_update()
1904  *
1905  * Routine to update the CPU usage of the clutch_bucket.
1906  */
1907 static void
sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t delta)1908 sched_clutch_bucket_group_cpu_usage_update(
1909 	sched_clutch_bucket_group_t clutch_bucket_group,
1910 	uint64_t delta)
1911 {
1912 	if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
1913 		/* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1914 		return;
1915 	}
1916 	delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1917 	os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1918 }
1919 
1920 /*
1921  * sched_clutch_bucket_group_cpu_pending_adjust()
1922  *
1923  * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1924  * such that one "pending interval" provides one point improvement in interactivity score.
1925  */
1926 static inline uint64_t
sched_clutch_bucket_group_cpu_pending_adjust(uint64_t cpu_used,uint64_t cpu_blocked,uint8_t pending_intervals)1927 sched_clutch_bucket_group_cpu_pending_adjust(
1928 	uint64_t cpu_used,
1929 	uint64_t cpu_blocked,
1930 	uint8_t pending_intervals)
1931 {
1932 	uint64_t cpu_used_adjusted = 0;
1933 	if (cpu_blocked < cpu_used) {
1934 		cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1935 		cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1936 	} else {
1937 		uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1938 		cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1939 	}
1940 	return cpu_used_adjusted;
1941 }
1942 
1943 /*
1944  * sched_clutch_bucket_group_cpu_adjust()
1945  *
1946  * Routine to scale the cpu usage and blocked time once the sum gets bigger
1947  * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1948  * manageable and maintain the same ratio while allowing clutch buckets to
1949  * adjust behavior and reflect in the interactivity score in a reasonable
1950  * amount of time. Also adjusts the CPU usage based on pending_intervals
1951  * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1952  */
1953 static void
sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t clutch_bucket_group,uint8_t pending_intervals)1954 sched_clutch_bucket_group_cpu_adjust(
1955 	sched_clutch_bucket_group_t clutch_bucket_group,
1956 	uint8_t pending_intervals)
1957 {
1958 	sched_clutch_bucket_cpu_data_t old_cpu_data = {};
1959 	sched_clutch_bucket_cpu_data_t new_cpu_data = {};
1960 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
1961 		clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
1962 		clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
1963 
1964 		if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
1965 		        /* No changes to the CPU used and blocked values */
1966 		        os_atomic_rmw_loop_give_up();
1967 		}
1968 		if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
1969 		        /* Only keep the recent CPU history to better indicate how this TG has been behaving */
1970 		        cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1971 		        cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
1972 		}
1973 		/* Use the shift passed in to ageout the CPU usage */
1974 		cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
1975 		new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
1976 		new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
1977 	});
1978 }
1979 
1980 /*
1981  * Thread level scheduling algorithm
1982  *
1983  * The thread level scheduling algorithm uses the mach timeshare
1984  * decay based algorithm to achieve sharing between threads within the
1985  * same clutch bucket. The load/priority shifts etc. are all maintained
1986  * at the clutch bucket level and used for decay calculation of the
1987  * threads. The load sampling is still driven off the scheduler tick
1988  * for runnable clutch buckets (it does not use the new higher frequency
1989  * EWMA based load calculation). The idea is that the contention and load
1990  * within clutch_buckets should be limited enough to not see heavy decay
1991  * and timeshare effectively.
1992  */
1993 
1994 /*
1995  * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
1996  *
1997  * Increment the run count for the clutch bucket associated with the
1998  * thread.
1999  */
2000 uint32_t
sched_clutch_thread_run_bucket_incr(thread_t thread,sched_bucket_t bucket)2001 sched_clutch_thread_run_bucket_incr(
2002 	thread_t thread,
2003 	sched_bucket_t bucket)
2004 {
2005 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2006 		return 0;
2007 	}
2008 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2009 	return sched_clutch_run_bucket_incr(clutch, bucket);
2010 }
2011 
2012 static uint32_t
sched_clutch_run_bucket_incr(sched_clutch_t clutch,sched_bucket_t bucket)2013 sched_clutch_run_bucket_incr(
2014 	sched_clutch_t clutch,
2015 	sched_bucket_t bucket)
2016 {
2017 	assert(bucket != TH_BUCKET_RUN);
2018 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2019 	return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
2020 }
2021 
2022 /*
2023  * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
2024  *
2025  * Decrement the run count for the clutch bucket associated with the
2026  * thread.
2027  */
2028 uint32_t
sched_clutch_thread_run_bucket_decr(thread_t thread,sched_bucket_t bucket)2029 sched_clutch_thread_run_bucket_decr(
2030 	thread_t thread,
2031 	sched_bucket_t bucket)
2032 {
2033 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2034 		return 0;
2035 	}
2036 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2037 	return sched_clutch_run_bucket_decr(clutch, bucket);
2038 }
2039 
2040 static uint32_t
sched_clutch_run_bucket_decr(sched_clutch_t clutch,sched_bucket_t bucket)2041 sched_clutch_run_bucket_decr(
2042 	sched_clutch_t clutch,
2043 	sched_bucket_t bucket)
2044 {
2045 	assert(bucket != TH_BUCKET_RUN);
2046 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2047 	return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
2048 }
2049 
2050 /*
2051  * sched_clutch_bucket_group_pri_shift_update()
2052  *
2053  * Routine to update the priority shift for a clutch bucket group,
2054  * necessary for timesharing correctly with priority decay within a
2055  * thread group + QoS.
2056  */
2057 static void
sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_t clutch_bucket_group)2058 sched_clutch_bucket_group_pri_shift_update(
2059 	sched_clutch_bucket_group_t clutch_bucket_group)
2060 {
2061 	if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2062 		/* No timesharing needed for fixed priority Above UI threads */
2063 		return;
2064 	}
2065 
2066 	/*
2067 	 * Update the timeshare parameters for the clutch bucket group
2068 	 * if they haven't been updated in this tick.
2069 	 */
2070 	uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
2071 	uint32_t current_sched_ts = os_atomic_load(&sched_tick, relaxed);
2072 	if (sched_ts < current_sched_ts) {
2073 		os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
2074 		/* NCPU wide workloads should not experience decay */
2075 		uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
2076 		uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
2077 		bucket_group_load = MIN(bucket_group_load, NRQS - 1);
2078 		uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
2079 		/* Ensure that the pri_shift value is reasonable */
2080 		pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
2081 		os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
2082 	}
2083 }
2084 
2085 /*
2086  * sched_clutch_bucket_group_timeshare_update()
2087  *
2088  * Routine to update the priority shift and priority for the clutch_bucket_group
2089  * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
2090  * clutch buckets with runnable threads in them. So it is important to maintain
2091  * the timesharing information at the clutch_bucket_group level instead of
2092  * individual clutch buckets (because the algorithm is trying to timeshare all
2093  * threads at the same QoS irrespective of which hierarchy they are enqueued in).
2094  *
2095  * The routine is called from the sched tick handling code to make sure this value
2096  * is updated at least once every sched tick. For clutch bucket groups which have
2097  * not been runnable for very long, the clutch_bucket_group maintains a "last
2098  * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
2099  * if this value is outdated, we update the priority shift.
2100  *
2101  * Possible optimization:
2102  * - The current algorithm samples the load at most once every sched tick (125ms).
2103  *   This is prone to spikes in runnable counts; if that turns out to be
2104  *   a problem, a simple solution would be to do the EWMA trick to sample
2105  *   load at every load_tick (30ms) and use the averaged value for the pri
2106  *   shift calculation.
2107  */
2108 static void
sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_bucket_t clutch_bucket,uint64_t ctime)2109 sched_clutch_bucket_group_timeshare_update(
2110 	sched_clutch_bucket_group_t clutch_bucket_group,
2111 	sched_clutch_bucket_t clutch_bucket,
2112 	uint64_t ctime)
2113 {
2114 	if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2115 		/* No timesharing needed for fixed priority Above UI threads */
2116 		return;
2117 	}
2118 	sched_clutch_bucket_group_pri_shift_update(clutch_bucket_group);
2119 	/*
2120 	 * Update the clutch bucket priority; this allows clutch buckets that have been pending
2121 	 * for a long time to get an updated interactivity score.
2122 	 */
2123 	sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
2124 }
2125 
2126 /*
2127  * Calculate the CPU used by this thread and attribute it to the
2128  * thread's current scheduling bucket and clutch bucket group, or
2129  * a previous clutch bucket group if specified.
2130  * Also update the general scheduler CPU usage, matching
2131  * what we do for lightweight_update_priority().
2132  */
2133 static inline void
sched_clutch_thread_tick_delta(thread_t thread,sched_clutch_bucket_group_t _Nullable clutch_bucket_group)2134 sched_clutch_thread_tick_delta(thread_t thread, sched_clutch_bucket_group_t _Nullable clutch_bucket_group)
2135 {
2136 	uint32_t cpu_delta;
2137 	sched_tick_delta(thread, cpu_delta);
2138 	if (thread->pri_shift < INT8_MAX) {
2139 		thread->sched_usage += cpu_delta;
2140 	}
2141 	thread->cpu_delta += cpu_delta;
2142 	if (clutch_bucket_group != NULL) {
2143 		sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
2144 	} else {
2145 		sched_clutch_cpu_usage_update(thread, cpu_delta);
2146 	}
2147 }
2148 
2149 /*
2150  * sched_clutch_thread_clutch_update()
2151  *
2152  * Routine called when the thread changes its thread group. The current
2153  * implementation relies on the fact that the thread group is changed only from
2154  * the context of the thread itself or when the thread is runnable but not in a
2155  * runqueue. Due to this fact, the thread group change causes only counter
2156  * updates in the old & new clutch buckets and no hierarchy changes. The routine
2157  * also attributes the CPU used so far to the old clutch.
2158  */
2159 void
sched_clutch_thread_clutch_update(thread_t thread,sched_clutch_t old_clutch,sched_clutch_t new_clutch)2160 sched_clutch_thread_clutch_update(
2161 	thread_t thread,
2162 	sched_clutch_t old_clutch,
2163 	sched_clutch_t new_clutch)
2164 {
2165 	if (old_clutch) {
2166 		assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
2167 
2168 		sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
2169 
2170 		/* Attribute CPU usage with the old clutch */
2171 		sched_clutch_bucket_group_t old_clutch_bucket_group = NULL;
2172 		if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
2173 			old_clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
2174 		}
2175 		sched_clutch_thread_tick_delta(thread, old_clutch_bucket_group);
2176 	}
2177 
2178 	if (new_clutch) {
2179 		sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
2180 	}
2181 }
2182 
2183 /* Thread Insertion/Removal/Selection routines */
2184 
2185 #if CONFIG_SCHED_EDGE
2186 
2187 /*
2188  * Edge Scheduler Bound Thread Support
2189  *
2190  * The edge scheduler allows threads to be bound to specific clusters. The scheduler
2191  * maintains a separate runq on the clutch root to hold these bound threads. These
2192  * bound threads count towards the root priority and thread count, but are ignored
2193  * for thread migration/steal decisions. Bound threads that are enqueued in the
2194  * separate runq have the th_bound_cluster_enqueued flag set to allow easy
2195  * removal.
2196  *
2197  * Bound Threads Timesharing
2198  * The bound threads share the timesharing properties of the clutch bucket group they are
2199  * part of. They contribute to the load and use priority shifts/decay values from the
2200  * clutch bucket group.
2201  */
2202 
2203 static boolean_t
sched_edge_bound_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2204 sched_edge_bound_thread_insert(
2205 	sched_clutch_root_t root_clutch,
2206 	thread_t thread,
2207 	integer_t options)
2208 {
2209 	/* Update the clutch runnable count and priority */
2210 	sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2211 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2212 	if (root_bucket->scrb_bound_thread_runq.count == 0) {
2213 		sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
2214 	}
2215 
2216 	assert((thread->th_bound_cluster_enqueued) == false);
2217 	run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
2218 	thread->th_bound_cluster_enqueued = true;
2219 
2220 	/*
2221 	 * Trigger an update to the thread's clutch bucket group's priority shift parameters,
2222 	 * needed for global timeshare within a clutch bucket group.
2223 	 */
2224 	sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_for_thread(thread));
2225 
2226 	/* Increment the urgency counter for the root if necessary */
2227 	sched_clutch_root_urgency_inc(root_clutch, thread);
2228 
2229 	int16_t root_old_pri = root_clutch->scr_priority;
2230 	sched_clutch_root_pri_update(root_clutch);
2231 	return root_clutch->scr_priority > root_old_pri;
2232 }
2233 
2234 static void
sched_edge_bound_thread_remove(sched_clutch_root_t root_clutch,thread_t thread)2235 sched_edge_bound_thread_remove(
2236 	sched_clutch_root_t root_clutch,
2237 	thread_t thread)
2238 {
2239 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2240 	assert((thread->th_bound_cluster_enqueued) == true);
2241 	run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
2242 	thread->th_bound_cluster_enqueued = false;
2243 
2244 	/* Decrement the urgency counter for the root if necessary */
2245 	sched_clutch_root_urgency_dec(root_clutch, thread);
2246 
2247 	/* Update the clutch runnable count and priority */
2248 	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2249 	if (root_bucket->scrb_bound_thread_runq.count == 0) {
2250 		sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
2251 	}
2252 	sched_clutch_root_pri_update(root_clutch);
2253 
2254 	/*
2255 	 * Trigger an update to the thread's clutch bucket group's priority shift parameters,
2256 	 * needed for global timeshare within a clutch bucket group.
2257 	 */
2258 	sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_for_thread(thread));
2259 }
2260 
2261 /*
2262  * Edge Scheduler cluster shared resource threads load balancing
2263  *
2264  * The Edge scheduler attempts to load balance cluster shared resource intensive threads
2265  * across clusters in order to reduce contention on the shared resources. It achieves
2266  * that by maintaining the runnable and running shared resource load on each cluster
2267  * and balancing the load across multiple clusters.
2268  *
2269  * The current implementation for cluster shared resource load balancing looks at
2270  * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
2271  * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
2272  * do not steal/rebalance shared resource threads. Some more details for the implementation:
2273  *
2274  * - When threads are tagged as shared resource, they go through the cluster selection logic
2275  *   which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
2276  *   enqueued in the cluster bound runqueue.
2277  *
2278  * - When the threads start running and call avoid_processor, the load balancing logic will be
2279  *   invoked and cause the thread to be sent to a more preferred cluster if one exists and has
2280  *   no shared resource load.
2281  *
2282  * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
2283  *   it will look at running shared resource threads on foreign clusters and actively rebalance them.
2284  *
2285  * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
2286  *   go idle intentionally.
2287  *
2288  * - One caveat of this design is that if a preferred CPU has already run and finished its shared
2289  *   resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
2290  *   The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
2291  *   events listed above happen.
2292  *
2293  * - Also it currently does not consider other properties such as thread priorities and
2294  *   qos level thread load in the thread placement decision.
2295  *
2296  * Edge Scheduler cluster shared resource thread scheduling policy
2297  *
2298  * The threads for shared resources can be scheduled using one of the two policies:
2299  *
2300  * EDGE_SHARED_RSRC_SCHED_POLICY_RR
2301  * This policy distributes the threads so that they spread across all available clusters
2302  * irrespective of type. The idea is that this scheduling policy will put a shared resource
2303  * thread on each cluster on the platform before it starts doubling up on clusters.
2304  *
2305  * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
2306  * This policy distributes threads so that the threads first fill up all the capacity on
2307  * the preferred cluster and its homogeneous peers before spilling to different core type.
2308  * The current implementation defines capacity based on the number of CPUs in the cluster;
2309  * so a cluster's shared resource is considered full if there are "n" runnable + running
2310  * shared resource threads on the cluster with n cpus. This policy is different from the
2311  * default scheduling policy of the edge scheduler since this always tries to fill up the
2312  * native clusters to capacity even when non-native clusters might be idle.
2313  */
2314 __options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
2315 	EDGE_SHARED_RSRC_SCHED_POLICY_RR                = 0,
2316 	EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST      = 1,
2317 });
2318 
2319 static const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
2320 	[CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
2321 	[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
2322 };
2323 
2324 static void
sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch,thread_t thread)2325 sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
2326 {
2327 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2328 		root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
2329 		thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
2330 	}
2331 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2332 		root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
2333 		thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
2334 	}
2335 }
2336 
2337 static void
sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch,thread_t thread)2338 sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
2339 {
2340 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2341 		if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2342 			thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2343 			root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2344 		}
2345 	}
2346 }
2347 
2348 uint16_t
sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch,cluster_shared_rsrc_type_t shared_rsrc_type)2349 sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2350 {
2351 	return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2352 }
2353 
2354 /*
2355  * sched_edge_shared_rsrc_idle()
2356  *
2357  * Routine used to determine if the constrained resource for the pset is idle. This is
2358  * used by a CPU going idle to decide if it should rebalance a running shared resource
2359  * thread from a non-preferred cluster.
2360  */
2361 static boolean_t
sched_edge_shared_rsrc_idle(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2362 sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2363 {
2364 	return sched_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
2365 }
2366 
2367 /*
2368  * sched_edge_thread_shared_rsrc_type
2369  *
2370  * This routine decides if a given thread needs special handling for being a
2371  * heavy shared resource user. It is valid for the same thread to be using
2372  * several shared resources at the same time and have multiple policy flags set.
2373  * This routine determines which of those properties will be used for load
2374  * balancing and migration decisions.
2375  */
2376 static cluster_shared_rsrc_type_t
sched_edge_thread_shared_rsrc_type(thread_t thread)2377 sched_edge_thread_shared_rsrc_type(thread_t thread)
2378 {
2379 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2380 		return CLUSTER_SHARED_RSRC_TYPE_RR;
2381 	}
2382 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2383 		return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2384 	}
2385 	return CLUSTER_SHARED_RSRC_TYPE_NONE;
2386 }
2387 
2388 #endif /* CONFIG_SCHED_EDGE */
2389 
2390 /*
2391  * sched_clutch_thread_bound_lookup()
2392  *
2393  * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2394  */
2395 static thread_t
sched_clutch_thread_bound_lookup(__unused sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t processor,thread_t _Nullable prev_thread)2396 sched_clutch_thread_bound_lookup(
2397 	__unused sched_clutch_root_t root_clutch,
2398 	sched_clutch_root_bucket_t root_bucket,
2399 	processor_t processor,
2400 	thread_t _Nullable prev_thread)
2401 {
2402 	assert(root_bucket->scrb_bound == true);
2403 	thread_t bound_thread = run_queue_peek(&root_bucket->scrb_bound_thread_runq);
2404 	if ((prev_thread != NULL) &&
2405 	    (bound_thread == NULL || sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_thread->sched_pri, processor->first_timeslice))) {
2406 		return prev_thread;
2407 	}
2408 	assert(bound_thread != THREAD_NULL);
2409 	return bound_thread;
2410 }
2411 
2412 /*
2413  * Clutch Bucket Group Thread Counts and Pending time calculation
2414  *
2415  * The pending time on the clutch_bucket_group allows the scheduler to track if it
2416  * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2417  * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2418  * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2419  * pending time is advanced to the time of thread selection.
2420  *
2421  * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2422  * simulataneously, it is important that the updates to thread counts and pending timestamps
2423  * happen atomically. The implementation relies on the following aspects to make that work
2424  * as expected:
2425  * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2426  *   is held when threads are added/removed and pending timestamps are updated
2427  * - The thread count and pending timestamp can be updated atomically using double wide
2428  *   128 bit atomics
2429  *
2430  * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2431  * above to atomically update the interactivity score for a clutch bucket group.
2432  */
2433 
2434 #if CONFIG_SCHED_EDGE
2435 
2436 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2437 sched_clutch_bucket_group_thr_count_inc(
2438 	sched_clutch_bucket_group_t clutch_bucket_group,
2439 	uint64_t timestamp)
2440 {
2441 	sched_clutch_counter_time_t old_pending_data;
2442 	sched_clutch_counter_time_t new_pending_data;
2443 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2444 		new_pending_data.scct_count = old_pending_data.scct_count + 1;
2445 		new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2446 		if (old_pending_data.scct_count == 0) {
2447 		        new_pending_data.scct_timestamp = timestamp;
2448 		}
2449 	});
2450 }
2451 
2452 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2453 sched_clutch_bucket_group_thr_count_dec(
2454 	sched_clutch_bucket_group_t clutch_bucket_group,
2455 	uint64_t timestamp)
2456 {
2457 	sched_clutch_counter_time_t old_pending_data;
2458 	sched_clutch_counter_time_t new_pending_data;
2459 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2460 		new_pending_data.scct_count = old_pending_data.scct_count - 1;
2461 		if (new_pending_data.scct_count == 0) {
2462 		        new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2463 		} else {
2464 		        new_pending_data.scct_timestamp = timestamp;
2465 		}
2466 	});
2467 }
2468 
2469 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2470 sched_clutch_bucket_group_pending_ageout(
2471 	sched_clutch_bucket_group_t clutch_bucket_group,
2472 	uint64_t timestamp)
2473 {
2474 	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2475 	sched_clutch_counter_time_t old_pending_data;
2476 	sched_clutch_counter_time_t new_pending_data;
2477 	uint8_t cpu_usage_shift = 0;
2478 
2479 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2480 		cpu_usage_shift = 0;
2481 		uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2482 		bool old_update = (old_pending_ts >= timestamp);
2483 		bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2484 		bool no_bucket_load = (bucket_load == 0);
2485 		if (old_update || no_pending_time || no_bucket_load) {
2486 		        os_atomic_rmw_loop_give_up();
2487 		}
2488 
2489 		/* Calculate the time the clutch bucket group has been pending */
2490 		uint64_t pending_delta = timestamp - old_pending_ts;
2491 		/*
2492 		 * Other buckets should get a chance to run first before artificially boosting
2493 		 * this clutch bucket group's interactivity score, at least when the entire root
2494 		 * bucket is getting a large enough share of CPU.
2495 		 */
2496 		uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2497 		if (pending_delta < interactivity_delta) {
2498 		        os_atomic_rmw_loop_give_up();
2499 		}
2500 		cpu_usage_shift = (pending_delta / interactivity_delta);
2501 		new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2502 		new_pending_data.scct_count = old_pending_data.scct_count;
2503 	});
2504 	return cpu_usage_shift;
2505 }
2506 
2507 static boolean_t
sched_edge_thread_should_be_inserted_as_bound(sched_clutch_root_t root_clutch,thread_t thread)2508 sched_edge_thread_should_be_inserted_as_bound(
2509 	sched_clutch_root_t root_clutch,
2510 	thread_t thread)
2511 {
2512 	/*
2513 	 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2514 	 * If the thread is cluster-bound but to a different cluster, we should enqueue as unbound.
2515 	 */
2516 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2517 		return TRUE;
2518 	}
2519 	/*
2520 	 * Use bound runqueue for shared resource threads. See "cluster shared resource
2521 	 * threads load balancing" section for details.
2522 	 */
2523 	if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2524 		return TRUE;
2525 	}
2526 	return FALSE;
2527 }
2528 
2529 #else /* CONFIG_SCHED_EDGE */
2530 
2531 /*
2532  * For the clutch scheduler, atomicity is ensured by making sure all operations
2533  * are happening under the pset lock of the only cluster present on the platform.
2534  */
2535 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2536 sched_clutch_bucket_group_thr_count_inc(
2537 	sched_clutch_bucket_group_t clutch_bucket_group,
2538 	uint64_t timestamp)
2539 {
2540 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2541 	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2542 		clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2543 	}
2544 	clutch_bucket_group->scbg_pending_data.scct_count++;
2545 }
2546 
2547 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2548 sched_clutch_bucket_group_thr_count_dec(
2549 	sched_clutch_bucket_group_t clutch_bucket_group,
2550 	uint64_t timestamp)
2551 {
2552 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2553 	clutch_bucket_group->scbg_pending_data.scct_count--;
2554 	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2555 		clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2556 	} else {
2557 		clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2558 	}
2559 }
2560 
2561 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2562 sched_clutch_bucket_group_pending_ageout(
2563 	sched_clutch_bucket_group_t clutch_bucket_group,
2564 	uint64_t timestamp)
2565 {
2566 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2567 	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2568 	uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2569 	bool old_update = (old_pending_ts >= timestamp);
2570 	bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2571 	bool no_bucket_load = (bucket_load == 0);
2572 	if (old_update || no_pending_time || no_bucket_load) {
2573 		return 0;
2574 	}
2575 	uint64_t pending_delta = timestamp - old_pending_ts;
2576 	/*
2577 	 * Other buckets should get a chance to run first before artificially boosting
2578 	 * this clutch bucket group's interactivity score, at least when the entire root
2579 	 * bucket is getting a large enough share of CPU.
2580 	 */
2581 	uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2582 	if (pending_delta < interactivity_delta) {
2583 		return 0;
2584 	}
2585 	uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2586 	clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2587 	return cpu_usage_shift;
2588 }
2589 
2590 #endif /* CONFIG_SCHED_EDGE */
2591 
2592 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2593 sched_clutch_bucket_group_interactivity_score_calculate(
2594 	sched_clutch_bucket_group_t clutch_bucket_group,
2595 	uint64_t timestamp)
2596 {
2597 	if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2598 		/*
2599 		 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2600 		 * priorities, make sure all AboveUI buckets are marked interactive.
2601 		 */
2602 		assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2603 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2604 	}
2605 	/* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2606 	uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2607 	/* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2608 	sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2609 	uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2610 	/* Write back any interactivity score update */
2611 #if CONFIG_SCHED_EDGE
2612 	sched_clutch_counter_time_t old_interactivity_data;
2613 	sched_clutch_counter_time_t new_interactivity_data;
2614 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2615 		new_interactivity_data.scct_count = old_interactivity_data.scct_count;
2616 		if (old_interactivity_data.scct_timestamp >= timestamp) {
2617 		        os_atomic_rmw_loop_give_up();
2618 		}
2619 		new_interactivity_data.scct_timestamp = timestamp;
2620 		if (old_interactivity_data.scct_timestamp != 0) {
2621 		        new_interactivity_data.scct_count = interactivity_score;
2622 		}
2623 	});
2624 	return (uint8_t)new_interactivity_data.scct_count;
2625 #else /* !CONFIG_SCHED_EDGE */
2626 	sched_clutch_hierarchy_locked_assert(&pset0.pset_clutch_root);
2627 	if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2628 		clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2629 		clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2630 	}
2631 	return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2632 #endif /* !CONFIG_SCHED_EDGE */
2633 }
2634 
2635 /*
2636  * Clutch Bucket Group Run Count and Blocked Time Accounting
2637  *
2638  * The clutch bucket group maintains the number of runnable/running threads in the group.
2639  * Since the blocked time of the clutch bucket group is based on this count, it is
2640  * important to make sure the blocking timestamp and the run count are updated atomically.
2641  *
2642  * Since the run count increments happen without any pset locks held, the scheduler updates
2643  * the count & timestamp using double wide 128 bit atomics.
2644  */
2645 
2646 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2647 sched_clutch_bucket_group_run_count_inc(
2648 	sched_clutch_bucket_group_t clutch_bucket_group)
2649 {
2650 	sched_clutch_counter_time_t old_blocked_data;
2651 	sched_clutch_counter_time_t new_blocked_data;
2652 
2653 	bool update_blocked_time = false;
2654 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2655 		new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2656 		new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2657 		update_blocked_time = false;
2658 		if (old_blocked_data.scct_count == 0) {
2659 		        new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2660 		        update_blocked_time = true;
2661 		}
2662 	});
2663 	if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2664 		uint64_t ctime = mach_absolute_time();
2665 		if (ctime > old_blocked_data.scct_timestamp) {
2666 			uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2667 			blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2668 			os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2669 		}
2670 	}
2671 	return (uint32_t)new_blocked_data.scct_count;
2672 }
2673 
2674 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2675 sched_clutch_bucket_group_run_count_dec(
2676 	sched_clutch_bucket_group_t clutch_bucket_group)
2677 {
2678 	sched_clutch_counter_time_t old_blocked_data;
2679 	sched_clutch_counter_time_t new_blocked_data;
2680 
2681 	uint64_t ctime = mach_absolute_time();
2682 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2683 		new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2684 		new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2685 		if (new_blocked_data.scct_count == 0) {
2686 		        new_blocked_data.scct_timestamp = ctime;
2687 		}
2688 	});
2689 	return (uint32_t)new_blocked_data.scct_count;
2690 }
2691 
2692 static inline sched_clutch_bucket_t
sched_clutch_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t thread)2693 sched_clutch_bucket_for_thread(
2694 	sched_clutch_root_t root_clutch,
2695 	thread_t thread)
2696 {
2697 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2698 	assert(thread->thread_group == clutch->sc_tg);
2699 
2700 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2701 	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2702 	assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2703 
2704 	return clutch_bucket;
2705 }
2706 
2707 static inline sched_clutch_bucket_group_t
sched_clutch_bucket_group_for_thread(thread_t prev_thread)2708 sched_clutch_bucket_group_for_thread(thread_t prev_thread)
2709 {
2710 	sched_clutch_t clutch = sched_clutch_for_thread_group(prev_thread->thread_group);
2711 	return &clutch->sc_clutch_groups[prev_thread->th_sched_bucket];
2712 }
2713 
2714 /*
2715  * sched_clutch_thread_insert()
2716  *
2717  * Routine to insert a thread into the sched clutch hierarchy.
2718  * Update the counts at all levels of the hierarchy and insert the nodes
2719  * as they become runnable. Always called with the pset lock held.
2720  */
2721 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2722 sched_clutch_thread_insert(
2723 	sched_clutch_root_t root_clutch,
2724 	thread_t thread,
2725 	integer_t options)
2726 {
2727 	boolean_t result = FALSE;
2728 
2729 	sched_clutch_hierarchy_locked_assert(root_clutch);
2730 #if CONFIG_SCHED_EDGE
2731 	sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2732 	sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2733 
2734 	if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, thread)) {
2735 		/*
2736 		 * Includes threads bound to this specific cluster as well as all
2737 		 * shared resource threads.
2738 		 */
2739 		return sched_edge_bound_thread_insert(root_clutch, thread, options);
2740 	}
2741 #endif /* CONFIG_SCHED_EDGE */
2742 
2743 	uint64_t current_timestamp = mach_absolute_time();
2744 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2745 	assert(thread->thread_group == clutch->sc_tg);
2746 	sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, thread);
2747 	assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2748 
2749 	/*
2750 	 * Thread linkage in clutch_bucket
2751 	 *
2752 	 * A thread has a few linkages within the clutch bucket:
2753 	 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2754 	 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2755 	 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2756 	 */
2757 
2758 	/* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2759 	thread->th_clutch_runq_link.stamp = current_timestamp;
2760 	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2761 	    (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2762 	priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2763 
2764 	/* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2765 	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2766 	    sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2767 	priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2768 
2769 	/* Insert thread into timesharing queue of the clutch bucket */
2770 	enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2771 
2772 	/* Increment the urgency counter for the root if necessary */
2773 	sched_clutch_root_urgency_inc(root_clutch, thread);
2774 
2775 	os_atomic_inc(&clutch->sc_thr_count, relaxed);
2776 	sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2777 
2778 	/* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2779 	sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2780 	if (clutch_bucket->scb_thr_count == 0) {
2781 		sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2782 		sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2783 		result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2784 	} else {
2785 		sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2786 		sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2787 		result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2788 	}
2789 
2790 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2791 	    root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2792 	    SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2793 	return result;
2794 }
2795 
2796 /*
2797  * sched_clutch_thread_remove()
2798  *
2799  * Routine to remove a thread from the sched clutch hierarchy.
2800  * Update the counts at all levels of the hierarchy and remove the nodes
2801  * as they become empty. Always called with the pset lock held.
2802  */
2803 static void
sched_clutch_thread_remove(sched_clutch_root_t root_clutch,thread_t thread,uint64_t current_timestamp,sched_clutch_bucket_options_t options)2804 sched_clutch_thread_remove(
2805 	sched_clutch_root_t root_clutch,
2806 	thread_t thread,
2807 	uint64_t current_timestamp,
2808 	sched_clutch_bucket_options_t options)
2809 {
2810 	sched_clutch_hierarchy_locked_assert(root_clutch);
2811 #if CONFIG_SCHED_EDGE
2812 	sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2813 	sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2814 
2815 	if (thread->th_bound_cluster_enqueued) {
2816 		sched_edge_bound_thread_remove(root_clutch, thread);
2817 		return;
2818 	}
2819 #endif /* CONFIG_SCHED_EDGE */
2820 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2821 	assert(thread->thread_group == clutch->sc_tg);
2822 	thread_assert_runq_nonnull(thread);
2823 
2824 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2825 	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2826 	assert(clutch_bucket->scb_root == root_clutch);
2827 
2828 	/* Decrement the urgency counter for the root if necessary */
2829 	sched_clutch_root_urgency_dec(root_clutch, thread);
2830 	/* Remove thread from the clutch_bucket */
2831 	priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2832 	remqueue(&thread->th_clutch_timeshare_link);
2833 
2834 	priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2835 
2836 	/*
2837 	 * Warning: After this point, the thread's scheduling fields may be
2838 	 * modified by other cores that acquire the thread lock.
2839 	 */
2840 	thread_clear_runq(thread);
2841 
2842 	/* Update counts at various levels of the hierarchy */
2843 	os_atomic_dec(&clutch->sc_thr_count, relaxed);
2844 	sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2845 	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2846 	sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2847 
2848 	/* Remove the clutch from hierarchy (if needed) and update properties */
2849 	if (clutch_bucket->scb_thr_count == 0) {
2850 		sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2851 	} else {
2852 		sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2853 	}
2854 
2855 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2856 	    root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2857 	    SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2858 }
2859 
2860 /*
2861  * sched_clutch_thread_unbound_lookup()
2862  *
2863  * Routine to find the highest unbound thread in the root clutch.
2864  * Helps find threads easily for steal/migrate scenarios in the
2865  * Edge scheduler.
2866  */
2867 static thread_t
sched_clutch_thread_unbound_lookup(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread)2868 sched_clutch_thread_unbound_lookup(
2869 	sched_clutch_root_t root_clutch,
2870 	sched_clutch_root_bucket_t root_bucket,
2871 	processor_t _Nullable processor,
2872 	thread_t _Nullable prev_thread)
2873 {
2874 	assert(processor != NULL || prev_thread == NULL);
2875 	assert(root_bucket->scrb_bound == false);
2876 	sched_clutch_hierarchy_locked_assert(root_clutch);
2877 
2878 	/* Find the highest priority clutch bucket in this root bucket */
2879 	bool chose_prev_thread = false;
2880 	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket, processor, prev_thread, &chose_prev_thread);
2881 	assert(clutch_bucket != NULL);
2882 
2883 	if (chose_prev_thread) {
2884 		/* We have determined that prev_thread is the highest thread, based on the Clutch bucket level policy */
2885 		assert(processor != NULL && prev_thread != NULL);
2886 		return prev_thread;
2887 	}
2888 
2889 	/* Find the highest priority runnable thread in this clutch bucket */
2890 	thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2891 	assert(thread != NULL);
2892 
2893 	/* Consider the previous thread */
2894 	if (prev_thread != NULL &&
2895 	    sched_clutch_bucket_for_thread(root_clutch, prev_thread) == clutch_bucket &&
2896 	    sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, thread->sched_pri, processor->first_timeslice)) {
2897 		thread = prev_thread;
2898 	}
2899 
2900 	return thread;
2901 }
2902 
2903 static sched_clutch_root_bucket_t
sched_clutch_root_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t prev_thread)2904 sched_clutch_root_bucket_for_thread(
2905 	sched_clutch_root_t root_clutch,
2906 	thread_t prev_thread)
2907 {
2908 #if CONFIG_SCHED_EDGE
2909 	if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, prev_thread)) {
2910 		return &root_clutch->scr_bound_buckets[prev_thread->th_sched_bucket];
2911 	}
2912 #endif /* CONFIG_SCHED_EDGE */
2913 	return &root_clutch->scr_unbound_buckets[prev_thread->th_sched_bucket];
2914 }
2915 
2916 /*
2917  * sched_clutch_hierarchy_thread_highest()
2918  *
2919  * Routine to traverse the Clutch hierarchy and return the highest thread which
2920  * should be selected to run next, optionally comparing against the previously
2921  * running thread. Removes the highest thread with sched_clutch_thread_remove()
2922  * depending on the traverse mode and whether it is the previously running thread.
2923  * Always called with the pset lock held.
2924  */
2925 static thread_t
sched_clutch_hierarchy_thread_highest(sched_clutch_root_t root_clutch,processor_t processor,thread_t _Nullable prev_thread,sched_clutch_traverse_mode_t mode)2926 sched_clutch_hierarchy_thread_highest(
2927 	sched_clutch_root_t root_clutch,
2928 	processor_t processor,
2929 	thread_t _Nullable prev_thread,
2930 	sched_clutch_traverse_mode_t mode)
2931 {
2932 	assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || prev_thread == NULL);
2933 	sched_clutch_hierarchy_locked_assert(root_clutch);
2934 
2935 	thread_t highest_thread = NULL;
2936 	uint64_t current_timestamp = mach_absolute_time();
2937 	bool chose_prev_thread = false;
2938 	sched_clutch_dbg_thread_select_packed_t debug_info = {0};
2939 	sched_clutch_root_bucket_t prev_root_bucket = prev_thread != NULL ? sched_clutch_root_bucket_for_thread(root_clutch, prev_thread) : NULL;
2940 	sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL, prev_root_bucket, prev_thread, &chose_prev_thread, mode, &debug_info);
2941 	if (chose_prev_thread) {
2942 		/* We disambiguated that we want to keep running the previous thread */
2943 		highest_thread = processor->active_thread;
2944 		goto done_selecting_thread;
2945 	}
2946 	if (root_bucket == NULL) {
2947 		/* The Clutch hierarchy has no runnable threads, including the previous thread */
2948 		assert(sched_clutch_root_count(root_clutch) == 0);
2949 		assert(prev_thread == NULL);
2950 		return NULL;
2951 	}
2952 	if (root_bucket != prev_root_bucket) {
2953 		/* We have ruled out continuing to run the previous thread, based on the root bucket level policy */
2954 		prev_thread = NULL;
2955 		assert((mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT) || (prev_root_bucket == NULL) ||
2956 		    (prev_root_bucket->scrb_bucket >= root_bucket->scrb_bucket) || (root_bucket->scrb_starvation_avoidance) ||
2957 		    (prev_root_bucket->scrb_bound != root_bucket->scrb_bound) ||
2958 		    (root_bucket->scrb_warp_remaining > 0 && root_bucket->scrb_warped_deadline > current_timestamp && prev_root_bucket->scrb_warp_remaining == 0));
2959 	}
2960 
2961 	if (root_bucket->scrb_bound) {
2962 		highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket, processor, prev_thread);
2963 	} else {
2964 		highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket, processor, prev_thread);
2965 	}
2966 
2967 	if (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY ||
2968 	    (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT && highest_thread != processor->active_thread)) {
2969 		assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || highest_thread != processor->active_thread);
2970 		sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
2971 	}
2972 
2973 done_selecting_thread:
2974 	debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
2975 	debug_info.trace_data.traverse_mode = mode;
2976 	debug_info.trace_data.cluster_id = root_clutch->scr_cluster_id;
2977 	debug_info.trace_data.selection_was_cluster_bound = root_bucket->scrb_bound;
2978 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
2979 	    thread_tid(highest_thread), thread_group_get_id(highest_thread->thread_group), root_bucket->scrb_bucket, debug_info.scdts_trace_data_packed, 0);
2980 	return highest_thread;
2981 }
2982 
2983 /* High level global accessor routines */
2984 
2985 /*
2986  * sched_clutch_root_urgency()
2987  *
2988  * Routine to get the urgency of the highest runnable
2989  * thread in the hierarchy.
2990  */
2991 static uint32_t
sched_clutch_root_urgency(sched_clutch_root_t root_clutch)2992 sched_clutch_root_urgency(
2993 	sched_clutch_root_t root_clutch)
2994 {
2995 	return root_clutch->scr_urgency;
2996 }
2997 
2998 /*
2999  * sched_clutch_root_count_sum()
3000  *
3001  * The count_sum mechanism is used for scheduler runq
3002  * statistics calculation. Its only useful for debugging
3003  * purposes; since it takes a mach_absolute_time() on
3004  * other scheduler implementations, its better to avoid
3005  * populating this until absolutely necessary.
3006  */
3007 static uint32_t
sched_clutch_root_count_sum(__unused sched_clutch_root_t root_clutch)3008 sched_clutch_root_count_sum(
3009 	__unused sched_clutch_root_t root_clutch)
3010 {
3011 	return 0;
3012 }
3013 
3014 /*
3015  * sched_clutch_root_priority()
3016  *
3017  * Routine to get the priority of the highest runnable
3018  * thread in the hierarchy.
3019  */
3020 static int
sched_clutch_root_priority(sched_clutch_root_t root_clutch)3021 sched_clutch_root_priority(
3022 	sched_clutch_root_t root_clutch)
3023 {
3024 	return root_clutch->scr_priority;
3025 }
3026 
3027 /*
3028  * sched_clutch_root_count()
3029  *
3030  * Returns total number of runnable threads in the hierarchy.
3031  */
3032 uint32_t
sched_clutch_root_count(sched_clutch_root_t root_clutch)3033 sched_clutch_root_count(
3034 	sched_clutch_root_t root_clutch)
3035 {
3036 	return root_clutch->scr_thr_count;
3037 }
3038 
3039 #if CONFIG_SCHED_EDGE
3040 
3041 /*
3042  * sched_clutch_root_foreign_empty()
3043  *
3044  * Routine to check if the foreign clutch bucket priority list is empty for a cluster.
3045  */
3046 static boolean_t
sched_clutch_root_foreign_empty(sched_clutch_root_t root_clutch)3047 sched_clutch_root_foreign_empty(
3048 	sched_clutch_root_t root_clutch)
3049 {
3050 	return priority_queue_empty(&root_clutch->scr_foreign_buckets);
3051 }
3052 
3053 /*
3054  * sched_clutch_root_highest_foreign_thread_remove()
3055  *
3056  * Routine to return the thread in the highest priority clutch bucket in a cluster.
3057  * Must be called with the pset for the cluster locked.
3058  */
3059 static thread_t
sched_clutch_root_highest_foreign_thread_remove(sched_clutch_root_t root_clutch)3060 sched_clutch_root_highest_foreign_thread_remove(
3061 	sched_clutch_root_t root_clutch)
3062 {
3063 	thread_t thread = THREAD_NULL;
3064 	if (priority_queue_empty(&root_clutch->scr_foreign_buckets)) {
3065 		return thread;
3066 	}
3067 	sched_clutch_bucket_t clutch_bucket = priority_queue_max(&root_clutch->scr_foreign_buckets, struct sched_clutch_bucket, scb_foreignlink);
3068 	thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
3069 	sched_clutch_thread_remove(root_clutch, thread, mach_absolute_time(), 0);
3070 	return thread;
3071 }
3072 
3073 #endif /* CONFIG_SCHED_EDGE */
3074 
3075 /*
3076  * sched_clutch_thread_pri_shift()
3077  *
3078  * Routine to get the priority shift value for a thread.
3079  * Since the timesharing is done at the clutch_bucket level,
3080  * this routine gets the clutch_bucket and retrieves the
3081  * values from there.
3082  */
3083 uint32_t
sched_clutch_thread_pri_shift(thread_t thread,sched_bucket_t bucket)3084 sched_clutch_thread_pri_shift(
3085 	thread_t thread,
3086 	sched_bucket_t bucket)
3087 {
3088 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3089 		return INT8_MAX;
3090 	}
3091 	assert(bucket != TH_BUCKET_RUN);
3092 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
3093 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
3094 	return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
3095 }
3096 
3097 #pragma mark -- Clutch Scheduler Algorithm
3098 
3099 static void
3100 sched_clutch_init(void);
3101 
3102 static thread_t
3103 sched_clutch_steal_thread(processor_set_t pset);
3104 
3105 #if !SCHED_TEST_HARNESS
3106 
3107 static void
3108 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
3109 
3110 #endif /* !SCHED_TEST_HARNESS */
3111 
3112 static boolean_t
3113 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
3114     sched_options_t options);
3115 
3116 static boolean_t
3117 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
3118 
3119 static ast_t
3120 sched_clutch_processor_csw_check(processor_t processor);
3121 
3122 static boolean_t
3123 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3124 
3125 static int
3126 sched_clutch_runq_count(processor_t processor);
3127 
3128 static boolean_t
3129 sched_clutch_processor_queue_empty(processor_t processor);
3130 
3131 #if !SCHED_TEST_HARNESS
3132 
3133 static uint64_t
3134 sched_clutch_runq_stats_count_sum(processor_t processor);
3135 
3136 #endif /* !SCHED_TEST_HARNESS */
3137 
3138 static int
3139 sched_clutch_processor_bound_count(processor_t processor);
3140 
3141 static void
3142 sched_clutch_pset_init(processor_set_t pset);
3143 
3144 static void
3145 sched_clutch_processor_init(processor_t processor);
3146 
3147 static thread_t
3148 sched_clutch_processor_highest_thread(processor_t processor, sched_clutch_traverse_mode_t mode);
3149 
3150 static thread_t
3151 sched_clutch_choose_thread(processor_t processor, int priority, thread_t prev_thread, ast_t reason);
3152 
3153 #if !SCHED_TEST_HARNESS
3154 
3155 static void
3156 sched_clutch_processor_queue_shutdown(processor_t processor);
3157 
3158 #endif /* !SCHED_TEST_HARNESS */
3159 
3160 static sched_mode_t
3161 sched_clutch_initial_thread_sched_mode(task_t parent_task);
3162 
3163 static uint32_t
3164 sched_clutch_initial_quantum_size(thread_t thread);
3165 
3166 static uint32_t
3167 sched_clutch_run_incr(thread_t thread);
3168 
3169 static uint32_t
3170 sched_clutch_run_decr(thread_t thread);
3171 
3172 static void
3173 sched_clutch_update_thread_bucket(thread_t thread);
3174 
3175 #if !SCHED_TEST_HARNESS
3176 
3177 static void
3178 sched_clutch_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
3179 
3180 #endif /* !SCHED_TEST_HARNESS */
3181 
3182 const struct sched_dispatch_table sched_clutch_dispatch = {
3183 	.sched_name                                     = "clutch",
3184 	.init                                           = sched_clutch_init,
3185 	.timebase_init                                  = sched_timeshare_timebase_init,
3186 	.processor_init                                 = sched_clutch_processor_init,
3187 	.pset_init                                      = sched_clutch_pset_init,
3188 	.choose_thread                                  = sched_clutch_choose_thread,
3189 	.steal_thread_enabled                           = sched_steal_thread_enabled,
3190 	.steal_thread                                   = sched_clutch_steal_thread,
3191 	.processor_enqueue                              = sched_clutch_processor_enqueue,
3192 	.processor_queue_remove                         = sched_clutch_processor_queue_remove,
3193 	.processor_queue_empty                          = sched_clutch_processor_queue_empty,
3194 	.priority_is_urgent                             = priority_is_urgent,
3195 	.processor_csw_check                            = sched_clutch_processor_csw_check,
3196 	.processor_queue_has_priority                   = sched_clutch_processor_queue_has_priority,
3197 	.initial_quantum_size                           = sched_clutch_initial_quantum_size,
3198 	.initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
3199 	.processor_runq_count                           = sched_clutch_runq_count,
3200 	.processor_bound_count                          = sched_clutch_processor_bound_count,
3201 	.multiple_psets_enabled                         = TRUE,
3202 	.avoid_processor_enabled                        = FALSE,
3203 	.thread_avoid_processor                         = NULL,
3204 	.update_thread_bucket                           = sched_clutch_update_thread_bucket,
3205 	.cpu_init_completed                             = NULL,
3206 	.thread_eligible_for_pset                       = NULL,
3207 
3208 	.rt_choose_processor                            = sched_rt_choose_processor,
3209 	.rt_steal_thread                                = NULL,
3210 	.rt_init_pset                                   = sched_rt_init_pset,
3211 	.rt_init_completed                              = sched_rt_init_completed,
3212 	.rt_runq_count_sum                              = sched_rt_runq_count_sum,
3213 
3214 #if !SCHED_TEST_HARNESS
3215 	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
3216 	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
3217 	.choose_node                                    = sched_choose_node,
3218 	.choose_processor                               = choose_processor,
3219 	.processor_queue_shutdown                       = sched_clutch_processor_queue_shutdown,
3220 	.can_update_priority                            = can_update_priority,
3221 	.update_priority                                = update_priority,
3222 	.lightweight_update_priority                    = lightweight_update_priority,
3223 	.quantum_expire                                 = sched_default_quantum_expire,
3224 	.processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
3225 	.thread_update_scan                             = sched_clutch_thread_update_scan,
3226 	.processor_balance                              = sched_SMT_balance,
3227 	.qos_max_parallelism                            = sched_qos_max_parallelism,
3228 	.check_spill                                    = sched_check_spill,
3229 	.ipi_policy                                     = sched_ipi_policy,
3230 	.thread_should_yield                            = sched_thread_should_yield,
3231 	.run_count_incr                                 = sched_clutch_run_incr,
3232 	.run_count_decr                                 = sched_clutch_run_decr,
3233 	.pset_made_schedulable                          = sched_pset_made_schedulable,
3234 	.thread_group_recommendation_change             = sched_clutch_thread_group_recommendation_change,
3235 
3236 	.rt_queue_shutdown                              = sched_rt_queue_shutdown,
3237 	.rt_runq_scan                                   = sched_rt_runq_scan,
3238 #endif /* !SCHED_TEST_HARNESS */
3239 };
3240 
3241 __attribute__((always_inline))
3242 static inline run_queue_t
sched_clutch_bound_runq(processor_t processor)3243 sched_clutch_bound_runq(processor_t processor)
3244 {
3245 	return &processor->runq;
3246 }
3247 
3248 __attribute__((always_inline))
3249 static inline sched_clutch_root_t
sched_clutch_processor_root_clutch(processor_t processor)3250 sched_clutch_processor_root_clutch(processor_t processor)
3251 {
3252 	return &processor->processor_set->pset_clutch_root;
3253 }
3254 
3255 __attribute__((always_inline))
3256 static inline run_queue_t
sched_clutch_thread_bound_runq(processor_t processor,__assert_only thread_t thread)3257 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
3258 {
3259 	assert(thread->bound_processor == processor);
3260 	return sched_clutch_bound_runq(processor);
3261 }
3262 
3263 static uint32_t
sched_clutch_initial_quantum_size(thread_t thread)3264 sched_clutch_initial_quantum_size(thread_t thread)
3265 {
3266 	if (thread == THREAD_NULL) {
3267 		return std_quantum;
3268 	}
3269 	assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
3270 	return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
3271 }
3272 
3273 static sched_mode_t
sched_clutch_initial_thread_sched_mode(task_t parent_task)3274 sched_clutch_initial_thread_sched_mode(task_t parent_task)
3275 {
3276 	if (parent_task == kernel_task) {
3277 		return TH_MODE_FIXED;
3278 	} else {
3279 		return TH_MODE_TIMESHARE;
3280 	}
3281 }
3282 
3283 static void
sched_clutch_processor_init(processor_t processor)3284 sched_clutch_processor_init(processor_t processor)
3285 {
3286 	run_queue_init(&processor->runq);
3287 }
3288 
3289 static void
sched_clutch_pset_init(processor_set_t pset)3290 sched_clutch_pset_init(processor_set_t pset)
3291 {
3292 	sched_clutch_root_init(&pset->pset_clutch_root, pset);
3293 }
3294 
3295 static void
sched_clutch_tunables_init(void)3296 sched_clutch_tunables_init(void)
3297 {
3298 	sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
3299 	sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
3300 	sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
3301 	clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
3302 	    NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
3303 	assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
3304 	sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
3305 }
3306 
3307 static void
sched_clutch_init(void)3308 sched_clutch_init(void)
3309 {
3310 	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3311 		sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3312 	}
3313 	sched_timeshare_init();
3314 	sched_clutch_tunables_init();
3315 }
3316 
3317 static inline bool
sched_clutch_pri_greater_than_tiebreak(int pri_one,int pri_two,bool one_wins_ties)3318 sched_clutch_pri_greater_than_tiebreak(int pri_one, int pri_two, bool one_wins_ties)
3319 {
3320 	if (one_wins_ties) {
3321 		return pri_one >= pri_two;
3322 	} else {
3323 		return pri_one > pri_two;
3324 	}
3325 }
3326 
3327 /*
3328  * sched_clutch_processor_highest_thread()
3329  *
3330  * Routine to determine the highest thread on the entire cluster runqueue which
3331  * should be selected to run next, optionally comparing against the previously
3332  * running thread. Removes the highest thread from the runqueue, depending on the
3333  * traverse mode and whether the highest thread is the previously running thread.
3334  *
3335  * Always called with the pset lock held. Assumes that processor->active_thread
3336  * may be locked and modified by another processor.
3337  */
3338 static thread_t
sched_clutch_processor_highest_thread(processor_t processor,sched_clutch_traverse_mode_t mode)3339 sched_clutch_processor_highest_thread(
3340 	processor_t      processor,
3341 	sched_clutch_traverse_mode_t mode)
3342 {
3343 	sched_clutch_root_t root_clutch = sched_clutch_processor_root_clutch(processor);
3344 	int clutch_pri = sched_clutch_root_priority(root_clutch);
3345 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3346 	int bound_pri = bound_runq->highq;
3347 
3348 	bool has_prev_thread = mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
3349 	thread_t prev_thread = has_prev_thread ? processor->active_thread : NULL;
3350 
3351 	if (bound_runq->count == 0 && root_clutch->scr_thr_count == 0) {
3352 		/* The runqueue is totally empty */
3353 		assert(bound_pri < MINPRI && clutch_pri < MINPRI);
3354 		return prev_thread;
3355 	}
3356 
3357 	if (has_prev_thread) {
3358 		if (prev_thread->sched_pri >= BASEPRI_RTQUEUES) {
3359 			/* The previous thread is real-time and thus guaranteed higher than the non-RT runqueue */
3360 			return prev_thread;
3361 		}
3362 		/* Allow the previous thread to influence the priority comparison of Clutch hierarchy vs. processor-bound runqueue */
3363 		if (prev_thread->bound_processor != NULL) {
3364 			bound_pri = MAX(bound_pri, prev_thread->sched_pri);
3365 		} else {
3366 			clutch_pri = MAX(clutch_pri, prev_thread->sched_pri);
3367 		}
3368 	}
3369 
3370 	bool prev_thread_is_not_processor_bound = has_prev_thread && (prev_thread->bound_processor == NULL);
3371 	bool prev_thread_is_processor_bound = has_prev_thread && (prev_thread->bound_processor != NULL);
3372 	thread_t next_thread = prev_thread;
3373 	if (clutch_pri > bound_pri) {
3374 		if (root_clutch->scr_thr_count == 0) {
3375 			goto found_thread;
3376 		}
3377 		next_thread = sched_clutch_hierarchy_thread_highest(root_clutch, processor, prev_thread_is_not_processor_bound ? prev_thread : NULL, mode);
3378 	} else {
3379 		if (bound_runq->count == 0 ||
3380 		    (prev_thread_is_processor_bound && sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_runq->highq, processor->first_timeslice))) {
3381 			goto found_thread;
3382 		}
3383 		next_thread = (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY) ?
3384 		    run_queue_dequeue(bound_runq, SCHED_HEADQ) : run_queue_peek(bound_runq);
3385 		assert(mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || next_thread != prev_thread);
3386 	}
3387 found_thread:
3388 	assert(next_thread != NULL);
3389 	return next_thread;
3390 }
3391 
3392 static thread_t
sched_clutch_choose_thread(processor_t processor,__unused int priority,thread_t _Nullable prev_thread,__unused ast_t reason)3393 sched_clutch_choose_thread(
3394 	processor_t      processor,
3395 	__unused int              priority,
3396 	thread_t _Nullable        prev_thread,
3397 	__unused ast_t            reason)
3398 {
3399 	assert(prev_thread == NULL || prev_thread == processor->active_thread);
3400 	return sched_clutch_processor_highest_thread(processor, prev_thread != NULL ? SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT : SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3401 }
3402 
3403 static boolean_t
sched_clutch_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)3404 sched_clutch_processor_enqueue(
3405 	processor_t       processor,
3406 	thread_t          thread,
3407 	sched_options_t   options)
3408 {
3409 	boolean_t       result;
3410 
3411 	thread_set_runq_locked(thread, processor);
3412 	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3413 		sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3414 		result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
3415 	} else {
3416 		run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3417 		result = run_queue_enqueue(rq, thread, options);
3418 	}
3419 	return result;
3420 }
3421 
3422 static boolean_t
sched_clutch_processor_queue_empty(processor_t processor)3423 sched_clutch_processor_queue_empty(processor_t processor)
3424 {
3425 	return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
3426 	       sched_clutch_bound_runq(processor)->count == 0;
3427 }
3428 
3429 static ast_t
sched_clutch_processor_csw_check(processor_t processor)3430 sched_clutch_processor_csw_check(processor_t processor)
3431 {
3432 	assert(processor->active_thread != NULL);
3433 	thread_t runqueue_thread = sched_clutch_processor_highest_thread(processor, SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT);
3434 	if (runqueue_thread != processor->active_thread) {
3435 		/* Found a better thread to run */
3436 		if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0 ||
3437 		    sched_clutch_bound_runq(processor)->urgency > 0) {
3438 			return AST_PREEMPT | AST_URGENT;
3439 		}
3440 		return AST_PREEMPT;
3441 	}
3442 	return AST_NONE;
3443 }
3444 
3445 static boolean_t
sched_clutch_processor_queue_has_priority(__unused processor_t processor,__unused int priority,__unused boolean_t gte)3446 sched_clutch_processor_queue_has_priority(
3447 	__unused processor_t    processor,
3448 	__unused int            priority,
3449 	__unused boolean_t      gte)
3450 {
3451 	/*
3452 	 * Never short-circuit the Clutch runqueue by returning FALSE here. Instead,
3453 	 * thread_select() should always go through sched_clutch_choose_thread().
3454 	 */
3455 	return TRUE;
3456 }
3457 
3458 static int
sched_clutch_runq_count(processor_t processor)3459 sched_clutch_runq_count(processor_t processor)
3460 {
3461 	return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
3462 }
3463 
3464 #if !SCHED_TEST_HARNESS
3465 
3466 static uint64_t
sched_clutch_runq_stats_count_sum(processor_t processor)3467 sched_clutch_runq_stats_count_sum(processor_t processor)
3468 {
3469 	uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
3470 
3471 	if (processor->cpu_id == processor->processor_set->cpu_set_low) {
3472 		return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
3473 	} else {
3474 		return bound_sum;
3475 	}
3476 }
3477 
3478 #endif /* !SCHED_TEST_HARNESS */
3479 
3480 static int
sched_clutch_processor_bound_count(processor_t processor)3481 sched_clutch_processor_bound_count(processor_t processor)
3482 {
3483 	return sched_clutch_bound_runq(processor)->count;
3484 }
3485 
3486 #if !SCHED_TEST_HARNESS
3487 
3488 static void
sched_clutch_processor_queue_shutdown(processor_t processor)3489 sched_clutch_processor_queue_shutdown(processor_t processor)
3490 {
3491 	processor_set_t pset = processor->processor_set;
3492 	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3493 	thread_t        thread;
3494 	queue_head_t    tqueue;
3495 
3496 	/* We only need to migrate threads if this is the last active processor in the pset */
3497 	if (pset->online_processor_count > 0) {
3498 		pset_unlock(pset);
3499 		return;
3500 	}
3501 
3502 	queue_init(&tqueue);
3503 	while (sched_clutch_root_count(pset_clutch_root) > 0) {
3504 		thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3505 		enqueue_tail(&tqueue, &thread->runq_links);
3506 	}
3507 
3508 	pset_unlock(pset);
3509 
3510 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
3511 		remqueue(&thread->runq_links);
3512 		thread_lock(thread);
3513 		thread_setrun(thread, SCHED_TAILQ);
3514 		thread_unlock(thread);
3515 	}
3516 }
3517 
3518 #endif /* !SCHED_TEST_HARNESS */
3519 
3520 static boolean_t
sched_clutch_processor_queue_remove(processor_t processor,thread_t thread)3521 sched_clutch_processor_queue_remove(
3522 	processor_t processor,
3523 	thread_t    thread)
3524 {
3525 	processor_set_t         pset = processor->processor_set;
3526 
3527 	pset_lock(pset);
3528 
3529 	if (processor == thread_get_runq_locked(thread)) {
3530 		/*
3531 		 * Thread is on a run queue and we have a lock on
3532 		 * that run queue.
3533 		 */
3534 		if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3535 			sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3536 			sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3537 		} else {
3538 			run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3539 			run_queue_remove(rq, thread);
3540 		}
3541 	} else {
3542 		/*
3543 		 * The thread left the run queue before we could
3544 		 * lock the run queue.
3545 		 */
3546 		thread_assert_runq_null(thread);
3547 		processor = PROCESSOR_NULL;
3548 	}
3549 
3550 	pset_unlock(pset);
3551 
3552 	return processor != PROCESSOR_NULL;
3553 }
3554 
3555 static thread_t
sched_clutch_steal_thread(__unused processor_set_t pset)3556 sched_clutch_steal_thread(__unused processor_set_t pset)
3557 {
3558 	/* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3559 	return THREAD_NULL;
3560 }
3561 
3562 #if !SCHED_TEST_HARNESS
3563 
3564 static void
sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)3565 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3566 {
3567 	boolean_t               restart_needed = FALSE;
3568 	processor_t             processor = processor_list;
3569 	processor_set_t         pset;
3570 	thread_t                thread;
3571 	spl_t                   s;
3572 
3573 	/*
3574 	 *  We update the threads associated with each processor (bound and idle threads)
3575 	 *  and then update the threads in each pset runqueue.
3576 	 */
3577 
3578 	do {
3579 		do {
3580 			pset = processor->processor_set;
3581 
3582 			s = splsched();
3583 			pset_lock(pset);
3584 
3585 			restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
3586 
3587 			pset_unlock(pset);
3588 			splx(s);
3589 
3590 			if (restart_needed) {
3591 				break;
3592 			}
3593 
3594 			thread = processor->idle_thread;
3595 			if (thread != THREAD_NULL && thread->sched_stamp != os_atomic_load(&sched_tick, relaxed)) {
3596 				if (thread_update_add_thread(thread) == FALSE) {
3597 					restart_needed = TRUE;
3598 					break;
3599 				}
3600 			}
3601 		} while ((processor = processor->processor_list) != NULL);
3602 
3603 		/* Ok, we now have a collection of candidates -- fix them. */
3604 		thread_update_process_threads();
3605 	} while (restart_needed);
3606 
3607 	pset_node_t node = &pset_node0;
3608 	pset = node->psets;
3609 
3610 	do {
3611 		do {
3612 			restart_needed = FALSE;
3613 			while (pset != NULL) {
3614 				s = splsched();
3615 				pset_lock(pset);
3616 
3617 				if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
3618 					for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3619 						restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3620 						if (restart_needed) {
3621 							break;
3622 						}
3623 					}
3624 					queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3625 					sched_clutch_bucket_t clutch_bucket;
3626 					qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3627 						sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
3628 						restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
3629 						if (restart_needed) {
3630 							break;
3631 						}
3632 					}
3633 				}
3634 
3635 				pset_unlock(pset);
3636 				splx(s);
3637 
3638 				if (restart_needed) {
3639 					break;
3640 				}
3641 				pset = pset->pset_list;
3642 			}
3643 
3644 			if (restart_needed) {
3645 				break;
3646 			}
3647 		} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3648 
3649 		/* Ok, we now have a collection of candidates -- fix them. */
3650 		thread_update_process_threads();
3651 	} while (restart_needed);
3652 }
3653 
3654 /*
3655  * For threads that have changed sched_pri without changing the
3656  * base_pri for any reason other than decay, use the sched_pri
3657  * as the bucketizing priority instead of base_pri. All such
3658  * changes are typically due to kernel locking primitives boosts
3659  * or demotions.
3660  */
3661 static boolean_t
sched_thread_sched_pri_promoted(thread_t thread)3662 sched_thread_sched_pri_promoted(thread_t thread)
3663 {
3664 	return (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3665 	       (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3666 	       (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3667 	       (thread->kern_promotion_schedpri != 0);
3668 }
3669 
3670 #endif /* !SCHED_TEST_HARNESS */
3671 
3672 /*
3673  * For the clutch scheduler, the run counts are maintained in the clutch
3674  * buckets (i.e thread group scheduling structure).
3675  */
3676 static uint32_t
sched_clutch_run_incr(thread_t thread)3677 sched_clutch_run_incr(thread_t thread)
3678 {
3679 	assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3680 	uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3681 	sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3682 	return new_count;
3683 }
3684 
3685 static uint32_t
sched_clutch_run_decr(thread_t thread)3686 sched_clutch_run_decr(thread_t thread)
3687 {
3688 	assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3689 	uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3690 	sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3691 	return new_count;
3692 }
3693 
3694 /*
3695  * Routine to update the scheduling bucket for the thread.
3696  *
3697  * In the clutch scheduler implementation, the thread's bucket
3698  * is based on sched_pri if it was promoted due to a kernel
3699  * primitive; otherwise its based on the thread base_pri. This
3700  * enhancement allows promoted threads to reach a higher priority
3701  * bucket and potentially get selected sooner for scheduling.
3702  *
3703  * Also, the clutch scheduler does not honor fixed priority below
3704  * FG priority. It simply puts those threads in the corresponding
3705  * timeshare bucket. The reason for to do that is because it is
3706  * extremely hard to define the scheduling properties of such threads
3707  * and they typically lead to performance issues.
3708  *
3709  * Called with the thread lock held and the thread held off the runqueue.
3710  */
3711 
3712 void
sched_clutch_update_thread_bucket(thread_t thread)3713 sched_clutch_update_thread_bucket(thread_t thread)
3714 {
3715 	sched_bucket_t old_bucket = thread->th_sched_bucket;
3716 	thread_assert_runq_null(thread);
3717 	int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3718 	sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3719 
3720 	if (old_bucket == new_bucket) {
3721 		return;
3722 	}
3723 
3724 	/* Bypass accounting CPU usage for a newly created thread */
3725 	if (old_bucket != TH_BUCKET_RUN) {
3726 		/* Attribute CPU usage with the old scheduling bucket */
3727 		sched_clutch_thread_tick_delta(thread, NULL);
3728 	}
3729 
3730 	/* Transition to the new sched_bucket */
3731 	thread->th_sched_bucket = new_bucket;
3732 	thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3733 
3734 	/*
3735 	 * Since this is called after the thread has been removed from the runq,
3736 	 * only the run counts need to be updated. The re-insert into the runq
3737 	 * would put the thread into the correct new bucket's runq.
3738 	 */
3739 	if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3740 		sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3741 		sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3742 	}
3743 }
3744 
3745 #if !SCHED_TEST_HARNESS
3746 
3747 static void
sched_clutch_thread_group_recommendation_change(__unused struct thread_group * tg,__unused cluster_type_t new_recommendation)3748 sched_clutch_thread_group_recommendation_change(__unused struct thread_group *tg, __unused cluster_type_t new_recommendation)
3749 {
3750 	/* Clutch ignores the recommendation because Clutch does not migrate
3751 	 * threads between cluster types independently from the Edge scheduler.
3752 	 */
3753 }
3754 
3755 #endif /* !SCHED_TEST_HARNESS */
3756 
3757 #if CONFIG_SCHED_EDGE
3758 
3759 /* Implementation of the AMP version of the clutch scheduler */
3760 
3761 static void
3762 sched_edge_init(void);
3763 
3764 static void
3765 sched_edge_pset_init(processor_set_t pset);
3766 
3767 static thread_t
3768 sched_edge_processor_idle(processor_set_t pset);
3769 
3770 static boolean_t
3771 sched_edge_processor_queue_empty(processor_t processor);
3772 
3773 static void
3774 sched_edge_processor_queue_shutdown(processor_t processor);
3775 
3776 static processor_t
3777 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, sched_options_t *options_inout);
3778 
3779 static void
3780 sched_edge_quantum_expire(thread_t thread);
3781 
3782 static bool
3783 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason);
3784 
3785 static bool
3786 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3787 
3788 static void
3789 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3790 
3791 static bool
3792 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3793 
3794 static void
3795 sched_edge_pset_made_schedulable(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock);
3796 
3797 static void
3798 sched_edge_cpu_init_completed(void);
3799 
3800 static bool
3801 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3802 
3803 static bool
3804 sched_edge_steal_thread_enabled(processor_set_t pset);
3805 
3806 static sched_ipi_type_t
3807 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3808 
3809 static uint32_t
3810 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3811 
3812 static uint32_t
3813 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3814 
3815 static uint32_t
3816 sched_edge_run_count_incr(thread_t thread);
3817 
3818 static bool
3819 sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset);
3820 
3821 const struct sched_dispatch_table sched_edge_dispatch = {
3822 	.sched_name                                     = "edge",
3823 	.init                                           = sched_edge_init,
3824 	.timebase_init                                  = sched_timeshare_timebase_init,
3825 	.processor_init                                 = sched_clutch_processor_init,
3826 	.pset_init                                      = sched_edge_pset_init,
3827 	.choose_thread                                  = sched_clutch_choose_thread,
3828 	.steal_thread_enabled                           = sched_edge_steal_thread_enabled,
3829 	.steal_thread                                   = sched_edge_processor_idle,
3830 	.choose_processor                               = sched_edge_choose_processor,
3831 	.processor_enqueue                              = sched_clutch_processor_enqueue,
3832 	.processor_queue_remove                         = sched_clutch_processor_queue_remove,
3833 	.processor_queue_empty                          = sched_edge_processor_queue_empty,
3834 	.priority_is_urgent                             = priority_is_urgent,
3835 	.processor_csw_check                            = sched_clutch_processor_csw_check,
3836 	.processor_queue_has_priority                   = sched_clutch_processor_queue_has_priority,
3837 	.initial_quantum_size                           = sched_clutch_initial_quantum_size,
3838 	.initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
3839 	.processor_runq_count                           = sched_clutch_runq_count,
3840 	.processor_bound_count                          = sched_clutch_processor_bound_count,
3841 	.multiple_psets_enabled                         = TRUE,
3842 	.avoid_processor_enabled                        = TRUE,
3843 	.thread_avoid_processor                         = sched_edge_thread_avoid_processor,
3844 	.processor_balance                              = sched_edge_balance,
3845 	.qos_max_parallelism                            = sched_edge_qos_max_parallelism,
3846 	.check_spill                                    = sched_edge_check_spill,
3847 	.ipi_policy                                     = sched_edge_ipi_policy,
3848 	.thread_should_yield                            = sched_edge_thread_should_yield,
3849 	.update_thread_bucket                           = sched_clutch_update_thread_bucket,
3850 	.cpu_init_completed                             = sched_edge_cpu_init_completed,
3851 	.thread_eligible_for_pset                       = sched_edge_thread_eligible_for_pset,
3852 
3853 	.rt_choose_processor                            = sched_rt_choose_processor,
3854 	.rt_steal_thread                                = sched_rt_steal_thread,
3855 	.rt_init_pset                                   = sched_rt_init_pset,
3856 	.rt_init_completed                              = sched_rt_init_completed,
3857 	.rt_runq_count_sum                              = sched_rt_runq_count_sum,
3858 
3859 #if !SCHED_TEST_HARNESS
3860 	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
3861 	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
3862 	.choose_node                                    = sched_choose_node,
3863 	.processor_queue_shutdown                       = sched_edge_processor_queue_shutdown,
3864 	.can_update_priority                            = can_update_priority,
3865 	.update_priority                                = update_priority,
3866 	.lightweight_update_priority                    = lightweight_update_priority,
3867 	.quantum_expire                                 = sched_edge_quantum_expire,
3868 	.processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
3869 	.thread_update_scan                             = sched_clutch_thread_update_scan,
3870 	.run_count_incr                                 = sched_edge_run_count_incr,
3871 	.run_count_decr                                 = sched_clutch_run_decr,
3872 	.pset_made_schedulable                          = sched_edge_pset_made_schedulable,
3873 	.thread_group_recommendation_change             = NULL,
3874 
3875 	.rt_queue_shutdown                              = sched_rt_queue_shutdown,
3876 	.rt_runq_scan                                   = sched_rt_runq_scan,
3877 #endif /* !SCHED_TEST_HARNESS */
3878 };
3879 
3880 static bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3881 
3882 /*
3883  * sched_edge_thread_bound_cluster_id()
3884  *
3885  * Routine to determine which cluster a particular thread is bound to. Uses
3886  * the sched_flags on the thread to map back to a specific cluster id.
3887  *
3888  * <Edge Multi-cluster Support Needed>
3889  */
3890 static uint32_t
sched_edge_thread_bound_cluster_id(thread_t thread)3891 sched_edge_thread_bound_cluster_id(thread_t thread)
3892 {
3893 	assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3894 	return thread->th_bound_cluster_id;
3895 }
3896 
3897 /* Forward declaration for some thread migration routines */
3898 static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset);
3899 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3900 static processor_set_t sched_edge_steal_candidate(processor_set_t pset);
3901 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks, processor_t *processor_hint_out, sched_options_t *options_inout);
3902 
3903 static_assert(sizeof(sched_clutch_edge) == sizeof(uint64_t), "sched_clutch_edge fits in 64 bits");
3904 
3905 /*
3906  * sched_edge_config_set()
3907  *
3908  * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3909  * policies in the scheduler.
3910  */
3911 static void
sched_edge_config_set(uint32_t src_cluster,uint32_t dst_cluster,sched_bucket_t bucket,sched_clutch_edge edge_config)3912 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_bucket_t bucket, sched_clutch_edge edge_config)
3913 {
3914 	os_atomic_store(&pset_array[src_cluster]->sched_edges[dst_cluster][bucket], edge_config, relaxed);
3915 }
3916 
3917 /*
3918  * sched_edge_config_get()
3919  *
3920  * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3921  * if it needs to update edges.
3922  */
3923 static sched_clutch_edge
sched_edge_config_get(uint32_t src_cluster,uint32_t dst_cluster,sched_bucket_t bucket)3924 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster, sched_bucket_t bucket)
3925 {
3926 	return os_atomic_load(&pset_array[src_cluster]->sched_edges[dst_cluster][bucket], relaxed);
3927 }
3928 
3929 /*
3930  * sched_edge_config_pset_push()
3931  *
3932  * After using sched_edge_config_set() to update edge tunables outgoing from a particular source
3933  * pset, this function should be called in order to propagate the updates to derived metadata for
3934  * the pset, such as search orders for outgoing spill and steal.
3935  */
3936 static void
sched_edge_config_pset_push(uint32_t src_pset_id)3937 sched_edge_config_pset_push(uint32_t src_pset_id)
3938 {
3939 	processor_set_t src_pset = pset_array[src_pset_id];
3940 	uint8_t search_order_len = sched_edge_max_clusters - 1;
3941 	sched_pset_search_order_sort_data_t search_order_datas[MAX_PSETS - 1];
3942 	for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3943 		uint8_t dst_pset_id = 0;
3944 		for (int i = 0; i < search_order_len; i++, dst_pset_id++) {
3945 			if (dst_pset_id == src_pset->pset_id) {
3946 				dst_pset_id++;
3947 			}
3948 			search_order_datas[i].spsosd_src_pset = src_pset;
3949 			search_order_datas[i].spsosd_dst_pset_id = dst_pset_id;
3950 			sched_clutch_edge edge = sched_edge_config_get(src_pset->pset_id, dst_pset_id, bucket);
3951 			search_order_datas[i].spsosd_migration_weight = edge.sce_migration_allowed ?
3952 			    edge.sce_migration_weight : UINT32_MAX;
3953 		}
3954 		sched_pset_search_order_compute(&src_pset->spill_search_order[bucket],
3955 		    search_order_datas, search_order_len, sched_edge_search_order_weight_then_locality_cmp);
3956 	}
3957 }
3958 
3959 static int
sched_edge_search_order_weight_then_locality(const void * a,const void * b)3960 sched_edge_search_order_weight_then_locality(const void *a, const void *b)
3961 {
3962 	const sched_pset_search_order_sort_data_t *data_a = (const sched_pset_search_order_sort_data_t *)a;
3963 	const sched_pset_search_order_sort_data_t *data_b = (const sched_pset_search_order_sort_data_t *)b;
3964 	assert3p(data_a->spsosd_src_pset, ==, data_b->spsosd_src_pset);
3965 	assert3u(data_a->spsosd_dst_pset_id, !=, data_b->spsosd_dst_pset_id);
3966 	/*
3967 	 * Sort based on lowest edge migration weight, followed by die-local psets
3968 	 * first, followed by lowest pset id.
3969 	 */
3970 	if (data_a->spsosd_migration_weight != data_b->spsosd_migration_weight) {
3971 		return (data_a->spsosd_migration_weight < data_b->spsosd_migration_weight) ? -1 : 1;
3972 	}
3973 
3974 	bool is_local_a = bitmap_test(data_a->spsosd_src_pset->local_psets, data_a->spsosd_dst_pset_id);
3975 	bool is_local_b = bitmap_test(data_b->spsosd_src_pset->local_psets, data_b->spsosd_dst_pset_id);
3976 	if (is_local_a != is_local_b) {
3977 		return is_local_a ? -1 : 1;
3978 	}
3979 
3980 	if (data_a->spsosd_dst_pset_id != data_b->spsosd_dst_pset_id) {
3981 		return (data_a->spsosd_dst_pset_id < data_b->spsosd_dst_pset_id) ? -1 : 1;
3982 	}
3983 	return 0;
3984 }
3985 
3986 cmpfunc_t sched_edge_search_order_weight_then_locality_cmp = &sched_edge_search_order_weight_then_locality;
3987 
3988 /*
3989  * sched_edge_matrix_set()
3990  *
3991  * Routine to update various edges in the edge migration graph. The edge_changed array
3992  * indicates which edges need to be updated. Both the edge_matrix and edge_changed arrays
3993  * are matrices with dimension num_psets * num_psets * TH_BUCKET_SCHED_MAX, flattened into a
3994  * single-dimensional array.
3995  */
3996 void
sched_edge_matrix_set(sched_clutch_edge * edge_matrix,bool * edge_changed,__unused uint64_t flags,__assert_only uint64_t num_psets)3997 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changed, __unused uint64_t flags,
3998     __assert_only uint64_t num_psets)
3999 {
4000 	assert3u(num_psets, ==, sched_edge_max_clusters);
4001 	uint32_t edge_index = 0;
4002 	for (uint32_t src_cluster = 0; src_cluster < sched_edge_max_clusters; src_cluster++) {
4003 		for (uint32_t dst_cluster = 0; dst_cluster < sched_edge_max_clusters; dst_cluster++) {
4004 			for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4005 				if (edge_changed[edge_index]) {
4006 					sched_edge_config_set(src_cluster, dst_cluster, bucket, edge_matrix[edge_index]);
4007 				}
4008 				edge_index++;
4009 			}
4010 		}
4011 		sched_edge_config_pset_push(src_cluster);
4012 	}
4013 }
4014 
4015 /*
4016  * sched_edge_matrix_get()
4017  *
4018  * Routine to retrieve various edges in the edge migration graph. The edge_requested array
4019  * indicates which edges need to be retrieved. Both the edge_matrix and edge_requested arrays
4020  * are matrices with dimension num_psets * num_psets * TH_BUCKET_SCHED_MAX, flattened into a
4021  * single-dimensional array.
4022  */
4023 void
sched_edge_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_requested,__unused uint64_t flags,__assert_only uint64_t num_psets)4024 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_requested, __unused uint64_t flags,
4025     __assert_only uint64_t num_psets)
4026 {
4027 	assert3u(num_psets, ==, sched_edge_max_clusters);
4028 	uint32_t edge_index = 0;
4029 	for (uint32_t src_pset = 0; src_pset < sched_edge_max_clusters; src_pset++) {
4030 		for (uint32_t dst_pset = 0; dst_pset < sched_edge_max_clusters; dst_pset++) {
4031 			for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4032 				if (edge_requested[edge_index]) {
4033 					edge_matrix[edge_index] = sched_edge_config_get(src_pset, dst_pset, bucket);
4034 				}
4035 				edge_index++;
4036 			}
4037 		}
4038 	}
4039 }
4040 
4041 
4042 /*
4043  * sched_edge_init()
4044  *
4045  * Routine to initialize the data structures for the Edge scheduler.
4046  */
4047 static void
sched_edge_init(void)4048 sched_edge_init(void)
4049 {
4050 	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
4051 		sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
4052 	}
4053 	sched_timeshare_init();
4054 	sched_clutch_tunables_init();
4055 	sched_edge_max_clusters = ml_get_cluster_count();
4056 }
4057 
4058 static void
sched_edge_pset_init(processor_set_t pset)4059 sched_edge_pset_init(processor_set_t pset)
4060 {
4061 	uint32_t pset_cluster_id = pset->pset_cluster_id;
4062 	pset->pset_type = pset_cluster_type_to_cluster_type(pset->pset_cluster_type);
4063 	/* Each pset must declare an AMP type */
4064 	assert(pset->pset_type != CLUSTER_TYPE_SMP);
4065 
4066 	/* Set the edge weight and properties for the pset itself */
4067 	bitmap_clear(pset->foreign_psets, pset_cluster_id);
4068 	bitmap_clear(pset->native_psets, pset_cluster_id);
4069 	bitmap_clear(pset->local_psets, pset_cluster_id);
4070 	bitmap_clear(pset->remote_psets, pset_cluster_id);
4071 	bzero(&pset->sched_edges, sizeof(pset->sched_edges));
4072 	bzero(&pset->max_parallel_cores, sizeof(pset->max_parallel_cores));
4073 	bzero(&pset->max_parallel_clusters, sizeof(pset->max_parallel_cores));
4074 	for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4075 		sched_pset_search_order_init(pset, &pset->spill_search_order[bucket]);
4076 	}
4077 	sched_clutch_root_init(&pset->pset_clutch_root, pset);
4078 	bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id);
4079 }
4080 
4081 static boolean_t
sched_edge_processor_queue_empty(processor_t processor)4082 sched_edge_processor_queue_empty(processor_t processor)
4083 {
4084 	return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
4085 	       (sched_clutch_bound_runq(processor)->count == 0);
4086 }
4087 
4088 static void
sched_edge_check_spill(__unused processor_set_t pset,__unused thread_t thread)4089 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
4090 {
4091 	assert(thread->bound_processor == PROCESSOR_NULL);
4092 }
4093 
4094 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
4095 	SCHED_EDGE_YIELD_RUNQ_NONEMPTY       = 0x0,
4096 	SCHED_EDGE_YIELD_FOREIGN_RUNNABLE    = 0x1,
4097 	SCHED_EDGE_YIELD_FOREIGN_RUNNING     = 0x2,
4098 	SCHED_EDGE_YIELD_STEAL_POSSIBLE      = 0x3,
4099 	SCHED_EDGE_YIELD_DISALLOW            = 0x4,
4100 });
4101 
4102 static bool
sched_edge_thread_should_yield(processor_t processor,__unused thread_t thread)4103 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
4104 {
4105 	if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
4106 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4107 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
4108 		return true;
4109 	}
4110 
4111 	/*
4112 	 * The yield logic should follow the same logic that steal_thread () does. The
4113 	 * thread_should_yield() is effectively trying to quickly check that if the
4114 	 * current thread gave up CPU, is there any other thread that would execute
4115 	 * on this CPU. So it needs to provide the same answer as the steal_thread()/
4116 	 * processor Idle logic.
4117 	 */
4118 	if (sched_edge_foreign_runnable_thread_available(processor->processor_set)) {
4119 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4120 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNABLE);
4121 		return true;
4122 	}
4123 	if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
4124 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4125 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
4126 		return true;
4127 	}
4128 
4129 	processor_set_t steal_candidate = sched_edge_steal_candidate(processor->processor_set);
4130 	if (steal_candidate != NULL) {
4131 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4132 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
4133 		return true;
4134 	}
4135 
4136 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE, thread_tid(thread), processor->processor_set->pset_cluster_id,
4137 	    0, SCHED_EDGE_YIELD_DISALLOW);
4138 	return false;
4139 }
4140 
4141 #if !SCHED_TEST_HARNESS
4142 
4143 static void
sched_edge_processor_queue_shutdown(processor_t processor)4144 sched_edge_processor_queue_shutdown(processor_t processor)
4145 {
4146 	processor_set_t pset = processor->processor_set;
4147 	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
4148 	thread_t        thread;
4149 	queue_head_t    tqueue;
4150 
4151 	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4152 	if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
4153 		pset_unlock(pset);
4154 		return;
4155 	}
4156 
4157 	bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_cluster_id);
4158 
4159 	queue_init(&tqueue);
4160 	while (sched_clutch_root_count(pset_clutch_root) > 0) {
4161 		thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
4162 		enqueue_tail(&tqueue, &thread->runq_links);
4163 	}
4164 	pset_unlock(pset);
4165 
4166 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
4167 		remqueue(&thread->runq_links);
4168 		thread_lock(thread);
4169 		thread_setrun(thread, SCHED_TAILQ);
4170 		thread_unlock(thread);
4171 	}
4172 }
4173 
4174 #endif /* !SCHED_TEST_HARNESS */
4175 
4176 /*
4177  * sched_edge_cluster_load_metric()
4178  *
4179  * The load metric for a cluster is a measure of the average scheduling latency
4180  * experienced by threads on that cluster. It is a product of the average number
4181  * of threads in the runqueue and the average execution time for threads. The metric
4182  * has special values in the following cases:
4183  * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
4184  *   the maximum value to disallow any threads to migrate to this cluster.
4185  * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
4186  *   to be spread across the platform quickly for ncpu wide workloads.
4187  */
4188 static uint32_t
sched_edge_cluster_load_metric(processor_set_t pset,sched_bucket_t sched_bucket)4189 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
4190 {
4191 	if (pset_is_recommended(pset) == false) {
4192 		return UINT32_MAX;
4193 	}
4194 	return (uint32_t)sched_get_pset_load_average(pset, sched_bucket);
4195 }
4196 
4197 /*
4198  *
4199  * Edge Scheduler Steal/Rebalance logic
4200  *
4201  * = Generic scheduler logic =
4202  *
4203  * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
4204  * find any thread for execution in its runqueue. The aim of the steal operation
4205  * is to find other threads running/runnable in other clusters which should be
4206  * executed here.
4207  *
4208  * If the steal callout does not return a thread, the thread_select() logic calls
4209  * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
4210  * threads and idle out the current CPU.
4211  *
4212  * = SCHED(steal_thread) for Edge Scheduler =
4213  *
4214  * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
4215  * routine tries to do the following operations in order:
4216  * (1) Find foreign runnnable threads in non-native cluster
4217  *     runqueues (sched_edge_foreign_runnable_thread_remove())
4218  * (2) Check if foreign threads are running on the non-native
4219  *     clusters (sched_edge_foreign_running_thread_available())
4220  *         - If yes, return THREAD_NULL for the steal callout and
4221  *         perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
4222  * (3) Steal a thread from another cluster based on edge
4223  *     weights (sched_edge_steal_thread())
4224  *
4225  * = SCHED(processor_balance) for Edge Scheduler =
4226  *
4227  * If steal_thread did not return a thread for the processor, use
4228  * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
4229  *
4230  * = Clutch Bucket Preferred Cluster Overrides =
4231  *
4232  * Since these operations (just like thread migrations on enqueue)
4233  * move threads across clusters, they need support for handling clutch
4234  * bucket group level preferred cluster recommendations.
4235  * For (1), a clutch bucket will be in the foreign runnable queue based
4236  * on the clutch bucket group preferred cluster.
4237  * For (2), the running thread will set the bit on the processor based
4238  * on its preferred cluster type.
4239  * For (3), the edge configuration would prevent threads from being stolen
4240  * in the wrong direction.
4241  *
4242  * = SCHED(thread_should_yield) =
4243  * The thread_should_yield() logic needs to have the same logic as sched_edge_processor_idle()
4244  * since that is expecting the same answer as if thread_select() was called on a core
4245  * with an empty runqueue.
4246  */
4247 
4248 static bool
sched_edge_steal_thread_enabled(__unused processor_set_t pset)4249 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
4250 {
4251 	/*
4252 	 * For edge scheduler, the gating for steal is being done by sched_edge_steal_candidate()
4253 	 */
4254 	return true;
4255 }
4256 
4257 static processor_set_t
sched_edge_steal_candidate(processor_set_t pset)4258 sched_edge_steal_candidate(processor_set_t pset)
4259 {
4260 	uint32_t dst_cluster_id = pset->pset_cluster_id;
4261 	for (int cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
4262 		processor_set_t candidate_pset = pset_array[cluster_id];
4263 		if (cluster_id == dst_cluster_id) {
4264 			continue;
4265 		}
4266 		if (candidate_pset == NULL) {
4267 			continue;
4268 		}
4269 		int highest_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
4270 		if (highest_bucket != -1) {
4271 			/* Assumes that higher root buckets have the less restrictive sce_steal_allowed edges */
4272 			sched_clutch_edge edge = sched_edge_config_get(cluster_id, dst_cluster_id, highest_bucket);
4273 			if (edge.sce_steal_allowed) {
4274 				return candidate_pset;
4275 			}
4276 		}
4277 	}
4278 	return NULL;
4279 }
4280 
4281 static boolean_t
sched_edge_foreign_runnable_thread_available(processor_set_t pset)4282 sched_edge_foreign_runnable_thread_available(processor_set_t pset)
4283 {
4284 	/* Find all the clusters that are foreign for this cluster */
4285 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4286 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4287 		/*
4288 		 * For each cluster, see if there are any runnable foreign threads.
4289 		 * This check is currently being done without the pset lock to make it cheap for
4290 		 * the common case.
4291 		 */
4292 		processor_set_t target_pset = pset_array[cluster];
4293 		if (pset_is_recommended(target_pset) == false) {
4294 			continue;
4295 		}
4296 
4297 		if (!sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
4298 			return true;
4299 		}
4300 	}
4301 	return false;
4302 }
4303 
4304 static thread_t
sched_edge_foreign_runnable_thread_remove(processor_set_t pset,uint64_t ctime)4305 sched_edge_foreign_runnable_thread_remove(processor_set_t pset, uint64_t ctime)
4306 {
4307 	thread_t thread = THREAD_NULL;
4308 
4309 	/* Find all the clusters that are foreign for this cluster */
4310 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4311 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
4312 		/*
4313 		 * For each cluster, see if there are any runnable foreign threads.
4314 		 * This check is currently being done without the pset lock to make it cheap for
4315 		 * the common case.
4316 		 */
4317 		processor_set_t target_pset = pset_array[cluster];
4318 		if (pset_is_recommended(target_pset) == false) {
4319 			continue;
4320 		}
4321 
4322 		if (sched_clutch_root_foreign_empty(&target_pset->pset_clutch_root)) {
4323 			continue;
4324 		}
4325 		/*
4326 		 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
4327 		 * and get the highest priority thread.
4328 		 */
4329 		pset_lock(target_pset);
4330 		if (pset_is_recommended(target_pset)) {
4331 			thread = sched_clutch_root_highest_foreign_thread_remove(&target_pset->pset_clutch_root);
4332 			sched_update_pset_load_average(target_pset, ctime);
4333 		}
4334 		pset_unlock(target_pset);
4335 
4336 		/*
4337 		 * Edge Scheduler Optimization
4338 		 *
4339 		 * The current implementation immediately returns as soon as it finds a foreign
4340 		 * runnable thread. This could be enhanced to look at highest priority threads
4341 		 * from all foreign clusters and pick the highest amongst them. That would need
4342 		 * some form of global state across psets to make that kind of a check cheap.
4343 		 */
4344 		if (thread != THREAD_NULL) {
4345 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), pset->pset_cluster_id, target_pset->pset_cluster_id, 0);
4346 			break;
4347 		}
4348 		/* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
4349 	}
4350 
4351 	return thread;
4352 }
4353 
4354 /*
4355  * sched_edge_cpu_running_foreign_shared_rsrc_available()
4356  *
4357  * Routine to determine if the thread running on a CPU is a shared resource thread
4358  * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
4359  * a CPU going idle on a pset should rebalance a running shared resource heavy thread
4360  * from another non-ideal cluster based on the former's shared resource load.
4361  */
4362 static boolean_t
sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset,int foreign_cpu,processor_set_t idle_pset)4363 sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
4364 {
4365 	boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
4366 	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
4367 		return false;
4368 	}
4369 
4370 	boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
4371 	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
4372 		return false;
4373 	}
4374 	return true;
4375 }
4376 
4377 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset)4378 sched_edge_foreign_running_thread_available(processor_set_t pset)
4379 {
4380 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4381 	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4382 	while (sched_iterate_psets_ordered(pset, &pset->spill_search_order[0], foreign_pset_bitmap[0], &istate)) {
4383 		/* Skip the pset if its not schedulable */
4384 		processor_set_t target_pset = pset_array[istate.spis_pset_id];
4385 		if (pset_is_recommended(target_pset) == false) {
4386 			continue;
4387 		}
4388 
4389 		uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
4390 		for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
4391 			if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
4392 				continue;
4393 			}
4394 			return true;
4395 		}
4396 	}
4397 	return false;
4398 }
4399 
4400 /*
4401  * sched_edge_steal_possible()
4402  *
4403  * Determines whether we can and should steal a thread from
4404  * the candidate_pset to run it on the idle_pset. When returning
4405  * true, the function also writes the scheduling bucket that we
4406  * should steal from into the bucket_for_steal out parameter.
4407  *
4408  * Always called with the pset lock for candidate_pset held.
4409  */
4410 static bool
sched_edge_steal_possible(processor_set_t idle_pset,processor_set_t candidate_pset,sched_bucket_t * bucket_for_steal)4411 sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_pset, sched_bucket_t *bucket_for_steal)
4412 {
4413 	sched_clutch_root_t candidate_clutch_root = &candidate_pset->pset_clutch_root;
4414 
4415 	int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(candidate_clutch_root, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
4416 	if (highest_runnable_bucket == -1) {
4417 		/* Candidate cluster runq is empty of unbound threads */
4418 		return false;
4419 	}
4420 
4421 	for (int unbound_qos = highest_runnable_bucket; unbound_qos >= 0; unbound_qos = bitmap_lsb_next(candidate_clutch_root->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, unbound_qos)) {
4422 		/* Confirm we are allowed to steal across the edge at this QoS */
4423 		sched_clutch_edge edge = sched_edge_config_get(candidate_pset->pset_cluster_id, idle_pset->pset_cluster_id, unbound_qos);
4424 		if (edge.sce_steal_allowed == false) {
4425 			continue;
4426 		}
4427 		if (edge.sce_migration_weight == 0) {
4428 			/* Allow free stealing across a zero edge weight, even with idle cores in the candidate pset */
4429 			*bucket_for_steal = (sched_bucket_t)unbound_qos;
4430 			return true;
4431 		}
4432 		uint32_t candidate_runq_depth = os_atomic_load(&candidate_pset->pset_runnable_depth[unbound_qos], relaxed);
4433 		if (candidate_runq_depth > pset_available_cpu_count(candidate_pset)) {
4434 			/* Candidate cluster has excess load at this QoS (and at least one unbound thread we can steal!) */
4435 			*bucket_for_steal = (sched_bucket_t)unbound_qos;
4436 			return true;
4437 		}
4438 	}
4439 	/* None of the unbound root buckets are available for steal */
4440 	return false;
4441 }
4442 
4443 static thread_t
sched_edge_steal_thread(processor_set_t pset,uint64_t candidate_pset_bitmap)4444 sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap)
4445 {
4446 	thread_t stolen_thread = THREAD_NULL;
4447 
4448 	/*
4449 	 * Edge Scheduler Optimization
4450 	 *
4451 	 * The logic today bails as soon as it finds a cluster where the cluster load is
4452 	 * greater than the edge weight. Maybe it should have a more advanced version
4453 	 * which looks for the maximum delta etc.
4454 	 */
4455 	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4456 	while (sched_iterate_psets_ordered(pset, &pset->spill_search_order[0], candidate_pset_bitmap, &istate)) {
4457 		processor_set_t steal_from_pset = pset_array[istate.spis_pset_id];
4458 		if (steal_from_pset == NULL) {
4459 			continue;
4460 		}
4461 		bool steal_allowed = false;
4462 		for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4463 			sched_clutch_edge edge = sched_edge_config_get(istate.spis_pset_id, pset->pset_cluster_id, bucket);
4464 			if (edge.sce_steal_allowed) {
4465 				steal_allowed = true;
4466 				break;
4467 			}
4468 		}
4469 		if (steal_allowed == false) {
4470 			continue;
4471 		}
4472 		pset_lock(steal_from_pset);
4473 		sched_bucket_t bucket_for_steal;
4474 		if (sched_edge_steal_possible(pset, steal_from_pset, &bucket_for_steal)) {
4475 			uint64_t current_timestamp = mach_absolute_time();
4476 			sched_clutch_root_t clutch_root_for_steal = &steal_from_pset->pset_clutch_root;
4477 			stolen_thread = sched_clutch_thread_unbound_lookup(clutch_root_for_steal, &clutch_root_for_steal->scr_unbound_buckets[bucket_for_steal], NULL, NULL);
4478 			sched_clutch_thread_remove(clutch_root_for_steal, stolen_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
4479 
4480 			sched_clutch_dbg_thread_select_packed_t debug_info = {0};
4481 			debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
4482 			debug_info.trace_data.traverse_mode = SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY;
4483 			debug_info.trace_data.cluster_id = steal_from_pset->pset_cluster_id;
4484 			debug_info.trace_data.selection_was_cluster_bound = false;
4485 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
4486 			    thread_tid(stolen_thread), thread_group_get_id(stolen_thread->thread_group), bucket_for_steal, debug_info.scdts_trace_data_packed, 0);
4487 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(stolen_thread), pset->pset_cluster_id, steal_from_pset->pset_cluster_id, 0);
4488 
4489 			sched_update_pset_load_average(steal_from_pset, current_timestamp);
4490 		}
4491 		pset_unlock(steal_from_pset);
4492 		if (stolen_thread != THREAD_NULL) {
4493 			break;
4494 		}
4495 	}
4496 	return stolen_thread;
4497 }
4498 
4499 /*
4500  * sched_edge_processor_idle()
4501  *
4502  * The routine is the implementation for steal_thread() for the Edge scheduler.
4503  */
4504 static thread_t
sched_edge_processor_idle(processor_set_t pset)4505 sched_edge_processor_idle(processor_set_t pset)
4506 {
4507 	thread_t thread = THREAD_NULL;
4508 
4509 	uint64_t ctime = mach_absolute_time();
4510 
4511 	processor_t processor = current_processor();
4512 	bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
4513 
4514 	/* Each of the operations acquire the lock for the pset they target */
4515 	pset_unlock(pset);
4516 
4517 	/* Find highest priority runnable thread on all non-native clusters */
4518 	thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
4519 	if (thread != THREAD_NULL) {
4520 		return thread;
4521 	}
4522 
4523 	/* Find highest priority runnable thread on all native clusters */
4524 	thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4525 	if (thread != THREAD_NULL) {
4526 		return thread;
4527 	}
4528 
4529 	/* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4530 	boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4531 	if (rebalance_needed) {
4532 		return THREAD_NULL;
4533 	}
4534 
4535 	/* No foreign threads found; find a thread to steal from all clusters based on weights/loads etc. */
4536 	thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4537 	return thread;
4538 }
4539 
4540 /* Return true if this shared resource thread has a better cluster to run on */
4541 static bool
sched_edge_shared_rsrc_migrate_possible(thread_t thread,processor_set_t preferred_pset,processor_set_t current_pset)4542 sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4543 {
4544 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4545 	uint64_t current_pset_load = sched_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4546 	/*
4547 	 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4548 	 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4549 	 *
4550 	 * Edge Scheduler Optimization
4551 	 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4552 	 */
4553 	uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4554 
4555 	uint64_t eligible_pset_bitmask = 0;
4556 	if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4557 		/*
4558 		 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4559 		 * only among clusters native with the preferred cluster.
4560 		 */
4561 		eligible_pset_bitmask = preferred_pset->native_psets[0];
4562 		bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4563 	} else {
4564 		/* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4565 		eligible_pset_bitmask = sched_edge_available_pset_bitmask[0];
4566 	}
4567 
4568 	/* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4569 	for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4570 		if (cluster_id == current_pset->pset_cluster_id) {
4571 			continue;
4572 		}
4573 		uint64_t cluster_load = sched_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4574 		if (current_pset_adjusted_load > cluster_load) {
4575 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4576 			return true;
4577 		}
4578 	}
4579 	return false;
4580 }
4581 
4582 /*
4583  * Stir-the-pot Registry:
4584  *
4585  * Global state tracking which cores currently have threads that
4586  * are ready to be stirred onto cores of the opposite type.
4587  *
4588  * The registry state updates are implemented with atomic transaction
4589  * operations rather than a global lock, in order to avoid the cost
4590  * of serializing some of the most frequent registry state update
4591  * callsites that depend on consistent speed--namely the
4592  * preemption check and context-switch paths. The most expensive
4593  * state update, in sched_edge_stir_the_pot_try_trigger_swap(), only
4594  * happens at quantum expiration, which should allow cheaper
4595  * operations at other callsites to win the race.
4596  */
4597 typedef unsigned __int128 sched_edge_stp_registry_t;
4598 _Atomic sched_edge_stp_registry_t sched_edge_stir_the_pot_global_registry = 0LL;
4599 #define SESTP_BITS_PER_CORE (2)
4600 #define SESTP_BIT_POS(cpu_id) ((sched_edge_stp_registry_t)(cpu_id * SESTP_BITS_PER_CORE))
4601 #define SESTP_MASK(cpu_id) ((sched_edge_stp_registry_t)mask(SESTP_BITS_PER_CORE) << SESTP_BIT_POS(cpu_id))
4602 static_assert((SESTP_BITS_PER_CORE * MAX_CPUS) <= (sizeof(sched_edge_stp_registry_t) * 8),
4603     "Global registry must fit per-core bits for each core");
4604 
4605 #define SESTP_EXTRACT_STATE(registry, cpu_id) ((registry >> SESTP_BIT_POS(cpu_id)) & mask(SESTP_BITS_PER_CORE))
4606 #define SESTP_SET_STATE(registry, cpu_id, state) ((registry & ~SESTP_MASK(cpu_id)) | ((sched_edge_stp_registry_t)state << SESTP_BIT_POS(cpu_id)))
4607 __enum_decl(sched_edge_stp_state_t, uint8_t, {
4608 	SCHED_EDGE_STP_NOT_WANT   = 0,
4609 	SCHED_EDGE_STP_REQUESTED  = 1,
4610 	SCHED_EDGE_STP_PENDING    = 2,
4611 	SCHED_EDGE_STP_MAX        = SCHED_EDGE_STP_PENDING
4612 });
4613 static_assert(SCHED_EDGE_STP_MAX <= mask(SESTP_BITS_PER_CORE),
4614     "Per-core stir-the-pot request state must fit in per-core bits");
4615 
4616 #if OS_ATOMIC_USE_LLSC
4617 #error "Expecting CAS implementation of os_atomic_rmw_loop()"
4618 #endif /* OS_ATOMIC_USE_LLSC */
4619 
4620 static cpumap_t sched_edge_p_core_map = 0ULL;
4621 static cpumap_t sched_edge_non_p_core_map = 0ULL;
4622 
4623 /*
4624  * In order to reduce the chance of picking the same CPUs over
4625  * and over unfairly for stir-the-pot swaps, use an offset value
4626  * for the lsb selection, which rotates by one index each time
4627  * the choice is evaluated.
4628  */
4629 static _Atomic uint64_t sched_edge_stp_selection_p_core_offset = 0;
4630 static _Atomic uint64_t sched_edge_stp_selection_non_p_core_offset = 0;
4631 
4632 /*
4633  * sched_edge_stir_the_pot_try_trigger_swap()
4634  *
4635  * Search for an eligible swap candidate on the opposite core
4636  * type, and if one is found, initiate a swap for stir-the-pot.
4637  * From a P-core, initiating means sending an inbox message and IPI
4638  * to the swapping lower performance core. For initiating swap from
4639  * a lower performance core, only an inbox message needs to be sent
4640  * to itself, naming the P-core for swap.
4641  * If no eligible candidate is found, mark the current processor
4642  * as requesting stir-the-pot swap--that is unless a swap has already
4643  * been initiated for this core, in which case we should sit tight.
4644  * Thread lock must be held.
4645  */
4646 static inline int
sched_edge_stir_the_pot_try_trigger_swap(thread_t thread)4647 sched_edge_stir_the_pot_try_trigger_swap(thread_t thread)
4648 {
4649 	processor_t self_processor = current_processor();
4650 	int self_cpu = self_processor->cpu_id;
4651 	/*
4652 	 * Prepare the core mask of candidate cores (of the opposite type),
4653 	 * and compute an offset where the candidate search should begin,
4654 	 * to avoid unfairly swapping with the same cores repeatedly.
4655 	 */
4656 	cpumap_t swap_candidates_map;
4657 	uint64_t offset;
4658 	if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
4659 		swap_candidates_map = sched_edge_non_p_core_map;
4660 		offset = os_atomic_inc_orig(&sched_edge_stp_selection_non_p_core_offset, relaxed);
4661 	} else {
4662 		swap_candidates_map = sched_edge_p_core_map;
4663 		offset = os_atomic_inc_orig(&sched_edge_stp_selection_p_core_offset, relaxed);
4664 	}
4665 	int num_candidates = bit_count(swap_candidates_map);
4666 	if (num_candidates == 0) {
4667 		/* Too early in boot, no cores of opposite type */
4668 		return -1;
4669 	}
4670 	int cpu_of_type_offset_ind = offset % num_candidates;
4671 	int search_start_ind = lsb_first(swap_candidates_map);
4672 	for (int i = 0; i < cpu_of_type_offset_ind; i++) {
4673 		search_start_ind = lsb_next(swap_candidates_map, search_start_ind);
4674 		assert3s(search_start_ind, !=, -1);
4675 	}
4676 	assert3s(search_start_ind, !=, -1);
4677 	swap_candidates_map = bit_ror64(swap_candidates_map, search_start_ind);
4678 	/*
4679 	 * Search the registry for candidate cores of the opposite type which
4680 	 * have requested swap.
4681 	 */
4682 	int swap_cpu;
4683 	sched_edge_stp_registry_t old_registry, new_registry, intermediate_registry;
4684 	sched_edge_stp_state_t self_state;
4685 	/* BEGIN IGNORE CODESTYLE */
4686 	os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
4687 	    old_registry, new_registry, relaxed, {
4688 		swap_cpu = -1;
4689 		self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
4690 		if (self_state == SCHED_EDGE_STP_PENDING) {
4691 			/*
4692 			 * Another core already initiated a swap with us, so we should
4693 			 * wait for that one to finish rather than initiate or request
4694 			 * a new one.
4695 			 */
4696 			os_atomic_rmw_loop_give_up(break);
4697 		}
4698 		/* Scan candidates */
4699 		for (int rotid = lsb_first(swap_candidates_map); rotid != -1; rotid = lsb_next(swap_candidates_map, rotid)) {
4700 			int candidate_cpu = (rotid + search_start_ind) % 64; // un-rotate the bit
4701 			sched_edge_stp_state_t candidate_state = SESTP_EXTRACT_STATE(old_registry, candidate_cpu);
4702 			if (candidate_state == SCHED_EDGE_STP_REQUESTED) {
4703 				sched_bucket_t candidate_qos = os_atomic_load(
4704 				    &processor_array[candidate_cpu]->processor_set->cpu_running_buckets[candidate_cpu], relaxed);
4705 				if (candidate_qos == thread->th_sched_bucket) {
4706 					/* Found a requesting candidate of matching QoS */
4707 					swap_cpu = candidate_cpu;
4708 					break;
4709 				}
4710 			}
4711 		}
4712 		if (swap_cpu == -1) {
4713 			/* No candidates requesting swap, so mark this core as requesting */
4714 			intermediate_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_REQUESTED);
4715 		} else {
4716 			/*
4717 			 * Mark candidate core as selected/pending for swap, and mark
4718 			 * current CPU as not needing a swap anymore, since we will now
4719 			 * start one.
4720 			 */
4721 			intermediate_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_PENDING);
4722 			intermediate_registry = SESTP_SET_STATE(intermediate_registry, swap_cpu, SCHED_EDGE_STP_PENDING);
4723 		}
4724 		new_registry = intermediate_registry;
4725 	});
4726 	/* END IGNORE CODESTYLE */
4727 	/* Leave debug tracepoints for tracking any updates to registry state */
4728 	if (self_state != SCHED_EDGE_STP_PENDING) {
4729 		if (swap_cpu == -1) {
4730 			if (self_state != SCHED_EDGE_STP_REQUESTED) {
4731 				/* Now requesting */
4732 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
4733 				    DBG_FUNC_START, 0, self_cpu, cpu_of_type_offset_ind, 0);
4734 			}
4735 		} else {
4736 			if (self_state == SCHED_EDGE_STP_REQUESTED) {
4737 				/* Now pending */
4738 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
4739 				    DBG_FUNC_END, 1, self_cpu, cpu_of_type_offset_ind, 0);
4740 			}
4741 			int swap_state = SESTP_EXTRACT_STATE(old_registry, swap_cpu);
4742 			if (swap_state == SCHED_EDGE_STP_REQUESTED) {
4743 				/* Swap core now pending */
4744 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
4745 				    DBG_FUNC_END, 1, swap_cpu, cpu_of_type_offset_ind, 0);
4746 			}
4747 		}
4748 	}
4749 	if (swap_cpu != -1) {
4750 		/* Initiate a stir-the-pot swap */
4751 		assert3s(swap_cpu, <, ml_get_topology_info()->num_cpus);
4752 		assert3s(swap_cpu, !=, self_processor->cpu_id);
4753 		processor_t swap_processor = processor_array[swap_cpu];
4754 		if (swap_processor == PROCESSOR_NULL) {
4755 			/* Unlikely early boot initialization race */
4756 			return -1;
4757 		}
4758 		assert3u(sched_edge_stir_the_pot_core_type_is_desired(swap_processor->processor_set), !=,
4759 		    sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set));
4760 		if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
4761 			/*
4762 			 * Send a message and IPI notification to the lower-performance
4763 			 * core we found which wants to swap, so it will know to send its
4764 			 * thread back here.
4765 			 */
4766 			os_atomic_store(&swap_processor->stir_the_pot_inbox_cpu, self_cpu, relaxed);
4767 			processor_set_t swap_pset = swap_processor->processor_set;
4768 			pset_lock(swap_pset);
4769 			sched_ipi_type_t ipi_type = sched_ipi_action(swap_processor, NULL,
4770 			    SCHED_IPI_EVENT_REBALANCE);
4771 			pset_unlock(swap_pset);
4772 			sched_ipi_perform(swap_processor, ipi_type);
4773 		} else {
4774 			/*
4775 			 * Send message to self to send this thread to the swap P-core. P-core
4776 			 * will clear its own pending state upon commiting to the incoming swap
4777 			 * thread after that happens.
4778 			 */
4779 			os_atomic_store(&self_processor->stir_the_pot_inbox_cpu, swap_cpu, relaxed);
4780 		}
4781 	}
4782 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_NONE,
4783 	    (swap_cpu != -1) ? 1 : 0, swap_cpu, old_registry, cpu_of_type_offset_ind);
4784 	return swap_cpu;
4785 }
4786 
4787 /*
4788  * sched_edge_stir_the_pot_clear_registry_entry()
4789  *
4790  * Mark the current CPU as NOT containing a thread which is eligible
4791  * to be swapped for stir-the-pot.
4792  * Preemption must be disabled.
4793  */
4794 void
sched_edge_stir_the_pot_clear_registry_entry(void)4795 sched_edge_stir_the_pot_clear_registry_entry(void)
4796 {
4797 	int self_cpu = current_processor()->cpu_id;
4798 	sched_edge_stp_state_t self_state;
4799 	sched_edge_stp_registry_t old_registry, new_registry;
4800 	os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
4801 	    old_registry, new_registry, relaxed, {
4802 		self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
4803 		if (self_state == SCHED_EDGE_STP_NOT_WANT) {
4804 		        /* State already cleared, nothing to be done */
4805 		        os_atomic_rmw_loop_give_up(break);
4806 		}
4807 		new_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_NOT_WANT);
4808 	});
4809 	if (self_state == SCHED_EDGE_STP_REQUESTED) {
4810 		/* Request was cleared */
4811 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_END,
4812 		    2, self_cpu, 0, 0);
4813 	}
4814 }
4815 
4816 /*
4817  * sched_edge_stir_the_pot_set_registry_entry()
4818  *
4819  * Mark the current CPU as containing a thread which is eligible
4820  * to be swapped to a core of the opposite type for stir-the-pot.
4821  * Preemption must be disabled.
4822  */
4823 static inline void
sched_edge_stir_the_pot_set_registry_entry(void)4824 sched_edge_stir_the_pot_set_registry_entry(void)
4825 {
4826 	int self_cpu = current_processor()->cpu_id;
4827 	sched_edge_stp_state_t self_state;
4828 	sched_edge_stp_registry_t old_registry, new_registry;
4829 	bool newly_requested = os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
4830 	    old_registry, new_registry, relaxed, {
4831 		self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
4832 		if (self_state == SCHED_EDGE_STP_REQUESTED) {
4833 		        /* Core already registered, nothing to be done */
4834 		        os_atomic_rmw_loop_give_up(break);
4835 		}
4836 		new_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_REQUESTED);
4837 	});
4838 	if (newly_requested) {
4839 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_START,
4840 		    3, self_cpu, self_state, 0);
4841 	}
4842 }
4843 
4844 /* Stir-the-pot is designed for sharing time on the P-cores */
4845 static inline bool
sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset)4846 sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset)
4847 {
4848 	return pset->pset_type == CLUSTER_TYPE_P;
4849 }
4850 
4851 /*
4852  * sched_edge_stir_the_pot_thread_eligible()
4853  *
4854  * Determine whether a thread is eligible to engage in a
4855  * stir-the-pot swap. It must be P-recommended, unbound, and not
4856  * round-robin shared resource. Additionally, it must have already
4857  * expired quantum on its current core type.
4858  */
4859 static inline bool
sched_edge_stir_the_pot_thread_eligible(thread_t thread)4860 sched_edge_stir_the_pot_thread_eligible(thread_t thread)
4861 {
4862 	processor_set_t preferred_pset;
4863 	if ((thread == THREAD_NULL) ||
4864 	    ((preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)]) == PROCESSOR_SET_NULL)) {
4865 		/* Still initializing at boot */
4866 		return false;
4867 	}
4868 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4869 	bool right_kind_of_thread =
4870 	    sched_edge_stir_the_pot_core_type_is_desired(preferred_pset) &&
4871 	    (thread->sched_mode != TH_MODE_REALTIME) &&
4872 	    ((thread->state & TH_IDLE) == 0) &&
4873 	    SCHED_CLUTCH_THREAD_ELIGIBLE(thread) &&
4874 	    (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) == false) &&
4875 	    (shared_rsrc_type == CLUSTER_SHARED_RSRC_TYPE_NONE ||
4876 	    shared_rsrc_type == CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
4877 	bool ready_for_swap = sched_edge_stir_the_pot_core_type_is_desired(current_processor()->processor_set) ?
4878 	    thread->th_expired_quantum_on_higher_core :
4879 	    thread->th_expired_quantum_on_lower_core;
4880 	return right_kind_of_thread && ready_for_swap;
4881 }
4882 
4883 /*
4884  * sched_edge_stir_the_pot_check_inbox_for_thread()
4885  *
4886  * Check whether this thread on a non-P-core has been chosen by a P-core to
4887  * swap places for stir-the-pot, optionally consuming the inbox message.
4888  * Preemption must be disabled.
4889  */
4890 static inline int
sched_edge_stir_the_pot_check_inbox_for_thread(thread_t thread,bool consume_message)4891 sched_edge_stir_the_pot_check_inbox_for_thread(thread_t thread, bool consume_message)
4892 {
4893 	processor_t self_processor = current_processor();
4894 	int dst_cpu = -1;
4895 	if (sched_edge_stir_the_pot_thread_eligible(thread)) {
4896 		/* Thread can accept the inbox message */
4897 		dst_cpu = os_atomic_load(&self_processor->stir_the_pot_inbox_cpu, relaxed);
4898 	} else {
4899 		/* Ensure registry state is cleared for ineligible thread, if it hasn't been already */
4900 		sched_edge_stir_the_pot_clear_registry_entry();
4901 		/*
4902 		 * Note, we don't clear a possible inbox message, in case an eligible
4903 		 * thread comes back on-core quickly to receive it.
4904 		 */
4905 	}
4906 	if (consume_message) {
4907 		/*
4908 		 * Unconditionally clear inbox, since either we are triggering a
4909 		 * swap now or ultimately discarding the message because conditions
4910 		 * have changed (thread not eligible).
4911 		 */
4912 		os_atomic_store(&self_processor->stir_the_pot_inbox_cpu, -1, relaxed);
4913 		/*
4914 		 * We may have delayed requesting stir-the-pot swap for the the current thread
4915 		 * due to a pending inbox message for the previous thread. Now that that such
4916 		 * a message has been received, finishing updating the registry state.
4917 		 */
4918 		if (sched_edge_stir_the_pot_thread_eligible(self_processor->active_thread)) {
4919 			sched_edge_stir_the_pot_set_registry_entry();
4920 		}
4921 	}
4922 	return dst_cpu;
4923 }
4924 
4925 /*
4926  * sched_edge_stir_the_pot_update_registry_state()
4927  *
4928  * Update stir-the-pot state for the current processor based on its
4929  * (possibly new) current thread. This sets or clears the registry state
4930  * which indicates whether the processor is running a thread that wants
4931  * and is eligible to be swapped with a thread on the opposite core type.
4932  * Preemption must be disabled.
4933  */
4934 void
sched_edge_stir_the_pot_update_registry_state(thread_t thread)4935 sched_edge_stir_the_pot_update_registry_state(thread_t thread)
4936 {
4937 	processor_t self_processor = current_processor();
4938 	/*
4939 	 * Clear corresponding th_expired_quantum_on_ field now that thread
4940 	 * is getting a chance to run on the opposite type.
4941 	 */
4942 	if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
4943 		thread->th_expired_quantum_on_lower_core = false;
4944 	} else {
4945 		thread->th_expired_quantum_on_higher_core = false;
4946 	}
4947 	if (sched_edge_stir_the_pot_thread_eligible(thread)) {
4948 		int inbox_message = os_atomic_load(&self_processor->stir_the_pot_inbox_cpu, relaxed);
4949 		if (inbox_message == -1) {
4950 			/* Set the registry bit */
4951 			sched_edge_stir_the_pot_set_registry_entry();
4952 		} else {
4953 			assert(sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set) == false);
4954 			/*
4955 			 * There's an inbox message which still needs to be used at the next
4956 			 * migration decision, so avoid starting a new request or clearing the
4957 			 * interim pending status until then.
4958 			 */
4959 		}
4960 	} else {
4961 		/* Thread is ineligible for swap, so clear the registry bit */
4962 		sched_edge_stir_the_pot_clear_registry_entry();
4963 	}
4964 }
4965 
4966 /*
4967  * sched_edge_quantum_expire()
4968  *
4969  * Update stir-the-pot eligibility and drive stir-the-pot swaps.
4970  * Thread lock must be held.
4971  */
4972 static void
sched_edge_quantum_expire(thread_t thread)4973 sched_edge_quantum_expire(thread_t thread)
4974 {
4975 	if (sched_edge_stir_the_pot_core_type_is_desired(current_processor()->processor_set)) {
4976 		thread->th_expired_quantum_on_higher_core = true;
4977 	} else {
4978 		thread->th_expired_quantum_on_lower_core = true;
4979 	}
4980 	if (sched_edge_stir_the_pot_thread_eligible(thread)) {
4981 		sched_edge_stir_the_pot_try_trigger_swap(thread);
4982 	}
4983 }
4984 
4985 /*
4986  * sched_edge_run_count_incr()
4987  *
4988  * Update runnable thread counts in the same way as
4989  * sched_clutch_run_incr(), and reset per-thread, quantum-
4990  * expired tracking used by stir-the-pot, as the thread
4991  * is unblocking.
4992  */
4993 static uint32_t
sched_edge_run_count_incr(thread_t thread)4994 sched_edge_run_count_incr(thread_t thread)
4995 {
4996 	uint32_t new_count = sched_clutch_run_incr(thread);
4997 	/* Thread is unblocking and so resets its quantum tracking */
4998 	thread->th_expired_quantum_on_lower_core = false;
4999 	thread->th_expired_quantum_on_higher_core = false;
5000 	return new_count;
5001 }
5002 
5003 /* Return true if this thread should not continue running on this processor */
5004 static bool
sched_edge_thread_avoid_processor(processor_t processor,thread_t thread,ast_t reason)5005 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason)
5006 {
5007 	if (thread->bound_processor == processor) {
5008 		/* Thread is bound here */
5009 		return false;
5010 	}
5011 
5012 	/*
5013 	 * On quantum expiry, check the migration bitmask if this thread should be migrated off this core.
5014 	 * A migration is only recommended if there's also an idle core available that needn't be avoided.
5015 	 */
5016 	if (reason & AST_QUANTUM) {
5017 		if (bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id)) {
5018 			uint64_t non_avoided_idle_primary_map = processor->processor_set->cpu_state_map[PROCESSOR_IDLE] & processor->processor_set->recommended_bitmask & ~processor->processor_set->perfcontrol_cpu_migration_bitmask;
5019 			if (non_avoided_idle_primary_map != 0) {
5020 				return true;
5021 			}
5022 		}
5023 	}
5024 
5025 	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5026 
5027 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) &&
5028 	    preferred_pset->pset_id != processor->processor_set->pset_id &&
5029 	    pset_type_is_recommended(preferred_pset)) {
5030 		/* We should send this thread to the bound cluster */
5031 		return true;
5032 	}
5033 
5034 	sched_clutch_edge edge = (thread->sched_pri >= BASEPRI_RTQUEUES)
5035 	    ? sched_rt_config_get(preferred_pset->pset_cluster_id, processor->processor_set->pset_cluster_id)
5036 	    : sched_edge_config_get(preferred_pset->pset_cluster_id, processor->processor_set->pset_cluster_id, thread->th_sched_bucket);
5037 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) == false &&
5038 	    preferred_pset->pset_id != processor->processor_set->pset_id &&
5039 	    edge.sce_migration_allowed == false &&
5040 	    edge.sce_steal_allowed == false) {
5041 		/*
5042 		 * Thread isn't allowed to be here, according to the edge migration graph.
5043 		 * Perhaps the thread's priority or boundness or its thread group's preferred
5044 		 * pset or the edge migration graph changed.
5045 		 *
5046 		 * We should only preempt after confirming the thread actually has a
5047 		 * recommended, allowed alternative pset to run on.
5048 		 */
5049 		for (uint32_t pset_id = 0; pset_id < sched_edge_max_clusters; pset_id++) {
5050 			if (pset_id == processor->processor_set->pset_id) {
5051 				continue;
5052 			}
5053 			edge = (thread->sched_pri >= BASEPRI_RTQUEUES)
5054 			    ? sched_rt_config_get(preferred_pset->pset_id, pset_id)
5055 			    : sched_edge_config_get(preferred_pset->pset_id, pset_id, thread->th_sched_bucket);
5056 			if (pset_is_recommended(pset_array[pset_id]) && ((pset_id == preferred_pset->pset_id) || edge.sce_migration_allowed)) {
5057 				/* Thread can be run elsewhere. */
5058 				return true;
5059 			}
5060 		}
5061 	}
5062 
5063 	/* Evaluate shared resource policies */
5064 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
5065 		return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
5066 	}
5067 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
5068 		if (processor->processor_set->pset_type != preferred_pset->pset_type &&
5069 		    pset_type_is_recommended(preferred_pset)) {
5070 			return true;
5071 		}
5072 		return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
5073 	}
5074 
5075 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5076 		return false;
5077 	}
5078 	/* ~~ No realtime or shared resource threads beyond this point ~~ */
5079 
5080 	/*
5081 	 * Stir-the-Pot:
5082 	 * A non-P-core should preempt if a P-core has been found to swap the current,
5083 	 * quantum-expired thread to for stir-the-pot. This is in order for threads in a
5084 	 * multi-threaded workload to share time on the P-cores so they make roughly equal
5085 	 * forward progress.
5086 	 */
5087 	if (sched_edge_stir_the_pot_check_inbox_for_thread(thread, false) != -1) {
5088 		return true;
5089 	}
5090 
5091 	/*
5092 	 * Compaction:
5093 	 * If the preferred pset for the thread is now idle, try and migrate the thread to that cluster.
5094 	 */
5095 	if ((processor->processor_set != preferred_pset) &&
5096 	    (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
5097 		return true;
5098 	}
5099 
5100 	/*
5101 	 * Running Rebalance:
5102 	 * We are willing to preempt the thread in order to migrate it onto an idle core
5103 	 * of the preferred type.
5104 	 */
5105 	if ((processor->processor_set->pset_type != preferred_pset->pset_type) &&
5106 	    pset_type_is_recommended(preferred_pset)) {
5107 		/* Scan for idle pset */
5108 		for (uint32_t pset_id = 0; pset_id < sched_edge_max_clusters; pset_id++) {
5109 			processor_set_t candidate_pset = pset_array[pset_id];
5110 			edge = sched_edge_config_get(preferred_pset->pset_id, pset_id, thread->th_sched_bucket);
5111 			if ((candidate_pset->pset_type == preferred_pset->pset_type) &&
5112 			    edge.sce_migration_allowed &&
5113 			    (sched_edge_cluster_load_metric(candidate_pset, thread->th_sched_bucket) == 0)) {
5114 				return true;
5115 			}
5116 		}
5117 	}
5118 
5119 	return false;
5120 }
5121 
5122 static bool
sched_edge_balance(__unused processor_t cprocessor,processor_set_t cpset)5123 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
5124 {
5125 	assert(cprocessor == current_processor());
5126 	pset_unlock(cpset);
5127 
5128 	uint64_t ast_processor_map = 0;
5129 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
5130 
5131 	bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
5132 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_edge_max_clusters); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
5133 		/* Skip the pset if its not schedulable */
5134 		processor_set_t target_pset = pset_array[cluster];
5135 		if (pset_is_recommended(target_pset) == false) {
5136 			continue;
5137 		}
5138 
5139 		pset_lock(target_pset);
5140 		uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
5141 		for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
5142 			if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
5143 				continue;
5144 			}
5145 			processor_t target_cpu = processor_array[cpuid];
5146 			ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
5147 			if (ipi_type[cpuid] != SCHED_IPI_NONE) {
5148 				bit_set(ast_processor_map, cpuid);
5149 			}
5150 		}
5151 		pset_unlock(target_pset);
5152 	}
5153 
5154 	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
5155 		processor_t ast_processor = processor_array[cpuid];
5156 		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
5157 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
5158 	}
5159 
5160 	/* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
5161 	return ast_processor_map != 0;
5162 }
5163 
5164 /*
5165  * sched_edge_migration_check()
5166  *
5167  * Routine to evaluate an edge between two clusters to decide if migration is possible
5168  * across that edge. Also updates the selected_pset and max_edge_delta out parameters
5169  * accordingly. The return value indicates if the invoking routine should short circuit
5170  * the search, since an ideal candidate has been found. The routine looks at the regular
5171  * edges and cluster loads or the shared resource loads based on the type of thread.
5172  */
5173 static bool
sched_edge_migration_check(uint32_t cluster_id,processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread,processor_set_t * selected_pset,uint32_t * max_edge_delta)5174 sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
5175     uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
5176 {
5177 	uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
5178 	cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
5179 	processor_set_t dst_pset = pset_array[cluster_id];
5180 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5181 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5182 
5183 	if (cluster_id == preferred_cluster_id) {
5184 		return false;
5185 	}
5186 
5187 	if (dst_pset == NULL) {
5188 		return false;
5189 	}
5190 
5191 	sched_clutch_edge edge = sched_edge_config_get(preferred_cluster_id, cluster_id, thread->th_sched_bucket);
5192 	if (edge.sce_migration_allowed == false) {
5193 		return false;
5194 	}
5195 	uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
5196 	if (dst_load == 0
5197 	    ) {
5198 		/* The candidate cluster is idle; select it immediately for execution */
5199 		*selected_pset = dst_pset;
5200 		*max_edge_delta = preferred_cluster_load;
5201 		return true;
5202 	}
5203 
5204 	uint32_t edge_delta = 0;
5205 	if (dst_load > preferred_cluster_load) {
5206 		return false;
5207 	}
5208 	edge_delta = preferred_cluster_load - dst_load;
5209 	if (!shared_rsrc_thread && (edge_delta < edge.sce_migration_weight)) {
5210 		/*
5211 		 * For non shared resource threads, use the edge migration weight to decide if
5212 		 * this cluster is over-committed at the QoS level of this thread.
5213 		 */
5214 		return false;
5215 	}
5216 
5217 	if (edge_delta < *max_edge_delta) {
5218 		return false;
5219 	}
5220 	if (edge_delta == *max_edge_delta) {
5221 		/* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
5222 		boolean_t selected_homogeneous = ((*selected_pset)->pset_type == preferred_cluster_type);
5223 		boolean_t candidate_homogeneous = (dst_pset->pset_type == preferred_cluster_type);
5224 		if (selected_homogeneous || !candidate_homogeneous) {
5225 			return false;
5226 		}
5227 	}
5228 	/* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
5229 	*max_edge_delta = edge_delta;
5230 	*selected_pset = dst_pset;
5231 	return false;
5232 }
5233 
5234 /*
5235  * sched_edge_migrate_edges_evaluate()
5236  *
5237  * Routine to find the candidate for thread migration based on edge weights.
5238  *
5239  * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
5240  * return preferred_pset if its the most ideal destination for this thread.
5241  */
5242 static processor_set_t
sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread)5243 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
5244 {
5245 	processor_set_t selected_pset = preferred_pset;
5246 	uint32_t max_edge_delta = 0;
5247 	bool search_complete = false;
5248 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5249 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5250 
5251 	bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
5252 	bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
5253 	/* Always start the search with the native clusters */
5254 	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
5255 	while (sched_iterate_psets_ordered(preferred_pset, &preferred_pset->spill_search_order[thread->th_sched_bucket], native_pset_bitmap[0], &istate)) {
5256 		search_complete = sched_edge_migration_check(istate.spis_pset_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
5257 		if (search_complete) {
5258 			break;
5259 		}
5260 	}
5261 
5262 	if (search_complete) {
5263 		return selected_pset;
5264 	}
5265 
5266 	if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
5267 		/*
5268 		 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
5269 		 * to fill up the preferred cluster and its homogeneous peers first.
5270 		 */
5271 
5272 		if (max_edge_delta > 0) {
5273 			/*
5274 			 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
5275 			 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
5276 			 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
5277 			 * should be placed.
5278 			 */
5279 			return selected_pset;
5280 		}
5281 		/*
5282 		 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
5283 		 * any more capacity left.
5284 		 */
5285 		if (sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
5286 			return preferred_pset;
5287 		}
5288 		/*
5289 		 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
5290 		 * at non-native clusters for capacity.
5291 		 */
5292 	}
5293 
5294 	/* Now look at the non-native clusters */
5295 	istate = SCHED_PSET_ITERATE_STATE_INIT;
5296 	while (sched_iterate_psets_ordered(preferred_pset, &preferred_pset->spill_search_order[thread->th_sched_bucket], foreign_pset_bitmap[0], &istate)) {
5297 		search_complete = sched_edge_migration_check(istate.spis_pset_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
5298 		if (search_complete) {
5299 			break;
5300 		}
5301 	}
5302 	return selected_pset;
5303 }
5304 
5305 /*
5306  * sched_edge_candidate_alternative()
5307  *
5308  * Routine to find an alternative cluster from candidate_cluster_bitmap since the
5309  * selected_pset is not available for execution. The logic tries to prefer homogeneous
5310  * clusters over heterogeneous clusters since this is typically used in thread
5311  * placement decisions.
5312  */
5313 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
5314 static processor_set_t
sched_edge_candidate_alternative(processor_set_t selected_pset,uint64_t candidate_cluster_bitmap)5315 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
5316 {
5317 	/*
5318 	 * It looks like the most ideal pset is not available for scheduling currently.
5319 	 * Try to find a homogeneous cluster that is still available.
5320 	 */
5321 	uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
5322 	int available_cluster_id = lsb_first(available_native_clusters);
5323 	if (available_cluster_id == -1) {
5324 		/* Looks like none of the homogeneous clusters are available; pick the first available cluster */
5325 		available_cluster_id = bit_first(candidate_cluster_bitmap);
5326 	}
5327 	assert(available_cluster_id != -1);
5328 	return pset_array[available_cluster_id];
5329 }
5330 
5331 /*
5332  * sched_edge_switch_pset_lock()
5333  *
5334  * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
5335  * switch_pset_locks.
5336  * Returns the newly locked pset after the switch.
5337  */
5338 static processor_set_t
sched_edge_switch_pset_lock(processor_set_t selected_pset,processor_set_t locked_pset,bool switch_pset_locks)5339 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
5340 {
5341 	if (!switch_pset_locks) {
5342 		return locked_pset;
5343 	}
5344 	if (selected_pset != locked_pset) {
5345 		pset_unlock(locked_pset);
5346 		pset_lock(selected_pset);
5347 		return selected_pset;
5348 	} else {
5349 		return locked_pset;
5350 	}
5351 }
5352 
5353 /*
5354  * sched_edge_migrate_candidate()
5355  *
5356  * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
5357  * the thread and the preferred cluster to determine the best available pset for scheduling.
5358  *
5359  * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
5360  * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
5361  * decision has to be synchronized with another CPU changing the recommendation of clusters available
5362  * on the system. If this parameter is set to false, this routine returns the best effort indication of
5363  * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
5364  * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
5365  * more appropriate cluster.
5366  *
5367  * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
5368  * resultant pset lock is held.
5369  */
5370 static processor_set_t
sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset,thread_t thread,processor_set_t locked_pset,bool switch_pset_locks,processor_t * processor_hint_out,sched_options_t * options_inout)5371 sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset, thread_t thread,
5372     processor_set_t locked_pset, bool switch_pset_locks, processor_t *processor_hint_out,
5373     sched_options_t *options_inout)
5374 {
5375 	processor_set_t selected_pset = preferred_pset;
5376 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5377 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5378 	bool stirring_the_pot = false;
5379 
5380 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
5381 		/*
5382 		 * For cluster-bound threads, choose the cluster to which the thread is bound, unless that
5383 		 * cluster is unavailable. If it's not available, fall through to the regular cluster selection
5384 		 * logic which handles derecommended clusters appropriately.
5385 		 */
5386 		selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
5387 		if (selected_pset != NULL) {
5388 			locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
5389 			if (pset_is_recommended(selected_pset)) {
5390 				return selected_pset;
5391 			}
5392 		}
5393 	}
5394 
5395 	uint64_t candidate_cluster_bitmap = mask(sched_edge_max_clusters);
5396 #if DEVELOPMENT || DEBUG
5397 	extern int enable_task_set_cluster_type;
5398 	task_t task = get_threadtask(thread);
5399 	if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5400 		processor_set_t pset_hint = task->pset_hint;
5401 		if (pset_hint && (selected_pset == NULL || selected_pset->pset_cluster_type != pset_hint->pset_cluster_type)) {
5402 			selected_pset = pset_hint;
5403 			goto migrate_candidate_available_check;
5404 		}
5405 	}
5406 #endif
5407 
5408 	if (preferred_pset == NULL) {
5409 		/* The preferred_pset has not finished initializing at boot */
5410 		goto migrate_candidate_available_check;
5411 	}
5412 
5413 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5414 		/* For realtime threads, try and schedule them on the preferred pset always */
5415 		goto migrate_candidate_available_check;
5416 	}
5417 
5418 	uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
5419 	if (preferred_cluster_load == 0) {
5420 		goto migrate_candidate_available_check;
5421 	}
5422 
5423 	/*
5424 	 * If this thread has expired quantum on a non-preferred core and is waiting on
5425 	 * "stir-the-pot" to get a turn running on a P-core, check our processor inbox for
5426 	 * stir-the-pot to see if an eligible P-core has already been found for swap.
5427 	 * If so, try to migrate to the corresponding pset and also carry over the
5428 	 * processor hint to preempt that specific P-core.
5429 	 *
5430 	 * The AMP rebalancing mechanism is available for regular threads or shared resource
5431 	 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
5432 	 */
5433 	int stir_the_pot_swap_cpu = sched_edge_stir_the_pot_check_inbox_for_thread(thread, true);
5434 	if (stir_the_pot_swap_cpu != -1) {
5435 		*processor_hint_out = processor_array[stir_the_pot_swap_cpu];
5436 		selected_pset = processor_array[stir_the_pot_swap_cpu]->processor_set;
5437 		stirring_the_pot = true;
5438 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_NONE,
5439 		    2, stir_the_pot_swap_cpu, 0, 0);
5440 		goto migrate_candidate_available_check;
5441 	}
5442 
5443 	/* Look at edge weights to decide the most ideal migration candidate for this thread */
5444 	selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
5445 
5446 migrate_candidate_available_check:
5447 	if (selected_pset == NULL) {
5448 		/* The selected_pset has not finished initializing at boot */
5449 		pset_unlock(locked_pset);
5450 		return NULL;
5451 	}
5452 
5453 	locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
5454 	if (pset_is_recommended(selected_pset) == true) {
5455 		/* Committing to the pset */
5456 		if (stirring_the_pot) {
5457 			*options_inout |= SCHED_STIR_POT;
5458 		}
5459 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_pset->pset_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
5460 		return selected_pset;
5461 	}
5462 	stirring_the_pot = false;
5463 	/* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
5464 	bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
5465 	if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -1)) {
5466 		pset_unlock(locked_pset);
5467 		return NULL;
5468 	}
5469 	/* Try and find an alternative for the selected pset */
5470 	selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
5471 	goto migrate_candidate_available_check;
5472 }
5473 
5474 static processor_t
sched_edge_choose_processor(processor_set_t pset,processor_t processor,thread_t thread,sched_options_t * options_inout)5475 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, sched_options_t *options_inout)
5476 {
5477 	/* Bound threads don't call this function */
5478 	assert(thread->bound_processor == PROCESSOR_NULL);
5479 	processor_t chosen_processor = PROCESSOR_NULL;
5480 
5481 	/*
5482 	 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
5483 	 * It should take the passed in "pset" as a hint which represents the recency metric for
5484 	 * pset selection logic.
5485 	 */
5486 	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5487 	processor_set_t chosen_pset = preferred_pset;
5488 	/*
5489 	 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
5490 	 * threads to. sched_edge_migrate_candidate() returns the preferred pset
5491 	 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
5492 	 *
5493 	 * Edge Scheduler Optimization
5494 	 * It might be useful to build a recency metric for the thread for multiple clusters and
5495 	 * factor that into the migration decisions.
5496 	 */
5497 	chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true, &processor, options_inout);
5498 	if (chosen_pset) {
5499 		chosen_processor = choose_processor(chosen_pset, processor, thread, options_inout);
5500 	}
5501 	return chosen_processor;
5502 }
5503 
5504 /*
5505  * sched_edge_clutch_bucket_threads_drain()
5506  *
5507  * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
5508  * bucket overrides etc.) into a local thread queue.
5509  */
5510 static void
sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,queue_t clutch_threads)5511 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads)
5512 {
5513 	thread_t thread = THREAD_NULL;
5514 	uint64_t current_timestamp = mach_approximate_time();
5515 	qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
5516 		sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
5517 		enqueue_tail(clutch_threads, &thread->runq_links);
5518 	}
5519 }
5520 
5521 #if !SCHED_TEST_HARNESS
5522 
5523 /*
5524  * sched_edge_run_drained_threads()
5525  *
5526  * Makes all drained threads in a local queue runnable.
5527  */
5528 static void
sched_edge_run_drained_threads(queue_t clutch_threads)5529 sched_edge_run_drained_threads(queue_t clutch_threads)
5530 {
5531 	thread_t thread;
5532 	/* Now setrun all the threads in the local queue */
5533 	qe_foreach_element_safe(thread, clutch_threads, runq_links) {
5534 		remqueue(&thread->runq_links);
5535 		thread_lock(thread);
5536 		thread_setrun(thread, SCHED_TAILQ);
5537 		thread_unlock(thread);
5538 	}
5539 }
5540 
5541 #endif /* !SCHED_TEST_HARNESS */
5542 
5543 /*
5544  * sched_edge_update_preferred_cluster()
5545  *
5546  * Routine to update the preferred cluster for QoS buckets within a thread group.
5547  * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
5548  */
5549 static void
sched_edge_update_preferred_cluster(sched_clutch_t sched_clutch,bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster)5550 sched_edge_update_preferred_cluster(
5551 	sched_clutch_t sched_clutch,
5552 	bitmap_t *clutch_bucket_modify_bitmap,
5553 	uint32_t *tg_bucket_preferred_cluster)
5554 {
5555 	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
5556 		os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
5557 	}
5558 }
5559 
5560 #if !SCHED_TEST_HARNESS
5561 
5562 /*
5563  * sched_edge_migrate_thread_group_runnable_threads()
5564  *
5565  * Routine to implement the migration of threads on a cluster when the thread group
5566  * recommendation is updated. The migration works using a 2-phase
5567  * algorithm.
5568  *
5569  * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
5570  * For each clutch bucket, if it needs to be migrated immediately, drain the threads
5571  * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
5572  * appropriate.
5573  *
5574  * Phase 2: After unlocking the pset, drain all the threads from the local thread
5575  * queue and mark them runnable which should land them in the right hierarchy.
5576  *
5577  * The routine assumes that the preferences for the clutch buckets/clutch bucket
5578  * groups have already been updated by the caller.
5579  *
5580  * - Called with the pset locked and interrupts disabled.
5581  * - Returns with the pset unlocked.
5582  */
5583 static void
sched_edge_migrate_thread_group_runnable_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,bitmap_t * clutch_bucket_modify_bitmap,__unused uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)5584 sched_edge_migrate_thread_group_runnable_threads(
5585 	sched_clutch_t sched_clutch,
5586 	sched_clutch_root_t root_clutch,
5587 	bitmap_t *clutch_bucket_modify_bitmap,
5588 	__unused uint32_t *tg_bucket_preferred_cluster,
5589 	bool migrate_immediately)
5590 {
5591 	/* Queue to hold threads that have been drained from clutch buckets to be migrated */
5592 	queue_head_t clutch_threads;
5593 	queue_init(&clutch_threads);
5594 
5595 	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
5596 		/* Get the clutch bucket for this cluster and sched bucket */
5597 		sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
5598 		sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
5599 		sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed);
5600 		if (scb_root == NULL) {
5601 			/* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
5602 			assert(clutch_bucket->scb_thr_count == 0);
5603 			continue;
5604 		}
5605 		assert(scb_root == root_clutch);
5606 		uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
5607 
5608 		if (migrate_immediately) {
5609 			/*
5610 			 * For transitions where threads need to be migrated immediately, drain the threads into a
5611 			 * local queue unless we are looking at the clutch buckets for the newly recommended
5612 			 * cluster.
5613 			 */
5614 			if (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster) {
5615 				sched_edge_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads);
5616 			} else {
5617 				sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
5618 			}
5619 		} else {
5620 			/* Check if this cluster is the same type as the newly recommended cluster */
5621 			boolean_t homogeneous_cluster = (pset_type_for_id(root_clutch->scr_cluster_id) == pset_type_for_id(clutch_bucket_preferred_cluster));
5622 			/*
5623 			 * If threads do not have to be migrated immediately, just change the native/foreign
5624 			 * flag on the clutch bucket.
5625 			 */
5626 			if (homogeneous_cluster) {
5627 				sched_clutch_bucket_mark_native(clutch_bucket, root_clutch);
5628 			} else {
5629 				sched_clutch_bucket_mark_foreign(clutch_bucket, root_clutch);
5630 			}
5631 		}
5632 	}
5633 
5634 	pset_unlock(root_clutch->scr_pset);
5635 	sched_edge_run_drained_threads(&clutch_threads);
5636 }
5637 
5638 /*
5639  * sched_edge_migrate_thread_group_running_threads()
5640  *
5641  * Routine to find all running threads of a thread group on a specific cluster
5642  * and IPI them if they need to be moved immediately.
5643  */
5644 static void
sched_edge_migrate_thread_group_running_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,__unused bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)5645 sched_edge_migrate_thread_group_running_threads(
5646 	sched_clutch_t sched_clutch,
5647 	sched_clutch_root_t root_clutch,
5648 	__unused bitmap_t *clutch_bucket_modify_bitmap,
5649 	uint32_t *tg_bucket_preferred_cluster,
5650 	bool migrate_immediately)
5651 {
5652 	if (migrate_immediately == false) {
5653 		/* If CLPC has recommended not to move threads immediately, nothing to do here */
5654 		return;
5655 	}
5656 
5657 	/*
5658 	 * Edge Scheduler Optimization
5659 	 *
5660 	 * When the system has a large number of clusters and cores, it might be useful to
5661 	 * narrow down the iteration by using a thread running bitmap per clutch.
5662 	 */
5663 	uint64_t ast_processor_map = 0;
5664 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
5665 
5666 	uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
5667 	/*
5668 	 * Iterate all CPUs and look for the ones running threads from this thread group and are
5669 	 * not restricted to the specific cluster (due to overrides etc.)
5670 	 */
5671 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
5672 		processor_t src_processor = processor_array[cpuid];
5673 		boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
5674 		sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
5675 		if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
5676 			continue;
5677 		}
5678 		boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
5679 
5680 		if (expected_tg && non_preferred_cluster) {
5681 			ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
5682 			if (ipi_type[cpuid] != SCHED_IPI_NONE) {
5683 				bit_set(ast_processor_map, cpuid);
5684 			} else if (src_processor == current_processor()) {
5685 				bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
5686 				ast_t new_preempt = update_pending_nonurgent_preemption(src_processor, AST_PREEMPT);
5687 				ast_on(new_preempt);
5688 			}
5689 		}
5690 	}
5691 
5692 	/* Perform all the IPIs */
5693 	if (bit_first(ast_processor_map) != -1) {
5694 		for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
5695 			processor_t ast_processor = processor_array[cpuid];
5696 			sched_ipi_perform(ast_processor, ipi_type[cpuid]);
5697 		}
5698 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
5699 	}
5700 }
5701 
5702 /*
5703  * sched_edge_tg_preferred_cluster_change()
5704  *
5705  * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
5706  * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
5707  * thread migrations based on the policy specified by CLPC.
5708  * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
5709  * for each QoS within the thread group.
5710  */
5711 void
sched_edge_tg_preferred_cluster_change(struct thread_group * tg,uint32_t * tg_bucket_preferred_cluster,sched_perfcontrol_preferred_cluster_options_t options)5712 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
5713 {
5714 	sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
5715 	/*
5716 	 * In order to optimize the processing, create a bitmap which represents all QoS buckets
5717 	 * for which the preferred cluster has changed.
5718 	 */
5719 	bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
5720 	for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
5721 		uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
5722 		uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
5723 		if (old_preferred_cluster != new_preferred_cluster) {
5724 			bitmap_set(clutch_bucket_modify_bitmap, bucket);
5725 		}
5726 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREFERRED_PSET) | DBG_FUNC_NONE,
5727 		    thread_group_get_id(tg), bucket, new_preferred_cluster, options);
5728 	}
5729 	if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
5730 		/* No changes in any clutch buckets; nothing to do here */
5731 		return;
5732 	}
5733 
5734 	/*
5735 	 * The first operation is to update the preferred cluster for all QoS buckets within the
5736 	 * thread group so that any future threads becoming runnable would see the new preferred
5737 	 * cluster value.
5738 	 */
5739 	sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
5740 
5741 	for (uint32_t cluster_id = 0; cluster_id < sched_edge_max_clusters; cluster_id++) {
5742 		processor_set_t pset = pset_array[cluster_id];
5743 		spl_t s = splsched();
5744 		pset_lock(pset);
5745 		/*
5746 		 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
5747 		 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
5748 		 *
5749 		 * Edge Scheduler Optimization
5750 		 */
5751 		/* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
5752 		sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
5753 		    tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
5754 		/* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
5755 		sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
5756 		    tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE));
5757 		/* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
5758 		splx(s);
5759 	}
5760 }
5761 
5762 /*
5763  * sched_edge_pset_made_schedulable()
5764  *
5765  * Routine to migrate all the clutch buckets which are not in their recommended
5766  * pset hierarchy now that a new pset has become runnable. Its possible that this
5767  * routine is called when the pset is already marked schedulable.
5768  *
5769  * Invoked with the pset lock held and interrupts disabled.
5770  */
5771 static void
sched_edge_pset_made_schedulable(__unused processor_t processor,processor_set_t dst_pset,boolean_t drop_lock)5772 sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock)
5773 {
5774 	if (bitmap_test(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id)) {
5775 		/* Nothing to do here since pset is already marked schedulable */
5776 		if (drop_lock) {
5777 			pset_unlock(dst_pset);
5778 		}
5779 		return;
5780 	}
5781 
5782 	bitmap_set(sched_edge_available_pset_bitmask, dst_pset->pset_cluster_id);
5783 
5784 	thread_t thread = sched_edge_processor_idle(dst_pset);
5785 	if (thread != THREAD_NULL) {
5786 		thread_lock(thread);
5787 		thread_setrun(thread, SCHED_TAILQ);
5788 		thread_unlock(thread);
5789 	}
5790 
5791 	if (!drop_lock) {
5792 		pset_lock(dst_pset);
5793 	}
5794 }
5795 
5796 #endif /* !SCHED_TEST_HARNESS */
5797 
5798 
5799 /*
5800  * sched_edge_cpu_init_completed()
5801  *
5802  * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
5803  * provides an opportunity for the edge scheduler to initialize all the edge parameters.
5804  */
5805 static void
sched_edge_cpu_init_completed(void)5806 sched_edge_cpu_init_completed(void)
5807 {
5808 	/* Now that all cores have registered, compute bitmaps for different core types */
5809 	for (int pset_id = 0; pset_id < sched_edge_max_clusters; pset_id++) {
5810 		processor_set_t pset = pset_array[pset_id];
5811 		if (sched_edge_stir_the_pot_core_type_is_desired(pset)) {
5812 			os_atomic_or(&sched_edge_p_core_map, pset->cpu_bitmask, relaxed);
5813 		} else {
5814 			os_atomic_or(&sched_edge_non_p_core_map, pset->cpu_bitmask, relaxed);
5815 		}
5816 	}
5817 	/* Build policy table for setting edge weight tunables based on cluster types */
5818 	sched_clutch_edge edge_config_defaults[MAX_CPU_TYPES][MAX_CPU_TYPES];
5819 	sched_clutch_edge free_spill = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1};
5820 	sched_clutch_edge no_spill = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0};
5821 	sched_clutch_edge weighted_spill = (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1};
5822 	/* P -> P */
5823 	edge_config_defaults[CLUSTER_TYPE_P][CLUSTER_TYPE_P] = free_spill;
5824 	/* E -> E */
5825 	edge_config_defaults[CLUSTER_TYPE_E][CLUSTER_TYPE_E] = free_spill;
5826 	/* P -> E */
5827 	edge_config_defaults[CLUSTER_TYPE_P][CLUSTER_TYPE_E] = weighted_spill;
5828 	/* E -> P */
5829 	edge_config_defaults[CLUSTER_TYPE_E][CLUSTER_TYPE_P] = no_spill;
5830 
5831 	spl_t s = splsched();
5832 	for (int src_cluster_id = 0; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) {
5833 		processor_set_t src_pset = pset_array[src_cluster_id];
5834 		pset_lock(src_pset);
5835 
5836 		/* Each pset recommendation is at least allowed to access its own cluster */
5837 		for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
5838 			src_pset->max_parallel_cores[bucket] = src_pset->cpu_set_count;
5839 			src_pset->max_parallel_clusters[bucket] = 1;
5840 		}
5841 
5842 		/* For each cluster, set all its outgoing edge parameters */
5843 		for (int dst_cluster_id = 0; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) {
5844 			processor_set_t dst_pset = pset_array[dst_cluster_id];
5845 			if (dst_cluster_id == src_cluster_id) {
5846 				continue;
5847 			}
5848 
5849 			bool clusters_homogenous = (src_pset->pset_type == dst_pset->pset_type);
5850 			if (clusters_homogenous) {
5851 				bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
5852 				bitmap_set(src_pset->native_psets, dst_cluster_id);
5853 				/* Default realtime policy: spill allowed among homogeneous psets. */
5854 				sched_rt_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge) {
5855 					.sce_migration_allowed = true,
5856 					.sce_steal_allowed = true,
5857 					.sce_migration_weight = 0,
5858 				});
5859 			} else {
5860 				bitmap_set(src_pset->foreign_psets, dst_cluster_id);
5861 				bitmap_clear(src_pset->native_psets, dst_cluster_id);
5862 				/* Default realtime policy: disallow spill among heterogeneous psets. */
5863 				sched_rt_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge) {
5864 					.sce_migration_allowed = false,
5865 					.sce_steal_allowed = false,
5866 					.sce_migration_weight = 0,
5867 				});
5868 			}
5869 
5870 			bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
5871 			if (clusters_local) {
5872 				bitmap_set(src_pset->local_psets, dst_cluster_id);
5873 				bitmap_clear(src_pset->remote_psets, dst_cluster_id);
5874 			} else {
5875 				bitmap_set(src_pset->remote_psets, dst_cluster_id);
5876 				bitmap_clear(src_pset->local_psets, dst_cluster_id);
5877 			}
5878 
5879 			for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
5880 				/* Set tunables for an edge based on the cluster types at either ends of it */
5881 				sched_clutch_edge edge_config = edge_config_defaults[src_pset->pset_type][dst_pset->pset_type];
5882 				sched_edge_config_set(src_cluster_id, dst_cluster_id, bucket, edge_config);
5883 				if (edge_config.sce_migration_allowed) {
5884 					src_pset->max_parallel_cores[bucket] += dst_pset->cpu_set_count;
5885 					src_pset->max_parallel_clusters[bucket] += 1;
5886 				}
5887 			}
5888 		}
5889 		sched_edge_config_pset_push(src_cluster_id);
5890 
5891 		pset_unlock(src_pset);
5892 	}
5893 	splx(s);
5894 }
5895 
5896 static bool
sched_edge_thread_eligible_for_pset(thread_t thread,processor_set_t pset)5897 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
5898 {
5899 	uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
5900 	if (preferred_cluster_id == pset->pset_cluster_id) {
5901 		return true;
5902 	} else {
5903 		sched_clutch_edge edge;
5904 		if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5905 			edge = sched_rt_config_get(preferred_cluster_id, pset->pset_id);
5906 		} else {
5907 			edge = sched_edge_config_get(preferred_cluster_id, pset->pset_cluster_id, thread->th_sched_bucket);
5908 		}
5909 		return edge.sce_migration_allowed;
5910 	}
5911 }
5912 
5913 extern int sched_amp_spill_deferred_ipi;
5914 extern int sched_amp_pcores_preempt_immediate_ipi;
5915 
5916 int sched_edge_migrate_ipi_immediate = 1;
5917 
5918 sched_ipi_type_t
sched_edge_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)5919 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
5920 {
5921 	processor_set_t pset = dst->processor_set;
5922 	assert(dst != current_processor());
5923 
5924 	boolean_t deferred_ipi_supported = false;
5925 #if defined(CONFIG_SCHED_DEFERRED_AST)
5926 	deferred_ipi_supported = true;
5927 #endif /* CONFIG_SCHED_DEFERRED_AST */
5928 
5929 	switch (event) {
5930 	case SCHED_IPI_EVENT_SPILL:
5931 		/* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
5932 		if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
5933 			return sched_ipi_deferred_policy(pset, dst, thread, event);
5934 		}
5935 		break;
5936 	case SCHED_IPI_EVENT_PREEMPT:
5937 		/* For preemption, the default policy is to use deferred IPIs
5938 		 * for Non-RT P-core preemption. Override that behavior if
5939 		 * sched_amp_pcores_preempt_immediate_ipi is set
5940 		 */
5941 		if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
5942 			if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
5943 				return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5944 			}
5945 			if (sched_edge_migrate_ipi_immediate) {
5946 				processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5947 				/*
5948 				 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
5949 				 */
5950 				if (preferred_pset->pset_type == pset->pset_type) {
5951 					return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5952 				}
5953 				/*
5954 				 * For workloads that are going wide, it might be useful to use Immediate IPI to
5955 				 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
5956 				 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
5957 				 * latency on the preferred pset as an estimate of busyness.
5958 				 */
5959 				if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
5960 					return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
5961 				}
5962 			}
5963 		}
5964 		break;
5965 	default:
5966 		break;
5967 	}
5968 	/* Default back to the global policy for all other scenarios */
5969 	return sched_ipi_policy(dst, thread, dst_idle, event);
5970 }
5971 
5972 
5973 /*
5974  * sched_edge_qos_max_parallelism()
5975  */
5976 uint32_t
sched_edge_qos_max_parallelism(int qos,uint64_t options)5977 sched_edge_qos_max_parallelism(int qos, uint64_t options)
5978 {
5979 	cluster_type_t low_core_type = CLUSTER_TYPE_E;
5980 	cluster_type_t high_core_type = CLUSTER_TYPE_P;
5981 
5982 	if (options & QOS_PARALLELISM_REALTIME) {
5983 		/* For realtime threads on AMP, we would want them
5984 		 * to limit the width to just the P-cores since we
5985 		 * do not spill/rebalance for RT threads.
5986 		 */
5987 		uint32_t high_cpu_count = ml_get_cpu_number_type(high_core_type, false, false);
5988 		uint32_t high_cluster_count = ml_get_cluster_number_type(high_core_type);
5989 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? high_cluster_count : high_cpu_count;
5990 	}
5991 
5992 	/*
5993 	 * The Edge scheduler supports per-QoS recommendations for thread groups.
5994 	 * This enables lower QoS buckets (such as UT) to be scheduled on all
5995 	 * CPUs on the system.
5996 	 *
5997 	 * The only restriction is for BG/Maintenance QoS classes for which the
5998 	 * performance controller would never recommend execution on the P-cores.
5999 	 * If that policy changes in the future, this value should be changed.
6000 	 */
6001 	switch (qos) {
6002 	case THREAD_QOS_BACKGROUND:
6003 	case THREAD_QOS_MAINTENANCE:;
6004 		uint32_t low_cpu_count = ml_get_cpu_number_type(low_core_type, false, false);
6005 		uint32_t low_cluster_count = ml_get_cluster_number_type(low_core_type);
6006 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? low_cluster_count : low_cpu_count;
6007 	default:;
6008 		uint32_t total_cpus = ml_get_cpu_count();
6009 		uint32_t total_clusters = ml_get_cluster_count();
6010 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? total_clusters : total_cpus;
6011 	}
6012 }
6013 
6014 
6015 #endif /* CONFIG_SCHED_EDGE */
6016 
6017 #endif /* CONFIG_SCHED_CLUTCH */
6018