xref: /xnu-12377.81.4/osfmk/kern/sched_clutch.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2018 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #if !SCHED_TEST_HARNESS
30 
31 #include <kern/debug.h>
32 #include <kern/kern_types.h>
33 #include <kern/machine.h>
34 #include <kern/misc_protos.h>
35 #include <kern/queue.h>
36 #include <kern/sched_clutch.h>
37 #include <kern/sched.h>
38 #include <kern/task.h>
39 #include <kern/thread.h>
40 
41 #include <mach/mach_types.h>
42 #include <mach/machine.h>
43 
44 #include <machine/atomic.h>
45 #include <machine/machine_cpu.h>
46 #include <machine/machine_routines.h>
47 #include <machine/sched_param.h>
48 
49 #include <sys/kdebug.h>
50 
51 #endif /* !SCHED_TEST_HARNESS */
52 
53 #include <kern/processor.h>
54 #include <kern/sched_prim.h>
55 #include <kern/sched_rt.h>
56 
57 #if CONFIG_SCHED_EDGE
58 #include <kern/sched_amp_common.h>
59 #endif /* CONFIG_SCHED_EDGE */
60 
61 #if CONFIG_SCHED_CLUTCH
62 
63 #if CONFIG_SCHED_SMT
64 #error "The clutch scheduler does not support CONFIG_SCHED_SMT."
65 #endif /* CONFIG_SCHED_SMT */
66 
67 #define SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION 1
68 typedef union {
69 	struct __attribute__((packed)) {
70 		unsigned int version                            : 4;
71 		unsigned int traverse_mode                      : 3;
72 		unsigned int cluster_id                         : 6;
73 		unsigned int selection_was_edf                  : 1;
74 		unsigned int selection_was_cluster_bound        : 1;
75 		unsigned int selection_opened_starvation_avoidance_window  : 1;
76 		unsigned int selection_opened_warp_window       : 1;
77 		unsigned int starvation_avoidance_window_close  : 12;
78 		unsigned int warp_window_close                  : 12;
79 		unsigned int reserved                           : 23;  /* For future usage */
80 	} trace_data;
81 	uint64_t scdts_trace_data_packed;
82 } sched_clutch_dbg_thread_select_packed_t;
83 
84 static_assert(TH_BUCKET_SCHED_MAX == 6, "Ensure layout of sched_clutch_dbg_thread_select_packed can fit root bucket bitmasks");
85 static_assert(sizeof(sched_clutch_dbg_thread_select_packed_t) <= sizeof(uint64_t), "Ensure sched_clutch_dbg_thread_select_packed_t can fit in one tracepoint argument");
86 
87 /* Forward declarations of static routines */
88 
89 /* Root level hierarchy management */
90 static void sched_clutch_root_init(sched_clutch_root_t, processor_set_t);
91 static void sched_clutch_root_bucket_init(sched_clutch_root_bucket_t, sched_bucket_t, bool);
92 static void sched_clutch_root_pri_update(sched_clutch_root_t);
93 static void sched_clutch_root_urgency_inc(sched_clutch_root_t, thread_t);
94 static void sched_clutch_root_urgency_dec(sched_clutch_root_t, thread_t);
95 
96 __enum_decl(sched_clutch_highest_root_bucket_type_t, uint32_t, {
97 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_NONE           = 0,
98 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY   = 1,
99 	SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL            = 2,
100 });
101 __enum_decl(sched_clutch_traverse_mode_t, uint32_t, {
102 	SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY   = 0,
103 	SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT = 1,
104 	SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT           = 2,
105 });
106 static_assert(SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT < (1 << 3), "Ensure traverse mode can be encoded within 3 bits of sched_clutch_dbg_thread_select_packed_t");
107 static sched_clutch_root_bucket_t sched_clutch_root_highest_root_bucket(sched_clutch_root_t, uint64_t, sched_clutch_highest_root_bucket_type_t, sched_clutch_root_bucket_t, thread_t, bool *, sched_clutch_traverse_mode_t, sched_clutch_dbg_thread_select_packed_t *);
108 
109 /* Root bucket level hierarchy management */
110 static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t, uint64_t);
111 static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t, bool);
112 static int sched_clutch_root_highest_runnable_qos(sched_clutch_root_t, sched_clutch_highest_root_bucket_type_t);
113 
114 /* Options for clutch bucket ordering in the runq */
115 __options_decl(sched_clutch_bucket_options_t, uint32_t, {
116 	SCHED_CLUTCH_BUCKET_OPTIONS_NONE        = 0x0,
117 	/* Round robin clutch bucket on thread removal */
118 	SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR  = 0x1,
119 	/* Insert clutch bucket at head (for thread preemption) */
120 	SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ       = 0x2,
121 	/* Insert clutch bucket at tail (default) */
122 	SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ       = 0x4,
123 });
124 
125 /* Clutch bucket level hierarchy management */
126 static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
127 static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t);
128 static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
129 static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
130 static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t);
131 static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t);
132 
133 /* Clutch bucket group level properties management */
134 static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t);
135 static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t);
136 static void sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_t);
137 static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t);
138 static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t);
139 static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t);
140 static uint8_t sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t, uint64_t);
141 
142 /* Clutch timeshare properties updates */
143 static uint32_t sched_clutch_run_bucket_incr(sched_clutch_t, sched_bucket_t);
144 static uint32_t sched_clutch_run_bucket_decr(sched_clutch_t, sched_bucket_t);
145 
146 /* Clutch membership management */
147 static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t);
148 static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t);
149 static thread_t sched_clutch_hierarchy_thread_highest(sched_clutch_root_t, processor_t, thread_t, sched_clutch_traverse_mode_t);
150 
151 /* Clutch properties updates */
152 static uint32_t sched_clutch_root_urgency(sched_clutch_root_t);
153 static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t);
154 static int sched_clutch_root_priority(sched_clutch_root_t);
155 static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t, sched_clutch_root_bucket_t, processor_t _Nullable processor, thread_t _Nullable prev_thread, bool *_Nullable chose_prev_thread);
156 
157 /* Clutch thread properties */
158 static boolean_t sched_thread_sched_pri_promoted(thread_t);
159 static inline sched_clutch_bucket_t sched_clutch_bucket_for_thread(sched_clutch_root_t, thread_t);
160 static inline sched_clutch_bucket_group_t sched_clutch_bucket_group_for_thread(thread_t);
161 
162 /* General utilities */
163 static inline bool sched_clutch_pri_greater_than_tiebreak(int, int, bool);
164 
165 #if CONFIG_SCHED_EDGE
166 
167 /* System based routines */
168 static uint32_t sched_edge_thread_bound_cluster_id(thread_t);
169 static bool sched_edge_pset_peek_steal_possible(processor_set_t, processor_set_t, bitmap_t);
170 
171 #endif /* CONFIG_SCHED_EDGE */
172 
173 /* Helper debugging routines */
174 static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t);
175 
176 /*
177  * Special markers for buckets that have invalid WCELs/quantums etc.
178  */
179 #define SCHED_CLUTCH_INVALID_TIME_32 ((uint32_t)~0)
180 #define SCHED_CLUTCH_INVALID_TIME_64 ((uint64_t)~0)
181 
182 /*
183  * Root level bucket WCELs
184  *
185  * The root level bucket selection algorithm is an Earliest Deadline
186  * First (EDF) algorithm where the deadline for buckets are defined
187  * by the worst-case-execution-latency and the make runnable timestamp
188  * for the bucket.
189  *
190  */
191 static uint32_t sched_clutch_root_bucket_wcel_us[TH_BUCKET_SCHED_MAX] = {
192 	SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
193 	0,                                              /* FG */
194 	37500,                                          /* IN (37.5ms) */
195 	75000,                                          /* DF (75ms) */
196 	150000,                                         /* UT (150ms) */
197 	250000                                          /* BG (250ms) */
198 };
199 static uint64_t sched_clutch_root_bucket_wcel[TH_BUCKET_SCHED_MAX] = {0};
200 
201 /*
202  * Root level bucket warp
203  *
204  * Each root level bucket has a warp value associated with it as well.
205  * The warp value allows the root bucket to effectively warp ahead of
206  * lower priority buckets for a limited time even if it has a later
207  * deadline. The warping behavior provides extra (but limited)
208  * opportunity for high priority buckets to remain responsive.
209  */
210 
211 /* Special warp deadline value to indicate that the bucket has not used any warp yet */
212 #define SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED    (SCHED_CLUTCH_INVALID_TIME_64)
213 
214 /* Warp window durations for various tiers */
215 static uint32_t sched_clutch_root_bucket_warp_us[TH_BUCKET_SCHED_MAX] = {
216 	SCHED_CLUTCH_INVALID_TIME_32,                   /* FIXPRI */
217 	8000,                                           /* FG (8ms)*/
218 	4000,                                           /* IN (4ms) */
219 	2000,                                           /* DF (2ms) */
220 	1000,                                           /* UT (1ms) */
221 	0                                               /* BG (0ms) */
222 };
223 static uint64_t sched_clutch_root_bucket_warp[TH_BUCKET_SCHED_MAX] = {0};
224 
225 /*
226  * Thread level quantum
227  *
228  * The algorithm defines quantums for threads at various buckets. This
229  * (combined with the root level bucket quantums) restricts how much
230  * the lower priority levels can preempt the higher priority threads.
231  */
232 
233 #if XNU_TARGET_OS_OSX
234 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
235 	10000,                                          /* FIXPRI (10ms) */
236 	10000,                                          /* FG (10ms) */
237 	10000,                                          /* IN (10ms) */
238 	10000,                                          /* DF (10ms) */
239 	4000,                                           /* UT (4ms) */
240 	2000                                            /* BG (2ms) */
241 };
242 #else /* XNU_TARGET_OS_OSX */
243 static uint32_t sched_clutch_thread_quantum_us[TH_BUCKET_SCHED_MAX] = {
244 	10000,                                          /* FIXPRI (10ms) */
245 	10000,                                          /* FG (10ms) */
246 	8000,                                           /* IN (8ms) */
247 	6000,                                           /* DF (6ms) */
248 	4000,                                           /* UT (4ms) */
249 	2000                                            /* BG (2ms) */
250 };
251 #endif /* XNU_TARGET_OS_OSX */
252 
253 static uint64_t sched_clutch_thread_quantum[TH_BUCKET_SCHED_MAX] = {0};
254 
255 /*
256  * sched_clutch_us_to_abstime()
257  *
258  * Initializer for converting all durations in usec to abstime
259  */
260 static void
sched_clutch_us_to_abstime(uint32_t * us_vals,uint64_t * abstime_vals)261 sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals)
262 {
263 	for (int i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
264 		if (us_vals[i] == SCHED_CLUTCH_INVALID_TIME_32) {
265 			abstime_vals[i] = SCHED_CLUTCH_INVALID_TIME_64;
266 		} else {
267 			clock_interval_to_absolutetime_interval(us_vals[i],
268 			    NSEC_PER_USEC, &abstime_vals[i]);
269 		}
270 	}
271 }
272 
273 /* Clutch/Edge Scheduler Debugging support */
274 #define SCHED_CLUTCH_DBG_THR_COUNT_PACK(a, b, c)        ((uint64_t)c | ((uint64_t)b << 16) | ((uint64_t)a << 32))
275 
276 #if DEVELOPMENT || DEBUG
277 
278 kern_return_t
sched_clutch_thread_group_cpu_time_for_thread(thread_t thread,int sched_bucket,uint64_t * cpu_stats)279 sched_clutch_thread_group_cpu_time_for_thread(thread_t thread, int sched_bucket, uint64_t *cpu_stats)
280 {
281 	if (sched_bucket < 0 || sched_bucket >= TH_BUCKET_MAX) {
282 		return KERN_INVALID_ARGUMENT;
283 	}
284 	sched_clutch_bucket_group_t clutch_bucket_group = &sched_clutch_for_thread(thread)->sc_clutch_groups[sched_bucket];
285 	sched_clutch_bucket_cpu_data_t scb_cpu_data;
286 	scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
287 	cpu_stats[0] = scb_cpu_data.cpu_data.scbcd_cpu_used;
288 	cpu_stats[1] = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
289 	return KERN_SUCCESS;
290 }
291 
292 /*
293  * sched_clutch_hierarchy_locked_assert()
294  *
295  * Debugging helper routine. Asserts that the hierarchy is locked. The locking
296  * for the hierarchy depends on where the hierarchy is hooked. The current
297  * implementation hooks the hierarchy at the pset, so the hierarchy is locked
298  * using the pset lock.
299  */
300 static inline void
sched_clutch_hierarchy_locked_assert(sched_clutch_root_t root_clutch)301 sched_clutch_hierarchy_locked_assert(
302 	sched_clutch_root_t root_clutch)
303 {
304 	pset_assert_locked(root_clutch->scr_pset);
305 }
306 
307 #else /* DEVELOPMENT || DEBUG */
308 
309 static inline void
sched_clutch_hierarchy_locked_assert(__unused sched_clutch_root_t root_clutch)310 sched_clutch_hierarchy_locked_assert(
311 	__unused sched_clutch_root_t root_clutch)
312 {
313 }
314 
315 #endif /* DEVELOPMENT || DEBUG */
316 
317 /*
318  * sched_clutch_thr_count_inc()
319  *
320  * Increment thread count at a hierarchy level with overflow checks.
321  */
322 static void
sched_clutch_thr_count_inc(uint16_t * thr_count)323 sched_clutch_thr_count_inc(
324 	uint16_t *thr_count)
325 {
326 	if (__improbable(os_inc_overflow(thr_count))) {
327 		panic("sched_clutch thread count overflowed!");
328 	}
329 }
330 
331 /*
332  * sched_clutch_thr_count_dec()
333  *
334  * Decrement thread count at a hierarchy level with underflow checks.
335  */
336 static void
sched_clutch_thr_count_dec(uint16_t * thr_count)337 sched_clutch_thr_count_dec(
338 	uint16_t *thr_count)
339 {
340 	if (__improbable(os_dec_overflow(thr_count))) {
341 		panic("sched_clutch thread count underflowed!");
342 	}
343 }
344 
345 static sched_bucket_t
sched_convert_pri_to_bucket(uint8_t priority)346 sched_convert_pri_to_bucket(uint8_t priority)
347 {
348 	sched_bucket_t bucket = TH_BUCKET_RUN;
349 
350 	if (priority > BASEPRI_USER_INITIATED) {
351 		bucket = TH_BUCKET_SHARE_FG;
352 	} else if (priority > BASEPRI_DEFAULT) {
353 		bucket = TH_BUCKET_SHARE_IN;
354 	} else if (priority > BASEPRI_UTILITY) {
355 		bucket = TH_BUCKET_SHARE_DF;
356 	} else if (priority > MAXPRI_THROTTLE) {
357 		bucket = TH_BUCKET_SHARE_UT;
358 	} else {
359 		bucket = TH_BUCKET_SHARE_BG;
360 	}
361 	return bucket;
362 }
363 
364 /*
365  * sched_clutch_thread_bucket_map()
366  *
367  * Map a thread to a scheduling bucket for the clutch/edge scheduler
368  * based on its scheduling mode and the priority attribute passed in.
369  */
370 static sched_bucket_t
sched_clutch_thread_bucket_map(thread_t thread,int pri)371 sched_clutch_thread_bucket_map(thread_t thread, int pri)
372 {
373 	switch (thread->sched_mode) {
374 	case TH_MODE_FIXED:
375 		if (pri >= BASEPRI_FOREGROUND) {
376 			return TH_BUCKET_FIXPRI;
377 		} else {
378 			return sched_convert_pri_to_bucket(pri);
379 		}
380 
381 	case TH_MODE_REALTIME:
382 		return TH_BUCKET_FIXPRI;
383 
384 	case TH_MODE_TIMESHARE:
385 		return sched_convert_pri_to_bucket(pri);
386 
387 	default:
388 		panic("unexpected mode: %d", thread->sched_mode);
389 		break;
390 	}
391 }
392 
393 /*
394  * The clutch scheduler attempts to ageout the CPU usage of clutch bucket groups
395  * based on the amount of time they have been pending and the load at that
396  * scheduling bucket level. Since the clutch bucket groups are global (i.e. span
397  * multiple clusters, its important to keep the load also as a global counter.
398  */
399 static uint32_t _Atomic sched_clutch_global_bucket_load[TH_BUCKET_SCHED_MAX];
400 
401 /*
402  * sched_clutch_root_init()
403  *
404  * Routine to initialize the scheduler hierarchy root.
405  */
406 static void
sched_clutch_root_init(sched_clutch_root_t root_clutch,processor_set_t pset)407 sched_clutch_root_init(
408 	sched_clutch_root_t root_clutch,
409 	processor_set_t pset)
410 {
411 	root_clutch->scr_thr_count = 0;
412 	root_clutch->scr_priority = NOPRI;
413 	root_clutch->scr_urgency = 0;
414 	root_clutch->scr_pset = pset;
415 #if CONFIG_SCHED_EDGE
416 	root_clutch->scr_cluster_id = pset->pset_cluster_id;
417 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
418 		root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type] = 0;
419 	}
420 	/* Initialize the silos for tracking steal eligibility */
421 	bitmap_zero((bitmap_t *)root_clutch->scr_populated_steal_silos, MAX_PSETS);
422 	for (pset_id_t p = 0; p < MAX_PSETS; p++) {
423 		bitmap_zero((bitmap_t *)root_clutch->scr_steal_silos[p].sess_populated_steal_queues, TH_BUCKET_SCHED_MAX);
424 		for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
425 			priority_queue_init(&root_clutch->scr_steal_silos[p].sess_steal_queues[bucket]);
426 		}
427 	}
428 #else /* CONFIG_SCHED_EDGE */
429 	root_clutch->scr_cluster_id = 0;
430 #endif /* CONFIG_SCHED_EDGE */
431 
432 	/* Initialize the queue which maintains all runnable clutch_buckets for timesharing purposes */
433 	queue_init(&root_clutch->scr_clutch_buckets);
434 
435 	bzero(&root_clutch->scr_cumulative_run_count, sizeof(root_clutch->scr_cumulative_run_count));
436 	bitmap_zero(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
437 	bitmap_zero(root_clutch->scr_bound_warp_available, TH_BUCKET_SCHED_MAX);
438 	priority_queue_init(&root_clutch->scr_bound_root_buckets);
439 
440 	/* Initialize the bitmap and priority queue of runnable root buckets */
441 	priority_queue_init(&root_clutch->scr_unbound_root_buckets);
442 	bitmap_zero(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
443 	bitmap_zero(root_clutch->scr_unbound_warp_available, TH_BUCKET_SCHED_MAX);
444 
445 	/* Initialize all the root buckets */
446 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
447 		sched_clutch_root_bucket_init(&root_clutch->scr_unbound_buckets[i], i, false);
448 		sched_clutch_root_bucket_init(&root_clutch->scr_bound_buckets[i], i, true);
449 	}
450 }
451 
452 /*
453  * Clutch Bucket Runqueues
454  *
455  * The clutch buckets are maintained in a runq at the root bucket level. The
456  * runq organization allows clutch buckets to be ordered based on various
457  * factors such as:
458  *
459  * - Clutch buckets are round robin'ed at the same priority level when a
460  *   thread is selected from a clutch bucket. This prevents a clutch bucket
461  *   from starving out other clutch buckets at the same priority.
462  *
463  * - Clutch buckets are inserted at the head when it becomes runnable due to
464  *   thread preemption. This allows threads that were preempted to maintain
465  *   their order in the queue.
466  */
467 
468 /*
469  * sched_clutch_bucket_runq_init()
470  *
471  * Initialize a clutch bucket runq.
472  */
473 static void
sched_clutch_bucket_runq_init(sched_clutch_bucket_runq_t clutch_buckets_rq)474 sched_clutch_bucket_runq_init(
475 	sched_clutch_bucket_runq_t clutch_buckets_rq)
476 {
477 	clutch_buckets_rq->scbrq_highq = NOPRI;
478 	for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) {
479 		clutch_buckets_rq->scbrq_bitmap[i] = 0;
480 	}
481 	clutch_buckets_rq->scbrq_count = 0;
482 	for (int i = 0; i < NRQS; i++) {
483 		circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]);
484 	}
485 }
486 
487 /*
488  * sched_clutch_bucket_runq_empty()
489  *
490  * Returns if a clutch bucket runq is empty.
491  */
492 static boolean_t
sched_clutch_bucket_runq_empty(sched_clutch_bucket_runq_t clutch_buckets_rq)493 sched_clutch_bucket_runq_empty(
494 	sched_clutch_bucket_runq_t clutch_buckets_rq)
495 {
496 	return clutch_buckets_rq->scbrq_count == 0;
497 }
498 
499 /*
500  * sched_clutch_bucket_runq_peek()
501  *
502  * Returns the highest priority clutch bucket in the runq.
503  */
504 static sched_clutch_bucket_t
sched_clutch_bucket_runq_peek(sched_clutch_bucket_runq_t clutch_buckets_rq)505 sched_clutch_bucket_runq_peek(
506 	sched_clutch_bucket_runq_t clutch_buckets_rq)
507 {
508 	if (clutch_buckets_rq->scbrq_count > 0) {
509 		circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq];
510 		return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink);
511 	} else {
512 		return NULL;
513 	}
514 }
515 
516 /*
517  * sched_clutch_bucket_runq_enqueue()
518  *
519  * Enqueue a clutch bucket into the runq based on the options passed in.
520  */
521 static void
sched_clutch_bucket_runq_enqueue(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_options_t options)522 sched_clutch_bucket_runq_enqueue(
523 	sched_clutch_bucket_runq_t clutch_buckets_rq,
524 	sched_clutch_bucket_t clutch_bucket,
525 	sched_clutch_bucket_options_t options)
526 {
527 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
528 	if (circle_queue_empty(queue)) {
529 		circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
530 		bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
531 		if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) {
532 			clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority;
533 		}
534 	} else {
535 		if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) {
536 			circle_enqueue_head(queue, &clutch_bucket->scb_runqlink);
537 		} else {
538 			/*
539 			 * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ &
540 			 * SCHED_CLUTCH_BUCKET_OPTIONS_NONE)
541 			 */
542 			circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink);
543 		}
544 	}
545 	clutch_buckets_rq->scbrq_count++;
546 }
547 
548 /*
549  * sched_clutch_bucket_runq_remove()
550  *
551  * Remove a clutch bucket from the runq.
552  */
553 static void
sched_clutch_bucket_runq_remove(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)554 sched_clutch_bucket_runq_remove(
555 	sched_clutch_bucket_runq_t clutch_buckets_rq,
556 	sched_clutch_bucket_t clutch_bucket)
557 {
558 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
559 	circle_dequeue(queue, &clutch_bucket->scb_runqlink);
560 	assert(clutch_buckets_rq->scbrq_count > 0);
561 	clutch_buckets_rq->scbrq_count--;
562 	if (circle_queue_empty(queue)) {
563 		bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority);
564 		clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS);
565 	}
566 }
567 
568 static void
sched_clutch_bucket_runq_rotate(sched_clutch_bucket_runq_t clutch_buckets_rq,sched_clutch_bucket_t clutch_bucket)569 sched_clutch_bucket_runq_rotate(
570 	sched_clutch_bucket_runq_t clutch_buckets_rq,
571 	sched_clutch_bucket_t clutch_bucket)
572 {
573 	circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority];
574 	assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink));
575 	circle_queue_rotate_head_forward(queue);
576 }
577 
578 /*
579  * sched_clutch_root_bucket_init()
580  *
581  * Routine to initialize root buckets.
582  */
583 static void
sched_clutch_root_bucket_init(sched_clutch_root_bucket_t root_bucket,sched_bucket_t bucket,bool bound_root_bucket)584 sched_clutch_root_bucket_init(
585 	sched_clutch_root_bucket_t root_bucket,
586 	sched_bucket_t bucket,
587 	bool bound_root_bucket)
588 {
589 	root_bucket->scrb_bucket = bucket;
590 	if (bound_root_bucket) {
591 		/* For bound root buckets, initialize the bound thread runq. */
592 		root_bucket->scrb_bound = true;
593 		run_queue_init(&root_bucket->scrb_bound_thread_runq);
594 	} else {
595 		/*
596 		 * The unbounded root buckets contain a runq of runnable clutch buckets
597 		 * which then hold the runnable threads.
598 		 */
599 		root_bucket->scrb_bound = false;
600 		sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets);
601 	}
602 	priority_queue_entry_init(&root_bucket->scrb_pqlink);
603 	root_bucket->scrb_pqlink.deadline = 0;
604 	root_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
605 	root_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[root_bucket->scrb_bucket];
606 	root_bucket->scrb_starvation_avoidance = false;
607 	root_bucket->scrb_starvation_ts = 0;
608 }
609 
610 /*
611  * Special case scheduling for Above UI bucket.
612  *
613  * AboveUI threads are typically system critical threads that need low latency
614  * which is why they are handled specially.
615  *
616  * Since the priority range for AboveUI and FG Timeshare buckets overlap, it is
617  * important to maintain some native priority order between those buckets. For unbounded
618  * root buckets, the policy is to compare the highest clutch buckets of both buckets; if the
619  * Above UI bucket is higher, schedule it immediately. Otherwise fall through to the
620  * deadline based scheduling which should pickup the timeshare buckets. For the bound
621  * case, the policy simply compares the priority of the highest runnable threads in
622  * the above UI and timeshare buckets.
623  *
624  * The implementation allows extremely low latency CPU access for Above UI threads
625  * while supporting the use case of high priority timeshare threads contending with
626  * lower priority fixed priority threads.
627  */
628 
629 
630 /*
631  * sched_clutch_root_unbound_select_aboveui()
632  *
633  * Routine to determine if the above UI unbounded bucket should be selected for execution.
634  *
635  * Writes the highest unbound (timeshare FG vs. above UI) bucket, its priority, and whether
636  * it is an above UI bucket into the pointer parameters.
637  */
638 static void
sched_clutch_root_unbound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)639 sched_clutch_root_unbound_select_aboveui(
640 	sched_clutch_root_t root_clutch,
641 	sched_clutch_root_bucket_t *highest_bucket,
642 	int *highest_pri,
643 	bool *highest_is_aboveui,
644 	sched_clutch_root_bucket_t _Nullable prev_bucket,
645 	thread_t _Nullable prev_thread)
646 {
647 	/* First determine the highest Clutch bucket */
648 	sched_clutch_root_bucket_t higher_root_bucket = NULL;
649 	sched_clutch_bucket_t higher_clutch_bucket = NULL;
650 	int higher_bucket_sched_pri = -1;
651 	bool higher_is_aboveui = false;
652 	/* Consider unbound Above UI */
653 	if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_FIXPRI)) {
654 		higher_root_bucket = &root_clutch->scr_unbound_buckets[TH_BUCKET_FIXPRI];
655 		higher_clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, higher_root_bucket, NULL, NULL, NULL);
656 		higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
657 		higher_is_aboveui = true;
658 	}
659 	/* Consider unbound Timeshare FG */
660 	if (bitmap_test(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SHARE_FG)) {
661 		sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_unbound_buckets[TH_BUCKET_SHARE_FG];
662 		sched_clutch_bucket_t clutch_bucket_sharefg = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket_sharefg, NULL, NULL, NULL);
663 		/* Strict greater-than because unbound timeshare FG root bucket loses all priority ties at this level */
664 		if (higher_root_bucket == NULL || clutch_bucket_sharefg->scb_priority > higher_clutch_bucket->scb_priority) {
665 			higher_root_bucket = root_bucket_sharefg;
666 			higher_clutch_bucket = clutch_bucket_sharefg;
667 			higher_bucket_sched_pri = priority_queue_max_sched_pri(&higher_clutch_bucket->scb_clutchpri_prioq);
668 			higher_is_aboveui = false;
669 		}
670 	}
671 	/* Consider the previous thread */
672 	if (prev_thread != NULL) {
673 		assert(prev_bucket->scrb_bound == false);
674 		sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
675 		int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
676 		sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
677 		bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && higher_is_aboveui == false;
678 		if (higher_clutch_bucket == NULL ||
679 		    sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, higher_clutch_bucket->scb_priority, prev_bucket_should_win_ties)) {
680 			higher_root_bucket = prev_bucket;
681 			higher_clutch_bucket = prev_clutch_bucket;
682 			higher_bucket_sched_pri = prev_thread->sched_pri;
683 			higher_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
684 		}
685 	}
686 	/* Compare highest priority in the highest unbound Clutch bucket to highest priority seen from the bound buckets */
687 	if (higher_root_bucket != NULL) {
688 		bool unbound_should_win_ties = higher_is_aboveui == true && *highest_is_aboveui == false;
689 		if (sched_clutch_pri_greater_than_tiebreak(higher_bucket_sched_pri, *highest_pri, unbound_should_win_ties)) {
690 			*highest_pri = higher_bucket_sched_pri;
691 			*highest_bucket = higher_root_bucket;
692 			*highest_is_aboveui = higher_is_aboveui;
693 		}
694 	}
695 }
696 
697 /*
698  * sched_clutch_root_bound_select_aboveui()
699  *
700  * Routine to determine if the above UI bounded bucket should be selected for execution.
701  *
702  * Writes the highest bound (timeshare FG vs. above UI) bucket, its priority, and whether
703  * it is an above UI bucket into the pointer parameters.
704  */
705 static void
sched_clutch_root_bound_select_aboveui(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t * highest_bucket,int * highest_pri,bool * highest_is_aboveui,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread)706 sched_clutch_root_bound_select_aboveui(
707 	sched_clutch_root_t root_clutch,
708 	sched_clutch_root_bucket_t *highest_bucket,
709 	int *highest_pri,
710 	bool *highest_is_aboveui,
711 	sched_clutch_root_bucket_t _Nullable prev_bucket,
712 	thread_t _Nullable prev_thread)
713 {
714 	/* Consider bound Above UI */
715 	sched_clutch_root_bucket_t root_bucket_aboveui = &root_clutch->scr_bound_buckets[TH_BUCKET_FIXPRI];
716 	if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_FIXPRI) &&
717 	    sched_clutch_pri_greater_than_tiebreak(root_bucket_aboveui->scrb_bound_thread_runq.highq, *highest_pri, *highest_is_aboveui == false)) {
718 		*highest_pri = root_bucket_aboveui->scrb_bound_thread_runq.highq;
719 		*highest_bucket = root_bucket_aboveui;
720 		*highest_is_aboveui = true;
721 	}
722 	/* Consider bound Timeshare FG */
723 	sched_clutch_root_bucket_t root_bucket_sharefg = &root_clutch->scr_bound_buckets[TH_BUCKET_SHARE_FG];
724 	if (bitmap_test(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SHARE_FG) &&
725 	    sched_clutch_pri_greater_than_tiebreak(root_bucket_sharefg->scrb_bound_thread_runq.highq, *highest_pri, false)) {
726 		*highest_pri = root_bucket_sharefg->scrb_bound_thread_runq.highq;
727 		*highest_bucket = root_bucket_sharefg;
728 		*highest_is_aboveui = false;
729 	}
730 	/* Consider the previous thread */
731 	if (prev_thread != NULL) {
732 		assert(prev_bucket->scrb_bound == true);
733 		bool prev_bucket_should_win_ties = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI && *highest_is_aboveui == false;
734 		if (sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, *highest_pri, prev_bucket_should_win_ties)) {
735 			*highest_pri = prev_thread->sched_pri;
736 			*highest_bucket = prev_bucket;
737 			*highest_is_aboveui = prev_bucket->scrb_bucket == TH_BUCKET_FIXPRI;
738 		}
739 	}
740 }
741 
742 /*
743  * sched_clutch_root_highest_runnable_qos()
744  *
745  * Returns the index of the highest-QoS root bucket which is currently runnable.
746  */
747 static int
sched_clutch_root_highest_runnable_qos(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type)748 sched_clutch_root_highest_runnable_qos(
749 	sched_clutch_root_t root_clutch,
750 	sched_clutch_highest_root_bucket_type_t type)
751 {
752 	int highest_unbound_bucket = bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
753 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
754 		return highest_unbound_bucket;
755 	}
756 	assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
757 	int highest_bound_bucket = bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX);
758 	if (highest_bound_bucket == -1) {
759 		return highest_unbound_bucket;
760 	}
761 	if (highest_unbound_bucket == -1) {
762 		return highest_bound_bucket;
763 	}
764 	/* Both bound and unbound buckets are runnable, return the higher QoS */
765 	return MIN(highest_bound_bucket, highest_unbound_bucket);
766 }
767 
768 /*
769  * sched_clutch_root_highest_aboveui_root_bucket()
770  *
771  * Routine to determine if an above UI root bucket should be selected for execution.
772  *
773  * Returns the root bucket if we should run an above UI bucket or NULL otherwise.
774  */
775 static sched_clutch_root_bucket_t
sched_clutch_root_highest_aboveui_root_bucket(sched_clutch_root_t root_clutch,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread)776 sched_clutch_root_highest_aboveui_root_bucket(
777 	sched_clutch_root_t root_clutch,
778 	sched_clutch_highest_root_bucket_type_t type,
779 	sched_clutch_root_bucket_t _Nullable prev_bucket,
780 	thread_t _Nullable prev_thread,
781 	bool *chose_prev_thread)
782 {
783 	assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
784 	assert((type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) || (prev_bucket == NULL));
785 
786 	sched_clutch_root_bucket_t highest_bucket = NULL;
787 	int highest_pri = -1;
788 	bool highest_is_aboveui = false;
789 
790 	/* Forward previous thread to the correct comparison logic, based on boundness */
791 	sched_clutch_root_bucket_t bound_prev_bucket = NULL, unbound_prev_bucket = NULL;
792 	thread_t bound_prev_thread = NULL, unbound_prev_thread = NULL;
793 	if (prev_thread != NULL) {
794 		if (prev_bucket->scrb_bound) {
795 			bound_prev_bucket = prev_bucket;
796 			bound_prev_thread = prev_thread;
797 		} else {
798 			unbound_prev_bucket = prev_bucket;
799 			unbound_prev_thread = prev_thread;
800 		}
801 	}
802 
803 	/* Consider bound Above UI vs. Timeshare FG first, so those buckets will win ties against the corresponding unbound buckets */
804 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL) {
805 		sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, bound_prev_bucket, bound_prev_thread);
806 	}
807 
808 	/* Consider unbound Above UI vs. Timeshare FG */
809 	sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_bucket, &highest_pri, &highest_is_aboveui, unbound_prev_bucket, unbound_prev_thread);
810 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
811 		return highest_is_aboveui ? highest_bucket : NULL;
812 	}
813 	assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
814 
815 	/* Determine whether we already know to continue running the previous thread */
816 	if (prev_thread != NULL &&
817 	    bitmap_test(highest_bucket->scrb_bound ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap, highest_bucket->scrb_bucket) == false) {
818 		/* Highest bucket we saw is empty, so the previous thread must have been the highest */
819 		assert(highest_bucket == prev_bucket);
820 		*chose_prev_thread = true;
821 	}
822 
823 	return highest_is_aboveui ? highest_bucket : NULL;
824 }
825 
826 /*
827  * sched_clutch_root_highest_root_bucket()
828  *
829  * Main routine to find the highest runnable root level bucket.
830  * This routine is called from performance sensitive contexts; so it is
831  * crucial to keep this O(1). The options parameter determines if
832  * the selection logic should look at unbounded threads only (for
833  * cross-cluster stealing operations) or both bounded and unbounded
834  * threads (for selecting next thread for execution on current cluster).
835  */
836 static sched_clutch_root_bucket_t
sched_clutch_root_highest_root_bucket(sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_highest_root_bucket_type_t type,sched_clutch_root_bucket_t _Nullable prev_bucket,thread_t _Nullable prev_thread,bool * chose_prev_thread,sched_clutch_traverse_mode_t mode,sched_clutch_dbg_thread_select_packed_t * debug_info)837 sched_clutch_root_highest_root_bucket(
838 	sched_clutch_root_t root_clutch,
839 	uint64_t timestamp,
840 	sched_clutch_highest_root_bucket_type_t type,
841 	sched_clutch_root_bucket_t _Nullable prev_bucket,
842 	thread_t _Nullable prev_thread,
843 	bool *chose_prev_thread,
844 	sched_clutch_traverse_mode_t mode,
845 	sched_clutch_dbg_thread_select_packed_t *debug_info)
846 {
847 	assert((prev_thread == NULL && prev_bucket == NULL) || (prev_thread != NULL && prev_bucket != NULL));
848 	assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL || (prev_thread == NULL));
849 	assert(prev_thread == NULL || (mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY));
850 	sched_clutch_hierarchy_locked_assert(root_clutch);
851 
852 	int highest_runnable_bucket = sched_clutch_root_highest_runnable_qos(root_clutch, type);
853 	if (highest_runnable_bucket == -1) {
854 		/*
855 		 * The Clutch hierarchy has no runnable threads. We can continue running
856 		 * whatever was running previously.
857 		 */
858 		assert(sched_clutch_root_count(root_clutch) == 0 || type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY);
859 		*chose_prev_thread = true;
860 		if (prev_thread != NULL) {
861 			debug_info->trace_data.selection_was_edf = true;
862 		}
863 		return prev_bucket;
864 	}
865 
866 	/* Consider Above UI threads, in comparison to Timeshare FG threads */
867 	sched_clutch_root_bucket_t highest_aboveui_bucket = sched_clutch_root_highest_aboveui_root_bucket(root_clutch, type, prev_bucket, prev_thread, chose_prev_thread);
868 	if (highest_aboveui_bucket != NULL) {
869 		debug_info->trace_data.selection_was_edf = true;
870 		return highest_aboveui_bucket;
871 	}
872 
873 	/*
874 	 * Above UI bucket is not runnable or has a low priority runnable thread; use the
875 	 * earliest deadline model to schedule threads. The idea is that as the timeshare
876 	 * buckets use CPU, they will drop their interactivity score/sched priority and
877 	 * allow the low priority AboveUI buckets to be scheduled.
878 	 */
879 
880 	/* Find the earliest deadline bucket */
881 	sched_clutch_root_bucket_t edf_bucket;
882 	bool edf_bucket_enqueued_normally;
883 
884 evaluate_root_buckets:
885 	edf_bucket = NULL;
886 	edf_bucket_enqueued_normally = true;
887 
888 	if (type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_UNBOUND_ONLY) {
889 		edf_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
890 	} else {
891 		assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
892 		sched_clutch_root_bucket_t unbound_bucket = priority_queue_min(&root_clutch->scr_unbound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
893 		sched_clutch_root_bucket_t bound_bucket = priority_queue_min(&root_clutch->scr_bound_root_buckets, struct sched_clutch_root_bucket, scrb_pqlink);
894 		if (bound_bucket && unbound_bucket) {
895 			/* If bound and unbound root buckets are runnable, select the one with the earlier deadline */
896 			edf_bucket = (bound_bucket->scrb_pqlink.deadline <= unbound_bucket->scrb_pqlink.deadline) ? bound_bucket : unbound_bucket;
897 		} else {
898 			edf_bucket = (bound_bucket) ? bound_bucket : unbound_bucket;
899 		}
900 	}
901 	if (edf_bucket == NULL) {
902 		/* The timeshare portion of the runqueue is empty */
903 		assert(type == SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL);
904 		assert(prev_thread != NULL);
905 		*chose_prev_thread = true;
906 		if (prev_thread != NULL) {
907 			debug_info->trace_data.selection_was_edf = true;
908 		}
909 		return prev_bucket;
910 	}
911 	if (prev_bucket != NULL && prev_bucket->scrb_pqlink.deadline < edf_bucket->scrb_pqlink.deadline) {
912 		/* The previous thread's root bucket has the earliest deadline and is not currently enqueued */
913 		edf_bucket = prev_bucket;
914 		edf_bucket_enqueued_normally = false;
915 	}
916 
917 	if (edf_bucket->scrb_starvation_avoidance) {
918 		/* Check if the EDF bucket is in an expired starvation avoidance window */
919 		uint64_t starvation_window = sched_clutch_thread_quantum[edf_bucket->scrb_bucket];
920 		if (timestamp >= (edf_bucket->scrb_starvation_ts + starvation_window)) {
921 			/* Starvation avoidance window is over; update deadline and re-evaluate EDF */
922 			edf_bucket->scrb_starvation_avoidance = false;
923 			edf_bucket->scrb_starvation_ts = 0;
924 			sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
925 			bit_set(debug_info->trace_data.starvation_avoidance_window_close, edf_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + edf_bucket->scrb_bucket);
926 			goto evaluate_root_buckets;
927 		}
928 	}
929 
930 	/*
931 	 * Check if any of the buckets have warp available. The implementation only allows root buckets to warp ahead of
932 	 * buckets of the same type (i.e. bound/unbound). The reason for doing that is because warping is a concept that
933 	 * makes sense between root buckets of the same type since its effectively a scheduling advantage over a lower
934 	 * QoS root bucket.
935 	 */
936 	bitmap_t *warp_available_bitmap = (edf_bucket->scrb_bound) ? (root_clutch->scr_bound_warp_available) : (root_clutch->scr_unbound_warp_available);
937 	int warp_bucket_index = bitmap_lsb_first(warp_available_bitmap, TH_BUCKET_SCHED_MAX);
938 
939 	/* Allow the prev_bucket to use its warp as well */
940 	bool prev_bucket_warping = (prev_bucket != NULL) && (prev_bucket->scrb_bound == edf_bucket->scrb_bound) &&
941 	    prev_bucket->scrb_bucket < edf_bucket->scrb_bucket && (prev_bucket->scrb_warp_remaining > 0) &&
942 	    (warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
943 
944 	bool non_edf_bucket_can_warp = (warp_bucket_index != -1 && warp_bucket_index < edf_bucket->scrb_bucket) || prev_bucket_warping;
945 
946 	if (non_edf_bucket_can_warp == false) {
947 		/* No higher buckets have warp left; best choice is the EDF based bucket */
948 		debug_info->trace_data.selection_was_edf = true;
949 
950 		bool should_update_edf_starvation_state = edf_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
951 		if (edf_bucket->scrb_starvation_avoidance == false && should_update_edf_starvation_state) {
952 			/* Looks like the EDF bucket is not in starvation avoidance mode; check if it should be */
953 			if (highest_runnable_bucket < edf_bucket->scrb_bucket || (prev_bucket != NULL && prev_bucket->scrb_bucket < edf_bucket->scrb_bucket)) {
954 				/*
955 				 * Since a higher bucket is runnable, it indicates that the EDF bucket should be in starvation avoidance.
956 				 *
957 				 * The starvation avoidance window is allocated as a single quantum for the starved bucket, enforced
958 				 * simultaneously across all CPUs in the cluster. The idea is to grant the starved bucket roughly one
959 				 * quantum per core, each time the bucket reaches the earliest deadline position. Note that this
960 				 * cadence is driven by the difference between the starved bucket's and highest-runnable bucket's WCELs.
961 				 */
962 				edf_bucket->scrb_starvation_avoidance = true;
963 				edf_bucket->scrb_starvation_ts = timestamp;
964 				debug_info->trace_data.selection_opened_starvation_avoidance_window = true;
965 			} else {
966 				/* EDF bucket is being selected in the natural order; update deadline and reset warp */
967 				sched_clutch_root_bucket_deadline_update(edf_bucket, root_clutch, timestamp, edf_bucket_enqueued_normally);
968 				edf_bucket->scrb_warp_remaining = sched_clutch_root_bucket_warp[edf_bucket->scrb_bucket];
969 				edf_bucket->scrb_warped_deadline = SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED;
970 				if (edf_bucket_enqueued_normally) {
971 					if (edf_bucket->scrb_bound) {
972 						bitmap_set(root_clutch->scr_bound_warp_available, edf_bucket->scrb_bucket);
973 					} else {
974 						bitmap_set(root_clutch->scr_unbound_warp_available, edf_bucket->scrb_bucket);
975 					}
976 				}
977 			}
978 		}
979 		*chose_prev_thread = !edf_bucket_enqueued_normally;
980 		return edf_bucket;
981 	}
982 
983 	/*
984 	 * Looks like there is a root bucket which is higher in the natural priority
985 	 * order than edf_bucket and might have some warp remaining.
986 	 */
987 	assert(prev_bucket_warping || warp_bucket_index >= 0);
988 	sched_clutch_root_bucket_t warp_bucket = NULL;
989 	if (prev_bucket_warping) {
990 		assert(warp_bucket_index == -1 || prev_bucket->scrb_bucket < warp_bucket_index);
991 		warp_bucket = prev_bucket;
992 	} else {
993 		warp_bucket = (edf_bucket->scrb_bound) ? &root_clutch->scr_bound_buckets[warp_bucket_index] : &root_clutch->scr_unbound_buckets[warp_bucket_index];
994 	}
995 
996 	bool warp_is_being_utilized = warp_bucket == prev_bucket || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
997 
998 	if (warp_bucket->scrb_warped_deadline == SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
999 		if (warp_is_being_utilized) {
1000 			/* Root bucket has not used any of its warp; set a deadline to expire its warp and return it */
1001 			warp_bucket->scrb_warped_deadline = timestamp + warp_bucket->scrb_warp_remaining;
1002 			sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
1003 			debug_info->trace_data.selection_opened_warp_window = true;
1004 		}
1005 		*chose_prev_thread = prev_bucket_warping;
1006 		debug_info->trace_data.selection_was_edf = false;
1007 		assert(warp_bucket != edf_bucket);
1008 		return warp_bucket;
1009 	}
1010 	if (warp_bucket->scrb_warped_deadline > timestamp) {
1011 		/* Root bucket already has a warp window open with some warp remaining */
1012 		if (warp_is_being_utilized) {
1013 			sched_clutch_root_bucket_deadline_update(warp_bucket, root_clutch, timestamp, !prev_bucket_warping);
1014 		}
1015 		*chose_prev_thread = prev_bucket_warping;
1016 		debug_info->trace_data.selection_was_edf = false;
1017 		return warp_bucket;
1018 	}
1019 
1020 	/*
1021 	 * For this bucket, warp window was opened sometime in the past but has now
1022 	 * expired. Mark the bucket as not available for warp anymore and re-run the
1023 	 * warp bucket selection logic.
1024 	 */
1025 	warp_bucket->scrb_warp_remaining = 0;
1026 	if (!prev_bucket_warping) {
1027 		if (warp_bucket->scrb_bound) {
1028 			bitmap_clear(root_clutch->scr_bound_warp_available, warp_bucket->scrb_bucket);
1029 		} else {
1030 			bitmap_clear(root_clutch->scr_unbound_warp_available, warp_bucket->scrb_bucket);
1031 		}
1032 	}
1033 	bit_set(debug_info->trace_data.warp_window_close, warp_bucket->scrb_bound * TH_BUCKET_SCHED_MAX + warp_bucket->scrb_bucket);
1034 	goto evaluate_root_buckets;
1035 }
1036 
1037 static inline bool
sched_clutch_bucket_is_above_timeshare(sched_bucket_t bucket)1038 sched_clutch_bucket_is_above_timeshare(sched_bucket_t bucket)
1039 {
1040 	return bucket == TH_BUCKET_FIXPRI;
1041 }
1042 
1043 /*
1044  * sched_clutch_root_bucket_deadline_calculate()
1045  *
1046  * Calculate the deadline for the bucket based on its WCEL
1047  */
1048 static uint64_t
sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bucket_t root_bucket,uint64_t timestamp)1049 sched_clutch_root_bucket_deadline_calculate(
1050 	sched_clutch_root_bucket_t root_bucket,
1051 	uint64_t timestamp)
1052 {
1053 	/* For fixpri AboveUI bucket always return it as the earliest deadline */
1054 	if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1055 		return 0;
1056 	}
1057 
1058 	/* For all timeshare buckets set the deadline as current time + worst-case-execution-latency */
1059 	return timestamp + sched_clutch_root_bucket_wcel[root_bucket->scrb_bucket];
1060 }
1061 
1062 /*
1063  * sched_clutch_root_bucket_deadline_update()
1064  *
1065  * Routine to update the deadline of the root bucket when it is selected.
1066  * Updating the deadline also moves the root_bucket in the EDF priority
1067  * queue.
1068  */
1069 static void
sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,bool bucket_is_enqueued)1070 sched_clutch_root_bucket_deadline_update(
1071 	sched_clutch_root_bucket_t root_bucket,
1072 	sched_clutch_root_t root_clutch,
1073 	uint64_t timestamp,
1074 	bool bucket_is_enqueued)
1075 {
1076 	if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1077 		/* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */
1078 		return;
1079 	}
1080 
1081 	uint64_t old_deadline = root_bucket->scrb_pqlink.deadline;
1082 	uint64_t new_deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1083 	if (__improbable(old_deadline > new_deadline)) {
1084 		panic("old_deadline (%llu) > new_deadline (%llu); root_bucket (%d); timestamp (%llu)", old_deadline, new_deadline, root_bucket->scrb_bucket, timestamp);
1085 	}
1086 	if (old_deadline != new_deadline) {
1087 		root_bucket->scrb_pqlink.deadline = new_deadline;
1088 		if (bucket_is_enqueued) {
1089 			struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1090 			priority_queue_entry_increased(prioq, &root_bucket->scrb_pqlink);
1091 		}
1092 	}
1093 }
1094 
1095 /*
1096  * sched_clutch_root_bucket_runnable()
1097  *
1098  * Routine to insert a newly runnable root bucket into the hierarchy.
1099  * Also updates the deadline and warp parameters as necessary.
1100  */
1101 static void
sched_clutch_root_bucket_runnable(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1102 sched_clutch_root_bucket_runnable(
1103 	sched_clutch_root_bucket_t root_bucket,
1104 	sched_clutch_root_t root_clutch,
1105 	uint64_t timestamp)
1106 {
1107 	/* Mark the root bucket as runnable */
1108 	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1109 	bitmap_set(runnable_bitmap, root_bucket->scrb_bucket);
1110 
1111 	if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1112 		/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1113 		return;
1114 	}
1115 
1116 	if (root_bucket->scrb_starvation_avoidance == false) {
1117 		/*
1118 		 * Only update the deadline if the bucket was not in starvation avoidance mode. If the bucket was in
1119 		 * starvation avoidance and its window has expired, the highest root bucket selection logic will notice
1120 		 * that and fix it up.
1121 		 */
1122 		root_bucket->scrb_pqlink.deadline = sched_clutch_root_bucket_deadline_calculate(root_bucket, timestamp);
1123 	}
1124 	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1125 	priority_queue_insert(prioq, &root_bucket->scrb_pqlink);
1126 	if (root_bucket->scrb_warp_remaining) {
1127 		/* Since the bucket has some warp remaining and its now runnable, mark it as available for warp */
1128 		bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1129 		bitmap_set(warp_bitmap, root_bucket->scrb_bucket);
1130 	}
1131 }
1132 
1133 /*
1134  * sched_clutch_root_bucket_empty()
1135  *
1136  * Routine to remove an empty root bucket from the hierarchy.
1137  * Also updates the deadline and warp parameters as necessary.
1138  */
1139 static void
sched_clutch_root_bucket_empty(sched_clutch_root_bucket_t root_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp)1140 sched_clutch_root_bucket_empty(
1141 	sched_clutch_root_bucket_t root_bucket,
1142 	sched_clutch_root_t root_clutch,
1143 	uint64_t timestamp)
1144 {
1145 	bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap;
1146 	bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket);
1147 
1148 	if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) {
1149 		/* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */
1150 		return;
1151 	}
1152 
1153 	struct priority_queue_deadline_min *prioq = (root_bucket->scrb_bound) ? &root_clutch->scr_bound_root_buckets : &root_clutch->scr_unbound_root_buckets;
1154 	priority_queue_remove(prioq, &root_bucket->scrb_pqlink);
1155 
1156 	bitmap_t *warp_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_warp_available : root_clutch->scr_unbound_warp_available;
1157 	bitmap_clear(warp_bitmap, root_bucket->scrb_bucket);
1158 
1159 	if (root_bucket->scrb_warped_deadline != SCHED_CLUTCH_ROOT_BUCKET_WARP_UNUSED) {
1160 		if (root_bucket->scrb_warped_deadline > timestamp) {
1161 			/*
1162 			 * For root buckets that were using the warp, check if the warp
1163 			 * deadline is in the future. If yes, remove the wall time the
1164 			 * warp was active and update the warp remaining. This allows
1165 			 * the root bucket to use the remaining warp the next time it
1166 			 * becomes runnable.
1167 			 */
1168 			root_bucket->scrb_warp_remaining = root_bucket->scrb_warped_deadline - timestamp;
1169 		} else {
1170 			/*
1171 			 * If the root bucket's warped deadline is in the past, it has used up
1172 			 * all the warp it was assigned. Empty out its warp remaining.
1173 			 */
1174 			root_bucket->scrb_warp_remaining = 0;
1175 		}
1176 	}
1177 }
1178 
1179 static int
sched_clutch_global_bucket_load_get(sched_bucket_t bucket)1180 sched_clutch_global_bucket_load_get(
1181 	sched_bucket_t bucket)
1182 {
1183 	return (int)os_atomic_load(&sched_clutch_global_bucket_load[bucket], relaxed);
1184 }
1185 
1186 /*
1187  * sched_clutch_root_pri_update()
1188  *
1189  * The root level priority is used for thread selection and preemption
1190  * logic.
1191  *
1192  * The logic uses the same decision as thread selection for deciding between the
1193  * above UI and timeshare buckets. If one of the timesharing buckets have to be
1194  * used for priority calculation, the logic is slightly different from thread
1195  * selection, because thread selection considers deadlines, warps etc. to
1196  * decide the most optimal bucket at a given timestamp. Since the priority
1197  * value is used for preemption decisions only, it needs to be based on the
1198  * highest runnable thread available in the timeshare domain. This logic can
1199  * be made more sophisticated if there are cases of unnecessary preemption
1200  * being seen in workloads.
1201  */
1202 static void
sched_clutch_root_pri_update(sched_clutch_root_t root_clutch)1203 sched_clutch_root_pri_update(
1204 	sched_clutch_root_t root_clutch)
1205 {
1206 	sched_clutch_hierarchy_locked_assert(root_clutch);
1207 	int16_t root_bound_pri = NOPRI;
1208 	int16_t root_unbound_pri = NOPRI;
1209 
1210 	/* Consider bound root buckets */
1211 	if (bitmap_lsb_first(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1212 		goto root_pri_update_unbound;
1213 	}
1214 	sched_clutch_root_bucket_t highest_bound_root_bucket = NULL;
1215 	__unused int highest_bound_root_bucket_pri = -1;
1216 	bool highest_bound_root_bucket_is_fixpri = false;
1217 	sched_clutch_root_bound_select_aboveui(root_clutch, &highest_bound_root_bucket, &highest_bound_root_bucket_pri, &highest_bound_root_bucket_is_fixpri, NULL, NULL);
1218 	if (highest_bound_root_bucket_is_fixpri == false) {
1219 		int root_bucket_index = bitmap_lsb_next(root_clutch->scr_bound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1220 		assert(root_bucket_index != -1);
1221 		highest_bound_root_bucket = &root_clutch->scr_bound_buckets[root_bucket_index];
1222 	}
1223 	root_bound_pri = highest_bound_root_bucket->scrb_bound_thread_runq.highq;
1224 
1225 root_pri_update_unbound:
1226 	/* Consider unbound root buckets */
1227 	if (bitmap_lsb_first(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
1228 		goto root_pri_update_complete;
1229 	}
1230 	sched_clutch_root_bucket_t highest_unbound_root_bucket = NULL;
1231 	__unused int highest_unbound_root_bucket_pri = -1;
1232 	bool highest_unbound_root_bucket_is_fixpri = false;
1233 	sched_clutch_root_unbound_select_aboveui(root_clutch, &highest_unbound_root_bucket, &highest_unbound_root_bucket_pri, &highest_unbound_root_bucket_is_fixpri, NULL, NULL);
1234 	if (highest_unbound_root_bucket_is_fixpri == false) {
1235 		int root_bucket_index = bitmap_lsb_next(root_clutch->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, TH_BUCKET_FIXPRI);
1236 		assert(root_bucket_index != -1);
1237 		highest_unbound_root_bucket = &root_clutch->scr_unbound_buckets[root_bucket_index];
1238 	}
1239 
1240 	/* For the selected root bucket, find the highest priority clutch bucket */
1241 	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, highest_unbound_root_bucket, NULL, NULL, NULL);
1242 	root_unbound_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1243 
1244 root_pri_update_complete:
1245 	root_clutch->scr_priority = MAX(root_bound_pri, root_unbound_pri);
1246 }
1247 
1248 /*
1249  * sched_clutch_root_urgency_inc()
1250  *
1251  * Routine to increment the urgency at the root level based on the thread
1252  * priority that is being inserted into the hierarchy. The root urgency
1253  * counter is updated based on the urgency of threads in any of the
1254  * clutch buckets which are part of the hierarchy.
1255  *
1256  * Always called with the pset lock held.
1257  */
1258 static void
sched_clutch_root_urgency_inc(sched_clutch_root_t root_clutch,thread_t thread)1259 sched_clutch_root_urgency_inc(
1260 	sched_clutch_root_t root_clutch,
1261 	thread_t thread)
1262 {
1263 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1264 		root_clutch->scr_urgency++;
1265 	}
1266 }
1267 
1268 /*
1269  * sched_clutch_root_urgency_dec()
1270  *
1271  * Routine to decrement the urgency at the root level based on the thread
1272  * priority that is being removed from the hierarchy. The root urgency
1273  * counter is updated based on the urgency of threads in any of the
1274  * clutch buckets which are part of the hierarchy.
1275  *
1276  * Always called with the pset lock held.
1277  */
1278 static void
sched_clutch_root_urgency_dec(sched_clutch_root_t root_clutch,thread_t thread)1279 sched_clutch_root_urgency_dec(
1280 	sched_clutch_root_t root_clutch,
1281 	thread_t thread)
1282 {
1283 	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
1284 		root_clutch->scr_urgency--;
1285 	}
1286 }
1287 
1288 /*
1289  * Clutch bucket level scheduling
1290  *
1291  * The second level of scheduling is the clutch bucket level scheduling
1292  * which tries to schedule thread groups within root_buckets. Each
1293  * clutch represents a thread group and a clutch_bucket_group represents
1294  * threads at a particular sched_bucket within that thread group. The
1295  * clutch_bucket_group contains a clutch_bucket per cluster on the system
1296  * where it holds the runnable threads destined for execution on that
1297  * cluster.
1298  *
1299  * The goal of this level of scheduling is to allow interactive thread
1300  * groups low latency access to the CPU. It also provides slight
1301  * scheduling preference for App and unrestricted thread groups.
1302  *
1303  * The clutch bucket scheduling algorithm measures an interactivity
1304  * score for all clutch bucket groups. The interactivity score is based
1305  * on the ratio of the CPU used and the voluntary blocking of threads
1306  * within the clutch bucket group. The algorithm is very close to the ULE
1307  * scheduler on FreeBSD in terms of calculations. The interactivity
1308  * score provides an interactivity boost in the range of
1309  * [0:SCHED_CLUTCH_BUCKET_INTERACTIVE_PRI * 2] which allows interactive
1310  * thread groups to win over CPU spinners.
1311  *
1312  * The interactivity score of the clutch bucket group is combined with the
1313  * highest base/promoted priority of threads in the clutch bucket to form
1314  * the overall priority of the clutch bucket.
1315  */
1316 
1317 /* Priority boost range for interactivity */
1318 #define SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT     (8)
1319 static uint8_t sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
1320 
1321 /* window to scale the cpu usage and blocked values (currently 500ms). Its the threshold of used+blocked */
1322 static uint64_t sched_clutch_bucket_group_adjust_threshold = 0;
1323 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS      (500000)
1324 
1325 /* The ratio to scale the cpu/blocked time per window */
1326 #define SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO                (10)
1327 
1328 /* Initial value for voluntary blocking time for the clutch_bucket */
1329 #define SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID          (uint64_t)(~0)
1330 
1331 /* Value indicating the clutch bucket is not pending execution */
1332 #define SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID             ((uint64_t)(~0))
1333 
1334 /*
1335  * Thread group CPU starvation avoidance
1336  *
1337  * In heavily CPU contended scenarios, it is possible that some thread groups
1338  * which have a low interactivity score do not get CPU time at all. In order to
1339  * resolve that, the scheduler tries to ageout the CPU usage of the clutch
1340  * bucket group when it has been pending execution for a certain time as defined
1341  * by the sched_clutch_bucket_group_pending_delta_us values below.
1342  *
1343  * The values chosen here are very close to the WCEL values for each sched bucket.
1344  * Theses values are added into the pending interval used to determine how
1345  * frequently we will ageout the CPU usage, ensuring a reasonable limit on the
1346  * frequency.
1347  */
1348 static uint32_t sched_clutch_bucket_group_pending_delta_us[TH_BUCKET_SCHED_MAX] = {
1349 	SCHED_CLUTCH_INVALID_TIME_32,           /* FIXPRI */
1350 	10000,                                  /* FG */
1351 	37500,                                  /* IN */
1352 	75000,                                  /* DF */
1353 	150000,                                 /* UT */
1354 	250000,                                 /* BG */
1355 };
1356 static uint64_t sched_clutch_bucket_group_pending_delta[TH_BUCKET_SCHED_MAX] = {0};
1357 
1358 /*
1359  * sched_clutch_bucket_init()
1360  *
1361  * Initializer for clutch buckets.
1362  */
1363 static void
sched_clutch_bucket_init(sched_clutch_bucket_t clutch_bucket,sched_clutch_bucket_group_t clutch_bucket_group,sched_bucket_t bucket)1364 sched_clutch_bucket_init(
1365 	sched_clutch_bucket_t clutch_bucket,
1366 	sched_clutch_bucket_group_t clutch_bucket_group,
1367 	sched_bucket_t bucket)
1368 {
1369 	clutch_bucket->scb_bucket = bucket;
1370 	/* scb_priority will be recalculated when a thread is inserted in the clutch bucket */
1371 	clutch_bucket->scb_priority = 0;
1372 #if CONFIG_SCHED_EDGE
1373 	clutch_bucket->scb_preferred_pset_when_enqueued = PSET_ID_INVALID;
1374 	priority_queue_entry_init(&clutch_bucket->scb_stealqlink);
1375 #endif /* CONFIG_SCHED_EDGE */
1376 	clutch_bucket->scb_group = clutch_bucket_group;
1377 	clutch_bucket->scb_root = NULL;
1378 	priority_queue_init(&clutch_bucket->scb_clutchpri_prioq);
1379 	priority_queue_init(&clutch_bucket->scb_thread_runq);
1380 	queue_init(&clutch_bucket->scb_thread_timeshare_queue);
1381 }
1382 
1383 /*
1384  * sched_clutch_bucket_group_init()
1385  *
1386  * Initializer for clutch bucket groups.
1387  */
1388 static void
sched_clutch_bucket_group_init(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_t clutch,sched_bucket_t bucket)1389 sched_clutch_bucket_group_init(
1390 	sched_clutch_bucket_group_t clutch_bucket_group,
1391 	sched_clutch_t clutch,
1392 	sched_bucket_t bucket)
1393 {
1394 	bzero(clutch_bucket_group, sizeof(struct sched_clutch_bucket_group));
1395 	clutch_bucket_group->scbg_bucket = bucket;
1396 	clutch_bucket_group->scbg_clutch = clutch;
1397 
1398 	int max_clusters = ml_get_cluster_count();
1399 	clutch_bucket_group->scbg_clutch_buckets = kalloc_type(struct sched_clutch_bucket, max_clusters, Z_WAITOK | Z_ZERO);
1400 	for (int i = 0; i < max_clusters; i++) {
1401 		sched_clutch_bucket_init(&clutch_bucket_group->scbg_clutch_buckets[i], clutch_bucket_group, bucket);
1402 	}
1403 
1404 	os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, 0, relaxed);
1405 	os_atomic_store(&clutch_bucket_group->scbg_pri_shift, INT8_MAX, relaxed);
1406 	os_atomic_store(&clutch_bucket_group->scbg_preferred_cluster, sched_boot_pset->pset_cluster_id, relaxed);
1407 	/*
1408 	 * All thread groups should be initialized to be interactive; this allows the newly launched
1409 	 * thread groups to fairly compete with already running thread groups.
1410 	 */
1411 	clutch_bucket_group->scbg_interactivity_data.scct_count = (sched_clutch_bucket_group_interactive_pri * 2);
1412 	clutch_bucket_group->scbg_interactivity_data.scct_timestamp = 0;
1413 	os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed);
1414 	clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
1415 	clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
1416 }
1417 
1418 static void
sched_clutch_bucket_group_destroy(sched_clutch_bucket_group_t clutch_bucket_group)1419 sched_clutch_bucket_group_destroy(
1420 	sched_clutch_bucket_group_t clutch_bucket_group)
1421 {
1422 	kfree_type(struct sched_clutch_bucket, ml_get_cluster_count(),
1423 	    clutch_bucket_group->scbg_clutch_buckets);
1424 }
1425 
1426 /*
1427  * sched_clutch_init_with_thread_group()
1428  *
1429  * Initialize the sched_clutch when the thread group is being created
1430  */
1431 void
sched_clutch_init_with_thread_group(sched_clutch_t clutch,struct thread_group * tg)1432 sched_clutch_init_with_thread_group(
1433 	sched_clutch_t clutch,
1434 	struct thread_group *tg)
1435 {
1436 	os_atomic_store(&clutch->sc_thr_count, 0, relaxed);
1437 
1438 	/* Initialize all the clutch buckets */
1439 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1440 		sched_clutch_bucket_group_init(&(clutch->sc_clutch_groups[i]), clutch, i);
1441 	}
1442 
1443 	/* Grouping specific fields */
1444 	clutch->sc_tg = tg;
1445 }
1446 
1447 /*
1448  * sched_clutch_destroy()
1449  *
1450  * Destructor for clutch; called from thread group release code.
1451  */
1452 void
sched_clutch_destroy(sched_clutch_t clutch)1453 sched_clutch_destroy(
1454 	sched_clutch_t clutch)
1455 {
1456 	assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0);
1457 	for (uint32_t i = 0; i < TH_BUCKET_SCHED_MAX; i++) {
1458 		sched_clutch_bucket_group_destroy(&(clutch->sc_clutch_groups[i]));
1459 	}
1460 }
1461 
1462 #if CONFIG_SCHED_EDGE
1463 
1464 /*
1465  * Edge Scheduler Preferred Cluster Mechanism
1466  *
1467  * In order to have better control over various QoS buckets within a thread group, the Edge
1468  * scheduler allows CLPC to specify a preferred cluster for each QoS level in a TG. These
1469  * preferences are stored at the sched_clutch_bucket_group level since that represents all
1470  * threads at a particular QoS level within a sched_clutch. For any lookup of preferred
1471  * cluster, the logic always goes back to the preference stored at the clutch_bucket_group.
1472  */
1473 
1474 static uint32_t
sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)1475 sched_edge_clutch_bucket_group_preferred_cluster(sched_clutch_bucket_group_t clutch_bucket_group)
1476 {
1477 	return os_atomic_load(&clutch_bucket_group->scbg_preferred_cluster, relaxed);
1478 }
1479 
1480 static uint32_t
sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)1481 sched_clutch_bucket_preferred_cluster(sched_clutch_bucket_t clutch_bucket)
1482 {
1483 	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket->scb_group);
1484 }
1485 
1486 uint32_t
sched_edge_thread_preferred_cluster(thread_t thread)1487 sched_edge_thread_preferred_cluster(thread_t thread)
1488 {
1489 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1490 		/* For threads bound to a specific cluster, return the bound cluster id */
1491 		return sched_edge_thread_bound_cluster_id(thread);
1492 	}
1493 
1494 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1495 	sched_bucket_t sched_bucket = thread->th_sched_bucket;
1496 	if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1497 		sched_bucket = sched_clutch_thread_bucket_map(thread, thread->base_pri);
1498 	}
1499 	sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[sched_bucket];
1500 	return sched_edge_clutch_bucket_group_preferred_cluster(clutch_bucket_group);
1501 }
1502 
1503 /*
1504  * Edge Scheduler Steal Silo Support
1505  *
1506  * Steal mechanisms in the Edge scheduler, including foreign rebalance
1507  * and regular work-stealing, are implemented using steal "silos"
1508  * on every pset tracking the clutch buckets with steal-able threads,
1509  * where each steal silo on a pset corresponds to a possible preferred
1510  * pset recommendation. Silos are comprised of per-bucket steal
1511  * queues. This amount of subdivision allows for fine-grained steal
1512  * policies which stay precisely in-sync with the complex Edge matrix.
1513  */
1514 
1515 /*
1516  * sched_edge_steal_silo_from_pset_id()
1517  *
1518  * Routine to return the steal silo corresponding to a particular
1519  * preferred pset on a root clutch.
1520  */
1521 static sched_edge_steal_silo_t
sched_edge_steal_silo_from_pset_id(pset_id_t preferred_pset_id,sched_clutch_root_t root_clutch)1522 sched_edge_steal_silo_from_pset_id(pset_id_t preferred_pset_id, sched_clutch_root_t root_clutch)
1523 {
1524 	return &root_clutch->scr_steal_silos[preferred_pset_id];
1525 }
1526 
1527 /*
1528  * sched_edge_steal_silo_clutch_bucket_unclassify()
1529  *
1530  * Routine to reset a clutch bucket's steal silo tracking on the
1531  * pset where it is enqueued, necessary when dequeueing a clutch
1532  * bucket or changing its priority.
1533  * Always called with the pset lock held.
1534  */
1535 static void
sched_edge_steal_silo_clutch_bucket_unclassify(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch)1536 sched_edge_steal_silo_clutch_bucket_unclassify(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch)
1537 {
1538 	assert3u(clutch_bucket->scb_preferred_pset_when_enqueued, !=, PSET_ID_INVALID);
1539 	sched_edge_steal_silo_t steal_silo =
1540 	    sched_edge_steal_silo_from_pset_id(clutch_bucket->scb_preferred_pset_when_enqueued, root_clutch);
1541 	struct priority_queue_sched_max *steal_queue = &steal_silo->sess_steal_queues[clutch_bucket->scb_bucket];
1542 	priority_queue_remove(steal_queue, &clutch_bucket->scb_stealqlink);
1543 	if (priority_queue_empty(steal_queue)) {
1544 		/* Last bucket from this steal queue */
1545 		atomic_bit_clear(steal_silo->sess_populated_steal_queues, clutch_bucket->scb_bucket, memory_order_relaxed);
1546 	}
1547 	if (os_atomic_load(steal_silo->sess_populated_steal_queues, relaxed) == 0) {
1548 		/* Last populated steal queue from this silo */
1549 		atomic_bit_clear(root_clutch->scr_populated_steal_silos,
1550 		    clutch_bucket->scb_preferred_pset_when_enqueued, memory_order_relaxed);
1551 	}
1552 	clutch_bucket->scb_preferred_pset_when_enqueued = PSET_ID_INVALID;
1553 }
1554 
1555 /*
1556  * sched_edge_steal_silo_clutch_bucket_classify()
1557  *
1558  * Routine to establish a clutch bucket's steal silo tracking on
1559  * the pset where it is (being) enqueued. Can be used to update the
1560  * tracking of a previously enqueued clutch bucket.
1561  * Always called with the pset lock held.
1562  */
1563 static void
sched_edge_steal_silo_clutch_bucket_classify(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint32_t preferred_pset_id)1564 sched_edge_steal_silo_clutch_bucket_classify(sched_clutch_bucket_t clutch_bucket,
1565     sched_clutch_root_t root_clutch, uint32_t preferred_pset_id)
1566 {
1567 	if (clutch_bucket->scb_preferred_pset_when_enqueued != PSET_ID_INVALID) {
1568 		if (clutch_bucket->scb_preferred_pset_when_enqueued == preferred_pset_id) {
1569 			/* Already classified correctly */
1570 			return;
1571 		} else {
1572 			/* Remove from previous queue */
1573 			sched_edge_steal_silo_clutch_bucket_unclassify(clutch_bucket, root_clutch);
1574 		}
1575 	}
1576 	assert3u(clutch_bucket->scb_preferred_pset_when_enqueued, ==, PSET_ID_INVALID);
1577 	/*
1578 	 * Insert clutch bucket into the steal silo matching its preferred pset
1579 	 * and into the queue in the silo matching its scheduling bucket.
1580 	 */
1581 	clutch_bucket->scb_preferred_pset_when_enqueued = preferred_pset_id;
1582 	sched_edge_steal_silo_t steal_silo =
1583 	    sched_edge_steal_silo_from_pset_id(clutch_bucket->scb_preferred_pset_when_enqueued, root_clutch);
1584 	struct priority_queue_sched_max *steal_queue = &steal_silo->sess_steal_queues[clutch_bucket->scb_bucket];
1585 	priority_queue_entry_set_sched_pri(steal_queue, &clutch_bucket->scb_stealqlink, clutch_bucket->scb_priority, 0);
1586 	priority_queue_insert(steal_queue, &clutch_bucket->scb_stealqlink);
1587 	atomic_bit_set(steal_silo->sess_populated_steal_queues, clutch_bucket->scb_bucket, memory_order_relaxed);
1588 	atomic_bit_set(root_clutch->scr_populated_steal_silos, clutch_bucket->scb_preferred_pset_when_enqueued, memory_order_relaxed);
1589 }
1590 
1591 /*
1592  * Edge Scheduler Cumulative Load Average
1593  *
1594  * The Edge scheduler maintains a per-QoS/scheduling bucket load average for
1595  * making thread migration decisions. The per-bucket load is maintained as a
1596  * cumulative count since higher scheduling buckets impact load on lower buckets
1597  * for thread migration decisions.
1598  */
1599 
1600 static void
sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1601 sched_edge_cluster_cumulative_count_incr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1602 {
1603 	switch (bucket) {
1604 	case TH_BUCKET_FIXPRI:    os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1605 	case TH_BUCKET_SHARE_FG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1606 	case TH_BUCKET_SHARE_IN:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1607 	case TH_BUCKET_SHARE_DF:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1608 	case TH_BUCKET_SHARE_UT:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1609 	case TH_BUCKET_SHARE_BG:  os_atomic_inc(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1610 	default:
1611 		panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_incr()");
1612 	}
1613 }
1614 
1615 static void
sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1616 sched_edge_cluster_cumulative_count_decr(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1617 {
1618 	switch (bucket) {
1619 	case TH_BUCKET_FIXPRI:    os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_FIXPRI], relaxed); OS_FALLTHROUGH;
1620 	case TH_BUCKET_SHARE_FG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_FG], relaxed); OS_FALLTHROUGH;
1621 	case TH_BUCKET_SHARE_IN:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_IN], relaxed); OS_FALLTHROUGH;
1622 	case TH_BUCKET_SHARE_DF:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_DF], relaxed); OS_FALLTHROUGH;
1623 	case TH_BUCKET_SHARE_UT:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_UT], relaxed); OS_FALLTHROUGH;
1624 	case TH_BUCKET_SHARE_BG:  os_atomic_dec(&root_clutch->scr_cumulative_run_count[TH_BUCKET_SHARE_BG], relaxed); break;
1625 	default:
1626 		panic("Unexpected sched_bucket passed to sched_edge_cluster_cumulative_count_decr()");
1627 	}
1628 }
1629 
1630 uint16_t
sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch,sched_bucket_t bucket)1631 sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket)
1632 {
1633 	return os_atomic_load(&root_clutch->scr_cumulative_run_count[bucket], relaxed);
1634 }
1635 
1636 #endif /* CONFIG_SCHED_EDGE */
1637 
1638 /*
1639  * sched_clutch_bucket_hierarchy_insert()
1640  *
1641  * Routine to insert a newly runnable clutch_bucket into the root hierarchy.
1642  */
1643 static void
sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,sched_clutch_bucket_options_t options)1644 sched_clutch_bucket_hierarchy_insert(
1645 	sched_clutch_root_t root_clutch,
1646 	sched_clutch_bucket_t clutch_bucket,
1647 	sched_bucket_t bucket,
1648 	uint64_t timestamp,
1649 	sched_clutch_bucket_options_t options)
1650 {
1651 	sched_clutch_hierarchy_locked_assert(root_clutch);
1652 	if (sched_clutch_bucket_is_above_timeshare(bucket) == false) {
1653 		/* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */
1654 		enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink);
1655 	}
1656 #if CONFIG_SCHED_EDGE
1657 	/* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */
1658 	uint32_t preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
1659 	sched_edge_steal_silo_clutch_bucket_classify(clutch_bucket, root_clutch, preferred_cluster);
1660 #endif /* CONFIG_SCHED_EDGE */
1661 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1662 
1663 	/* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */
1664 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1665 		sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp);
1666 	}
1667 
1668 	/* Insert the clutch bucket into the root bucket run queue with order based on options */
1669 	sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options);
1670 	clutch_bucket->scb_root = root_clutch;
1671 	os_atomic_inc(&sched_clutch_global_bucket_load[bucket], relaxed);
1672 }
1673 
1674 /*
1675  * sched_clutch_bucket_hierarchy_remove()
1676  *
1677  * Rotuine to remove a empty clutch bucket from the root hierarchy.
1678  */
1679 static void
sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t root_clutch,sched_clutch_bucket_t clutch_bucket,sched_bucket_t bucket,uint64_t timestamp,__unused sched_clutch_bucket_options_t options)1680 sched_clutch_bucket_hierarchy_remove(
1681 	sched_clutch_root_t root_clutch,
1682 	sched_clutch_bucket_t clutch_bucket,
1683 	sched_bucket_t bucket,
1684 	uint64_t timestamp,
1685 	__unused sched_clutch_bucket_options_t options)
1686 {
1687 	sched_clutch_hierarchy_locked_assert(root_clutch);
1688 	if (sched_clutch_bucket_is_above_timeshare(bucket) == false) {
1689 		/* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */
1690 		remqueue(&clutch_bucket->scb_listlink);
1691 	}
1692 #if CONFIG_SCHED_EDGE
1693 	sched_edge_steal_silo_clutch_bucket_unclassify(clutch_bucket, root_clutch);
1694 #endif /* CONFIG_SCHED_EDGE */
1695 
1696 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_unbound_buckets[bucket];
1697 
1698 	/* Remove the clutch bucket from the root bucket priority queue */
1699 	sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket);
1700 	clutch_bucket->scb_root = NULL;
1701 
1702 	/* If the root bucket priority queue is now empty, remove it from the root priority queue */
1703 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1704 		sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp);
1705 	}
1706 	os_atomic_dec(&sched_clutch_global_bucket_load[bucket], relaxed);
1707 }
1708 
1709 /*
1710  * sched_clutch_bucket_base_pri()
1711  *
1712  * Calculates the "base" priority of the clutch bucket, which is equal to the max of the
1713  * highest base_pri and the highest sched_pri in the clutch bucket.
1714  */
1715 static uint8_t
sched_clutch_bucket_base_pri(sched_clutch_bucket_t clutch_bucket)1716 sched_clutch_bucket_base_pri(
1717 	sched_clutch_bucket_t clutch_bucket)
1718 {
1719 	assert(priority_queue_empty(&clutch_bucket->scb_thread_runq) == false);
1720 	/*
1721 	 * Since the clutch bucket can contain threads that are members of the group due
1722 	 * to the sched_pri being promoted or due to their base pri, the base priority of
1723 	 * the entire clutch bucket should be based on the highest thread (promoted or base)
1724 	 * in the clutch bucket.
1725 	 */
1726 	uint8_t max_pri = 0;
1727 	if (!priority_queue_empty(&clutch_bucket->scb_clutchpri_prioq)) {
1728 		max_pri = priority_queue_max_sched_pri(&clutch_bucket->scb_clutchpri_prioq);
1729 	}
1730 	return max_pri;
1731 }
1732 
1733 /*
1734  * sched_clutch_interactivity_from_cpu_data()
1735  *
1736  * Routine to calculate the interactivity score of a clutch bucket group from its CPU usage
1737  */
1738 static uint8_t
sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)1739 sched_clutch_interactivity_from_cpu_data(sched_clutch_bucket_group_t clutch_bucket_group)
1740 {
1741 	sched_clutch_bucket_cpu_data_t scb_cpu_data;
1742 	scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed);
1743 	clutch_cpu_data_t cpu_used = scb_cpu_data.cpu_data.scbcd_cpu_used;
1744 	clutch_cpu_data_t cpu_blocked = scb_cpu_data.cpu_data.scbcd_cpu_blocked;
1745 	uint8_t interactive_score = 0;
1746 
1747 	if ((cpu_blocked == 0) && (cpu_used == 0)) {
1748 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
1749 	}
1750 	/*
1751 	 * For all timeshare buckets, calculate the interactivity score of the bucket
1752 	 * and add it to the base priority
1753 	 */
1754 	if (cpu_blocked > cpu_used) {
1755 		/* Interactive clutch_bucket case */
1756 		interactive_score = sched_clutch_bucket_group_interactive_pri +
1757 		    ((sched_clutch_bucket_group_interactive_pri * (cpu_blocked - cpu_used)) / cpu_blocked);
1758 	} else {
1759 		/* Non-interactive clutch_bucket case */
1760 		interactive_score = ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) / cpu_used);
1761 	}
1762 	return interactive_score;
1763 }
1764 
1765 /*
1766  * sched_clutch_bucket_pri_calculate()
1767  *
1768  * The priority calculation algorithm for the clutch_bucket is a slight
1769  * modification on the ULE interactivity score. It uses the base priority
1770  * of the clutch bucket and applies an interactivity score boost to the
1771  * highly responsive clutch buckets.
1772  */
1773 static uint8_t
sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t clutch_bucket,uint64_t timestamp)1774 sched_clutch_bucket_pri_calculate(
1775 	sched_clutch_bucket_t clutch_bucket,
1776 	uint64_t timestamp)
1777 {
1778 	/* For empty clutch buckets, return priority 0 */
1779 	if (clutch_bucket->scb_thr_count == 0) {
1780 		return 0;
1781 	}
1782 
1783 	uint8_t base_pri = sched_clutch_bucket_base_pri(clutch_bucket);
1784 	uint8_t interactive_score = sched_clutch_bucket_group_interactivity_score_calculate(clutch_bucket->scb_group, timestamp);
1785 
1786 	assert(((uint64_t)base_pri + interactive_score) <= UINT8_MAX);
1787 	uint8_t pri = base_pri + interactive_score;
1788 	if (pri != clutch_bucket->scb_priority) {
1789 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_PRI) | DBG_FUNC_NONE,
1790 		    thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket, pri, interactive_score, 0);
1791 	}
1792 	return pri;
1793 }
1794 
1795 /*
1796  * sched_clutch_root_bucket_highest_clutch_bucket()
1797  *
1798  * Routine to find the highest priority clutch bucket
1799  * within the root bucket.
1800  */
1801 static sched_clutch_bucket_t
sched_clutch_root_bucket_highest_clutch_bucket(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread,bool * _Nullable chose_prev_thread)1802 sched_clutch_root_bucket_highest_clutch_bucket(
1803 	sched_clutch_root_t root_clutch,
1804 	sched_clutch_root_bucket_t root_bucket,
1805 	processor_t _Nullable processor,
1806 	thread_t _Nullable prev_thread,
1807 	bool *_Nullable chose_prev_thread)
1808 {
1809 	if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) {
1810 		if (prev_thread != NULL) {
1811 			*chose_prev_thread = true;
1812 			return sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1813 		}
1814 		return NULL;
1815 	}
1816 	sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets);
1817 	/* Consider the Clutch bucket of the previous thread */
1818 	if (prev_thread != NULL) {
1819 		assert(chose_prev_thread != NULL);
1820 		sched_clutch_bucket_group_t prev_clutch_bucket_group = sched_clutch_bucket_group_for_thread(prev_thread);
1821 		int prev_clutch_bucket_pri = prev_thread->sched_pri + (int)(os_atomic_load(&prev_clutch_bucket_group->scbg_interactivity_data.scct_count, relaxed));
1822 		sched_clutch_bucket_t prev_clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, prev_thread);
1823 		if (prev_clutch_bucket != clutch_bucket &&
1824 		    sched_clutch_pri_greater_than_tiebreak(prev_clutch_bucket_pri, clutch_bucket->scb_priority, processor->first_timeslice)) {
1825 			*chose_prev_thread = true;
1826 			return prev_clutch_bucket;
1827 		}
1828 	}
1829 	return clutch_bucket;
1830 }
1831 
1832 /*
1833  * sched_clutch_bucket_runnable()
1834  *
1835  * Perform all operations needed when a new clutch bucket becomes runnable.
1836  * It involves inserting the clutch_bucket into the hierarchy and updating the
1837  * root priority appropriately.
1838  */
1839 static boolean_t
sched_clutch_bucket_runnable(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1840 sched_clutch_bucket_runnable(
1841 	sched_clutch_bucket_t clutch_bucket,
1842 	sched_clutch_root_t root_clutch,
1843 	uint64_t timestamp,
1844 	sched_clutch_bucket_options_t options)
1845 {
1846 	sched_clutch_hierarchy_locked_assert(root_clutch);
1847 	/* Since the clutch bucket became newly runnable, update its pending timestamp */
1848 	clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1849 	sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1850 
1851 	/* Update the timesharing properties of this clutch_bucket_group; also done every sched_tick */
1852 	sched_clutch_bucket_group_pri_shift_update(clutch_bucket->scb_group);
1853 
1854 	int16_t root_old_pri = root_clutch->scr_priority;
1855 	sched_clutch_root_pri_update(root_clutch);
1856 	return root_clutch->scr_priority > root_old_pri;
1857 }
1858 
1859 /*
1860  * sched_clutch_bucket_update()
1861  *
1862  * Update the clutch_bucket's position in the hierarchy. This routine is
1863  * called when a new thread is inserted or removed from a runnable clutch
1864  * bucket. The options specify some properties about the clutch bucket
1865  * insertion order into the clutch bucket runq.
1866  */
1867 static boolean_t
sched_clutch_bucket_update(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1868 sched_clutch_bucket_update(
1869 	sched_clutch_bucket_t clutch_bucket,
1870 	sched_clutch_root_t root_clutch,
1871 	uint64_t timestamp,
1872 	sched_clutch_bucket_options_t options)
1873 {
1874 	sched_clutch_hierarchy_locked_assert(root_clutch);
1875 	uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp);
1876 	sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_unbound_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets;
1877 	if (new_pri == clutch_bucket->scb_priority) {
1878 		/*
1879 		 * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket
1880 		 * to the end of the runq. Typically used when a thread is selected for execution
1881 		 * from a clutch bucket.
1882 		 */
1883 		if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) {
1884 			sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket);
1885 		}
1886 		return false;
1887 	}
1888 	sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket);
1889 #if CONFIG_SCHED_EDGE
1890 	/* Need to update clutch bucket's priority ranking in its foreign queue */
1891 	pset_id_t pset_preference = clutch_bucket->scb_preferred_pset_when_enqueued;
1892 	sched_edge_steal_silo_clutch_bucket_unclassify(clutch_bucket, root_clutch);
1893 #endif /* CONFIG_SCHED_EDGE */
1894 	clutch_bucket->scb_priority = new_pri;
1895 #if CONFIG_SCHED_EDGE
1896 	sched_edge_steal_silo_clutch_bucket_classify(clutch_bucket, root_clutch, pset_preference);
1897 #endif /* CONFIG_SCHED_EDGE */
1898 	sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options);
1899 
1900 	int16_t root_old_pri = root_clutch->scr_priority;
1901 	sched_clutch_root_pri_update(root_clutch);
1902 	return root_clutch->scr_priority > root_old_pri;
1903 }
1904 
1905 /*
1906  * sched_clutch_bucket_empty()
1907  *
1908  * Perform all the operations needed when a clutch_bucket is no longer runnable.
1909  * It involves removing the clutch bucket from the hierarchy and updaing the root
1910  * priority appropriately.
1911  */
1912 static void
sched_clutch_bucket_empty(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,uint64_t timestamp,sched_clutch_bucket_options_t options)1913 sched_clutch_bucket_empty(
1914 	sched_clutch_bucket_t clutch_bucket,
1915 	sched_clutch_root_t root_clutch,
1916 	uint64_t timestamp,
1917 	sched_clutch_bucket_options_t options)
1918 {
1919 	sched_clutch_hierarchy_locked_assert(root_clutch);
1920 	assert3u(clutch_bucket->scb_thr_count, ==, 0);
1921 	sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options);
1922 
1923 	/* Update the timesharing properties of this clutch_bucket_group; also done every sched_tick */
1924 	sched_clutch_bucket_group_pri_shift_update(clutch_bucket->scb_group);
1925 
1926 	clutch_bucket->scb_priority = 0;
1927 	sched_clutch_root_pri_update(root_clutch);
1928 }
1929 
1930 /*
1931  * sched_clutch_cpu_usage_update()
1932  *
1933  * Routine to update CPU usage of the thread in the hierarchy.
1934  */
1935 void
sched_clutch_cpu_usage_update(thread_t thread,uint64_t delta)1936 sched_clutch_cpu_usage_update(
1937 	thread_t thread,
1938 	uint64_t delta)
1939 {
1940 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread) || SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
1941 		return;
1942 	}
1943 
1944 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
1945 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
1946 	sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, delta);
1947 }
1948 
1949 /*
1950  * sched_clutch_bucket_group_cpu_usage_update()
1951  *
1952  * Routine to update the CPU usage of the clutch_bucket.
1953  */
1954 static void
sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t delta)1955 sched_clutch_bucket_group_cpu_usage_update(
1956 	sched_clutch_bucket_group_t clutch_bucket_group,
1957 	uint64_t delta)
1958 {
1959 	if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
1960 		/* Since Above UI bucket has maximum interactivity score always, nothing to do here */
1961 		return;
1962 	}
1963 	delta = MIN(delta, sched_clutch_bucket_group_adjust_threshold);
1964 	os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_used), (clutch_cpu_data_t)delta, relaxed);
1965 }
1966 
1967 /*
1968  * sched_clutch_bucket_group_cpu_pending_adjust()
1969  *
1970  * Routine to calculate the adjusted CPU usage value based on the pending intervals. The calculation is done
1971  * such that one "pending interval" provides one point improvement in interactivity score.
1972  */
1973 static inline uint64_t
sched_clutch_bucket_group_cpu_pending_adjust(uint64_t cpu_used,uint64_t cpu_blocked,uint8_t pending_intervals)1974 sched_clutch_bucket_group_cpu_pending_adjust(
1975 	uint64_t cpu_used,
1976 	uint64_t cpu_blocked,
1977 	uint8_t pending_intervals)
1978 {
1979 	uint64_t cpu_used_adjusted = 0;
1980 	if (cpu_blocked < cpu_used) {
1981 		cpu_used_adjusted = (sched_clutch_bucket_group_interactive_pri * cpu_blocked * cpu_used);
1982 		cpu_used_adjusted = cpu_used_adjusted / ((sched_clutch_bucket_group_interactive_pri * cpu_blocked) + (cpu_used * pending_intervals));
1983 	} else {
1984 		uint64_t adjust_factor = (cpu_blocked * pending_intervals) / sched_clutch_bucket_group_interactive_pri;
1985 		cpu_used_adjusted = (adjust_factor > cpu_used) ? 0 : (cpu_used - adjust_factor);
1986 	}
1987 	return cpu_used_adjusted;
1988 }
1989 
1990 /*
1991  * sched_clutch_bucket_group_cpu_adjust()
1992  *
1993  * Routine to scale the cpu usage and blocked time once the sum gets bigger
1994  * than sched_clutch_bucket_group_adjust_threshold. Allows the values to remain
1995  * manageable and maintain the same ratio while allowing clutch buckets to
1996  * adjust behavior and reflect in the interactivity score in a reasonable
1997  * amount of time. Also adjusts the CPU usage based on pending_intervals
1998  * which allows ageout of CPU to avoid starvation in highly contended scenarios.
1999  */
2000 static void
sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t clutch_bucket_group,uint8_t pending_intervals)2001 sched_clutch_bucket_group_cpu_adjust(
2002 	sched_clutch_bucket_group_t clutch_bucket_group,
2003 	uint8_t pending_intervals)
2004 {
2005 	sched_clutch_bucket_cpu_data_t old_cpu_data = {};
2006 	sched_clutch_bucket_cpu_data_t new_cpu_data = {};
2007 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, old_cpu_data.scbcd_cpu_data_packed, new_cpu_data.scbcd_cpu_data_packed, relaxed, {
2008 		clutch_cpu_data_t cpu_used = old_cpu_data.cpu_data.scbcd_cpu_used;
2009 		clutch_cpu_data_t cpu_blocked = old_cpu_data.cpu_data.scbcd_cpu_blocked;
2010 
2011 		if ((pending_intervals == 0) && (cpu_used + cpu_blocked) < sched_clutch_bucket_group_adjust_threshold) {
2012 		        /* No changes to the CPU used and blocked values */
2013 		        os_atomic_rmw_loop_give_up();
2014 		}
2015 		if ((cpu_used + cpu_blocked) >= sched_clutch_bucket_group_adjust_threshold) {
2016 		        /* Only keep the recent CPU history to better indicate how this TG has been behaving */
2017 		        cpu_used = cpu_used / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
2018 		        cpu_blocked = cpu_blocked / SCHED_CLUTCH_BUCKET_GROUP_ADJUST_RATIO;
2019 		}
2020 		/* Use the shift passed in to ageout the CPU usage */
2021 		cpu_used = (clutch_cpu_data_t)sched_clutch_bucket_group_cpu_pending_adjust(cpu_used, cpu_blocked, pending_intervals);
2022 		new_cpu_data.cpu_data.scbcd_cpu_used = cpu_used;
2023 		new_cpu_data.cpu_data.scbcd_cpu_blocked = cpu_blocked;
2024 	});
2025 }
2026 
2027 /*
2028  * Thread level scheduling algorithm
2029  *
2030  * The thread level scheduling algorithm uses the mach timeshare
2031  * decay based algorithm to achieve sharing between threads within the
2032  * same clutch bucket. The load/priority shifts etc. are all maintained
2033  * at the clutch bucket level and used for decay calculation of the
2034  * threads. The load sampling is still driven off the scheduler tick
2035  * for runnable clutch buckets (it does not use the new higher frequency
2036  * EWMA based load calculation). The idea is that the contention and load
2037  * within clutch_buckets should be limited enough to not see heavy decay
2038  * and timeshare effectively.
2039  */
2040 
2041 /*
2042  * sched_clutch_thread_run_bucket_incr() / sched_clutch_run_bucket_incr()
2043  *
2044  * Increment the run count for the clutch bucket associated with the
2045  * thread.
2046  */
2047 uint32_t
sched_clutch_thread_run_bucket_incr(thread_t thread,sched_bucket_t bucket)2048 sched_clutch_thread_run_bucket_incr(
2049 	thread_t thread,
2050 	sched_bucket_t bucket)
2051 {
2052 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2053 		return 0;
2054 	}
2055 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2056 	return sched_clutch_run_bucket_incr(clutch, bucket);
2057 }
2058 
2059 static uint32_t
sched_clutch_run_bucket_incr(sched_clutch_t clutch,sched_bucket_t bucket)2060 sched_clutch_run_bucket_incr(
2061 	sched_clutch_t clutch,
2062 	sched_bucket_t bucket)
2063 {
2064 	assert(bucket != TH_BUCKET_RUN);
2065 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2066 	return sched_clutch_bucket_group_run_count_inc(clutch_bucket_group);
2067 }
2068 
2069 /*
2070  * sched_clutch_thread_run_bucket_decr() / sched_clutch_run_bucket_decr()
2071  *
2072  * Decrement the run count for the clutch bucket associated with the
2073  * thread.
2074  */
2075 uint32_t
sched_clutch_thread_run_bucket_decr(thread_t thread,sched_bucket_t bucket)2076 sched_clutch_thread_run_bucket_decr(
2077 	thread_t thread,
2078 	sched_bucket_t bucket)
2079 {
2080 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
2081 		return 0;
2082 	}
2083 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2084 	return sched_clutch_run_bucket_decr(clutch, bucket);
2085 }
2086 
2087 static uint32_t
sched_clutch_run_bucket_decr(sched_clutch_t clutch,sched_bucket_t bucket)2088 sched_clutch_run_bucket_decr(
2089 	sched_clutch_t clutch,
2090 	sched_bucket_t bucket)
2091 {
2092 	assert(bucket != TH_BUCKET_RUN);
2093 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
2094 	return sched_clutch_bucket_group_run_count_dec(clutch_bucket_group);
2095 }
2096 
2097 /*
2098  * sched_clutch_bucket_group_pri_shift_update()
2099  *
2100  * Routine to update the priority shift for a clutch bucket group,
2101  * necessary for timesharing correctly with priority decay within a
2102  * thread group + QoS.
2103  */
2104 static void
sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_t clutch_bucket_group)2105 sched_clutch_bucket_group_pri_shift_update(
2106 	sched_clutch_bucket_group_t clutch_bucket_group)
2107 {
2108 	if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2109 		/* No timesharing needed for fixed priority Above UI threads */
2110 		return;
2111 	}
2112 
2113 	/*
2114 	 * Update the timeshare parameters for the clutch bucket group
2115 	 * if they haven't been updated in this tick.
2116 	 */
2117 	uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed);
2118 	uint32_t current_sched_ts = os_atomic_load(&sched_tick, relaxed);
2119 	if (sched_ts < current_sched_ts) {
2120 		os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed);
2121 		/* NCPU wide workloads should not experience decay */
2122 		uint64_t bucket_group_run_count = os_atomic_load_wide(&clutch_bucket_group->scbg_blocked_data.scct_count, relaxed) - 1;
2123 		uint32_t bucket_group_load = (uint32_t)(bucket_group_run_count / processor_avail_count);
2124 		bucket_group_load = MIN(bucket_group_load, NRQS - 1);
2125 		uint32_t pri_shift = sched_fixed_shift - sched_load_shifts[bucket_group_load];
2126 		/* Ensure that the pri_shift value is reasonable */
2127 		pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift;
2128 		os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed);
2129 	}
2130 }
2131 
2132 /*
2133  * sched_clutch_bucket_group_timeshare_update()
2134  *
2135  * Routine to update the priority shift and priority for the clutch_bucket_group
2136  * every sched_tick. For multi-cluster platforms, each QoS level will have multiple
2137  * clutch buckets with runnable threads in them. So it is important to maintain
2138  * the timesharing information at the clutch_bucket_group level instead of
2139  * individual clutch buckets (because the algorithm is trying to timeshare all
2140  * threads at the same QoS irrespective of which hierarchy they are enqueued in).
2141  *
2142  * The routine is called from the sched tick handling code to make sure this value
2143  * is updated at least once every sched tick. For clutch bucket groups which have
2144  * not been runnable for very long, the clutch_bucket_group maintains a "last
2145  * updated schedtick" parameter. As threads become runnable in the clutch bucket group,
2146  * if this value is outdated, we update the priority shift.
2147  *
2148  * Possible optimization:
2149  * - The current algorithm samples the load at most once every sched tick (125ms).
2150  *   This is prone to spikes in runnable counts; if that turns out to be
2151  *   a problem, a simple solution would be to do the EWMA trick to sample
2152  *   load at every load_tick (30ms) and use the averaged value for the pri
2153  *   shift calculation.
2154  */
2155 static void
sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t clutch_bucket_group,sched_clutch_bucket_t clutch_bucket,uint64_t ctime)2156 sched_clutch_bucket_group_timeshare_update(
2157 	sched_clutch_bucket_group_t clutch_bucket_group,
2158 	sched_clutch_bucket_t clutch_bucket,
2159 	uint64_t ctime)
2160 {
2161 	if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2162 		/* No timesharing needed for fixed priority Above UI threads */
2163 		return;
2164 	}
2165 	sched_clutch_bucket_group_pri_shift_update(clutch_bucket_group);
2166 	/*
2167 	 * Update the clutch bucket priority; this allows clutch buckets that have been pending
2168 	 * for a long time to get an updated interactivity score.
2169 	 */
2170 	sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
2171 }
2172 
2173 /*
2174  * Calculate the CPU used by this thread and attribute it to the
2175  * thread's current scheduling bucket and clutch bucket group, or
2176  * a previous clutch bucket group if specified.
2177  * Also update the general scheduler CPU usage, matching
2178  * what we do for lightweight_update_priority().
2179  */
2180 static inline void
sched_clutch_thread_tick_delta(thread_t thread,sched_clutch_bucket_group_t _Nullable clutch_bucket_group)2181 sched_clutch_thread_tick_delta(thread_t thread, sched_clutch_bucket_group_t _Nullable clutch_bucket_group)
2182 {
2183 	uint32_t cpu_delta;
2184 	sched_tick_delta(thread, cpu_delta);
2185 	if (thread->pri_shift < INT8_MAX) {
2186 		thread->sched_usage += cpu_delta;
2187 	}
2188 	thread->cpu_delta += cpu_delta;
2189 	if (clutch_bucket_group != NULL) {
2190 		sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta);
2191 	} else {
2192 		sched_clutch_cpu_usage_update(thread, cpu_delta);
2193 	}
2194 }
2195 
2196 /*
2197  * sched_clutch_thread_clutch_update()
2198  *
2199  * Routine called when the thread changes its thread group. The current
2200  * implementation relies on the fact that the thread group is changed only from
2201  * the context of the thread itself or when the thread is runnable but not in a
2202  * runqueue. Due to this fact, the thread group change causes only counter
2203  * updates in the old & new clutch buckets and no hierarchy changes. The routine
2204  * also attributes the CPU used so far to the old clutch.
2205  */
2206 void
sched_clutch_thread_clutch_update(thread_t thread,sched_clutch_t old_clutch,sched_clutch_t new_clutch)2207 sched_clutch_thread_clutch_update(
2208 	thread_t thread,
2209 	sched_clutch_t old_clutch,
2210 	sched_clutch_t new_clutch)
2211 {
2212 	if (old_clutch) {
2213 		assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
2214 
2215 		sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket);
2216 
2217 		/* Attribute CPU usage with the old clutch */
2218 		sched_clutch_bucket_group_t old_clutch_bucket_group = NULL;
2219 		if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
2220 			old_clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]);
2221 		}
2222 		sched_clutch_thread_tick_delta(thread, old_clutch_bucket_group);
2223 	}
2224 
2225 	if (new_clutch) {
2226 		sched_clutch_run_bucket_incr(new_clutch, thread->th_sched_bucket);
2227 	}
2228 }
2229 
2230 /* Thread Insertion/Removal/Selection routines */
2231 
2232 #if CONFIG_SCHED_EDGE
2233 
2234 /*
2235  * Edge Scheduler Bound Thread Support
2236  *
2237  * The edge scheduler allows threads to be bound to specific clusters. The scheduler
2238  * maintains a separate runq on the clutch root to hold these bound threads. These
2239  * bound threads count towards the root priority and thread count, but are ignored
2240  * for thread migration/steal decisions. Bound threads that are enqueued in the
2241  * separate runq have the th_bound_cluster_enqueued flag set to allow easy
2242  * removal.
2243  *
2244  * Bound Threads Timesharing
2245  * The bound threads share the timesharing properties of the clutch bucket group they are
2246  * part of. They contribute to the load and use priority shifts/decay values from the
2247  * clutch bucket group.
2248  */
2249 
2250 static boolean_t
sched_edge_bound_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2251 sched_edge_bound_thread_insert(
2252 	sched_clutch_root_t root_clutch,
2253 	thread_t thread,
2254 	integer_t options)
2255 {
2256 	/* Update the clutch runnable count and priority */
2257 	sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2258 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2259 	if (root_bucket->scrb_bound_thread_runq.count == 0) {
2260 		sched_clutch_root_bucket_runnable(root_bucket, root_clutch, mach_absolute_time());
2261 	}
2262 
2263 	assert((thread->th_bound_cluster_enqueued) == false);
2264 	run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options);
2265 	thread->th_bound_cluster_enqueued = true;
2266 
2267 	/*
2268 	 * Trigger an update to the thread's clutch bucket group's priority shift parameters,
2269 	 * needed for global timeshare within a clutch bucket group.
2270 	 */
2271 	sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_for_thread(thread));
2272 
2273 	/* Increment the urgency counter for the root if necessary */
2274 	sched_clutch_root_urgency_inc(root_clutch, thread);
2275 
2276 	int16_t root_old_pri = root_clutch->scr_priority;
2277 	sched_clutch_root_pri_update(root_clutch);
2278 	return root_clutch->scr_priority > root_old_pri;
2279 }
2280 
2281 static void
sched_edge_bound_thread_remove(sched_clutch_root_t root_clutch,thread_t thread)2282 sched_edge_bound_thread_remove(
2283 	sched_clutch_root_t root_clutch,
2284 	thread_t thread)
2285 {
2286 	sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_bound_buckets[thread->th_sched_bucket];
2287 	assert((thread->th_bound_cluster_enqueued) == true);
2288 	run_queue_remove(&root_bucket->scrb_bound_thread_runq, thread);
2289 	thread->th_bound_cluster_enqueued = false;
2290 
2291 	/* Decrement the urgency counter for the root if necessary */
2292 	sched_clutch_root_urgency_dec(root_clutch, thread);
2293 
2294 	/* Update the clutch runnable count and priority */
2295 	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2296 	if (root_bucket->scrb_bound_thread_runq.count == 0) {
2297 		sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time());
2298 	}
2299 	sched_clutch_root_pri_update(root_clutch);
2300 
2301 	/*
2302 	 * Trigger an update to the thread's clutch bucket group's priority shift parameters,
2303 	 * needed for global timeshare within a clutch bucket group.
2304 	 */
2305 	sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_for_thread(thread));
2306 }
2307 
2308 /*
2309  * Edge Scheduler cluster shared resource threads load balancing
2310  *
2311  * The Edge scheduler attempts to load balance cluster shared resource intensive threads
2312  * across clusters in order to reduce contention on the shared resources. It achieves
2313  * that by maintaining the runnable and running shared resource load on each cluster
2314  * and balancing the load across multiple clusters.
2315  *
2316  * The current implementation for cluster shared resource load balancing looks at
2317  * the per-cluster load at thread runnable time to enqueue the thread in the appropriate
2318  * cluster. The thread is enqueued in the cluster bound runqueue to ensure idle CPUs
2319  * do not steal/rebalance shared resource threads. Some more details for the implementation:
2320  *
2321  * - When threads are tagged as shared resource, they go through the cluster selection logic
2322  *   which looks at cluster shared resource loads and picks a cluster accordingly. The thread is
2323  *   enqueued in the cluster bound runqueue.
2324  *
2325  * - When the threads start running and call avoid_processor, the load balancing logic will be
2326  *   invoked and cause the thread to be sent to a more preferred cluster if one exists and has
2327  *   no shared resource load.
2328  *
2329  * - If a CPU in a preferred cluster is going idle and that cluster has no more shared load,
2330  *   it will look at running shared resource threads on foreign clusters and actively rebalance them.
2331  *
2332  * - Runnable shared resource threads are not stolen by the preferred cluster CPUs as they
2333  *   go idle intentionally.
2334  *
2335  * - One caveat of this design is that if a preferred CPU has already run and finished its shared
2336  *   resource thread execution, it will not go out and steal the runnable thread in the non-preferred cluster.
2337  *   The rebalancing will happen when the thread actually runs on a non-preferred cluster and one of the
2338  *   events listed above happen.
2339  *
2340  * - Also it currently does not consider other properties such as thread priorities and
2341  *   qos level thread load in the thread placement decision.
2342  *
2343  * Edge Scheduler cluster shared resource thread scheduling policy
2344  *
2345  * The threads for shared resources can be scheduled using one of the two policies:
2346  *
2347  * EDGE_SHARED_RSRC_SCHED_POLICY_RR
2348  * This policy distributes the threads so that they spread across all available clusters
2349  * irrespective of type. The idea is that this scheduling policy will put a shared resource
2350  * thread on each cluster on the platform before it starts doubling up on clusters.
2351  *
2352  * EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST
2353  * This policy distributes threads so that the threads first fill up all the capacity on
2354  * the preferred cluster and its homogeneous peers before spilling to different core type.
2355  * The current implementation defines capacity based on the number of CPUs in the cluster;
2356  * so a cluster's shared resource is considered full if there are "n" runnable + running
2357  * shared resource threads on the cluster with n cpus. This policy is different from the
2358  * default scheduling policy of the edge scheduler since this always tries to fill up the
2359  * native clusters to capacity even when non-native clusters might be idle.
2360  */
2361 __options_decl(edge_shared_rsrc_sched_policy_t, uint32_t, {
2362 	EDGE_SHARED_RSRC_SCHED_POLICY_RR                = 0,
2363 	EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST      = 1,
2364 });
2365 
2366 static const edge_shared_rsrc_sched_policy_t edge_shared_rsrc_policy[CLUSTER_SHARED_RSRC_TYPE_COUNT] = {
2367 	[CLUSTER_SHARED_RSRC_TYPE_RR] = EDGE_SHARED_RSRC_SCHED_POLICY_RR,
2368 	[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST,
2369 };
2370 
2371 static void
sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch,thread_t thread)2372 sched_edge_shared_rsrc_runnable_load_incr(sched_clutch_root_t root_clutch, thread_t thread)
2373 {
2374 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2375 		root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_RR]++;
2376 		thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_RR] = true;
2377 	}
2378 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2379 		root_clutch->scr_shared_rsrc_load_runnable[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST]++;
2380 		thread->th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST] = true;
2381 	}
2382 }
2383 
2384 static void
sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch,thread_t thread)2385 sched_edge_shared_rsrc_runnable_load_decr(sched_clutch_root_t root_clutch, thread_t thread)
2386 {
2387 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
2388 		if (thread->th_shared_rsrc_enqueued[shared_rsrc_type]) {
2389 			thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
2390 			root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type]--;
2391 		}
2392 	}
2393 }
2394 
2395 uint16_t
sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch,cluster_shared_rsrc_type_t shared_rsrc_type)2396 sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t shared_rsrc_type)
2397 {
2398 	return root_clutch->scr_shared_rsrc_load_runnable[shared_rsrc_type];
2399 }
2400 
2401 static uint64_t
sched_edge_pset_cluster_shared_rsrc_load(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2402 sched_edge_pset_cluster_shared_rsrc_load(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2403 {
2404 	/* Prevent migrations to derecommended clusters */
2405 	if (!pset_is_recommended(pset)) {
2406 		return UINT64_MAX;
2407 	}
2408 	return os_atomic_load(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], relaxed);
2409 }
2410 
2411 /*
2412  * sched_edge_shared_rsrc_idle()
2413  *
2414  * Routine used to determine if the constrained resource for the pset is idle. This is
2415  * used by a CPU going idle to decide if it should rebalance a running shared resource
2416  * thread from a non-preferred cluster.
2417  */
2418 static boolean_t
sched_edge_shared_rsrc_idle(processor_set_t pset,cluster_shared_rsrc_type_t shared_rsrc_type)2419 sched_edge_shared_rsrc_idle(processor_set_t pset, cluster_shared_rsrc_type_t shared_rsrc_type)
2420 {
2421 	return sched_edge_pset_cluster_shared_rsrc_load(pset, shared_rsrc_type) == 0;
2422 }
2423 
2424 /*
2425  * sched_edge_thread_shared_rsrc_type
2426  *
2427  * This routine decides if a given thread needs special handling for being a
2428  * heavy shared resource user. It is valid for the same thread to be using
2429  * several shared resources at the same time and have multiple policy flags set.
2430  * This routine determines which of those properties will be used for load
2431  * balancing and migration decisions.
2432  */
2433 static cluster_shared_rsrc_type_t
sched_edge_thread_shared_rsrc_type(thread_t thread)2434 sched_edge_thread_shared_rsrc_type(thread_t thread)
2435 {
2436 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
2437 		return CLUSTER_SHARED_RSRC_TYPE_RR;
2438 	}
2439 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
2440 		return CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST;
2441 	}
2442 	return CLUSTER_SHARED_RSRC_TYPE_NONE;
2443 }
2444 
2445 #endif /* CONFIG_SCHED_EDGE */
2446 
2447 /*
2448  * sched_clutch_thread_bound_lookup()
2449  *
2450  * Routine to lookup the highest priority runnable thread in a bounded root bucket.
2451  */
2452 static thread_t
sched_clutch_thread_bound_lookup(__unused sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t processor,thread_t _Nullable prev_thread)2453 sched_clutch_thread_bound_lookup(
2454 	__unused sched_clutch_root_t root_clutch,
2455 	sched_clutch_root_bucket_t root_bucket,
2456 	processor_t processor,
2457 	thread_t _Nullable prev_thread)
2458 {
2459 	assert(root_bucket->scrb_bound == true);
2460 	thread_t bound_thread = run_queue_peek(&root_bucket->scrb_bound_thread_runq);
2461 	if ((prev_thread != NULL) &&
2462 	    (bound_thread == NULL || sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_thread->sched_pri, processor->first_timeslice))) {
2463 		return prev_thread;
2464 	}
2465 	assert(bound_thread != THREAD_NULL);
2466 	return bound_thread;
2467 }
2468 
2469 /*
2470  * Clutch Bucket Group Thread Counts and Pending time calculation
2471  *
2472  * The pending time on the clutch_bucket_group allows the scheduler to track if it
2473  * needs to ageout the CPU usage because the clutch_bucket_group has been pending for
2474  * a very long time. The pending time is set to the timestamp as soon as a thread becomes
2475  * runnable. When a thread is picked up for execution from this clutch_bucket_group, the
2476  * pending time is advanced to the time of thread selection.
2477  *
2478  * Since threads for a clutch bucket group can be added or removed from multiple CPUs
2479  * simulataneously, it is important that the updates to thread counts and pending timestamps
2480  * happen atomically. The implementation relies on the following aspects to make that work
2481  * as expected:
2482  * - The clutch scheduler would be deployed on single cluster platforms where the pset lock
2483  *   is held when threads are added/removed and pending timestamps are updated
2484  * - The thread count and pending timestamp can be updated atomically using double wide
2485  *   128 bit atomics
2486  *
2487  * Clutch bucket group interactivity timestamp and score updates also rely on the properties
2488  * above to atomically update the interactivity score for a clutch bucket group.
2489  */
2490 
2491 #if CONFIG_SCHED_EDGE
2492 
2493 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2494 sched_clutch_bucket_group_thr_count_inc(
2495 	sched_clutch_bucket_group_t clutch_bucket_group,
2496 	uint64_t timestamp)
2497 {
2498 	sched_clutch_counter_time_t old_pending_data;
2499 	sched_clutch_counter_time_t new_pending_data;
2500 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2501 		new_pending_data.scct_count = old_pending_data.scct_count + 1;
2502 		new_pending_data.scct_timestamp = old_pending_data.scct_timestamp;
2503 		if (old_pending_data.scct_count == 0) {
2504 		        new_pending_data.scct_timestamp = timestamp;
2505 		}
2506 	});
2507 }
2508 
2509 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2510 sched_clutch_bucket_group_thr_count_dec(
2511 	sched_clutch_bucket_group_t clutch_bucket_group,
2512 	uint64_t timestamp)
2513 {
2514 	sched_clutch_counter_time_t old_pending_data;
2515 	sched_clutch_counter_time_t new_pending_data;
2516 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2517 		new_pending_data.scct_count = old_pending_data.scct_count - 1;
2518 		if (new_pending_data.scct_count == 0) {
2519 		        new_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2520 		} else {
2521 		        new_pending_data.scct_timestamp = timestamp;
2522 		}
2523 	});
2524 }
2525 
2526 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2527 sched_clutch_bucket_group_pending_ageout(
2528 	sched_clutch_bucket_group_t clutch_bucket_group,
2529 	uint64_t timestamp)
2530 {
2531 	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2532 	sched_clutch_counter_time_t old_pending_data;
2533 	sched_clutch_counter_time_t new_pending_data;
2534 	uint8_t cpu_usage_shift = 0;
2535 
2536 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_pending_data.scct_packed, old_pending_data.scct_packed, new_pending_data.scct_packed, relaxed, {
2537 		cpu_usage_shift = 0;
2538 		uint64_t old_pending_ts = old_pending_data.scct_timestamp;
2539 		bool old_update = (old_pending_ts >= timestamp);
2540 		bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2541 		bool no_bucket_load = (bucket_load == 0);
2542 		if (old_update || no_pending_time || no_bucket_load) {
2543 		        os_atomic_rmw_loop_give_up();
2544 		}
2545 
2546 		/* Calculate the time the clutch bucket group has been pending */
2547 		uint64_t pending_delta = timestamp - old_pending_ts;
2548 		/*
2549 		 * Other buckets should get a chance to run first before artificially boosting
2550 		 * this clutch bucket group's interactivity score, at least when the entire root
2551 		 * bucket is getting a large enough share of CPU.
2552 		 */
2553 		uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2554 		if (pending_delta < interactivity_delta) {
2555 		        os_atomic_rmw_loop_give_up();
2556 		}
2557 		cpu_usage_shift = (pending_delta / interactivity_delta);
2558 		new_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2559 		new_pending_data.scct_count = old_pending_data.scct_count;
2560 	});
2561 	return cpu_usage_shift;
2562 }
2563 
2564 static boolean_t
sched_edge_thread_should_be_inserted_as_bound(sched_clutch_root_t root_clutch,thread_t thread)2565 sched_edge_thread_should_be_inserted_as_bound(
2566 	sched_clutch_root_t root_clutch,
2567 	thread_t thread)
2568 {
2569 	/*
2570 	 * Check if the thread is bound and is being enqueued in its desired bound cluster.
2571 	 * If the thread is cluster-bound but to a different cluster, we should enqueue as unbound.
2572 	 */
2573 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) && (sched_edge_thread_bound_cluster_id(thread) == root_clutch->scr_cluster_id)) {
2574 		return TRUE;
2575 	}
2576 	/*
2577 	 * Use bound runqueue for shared resource threads. See "cluster shared resource
2578 	 * threads load balancing" section for details.
2579 	 */
2580 	if (sched_edge_thread_shared_rsrc_type(thread) != CLUSTER_SHARED_RSRC_TYPE_NONE) {
2581 		return TRUE;
2582 	}
2583 	return FALSE;
2584 }
2585 
2586 #else /* CONFIG_SCHED_EDGE */
2587 
2588 /*
2589  * For the clutch scheduler, atomicity is ensured by making sure all operations
2590  * are happening under the pset lock of the only cluster present on the platform.
2591  */
2592 static void
sched_clutch_bucket_group_thr_count_inc(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2593 sched_clutch_bucket_group_thr_count_inc(
2594 	sched_clutch_bucket_group_t clutch_bucket_group,
2595 	uint64_t timestamp)
2596 {
2597 	sched_clutch_hierarchy_locked_assert(&sched_boot_pset->pset_clutch_root);
2598 	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2599 		clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2600 	}
2601 	clutch_bucket_group->scbg_pending_data.scct_count++;
2602 }
2603 
2604 static void
sched_clutch_bucket_group_thr_count_dec(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2605 sched_clutch_bucket_group_thr_count_dec(
2606 	sched_clutch_bucket_group_t clutch_bucket_group,
2607 	uint64_t timestamp)
2608 {
2609 	sched_clutch_hierarchy_locked_assert(&sched_boot_pset->pset_clutch_root);
2610 	clutch_bucket_group->scbg_pending_data.scct_count--;
2611 	if (clutch_bucket_group->scbg_pending_data.scct_count == 0) {
2612 		clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID;
2613 	} else {
2614 		clutch_bucket_group->scbg_pending_data.scct_timestamp = timestamp;
2615 	}
2616 }
2617 
2618 static uint8_t
sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2619 sched_clutch_bucket_group_pending_ageout(
2620 	sched_clutch_bucket_group_t clutch_bucket_group,
2621 	uint64_t timestamp)
2622 {
2623 	sched_clutch_hierarchy_locked_assert(&sched_boot_pset->pset_clutch_root);
2624 	int bucket_load = sched_clutch_global_bucket_load_get(clutch_bucket_group->scbg_bucket);
2625 	uint64_t old_pending_ts = clutch_bucket_group->scbg_pending_data.scct_timestamp;
2626 	bool old_update = (old_pending_ts >= timestamp);
2627 	bool no_pending_time = (old_pending_ts == SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID);
2628 	bool no_bucket_load = (bucket_load == 0);
2629 	if (old_update || no_pending_time || no_bucket_load) {
2630 		return 0;
2631 	}
2632 	uint64_t pending_delta = timestamp - old_pending_ts;
2633 	/*
2634 	 * Other buckets should get a chance to run first before artificially boosting
2635 	 * this clutch bucket group's interactivity score, at least when the entire root
2636 	 * bucket is getting a large enough share of CPU.
2637 	 */
2638 	uint64_t interactivity_delta = sched_clutch_bucket_group_pending_delta[clutch_bucket_group->scbg_bucket] + (bucket_load * sched_clutch_thread_quantum[clutch_bucket_group->scbg_bucket]);
2639 	if (pending_delta < interactivity_delta) {
2640 		return 0;
2641 	}
2642 	uint8_t cpu_usage_shift = (pending_delta / interactivity_delta);
2643 	clutch_bucket_group->scbg_pending_data.scct_timestamp = old_pending_ts + (cpu_usage_shift * interactivity_delta);
2644 	return cpu_usage_shift;
2645 }
2646 
2647 #endif /* CONFIG_SCHED_EDGE */
2648 
2649 static uint8_t
sched_clutch_bucket_group_interactivity_score_calculate(sched_clutch_bucket_group_t clutch_bucket_group,uint64_t timestamp)2650 sched_clutch_bucket_group_interactivity_score_calculate(
2651 	sched_clutch_bucket_group_t clutch_bucket_group,
2652 	uint64_t timestamp)
2653 {
2654 	if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) {
2655 		/*
2656 		 * Since the root bucket selection algorithm for Above UI looks at clutch bucket
2657 		 * priorities, make sure all AboveUI buckets are marked interactive.
2658 		 */
2659 		assert(clutch_bucket_group->scbg_interactivity_data.scct_count == (2 * sched_clutch_bucket_group_interactive_pri));
2660 		return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2661 	}
2662 	/* Check if the clutch bucket group CPU usage needs to be aged out due to pending time */
2663 	uint8_t pending_intervals = sched_clutch_bucket_group_pending_ageout(clutch_bucket_group, timestamp);
2664 	/* Adjust CPU stats based on the calculated shift and to make sure only recent behavior is used */
2665 	sched_clutch_bucket_group_cpu_adjust(clutch_bucket_group, pending_intervals);
2666 	uint8_t interactivity_score = sched_clutch_interactivity_from_cpu_data(clutch_bucket_group);
2667 	/* Write back any interactivity score update */
2668 #if CONFIG_SCHED_EDGE
2669 	sched_clutch_counter_time_t old_interactivity_data;
2670 	sched_clutch_counter_time_t new_interactivity_data;
2671 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_interactivity_data.scct_packed, old_interactivity_data.scct_packed, new_interactivity_data.scct_packed, relaxed, {
2672 		new_interactivity_data.scct_count = old_interactivity_data.scct_count;
2673 		if (old_interactivity_data.scct_timestamp >= timestamp) {
2674 		        os_atomic_rmw_loop_give_up();
2675 		}
2676 		new_interactivity_data.scct_timestamp = timestamp;
2677 		if (old_interactivity_data.scct_timestamp != 0) {
2678 		        new_interactivity_data.scct_count = interactivity_score;
2679 		}
2680 	});
2681 	return (uint8_t)new_interactivity_data.scct_count;
2682 #else /* !CONFIG_SCHED_EDGE */
2683 	sched_clutch_hierarchy_locked_assert(&sched_boot_pset->pset_clutch_root);
2684 	if (timestamp > clutch_bucket_group->scbg_interactivity_data.scct_timestamp) {
2685 		clutch_bucket_group->scbg_interactivity_data.scct_count = interactivity_score;
2686 		clutch_bucket_group->scbg_interactivity_data.scct_timestamp = timestamp;
2687 	}
2688 	return (uint8_t)clutch_bucket_group->scbg_interactivity_data.scct_count;
2689 #endif /* !CONFIG_SCHED_EDGE */
2690 }
2691 
2692 /*
2693  * Clutch Bucket Group Run Count and Blocked Time Accounting
2694  *
2695  * The clutch bucket group maintains the number of runnable/running threads in the group.
2696  * Since the blocked time of the clutch bucket group is based on this count, it is
2697  * important to make sure the blocking timestamp and the run count are updated atomically.
2698  *
2699  * Since the run count increments happen without any pset locks held, the scheduler updates
2700  * the count & timestamp using double wide 128 bit atomics.
2701  */
2702 
2703 static uint32_t
sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t clutch_bucket_group)2704 sched_clutch_bucket_group_run_count_inc(
2705 	sched_clutch_bucket_group_t clutch_bucket_group)
2706 {
2707 	sched_clutch_counter_time_t old_blocked_data;
2708 	sched_clutch_counter_time_t new_blocked_data;
2709 
2710 	bool update_blocked_time = false;
2711 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2712 		new_blocked_data.scct_count = old_blocked_data.scct_count + 1;
2713 		new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2714 		update_blocked_time = false;
2715 		if (old_blocked_data.scct_count == 0) {
2716 		        new_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID;
2717 		        update_blocked_time = true;
2718 		}
2719 	});
2720 	if (update_blocked_time && (old_blocked_data.scct_timestamp != SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID)) {
2721 		uint64_t ctime = mach_absolute_time();
2722 		if (ctime > old_blocked_data.scct_timestamp) {
2723 			uint64_t blocked_time = ctime - old_blocked_data.scct_timestamp;
2724 			blocked_time = MIN(blocked_time, sched_clutch_bucket_group_adjust_threshold);
2725 			os_atomic_add(&(clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked), (clutch_cpu_data_t)blocked_time, relaxed);
2726 		}
2727 	}
2728 	return (uint32_t)new_blocked_data.scct_count;
2729 }
2730 
2731 static uint32_t
sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t clutch_bucket_group)2732 sched_clutch_bucket_group_run_count_dec(
2733 	sched_clutch_bucket_group_t clutch_bucket_group)
2734 {
2735 	sched_clutch_counter_time_t old_blocked_data;
2736 	sched_clutch_counter_time_t new_blocked_data;
2737 
2738 	uint64_t ctime = mach_absolute_time();
2739 	os_atomic_rmw_loop(&clutch_bucket_group->scbg_blocked_data.scct_packed, old_blocked_data.scct_packed, new_blocked_data.scct_packed, relaxed, {
2740 		new_blocked_data.scct_count = old_blocked_data.scct_count - 1;
2741 		new_blocked_data.scct_timestamp = old_blocked_data.scct_timestamp;
2742 		if (new_blocked_data.scct_count == 0) {
2743 		        new_blocked_data.scct_timestamp = ctime;
2744 		}
2745 	});
2746 	return (uint32_t)new_blocked_data.scct_count;
2747 }
2748 
2749 static inline sched_clutch_bucket_t
sched_clutch_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t thread)2750 sched_clutch_bucket_for_thread(
2751 	sched_clutch_root_t root_clutch,
2752 	thread_t thread)
2753 {
2754 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2755 	assert(thread->thread_group == clutch->sc_tg);
2756 
2757 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2758 	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2759 	assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2760 
2761 	return clutch_bucket;
2762 }
2763 
2764 static inline sched_clutch_bucket_group_t
sched_clutch_bucket_group_for_thread(thread_t prev_thread)2765 sched_clutch_bucket_group_for_thread(thread_t prev_thread)
2766 {
2767 	sched_clutch_t clutch = sched_clutch_for_thread_group(prev_thread->thread_group);
2768 	return &clutch->sc_clutch_groups[prev_thread->th_sched_bucket];
2769 }
2770 
2771 /*
2772  * sched_clutch_thread_insert()
2773  *
2774  * Routine to insert a thread into the sched clutch hierarchy.
2775  * Update the counts at all levels of the hierarchy and insert the nodes
2776  * as they become runnable. Always called with the pset lock held.
2777  */
2778 static boolean_t
sched_clutch_thread_insert(sched_clutch_root_t root_clutch,thread_t thread,integer_t options)2779 sched_clutch_thread_insert(
2780 	sched_clutch_root_t root_clutch,
2781 	thread_t thread,
2782 	integer_t options)
2783 {
2784 	boolean_t result = FALSE;
2785 
2786 	sched_clutch_hierarchy_locked_assert(root_clutch);
2787 #if CONFIG_SCHED_EDGE
2788 	sched_edge_cluster_cumulative_count_incr(root_clutch, thread->th_sched_bucket);
2789 	sched_edge_shared_rsrc_runnable_load_incr(root_clutch, thread);
2790 
2791 	if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, thread)) {
2792 		/*
2793 		 * Includes threads bound to this specific cluster as well as all
2794 		 * shared resource threads.
2795 		 */
2796 		return sched_edge_bound_thread_insert(root_clutch, thread, options);
2797 	}
2798 #endif /* CONFIG_SCHED_EDGE */
2799 
2800 	uint64_t current_timestamp = mach_absolute_time();
2801 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2802 	assert(thread->thread_group == clutch->sc_tg);
2803 	sched_clutch_bucket_t clutch_bucket = sched_clutch_bucket_for_thread(root_clutch, thread);
2804 	assert((clutch_bucket->scb_root == NULL) || (clutch_bucket->scb_root == root_clutch));
2805 
2806 	/*
2807 	 * Thread linkage in clutch_bucket
2808 	 *
2809 	 * A thread has a few linkages within the clutch bucket:
2810 	 * - A stable priority queue linkage which is the main runqueue (based on sched_pri) for the clutch bucket
2811 	 * - A regular priority queue linkage which is based on thread's base/promoted pri (used for clutch bucket priority calculation)
2812 	 * - A queue linkage used for timesharing operations of threads at the scheduler tick
2813 	 */
2814 
2815 	/* Insert thread into the clutch_bucket stable priority runqueue using sched_pri */
2816 	thread->th_clutch_runq_link.stamp = current_timestamp;
2817 	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link, thread->sched_pri,
2818 	    (options & SCHED_TAILQ) ? PRIORITY_QUEUE_ENTRY_NONE : PRIORITY_QUEUE_ENTRY_PREEMPTED);
2819 	priority_queue_insert(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2820 
2821 	/* Insert thread into clutch_bucket priority queue based on the promoted or base priority */
2822 	priority_queue_entry_set_sched_pri(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link,
2823 	    sched_thread_sched_pri_promoted(thread) ? thread->sched_pri : thread->base_pri, false);
2824 	priority_queue_insert(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2825 
2826 	/* Insert thread into timesharing queue of the clutch bucket */
2827 	enqueue_tail(&clutch_bucket->scb_thread_timeshare_queue, &thread->th_clutch_timeshare_link);
2828 
2829 	/* Increment the urgency counter for the root if necessary */
2830 	sched_clutch_root_urgency_inc(root_clutch, thread);
2831 
2832 	os_atomic_inc(&clutch->sc_thr_count, relaxed);
2833 	sched_clutch_bucket_group_thr_count_inc(clutch_bucket->scb_group, current_timestamp);
2834 
2835 	/* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */
2836 	sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ;
2837 	if (clutch_bucket->scb_thr_count == 0) {
2838 		sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2839 		sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2840 		result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options);
2841 	} else {
2842 		sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count);
2843 		sched_clutch_thr_count_inc(&root_clutch->scr_thr_count);
2844 		result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options);
2845 	}
2846 
2847 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2848 	    root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2849 	    SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2850 	return result;
2851 }
2852 
2853 /*
2854  * sched_clutch_thread_remove()
2855  *
2856  * Routine to remove a thread from the sched clutch hierarchy.
2857  * Update the counts at all levels of the hierarchy and remove the nodes
2858  * as they become empty. Always called with the pset lock held.
2859  */
2860 static void
sched_clutch_thread_remove(sched_clutch_root_t root_clutch,thread_t thread,uint64_t current_timestamp,sched_clutch_bucket_options_t options)2861 sched_clutch_thread_remove(
2862 	sched_clutch_root_t root_clutch,
2863 	thread_t thread,
2864 	uint64_t current_timestamp,
2865 	sched_clutch_bucket_options_t options)
2866 {
2867 	sched_clutch_hierarchy_locked_assert(root_clutch);
2868 #if CONFIG_SCHED_EDGE
2869 	sched_edge_cluster_cumulative_count_decr(root_clutch, thread->th_sched_bucket);
2870 	sched_edge_shared_rsrc_runnable_load_decr(root_clutch, thread);
2871 
2872 	if (thread->th_bound_cluster_enqueued) {
2873 		sched_edge_bound_thread_remove(root_clutch, thread);
2874 		return;
2875 	}
2876 #endif /* CONFIG_SCHED_EDGE */
2877 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
2878 	assert(thread->thread_group == clutch->sc_tg);
2879 	thread_assert_runq_nonnull(thread);
2880 
2881 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[thread->th_sched_bucket]);
2882 	sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
2883 	assert(clutch_bucket->scb_root == root_clutch);
2884 
2885 	/* Decrement the urgency counter for the root if necessary */
2886 	sched_clutch_root_urgency_dec(root_clutch, thread);
2887 	/* Remove thread from the clutch_bucket */
2888 	priority_queue_remove(&clutch_bucket->scb_thread_runq, &thread->th_clutch_runq_link);
2889 	remqueue(&thread->th_clutch_timeshare_link);
2890 
2891 	priority_queue_remove(&clutch_bucket->scb_clutchpri_prioq, &thread->th_clutch_pri_link);
2892 
2893 	/*
2894 	 * Warning: After this point, the thread's scheduling fields may be
2895 	 * modified by other cores that acquire the thread lock.
2896 	 */
2897 	thread_clear_runq(thread);
2898 
2899 	/* Update counts at various levels of the hierarchy */
2900 	os_atomic_dec(&clutch->sc_thr_count, relaxed);
2901 	sched_clutch_bucket_group_thr_count_dec(clutch_bucket->scb_group, current_timestamp);
2902 	sched_clutch_thr_count_dec(&root_clutch->scr_thr_count);
2903 	sched_clutch_thr_count_dec(&clutch_bucket->scb_thr_count);
2904 
2905 	/* Remove the clutch from hierarchy (if needed) and update properties */
2906 	if (clutch_bucket->scb_thr_count == 0) {
2907 		sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options);
2908 	} else {
2909 		sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options);
2910 	}
2911 
2912 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THR_COUNT) | DBG_FUNC_NONE,
2913 	    root_clutch->scr_cluster_id, thread_group_get_id(clutch_bucket->scb_group->scbg_clutch->sc_tg), clutch_bucket->scb_bucket,
2914 	    SCHED_CLUTCH_DBG_THR_COUNT_PACK(root_clutch->scr_thr_count, os_atomic_load(&clutch->sc_thr_count, relaxed), clutch_bucket->scb_thr_count));
2915 }
2916 
2917 /*
2918  * sched_clutch_thread_unbound_lookup()
2919  *
2920  * Routine to find the highest unbound thread in the root clutch.
2921  * Helps find threads easily for steal/migrate scenarios in the
2922  * Edge scheduler.
2923  */
2924 static thread_t
sched_clutch_thread_unbound_lookup(sched_clutch_root_t root_clutch,sched_clutch_root_bucket_t root_bucket,processor_t _Nullable processor,thread_t _Nullable prev_thread)2925 sched_clutch_thread_unbound_lookup(
2926 	sched_clutch_root_t root_clutch,
2927 	sched_clutch_root_bucket_t root_bucket,
2928 	processor_t _Nullable processor,
2929 	thread_t _Nullable prev_thread)
2930 {
2931 	assert(processor != NULL || prev_thread == NULL);
2932 	assert(root_bucket->scrb_bound == false);
2933 	sched_clutch_hierarchy_locked_assert(root_clutch);
2934 
2935 	/* Find the highest priority clutch bucket in this root bucket */
2936 	bool chose_prev_thread = false;
2937 	sched_clutch_bucket_t clutch_bucket = sched_clutch_root_bucket_highest_clutch_bucket(root_clutch, root_bucket, processor, prev_thread, &chose_prev_thread);
2938 	assert(clutch_bucket != NULL);
2939 
2940 	if (chose_prev_thread) {
2941 		/* We have determined that prev_thread is the highest thread, based on the Clutch bucket level policy */
2942 		assert(processor != NULL && prev_thread != NULL);
2943 		return prev_thread;
2944 	}
2945 
2946 	/* Find the highest priority runnable thread in this clutch bucket */
2947 	thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
2948 	assert(thread != NULL);
2949 
2950 	/* Consider the previous thread */
2951 	if (prev_thread != NULL &&
2952 	    sched_clutch_bucket_for_thread(root_clutch, prev_thread) == clutch_bucket &&
2953 	    sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, thread->sched_pri, processor->first_timeslice)) {
2954 		thread = prev_thread;
2955 	}
2956 
2957 	return thread;
2958 }
2959 
2960 static sched_clutch_root_bucket_t
sched_clutch_root_bucket_for_thread(sched_clutch_root_t root_clutch,thread_t prev_thread)2961 sched_clutch_root_bucket_for_thread(
2962 	sched_clutch_root_t root_clutch,
2963 	thread_t prev_thread)
2964 {
2965 #if CONFIG_SCHED_EDGE
2966 	if (sched_edge_thread_should_be_inserted_as_bound(root_clutch, prev_thread)) {
2967 		return &root_clutch->scr_bound_buckets[prev_thread->th_sched_bucket];
2968 	}
2969 #endif /* CONFIG_SCHED_EDGE */
2970 	return &root_clutch->scr_unbound_buckets[prev_thread->th_sched_bucket];
2971 }
2972 
2973 /*
2974  * sched_clutch_hierarchy_thread_highest()
2975  *
2976  * Routine to traverse the Clutch hierarchy and return the highest thread which
2977  * should be selected to run next, optionally comparing against the previously
2978  * running thread. Removes the highest thread with sched_clutch_thread_remove()
2979  * depending on the traverse mode and whether it is the previously running thread.
2980  * Always called with the pset lock held.
2981  */
2982 static thread_t
sched_clutch_hierarchy_thread_highest(sched_clutch_root_t root_clutch,processor_t processor,thread_t _Nullable prev_thread,sched_clutch_traverse_mode_t mode)2983 sched_clutch_hierarchy_thread_highest(
2984 	sched_clutch_root_t root_clutch,
2985 	processor_t processor,
2986 	thread_t _Nullable prev_thread,
2987 	sched_clutch_traverse_mode_t mode)
2988 {
2989 	assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || prev_thread == NULL);
2990 	sched_clutch_hierarchy_locked_assert(root_clutch);
2991 
2992 	thread_t highest_thread = NULL;
2993 	uint64_t current_timestamp = mach_absolute_time();
2994 	bool chose_prev_thread = false;
2995 	sched_clutch_dbg_thread_select_packed_t debug_info = {0};
2996 	sched_clutch_root_bucket_t prev_root_bucket = prev_thread != NULL ? sched_clutch_root_bucket_for_thread(root_clutch, prev_thread) : NULL;
2997 	sched_clutch_root_bucket_t root_bucket = sched_clutch_root_highest_root_bucket(root_clutch, current_timestamp, SCHED_CLUTCH_HIGHEST_ROOT_BUCKET_ALL, prev_root_bucket, prev_thread, &chose_prev_thread, mode, &debug_info);
2998 	if (chose_prev_thread) {
2999 		/* We disambiguated that we want to keep running the previous thread */
3000 		highest_thread = processor->active_thread;
3001 		goto done_selecting_thread;
3002 	}
3003 	if (root_bucket == NULL) {
3004 		/* The Clutch hierarchy has no runnable threads, including the previous thread */
3005 		assert(sched_clutch_root_count(root_clutch) == 0);
3006 		assert(prev_thread == NULL);
3007 		return NULL;
3008 	}
3009 	if (root_bucket != prev_root_bucket) {
3010 		/* We have ruled out continuing to run the previous thread, based on the root bucket level policy */
3011 		prev_thread = NULL;
3012 		assert((mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT) || (prev_root_bucket == NULL) ||
3013 		    (prev_root_bucket->scrb_bucket >= root_bucket->scrb_bucket) || (root_bucket->scrb_starvation_avoidance) ||
3014 		    (prev_root_bucket->scrb_bound != root_bucket->scrb_bound) ||
3015 		    (root_bucket->scrb_warp_remaining > 0 && root_bucket->scrb_warped_deadline > current_timestamp && prev_root_bucket->scrb_warp_remaining == 0));
3016 	}
3017 
3018 	if (root_bucket->scrb_bound) {
3019 		highest_thread = sched_clutch_thread_bound_lookup(root_clutch, root_bucket, processor, prev_thread);
3020 	} else {
3021 		highest_thread = sched_clutch_thread_unbound_lookup(root_clutch, root_bucket, processor, prev_thread);
3022 	}
3023 
3024 	if (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY ||
3025 	    (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT && highest_thread != processor->active_thread)) {
3026 		assert(mode != SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || highest_thread != processor->active_thread);
3027 		sched_clutch_thread_remove(root_clutch, highest_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR);
3028 	}
3029 
3030 done_selecting_thread:
3031 	debug_info.trace_data.version = SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION;
3032 	debug_info.trace_data.traverse_mode = mode;
3033 	debug_info.trace_data.cluster_id = root_clutch->scr_cluster_id;
3034 	debug_info.trace_data.selection_was_cluster_bound = root_bucket->scrb_bound;
3035 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE,
3036 	    thread_tid(highest_thread), thread_group_get_id(highest_thread->thread_group), root_bucket->scrb_bucket, debug_info.scdts_trace_data_packed, 0);
3037 	return highest_thread;
3038 }
3039 
3040 /* High level global accessor routines */
3041 
3042 /*
3043  * sched_clutch_root_urgency()
3044  *
3045  * Routine to get the urgency of the highest runnable
3046  * thread in the hierarchy.
3047  */
3048 static uint32_t
sched_clutch_root_urgency(sched_clutch_root_t root_clutch)3049 sched_clutch_root_urgency(
3050 	sched_clutch_root_t root_clutch)
3051 {
3052 	return root_clutch->scr_urgency;
3053 }
3054 
3055 /*
3056  * sched_clutch_root_count_sum()
3057  *
3058  * The count_sum mechanism is used for scheduler runq
3059  * statistics calculation. Its only useful for debugging
3060  * purposes; since it takes a mach_absolute_time() on
3061  * other scheduler implementations, its better to avoid
3062  * populating this until absolutely necessary.
3063  */
3064 static uint32_t
sched_clutch_root_count_sum(__unused sched_clutch_root_t root_clutch)3065 sched_clutch_root_count_sum(
3066 	__unused sched_clutch_root_t root_clutch)
3067 {
3068 	return 0;
3069 }
3070 
3071 /*
3072  * sched_clutch_root_priority()
3073  *
3074  * Routine to get the priority of the highest runnable
3075  * thread in the hierarchy.
3076  */
3077 static int
sched_clutch_root_priority(sched_clutch_root_t root_clutch)3078 sched_clutch_root_priority(
3079 	sched_clutch_root_t root_clutch)
3080 {
3081 	return root_clutch->scr_priority;
3082 }
3083 
3084 /*
3085  * sched_clutch_root_count()
3086  *
3087  * Returns total number of runnable threads in the hierarchy.
3088  */
3089 uint32_t
sched_clutch_root_count(sched_clutch_root_t root_clutch)3090 sched_clutch_root_count(
3091 	sched_clutch_root_t root_clutch)
3092 {
3093 	return root_clutch->scr_thr_count;
3094 }
3095 
3096 /*
3097  * sched_clutch_thread_pri_shift()
3098  *
3099  * Routine to get the priority shift value for a thread.
3100  * Since the timesharing is done at the clutch_bucket level,
3101  * this routine gets the clutch_bucket and retrieves the
3102  * values from there.
3103  */
3104 uint32_t
sched_clutch_thread_pri_shift(thread_t thread,sched_bucket_t bucket)3105 sched_clutch_thread_pri_shift(
3106 	thread_t thread,
3107 	sched_bucket_t bucket)
3108 {
3109 	if (!SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3110 		return INT8_MAX;
3111 	}
3112 	assert(bucket != TH_BUCKET_RUN);
3113 	sched_clutch_t clutch = sched_clutch_for_thread(thread);
3114 	sched_clutch_bucket_group_t clutch_bucket_group = &(clutch->sc_clutch_groups[bucket]);
3115 	return os_atomic_load(&clutch_bucket_group->scbg_pri_shift, relaxed);
3116 }
3117 
3118 #pragma mark -- Clutch Scheduler Algorithm
3119 
3120 static void
3121 sched_clutch_init(void);
3122 
3123 static thread_t
3124 sched_clutch_steal_thread(processor_set_t pset);
3125 
3126 #if !SCHED_TEST_HARNESS
3127 
3128 static void
3129 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context);
3130 
3131 #endif /* !SCHED_TEST_HARNESS */
3132 
3133 static boolean_t
3134 sched_clutch_processor_enqueue(processor_t processor, thread_t thread,
3135     sched_options_t options);
3136 
3137 static boolean_t
3138 sched_clutch_processor_queue_remove(processor_t processor, thread_t thread);
3139 
3140 static ast_t
3141 sched_clutch_processor_csw_check(processor_t processor);
3142 
3143 static boolean_t
3144 sched_clutch_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
3145 
3146 static int
3147 sched_clutch_runq_count(processor_t processor);
3148 
3149 static boolean_t
3150 sched_clutch_processor_queue_empty(processor_t processor);
3151 
3152 #if !SCHED_TEST_HARNESS
3153 
3154 static uint64_t
3155 sched_clutch_runq_stats_count_sum(processor_t processor);
3156 
3157 #endif /* !SCHED_TEST_HARNESS */
3158 
3159 static int
3160 sched_clutch_processor_bound_count(processor_t processor);
3161 
3162 static void
3163 sched_clutch_pset_init(processor_set_t pset);
3164 
3165 static void
3166 sched_clutch_processor_init(processor_t processor);
3167 
3168 static thread_t
3169 sched_clutch_processor_highest_thread(processor_t processor, sched_clutch_traverse_mode_t mode);
3170 
3171 static thread_t
3172 sched_clutch_choose_thread(processor_t processor, int priority, thread_t prev_thread, ast_t reason);
3173 
3174 #if !SCHED_TEST_HARNESS
3175 
3176 static void
3177 sched_clutch_processor_queue_shutdown(processor_t processor, struct pulled_thread_queue * threadq);
3178 
3179 #endif /* !SCHED_TEST_HARNESS */
3180 
3181 static sched_mode_t
3182 sched_clutch_initial_thread_sched_mode(task_t parent_task);
3183 
3184 static uint32_t
3185 sched_clutch_initial_quantum_size(thread_t thread);
3186 
3187 static uint32_t
3188 sched_clutch_run_incr(thread_t thread);
3189 
3190 static uint32_t
3191 sched_clutch_run_decr(thread_t thread);
3192 
3193 static void
3194 sched_clutch_update_thread_bucket(thread_t thread);
3195 
3196 #if !SCHED_TEST_HARNESS
3197 
3198 static void
3199 sched_clutch_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation);
3200 
3201 #endif /* !SCHED_TEST_HARNESS */
3202 
3203 const struct sched_dispatch_table sched_clutch_dispatch = {
3204 	.sched_name                                     = "clutch",
3205 	.init                                           = sched_clutch_init,
3206 	.timebase_init                                  = sched_timeshare_timebase_init,
3207 	.processor_init                                 = sched_clutch_processor_init,
3208 	.pset_init                                      = sched_clutch_pset_init,
3209 	.choose_thread                                  = sched_clutch_choose_thread,
3210 	.steal_thread_enabled                           = sched_steal_thread_enabled,
3211 	.steal_thread                                   = sched_clutch_steal_thread,
3212 	.processor_enqueue                              = sched_clutch_processor_enqueue,
3213 	.processor_queue_remove                         = sched_clutch_processor_queue_remove,
3214 	.processor_queue_empty                          = sched_clutch_processor_queue_empty,
3215 	.priority_is_urgent                             = priority_is_urgent,
3216 	.processor_csw_check                            = sched_clutch_processor_csw_check,
3217 	.processor_queue_has_priority                   = sched_clutch_processor_queue_has_priority,
3218 	.initial_quantum_size                           = sched_clutch_initial_quantum_size,
3219 	.initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
3220 	.processor_runq_count                           = sched_clutch_runq_count,
3221 	.processor_bound_count                          = sched_clutch_processor_bound_count,
3222 	.multiple_psets_enabled                         = TRUE,
3223 	.avoid_processor_enabled                        = FALSE,
3224 	.thread_avoid_processor                         = NULL,
3225 	.update_thread_bucket                           = sched_clutch_update_thread_bucket,
3226 	.cpu_init_completed                             = NULL,
3227 	.thread_eligible_for_pset                       = NULL,
3228 	.update_pset_load_average                       = sched_update_pset_load_average,
3229 	.update_pset_avg_execution_time                 = sched_update_pset_avg_execution_time,
3230 
3231 	.rt_choose_processor                            = sched_rt_choose_processor,
3232 	.rt_steal_thread                                = NULL,
3233 	.rt_init_pset                                   = sched_rt_init_pset,
3234 	.rt_init_completed                              = sched_rt_init_completed,
3235 	.rt_runq_count_sum                              = sched_rt_runq_count_sum,
3236 
3237 #if !SCHED_TEST_HARNESS
3238 	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
3239 	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
3240 	.choose_node                                    = sched_choose_node,
3241 	.choose_processor                               = choose_processor,
3242 	.processor_queue_shutdown                       = sched_clutch_processor_queue_shutdown,
3243 	.can_update_priority                            = can_update_priority,
3244 	.update_priority                                = update_priority,
3245 	.lightweight_update_priority                    = lightweight_update_priority,
3246 	.quantum_expire                                 = sched_default_quantum_expire,
3247 	.processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
3248 	.thread_update_scan                             = sched_clutch_thread_update_scan,
3249 	.processor_balance                              = sched_SMT_balance,
3250 	.qos_max_parallelism                            = sched_qos_max_parallelism,
3251 	.check_spill                                    = sched_check_spill,
3252 	.ipi_policy                                     = sched_ipi_policy,
3253 	.thread_should_yield                            = sched_thread_should_yield,
3254 	.run_count_incr                                 = sched_clutch_run_incr,
3255 	.run_count_decr                                 = sched_clutch_run_decr,
3256 	.pset_made_schedulable                          = sched_pset_made_schedulable,
3257 	.thread_group_recommendation_change             = sched_clutch_thread_group_recommendation_change,
3258 
3259 	.rt_queue_shutdown                              = sched_rt_queue_shutdown,
3260 	.rt_runq_scan                                   = sched_rt_runq_scan,
3261 #endif /* !SCHED_TEST_HARNESS */
3262 };
3263 
3264 __attribute__((always_inline))
3265 static inline run_queue_t
sched_clutch_bound_runq(processor_t processor)3266 sched_clutch_bound_runq(processor_t processor)
3267 {
3268 	return &processor->runq;
3269 }
3270 
3271 __attribute__((always_inline))
3272 static inline sched_clutch_root_t
sched_clutch_processor_root_clutch(processor_t processor)3273 sched_clutch_processor_root_clutch(processor_t processor)
3274 {
3275 	return &processor->processor_set->pset_clutch_root;
3276 }
3277 
3278 __attribute__((always_inline))
3279 static inline run_queue_t
sched_clutch_thread_bound_runq(processor_t processor,__assert_only thread_t thread)3280 sched_clutch_thread_bound_runq(processor_t processor, __assert_only thread_t thread)
3281 {
3282 	assert(thread->bound_processor == processor);
3283 	return sched_clutch_bound_runq(processor);
3284 }
3285 
3286 static uint32_t
sched_clutch_initial_quantum_size(thread_t thread)3287 sched_clutch_initial_quantum_size(thread_t thread)
3288 {
3289 	if (thread == THREAD_NULL) {
3290 		return std_quantum;
3291 	}
3292 	assert(sched_clutch_thread_quantum[thread->th_sched_bucket] <= UINT32_MAX);
3293 	return (uint32_t)sched_clutch_thread_quantum[thread->th_sched_bucket];
3294 }
3295 
3296 static sched_mode_t
sched_clutch_initial_thread_sched_mode(task_t parent_task)3297 sched_clutch_initial_thread_sched_mode(task_t parent_task)
3298 {
3299 	if (parent_task == kernel_task) {
3300 		return TH_MODE_FIXED;
3301 	} else {
3302 		return TH_MODE_TIMESHARE;
3303 	}
3304 }
3305 
3306 static void
sched_clutch_processor_init(processor_t processor)3307 sched_clutch_processor_init(processor_t processor)
3308 {
3309 	run_queue_init(&processor->runq);
3310 }
3311 
3312 static void
sched_clutch_pset_init(processor_set_t pset)3313 sched_clutch_pset_init(processor_set_t pset)
3314 {
3315 	sched_clutch_root_init(&pset->pset_clutch_root, pset);
3316 }
3317 
3318 static void
sched_clutch_tunables_init(void)3319 sched_clutch_tunables_init(void)
3320 {
3321 	sched_clutch_us_to_abstime(sched_clutch_root_bucket_wcel_us, sched_clutch_root_bucket_wcel);
3322 	sched_clutch_us_to_abstime(sched_clutch_root_bucket_warp_us, sched_clutch_root_bucket_warp);
3323 	sched_clutch_us_to_abstime(sched_clutch_thread_quantum_us, sched_clutch_thread_quantum);
3324 	clock_interval_to_absolutetime_interval(SCHED_CLUTCH_BUCKET_GROUP_ADJUST_THRESHOLD_USECS,
3325 	    NSEC_PER_USEC, &sched_clutch_bucket_group_adjust_threshold);
3326 	assert(sched_clutch_bucket_group_adjust_threshold <= CLUTCH_CPU_DATA_MAX);
3327 	sched_clutch_us_to_abstime(sched_clutch_bucket_group_pending_delta_us, sched_clutch_bucket_group_pending_delta);
3328 }
3329 
3330 static void
sched_clutch_init(void)3331 sched_clutch_init(void)
3332 {
3333 	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
3334 		sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
3335 	}
3336 	sched_timeshare_init();
3337 	sched_clutch_tunables_init();
3338 }
3339 
3340 static inline bool
sched_clutch_pri_greater_than_tiebreak(int pri_one,int pri_two,bool one_wins_ties)3341 sched_clutch_pri_greater_than_tiebreak(int pri_one, int pri_two, bool one_wins_ties)
3342 {
3343 	if (one_wins_ties) {
3344 		return pri_one >= pri_two;
3345 	} else {
3346 		return pri_one > pri_two;
3347 	}
3348 }
3349 
3350 /*
3351  * sched_clutch_processor_highest_thread()
3352  *
3353  * Routine to determine the highest thread on the entire cluster runqueue which
3354  * should be selected to run next, optionally comparing against the previously
3355  * running thread. Removes the highest thread from the runqueue, depending on the
3356  * traverse mode and whether the highest thread is the previously running thread.
3357  *
3358  * Always called with the pset lock held. Assumes that processor->active_thread
3359  * may be locked and modified by another processor.
3360  */
3361 static thread_t
sched_clutch_processor_highest_thread(processor_t processor,sched_clutch_traverse_mode_t mode)3362 sched_clutch_processor_highest_thread(
3363 	processor_t      processor,
3364 	sched_clutch_traverse_mode_t mode)
3365 {
3366 	sched_clutch_root_t root_clutch = sched_clutch_processor_root_clutch(processor);
3367 	int clutch_pri = sched_clutch_root_priority(root_clutch);
3368 	run_queue_t bound_runq = sched_clutch_bound_runq(processor);
3369 	int bound_pri = bound_runq->highq;
3370 
3371 	bool has_prev_thread = mode == SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT;
3372 	thread_t prev_thread = has_prev_thread ? processor->active_thread : NULL;
3373 
3374 	if (bound_runq->count == 0 && root_clutch->scr_thr_count == 0) {
3375 		/* The runqueue is totally empty */
3376 		assert(bound_pri < MINPRI && clutch_pri < MINPRI);
3377 		return prev_thread;
3378 	}
3379 
3380 	if (has_prev_thread) {
3381 		if (prev_thread->sched_pri >= BASEPRI_RTQUEUES) {
3382 			/* The previous thread is real-time and thus guaranteed higher than the non-RT runqueue */
3383 			return prev_thread;
3384 		}
3385 		/* Allow the previous thread to influence the priority comparison of Clutch hierarchy vs. processor-bound runqueue */
3386 		if (prev_thread->bound_processor != NULL) {
3387 			bound_pri = MAX(bound_pri, prev_thread->sched_pri);
3388 		} else {
3389 			clutch_pri = MAX(clutch_pri, prev_thread->sched_pri);
3390 		}
3391 	}
3392 
3393 	bool prev_thread_is_not_processor_bound = has_prev_thread && (prev_thread->bound_processor == NULL);
3394 	bool prev_thread_is_processor_bound = has_prev_thread && (prev_thread->bound_processor != NULL);
3395 	thread_t next_thread = prev_thread;
3396 	if (clutch_pri > bound_pri) {
3397 		if (root_clutch->scr_thr_count == 0) {
3398 			goto found_thread;
3399 		}
3400 		next_thread = sched_clutch_hierarchy_thread_highest(root_clutch, processor, prev_thread_is_not_processor_bound ? prev_thread : NULL, mode);
3401 	} else {
3402 		if (bound_runq->count == 0 ||
3403 		    (prev_thread_is_processor_bound && sched_clutch_pri_greater_than_tiebreak(prev_thread->sched_pri, bound_runq->highq, processor->first_timeslice))) {
3404 			goto found_thread;
3405 		}
3406 		next_thread = (mode == SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT || mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY) ?
3407 		    run_queue_dequeue(bound_runq, SCHED_HEADQ) : run_queue_peek(bound_runq);
3408 		assert(mode == SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY || next_thread != prev_thread);
3409 	}
3410 found_thread:
3411 	assert(next_thread != NULL);
3412 	return next_thread;
3413 }
3414 
3415 static thread_t
sched_clutch_choose_thread(processor_t processor,__unused int priority,thread_t _Nullable prev_thread,__unused ast_t reason)3416 sched_clutch_choose_thread(
3417 	processor_t      processor,
3418 	__unused int              priority,
3419 	thread_t _Nullable        prev_thread,
3420 	__unused ast_t            reason)
3421 {
3422 	assert(prev_thread == NULL || prev_thread == processor->active_thread);
3423 	return sched_clutch_processor_highest_thread(processor, prev_thread != NULL ? SCHED_CLUTCH_TRAVERSE_REMOVE_CONSIDER_CURRENT : SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3424 }
3425 
3426 static boolean_t
sched_clutch_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)3427 sched_clutch_processor_enqueue(
3428 	processor_t       processor,
3429 	thread_t          thread,
3430 	sched_options_t   options)
3431 {
3432 	boolean_t       result;
3433 
3434 	thread_set_runq_locked(thread, processor);
3435 	if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3436 		sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3437 		result = sched_clutch_thread_insert(pset_clutch_root, thread, options);
3438 	} else {
3439 		run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3440 		result = run_queue_enqueue(rq, thread, options);
3441 	}
3442 	return result;
3443 }
3444 
3445 static boolean_t
sched_clutch_processor_queue_empty(processor_t processor)3446 sched_clutch_processor_queue_empty(processor_t processor)
3447 {
3448 	return sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0 &&
3449 	       sched_clutch_bound_runq(processor)->count == 0;
3450 }
3451 
3452 static ast_t
sched_clutch_processor_csw_check(processor_t processor)3453 sched_clutch_processor_csw_check(processor_t processor)
3454 {
3455 	assert(processor->active_thread != NULL);
3456 	thread_t runqueue_thread = sched_clutch_processor_highest_thread(processor, SCHED_CLUTCH_TRAVERSE_CHECK_PREEMPT);
3457 	if (runqueue_thread != processor->active_thread) {
3458 		/* Found a better thread to run */
3459 		if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0 ||
3460 		    sched_clutch_bound_runq(processor)->urgency > 0) {
3461 			return AST_PREEMPT | AST_URGENT;
3462 		}
3463 		return AST_PREEMPT;
3464 	}
3465 	return AST_NONE;
3466 }
3467 
3468 static boolean_t
sched_clutch_processor_queue_has_priority(__unused processor_t processor,__unused int priority,__unused boolean_t gte)3469 sched_clutch_processor_queue_has_priority(
3470 	__unused processor_t    processor,
3471 	__unused int            priority,
3472 	__unused boolean_t      gte)
3473 {
3474 	/*
3475 	 * Never short-circuit the Clutch runqueue by returning FALSE here. Instead,
3476 	 * thread_select() should always go through sched_clutch_choose_thread().
3477 	 */
3478 	return TRUE;
3479 }
3480 
3481 static int
sched_clutch_runq_count(processor_t processor)3482 sched_clutch_runq_count(processor_t processor)
3483 {
3484 	return (int)sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) + sched_clutch_bound_runq(processor)->count;
3485 }
3486 
3487 #if !SCHED_TEST_HARNESS
3488 
3489 static uint64_t
sched_clutch_runq_stats_count_sum(processor_t processor)3490 sched_clutch_runq_stats_count_sum(processor_t processor)
3491 {
3492 	uint64_t bound_sum = sched_clutch_bound_runq(processor)->runq_stats.count_sum;
3493 
3494 	if (processor->cpu_id == processor->processor_set->cpu_set_low) {
3495 		return bound_sum + sched_clutch_root_count_sum(sched_clutch_processor_root_clutch(processor));
3496 	} else {
3497 		return bound_sum;
3498 	}
3499 }
3500 
3501 #endif /* !SCHED_TEST_HARNESS */
3502 
3503 static int
sched_clutch_processor_bound_count(processor_t processor)3504 sched_clutch_processor_bound_count(processor_t processor)
3505 {
3506 	return sched_clutch_bound_runq(processor)->count;
3507 }
3508 
3509 #if !SCHED_TEST_HARNESS
3510 
3511 static void
sched_clutch_processor_queue_shutdown(processor_t processor,struct pulled_thread_queue * threadq)3512 sched_clutch_processor_queue_shutdown(processor_t processor, struct pulled_thread_queue * threadq)
3513 {
3514 	processor_set_t pset = processor->processor_set;
3515 	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3516 
3517 	/* We only need to migrate threads if this is the last active processor in the pset */
3518 	if (pset->online_processor_count == 0) {
3519 		while (sched_clutch_root_count(pset_clutch_root) > 0) {
3520 			thread_t thread = sched_clutch_hierarchy_thread_highest(
3521 				pset_clutch_root, processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
3522 			pulled_thread_queue_enqueue(threadq, thread);
3523 		}
3524 	}
3525 
3526 	pset_unlock(pset);
3527 }
3528 
3529 #endif /* !SCHED_TEST_HARNESS */
3530 
3531 static boolean_t
sched_clutch_processor_queue_remove(processor_t processor,thread_t thread)3532 sched_clutch_processor_queue_remove(
3533 	processor_t processor,
3534 	thread_t    thread)
3535 {
3536 	processor_set_t         pset = processor->processor_set;
3537 
3538 	pset_lock(pset);
3539 
3540 	if (processor == thread_get_runq_locked(thread)) {
3541 		/*
3542 		 * Thread is on a run queue and we have a lock on
3543 		 * that run queue.
3544 		 */
3545 		if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) {
3546 			sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
3547 			sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
3548 		} else {
3549 			run_queue_t rq = sched_clutch_thread_bound_runq(processor, thread);
3550 			run_queue_remove(rq, thread);
3551 		}
3552 	} else {
3553 		/*
3554 		 * The thread left the run queue before we could
3555 		 * lock the run queue.
3556 		 */
3557 		thread_assert_runq_null(thread);
3558 		processor = PROCESSOR_NULL;
3559 	}
3560 
3561 	pset_unlock(pset);
3562 
3563 	return processor != PROCESSOR_NULL;
3564 }
3565 
3566 static thread_t
sched_clutch_steal_thread(__unused processor_set_t pset)3567 sched_clutch_steal_thread(__unused processor_set_t pset)
3568 {
3569 	/* Thread stealing is not enabled for single cluster clutch scheduler platforms */
3570 	return THREAD_NULL;
3571 }
3572 
3573 #if !SCHED_TEST_HARNESS
3574 
3575 static void
sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)3576 sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context)
3577 {
3578 	boolean_t               restart_needed = FALSE;
3579 	processor_t             processor = processor_list;
3580 	processor_set_t         pset;
3581 	thread_t                thread;
3582 	spl_t                   s;
3583 
3584 	/*
3585 	 *  We update the threads associated with each processor (bound and idle threads)
3586 	 *  and then update the threads in each pset runqueue.
3587 	 */
3588 
3589 	do {
3590 		do {
3591 			pset = processor->processor_set;
3592 
3593 			s = splsched();
3594 			pset_lock(pset);
3595 
3596 			restart_needed = runq_scan(sched_clutch_bound_runq(processor), scan_context);
3597 
3598 			pset_unlock(pset);
3599 			splx(s);
3600 
3601 			if (restart_needed) {
3602 				break;
3603 			}
3604 
3605 			thread = processor->idle_thread;
3606 			if (thread != THREAD_NULL && thread->sched_stamp != os_atomic_load(&sched_tick, relaxed)) {
3607 				if (thread_update_add_thread(thread) == FALSE) {
3608 					restart_needed = TRUE;
3609 					break;
3610 				}
3611 			}
3612 		} while ((processor = processor->processor_list) != NULL);
3613 
3614 		/* Ok, we now have a collection of candidates -- fix them. */
3615 		thread_update_process_threads();
3616 	} while (restart_needed);
3617 
3618 	pset_node_t node = &pset_node0;
3619 	pset = node->psets;
3620 
3621 	do {
3622 		do {
3623 			restart_needed = FALSE;
3624 			while (pset != NULL) {
3625 				s = splsched();
3626 				pset_lock(pset);
3627 
3628 				if (sched_clutch_root_count(&pset->pset_clutch_root) > 0) {
3629 					for (sched_bucket_t bucket = TH_BUCKET_SHARE_FG; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3630 						restart_needed = runq_scan(&pset->pset_clutch_root.scr_bound_buckets[bucket].scrb_bound_thread_runq, scan_context);
3631 						if (restart_needed) {
3632 							break;
3633 						}
3634 					}
3635 					queue_t clutch_bucket_list = &pset->pset_clutch_root.scr_clutch_buckets;
3636 					sched_clutch_bucket_t clutch_bucket;
3637 					qe_foreach_element(clutch_bucket, clutch_bucket_list, scb_listlink) {
3638 						sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, scan_context->sched_tick_last_abstime);
3639 						restart_needed = sched_clutch_timeshare_scan(&clutch_bucket->scb_thread_timeshare_queue, clutch_bucket->scb_thr_count, scan_context);
3640 						if (restart_needed) {
3641 							break;
3642 						}
3643 					}
3644 				}
3645 
3646 				pset_unlock(pset);
3647 				splx(s);
3648 
3649 				if (restart_needed) {
3650 					break;
3651 				}
3652 				pset = pset->pset_list;
3653 			}
3654 
3655 			if (restart_needed) {
3656 				break;
3657 			}
3658 		} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
3659 
3660 		/* Ok, we now have a collection of candidates -- fix them. */
3661 		thread_update_process_threads();
3662 	} while (restart_needed);
3663 }
3664 
3665 /*
3666  * For threads that have changed sched_pri without changing the
3667  * base_pri for any reason other than decay, use the sched_pri
3668  * as the bucketizing priority instead of base_pri. All such
3669  * changes are typically due to kernel locking primitives boosts
3670  * or demotions.
3671  */
3672 static boolean_t
sched_thread_sched_pri_promoted(thread_t thread)3673 sched_thread_sched_pri_promoted(thread_t thread)
3674 {
3675 	return (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) ||
3676 	       (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) ||
3677 	       (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) ||
3678 	       (thread->kern_promotion_schedpri != 0);
3679 }
3680 
3681 #endif /* !SCHED_TEST_HARNESS */
3682 
3683 /*
3684  * For the clutch scheduler, the run counts are maintained in the clutch
3685  * buckets (i.e thread group scheduling structure).
3686  */
3687 static uint32_t
sched_clutch_run_incr(thread_t thread)3688 sched_clutch_run_incr(thread_t thread)
3689 {
3690 	assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
3691 	uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3692 	sched_clutch_thread_run_bucket_incr(thread, thread->th_sched_bucket);
3693 	return new_count;
3694 }
3695 
3696 static uint32_t
sched_clutch_run_decr(thread_t thread)3697 sched_clutch_run_decr(thread_t thread)
3698 {
3699 	assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
3700 	uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
3701 	sched_clutch_thread_run_bucket_decr(thread, thread->th_sched_bucket);
3702 	return new_count;
3703 }
3704 
3705 /*
3706  * Routine to update the scheduling bucket for the thread.
3707  *
3708  * In the clutch scheduler implementation, the thread's bucket
3709  * is based on sched_pri if it was promoted due to a kernel
3710  * primitive; otherwise its based on the thread base_pri. This
3711  * enhancement allows promoted threads to reach a higher priority
3712  * bucket and potentially get selected sooner for scheduling.
3713  *
3714  * Also, the clutch scheduler does not honor fixed priority below
3715  * FG priority. It simply puts those threads in the corresponding
3716  * timeshare bucket. The reason for to do that is because it is
3717  * extremely hard to define the scheduling properties of such threads
3718  * and they typically lead to performance issues.
3719  *
3720  * Called with the thread lock held and the thread held off the runqueue.
3721  */
3722 
3723 void
sched_clutch_update_thread_bucket(thread_t thread)3724 sched_clutch_update_thread_bucket(thread_t thread)
3725 {
3726 	sched_bucket_t old_bucket = thread->th_sched_bucket;
3727 	thread_assert_runq_null(thread);
3728 	int pri = (sched_thread_sched_pri_promoted(thread)) ? thread->sched_pri : thread->base_pri;
3729 	sched_bucket_t new_bucket = sched_clutch_thread_bucket_map(thread, pri);
3730 
3731 	if (old_bucket == new_bucket) {
3732 		return;
3733 	}
3734 
3735 	/* Bypass accounting CPU usage for a newly created thread */
3736 	if (old_bucket != TH_BUCKET_RUN) {
3737 		/* Attribute CPU usage with the old scheduling bucket */
3738 		sched_clutch_thread_tick_delta(thread, NULL);
3739 	}
3740 
3741 	/* Transition to the new sched_bucket */
3742 	thread->th_sched_bucket = new_bucket;
3743 	thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket);
3744 
3745 	/*
3746 	 * Since this is called after the thread has been removed from the runq,
3747 	 * only the run counts need to be updated. The re-insert into the runq
3748 	 * would put the thread into the correct new bucket's runq.
3749 	 */
3750 	if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
3751 		sched_clutch_thread_run_bucket_decr(thread, old_bucket);
3752 		sched_clutch_thread_run_bucket_incr(thread, new_bucket);
3753 	}
3754 }
3755 
3756 #if !SCHED_TEST_HARNESS
3757 
3758 static void
sched_clutch_thread_group_recommendation_change(__unused struct thread_group * tg,__unused cluster_type_t new_recommendation)3759 sched_clutch_thread_group_recommendation_change(__unused struct thread_group *tg, __unused cluster_type_t new_recommendation)
3760 {
3761 	/* Clutch ignores the recommendation because Clutch does not migrate
3762 	 * threads between cluster types independently from the Edge scheduler.
3763 	 */
3764 }
3765 
3766 #endif /* !SCHED_TEST_HARNESS */
3767 
3768 #if CONFIG_SCHED_EDGE
3769 
3770 /* Implementation of the AMP version of the clutch scheduler */
3771 
3772 static void
3773 sched_edge_init(void);
3774 
3775 static void
3776 sched_edge_pset_init(processor_set_t pset);
3777 
3778 static thread_t
3779 sched_edge_processor_idle(processor_set_t pset);
3780 
3781 static boolean_t
3782 sched_edge_processor_queue_empty(processor_t processor);
3783 
3784 static void
3785 sched_edge_processor_queue_shutdown(processor_t processor, struct pulled_thread_queue * threadq);
3786 
3787 static processor_t
3788 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, sched_options_t *options_inout);
3789 
3790 static void
3791 sched_edge_quantum_expire(thread_t thread);
3792 
3793 static bool
3794 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason);
3795 
3796 static bool
3797 sched_edge_balance(processor_t cprocessor, processor_set_t cpset);
3798 
3799 static void
3800 sched_edge_check_spill(processor_set_t pset, thread_t thread);
3801 
3802 static bool
3803 sched_edge_thread_should_yield(processor_t processor, thread_t thread);
3804 
3805 static void
3806 sched_edge_pset_made_schedulable(processor_set_t pset);
3807 
3808 static void
3809 sched_edge_cpu_init_completed(void);
3810 
3811 static bool
3812 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset);
3813 
3814 static bool
3815 sched_edge_steal_thread_enabled(processor_set_t pset);
3816 
3817 static sched_ipi_type_t
3818 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event);
3819 
3820 static uint32_t
3821 sched_edge_qos_max_parallelism(int qos, uint64_t options);
3822 
3823 static void
3824 sched_edge_update_pset_load_average(processor_set_t pset, uint64_t curtime);
3825 
3826 static void
3827 sched_edge_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket);
3828 
3829 static uint32_t
3830 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket);
3831 
3832 static uint32_t
3833 sched_edge_run_count_incr(thread_t thread);
3834 
3835 static bool
3836 sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset);
3837 
3838 const struct sched_dispatch_table sched_edge_dispatch = {
3839 	.sched_name                                     = "edge",
3840 	.init                                           = sched_edge_init,
3841 	.timebase_init                                  = sched_timeshare_timebase_init,
3842 	.processor_init                                 = sched_clutch_processor_init,
3843 	.pset_init                                      = sched_edge_pset_init,
3844 	.choose_thread                                  = sched_clutch_choose_thread,
3845 	.steal_thread_enabled                           = sched_edge_steal_thread_enabled,
3846 	.steal_thread                                   = sched_edge_processor_idle,
3847 	.choose_processor                               = sched_edge_choose_processor,
3848 	.processor_enqueue                              = sched_clutch_processor_enqueue,
3849 	.processor_queue_remove                         = sched_clutch_processor_queue_remove,
3850 	.processor_queue_empty                          = sched_edge_processor_queue_empty,
3851 	.priority_is_urgent                             = priority_is_urgent,
3852 	.processor_csw_check                            = sched_clutch_processor_csw_check,
3853 	.processor_queue_has_priority                   = sched_clutch_processor_queue_has_priority,
3854 	.initial_quantum_size                           = sched_clutch_initial_quantum_size,
3855 	.initial_thread_sched_mode                      = sched_clutch_initial_thread_sched_mode,
3856 	.processor_runq_count                           = sched_clutch_runq_count,
3857 	.processor_bound_count                          = sched_clutch_processor_bound_count,
3858 	.multiple_psets_enabled                         = TRUE,
3859 	.avoid_processor_enabled                        = TRUE,
3860 	.thread_avoid_processor                         = sched_edge_thread_avoid_processor,
3861 	.processor_balance                              = sched_edge_balance,
3862 	.qos_max_parallelism                            = sched_edge_qos_max_parallelism,
3863 	.check_spill                                    = sched_edge_check_spill,
3864 	.ipi_policy                                     = sched_edge_ipi_policy,
3865 	.thread_should_yield                            = sched_edge_thread_should_yield,
3866 	.update_thread_bucket                           = sched_clutch_update_thread_bucket,
3867 	.cpu_init_completed                             = sched_edge_cpu_init_completed,
3868 	.thread_eligible_for_pset                       = sched_edge_thread_eligible_for_pset,
3869 	.update_pset_load_average                       = sched_edge_update_pset_load_average,
3870 	.update_pset_avg_execution_time                 = sched_edge_update_pset_avg_execution_time,
3871 
3872 	.rt_choose_processor                            = sched_rt_choose_processor,
3873 	.rt_steal_thread                                = sched_rt_steal_thread,
3874 	.rt_init_pset                                   = sched_rt_init_pset,
3875 	.rt_init_completed                              = sched_rt_init_completed,
3876 	.rt_runq_count_sum                              = sched_rt_runq_count_sum,
3877 
3878 #if !SCHED_TEST_HARNESS
3879 	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
3880 	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
3881 	.choose_node                                    = sched_choose_node,
3882 	.processor_queue_shutdown                       = sched_edge_processor_queue_shutdown,
3883 	.can_update_priority                            = can_update_priority,
3884 	.update_priority                                = update_priority,
3885 	.lightweight_update_priority                    = lightweight_update_priority,
3886 	.quantum_expire                                 = sched_edge_quantum_expire,
3887 	.processor_runq_stats_count_sum                 = sched_clutch_runq_stats_count_sum,
3888 	.thread_update_scan                             = sched_clutch_thread_update_scan,
3889 	.run_count_incr                                 = sched_edge_run_count_incr,
3890 	.run_count_decr                                 = sched_clutch_run_decr,
3891 	.pset_made_schedulable                          = sched_edge_pset_made_schedulable,
3892 	.thread_group_recommendation_change             = NULL,
3893 
3894 	.rt_queue_shutdown                              = sched_rt_queue_shutdown,
3895 	.rt_runq_scan                                   = sched_rt_runq_scan,
3896 #endif /* !SCHED_TEST_HARNESS */
3897 };
3898 
3899 static _Atomic bitmap_t sched_edge_available_pset_bitmask[BITMAP_LEN(MAX_PSETS)];
3900 
3901 /*
3902  * sched_edge_thread_bound_cluster_id()
3903  *
3904  * Routine to determine which cluster a particular thread is bound to. Uses
3905  * the sched_flags on the thread to map back to a specific cluster id.
3906  *
3907  * <Edge Multi-cluster Support Needed>
3908  */
3909 static uint32_t
sched_edge_thread_bound_cluster_id(thread_t thread)3910 sched_edge_thread_bound_cluster_id(thread_t thread)
3911 {
3912 	assert(SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread));
3913 	return thread->th_bound_cluster_id;
3914 }
3915 
3916 /* Forward declaration for some thread migration routines */
3917 static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset);
3918 static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks, processor_t *processor_hint_out, sched_options_t *options_inout);
3919 
3920 static_assert(sizeof(sched_clutch_edge) == sizeof(uint64_t), "sched_clutch_edge fits in 64 bits");
3921 
3922 #define PERMISSIVE_MIGRATION_BUCKET (TH_BUCKET_FIXPRI)
3923 
3924 /*
3925  * sched_edge_config_set()
3926  *
3927  * Support to update an edge configuration. Typically used by CLPC to affect thread migration
3928  * policies in the scheduler.
3929  */
3930 static void
sched_edge_config_set(uint32_t src_cluster,uint32_t dst_cluster,sched_bucket_t bucket,sched_clutch_edge edge_config)3931 sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_bucket_t bucket, sched_clutch_edge edge_config)
3932 {
3933 	os_atomic_store(&pset_for_id(src_cluster)->sched_edges[dst_cluster][bucket], edge_config, relaxed);
3934 }
3935 
3936 /*
3937  * sched_edge_config_get()
3938  *
3939  * Support to get an edge configuration. Typically used by CLPC to query edge configs to decide
3940  * if it needs to update edges.
3941  */
3942 static sched_clutch_edge
sched_edge_config_get(uint32_t src_cluster,uint32_t dst_cluster,sched_bucket_t bucket)3943 sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster, sched_bucket_t bucket)
3944 {
3945 	return os_atomic_load(&pset_array[src_cluster]->sched_edges[dst_cluster][bucket], relaxed);
3946 }
3947 
3948 /*
3949  * sched_edge_config_pset_push()
3950  *
3951  * After using sched_edge_config_set() to update edge tunables outgoing from a particular source
3952  * pset, this function should be called in order to propagate the updates to derived metadata for
3953  * the pset, such as search orders for outgoing spill and steal.
3954  */
3955 static void
sched_edge_config_pset_push(uint32_t src_pset_id)3956 sched_edge_config_pset_push(uint32_t src_pset_id)
3957 {
3958 	processor_set_t src_pset = pset_array[src_pset_id];
3959 	uint8_t search_order_len = sched_num_psets - 1;
3960 	sched_pset_search_order_sort_data_t search_order_datas[MAX_PSETS - 1];
3961 	for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
3962 		uint8_t dst_pset_id = 0;
3963 		for (int i = 0; i < search_order_len; i++, dst_pset_id++) {
3964 			if (dst_pset_id == src_pset->pset_id) {
3965 				dst_pset_id++;
3966 			}
3967 			search_order_datas[i].spsosd_src_pset = src_pset;
3968 			search_order_datas[i].spsosd_dst_pset_id = dst_pset_id;
3969 			sched_clutch_edge edge = sched_edge_config_get(src_pset->pset_id, dst_pset_id, bucket);
3970 			search_order_datas[i].spsosd_migration_weight = edge.sce_migration_allowed ?
3971 			    edge.sce_migration_weight : UINT32_MAX;
3972 		}
3973 		sched_pset_search_order_compute(&src_pset->spill_search_order[bucket],
3974 		    search_order_datas, search_order_len, sched_edge_search_order_weight_then_locality_cmp);
3975 	}
3976 }
3977 
3978 static int
sched_edge_search_order_weight_then_locality(const void * a,const void * b)3979 sched_edge_search_order_weight_then_locality(const void *a, const void *b)
3980 {
3981 	const sched_pset_search_order_sort_data_t *data_a = (const sched_pset_search_order_sort_data_t *)a;
3982 	const sched_pset_search_order_sort_data_t *data_b = (const sched_pset_search_order_sort_data_t *)b;
3983 	assert3p(data_a->spsosd_src_pset, ==, data_b->spsosd_src_pset);
3984 	assert3u(data_a->spsosd_dst_pset_id, !=, data_b->spsosd_dst_pset_id);
3985 	/*
3986 	 * Sort based on lowest edge migration weight, followed by die-local psets
3987 	 * first, followed by lowest pset id.
3988 	 */
3989 	if (data_a->spsosd_migration_weight != data_b->spsosd_migration_weight) {
3990 		return (data_a->spsosd_migration_weight < data_b->spsosd_migration_weight) ? -1 : 1;
3991 	}
3992 
3993 	bool is_local_a = bitmap_test(data_a->spsosd_src_pset->local_psets, data_a->spsosd_dst_pset_id);
3994 	bool is_local_b = bitmap_test(data_b->spsosd_src_pset->local_psets, data_b->spsosd_dst_pset_id);
3995 	if (is_local_a != is_local_b) {
3996 		return is_local_a ? -1 : 1;
3997 	}
3998 
3999 	if (data_a->spsosd_dst_pset_id != data_b->spsosd_dst_pset_id) {
4000 		return (data_a->spsosd_dst_pset_id < data_b->spsosd_dst_pset_id) ? -1 : 1;
4001 	}
4002 	return 0;
4003 }
4004 
4005 cmpfunc_t sched_edge_search_order_weight_then_locality_cmp = &sched_edge_search_order_weight_then_locality;
4006 
4007 #if DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS
4008 
4009 /*
4010  * sched_edge_config_verify_non_decreasing_qos_strictness()
4011  *
4012  * Routine to validate the assumption that higher QoSes
4013  * will be configured with the less restrictive migration
4014  * allowance for each edge in the matrix. This allows
4015  * early-exiting searches when migration is disallowed for
4016  * a higher QoS edge.
4017  * Returns true if no violations were discovered.
4018  */
4019 static inline bool
sched_edge_config_verify_non_decreasing_qos_strictness(pset_id_t src_id,pset_id_t dst_id,sched_bucket_t bucket)4020 sched_edge_config_verify_non_decreasing_qos_strictness(
4021 	pset_id_t src_id, pset_id_t dst_id, sched_bucket_t bucket)
4022 {
4023 	if (bucket == PERMISSIVE_MIGRATION_BUCKET) {
4024 		return true;
4025 	}
4026 	sched_clutch_edge edge = sched_edge_config_get(src_id, dst_id, bucket);
4027 	sched_clutch_edge higher_bucket_edge = sched_edge_config_get(src_id, dst_id, bucket - 1);
4028 	if ((edge.sce_migration_allowed && !higher_bucket_edge.sce_migration_allowed) ||
4029 	    (edge.sce_steal_allowed && !higher_bucket_edge.sce_steal_allowed)) {
4030 		kprintf("warn: Edge matrix config violates non-decreasing strictness "
4031 		    "across buckets %u and %u for edge %u->%u\n",
4032 		    bucket - 1, bucket, src_id, dst_id);
4033 		return false;
4034 	}
4035 	return true;
4036 }
4037 
4038 static bool
sched_edge_config_verify_transitive_traverse(pset_id_t dst_id,pset_id_t curr_id,sched_bucket_t qos,bitmap_t * visited_map)4039 sched_edge_config_verify_transitive_traverse(pset_id_t dst_id, pset_id_t curr_id,
4040     sched_bucket_t qos, bitmap_t *visited_map)
4041 {
4042 	if (bitmap_test(visited_map, curr_id)) {
4043 		/* Been there, done that */
4044 		return true;
4045 	}
4046 	bitmap_set(visited_map, curr_id);
4047 	bool pass = true;
4048 	for (pset_id_t next_id = 0; next_id < sched_num_psets; next_id++) {
4049 		if (next_id == curr_id) {
4050 			continue;
4051 		}
4052 		sched_clutch_edge path_edge = sched_edge_config_get(next_id, curr_id, qos);
4053 		if (path_edge.sce_migration_allowed) {
4054 			/*
4055 			 * We have found a migration path from next_id to dst_id.
4056 			 * Verify that the direct edge agrees.
4057 			 */
4058 			if (next_id != dst_id) {
4059 				sched_clutch_edge direct_edge = sched_edge_config_get(next_id, dst_id, qos);
4060 				if (!direct_edge.sce_migration_allowed || !direct_edge.sce_steal_allowed) {
4061 					pass = false;
4062 					kprintf("warn: Edge matrix config violates transitive property across "
4063 					    "psets %u->%u for scheduling bucket %u\n", next_id, dst_id, qos);
4064 				}
4065 			}
4066 			/* DFS onward */
4067 			pass = sched_edge_config_verify_transitive_traverse(dst_id, next_id, qos, visited_map) && pass;
4068 		}
4069 	}
4070 	return pass;
4071 }
4072 
4073 /*
4074  * sched_edge_config_verify_transitive()
4075  *
4076  * Routine to validate transitivity of the Edge matrix which
4077  * helps ensure that the configured migration policy minimizes
4078  * scheduling latency by allowing threads to directly spill to
4079  * idle cores where they are allowed to run, rather than
4080  * arrive on those cores only via steal operations.
4081  * Returns true if no violations were discovered.
4082  */
4083 static bool
sched_edge_config_verify_transitive(pset_id_t dst_id)4084 sched_edge_config_verify_transitive(pset_id_t dst_id)
4085 {
4086 	bool pass = true;
4087 	for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4088 		/*
4089 		 * Depth-first-search paths to get to the destination pset,
4090 		 * and verify that each path also has a matching direct edge
4091 		 * from start to finish.
4092 		 */
4093 		bitmap_t visited_map[BITMAP_LEN(MAX_PSETS)] = {0};
4094 		pass = sched_edge_config_verify_transitive_traverse(dst_id, dst_id, bucket, visited_map) && pass;
4095 	}
4096 	return pass;
4097 }
4098 
4099 /*
4100  * sched_edge_config_verify()
4101  *
4102  * Performs checks to validate assumed properties of the Edge matrix,
4103  * such as transitivity.
4104  * Returns true if no violations were discovered.
4105  */
4106 static bool
sched_edge_config_verify(void)4107 sched_edge_config_verify(void)
4108 {
4109 	bool pass = true;
4110 	sched_edge_matrix_iterate(src_id, dst_id, bucket, { \
4111 		pass = sched_edge_config_verify_non_decreasing_qos_strictness(src_id, dst_id, bucket) && pass;
4112 	});
4113 	for (pset_id_t dst_id = 0; dst_id < sched_num_psets; dst_id++) {
4114 		pass = sched_edge_config_verify_transitive(dst_id) && pass;
4115 	}
4116 	return pass;
4117 }
4118 
4119 #endif /* DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS */
4120 
4121 /*
4122  * sched_edge_config_final_push()
4123  *
4124  * After using sched_edge_config_set() to update edge tunables outgoing from every pset,
4125  * this function is called in order to propagate the updates to derived global metadata,
4126  * such as short-cut bitmasks.
4127  */
4128 static void
sched_edge_config_final_push(void)4129 sched_edge_config_final_push(void)
4130 {
4131 	for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4132 		for (pset_id_t dst_id = 0; dst_id < sched_num_psets; dst_id++) {
4133 			bitmap_t updated_steal_map[BITMAP_LEN(MAX_PSETS)] = {0};
4134 			for (pset_id_t src_id = 0; src_id < sched_num_psets; src_id++) {
4135 				sched_clutch_edge edge = sched_edge_config_get(src_id, dst_id, bucket);
4136 				if ((dst_id == src_id) || edge.sce_migration_allowed) {
4137 					bitmap_set(updated_steal_map, src_id);
4138 				}
4139 			}
4140 			sched_clutch_root_t dst_root = &pset_array[dst_id]->pset_clutch_root;
4141 			os_atomic_store(dst_root->scr_incoming_migration_allowed[bucket], updated_steal_map[0], relaxed);
4142 		}
4143 	}
4144 }
4145 
4146 /*
4147  * sched_edge_matrix_set()
4148  *
4149  * Routine to update various edges in the edge migration graph. The edge_changed array
4150  * indicates which edges need to be updated. Both the edge_matrix and edge_changed arrays
4151  * are matrices with dimension num_psets * num_psets * TH_BUCKET_SCHED_MAX, flattened into a
4152  * single-dimensional array.
4153  */
4154 void
sched_edge_matrix_set(sched_clutch_edge * edge_matrix,bool * edge_changed,__unused uint64_t flags,__assert_only uint64_t num_psets)4155 sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changed, __unused uint64_t flags,
4156     __assert_only uint64_t num_psets)
4157 {
4158 	assert3u(num_psets, ==, sched_num_psets);
4159 	uint32_t edge_index = 0;
4160 	for (uint32_t src_cluster = 0; src_cluster < sched_num_psets; src_cluster++) {
4161 		for (uint32_t dst_cluster = 0; dst_cluster < sched_num_psets; dst_cluster++) {
4162 			for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4163 				if (edge_changed[edge_index]) {
4164 					sched_edge_config_set(src_cluster, dst_cluster, bucket, edge_matrix[edge_index]);
4165 				}
4166 				edge_index++;
4167 			}
4168 		}
4169 		sched_edge_config_pset_push(src_cluster);
4170 	}
4171 	sched_edge_config_final_push();
4172 }
4173 
4174 /*
4175  * sched_edge_matrix_get()
4176  *
4177  * Routine to retrieve various edges in the edge migration graph. The edge_requested array
4178  * indicates which edges need to be retrieved. Both the edge_matrix and edge_requested arrays
4179  * are matrices with dimension num_psets * num_psets * TH_BUCKET_SCHED_MAX, flattened into a
4180  * single-dimensional array.
4181  */
4182 void
sched_edge_matrix_get(sched_clutch_edge * edge_matrix,bool * edge_requested,__unused uint64_t flags,__assert_only uint64_t num_psets)4183 sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_requested, __unused uint64_t flags,
4184     __assert_only uint64_t num_psets)
4185 {
4186 	assert3u(num_psets, ==, sched_num_psets);
4187 	uint32_t edge_index = 0;
4188 	for (uint32_t src_pset = 0; src_pset < sched_num_psets; src_pset++) {
4189 		for (uint32_t dst_pset = 0; dst_pset < sched_num_psets; dst_pset++) {
4190 			for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4191 				if (edge_requested[edge_index]) {
4192 					edge_matrix[edge_index] = sched_edge_config_get(src_pset, dst_pset, bucket);
4193 				}
4194 				edge_index++;
4195 			}
4196 		}
4197 	}
4198 }
4199 
4200 
4201 /*
4202  * sched_edge_init()
4203  *
4204  * Routine to initialize the data structures for the Edge scheduler.
4205  */
4206 static void
sched_edge_init(void)4207 sched_edge_init(void)
4208 {
4209 	if (!PE_parse_boot_argn("sched_clutch_bucket_group_interactive_pri", &sched_clutch_bucket_group_interactive_pri, sizeof(sched_clutch_bucket_group_interactive_pri))) {
4210 		sched_clutch_bucket_group_interactive_pri = SCHED_CLUTCH_BUCKET_GROUP_INTERACTIVE_PRI_DEFAULT;
4211 	}
4212 	sched_timeshare_init();
4213 	sched_clutch_tunables_init();
4214 	assert3s(sched_num_psets, >, 0);
4215 	assert3s(sched_num_psets, <=, (int)MAX_PSETS);
4216 }
4217 
4218 static void
sched_edge_pset_init(processor_set_t pset)4219 sched_edge_pset_init(processor_set_t pset)
4220 {
4221 	uint32_t pset_cluster_id = pset->pset_cluster_id;
4222 	pset->pset_type = pset_cluster_type_to_cluster_type(pset->pset_cluster_type);
4223 	/* Each pset must declare an AMP type */
4224 	assert(pset->pset_type != CLUSTER_TYPE_SMP);
4225 
4226 	/* Set the edge weight and properties for the pset itself */
4227 	bitmap_clear(pset->foreign_psets, pset_cluster_id);
4228 	bitmap_clear(pset->native_psets, pset_cluster_id);
4229 	bitmap_clear(pset->local_psets, pset_cluster_id);
4230 	bitmap_clear(pset->remote_psets, pset_cluster_id);
4231 	bzero(&pset->sched_edges, sizeof(pset->sched_edges));
4232 	bzero(&pset->max_parallel_cores, sizeof(pset->max_parallel_cores));
4233 	bzero(&pset->max_parallel_clusters, sizeof(pset->max_parallel_cores));
4234 	for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
4235 		sched_pset_search_order_init(pset, &pset->spill_search_order[bucket]);
4236 	}
4237 	sched_clutch_root_init(&pset->pset_clutch_root, pset);
4238 	atomic_bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id, memory_order_relaxed);
4239 }
4240 
4241 static boolean_t
sched_edge_processor_queue_empty(processor_t processor)4242 sched_edge_processor_queue_empty(processor_t processor)
4243 {
4244 	return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) &&
4245 	       (sched_clutch_bound_runq(processor)->count == 0);
4246 }
4247 
4248 static void
sched_edge_check_spill(__unused processor_set_t pset,__unused thread_t thread)4249 sched_edge_check_spill(__unused processor_set_t pset, __unused thread_t thread)
4250 {
4251 	assert(thread->bound_processor == PROCESSOR_NULL);
4252 }
4253 
4254 __options_decl(sched_edge_thread_yield_reason_t, uint32_t, {
4255 	SCHED_EDGE_YIELD_RUNQ_NONEMPTY       = 0x0,
4256 	/* SCHED_EDGE_YIELD_FOREIGN_RUNNABLE    = 0x1, unused */
4257 	SCHED_EDGE_YIELD_FOREIGN_RUNNING     = 0x2,
4258 	SCHED_EDGE_YIELD_STEAL_POSSIBLE      = 0x3,
4259 	SCHED_EDGE_YIELD_DISALLOW            = 0x4,
4260 });
4261 
4262 /*
4263  * sched_edge_thread_should_yield()
4264  *
4265  * Routine for a fast-path decision of whether or not to proceed
4266  * depressing the priority of and considering preempting a
4267  * yielding thread.
4268  * Called with preemption disabled but WITHOUT the pset lock held.
4269  */
4270 static bool
sched_edge_thread_should_yield(processor_t processor,__unused thread_t thread)4271 sched_edge_thread_should_yield(processor_t processor, __unused thread_t thread)
4272 {
4273 	/* Self runqueue case exactly matches sched_thread_should_yield() */
4274 	if (!sched_edge_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) {
4275 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4276 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_RUNQ_NONEMPTY);
4277 		return true;
4278 	}
4279 
4280 	/* Scan for running rebalance opportunity */
4281 	if (sched_edge_foreign_running_thread_available(processor->processor_set)) {
4282 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4283 		    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_FOREIGN_RUNNING);
4284 		return true;
4285 	}
4286 
4287 	/* Scan for steal opportunity */
4288 	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4289 	uint64_t try_all_mask = ~0ULL;
4290 	while (sched_iterate_psets_ordered(processor->processor_set,
4291 	    &processor->processor_set->spill_search_order[TH_BUCKET_FIXPRI], try_all_mask, &istate)) {
4292 		processor_set_t target_pset = pset_array[istate.spis_pset_id];
4293 		if (sched_edge_pset_peek_steal_possible(target_pset, processor->processor_set, try_all_mask)) {
4294 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4295 			    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_STEAL_POSSIBLE);
4296 			return true;
4297 		}
4298 	}
4299 
4300 	/*
4301 	 * Note, the current yield policy in thread_select() does NOT attempt
4302 	 * to steal or rebalance before falling back to continue running the
4303 	 * yielding thread.
4304 	 */
4305 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHOULD_YIELD) | DBG_FUNC_NONE,
4306 	    thread_tid(thread), processor->processor_set->pset_cluster_id, 0, SCHED_EDGE_YIELD_DISALLOW);
4307 	return false;
4308 }
4309 
4310 #if !SCHED_TEST_HARNESS
4311 
4312 static void
sched_edge_processor_queue_shutdown(processor_t processor,struct pulled_thread_queue * threadq)4313 sched_edge_processor_queue_shutdown(processor_t processor, struct pulled_thread_queue * threadq)
4314 {
4315 	processor_set_t pset = processor->processor_set;
4316 	sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor);
4317 
4318 	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
4319 	if (pset->online_processor_count == 0 || !pset_is_recommended(pset)) {
4320 		atomic_bitmap_clear(sched_edge_available_pset_bitmask, pset->pset_id, memory_order_relaxed);
4321 
4322 		while (sched_clutch_root_count(pset_clutch_root) > 0) {
4323 			thread_t thread = sched_clutch_hierarchy_thread_highest(pset_clutch_root,
4324 			    processor, NULL, SCHED_CLUTCH_TRAVERSE_REMOVE_HIERARCHY_ONLY);
4325 			pulled_thread_queue_enqueue(threadq, thread);
4326 		}
4327 	}
4328 
4329 	pset_unlock(pset);
4330 }
4331 
4332 #endif /* !SCHED_TEST_HARNESS */
4333 
4334 /*
4335  * The Edge scheduler uses average scheduling latency as the metric for making
4336  * thread migration decisions. One component of avg scheduling latency is the load
4337  * average on the cluster.
4338  *
4339  * Load Average Fixed Point Arithmetic
4340  *
4341  * The load average is maintained as a 24.8 fixed point arithmetic value for precision.
4342  * When multiplied by the average execution time, it needs to be rounded up (based on
4343  * the most significant bit of the fractional part) for better accuracy. After rounding
4344  * up, the whole number part of the value is used as the actual load value for
4345  * migrate/steal decisions.
4346  */
4347 #define SCHED_PSET_LOAD_EWMA_FRACTION_BITS 8
4348 #define SCHED_PSET_LOAD_EWMA_ROUND_BIT     (1 << (SCHED_PSET_LOAD_EWMA_FRACTION_BITS - 1))
4349 #define SCHED_PSET_LOAD_EWMA_FRACTION_MASK ((1 << SCHED_PSET_LOAD_EWMA_FRACTION_BITS) - 1)
4350 #define SCHED_PSET_LOAD_EWMA_TC_NSECS 10000000u
4351 
4352 inline static int
sched_edge_get_pset_load_average(processor_set_t pset,sched_bucket_t sched_bucket)4353 sched_edge_get_pset_load_average(processor_set_t pset, sched_bucket_t sched_bucket)
4354 {
4355 	uint64_t load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
4356 	uint64_t avg_execution_time = os_atomic_load(&pset->pset_execution_time[sched_bucket].pset_avg_thread_execution_time, relaxed);
4357 	/*
4358 	 * Since a load average of 0 indicates an idle cluster, don't allow an average
4359 	 * execution time less than 1us to cause a cluster to appear idle.
4360 	 */
4361 	avg_execution_time = MAX(avg_execution_time, 1ULL);
4362 	return (int)(((load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS) * avg_execution_time);
4363 }
4364 
4365 /*
4366  * sched_edge_pset_running_higher_bucket()
4367  *
4368  * Routine to calculate cumulative running counts for each scheduling
4369  * bucket. This effectively lets the load calculation calculate if a
4370  * cluster is running any threads at a QoS lower than the thread being
4371  * migrated etc.
4372  */
4373 static void
sched_edge_pset_running_higher_bucket(processor_set_t pset,uint32_t * running_higher)4374 sched_edge_pset_running_higher_bucket(processor_set_t pset, uint32_t *running_higher)
4375 {
4376 	bitmap_t *active_map = &pset->cpu_state_map[PROCESSOR_RUNNING];
4377 	bzero(running_higher, sizeof(uint32_t) * TH_BUCKET_SCHED_MAX);
4378 
4379 	/* Count the running threads per bucket */
4380 	for (int cpu = bitmap_first(active_map, MAX_CPUS); cpu >= 0; cpu = bitmap_next(active_map, cpu)) {
4381 		sched_bucket_t cpu_bucket = os_atomic_load(&pset->cpu_running_buckets[cpu], relaxed);
4382 		/* Don't count idle threads */
4383 		if (cpu_bucket < TH_BUCKET_SCHED_MAX) {
4384 			running_higher[cpu_bucket]++;
4385 		}
4386 	}
4387 
4388 	/* Calculate the cumulative running counts as a prefix sum */
4389 	for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX - 1; bucket++) {
4390 		running_higher[bucket + 1] += running_higher[bucket];
4391 	}
4392 }
4393 
4394 /*
4395  * sched_edge_update_pset_load_average()
4396  *
4397  * Updates the load average for each sched bucket for a cluster.
4398  * This routine must be called with the pset lock held.
4399  */
4400 static void
sched_edge_update_pset_load_average(processor_set_t pset,uint64_t curtime)4401 sched_edge_update_pset_load_average(processor_set_t pset, uint64_t curtime)
4402 {
4403 	int avail_cpu_count = pset_available_cpu_count(pset);
4404 	if (avail_cpu_count == 0) {
4405 		/* Looks like the pset is not runnable any more; nothing to do here */
4406 		return;
4407 	}
4408 
4409 	/*
4410 	 * Edge Scheduler Optimization
4411 	 *
4412 	 * See if more callers of this routine can pass in timestamps to avoid the
4413 	 * mach_absolute_time() call here.
4414 	 */
4415 
4416 	if (!curtime) {
4417 		curtime = mach_absolute_time();
4418 	}
4419 	uint64_t last_update = os_atomic_load(&pset->pset_load_last_update, relaxed);
4420 	int64_t delta_ticks = curtime - last_update;
4421 	if (delta_ticks < 0) {
4422 		return;
4423 	}
4424 
4425 	uint64_t delta_nsecs = 0;
4426 	absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
4427 
4428 	if (__improbable(delta_nsecs > UINT32_MAX)) {
4429 		delta_nsecs = UINT32_MAX;
4430 	}
4431 
4432 	/* Update the shared resource load on the pset */
4433 	for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
4434 		uint64_t shared_rsrc_runnable_load = sched_edge_shared_rsrc_runnable_load(&pset->pset_clutch_root, shared_rsrc_type);
4435 		uint64_t shared_rsrc_running_load = bit_count(pset->cpu_running_cluster_shared_rsrc_thread[shared_rsrc_type]);
4436 		uint64_t new_shared_load = shared_rsrc_runnable_load + shared_rsrc_running_load;
4437 		uint64_t old_shared_load = os_atomic_xchg(&pset->pset_cluster_shared_rsrc_load[shared_rsrc_type], new_shared_load, relaxed);
4438 		if (old_shared_load != new_shared_load) {
4439 			KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD) | DBG_FUNC_NONE, pset->pset_cluster_id, shared_rsrc_type, new_shared_load, shared_rsrc_running_load);
4440 		}
4441 	}
4442 
4443 	uint32_t running_higher[TH_BUCKET_SCHED_MAX];
4444 	sched_edge_pset_running_higher_bucket(pset, running_higher);
4445 
4446 	for (sched_bucket_t sched_bucket = TH_BUCKET_FIXPRI; sched_bucket < TH_BUCKET_SCHED_MAX; sched_bucket++) {
4447 		uint64_t old_load_average = os_atomic_load(&pset->pset_load_average[sched_bucket], relaxed);
4448 		uint64_t old_load_average_factor = old_load_average * SCHED_PSET_LOAD_EWMA_TC_NSECS;
4449 		uint32_t current_runq_depth = sched_edge_cluster_cumulative_count(&pset->pset_clutch_root, sched_bucket) +  rt_runq_count(pset) + running_higher[sched_bucket];
4450 		os_atomic_store(&pset->pset_runnable_depth[sched_bucket], current_runq_depth, relaxed);
4451 
4452 		uint32_t current_load = current_runq_depth / avail_cpu_count;
4453 		/*
4454 		 * For the new load average multiply current_load by delta_nsecs (which results in a 32.0 value).
4455 		 * Since we want to maintain the load average as a 24.8 fixed arithmetic value for precision, the
4456 		 * new load average needs to be shifted before it can be added to the old load average.
4457 		 */
4458 		uint64_t new_load_average_factor = (current_load * delta_nsecs) << SCHED_PSET_LOAD_EWMA_FRACTION_BITS;
4459 
4460 		/*
4461 		 * For extremely parallel workloads, it is important that the load average on a cluster moves zero to non-zero
4462 		 * instantly to allow threads to be migrated to other (potentially idle) clusters quickly. Hence use the EWMA
4463 		 * when the system is already loaded; otherwise for an idle system use the latest load average immediately.
4464 		 */
4465 		int old_load_shifted = (int)((old_load_average + SCHED_PSET_LOAD_EWMA_ROUND_BIT) >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
4466 		boolean_t load_uptick = (old_load_shifted == 0) && (current_load != 0);
4467 		boolean_t load_downtick = (old_load_shifted != 0) && (current_load == 0);
4468 		uint64_t load_average;
4469 		if (load_uptick || load_downtick) {
4470 			load_average = (current_load << SCHED_PSET_LOAD_EWMA_FRACTION_BITS);
4471 		} else {
4472 			/* Indicates a loaded system; use EWMA for load average calculation */
4473 			load_average = (old_load_average_factor + new_load_average_factor) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
4474 		}
4475 		os_atomic_store(&pset->pset_load_average[sched_bucket], load_average, relaxed);
4476 		if (load_average != old_load_average) {
4477 			KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_LOAD_AVG) | DBG_FUNC_NONE, pset->pset_cluster_id, (load_average >> SCHED_PSET_LOAD_EWMA_FRACTION_BITS), load_average & SCHED_PSET_LOAD_EWMA_FRACTION_MASK, sched_bucket);
4478 			os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
4479 		}
4480 	}
4481 	os_atomic_store(&pset->pset_load_last_update, curtime, relaxed);
4482 }
4483 
4484 static void
sched_edge_update_pset_avg_execution_time(processor_set_t pset,uint64_t execution_time,uint64_t curtime,sched_bucket_t sched_bucket)4485 sched_edge_update_pset_avg_execution_time(processor_set_t pset, uint64_t execution_time, uint64_t curtime, sched_bucket_t sched_bucket)
4486 {
4487 	pset_execution_time_t old_execution_time_packed, new_execution_time_packed;
4488 	uint64_t avg_thread_execution_time = 0;
4489 
4490 	os_atomic_rmw_loop(&pset->pset_execution_time[sched_bucket].pset_execution_time_packed,
4491 	    old_execution_time_packed.pset_execution_time_packed,
4492 	    new_execution_time_packed.pset_execution_time_packed, relaxed, {
4493 		uint64_t last_update = old_execution_time_packed.pset_execution_time_last_update;
4494 		int64_t delta_ticks = curtime - last_update;
4495 		if (delta_ticks <= 0) {
4496 		        /*
4497 		         * Its possible that another CPU came in and updated the pset_execution_time
4498 		         * before this CPU could do it. Since the average execution time is meant to
4499 		         * be an approximate measure per cluster, ignore the older update.
4500 		         */
4501 		        os_atomic_rmw_loop_give_up(return );
4502 		}
4503 		uint64_t delta_nsecs = 0;
4504 		absolutetime_to_nanoseconds(delta_ticks, &delta_nsecs);
4505 
4506 		uint64_t nanotime = 0;
4507 		absolutetime_to_nanoseconds(execution_time, &nanotime);
4508 		uint64_t execution_time_us = nanotime / NSEC_PER_USEC;
4509 
4510 		/*
4511 		 * Since the average execution time is stored in microseconds, avoid rounding errors in
4512 		 * the EWMA calculation by only using a non-zero previous value.
4513 		 */
4514 		uint64_t old_avg_thread_execution_time = MAX(old_execution_time_packed.pset_avg_thread_execution_time, 1ULL);
4515 
4516 		uint64_t old_execution_time = (old_avg_thread_execution_time * SCHED_PSET_LOAD_EWMA_TC_NSECS);
4517 		uint64_t new_execution_time = (execution_time_us * delta_nsecs);
4518 
4519 		avg_thread_execution_time = (old_execution_time + new_execution_time) / (delta_nsecs + SCHED_PSET_LOAD_EWMA_TC_NSECS);
4520 		new_execution_time_packed.pset_avg_thread_execution_time = avg_thread_execution_time;
4521 		new_execution_time_packed.pset_execution_time_last_update = curtime;
4522 	});
4523 	if (new_execution_time_packed.pset_avg_thread_execution_time != old_execution_time_packed.pset_execution_time_packed) {
4524 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_AVG_EXEC_TIME) | DBG_FUNC_NONE, pset->pset_cluster_id, avg_thread_execution_time, sched_bucket);
4525 	}
4526 }
4527 
4528 /*
4529  * sched_edge_cluster_load_metric()
4530  *
4531  * The load metric for a cluster is a measure of the average scheduling latency
4532  * experienced by threads on that cluster. It is a product of the average number
4533  * of threads in the runqueue and the average execution time for threads. The metric
4534  * has special values in the following cases:
4535  * - UINT32_MAX: If the cluster is not available for scheduling, its load is set to
4536  *   the maximum value to disallow any threads to migrate to this cluster.
4537  * - 0: If there are idle CPUs in the cluster or an empty runqueue; this allows threads
4538  *   to be spread across the platform quickly for ncpu wide workloads.
4539  */
4540 static uint32_t
sched_edge_cluster_load_metric(processor_set_t pset,sched_bucket_t sched_bucket)4541 sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket)
4542 {
4543 	if (pset_is_recommended(pset) == false) {
4544 		return UINT32_MAX;
4545 	}
4546 	return (uint32_t)sched_edge_get_pset_load_average(pset, sched_bucket);
4547 }
4548 
4549 /*
4550  *
4551  * Edge Scheduler Steal/Rebalance logic
4552  *
4553  * = Generic scheduler logic =
4554  *
4555  * The SCHED(steal_thread) scheduler callout is invoked when the processor does not
4556  * find any thread for execution in its runqueue. The aim of the steal operation
4557  * is to find other threads running/runnable in other clusters which should be
4558  * executed here.
4559  *
4560  * If the steal callout does not return a thread, the thread_select() logic calls
4561  * SCHED(processor_balance) callout which is supposed to IPI other CPUs to rebalance
4562  * threads and idle out the current CPU.
4563  *
4564  * = SCHED(steal_thread) for Edge Scheduler =
4565  *
4566  * The edge scheduler hooks into sched_edge_processor_idle() for steal_thread. This
4567  * routine tries to do the following operations in order:
4568  * (1) Find foreign runnnable threads in non-native cluster
4569  *     runqueues (sched_edge_foreign_runnable_thread_remove())
4570  * (2) Check if foreign threads are running on the non-native
4571  *     clusters (sched_edge_foreign_running_thread_available())
4572  *         - If yes, return THREAD_NULL for the steal callout and
4573  *         perform rebalancing as part of SCHED(processor_balance) i.e. sched_edge_balance()
4574  * (3) Steal a thread from another cluster based on edge
4575  *     weights (sched_edge_steal_thread())
4576  *
4577  * = SCHED(processor_balance) for Edge Scheduler =
4578  *
4579  * If steal_thread did not return a thread for the processor, use
4580  * sched_edge_balance() to rebalance foreign running threads and idle out this CPU.
4581  *
4582  * = Clutch Bucket Preferred Cluster Overrides =
4583  *
4584  * Since these operations (just like thread migrations on enqueue)
4585  * move threads across clusters, they need support for handling clutch
4586  * bucket group level preferred pset recommendations.
4587  * For (1), a clutch bucket will be enqueued in the corresponding steal
4588  * silo and queue based on its preferred pset and scheduling bucket
4589  * respectively.
4590  * For (2), the running thread will set the bit on the processor based
4591  * on its preferred cluster type.
4592  * For (3), the edge configuration would prevent threads from being stolen
4593  * in the wrong direction.
4594  *
4595  * = SCHED(thread_should_yield) =
4596  * The thread_should_yield() logic should remain close to matching what
4597  * thread_select() would do for a yielding thread. Note, cases where
4598  * thread_should_yield() answers "yes" but thread_select() does not
4599  * context-switch out the yielding thread still result in a transient
4600  * priority drop for the yielding thread (not to mention timing effects
4601  * from choosing to consult thread_select()), which could racily affect
4602  * migration decisions happening from other cores.
4603  */
4604 
4605 static bool
sched_edge_steal_thread_enabled(__unused processor_set_t pset)4606 sched_edge_steal_thread_enabled(__unused processor_set_t pset)
4607 {
4608 	return true;
4609 }
4610 
4611 /*
4612  * sched_edge_pset_peek_steal_possible()
4613  *
4614  * Routine to fast-path evaluate whether the steal_from_pset may
4615  * contain threads eligible to be stolen to the idle_pset.
4616  * Can be called WITHOUT either pset locked.
4617  */
4618 static inline bool
sched_edge_pset_peek_steal_possible(processor_set_t steal_from_pset,processor_set_t idle_pset,bitmap_t silos_filter)4619 sched_edge_pset_peek_steal_possible(
4620 	processor_set_t steal_from_pset,
4621 	processor_set_t idle_pset,
4622 	bitmap_t silos_filter)
4623 {
4624 	bitmap_t populated_silos =
4625 	    os_atomic_load(steal_from_pset->pset_clutch_root.scr_populated_steal_silos, relaxed);
4626 	bitmap_t permissive_migration_allowed_map =
4627 	    os_atomic_load(idle_pset->pset_clutch_root.scr_incoming_migration_allowed[PERMISSIVE_MIGRATION_BUCKET], relaxed);
4628 	bitmap_t eligible_silos = silos_filter & populated_silos & permissive_migration_allowed_map;
4629 	if (eligible_silos == 0) {
4630 		/* No eligible silos that contain threads */
4631 		return false;
4632 	}
4633 	for (int silo_id = lsb_first(eligible_silos); silo_id >= 0; silo_id = lsb_next(eligible_silos, silo_id)) {
4634 		sched_edge_steal_silo_t steal_silo =
4635 		    sched_edge_steal_silo_from_pset_id((pset_id_t)silo_id, &steal_from_pset->pset_clutch_root);
4636 		bitmap_t populated_queues = os_atomic_load(steal_silo->sess_populated_steal_queues, relaxed);
4637 		int highest_populated_bucket = lsb_first(populated_queues);
4638 		if (highest_populated_bucket != -1) {
4639 			sched_clutch_edge silo_edge =
4640 			    sched_edge_config_get(silo_id, idle_pset->pset_id, highest_populated_bucket);
4641 			if (silo_edge.sce_steal_allowed || (silo_id == idle_pset->pset_id)) {
4642 				/* Found eligible candidate */
4643 				return true;
4644 			}
4645 		}
4646 	}
4647 	/* Silos only contain threads of QoSes not allowed to be stolen across the edge */
4648 	return false;
4649 }
4650 
4651 
4652 /*
4653  * Configurable behaviors when looking for threads to steal
4654  * out of a particular pset.
4655  */
4656 __options_decl(sched_edge_steal_options_t, uint8_t, {
4657 	SCHED_EDGE_STEAL_OPTIONS_NONE                 = 0x0,
4658 	/* Only steal when there are more threads at the QoS than CPUs in the pset */
4659 	SCHED_EDGE_STEAL_OPTIONS_ONLY_EXCESS_LOAD     = 0x1,
4660 });
4661 
4662 /*
4663  * sched_edge_pset_steal_thread()
4664  *
4665  * Routine to return the highest QoS thread enqueued in
4666  * steal_from_pset which is eligible to be stolen to
4667  * idle_pset, based on the policy configured in steal_options
4668  * combined with the Edge matrix.
4669  * Always called with the steal_from_pset locked.
4670  */
4671 static thread_t
sched_edge_pset_steal_thread(processor_set_t steal_from_pset,processor_set_t idle_pset,bitmap_t silos_filter,sched_edge_steal_options_t steal_options)4672 sched_edge_pset_steal_thread(
4673 	processor_set_t steal_from_pset,
4674 	processor_set_t idle_pset,
4675 	bitmap_t silos_filter,
4676 	sched_edge_steal_options_t steal_options)
4677 {
4678 	bitmap_t populated_silos =
4679 	    os_atomic_load(steal_from_pset->pset_clutch_root.scr_populated_steal_silos, relaxed);
4680 	bitmap_t silos_to_search = populated_silos & silos_filter;
4681 	thread_t highest_pri_thread = THREAD_NULL;
4682 	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4683 	while (sched_iterate_psets_ordered(idle_pset, &idle_pset->spill_search_order[TH_BUCKET_FIXPRI],
4684 	    silos_to_search, &istate)) {
4685 		int silo_id = istate.spis_pset_id;
4686 		sched_edge_steal_silo_t steal_silo =
4687 		    sched_edge_steal_silo_from_pset_id(silo_id, &steal_from_pset->pset_clutch_root);
4688 		bitmap_t populated_queues = os_atomic_load(steal_silo->sess_populated_steal_queues, relaxed);
4689 		for (int bucket = lsb_first(populated_queues); bucket >= 0; bucket = lsb_next(populated_queues, bucket)) {
4690 			sched_clutch_edge silo_edge = sched_edge_config_get(silo_id, idle_pset->pset_id, bucket);
4691 			if ((silo_edge.sce_steal_allowed == false) && (silo_id != idle_pset->pset_id)) {
4692 				/*
4693 				 * Stealing not allowed to the idle_pset for threads of this QoS and
4694 				 * recommended to this silo.
4695 				 * Assume that a higher QoS disallowing steal implies the same for
4696 				 * all lower QoSes.
4697 				 */
4698 				break;
4699 			}
4700 			if (steal_options & SCHED_EDGE_STEAL_OPTIONS_ONLY_EXCESS_LOAD) {
4701 				if (silo_edge.sce_migration_weight != 0) {
4702 					uint32_t candidate_runq_depth = os_atomic_load(&steal_from_pset->pset_runnable_depth[bucket], relaxed);
4703 					if (candidate_runq_depth <= pset_available_cpu_count(steal_from_pset)) {
4704 						/* No excess threads at or above this bucket */
4705 						continue;
4706 					}
4707 				}
4708 			}
4709 			/* Thread candidate found */
4710 			struct priority_queue_sched_max *steal_queue = &steal_silo->sess_steal_queues[bucket];
4711 			sched_clutch_bucket_t clutch_bucket = priority_queue_max(steal_queue, struct sched_clutch_bucket, scb_stealqlink);
4712 			thread_t thread = priority_queue_max(&clutch_bucket->scb_thread_runq, struct thread, th_clutch_runq_link);
4713 			/* Bias ties in favor of psets earlier in the search order */
4714 			if ((highest_pri_thread == THREAD_NULL) || (thread->sched_pri > highest_pri_thread->sched_pri)) {
4715 				highest_pri_thread = thread;
4716 			}
4717 			/* Since this thread is from the highest eligible QoS we found in this silo, move on to search other silos */
4718 			break;
4719 		}
4720 	}
4721 	return highest_pri_thread;
4722 }
4723 
4724 static thread_t
sched_edge_foreign_runnable_thread_remove(processor_set_t idle_pset,uint64_t ctime)4725 sched_edge_foreign_runnable_thread_remove(processor_set_t idle_pset, uint64_t ctime)
4726 {
4727 	thread_t thread = THREAD_NULL;
4728 
4729 	/*
4730 	 * Search all the psets that are foreign for the idle_pset,
4731 	 * iterating in reverse spill order to prioritize rescuing
4732 	 * threads from their least desired, most "distant" spill
4733 	 * location.
4734 	 */
4735 	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4736 	istate.spis_options = SCHED_PSET_ITERATE_STATE_OPTIONS_REVERSE;
4737 	while (sched_iterate_psets_ordered(idle_pset, &idle_pset->spill_search_order[PERMISSIVE_MIGRATION_BUCKET],
4738 	    idle_pset->foreign_psets[0], &istate)) {
4739 		processor_set_t target_pset = pset_array[istate.spis_pset_id];
4740 		/*
4741 		 * For each pset, see if there are any runnable foreign threads.
4742 		 * This check is currently being done without the pset lock to make it cheap for
4743 		 * the common case.
4744 		 */
4745 		pset_node_t dst_node = pset_node_for_pset_cluster_type(idle_pset->pset_cluster_type);
4746 		if (!sched_edge_pset_peek_steal_possible(target_pset, idle_pset, dst_node->pset_map)) {
4747 			continue;
4748 		}
4749 		/*
4750 		 * Looks like there are runnable foreign threads in the hierarchy; lock the pset
4751 		 * and get the highest priority thread.
4752 		 */
4753 		pset_lock(target_pset);
4754 		thread = sched_edge_pset_steal_thread(target_pset, idle_pset, dst_node->pset_map,
4755 		    SCHED_EDGE_STEAL_OPTIONS_NONE);
4756 		if (thread != THREAD_NULL) {
4757 			sched_clutch_thread_remove(&target_pset->pset_clutch_root, thread, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4758 			SCHED(update_pset_load_average)(target_pset, ctime);
4759 		}
4760 		pset_unlock(target_pset);
4761 
4762 		/*
4763 		 * Edge Scheduler Optimization
4764 		 *
4765 		 * The current implementation immediately returns as soon as it finds a foreign
4766 		 * runnable thread. This could be enhanced to look at highest priority threads
4767 		 * from all foreign clusters and pick the highest amongst them. That would need
4768 		 * some form of global state across psets to make that kind of a check cheap.
4769 		 */
4770 		if (thread != THREAD_NULL) {
4771 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNABLE) | DBG_FUNC_NONE, thread_tid(thread), idle_pset->pset_id, target_pset->pset_id, 0);
4772 			break;
4773 		}
4774 		/* Looks like the thread escaped after the check but before the pset lock was taken; continue the search */
4775 	}
4776 
4777 	return thread;
4778 }
4779 
4780 /*
4781  * sched_edge_cpu_running_foreign_shared_rsrc_available()
4782  *
4783  * Routine to determine if the thread running on a CPU is a shared resource thread
4784  * and can be rebalanced to the cluster with an idle CPU. It is used to determine if
4785  * a CPU going idle on a pset should rebalance a running shared resource heavy thread
4786  * from another non-ideal cluster based on the former's shared resource load.
4787  */
4788 static boolean_t
sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset,int foreign_cpu,processor_set_t idle_pset)4789 sched_edge_cpu_running_foreign_shared_rsrc_available(processor_set_t target_pset, int foreign_cpu, processor_set_t idle_pset)
4790 {
4791 	boolean_t idle_pset_shared_rsrc_rr_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_RR);
4792 	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], foreign_cpu) && !idle_pset_shared_rsrc_rr_idle) {
4793 		return false;
4794 	}
4795 
4796 	boolean_t idle_pset_shared_rsrc_biu_idle = sched_edge_shared_rsrc_idle(idle_pset, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
4797 	if (bit_test(target_pset->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], foreign_cpu) && !idle_pset_shared_rsrc_biu_idle) {
4798 		return false;
4799 	}
4800 	return true;
4801 }
4802 
4803 static boolean_t
sched_edge_foreign_running_thread_available(processor_set_t pset)4804 sched_edge_foreign_running_thread_available(processor_set_t pset)
4805 {
4806 	bitmap_t *foreign_pset_bitmap = pset->foreign_psets;
4807 	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4808 	while (sched_iterate_psets_ordered(pset, &pset->spill_search_order[PERMISSIVE_MIGRATION_BUCKET], foreign_pset_bitmap[0], &istate)) {
4809 		/* Skip the pset if its not schedulable */
4810 		processor_set_t target_pset = pset_array[istate.spis_pset_id];
4811 		if (pset_is_recommended(target_pset) == false) {
4812 			continue;
4813 		}
4814 
4815 		uint64_t running_foreign_bitmap = target_pset->cpu_state_map[PROCESSOR_RUNNING] & target_pset->cpu_running_foreign;
4816 		for (int cpu_foreign = bit_first(running_foreign_bitmap); cpu_foreign >= 0; cpu_foreign = bit_next(running_foreign_bitmap, cpu_foreign)) {
4817 			if (sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpu_foreign, pset)) {
4818 				return true;
4819 			}
4820 		}
4821 	}
4822 	return false;
4823 }
4824 
4825 static thread_t
sched_edge_steal_thread(processor_set_t idle_pset,uint64_t candidate_pset_bitmap)4826 sched_edge_steal_thread(processor_set_t idle_pset, uint64_t candidate_pset_bitmap)
4827 {
4828 	thread_t stolen_thread = THREAD_NULL;
4829 
4830 	/*
4831 	 * Edge Scheduler Optimization
4832 	 *
4833 	 * The logic today bails as soon as it finds a cluster where the cluster load is
4834 	 * greater than the edge weight. Maybe it should have a more advanced version
4835 	 * which looks for the maximum delta etc.
4836 	 */
4837 	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
4838 	while (sched_iterate_psets_ordered(idle_pset, &idle_pset->spill_search_order[PERMISSIVE_MIGRATION_BUCKET], candidate_pset_bitmap, &istate)) {
4839 		processor_set_t steal_from_pset = pset_array[istate.spis_pset_id];
4840 		bitmap_t migration_allowed_map =
4841 		    os_atomic_load(idle_pset->pset_clutch_root.scr_incoming_migration_allowed[PERMISSIVE_MIGRATION_BUCKET], relaxed);
4842 		if (!sched_edge_pset_peek_steal_possible(steal_from_pset, idle_pset, migration_allowed_map)) {
4843 			continue;
4844 		}
4845 		pset_lock(steal_from_pset);
4846 
4847 		sched_edge_steal_options_t steal_options = SCHED_EDGE_STEAL_OPTIONS_ONLY_EXCESS_LOAD;
4848 		stolen_thread = sched_edge_pset_steal_thread(steal_from_pset, idle_pset, migration_allowed_map, steal_options);
4849 
4850 		if (stolen_thread != THREAD_NULL) {
4851 			uint64_t current_timestamp = mach_absolute_time();
4852 			sched_clutch_thread_remove(&steal_from_pset->pset_clutch_root, stolen_thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
4853 			SCHED(update_pset_load_average)(steal_from_pset, current_timestamp);
4854 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STEAL) | DBG_FUNC_NONE, thread_tid(stolen_thread), idle_pset->pset_id, steal_from_pset->pset_id, 0);
4855 		}
4856 
4857 		pset_unlock(steal_from_pset);
4858 		if (stolen_thread != THREAD_NULL) {
4859 			break;
4860 		}
4861 	}
4862 	return stolen_thread;
4863 }
4864 
4865 /*
4866  * sched_edge_processor_idle()
4867  *
4868  * The routine is the implementation for steal_thread() for the Edge scheduler.
4869  */
4870 static thread_t
sched_edge_processor_idle(processor_set_t pset)4871 sched_edge_processor_idle(processor_set_t pset)
4872 {
4873 	thread_t thread = THREAD_NULL;
4874 
4875 	uint64_t ctime = mach_absolute_time();
4876 
4877 	processor_t processor = current_processor();
4878 	bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id);
4879 
4880 	/* Each of the operations acquire the lock for the pset they target */
4881 	pset_unlock(pset);
4882 
4883 	/* Find highest priority runnable thread on all non-native clusters */
4884 	thread = sched_edge_foreign_runnable_thread_remove(pset, ctime);
4885 	if (thread != THREAD_NULL) {
4886 		return thread;
4887 	}
4888 
4889 	/* Find highest priority runnable thread on all native clusters */
4890 	thread = sched_edge_steal_thread(pset, pset->native_psets[0]);
4891 	if (thread != THREAD_NULL) {
4892 		return thread;
4893 	}
4894 
4895 	/* Find foreign running threads to rebalance; the actual rebalance is done in sched_edge_balance() */
4896 	boolean_t rebalance_needed = sched_edge_foreign_running_thread_available(pset);
4897 	if (rebalance_needed) {
4898 		return THREAD_NULL;
4899 	}
4900 
4901 	/* No foreign-enqueued threads found; find a thread to steal from all clusters based on weights/loads etc. */
4902 	thread = sched_edge_steal_thread(pset, pset->native_psets[0] | pset->foreign_psets[0]);
4903 	return thread;
4904 }
4905 
4906 /* Return true if this shared resource thread has a better cluster to run on */
4907 static bool
sched_edge_shared_rsrc_migrate_possible(thread_t thread,processor_set_t preferred_pset,processor_set_t current_pset)4908 sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferred_pset, processor_set_t current_pset)
4909 {
4910 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
4911 	uint64_t current_pset_load = sched_edge_pset_cluster_shared_rsrc_load(current_pset, shared_rsrc_type);
4912 	/*
4913 	 * Adjust the current pset load to discount the current thread only if the current pset is a preferred pset type. This allows the
4914 	 * scheduler to rebalance threads from non-preferred cluster to an idle cluster of the preferred type.
4915 	 *
4916 	 * Edge Scheduler Optimization
4917 	 * For multi-cluster machines, it might be useful to enhance this mechanism to migrate between clusters of the preferred type.
4918 	 */
4919 	uint64_t current_pset_adjusted_load = (current_pset->pset_type != preferred_pset->pset_type) ? current_pset_load : (current_pset_load - 1);
4920 
4921 	uint64_t eligible_pset_bitmask = 0;
4922 	if (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST) {
4923 		/*
4924 		 * For the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy, the load balancing occurs
4925 		 * only among clusters native with the preferred cluster.
4926 		 */
4927 		eligible_pset_bitmask = preferred_pset->native_psets[0];
4928 		bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id);
4929 	} else {
4930 		/* For EDGE_SHARED_RSRC_SCHED_POLICY_RR, the load balancing happens among all clusters */
4931 		eligible_pset_bitmask = os_atomic_load(&sched_edge_available_pset_bitmask[0], relaxed);
4932 	}
4933 
4934 	/* For each eligible cluster check if there is an under-utilized cluster; return true if there is */
4935 	for (int cluster_id = bit_first(eligible_pset_bitmask); cluster_id >= 0; cluster_id = bit_next(eligible_pset_bitmask, cluster_id)) {
4936 		if (cluster_id == current_pset->pset_cluster_id) {
4937 			continue;
4938 		}
4939 		uint64_t cluster_load = sched_edge_pset_cluster_shared_rsrc_load(pset_array[cluster_id], shared_rsrc_type);
4940 		if (current_pset_adjusted_load > cluster_load) {
4941 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE) | DBG_FUNC_NONE, current_pset_load, current_pset->pset_cluster_id, cluster_load, cluster_id);
4942 			return true;
4943 		}
4944 	}
4945 	return false;
4946 }
4947 
4948 /*
4949  * Stir-the-pot Registry:
4950  *
4951  * Global state tracking which cores currently have threads that
4952  * are ready to be stirred onto cores of the opposite type.
4953  *
4954  * The registry state updates are implemented with atomic transaction
4955  * operations rather than a global lock, in order to avoid the cost
4956  * of serializing some of the most frequent registry state update
4957  * callsites that depend on consistent speed--namely the
4958  * preemption check and context-switch paths. The most expensive
4959  * state update, in sched_edge_stir_the_pot_try_trigger_swap(), only
4960  * happens at quantum expiration, which should allow cheaper
4961  * operations at other callsites to win the race.
4962  */
4963 typedef unsigned __int128 sched_edge_stp_registry_t;
4964 _Atomic sched_edge_stp_registry_t sched_edge_stir_the_pot_global_registry = 0LL;
4965 #define SESTP_BITS_PER_CORE (2)
4966 #define SESTP_BIT_POS(cpu_id) ((sched_edge_stp_registry_t)(cpu_id * SESTP_BITS_PER_CORE))
4967 #define SESTP_MASK(cpu_id) ((sched_edge_stp_registry_t)mask(SESTP_BITS_PER_CORE) << SESTP_BIT_POS(cpu_id))
4968 static_assert((SESTP_BITS_PER_CORE * MAX_CPUS) <= (sizeof(sched_edge_stp_registry_t) * 8),
4969     "Global registry must fit per-core bits for each core");
4970 
4971 #define SESTP_EXTRACT_STATE(registry, cpu_id) ((registry >> SESTP_BIT_POS(cpu_id)) & mask(SESTP_BITS_PER_CORE))
4972 #define SESTP_SET_STATE(registry, cpu_id, state) ((registry & ~SESTP_MASK(cpu_id)) | ((sched_edge_stp_registry_t)state << SESTP_BIT_POS(cpu_id)))
4973 __enum_decl(sched_edge_stp_state_t, uint8_t, {
4974 	SCHED_EDGE_STP_NOT_WANT   = 0,
4975 	SCHED_EDGE_STP_REQUESTED  = 1,
4976 	SCHED_EDGE_STP_PENDING    = 2,
4977 	SCHED_EDGE_STP_MAX        = SCHED_EDGE_STP_PENDING
4978 });
4979 static_assert(SCHED_EDGE_STP_MAX <= mask(SESTP_BITS_PER_CORE),
4980     "Per-core stir-the-pot request state must fit in per-core bits");
4981 
4982 #if OS_ATOMIC_USE_LLSC
4983 #error "Expecting CAS implementation of os_atomic_rmw_loop()"
4984 #endif /* OS_ATOMIC_USE_LLSC */
4985 
4986 static cpumap_t sched_edge_p_core_map = 0ULL;
4987 static cpumap_t sched_edge_non_p_core_map = 0ULL;
4988 
4989 /*
4990  * In order to reduce the chance of picking the same CPUs over
4991  * and over unfairly for stir-the-pot swaps, use an offset value
4992  * for the lsb selection, which rotates by one index each time
4993  * the choice is evaluated.
4994  */
4995 static _Atomic uint64_t sched_edge_stp_selection_p_core_offset = 0;
4996 static _Atomic uint64_t sched_edge_stp_selection_non_p_core_offset = 0;
4997 
4998 /*
4999  * sched_edge_stir_the_pot_try_trigger_swap()
5000  *
5001  * Search for an eligible swap candidate on the opposite core
5002  * type, and if one is found, initiate a swap for stir-the-pot.
5003  * From a P-core, initiating means sending an inbox message and IPI
5004  * to the swapping lower performance core. For initiating swap from
5005  * a lower performance core, only an inbox message needs to be sent
5006  * to itself, naming the P-core for swap.
5007  * If no eligible candidate is found, mark the current processor
5008  * as requesting stir-the-pot swap--that is unless a swap has already
5009  * been initiated for this core, in which case we should sit tight.
5010  * Thread lock must be held.
5011  */
5012 static inline int
sched_edge_stir_the_pot_try_trigger_swap(thread_t thread)5013 sched_edge_stir_the_pot_try_trigger_swap(thread_t thread)
5014 {
5015 	processor_t self_processor = current_processor();
5016 	int self_cpu = self_processor->cpu_id;
5017 	/*
5018 	 * Prepare the core mask of candidate cores (of the opposite type),
5019 	 * and compute an offset where the candidate search should begin,
5020 	 * to avoid unfairly swapping with the same cores repeatedly.
5021 	 */
5022 	cpumap_t swap_candidates_map;
5023 	uint64_t offset;
5024 	if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
5025 		swap_candidates_map = sched_edge_non_p_core_map;
5026 		offset = os_atomic_inc_orig(&sched_edge_stp_selection_non_p_core_offset, relaxed);
5027 	} else {
5028 		swap_candidates_map = sched_edge_p_core_map;
5029 		offset = os_atomic_inc_orig(&sched_edge_stp_selection_p_core_offset, relaxed);
5030 	}
5031 	int num_candidates = bit_count(swap_candidates_map);
5032 	if (num_candidates == 0) {
5033 		/* Too early in boot, no cores of opposite type */
5034 		return -1;
5035 	}
5036 	int cpu_of_type_offset_ind = offset % num_candidates;
5037 	int search_start_ind = lsb_first(swap_candidates_map);
5038 	for (int i = 0; i < cpu_of_type_offset_ind; i++) {
5039 		search_start_ind = lsb_next(swap_candidates_map, search_start_ind);
5040 		assert3s(search_start_ind, !=, -1);
5041 	}
5042 	assert3s(search_start_ind, !=, -1);
5043 	swap_candidates_map = bit_ror64(swap_candidates_map, search_start_ind);
5044 	/*
5045 	 * Search the registry for candidate cores of the opposite type which
5046 	 * have requested swap.
5047 	 */
5048 	int swap_cpu;
5049 	sched_edge_stp_registry_t old_registry, new_registry, intermediate_registry;
5050 	sched_edge_stp_state_t self_state;
5051 	/* BEGIN IGNORE CODESTYLE */
5052 	os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
5053 	    old_registry, new_registry, relaxed, {
5054 		swap_cpu = -1;
5055 		self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
5056 		if (self_state == SCHED_EDGE_STP_PENDING) {
5057 			/*
5058 			 * Another core already initiated a swap with us, so we should
5059 			 * wait for that one to finish rather than initiate or request
5060 			 * a new one.
5061 			 */
5062 			os_atomic_rmw_loop_give_up(break);
5063 		}
5064 		/* Scan candidates */
5065 		for (int rotid = lsb_first(swap_candidates_map); rotid != -1; rotid = lsb_next(swap_candidates_map, rotid)) {
5066 			int candidate_cpu = (rotid + search_start_ind) % 64; // un-rotate the bit
5067 			sched_edge_stp_state_t candidate_state = SESTP_EXTRACT_STATE(old_registry, candidate_cpu);
5068 			if (candidate_state == SCHED_EDGE_STP_REQUESTED) {
5069 				sched_bucket_t candidate_qos = os_atomic_load(
5070 				    &processor_array[candidate_cpu]->processor_set->cpu_running_buckets[candidate_cpu], relaxed);
5071 				if (candidate_qos == thread->th_sched_bucket) {
5072 					/* Found a requesting candidate of matching QoS */
5073 					swap_cpu = candidate_cpu;
5074 					break;
5075 				}
5076 			}
5077 		}
5078 		if (swap_cpu == -1) {
5079 			/* No candidates requesting swap, so mark this core as requesting */
5080 			intermediate_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_REQUESTED);
5081 		} else {
5082 			/*
5083 			 * Mark candidate core as selected/pending for swap, and mark
5084 			 * current CPU as not needing a swap anymore, since we will now
5085 			 * start one.
5086 			 */
5087 			intermediate_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_PENDING);
5088 			intermediate_registry = SESTP_SET_STATE(intermediate_registry, swap_cpu, SCHED_EDGE_STP_PENDING);
5089 		}
5090 		new_registry = intermediate_registry;
5091 	});
5092 	/* END IGNORE CODESTYLE */
5093 	/* Leave debug tracepoints for tracking any updates to registry state */
5094 	if (self_state != SCHED_EDGE_STP_PENDING) {
5095 		if (swap_cpu == -1) {
5096 			if (self_state != SCHED_EDGE_STP_REQUESTED) {
5097 				/* Now requesting */
5098 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
5099 				    DBG_FUNC_START, 0, self_cpu, cpu_of_type_offset_ind, 0);
5100 			}
5101 		} else {
5102 			if (self_state == SCHED_EDGE_STP_REQUESTED) {
5103 				/* Now pending */
5104 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
5105 				    DBG_FUNC_END, 1, self_cpu, cpu_of_type_offset_ind, 0);
5106 			}
5107 			int swap_state = SESTP_EXTRACT_STATE(old_registry, swap_cpu);
5108 			if (swap_state == SCHED_EDGE_STP_REQUESTED) {
5109 				/* Swap core now pending */
5110 				KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) |
5111 				    DBG_FUNC_END, 1, swap_cpu, cpu_of_type_offset_ind, 0);
5112 			}
5113 		}
5114 	}
5115 	if (swap_cpu != -1) {
5116 		/* Initiate a stir-the-pot swap */
5117 		assert3s(swap_cpu, <, ml_get_topology_info()->num_cpus);
5118 		assert3s(swap_cpu, !=, self_processor->cpu_id);
5119 		processor_t swap_processor = processor_array[swap_cpu];
5120 		if (swap_processor == PROCESSOR_NULL) {
5121 			/* Unlikely early boot initialization race */
5122 			return -1;
5123 		}
5124 		assert3u(sched_edge_stir_the_pot_core_type_is_desired(swap_processor->processor_set), !=,
5125 		    sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set));
5126 		if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
5127 			/*
5128 			 * Send a message and IPI notification to the lower-performance
5129 			 * core we found which wants to swap, so it will know to send its
5130 			 * thread back here.
5131 			 */
5132 			os_atomic_store(&swap_processor->stir_the_pot_inbox_cpu, self_cpu, relaxed);
5133 			processor_set_t swap_pset = swap_processor->processor_set;
5134 			pset_lock(swap_pset);
5135 			sched_ipi_type_t ipi_type = sched_ipi_action(swap_processor, NULL,
5136 			    SCHED_IPI_EVENT_REBALANCE);
5137 			pset_unlock(swap_pset);
5138 			sched_ipi_perform(swap_processor, ipi_type);
5139 		} else {
5140 			/*
5141 			 * Send message to self to send this thread to the swap P-core. P-core
5142 			 * will clear its own pending state upon commiting to the incoming swap
5143 			 * thread after that happens.
5144 			 */
5145 			os_atomic_store(&self_processor->stir_the_pot_inbox_cpu, swap_cpu, relaxed);
5146 		}
5147 	}
5148 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_NONE,
5149 	    (swap_cpu != -1) ? 1 : 0, swap_cpu, old_registry, cpu_of_type_offset_ind);
5150 	return swap_cpu;
5151 }
5152 
5153 /*
5154  * sched_edge_stir_the_pot_clear_registry_entry()
5155  *
5156  * Mark the current CPU as NOT containing a thread which is eligible
5157  * to be swapped for stir-the-pot.
5158  * Preemption must be disabled.
5159  */
5160 void
sched_edge_stir_the_pot_clear_registry_entry(void)5161 sched_edge_stir_the_pot_clear_registry_entry(void)
5162 {
5163 	int self_cpu = current_processor()->cpu_id;
5164 	sched_edge_stp_state_t self_state;
5165 	sched_edge_stp_registry_t old_registry, new_registry;
5166 	os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
5167 	    old_registry, new_registry, relaxed, {
5168 		self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
5169 		if (self_state == SCHED_EDGE_STP_NOT_WANT) {
5170 		        /* State already cleared, nothing to be done */
5171 		        os_atomic_rmw_loop_give_up(break);
5172 		}
5173 		new_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_NOT_WANT);
5174 	});
5175 	if (self_state == SCHED_EDGE_STP_REQUESTED) {
5176 		/* Request was cleared */
5177 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_END,
5178 		    2, self_cpu, 0, 0);
5179 	}
5180 }
5181 
5182 /*
5183  * sched_edge_stir_the_pot_set_registry_entry()
5184  *
5185  * Mark the current CPU as containing a thread which is eligible
5186  * to be swapped to a core of the opposite type for stir-the-pot.
5187  * Preemption must be disabled.
5188  */
5189 static inline void
sched_edge_stir_the_pot_set_registry_entry(void)5190 sched_edge_stir_the_pot_set_registry_entry(void)
5191 {
5192 	int self_cpu = current_processor()->cpu_id;
5193 	sched_edge_stp_state_t self_state;
5194 	sched_edge_stp_registry_t old_registry, new_registry;
5195 	bool newly_requested = os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry,
5196 	    old_registry, new_registry, relaxed, {
5197 		self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu);
5198 		if (self_state == SCHED_EDGE_STP_REQUESTED) {
5199 		        /* Core already registered, nothing to be done */
5200 		        os_atomic_rmw_loop_give_up(break);
5201 		}
5202 		new_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_REQUESTED);
5203 	});
5204 	if (newly_requested) {
5205 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_START,
5206 		    3, self_cpu, self_state, 0);
5207 	}
5208 }
5209 
5210 /* Stir-the-pot is designed for sharing time on the P-cores */
5211 static inline bool
sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset)5212 sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset)
5213 {
5214 	return pset->pset_type == CLUSTER_TYPE_P;
5215 }
5216 
5217 /*
5218  * sched_edge_stir_the_pot_thread_eligible()
5219  *
5220  * Determine whether a thread is eligible to engage in a
5221  * stir-the-pot swap. It must be P-recommended, unbound, and not
5222  * round-robin shared resource. Additionally, it must have already
5223  * expired quantum on its current core type.
5224  */
5225 static inline bool
sched_edge_stir_the_pot_thread_eligible(thread_t thread)5226 sched_edge_stir_the_pot_thread_eligible(thread_t thread)
5227 {
5228 	processor_set_t preferred_pset;
5229 	if ((thread == THREAD_NULL) ||
5230 	    ((preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)]) == PROCESSOR_SET_NULL)) {
5231 		/* Still initializing at boot */
5232 		return false;
5233 	}
5234 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5235 	bool right_kind_of_thread =
5236 	    sched_edge_stir_the_pot_core_type_is_desired(preferred_pset) &&
5237 	    (thread->sched_mode != TH_MODE_REALTIME) &&
5238 	    ((thread->state & TH_IDLE) == 0) &&
5239 	    SCHED_CLUTCH_THREAD_ELIGIBLE(thread) &&
5240 	    (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) == false) &&
5241 	    (shared_rsrc_type == CLUSTER_SHARED_RSRC_TYPE_NONE ||
5242 	    shared_rsrc_type == CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST);
5243 	bool ready_for_swap = sched_edge_stir_the_pot_core_type_is_desired(current_processor()->processor_set) ?
5244 	    thread->th_expired_quantum_on_higher_core :
5245 	    thread->th_expired_quantum_on_lower_core;
5246 	return right_kind_of_thread && ready_for_swap;
5247 }
5248 
5249 /*
5250  * sched_edge_stir_the_pot_check_inbox_for_thread()
5251  *
5252  * Check whether this thread on a non-P-core has been chosen by a P-core to
5253  * swap places for stir-the-pot, optionally consuming the inbox message.
5254  * Preemption must be disabled.
5255  */
5256 static inline int
sched_edge_stir_the_pot_check_inbox_for_thread(thread_t thread,bool consume_message)5257 sched_edge_stir_the_pot_check_inbox_for_thread(thread_t thread, bool consume_message)
5258 {
5259 	processor_t self_processor = current_processor();
5260 	int dst_cpu = -1;
5261 	if (sched_edge_stir_the_pot_thread_eligible(thread)) {
5262 		/* Thread can accept the inbox message */
5263 		dst_cpu = os_atomic_load(&self_processor->stir_the_pot_inbox_cpu, relaxed);
5264 	} else {
5265 		/* Ensure registry state is cleared for ineligible thread, if it hasn't been already */
5266 		sched_edge_stir_the_pot_clear_registry_entry();
5267 		/*
5268 		 * Note, we don't clear a possible inbox message, in case an eligible
5269 		 * thread comes back on-core quickly to receive it.
5270 		 */
5271 	}
5272 	if (consume_message) {
5273 		/*
5274 		 * Unconditionally clear inbox, since either we are triggering a
5275 		 * swap now or ultimately discarding the message because conditions
5276 		 * have changed (thread not eligible).
5277 		 */
5278 		os_atomic_store(&self_processor->stir_the_pot_inbox_cpu, -1, relaxed);
5279 		/*
5280 		 * We may have delayed requesting stir-the-pot swap for the the current thread
5281 		 * due to a pending inbox message for the previous thread. Now that that such
5282 		 * a message has been received, finishing updating the registry state.
5283 		 */
5284 		if (sched_edge_stir_the_pot_thread_eligible(self_processor->active_thread)) {
5285 			sched_edge_stir_the_pot_set_registry_entry();
5286 		}
5287 	}
5288 	return dst_cpu;
5289 }
5290 
5291 /*
5292  * sched_edge_stir_the_pot_update_registry_state()
5293  *
5294  * Update stir-the-pot state for the current processor based on its
5295  * (possibly new) current thread. This sets or clears the registry state
5296  * which indicates whether the processor is running a thread that wants
5297  * and is eligible to be swapped with a thread on the opposite core type.
5298  * Preemption must be disabled.
5299  */
5300 void
sched_edge_stir_the_pot_update_registry_state(thread_t thread)5301 sched_edge_stir_the_pot_update_registry_state(thread_t thread)
5302 {
5303 	processor_t self_processor = current_processor();
5304 	/*
5305 	 * Clear corresponding th_expired_quantum_on_ field now that thread
5306 	 * is getting a chance to run on the opposite type.
5307 	 */
5308 	if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) {
5309 		thread->th_expired_quantum_on_lower_core = false;
5310 	} else {
5311 		thread->th_expired_quantum_on_higher_core = false;
5312 	}
5313 	if (sched_edge_stir_the_pot_thread_eligible(thread)) {
5314 		int inbox_message = os_atomic_load(&self_processor->stir_the_pot_inbox_cpu, relaxed);
5315 		if (inbox_message == -1) {
5316 			/* Set the registry bit */
5317 			sched_edge_stir_the_pot_set_registry_entry();
5318 		} else {
5319 			assert(sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set) == false);
5320 			/*
5321 			 * There's an inbox message which still needs to be used at the next
5322 			 * migration decision, so avoid starting a new request or clearing the
5323 			 * interim pending status until then.
5324 			 */
5325 		}
5326 	} else {
5327 		/* Thread is ineligible for swap, so clear the registry bit */
5328 		sched_edge_stir_the_pot_clear_registry_entry();
5329 	}
5330 }
5331 
5332 /*
5333  * sched_edge_quantum_expire()
5334  *
5335  * Update stir-the-pot eligibility and drive stir-the-pot swaps.
5336  * Thread lock must be held.
5337  */
5338 static void
sched_edge_quantum_expire(thread_t thread)5339 sched_edge_quantum_expire(thread_t thread)
5340 {
5341 	if (sched_edge_stir_the_pot_core_type_is_desired(current_processor()->processor_set)) {
5342 		thread->th_expired_quantum_on_higher_core = true;
5343 	} else {
5344 		thread->th_expired_quantum_on_lower_core = true;
5345 	}
5346 	if (sched_edge_stir_the_pot_thread_eligible(thread)) {
5347 		sched_edge_stir_the_pot_try_trigger_swap(thread);
5348 	}
5349 }
5350 
5351 /*
5352  * sched_edge_run_count_incr()
5353  *
5354  * Update runnable thread counts in the same way as
5355  * sched_clutch_run_incr(), and reset per-thread, quantum-
5356  * expired tracking used by stir-the-pot, as the thread
5357  * is unblocking.
5358  */
5359 static uint32_t
sched_edge_run_count_incr(thread_t thread)5360 sched_edge_run_count_incr(thread_t thread)
5361 {
5362 	uint32_t new_count = sched_clutch_run_incr(thread);
5363 	/* Thread is unblocking and so resets its quantum tracking */
5364 	thread->th_expired_quantum_on_lower_core = false;
5365 	thread->th_expired_quantum_on_higher_core = false;
5366 	return new_count;
5367 }
5368 
5369 /* Return true if this thread should not continue running on this processor */
5370 static bool
sched_edge_thread_avoid_processor(processor_t processor,thread_t thread,ast_t reason)5371 sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason)
5372 {
5373 	if (thread->bound_processor == processor) {
5374 		/* Thread is bound here */
5375 		return false;
5376 	}
5377 
5378 	/*
5379 	 * On quantum expiry, check the migration bitmask if this thread should be migrated off this core.
5380 	 * A migration is only recommended if there's also an idle core available that needn't be avoided.
5381 	 */
5382 	if (reason & AST_QUANTUM) {
5383 		if (bit_test(processor->processor_set->perfcontrol_cpu_migration_bitmask, processor->cpu_id)) {
5384 			uint64_t non_avoided_idle_primary_map = processor->processor_set->cpu_state_map[PROCESSOR_IDLE] & processor->processor_set->recommended_bitmask & ~processor->processor_set->perfcontrol_cpu_migration_bitmask;
5385 			if (non_avoided_idle_primary_map != 0) {
5386 				return true;
5387 			}
5388 		}
5389 	}
5390 
5391 	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5392 
5393 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) &&
5394 	    preferred_pset->pset_id != processor->processor_set->pset_id &&
5395 	    pset_type_is_recommended(preferred_pset)) {
5396 		/* We should send this thread to the bound cluster */
5397 		return true;
5398 	}
5399 
5400 	sched_clutch_edge edge = (thread->sched_pri >= BASEPRI_RTQUEUES)
5401 	    ? sched_rt_config_get(preferred_pset->pset_cluster_id, processor->processor_set->pset_cluster_id)
5402 	    : sched_edge_config_get(preferred_pset->pset_cluster_id, processor->processor_set->pset_cluster_id, thread->th_sched_bucket);
5403 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) == false &&
5404 	    preferred_pset->pset_id != processor->processor_set->pset_id &&
5405 	    edge.sce_migration_allowed == false &&
5406 	    edge.sce_steal_allowed == false) {
5407 		/*
5408 		 * Thread isn't allowed to be here, according to the edge migration graph.
5409 		 * Perhaps the thread's priority or boundness or its thread group's preferred
5410 		 * pset or the edge migration graph changed.
5411 		 *
5412 		 * We should only preempt after confirming the thread actually has a
5413 		 * recommended, allowed alternative pset to run on.
5414 		 */
5415 		for (uint32_t pset_id = 0; pset_id < sched_num_psets; pset_id++) {
5416 			if (pset_id == processor->processor_set->pset_id) {
5417 				continue;
5418 			}
5419 			edge = (thread->sched_pri >= BASEPRI_RTQUEUES)
5420 			    ? sched_rt_config_get(preferred_pset->pset_id, pset_id)
5421 			    : sched_edge_config_get(preferred_pset->pset_id, pset_id, thread->th_sched_bucket);
5422 			if (pset_is_recommended(pset_array[pset_id]) && ((pset_id == preferred_pset->pset_id) || edge.sce_migration_allowed)) {
5423 				/* Thread can be run elsewhere. */
5424 				return true;
5425 			}
5426 		}
5427 	}
5428 
5429 	/* Evaluate shared resource policies */
5430 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) {
5431 		return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
5432 	}
5433 	if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST)) {
5434 		if (processor->processor_set->pset_type != preferred_pset->pset_type &&
5435 		    pset_type_is_recommended(preferred_pset)) {
5436 			return true;
5437 		}
5438 		return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set);
5439 	}
5440 
5441 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5442 		return false;
5443 	}
5444 	/* ~~ No realtime or shared resource threads beyond this point ~~ */
5445 
5446 	/*
5447 	 * Stir-the-Pot:
5448 	 * A non-P-core should preempt if a P-core has been found to swap the current,
5449 	 * quantum-expired thread to for stir-the-pot. This is in order for threads in a
5450 	 * multi-threaded workload to share time on the P-cores so they make roughly equal
5451 	 * forward progress.
5452 	 */
5453 	if (sched_edge_stir_the_pot_check_inbox_for_thread(thread, false) != -1) {
5454 		return true;
5455 	}
5456 
5457 	/*
5458 	 * Compaction:
5459 	 * If the preferred pset for the thread is now idle, try and migrate the thread to that cluster.
5460 	 */
5461 	if ((processor->processor_set != preferred_pset) &&
5462 	    (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) {
5463 		return true;
5464 	}
5465 
5466 	/*
5467 	 * Running Rebalance:
5468 	 * We are willing to preempt the thread in order to migrate it onto an idle core
5469 	 * of the preferred type.
5470 	 */
5471 	if ((processor->processor_set->pset_type != preferred_pset->pset_type) &&
5472 	    pset_type_is_recommended(preferred_pset)) {
5473 		/* Scan for idle pset */
5474 		for (uint32_t pset_id = 0; pset_id < sched_num_psets; pset_id++) {
5475 			processor_set_t candidate_pset = pset_array[pset_id];
5476 			edge = sched_edge_config_get(preferred_pset->pset_id, pset_id, thread->th_sched_bucket);
5477 			if ((candidate_pset->pset_type == preferred_pset->pset_type) &&
5478 			    edge.sce_migration_allowed &&
5479 			    (sched_edge_cluster_load_metric(candidate_pset, thread->th_sched_bucket) == 0)) {
5480 				return true;
5481 			}
5482 		}
5483 	}
5484 
5485 	return false;
5486 }
5487 
5488 static bool
sched_edge_balance(__unused processor_t cprocessor,processor_set_t cpset)5489 sched_edge_balance(__unused processor_t cprocessor, processor_set_t cpset)
5490 {
5491 	assert(cprocessor == current_processor());
5492 	pset_unlock(cpset);
5493 
5494 	uint64_t ast_processor_map = 0;
5495 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
5496 
5497 	bitmap_t *foreign_pset_bitmap = cpset->foreign_psets;
5498 	for (int cluster = bitmap_first(foreign_pset_bitmap, sched_num_psets); cluster >= 0; cluster = bitmap_next(foreign_pset_bitmap, cluster)) {
5499 		/* Skip the pset if its not schedulable */
5500 		processor_set_t target_pset = pset_array[cluster];
5501 		if (pset_is_recommended(target_pset) == false) {
5502 			continue;
5503 		}
5504 
5505 		pset_lock(target_pset);
5506 		uint64_t cpu_running_foreign_map = (target_pset->cpu_running_foreign & target_pset->cpu_state_map[PROCESSOR_RUNNING]);
5507 		for (int cpuid = lsb_first(cpu_running_foreign_map); cpuid >= 0; cpuid = lsb_next(cpu_running_foreign_map, cpuid)) {
5508 			if (!sched_edge_cpu_running_foreign_shared_rsrc_available(target_pset, cpuid, cpset)) {
5509 				continue;
5510 			}
5511 			processor_t target_cpu = processor_array[cpuid];
5512 			ipi_type[target_cpu->cpu_id] = sched_ipi_action(target_cpu, NULL, SCHED_IPI_EVENT_REBALANCE);
5513 			if (ipi_type[cpuid] != SCHED_IPI_NONE) {
5514 				bit_set(ast_processor_map, cpuid);
5515 			}
5516 		}
5517 		pset_unlock(target_pset);
5518 	}
5519 
5520 	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
5521 		processor_t ast_processor = processor_array[cpuid];
5522 		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
5523 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_REBAL_RUNNING) | DBG_FUNC_NONE, 0, cprocessor->cpu_id, cpuid, 0);
5524 	}
5525 
5526 	/* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
5527 	return ast_processor_map != 0;
5528 }
5529 
5530 /*
5531  * sched_edge_migration_check()
5532  *
5533  * Routine to evaluate an edge between two clusters to decide if migration is possible
5534  * across that edge. Also updates the selected_pset and max_edge_delta out parameters
5535  * accordingly. The return value indicates if the invoking routine should short circuit
5536  * the search, since an ideal candidate has been found. The routine looks at the regular
5537  * edges and cluster loads or the shared resource loads based on the type of thread.
5538  */
5539 static bool
sched_edge_migration_check(uint32_t cluster_id,processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread,processor_set_t * selected_pset,uint32_t * max_edge_delta)5540 sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset,
5541     uint32_t preferred_cluster_load, thread_t thread, processor_set_t *selected_pset, uint32_t *max_edge_delta)
5542 {
5543 	uint32_t preferred_cluster_id = preferred_pset->pset_cluster_id;
5544 	cluster_type_t preferred_cluster_type = pset_type_for_id(preferred_cluster_id);
5545 	processor_set_t dst_pset = pset_array[cluster_id];
5546 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5547 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5548 
5549 	if (cluster_id == preferred_cluster_id) {
5550 		return false;
5551 	}
5552 
5553 	if (dst_pset == NULL) {
5554 		return false;
5555 	}
5556 
5557 	sched_clutch_edge edge = sched_edge_config_get(preferred_cluster_id, cluster_id, thread->th_sched_bucket);
5558 	if (edge.sce_migration_allowed == false) {
5559 		return false;
5560 	}
5561 	uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_edge_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket);
5562 	if (dst_load == 0
5563 	    ) {
5564 		/* The candidate cluster is idle; select it immediately for execution */
5565 		*selected_pset = dst_pset;
5566 		*max_edge_delta = preferred_cluster_load;
5567 		return true;
5568 	}
5569 
5570 	uint32_t edge_delta = 0;
5571 	if (dst_load > preferred_cluster_load) {
5572 		return false;
5573 	}
5574 	edge_delta = preferred_cluster_load - dst_load;
5575 	if (!shared_rsrc_thread && (edge_delta < edge.sce_migration_weight)) {
5576 		/*
5577 		 * For non shared resource threads, use the edge migration weight to decide if
5578 		 * this cluster is over-committed at the QoS level of this thread.
5579 		 */
5580 		return false;
5581 	}
5582 
5583 	if (edge_delta < *max_edge_delta) {
5584 		return false;
5585 	}
5586 	if (edge_delta == *max_edge_delta) {
5587 		/* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */
5588 		boolean_t selected_homogeneous = ((*selected_pset)->pset_type == preferred_cluster_type);
5589 		boolean_t candidate_homogeneous = (dst_pset->pset_type == preferred_cluster_type);
5590 		if (selected_homogeneous || !candidate_homogeneous) {
5591 			return false;
5592 		}
5593 	}
5594 	/* dst_pset seems to be the best candidate for migration; however other candidates should still be evaluated */
5595 	*max_edge_delta = edge_delta;
5596 	*selected_pset = dst_pset;
5597 	return false;
5598 }
5599 
5600 /*
5601  * sched_edge_migrate_edges_evaluate()
5602  *
5603  * Routine to find the candidate for thread migration based on edge weights.
5604  *
5605  * Returns the most ideal cluster for execution of this thread based on outgoing edges of the preferred pset. Can
5606  * return preferred_pset if its the most ideal destination for this thread.
5607  */
5608 static processor_set_t
sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset,uint32_t preferred_cluster_load,thread_t thread)5609 sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t preferred_cluster_load, thread_t thread)
5610 {
5611 	processor_set_t selected_pset = preferred_pset;
5612 	uint32_t max_edge_delta = 0;
5613 	bool search_complete = false;
5614 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5615 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5616 
5617 	bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets;
5618 	bitmap_t *native_pset_bitmap = preferred_pset->native_psets;
5619 	/* Always start the search with the native clusters */
5620 	sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT;
5621 	while (sched_iterate_psets_ordered(preferred_pset, &preferred_pset->spill_search_order[thread->th_sched_bucket], native_pset_bitmap[0], &istate)) {
5622 		search_complete = sched_edge_migration_check(istate.spis_pset_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
5623 		if (search_complete) {
5624 			break;
5625 		}
5626 	}
5627 
5628 	if (search_complete) {
5629 		return selected_pset;
5630 	}
5631 
5632 	if (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)) {
5633 		/*
5634 		 * If the shared resource scheduling policy is EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST, the scheduler tries
5635 		 * to fill up the preferred cluster and its homogeneous peers first.
5636 		 */
5637 
5638 		if (max_edge_delta > 0) {
5639 			/*
5640 			 * This represents that there is a peer cluster of the same type as the preferred cluster (since the code
5641 			 * above only looks at the native_psets) which has lesser threads as compared to the preferred cluster of
5642 			 * the shared resource type. This indicates that there is capacity on a native cluster where this thread
5643 			 * should be placed.
5644 			 */
5645 			return selected_pset;
5646 		}
5647 		/*
5648 		 * Indicates that all peer native clusters are at the same shared resource usage; check if the preferred cluster has
5649 		 * any more capacity left.
5650 		 */
5651 		if (sched_edge_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) < pset_available_cpu_count(preferred_pset)) {
5652 			return preferred_pset;
5653 		}
5654 		/*
5655 		 * Looks like the preferred cluster and all its native peers are full with shared resource threads; need to start looking
5656 		 * at non-native clusters for capacity.
5657 		 */
5658 	}
5659 
5660 	/* Now look at the non-native clusters */
5661 	istate = SCHED_PSET_ITERATE_STATE_INIT;
5662 	while (sched_iterate_psets_ordered(preferred_pset, &preferred_pset->spill_search_order[thread->th_sched_bucket], foreign_pset_bitmap[0], &istate)) {
5663 		search_complete = sched_edge_migration_check(istate.spis_pset_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta);
5664 		if (search_complete) {
5665 			break;
5666 		}
5667 	}
5668 	return selected_pset;
5669 }
5670 
5671 /*
5672  * sched_edge_candidate_alternative()
5673  *
5674  * Routine to find an alternative cluster from candidate_cluster_bitmap since the
5675  * selected_pset is not available for execution. The logic tries to prefer homogeneous
5676  * clusters over heterogeneous clusters since this is typically used in thread
5677  * placement decisions.
5678  */
5679 _Static_assert(MAX_PSETS <= 64, "Unable to fit maximum number of psets in uint64_t bitmask");
5680 static processor_set_t
sched_edge_candidate_alternative(processor_set_t selected_pset,uint64_t candidate_cluster_bitmap)5681 sched_edge_candidate_alternative(processor_set_t selected_pset, uint64_t candidate_cluster_bitmap)
5682 {
5683 	/*
5684 	 * It looks like the most ideal pset is not available for scheduling currently.
5685 	 * Try to find a homogeneous cluster that is still available.
5686 	 */
5687 	uint64_t available_native_clusters = selected_pset->native_psets[0] & candidate_cluster_bitmap;
5688 	int available_cluster_id = lsb_first(available_native_clusters);
5689 	if (available_cluster_id == -1) {
5690 		/* Looks like none of the homogeneous clusters are available; pick the first available cluster */
5691 		available_cluster_id = bit_first(candidate_cluster_bitmap);
5692 	}
5693 	assert(available_cluster_id != -1);
5694 	return pset_array[available_cluster_id];
5695 }
5696 
5697 /*
5698  * sched_edge_switch_pset_lock()
5699  *
5700  * Helper routine for sched_edge_migrate_candidate() which switches pset locks (if needed) based on
5701  * switch_pset_locks.
5702  * Returns the newly locked pset after the switch.
5703  */
5704 static processor_set_t
sched_edge_switch_pset_lock(processor_set_t selected_pset,processor_set_t locked_pset,bool switch_pset_locks)5705 sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locked_pset, bool switch_pset_locks)
5706 {
5707 	if (!switch_pset_locks) {
5708 		return locked_pset;
5709 	}
5710 	if (selected_pset != locked_pset) {
5711 		pset_unlock(locked_pset);
5712 		pset_lock(selected_pset);
5713 		return selected_pset;
5714 	} else {
5715 		return locked_pset;
5716 	}
5717 }
5718 
5719 /*
5720  * sched_edge_migrate_candidate()
5721  *
5722  * Routine to find an appropriate cluster for scheduling a thread. The routine looks at the properties of
5723  * the thread and the preferred cluster to determine the best available pset for scheduling.
5724  *
5725  * The switch_pset_locks parameter defines whether the routine should switch pset locks to provide an
5726  * accurate scheduling decision. This mode is typically used when choosing a pset for scheduling a thread since the
5727  * decision has to be synchronized with another CPU changing the recommendation of clusters available
5728  * on the system. If this parameter is set to false, this routine returns the best effort indication of
5729  * the cluster the thread should be scheduled on. It is typically used in fast path contexts (such as
5730  * SCHED(thread_avoid_processor) to determine if there is a possibility of scheduling this thread on a
5731  * more appropriate cluster.
5732  *
5733  * Routine returns the most ideal cluster for scheduling. If switch_pset_locks is set, it ensures that the
5734  * resultant pset lock is held.
5735  */
5736 static processor_set_t
sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset,thread_t thread,processor_set_t locked_pset,bool switch_pset_locks,processor_t * processor_hint_out,sched_options_t * options_inout)5737 sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset, thread_t thread,
5738     processor_set_t locked_pset, bool switch_pset_locks, processor_t *processor_hint_out,
5739     sched_options_t *options_inout)
5740 {
5741 	processor_set_t selected_pset = preferred_pset;
5742 	cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread);
5743 	bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE);
5744 	bool stirring_the_pot = false;
5745 
5746 	if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) {
5747 		/*
5748 		 * For cluster-bound threads, choose the cluster to which the thread is bound, unless that
5749 		 * cluster is unavailable. If it's not available, fall through to the regular cluster selection
5750 		 * logic which handles derecommended clusters appropriately.
5751 		 */
5752 		selected_pset = pset_array[sched_edge_thread_bound_cluster_id(thread)];
5753 		if (selected_pset != NULL) {
5754 			locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
5755 			if (pset_is_recommended(selected_pset)) {
5756 				return selected_pset;
5757 			}
5758 		}
5759 	}
5760 
5761 	uint64_t candidate_cluster_bitmap = mask(sched_num_psets);
5762 #if DEVELOPMENT || DEBUG
5763 	extern int enable_task_set_cluster_type;
5764 	task_t task = get_threadtask(thread);
5765 	if (enable_task_set_cluster_type && (task->t_flags & TF_USE_PSET_HINT_CLUSTER_TYPE)) {
5766 		processor_set_t pset_hint = task->pset_hint;
5767 		if (pset_hint && (selected_pset == NULL || selected_pset->pset_cluster_type != pset_hint->pset_cluster_type)) {
5768 			selected_pset = pset_hint;
5769 			goto migrate_candidate_available_check;
5770 		}
5771 	}
5772 #endif
5773 
5774 	if (preferred_pset == NULL) {
5775 		/* The preferred_pset has not finished initializing at boot */
5776 		goto migrate_candidate_available_check;
5777 	}
5778 
5779 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
5780 		/* For realtime threads, try and schedule them on the preferred pset always */
5781 		goto migrate_candidate_available_check;
5782 	}
5783 
5784 	uint32_t preferred_cluster_load = shared_rsrc_thread ? (uint32_t)sched_edge_pset_cluster_shared_rsrc_load(preferred_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket);
5785 	if (preferred_cluster_load == 0) {
5786 		goto migrate_candidate_available_check;
5787 	}
5788 
5789 	/*
5790 	 * If this thread has expired quantum on a non-preferred core and is waiting on
5791 	 * "stir-the-pot" to get a turn running on a P-core, check our processor inbox for
5792 	 * stir-the-pot to see if an eligible P-core has already been found for swap.
5793 	 * If so, try to migrate to the corresponding pset and also carry over the
5794 	 * processor hint to preempt that specific P-core.
5795 	 *
5796 	 * The AMP rebalancing mechanism is available for regular threads or shared resource
5797 	 * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy.
5798 	 */
5799 	int stir_the_pot_swap_cpu = sched_edge_stir_the_pot_check_inbox_for_thread(thread, true);
5800 	if (stir_the_pot_swap_cpu != -1) {
5801 		*processor_hint_out = processor_array[stir_the_pot_swap_cpu];
5802 		selected_pset = processor_array[stir_the_pot_swap_cpu]->processor_set;
5803 		stirring_the_pot = true;
5804 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_NONE,
5805 		    2, stir_the_pot_swap_cpu, 0, 0);
5806 		goto migrate_candidate_available_check;
5807 	}
5808 
5809 	/* Look at edge weights to decide the most ideal migration candidate for this thread */
5810 	selected_pset = sched_edge_migrate_edges_evaluate(preferred_pset, preferred_cluster_load, thread);
5811 
5812 migrate_candidate_available_check:
5813 	if (selected_pset == NULL) {
5814 		/* The selected_pset has not finished initializing at boot */
5815 		pset_unlock(locked_pset);
5816 		return NULL;
5817 	}
5818 
5819 	locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks);
5820 	if (pset_is_recommended(selected_pset) == true) {
5821 		/* Committing to the pset */
5822 		if (stirring_the_pot) {
5823 			*options_inout |= SCHED_STIR_POT;
5824 		}
5825 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_pset->pset_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load);
5826 		return selected_pset;
5827 	}
5828 	stirring_the_pot = false;
5829 	/* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */
5830 	bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id);
5831 	if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_num_psets) == -1)) {
5832 		pset_unlock(locked_pset);
5833 		return NULL;
5834 	}
5835 	/* Try and find an alternative for the selected pset */
5836 	selected_pset = sched_edge_candidate_alternative(selected_pset, candidate_cluster_bitmap);
5837 	goto migrate_candidate_available_check;
5838 }
5839 
5840 static processor_t
sched_edge_choose_processor(processor_set_t pset,processor_t processor,thread_t thread,sched_options_t * options_inout)5841 sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, sched_options_t *options_inout)
5842 {
5843 	/* Bound threads don't call this function */
5844 	assert(thread->bound_processor == PROCESSOR_NULL);
5845 	processor_t chosen_processor = PROCESSOR_NULL;
5846 
5847 	/*
5848 	 * sched_edge_preferred_pset() returns the preferred pset for a given thread.
5849 	 * It should take the passed in "pset" as a hint which represents the recency metric for
5850 	 * pset selection logic.
5851 	 */
5852 	processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
5853 	processor_set_t chosen_pset = preferred_pset;
5854 	/*
5855 	 * If the preferred pset is overloaded, find a pset which is the best candidate to migrate
5856 	 * threads to. sched_edge_migrate_candidate() returns the preferred pset
5857 	 * if it has capacity; otherwise finds the best candidate pset to migrate this thread to.
5858 	 *
5859 	 * Edge Scheduler Optimization
5860 	 * It might be useful to build a recency metric for the thread for multiple clusters and
5861 	 * factor that into the migration decisions.
5862 	 */
5863 	chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true, &processor, options_inout);
5864 	if (chosen_pset) {
5865 		chosen_processor = choose_processor(chosen_pset, processor, thread, options_inout);
5866 	}
5867 	return chosen_processor;
5868 }
5869 
5870 /*
5871  * sched_edge_clutch_bucket_threads_drain()
5872  *
5873  * Drains all the runnable threads which are not restricted to the root_clutch (due to clutch
5874  * bucket overrides etc.) into a local thread queue.
5875  */
5876 static void
sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket,sched_clutch_root_t root_clutch,struct pulled_thread_queue * threadq)5877 sched_edge_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, struct pulled_thread_queue * threadq)
5878 {
5879 	thread_t thread = THREAD_NULL;
5880 	uint64_t current_timestamp = mach_approximate_time();
5881 	qe_foreach_element_safe(thread, &clutch_bucket->scb_thread_timeshare_queue, th_clutch_timeshare_link) {
5882 		sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE);
5883 		pulled_thread_queue_enqueue(threadq, thread);
5884 	}
5885 }
5886 
5887 /*
5888  * sched_edge_update_preferred_cluster()
5889  *
5890  * Routine to update the preferred cluster for QoS buckets within a thread group.
5891  * The buckets to be updated are specifed as a bitmap (clutch_bucket_modify_bitmap).
5892  */
5893 static void
sched_edge_update_preferred_cluster(sched_clutch_t sched_clutch,bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster)5894 sched_edge_update_preferred_cluster(
5895 	sched_clutch_t sched_clutch,
5896 	bitmap_t *clutch_bucket_modify_bitmap,
5897 	uint32_t *tg_bucket_preferred_cluster)
5898 {
5899 	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
5900 		os_atomic_store(&sched_clutch->sc_clutch_groups[bucket].scbg_preferred_cluster, tg_bucket_preferred_cluster[bucket], relaxed);
5901 	}
5902 }
5903 
5904 #if !SCHED_TEST_HARNESS
5905 
5906 /*
5907  * sched_edge_migrate_thread_group_runnable_threads()
5908  *
5909  * Routine to implement the migration of threads on a cluster when the thread group
5910  * recommendation is updated. The migration works using a 2-phase
5911  * algorithm.
5912  *
5913  * Phase 1: With the pset lock held, check the recommendation of the clutch buckets.
5914  * For each clutch bucket, if it needs to be migrated immediately, drain the threads
5915  * into a local thread queue. Otherwise mark the clutch bucket as native/foreign as
5916  * appropriate.
5917  *
5918  * Phase 2: After unlocking the pset, drain all the threads from the local thread
5919  * queue and mark them runnable which should land them in the right hierarchy.
5920  *
5921  * The routine assumes that the preferences for the clutch buckets/clutch bucket
5922  * groups have already been updated by the caller.
5923  *
5924  * - Called with the pset locked and interrupts disabled.
5925  * - Returns with the pset unlocked.
5926  */
5927 static void
sched_edge_migrate_thread_group_runnable_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,bitmap_t * clutch_bucket_modify_bitmap,__unused uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately,struct pulled_thread_queue * threadq)5928 sched_edge_migrate_thread_group_runnable_threads(
5929 	sched_clutch_t sched_clutch,
5930 	sched_clutch_root_t root_clutch,
5931 	bitmap_t *clutch_bucket_modify_bitmap,
5932 	__unused uint32_t *tg_bucket_preferred_cluster,
5933 	bool migrate_immediately, struct pulled_thread_queue * threadq)
5934 {
5935 	sched_clutch_hierarchy_locked_assert(root_clutch);
5936 
5937 	for (int bucket = bitmap_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX); bucket >= 0; bucket = bitmap_next(clutch_bucket_modify_bitmap, bucket)) {
5938 		/* Get the clutch bucket for this cluster and sched bucket */
5939 		sched_clutch_bucket_group_t clutch_bucket_group = &(sched_clutch->sc_clutch_groups[bucket]);
5940 		sched_clutch_bucket_t clutch_bucket = &(clutch_bucket_group->scbg_clutch_buckets[root_clutch->scr_cluster_id]);
5941 		if (clutch_bucket->scb_root == NULL) {
5942 			/* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */
5943 			assert3u(clutch_bucket->scb_thr_count, ==, 0);
5944 			continue;
5945 		}
5946 		assert3p(clutch_bucket->scb_root, ==, root_clutch);
5947 		uint32_t clutch_bucket_preferred_cluster = sched_clutch_bucket_preferred_cluster(clutch_bucket);
5948 
5949 		sched_edge_steal_silo_clutch_bucket_classify(clutch_bucket, root_clutch, clutch_bucket_preferred_cluster);
5950 
5951 		if (migrate_immediately && (root_clutch->scr_cluster_id != clutch_bucket_preferred_cluster)) {
5952 			/*
5953 			 * For transitions where threads need to be migrated immediately, drain the threads into a
5954 			 * local queue unless we are looking at the clutch buckets for the newly recommended
5955 			 * cluster.
5956 			 */
5957 			sched_edge_clutch_bucket_threads_drain(clutch_bucket, clutch_bucket->scb_root, threadq);
5958 		}
5959 	}
5960 
5961 	pset_unlock(root_clutch->scr_pset);
5962 }
5963 
5964 /*
5965  * sched_edge_migrate_thread_group_running_threads()
5966  *
5967  * Routine to find all running threads of a thread group on a specific cluster
5968  * and IPI them if they need to be moved immediately.
5969  */
5970 static void
sched_edge_migrate_thread_group_running_threads(sched_clutch_t sched_clutch,sched_clutch_root_t root_clutch,__unused bitmap_t * clutch_bucket_modify_bitmap,uint32_t * tg_bucket_preferred_cluster,bool migrate_immediately)5971 sched_edge_migrate_thread_group_running_threads(
5972 	sched_clutch_t sched_clutch,
5973 	sched_clutch_root_t root_clutch,
5974 	__unused bitmap_t *clutch_bucket_modify_bitmap,
5975 	uint32_t *tg_bucket_preferred_cluster,
5976 	bool migrate_immediately)
5977 {
5978 	if (migrate_immediately == false) {
5979 		/* If CLPC has recommended not to move threads immediately, nothing to do here */
5980 		return;
5981 	}
5982 
5983 	/*
5984 	 * Edge Scheduler Optimization
5985 	 *
5986 	 * When the system has a large number of clusters and cores, it might be useful to
5987 	 * narrow down the iteration by using a thread running bitmap per clutch.
5988 	 */
5989 	uint64_t ast_processor_map = 0;
5990 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
5991 
5992 	uint64_t running_map = root_clutch->scr_pset->cpu_state_map[PROCESSOR_RUNNING];
5993 	/*
5994 	 * Iterate all CPUs and look for the ones running threads from this thread group and are
5995 	 * not restricted to the specific cluster (due to overrides etc.)
5996 	 */
5997 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
5998 		processor_t src_processor = processor_array[cpuid];
5999 		boolean_t expected_tg = (src_processor->current_thread_group == sched_clutch->sc_tg);
6000 		sched_bucket_t processor_sched_bucket = src_processor->processor_set->cpu_running_buckets[cpuid];
6001 		if (processor_sched_bucket == TH_BUCKET_SCHED_MAX) {
6002 			continue;
6003 		}
6004 		boolean_t non_preferred_cluster = tg_bucket_preferred_cluster[processor_sched_bucket] != root_clutch->scr_cluster_id;
6005 
6006 		if (expected_tg && non_preferred_cluster) {
6007 			ipi_type[cpuid] = sched_ipi_action(src_processor, NULL, SCHED_IPI_EVENT_REBALANCE);
6008 			if (ipi_type[cpuid] != SCHED_IPI_NONE) {
6009 				bit_set(ast_processor_map, cpuid);
6010 			} else if (src_processor == current_processor()) {
6011 				bit_set(root_clutch->scr_pset->pending_AST_PREEMPT_cpu_mask, cpuid);
6012 				ast_t new_preempt = update_pending_nonurgent_preemption(src_processor, AST_PREEMPT);
6013 				ast_on(new_preempt);
6014 			}
6015 		}
6016 	}
6017 
6018 	/* Perform all the IPIs */
6019 	if (bit_first(ast_processor_map) != -1) {
6020 		for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
6021 			processor_t ast_processor = processor_array[cpuid];
6022 			sched_ipi_perform(ast_processor, ipi_type[cpuid]);
6023 		}
6024 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, thread_group_get_id(sched_clutch->sc_tg), ast_processor_map, 0, 0);
6025 	}
6026 }
6027 
6028 /*
6029  * sched_edge_tg_preferred_cluster_change()
6030  *
6031  * Routine to handle changes to a thread group's recommendation. In the Edge Scheduler, the preferred cluster
6032  * is specified on a per-QoS basis within a thread group. The routine updates the preferences and performs
6033  * thread migrations based on the policy specified by CLPC.
6034  * tg_bucket_preferred_cluster is an array of size TH_BUCKET_SCHED_MAX which specifies the new preferred cluster
6035  * for each QoS within the thread group.
6036  */
6037 void
sched_edge_tg_preferred_cluster_change(struct thread_group * tg,uint32_t * tg_bucket_preferred_cluster,sched_perfcontrol_preferred_cluster_options_t options)6038 sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options)
6039 {
6040 	sched_clutch_t clutch = sched_clutch_for_thread_group(tg);
6041 	/*
6042 	 * In order to optimize the processing, create a bitmap which represents all QoS buckets
6043 	 * for which the preferred cluster has changed.
6044 	 */
6045 	bitmap_t clutch_bucket_modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0};
6046 	for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
6047 		uint32_t old_preferred_cluster = sched_edge_clutch_bucket_group_preferred_cluster(&clutch->sc_clutch_groups[bucket]);
6048 		uint32_t new_preferred_cluster = tg_bucket_preferred_cluster[bucket];
6049 		if (old_preferred_cluster != new_preferred_cluster) {
6050 			bitmap_set(clutch_bucket_modify_bitmap, bucket);
6051 		}
6052 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREFERRED_PSET) | DBG_FUNC_NONE,
6053 		    thread_group_get_id(tg), bucket, new_preferred_cluster, options);
6054 	}
6055 	if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) {
6056 		/* No changes in any clutch buckets; nothing to do here */
6057 		return;
6058 	}
6059 
6060 	/*
6061 	 * The first operation is to update the preferred cluster for all QoS buckets within the
6062 	 * thread group so that any future threads becoming runnable would see the new preferred
6063 	 * cluster value.
6064 	 */
6065 	sched_edge_update_preferred_cluster(clutch, clutch_bucket_modify_bitmap, tg_bucket_preferred_cluster);
6066 
6067 	for (uint32_t cluster_id = 0; cluster_id < sched_num_psets; cluster_id++) {
6068 		processor_set_t pset = pset_array[cluster_id];
6069 		struct pulled_thread_queue *threadq = pulled_thread_queue_prepare();
6070 
6071 		spl_t s = splsched();
6072 		pset_lock(pset);
6073 		/*
6074 		 * Currently iterates all clusters looking for running threads for a TG to be migrated. Can be optimized
6075 		 * by keeping a per-clutch bitmap of clusters running threads for a particular TG.
6076 		 *
6077 		 * Edge Scheduler Optimization
6078 		 */
6079 		/* Migrate all running threads of the TG on this cluster based on options specified by CLPC */
6080 		sched_edge_migrate_thread_group_running_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
6081 		    tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNING));
6082 		/* Migrate all runnable threads of the TG in this cluster's hierarchy based on options specified by CLPC */
6083 		sched_edge_migrate_thread_group_runnable_threads(clutch, &pset->pset_clutch_root, clutch_bucket_modify_bitmap,
6084 		    tg_bucket_preferred_cluster, (options & SCHED_PERFCONTROL_PREFERRED_CLUSTER_MIGRATE_RUNNABLE), threadq);
6085 		/* sched_edge_migrate_thread_group_runnable_threads() returns with pset unlocked */
6086 		splx(s);
6087 
6088 		pulled_thread_queue_flush(threadq);
6089 	}
6090 }
6091 
6092 /*
6093  * sched_edge_pset_made_schedulable()
6094  *
6095  * Pset may already be marked schedulable. Called at least once when new
6096  * processor(s) are made available.
6097  *
6098  * Invoked with the pset lock held and interrupts disabled.
6099  */
6100 static void
sched_edge_pset_made_schedulable(processor_set_t pset)6101 sched_edge_pset_made_schedulable(
6102 	processor_set_t pset)
6103 {
6104 	/* Mark the pset as schedulable. The bit may already be set if the pset was already schedulable. */
6105 	atomic_bit_set(sched_edge_available_pset_bitmask, pset->pset_id, memory_order_relaxed);
6106 }
6107 #endif /* !SCHED_TEST_HARNESS */
6108 
6109 
6110 /*
6111  * sched_edge_cpu_init_completed()
6112  *
6113  * Callback routine from the platform layer once all CPUs/clusters have been initialized. This
6114  * provides an opportunity for the edge scheduler to initialize all the edge parameters.
6115  */
6116 static void
sched_edge_cpu_init_completed(void)6117 sched_edge_cpu_init_completed(void)
6118 {
6119 	/* Now that all cores have registered, compute bitmaps for different core types */
6120 	for (int pset_id = 0; pset_id < sched_num_psets; pset_id++) {
6121 		processor_set_t pset = pset_array[pset_id];
6122 		if (sched_edge_stir_the_pot_core_type_is_desired(pset)) {
6123 			os_atomic_or(&sched_edge_p_core_map, pset->cpu_bitmask, relaxed);
6124 		} else {
6125 			os_atomic_or(&sched_edge_non_p_core_map, pset->cpu_bitmask, relaxed);
6126 		}
6127 	}
6128 	/* Build policy table for setting edge weight tunables based on cluster types */
6129 	sched_clutch_edge edge_config_defaults[MAX_CPU_TYPES][MAX_CPU_TYPES];
6130 	sched_clutch_edge free_spill = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1};
6131 	sched_clutch_edge no_spill = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0};
6132 	sched_clutch_edge weighted_spill = (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1};
6133 	/* P -> P */
6134 	edge_config_defaults[CLUSTER_TYPE_P][CLUSTER_TYPE_P] = free_spill;
6135 	/* E -> E */
6136 	edge_config_defaults[CLUSTER_TYPE_E][CLUSTER_TYPE_E] = free_spill;
6137 	/* P -> E */
6138 	edge_config_defaults[CLUSTER_TYPE_P][CLUSTER_TYPE_E] = weighted_spill;
6139 	/* E -> P */
6140 	edge_config_defaults[CLUSTER_TYPE_E][CLUSTER_TYPE_P] = no_spill;
6141 
6142 	spl_t s = splsched();
6143 	for (int src_cluster_id = 0; src_cluster_id < sched_num_psets; src_cluster_id++) {
6144 		processor_set_t src_pset = pset_array[src_cluster_id];
6145 		pset_lock(src_pset);
6146 
6147 		/* Each pset recommendation is at least allowed to access its own cluster */
6148 		for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
6149 			src_pset->max_parallel_cores[bucket] = src_pset->cpu_set_count;
6150 			src_pset->max_parallel_clusters[bucket] = 1;
6151 		}
6152 
6153 		/* For each cluster, set all its outgoing edge parameters */
6154 		for (int dst_cluster_id = 0; dst_cluster_id < sched_num_psets; dst_cluster_id++) {
6155 			processor_set_t dst_pset = pset_array[dst_cluster_id];
6156 			if (dst_cluster_id == src_cluster_id) {
6157 				continue;
6158 			}
6159 
6160 			bool clusters_homogenous = (src_pset->pset_type == dst_pset->pset_type);
6161 			if (clusters_homogenous) {
6162 				bitmap_clear(src_pset->foreign_psets, dst_cluster_id);
6163 				bitmap_set(src_pset->native_psets, dst_cluster_id);
6164 				/* Default realtime policy: spill allowed among homogeneous psets. */
6165 				sched_rt_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge) {
6166 					.sce_migration_allowed = true,
6167 					.sce_steal_allowed = true,
6168 					.sce_migration_weight = 0,
6169 				});
6170 			} else {
6171 				bitmap_set(src_pset->foreign_psets, dst_cluster_id);
6172 				bitmap_clear(src_pset->native_psets, dst_cluster_id);
6173 				/* Default realtime policy: disallow spill among heterogeneous psets. */
6174 				sched_rt_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge) {
6175 					.sce_migration_allowed = false,
6176 					.sce_steal_allowed = false,
6177 					.sce_migration_weight = 0,
6178 				});
6179 			}
6180 
6181 			bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id));
6182 			if (clusters_local) {
6183 				bitmap_set(src_pset->local_psets, dst_cluster_id);
6184 				bitmap_clear(src_pset->remote_psets, dst_cluster_id);
6185 			} else {
6186 				bitmap_set(src_pset->remote_psets, dst_cluster_id);
6187 				bitmap_clear(src_pset->local_psets, dst_cluster_id);
6188 			}
6189 
6190 			for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) {
6191 				/* Set tunables for an edge based on the cluster types at either ends of it */
6192 				sched_clutch_edge edge_config = edge_config_defaults[src_pset->pset_type][dst_pset->pset_type];
6193 				sched_edge_config_set(src_cluster_id, dst_cluster_id, bucket, edge_config);
6194 				if (edge_config.sce_migration_allowed) {
6195 					src_pset->max_parallel_cores[bucket] += dst_pset->cpu_set_count;
6196 					src_pset->max_parallel_clusters[bucket] += 1;
6197 				}
6198 			}
6199 		}
6200 		sched_edge_config_pset_push(src_cluster_id);
6201 
6202 		pset_unlock(src_pset);
6203 	}
6204 	sched_edge_config_final_push();
6205 #if DEVELOPMENT || DEBUG
6206 	assert(sched_edge_config_verify());
6207 #endif /* DEVELOPMENT || DEBUG */
6208 	splx(s);
6209 }
6210 
6211 static bool
sched_edge_thread_eligible_for_pset(thread_t thread,processor_set_t pset)6212 sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset)
6213 {
6214 	uint32_t preferred_cluster_id = sched_edge_thread_preferred_cluster(thread);
6215 	if (preferred_cluster_id == pset->pset_cluster_id) {
6216 		return true;
6217 	} else {
6218 		sched_clutch_edge edge;
6219 		if (thread->sched_pri >= BASEPRI_RTQUEUES) {
6220 			edge = sched_rt_config_get(preferred_cluster_id, pset->pset_id);
6221 		} else {
6222 			edge = sched_edge_config_get(preferred_cluster_id, pset->pset_cluster_id, thread->th_sched_bucket);
6223 		}
6224 		return edge.sce_migration_allowed;
6225 	}
6226 }
6227 
6228 extern int sched_amp_spill_deferred_ipi;
6229 extern int sched_amp_pcores_preempt_immediate_ipi;
6230 
6231 int sched_edge_migrate_ipi_immediate = 1;
6232 
6233 sched_ipi_type_t
sched_edge_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)6234 sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
6235 {
6236 	processor_set_t pset = dst->processor_set;
6237 	assert(dst != current_processor());
6238 
6239 	boolean_t deferred_ipi_supported = false;
6240 #if defined(CONFIG_SCHED_DEFERRED_AST)
6241 	deferred_ipi_supported = true;
6242 #endif /* CONFIG_SCHED_DEFERRED_AST */
6243 
6244 	switch (event) {
6245 	case SCHED_IPI_EVENT_SPILL:
6246 		/* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
6247 		if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
6248 			return sched_ipi_deferred_policy(pset, dst, thread, event);
6249 		}
6250 		break;
6251 	case SCHED_IPI_EVENT_PREEMPT:
6252 		/* For preemption, the default policy is to use deferred IPIs
6253 		 * for Non-RT P-core preemption. Override that behavior if
6254 		 * sched_amp_pcores_preempt_immediate_ipi is set
6255 		 */
6256 		if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
6257 			if (sched_amp_pcores_preempt_immediate_ipi && (pset_type_for_id(pset->pset_cluster_id) == CLUSTER_TYPE_P)) {
6258 				return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
6259 			}
6260 			if (sched_edge_migrate_ipi_immediate) {
6261 				processor_set_t preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)];
6262 				/*
6263 				 * For IPI'ing CPUs that are homogeneous with the preferred cluster, use immediate IPIs
6264 				 */
6265 				if (preferred_pset->pset_type == pset->pset_type) {
6266 					return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
6267 				}
6268 				/*
6269 				 * For workloads that are going wide, it might be useful to use Immediate IPI to
6270 				 * wakeup the idle CPU if the scheduler estimates that the preferred pset will
6271 				 * be busy for the deferred IPI timeout. The Edge Scheduler uses the avg execution
6272 				 * latency on the preferred pset as an estimate of busyness.
6273 				 */
6274 				if ((preferred_pset->pset_execution_time[thread->th_sched_bucket].pset_avg_thread_execution_time * NSEC_PER_USEC) >= ml_cpu_signal_deferred_get_timer()) {
6275 					return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
6276 				}
6277 			}
6278 		}
6279 		break;
6280 	default:
6281 		break;
6282 	}
6283 	/* Default back to the global policy for all other scenarios */
6284 	return sched_ipi_policy(dst, thread, dst_idle, event);
6285 }
6286 
6287 
6288 /*
6289  * sched_edge_qos_max_parallelism()
6290  */
6291 uint32_t
sched_edge_qos_max_parallelism(int qos,uint64_t options)6292 sched_edge_qos_max_parallelism(int qos, uint64_t options)
6293 {
6294 	cluster_type_t low_core_type = CLUSTER_TYPE_E;
6295 	cluster_type_t high_core_type = CLUSTER_TYPE_P;
6296 
6297 	if (options & QOS_PARALLELISM_REALTIME) {
6298 		/* For realtime threads on AMP, we would want them
6299 		 * to limit the width to just the P-cores since we
6300 		 * do not spill/rebalance for RT threads.
6301 		 */
6302 		uint32_t high_cpu_count = ml_get_cpu_number_type(high_core_type, false, false);
6303 		uint32_t high_cluster_count = ml_get_cluster_number_type(high_core_type);
6304 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? high_cluster_count : high_cpu_count;
6305 	}
6306 
6307 	/*
6308 	 * The Edge scheduler supports per-QoS recommendations for thread groups.
6309 	 * This enables lower QoS buckets (such as UT) to be scheduled on all
6310 	 * CPUs on the system.
6311 	 *
6312 	 * The only restriction is for BG/Maintenance QoS classes for which the
6313 	 * performance controller would never recommend execution on the P-cores.
6314 	 * If that policy changes in the future, this value should be changed.
6315 	 */
6316 	switch (qos) {
6317 	case THREAD_QOS_BACKGROUND:
6318 	case THREAD_QOS_MAINTENANCE:;
6319 		uint32_t low_cpu_count = ml_get_cpu_number_type(low_core_type, false, false);
6320 		uint32_t low_cluster_count = ml_get_cluster_number_type(low_core_type);
6321 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? low_cluster_count : low_cpu_count;
6322 	default:;
6323 		uint32_t total_cpus = ml_get_cpu_count();
6324 		uint32_t total_clusters = ml_get_cluster_count();
6325 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? total_clusters : total_cpus;
6326 	}
6327 }
6328 
6329 
6330 #endif /* CONFIG_SCHED_EDGE */
6331 
6332 #endif /* CONFIG_SCHED_CLUTCH */
6333