xref: /xnu-12377.81.4/osfmk/kern/sched_amp_common.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2019 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 #include <machine/machine_routines.h>
32 #include <machine/sched_param.h>
33 #include <machine/machine_cpu.h>
34 #include <kern/kern_types.h>
35 #include <kern/debug.h>
36 #include <kern/machine.h>
37 #include <kern/misc_protos.h>
38 #include <kern/processor.h>
39 #include <kern/queue.h>
40 #include <kern/sched.h>
41 #include <kern/sched_prim.h>
42 #include <kern/sched_rt.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <machine/atomic.h>
46 #include <sys/kdebug.h>
47 #include <kern/sched_amp_common.h>
48 #include <stdatomic.h>
49 
50 #if __AMP__
51 
52 /* Configuration shared with the Edge scheduler */
53 
54 /*
55  * We see performance gains from doing immediate IPIs to P-cores to run
56  * P-eligible threads and lesser P-E migrations from using deferred IPIs
57  * for spill.
58  */
59 int sched_amp_spill_deferred_ipi = 1;
60 int sched_amp_pcores_preempt_immediate_ipi = 1;
61 
62 #if !CONFIG_SCHED_EDGE
63 
64 /* Exported globals */
65 processor_set_t ecore_set = NULL;
66 processor_set_t pcore_set = NULL;
67 
68 /*
69  * sched_amp_init()
70  *
71  * Initialize the pcore_set and ecore_set globals which describe the
72  * P/E processor sets.
73  */
74 void
sched_amp_init(void)75 sched_amp_init(void)
76 {
77 	sched_timeshare_init();
78 }
79 
80 #define PSET_LOAD_NUMERATOR_SHIFT   16
81 #define PSET_LOAD_FRACTIONAL_SHIFT   4
82 
83 inline int
sched_amp_get_pset_load_average(processor_set_t pset,__unused sched_bucket_t sched_bucket)84 sched_amp_get_pset_load_average(processor_set_t pset, __unused sched_bucket_t sched_bucket)
85 {
86 	return (int)pset->load_average >> (PSET_LOAD_NUMERATOR_SHIFT - PSET_LOAD_FRACTIONAL_SHIFT);
87 }
88 
89 void
sched_amp_update_pset_load_average(processor_set_t pset,__unused uint64_t curtime)90 sched_amp_update_pset_load_average(processor_set_t pset, __unused uint64_t curtime)
91 {
92 	int non_rt_load = pset->pset_runq.count;
93 	int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + non_rt_load + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
94 	int new_load_average = ((int)pset->load_average + load) >> 1;
95 
96 	pset->load_average = new_load_average;
97 #if (DEVELOPMENT || DEBUG)
98 	if (pset->pset_cluster_type == PSET_AMP_P) {
99 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_amp_get_pset_load_average(pset, 0), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)));
100 	}
101 #endif
102 }
103 
104 /* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */
105 int sched_amp_spill_count = 3;
106 int sched_amp_idle_steal = 1;
107 int sched_amp_spill_steal = 1;
108 
109 /*
110  * sched_perfcontrol_inherit_recommendation_from_tg changes amp
111  * scheduling policy away from default and allows policy to be
112  * modified at run-time.
113  *
114  * once modified from default, the policy toggles between "follow
115  * thread group" and "restrict to e".
116  */
117 
118 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_util = SCHED_PERFCTL_POLICY_DEFAULT;
119 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_bg = SCHED_PERFCTL_POLICY_DEFAULT;
120 
121 /*
122  * sched_amp_spill_threshold()
123  *
124  * Routine to calulate spill threshold which decides if cluster should spill.
125  */
126 int
sched_amp_spill_threshold(processor_set_t pset)127 sched_amp_spill_threshold(processor_set_t pset)
128 {
129 	int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
130 
131 	return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + sched_amp_spill_count;
132 }
133 
134 /*
135  * pset_signal_spill()
136  *
137  * Routine to signal a running/idle CPU to cause a spill onto that CPU.
138  * Called with pset locked, returns unlocked
139  */
140 void
pset_signal_spill(processor_set_t pset,int spilled_thread_priority)141 pset_signal_spill(processor_set_t pset, int spilled_thread_priority)
142 {
143 	processor_t processor;
144 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
145 
146 	uint64_t idle_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE];
147 	for (int cpuid = lsb_first(idle_map); cpuid >= 0; cpuid = lsb_next(idle_map, cpuid)) {
148 		processor = processor_array[cpuid];
149 		if (bit_set_if_clear(pset->pending_spill_cpu_mask, processor->cpu_id)) {
150 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 0, 0, 0);
151 
152 			processor->deadline = UINT64_MAX;
153 
154 			if (processor == current_processor()) {
155 				pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
156 				if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
157 					KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
158 					    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 6);
159 				}
160 			} else {
161 				ipi_type = sched_ipi_action(processor, NULL, SCHED_IPI_EVENT_SPILL);
162 			}
163 			pset_unlock(pset);
164 			sched_ipi_perform(processor, ipi_type);
165 			return;
166 		}
167 	}
168 
169 	processor_t ast_processor = NULL;
170 	ast_t preempt = AST_NONE;
171 	uint64_t running_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING];
172 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
173 		processor = processor_array[cpuid];
174 		if (processor->current_recommended_pset_type == PSET_AMP_P) {
175 			/* Already running a spilled P-core recommended thread */
176 			continue;
177 		}
178 		if (bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
179 			/* Already received a spill signal */
180 			continue;
181 		}
182 		if (processor->current_pri >= spilled_thread_priority) {
183 			/* Already running a higher or equal priority thread */
184 			continue;
185 		}
186 
187 		/* Found a suitable processor */
188 		bit_set(pset->pending_spill_cpu_mask, processor->cpu_id);
189 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 1, 0, 0);
190 		if (processor == current_processor()) {
191 			preempt = AST_PREEMPT;
192 		}
193 		ipi_type = sched_ipi_action(processor, NULL, SCHED_IPI_EVENT_SPILL);
194 		if (ipi_type != SCHED_IPI_NONE) {
195 			ast_processor = processor;
196 		}
197 		break;
198 	}
199 
200 	pset_unlock(pset);
201 	sched_ipi_perform(ast_processor, ipi_type);
202 
203 	if (preempt != AST_NONE) {
204 		ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt);
205 		ast_on(new_preempt);
206 	}
207 }
208 
209 /*
210  * pset_should_accept_spilled_thread()
211  *
212  * Routine to decide if pset should accept spilled threads.
213  * This function must be safe to call (to use as a hint) without holding the pset lock.
214  */
215 bool
pset_should_accept_spilled_thread(processor_set_t pset,int spilled_thread_priority)216 pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority)
217 {
218 	if (!pset) {
219 		return false;
220 	}
221 
222 	if ((pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
223 		return true;
224 	}
225 
226 	uint64_t cpu_map = (pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]);
227 
228 	for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
229 		processor_t processor = processor_array[cpuid];
230 
231 		if (processor->current_recommended_pset_type == PSET_AMP_P) {
232 			/* This processor is already running a spilled thread */
233 			continue;
234 		}
235 
236 		if (processor->current_pri < spilled_thread_priority) {
237 			return true;
238 		}
239 	}
240 
241 	return false;
242 }
243 
244 /*
245  * should_spill_to_ecores()
246  *
247  * Spill policy is implemented here
248  */
249 bool
should_spill_to_ecores(processor_set_t nset,thread_t thread)250 should_spill_to_ecores(processor_set_t nset, thread_t thread)
251 {
252 	if (nset->pset_cluster_type == PSET_AMP_E) {
253 		/* Not relevant if ecores already preferred */
254 		return false;
255 	}
256 
257 	if (!pset_is_recommended(ecore_set)) {
258 		/* E cores must be recommended */
259 		return false;
260 	}
261 
262 	if (thread->th_bound_cluster_id == pcore_set->pset_id) {
263 		/* Thread bound to the P-cluster */
264 		return false;
265 	}
266 
267 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
268 		/* Never spill realtime threads */
269 		return false;
270 	}
271 
272 	if ((nset->recommended_bitmask & nset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
273 		/* Don't spill if idle cores */
274 		return false;
275 	}
276 
277 	if ((sched_amp_get_pset_load_average(nset, 0) >= sched_amp_spill_threshold(nset)) &&  /* There is already a load on P cores */
278 	    pset_should_accept_spilled_thread(ecore_set, thread->sched_pri)) { /* There are lower priority E cores */
279 		return true;
280 	}
281 
282 	return false;
283 }
284 
285 /*
286  * sched_amp_check_spill()
287  *
288  * Routine to check if the thread should be spilled and signal the pset if needed.
289  */
290 void
sched_amp_check_spill(processor_set_t pset,thread_t thread)291 sched_amp_check_spill(processor_set_t pset, thread_t thread)
292 {
293 	/* pset is unlocked */
294 
295 	/* Bound threads don't call this function */
296 	assert(thread->bound_processor == PROCESSOR_NULL);
297 
298 	if (should_spill_to_ecores(pset, thread)) {
299 		pset_lock(ecore_set);
300 
301 		pset_signal_spill(ecore_set, thread->sched_pri);
302 		/* returns with ecore_set unlocked */
303 	}
304 }
305 
306 /*
307  * sched_amp_steal_threshold()
308  *
309  * Routine to calculate the steal threshold
310  */
311 int
sched_amp_steal_threshold(processor_set_t pset,bool spill_pending)312 sched_amp_steal_threshold(processor_set_t pset, bool spill_pending)
313 {
314 	int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
315 
316 	return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + (spill_pending ? sched_amp_spill_steal : sched_amp_idle_steal);
317 }
318 
319 /*
320  * sched_amp_steal_thread_enabled()
321  *
322  */
323 bool
sched_amp_steal_thread_enabled(processor_set_t pset)324 sched_amp_steal_thread_enabled(processor_set_t pset)
325 {
326 	return (pset->pset_cluster_type == PSET_AMP_E) && (pcore_set != NULL) && (pcore_set->online_processor_count > 0);
327 }
328 
329 /*
330  * sched_amp_balance()
331  *
332  * Invoked with pset locked, returns with pset unlocked
333  */
334 bool
sched_amp_balance(processor_t cprocessor,processor_set_t cpset)335 sched_amp_balance(processor_t cprocessor, processor_set_t cpset)
336 {
337 	assert(cprocessor == current_processor());
338 
339 	pset_unlock(cpset);
340 
341 	if (!ecore_set || cpset->pset_cluster_type == PSET_AMP_E || !cprocessor->is_recommended) {
342 		return false;
343 	}
344 
345 	/*
346 	 * cprocessor is an idle, recommended P core processor.
347 	 * Look for P-eligible threads that have spilled to an E core
348 	 * and coax them to come back.
349 	 */
350 	processor_set_t pset = ecore_set;
351 
352 	pset_lock(pset);
353 
354 	processor_t eprocessor;
355 	uint64_t ast_processor_map = 0;
356 
357 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
358 	uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
359 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
360 		eprocessor = processor_array[cpuid];
361 		if ((eprocessor->current_pri < BASEPRI_RTQUEUES) &&
362 		    (eprocessor->current_recommended_pset_type == PSET_AMP_P)) {
363 			ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, SCHED_IPI_EVENT_REBALANCE);
364 			if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
365 				bit_set(ast_processor_map, eprocessor->cpu_id);
366 				assert(eprocessor != cprocessor);
367 			}
368 		}
369 	}
370 
371 	pset_unlock(pset);
372 
373 	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
374 		processor_t ast_processor = processor_array[cpuid];
375 		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
376 	}
377 
378 	/* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
379 	return ast_processor_map != 0;
380 }
381 
382 /*
383  * Helper function for sched_amp_thread_group_recommendation_change()
384  * Find all the cores in the pset running threads from the thread_group tg
385  * and send them a rebalance interrupt.
386  */
387 void
sched_amp_bounce_thread_group_from_ecores(processor_set_t pset,struct thread_group * tg)388 sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg)
389 {
390 	if (!pset) {
391 		return;
392 	}
393 
394 	assert(pset->pset_cluster_type == PSET_AMP_E);
395 	uint64_t ast_processor_map = 0;
396 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
397 
398 	spl_t s = splsched();
399 	pset_lock(pset);
400 
401 	uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
402 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
403 		processor_t eprocessor = processor_array[cpuid];
404 		if (eprocessor->current_thread_group == tg) {
405 			ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, SCHED_IPI_EVENT_REBALANCE);
406 			if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
407 				bit_set(ast_processor_map, eprocessor->cpu_id);
408 			} else if (eprocessor == current_processor()) {
409 				ast_on(AST_PREEMPT);
410 				bit_set(pset->pending_AST_PREEMPT_cpu_mask, eprocessor->cpu_id);
411 			}
412 		}
413 	}
414 
415 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, tg, ast_processor_map, 0, 0);
416 
417 	pset_unlock(pset);
418 
419 	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
420 		processor_t ast_processor = processor_array[cpuid];
421 		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
422 	}
423 
424 	splx(s);
425 }
426 
427 /*
428  * sched_amp_ipi_policy()
429  */
430 sched_ipi_type_t
sched_amp_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)431 sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
432 {
433 	processor_set_t pset = dst->processor_set;
434 	assert(dst != current_processor());
435 
436 	boolean_t deferred_ipi_supported = false;
437 #if defined(CONFIG_SCHED_DEFERRED_AST)
438 	deferred_ipi_supported = true;
439 #endif /* CONFIG_SCHED_DEFERRED_AST */
440 
441 	switch (event) {
442 	case SCHED_IPI_EVENT_SPILL:
443 		/* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
444 		if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
445 			return sched_ipi_deferred_policy(pset, dst, thread, event);
446 		}
447 		break;
448 	case SCHED_IPI_EVENT_PREEMPT:
449 		/* For preemption, the default policy is to use deferred IPIs
450 		 * for Non-RT P-core preemption. Override that behavior if
451 		 * sched_amp_pcores_preempt_immediate_ipi is set
452 		 */
453 		if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
454 			if (sched_amp_pcores_preempt_immediate_ipi && (pset == pcore_set)) {
455 				return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
456 			}
457 		}
458 		break;
459 	default:
460 		break;
461 	}
462 	/* Default back to the global policy for all other scenarios */
463 	return sched_ipi_policy(dst, thread, dst_idle, event);
464 }
465 
466 /*
467  * sched_amp_qos_max_parallelism()
468  */
469 uint32_t
sched_amp_qos_max_parallelism(int qos,uint64_t options)470 sched_amp_qos_max_parallelism(int qos, uint64_t options)
471 {
472 	uint32_t ecount = ecore_set ? ecore_set->cpu_set_count : 0;
473 	uint32_t pcount = pcore_set ? pcore_set->cpu_set_count : 0;
474 
475 	/*
476 	 * The AMP scheduler does not support more than 1 of each type of cluster
477 	 * but the P-cluster is optional (e.g. watchOS)
478 	 */
479 	uint32_t ecluster_count = ecount ? 1 : 0;
480 	uint32_t pcluster_count = pcount ? 1 : 0;
481 
482 	if (options & QOS_PARALLELISM_REALTIME) {
483 		/* For realtime threads on AMP, we would want them
484 		 * to limit the width to just the P-cores since we
485 		 * do not spill/rebalance for RT threads.
486 		 */
487 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcount;
488 	}
489 
490 	/*
491 	 * The default AMP scheduler policy is to run utility and by
492 	 * threads on E-Cores only.  Run-time policy adjustment unlocks
493 	 * ability of utility and bg to threads to be scheduled based on
494 	 * run-time conditions.
495 	 */
496 	switch (qos) {
497 	case THREAD_QOS_UTILITY:
498 		if (os_atomic_load(&sched_perfctl_policy_util, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) {
499 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecount;
500 		} else {
501 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
502 		}
503 	case THREAD_QOS_BACKGROUND:
504 	case THREAD_QOS_MAINTENANCE:
505 		if (os_atomic_load(&sched_perfctl_policy_bg, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) {
506 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecount;
507 		} else {
508 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
509 		}
510 	default:
511 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
512 	}
513 }
514 
515 pset_node_t
sched_amp_choose_node(thread_t thread)516 sched_amp_choose_node(thread_t thread)
517 {
518 	pset_cluster_type_t pset_cluster_type = (recommended_pset_type(thread) == PSET_AMP_P) ? PSET_AMP_P : PSET_AMP_E;
519 	pset_node_t node = pset_node_for_pset_cluster_type(pset_cluster_type);
520 	return ((node != NULL) && (node->pset_map != 0)) ? node : &pset_node0;
521 }
522 #endif /* !CONFIG_SCHED_EDGE */
523 #endif /* __AMP__ */
524