xref: /xnu-11215.81.4/osfmk/kern/sched_amp_common.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2019 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 #include <machine/machine_routines.h>
32 #include <machine/sched_param.h>
33 #include <machine/machine_cpu.h>
34 #include <kern/kern_types.h>
35 #include <kern/debug.h>
36 #include <kern/machine.h>
37 #include <kern/misc_protos.h>
38 #include <kern/processor.h>
39 #include <kern/queue.h>
40 #include <kern/sched.h>
41 #include <kern/sched_prim.h>
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <machine/atomic.h>
45 #include <sys/kdebug.h>
46 #include <kern/sched_amp_common.h>
47 #include <stdatomic.h>
48 
49 #if __AMP__
50 
51 /* Configuration shared with the Edge scheduler */
52 
53 /*
54  * We see performance gains from doing immediate IPIs to P-cores to run
55  * P-eligible threads and lesser P-E migrations from using deferred IPIs
56  * for spill.
57  */
58 int sched_amp_spill_deferred_ipi = 1;
59 int sched_amp_pcores_preempt_immediate_ipi = 1;
60 
61 #if !CONFIG_SCHED_EDGE
62 
63 /* Exported globals */
64 processor_set_t ecore_set = NULL;
65 processor_set_t pcore_set = NULL;
66 
67 /*
68  * sched_amp_init()
69  *
70  * Initialize the pcore_set and ecore_set globals which describe the
71  * P/E processor sets.
72  */
73 void
sched_amp_init(void)74 sched_amp_init(void)
75 {
76 	sched_timeshare_init();
77 }
78 
79 /* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */
80 int sched_amp_spill_count = 3;
81 int sched_amp_idle_steal = 1;
82 int sched_amp_spill_steal = 1;
83 
84 /*
85  * sched_perfcontrol_inherit_recommendation_from_tg changes amp
86  * scheduling policy away from default and allows policy to be
87  * modified at run-time.
88  *
89  * once modified from default, the policy toggles between "follow
90  * thread group" and "restrict to e".
91  */
92 
93 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_util = SCHED_PERFCTL_POLICY_DEFAULT;
94 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_bg = SCHED_PERFCTL_POLICY_DEFAULT;
95 
96 /*
97  * sched_amp_spill_threshold()
98  *
99  * Routine to calulate spill threshold which decides if cluster should spill.
100  */
101 int
sched_amp_spill_threshold(processor_set_t pset)102 sched_amp_spill_threshold(processor_set_t pset)
103 {
104 	int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
105 
106 	return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + sched_amp_spill_count;
107 }
108 
109 /*
110  * pset_signal_spill()
111  *
112  * Routine to signal a running/idle CPU to cause a spill onto that CPU.
113  * Called with pset locked, returns unlocked
114  */
115 void
pset_signal_spill(processor_set_t pset,int spilled_thread_priority)116 pset_signal_spill(processor_set_t pset, int spilled_thread_priority)
117 {
118 	processor_t processor;
119 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
120 
121 	uint64_t idle_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE];
122 	for (int cpuid = lsb_first(idle_map); cpuid >= 0; cpuid = lsb_next(idle_map, cpuid)) {
123 		processor = processor_array[cpuid];
124 		if (bit_set_if_clear(pset->pending_spill_cpu_mask, processor->cpu_id)) {
125 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 0, 0, 0);
126 
127 			processor->deadline = UINT64_MAX;
128 
129 			if (processor == current_processor()) {
130 				pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
131 				if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
132 					KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
133 					    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 6);
134 				}
135 			} else {
136 				ipi_type = sched_ipi_action(processor, NULL, SCHED_IPI_EVENT_SPILL);
137 			}
138 			pset_unlock(pset);
139 			sched_ipi_perform(processor, ipi_type);
140 			return;
141 		}
142 	}
143 
144 	processor_t ast_processor = NULL;
145 	ast_t preempt = AST_NONE;
146 	uint64_t running_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING];
147 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
148 		processor = processor_array[cpuid];
149 		if (processor->current_recommended_pset_type == PSET_AMP_P) {
150 			/* Already running a spilled P-core recommended thread */
151 			continue;
152 		}
153 		if (bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
154 			/* Already received a spill signal */
155 			continue;
156 		}
157 		if (processor->current_pri >= spilled_thread_priority) {
158 			/* Already running a higher or equal priority thread */
159 			continue;
160 		}
161 
162 		/* Found a suitable processor */
163 		bit_set(pset->pending_spill_cpu_mask, processor->cpu_id);
164 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 1, 0, 0);
165 		if (processor == current_processor()) {
166 			preempt = AST_PREEMPT;
167 		}
168 		ipi_type = sched_ipi_action(processor, NULL, SCHED_IPI_EVENT_SPILL);
169 		if (ipi_type != SCHED_IPI_NONE) {
170 			ast_processor = processor;
171 		}
172 		break;
173 	}
174 
175 	pset_unlock(pset);
176 	sched_ipi_perform(ast_processor, ipi_type);
177 
178 	if (preempt != AST_NONE) {
179 		ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt);
180 		ast_on(new_preempt);
181 	}
182 }
183 
184 /*
185  * pset_should_accept_spilled_thread()
186  *
187  * Routine to decide if pset should accept spilled threads.
188  * This function must be safe to call (to use as a hint) without holding the pset lock.
189  */
190 bool
pset_should_accept_spilled_thread(processor_set_t pset,int spilled_thread_priority)191 pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority)
192 {
193 	if (!pset) {
194 		return false;
195 	}
196 
197 	if ((pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
198 		return true;
199 	}
200 
201 	uint64_t cpu_map = (pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]);
202 
203 	for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
204 		processor_t processor = processor_array[cpuid];
205 
206 		if (processor->current_recommended_pset_type == PSET_AMP_P) {
207 			/* This processor is already running a spilled thread */
208 			continue;
209 		}
210 
211 		if (processor->current_pri < spilled_thread_priority) {
212 			return true;
213 		}
214 	}
215 
216 	return false;
217 }
218 
219 /*
220  * should_spill_to_ecores()
221  *
222  * Spill policy is implemented here
223  */
224 bool
should_spill_to_ecores(processor_set_t nset,thread_t thread)225 should_spill_to_ecores(processor_set_t nset, thread_t thread)
226 {
227 	if (nset->pset_cluster_type == PSET_AMP_E) {
228 		/* Not relevant if ecores already preferred */
229 		return false;
230 	}
231 
232 	if (!pset_is_recommended(ecore_set)) {
233 		/* E cores must be recommended */
234 		return false;
235 	}
236 
237 	if (thread->th_bound_cluster_id == pcore_set->pset_id) {
238 		/* Thread bound to the P-cluster */
239 		return false;
240 	}
241 
242 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
243 		/* Never spill realtime threads */
244 		return false;
245 	}
246 
247 	if ((nset->recommended_bitmask & nset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
248 		/* Don't spill if idle cores */
249 		return false;
250 	}
251 
252 	if ((sched_get_pset_load_average(nset, 0) >= sched_amp_spill_threshold(nset)) &&  /* There is already a load on P cores */
253 	    pset_should_accept_spilled_thread(ecore_set, thread->sched_pri)) { /* There are lower priority E cores */
254 		return true;
255 	}
256 
257 	return false;
258 }
259 
260 /*
261  * sched_amp_check_spill()
262  *
263  * Routine to check if the thread should be spilled and signal the pset if needed.
264  */
265 void
sched_amp_check_spill(processor_set_t pset,thread_t thread)266 sched_amp_check_spill(processor_set_t pset, thread_t thread)
267 {
268 	/* pset is unlocked */
269 
270 	/* Bound threads don't call this function */
271 	assert(thread->bound_processor == PROCESSOR_NULL);
272 
273 	if (should_spill_to_ecores(pset, thread)) {
274 		pset_lock(ecore_set);
275 
276 		pset_signal_spill(ecore_set, thread->sched_pri);
277 		/* returns with ecore_set unlocked */
278 	}
279 }
280 
281 /*
282  * sched_amp_steal_threshold()
283  *
284  * Routine to calculate the steal threshold
285  */
286 int
sched_amp_steal_threshold(processor_set_t pset,bool spill_pending)287 sched_amp_steal_threshold(processor_set_t pset, bool spill_pending)
288 {
289 	int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
290 
291 	return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + (spill_pending ? sched_amp_spill_steal : sched_amp_idle_steal);
292 }
293 
294 /*
295  * sched_amp_steal_thread_enabled()
296  *
297  */
298 bool
sched_amp_steal_thread_enabled(processor_set_t pset)299 sched_amp_steal_thread_enabled(processor_set_t pset)
300 {
301 	return (pset->pset_cluster_type == PSET_AMP_E) && (pcore_set != NULL) && (pcore_set->online_processor_count > 0);
302 }
303 
304 /*
305  * sched_amp_balance()
306  *
307  * Invoked with pset locked, returns with pset unlocked
308  */
309 bool
sched_amp_balance(processor_t cprocessor,processor_set_t cpset)310 sched_amp_balance(processor_t cprocessor, processor_set_t cpset)
311 {
312 	assert(cprocessor == current_processor());
313 
314 	pset_unlock(cpset);
315 
316 	if (!ecore_set || cpset->pset_cluster_type == PSET_AMP_E || !cprocessor->is_recommended) {
317 		return false;
318 	}
319 
320 	/*
321 	 * cprocessor is an idle, recommended P core processor.
322 	 * Look for P-eligible threads that have spilled to an E core
323 	 * and coax them to come back.
324 	 */
325 	processor_set_t pset = ecore_set;
326 
327 	pset_lock(pset);
328 
329 	processor_t eprocessor;
330 	uint64_t ast_processor_map = 0;
331 
332 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
333 	uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
334 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
335 		eprocessor = processor_array[cpuid];
336 		if ((eprocessor->current_pri < BASEPRI_RTQUEUES) &&
337 		    (eprocessor->current_recommended_pset_type == PSET_AMP_P)) {
338 			ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, SCHED_IPI_EVENT_REBALANCE);
339 			if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
340 				bit_set(ast_processor_map, eprocessor->cpu_id);
341 				assert(eprocessor != cprocessor);
342 			}
343 		}
344 	}
345 
346 	pset_unlock(pset);
347 
348 	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
349 		processor_t ast_processor = processor_array[cpuid];
350 		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
351 	}
352 
353 	/* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
354 	return ast_processor_map != 0;
355 }
356 
357 /*
358  * Helper function for sched_amp_thread_group_recommendation_change()
359  * Find all the cores in the pset running threads from the thread_group tg
360  * and send them a rebalance interrupt.
361  */
362 void
sched_amp_bounce_thread_group_from_ecores(processor_set_t pset,struct thread_group * tg)363 sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg)
364 {
365 	if (!pset) {
366 		return;
367 	}
368 
369 	assert(pset->pset_cluster_type == PSET_AMP_E);
370 	uint64_t ast_processor_map = 0;
371 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
372 
373 	spl_t s = splsched();
374 	pset_lock(pset);
375 
376 	uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
377 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
378 		processor_t eprocessor = processor_array[cpuid];
379 		if (eprocessor->current_thread_group == tg) {
380 			ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, SCHED_IPI_EVENT_REBALANCE);
381 			if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
382 				bit_set(ast_processor_map, eprocessor->cpu_id);
383 			} else if (eprocessor == current_processor()) {
384 				ast_on(AST_PREEMPT);
385 				bit_set(pset->pending_AST_PREEMPT_cpu_mask, eprocessor->cpu_id);
386 			}
387 		}
388 	}
389 
390 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, tg, ast_processor_map, 0, 0);
391 
392 	pset_unlock(pset);
393 
394 	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
395 		processor_t ast_processor = processor_array[cpuid];
396 		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
397 	}
398 
399 	splx(s);
400 }
401 
402 /*
403  * sched_amp_ipi_policy()
404  */
405 sched_ipi_type_t
sched_amp_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)406 sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
407 {
408 	processor_set_t pset = dst->processor_set;
409 	assert(dst != current_processor());
410 
411 	boolean_t deferred_ipi_supported = false;
412 #if defined(CONFIG_SCHED_DEFERRED_AST)
413 	deferred_ipi_supported = true;
414 #endif /* CONFIG_SCHED_DEFERRED_AST */
415 
416 	switch (event) {
417 	case SCHED_IPI_EVENT_SPILL:
418 		/* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
419 		if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
420 			return sched_ipi_deferred_policy(pset, dst, thread, event);
421 		}
422 		break;
423 	case SCHED_IPI_EVENT_PREEMPT:
424 		/* For preemption, the default policy is to use deferred IPIs
425 		 * for Non-RT P-core preemption. Override that behavior if
426 		 * sched_amp_pcores_preempt_immediate_ipi is set
427 		 */
428 		if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
429 			if (sched_amp_pcores_preempt_immediate_ipi && (pset == pcore_set)) {
430 				return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
431 			}
432 		}
433 		break;
434 	default:
435 		break;
436 	}
437 	/* Default back to the global policy for all other scenarios */
438 	return sched_ipi_policy(dst, thread, dst_idle, event);
439 }
440 
441 /*
442  * sched_amp_qos_max_parallelism()
443  */
444 uint32_t
sched_amp_qos_max_parallelism(int qos,uint64_t options)445 sched_amp_qos_max_parallelism(int qos, uint64_t options)
446 {
447 	uint32_t ecount = ecore_set ? ecore_set->cpu_set_count : 0;
448 	uint32_t pcount = pcore_set ? pcore_set->cpu_set_count : 0;
449 
450 	/*
451 	 * The AMP scheduler does not support more than 1 of each type of cluster
452 	 * but the P-cluster is optional (e.g. watchOS)
453 	 */
454 	uint32_t ecluster_count = ecount ? 1 : 0;
455 	uint32_t pcluster_count = pcount ? 1 : 0;
456 
457 	if (options & QOS_PARALLELISM_REALTIME) {
458 		/* For realtime threads on AMP, we would want them
459 		 * to limit the width to just the P-cores since we
460 		 * do not spill/rebalance for RT threads.
461 		 */
462 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcount;
463 	}
464 
465 	/*
466 	 * The default AMP scheduler policy is to run utility and by
467 	 * threads on E-Cores only.  Run-time policy adjustment unlocks
468 	 * ability of utility and bg to threads to be scheduled based on
469 	 * run-time conditions.
470 	 */
471 	switch (qos) {
472 	case THREAD_QOS_UTILITY:
473 		if (os_atomic_load(&sched_perfctl_policy_util, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) {
474 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecount;
475 		} else {
476 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
477 		}
478 	case THREAD_QOS_BACKGROUND:
479 	case THREAD_QOS_MAINTENANCE:
480 		if (os_atomic_load(&sched_perfctl_policy_bg, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) {
481 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecount;
482 		} else {
483 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
484 		}
485 	default:
486 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
487 	}
488 }
489 
490 pset_node_t
sched_amp_choose_node(thread_t thread)491 sched_amp_choose_node(thread_t thread)
492 {
493 	pset_node_t node = (recommended_pset_type(thread) == PSET_AMP_P) ? pcore_node : ecore_node;
494 	return ((node != NULL) && (node->pset_map != 0)) ? node : &pset_node0;
495 }
496 #endif /* !CONFIG_SCHED_EDGE */
497 #endif /* __AMP__ */
498