xref: /xnu-10063.101.15/osfmk/kern/sched_amp_common.c (revision 94d3b452840153a99b38a3a9659680b2a006908e) !
1 /*
2  * Copyright (c) 2019 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 #include <machine/machine_routines.h>
32 #include <machine/sched_param.h>
33 #include <machine/machine_cpu.h>
34 #include <kern/kern_types.h>
35 #include <kern/debug.h>
36 #include <kern/machine.h>
37 #include <kern/misc_protos.h>
38 #include <kern/processor.h>
39 #include <kern/queue.h>
40 #include <kern/sched.h>
41 #include <kern/sched_prim.h>
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <machine/atomic.h>
45 #include <sys/kdebug.h>
46 #include <kern/sched_amp_common.h>
47 #include <stdatomic.h>
48 
49 #if __AMP__
50 
51 /* Exported globals */
52 processor_set_t ecore_set = NULL;
53 processor_set_t pcore_set = NULL;
54 
55 /*
56  * sched_amp_init()
57  *
58  * Initialize the pcore_set and ecore_set globals which describe the
59  * P/E processor sets.
60  */
61 void
sched_amp_init(void)62 sched_amp_init(void)
63 {
64 	sched_timeshare_init();
65 }
66 
67 /* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */
68 int sched_amp_spill_count = 3;
69 int sched_amp_idle_steal = 1;
70 int sched_amp_spill_steal = 1;
71 
72 /*
73  * We see performance gains from doing immediate IPIs to P-cores to run
74  * P-eligible threads and lesser P-E migrations from using deferred IPIs
75  * for spill.
76  */
77 int sched_amp_spill_deferred_ipi = 1;
78 int sched_amp_pcores_preempt_immediate_ipi = 1;
79 
80 /*
81  * sched_perfcontrol_inherit_recommendation_from_tg changes amp
82  * scheduling policy away from default and allows policy to be
83  * modified at run-time.
84  *
85  * once modified from default, the policy toggles between "follow
86  * thread group" and "restrict to e".
87  */
88 
89 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_util = SCHED_PERFCTL_POLICY_DEFAULT;
90 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_bg = SCHED_PERFCTL_POLICY_DEFAULT;
91 
92 /*
93  * sched_amp_spill_threshold()
94  *
95  * Routine to calulate spill threshold which decides if cluster should spill.
96  */
97 int
sched_amp_spill_threshold(processor_set_t pset)98 sched_amp_spill_threshold(processor_set_t pset)
99 {
100 	int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
101 
102 	return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + sched_amp_spill_count;
103 }
104 
105 /*
106  * pset_signal_spill()
107  *
108  * Routine to signal a running/idle CPU to cause a spill onto that CPU.
109  * Called with pset locked, returns unlocked
110  */
111 void
pset_signal_spill(processor_set_t pset,int spilled_thread_priority)112 pset_signal_spill(processor_set_t pset, int spilled_thread_priority)
113 {
114 	processor_t processor;
115 	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
116 
117 	uint64_t idle_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE];
118 	for (int cpuid = lsb_first(idle_map); cpuid >= 0; cpuid = lsb_next(idle_map, cpuid)) {
119 		processor = processor_array[cpuid];
120 		if (bit_set_if_clear(pset->pending_spill_cpu_mask, processor->cpu_id)) {
121 			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 0, 0, 0);
122 
123 			processor->deadline = UINT64_MAX;
124 
125 			if (processor == current_processor()) {
126 				pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
127 				if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
128 					KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
129 					    processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 6);
130 				}
131 			} else {
132 				ipi_type = sched_ipi_action(processor, NULL, SCHED_IPI_EVENT_SPILL);
133 			}
134 			pset_unlock(pset);
135 			sched_ipi_perform(processor, ipi_type);
136 			return;
137 		}
138 	}
139 
140 	processor_t ast_processor = NULL;
141 	ast_t preempt = AST_NONE;
142 	uint64_t running_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING];
143 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
144 		processor = processor_array[cpuid];
145 		if (processor->current_recommended_pset_type == PSET_AMP_P) {
146 			/* Already running a spilled P-core recommended thread */
147 			continue;
148 		}
149 		if (bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
150 			/* Already received a spill signal */
151 			continue;
152 		}
153 		if (processor->current_pri >= spilled_thread_priority) {
154 			/* Already running a higher or equal priority thread */
155 			continue;
156 		}
157 
158 		/* Found a suitable processor */
159 		bit_set(pset->pending_spill_cpu_mask, processor->cpu_id);
160 		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 1, 0, 0);
161 		if (processor == current_processor()) {
162 			preempt = AST_PREEMPT;
163 		}
164 		ipi_type = sched_ipi_action(processor, NULL, SCHED_IPI_EVENT_SPILL);
165 		if (ipi_type != SCHED_IPI_NONE) {
166 			ast_processor = processor;
167 		}
168 		break;
169 	}
170 
171 	pset_unlock(pset);
172 	sched_ipi_perform(ast_processor, ipi_type);
173 
174 	if (preempt != AST_NONE) {
175 		ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt);
176 		ast_on(new_preempt);
177 	}
178 }
179 
180 /*
181  * pset_should_accept_spilled_thread()
182  *
183  * Routine to decide if pset should accept spilled threads.
184  * This function must be safe to call (to use as a hint) without holding the pset lock.
185  */
186 bool
pset_should_accept_spilled_thread(processor_set_t pset,int spilled_thread_priority)187 pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority)
188 {
189 	if (!pset) {
190 		return false;
191 	}
192 
193 	if ((pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
194 		return true;
195 	}
196 
197 	uint64_t cpu_map = (pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]);
198 
199 	for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
200 		processor_t processor = processor_array[cpuid];
201 
202 		if (processor->current_recommended_pset_type == PSET_AMP_P) {
203 			/* This processor is already running a spilled thread */
204 			continue;
205 		}
206 
207 		if (processor->current_pri < spilled_thread_priority) {
208 			return true;
209 		}
210 	}
211 
212 	return false;
213 }
214 
215 /*
216  * should_spill_to_ecores()
217  *
218  * Spill policy is implemented here
219  */
220 bool
should_spill_to_ecores(processor_set_t nset,thread_t thread)221 should_spill_to_ecores(processor_set_t nset, thread_t thread)
222 {
223 	if (nset->pset_cluster_type == PSET_AMP_E) {
224 		/* Not relevant if ecores already preferred */
225 		return false;
226 	}
227 
228 	if (!pset_is_recommended(ecore_set)) {
229 		/* E cores must be recommended */
230 		return false;
231 	}
232 
233 	if (thread->th_bound_cluster_id == pcore_set->pset_id) {
234 		/* Thread bound to the P-cluster */
235 		return false;
236 	}
237 
238 	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
239 		/* Never spill realtime threads */
240 		return false;
241 	}
242 
243 	if ((nset->recommended_bitmask & nset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
244 		/* Don't spill if idle cores */
245 		return false;
246 	}
247 
248 	if ((sched_get_pset_load_average(nset, 0) >= sched_amp_spill_threshold(nset)) &&  /* There is already a load on P cores */
249 	    pset_should_accept_spilled_thread(ecore_set, thread->sched_pri)) { /* There are lower priority E cores */
250 		return true;
251 	}
252 
253 	return false;
254 }
255 
256 /*
257  * sched_amp_check_spill()
258  *
259  * Routine to check if the thread should be spilled and signal the pset if needed.
260  */
261 void
sched_amp_check_spill(processor_set_t pset,thread_t thread)262 sched_amp_check_spill(processor_set_t pset, thread_t thread)
263 {
264 	/* pset is unlocked */
265 
266 	/* Bound threads don't call this function */
267 	assert(thread->bound_processor == PROCESSOR_NULL);
268 
269 	if (should_spill_to_ecores(pset, thread)) {
270 		pset_lock(ecore_set);
271 
272 		pset_signal_spill(ecore_set, thread->sched_pri);
273 		/* returns with ecore_set unlocked */
274 	}
275 }
276 
277 /*
278  * sched_amp_steal_threshold()
279  *
280  * Routine to calculate the steal threshold
281  */
282 int
sched_amp_steal_threshold(processor_set_t pset,bool spill_pending)283 sched_amp_steal_threshold(processor_set_t pset, bool spill_pending)
284 {
285 	int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
286 
287 	return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + (spill_pending ? sched_amp_spill_steal : sched_amp_idle_steal);
288 }
289 
290 /*
291  * sched_amp_steal_thread_enabled()
292  *
293  */
294 bool
sched_amp_steal_thread_enabled(processor_set_t pset)295 sched_amp_steal_thread_enabled(processor_set_t pset)
296 {
297 	return (pset->pset_cluster_type == PSET_AMP_E) && (pcore_set != NULL) && (pcore_set->online_processor_count > 0);
298 }
299 
300 /*
301  * sched_amp_balance()
302  *
303  * Invoked with pset locked, returns with pset unlocked
304  */
305 bool
sched_amp_balance(processor_t cprocessor,processor_set_t cpset)306 sched_amp_balance(processor_t cprocessor, processor_set_t cpset)
307 {
308 	assert(cprocessor == current_processor());
309 
310 	pset_unlock(cpset);
311 
312 	if (!ecore_set || cpset->pset_cluster_type == PSET_AMP_E || !cprocessor->is_recommended) {
313 		return false;
314 	}
315 
316 	/*
317 	 * cprocessor is an idle, recommended P core processor.
318 	 * Look for P-eligible threads that have spilled to an E core
319 	 * and coax them to come back.
320 	 */
321 	processor_set_t pset = ecore_set;
322 
323 	pset_lock(pset);
324 
325 	processor_t eprocessor;
326 	uint64_t ast_processor_map = 0;
327 
328 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
329 	uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
330 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
331 		eprocessor = processor_array[cpuid];
332 		if ((eprocessor->current_pri < BASEPRI_RTQUEUES) &&
333 		    (eprocessor->current_recommended_pset_type == PSET_AMP_P)) {
334 			ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, SCHED_IPI_EVENT_REBALANCE);
335 			if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
336 				bit_set(ast_processor_map, eprocessor->cpu_id);
337 				assert(eprocessor != cprocessor);
338 			}
339 		}
340 	}
341 
342 	pset_unlock(pset);
343 
344 	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
345 		processor_t ast_processor = processor_array[cpuid];
346 		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
347 	}
348 
349 	/* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
350 	return ast_processor_map != 0;
351 }
352 
353 /*
354  * Helper function for sched_amp_thread_group_recommendation_change()
355  * Find all the cores in the pset running threads from the thread_group tg
356  * and send them a rebalance interrupt.
357  */
358 void
sched_amp_bounce_thread_group_from_ecores(processor_set_t pset,struct thread_group * tg)359 sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg)
360 {
361 	if (!pset) {
362 		return;
363 	}
364 
365 	assert(pset->pset_cluster_type == PSET_AMP_E);
366 	uint64_t ast_processor_map = 0;
367 	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
368 
369 	spl_t s = splsched();
370 	pset_lock(pset);
371 
372 	uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
373 	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
374 		processor_t eprocessor = processor_array[cpuid];
375 		if (eprocessor->current_thread_group == tg) {
376 			ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, SCHED_IPI_EVENT_REBALANCE);
377 			if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
378 				bit_set(ast_processor_map, eprocessor->cpu_id);
379 			} else if (eprocessor == current_processor()) {
380 				ast_on(AST_PREEMPT);
381 				bit_set(pset->pending_AST_PREEMPT_cpu_mask, eprocessor->cpu_id);
382 			}
383 		}
384 	}
385 
386 	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, tg, ast_processor_map, 0, 0);
387 
388 	pset_unlock(pset);
389 
390 	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
391 		processor_t ast_processor = processor_array[cpuid];
392 		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
393 	}
394 
395 	splx(s);
396 }
397 
398 /*
399  * sched_amp_ipi_policy()
400  */
401 sched_ipi_type_t
sched_amp_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)402 sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
403 {
404 	processor_set_t pset = dst->processor_set;
405 	assert(dst != current_processor());
406 
407 	boolean_t deferred_ipi_supported = false;
408 #if defined(CONFIG_SCHED_DEFERRED_AST)
409 	deferred_ipi_supported = true;
410 #endif /* CONFIG_SCHED_DEFERRED_AST */
411 
412 	switch (event) {
413 	case SCHED_IPI_EVENT_SPILL:
414 		/* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
415 		if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
416 			return sched_ipi_deferred_policy(pset, dst, thread, event);
417 		}
418 		break;
419 	case SCHED_IPI_EVENT_PREEMPT:
420 		/* For preemption, the default policy is to use deferred IPIs
421 		 * for Non-RT P-core preemption. Override that behavior if
422 		 * sched_amp_pcores_preempt_immediate_ipi is set
423 		 */
424 		if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
425 			if (sched_amp_pcores_preempt_immediate_ipi && (pset == pcore_set)) {
426 				return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
427 			}
428 		}
429 		break;
430 	default:
431 		break;
432 	}
433 	/* Default back to the global policy for all other scenarios */
434 	return sched_ipi_policy(dst, thread, dst_idle, event);
435 }
436 
437 /*
438  * sched_amp_qos_max_parallelism()
439  */
440 uint32_t
sched_amp_qos_max_parallelism(int qos,uint64_t options)441 sched_amp_qos_max_parallelism(int qos, uint64_t options)
442 {
443 	uint32_t ecount = ecore_set ? ecore_set->cpu_set_count : 0;
444 	uint32_t pcount = pcore_set ? pcore_set->cpu_set_count : 0;
445 
446 	/*
447 	 * The AMP scheduler does not support more than 1 of each type of cluster
448 	 * but the P-cluster is optional (e.g. watchOS)
449 	 */
450 	uint32_t ecluster_count = ecount ? 1 : 0;
451 	uint32_t pcluster_count = pcount ? 1 : 0;
452 
453 	if (options & QOS_PARALLELISM_REALTIME) {
454 		/* For realtime threads on AMP, we would want them
455 		 * to limit the width to just the P-cores since we
456 		 * do not spill/rebalance for RT threads.
457 		 */
458 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcount;
459 	}
460 
461 	/*
462 	 * The default AMP scheduler policy is to run utility and by
463 	 * threads on E-Cores only.  Run-time policy adjustment unlocks
464 	 * ability of utility and bg to threads to be scheduled based on
465 	 * run-time conditions.
466 	 */
467 	switch (qos) {
468 	case THREAD_QOS_UTILITY:
469 		if (os_atomic_load(&sched_perfctl_policy_util, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) {
470 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecount;
471 		} else {
472 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
473 		}
474 	case THREAD_QOS_BACKGROUND:
475 	case THREAD_QOS_MAINTENANCE:
476 		if (os_atomic_load(&sched_perfctl_policy_bg, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) {
477 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecount;
478 		} else {
479 			return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
480 		}
481 	default:
482 		return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
483 	}
484 }
485 
486 pset_node_t
sched_amp_choose_node(thread_t thread)487 sched_amp_choose_node(thread_t thread)
488 {
489 	pset_node_t node = (recommended_pset_type(thread) == PSET_AMP_P) ? pcore_node : ecore_node;
490 	return ((node != NULL) && (node->pset_map != 0)) ? node : &pset_node0;
491 }
492 
493 #endif /* __AMP__ */
494