1 /*
2 * Copyright (c) 2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 #include <machine/machine_routines.h>
32 #include <machine/sched_param.h>
33 #include <machine/machine_cpu.h>
34 #include <kern/kern_types.h>
35 #include <kern/debug.h>
36 #include <kern/machine.h>
37 #include <kern/misc_protos.h>
38 #include <kern/processor.h>
39 #include <kern/queue.h>
40 #include <kern/sched.h>
41 #include <kern/sched_prim.h>
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <machine/atomic.h>
45 #include <sys/kdebug.h>
46 #include <kern/sched_amp_common.h>
47 #include <stdatomic.h>
48
49 #if __AMP__
50
51 /* Configuration shared with the Edge scheduler */
52
53 /*
54 * We see performance gains from doing immediate IPIs to P-cores to run
55 * P-eligible threads and lesser P-E migrations from using deferred IPIs
56 * for spill.
57 */
58 int sched_amp_spill_deferred_ipi = 1;
59 int sched_amp_pcores_preempt_immediate_ipi = 1;
60
61 #if !CONFIG_SCHED_EDGE
62
63 /* Exported globals */
64 processor_set_t ecore_set = NULL;
65 processor_set_t pcore_set = NULL;
66
67 /*
68 * sched_amp_init()
69 *
70 * Initialize the pcore_set and ecore_set globals which describe the
71 * P/E processor sets.
72 */
73 void
sched_amp_init(void)74 sched_amp_init(void)
75 {
76 sched_timeshare_init();
77 }
78
79 /* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */
80 int sched_amp_spill_count = 3;
81 int sched_amp_idle_steal = 1;
82 int sched_amp_spill_steal = 1;
83
84 /*
85 * sched_perfcontrol_inherit_recommendation_from_tg changes amp
86 * scheduling policy away from default and allows policy to be
87 * modified at run-time.
88 *
89 * once modified from default, the policy toggles between "follow
90 * thread group" and "restrict to e".
91 */
92
93 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_util = SCHED_PERFCTL_POLICY_DEFAULT;
94 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_bg = SCHED_PERFCTL_POLICY_DEFAULT;
95
96 /*
97 * sched_amp_spill_threshold()
98 *
99 * Routine to calulate spill threshold which decides if cluster should spill.
100 */
101 int
sched_amp_spill_threshold(processor_set_t pset)102 sched_amp_spill_threshold(processor_set_t pset)
103 {
104 int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
105
106 return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + sched_amp_spill_count;
107 }
108
109 /*
110 * pset_signal_spill()
111 *
112 * Routine to signal a running/idle CPU to cause a spill onto that CPU.
113 * Called with pset locked, returns unlocked
114 */
115 void
pset_signal_spill(processor_set_t pset,int spilled_thread_priority)116 pset_signal_spill(processor_set_t pset, int spilled_thread_priority)
117 {
118 processor_t processor;
119 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
120
121 uint64_t idle_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE];
122 for (int cpuid = lsb_first(idle_map); cpuid >= 0; cpuid = lsb_next(idle_map, cpuid)) {
123 processor = processor_array[cpuid];
124 if (bit_set_if_clear(pset->pending_spill_cpu_mask, processor->cpu_id)) {
125 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 0, 0, 0);
126
127 processor->deadline = UINT64_MAX;
128
129 if (processor == current_processor()) {
130 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
131 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
132 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
133 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 6);
134 }
135 } else {
136 ipi_type = sched_ipi_action(processor, NULL, SCHED_IPI_EVENT_SPILL);
137 }
138 pset_unlock(pset);
139 sched_ipi_perform(processor, ipi_type);
140 return;
141 }
142 }
143
144 processor_t ast_processor = NULL;
145 ast_t preempt = AST_NONE;
146 uint64_t running_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING];
147 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
148 processor = processor_array[cpuid];
149 if (processor->current_recommended_pset_type == PSET_AMP_P) {
150 /* Already running a spilled P-core recommended thread */
151 continue;
152 }
153 if (bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
154 /* Already received a spill signal */
155 continue;
156 }
157 if (processor->current_pri >= spilled_thread_priority) {
158 /* Already running a higher or equal priority thread */
159 continue;
160 }
161
162 /* Found a suitable processor */
163 bit_set(pset->pending_spill_cpu_mask, processor->cpu_id);
164 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 1, 0, 0);
165 if (processor == current_processor()) {
166 preempt = AST_PREEMPT;
167 }
168 ipi_type = sched_ipi_action(processor, NULL, SCHED_IPI_EVENT_SPILL);
169 if (ipi_type != SCHED_IPI_NONE) {
170 ast_processor = processor;
171 }
172 break;
173 }
174
175 pset_unlock(pset);
176 sched_ipi_perform(ast_processor, ipi_type);
177
178 if (preempt != AST_NONE) {
179 ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt);
180 ast_on(new_preempt);
181 }
182 }
183
184 /*
185 * pset_should_accept_spilled_thread()
186 *
187 * Routine to decide if pset should accept spilled threads.
188 * This function must be safe to call (to use as a hint) without holding the pset lock.
189 */
190 bool
pset_should_accept_spilled_thread(processor_set_t pset,int spilled_thread_priority)191 pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority)
192 {
193 if (!pset) {
194 return false;
195 }
196
197 if ((pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
198 return true;
199 }
200
201 uint64_t cpu_map = (pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]);
202
203 for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
204 processor_t processor = processor_array[cpuid];
205
206 if (processor->current_recommended_pset_type == PSET_AMP_P) {
207 /* This processor is already running a spilled thread */
208 continue;
209 }
210
211 if (processor->current_pri < spilled_thread_priority) {
212 return true;
213 }
214 }
215
216 return false;
217 }
218
219 /*
220 * should_spill_to_ecores()
221 *
222 * Spill policy is implemented here
223 */
224 bool
should_spill_to_ecores(processor_set_t nset,thread_t thread)225 should_spill_to_ecores(processor_set_t nset, thread_t thread)
226 {
227 if (nset->pset_cluster_type == PSET_AMP_E) {
228 /* Not relevant if ecores already preferred */
229 return false;
230 }
231
232 if (!pset_is_recommended(ecore_set)) {
233 /* E cores must be recommended */
234 return false;
235 }
236
237 if (thread->th_bound_cluster_id == pcore_set->pset_id) {
238 /* Thread bound to the P-cluster */
239 return false;
240 }
241
242 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
243 /* Never spill realtime threads */
244 return false;
245 }
246
247 if ((nset->recommended_bitmask & nset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
248 /* Don't spill if idle cores */
249 return false;
250 }
251
252 if ((sched_get_pset_load_average(nset, 0) >= sched_amp_spill_threshold(nset)) && /* There is already a load on P cores */
253 pset_should_accept_spilled_thread(ecore_set, thread->sched_pri)) { /* There are lower priority E cores */
254 return true;
255 }
256
257 return false;
258 }
259
260 /*
261 * sched_amp_check_spill()
262 *
263 * Routine to check if the thread should be spilled and signal the pset if needed.
264 */
265 void
sched_amp_check_spill(processor_set_t pset,thread_t thread)266 sched_amp_check_spill(processor_set_t pset, thread_t thread)
267 {
268 /* pset is unlocked */
269
270 /* Bound threads don't call this function */
271 assert(thread->bound_processor == PROCESSOR_NULL);
272
273 if (should_spill_to_ecores(pset, thread)) {
274 pset_lock(ecore_set);
275
276 pset_signal_spill(ecore_set, thread->sched_pri);
277 /* returns with ecore_set unlocked */
278 }
279 }
280
281 /*
282 * sched_amp_steal_threshold()
283 *
284 * Routine to calculate the steal threshold
285 */
286 int
sched_amp_steal_threshold(processor_set_t pset,bool spill_pending)287 sched_amp_steal_threshold(processor_set_t pset, bool spill_pending)
288 {
289 int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
290
291 return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + (spill_pending ? sched_amp_spill_steal : sched_amp_idle_steal);
292 }
293
294 /*
295 * sched_amp_steal_thread_enabled()
296 *
297 */
298 bool
sched_amp_steal_thread_enabled(processor_set_t pset)299 sched_amp_steal_thread_enabled(processor_set_t pset)
300 {
301 return (pset->pset_cluster_type == PSET_AMP_E) && (pcore_set != NULL) && (pcore_set->online_processor_count > 0);
302 }
303
304 /*
305 * sched_amp_balance()
306 *
307 * Invoked with pset locked, returns with pset unlocked
308 */
309 bool
sched_amp_balance(processor_t cprocessor,processor_set_t cpset)310 sched_amp_balance(processor_t cprocessor, processor_set_t cpset)
311 {
312 assert(cprocessor == current_processor());
313
314 pset_unlock(cpset);
315
316 if (!ecore_set || cpset->pset_cluster_type == PSET_AMP_E || !cprocessor->is_recommended) {
317 return false;
318 }
319
320 /*
321 * cprocessor is an idle, recommended P core processor.
322 * Look for P-eligible threads that have spilled to an E core
323 * and coax them to come back.
324 */
325 processor_set_t pset = ecore_set;
326
327 pset_lock(pset);
328
329 processor_t eprocessor;
330 uint64_t ast_processor_map = 0;
331
332 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
333 uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
334 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
335 eprocessor = processor_array[cpuid];
336 if ((eprocessor->current_pri < BASEPRI_RTQUEUES) &&
337 (eprocessor->current_recommended_pset_type == PSET_AMP_P)) {
338 ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, SCHED_IPI_EVENT_REBALANCE);
339 if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
340 bit_set(ast_processor_map, eprocessor->cpu_id);
341 assert(eprocessor != cprocessor);
342 }
343 }
344 }
345
346 pset_unlock(pset);
347
348 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
349 processor_t ast_processor = processor_array[cpuid];
350 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
351 }
352
353 /* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
354 return ast_processor_map != 0;
355 }
356
357 /*
358 * Helper function for sched_amp_thread_group_recommendation_change()
359 * Find all the cores in the pset running threads from the thread_group tg
360 * and send them a rebalance interrupt.
361 */
362 void
sched_amp_bounce_thread_group_from_ecores(processor_set_t pset,struct thread_group * tg)363 sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg)
364 {
365 if (!pset) {
366 return;
367 }
368
369 assert(pset->pset_cluster_type == PSET_AMP_E);
370 uint64_t ast_processor_map = 0;
371 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
372
373 spl_t s = splsched();
374 pset_lock(pset);
375
376 uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
377 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
378 processor_t eprocessor = processor_array[cpuid];
379 if (eprocessor->current_thread_group == tg) {
380 ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, SCHED_IPI_EVENT_REBALANCE);
381 if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
382 bit_set(ast_processor_map, eprocessor->cpu_id);
383 } else if (eprocessor == current_processor()) {
384 ast_on(AST_PREEMPT);
385 bit_set(pset->pending_AST_PREEMPT_cpu_mask, eprocessor->cpu_id);
386 }
387 }
388 }
389
390 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, tg, ast_processor_map, 0, 0);
391
392 pset_unlock(pset);
393
394 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
395 processor_t ast_processor = processor_array[cpuid];
396 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
397 }
398
399 splx(s);
400 }
401
402 /*
403 * sched_amp_ipi_policy()
404 */
405 sched_ipi_type_t
sched_amp_ipi_policy(processor_t dst,thread_t thread,boolean_t dst_idle,sched_ipi_event_t event)406 sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
407 {
408 processor_set_t pset = dst->processor_set;
409 assert(dst != current_processor());
410
411 boolean_t deferred_ipi_supported = false;
412 #if defined(CONFIG_SCHED_DEFERRED_AST)
413 deferred_ipi_supported = true;
414 #endif /* CONFIG_SCHED_DEFERRED_AST */
415
416 switch (event) {
417 case SCHED_IPI_EVENT_SPILL:
418 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
419 if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
420 return sched_ipi_deferred_policy(pset, dst, thread, event);
421 }
422 break;
423 case SCHED_IPI_EVENT_PREEMPT:
424 /* For preemption, the default policy is to use deferred IPIs
425 * for Non-RT P-core preemption. Override that behavior if
426 * sched_amp_pcores_preempt_immediate_ipi is set
427 */
428 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
429 if (sched_amp_pcores_preempt_immediate_ipi && (pset == pcore_set)) {
430 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
431 }
432 }
433 break;
434 default:
435 break;
436 }
437 /* Default back to the global policy for all other scenarios */
438 return sched_ipi_policy(dst, thread, dst_idle, event);
439 }
440
441 /*
442 * sched_amp_qos_max_parallelism()
443 */
444 uint32_t
sched_amp_qos_max_parallelism(int qos,uint64_t options)445 sched_amp_qos_max_parallelism(int qos, uint64_t options)
446 {
447 uint32_t ecount = ecore_set ? ecore_set->cpu_set_count : 0;
448 uint32_t pcount = pcore_set ? pcore_set->cpu_set_count : 0;
449
450 /*
451 * The AMP scheduler does not support more than 1 of each type of cluster
452 * but the P-cluster is optional (e.g. watchOS)
453 */
454 uint32_t ecluster_count = ecount ? 1 : 0;
455 uint32_t pcluster_count = pcount ? 1 : 0;
456
457 if (options & QOS_PARALLELISM_REALTIME) {
458 /* For realtime threads on AMP, we would want them
459 * to limit the width to just the P-cores since we
460 * do not spill/rebalance for RT threads.
461 */
462 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcount;
463 }
464
465 /*
466 * The default AMP scheduler policy is to run utility and by
467 * threads on E-Cores only. Run-time policy adjustment unlocks
468 * ability of utility and bg to threads to be scheduled based on
469 * run-time conditions.
470 */
471 switch (qos) {
472 case THREAD_QOS_UTILITY:
473 if (os_atomic_load(&sched_perfctl_policy_util, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) {
474 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecount;
475 } else {
476 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
477 }
478 case THREAD_QOS_BACKGROUND:
479 case THREAD_QOS_MAINTENANCE:
480 if (os_atomic_load(&sched_perfctl_policy_bg, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) {
481 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecount;
482 } else {
483 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
484 }
485 default:
486 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
487 }
488 }
489
490 pset_node_t
sched_amp_choose_node(thread_t thread)491 sched_amp_choose_node(thread_t thread)
492 {
493 pset_node_t node = (recommended_pset_type(thread) == PSET_AMP_P) ? pcore_node : ecore_node;
494 return ((node != NULL) && (node->pset_map != 0)) ? node : &pset_node0;
495 }
496 #endif /* !CONFIG_SCHED_EDGE */
497 #endif /* __AMP__ */
498