xref: /xnu-8019.80.24/osfmk/kern/sched_multiq.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 2013-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach/mach_types.h>
30 #include <mach/machine.h>
31 
32 #include <machine/machine_routines.h>
33 #include <machine/sched_param.h>
34 #include <machine/machine_cpu.h>
35 
36 #include <kern/kern_types.h>
37 #include <kern/debug.h>
38 #include <kern/mach_param.h>
39 #include <kern/machine.h>
40 #include <kern/misc_protos.h>
41 #include <kern/processor.h>
42 #include <kern/queue.h>
43 #include <kern/sched.h>
44 #include <kern/sched_prim.h>
45 #include <kern/task.h>
46 #include <kern/thread.h>
47 
48 #include <sys/kdebug.h>
49 
50 /*
51  * Theory Statement
52  *
53  * How does the task scheduler work?
54  *
55  * It schedules threads across a few levels.
56  *
57  * RT threads are dealt with above us
58  * Bound threads go into the per-processor runq
59  * Non-bound threads are linked on their task's sched_group's runq
60  * sched_groups' sched_entries are linked on the pset's runq
61  *
62  * TODO: make this explicit - bound threads should have a different enqueue fxn
63  *
64  * When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue
65  * or the current group's runqueue, then dequeue the next thread in that runqueue.
66  *
67  * We then manipulate the sched_entries to reflect the invariant that:
68  * Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global
69  * runqueue.
70  *
71  * A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting
72  * to run.  This reduces the excess contention bonus given to processes which have work spread among many threads
73  * as compared to processes which do the same amount of work under fewer threads.
74  *
75  * NOTE: Currently, the multiq scheduler only supports one pset.
76  *
77  * NOTE ABOUT thread->sched_pri:
78  *
79  * It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0.
80  * Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue.
81  *
82  * TODO: Future features:
83  *
84  * Decouple the task priority from the sched_entry priority, allowing for:
85  *      fast task priority change without having to iterate and re-dispatch all threads in the task.
86  *              i.e. task-wide priority, task-wide boosting
87  *      fancier group decay features
88  *
89  * Group (or task) decay:
90  *      Decay is used for a few different things:
91  *              Prioritizing latency-needing threads over throughput-needing threads for time-to-running
92  *              Balancing work between threads in a process
93  *              Balancing work done at the same priority between different processes
94  *              Recovering from priority inversions between two threads in the same process
95  *              Recovering from priority inversions between two threads in different processes
96  *              Simulating a proportional share scheduler by allowing lower priority threads
97  *                to run for a certain percentage of the time
98  *
99  *      Task decay lets us separately address the 'same process' and 'different process' needs,
100  *      which will allow us to make smarter tradeoffs in different cases.
101  *      For example, we could resolve priority inversion in the same process by reordering threads without dropping the
102  *      process below low priority threads in other processes.
103  *
104  * One lock to rule them all (or at least all the runqueues) instead of the pset locks
105  *
106  * Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field.
107  * The entries array is 5K currently so it'd be really great to reduce.
108  * One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime.
109  *
110  * When preempting a processor, store a flag saying if the preemption
111  * was from a thread in the same group or different group,
112  * and tell choose_thread about it.
113  *
114  * When choosing a processor, bias towards those running in the same
115  * group as I am running (at the same priority, or within a certain band?).
116  *
117  * Decide if we need to support psets.
118  * Decide how to support psets - do we need duplicate entries for each pset,
119  * or can we get away with putting the entry in either one or the other pset?
120  *
121  * Consider the right way to handle runq count - I don't want to iterate groups.
122  * Perhaps keep a global counter.
123  * Alternate option - remove it from choose_processor. It doesn't add much value
124  * now that we have global runq.
125  *
126  * Need a better way of finding group to target instead of looking at current_task.
127  * Perhaps choose_thread could pass in the current thread?
128  *
129  * Consider unifying runq copy-pastes.
130  *
131  * Thoughts on having a group central quantum bucket:
132  *
133  * I see two algorithms to decide quanta:
134  * A) Hand off only when switching thread to thread in the same group
135  * B) Allocate and return quanta to the group's pool
136  *
137  * Issues:
138  * If a task blocks completely, should it come back with the leftover quanta
139  * or brand new quanta?
140  *
141  * Should I put a flag saying zero out a quanta you grab when youre dispatched'?
142  *
143  * Resolution:
144  * Handing off quanta between threads will help with jumping around in the current task
145  * but will not help when a thread from a different task is involved.
146  * Need an algorithm that works with round robin-ing between threads in different tasks
147  *
148  * But wait - round robining can only be triggered by quantum expire or blocking.
149  * We need something that works with preemption or yielding - that's the more interesting idea.
150  *
151  * Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq.
152  * Blocking or quantum expiration does re-set quantum, puts thread on tail of runq.
153  *
154  * New algorithm -
155  * Hand off quanta when hopping between threads with same sched_group
156  * Even if thread was blocked it uses last thread remaining quanta when it starts.
157  *
158  * If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting
159  * interesting.
160  *
161  * A thought - perhaps the handoff approach doesn't work so well in the presence of
162  * non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that
163  * woken thread will be what I switch to - other processor may have stolen it.
164  * What do we do there?
165  *
166  * Conclusions:
167  * We currently don't know of a scenario where quantum buckets on the task is beneficial.
168  * We will instead handoff quantum between threads in the task, and keep quantum
169  * on the preempted thread if it's preempted by something outside the task.
170  *
171  */
172 
173 #if DEBUG || DEVELOPMENT
174 #define MULTIQ_SANITY_CHECK
175 #endif
176 
177 typedef struct sched_entry {
178 	queue_chain_t           entry_links;
179 	int16_t                 sched_pri;      /* scheduled (current) priority */
180 	int16_t                 runq;
181 	int32_t                 pad;
182 } *sched_entry_t;
183 
184 typedef run_queue_t entry_queue_t;                      /* A run queue that holds sched_entries instead of threads */
185 typedef run_queue_t group_runq_t;                       /* A run queue that is part of a sched_group */
186 
187 #define SCHED_ENTRY_NULL        ((sched_entry_t) 0)
188 #define MULTIQ_ERUNQ            (-4)                    /* Indicates entry is on the main runq */
189 
190 /* Each level in the run queue corresponds to one entry in the entries array */
191 struct sched_group {
192 	struct sched_entry      entries[NRQS];
193 	struct run_queue        runq;
194 	queue_chain_t           sched_groups;
195 };
196 
197 /*
198  * Keep entry on the head of the runqueue while dequeueing threads.
199  * Only cycle it to the end of the runqueue when a thread in the task
200  * hits its quantum.
201  */
202 static boolean_t        deep_drain = FALSE;
203 
204 /* Verify the consistency of the runq before touching it */
205 static boolean_t        multiq_sanity_check = FALSE;
206 
207 /*
208  * Draining threads from the current task is preferred
209  * when they're less than X steps below the current
210  * global highest priority
211  */
212 #define DEFAULT_DRAIN_BAND_LIMIT MAXPRI
213 static integer_t        drain_band_limit;
214 
215 /*
216  * Don't go below this priority level if there is something above it in another task
217  */
218 #define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
219 static integer_t        drain_depth_limit;
220 
221 /*
222  * Don't favor the task when there's something above this priority in another task.
223  */
224 #define DEFAULT_DRAIN_CEILING BASEPRI_FOREGROUND
225 static integer_t        drain_ceiling;
226 
227 static ZONE_DECLARE(sched_group_zone, "sched groups",
228     sizeof(struct sched_group), ZC_NOCALLOUT);
229 
230 static uint64_t         num_sched_groups = 0;
231 static queue_head_t     sched_groups;
232 
233 static LCK_GRP_DECLARE(sched_groups_lock_grp, "sched_groups");
234 static LCK_MTX_DECLARE(sched_groups_lock, &sched_groups_lock_grp);
235 
236 static void
237 sched_multiq_init(void);
238 
239 static thread_t
240 sched_multiq_steal_thread(processor_set_t pset);
241 
242 static void
243 sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context);
244 
245 static boolean_t
246 sched_multiq_processor_enqueue(processor_t processor, thread_t thread,
247     sched_options_t options);
248 
249 static boolean_t
250 sched_multiq_processor_queue_remove(processor_t processor, thread_t thread);
251 
252 void
253 sched_multiq_quantum_expire(thread_t thread);
254 
255 static ast_t
256 sched_multiq_processor_csw_check(processor_t processor);
257 
258 static boolean_t
259 sched_multiq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
260 
261 static int
262 sched_multiq_runq_count(processor_t processor);
263 
264 static boolean_t
265 sched_multiq_processor_queue_empty(processor_t processor);
266 
267 static uint64_t
268 sched_multiq_runq_stats_count_sum(processor_t processor);
269 
270 static int
271 sched_multiq_processor_bound_count(processor_t processor);
272 
273 static void
274 sched_multiq_pset_init(processor_set_t pset);
275 
276 static void
277 sched_multiq_processor_init(processor_t processor);
278 
279 static thread_t
280 sched_multiq_choose_thread(processor_t processor, int priority, ast_t reason);
281 
282 static void
283 sched_multiq_processor_queue_shutdown(processor_t processor);
284 
285 static sched_mode_t
286 sched_multiq_initial_thread_sched_mode(task_t parent_task);
287 
288 static bool
289 sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread);
290 
291 const struct sched_dispatch_table sched_multiq_dispatch = {
292 	.sched_name                                     = "multiq",
293 	.init                                           = sched_multiq_init,
294 	.timebase_init                                  = sched_timeshare_timebase_init,
295 	.processor_init                                 = sched_multiq_processor_init,
296 	.pset_init                                      = sched_multiq_pset_init,
297 	.maintenance_continuation                       = sched_timeshare_maintenance_continue,
298 	.choose_thread                                  = sched_multiq_choose_thread,
299 	.steal_thread_enabled                           = sched_steal_thread_DISABLED,
300 	.steal_thread                                   = sched_multiq_steal_thread,
301 	.compute_timeshare_priority                     = sched_compute_timeshare_priority,
302 	.choose_node                                    = sched_choose_node,
303 	.choose_processor                               = choose_processor,
304 	.processor_enqueue                              = sched_multiq_processor_enqueue,
305 	.processor_queue_shutdown                       = sched_multiq_processor_queue_shutdown,
306 	.processor_queue_remove                         = sched_multiq_processor_queue_remove,
307 	.processor_queue_empty                          = sched_multiq_processor_queue_empty,
308 	.priority_is_urgent                             = priority_is_urgent,
309 	.processor_csw_check                            = sched_multiq_processor_csw_check,
310 	.processor_queue_has_priority                   = sched_multiq_processor_queue_has_priority,
311 	.initial_quantum_size                           = sched_timeshare_initial_quantum_size,
312 	.initial_thread_sched_mode                      = sched_multiq_initial_thread_sched_mode,
313 	.can_update_priority                            = can_update_priority,
314 	.update_priority                                = update_priority,
315 	.lightweight_update_priority                    = lightweight_update_priority,
316 	.quantum_expire                                 = sched_multiq_quantum_expire,
317 	.processor_runq_count                           = sched_multiq_runq_count,
318 	.processor_runq_stats_count_sum                 = sched_multiq_runq_stats_count_sum,
319 	.processor_bound_count                          = sched_multiq_processor_bound_count,
320 	.thread_update_scan                             = sched_multiq_thread_update_scan,
321 	.multiple_psets_enabled                         = FALSE,
322 	.sched_groups_enabled                           = TRUE,
323 	.avoid_processor_enabled                        = TRUE,
324 	.thread_avoid_processor                         = sched_multiq_thread_avoid_processor,
325 	.processor_balance                              = sched_SMT_balance,
326 
327 	.rt_runq                                        = sched_rtlocal_runq,
328 	.rt_init                                        = sched_rtlocal_init,
329 	.rt_queue_shutdown                              = sched_rtlocal_queue_shutdown,
330 	.rt_runq_scan                                   = sched_rtlocal_runq_scan,
331 	.rt_runq_count_sum                              = sched_rtlocal_runq_count_sum,
332 	.rt_steal_thread                                = sched_rtlocal_steal_thread,
333 
334 	.qos_max_parallelism                            = sched_qos_max_parallelism,
335 	.check_spill                                    = sched_check_spill,
336 	.ipi_policy                                     = sched_ipi_policy,
337 	.thread_should_yield                            = sched_thread_should_yield,
338 	.run_count_incr                                 = sched_run_incr,
339 	.run_count_decr                                 = sched_run_decr,
340 	.update_thread_bucket                           = sched_update_thread_bucket,
341 	.pset_made_schedulable                          = sched_pset_made_schedulable,
342 	.cpu_init_completed                             = NULL,
343 	.thread_eligible_for_pset                       = NULL,
344 };
345 
346 
347 static void
sched_multiq_init(void)348 sched_multiq_init(void)
349 {
350 #if defined(MULTIQ_SANITY_CHECK)
351 	PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
352 #endif
353 
354 	PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
355 
356 	if (!PE_parse_boot_argn("multiq_drain_ceiling", &drain_ceiling, sizeof(drain_ceiling))) {
357 		drain_ceiling = DEFAULT_DRAIN_CEILING;
358 	}
359 
360 	if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
361 		drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
362 	}
363 
364 	if (!PE_parse_boot_argn("multiq_drain_band_limit", &drain_band_limit, sizeof(drain_band_limit))) {
365 		drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
366 	}
367 
368 	printf("multiq scheduler config: deep-drain %d, ceiling %d, depth limit %d, band limit %d, sanity check %d\n",
369 	    deep_drain, drain_ceiling, drain_depth_limit, drain_band_limit, multiq_sanity_check);
370 
371 	queue_init(&sched_groups);
372 
373 	sched_timeshare_init();
374 }
375 
376 static void
sched_multiq_processor_init(processor_t processor)377 sched_multiq_processor_init(processor_t processor)
378 {
379 	run_queue_init(&processor->runq);
380 }
381 
382 static void
sched_multiq_pset_init(processor_set_t pset)383 sched_multiq_pset_init(processor_set_t pset)
384 {
385 	run_queue_init(&pset->pset_runq);
386 }
387 
388 static sched_mode_t
sched_multiq_initial_thread_sched_mode(task_t parent_task)389 sched_multiq_initial_thread_sched_mode(task_t parent_task)
390 {
391 	if (parent_task == kernel_task) {
392 		return TH_MODE_FIXED;
393 	} else {
394 		return TH_MODE_TIMESHARE;
395 	}
396 }
397 
398 sched_group_t
sched_group_create(void)399 sched_group_create(void)
400 {
401 	sched_group_t       sched_group;
402 
403 	if (!SCHED(sched_groups_enabled)) {
404 		return SCHED_GROUP_NULL;
405 	}
406 
407 	sched_group = zalloc_flags(sched_group_zone, Z_WAITOK | Z_ZERO);
408 
409 	run_queue_init(&sched_group->runq);
410 
411 	for (size_t i = 0; i < NRQS; i++) {
412 		sched_group->entries[i].runq = 0;
413 		sched_group->entries[i].sched_pri = (int16_t)i;
414 	}
415 
416 	lck_mtx_lock(&sched_groups_lock);
417 	queue_enter(&sched_groups, sched_group, sched_group_t, sched_groups);
418 	num_sched_groups++;
419 	lck_mtx_unlock(&sched_groups_lock);
420 
421 	return sched_group;
422 }
423 
424 void
sched_group_destroy(sched_group_t sched_group)425 sched_group_destroy(sched_group_t sched_group)
426 {
427 	if (!SCHED(sched_groups_enabled)) {
428 		assert(sched_group == SCHED_GROUP_NULL);
429 		return;
430 	}
431 
432 	assert(sched_group != SCHED_GROUP_NULL);
433 	assert(sched_group->runq.count == 0);
434 
435 	for (int i = 0; i < NRQS; i++) {
436 		assert(sched_group->entries[i].runq == 0);
437 		assert(sched_group->entries[i].sched_pri == i);
438 	}
439 
440 	lck_mtx_lock(&sched_groups_lock);
441 	queue_remove(&sched_groups, sched_group, sched_group_t, sched_groups);
442 	num_sched_groups--;
443 	lck_mtx_unlock(&sched_groups_lock);
444 
445 	zfree(sched_group_zone, sched_group);
446 }
447 
448 __attribute__((always_inline))
449 static inline entry_queue_t
multiq_main_entryq(processor_t processor)450 multiq_main_entryq(processor_t processor)
451 {
452 	return (entry_queue_t)&processor->processor_set->pset_runq;
453 }
454 
455 __attribute__((always_inline))
456 static inline run_queue_t
multiq_bound_runq(processor_t processor)457 multiq_bound_runq(processor_t processor)
458 {
459 	return &processor->runq;
460 }
461 
462 __attribute__((always_inline))
463 static inline sched_entry_t
group_entry_for_pri(sched_group_t group,integer_t pri)464 group_entry_for_pri(sched_group_t group, integer_t pri)
465 {
466 	return &group->entries[pri];
467 }
468 
469 __attribute__((always_inline))
470 static inline sched_group_t
group_for_entry(sched_entry_t entry)471 group_for_entry(sched_entry_t entry)
472 {
473 #pragma clang diagnostic push
474 #pragma clang diagnostic ignored "-Wcast-align"
475 	sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
476 #pragma clang diagnostic pop
477 	return group;
478 }
479 
480 /* Peek at the head of the runqueue */
481 static sched_entry_t
entry_queue_first_entry(entry_queue_t rq)482 entry_queue_first_entry(entry_queue_t rq)
483 {
484 	assert(rq->count != 0);
485 
486 	circle_queue_t queue = &rq->queues[rq->highq];
487 
488 	sched_entry_t entry = cqe_queue_first(queue, struct sched_entry, entry_links);
489 
490 	assert(entry->sched_pri == rq->highq);
491 
492 	return entry;
493 }
494 
495 #if defined(MULTIQ_SANITY_CHECK)
496 
497 #if MACH_ASSERT
498 __attribute__((always_inline))
499 static inline boolean_t
queue_chain_linked(queue_chain_t * chain)500 queue_chain_linked(queue_chain_t* chain)
501 {
502 	if (chain->next != NULL) {
503 		assert(chain->prev != NULL);
504 		return TRUE;
505 	} else {
506 		assert(chain->prev == NULL);
507 		return FALSE;
508 	}
509 }
510 #endif /* MACH_ASSERT */
511 
512 static thread_t
group_first_thread(sched_group_t group)513 group_first_thread(sched_group_t group)
514 {
515 	group_runq_t rq = &group->runq;
516 
517 	assert(rq->count != 0);
518 
519 	circle_queue_t queue = &rq->queues[rq->highq];
520 
521 	thread_t thread = cqe_queue_first(queue, struct thread, runq_links);
522 
523 	assert(thread != THREAD_NULL);
524 	assert_thread_magic(thread);
525 
526 	assert(thread->sched_group == group);
527 
528 	/* TODO: May not be safe */
529 	assert(thread->sched_pri == rq->highq);
530 
531 	return thread;
532 }
533 
534 /* Asserts if entry is not in entry runq at pri */
535 static void
entry_queue_check_entry(entry_queue_t runq,sched_entry_t entry,int expected_pri)536 entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri)
537 {
538 	circle_queue_t q;
539 	sched_entry_t elem;
540 
541 	assert(queue_chain_linked(&entry->entry_links));
542 	assert(entry->runq == MULTIQ_ERUNQ);
543 
544 	q = &runq->queues[expected_pri];
545 
546 	cqe_foreach_element(elem, q, entry_links) {
547 		if (elem == entry) {
548 			return;
549 		}
550 	}
551 
552 	panic("runq %p doesn't contain entry %p at pri %d", runq, entry, expected_pri);
553 }
554 
555 /* Asserts if thread is not in group at its priority */
556 static void
sched_group_check_thread(sched_group_t group,thread_t thread)557 sched_group_check_thread(sched_group_t group, thread_t thread)
558 {
559 	circle_queue_t q;
560 	thread_t elem;
561 	int pri = thread->sched_pri;
562 
563 	assert(thread->runq != PROCESSOR_NULL);
564 
565 	q = &group->runq.queues[pri];
566 
567 	cqe_foreach_element(elem, q, runq_links) {
568 		if (elem == thread) {
569 			return;
570 		}
571 	}
572 
573 	panic("group %p doesn't contain thread %p at pri %d", group, thread, pri);
574 }
575 
576 static void
global_check_entry_queue(entry_queue_t main_entryq)577 global_check_entry_queue(entry_queue_t main_entryq)
578 {
579 	if (main_entryq->count == 0) {
580 		return;
581 	}
582 
583 	sched_entry_t entry = entry_queue_first_entry(main_entryq);
584 
585 	assert(entry->runq == MULTIQ_ERUNQ);
586 
587 	sched_group_t group = group_for_entry(entry);
588 
589 	thread_t thread = group_first_thread(group);
590 
591 	__assert_only sched_entry_t thread_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
592 
593 	assert(entry->sched_pri == group->runq.highq);
594 
595 	assert(entry == thread_entry);
596 	assert(thread->runq != PROCESSOR_NULL);
597 }
598 
599 static void
group_check_run_queue(entry_queue_t main_entryq,sched_group_t group)600 group_check_run_queue(entry_queue_t main_entryq, sched_group_t group)
601 {
602 	if (group->runq.count == 0) {
603 		return;
604 	}
605 
606 	thread_t thread = group_first_thread(group);
607 
608 	assert(thread->runq != PROCESSOR_NULL);
609 
610 	sched_entry_t sched_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
611 
612 	entry_queue_check_entry(main_entryq, sched_entry, thread->sched_pri);
613 
614 	assert(sched_entry->sched_pri == thread->sched_pri);
615 	assert(sched_entry->runq == MULTIQ_ERUNQ);
616 }
617 
618 #endif /* defined(MULTIQ_SANITY_CHECK) */
619 
620 /*
621  * The run queue must not be empty.
622  */
623 static sched_entry_t
entry_queue_dequeue_entry(entry_queue_t rq)624 entry_queue_dequeue_entry(entry_queue_t rq)
625 {
626 	sched_entry_t   sched_entry;
627 	circle_queue_t  queue = &rq->queues[rq->highq];
628 
629 	assert(rq->count > 0);
630 	assert(!circle_queue_empty(queue));
631 
632 	sched_entry = cqe_dequeue_head(queue, struct sched_entry, entry_links);
633 
634 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
635 	rq->count--;
636 	if (SCHED(priority_is_urgent)(rq->highq)) {
637 		rq->urgency--; assert(rq->urgency >= 0);
638 	}
639 	if (circle_queue_empty(queue)) {
640 		rq_bitmap_clear(rq->bitmap, rq->highq);
641 		rq->highq = bitmap_first(rq->bitmap, NRQS);
642 	}
643 
644 	sched_entry->runq = 0;
645 
646 	return sched_entry;
647 }
648 
649 /*
650  * The run queue must not be empty.
651  */
652 static boolean_t
entry_queue_enqueue_entry(entry_queue_t rq,sched_entry_t entry,integer_t options)653 entry_queue_enqueue_entry(
654 	entry_queue_t rq,
655 	sched_entry_t entry,
656 	integer_t     options)
657 {
658 	int             sched_pri = entry->sched_pri;
659 	circle_queue_t  queue = &rq->queues[sched_pri];
660 	boolean_t       result = FALSE;
661 
662 	assert(entry->runq == 0);
663 
664 	if (circle_queue_empty(queue)) {
665 		circle_enqueue_tail(queue, &entry->entry_links);
666 
667 		rq_bitmap_set(rq->bitmap, sched_pri);
668 		if (sched_pri > rq->highq) {
669 			rq->highq = sched_pri;
670 			result = TRUE;
671 		}
672 	} else {
673 		if (options & SCHED_TAILQ) {
674 			circle_enqueue_tail(queue, &entry->entry_links);
675 		} else {
676 			circle_enqueue_head(queue, &entry->entry_links);
677 		}
678 	}
679 	if (SCHED(priority_is_urgent)(sched_pri)) {
680 		rq->urgency++;
681 	}
682 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
683 	rq->count++;
684 
685 	entry->runq = MULTIQ_ERUNQ;
686 
687 	return result;
688 }
689 
690 /*
691  * The entry must be in this runqueue.
692  */
693 static void
entry_queue_remove_entry(entry_queue_t rq,sched_entry_t entry)694 entry_queue_remove_entry(
695 	entry_queue_t  rq,
696 	sched_entry_t  entry)
697 {
698 	int sched_pri = entry->sched_pri;
699 
700 #if defined(MULTIQ_SANITY_CHECK)
701 	if (multiq_sanity_check) {
702 		entry_queue_check_entry(rq, entry, sched_pri);
703 	}
704 #endif
705 
706 	remqueue(&entry->entry_links);
707 
708 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
709 	rq->count--;
710 	if (SCHED(priority_is_urgent)(sched_pri)) {
711 		rq->urgency--; assert(rq->urgency >= 0);
712 	}
713 
714 	if (circle_queue_empty(&rq->queues[sched_pri])) {
715 		/* update run queue status */
716 		rq_bitmap_clear(rq->bitmap, sched_pri);
717 		rq->highq = bitmap_first(rq->bitmap, NRQS);
718 	}
719 
720 	entry->runq = 0;
721 }
722 
723 static void
entry_queue_change_entry(entry_queue_t rq,sched_entry_t entry,integer_t options)724 entry_queue_change_entry(
725 	entry_queue_t rq,
726 	sched_entry_t entry,
727 	integer_t     options)
728 {
729 	int            sched_pri   = entry->sched_pri;
730 	circle_queue_t queue       = &rq->queues[sched_pri];
731 
732 #if defined(MULTIQ_SANITY_CHECK)
733 	if (multiq_sanity_check) {
734 		entry_queue_check_entry(rq, entry, sched_pri);
735 	}
736 #endif
737 
738 	circle_dequeue(queue, &entry->entry_links);
739 	if (options & SCHED_TAILQ) {
740 		circle_enqueue_tail(queue, &entry->entry_links);
741 	} else {
742 		circle_enqueue_head(queue, &entry->entry_links);
743 	}
744 }
745 /*
746  * The run queue must not be empty.
747  *
748  * sets queue_empty to TRUE if queue is now empty at thread_pri
749  */
750 static thread_t
group_run_queue_dequeue_thread(group_runq_t rq,integer_t * thread_pri,boolean_t * queue_empty)751 group_run_queue_dequeue_thread(
752 	group_runq_t   rq,
753 	integer_t     *thread_pri,
754 	boolean_t     *queue_empty)
755 {
756 	thread_t        thread;
757 	circle_queue_t  queue = &rq->queues[rq->highq];
758 
759 	assert(rq->count > 0);
760 	assert(!circle_queue_empty(queue));
761 
762 	*thread_pri = rq->highq;
763 
764 	thread = cqe_dequeue_head(queue, struct thread, runq_links);
765 	assert_thread_magic(thread);
766 
767 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
768 	rq->count--;
769 	if (SCHED(priority_is_urgent)(rq->highq)) {
770 		rq->urgency--; assert(rq->urgency >= 0);
771 	}
772 	if (circle_queue_empty(queue)) {
773 		rq_bitmap_clear(rq->bitmap, rq->highq);
774 		rq->highq = bitmap_first(rq->bitmap, NRQS);
775 		*queue_empty = TRUE;
776 	} else {
777 		*queue_empty = FALSE;
778 	}
779 
780 	return thread;
781 }
782 
783 /*
784  * The run queue must not be empty.
785  * returns TRUE if queue was empty at thread_pri
786  */
787 static boolean_t
group_run_queue_enqueue_thread(group_runq_t rq,thread_t thread,integer_t thread_pri,integer_t options)788 group_run_queue_enqueue_thread(
789 	group_runq_t   rq,
790 	thread_t       thread,
791 	integer_t      thread_pri,
792 	integer_t      options)
793 {
794 	circle_queue_t  queue = &rq->queues[thread_pri];
795 	boolean_t       result = FALSE;
796 
797 	assert(thread->runq == PROCESSOR_NULL);
798 	assert_thread_magic(thread);
799 
800 	if (circle_queue_empty(queue)) {
801 		circle_enqueue_tail(queue, &thread->runq_links);
802 
803 		rq_bitmap_set(rq->bitmap, thread_pri);
804 		if (thread_pri > rq->highq) {
805 			rq->highq = thread_pri;
806 		}
807 		result = TRUE;
808 	} else {
809 		if (options & SCHED_TAILQ) {
810 			circle_enqueue_tail(queue, &thread->runq_links);
811 		} else {
812 			circle_enqueue_head(queue, &thread->runq_links);
813 		}
814 	}
815 	if (SCHED(priority_is_urgent)(thread_pri)) {
816 		rq->urgency++;
817 	}
818 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
819 	rq->count++;
820 
821 	return result;
822 }
823 
824 /*
825  * The thread must be in this runqueue.
826  * returns TRUE if queue is now empty at thread_pri
827  */
828 static boolean_t
group_run_queue_remove_thread(group_runq_t rq,thread_t thread,integer_t thread_pri)829 group_run_queue_remove_thread(
830 	group_runq_t    rq,
831 	thread_t        thread,
832 	integer_t       thread_pri)
833 {
834 	circle_queue_t  queue = &rq->queues[thread_pri];
835 	boolean_t       result = FALSE;
836 
837 	assert_thread_magic(thread);
838 	assert(thread->runq != PROCESSOR_NULL);
839 
840 	circle_dequeue(queue, &thread->runq_links);
841 
842 	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
843 	rq->count--;
844 	if (SCHED(priority_is_urgent)(thread_pri)) {
845 		rq->urgency--; assert(rq->urgency >= 0);
846 	}
847 
848 	if (circle_queue_empty(queue)) {
849 		/* update run queue status */
850 		rq_bitmap_clear(rq->bitmap, thread_pri);
851 		rq->highq = bitmap_first(rq->bitmap, NRQS);
852 		result = TRUE;
853 	}
854 
855 	thread->runq = PROCESSOR_NULL;
856 
857 	return result;
858 }
859 
860 /*
861  * A thread's sched pri may change out from under us because
862  * we're clearing thread->runq here without the thread locked.
863  * Do not rely on it to be the same as when we enqueued.
864  */
865 static thread_t
sched_global_dequeue_thread(entry_queue_t main_entryq)866 sched_global_dequeue_thread(entry_queue_t main_entryq)
867 {
868 	boolean_t pri_level_empty = FALSE;
869 	sched_entry_t entry;
870 	group_runq_t group_runq;
871 	thread_t thread;
872 	integer_t thread_pri;
873 	sched_group_t group;
874 
875 	assert(main_entryq->count > 0);
876 
877 	entry = entry_queue_dequeue_entry(main_entryq);
878 
879 	group = group_for_entry(entry);
880 	group_runq = &group->runq;
881 
882 	thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
883 
884 	thread->runq = PROCESSOR_NULL;
885 
886 	if (!pri_level_empty) {
887 		entry_queue_enqueue_entry(main_entryq, entry, SCHED_TAILQ);
888 	}
889 
890 	return thread;
891 }
892 
893 /* Dequeue a thread from the global runq without moving the entry */
894 static thread_t
sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq)895 sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq)
896 {
897 	boolean_t pri_level_empty = FALSE;
898 	sched_entry_t entry;
899 	group_runq_t group_runq;
900 	thread_t thread;
901 	integer_t thread_pri;
902 	sched_group_t group;
903 
904 	assert(main_entryq->count > 0);
905 
906 	entry = entry_queue_first_entry(main_entryq);
907 
908 	group = group_for_entry(entry);
909 	group_runq = &group->runq;
910 
911 	thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
912 
913 	thread->runq = PROCESSOR_NULL;
914 
915 	if (pri_level_empty) {
916 		entry_queue_remove_entry(main_entryq, entry);
917 	}
918 
919 	return thread;
920 }
921 
922 
923 static thread_t
sched_group_dequeue_thread(entry_queue_t main_entryq,sched_group_t group)924 sched_group_dequeue_thread(
925 	entry_queue_t main_entryq,
926 	sched_group_t group)
927 {
928 	group_runq_t group_runq = &group->runq;
929 	boolean_t pri_level_empty = FALSE;
930 	thread_t thread;
931 	integer_t thread_pri;
932 
933 	thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
934 
935 	thread->runq = PROCESSOR_NULL;
936 
937 	if (pri_level_empty) {
938 		entry_queue_remove_entry(main_entryq, group_entry_for_pri(group, thread_pri));
939 	}
940 
941 	return thread;
942 }
943 
944 static void
sched_group_remove_thread(entry_queue_t main_entryq,sched_group_t group,thread_t thread)945 sched_group_remove_thread(
946 	entry_queue_t main_entryq,
947 	sched_group_t group,
948 	thread_t thread)
949 {
950 	integer_t thread_pri = thread->sched_pri;
951 	sched_entry_t sched_entry = group_entry_for_pri(group, thread_pri);
952 
953 #if defined(MULTIQ_SANITY_CHECK)
954 	if (multiq_sanity_check) {
955 		global_check_entry_queue(main_entryq);
956 		group_check_run_queue(main_entryq, group);
957 
958 		sched_group_check_thread(group, thread);
959 		entry_queue_check_entry(main_entryq, sched_entry, thread_pri);
960 	}
961 #endif
962 
963 	boolean_t pri_level_empty = group_run_queue_remove_thread(&group->runq, thread, thread_pri);
964 
965 	if (pri_level_empty) {
966 		entry_queue_remove_entry(main_entryq, sched_entry);
967 	}
968 
969 #if defined(MULTIQ_SANITY_CHECK)
970 	if (multiq_sanity_check) {
971 		global_check_entry_queue(main_entryq);
972 		group_check_run_queue(main_entryq, group);
973 	}
974 #endif
975 }
976 
977 static void
sched_group_enqueue_thread(entry_queue_t main_entryq,sched_group_t group,thread_t thread,integer_t options)978 sched_group_enqueue_thread(
979 	entry_queue_t        main_entryq,
980 	sched_group_t        group,
981 	thread_t             thread,
982 	integer_t            options)
983 {
984 #if defined(MULTIQ_SANITY_CHECK)
985 	if (multiq_sanity_check) {
986 		global_check_entry_queue(main_entryq);
987 		group_check_run_queue(main_entryq, group);
988 	}
989 #endif
990 
991 	int sched_pri = thread->sched_pri;
992 
993 	boolean_t pri_level_was_empty = group_run_queue_enqueue_thread(&group->runq, thread, sched_pri, options);
994 
995 	if (pri_level_was_empty) {
996 		/*
997 		 * TODO: Need to figure out if passing options here is a good idea or not
998 		 * What effects would it have?
999 		 */
1000 		entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
1001 	} else if (options & SCHED_HEADQ) {
1002 		/* The thread should be at the head of the line - move its entry to the front */
1003 		entry_queue_change_entry(main_entryq, &group->entries[sched_pri], options);
1004 	}
1005 }
1006 
1007 /*
1008  *  Locate a thread to execute from the run queue and return it.
1009  *  Only choose a thread with greater or equal priority.
1010  *
1011  *  pset is locked, thread is not locked.
1012  *
1013  *  Returns THREAD_NULL if it cannot find a valid thread.
1014  *
1015  *  Note: we cannot rely on the value of thread->sched_pri in this path because
1016  *  we don't have the thread locked.
1017  *
1018  *  TODO: Remove tracepoints
1019  */
1020 static thread_t
sched_multiq_choose_thread(processor_t processor,int priority,ast_t reason)1021 sched_multiq_choose_thread(
1022 	processor_t      processor,
1023 	int              priority,
1024 	ast_t            reason)
1025 {
1026 	entry_queue_t   main_entryq = multiq_main_entryq(processor);
1027 	run_queue_t     bound_runq  = multiq_bound_runq(processor);
1028 
1029 	boolean_t choose_bound_runq = FALSE;
1030 
1031 	if (bound_runq->highq < priority &&
1032 	    main_entryq->highq < priority) {
1033 		return THREAD_NULL;
1034 	}
1035 
1036 	if (bound_runq->count && main_entryq->count) {
1037 		if (bound_runq->highq >= main_entryq->highq) {
1038 			choose_bound_runq = TRUE;
1039 		} else {
1040 			/* Use main runq */
1041 		}
1042 	} else if (bound_runq->count) {
1043 		choose_bound_runq = TRUE;
1044 	} else if (main_entryq->count) {
1045 		/* Use main runq */
1046 	} else {
1047 		return THREAD_NULL;
1048 	}
1049 
1050 	if (choose_bound_runq) {
1051 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1052 		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1053 		    MACH_MULTIQ_BOUND, main_entryq->highq, bound_runq->highq, 0, 0);
1054 
1055 		return run_queue_dequeue(bound_runq, SCHED_HEADQ);
1056 	}
1057 
1058 	sched_group_t group = current_thread()->sched_group;
1059 
1060 #if defined(MULTIQ_SANITY_CHECK)
1061 	if (multiq_sanity_check) {
1062 		global_check_entry_queue(main_entryq);
1063 		group_check_run_queue(main_entryq, group);
1064 	}
1065 #endif
1066 
1067 	/*
1068 	 * Determine if we should look at the group or the global queue
1069 	 *
1070 	 * TODO:
1071 	 * Perhaps pass reason as a 'should look inside' argument to choose_thread
1072 	 * Should YIELD AST override drain limit?
1073 	 */
1074 	if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
1075 		boolean_t favor_group = TRUE;
1076 
1077 		integer_t global_pri = main_entryq->highq;
1078 		integer_t group_pri  = group->runq.highq;
1079 
1080 		/*
1081 		 * Favor the current group if the group is still the globally highest.
1082 		 *
1083 		 * Otherwise, consider choosing a thread from the current group
1084 		 * even if it's lower priority than the global highest priority.
1085 		 */
1086 		if (global_pri > group_pri) {
1087 			/*
1088 			 * If there's something elsewhere above the depth limit,
1089 			 * don't pick a thread below the limit.
1090 			 */
1091 			if (global_pri > drain_depth_limit && group_pri <= drain_depth_limit) {
1092 				favor_group = FALSE;
1093 			}
1094 
1095 			/*
1096 			 * If there's something at or above the ceiling,
1097 			 * don't favor the group.
1098 			 */
1099 			if (global_pri >= drain_ceiling) {
1100 				favor_group = FALSE;
1101 			}
1102 
1103 			/*
1104 			 * Don't go more than X steps below the global highest
1105 			 */
1106 			if ((global_pri - group_pri) >= drain_band_limit) {
1107 				favor_group = FALSE;
1108 			}
1109 		}
1110 
1111 		if (favor_group) {
1112 			/* Pull from local runq */
1113 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1114 			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1115 			    MACH_MULTIQ_GROUP, global_pri, group_pri, 0, 0);
1116 
1117 			return sched_group_dequeue_thread(main_entryq, group);
1118 		}
1119 	}
1120 
1121 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1122 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1123 	    MACH_MULTIQ_GLOBAL, main_entryq->highq, group->runq.highq, 0, 0);
1124 
1125 	/* Couldn't pull from local runq, pull from global runq instead */
1126 	if (deep_drain) {
1127 		return sched_global_deep_drain_dequeue_thread(main_entryq);
1128 	} else {
1129 		return sched_global_dequeue_thread(main_entryq);
1130 	}
1131 }
1132 
1133 
1134 /*
1135  * Thread must be locked, and not already be on a run queue.
1136  * pset is locked.
1137  */
1138 static boolean_t
sched_multiq_processor_enqueue(processor_t processor,thread_t thread,sched_options_t options)1139 sched_multiq_processor_enqueue(
1140 	processor_t      processor,
1141 	thread_t         thread,
1142 	sched_options_t  options)
1143 {
1144 	boolean_t       result;
1145 
1146 	assert(processor == thread->chosen_processor);
1147 
1148 	if (thread->bound_processor != PROCESSOR_NULL) {
1149 		assert(thread->bound_processor == processor);
1150 
1151 		result = run_queue_enqueue(multiq_bound_runq(processor), thread, options);
1152 		thread->runq = processor;
1153 
1154 		return result;
1155 	}
1156 
1157 	sched_group_enqueue_thread(multiq_main_entryq(processor),
1158 	    thread->sched_group,
1159 	    thread, options);
1160 
1161 	thread->runq = processor;
1162 
1163 	return FALSE;
1164 }
1165 
1166 /*
1167  * Called in the context of thread with thread and pset unlocked,
1168  * after updating thread priority but before propagating that priority
1169  * to the processor
1170  */
1171 void
sched_multiq_quantum_expire(thread_t thread)1172 sched_multiq_quantum_expire(thread_t thread)
1173 {
1174 	if (deep_drain) {
1175 		/*
1176 		 * Move the entry at this priority to the end of the queue,
1177 		 * to allow the next task a shot at running.
1178 		 */
1179 
1180 		processor_t processor = thread->last_processor;
1181 		processor_set_t pset = processor->processor_set;
1182 		entry_queue_t entryq = multiq_main_entryq(processor);
1183 
1184 		pset_lock(pset);
1185 
1186 		sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
1187 
1188 		if (entry->runq == MULTIQ_ERUNQ) {
1189 			entry_queue_change_entry(entryq, entry, SCHED_TAILQ);
1190 		}
1191 
1192 		pset_unlock(pset);
1193 	}
1194 }
1195 
1196 static boolean_t
sched_multiq_processor_queue_empty(processor_t processor)1197 sched_multiq_processor_queue_empty(processor_t processor)
1198 {
1199 	return multiq_main_entryq(processor)->count == 0 &&
1200 	       multiq_bound_runq(processor)->count == 0;
1201 }
1202 
1203 static ast_t
sched_multiq_processor_csw_check(processor_t processor)1204 sched_multiq_processor_csw_check(processor_t processor)
1205 {
1206 	boolean_t       has_higher;
1207 	int             pri;
1208 
1209 	if (sched_multiq_thread_avoid_processor(processor, current_thread())) {
1210 		return AST_PREEMPT | AST_URGENT;
1211 	}
1212 
1213 	entry_queue_t main_entryq = multiq_main_entryq(processor);
1214 	run_queue_t   bound_runq  = multiq_bound_runq(processor);
1215 
1216 	assert(processor->active_thread != NULL);
1217 
1218 	pri = MAX(main_entryq->highq, bound_runq->highq);
1219 
1220 	if (processor->first_timeslice) {
1221 		has_higher = (pri > processor->current_pri);
1222 	} else {
1223 		has_higher = (pri >= processor->current_pri);
1224 	}
1225 
1226 	if (has_higher) {
1227 		if (main_entryq->urgency > 0) {
1228 			return AST_PREEMPT | AST_URGENT;
1229 		}
1230 
1231 		if (bound_runq->urgency > 0) {
1232 			return AST_PREEMPT | AST_URGENT;
1233 		}
1234 
1235 		return AST_PREEMPT;
1236 	}
1237 
1238 	return AST_NONE;
1239 }
1240 
1241 static boolean_t
sched_multiq_processor_queue_has_priority(processor_t processor,int priority,boolean_t gte)1242 sched_multiq_processor_queue_has_priority(
1243 	processor_t   processor,
1244 	int           priority,
1245 	boolean_t     gte)
1246 {
1247 	run_queue_t main_runq  = multiq_main_entryq(processor);
1248 	run_queue_t bound_runq = multiq_bound_runq(processor);
1249 
1250 	int qpri = MAX(main_runq->highq, bound_runq->highq);
1251 
1252 	if (gte) {
1253 		return qpri >= priority;
1254 	} else {
1255 		return qpri > priority;
1256 	}
1257 }
1258 
1259 static int
sched_multiq_runq_count(processor_t processor)1260 sched_multiq_runq_count(processor_t processor)
1261 {
1262 	/*
1263 	 *  TODO: Decide whether to keep a count of runnable threads in the pset
1264 	 *  or just return something less than the true count.
1265 	 *
1266 	 *  This needs to be fast, so no iterating the whole runq.
1267 	 *
1268 	 *  Another possible decision is to remove this - with global runq
1269 	 *  it doesn't make much sense.
1270 	 */
1271 	return multiq_main_entryq(processor)->count + multiq_bound_runq(processor)->count;
1272 }
1273 
1274 static uint64_t
sched_multiq_runq_stats_count_sum(processor_t processor)1275 sched_multiq_runq_stats_count_sum(processor_t processor)
1276 {
1277 	/*
1278 	 * TODO: This one does need to go through all the runqueues, but it's only needed for
1279 	 * the sched stats tool
1280 	 */
1281 
1282 	uint64_t bound_sum = multiq_bound_runq(processor)->runq_stats.count_sum;
1283 
1284 	if (processor->cpu_id == processor->processor_set->cpu_set_low) {
1285 		return bound_sum + multiq_main_entryq(processor)->runq_stats.count_sum;
1286 	} else {
1287 		return bound_sum;
1288 	}
1289 }
1290 
1291 static int
sched_multiq_processor_bound_count(processor_t processor)1292 sched_multiq_processor_bound_count(processor_t processor)
1293 {
1294 	return multiq_bound_runq(processor)->count;
1295 }
1296 
1297 static void
sched_multiq_processor_queue_shutdown(processor_t processor)1298 sched_multiq_processor_queue_shutdown(processor_t processor)
1299 {
1300 	processor_set_t pset = processor->processor_set;
1301 	entry_queue_t   main_entryq = multiq_main_entryq(processor);
1302 	thread_t        thread;
1303 	queue_head_t    tqueue;
1304 
1305 	/* We only need to migrate threads if this is the last active processor in the pset */
1306 	if (pset->online_processor_count > 0) {
1307 		pset_unlock(pset);
1308 		return;
1309 	}
1310 
1311 	queue_init(&tqueue);
1312 
1313 	/* Note that we do not remove bound threads from the queues here */
1314 
1315 	while (main_entryq->count > 0) {
1316 		thread = sched_global_dequeue_thread(main_entryq);
1317 		enqueue_tail(&tqueue, &thread->runq_links);
1318 	}
1319 
1320 	pset_unlock(pset);
1321 
1322 	qe_foreach_element_safe(thread, &tqueue, runq_links) {
1323 		remqueue(&thread->runq_links);
1324 
1325 		thread_lock(thread);
1326 
1327 		thread_setrun(thread, SCHED_TAILQ);
1328 
1329 		thread_unlock(thread);
1330 	}
1331 }
1332 
1333 /*
1334  * Thread is locked
1335  *
1336  * This is why we can never read sched_pri unless we have the thread locked.
1337  * Which we do in the enqueue and remove cases, but not the dequeue case.
1338  */
1339 static boolean_t
sched_multiq_processor_queue_remove(processor_t processor,thread_t thread)1340 sched_multiq_processor_queue_remove(
1341 	processor_t processor,
1342 	thread_t    thread)
1343 {
1344 	boolean_t removed = FALSE;
1345 	processor_set_t pset = processor->processor_set;
1346 
1347 	pset_lock(pset);
1348 
1349 	if (thread->runq != PROCESSOR_NULL) {
1350 		/*
1351 		 * Thread is on a run queue and we have a lock on
1352 		 * that run queue.
1353 		 */
1354 
1355 		assert(thread->runq == processor);
1356 
1357 		if (thread->bound_processor != PROCESSOR_NULL) {
1358 			assert(processor == thread->bound_processor);
1359 			run_queue_remove(multiq_bound_runq(processor), thread);
1360 			thread->runq = PROCESSOR_NULL;
1361 		} else {
1362 			sched_group_remove_thread(multiq_main_entryq(processor),
1363 			    thread->sched_group,
1364 			    thread);
1365 		}
1366 
1367 		removed = TRUE;
1368 	}
1369 
1370 	pset_unlock(pset);
1371 
1372 	return removed;
1373 }
1374 
1375 /* pset is locked, returned unlocked */
1376 static thread_t
sched_multiq_steal_thread(processor_set_t pset)1377 sched_multiq_steal_thread(processor_set_t pset)
1378 {
1379 	pset_unlock(pset);
1380 	return THREAD_NULL;
1381 }
1382 
1383 /*
1384  * Scan the global queue for candidate groups, and scan those groups for
1385  * candidate threads.
1386  *
1387  * TODO: This iterates every group runq in its entirety for each entry it has in the runq, which is O(N^2)
1388  *       Instead, iterate only the queue in the group runq matching the priority of the entry.
1389  *
1390  * Returns TRUE if retry is needed.
1391  */
1392 static boolean_t
group_scan(entry_queue_t runq,sched_update_scan_context_t scan_context)1393 group_scan(entry_queue_t runq, sched_update_scan_context_t scan_context)
1394 {
1395 	int count       = runq->count;
1396 	int queue_index;
1397 
1398 	assert(count >= 0);
1399 
1400 	if (count == 0) {
1401 		return FALSE;
1402 	}
1403 
1404 	for (queue_index = bitmap_first(runq->bitmap, NRQS);
1405 	    queue_index >= 0;
1406 	    queue_index = bitmap_next(runq->bitmap, queue_index)) {
1407 		sched_entry_t entry;
1408 
1409 		cqe_foreach_element(entry, &runq->queues[queue_index], entry_links) {
1410 			assert(count > 0);
1411 
1412 			sched_group_t group = group_for_entry(entry);
1413 			if (group->runq.count > 0) {
1414 				if (runq_scan(&group->runq, scan_context)) {
1415 					return TRUE;
1416 				}
1417 			}
1418 			count--;
1419 		}
1420 	}
1421 
1422 	return FALSE;
1423 }
1424 
1425 static void
sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)1426 sched_multiq_thread_update_scan(sched_update_scan_context_t scan_context)
1427 {
1428 	boolean_t               restart_needed = FALSE;
1429 	processor_t             processor = processor_list;
1430 	processor_set_t         pset;
1431 	thread_t                thread;
1432 	spl_t                   s;
1433 
1434 	/*
1435 	 *  We update the threads associated with each processor (bound and idle threads)
1436 	 *  and then update the threads in each pset runqueue.
1437 	 */
1438 
1439 	do {
1440 		do {
1441 			pset = processor->processor_set;
1442 
1443 			s = splsched();
1444 			pset_lock(pset);
1445 
1446 			restart_needed = runq_scan(multiq_bound_runq(processor), scan_context);
1447 
1448 			pset_unlock(pset);
1449 			splx(s);
1450 
1451 			if (restart_needed) {
1452 				break;
1453 			}
1454 
1455 			thread = processor->idle_thread;
1456 			if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
1457 				if (thread_update_add_thread(thread) == FALSE) {
1458 					restart_needed = TRUE;
1459 					break;
1460 				}
1461 			}
1462 		} while ((processor = processor->processor_list) != NULL);
1463 
1464 		/* Ok, we now have a collection of candidates -- fix them. */
1465 		thread_update_process_threads();
1466 	} while (restart_needed);
1467 
1468 	pset = &pset0;
1469 
1470 	do {
1471 		do {
1472 			s = splsched();
1473 			pset_lock(pset);
1474 
1475 			restart_needed = group_scan(&pset->pset_runq, scan_context);
1476 
1477 			pset_unlock(pset);
1478 			splx(s);
1479 
1480 			if (restart_needed) {
1481 				break;
1482 			}
1483 		} while ((pset = pset->pset_list) != NULL);
1484 
1485 		/* Ok, we now have a collection of candidates -- fix them. */
1486 		thread_update_process_threads();
1487 	} while (restart_needed);
1488 }
1489 
1490 extern int sched_allow_rt_smt;
1491 
1492 /* Return true if this thread should not continue running on this processor */
1493 static bool
sched_multiq_thread_avoid_processor(processor_t processor,thread_t thread)1494 sched_multiq_thread_avoid_processor(processor_t processor, thread_t thread)
1495 {
1496 	if (processor->processor_primary != processor) {
1497 		/*
1498 		 * This is a secondary SMT processor.  If the primary is running
1499 		 * a realtime thread, only allow realtime threads on the secondary.
1500 		 */
1501 		if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
1502 			return true;
1503 		}
1504 	}
1505 
1506 	return false;
1507 }
1508