1 /*
2 * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57 #include <mach/boolean.h>
58 #include <mach/thread_switch.h>
59 #include <kern/counter.h>
60 #include <kern/ipc_kobject.h>
61 #include <kern/processor.h>
62 #include <kern/sched.h>
63 #include <kern/sched_prim.h>
64 #include <kern/spl.h>
65 #include <kern/task.h>
66 #include <kern/thread.h>
67 #include <kern/policy_internal.h>
68
69 #include <mach/policy.h>
70
71 #include <kern/syscall_subr.h>
72 #include <mach/mach_host_server.h>
73 #include <mach/mach_syscalls.h>
74 #include <sys/kdebug.h>
75 #include <kern/ast.h>
76
77 #if DEVELOPMENT || DEBUG
78 SCALABLE_COUNTER_DECLARE(mach_eventlink_handoff_success_count);
79 #endif /* DEVELOPMENT || DEBUG */
80
81 static void thread_depress_abstime(uint64_t interval);
82 static void thread_depress_ms(mach_msg_timeout_t interval);
83
84 /* Called from commpage to take a delayed preemption when exiting
85 * the "Preemption Free Zone" (PFZ).
86 */
87 kern_return_t
pfz_exit(__unused struct pfz_exit_args * args)88 pfz_exit(
89 __unused struct pfz_exit_args *args)
90 {
91 /* For now, nothing special to do. We'll pick up the ASTs on kernel exit. */
92
93 return KERN_SUCCESS;
94 }
95
96
97 /*
98 * swtch and swtch_pri both attempt to context switch (logic in
99 * thread_block no-ops the context switch if nothing would happen).
100 * A boolean is returned that indicates whether there is anything
101 * else runnable. That's no excuse to spin, though.
102 */
103
104 static void
swtch_continue(void)105 swtch_continue(void)
106 {
107 processor_t myprocessor;
108 boolean_t result;
109
110 disable_preemption();
111 myprocessor = current_processor();
112 result = SCHED(thread_should_yield)(myprocessor, current_thread());
113 enable_preemption();
114
115 ml_delay_on_yield();
116
117 thread_syscall_return(result);
118 /*NOTREACHED*/
119 }
120
121 boolean_t
swtch(__unused struct swtch_args * args)122 swtch(
123 __unused struct swtch_args *args)
124 {
125 processor_t myprocessor;
126
127 disable_preemption();
128 myprocessor = current_processor();
129 if (!SCHED(thread_should_yield)(myprocessor, current_thread())) {
130 mp_enable_preemption();
131
132 return FALSE;
133 }
134 enable_preemption();
135
136 thread_yield_with_continuation((thread_continue_t)swtch_continue, NULL);
137 }
138
139 static void
swtch_pri_continue(void)140 swtch_pri_continue(void)
141 {
142 processor_t myprocessor;
143 boolean_t result;
144
145 thread_depress_abort(current_thread());
146
147 disable_preemption();
148 myprocessor = current_processor();
149 result = SCHED(thread_should_yield)(myprocessor, current_thread());
150 mp_enable_preemption();
151
152 ml_delay_on_yield();
153
154 thread_syscall_return(result);
155 /*NOTREACHED*/
156 }
157
158 boolean_t
swtch_pri(__unused struct swtch_pri_args * args)159 swtch_pri(
160 __unused struct swtch_pri_args *args)
161 {
162 processor_t myprocessor;
163
164 disable_preemption();
165 myprocessor = current_processor();
166 if (!SCHED(thread_should_yield)(myprocessor, current_thread())) {
167 mp_enable_preemption();
168
169 return FALSE;
170 }
171 enable_preemption();
172
173 thread_depress_abstime(thread_depress_time);
174
175 thread_yield_with_continuation((thread_continue_t)swtch_pri_continue, NULL);
176 }
177
178 static void
thread_switch_continue(void * parameter,__unused int ret)179 thread_switch_continue(void *parameter, __unused int ret)
180 {
181 thread_t self = current_thread();
182 int option = (int)(intptr_t)parameter;
183
184 if (option == SWITCH_OPTION_DEPRESS || option == SWITCH_OPTION_OSLOCK_DEPRESS) {
185 thread_depress_abort(self);
186 }
187
188 ml_delay_on_yield();
189
190 thread_syscall_return(KERN_SUCCESS);
191 /*NOTREACHED*/
192 }
193
194 /*
195 * thread_switch:
196 *
197 * Context switch. User may supply thread hint.
198 */
199 kern_return_t
thread_switch(struct thread_switch_args * args)200 thread_switch(
201 struct thread_switch_args *args)
202 {
203 thread_t thread = THREAD_NULL;
204 thread_t self = current_thread();
205 mach_port_name_t thread_name = args->thread_name;
206 int option = args->option;
207 mach_msg_timeout_t option_time = args->option_time;
208 uint32_t scale_factor = NSEC_PER_MSEC;
209 boolean_t depress_option = FALSE;
210 boolean_t wait_option = FALSE;
211 wait_interrupt_t interruptible = THREAD_ABORTSAFE;
212 port_intrans_options_t ptt_options = PORT_INTRANS_THREAD_NOT_CURRENT_THREAD;
213
214 /*
215 * Validate and process option.
216 *
217 * OSLock boosting only applies to other threads
218 * in your same task (even if you have a port for
219 * a thread in another task)
220 */
221 switch (option) {
222 case SWITCH_OPTION_NONE:
223 break;
224 case SWITCH_OPTION_WAIT:
225 wait_option = TRUE;
226 break;
227 case SWITCH_OPTION_DEPRESS:
228 depress_option = TRUE;
229 break;
230 case SWITCH_OPTION_DISPATCH_CONTENTION:
231 scale_factor = NSEC_PER_USEC;
232 wait_option = TRUE;
233 interruptible |= THREAD_WAIT_NOREPORT;
234 break;
235 case SWITCH_OPTION_OSLOCK_DEPRESS:
236 depress_option = TRUE;
237 interruptible |= THREAD_WAIT_NOREPORT;
238 ptt_options |= PORT_INTRANS_THREAD_IN_CURRENT_TASK;
239 break;
240 case SWITCH_OPTION_OSLOCK_WAIT:
241 wait_option = TRUE;
242 interruptible |= THREAD_WAIT_NOREPORT;
243 ptt_options |= PORT_INTRANS_THREAD_IN_CURRENT_TASK;
244 break;
245 default:
246 return KERN_INVALID_ARGUMENT;
247 }
248
249 /*
250 * Translate the port name if supplied.
251 */
252 if (thread_name != MACH_PORT_NULL) {
253 thread = port_name_to_thread(thread_name, ptt_options);
254 }
255
256 if (option == SWITCH_OPTION_OSLOCK_DEPRESS || option == SWITCH_OPTION_OSLOCK_WAIT) {
257 if (thread != THREAD_NULL) {
258 /*
259 * Attempt to kick the lock owner up to our same IO throttling tier.
260 * If the thread is currently blocked in throttle_lowpri_io(),
261 * it will immediately break out.
262 *
263 * TODO: SFI break out?
264 */
265 int new_policy = proc_get_effective_thread_policy(self, TASK_POLICY_IO);
266
267 set_thread_iotier_override(thread, new_policy);
268 }
269 }
270
271 /*
272 * Try to handoff if supplied.
273 */
274 if (thread != THREAD_NULL) {
275 spl_t s = splsched();
276
277 /* This may return a different thread if the target is pushing on something */
278 thread_t pulled_thread = thread_run_queue_remove_for_handoff(thread);
279
280 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SWITCH) | DBG_FUNC_NONE,
281 thread_tid(thread), thread->state,
282 pulled_thread ? TRUE : FALSE, 0, 0);
283
284 if (pulled_thread != THREAD_NULL) {
285 #if DEVELOPMENT || DEBUG
286 counter_inc_preemption_disabled(&mach_eventlink_handoff_success_count);
287 #endif /* DEVELOPMENT || DEBUG */
288
289 /* We can't be dropping the last ref here */
290 thread_deallocate_safe(thread);
291
292 if (wait_option) {
293 assert_wait_timeout((event_t)assert_wait_timeout, interruptible,
294 option_time, scale_factor);
295 } else if (depress_option) {
296 thread_depress_ms(option_time);
297 }
298
299 thread_run(self, thread_switch_continue, (void *)(intptr_t)option, pulled_thread);
300 __builtin_unreachable();
301 }
302
303 splx(s);
304
305 thread_deallocate(thread);
306 }
307
308 if (wait_option) {
309 assert_wait_timeout((event_t)assert_wait_timeout, interruptible, option_time, scale_factor);
310 } else {
311 disable_preemption();
312 bool should_yield = SCHED(thread_should_yield)(current_processor(), current_thread());
313 enable_preemption();
314
315 if (should_yield == false) {
316 /* Early-return if yielding to the scheduler will not be beneficial */
317 return KERN_SUCCESS;
318 }
319
320 if (depress_option) {
321 thread_depress_ms(option_time);
322 }
323 }
324
325 thread_yield_with_continuation(thread_switch_continue, (void *)(intptr_t)option);
326 __builtin_unreachable();
327 }
328
329 void
thread_yield_with_continuation(thread_continue_t continuation,void * parameter)330 thread_yield_with_continuation(
331 thread_continue_t continuation,
332 void *parameter)
333 {
334 assert(continuation);
335 thread_block_reason(continuation, parameter, AST_YIELD);
336 __builtin_unreachable();
337 }
338
339 /* This function is called after an assert_wait(), therefore it must not
340 * cause another wait until after the thread_run() or thread_block()
341 *
342 * Following are the calling convention for thread ref deallocation.
343 *
344 * 1) If no continuation is provided, then thread ref is consumed.
345 * (thread_handoff_deallocate convention).
346 *
347 * 2) If continuation is provided with option THREAD_HANDOFF_SETRUN_NEEDED
348 * then thread ref is always consumed.
349 *
350 * 3) If continuation is provided with option THREAD_HANDOFF_NONE then thread
351 * ref is not consumed and it is upto the continuation to deallocate
352 * the thread reference.
353 */
354 static wait_result_t
thread_handoff_internal(thread_t thread,thread_continue_t continuation,void * parameter,thread_handoff_option_t option)355 thread_handoff_internal(thread_t thread, thread_continue_t continuation,
356 void *parameter, thread_handoff_option_t option)
357 {
358 thread_t self = current_thread();
359
360 /*
361 * Try to handoff if supplied.
362 */
363 if (thread != THREAD_NULL) {
364 spl_t s = splsched();
365
366 thread_t pulled_thread = thread_prepare_for_handoff(thread, option);
367
368 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SWITCH) | DBG_FUNC_NONE,
369 thread_tid(thread), thread->state,
370 pulled_thread ? TRUE : FALSE, 0, 0);
371
372 /* Deallocate thread ref if needed */
373 if (continuation == NULL || (option & THREAD_HANDOFF_SETRUN_NEEDED)) {
374 /* Use the safe version of thread deallocate */
375 thread_deallocate_safe(thread);
376 }
377
378 if (pulled_thread != THREAD_NULL) {
379 #if DEVELOPMENT || DEBUG
380 counter_inc_preemption_disabled(&mach_eventlink_handoff_success_count);
381 #endif /* DEVELOPMENT || DEBUG */
382
383 int result = thread_run(self, continuation, parameter, pulled_thread);
384
385 splx(s);
386 return result;
387 }
388
389 splx(s);
390 }
391
392 int result = thread_block_parameter(continuation, parameter);
393 return result;
394 }
395
396 void
thread_handoff_parameter(thread_t thread,thread_continue_t continuation,void * parameter,thread_handoff_option_t option)397 thread_handoff_parameter(thread_t thread, thread_continue_t continuation,
398 void *parameter, thread_handoff_option_t option)
399 {
400 thread_handoff_internal(thread, continuation, parameter, option);
401 panic("NULL continuation passed to %s", __func__);
402 __builtin_unreachable();
403 }
404
405 wait_result_t
thread_handoff_deallocate(thread_t thread,thread_handoff_option_t option)406 thread_handoff_deallocate(thread_t thread, thread_handoff_option_t option)
407 {
408 return thread_handoff_internal(thread, NULL, NULL, option);
409 }
410
411 /*
412 * Thread depression
413 *
414 * This mechanism drops a thread to priority 0 in order for it to yield to
415 * all other runnnable threads on the system. It can be canceled or timed out,
416 * whereupon the thread goes back to where it was.
417 *
418 * Note that TH_SFLAG_DEPRESS and TH_SFLAG_POLLDEPRESS are never set at the
419 * same time. DEPRESS always defers to POLLDEPRESS.
420 *
421 * DEPRESS only lasts across a single thread_block call, and never returns
422 * to userspace.
423 * POLLDEPRESS can be active anywhere up until thread termination.
424 */
425
426 void
thread_depress_timer_setup(thread_t self)427 thread_depress_timer_setup(thread_t self)
428 {
429 self->depress_timer = kalloc_type(struct timer_call,
430 Z_ZERO | Z_WAITOK | Z_NOFAIL);
431 timer_call_setup(self->depress_timer, thread_depress_expire, self);
432 }
433
434 /*
435 * Depress thread's priority to lowest possible for the specified interval,
436 * with an interval of zero resulting in no timeout being scheduled.
437 *
438 * Must block with AST_YIELD afterwards to take effect
439 */
440 void
thread_depress_abstime(uint64_t interval)441 thread_depress_abstime(uint64_t interval)
442 {
443 thread_t self = current_thread();
444
445 spl_t s = splsched();
446 thread_lock(self);
447
448 assert((self->sched_flags & TH_SFLAG_DEPRESS) == 0);
449
450 if ((self->sched_flags & TH_SFLAG_POLLDEPRESS) == 0) {
451 self->sched_flags |= TH_SFLAG_DEPRESS;
452 thread_recompute_sched_pri(self, SETPRI_LAZY);
453
454 if (interval != 0) {
455 uint64_t deadline;
456
457 clock_absolutetime_interval_to_deadline(interval, &deadline);
458 if (!timer_call_enter(self->depress_timer, deadline, TIMER_CALL_USER_CRITICAL)) {
459 self->depress_timer_active++;
460 }
461 }
462 }
463
464 thread_unlock(self);
465 splx(s);
466 }
467
468 void
thread_depress_ms(mach_msg_timeout_t interval)469 thread_depress_ms(mach_msg_timeout_t interval)
470 {
471 uint64_t abstime;
472
473 clock_interval_to_absolutetime_interval(interval, NSEC_PER_MSEC, &abstime);
474 thread_depress_abstime(abstime);
475 }
476
477 /*
478 * Priority depression expiration.
479 */
480 void
thread_depress_expire(void * p0,__unused void * p1)481 thread_depress_expire(void *p0,
482 __unused void *p1)
483 {
484 thread_t thread = (thread_t)p0;
485
486 spl_t s = splsched();
487 thread_lock(thread);
488
489 assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
490
491 if (--thread->depress_timer_active == 0) {
492 thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
493 if ((thread->state & TH_RUN) == TH_RUN) {
494 thread->last_basepri_change_time = mach_absolute_time();
495 }
496 thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
497 }
498
499 thread_unlock(thread);
500 splx(s);
501 }
502
503 /*
504 * Prematurely abort priority depression if there is one.
505 */
506 kern_return_t
thread_depress_abort(thread_t thread)507 thread_depress_abort(thread_t thread)
508 {
509 kern_return_t result = KERN_NOT_DEPRESSED;
510
511 spl_t s = splsched();
512 thread_lock(thread);
513
514 assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
515
516 /*
517 * User-triggered depress-aborts should not get out
518 * of the poll-depress, but they should cancel a regular depress.
519 */
520 if ((thread->sched_flags & TH_SFLAG_POLLDEPRESS) == 0) {
521 result = thread_depress_abort_locked(thread);
522 }
523
524 thread_unlock(thread);
525 splx(s);
526
527 return result;
528 }
529
530 /*
531 * Prematurely abort priority depression or poll depression if one is active.
532 * Called with the thread locked.
533 */
534 kern_return_t
thread_depress_abort_locked(thread_t thread)535 thread_depress_abort_locked(thread_t thread)
536 {
537 if ((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) == 0) {
538 return KERN_NOT_DEPRESSED;
539 }
540
541 assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
542
543 thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
544 if ((thread->state & TH_RUN) == TH_RUN) {
545 thread->last_basepri_change_time = mach_absolute_time();
546 }
547
548 thread_recompute_sched_pri(thread, SETPRI_LAZY);
549
550 if (timer_call_cancel(thread->depress_timer)) {
551 thread->depress_timer_active--;
552 }
553
554 return KERN_SUCCESS;
555 }
556
557 /*
558 * Invoked as part of a polling operation like a no-timeout port receive
559 *
560 * Forces a fixpri thread to yield if it is detected polling without blocking for too long.
561 */
562 void
thread_poll_yield(thread_t self)563 thread_poll_yield(thread_t self)
564 {
565 assert(self == current_thread());
566 assert((self->sched_flags & TH_SFLAG_DEPRESS) == 0);
567
568 if (self->sched_mode != TH_MODE_FIXED) {
569 return;
570 }
571
572 spl_t s = splsched();
573
574 uint64_t abstime = mach_absolute_time();
575 uint64_t total_computation = abstime -
576 self->computation_epoch + self->computation_metered;
577
578 if (total_computation >= max_poll_computation) {
579 thread_lock(self);
580
581 self->computation_epoch = abstime;
582 self->computation_interrupt_epoch = recount_current_thread_interrupt_time_mach();
583 self->computation_metered = 0;
584
585 uint64_t yield_expiration = abstime +
586 (total_computation >> sched_poll_yield_shift);
587
588 if (!timer_call_enter(self->depress_timer, yield_expiration,
589 TIMER_CALL_USER_CRITICAL)) {
590 self->depress_timer_active++;
591 }
592
593 self->sched_flags |= TH_SFLAG_POLLDEPRESS;
594 thread_recompute_sched_pri(self, SETPRI_DEFAULT);
595
596 thread_unlock(self);
597 }
598 splx(s);
599 }
600
601 /*
602 * Kernel-internal interface to yield for a specified period
603 *
604 * WARNING: Will still yield to priority 0 even if the thread is holding a contended lock!
605 */
606 void
thread_yield_internal(mach_msg_timeout_t ms)607 thread_yield_internal(mach_msg_timeout_t ms)
608 {
609 thread_t self = current_thread();
610
611 assert((self->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
612
613 processor_t myprocessor;
614
615 disable_preemption();
616 myprocessor = current_processor();
617 if (!SCHED(thread_should_yield)(myprocessor, self)) {
618 mp_enable_preemption();
619
620 return;
621 }
622 enable_preemption();
623
624 thread_depress_ms(ms);
625
626 thread_block_reason(THREAD_CONTINUE_NULL, NULL, AST_YIELD);
627
628 thread_depress_abort(self);
629 }
630
631 /*
632 * This yields to a possible non-urgent preemption pending on the current processor.
633 *
634 * This is useful when doing a long computation in the kernel without returning to userspace.
635 *
636 * As opposed to other yielding mechanisms, this does not drop the priority of the current thread.
637 */
638 void
thread_yield_to_preemption()639 thread_yield_to_preemption()
640 {
641 /*
642 * ast_pending() should ideally be called with interrupts disabled, but
643 * the check here is fine because csw_check() will do the right thing.
644 */
645 ast_t *pending_ast = ast_pending();
646 ast_t ast = AST_NONE;
647 processor_t p;
648
649 if (*pending_ast & AST_PREEMPT) {
650 thread_t self = current_thread();
651
652 spl_t s = splsched();
653
654 p = current_processor();
655 thread_lock(self);
656 ast = csw_check(self, p, AST_YIELD);
657 ast_on(ast);
658 thread_unlock(self);
659
660 if (ast != AST_NONE) {
661 (void)thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
662 }
663
664 splx(s);
665 }
666 }
667