xref: /xnu-10002.1.13/osfmk/kern/syscall_subr.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 
57 #include <mach/boolean.h>
58 #include <mach/thread_switch.h>
59 #include <ipc/ipc_port.h>
60 #include <ipc/ipc_space.h>
61 #include <kern/counter.h>
62 #include <kern/ipc_kobject.h>
63 #include <kern/processor.h>
64 #include <kern/sched.h>
65 #include <kern/sched_prim.h>
66 #include <kern/spl.h>
67 #include <kern/task.h>
68 #include <kern/thread.h>
69 #include <kern/policy_internal.h>
70 
71 #include <mach/policy.h>
72 
73 #include <kern/syscall_subr.h>
74 #include <mach/mach_host_server.h>
75 #include <mach/mach_syscalls.h>
76 #include <sys/kdebug.h>
77 #include <kern/ast.h>
78 
79 static void thread_depress_abstime(uint64_t interval);
80 static void thread_depress_ms(mach_msg_timeout_t interval);
81 
82 /* Called from commpage to take a delayed preemption when exiting
83  * the "Preemption Free Zone" (PFZ).
84  */
85 kern_return_t
pfz_exit(__unused struct pfz_exit_args * args)86 pfz_exit(
87 	__unused        struct pfz_exit_args *args)
88 {
89 	/* For now, nothing special to do.  We'll pick up the ASTs on kernel exit. */
90 
91 	return KERN_SUCCESS;
92 }
93 
94 
95 /*
96  *	swtch and swtch_pri both attempt to context switch (logic in
97  *	thread_block no-ops the context switch if nothing would happen).
98  *	A boolean is returned that indicates whether there is anything
99  *	else runnable.  That's no excuse to spin, though.
100  */
101 
102 static void
swtch_continue(void)103 swtch_continue(void)
104 {
105 	processor_t     myprocessor;
106 	boolean_t       result;
107 
108 	disable_preemption();
109 	myprocessor = current_processor();
110 	result = SCHED(thread_should_yield)(myprocessor, current_thread());
111 	enable_preemption();
112 
113 	ml_delay_on_yield();
114 
115 	thread_syscall_return(result);
116 	/*NOTREACHED*/
117 }
118 
119 boolean_t
swtch(__unused struct swtch_args * args)120 swtch(
121 	__unused struct swtch_args *args)
122 {
123 	processor_t     myprocessor;
124 
125 	disable_preemption();
126 	myprocessor = current_processor();
127 	if (!SCHED(thread_should_yield)(myprocessor, current_thread())) {
128 		mp_enable_preemption();
129 
130 		return FALSE;
131 	}
132 	enable_preemption();
133 
134 	thread_yield_with_continuation((thread_continue_t)swtch_continue, NULL);
135 }
136 
137 static void
swtch_pri_continue(void)138 swtch_pri_continue(void)
139 {
140 	processor_t     myprocessor;
141 	boolean_t       result;
142 
143 	thread_depress_abort(current_thread());
144 
145 	disable_preemption();
146 	myprocessor = current_processor();
147 	result = SCHED(thread_should_yield)(myprocessor, current_thread());
148 	mp_enable_preemption();
149 
150 	ml_delay_on_yield();
151 
152 	thread_syscall_return(result);
153 	/*NOTREACHED*/
154 }
155 
156 boolean_t
swtch_pri(__unused struct swtch_pri_args * args)157 swtch_pri(
158 	__unused        struct swtch_pri_args *args)
159 {
160 	processor_t     myprocessor;
161 
162 	disable_preemption();
163 	myprocessor = current_processor();
164 	if (!SCHED(thread_should_yield)(myprocessor, current_thread())) {
165 		mp_enable_preemption();
166 
167 		return FALSE;
168 	}
169 	enable_preemption();
170 
171 	thread_depress_abstime(thread_depress_time);
172 
173 	thread_yield_with_continuation((thread_continue_t)swtch_pri_continue, NULL);
174 }
175 
176 static void
thread_switch_continue(void * parameter,__unused int ret)177 thread_switch_continue(void *parameter, __unused int ret)
178 {
179 	thread_t self = current_thread();
180 	int option = (int)(intptr_t)parameter;
181 
182 	if (option == SWITCH_OPTION_DEPRESS || option == SWITCH_OPTION_OSLOCK_DEPRESS) {
183 		thread_depress_abort(self);
184 	}
185 
186 	ml_delay_on_yield();
187 
188 	thread_syscall_return(KERN_SUCCESS);
189 	/*NOTREACHED*/
190 }
191 
192 /*
193  *	thread_switch:
194  *
195  *	Context switch.  User may supply thread hint.
196  */
197 kern_return_t
thread_switch(struct thread_switch_args * args)198 thread_switch(
199 	struct thread_switch_args *args)
200 {
201 	thread_t                        thread = THREAD_NULL;
202 	thread_t                        self = current_thread();
203 	mach_port_name_t                thread_name = args->thread_name;
204 	int                             option = args->option;
205 	mach_msg_timeout_t              option_time = args->option_time;
206 	uint32_t                        scale_factor = NSEC_PER_MSEC;
207 	boolean_t                       depress_option = FALSE;
208 	boolean_t                       wait_option = FALSE;
209 	wait_interrupt_t                interruptible = THREAD_ABORTSAFE;
210 	port_intrans_options_t        ptt_options = PORT_INTRANS_THREAD_NOT_CURRENT_THREAD;
211 
212 	/*
213 	 *	Validate and process option.
214 	 *
215 	 * OSLock boosting only applies to other threads
216 	 * in your same task (even if you have a port for
217 	 * a thread in another task)
218 	 */
219 	switch (option) {
220 	case SWITCH_OPTION_NONE:
221 		break;
222 	case SWITCH_OPTION_WAIT:
223 		wait_option = TRUE;
224 		break;
225 	case SWITCH_OPTION_DEPRESS:
226 		depress_option = TRUE;
227 		break;
228 	case SWITCH_OPTION_DISPATCH_CONTENTION:
229 		scale_factor = NSEC_PER_USEC;
230 		wait_option = TRUE;
231 		interruptible |= THREAD_WAIT_NOREPORT;
232 		break;
233 	case SWITCH_OPTION_OSLOCK_DEPRESS:
234 		depress_option = TRUE;
235 		interruptible |= THREAD_WAIT_NOREPORT;
236 		ptt_options |= PORT_INTRANS_THREAD_IN_CURRENT_TASK;
237 		break;
238 	case SWITCH_OPTION_OSLOCK_WAIT:
239 		wait_option = TRUE;
240 		interruptible |= THREAD_WAIT_NOREPORT;
241 		ptt_options |= PORT_INTRANS_THREAD_IN_CURRENT_TASK;
242 		break;
243 	default:
244 		return KERN_INVALID_ARGUMENT;
245 	}
246 
247 	/*
248 	 * Translate the port name if supplied.
249 	 */
250 	if (thread_name != MACH_PORT_NULL) {
251 		thread = port_name_to_thread(thread_name, ptt_options);
252 	}
253 
254 	if (option == SWITCH_OPTION_OSLOCK_DEPRESS || option == SWITCH_OPTION_OSLOCK_WAIT) {
255 		if (thread != THREAD_NULL) {
256 			/*
257 			 * Attempt to kick the lock owner up to our same IO throttling tier.
258 			 * If the thread is currently blocked in throttle_lowpri_io(),
259 			 * it will immediately break out.
260 			 *
261 			 * TODO: SFI break out?
262 			 */
263 			int new_policy = proc_get_effective_thread_policy(self, TASK_POLICY_IO);
264 
265 			set_thread_iotier_override(thread, new_policy);
266 		}
267 	}
268 
269 	/*
270 	 * Try to handoff if supplied.
271 	 */
272 	if (thread != THREAD_NULL) {
273 		spl_t s = splsched();
274 
275 		/* This may return a different thread if the target is pushing on something */
276 		thread_t pulled_thread = thread_run_queue_remove_for_handoff(thread);
277 
278 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SWITCH) | DBG_FUNC_NONE,
279 		    thread_tid(thread), thread->state,
280 		    pulled_thread ? TRUE : FALSE, 0, 0);
281 
282 		if (pulled_thread != THREAD_NULL) {
283 			/* We can't be dropping the last ref here */
284 			thread_deallocate_safe(thread);
285 
286 			if (wait_option) {
287 				assert_wait_timeout((event_t)assert_wait_timeout, interruptible,
288 				    option_time, scale_factor);
289 			} else if (depress_option) {
290 				thread_depress_ms(option_time);
291 			}
292 
293 			thread_run(self, thread_switch_continue, (void *)(intptr_t)option, pulled_thread);
294 			__builtin_unreachable();
295 		}
296 
297 		splx(s);
298 
299 		thread_deallocate(thread);
300 	}
301 
302 	if (wait_option) {
303 		assert_wait_timeout((event_t)assert_wait_timeout, interruptible, option_time, scale_factor);
304 	} else {
305 		disable_preemption();
306 		bool should_yield = SCHED(thread_should_yield)(current_processor(), current_thread());
307 		enable_preemption();
308 
309 		if (should_yield == false) {
310 			/* Early-return if yielding to the scheduler will not be beneficial */
311 			return KERN_SUCCESS;
312 		}
313 
314 		if (depress_option) {
315 			thread_depress_ms(option_time);
316 		}
317 	}
318 
319 	thread_yield_with_continuation(thread_switch_continue, (void *)(intptr_t)option);
320 	__builtin_unreachable();
321 }
322 
323 void
thread_yield_with_continuation(thread_continue_t continuation,void * parameter)324 thread_yield_with_continuation(
325 	thread_continue_t       continuation,
326 	void                            *parameter)
327 {
328 	assert(continuation);
329 	thread_block_reason(continuation, parameter, AST_YIELD);
330 	__builtin_unreachable();
331 }
332 
333 /* This function is called after an assert_wait(), therefore it must not
334  * cause another wait until after the thread_run() or thread_block()
335  *
336  * Following are the calling convention for thread ref deallocation.
337  *
338  * 1) If no continuation is provided, then thread ref is consumed.
339  * (thread_handoff_deallocate convention).
340  *
341  * 2) If continuation is provided with option THREAD_HANDOFF_SETRUN_NEEDED
342  * then thread ref is always consumed.
343  *
344  * 3) If continuation is provided with option THREAD_HANDOFF_NONE then thread
345  * ref is not consumed and it is upto the continuation to deallocate
346  * the thread reference.
347  */
348 static wait_result_t
thread_handoff_internal(thread_t thread,thread_continue_t continuation,void * parameter,thread_handoff_option_t option)349 thread_handoff_internal(thread_t thread, thread_continue_t continuation,
350     void *parameter, thread_handoff_option_t option)
351 {
352 	thread_t self = current_thread();
353 
354 	/*
355 	 * Try to handoff if supplied.
356 	 */
357 	if (thread != THREAD_NULL) {
358 		spl_t s = splsched();
359 
360 		thread_t pulled_thread = thread_prepare_for_handoff(thread, option);
361 
362 		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_THREAD_SWITCH) | DBG_FUNC_NONE,
363 		    thread_tid(thread), thread->state,
364 		    pulled_thread ? TRUE : FALSE, 0, 0);
365 
366 		/* Deallocate thread ref if needed */
367 		if (continuation == NULL || (option & THREAD_HANDOFF_SETRUN_NEEDED)) {
368 			/* Use the safe version of thread deallocate */
369 			thread_deallocate_safe(thread);
370 		}
371 
372 		if (pulled_thread != THREAD_NULL) {
373 			int result = thread_run(self, continuation, parameter, pulled_thread);
374 
375 			splx(s);
376 			return result;
377 		}
378 
379 		splx(s);
380 	}
381 
382 	int result = thread_block_parameter(continuation, parameter);
383 	return result;
384 }
385 
386 void
thread_handoff_parameter(thread_t thread,thread_continue_t continuation,void * parameter,thread_handoff_option_t option)387 thread_handoff_parameter(thread_t thread, thread_continue_t continuation,
388     void *parameter, thread_handoff_option_t option)
389 {
390 	thread_handoff_internal(thread, continuation, parameter, option);
391 	panic("NULL continuation passed to %s", __func__);
392 	__builtin_unreachable();
393 }
394 
395 wait_result_t
thread_handoff_deallocate(thread_t thread,thread_handoff_option_t option)396 thread_handoff_deallocate(thread_t thread, thread_handoff_option_t option)
397 {
398 	return thread_handoff_internal(thread, NULL, NULL, option);
399 }
400 
401 /*
402  * Thread depression
403  *
404  * This mechanism drops a thread to priority 0 in order for it to yield to
405  * all other runnnable threads on the system.  It can be canceled or timed out,
406  * whereupon the thread goes back to where it was.
407  *
408  * Note that TH_SFLAG_DEPRESS and TH_SFLAG_POLLDEPRESS are never set at the
409  * same time.  DEPRESS always defers to POLLDEPRESS.
410  *
411  * DEPRESS only lasts across a single thread_block call, and never returns
412  * to userspace.
413  * POLLDEPRESS can be active anywhere up until thread termination.
414  */
415 
416 void
thread_depress_timer_setup(thread_t self)417 thread_depress_timer_setup(thread_t self)
418 {
419 	self->depress_timer = kalloc_type(struct timer_call,
420 	    Z_ZERO | Z_WAITOK | Z_NOFAIL);
421 	timer_call_setup(self->depress_timer, thread_depress_expire, self);
422 }
423 
424 /*
425  * Depress thread's priority to lowest possible for the specified interval,
426  * with an interval of zero resulting in no timeout being scheduled.
427  *
428  * Must block with AST_YIELD afterwards to take effect
429  */
430 void
thread_depress_abstime(uint64_t interval)431 thread_depress_abstime(uint64_t interval)
432 {
433 	thread_t self = current_thread();
434 
435 	spl_t s = splsched();
436 	thread_lock(self);
437 
438 	assert((self->sched_flags & TH_SFLAG_DEPRESS) == 0);
439 
440 	if ((self->sched_flags & TH_SFLAG_POLLDEPRESS) == 0) {
441 		self->sched_flags |= TH_SFLAG_DEPRESS;
442 		thread_recompute_sched_pri(self, SETPRI_LAZY);
443 
444 		if (interval != 0) {
445 			uint64_t deadline;
446 
447 			clock_absolutetime_interval_to_deadline(interval, &deadline);
448 			if (!timer_call_enter(self->depress_timer, deadline, TIMER_CALL_USER_CRITICAL)) {
449 				self->depress_timer_active++;
450 			}
451 		}
452 	}
453 
454 	thread_unlock(self);
455 	splx(s);
456 }
457 
458 void
thread_depress_ms(mach_msg_timeout_t interval)459 thread_depress_ms(mach_msg_timeout_t interval)
460 {
461 	uint64_t abstime;
462 
463 	clock_interval_to_absolutetime_interval(interval, NSEC_PER_MSEC, &abstime);
464 	thread_depress_abstime(abstime);
465 }
466 
467 /*
468  *	Priority depression expiration.
469  */
470 void
thread_depress_expire(void * p0,__unused void * p1)471 thread_depress_expire(void      *p0,
472     __unused void      *p1)
473 {
474 	thread_t thread = (thread_t)p0;
475 
476 	spl_t s = splsched();
477 	thread_lock(thread);
478 
479 	assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
480 
481 	if (--thread->depress_timer_active == 0) {
482 		thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
483 		if ((thread->state & TH_RUN) == TH_RUN) {
484 			thread->last_basepri_change_time = mach_absolute_time();
485 		}
486 		thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
487 	}
488 
489 	thread_unlock(thread);
490 	splx(s);
491 }
492 
493 /*
494  * Prematurely abort priority depression if there is one.
495  */
496 kern_return_t
thread_depress_abort(thread_t thread)497 thread_depress_abort(thread_t thread)
498 {
499 	kern_return_t result = KERN_NOT_DEPRESSED;
500 
501 	spl_t s = splsched();
502 	thread_lock(thread);
503 
504 	assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
505 
506 	/*
507 	 * User-triggered depress-aborts should not get out
508 	 * of the poll-depress, but they should cancel a regular depress.
509 	 */
510 	if ((thread->sched_flags & TH_SFLAG_POLLDEPRESS) == 0) {
511 		result = thread_depress_abort_locked(thread);
512 	}
513 
514 	thread_unlock(thread);
515 	splx(s);
516 
517 	return result;
518 }
519 
520 /*
521  * Prematurely abort priority depression or poll depression if one is active.
522  * Called with the thread locked.
523  */
524 kern_return_t
thread_depress_abort_locked(thread_t thread)525 thread_depress_abort_locked(thread_t thread)
526 {
527 	if ((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) == 0) {
528 		return KERN_NOT_DEPRESSED;
529 	}
530 
531 	assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
532 
533 	thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
534 	if ((thread->state & TH_RUN) == TH_RUN) {
535 		thread->last_basepri_change_time = mach_absolute_time();
536 	}
537 
538 	thread_recompute_sched_pri(thread, SETPRI_LAZY);
539 
540 	if (timer_call_cancel(thread->depress_timer)) {
541 		thread->depress_timer_active--;
542 	}
543 
544 	return KERN_SUCCESS;
545 }
546 
547 /*
548  * Invoked as part of a polling operation like a no-timeout port receive
549  *
550  * Forces a fixpri thread to yield if it is detected polling without blocking for too long.
551  */
552 void
thread_poll_yield(thread_t self)553 thread_poll_yield(thread_t self)
554 {
555 	assert(self == current_thread());
556 	assert((self->sched_flags & TH_SFLAG_DEPRESS) == 0);
557 
558 	if (self->sched_mode != TH_MODE_FIXED) {
559 		return;
560 	}
561 
562 	spl_t s = splsched();
563 
564 	uint64_t abstime = mach_absolute_time();
565 	uint64_t total_computation = abstime -
566 	    self->computation_epoch + self->computation_metered;
567 
568 	if (total_computation >= max_poll_computation) {
569 		thread_lock(self);
570 
571 		self->computation_epoch   = abstime;
572 		self->computation_metered = 0;
573 
574 		uint64_t yield_expiration = abstime +
575 		    (total_computation >> sched_poll_yield_shift);
576 
577 		if (!timer_call_enter(self->depress_timer, yield_expiration,
578 		    TIMER_CALL_USER_CRITICAL)) {
579 			self->depress_timer_active++;
580 		}
581 
582 		self->sched_flags |= TH_SFLAG_POLLDEPRESS;
583 		thread_recompute_sched_pri(self, SETPRI_DEFAULT);
584 
585 		thread_unlock(self);
586 	}
587 	splx(s);
588 }
589 
590 /*
591  * Kernel-internal interface to yield for a specified period
592  *
593  * WARNING: Will still yield to priority 0 even if the thread is holding a contended lock!
594  */
595 void
thread_yield_internal(mach_msg_timeout_t ms)596 thread_yield_internal(mach_msg_timeout_t ms)
597 {
598 	thread_t self = current_thread();
599 
600 	assert((self->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
601 
602 	processor_t     myprocessor;
603 
604 	disable_preemption();
605 	myprocessor = current_processor();
606 	if (!SCHED(thread_should_yield)(myprocessor, self)) {
607 		mp_enable_preemption();
608 
609 		return;
610 	}
611 	enable_preemption();
612 
613 	thread_depress_ms(ms);
614 
615 	thread_block_reason(THREAD_CONTINUE_NULL, NULL, AST_YIELD);
616 
617 	thread_depress_abort(self);
618 }
619 
620 /*
621  * This yields to a possible non-urgent preemption pending on the current processor.
622  *
623  * This is useful when doing a long computation in the kernel without returning to userspace.
624  *
625  * As opposed to other yielding mechanisms, this does not drop the priority of the current thread.
626  */
627 void
thread_yield_to_preemption()628 thread_yield_to_preemption()
629 {
630 	/*
631 	 * ast_pending() should ideally be called with interrupts disabled, but
632 	 * the check here is fine because csw_check() will do the right thing.
633 	 */
634 	ast_t *pending_ast = ast_pending();
635 	ast_t ast = AST_NONE;
636 	processor_t p;
637 
638 	if (*pending_ast & AST_PREEMPT) {
639 		thread_t self = current_thread();
640 
641 		spl_t s = splsched();
642 
643 		p = current_processor();
644 		thread_lock(self);
645 		ast = csw_check(self, p, AST_YIELD);
646 		ast_on(ast);
647 		thread_unlock(self);
648 
649 		if (ast != AST_NONE) {
650 			(void)thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
651 		}
652 
653 		splx(s);
654 	}
655 }
656