xref: /xnu-10002.61.3/bsd/dev/i386/systemcalls.c (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <kern/task.h>
29 #include <kern/thread.h>
30 #include <kern/assert.h>
31 #include <kern/clock.h>
32 #include <kern/locks.h>
33 #include <kern/sched_prim.h>
34 #include <kern/debug.h>
35 #include <mach/machine/thread_status.h>
36 #include <mach/thread_act.h>
37 
38 #include <sys/kernel.h>
39 #include <sys/vm.h>
40 #include <sys/proc_internal.h>
41 #include <sys/syscall.h>
42 #include <sys/systm.h>
43 #include <sys/user.h>
44 #include <sys/errno.h>
45 #include <sys/kdebug.h>
46 #include <sys/sysent.h>
47 #include <sys/sysproto.h>
48 #include <sys/kauth.h>
49 #include <sys/systm.h>
50 #include <sys/bitstring.h>
51 
52 #include <security/audit/audit.h>
53 
54 #include <i386/seg.h>
55 #include <i386/machine_routines.h>
56 #include <mach/i386/syscall_sw.h>
57 
58 #include <machine/pal_routines.h>
59 
60 #if CONFIG_MACF
61 #include <security/mac_framework.h>
62 #endif
63 
64 #if CONFIG_DTRACE
65 extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
66 extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
67 #endif
68 
69 extern void unix_syscall(x86_saved_state_t *);
70 extern void unix_syscall64(x86_saved_state_t *);
71 extern void *find_user_regs(thread_t);
72 
73 /* dynamically generated at build time based on syscalls.master */
74 extern const char *syscallnames[];
75 
76 #define code_is_kdebug_trace(code) (((code) == SYS_kdebug_trace) ||   \
77 	                            ((code) == SYS_kdebug_trace64) || \
78 	                            ((code) == SYS_kdebug_trace_string))
79 
80 /*
81  * Function:	unix_syscall
82  *
83  * Inputs:	regs	- pointer to i386 save area
84  *
85  * Outputs:	none
86  */
87 __attribute__((noreturn))
88 void
unix_syscall(x86_saved_state_t * state)89 unix_syscall(x86_saved_state_t *state)
90 {
91 	thread_t                thread;
92 	void                    *vt;
93 	unsigned int            code, syscode;
94 	const struct sysent     *callp;
95 
96 	int                     error;
97 	vm_offset_t             params;
98 	struct proc             *p;
99 	struct uthread          *uthread;
100 	x86_saved_state32_t     *regs;
101 	pid_t                   pid;
102 
103 	assert(is_saved_state32(state));
104 	regs = saved_state32(state);
105 #if DEBUG
106 	if (regs->eax == 0x800) {
107 		thread_exception_return();
108 	}
109 #endif
110 	thread = current_thread();
111 	uthread = get_bsdthread_info(thread);
112 	p = current_proc();
113 
114 	uthread_reset_proc_refcount(uthread);
115 
116 	code    = regs->eax & I386_SYSCALL_NUMBER_MASK;
117 	syscode = (code < nsysent) ? code : SYS_invalid;
118 	DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
119 	    code, syscallnames[syscode], (uint32_t)regs->eip);
120 	params = (vm_offset_t) (regs->uesp + sizeof(int));
121 
122 	regs->efl &= ~(EFL_CF);
123 
124 	callp = &sysent[syscode];
125 
126 	if (__improbable(callp == sysent)) {
127 		code = fuword(params);
128 		params += sizeof(int);
129 		syscode = (code < nsysent) ? code : SYS_invalid;
130 		callp = &sysent[syscode];
131 	}
132 
133 	vt = (void *)uthread->uu_arg;
134 
135 	if (callp->sy_arg_bytes != 0) {
136 #if CONFIG_REQUIRES_U32_MUNGING
137 		sy_munge_t      *mungerp;
138 #else
139 #error U32 syscalls on x86_64 kernel requires munging
140 #endif
141 		uint32_t         nargs;
142 
143 		assert((unsigned) callp->sy_arg_bytes <= sizeof(uthread->uu_arg));
144 		nargs = callp->sy_arg_bytes;
145 		error = copyin((user_addr_t) params, (char *) vt, nargs);
146 		if (error) {
147 			regs->eax = error;
148 			regs->efl |= EFL_CF;
149 			thread_exception_return();
150 			/* NOTREACHED */
151 		}
152 
153 		if (__probable(!code_is_kdebug_trace(code))) {
154 			uint32_t *uip = vt;
155 			KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
156 			    uip[0], uip[1], uip[2], uip[3]);
157 		}
158 
159 #if CONFIG_REQUIRES_U32_MUNGING
160 		mungerp = callp->sy_arg_munge32;
161 
162 		if (mungerp != NULL) {
163 			(*mungerp)(vt);
164 		}
165 #endif
166 	} else {
167 		KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START);
168 	}
169 
170 	/*
171 	 * Delayed binding of thread credential to process credential, if we
172 	 * are not running with an explicitly set thread credential.
173 	 */
174 	kauth_cred_thread_update(thread, p);
175 
176 	uthread->uu_rval[0] = 0;
177 	uthread->uu_rval[1] = 0;
178 	uthread->uu_flag |= UT_NOTCANCELPT;
179 	uthread->syscall_code = code;
180 	pid = proc_pid(p);
181 
182 #ifdef CONFIG_IOCOUNT_TRACE
183 	uthread->uu_iocount = 0;
184 	uthread->uu_vpindex = 0;
185 #endif
186 
187 #if CONFIG_MACF
188 	if (__improbable(proc_syscall_filter_mask(p) != NULL && !bitstr_test(proc_syscall_filter_mask(p), syscode))) {
189 		error = mac_proc_check_syscall_unix(p, syscode);
190 		if (error) {
191 			goto skip_syscall;
192 		}
193 	}
194 #endif /* CONFIG_MACF */
195 
196 	AUDIT_SYSCALL_ENTER(code, p, uthread);
197 	error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0]));
198 	AUDIT_SYSCALL_EXIT(code, p, uthread, error);
199 
200 #if CONFIG_MACF
201 skip_syscall:
202 #endif /* CONFIG_MACF */
203 
204 #ifdef CONFIG_IOCOUNT_TRACE
205 	if (uthread->uu_iocount) {
206 		printf("system call returned with uu_iocount(%d) != 0\n",
207 		    uthread->uu_iocount);
208 	}
209 #endif
210 #if CONFIG_DTRACE
211 	uthread->t_dtrace_errno = error;
212 #endif /* CONFIG_DTRACE */
213 
214 	if (__improbable(error == ERESTART)) {
215 		/*
216 		 * Move the user's pc back to repeat the syscall:
217 		 * 5 bytes for a sysenter, or 2 for an int 8x.
218 		 * The SYSENTER_TF_CS covers single-stepping over a sysenter
219 		 * - see debug trap handler in idt.s/idt64.s
220 		 */
221 
222 		pal_syscall_restart(thread, state);
223 	} else if (error != EJUSTRETURN) {
224 		if (__improbable(error)) {
225 			regs->eax = error;
226 			regs->efl |= EFL_CF;    /* carry bit */
227 		} else { /* (not error) */
228 			 /*
229 			  * We split retval across two registers, in case the
230 			  * syscall had a 64-bit return value, in which case
231 			  * eax/edx matches the function call ABI.
232 			  */
233 			regs->eax = uthread->uu_rval[0];
234 			regs->edx = uthread->uu_rval[1];
235 		}
236 	}
237 
238 	DEBUG_KPRINT_SYSCALL_UNIX(
239 		"unix_syscall: error=%d retval=(%u,%u)\n",
240 		error, regs->eax, regs->edx);
241 
242 	uthread->uu_flag &= ~UT_NOTCANCELPT;
243 	uthread->syscall_code = 0;
244 
245 #if DEBUG || DEVELOPMENT
246 	kern_allocation_name_t
247 	prior __assert_only = thread_set_allocation_name(NULL);
248 	assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior));
249 #endif /* DEBUG || DEVELOPMENT */
250 
251 	if (__improbable(uthread->uu_lowpri_window)) {
252 		/*
253 		 * task is marked as a low priority I/O type
254 		 * and the I/O we issued while in this system call
255 		 * collided with normal I/O operations... we'll
256 		 * delay in order to mitigate the impact of this
257 		 * task on the normal operation of the system
258 		 */
259 		throttle_lowpri_io(1);
260 	}
261 	if (__probable(!code_is_kdebug_trace(code))) {
262 		KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
263 		    error, uthread->uu_rval[0], uthread->uu_rval[1], pid);
264 	}
265 
266 	if (__improbable(callp->sy_call == (sy_call_t *)execve && !error)) {
267 		pal_execve_return(thread);
268 	}
269 
270 	uthread_assert_zero_proc_refcount(uthread);
271 	thread_exception_return();
272 	/* NOTREACHED */
273 }
274 
275 __attribute__((noreturn))
276 void
unix_syscall64(x86_saved_state_t * state)277 unix_syscall64(x86_saved_state_t *state)
278 {
279 	thread_t        thread;
280 	void                    *vt;
281 	unsigned int    code, syscode;
282 	const struct sysent   *callp;
283 	int             args_in_regs;
284 	boolean_t       args_start_at_rdi;
285 	int             error;
286 	struct proc     *p;
287 	struct uthread  *uthread;
288 	x86_saved_state64_t *regs;
289 	pid_t           pid;
290 
291 	assert(is_saved_state64(state));
292 	regs = saved_state64(state);
293 #if     DEBUG
294 	if (regs->rax == 0x2000800) {
295 		thread_exception_return();
296 	}
297 #endif
298 	thread = current_thread();
299 	uthread = get_bsdthread_info(thread);
300 	p = current_proc();
301 
302 	uthread_reset_proc_refcount(uthread);
303 
304 	/* Verify that we are not being called from a task without a proc */
305 	if (__improbable(p == NULL)) {
306 		regs->rax = EPERM;
307 		regs->isf.rflags |= EFL_CF;
308 		task_terminate_internal(current_task());
309 		thread_exception_return();
310 		/* NOTREACHED */
311 	}
312 
313 	code    = regs->rax & SYSCALL_NUMBER_MASK;
314 	syscode = (code < nsysent) ? code : SYS_invalid;
315 	DEBUG_KPRINT_SYSCALL_UNIX(
316 		"unix_syscall64: code=%d(%s) rip=%llx\n",
317 		code, syscallnames[syscode], regs->isf.rip);
318 	callp = &sysent[syscode];
319 
320 	vt = (void *)uthread->uu_arg;
321 
322 	if (__improbable(callp == sysent)) {
323 		/*
324 		 * indirect system call... system call number
325 		 * passed as 'arg0'
326 		 */
327 		code    = regs->rdi;
328 		syscode = (code < nsysent) ? code : SYS_invalid;
329 		callp   = &sysent[syscode];
330 		args_start_at_rdi = FALSE;
331 		args_in_regs = 5;
332 	} else {
333 		args_start_at_rdi = TRUE;
334 		args_in_regs = 6;
335 	}
336 
337 	if (callp->sy_narg != 0) {
338 		assert(callp->sy_narg <= 8); /* size of uu_arg */
339 
340 		args_in_regs = MIN(args_in_regs, callp->sy_narg);
341 		memcpy(vt, args_start_at_rdi ? &regs->rdi : &regs->rsi, args_in_regs * sizeof(syscall_arg_t));
342 
343 		if (!code_is_kdebug_trace(code)) {
344 			uint64_t *uip = vt;
345 
346 			KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
347 			    uip[0], uip[1], uip[2], uip[3]);
348 		}
349 
350 		if (__improbable(callp->sy_narg > args_in_regs)) {
351 			int copyin_count;
352 
353 			copyin_count = (callp->sy_narg - args_in_regs) * sizeof(syscall_arg_t);
354 
355 			error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&uthread->uu_arg[args_in_regs], copyin_count);
356 			if (error) {
357 				regs->rax = error;
358 				regs->isf.rflags |= EFL_CF;
359 				thread_exception_return();
360 				/* NOTREACHED */
361 			}
362 		}
363 	} else {
364 		KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START);
365 	}
366 
367 	/*
368 	 * Delayed binding of thread credential to process credential, if we
369 	 * are not running with an explicitly set thread credential.
370 	 */
371 	kauth_cred_thread_update(thread, p);
372 
373 	uthread->uu_rval[0] = 0;
374 	uthread->uu_rval[1] = 0;
375 	uthread->uu_flag |= UT_NOTCANCELPT;
376 	uthread->syscall_code = code;
377 	pid = proc_pid(p);
378 
379 #ifdef CONFIG_IOCOUNT_TRACE
380 	uthread->uu_iocount = 0;
381 	uthread->uu_vpindex = 0;
382 #endif
383 
384 #if CONFIG_MACF
385 	if (__improbable(proc_syscall_filter_mask(p) != NULL && !bitstr_test(proc_syscall_filter_mask(p), syscode))) {
386 		error = mac_proc_check_syscall_unix(p, syscode);
387 		if (error) {
388 			goto skip_syscall;
389 		}
390 	}
391 #endif /* CONFIG_MACF */
392 
393 	AUDIT_SYSCALL_ENTER(code, p, uthread);
394 	error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0]));
395 	AUDIT_SYSCALL_EXIT(code, p, uthread, error);
396 
397 #if CONFIG_MACF
398 skip_syscall:
399 #endif /* CONFIG_MACF */
400 
401 #ifdef CONFIG_IOCOUNT_TRACE
402 	if (uthread->uu_iocount) {
403 		printf("system call returned with uu_iocount(%d) != 0\n",
404 		    uthread->uu_iocount);
405 	}
406 #endif
407 
408 #if CONFIG_DTRACE
409 	uthread->t_dtrace_errno = error;
410 #endif /* CONFIG_DTRACE */
411 
412 	if (__improbable(error == ERESTART)) {
413 		/*
414 		 * all system calls come through via the syscall instruction
415 		 * in 64 bit mode... its 2 bytes in length
416 		 * move the user's pc back to repeat the syscall:
417 		 */
418 		pal_syscall_restart( thread, state );
419 	} else if (error != EJUSTRETURN) {
420 		if (__improbable(error)) {
421 			regs->rax = error;
422 			regs->isf.rflags |= EFL_CF;     /* carry bit */
423 		} else { /* (not error) */
424 			switch (callp->sy_return_type) {
425 			case _SYSCALL_RET_INT_T:
426 				regs->rax = uthread->uu_rval[0];
427 				regs->rdx = uthread->uu_rval[1];
428 				break;
429 			case _SYSCALL_RET_UINT_T:
430 				regs->rax = ((u_int)uthread->uu_rval[0]);
431 				regs->rdx = ((u_int)uthread->uu_rval[1]);
432 				break;
433 			case _SYSCALL_RET_OFF_T:
434 			case _SYSCALL_RET_ADDR_T:
435 			case _SYSCALL_RET_SIZE_T:
436 			case _SYSCALL_RET_SSIZE_T:
437 			case _SYSCALL_RET_UINT64_T:
438 				regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
439 				regs->rdx = 0;
440 				break;
441 			case _SYSCALL_RET_NONE:
442 				break;
443 			default:
444 				panic("unix_syscall: unknown return type");
445 				break;
446 			}
447 			regs->isf.rflags &= ~EFL_CF;
448 		}
449 	}
450 
451 	DEBUG_KPRINT_SYSCALL_UNIX(
452 		"unix_syscall64: error=%d retval=(%llu,%llu)\n",
453 		error, regs->rax, regs->rdx);
454 
455 	uthread->uu_flag &= ~UT_NOTCANCELPT;
456 	uthread->syscall_code = 0;
457 
458 #if DEBUG || DEVELOPMENT
459 	kern_allocation_name_t
460 	prior __assert_only = thread_set_allocation_name(NULL);
461 	assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior));
462 #endif /* DEBUG || DEVELOPMENT */
463 
464 	if (__improbable(uthread->uu_lowpri_window)) {
465 		/*
466 		 * task is marked as a low priority I/O type
467 		 * and the I/O we issued while in this system call
468 		 * collided with normal I/O operations... we'll
469 		 * delay in order to mitigate the impact of this
470 		 * task on the normal operation of the system
471 		 */
472 		throttle_lowpri_io(1);
473 	}
474 	if (__probable(!code_is_kdebug_trace(code))) {
475 		KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
476 		    error, uthread->uu_rval[0], uthread->uu_rval[1], pid);
477 	}
478 
479 	uthread_assert_zero_proc_refcount(uthread);
480 	thread_exception_return();
481 	/* NOTREACHED */
482 }
483 
484 
485 void
unix_syscall_return(int error)486 unix_syscall_return(int error)
487 {
488 	thread_t                thread;
489 	struct uthread          *uthread;
490 	struct proc *p;
491 	unsigned int code;
492 	const struct sysent *callp;
493 
494 	thread = current_thread();
495 	uthread = get_bsdthread_info(thread);
496 
497 	pal_register_cache_state(thread, DIRTY);
498 
499 	p = current_proc();
500 
501 	if (proc_is64bit(p)) {
502 		x86_saved_state64_t *regs;
503 
504 		regs = saved_state64(find_user_regs(thread));
505 
506 		code = uthread->syscall_code;
507 		callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code];
508 
509 #if CONFIG_DTRACE
510 		if (callp->sy_call == dtrace_systrace_syscall) {
511 			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
512 		}
513 #endif /* CONFIG_DTRACE */
514 		AUDIT_SYSCALL_EXIT(code, p, uthread, error);
515 
516 		if (error == ERESTART) {
517 			/*
518 			 * repeat the syscall
519 			 */
520 			pal_syscall_restart( thread, find_user_regs(thread));
521 		} else if (error != EJUSTRETURN) {
522 			if (error) {
523 				regs->rax = error;
524 				regs->isf.rflags |= EFL_CF;     /* carry bit */
525 			} else { /* (not error) */
526 				switch (callp->sy_return_type) {
527 				case _SYSCALL_RET_INT_T:
528 					regs->rax = uthread->uu_rval[0];
529 					regs->rdx = uthread->uu_rval[1];
530 					break;
531 				case _SYSCALL_RET_UINT_T:
532 					regs->rax = ((u_int)uthread->uu_rval[0]);
533 					regs->rdx = ((u_int)uthread->uu_rval[1]);
534 					break;
535 				case _SYSCALL_RET_OFF_T:
536 				case _SYSCALL_RET_ADDR_T:
537 				case _SYSCALL_RET_SIZE_T:
538 				case _SYSCALL_RET_SSIZE_T:
539 				case _SYSCALL_RET_UINT64_T:
540 					regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
541 					regs->rdx = 0;
542 					break;
543 				case _SYSCALL_RET_NONE:
544 					break;
545 				default:
546 					panic("unix_syscall: unknown return type");
547 					break;
548 				}
549 				regs->isf.rflags &= ~EFL_CF;
550 			}
551 		}
552 		DEBUG_KPRINT_SYSCALL_UNIX(
553 			"unix_syscall_return: error=%d retval=(%llu,%llu)\n",
554 			error, regs->rax, regs->rdx);
555 	} else {
556 		x86_saved_state32_t     *regs;
557 
558 		regs = saved_state32(find_user_regs(thread));
559 
560 		regs->efl &= ~(EFL_CF);
561 
562 		code = uthread->syscall_code;
563 		callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code];
564 
565 #if CONFIG_DTRACE
566 		if (callp->sy_call == dtrace_systrace_syscall) {
567 			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
568 		}
569 #endif /* CONFIG_DTRACE */
570 		AUDIT_SYSCALL_EXIT(code, p, uthread, error);
571 
572 		if (error == ERESTART) {
573 			pal_syscall_restart( thread, find_user_regs(thread));
574 		} else if (error != EJUSTRETURN) {
575 			if (error) {
576 				regs->eax = error;
577 				regs->efl |= EFL_CF;    /* carry bit */
578 			} else { /* (not error) */
579 				regs->eax = uthread->uu_rval[0];
580 				regs->edx = uthread->uu_rval[1];
581 			}
582 		}
583 		DEBUG_KPRINT_SYSCALL_UNIX(
584 			"unix_syscall_return: error=%d retval=(%u,%u)\n",
585 			error, regs->eax, regs->edx);
586 	}
587 
588 
589 	uthread->uu_flag &= ~UT_NOTCANCELPT;
590 	uthread->syscall_code = 0;
591 
592 #if DEBUG || DEVELOPMENT
593 	kern_allocation_name_t
594 	prior __assert_only = thread_set_allocation_name(NULL);
595 	assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior));
596 #endif /* DEBUG || DEVELOPMENT */
597 
598 	if (uthread->uu_lowpri_window) {
599 		/*
600 		 * task is marked as a low priority I/O type
601 		 * and the I/O we issued while in this system call
602 		 * collided with normal I/O operations... we'll
603 		 * delay in order to mitigate the impact of this
604 		 * task on the normal operation of the system
605 		 */
606 		throttle_lowpri_io(1);
607 	}
608 	if (!code_is_kdebug_trace(code)) {
609 		KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
610 		    error, uthread->uu_rval[0], uthread->uu_rval[1], proc_getpid(p));
611 	}
612 
613 	thread_exception_return();
614 	/* NOTREACHED */
615 }
616