xref: /xnu-11215.41.3/bsd/dev/i386/systemcalls.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <kern/task.h>
29 #include <kern/thread.h>
30 #include <kern/assert.h>
31 #include <kern/clock.h>
32 #include <kern/locks.h>
33 #include <kern/sched_prim.h>
34 #include <kern/debug.h>
35 #include <mach/machine/thread_status.h>
36 #include <mach/thread_act.h>
37 
38 #include <sys/kernel.h>
39 #include <sys/vm.h>
40 #include <sys/proc_internal.h>
41 #include <sys/syscall.h>
42 #include <sys/systm.h>
43 #include <sys/user.h>
44 #include <sys/errno.h>
45 #include <sys/kdebug.h>
46 #include <sys/sysent.h>
47 #include <sys/sysproto.h>
48 #include <sys/kauth.h>
49 #include <sys/systm.h>
50 #include <sys/bitstring.h>
51 
52 #include <security/audit/audit.h>
53 
54 #include <i386/seg.h>
55 #include <i386/machine_routines.h>
56 #include <mach/i386/syscall_sw.h>
57 
58 #include <machine/pal_routines.h>
59 
60 #if CONFIG_MACF
61 #include <security/mac_framework.h>
62 #endif
63 
64 #if CONFIG_DTRACE
65 extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
66 extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
67 #endif
68 
69 extern void unix_syscall(x86_saved_state_t *);
70 extern void unix_syscall64(x86_saved_state_t *);
71 extern void *find_user_regs(thread_t);
72 
73 /* dynamically generated at build time based on syscalls.master */
74 extern const char *syscallnames[];
75 
76 #define code_is_kdebug_trace(code) (((code) == SYS_kdebug_trace) ||   \
77 	                            ((code) == SYS_kdebug_trace64) || \
78 	                            ((code) == SYS_kdebug_trace_string))
79 
80 /*
81  * Function:	unix_syscall
82  *
83  * Inputs:	regs	- pointer to i386 save area
84  *
85  * Outputs:	none
86  */
87 __attribute__((noreturn))
88 void
unix_syscall(x86_saved_state_t * state)89 unix_syscall(x86_saved_state_t *state)
90 {
91 	thread_t                thread;
92 	void                    *vt;
93 	unsigned int            code, syscode;
94 	const struct sysent     *callp;
95 
96 	int                     error;
97 	vm_offset_t             params;
98 	struct proc             *p;
99 	struct uthread          *uthread;
100 	x86_saved_state32_t     *regs;
101 	pid_t                   pid;
102 
103 	assert(is_saved_state32(state));
104 	regs = saved_state32(state);
105 #if DEBUG
106 	if (regs->eax == 0x800) {
107 		thread_exception_return();
108 	}
109 #endif
110 	thread = current_thread();
111 	uthread = get_bsdthread_info(thread);
112 	p = get_thread_ro(thread)->tro_proc;
113 
114 	uthread_reset_proc_refcount(uthread);
115 
116 	code    = regs->eax & I386_SYSCALL_NUMBER_MASK;
117 	syscode = (code < nsysent) ? code : SYS_invalid;
118 	DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
119 	    code, syscallnames[syscode], (uint32_t)regs->eip);
120 	params = (vm_offset_t) (regs->uesp + sizeof(int));
121 
122 	regs->efl &= ~(EFL_CF);
123 
124 	callp = &sysent[syscode];
125 
126 	if (__improbable(callp == sysent)) {
127 		code = fuword(params);
128 		params += sizeof(int);
129 		syscode = (code < nsysent) ? code : SYS_invalid;
130 		callp = &sysent[syscode];
131 	}
132 
133 	vt = (void *)uthread->uu_arg;
134 
135 	if (callp->sy_arg_bytes != 0) {
136 #if CONFIG_REQUIRES_U32_MUNGING
137 		sy_munge_t      *mungerp;
138 #else
139 #error U32 syscalls on x86_64 kernel requires munging
140 #endif
141 		uint32_t         nargs;
142 
143 		assert((unsigned) callp->sy_arg_bytes <= sizeof(uthread->uu_arg));
144 		nargs = callp->sy_arg_bytes;
145 		error = copyin((user_addr_t) params, (char *) vt, nargs);
146 		if (error) {
147 			regs->eax = error;
148 			regs->efl |= EFL_CF;
149 			thread_exception_return();
150 			/* NOTREACHED */
151 		}
152 
153 		if (__probable(!code_is_kdebug_trace(code))) {
154 			uint32_t *uip = vt;
155 			KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
156 			    uip[0], uip[1], uip[2], uip[3]);
157 		}
158 
159 #if CONFIG_REQUIRES_U32_MUNGING
160 		mungerp = callp->sy_arg_munge32;
161 
162 		if (mungerp != NULL) {
163 			(*mungerp)(vt);
164 		}
165 #endif
166 	} else {
167 		KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START);
168 	}
169 
170 	current_cached_proc_cred_update();
171 
172 	uthread->uu_rval[0] = 0;
173 	uthread->uu_rval[1] = 0;
174 	uthread->uu_flag |= UT_NOTCANCELPT;
175 	uthread->syscall_code = code;
176 	pid = proc_pid(p);
177 
178 #ifdef CONFIG_IOCOUNT_TRACE
179 	uthread->uu_iocount = 0;
180 	uthread->uu_vpindex = 0;
181 #endif
182 
183 #if CONFIG_MACF
184 	if (__improbable(proc_syscall_filter_mask(p) != NULL && !bitstr_test(proc_syscall_filter_mask(p), syscode))) {
185 		error = mac_proc_check_syscall_unix(p, syscode);
186 		if (error) {
187 			goto skip_syscall;
188 		}
189 	}
190 #endif /* CONFIG_MACF */
191 
192 	AUDIT_SYSCALL_ENTER(code, p, uthread);
193 	error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0]));
194 	AUDIT_SYSCALL_EXIT(code, p, uthread, error);
195 
196 #if CONFIG_MACF
197 skip_syscall:
198 #endif /* CONFIG_MACF */
199 
200 #ifdef CONFIG_IOCOUNT_TRACE
201 	if (uthread->uu_iocount) {
202 		printf("system call(%d) returned with uu_iocount(%d) != 0\n",
203 		    syscode, uthread->uu_iocount);
204 	}
205 #endif
206 #if CONFIG_DTRACE
207 	uthread->t_dtrace_errno = error;
208 #endif /* CONFIG_DTRACE */
209 
210 	if (__improbable(error == ERESTART)) {
211 		/*
212 		 * Move the user's pc back to repeat the syscall:
213 		 * 5 bytes for a sysenter, or 2 for an int 8x.
214 		 * The SYSENTER_TF_CS covers single-stepping over a sysenter
215 		 * - see debug trap handler in idt.s/idt64.s
216 		 */
217 
218 		pal_syscall_restart(thread, state);
219 	} else if (error != EJUSTRETURN) {
220 		if (__improbable(error)) {
221 			regs->eax = error;
222 			regs->efl |= EFL_CF;    /* carry bit */
223 		} else { /* (not error) */
224 			 /*
225 			  * We split retval across two registers, in case the
226 			  * syscall had a 64-bit return value, in which case
227 			  * eax/edx matches the function call ABI.
228 			  */
229 			regs->eax = uthread->uu_rval[0];
230 			regs->edx = uthread->uu_rval[1];
231 		}
232 	}
233 
234 	DEBUG_KPRINT_SYSCALL_UNIX(
235 		"unix_syscall: error=%d retval=(%u,%u)\n",
236 		error, regs->eax, regs->edx);
237 
238 	uthread->uu_flag &= ~UT_NOTCANCELPT;
239 	uthread->syscall_code = 0;
240 
241 #if DEBUG || DEVELOPMENT
242 	kern_allocation_name_t
243 	prior __assert_only = thread_set_allocation_name(NULL);
244 	assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior));
245 #endif /* DEBUG || DEVELOPMENT */
246 
247 	if (__improbable(uthread->uu_lowpri_window)) {
248 		/*
249 		 * task is marked as a low priority I/O type
250 		 * and the I/O we issued while in this system call
251 		 * collided with normal I/O operations... we'll
252 		 * delay in order to mitigate the impact of this
253 		 * task on the normal operation of the system
254 		 */
255 		throttle_lowpri_io(1);
256 	}
257 	if (__probable(!code_is_kdebug_trace(code))) {
258 		KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
259 		    error, uthread->uu_rval[0], uthread->uu_rval[1], pid);
260 	}
261 
262 	if (__improbable(callp->sy_call == (sy_call_t *)execve && !error)) {
263 		pal_execve_return(thread);
264 	}
265 
266 	uthread_assert_zero_proc_refcount(uthread);
267 	thread_exception_return();
268 	/* NOTREACHED */
269 }
270 
271 __attribute__((noreturn))
272 void
unix_syscall64(x86_saved_state_t * state)273 unix_syscall64(x86_saved_state_t *state)
274 {
275 	thread_t        thread;
276 	void                    *vt;
277 	unsigned int    code, syscode;
278 	const struct sysent   *callp;
279 	int             args_in_regs;
280 	boolean_t       args_start_at_rdi;
281 	int             error;
282 	struct proc     *p;
283 	struct uthread  *uthread;
284 	x86_saved_state64_t *regs;
285 	pid_t           pid;
286 
287 	assert(is_saved_state64(state));
288 	regs = saved_state64(state);
289 #if     DEBUG
290 	if (regs->rax == 0x2000800) {
291 		thread_exception_return();
292 	}
293 #endif
294 	thread = current_thread();
295 	uthread = get_bsdthread_info(thread);
296 	p = current_proc();
297 
298 	uthread_reset_proc_refcount(uthread);
299 
300 	/* Verify that we are not being called from a task without a proc */
301 	if (__improbable(p == NULL)) {
302 		regs->rax = EPERM;
303 		regs->isf.rflags |= EFL_CF;
304 		task_terminate_internal(current_task());
305 		thread_exception_return();
306 		/* NOTREACHED */
307 	}
308 
309 	code    = regs->rax & SYSCALL_NUMBER_MASK;
310 	syscode = (code < nsysent) ? code : SYS_invalid;
311 	DEBUG_KPRINT_SYSCALL_UNIX(
312 		"unix_syscall64: code=%d(%s) rip=%llx\n",
313 		code, syscallnames[syscode], regs->isf.rip);
314 	callp = &sysent[syscode];
315 
316 	vt = (void *)uthread->uu_arg;
317 
318 	if (__improbable(callp == sysent)) {
319 		/*
320 		 * indirect system call... system call number
321 		 * passed as 'arg0'
322 		 */
323 		code    = regs->rdi;
324 		syscode = (code < nsysent) ? code : SYS_invalid;
325 		callp   = &sysent[syscode];
326 		args_start_at_rdi = FALSE;
327 		args_in_regs = 5;
328 	} else {
329 		args_start_at_rdi = TRUE;
330 		args_in_regs = 6;
331 	}
332 
333 	if (callp->sy_narg != 0) {
334 		assert(callp->sy_narg <= 8); /* size of uu_arg */
335 
336 		args_in_regs = MIN(args_in_regs, callp->sy_narg);
337 		memcpy(vt, args_start_at_rdi ? &regs->rdi : &regs->rsi, args_in_regs * sizeof(syscall_arg_t));
338 
339 		if (!code_is_kdebug_trace(code)) {
340 			uint64_t *uip = vt;
341 
342 			KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
343 			    uip[0], uip[1], uip[2], uip[3]);
344 		}
345 
346 		if (__improbable(callp->sy_narg > args_in_regs)) {
347 			int copyin_count;
348 
349 			copyin_count = (callp->sy_narg - args_in_regs) * sizeof(syscall_arg_t);
350 
351 			error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&uthread->uu_arg[args_in_regs], copyin_count);
352 			if (error) {
353 				regs->rax = error;
354 				regs->isf.rflags |= EFL_CF;
355 				thread_exception_return();
356 				/* NOTREACHED */
357 			}
358 		}
359 	} else {
360 		KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START);
361 	}
362 
363 	current_cached_proc_cred_update();
364 
365 	uthread->uu_rval[0] = 0;
366 	uthread->uu_rval[1] = 0;
367 	uthread->uu_flag |= UT_NOTCANCELPT;
368 	uthread->syscall_code = code;
369 	pid = proc_pid(p);
370 
371 #ifdef CONFIG_IOCOUNT_TRACE
372 	uthread->uu_iocount = 0;
373 	uthread->uu_vpindex = 0;
374 #endif
375 
376 #if CONFIG_MACF
377 	if (__improbable(proc_syscall_filter_mask(p) != NULL && !bitstr_test(proc_syscall_filter_mask(p), syscode))) {
378 		error = mac_proc_check_syscall_unix(p, syscode);
379 		if (error) {
380 			goto skip_syscall;
381 		}
382 	}
383 #endif /* CONFIG_MACF */
384 
385 	AUDIT_SYSCALL_ENTER(code, p, uthread);
386 	error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0]));
387 	AUDIT_SYSCALL_EXIT(code, p, uthread, error);
388 
389 #if CONFIG_MACF
390 skip_syscall:
391 #endif /* CONFIG_MACF */
392 
393 #ifdef CONFIG_IOCOUNT_TRACE
394 	if (uthread->uu_iocount) {
395 		printf("system call(%d) returned with uu_iocount(%d) != 0\n",
396 		    syscode, uthread->uu_iocount);
397 	}
398 #endif
399 
400 #if CONFIG_DTRACE
401 	uthread->t_dtrace_errno = error;
402 #endif /* CONFIG_DTRACE */
403 
404 	if (__improbable(error == ERESTART)) {
405 		/*
406 		 * all system calls come through via the syscall instruction
407 		 * in 64 bit mode... its 2 bytes in length
408 		 * move the user's pc back to repeat the syscall:
409 		 */
410 		pal_syscall_restart( thread, state );
411 	} else if (error != EJUSTRETURN) {
412 		if (__improbable(error)) {
413 			regs->rax = error;
414 			regs->isf.rflags |= EFL_CF;     /* carry bit */
415 		} else { /* (not error) */
416 			switch (callp->sy_return_type) {
417 			case _SYSCALL_RET_INT_T:
418 				regs->rax = uthread->uu_rval[0];
419 				regs->rdx = uthread->uu_rval[1];
420 				break;
421 			case _SYSCALL_RET_UINT_T:
422 				regs->rax = ((u_int)uthread->uu_rval[0]);
423 				regs->rdx = ((u_int)uthread->uu_rval[1]);
424 				break;
425 			case _SYSCALL_RET_OFF_T:
426 			case _SYSCALL_RET_ADDR_T:
427 			case _SYSCALL_RET_SIZE_T:
428 			case _SYSCALL_RET_SSIZE_T:
429 			case _SYSCALL_RET_UINT64_T:
430 				regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
431 				regs->rdx = 0;
432 				break;
433 			case _SYSCALL_RET_NONE:
434 				break;
435 			default:
436 				panic("unix_syscall: unknown return type");
437 				break;
438 			}
439 			regs->isf.rflags &= ~EFL_CF;
440 		}
441 	}
442 
443 	DEBUG_KPRINT_SYSCALL_UNIX(
444 		"unix_syscall64: error=%d retval=(%llu,%llu)\n",
445 		error, regs->rax, regs->rdx);
446 
447 	uthread->uu_flag &= ~UT_NOTCANCELPT;
448 	uthread->syscall_code = 0;
449 
450 #if DEBUG || DEVELOPMENT
451 	kern_allocation_name_t
452 	prior __assert_only = thread_set_allocation_name(NULL);
453 	assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior));
454 #endif /* DEBUG || DEVELOPMENT */
455 
456 	if (__improbable(uthread->uu_lowpri_window)) {
457 		/*
458 		 * task is marked as a low priority I/O type
459 		 * and the I/O we issued while in this system call
460 		 * collided with normal I/O operations... we'll
461 		 * delay in order to mitigate the impact of this
462 		 * task on the normal operation of the system
463 		 */
464 		throttle_lowpri_io(1);
465 	}
466 	if (__probable(!code_is_kdebug_trace(code))) {
467 		KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
468 		    error, uthread->uu_rval[0], uthread->uu_rval[1], pid);
469 	}
470 
471 	uthread_assert_zero_proc_refcount(uthread);
472 	thread_exception_return();
473 	/* NOTREACHED */
474 }
475 
476 
477 void
unix_syscall_return(int error)478 unix_syscall_return(int error)
479 {
480 	thread_t                thread;
481 	struct uthread          *uthread;
482 	struct proc *p;
483 	unsigned int code;
484 	const struct sysent *callp;
485 
486 	thread = current_thread();
487 	uthread = get_bsdthread_info(thread);
488 
489 	pal_register_cache_state(thread, DIRTY);
490 
491 	p = current_proc();
492 
493 	if (proc_is64bit(p)) {
494 		x86_saved_state64_t *regs;
495 
496 		regs = saved_state64(find_user_regs(thread));
497 
498 		code = uthread->syscall_code;
499 		callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code];
500 
501 #if CONFIG_DTRACE
502 		if (callp->sy_call == dtrace_systrace_syscall) {
503 			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
504 		}
505 #endif /* CONFIG_DTRACE */
506 		AUDIT_SYSCALL_EXIT(code, p, uthread, error);
507 
508 		if (error == ERESTART) {
509 			/*
510 			 * repeat the syscall
511 			 */
512 			pal_syscall_restart( thread, find_user_regs(thread));
513 		} else if (error != EJUSTRETURN) {
514 			if (error) {
515 				regs->rax = error;
516 				regs->isf.rflags |= EFL_CF;     /* carry bit */
517 			} else { /* (not error) */
518 				switch (callp->sy_return_type) {
519 				case _SYSCALL_RET_INT_T:
520 					regs->rax = uthread->uu_rval[0];
521 					regs->rdx = uthread->uu_rval[1];
522 					break;
523 				case _SYSCALL_RET_UINT_T:
524 					regs->rax = ((u_int)uthread->uu_rval[0]);
525 					regs->rdx = ((u_int)uthread->uu_rval[1]);
526 					break;
527 				case _SYSCALL_RET_OFF_T:
528 				case _SYSCALL_RET_ADDR_T:
529 				case _SYSCALL_RET_SIZE_T:
530 				case _SYSCALL_RET_SSIZE_T:
531 				case _SYSCALL_RET_UINT64_T:
532 					regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
533 					regs->rdx = 0;
534 					break;
535 				case _SYSCALL_RET_NONE:
536 					break;
537 				default:
538 					panic("unix_syscall: unknown return type");
539 					break;
540 				}
541 				regs->isf.rflags &= ~EFL_CF;
542 			}
543 		}
544 		DEBUG_KPRINT_SYSCALL_UNIX(
545 			"unix_syscall_return: error=%d retval=(%llu,%llu)\n",
546 			error, regs->rax, regs->rdx);
547 	} else {
548 		x86_saved_state32_t     *regs;
549 
550 		regs = saved_state32(find_user_regs(thread));
551 
552 		regs->efl &= ~(EFL_CF);
553 
554 		code = uthread->syscall_code;
555 		callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code];
556 
557 #if CONFIG_DTRACE
558 		if (callp->sy_call == dtrace_systrace_syscall) {
559 			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
560 		}
561 #endif /* CONFIG_DTRACE */
562 		AUDIT_SYSCALL_EXIT(code, p, uthread, error);
563 
564 		if (error == ERESTART) {
565 			pal_syscall_restart( thread, find_user_regs(thread));
566 		} else if (error != EJUSTRETURN) {
567 			if (error) {
568 				regs->eax = error;
569 				regs->efl |= EFL_CF;    /* carry bit */
570 			} else { /* (not error) */
571 				regs->eax = uthread->uu_rval[0];
572 				regs->edx = uthread->uu_rval[1];
573 			}
574 		}
575 		DEBUG_KPRINT_SYSCALL_UNIX(
576 			"unix_syscall_return: error=%d retval=(%u,%u)\n",
577 			error, regs->eax, regs->edx);
578 	}
579 
580 
581 	uthread->uu_flag &= ~UT_NOTCANCELPT;
582 	uthread->syscall_code = 0;
583 
584 #if DEBUG || DEVELOPMENT
585 	kern_allocation_name_t
586 	prior __assert_only = thread_set_allocation_name(NULL);
587 	assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior));
588 #endif /* DEBUG || DEVELOPMENT */
589 
590 	if (uthread->uu_lowpri_window) {
591 		/*
592 		 * task is marked as a low priority I/O type
593 		 * and the I/O we issued while in this system call
594 		 * collided with normal I/O operations... we'll
595 		 * delay in order to mitigate the impact of this
596 		 * task on the normal operation of the system
597 		 */
598 		throttle_lowpri_io(1);
599 	}
600 	if (!code_is_kdebug_trace(code)) {
601 		KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
602 		    error, uthread->uu_rval[0], uthread->uu_rval[1], proc_getpid(p));
603 	}
604 
605 	thread_exception_return();
606 	/* NOTREACHED */
607 }
608