1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <kern/task.h>
29 #include <kern/thread.h>
30 #include <kern/assert.h>
31 #include <kern/clock.h>
32 #include <kern/locks.h>
33 #include <kern/sched_prim.h>
34 #include <kern/debug.h>
35 #include <mach/machine/thread_status.h>
36 #include <mach/thread_act.h>
37
38 #include <sys/kernel.h>
39 #include <sys/vm.h>
40 #include <sys/proc_internal.h>
41 #include <sys/syscall.h>
42 #include <sys/systm.h>
43 #include <sys/user.h>
44 #include <sys/errno.h>
45 #include <sys/kdebug.h>
46 #include <sys/sysent.h>
47 #include <sys/sysproto.h>
48 #include <sys/kauth.h>
49 #include <sys/systm.h>
50 #include <sys/bitstring.h>
51
52 #include <security/audit/audit.h>
53
54 #include <i386/seg.h>
55 #include <i386/machine_routines.h>
56 #include <mach/i386/syscall_sw.h>
57
58 #include <machine/pal_routines.h>
59
60 #if CONFIG_MACF
61 #include <security/mac_framework.h>
62 #endif
63
64 #if CONFIG_DTRACE
65 extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
66 extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
67 #endif
68
69 extern void unix_syscall(x86_saved_state_t *);
70 extern void unix_syscall64(x86_saved_state_t *);
71 extern void *find_user_regs(thread_t);
72
73 /* dynamically generated at build time based on syscalls.master */
74 extern const char *syscallnames[];
75
76 #define code_is_kdebug_trace(code) (((code) == SYS_kdebug_trace) || \
77 ((code) == SYS_kdebug_trace64) || \
78 ((code) == SYS_kdebug_trace_string))
79
80 /*
81 * Function: unix_syscall
82 *
83 * Inputs: regs - pointer to i386 save area
84 *
85 * Outputs: none
86 */
87 __attribute__((noreturn))
88 void
unix_syscall(x86_saved_state_t * state)89 unix_syscall(x86_saved_state_t *state)
90 {
91 thread_t thread;
92 void *vt;
93 unsigned int code, syscode;
94 const struct sysent *callp;
95
96 int error;
97 vm_offset_t params;
98 struct proc *p;
99 struct uthread *uthread;
100 x86_saved_state32_t *regs;
101 pid_t pid;
102
103 assert(is_saved_state32(state));
104 regs = saved_state32(state);
105 #if DEBUG
106 if (regs->eax == 0x800) {
107 thread_exception_return();
108 }
109 #endif
110 thread = current_thread();
111 uthread = get_bsdthread_info(thread);
112 p = get_thread_ro(thread)->tro_proc;
113
114 uthread_reset_proc_refcount(uthread);
115
116 code = regs->eax & I386_SYSCALL_NUMBER_MASK;
117 syscode = (code < nsysent) ? code : SYS_invalid;
118 DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
119 code, syscallnames[syscode], (uint32_t)regs->eip);
120 params = (vm_offset_t) (regs->uesp + sizeof(int));
121
122 regs->efl &= ~(EFL_CF);
123
124 callp = &sysent[syscode];
125
126 if (__improbable(callp == sysent)) {
127 code = fuword(params);
128 params += sizeof(int);
129 syscode = (code < nsysent) ? code : SYS_invalid;
130 callp = &sysent[syscode];
131 }
132
133 vt = (void *)uthread->uu_arg;
134
135 if (callp->sy_arg_bytes != 0) {
136 #if CONFIG_REQUIRES_U32_MUNGING
137 sy_munge_t *mungerp;
138 #else
139 #error U32 syscalls on x86_64 kernel requires munging
140 #endif
141 uint32_t nargs;
142
143 assert((unsigned) callp->sy_arg_bytes <= sizeof(uthread->uu_arg));
144 nargs = callp->sy_arg_bytes;
145 error = copyin((user_addr_t) params, (char *) vt, nargs);
146 if (error) {
147 regs->eax = error;
148 regs->efl |= EFL_CF;
149 thread_exception_return();
150 /* NOTREACHED */
151 }
152
153 if (__probable(!code_is_kdebug_trace(code))) {
154 uint32_t *uip = vt;
155 KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
156 uip[0], uip[1], uip[2], uip[3]);
157 }
158
159 #if CONFIG_REQUIRES_U32_MUNGING
160 mungerp = callp->sy_arg_munge32;
161
162 if (mungerp != NULL) {
163 (*mungerp)(vt);
164 }
165 #endif
166 } else {
167 KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START);
168 }
169
170 current_cached_proc_cred_update();
171
172 uthread->uu_rval[0] = 0;
173 uthread->uu_rval[1] = 0;
174 uthread->uu_flag |= UT_NOTCANCELPT;
175 uthread->syscall_code = code;
176 pid = proc_pid(p);
177
178 #ifdef CONFIG_IOCOUNT_TRACE
179 uthread->uu_iocount = 0;
180 uthread->uu_vpindex = 0;
181 #endif
182
183 #if CONFIG_MACF
184 if (__improbable(proc_syscall_filter_mask(p) != NULL && !bitstr_test(proc_syscall_filter_mask(p), syscode))) {
185 error = mac_proc_check_syscall_unix(p, syscode);
186 if (error) {
187 goto skip_syscall;
188 }
189 }
190 #endif /* CONFIG_MACF */
191
192 AUDIT_SYSCALL_ENTER(code, p, uthread);
193 error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0]));
194 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
195
196 #if CONFIG_MACF
197 skip_syscall:
198 #endif /* CONFIG_MACF */
199
200 #ifdef CONFIG_IOCOUNT_TRACE
201 if (uthread->uu_iocount) {
202 printf("system call(%d) returned with uu_iocount(%d) != 0\n",
203 syscode, uthread->uu_iocount);
204 }
205 #endif
206 #if CONFIG_DTRACE
207 uthread->t_dtrace_errno = error;
208 #endif /* CONFIG_DTRACE */
209
210 if (__improbable(error == ERESTART)) {
211 /*
212 * Move the user's pc back to repeat the syscall:
213 * 5 bytes for a sysenter, or 2 for an int 8x.
214 * The SYSENTER_TF_CS covers single-stepping over a sysenter
215 * - see debug trap handler in idt.s/idt64.s
216 */
217
218 pal_syscall_restart(thread, state);
219 } else if (error != EJUSTRETURN) {
220 if (__improbable(error)) {
221 regs->eax = error;
222 regs->efl |= EFL_CF; /* carry bit */
223 } else { /* (not error) */
224 /*
225 * We split retval across two registers, in case the
226 * syscall had a 64-bit return value, in which case
227 * eax/edx matches the function call ABI.
228 */
229 regs->eax = uthread->uu_rval[0];
230 regs->edx = uthread->uu_rval[1];
231 }
232 }
233
234 DEBUG_KPRINT_SYSCALL_UNIX(
235 "unix_syscall: error=%d retval=(%u,%u)\n",
236 error, regs->eax, regs->edx);
237
238 uthread->uu_flag &= ~UT_NOTCANCELPT;
239 uthread->syscall_code = 0;
240
241 #if DEBUG || DEVELOPMENT
242 kern_allocation_name_t
243 prior __assert_only = thread_set_allocation_name(NULL);
244 assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior));
245 #endif /* DEBUG || DEVELOPMENT */
246
247 if (__improbable(uthread->uu_lowpri_window)) {
248 /*
249 * task is marked as a low priority I/O type
250 * and the I/O we issued while in this system call
251 * collided with normal I/O operations... we'll
252 * delay in order to mitigate the impact of this
253 * task on the normal operation of the system
254 */
255 throttle_lowpri_io(1);
256 }
257 if (__probable(!code_is_kdebug_trace(code))) {
258 KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
259 error, uthread->uu_rval[0], uthread->uu_rval[1], pid);
260 }
261
262 if (__improbable(callp->sy_call == (sy_call_t *)execve && !error)) {
263 pal_execve_return(thread);
264 }
265
266 uthread_assert_zero_proc_refcount(uthread);
267 thread_exception_return();
268 /* NOTREACHED */
269 }
270
271 __attribute__((noreturn))
272 void
unix_syscall64(x86_saved_state_t * state)273 unix_syscall64(x86_saved_state_t *state)
274 {
275 thread_t thread;
276 void *vt;
277 unsigned int code, syscode;
278 const struct sysent *callp;
279 int args_in_regs;
280 boolean_t args_start_at_rdi;
281 int error;
282 struct proc *p;
283 struct uthread *uthread;
284 x86_saved_state64_t *regs;
285 pid_t pid;
286
287 assert(is_saved_state64(state));
288 regs = saved_state64(state);
289 #if DEBUG
290 if (regs->rax == 0x2000800) {
291 thread_exception_return();
292 }
293 #endif
294 thread = current_thread();
295 uthread = get_bsdthread_info(thread);
296 p = current_proc();
297
298 uthread_reset_proc_refcount(uthread);
299
300 /* Verify that we are not being called from a task without a proc */
301 if (__improbable(p == NULL)) {
302 regs->rax = EPERM;
303 regs->isf.rflags |= EFL_CF;
304 task_terminate_internal(current_task());
305 thread_exception_return();
306 /* NOTREACHED */
307 }
308
309 code = regs->rax & SYSCALL_NUMBER_MASK;
310 syscode = (code < nsysent) ? code : SYS_invalid;
311 DEBUG_KPRINT_SYSCALL_UNIX(
312 "unix_syscall64: code=%d(%s) rip=%llx\n",
313 code, syscallnames[syscode], regs->isf.rip);
314 callp = &sysent[syscode];
315
316 vt = (void *)uthread->uu_arg;
317
318 if (__improbable(callp == sysent)) {
319 /*
320 * indirect system call... system call number
321 * passed as 'arg0'
322 */
323 code = regs->rdi;
324 syscode = (code < nsysent) ? code : SYS_invalid;
325 callp = &sysent[syscode];
326 args_start_at_rdi = FALSE;
327 args_in_regs = 5;
328 } else {
329 args_start_at_rdi = TRUE;
330 args_in_regs = 6;
331 }
332
333 if (callp->sy_narg != 0) {
334 assert(callp->sy_narg <= 8); /* size of uu_arg */
335
336 args_in_regs = MIN(args_in_regs, callp->sy_narg);
337 memcpy(vt, args_start_at_rdi ? ®s->rdi : ®s->rsi, args_in_regs * sizeof(syscall_arg_t));
338
339 if (!code_is_kdebug_trace(code)) {
340 uint64_t *uip = vt;
341
342 KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
343 uip[0], uip[1], uip[2], uip[3]);
344 }
345
346 if (__improbable(callp->sy_narg > args_in_regs)) {
347 int copyin_count;
348
349 copyin_count = (callp->sy_narg - args_in_regs) * sizeof(syscall_arg_t);
350
351 error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&uthread->uu_arg[args_in_regs], copyin_count);
352 if (error) {
353 regs->rax = error;
354 regs->isf.rflags |= EFL_CF;
355 thread_exception_return();
356 /* NOTREACHED */
357 }
358 }
359 } else {
360 KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START);
361 }
362
363 current_cached_proc_cred_update();
364
365 uthread->uu_rval[0] = 0;
366 uthread->uu_rval[1] = 0;
367 uthread->uu_flag |= UT_NOTCANCELPT;
368 uthread->syscall_code = code;
369 pid = proc_pid(p);
370
371 #ifdef CONFIG_IOCOUNT_TRACE
372 uthread->uu_iocount = 0;
373 uthread->uu_vpindex = 0;
374 #endif
375
376 #if CONFIG_MACF
377 if (__improbable(proc_syscall_filter_mask(p) != NULL && !bitstr_test(proc_syscall_filter_mask(p), syscode))) {
378 error = mac_proc_check_syscall_unix(p, syscode);
379 if (error) {
380 goto skip_syscall;
381 }
382 }
383 #endif /* CONFIG_MACF */
384
385 AUDIT_SYSCALL_ENTER(code, p, uthread);
386 error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0]));
387 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
388
389 #if CONFIG_MACF
390 skip_syscall:
391 #endif /* CONFIG_MACF */
392
393 #ifdef CONFIG_IOCOUNT_TRACE
394 if (uthread->uu_iocount) {
395 printf("system call(%d) returned with uu_iocount(%d) != 0\n",
396 syscode, uthread->uu_iocount);
397 }
398 #endif
399
400 #if CONFIG_DTRACE
401 uthread->t_dtrace_errno = error;
402 #endif /* CONFIG_DTRACE */
403
404 if (__improbable(error == ERESTART)) {
405 /*
406 * all system calls come through via the syscall instruction
407 * in 64 bit mode... its 2 bytes in length
408 * move the user's pc back to repeat the syscall:
409 */
410 pal_syscall_restart( thread, state );
411 } else if (error != EJUSTRETURN) {
412 if (__improbable(error)) {
413 regs->rax = error;
414 regs->isf.rflags |= EFL_CF; /* carry bit */
415 } else { /* (not error) */
416 switch (callp->sy_return_type) {
417 case _SYSCALL_RET_INT_T:
418 regs->rax = uthread->uu_rval[0];
419 regs->rdx = uthread->uu_rval[1];
420 break;
421 case _SYSCALL_RET_UINT_T:
422 regs->rax = ((u_int)uthread->uu_rval[0]);
423 regs->rdx = ((u_int)uthread->uu_rval[1]);
424 break;
425 case _SYSCALL_RET_OFF_T:
426 case _SYSCALL_RET_ADDR_T:
427 case _SYSCALL_RET_SIZE_T:
428 case _SYSCALL_RET_SSIZE_T:
429 case _SYSCALL_RET_UINT64_T:
430 regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
431 regs->rdx = 0;
432 break;
433 case _SYSCALL_RET_NONE:
434 break;
435 default:
436 panic("unix_syscall: unknown return type");
437 break;
438 }
439 regs->isf.rflags &= ~EFL_CF;
440 }
441 }
442
443 DEBUG_KPRINT_SYSCALL_UNIX(
444 "unix_syscall64: error=%d retval=(%llu,%llu)\n",
445 error, regs->rax, regs->rdx);
446
447 uthread->uu_flag &= ~UT_NOTCANCELPT;
448 uthread->syscall_code = 0;
449
450 #if DEBUG || DEVELOPMENT
451 kern_allocation_name_t
452 prior __assert_only = thread_set_allocation_name(NULL);
453 assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior));
454 #endif /* DEBUG || DEVELOPMENT */
455
456 if (__improbable(uthread->uu_lowpri_window)) {
457 /*
458 * task is marked as a low priority I/O type
459 * and the I/O we issued while in this system call
460 * collided with normal I/O operations... we'll
461 * delay in order to mitigate the impact of this
462 * task on the normal operation of the system
463 */
464 throttle_lowpri_io(1);
465 }
466 if (__probable(!code_is_kdebug_trace(code))) {
467 KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
468 error, uthread->uu_rval[0], uthread->uu_rval[1], pid);
469 }
470
471 uthread_assert_zero_proc_refcount(uthread);
472 thread_exception_return();
473 /* NOTREACHED */
474 }
475
476
477 void
unix_syscall_return(int error)478 unix_syscall_return(int error)
479 {
480 thread_t thread;
481 struct uthread *uthread;
482 struct proc *p;
483 unsigned int code;
484 const struct sysent *callp;
485
486 thread = current_thread();
487 uthread = get_bsdthread_info(thread);
488
489 pal_register_cache_state(thread, DIRTY);
490
491 p = current_proc();
492
493 if (proc_is64bit(p)) {
494 x86_saved_state64_t *regs;
495
496 regs = saved_state64(find_user_regs(thread));
497
498 code = uthread->syscall_code;
499 callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code];
500
501 #if CONFIG_DTRACE
502 if (callp->sy_call == dtrace_systrace_syscall) {
503 dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
504 }
505 #endif /* CONFIG_DTRACE */
506 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
507
508 if (error == ERESTART) {
509 /*
510 * repeat the syscall
511 */
512 pal_syscall_restart( thread, find_user_regs(thread));
513 } else if (error != EJUSTRETURN) {
514 if (error) {
515 regs->rax = error;
516 regs->isf.rflags |= EFL_CF; /* carry bit */
517 } else { /* (not error) */
518 switch (callp->sy_return_type) {
519 case _SYSCALL_RET_INT_T:
520 regs->rax = uthread->uu_rval[0];
521 regs->rdx = uthread->uu_rval[1];
522 break;
523 case _SYSCALL_RET_UINT_T:
524 regs->rax = ((u_int)uthread->uu_rval[0]);
525 regs->rdx = ((u_int)uthread->uu_rval[1]);
526 break;
527 case _SYSCALL_RET_OFF_T:
528 case _SYSCALL_RET_ADDR_T:
529 case _SYSCALL_RET_SIZE_T:
530 case _SYSCALL_RET_SSIZE_T:
531 case _SYSCALL_RET_UINT64_T:
532 regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
533 regs->rdx = 0;
534 break;
535 case _SYSCALL_RET_NONE:
536 break;
537 default:
538 panic("unix_syscall: unknown return type");
539 break;
540 }
541 regs->isf.rflags &= ~EFL_CF;
542 }
543 }
544 DEBUG_KPRINT_SYSCALL_UNIX(
545 "unix_syscall_return: error=%d retval=(%llu,%llu)\n",
546 error, regs->rax, regs->rdx);
547 } else {
548 x86_saved_state32_t *regs;
549
550 regs = saved_state32(find_user_regs(thread));
551
552 regs->efl &= ~(EFL_CF);
553
554 code = uthread->syscall_code;
555 callp = (code >= nsysent) ? &sysent[SYS_invalid] : &sysent[code];
556
557 #if CONFIG_DTRACE
558 if (callp->sy_call == dtrace_systrace_syscall) {
559 dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
560 }
561 #endif /* CONFIG_DTRACE */
562 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
563
564 if (error == ERESTART) {
565 pal_syscall_restart( thread, find_user_regs(thread));
566 } else if (error != EJUSTRETURN) {
567 if (error) {
568 regs->eax = error;
569 regs->efl |= EFL_CF; /* carry bit */
570 } else { /* (not error) */
571 regs->eax = uthread->uu_rval[0];
572 regs->edx = uthread->uu_rval[1];
573 }
574 }
575 DEBUG_KPRINT_SYSCALL_UNIX(
576 "unix_syscall_return: error=%d retval=(%u,%u)\n",
577 error, regs->eax, regs->edx);
578 }
579
580
581 uthread->uu_flag &= ~UT_NOTCANCELPT;
582 uthread->syscall_code = 0;
583
584 #if DEBUG || DEVELOPMENT
585 kern_allocation_name_t
586 prior __assert_only = thread_set_allocation_name(NULL);
587 assertf(prior == NULL, "thread_set_allocation_name(\"%s\") not cleared", kern_allocation_get_name(prior));
588 #endif /* DEBUG || DEVELOPMENT */
589
590 if (uthread->uu_lowpri_window) {
591 /*
592 * task is marked as a low priority I/O type
593 * and the I/O we issued while in this system call
594 * collided with normal I/O operations... we'll
595 * delay in order to mitigate the impact of this
596 * task on the normal operation of the system
597 */
598 throttle_lowpri_io(1);
599 }
600 if (!code_is_kdebug_trace(code)) {
601 KDBG_RELEASE(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
602 error, uthread->uu_rval[0], uthread->uu_rval[1], proc_getpid(p));
603 }
604
605 thread_exception_return();
606 /* NOTREACHED */
607 }
608