xref: /xnu-8792.41.9/bsd/dev/dtrace/systrace.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <ptrauth.h>
27 
28 #include <kern/thread.h>
29 #include <mach/thread_status.h>
30 
31 /* XXX All of these should really be derived from syscall_sw.h */
32 #if defined (__x86_64__)
33 #define SYSCALL_CLASS_SHIFT 24
34 #define SYSCALL_CLASS_MASK  (0xFF << SYSCALL_CLASS_SHIFT)
35 #define SYSCALL_NUMBER_MASK (~SYSCALL_CLASS_MASK)
36 #define I386_SYSCALL_NUMBER_MASK (0xFFFF)
37 #endif
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/errno.h>
43 #include <sys/ioctl.h>
44 #include <sys/conf.h>
45 #include <sys/fcntl.h>
46 #include <sys/syscall.h>
47 #include <miscfs/devfs/devfs.h>
48 
49 #include <sys/dtrace.h>
50 #include <sys/dtrace_impl.h>
51 #include <sys/systrace_args.h>
52 #include "systrace.h"
53 #include <sys/stat.h>
54 #include <sys/systm.h>
55 #include <sys/conf.h>
56 #include <sys/user.h>
57 
58 #include <machine/pal_routines.h>
59 
60 #if defined (__x86_64__)
61 #define SYSTRACE_ARTIFICIAL_FRAMES      2
62 #define MACHTRACE_ARTIFICIAL_FRAMES 3
63 #elif defined(__arm64__)
64 #define SYSTRACE_ARTIFICIAL_FRAMES  2
65 #define MACHTRACE_ARTIFICIAL_FRAMES 3
66 #else
67 #error Unknown Architecture
68 #endif
69 
70 #define SYSTRACE_NARGS (int)(sizeof(((uthread_t)NULL)->uu_arg) / sizeof(((uthread_t)NULL)->uu_arg[0]))
71 
72 #include <sys/sysent.h>
73 #define sy_callc sy_call /* Map Solaris slot name to Darwin's */
74 #define NSYSCALL nsysent /* and is less than 500 or so */
75 
76 extern const char *syscallnames[];
77 
78 #include <sys/dtrace_glue.h>
79 #define casptr dtrace_casptr
80 #define membar_enter dtrace_membar_producer
81 
82 #define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */
83 #define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */
84 
85 static LCK_MTX_DECLARE_ATTR(dtrace_systrace_lock,
86     &dtrace_lck_grp, &dtrace_lck_attr);           /* probe state lock */
87 
88 systrace_sysent_t *systrace_sysent = NULL;
89 void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
90 
91 static uint64_t systrace_getargval(void *, dtrace_id_t, void *, int, int);
92 static void systrace_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *);
93 
94 void
systrace_stub(dtrace_id_t id,uint64_t arg0,uint64_t arg1,uint64_t arg2,uint64_t arg3,uint64_t arg4)95 systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
96     uint64_t arg2, uint64_t arg3, uint64_t arg4)
97 {
98 #pragma unused(id,arg0,arg1,arg2,arg3,arg4)
99 }
100 
101 int32_t
dtrace_systrace_syscall(struct proc * pp,void * uap,int * rv)102 dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv)
103 {
104 	unsigned short      code;       /* The system call number */
105 
106 	systrace_sysent_t *sy;
107 	dtrace_id_t id;
108 	int32_t rval;
109 	syscall_arg_t *ip = (syscall_arg_t *)uap;
110 	uint64_t uargs[SYSTRACE_NARGS] = {0};
111 
112 #if defined (__x86_64__)
113 	{
114 		pal_register_cache_state(current_thread(), VALID);
115 		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
116 
117 		if (is_saved_state64(tagged_regs)) {
118 			x86_saved_state64_t *regs = saved_state64(tagged_regs);
119 			code = regs->rax & SYSCALL_NUMBER_MASK;
120 			/*
121 			 * Check for indirect system call... system call number
122 			 * passed as 'arg0'
123 			 */
124 			if (code == 0) {
125 				code = regs->rdi;
126 			}
127 		} else {
128 			code = saved_state32(tagged_regs)->eax & I386_SYSCALL_NUMBER_MASK;
129 
130 			if (code == 0) {
131 				vm_offset_t params = (vm_offset_t) (saved_state32(tagged_regs)->uesp + sizeof(int));
132 				code = fuword(params);
133 			}
134 		}
135 	}
136 #elif defined(__arm64__)
137 	{
138 		/*
139 		 * On arm64, syscall numbers depend on a flavor (indirect or not)
140 		 * ... and for u32 can be in either r0 or r12
141 		 * ... and for u64 can be in either x0 or x16
142 		 */
143 
144 		/* see bsd/dev/arm/systemcalls.c:arm_get_syscall_number */
145 		arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(current_thread());
146 
147 		if (is_saved_state32(arm_regs)) {
148 			/* Check for indirect system call */
149 			if (saved_state32(arm_regs)->r[12] != 0) {
150 				code = saved_state32(arm_regs)->r[12];
151 			} else {
152 				code = saved_state32(arm_regs)->r[0];
153 			}
154 		} else {
155 			/* Check for indirect system call */
156 			if (saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM] != 0) {
157 				code = saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM];
158 			} else {
159 				code = saved_state64(arm_regs)->x[0];
160 			}
161 		}
162 	}
163 #else
164 #error Unknown Architecture
165 #endif
166 
167 	// Bounds "check" the value of code a la unix_syscall
168 	sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code];
169 
170 	systrace_args(code, ip, uargs);
171 
172 	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
173 		uthread_t uthread = current_uthread();
174 		if (uthread) {
175 			uthread->t_dtrace_syscall_args = uargs;
176 		}
177 
178 		static_assert(SYSTRACE_NARGS >= 5, "not enough system call arguments");
179 		(*systrace_probe)(id, uargs[0], uargs[1], uargs[2], uargs[3], uargs[4]);
180 
181 		if (uthread) {
182 			uthread->t_dtrace_syscall_args = NULL;
183 		}
184 	}
185 
186 
187 
188 #if 0 /* XXX */
189 	/*
190 	 * APPLE NOTE: Not implemented.
191 	 * We want to explicitly allow DTrace consumers to stop a process
192 	 * before it actually executes the meat of the syscall.
193 	 */
194 	p = ttoproc(curthread);
195 	mutex_enter(&p->p_lock);
196 	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
197 		curthread->t_dtrace_stop = 0;
198 		stop(PR_REQUESTED, 0);
199 	}
200 	mutex_exit(&p->p_lock);
201 #endif
202 
203 	rval = (*sy->stsy_underlying)(pp, uap, rv);
204 
205 	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
206 		uint64_t munged_rv0, munged_rv1;
207 		uthread_t uthread = current_uthread();
208 
209 		if (uthread) {
210 			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
211 		}
212 		/*
213 		 * "Decode" rv for use in the call to dtrace_probe()
214 		 */
215 		if (rval == ERESTART) {
216 			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
217 			munged_rv1 = -1LL;
218 		} else if (rval != EJUSTRETURN) {
219 			if (rval) {
220 				munged_rv0 = -1LL; /* Mimic what libc will do. */
221 				munged_rv1 = -1LL;
222 			} else {
223 				switch (sy->stsy_return_type) {
224 				case _SYSCALL_RET_INT_T:
225 					munged_rv0 = rv[0];
226 					munged_rv1 = rv[1];
227 					break;
228 				case _SYSCALL_RET_UINT_T:
229 					munged_rv0 = ((u_int)rv[0]);
230 					munged_rv1 = ((u_int)rv[1]);
231 					break;
232 				case _SYSCALL_RET_OFF_T:
233 				case _SYSCALL_RET_UINT64_T:
234 					munged_rv0 = *(u_int64_t *)rv;
235 					munged_rv1 = 0LL;
236 					break;
237 				case _SYSCALL_RET_ADDR_T:
238 				case _SYSCALL_RET_SIZE_T:
239 				case _SYSCALL_RET_SSIZE_T:
240 					munged_rv0 = *(user_addr_t *)rv;
241 					munged_rv1 = 0LL;
242 					break;
243 				case _SYSCALL_RET_NONE:
244 					munged_rv0 = 0LL;
245 					munged_rv1 = 0LL;
246 					break;
247 				default:
248 					munged_rv0 = 0LL;
249 					munged_rv1 = 0LL;
250 					break;
251 				}
252 			}
253 		} else {
254 			munged_rv0 = 0LL;
255 			munged_rv1 = 0LL;
256 		}
257 
258 		/*
259 		 * <http://mail.opensolaris.org/pipermail/dtrace-discuss/2007-January/003276.html> says:
260 		 *
261 		 * "This is a bit of an historical artifact. At first, the syscall provider just
262 		 * had its return value in arg0, and the fbt and pid providers had their return
263 		 * values in arg1 (so that we could use arg0 for the offset of the return site).
264 		 *
265 		 * We inevitably started writing scripts where we wanted to see the return
266 		 * values from probes in all three providers, and we made this script easier
267 		 * to write by replicating the syscall return values in arg1 to match fbt and
268 		 * pid. We debated briefly about removing the return value from arg0, but
269 		 * decided that it would be less confusing to have the same data in two places
270 		 * than to have some non-helpful, non-intuitive value in arg0.
271 		 *
272 		 * This change was made 4/23/2003 according to the DTrace project's putback log."
273 		 */
274 		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
275 	}
276 
277 	return rval;
278 }
279 
280 void
dtrace_systrace_syscall_return(unsigned short code,int rval,int * rv)281 dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv)
282 {
283 	systrace_sysent_t *sy;
284 	dtrace_id_t id;
285 
286 	// Bounds "check" the value of code a la unix_syscall_return
287 	sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code];
288 
289 	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
290 		uint64_t munged_rv0, munged_rv1;
291 		uthread_t uthread = current_uthread();
292 
293 		if (uthread) {
294 			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
295 		}
296 		/*
297 		 * "Decode" rv for use in the call to dtrace_probe()
298 		 */
299 		if (rval == ERESTART) {
300 			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
301 			munged_rv1 = -1LL;
302 		} else if (rval != EJUSTRETURN) {
303 			if (rval) {
304 				munged_rv0 = -1LL; /* Mimic what libc will do. */
305 				munged_rv1 = -1LL;
306 			} else {
307 				switch (sy->stsy_return_type) {
308 				case _SYSCALL_RET_INT_T:
309 					munged_rv0 = rv[0];
310 					munged_rv1 = rv[1];
311 					break;
312 				case _SYSCALL_RET_UINT_T:
313 					munged_rv0 = ((u_int)rv[0]);
314 					munged_rv1 = ((u_int)rv[1]);
315 					break;
316 				case _SYSCALL_RET_OFF_T:
317 				case _SYSCALL_RET_UINT64_T:
318 					munged_rv0 = *(u_int64_t *)rv;
319 					munged_rv1 = 0LL;
320 					break;
321 				case _SYSCALL_RET_ADDR_T:
322 				case _SYSCALL_RET_SIZE_T:
323 				case _SYSCALL_RET_SSIZE_T:
324 					munged_rv0 = *(user_addr_t *)rv;
325 					munged_rv1 = 0LL;
326 					break;
327 				case _SYSCALL_RET_NONE:
328 					munged_rv0 = 0LL;
329 					munged_rv1 = 0LL;
330 					break;
331 				default:
332 					munged_rv0 = 0LL;
333 					munged_rv1 = 0LL;
334 					break;
335 				}
336 			}
337 		} else {
338 			munged_rv0 = 0LL;
339 			munged_rv1 = 0LL;
340 		}
341 
342 		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
343 	}
344 }
345 
346 #define SYSTRACE_SHIFT                  16
347 #define SYSTRACE_ISENTRY(x)             ((int)(x) >> SYSTRACE_SHIFT)
348 #define SYSTRACE_SYSNUM(x)              ((int)(x) & ((1 << SYSTRACE_SHIFT) - 1))
349 #define SYSTRACE_ENTRY(id)              ((1 << SYSTRACE_SHIFT) | (id))
350 #define SYSTRACE_RETURN(id)             (id)
351 
352 #if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
353 #error 1 << SYSTRACE_SHIFT must exceed number of system calls
354 #endif
355 
356 static dtrace_provider_id_t systrace_id;
357 
358 /*
359  * APPLE NOTE: Avoid name clash with Darwin automagic conf symbol.
360  * See balanced undef below.
361  */
362 #define systrace_init _systrace_init
363 
364 static void
systrace_init(const struct sysent * actual,systrace_sysent_t ** interposed)365 systrace_init(const struct sysent *actual, systrace_sysent_t **interposed)
366 {
367 	systrace_sysent_t *ssysent = *interposed;  /* Avoid sysent shadow warning
368 	                                            *       from bsd/sys/sysent.h */
369 	unsigned int i;
370 
371 	if (ssysent == NULL) {
372 		*interposed = ssysent = kmem_zalloc(sizeof(systrace_sysent_t) *
373 		    NSYSCALL, KM_SLEEP);
374 	}
375 
376 	for (i = 0; i < NSYSCALL; i++) {
377 		/* Use of volatile protects the if statement below from being optimized away */
378 		const volatile struct sysent *a = &actual[i];
379 		systrace_sysent_t *s = &ssysent[i];
380 
381 		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) {
382 			continue;
383 		}
384 
385 		if (a->sy_callc == dtrace_systrace_syscall) {
386 			continue;
387 		}
388 
389 		s->stsy_underlying = a->sy_callc;
390 		s->stsy_return_type = a->sy_return_type;
391 	}
392 }
393 
394 
395 /*ARGSUSED*/
396 static void
systrace_provide(void * arg,const dtrace_probedesc_t * desc)397 systrace_provide(void *arg, const dtrace_probedesc_t *desc)
398 {
399 #pragma unused(arg) /* __APPLE__ */
400 	unsigned int i;
401 
402 	if (desc != NULL) {
403 		return;
404 	}
405 
406 	systrace_init(sysent, &systrace_sysent);
407 
408 	for (i = 0; i < NSYSCALL; i++) {
409 		if (systrace_sysent[i].stsy_underlying == NULL) {
410 			continue;
411 		}
412 
413 		if (dtrace_probe_lookup(systrace_id, NULL,
414 		    syscallnames[i], "entry") != 0) {
415 			continue;
416 		}
417 
418 		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
419 		    "entry", SYSTRACE_ARTIFICIAL_FRAMES,
420 		    (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
421 		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
422 		    "return", SYSTRACE_ARTIFICIAL_FRAMES,
423 		    (void *)((uintptr_t)SYSTRACE_RETURN(i)));
424 
425 		systrace_sysent[i].stsy_entry = DTRACE_IDNONE;
426 		systrace_sysent[i].stsy_return = DTRACE_IDNONE;
427 	}
428 }
429 #undef systrace_init
430 
431 /*ARGSUSED*/
432 static void
systrace_destroy(void * arg,dtrace_id_t id,void * parg)433 systrace_destroy(void *arg, dtrace_id_t id, void *parg)
434 {
435 #pragma unused(arg,id) /* __APPLE__ */
436 
437 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
438 
439 #pragma unused(sysnum)  /* __APPLE__ */
440 	/*
441 	 * There's nothing to do here but assert that we have actually been
442 	 * disabled.
443 	 */
444 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
445 		ASSERT(systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
446 	} else {
447 		ASSERT(systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
448 	}
449 }
450 
451 /*ARGSUSED*/
452 static int
systrace_enable(void * arg,dtrace_id_t id,void * parg)453 systrace_enable(void *arg, dtrace_id_t id, void *parg)
454 {
455 #pragma unused(arg) /* __APPLE__ */
456 
457 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
458 	int enabled = (systrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
459 	    systrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
460 
461 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
462 		systrace_sysent[sysnum].stsy_entry = id;
463 	} else {
464 		systrace_sysent[sysnum].stsy_return = id;
465 	}
466 
467 	if (enabled) {
468 		ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall);
469 		return 0;
470 	}
471 
472 	lck_mtx_lock(&dtrace_systrace_lock);
473 	if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) {
474 		/* It is not possible to write to sysent[] directly because it is const. */
475 		vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_systrace_syscall);
476 		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(vm_offset_t));
477 	}
478 	lck_mtx_unlock(&dtrace_systrace_lock);
479 
480 	return 0;
481 }
482 
483 /*ARGSUSED*/
484 static void
systrace_disable(void * arg,dtrace_id_t id,void * parg)485 systrace_disable(void *arg, dtrace_id_t id, void *parg)
486 {
487 #pragma unused(arg,id) /* __APPLE__ */
488 
489 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
490 	int disable = (systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
491 	    systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
492 
493 	if (disable) {
494 		/*
495 		 * Usage of volatile protects the if statement below from being optimized away.
496 		 *
497 		 * Compilers are clever and know that const array values can't change in time
498 		 * and the if below is always false. That is because it can't see that DTrace
499 		 * injects dtrace_systrace_syscall dynamically and violates constness of the
500 		 * array.
501 		 */
502 		volatile const struct sysent *syscallent = &sysent[sysnum];
503 
504 		lck_mtx_lock(&dtrace_systrace_lock);
505 		if (syscallent->sy_callc == dtrace_systrace_syscall) {
506 			ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying,
507 			    (vm_offset_t)&syscallent->sy_callc, sizeof(vm_offset_t));
508 		}
509 		lck_mtx_unlock(&dtrace_systrace_lock);
510 	}
511 
512 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
513 		systrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
514 	} else {
515 		systrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
516 	}
517 }
518 
519 static dtrace_pattr_t systrace_attr = {
520 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
521 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
522 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
523 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
524 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
525 };
526 
527 static dtrace_pops_t systrace_pops = {
528 	.dtps_provide =         systrace_provide,
529 	.dtps_provide_module =  NULL,
530 	.dtps_enable =          systrace_enable,
531 	.dtps_disable =         systrace_disable,
532 	.dtps_suspend =         NULL,
533 	.dtps_resume =          NULL,
534 	.dtps_getargdesc =      systrace_getargdesc,
535 	.dtps_getargval =       systrace_getargval,
536 	.dtps_usermode =        NULL,
537 	.dtps_destroy =         systrace_destroy
538 };
539 
540 static int
systrace_attach(dev_info_t * devi)541 systrace_attach(dev_info_t *devi)
542 {
543 	systrace_probe = (void*)&dtrace_probe;
544 	membar_enter();
545 
546 	if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0,
547 	    DDI_PSEUDO, 0) == DDI_FAILURE ||
548 	    dtrace_register("syscall", &systrace_attr, DTRACE_PRIV_USER, NULL,
549 	    &systrace_pops, NULL, &systrace_id) != 0) {
550 		systrace_probe = systrace_stub;
551 		ddi_remove_minor_node(devi, NULL);
552 		return DDI_FAILURE;
553 	}
554 
555 	return DDI_SUCCESS;
556 }
557 
558 
559 /*
560  * APPLE NOTE:  systrace_detach not implemented
561  */
562 #if !defined(__APPLE__)
563 static int
systrace_detach(dev_info_t * devi,ddi_detach_cmd_t cmd)564 systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
565 {
566 	switch (cmd) {
567 	case DDI_DETACH:
568 		break;
569 	case DDI_SUSPEND:
570 		return DDI_SUCCESS;
571 	default:
572 		return DDI_FAILURE;
573 	}
574 
575 	if (dtrace_unregister(systrace_id) != 0) {
576 		return DDI_FAILURE;
577 	}
578 
579 	ddi_remove_minor_node(devi, NULL);
580 	systrace_probe = systrace_stub;
581 	return DDI_SUCCESS;
582 }
583 #endif /* __APPLE__ */
584 
585 
586 typedef kern_return_t (*mach_call_t)(void *);
587 
588 /* APPLE NOTE: From #include <kern/syscall_sw.h> which may be changed for 64 bit! */
589 #if CONFIG_REQUIRES_U32_MUNGING
590 typedef void mach_munge_t(void *);
591 #elif __arm__ && (__BIGGEST_ALIGNMENT__ > 4)
592 typedef int mach_munge_t(const void *, void *);
593 #endif
594 
595 typedef struct {
596 	unsigned char           mach_trap_arg_count; /* Number of trap arguments (Arch independant) */
597 	unsigned char           mach_trap_u32_words; /* number of 32-bit words to copyin for U32 */
598 	unsigned char           mach_trap_returns_port;
599 	unsigned char           __mach_trap_padding;
600 	kern_return_t         (*mach_trap_function)(void *);
601 #if CONFIG_REQUIRES_U32_MUNGING || (__arm__ && (__BIGGEST_ALIGNMENT__ > 4))
602 	mach_munge_t           *mach_trap_arg_munge32; /* system call argument munger routine for 32-bit */
603 #endif
604 #if MACH_ASSERT
605 	const char             *mach_trap_name;
606 #endif /* MACH_ASSERT */
607 } mach_trap_t;
608 
609 
610 #define MACH_TRAP_TABLE_COUNT   128
611 
612 extern const mach_trap_t        mach_trap_table[MACH_TRAP_TABLE_COUNT];
613 extern const int                mach_trap_count;
614 extern const char * const       mach_syscall_name_table[MACH_TRAP_TABLE_COUNT];
615 
616 
617 /* XXX From osfmk/i386/bsd_i386.c */
618 struct mach_call_args {
619 	syscall_arg_t arg1;
620 	syscall_arg_t arg2;
621 	syscall_arg_t arg3;
622 	syscall_arg_t arg4;
623 	syscall_arg_t arg5;
624 	syscall_arg_t arg6;
625 	syscall_arg_t arg7;
626 	syscall_arg_t arg8;
627 	syscall_arg_t arg9;
628 };
629 
630 #undef NSYSCALL
631 #define NSYSCALL mach_trap_count
632 
633 #if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
634 #error 1 << SYSTRACE_SHIFT must exceed number of Mach traps
635 #endif
636 
637 typedef struct machtrace_sysent {
638 	dtrace_id_t     stsy_entry;
639 	dtrace_id_t     stsy_return;
640 	kern_return_t   (*stsy_underlying)(void *);
641 	int32_t         stsy_return_type;
642 } machtrace_sysent_t;
643 
644 static machtrace_sysent_t *machtrace_sysent = NULL;
645 
646 void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t,
647     uint64_t, uint64_t, uint64_t);
648 
649 static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int);
650 
651 static dtrace_provider_id_t machtrace_id;
652 
653 static kern_return_t
dtrace_machtrace_syscall(struct mach_call_args * args)654 dtrace_machtrace_syscall(struct mach_call_args *args)
655 {
656 	int code;       /* The mach call number */
657 
658 	machtrace_sysent_t *sy;
659 	dtrace_id_t id;
660 	kern_return_t rval;
661 #if 0 /* XXX */
662 	proc_t *p;
663 #endif
664 	syscall_arg_t *ip = (syscall_arg_t *)args;
665 	mach_call_t mach_call;
666 
667 #if defined (__x86_64__)
668 	{
669 		pal_register_cache_state(current_thread(), VALID);
670 		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
671 
672 		if (is_saved_state64(tagged_regs)) {
673 			code = saved_state64(tagged_regs)->rax & SYSCALL_NUMBER_MASK;
674 		} else {
675 			code = -saved_state32(tagged_regs)->eax;
676 		}
677 	}
678 #elif defined(__arm64__)
679 	{
680 		/* From arm/thread_status.h:get_saved_state_svc_number */
681 		arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(current_thread());
682 		if (is_saved_state32(arm_regs)) {
683 			code = (int)saved_state32(arm_regs)->r[12];
684 		} else {
685 			code = (int)saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM];
686 		}
687 
688 		/* From bsd/arm64.c:mach_syscall */
689 		ASSERT(code < 0);    /* Otherwise it would be a Unix syscall */
690 		code = -code;
691 	}
692 #else
693 #error Unknown Architecture
694 #endif
695 
696 	sy = &machtrace_sysent[code];
697 
698 	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
699 		uthread_t uthread = current_uthread();
700 
701 		if (uthread) {
702 			uthread->t_dtrace_syscall_args = (void *)ip;
703 		}
704 
705 		(*machtrace_probe)(id, *ip, *(ip + 1), *(ip + 2), *(ip + 3), *(ip + 4));
706 
707 		if (uthread) {
708 			uthread->t_dtrace_syscall_args = (void *)0;
709 		}
710 	}
711 
712 #if 0 /* XXX */
713 	/*
714 	 * APPLE NOTE:  Not implemented.
715 	 * We want to explicitly allow DTrace consumers to stop a process
716 	 * before it actually executes the meat of the syscall.
717 	 */
718 	p = ttoproc(curthread);
719 	mutex_enter(&p->p_lock);
720 	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
721 		curthread->t_dtrace_stop = 0;
722 		stop(PR_REQUESTED, 0);
723 	}
724 	mutex_exit(&p->p_lock);
725 #endif
726 
727 	mach_call = (mach_call_t)(*sy->stsy_underlying);
728 	rval = mach_call(args);
729 
730 	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
731 		(*machtrace_probe)(id, (uint64_t)rval, 0, 0, 0, 0);
732 	}
733 
734 	return rval;
735 }
736 
737 static void
machtrace_init(const mach_trap_t * actual,machtrace_sysent_t ** interposed)738 machtrace_init(const mach_trap_t *actual, machtrace_sysent_t **interposed)
739 {
740 	machtrace_sysent_t *msysent = *interposed;
741 	int i;
742 
743 	if (msysent == NULL) {
744 		*interposed = msysent = kmem_zalloc(sizeof(machtrace_sysent_t) *
745 		    NSYSCALL, KM_SLEEP);
746 	}
747 
748 	for (i = 0; i < NSYSCALL; i++) {
749 		const volatile mach_trap_t *a = &actual[i];
750 		machtrace_sysent_t *s = &msysent[i];
751 
752 		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) {
753 			continue;
754 		}
755 
756 		if (a->mach_trap_function == (mach_call_t)(dtrace_machtrace_syscall)) {
757 			continue;
758 		}
759 
760 		s->stsy_underlying = a->mach_trap_function;
761 	}
762 }
763 
764 /*ARGSUSED*/
765 static void
machtrace_provide(void * arg,const dtrace_probedesc_t * desc)766 machtrace_provide(void *arg, const dtrace_probedesc_t *desc)
767 {
768 #pragma unused(arg) /* __APPLE__ */
769 
770 	int i;
771 
772 	if (desc != NULL) {
773 		return;
774 	}
775 
776 	machtrace_init(mach_trap_table, &machtrace_sysent);
777 
778 	for (i = 0; i < NSYSCALL; i++) {
779 		if (machtrace_sysent[i].stsy_underlying == NULL) {
780 			continue;
781 		}
782 
783 		if (dtrace_probe_lookup(machtrace_id, NULL,
784 		    mach_syscall_name_table[i], "entry") != 0) {
785 			continue;
786 		}
787 
788 		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
789 		    "entry", MACHTRACE_ARTIFICIAL_FRAMES,
790 		    (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
791 		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
792 		    "return", MACHTRACE_ARTIFICIAL_FRAMES,
793 		    (void *)((uintptr_t)SYSTRACE_RETURN(i)));
794 
795 		machtrace_sysent[i].stsy_entry = DTRACE_IDNONE;
796 		machtrace_sysent[i].stsy_return = DTRACE_IDNONE;
797 	}
798 }
799 
800 /*ARGSUSED*/
801 static void
machtrace_destroy(void * arg,dtrace_id_t id,void * parg)802 machtrace_destroy(void *arg, dtrace_id_t id, void *parg)
803 {
804 #pragma unused(arg,id) /* __APPLE__ */
805 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
806 
807 #pragma unused(sysnum) /* __APPLE__ */
808 
809 	/*
810 	 * There's nothing to do here but assert that we have actually been
811 	 * disabled.
812 	 */
813 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
814 		ASSERT(machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
815 	} else {
816 		ASSERT(machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
817 	}
818 }
819 
820 /*ARGSUSED*/
821 static int
machtrace_enable(void * arg,dtrace_id_t id,void * parg)822 machtrace_enable(void *arg, dtrace_id_t id, void *parg)
823 {
824 #pragma unused(arg) /* __APPLE__ */
825 
826 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
827 	int enabled = (machtrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
828 	    machtrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
829 
830 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
831 		machtrace_sysent[sysnum].stsy_entry = id;
832 	} else {
833 		machtrace_sysent[sysnum].stsy_return = id;
834 	}
835 
836 	if (enabled) {
837 		ASSERT(mach_trap_table[sysnum].mach_trap_function == (void *)dtrace_machtrace_syscall);
838 		return 0;
839 	}
840 
841 	lck_mtx_lock(&dtrace_systrace_lock);
842 
843 	if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) {
844 		/* It is not possible to write to mach_trap_table[] directly because it is const. */
845 		vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_machtrace_syscall);
846 		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t));
847 	}
848 
849 	lck_mtx_unlock(&dtrace_systrace_lock);
850 
851 	return 0;
852 }
853 
854 /*ARGSUSED*/
855 static void
machtrace_disable(void * arg,dtrace_id_t id,void * parg)856 machtrace_disable(void *arg, dtrace_id_t id, void *parg)
857 {
858 #pragma unused(arg,id) /* __APPLE__ */
859 
860 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
861 	int disable = (machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
862 	    machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
863 
864 	if (disable) {
865 		/*
866 		 * Usage of volatile protects the if statement below from being optimized away.
867 		 *
868 		 * Compilers are clever and know that const array values can't change in time
869 		 * and the if below is always false. That is because it can't see that DTrace
870 		 * injects dtrace_machtrace_syscall dynamically and violates constness of the
871 		 * array.
872 		 */
873 		volatile const mach_trap_t *machtrap = &mach_trap_table[sysnum];
874 
875 		lck_mtx_lock(&dtrace_systrace_lock);
876 		if (machtrap->mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) {
877 			ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying,
878 			    (vm_offset_t)&machtrap->mach_trap_function, sizeof(vm_offset_t));
879 		}
880 		lck_mtx_unlock(&dtrace_systrace_lock);
881 	}
882 
883 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
884 		machtrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
885 	} else {
886 		machtrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
887 	}
888 }
889 
890 static dtrace_pattr_t machtrace_attr = {
891 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
892 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
893 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
894 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
895 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
896 };
897 
898 static dtrace_pops_t machtrace_pops = {
899 	.dtps_provide =         machtrace_provide,
900 	.dtps_provide_module =  NULL,
901 	.dtps_enable =          machtrace_enable,
902 	.dtps_disable =         machtrace_disable,
903 	.dtps_suspend =         NULL,
904 	.dtps_resume =          NULL,
905 	.dtps_getargdesc =      NULL,
906 	.dtps_getargval =       machtrace_getarg,
907 	.dtps_usermode =        NULL,
908 	.dtps_destroy =         machtrace_destroy
909 };
910 
911 static int
machtrace_attach(dev_info_t * devi)912 machtrace_attach(dev_info_t *devi)
913 {
914 	machtrace_probe = dtrace_probe;
915 	membar_enter();
916 
917 	if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0,
918 	    DDI_PSEUDO, 0) == DDI_FAILURE ||
919 	    dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL,
920 	    &machtrace_pops, NULL, &machtrace_id) != 0) {
921 		machtrace_probe = (void*)&systrace_stub;
922 		ddi_remove_minor_node(devi, NULL);
923 		return DDI_FAILURE;
924 	}
925 
926 	return DDI_SUCCESS;
927 }
928 
929 d_open_t _systrace_open;
930 
931 int
_systrace_open(dev_t dev,int flags,int devtype,struct proc * p)932 _systrace_open(dev_t dev, int flags, int devtype, struct proc *p)
933 {
934 #pragma unused(dev,flags,devtype,p)
935 	return 0;
936 }
937 
938 #define SYSTRACE_MAJOR  -24 /* let the kernel pick the device number */
939 
940 static struct cdevsw systrace_cdevsw =
941 {
942 	.d_open = _systrace_open,
943 	.d_close = eno_opcl,
944 	.d_read = eno_rdwrt,
945 	.d_write = eno_rdwrt,
946 	.d_ioctl = eno_ioctl,
947 	.d_stop = eno_stop,
948 	.d_reset = eno_reset,
949 	.d_select = eno_select,
950 	.d_mmap = eno_mmap,
951 	.d_strategy = eno_strat,
952 	.d_reserved_1 = eno_getc,
953 	.d_reserved_2 = eno_putc,
954 };
955 
956 void systrace_init( void );
957 
958 void
systrace_init(void)959 systrace_init( void )
960 {
961 	if (dtrace_sdt_probes_restricted()) {
962 		return;
963 	}
964 
965 	int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw);
966 
967 	if (majdevno < 0) {
968 		printf("systrace_init: failed to allocate a major number!\n");
969 		return;
970 	}
971 
972 	systrace_attach((dev_info_t*)(uintptr_t)majdevno);
973 	machtrace_attach((dev_info_t*)(uintptr_t)majdevno);
974 }
975 #undef SYSTRACE_MAJOR
976 
977 static uint64_t
systrace_getargval(void * arg,dtrace_id_t id,void * parg,int argno,int aframes)978 systrace_getargval(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
979 {
980 #pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
981 	uint64_t val = 0;
982 	uint64_t *uargs = NULL;
983 
984 	uthread_t uthread = current_uthread();
985 
986 	if (uthread) {
987 		uargs = uthread->t_dtrace_syscall_args;
988 	}
989 	if (!uargs) {
990 		return 0;
991 	}
992 	if (argno < 0 || argno >= SYSTRACE_NARGS) {
993 		return 0;
994 	}
995 
996 	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
997 	val = uargs[argno];
998 	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
999 	return val;
1000 }
1001 
1002 static void
systrace_getargdesc(void * arg,dtrace_id_t id,void * parg,dtrace_argdesc_t * desc)1003 systrace_getargdesc(void *arg, dtrace_id_t id, void *parg,
1004     dtrace_argdesc_t *desc)
1005 {
1006 #pragma unused(arg, id)
1007 	int sysnum = SYSTRACE_SYSNUM(parg);
1008 	uthread_t uthread = current_uthread();
1009 	uint64_t *uargs = NULL;
1010 
1011 	if (!uthread) {
1012 		desc->dtargd_ndx = DTRACE_ARGNONE;
1013 		return;
1014 	}
1015 
1016 	uargs = uthread->t_dtrace_syscall_args;
1017 
1018 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
1019 		systrace_entry_setargdesc(sysnum, desc->dtargd_ndx,
1020 		    desc->dtargd_native, sizeof(desc->dtargd_native));
1021 	} else {
1022 		systrace_return_setargdesc(sysnum, desc->dtargd_ndx,
1023 		    desc->dtargd_native, sizeof(desc->dtargd_native));
1024 	}
1025 
1026 	if (desc->dtargd_native[0] == '\0') {
1027 		desc->dtargd_ndx = DTRACE_ARGNONE;
1028 	}
1029 }
1030 
1031 static uint64_t
machtrace_getarg(void * arg,dtrace_id_t id,void * parg,int argno,int aframes)1032 machtrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
1033 {
1034 #pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
1035 	uint64_t val = 0;
1036 	syscall_arg_t *stack = (syscall_arg_t *)NULL;
1037 
1038 	uthread_t uthread = current_uthread();
1039 
1040 	if (uthread) {
1041 		stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args;
1042 	}
1043 
1044 	if (!stack) {
1045 		return 0;
1046 	}
1047 
1048 	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1049 	/* dtrace_probe arguments arg0 .. arg4 are 64bits wide */
1050 	val = (uint64_t)*(stack + argno);
1051 	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1052 	return val;
1053 }
1054