xref: /xnu-8020.140.41/bsd/dev/dtrace/systrace.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <ptrauth.h>
27 
28 #include <kern/thread.h>
29 #include <mach/thread_status.h>
30 
31 /* XXX All of these should really be derived from syscall_sw.h */
32 #if defined (__x86_64__)
33 #define SYSCALL_CLASS_SHIFT 24
34 #define SYSCALL_CLASS_MASK  (0xFF << SYSCALL_CLASS_SHIFT)
35 #define SYSCALL_NUMBER_MASK (~SYSCALL_CLASS_MASK)
36 #define I386_SYSCALL_NUMBER_MASK (0xFFFF)
37 #endif
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/errno.h>
43 #include <sys/ioctl.h>
44 #include <sys/conf.h>
45 #include <sys/fcntl.h>
46 #include <sys/syscall.h>
47 #include <miscfs/devfs/devfs.h>
48 
49 #include <sys/dtrace.h>
50 #include <sys/dtrace_impl.h>
51 #include <sys/systrace_args.h>
52 #include "systrace.h"
53 #include <sys/stat.h>
54 #include <sys/systm.h>
55 #include <sys/conf.h>
56 #include <sys/user.h>
57 
58 #include <machine/pal_routines.h>
59 
60 #if defined (__x86_64__)
61 #define SYSTRACE_ARTIFICIAL_FRAMES      2
62 #define MACHTRACE_ARTIFICIAL_FRAMES 3
63 #elif defined(__arm__) || defined(__arm64__)
64 #define SYSTRACE_ARTIFICIAL_FRAMES  2
65 #define MACHTRACE_ARTIFICIAL_FRAMES 3
66 #else
67 #error Unknown Architecture
68 #endif
69 
70 #define SYSTRACE_NARGS (int)(sizeof(((uthread_t)NULL)->uu_arg) / sizeof(((uthread_t)NULL)->uu_arg[0]))
71 
72 #include <sys/sysent.h>
73 #define sy_callc sy_call /* Map Solaris slot name to Darwin's */
74 #define NSYSCALL nsysent /* and is less than 500 or so */
75 
76 extern const char *syscallnames[];
77 
78 #include <sys/dtrace_glue.h>
79 #define casptr dtrace_casptr
80 #define membar_enter dtrace_membar_producer
81 
82 #define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */
83 #define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */
84 
85 static LCK_MTX_DECLARE_ATTR(dtrace_systrace_lock,
86     &dtrace_lck_grp, &dtrace_lck_attr);           /* probe state lock */
87 
88 systrace_sysent_t *systrace_sysent = NULL;
89 void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
90 
91 static uint64_t systrace_getargval(void *, dtrace_id_t, void *, int, int);
92 static void systrace_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *);
93 
94 void
systrace_stub(dtrace_id_t id,uint64_t arg0,uint64_t arg1,uint64_t arg2,uint64_t arg3,uint64_t arg4)95 systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
96     uint64_t arg2, uint64_t arg3, uint64_t arg4)
97 {
98 #pragma unused(id,arg0,arg1,arg2,arg3,arg4)
99 }
100 
101 int32_t
dtrace_systrace_syscall(struct proc * pp,void * uap,int * rv)102 dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv)
103 {
104 	unsigned short      code;       /* The system call number */
105 
106 	systrace_sysent_t *sy;
107 	dtrace_id_t id;
108 	int32_t rval;
109 	syscall_arg_t *ip = (syscall_arg_t *)uap;
110 	uint64_t uargs[SYSTRACE_NARGS] = {0};
111 
112 #if defined (__x86_64__)
113 	{
114 		pal_register_cache_state(current_thread(), VALID);
115 		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
116 
117 		if (is_saved_state64(tagged_regs)) {
118 			x86_saved_state64_t *regs = saved_state64(tagged_regs);
119 			code = regs->rax & SYSCALL_NUMBER_MASK;
120 			/*
121 			 * Check for indirect system call... system call number
122 			 * passed as 'arg0'
123 			 */
124 			if (code == 0) {
125 				code = regs->rdi;
126 			}
127 		} else {
128 			code = saved_state32(tagged_regs)->eax & I386_SYSCALL_NUMBER_MASK;
129 
130 			if (code == 0) {
131 				vm_offset_t params = (vm_offset_t) (saved_state32(tagged_regs)->uesp + sizeof(int));
132 				code = fuword(params);
133 			}
134 		}
135 	}
136 #elif defined(__arm__)
137 	{
138 		/*
139 		 * On arm, syscall numbers depend on a flavor (indirect or not)
140 		 * and can be in either r0 or r12  (always u32)
141 		 */
142 
143 		/* See bsd/dev/arm/systemcalls.c:arm_get_syscall_number */
144 		arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(current_thread());
145 
146 		/* Check for indirect system call */
147 		if (arm_regs->r[12] != 0) {
148 			code = arm_regs->r[12];
149 		} else {
150 			code = arm_regs->r[0];
151 		}
152 	}
153 #elif defined(__arm64__)
154 	{
155 		/*
156 		 * On arm64, syscall numbers depend on a flavor (indirect or not)
157 		 * ... and for u32 can be in either r0 or r12
158 		 * ... and for u64 can be in either x0 or x16
159 		 */
160 
161 		/* see bsd/dev/arm/systemcalls.c:arm_get_syscall_number */
162 		arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(current_thread());
163 
164 		if (is_saved_state32(arm_regs)) {
165 			/* Check for indirect system call */
166 			if (saved_state32(arm_regs)->r[12] != 0) {
167 				code = saved_state32(arm_regs)->r[12];
168 			} else {
169 				code = saved_state32(arm_regs)->r[0];
170 			}
171 		} else {
172 			/* Check for indirect system call */
173 			if (saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM] != 0) {
174 				code = saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM];
175 			} else {
176 				code = saved_state64(arm_regs)->x[0];
177 			}
178 		}
179 	}
180 #else
181 #error Unknown Architecture
182 #endif
183 
184 	// Bounds "check" the value of code a la unix_syscall
185 	sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code];
186 
187 	systrace_args(code, ip, uargs);
188 
189 	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
190 		uthread_t uthread = current_uthread();
191 		if (uthread) {
192 			uthread->t_dtrace_syscall_args = uargs;
193 		}
194 
195 		static_assert(SYSTRACE_NARGS >= 5, "not enough system call arguments");
196 		(*systrace_probe)(id, uargs[0], uargs[1], uargs[2], uargs[3], uargs[4]);
197 
198 		if (uthread) {
199 			uthread->t_dtrace_syscall_args = NULL;
200 		}
201 	}
202 
203 
204 
205 #if 0 /* XXX */
206 	/*
207 	 * APPLE NOTE: Not implemented.
208 	 * We want to explicitly allow DTrace consumers to stop a process
209 	 * before it actually executes the meat of the syscall.
210 	 */
211 	p = ttoproc(curthread);
212 	mutex_enter(&p->p_lock);
213 	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
214 		curthread->t_dtrace_stop = 0;
215 		stop(PR_REQUESTED, 0);
216 	}
217 	mutex_exit(&p->p_lock);
218 #endif
219 
220 	rval = (*sy->stsy_underlying)(pp, uap, rv);
221 
222 	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
223 		uint64_t munged_rv0, munged_rv1;
224 		uthread_t uthread = current_uthread();
225 
226 		if (uthread) {
227 			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
228 		}
229 		/*
230 		 * "Decode" rv for use in the call to dtrace_probe()
231 		 */
232 		if (rval == ERESTART) {
233 			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
234 			munged_rv1 = -1LL;
235 		} else if (rval != EJUSTRETURN) {
236 			if (rval) {
237 				munged_rv0 = -1LL; /* Mimic what libc will do. */
238 				munged_rv1 = -1LL;
239 			} else {
240 				switch (sy->stsy_return_type) {
241 				case _SYSCALL_RET_INT_T:
242 					munged_rv0 = rv[0];
243 					munged_rv1 = rv[1];
244 					break;
245 				case _SYSCALL_RET_UINT_T:
246 					munged_rv0 = ((u_int)rv[0]);
247 					munged_rv1 = ((u_int)rv[1]);
248 					break;
249 				case _SYSCALL_RET_OFF_T:
250 				case _SYSCALL_RET_UINT64_T:
251 					munged_rv0 = *(u_int64_t *)rv;
252 					munged_rv1 = 0LL;
253 					break;
254 				case _SYSCALL_RET_ADDR_T:
255 				case _SYSCALL_RET_SIZE_T:
256 				case _SYSCALL_RET_SSIZE_T:
257 					munged_rv0 = *(user_addr_t *)rv;
258 					munged_rv1 = 0LL;
259 					break;
260 				case _SYSCALL_RET_NONE:
261 					munged_rv0 = 0LL;
262 					munged_rv1 = 0LL;
263 					break;
264 				default:
265 					munged_rv0 = 0LL;
266 					munged_rv1 = 0LL;
267 					break;
268 				}
269 			}
270 		} else {
271 			munged_rv0 = 0LL;
272 			munged_rv1 = 0LL;
273 		}
274 
275 		/*
276 		 * <http://mail.opensolaris.org/pipermail/dtrace-discuss/2007-January/003276.html> says:
277 		 *
278 		 * "This is a bit of an historical artifact. At first, the syscall provider just
279 		 * had its return value in arg0, and the fbt and pid providers had their return
280 		 * values in arg1 (so that we could use arg0 for the offset of the return site).
281 		 *
282 		 * We inevitably started writing scripts where we wanted to see the return
283 		 * values from probes in all three providers, and we made this script easier
284 		 * to write by replicating the syscall return values in arg1 to match fbt and
285 		 * pid. We debated briefly about removing the return value from arg0, but
286 		 * decided that it would be less confusing to have the same data in two places
287 		 * than to have some non-helpful, non-intuitive value in arg0.
288 		 *
289 		 * This change was made 4/23/2003 according to the DTrace project's putback log."
290 		 */
291 		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
292 	}
293 
294 	return rval;
295 }
296 
297 void
dtrace_systrace_syscall_return(unsigned short code,int rval,int * rv)298 dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv)
299 {
300 	systrace_sysent_t *sy;
301 	dtrace_id_t id;
302 
303 	// Bounds "check" the value of code a la unix_syscall_return
304 	sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code];
305 
306 	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
307 		uint64_t munged_rv0, munged_rv1;
308 		uthread_t uthread = current_uthread();
309 
310 		if (uthread) {
311 			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
312 		}
313 		/*
314 		 * "Decode" rv for use in the call to dtrace_probe()
315 		 */
316 		if (rval == ERESTART) {
317 			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
318 			munged_rv1 = -1LL;
319 		} else if (rval != EJUSTRETURN) {
320 			if (rval) {
321 				munged_rv0 = -1LL; /* Mimic what libc will do. */
322 				munged_rv1 = -1LL;
323 			} else {
324 				switch (sy->stsy_return_type) {
325 				case _SYSCALL_RET_INT_T:
326 					munged_rv0 = rv[0];
327 					munged_rv1 = rv[1];
328 					break;
329 				case _SYSCALL_RET_UINT_T:
330 					munged_rv0 = ((u_int)rv[0]);
331 					munged_rv1 = ((u_int)rv[1]);
332 					break;
333 				case _SYSCALL_RET_OFF_T:
334 				case _SYSCALL_RET_UINT64_T:
335 					munged_rv0 = *(u_int64_t *)rv;
336 					munged_rv1 = 0LL;
337 					break;
338 				case _SYSCALL_RET_ADDR_T:
339 				case _SYSCALL_RET_SIZE_T:
340 				case _SYSCALL_RET_SSIZE_T:
341 					munged_rv0 = *(user_addr_t *)rv;
342 					munged_rv1 = 0LL;
343 					break;
344 				case _SYSCALL_RET_NONE:
345 					munged_rv0 = 0LL;
346 					munged_rv1 = 0LL;
347 					break;
348 				default:
349 					munged_rv0 = 0LL;
350 					munged_rv1 = 0LL;
351 					break;
352 				}
353 			}
354 		} else {
355 			munged_rv0 = 0LL;
356 			munged_rv1 = 0LL;
357 		}
358 
359 		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
360 	}
361 }
362 
363 #define SYSTRACE_SHIFT                  16
364 #define SYSTRACE_ISENTRY(x)             ((int)(x) >> SYSTRACE_SHIFT)
365 #define SYSTRACE_SYSNUM(x)              ((int)(x) & ((1 << SYSTRACE_SHIFT) - 1))
366 #define SYSTRACE_ENTRY(id)              ((1 << SYSTRACE_SHIFT) | (id))
367 #define SYSTRACE_RETURN(id)             (id)
368 
369 #if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
370 #error 1 << SYSTRACE_SHIFT must exceed number of system calls
371 #endif
372 
373 static dtrace_provider_id_t systrace_id;
374 
375 /*
376  * APPLE NOTE: Avoid name clash with Darwin automagic conf symbol.
377  * See balanced undef below.
378  */
379 #define systrace_init _systrace_init
380 
381 static void
systrace_init(const struct sysent * actual,systrace_sysent_t ** interposed)382 systrace_init(const struct sysent *actual, systrace_sysent_t **interposed)
383 {
384 	systrace_sysent_t *ssysent = *interposed;  /* Avoid sysent shadow warning
385 	                                            *       from bsd/sys/sysent.h */
386 	unsigned int i;
387 
388 	if (ssysent == NULL) {
389 		*interposed = ssysent = kmem_zalloc(sizeof(systrace_sysent_t) *
390 		    NSYSCALL, KM_SLEEP);
391 	}
392 
393 	for (i = 0; i < NSYSCALL; i++) {
394 		/* Use of volatile protects the if statement below from being optimized away */
395 		const volatile struct sysent *a = &actual[i];
396 		systrace_sysent_t *s = &ssysent[i];
397 
398 		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) {
399 			continue;
400 		}
401 
402 		if (a->sy_callc == dtrace_systrace_syscall) {
403 			continue;
404 		}
405 
406 		s->stsy_underlying = a->sy_callc;
407 		s->stsy_return_type = a->sy_return_type;
408 	}
409 }
410 
411 
412 /*ARGSUSED*/
413 static void
systrace_provide(void * arg,const dtrace_probedesc_t * desc)414 systrace_provide(void *arg, const dtrace_probedesc_t *desc)
415 {
416 #pragma unused(arg) /* __APPLE__ */
417 	unsigned int i;
418 
419 	if (desc != NULL) {
420 		return;
421 	}
422 
423 	systrace_init(sysent, &systrace_sysent);
424 
425 	for (i = 0; i < NSYSCALL; i++) {
426 		if (systrace_sysent[i].stsy_underlying == NULL) {
427 			continue;
428 		}
429 
430 		if (dtrace_probe_lookup(systrace_id, NULL,
431 		    syscallnames[i], "entry") != 0) {
432 			continue;
433 		}
434 
435 		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
436 		    "entry", SYSTRACE_ARTIFICIAL_FRAMES,
437 		    (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
438 		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
439 		    "return", SYSTRACE_ARTIFICIAL_FRAMES,
440 		    (void *)((uintptr_t)SYSTRACE_RETURN(i)));
441 
442 		systrace_sysent[i].stsy_entry = DTRACE_IDNONE;
443 		systrace_sysent[i].stsy_return = DTRACE_IDNONE;
444 	}
445 }
446 #undef systrace_init
447 
448 /*ARGSUSED*/
449 static void
systrace_destroy(void * arg,dtrace_id_t id,void * parg)450 systrace_destroy(void *arg, dtrace_id_t id, void *parg)
451 {
452 #pragma unused(arg,id) /* __APPLE__ */
453 
454 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
455 
456 #pragma unused(sysnum)  /* __APPLE__ */
457 	/*
458 	 * There's nothing to do here but assert that we have actually been
459 	 * disabled.
460 	 */
461 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
462 		ASSERT(systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
463 	} else {
464 		ASSERT(systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
465 	}
466 }
467 
468 /*ARGSUSED*/
469 static int
systrace_enable(void * arg,dtrace_id_t id,void * parg)470 systrace_enable(void *arg, dtrace_id_t id, void *parg)
471 {
472 #pragma unused(arg) /* __APPLE__ */
473 
474 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
475 	int enabled = (systrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
476 	    systrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
477 
478 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
479 		systrace_sysent[sysnum].stsy_entry = id;
480 	} else {
481 		systrace_sysent[sysnum].stsy_return = id;
482 	}
483 
484 	if (enabled) {
485 		ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall);
486 		return 0;
487 	}
488 
489 	lck_mtx_lock(&dtrace_systrace_lock);
490 	if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) {
491 		/* It is not possible to write to sysent[] directly because it is const. */
492 		vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_systrace_syscall);
493 		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(vm_offset_t));
494 	}
495 	lck_mtx_unlock(&dtrace_systrace_lock);
496 
497 	return 0;
498 }
499 
500 /*ARGSUSED*/
501 static void
systrace_disable(void * arg,dtrace_id_t id,void * parg)502 systrace_disable(void *arg, dtrace_id_t id, void *parg)
503 {
504 #pragma unused(arg,id) /* __APPLE__ */
505 
506 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
507 	int disable = (systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
508 	    systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
509 
510 	if (disable) {
511 		/*
512 		 * Usage of volatile protects the if statement below from being optimized away.
513 		 *
514 		 * Compilers are clever and know that const array values can't change in time
515 		 * and the if below is always false. That is because it can't see that DTrace
516 		 * injects dtrace_systrace_syscall dynamically and violates constness of the
517 		 * array.
518 		 */
519 		volatile const struct sysent *syscallent = &sysent[sysnum];
520 
521 		lck_mtx_lock(&dtrace_systrace_lock);
522 		if (syscallent->sy_callc == dtrace_systrace_syscall) {
523 			ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying,
524 			    (vm_offset_t)&syscallent->sy_callc, sizeof(vm_offset_t));
525 		}
526 		lck_mtx_unlock(&dtrace_systrace_lock);
527 	}
528 
529 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
530 		systrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
531 	} else {
532 		systrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
533 	}
534 }
535 
536 static dtrace_pattr_t systrace_attr = {
537 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
538 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
539 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
540 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
541 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
542 };
543 
544 static dtrace_pops_t systrace_pops = {
545 	.dtps_provide =         systrace_provide,
546 	.dtps_provide_module =  NULL,
547 	.dtps_enable =          systrace_enable,
548 	.dtps_disable =         systrace_disable,
549 	.dtps_suspend =         NULL,
550 	.dtps_resume =          NULL,
551 	.dtps_getargdesc =      systrace_getargdesc,
552 	.dtps_getargval =       systrace_getargval,
553 	.dtps_usermode =        NULL,
554 	.dtps_destroy =         systrace_destroy
555 };
556 
557 static int
systrace_attach(dev_info_t * devi)558 systrace_attach(dev_info_t *devi)
559 {
560 	systrace_probe = (void*)&dtrace_probe;
561 	membar_enter();
562 
563 	if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0,
564 	    DDI_PSEUDO, 0) == DDI_FAILURE ||
565 	    dtrace_register("syscall", &systrace_attr, DTRACE_PRIV_USER, NULL,
566 	    &systrace_pops, NULL, &systrace_id) != 0) {
567 		systrace_probe = systrace_stub;
568 		ddi_remove_minor_node(devi, NULL);
569 		return DDI_FAILURE;
570 	}
571 
572 	return DDI_SUCCESS;
573 }
574 
575 
576 /*
577  * APPLE NOTE:  systrace_detach not implemented
578  */
579 #if !defined(__APPLE__)
580 static int
systrace_detach(dev_info_t * devi,ddi_detach_cmd_t cmd)581 systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
582 {
583 	switch (cmd) {
584 	case DDI_DETACH:
585 		break;
586 	case DDI_SUSPEND:
587 		return DDI_SUCCESS;
588 	default:
589 		return DDI_FAILURE;
590 	}
591 
592 	if (dtrace_unregister(systrace_id) != 0) {
593 		return DDI_FAILURE;
594 	}
595 
596 	ddi_remove_minor_node(devi, NULL);
597 	systrace_probe = systrace_stub;
598 	return DDI_SUCCESS;
599 }
600 #endif /* __APPLE__ */
601 
602 
603 typedef kern_return_t (*mach_call_t)(void *);
604 
605 /* APPLE NOTE: From #include <kern/syscall_sw.h> which may be changed for 64 bit! */
606 #if CONFIG_REQUIRES_U32_MUNGING
607 typedef void mach_munge_t(void *);
608 #elif __arm__ && (__BIGGEST_ALIGNMENT__ > 4)
609 typedef int mach_munge_t(const void *, void *);
610 #endif
611 
612 typedef struct {
613 	unsigned char           mach_trap_arg_count; /* Number of trap arguments (Arch independant) */
614 	unsigned char           mach_trap_u32_words; /* number of 32-bit words to copyin for U32 */
615 	unsigned char           mach_trap_returns_port;
616 	unsigned char           __mach_trap_padding;
617 	kern_return_t         (*mach_trap_function)(void *);
618 #if CONFIG_REQUIRES_U32_MUNGING || (__arm__ && (__BIGGEST_ALIGNMENT__ > 4))
619 	mach_munge_t           *mach_trap_arg_munge32; /* system call argument munger routine for 32-bit */
620 #endif
621 #if MACH_ASSERT
622 	const char             *mach_trap_name;
623 #endif /* MACH_ASSERT */
624 } mach_trap_t;
625 
626 
627 #define MACH_TRAP_TABLE_COUNT   128
628 
629 extern const mach_trap_t        mach_trap_table[MACH_TRAP_TABLE_COUNT];
630 extern const int                mach_trap_count;
631 extern const char * const       mach_syscall_name_table[MACH_TRAP_TABLE_COUNT];
632 
633 
634 /* XXX From osfmk/i386/bsd_i386.c */
635 struct mach_call_args {
636 	syscall_arg_t arg1;
637 	syscall_arg_t arg2;
638 	syscall_arg_t arg3;
639 	syscall_arg_t arg4;
640 	syscall_arg_t arg5;
641 	syscall_arg_t arg6;
642 	syscall_arg_t arg7;
643 	syscall_arg_t arg8;
644 	syscall_arg_t arg9;
645 };
646 
647 #undef NSYSCALL
648 #define NSYSCALL mach_trap_count
649 
650 #if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
651 #error 1 << SYSTRACE_SHIFT must exceed number of Mach traps
652 #endif
653 
654 typedef struct machtrace_sysent {
655 	dtrace_id_t     stsy_entry;
656 	dtrace_id_t     stsy_return;
657 	kern_return_t   (*stsy_underlying)(void *);
658 	int32_t         stsy_return_type;
659 } machtrace_sysent_t;
660 
661 static machtrace_sysent_t *machtrace_sysent = NULL;
662 
663 void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t,
664     uint64_t, uint64_t, uint64_t);
665 
666 static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int);
667 
668 static dtrace_provider_id_t machtrace_id;
669 
670 static kern_return_t
dtrace_machtrace_syscall(struct mach_call_args * args)671 dtrace_machtrace_syscall(struct mach_call_args *args)
672 {
673 	int code;       /* The mach call number */
674 
675 	machtrace_sysent_t *sy;
676 	dtrace_id_t id;
677 	kern_return_t rval;
678 #if 0 /* XXX */
679 	proc_t *p;
680 #endif
681 	syscall_arg_t *ip = (syscall_arg_t *)args;
682 	mach_call_t mach_call;
683 
684 #if defined (__x86_64__)
685 	{
686 		pal_register_cache_state(current_thread(), VALID);
687 		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
688 
689 		if (is_saved_state64(tagged_regs)) {
690 			code = saved_state64(tagged_regs)->rax & SYSCALL_NUMBER_MASK;
691 		} else {
692 			code = -saved_state32(tagged_regs)->eax;
693 		}
694 	}
695 #elif defined(__arm__)
696 	{
697 		/* r12 has the machcall number, but it is -ve */
698 		arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(current_thread());
699 		code = (int)arm_regs->r[12];
700 		ASSERT(code < 0);    /* Otherwise it would be a Unix syscall */
701 		code = -code;
702 	}
703 #elif defined(__arm64__)
704 	{
705 		/* From arm/thread_status.h:get_saved_state_svc_number */
706 		arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(current_thread());
707 		if (is_saved_state32(arm_regs)) {
708 			code = (int)saved_state32(arm_regs)->r[12];
709 		} else {
710 			code = (int)saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM];
711 		}
712 
713 		/* From bsd/arm64.c:mach_syscall */
714 		ASSERT(code < 0);    /* Otherwise it would be a Unix syscall */
715 		code = -code;
716 	}
717 #else
718 #error Unknown Architecture
719 #endif
720 
721 	sy = &machtrace_sysent[code];
722 
723 	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
724 		uthread_t uthread = current_uthread();
725 
726 		if (uthread) {
727 			uthread->t_dtrace_syscall_args = (void *)ip;
728 		}
729 
730 		(*machtrace_probe)(id, *ip, *(ip + 1), *(ip + 2), *(ip + 3), *(ip + 4));
731 
732 		if (uthread) {
733 			uthread->t_dtrace_syscall_args = (void *)0;
734 		}
735 	}
736 
737 #if 0 /* XXX */
738 	/*
739 	 * APPLE NOTE:  Not implemented.
740 	 * We want to explicitly allow DTrace consumers to stop a process
741 	 * before it actually executes the meat of the syscall.
742 	 */
743 	p = ttoproc(curthread);
744 	mutex_enter(&p->p_lock);
745 	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
746 		curthread->t_dtrace_stop = 0;
747 		stop(PR_REQUESTED, 0);
748 	}
749 	mutex_exit(&p->p_lock);
750 #endif
751 
752 	mach_call = (mach_call_t)(*sy->stsy_underlying);
753 	rval = mach_call(args);
754 
755 	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
756 		(*machtrace_probe)(id, (uint64_t)rval, 0, 0, 0, 0);
757 	}
758 
759 	return rval;
760 }
761 
762 static void
machtrace_init(const mach_trap_t * actual,machtrace_sysent_t ** interposed)763 machtrace_init(const mach_trap_t *actual, machtrace_sysent_t **interposed)
764 {
765 	machtrace_sysent_t *msysent = *interposed;
766 	int i;
767 
768 	if (msysent == NULL) {
769 		*interposed = msysent = kmem_zalloc(sizeof(machtrace_sysent_t) *
770 		    NSYSCALL, KM_SLEEP);
771 	}
772 
773 	for (i = 0; i < NSYSCALL; i++) {
774 		const volatile mach_trap_t *a = &actual[i];
775 		machtrace_sysent_t *s = &msysent[i];
776 
777 		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) {
778 			continue;
779 		}
780 
781 		if (a->mach_trap_function == (mach_call_t)(dtrace_machtrace_syscall)) {
782 			continue;
783 		}
784 
785 		s->stsy_underlying = a->mach_trap_function;
786 	}
787 }
788 
789 /*ARGSUSED*/
790 static void
machtrace_provide(void * arg,const dtrace_probedesc_t * desc)791 machtrace_provide(void *arg, const dtrace_probedesc_t *desc)
792 {
793 #pragma unused(arg) /* __APPLE__ */
794 
795 	int i;
796 
797 	if (desc != NULL) {
798 		return;
799 	}
800 
801 	machtrace_init(mach_trap_table, &machtrace_sysent);
802 
803 	for (i = 0; i < NSYSCALL; i++) {
804 		if (machtrace_sysent[i].stsy_underlying == NULL) {
805 			continue;
806 		}
807 
808 		if (dtrace_probe_lookup(machtrace_id, NULL,
809 		    mach_syscall_name_table[i], "entry") != 0) {
810 			continue;
811 		}
812 
813 		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
814 		    "entry", MACHTRACE_ARTIFICIAL_FRAMES,
815 		    (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
816 		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
817 		    "return", MACHTRACE_ARTIFICIAL_FRAMES,
818 		    (void *)((uintptr_t)SYSTRACE_RETURN(i)));
819 
820 		machtrace_sysent[i].stsy_entry = DTRACE_IDNONE;
821 		machtrace_sysent[i].stsy_return = DTRACE_IDNONE;
822 	}
823 }
824 
825 /*ARGSUSED*/
826 static void
machtrace_destroy(void * arg,dtrace_id_t id,void * parg)827 machtrace_destroy(void *arg, dtrace_id_t id, void *parg)
828 {
829 #pragma unused(arg,id) /* __APPLE__ */
830 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
831 
832 #pragma unused(sysnum) /* __APPLE__ */
833 
834 	/*
835 	 * There's nothing to do here but assert that we have actually been
836 	 * disabled.
837 	 */
838 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
839 		ASSERT(machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
840 	} else {
841 		ASSERT(machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
842 	}
843 }
844 
845 /*ARGSUSED*/
846 static int
machtrace_enable(void * arg,dtrace_id_t id,void * parg)847 machtrace_enable(void *arg, dtrace_id_t id, void *parg)
848 {
849 #pragma unused(arg) /* __APPLE__ */
850 
851 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
852 	int enabled = (machtrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
853 	    machtrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
854 
855 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
856 		machtrace_sysent[sysnum].stsy_entry = id;
857 	} else {
858 		machtrace_sysent[sysnum].stsy_return = id;
859 	}
860 
861 	if (enabled) {
862 		ASSERT(mach_trap_table[sysnum].mach_trap_function == (void *)dtrace_machtrace_syscall);
863 		return 0;
864 	}
865 
866 	lck_mtx_lock(&dtrace_systrace_lock);
867 
868 	if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) {
869 		/* It is not possible to write to mach_trap_table[] directly because it is const. */
870 		vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_machtrace_syscall);
871 		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t));
872 	}
873 
874 	lck_mtx_unlock(&dtrace_systrace_lock);
875 
876 	return 0;
877 }
878 
879 /*ARGSUSED*/
880 static void
machtrace_disable(void * arg,dtrace_id_t id,void * parg)881 machtrace_disable(void *arg, dtrace_id_t id, void *parg)
882 {
883 #pragma unused(arg,id) /* __APPLE__ */
884 
885 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
886 	int disable = (machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
887 	    machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
888 
889 	if (disable) {
890 		/*
891 		 * Usage of volatile protects the if statement below from being optimized away.
892 		 *
893 		 * Compilers are clever and know that const array values can't change in time
894 		 * and the if below is always false. That is because it can't see that DTrace
895 		 * injects dtrace_machtrace_syscall dynamically and violates constness of the
896 		 * array.
897 		 */
898 		volatile const mach_trap_t *machtrap = &mach_trap_table[sysnum];
899 
900 		lck_mtx_lock(&dtrace_systrace_lock);
901 		if (machtrap->mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) {
902 			ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying,
903 			    (vm_offset_t)&machtrap->mach_trap_function, sizeof(vm_offset_t));
904 		}
905 		lck_mtx_unlock(&dtrace_systrace_lock);
906 	}
907 
908 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
909 		machtrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
910 	} else {
911 		machtrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
912 	}
913 }
914 
915 static dtrace_pattr_t machtrace_attr = {
916 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
917 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
918 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
919 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
920 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
921 };
922 
923 static dtrace_pops_t machtrace_pops = {
924 	.dtps_provide =         machtrace_provide,
925 	.dtps_provide_module =  NULL,
926 	.dtps_enable =          machtrace_enable,
927 	.dtps_disable =         machtrace_disable,
928 	.dtps_suspend =         NULL,
929 	.dtps_resume =          NULL,
930 	.dtps_getargdesc =      NULL,
931 	.dtps_getargval =       machtrace_getarg,
932 	.dtps_usermode =        NULL,
933 	.dtps_destroy =         machtrace_destroy
934 };
935 
936 static int
machtrace_attach(dev_info_t * devi)937 machtrace_attach(dev_info_t *devi)
938 {
939 	machtrace_probe = dtrace_probe;
940 	membar_enter();
941 
942 	if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0,
943 	    DDI_PSEUDO, 0) == DDI_FAILURE ||
944 	    dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL,
945 	    &machtrace_pops, NULL, &machtrace_id) != 0) {
946 		machtrace_probe = (void*)&systrace_stub;
947 		ddi_remove_minor_node(devi, NULL);
948 		return DDI_FAILURE;
949 	}
950 
951 	return DDI_SUCCESS;
952 }
953 
954 d_open_t _systrace_open;
955 
956 int
_systrace_open(dev_t dev,int flags,int devtype,struct proc * p)957 _systrace_open(dev_t dev, int flags, int devtype, struct proc *p)
958 {
959 #pragma unused(dev,flags,devtype,p)
960 	return 0;
961 }
962 
963 #define SYSTRACE_MAJOR  -24 /* let the kernel pick the device number */
964 
965 static struct cdevsw systrace_cdevsw =
966 {
967 	.d_open = _systrace_open,
968 	.d_close = eno_opcl,
969 	.d_read = eno_rdwrt,
970 	.d_write = eno_rdwrt,
971 	.d_ioctl = eno_ioctl,
972 	.d_stop = (stop_fcn_t *)nulldev,
973 	.d_reset = (reset_fcn_t *)nulldev,
974 	.d_select = eno_select,
975 	.d_mmap = eno_mmap,
976 	.d_strategy = eno_strat,
977 	.d_reserved_1 = eno_getc,
978 	.d_reserved_2 = eno_putc,
979 };
980 
981 void systrace_init( void );
982 
983 void
systrace_init(void)984 systrace_init( void )
985 {
986 	if (dtrace_sdt_probes_restricted()) {
987 		return;
988 	}
989 
990 	int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw);
991 
992 	if (majdevno < 0) {
993 		printf("systrace_init: failed to allocate a major number!\n");
994 		return;
995 	}
996 
997 	systrace_attach((dev_info_t*)(uintptr_t)majdevno);
998 	machtrace_attach((dev_info_t*)(uintptr_t)majdevno);
999 }
1000 #undef SYSTRACE_MAJOR
1001 
1002 static uint64_t
systrace_getargval(void * arg,dtrace_id_t id,void * parg,int argno,int aframes)1003 systrace_getargval(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
1004 {
1005 #pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
1006 	uint64_t val = 0;
1007 	uint64_t *uargs = NULL;
1008 
1009 	uthread_t uthread = current_uthread();
1010 
1011 	if (uthread) {
1012 		uargs = uthread->t_dtrace_syscall_args;
1013 	}
1014 	if (!uargs) {
1015 		return 0;
1016 	}
1017 	if (argno < 0 || argno >= SYSTRACE_NARGS) {
1018 		return 0;
1019 	}
1020 
1021 	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1022 	val = uargs[argno];
1023 	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1024 	return val;
1025 }
1026 
1027 static void
systrace_getargdesc(void * arg,dtrace_id_t id,void * parg,dtrace_argdesc_t * desc)1028 systrace_getargdesc(void *arg, dtrace_id_t id, void *parg,
1029     dtrace_argdesc_t *desc)
1030 {
1031 #pragma unused(arg, id)
1032 	int sysnum = SYSTRACE_SYSNUM(parg);
1033 	uthread_t uthread = current_uthread();
1034 	uint64_t *uargs = NULL;
1035 
1036 	if (!uthread) {
1037 		desc->dtargd_ndx = DTRACE_ARGNONE;
1038 		return;
1039 	}
1040 
1041 	uargs = uthread->t_dtrace_syscall_args;
1042 
1043 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
1044 		systrace_entry_setargdesc(sysnum, desc->dtargd_ndx,
1045 		    desc->dtargd_native, sizeof(desc->dtargd_native));
1046 	} else {
1047 		systrace_return_setargdesc(sysnum, desc->dtargd_ndx,
1048 		    desc->dtargd_native, sizeof(desc->dtargd_native));
1049 	}
1050 
1051 	if (desc->dtargd_native[0] == '\0') {
1052 		desc->dtargd_ndx = DTRACE_ARGNONE;
1053 	}
1054 }
1055 
1056 static uint64_t
machtrace_getarg(void * arg,dtrace_id_t id,void * parg,int argno,int aframes)1057 machtrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
1058 {
1059 #pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
1060 	uint64_t val = 0;
1061 	syscall_arg_t *stack = (syscall_arg_t *)NULL;
1062 
1063 	uthread_t uthread = current_uthread();
1064 
1065 	if (uthread) {
1066 		stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args;
1067 	}
1068 
1069 	if (!stack) {
1070 		return 0;
1071 	}
1072 
1073 	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1074 	/* dtrace_probe arguments arg0 .. arg4 are 64bits wide */
1075 	val = (uint64_t)*(stack + argno);
1076 	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1077 	return val;
1078 }
1079